# MPE #potato transcriptome v2 (evigene) STARlong mapping of transcripts to reference genome and MatchAnnot to get transcript annotations #last edited: 16.05.2019 cd /DATA/markop/potato_tr_v2_matchannot #mkdir star_index mkdir STARlong.ENCODE.denovo #ulimit default 1024 ulimit -n 10000 #create genome index for DM v4.04 using the merged iTag+PGSC gtf # STARlong \ # --runThreadN 28 \ # --runMode genomeGenerate \ # --genomeDir ./star_index \ # --genomeFastaFiles ./potatoDM404genome/StPGSC4.04n_2018-01-18.fasta \ # --sjdbGTFfile ./potatoDM404genome/StPGSC4.04n_PGSC-ITAG-merged_representative-transcript_genes_2019-04-23.gtf \ # --limitGenomeGenerateRAM 210000000000 #mapping Rywal transcriptome STARlong --genomeLoad LoadAndExit --genomeDir ./star_index #parameters from STARmanual ENCODE options, SAM attributes copied from Liz #more parameters here: https://groups.google.com/forum/#!topic/rna-star/E2eevlGWFbQ # and here: https://github.com/PacificBiosciences/IsoSeq_SA3nUP/wiki/Old-Tutorial:-Optimizing-STAR-aligner-for-Iso-Seq-data # also added multimap chimera detection # Between the two RH haplotypes, 6.6 Mb of sequence could be aligned with 96.5% identity, corresponding to 1 SNP per 29?bp and 1 indel per 253?bp (average length 10.4?bp) STARlong \ --runMode alignReads \ --outSAMattributes NH HI NM MD \ --outFilterType BySJout \ --outFilterMultimapNmax 20 \ --alignSJoverhangMin 8 \ --alignSJDBoverhangMin 1 \ --outFilterMismatchNmax 999 \ --outFilterMismatchNoverReadLmax 0.08\ --alignIntronMin 20 \ --alignIntronMax 1000000 \ --alignMatesGapMax 1000000 \ --outSAMtype SAM \ --limitBAMsortRAM 100000000000 \ --runThreadN 28 \ --genomeDir ./star_index \ --readFilesIn ../../_A_01_evigene/intermediate_tr2aacds_Rywal/okayset/all.okay.tr \ --outFileNamePrefix ./STARlong.ENCODE.denovo/Rywal.tr.okay_DM_STARlong. \ --seedPerReadNmax 100000 \ --seedPerWindowNmax 1000 \ --seedSearchLmax 30 \ --seedSearchStartLmax 30 \ --alignTranscriptsPerReadNmax 100000 \ --alignTranscriptsPerWindowNmax 10000 \ --scoreGapNoncan -20 \ --scoreDelOpen -1 \ --scoreDelBase -1 \ --scoreInsOpen -1 \ --scoreInsBase -1 \ --chimMultimapNmax 20 \ --chimSegmentMin 100 STARlong \ --runMode alignReads \ --outSAMattributes NH HI NM MD \ --outFilterType BySJout \ --outFilterMultimapNmax 20 \ --alignSJoverhangMin 8 \ --alignSJDBoverhangMin 1 \ --outFilterMismatchNmax 999 \ --outFilterMismatchNoverReadLmax 0.08\ --alignIntronMin 20 \ --alignIntronMax 1000000 \ --alignMatesGapMax 1000000 \ --outSAMtype SAM \ --limitBAMsortRAM 100000000000 \ --runThreadN 28 \ --genomeDir ./star_index \ --readFilesIn ../../_A_01_evigene/intermediate_tr2aacds_Rywal/okayset/all.okalt.tr \ --outFileNamePrefix ./STARlong.ENCODE.denovo/Rywal.tr.okalt_DM_STARlong. \ --seedPerReadNmax 100000 \ --seedPerWindowNmax 1000 \ --seedSearchLmax 30 \ --seedSearchStartLmax 30 \ --alignTranscriptsPerReadNmax 100000 \ --alignTranscriptsPerWindowNmax 10000 \ --scoreGapNoncan -20 \ --scoreDelOpen -1 \ --scoreDelBase -1 \ --scoreInsOpen -1 \ --scoreInsBase -1 \ --chimMultimapNmax 20 \ --chimSegmentMin 100 #mapping PW363 transcriptome STARlong \ --runMode alignReads \ --outSAMattributes NH HI NM MD \ --outFilterType BySJout \ --outFilterMultimapNmax 20 \ --alignSJoverhangMin 8 \ --alignSJDBoverhangMin 1 \ --outFilterMismatchNmax 999 \ --outFilterMismatchNoverReadLmax 0.08\ --alignIntronMin 20 \ --alignIntronMax 1000000 \ --alignMatesGapMax 1000000 \ --outSAMtype SAM \ --limitBAMsortRAM 100000000000 \ --runThreadN 28 \ --genomeDir ./star_index \ --readFilesIn ../../_A_01_evigene/intermediate_tr2aacds_PW363/okayset/all.okay.tr \ --outFileNamePrefix ./STARlong.ENCODE.denovo/PW363.tr.okay_DM_STARlong. \ --seedPerReadNmax 100000 \ --seedPerWindowNmax 1000 \ --seedSearchLmax 30 \ --seedSearchStartLmax 30 \ --alignTranscriptsPerReadNmax 100000 \ --alignTranscriptsPerWindowNmax 10000 \ --scoreGapNoncan -20 \ --scoreDelOpen -1 \ --scoreDelBase -1 \ --scoreInsOpen -1 \ --scoreInsBase -1 \ --chimMultimapNmax 20 \ --chimSegmentMin 100 STARlong \ --runMode alignReads \ --outSAMattributes NH HI NM MD \ --outFilterType BySJout \ --outFilterMultimapNmax 20 \ --alignSJoverhangMin 8 \ --alignSJDBoverhangMin 1 \ --outFilterMismatchNmax 999 \ --outFilterMismatchNoverReadLmax 0.08\ --alignIntronMin 20 \ --alignIntronMax 1000000 \ --alignMatesGapMax 1000000 \ --outSAMtype SAM \ --limitBAMsortRAM 100000000000 \ --runThreadN 28 \ --genomeDir ./star_index \ --readFilesIn ../../_A_01_evigene/intermediate_tr2aacds_PW363/okayset/all.okalt.tr \ --outFileNamePrefix ./STARlong.ENCODE.denovo/PW363.tr.okalt_DM_STARlong. \ --seedPerReadNmax 100000 \ --seedPerWindowNmax 1000 \ --seedSearchLmax 30 \ --seedSearchStartLmax 30 \ --alignTranscriptsPerReadNmax 100000 \ --alignTranscriptsPerWindowNmax 10000 \ --scoreGapNoncan -20 \ --scoreDelOpen -1 \ --scoreDelBase -1 \ --scoreInsOpen -1 \ --scoreInsBase -1 \ --chimMultimapNmax 20 \ --chimSegmentMin 100 #mapping Desiree transcriptome STARlong \ --runMode alignReads \ --outSAMattributes NH HI NM MD \ --outFilterType BySJout \ --outFilterMultimapNmax 20 \ --alignSJoverhangMin 8 \ --alignSJDBoverhangMin 1 \ --outFilterMismatchNmax 999 \ --outFilterMismatchNoverReadLmax 0.08\ --alignIntronMin 20 \ --alignIntronMax 1000000 \ --alignMatesGapMax 1000000 \ --outSAMtype SAM \ --limitBAMsortRAM 100000000000 \ --runThreadN 28 \ --genomeDir ./star_index \ --readFilesIn ../../_A_01_evigene/intermediate_tr2aacds_Desiree/okayset/all.okay.tr \ --outFileNamePrefix ./STARlong.ENCODE.denovo/Desiree.tr.okay_DM_STARlong. \ --seedPerReadNmax 100000 \ --seedPerWindowNmax 1000 \ --seedSearchLmax 30 \ --seedSearchStartLmax 30 \ --alignTranscriptsPerReadNmax 100000 \ --alignTranscriptsPerWindowNmax 10000 \ --scoreGapNoncan -20 \ --scoreDelOpen -1 \ --scoreDelBase -1 \ --scoreInsOpen -1 \ --scoreInsBase -1 \ --chimMultimapNmax 20 \ --chimSegmentMin 100 STARlong \ --runMode alignReads \ --outSAMattributes NH HI NM MD \ --outFilterType BySJout \ --outFilterMultimapNmax 20 \ --alignSJoverhangMin 8 \ --alignSJDBoverhangMin 1 \ --outFilterMismatchNmax 999 \ --outFilterMismatchNoverReadLmax 0.08\ --alignIntronMin 20 \ --alignIntronMax 1000000 \ --alignMatesGapMax 1000000 \ --outSAMtype SAM \ --limitBAMsortRAM 100000000000 \ --runThreadN 28 \ --genomeDir ./star_index \ --readFilesIn ../../_A_01_evigene/intermediate_tr2aacds_Desiree/okayset/all.okalt.tr \ --outFileNamePrefix ./STARlong.ENCODE.denovo/Desiree.tr.okalt_DM_STARlong. \ --seedPerReadNmax 100000 \ --seedPerWindowNmax 1000 \ --seedSearchLmax 30 \ --seedSearchStartLmax 30 \ --alignTranscriptsPerReadNmax 100000 \ --alignTranscriptsPerWindowNmax 10000 \ --scoreGapNoncan -20 \ --scoreDelOpen -1 \ --scoreDelBase -1 \ --scoreInsOpen -1 \ --scoreInsBase -1 \ --chimMultimapNmax 20 \ --chimSegmentMin 100 STARlong --genomeLoad Remove --genomeDir ./star_index #sorting SAM files (same as done by Liz) for i in $(ls ./STARlong.ENCODE.denovo/*.sam | sed s/.sam// ) do sort -k 3,3 -k 4,4n ${i}.sam > ${i}.sorted.sam done #MatchAnnot #/home/administrator/Software/MatchAnnot/matchAnnot.py --gtf=./potatoDM404genome/StPGSC4.04n_PGSC-ITAG-merged_representative-transcript_genes_2019-04-23.gtf --format=alt\ ./STARlong.ENCODE.denovo/Desiree.tr.okalt_DM_STARlong.Aligned.out.sorted.sam > ./STARlong.ENCODE.denovo/Desiree.tr.okalt_DM_STARlong.Aligned.out.sorted.sam.matchAnnot.txt for i in $(ls ./STARlong.ENCODE.denovo/*.sorted.sam ) do /home/administrator/Software/MatchAnnot/matchAnnot.py --gtf=./potatoDM404genome/StPGSC4.04n_PGSC-ITAG-merged_representative-transcript_genes_2019-04-23.gtf --format=alt\ ${i} > ${i}.matchAnnot.txt done #parsing the matchannot results: EvigenetrID DMgeneID DMtrID exon_match match_score fwd/rev # grep "result:" Desiree.tr.okay_DM_STARlong.Aligned.out.sorted.sam.matchAnnot.txt | tr -s ' ' | sed 's/5-3:/fwd/g' | awk -F'[ ]' '{print $2, $3, $4, $6, $8, $9}' > Desiree.tr.okay_DM_STARlong.Aligned.out.sorted.sam.matchAnnot.parsed.txt # actually I don't need info on fwd/rev for i in $(ls ./STARlong.ENCODE.denovo/*.sorted.sam.matchAnnot.txt | sed s/.txt// ) do grep "result:" ${i}.txt | tr -s ' ' | awk -F'[ ]' '{print $2, $3, $4, $6, $8}' > ${i}.parsed.txt done