#SUSPHIRE tomato dataset RNA-seq mapping with STAR v2.6.1d cd /DATA/markop/SexyPlant mkdir STAR_index mkdir STAR_output #ulimit default 1024 ulimit -n 10000 # generate genome index STAR \ --runThreadN 8 \ --runMode genomeGenerate \ --genomeDir ./STAR_index \ --genomeFastaFiles ./benty_genome/Supplemental_dataset_1_NbC_fasta.fasta \ --sjdbGTFfile ./benty_genome/NbC_gene_exon.gff \ --sjdbGTFtagExonParentTranscript ID \ --sjdbOverhang 100 \ --limitGenomeGenerateRAM 50000000000 # map reads STAR --genomeLoad LoadAndExit --genomeDir ./STAR_index for i in $(ls /STORK_ngs/NGS_mpe_2019_SUSPHIRE-Nbenthamiana/original/ | sed s/_[12].fq.gz// | sort -u) do STAR \ --genomeDir ./STAR_index \ --quantMode GeneCounts \ --runThreadN 8 \ --readFilesIn /STORK_ngs/NGS_mpe_2019_SUSPHIRE-Nbenthamiana/original/${i}_1.fq.gz,/STORK_ngs/NGS_mpe_2019_SUSPHIRE-Nbenthamiana/original/${i}_2.fq.gz \ --readFilesCommand gunzip -c \ --outFileNamePrefix ./STAR_output/SexyPlant_STAR_$i. \ --outFilterMultimapNmax 4 \ --outFilterMismatchNoverReadLmax 0.05 \ --outSAMtype BAM SortedByCoordinate \ --quantTranscriptomeBan Singleend \ --limitBAMsortRAM 50000000000 done STAR --genomeLoad Remove --genomeDir ./STAR_index #ReadsPerGene.out.tab does not contain reads per gene # https://groups.google.com/forum/#!msg/rna-star/oRvzihFXE8k/Xa-7YgUUBgAJ # For the ReadsPerGene counting to work, each "exon" in the GFF file needs to contain gene Name it belongs to. # Your file does not have this formatting - it only lists exon Parent (which is mRNA), and mRNA in turn has a Parent gene. # trying with R featureCounts #run R R # install library (unccomment and run in sudo R) #BiocManager::install("Rsubread") library("Rsubread") files <- list.files(path="./STAR_output", pattern=".bam", full.names=TRUE) counts <- featureCounts(files, # annotation annot.inbuilt=FALSE, annot.ext="./benty_genome/NbC_gene_exon_Desc_Name_CLCexport-geneOnly.gtf", isGTFAnnotationFile=TRUE, GTF.featureType="EXON", GTF.attrType="gene_id", GTF.attrType.extra=NULL, chrAliases=NULL, # level of summarization - do not put TRUE here because it will summarize exons by "gene_id" strings and these are not unique useMetaFeatures=FALSE, # overlap between reads and features allowMultiOverlap=FALSE, minOverlap=1, fracOverlap=0, fracOverlapFeature=0, largestOverlap=FALSE, nonOverlap=NULL, nonOverlapFeature=NULL, # Read shift, extension and reduction readShiftType="upstream", readShiftSize=0, readExtension5=0, readExtension3=0, read2pos=NULL, # multi-mapping reads countMultiMappingReads=TRUE, # fractional counting fraction=FALSE, # long reads isLongRead=FALSE, # read filtering minMQS=0, splitOnly=FALSE, nonSplitOnly=FALSE, primaryOnly=FALSE, ignoreDup=FALSE, # strandness strandSpecific=0, # exon-exon junctions juncCounts=FALSE, genome=NULL, # parameters specific to paired end reads isPairedEnd=TRUE, requireBothEndsMapped=FALSE, checkFragLength=FALSE, minFragLength=50, maxFragLength=600, countChimericFragments=TRUE, autosort=TRUE, # number of CPU threads nthreads=8, # read group byReadGroup=FALSE, # report assignment result for each read reportReads=NULL, reportReadsPath=NULL, # miscellaneous maxMOp=10, tmpDir=".", verbose=FALSE) colnames(counts$counts) <- unlist (strsplit(substr(colnames(counts$counts), 52, 59), ".", fixed = TRUE))[c(1:17,19,21,23)] counttable <- cbind (counts$annotation, counts$counts) write.table(counttable, file="SexyPlant_counts_STAR_bothStr.txt", sep="\t", row.names=FALSE) q("no")