#SUSPHIRE tomato dataset RNA-seq mapping with STAR v2.6.1d

cd /DATA/markop/SexyPlant

mkdir STAR_index
mkdir STAR_output

#ulimit default 1024
ulimit -n 10000

# generate genome index
STAR \
--runThreadN 8 \
--runMode genomeGenerate \
--genomeDir ./STAR_index \
--genomeFastaFiles ./benty_genome/Supplemental_dataset_1_NbC_fasta.fasta \
--sjdbGTFfile ./benty_genome/NbC_gene_exon.gff \
--sjdbGTFtagExonParentTranscript ID \
--sjdbOverhang 100 \
--limitGenomeGenerateRAM 50000000000

# map reads
STAR --genomeLoad LoadAndExit --genomeDir ./STAR_index

for i in $(ls /STORK_ngs/NGS_mpe_2019_SUSPHIRE-Nbenthamiana/original/ | sed s/_[12].fq.gz// | sort -u)
do  STAR \
	--genomeDir ./STAR_index \
	--quantMode GeneCounts \
	--runThreadN 8 \
	--readFilesIn /STORK_ngs/NGS_mpe_2019_SUSPHIRE-Nbenthamiana/original/${i}_1.fq.gz,/STORK_ngs/NGS_mpe_2019_SUSPHIRE-Nbenthamiana/original/${i}_2.fq.gz \
	--readFilesCommand gunzip -c \
	--outFileNamePrefix ./STAR_output/SexyPlant_STAR_$i. \
	--outFilterMultimapNmax 4 \
	--outFilterMismatchNoverReadLmax 0.05 \
	--outSAMtype BAM SortedByCoordinate \
	--quantTranscriptomeBan Singleend \
	--limitBAMsortRAM 50000000000
done

STAR --genomeLoad Remove --genomeDir ./STAR_index

#ReadsPerGene.out.tab does not contain reads per gene

# https://groups.google.com/forum/#!msg/rna-star/oRvzihFXE8k/Xa-7YgUUBgAJ
# For the ReadsPerGene counting to work, each "exon" in the GFF file needs to contain gene Name it belongs to.
# Your file does not have this formatting - it only lists exon Parent (which is mRNA), and mRNA in turn has a Parent gene.

# trying with R featureCounts

#run R
R

# install library (unccomment and run in sudo R)
#BiocManager::install("Rsubread")

library("Rsubread")
files <- list.files(path="./STAR_output", pattern=".bam", full.names=TRUE)
counts <- featureCounts(files,

	# annotation
	annot.inbuilt=FALSE,
	annot.ext="./benty_genome/NbC_gene_exon_Desc_Name_CLCexport-geneOnly.gtf",
	isGTFAnnotationFile=TRUE,
	GTF.featureType="EXON",
	GTF.attrType="gene_id",
	GTF.attrType.extra=NULL,
	chrAliases=NULL,
	
	# level of summarization - do not put TRUE here because it will summarize exons by "gene_id" strings and these are not unique
	useMetaFeatures=FALSE,
	
	# overlap between reads and features
	allowMultiOverlap=FALSE,
	minOverlap=1,
	fracOverlap=0,
	fracOverlapFeature=0,
	largestOverlap=FALSE,
	nonOverlap=NULL,
	nonOverlapFeature=NULL,

	# Read shift, extension and reduction
	readShiftType="upstream",
	readShiftSize=0,
	readExtension5=0,
	readExtension3=0,
	read2pos=NULL,
	
	# multi-mapping reads
	countMultiMappingReads=TRUE,

	# fractional counting
	fraction=FALSE,

	# long reads
	isLongRead=FALSE,

	# read filtering
	minMQS=0,
	splitOnly=FALSE,
	nonSplitOnly=FALSE,
	primaryOnly=FALSE,
	ignoreDup=FALSE,
	
	# strandness
	strandSpecific=0,
	
	# exon-exon junctions
	juncCounts=FALSE,
	genome=NULL,
	
	# parameters specific to paired end reads
	isPairedEnd=TRUE,
	requireBothEndsMapped=FALSE,
	checkFragLength=FALSE,
	minFragLength=50,
	maxFragLength=600,
	countChimericFragments=TRUE,	
	autosort=TRUE,
	
	# number of CPU threads
	nthreads=8,

	# read group
	byReadGroup=FALSE,

	# report assignment result for each read
	reportReads=NULL,
	reportReadsPath=NULL,

	# miscellaneous
	maxMOp=10,
	tmpDir=".",
	verbose=FALSE)
	
colnames(counts$counts) <- unlist (strsplit(substr(colnames(counts$counts), 52, 59), ".", fixed = TRUE))[c(1:17,19,21,23)]
counttable <- cbind (counts$annotation, counts$counts)
write.table(counttable, file="SexyPlant_counts_STAR_bothStr.txt", sep="\t", row.names=FALSE)

q("no")