# Kallisto, bustools script for single nuclei analysis of murine data
# 200124
# Author: M.Wolfien
# Adapted from https://bustools.github.io/BUS_notebooks_R/velocity.html

###################
# kallisto + bustools for spliced unspliced processing for RNAvelocity analysis
# Note: Generated mm10 index (cDNA_introns.idx) can also be obtained from https://doi.org/10.5281/zenodo.3623148 and you can continue with the kallisto alignment step

# https://www.kallistobus.tools/velocity_index_tutorial.html
# Determine your biological read length
zcat R1.fastq.gz | head -2
echo -n AGCGCCAAGCTGGTGAGGGGTCTCTCAATTTTTTTTTTTTTTNTNTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCNNNNCNNNN | wc -c
150
wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/GRCm38.primary_assembly.genome.fa.gz

wget ftp://ftp.ensembl.org/pub/release-98/gtf/mus_musculus/

cat Mus_musculus.GRCm38.98.gtf | t2g make -p - > tr2g.txt

$ gunzip GRCm38.primary_assembly.genome.fa.gz
$ gunzip introns.bed.gz
$ head -4 introns.bed

# Using bedtools getfasta we will slice up the primary assembly with the BED file to give us a FASTA file of introns
$ bedtools getfasta -name -fo introns.fa -fi GRCm38.primary_assembly.genome.fa -bed introns.bed
$ head -2 introns.fa

# First we need to get a list of all of the intronic transcript IDs represented in our FASTA file, with or without version numbers.
$ cat introns.fa | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $2"."$}' > introns_transcripts.txt
$ cat introns.fa | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $1"."$3}' > introns_transcripts.txt
$ cat introns_transcripts.txt | tr "." " " | awk '{print $1}' > introns_transcripts_no_version.txt
sed -i -r 's/.{1}//' introns_transcripts_no_version.txt

# Now we add an identifier to the transcript IDs

$ cat introns_transcripts.txt | awk '{print $0"."NR"-I"}' > introns_transcripts.to_capture.txt
$ cat introns_transcripts_no_version.txt | awk '{print $0"."NR"-I"}' > introns_transcripts.to_capture.txt
sed -i -r 's/.{1}//' introns_transcripts.to_capture.txt


# Next we have to map the transcripts to their respective genes.  
$ awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' tr2g.txt introns_transcripts.txt > introns_t2g.txt
$ awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' tr2g.txt introns_transcripts_no_version.txt > introns_t2g.txt

$ awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' tr2g.txt introns_transcripts.to_capture.txt > introns_t2g_1.txt
# Combine Both versions! 


# We need to fix all of the headers for the introns FASTA file so that they contain the transcript ID, an identifier specifying that the transcript is an “intronic” transcript, and a unique number to avoid duplicates.
$ awk '{print ">"$1"."NR"-I"" gene_id:"$2" gene_name:"$3}' introns_t2g.txt > introns_fasta_header.txt
$ awk -v var=1 'FNR==NR{a[NR]=$0;next}{ if ($0~/^>/) {print a[var], var++} else {print $0}}' introns_fasta_header.txt introns.fa > introns.correct_header.fa
$ head -1 introns.correct_header.fa

# Gunzip the cDNA FASTA file
$ gunzip cDNA.fa.gz
$ head -1 cDNA.fa  ## Mus mus_musculus

# Get list of transcripts
$ cat cDNA.fa | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $3}' > cDNA_transcripts.txt
$ cat cDNA_transcripts.txt | tr "." " " | awk '{print $1}' > cDNA_transcripts_no_version.txt

# Add an identifier to the transcript IDs
$ cat cDNA_transcripts_no_version.txt | awk '{print $0"."NR}' > cDNA_transcripts.to_capture.txt

#Map the transcripts to genes
$ awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' tr2g.txt cDNA_transcripts_no_version.txt > cDNA_t2g.txt
$ awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' tr2g.txt cDNA_transcripts.to_capture.txt > cDNA_t2g_1.txt
# Combine Both versions! 

# Fix Introns fasta headers
$ awk '{print ">"$1"."NR" gene_id:"$2" gene_name:"$3}' cDNA_t2g.txt > cDNA_fasta_header.txt
$ awk -v var=1 'FNR==NR{a[NR]=$0;next}{ if ($0~/^>/) {print a[var], var++} else {print $0}}' cDNA_fasta_header.txt $cDNA.fa > cDNA.correct_header.fa
$ head -1 cDNA.correct_header.fa

# Make the kallisto index
$ cat cDNA.correct_header.fa introns.correct_header.fa > cDNA_introns.fa
$ cat cDNA_t2g.txt introns_t2g.txt > cDNA_introns_t2g.txt
$ kallisto index -i cDNA_introns.idx -k 31 cDNA_introns.fa

# Run kallisto for alignment
kallisto bus -i cDNA_introns.idx -o bus_output_fzDU/ -x 10xv2 -t 6 R1_001.fastq.gz R2_001.fastq.gz

# Run bustools - Correct, sort, capture, and count the spliced and unspliced matrices
$ cd bus_output_fzDU/
$ mkdir cDNA_capture/ introns_capture/ spliced/ unspliced/ tmp/ 
$ bustools correct -w ../10xv2_whitelist.txt -p output.bus | bustools sort -o output.correct.sort.bus -t 8 -
$ bustools correct -w ../3M-february-2018.txt -p output.bus | bustools sort -o output.correct.sort.bus -t 8 -
$ bustools capture -o cDNA_capture/cDNA_capture.bus -c ../cDNA_transcripts.to_capture.txt -e matrix.ec -t transcripts.txt output.correct.sort.bus -s
$ bustools capture -o introns_capture/intron_capture.bus -c ../introns_transcripts.to_capture.txt -e matrix.ec -t transcripts.txt output.correct.sort.bus -s
$ bustools count -o unspliced/unspliced -g ../cDNA_introns_t2g.txt -e matrix.ec -t transcripts.txt --genecounts cDNA_capture/cDNA_capture.bus
$ bustools count -o spliced/spliced -g ../cDNA_introns_t2g.txt -e matrix.ec -t transcripts.txt --genecounts introns_capture/intron_capture.bus

# Please continue with the provided R-script