# Kallisto, bustools script for single nuclei analysis of murine data # 200109 # Author: M.Wolfien # Adapted from https://bustools.github.io/BUS_notebooks_R/velocity.html ################### # kallisto + bustools were used for alignment and quantification of spliced unspliced reads and downstream RNAvelocity analysis # Note: you can skip this step and go to Zenodo (http://doi.org/10.5281/zenodo.3623148) where we host the fully build and used kallisto index that is being build with the recipe, just continue with step 2 after downloading the index from Zenodo. # However, this script can still be used as a guideline for another gene build version or different species. ##### General introduction on kallisto plus bustools for RNA velocity analysis # https://www.kallistobus.tools/velocity_index_tutorial.html ######## # 1. Determine your biological read length zcat R1.fastq.gz | head -2 echo -n AGCGCCAAGCTGGTGAGGGGTCTCTCAATTTTTTTTTTTTTTNTNTTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNCNNNNCNNNN | wc -c 150 wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M23/GRCm38.primary_assembly.genome.fa.gz wget ftp://ftp.ensembl.org/pub/release-98/gtf/mus_musculus/ cat Mus_musculus.GRCm38.98.gtf | t2g make -p - > tr2g.txt $ gunzip GRCm38.primary_assembly.genome.fa.gz $ gunzip introns.bed.gz $ head -4 introns.bed # Using bedtools getfasta we will slice up the primary assembly with the BED file to give us a FASTA file of introns $ bedtools getfasta -name -fo introns.fa -fi GRCm38.primary_assembly.genome.fa -bed introns.bed $ head -2 introns.fa # First we need to get a list of all of the intronic transcript IDs represented in our FASTA file, with or without version numbers. $ cat introns.fa | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $2"."$}' > introns_transcripts.txt $ cat introns.fa | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $1"."$3}' > introns_transcripts.txt $ cat introns_transcripts.txt | tr "." " " | awk '{print $1}' > introns_transcripts_no_version.txt sed -i -r 's/.{1}//' introns_transcripts_no_version.txt # Now we add an identifier to the transcript IDs $ cat introns_transcripts.txt | awk '{print $0"."NR"-I"}' > introns_transcripts.to_capture.txt $ cat introns_transcripts_no_version.txt | awk '{print $0"."NR"-I"}' > introns_transcripts.to_capture.txt sed -i -r 's/.{1}//' introns_transcripts.to_capture.txt # Next we have to map the transcripts to their respective genes. $ awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' tr2g.txt introns_transcripts.txt > introns_t2g.txt $ awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' tr2g.txt introns_transcripts_no_version.txt > introns_t2g.txt $ awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' tr2g.txt introns_transcripts.to_capture.txt > introns_t2g_1.txt # Combine Both versions! # We need to fix all of the headers for the introns FASTA file so that they contain the transcript ID, an identifier specifying that the transcript is an “intronic” transcript, and a unique number to avoid duplicates. $ awk '{print ">"$1"."NR"-I"" gene_id:"$2" gene_name:"$3}' introns_t2g.txt > introns_fasta_header.txt $ awk -v var=1 'FNR==NR{a[NR]=$0;next}{ if ($0~/^>/) {print a[var], var++} else {print $0}}' introns_fasta_header.txt introns.fa > introns.correct_header.fa $ head -1 introns.correct_header.fa # Gunzip the cDNA FASTA file $ gunzip cDNA.fa.gz $ head -1 cDNA.fa ## Mus mus_musculus # Get list of transcripts $ cat cDNA.fa | awk '/^>/ {print $0}' | tr "_" " " | awk '{print $3}' > cDNA_transcripts.txt $ cat cDNA_transcripts.txt | tr "." " " | awk '{print $1}' > cDNA_transcripts_no_version.txt # Add an identifier to the transcript IDs $ cat cDNA_transcripts_no_version.txt | awk '{print $0"."NR}' > cDNA_transcripts.to_capture.txt #Map the transcripts to genes $ awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' tr2g.txt cDNA_transcripts_no_version.txt > cDNA_t2g.txt $ awk 'NR==FNR{a[$1]=$2; b[$1]=$3;next} {$2=a[$1];$3=b[$1]} 1' tr2g.txt cDNA_transcripts.to_capture.txt > cDNA_t2g_1.txt # Combine Both versions! # Fix Introns fasta headers $ awk '{print ">"$1"."NR" gene_id:"$2" gene_name:"$3}' cDNA_t2g.txt > cDNA_fasta_header.txt $ awk -v var=1 'FNR==NR{a[NR]=$0;next}{ if ($0~/^>/) {print a[var], var++} else {print $0}}' cDNA_fasta_header.txt $cDNA.fa > cDNA.correct_header.fa $ head -1 cDNA.correct_header.fa # Make the kallisto index $ cat cDNA.correct_header.fa introns.correct_header.fa > cDNA_introns.fa $ cat cDNA_t2g.txt introns_t2g.txt > cDNA_introns_t2g.txt $ kallisto index -i cDNA_introns.idx -k 31 cDNA_introns.fa ######## # 2. Run kallisto for alignment kallisto bus -i cDNA_introns.idx -o bus_output_fzDU/ -x 10xv2 -t 6 R1_001.fastq.gz R2_001.fastq.gz ########### # 3. Run bustools - Correct, sort, capture, and count the spliced and unspliced matrices $ cd bus_output_fzDU/ $ mkdir cDNA_capture/ introns_capture/ spliced/ unspliced/ tmp/ $ bustools correct -w ../10xv2_whitelist.txt -p output.bus | bustools sort -o output.correct.sort.bus -t 8 - $ bustools correct -w ../3M-february-2018.txt -p output.bus | bustools sort -o output.correct.sort.bus -t 8 - $ bustools capture -o cDNA_capture/cDNA_capture.bus -c ../cDNA_transcripts.to_capture.txt -e matrix.ec -t transcripts.txt output.correct.sort.bus -s $ bustools capture -o introns_capture/intron_capture.bus -c ../introns_transcripts.to_capture.txt -e matrix.ec -t transcripts.txt output.correct.sort.bus -s $ bustools count -o unspliced/unspliced -g ../cDNA_introns_t2g.txt -e matrix.ec -t transcripts.txt --genecounts cDNA_capture/cDNA_capture.bus $ bustools count -o spliced/spliced -g ../cDNA_introns_t2g.txt -e matrix.ec -t transcripts.txt --genecounts introns_capture/intron_capture.bus ############ # 4. Continue with the provided R.script for downstream processing