rm()
gc()

##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  522230 27.9    1181252 63.1   621665 33.3
## Vcells 1003993  7.7    8388608 64.0  1601566 12.3

if(!require(data.table)) install.packages("data.table")

## Loading required package: data.table

library(data.table)
if(!require(tidyverse)) install.packages("tidyverse")

## Loading required package: tidyverse

## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --

## <U+221A> ggplot2 3.2.1     <U+221A> purrr   0.3.2
## <U+221A> tibble  2.1.3     <U+221A> dplyr   0.8.3
## <U+221A> tidyr   0.8.3     <U+221A> stringr 1.4.0
## <U+221A> readr   1.3.1     <U+221A> forcats 0.4.0

## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()

library(tidyverse)
if(!require(grr)) install.packages("grr")

## Loading required package: grr

## 
## Attaching package: 'grr'

## The following object is masked from 'package:dplyr':
## 
##     matches

## The following object is masked from 'package:tidyr':
## 
##     extract

library(grr)

'%ni%' = Negate('%in%')

# getwd()

Desiree

IDs, evg headers and seq lenghts

tr = read.table(file = "../output/intermediate/Desiree/Desiree.tr_lengths.tsv", 
               header = TRUE, 
               sep = "\t", 
               quote = NULL,
               dec = ".", 
               stringsAsFactors = FALSE,
               comment.char = "@")

cds = read.table(file = "../output/intermediate/Desiree/Desiree.cds_lengths.tsv", 
               header = TRUE, 
               sep = "\t", 
               quote = NULL,
               dec = ".", 
               stringsAsFactors = FALSE,
               comment.char = "@")

colnames(tr)[1] = "name"
colnames(cds)[1] = "name"

head(tr)

##                                                                                                 name
## 1          CLCdnDe10_23 evgclass=main,okay,match:CLCdnDe1_18118,pct:100/95/.; aalen=76,70%,complete;
## 2           CLCdnDe10_45 evgclass=main,okay,match:VdnDe1_1819,pct:100/100/.; aalen=717,91%,complete;
## 3 CLCdnDe10_71 evgclass=main,okay,match:TDe_comp105794_c1_seq3,pct:99/100/.; aalen=228,72%,complete;
## 4     CLCdnDe10_83 evgclass=main,okay,match:VdnDe3_203256,pct:100/100/-sense; aalen=43,47%,complete;
## 5        CLCdnDe10_113 evgclass=main,okay,match:CLCdnDe2_287,pct:100/100/.; aalen=1291,86%,complete;
## 6          CLCdnDe10_115 evgclass=main,okay,match:VdnDe3_98989,pct:99/100/.; aalen=267,70%,complete;
##   seq qual length
## 1  NA   NA    327
## 2  NA   NA   2364
## 3  NA   NA    954
## 4  NA   NA    276
## 5  NA   NA   4478
## 6  NA   NA   1143

head(cds)

##                                                                                                                                             name
## 1          CLCdnDe10_23 type=CDS; aalen=76,70%,complete; clen=327; strand=-; offs=269-39;  evgclass=main,okay,match:CLCdnDe1_18118,pct:100/95/.;
## 2        CLCdnDe10_45 type=CDS; aalen=717,91%,complete; clen=2364; strand=+; offs=147-2300;  evgclass=main,okay,match:VdnDe1_1819,pct:100/100/.;
## 3 CLCdnDe10_71 type=CDS; aalen=228,72%,complete; clen=954; strand=-; offs=762-76;  evgclass=main,okay,match:TDe_comp105794_c1_seq3,pct:99/100/.;
## 4     CLCdnDe10_83 type=CDS; aalen=43,47%,complete; clen=276; strand=-; offs=222-91;  evgclass=main,okay,match:VdnDe3_203256,pct:100/100/-sense;
## 5     CLCdnDe10_113 type=CDS; aalen=1291,86%,complete; clen=4478; strand=-; offs=4057-182;  evgclass=main,okay,match:CLCdnDe2_287,pct:100/100/.;
## 6         CLCdnDe10_115 type=CDS; aalen=267,70%,complete; clen=1143; strand=+; offs=91-894;  evgclass=main,okay,match:VdnDe3_98989,pct:99/100/.;
##   seq qual length
## 1  NA   NA    231
## 2  NA   NA   2154
## 3  NA   NA    687
## 4  NA   NA    132
## 5  NA   NA   3876
## 6  NA   NA    804

tr = tr[, -c(2,3)]
cds = cds[, -c(2,3)]

tmp = strsplit(tr$name, " ")
tmp = data.table::transpose(tmp)
length(tmp)

## [1] 3

ID = tmp[[1]]
evgHead1 = tmp[[2]]
evgHead2 = tmp[[3]]
## gsub("[;\"]"
evgHead2 = sapply(1:length(evgHead2), function(x) gsub("aalen=", "", evgHead2[x]))
evgHead2 = sapply(1:length(evgHead2), function(x) gsub(";", "", evgHead2[x]))
temp = strsplit(evgHead2, ",")
temp = data.table::transpose(temp)
evgAAlen = temp[[1]]
evgAAperc = temp[[2]]
evgAAcomplete = temp[[3]]

evgHead1 = sapply(1:length(evgHead1), function(x) gsub("evgclass=", "", evgHead1[x]))
evgHead1 = sapply(1:length(evgHead1), function(x) gsub(";", "", evgHead1[x]))
temp = strsplit(evgHead1, ",")
temp = data.table::transpose(temp)
class = temp[[1]]
pass = temp[[2]]
match = temp[[3]]
pct = temp[[4]]

tr.2 = cbind(ID, class, pass, match, pct, evgAAlen, evgAAperc, evgAAcomplete, tr$length)
colnames(tr.2)[dim(tr.2)[2]] = "length"

tmp = strsplit(cds$name, " ")
tmp = data.table::transpose(tmp)
length(tmp)

## [1] 8

ID = tmp[[1]]
evgHead1 = tmp[[8]]
evgHead2 = tmp[[3]]

evgHead2 = sapply(1:length(evgHead2), function(x) gsub("aalen=", "", evgHead2[x]))
evgHead2 = sapply(1:length(evgHead2), function(x) gsub(";", "", evgHead2[x]))
temp = strsplit(evgHead2, ",")
temp = data.table::transpose(temp)
evgAAlen = temp[[1]]
evgAAperc = temp[[2]]
evgAAcomplete = temp[[3]]

evgHead1 = sapply(1:length(evgHead1), function(x) gsub("evgclass=", "", evgHead1[x]))
evgHead1 = sapply(1:length(evgHead1), function(x) gsub(";", "", evgHead1[x]))
temp = strsplit(evgHead1, ",")
temp = data.table::transpose(temp)
class = temp[[1]]
pass = temp[[2]]
match = temp[[3]]
pct = temp[[4]]

clen = gsub("clen=", "", tmp[[4]])
strand = gsub("strand=", "", tmp[[5]])
offs = gsub("offs=", "", tmp[[6]])
clen = gsub(";", "", clen)
strand = gsub(";", "", strand)
offs = gsub(";", "", offs)

# dim(cbind(clen, strand, offs))

cds.2 = cbind(ID, class, pass, match, pct, evgAAlen, evgAAperc, evgAAcomplete, clen, strand, offs, cds$length)
colnames(cds.2)[dim(cds.2)[2]] = "length"

utrorfs

dim(cds.2) - dim(tr.2)

## [1] 3556    3

utrorf = cds.2[grep("utrorf", cds.2[,1])]
tr.utrorf = gsub("utrorf", "", utrorf)
ind = match(tr.utrorf, tr.2[,1])

cat("missing in .tr: ", sum(is.na(ind)), "sequences while existing in .cds as utrorf \n")

## missing in .tr:  3095 sequences while existing in .cds as utrorf

ind2 = which(is.na(ind))
ind3 = match(utrorf[ind2],cds.2[,1])

mymat = matrix(NA, length(ind2), dim(tr.2)[2])
mymat[,1] = tr.utrorf[ind2]
colnames(mymat) = colnames(tr.2)
mymat = as.data.frame(mymat)
mymat$class = rep("lost.dropped", length(ind2))
mymat$pass = rep("as.utrorfs.cds", length(ind2))

# tr.2 = rbind(tr.2, mymat) # shift to utrorf file


uniqueness = gsub("utrorf", "", cds.2[,1])
n_occur <- data.frame(table(uniqueness))
duplicates = n_occur[n_occur$Freq > 1,]
# dim(duplicates)
## partial match
## duplicatedCDS = sapply(1:dim(duplicates)[1], function(x) grep(as.character(duplicates[x,1]), cds.2[,1]))
## duplicatedCDSTable = cds.2[unlist(duplicatedCDS),]
duplicatedCDSTable = rbind (cds.2[match(duplicates[,1],cds.2[,1]),], 
                            cds.2[match(paste0(duplicates[,1], "utrorf"),cds.2[,1]),])
duplicatedCDSTable = as.data.frame(duplicatedCDSTable)
duplicatedCDSTable = duplicatedCDSTable[with(duplicatedCDSTable, order(ID)), ]


cat(".tr with 2nd .cds: ", dim(duplicatedCDSTable)[1]/2, " \n")

## .tr with 2nd .cds:  461

write.table(duplicatedCDSTable, file = "../output/Desiree/Desiree_secondaryORFTable.tsv", 
            append = FALSE, quote = FALSE, sep = "\t",
            eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = TRUE, qmethod = c("escape", "double"),
            fileEncoding = "")

colnames(tr.2)[2:dim(tr.2)[2]] = paste0("tr.", colnames(tr.2)[2:dim(tr.2)[2]])
colnames(cds.2)[2:dim(cds.2)[2]] = paste0("cds.", colnames(cds.2)[2:dim(cds.2)[2]])

# http://www.datasciencemadesimple.com/join-in-r-merge-in-r/

tr.3 = merge(tr.2, cds.2, by="ID", all.x =TRUE)


utrorf = cds.2[grep("utrorf", cds.2[,1])]
tr.utrorf = gsub("utrorf", "", utrorf)


ind4 = match(tr.utrorf, tr.2[,1])
ind5 = match(utrorf, cds.2[,1])
# colnames(cds.2[ind5,])  
# colnames(tr.2[ind4,])
tmp = (cds.2[ind5,])  
# dim(tr.2[ind4,]) 
colnames(tmp) = colnames(cds.2)
colnames(tmp)[1] = "ID2"
tmp = cbind(tmp, gsub("utrorf", "", tmp[,1]))
colnames(tmp)[dim(tmp)[2]] = "ID"

tr.3_utrorf = merge(tr.2, tmp, by="ID", all.y =TRUE)
colnames(tr.3_utrorf)[1] = "ID3"
colnames(tr.3_utrorf)[dim(tr.2)[2]+1] = "ID"
colnames(tr.3_utrorf)[1] = "ID2"
# dim(tr.3_utrorf)

colnames(tr.3_utrorf)

##  [1] "ID2"               "tr.class"          "tr.pass"          
##  [4] "tr.match"          "tr.pct"            "tr.evgAAlen"      
##  [7] "tr.evgAAperc"      "tr.evgAAcomplete"  "tr.length"        
## [10] "ID"                "cds.class"         "cds.pass"         
## [13] "cds.match"         "cds.pct"           "cds.evgAAlen"     
## [16] "cds.evgAAperc"     "cds.evgAAcomplete" "cds.clen"         
## [19] "cds.strand"        "cds.offs"          "cds.length"

colnames(mymat)

## [1] "ID"            "class"         "pass"          "match"        
## [5] "pct"           "evgAAlen"      "evgAAperc"     "evgAAcomplete"
## [9] "length"

mymat2 = matrix(NA, dim(mymat)[1], dim(tr.3_utrorf)[2])
colnames(mymat2) = colnames(tr.3_utrorf)
mymat2 = as.data.frame(mymat2)
mymat2$ID2 = mymat$ID
mymat2$tr.class = mymat$class
mymat2$tr.pass = mymat$pass
mymat2$tr.length = mymat$length
mymat2$ID = mymat$ID
i <- sapply(tr.3_utrorf, is.factor) 
tr.3_utrorf[i] <- lapply(tr.3_utrorf[i], as.character) 
i <- sapply(mymat2, is.factor) 
mymat2[i] <- lapply(mymat2[i], as.character) 
dim(mymat2)

## [1] 3095   21

dim(tr.3_utrorf)

## [1] 3556   21

sum(mymat2$ID2 %in% tr.3_utrorf$ID2)

## [1] 3095

ind = match(mymat2$ID2, tr.3_utrorf$ID2)
# names(tr.3_utrorf)
tr.3_utrorf$tr.class[ind] = mymat2$tr.class # shift to utrorf file; already in
tr.3_utrorf$tr.pass[ind] = mymat2$tr.pass # shift to utrorf file; already in

sum(duplicatedCDSTable$ID %in% tr.3_utrorf$ID2)

## [1] 461

sum(duplicatedCDSTable$ID %in% tr.3_utrorf$ID)

## [1] 461

tmp = tr.3[which(tr.3$ID %in% duplicatedCDSTable$ID ),]
tmp$ID2 = tmp$ID
ind = match(colnames(tr.3_utrorf), colnames(tmp))
tmp = tmp[,ind]
removeAdd = match(tmp$ID, tr.3$ID)

tr.3_utrorf = rbind(tmp, tr.3_utrorf) # shift to utrorf file

tr.3 = tr.3[-removeAdd,] # shift to utrorf file

lenghts

hist(as.numeric(tr.3$tr.length), 
     breaks = seq(0, max(as.numeric(tr.3$tr.length), na.rm = TRUE) + 50, 50),
     main = "Desiree evigene transcripts length", xlab = "transcript length",
     xlim=c(0, max(300,max(as.numeric(tr.3$tr.length), na.rm = TRUE))),
     col = "grey90")

hist(as.numeric(tr.3$cds.length), 
     breaks = seq(0, max(as.numeric(tr.3$cds.length), na.rm = TRUE) + 50, 50),
     main = "Desiree evigene coding sequences length", xlab = "CDS length",
     xlim=c(0, max(300,max(as.numeric(tr.3$cds.length), na.rm = TRUE))),
     col = "grey45", las=2)
abline(v=180, col = "red")
abline(v=200, col = "blue")
abline(v=300, col = "green")
axis(1, at=c(180), labels=c(180), las=2, col = c("red"), cex.axis=0.75, col.axis = c("red"))
axis(1, at=c(200), labels=c(200), las=2, col = c("blue"), cex.axis=0.75, col.axis = c("blue"))
axis(1, at=c(300), labels=c(300), las=2, col = c("green"), cex.axis=0.75, col.axis = c("green"))

hist(as.numeric(tr.3_utrorf$tr.length), 
     breaks = seq(0, max(as.numeric(tr.3_utrorf$tr.length), na.rm = TRUE) + 50, 50),
      main = "Desiree evigene transcripts with secondary ORF length", xlab = "transcript length",
     xlim=c(0, max(300,max(as.numeric(tr.3_utrorf$tr.length), na.rm = TRUE))),
     col = "grey90")

hist(as.numeric(tr.3_utrorf$cds.length), 
     breaks = seq(0, max(as.numeric(tr.3_utrorf$cds.length), na.rm = TRUE) + 50, 50),
     main = "Desiree evigene secondary ORF coding sequences length", xlab = "CDS length",
     xlim=c(0, max(300,max(as.numeric(tr.3_utrorf$cds.length), na.rm = TRUE))),
     col = "grey45", las=2)
abline(v=180, col = "red")
abline(v=200, col = "blue")
abline(v=300, col = "green")
axis(1, at=c(180), labels=c(180), las=2, col = c("red"), cex.axis=0.75, col.axis = c("red"))
axis(1, at=c(200), labels=c(200), las=2, col = c("blue"), cex.axis=0.75, col.axis = c("blue"))
axis(1, at=c(300), labels=c(300), las=2, col = c("green"), cex.axis=0.75, col.axis = c("green"))

Match Annot

MA = read.table(file = "../output/intermediate/Desiree/Desiree.tr_DM_STARlong.Aligned.out.sorted.sam.matchAnnot.parsed.txt", 
               header = FALSE, 
               sep = " ", 
               quote = NULL,
               dec = ".", 
               stringsAsFactors = FALSE,
               na.strings = "",
               comment.char = "@")

# dim(MA)
head(MA)

##                 V1             V2                 V3 V4 V5
## 1 CLCdnDe12_152354 no_genes_found               <NA> NA NA
## 2 CLCdnDe12_152354 no_genes_found               <NA> NA NA
## 3  CLCdnDe4_156205 Sotub09g013800 Sotub09g013800.1.1  3  5
## 4  CLCdnDe14_73521 Sotub09g013810 Sotub09g013810.1.1  1  0
## 5  CLCdnDe14_73521 Sotub09g013810 Sotub09g013810.1.1  1  0
## 6  CLCdnDe7_166550 no_genes_found               <NA> NA NA

colnames(MA) = c("ID", "DMgeneID", "DMtrID", "exon_match", "match_score")
MA = MA[order(MA$ID),] 
duplRow = which(duplicated(MA) | duplicated(MA[nrow(MA):1, ])[nrow(MA):1])
MA[duplRow[1:10],]

##                    ID             DMgeneID               DMtrID exon_match
## 70269   CLCdnDe1_1001       no_genes_found                 <NA>         NA
## 70270   CLCdnDe1_1001       no_genes_found                 <NA>         NA
## 267173 CLCdnDe1_10011       Sotub09g013400   Sotub09g013400.1.1          1
## 267174 CLCdnDe1_10011       Sotub09g013400   Sotub09g013400.1.1          1
## 146880 CLCdnDe1_10049       Sotub01g044750   Sotub01g044750.1.1          1
## 146881 CLCdnDe1_10049       Sotub01g044750   Sotub01g044750.1.1          1
## 194974 CLCdnDe1_10203       Sotub04g025620   Sotub04g025620.1.1          1
## 194975 CLCdnDe1_10203       Sotub04g025620   Sotub04g025620.1.1          1
## 309523 CLCdnDe1_10234 PGSC0003DMG400013199 PGSC0003DMT400034331          1
## 309524 CLCdnDe1_10234 PGSC0003DMG400013199 PGSC0003DMT400034331          1
##        match_score
## 70269           NA
## 70270           NA
## 267173           5
## 267174           5
## 146880           5
## 146881           5
## 194974           2
## 194975           2
## 309523           2
## 309524           2

MA = MA[!duplicated(MA),]

hist(MA$match_score, main = 'All Match Annot scores', xlab = "MA score")

data <- data.frame(cbind(seq(1,dim(MA)[1],1),MA))
colnames(data)[1] = "enumerate"
setDT(data)
DT5 = data[ , .(match.score = paste(match_score,
                         collapse=",")), by = ID]

no.gene_score.0 = 
    unlist(sapply(1:length(DT5$match.score), function(i) {
      aa = as.numeric(unlist(strsplit(DT5$match.score[i], ",")))
      if(all(is.na(aa)) | max(aa, na.rm=TRUE) == 0) return(i)
    }))
ind = unlist(matches((unlist(DT5[no.gene_score.0 ,1])), MA[,1],  list =TRUE))
no.gene_score.0 = MA[ind ,]
unique(no.gene_score.0[,5])

## [1] NA  0

MA.2 = MA[-which(MA$match_score == 0),]
MA = MA.2[-which(is.na(MA.2$match_score)),]

# only ones with max score
# uniqueID = unique(MA$ID)
keep = NULL
# to slow!!!
# keep = unlist(
#   sapply(1:length(uniqueID), function(i) {
#     if (!(i %% 100)) print(i)
#       # print(uniqueID[i])
#       ind = matches(uniqueID[i], MA[,1], all.x=FALSE,all.y=FALSE)[,2]
#       mmax = max(MA[ind,5])
#       ind2 = matches(mmax, MA[ind,5], all.x=FALSE,all.y=FALSE)[,2]
#       c(keep,(ind[ind2]))
#     }
#   )
# )

data <- data.frame(cbind(seq(1,dim(MA)[1],1),MA))
colnames(data)[1] = "enumerate"

dd = data %>% group_by(ID) %>% summarize(maxID = paste(enumerate[which(match_score == max(match_score))],  
                                                       collapse=","))



keep = as.numeric(unlist(strsplit(dd$maxID, ",")))
dim(MA)[1] - length(keep)

## [1] 219

MA = MA[keep,]
hist(MA$match_score, main = 'Selected Match Annot scores', xlab = "MA score")

setDT(MA)
setDT(no.gene_score.0)
# dim(MA)
DT1 = MA[ , .(DM.geneID = paste(DMgeneID,
                         collapse="; ")), by = ID]
DT2 = MA[ , .(DM.trID = paste(DMtrID,
                         collapse="; ")), by = ID]
DT3 = MA[ , .(exon.match = paste(exon_match,
                         collapse="; ")), by = ID]
DT4 = MA[ , .(match.score = paste(match_score,
                         collapse="; ")), by = ID]
DT1.1 = no.gene_score.0[ , .(DM.geneID = paste(DMgeneID,
                         collapse="; ")), by = ID]
DT2.1 = no.gene_score.0[ , .(DM.trID = paste(DMtrID,
                         collapse="; ")), by = ID]
DT3.1 = no.gene_score.0[ , .(exon.match = paste(exon_match,
                         collapse="; ")), by = ID]
DT4.1 = no.gene_score.0[ , .(match.score = paste(match_score,
                         collapse="; ")), by = ID]

# dim(MA)

myVec = unique(MA[,1])
myVec.1 = unique(no.gene_score.0[,1])
# dim(myVec)
# dim(myVec.1)
# dim(DT1)
# dim(DT1.1)

myVec = merge(myVec, DT1, by="ID", all.x =TRUE)
myVec = merge(myVec, DT2, by="ID", all.x =TRUE)
myVec = merge(myVec, DT3, by="ID", all.x =TRUE)
myVec = merge(myVec, DT4, by="ID", all.x =TRUE)
myVec.1 = merge(myVec.1, DT1.1, by="ID", all.x =TRUE)
myVec.1 = merge(myVec.1, DT2.1, by="ID", all.x =TRUE)
myVec.1 = merge(myVec.1, DT3.1, by="ID", all.x =TRUE)
myVec.1 = merge(myVec.1, DT4.1, by="ID", all.x =TRUE)

# dim(myVec)
# dim(myVec.1)

no.gene_score.0 = merge(tr.3, myVec.1, all.y =TRUE)

# print(dim(merge(tr.3, myVec, by="ID", all.x =TRUE)))

tr.3 = merge(tr.3, myVec, by="ID", all.x =TRUE)
colnames(myVec)[1] = "ID2"
tr.3_utrorf = merge(tr.3_utrorf, myVec, by="ID2", all.x =TRUE)
# dim(tr.3_utrorf)

IPR

IPR = read.table(file = "../output/intermediate/Desiree/Desiree_IPS_filtered_aggregated_filtered.tsv", 
               header = TRUE, 
               sep = "\t", 
               quote = NULL,
               dec = ".", 
               stringsAsFactors = FALSE,
               comment.char = "@")

# dim(IPR)
colnames(IPR)[1] = c("ID")

# print(dim(merge(tr.3, IPR, by="ID", all.x =TRUE)))

tr.3 = merge(tr.3, IPR, by="ID", all.x = TRUE)
tr.3_utrorf = merge(tr.3_utrorf, IPR, by="ID", all.x = TRUE)
# dim(tr.3_utrorf)
no.gene_score.0 = merge(no.gene_score.0, IPR, all.x = TRUE)

colnames(tr.3)[which(colnames(tr.3) == "Analysis")] = "IPS_Analysis"
colnames(tr.3_utrorf)[which(colnames(tr.3_utrorf) == "Analysis")] = "IPS_Analysis"
colnames(no.gene_score.0)[which(colnames(no.gene_score.0) == "Analysis")] = "IPS_Analysis"

colnames(tr.3)[which(colnames(tr.3) == "Signature_Accession")] = "IPS_Signature_Accession"
colnames(tr.3_utrorf)[which(colnames(tr.3_utrorf) == "Signature_Accession")] = "IPS_Signature_Accession"
colnames(no.gene_score.0)[which(colnames(no.gene_score.0) == "Signature_Accession")] = "IPS_Signature_Accession"

vecscreen

vs = read.table(file = "../output/intermediate/Desiree/Desiree_vecscreen.tsv", 
               header = TRUE, 
               sep = "\t", 
               quote = NULL,
               dec = ".", 
               stringsAsFactors = FALSE,
               comment.char = "@")

# dim(vs)
colnames(vs)[1] = "ID"
length(unique(vs$ID))

## [1] 10813

setDT(vs)
# dim(vs)
DT1 = vs[ , .(coverage = paste(paste0(Matching_vector._starting_with_uv., " [",
                                       Lower_end_of_the_alignment_in_the_vector, "-",
                                       Upper_end_of_the_alignment_in_the_vector, "] ",
                                      coverage, "%"),
                         collapse="; ")), by = ID]
DT2 = vs[ , .(vectorEvidence = paste(paste0(The_strength_of_this_vecscreen_match_, ", ",
                                       The_strength_of_the_strongest_vecscreen_match_for_this_query, ", ",
                                       Whether_there_is_any_dangling_part_.called_.Suspect._by_vecscreen._at_either_end_of_the_query, ", ",
                                       the_classification_of_the_match),
                         collapse="; ")), by = ID]
DT3 = vs[ , .(blastnVector = paste(blastn_desc, collapse="; ")), by = ID]


myVec = unique(vs[,1])
# dim(myVec)
# dim(DT1)

myVec = merge(myVec, DT1, by="ID", all.x =TRUE)
myVec = merge(myVec, DT2, by="ID", all.x =TRUE)
myVec = merge(myVec, DT3, by="ID", all.x =TRUE)



# print(dim(merge(tr.3, myVec, by="ID", all.x =TRUE)))

tr.3 = merge(tr.3, myVec, by="ID", all.x =TRUE)
no.gene_score.0 = merge(no.gene_score.0, myVec, all.x = TRUE)
colnames(myVec)[1] = "ID2"
tr.3_utrorf = merge(tr.3_utrorf, myVec, by="ID2", all.x =TRUE)
# dim(tr.3_utrorf)

colnames(tr.3)[which(colnames(tr.3) == "coverage")] = "VecScreen_coverage"
colnames(tr.3_utrorf)[which(colnames(tr.3_utrorf) == "coverage")] = "VecScreen_coverage"
colnames(no.gene_score.0)[which(colnames(no.gene_score.0) == "coverage")] = "VecScreen_coverage"

DIAMOND BLASTX

(myfiles = list.files(path = "../output/intermediate/Desiree/", pattern = "ENCHformat_top1.tsv"))

## [1] "01_Desiree.cds_solanumTuberosum.out.ENCHformat_top1.tsv"
## [2] "02_Desiree.tr_solanumTuberosum.out.ENCHformat_top1.tsv" 
## [3] "03_Desiree.cds_Solanaceae.out.ENCHformat_top1.tsv"      
## [4] "04_Desiree.tr_Solanaceae.out.ENCHformat_top1.tsv"       
## [5] "05_Desiree.cds_SP_TrEMBL_plants.out.ENCHformat_top1.tsv"
## [6] "06_Desiree.tr_SP_TrEMBL_plants.out.ENCHformat_top1.tsv" 
## [7] "07_Desiree.cds_SwissProt_TrEMBL.out.ENCHformat_top1.tsv"
## [8] "08_Desiree.tr_SwissProt_TrEMBL.out.ENCHformat_top1.tsv"

# watch our for comment.chars # in tables
for (i in myfiles) {
  print(i)
  blast = read.table(file = paste0("../output/intermediate/Desiree/",i), 
               header = TRUE, 
               sep = "\t", 
               quote = NULL,
               dec = ".", 
               stringsAsFactors = FALSE,
               comment.char = "@")
  
  blast = blast[,-1]
  # if (grepl(".tr_", i)) {
  #   remove = which(!grepl("evgclass", blast[,1])) # lost
  #   blast = blast[-remove,]
  # }
  tmp = strsplit(blast[,1], " ")
  tmp = data.table::transpose(tmp)[[1]]
  blast[,1] = tmp
  remove = which(duplicated(blast[,1])) # duplicated
  if (length(remove)) blast = blast[-remove,]
  
  colnames(blast)[1] = "ID"
  blast$Gaps.num = round(blast$Gaps.*blast$Aligned_seq_length/100)
  blast$Query_coverage = (blast$Aligned_seq_length-blast$Gaps.num)/(blast$Query_length/3)
  blast$Target_coverage = (blast$Aligned_seq_length-blast$Gaps.num)/blast$Target_length
  blast$Query_Target_ratio = (blast$Query_length/3)/blast$Target_length
  ind1 = which(blast$Query_coverage >= 0.50)
  ind2 = which(blast$Target_coverage >= 0.50)
  ind = intersect(ind1, ind2)
  blast = blast[ind, ]
  blast$Target_coverage[blast$Target_coverage > 1.0] = 1.0
  blast$Query_coverage[blast$Query_coverage > 1.0] = 1.0
  ind = which(colnames(blast) %in% c("ID", "Target_ID", "Target_description", "Aligned_seq_length", "Target_coverage", "Query_coverage", 
                         "E.value", "Score"))
  blast = blast[, ind]

  
  j = gsub(".out.ENCHformat_top1.tsv", "", i)
  j = gsub("_Desiree", "", j)
  colnames(blast)[2:dim(blast)[2]] = paste0(j, "_", colnames(blast)[2:dim(blast)[2]])
  
  tr.3 = merge(tr.3, blast, by="ID", all.x =TRUE)
  no.gene_score.0 = merge(no.gene_score.0, blast, by="ID", all.x = TRUE)

  if ((grepl(".cds_", i))) {
    tr.3_utrorf = merge(tr.3_utrorf, blast, by="ID", all.x =TRUE)
    # cat (".cds:", dim(merge(tr.3_utrorf, blast, by="ID", all.x =TRUE)), "\n")
    # print(dim(tr.3_utrorf))
  } else {
    colnames(blast)[1] = "ID2"
    tr.3_utrorf = merge(tr.3_utrorf, blast, by="ID2", all.x =TRUE)
    # cat(".tr: ", dim(merge(tr.3_utrorf, blast, by="ID2", all.x =TRUE)), "\n")
    # print(dim(tr.3_utrorf))
  }
    
}

## [1] "01_Desiree.cds_solanumTuberosum.out.ENCHformat_top1.tsv"
## [1] "02_Desiree.tr_solanumTuberosum.out.ENCHformat_top1.tsv"
## [1] "03_Desiree.cds_Solanaceae.out.ENCHformat_top1.tsv"
## [1] "04_Desiree.tr_Solanaceae.out.ENCHformat_top1.tsv"
## [1] "05_Desiree.cds_SP_TrEMBL_plants.out.ENCHformat_top1.tsv"
## [1] "06_Desiree.tr_SP_TrEMBL_plants.out.ENCHformat_top1.tsv"
## [1] "07_Desiree.cds_SwissProt_TrEMBL.out.ENCHformat_top1.tsv"
## [1] "08_Desiree.tr_SwissProt_TrEMBL.out.ENCHformat_top1.tsv"

### save.image("../output/Desiree/Desiree_withFiltering.RData")

save(tr.3, tr.3_utrorf, no.gene_score.0, file = "../output/Desiree/tr3s.RData")

rm(list=ls())
gc()

##           used (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  939040 50.2    7421482  396.4   9276852  495.5
## Vcells 3729369 28.5  132539537 1011.2 165508274 1262.8

load("../output/Desiree/tr3s.RData")

library(data.table)

library(tidyverse)

library(grr)

write

i <- sapply(tr.3, is.factor) 
tr.3[i] <- lapply(tr.3[i], as.character) 
tr.3[is.na(tr.3)] = NA
tr.3[tr.3 == ""] = NA

i <- sapply(no.gene_score.0, is.factor) 
no.gene_score.0[i] <- lapply(no.gene_score.0[i], as.character) 
no.gene_score.0[is.na(no.gene_score.0)] = NA
no.gene_score.0[no.gene_score.0 == ""] = NA
# write.table(no.gene_score.0, "../output/Desiree/no.gene_score.0_all.tsv", 
#             sep = "\t", row.names = FALSE,
#             quote = FALSE)

hist(as.numeric(no.gene_score.0$cds.length), 
     breaks = seq(0, max(as.numeric(no.gene_score.0$cds.length), na.rm = TRUE) + 50, 50),
     main = "Desiree evigene MA score NA/0 CDS sequence length", xlab = "CDS length",
     xlim=c(0, max(300,max(as.numeric(no.gene_score.0$cds.length), na.rm = TRUE))),
     col = "grey45", las=2)
abline(v=180, col = "red")
abline(v=200, col = "blue")
abline(v=300, col = "green")
axis(1, at=c(180), labels=c(180), las=2, col = c("red"), cex.axis=0.75, col.axis = c("red"))
axis(1, at=c(200), labels=c(200), las=2, col = c("blue"), cex.axis=0.75, col.axis = c("blue"))
axis(1, at=c(300), labels=c(300), las=2, col = c("green"), cex.axis=0.75, col.axis = c("green"))

table(as.numeric(no.gene_score.0$cds.length) >= 300)

## 
## FALSE  TRUE 
## 83136 20351

table(as.numeric(no.gene_score.0$cds.length) >= 200)

## 
## FALSE  TRUE 
## 70056 33431

table(as.numeric(no.gene_score.0$cds.length) >= 180)

## 
## FALSE  TRUE 
## 62979 40508

i <- sapply(tr.3_utrorf, is.factor) 
tr.3_utrorf[i] <- lapply(tr.3_utrorf[i], as.character) 
tr.3_utrorf[is.na(tr.3_utrorf)] = NA
tr.3_utrorf[tr.3_utrorf == ""] = NA

keep or remove

# colnames(tr.3_utrorf)

okay = which(!is.na(tr.3_utrorf$DM.trID))
length(okay)

## [1] 84

print("check if motif is in contamination")

## [1] "check if motif is in contamination"

okay = c(okay, which(!is.na(tr.3_utrorf$IPS_Analysis)))
length(okay)

## [1] 2447

ind = grep("Target_ID", colnames(tr.3_utrorf))

okay = c(okay, 
         which(!is.na(tr.3_utrorf[,ind[1]])),
         which(!is.na(tr.3_utrorf[,ind[2]])),
         which(!is.na(tr.3_utrorf[,ind[3]])),
         which(!is.na(tr.3_utrorf[,ind[4]])),
         which(!is.na(tr.3_utrorf[,ind[5]])),
         which(!is.na(tr.3_utrorf[,ind[6]])))

# which(as.numeric(tr.3_utrorf$cds.length) >= 180)
# which(as.numeric(tr.3_utrorf$cds.length) < 180)

okay = sort(unique(okay))
length(okay)

## [1] 3147

tr.3_utrorf.annot = tr.3_utrorf[okay,]
tr.3_utrorf.unknown = tr.3_utrorf[-okay,]

hist(as.numeric(tr.3_utrorf.annot$tr.length), 
     breaks = seq(0, max(as.numeric(tr.3_utrorf.annot$tr.length), na.rm = TRUE) + 50, 50),
      main = "OK Desiree evigene transcript with secondary ORF length", xlab = "transcript length",
     xlim=c(0, max(300,max(as.numeric(tr.3_utrorf.annot$tr.length), na.rm = TRUE))))

hist(as.numeric(tr.3_utrorf.annot$cds.length), 
     breaks = seq(0, max(as.numeric(tr.3_utrorf.annot$cds.length), na.rm = TRUE) + 50, 50),
     main = "OK Desiree evigene secondary ORF coding sequence length", xlab = "CDS length",
     xlim=c(0, max(300,max(as.numeric(tr.3_utrorf.annot$cds.length), na.rm = TRUE))),
     col = "grey45", las=2)
abline(v=180, col = "red")
abline(v=200, col = "blue")
abline(v=300, col = "green")
axis(1, at=c(180), labels=c(180), las=2, col = c("red"), cex.axis=0.75, col.axis = c("red"))
axis(1, at=c(200), labels=c(200), las=2, col = c("blue"), cex.axis=0.75, col.axis = c("blue"))
axis(1, at=c(300), labels=c(300), las=2, col = c("green"), cex.axis=0.75, col.axis = c("green"))

table(as.numeric(tr.3_utrorf.annot$cds.length) >= 300)

## 
## FALSE  TRUE 
##     2  3145

table(as.numeric(tr.3_utrorf.annot$cds.length) >= 200)

## 
## TRUE 
## 3147

table(as.numeric(tr.3_utrorf.annot$cds.length) >= 180)

## 
## TRUE 
## 3147

hist(as.numeric(tr.3_utrorf.unknown$tr.length), 
     breaks = seq(0, max(as.numeric(tr.3_utrorf.unknown$tr.length), na.rm = TRUE) + 50, 50),
      main = "notOK Desiree evigene transcript with secondary ORF length", xlab = "transcript length",
     xlim=c(0, max(300,max(as.numeric(tr.3_utrorf.unknown$tr.length), na.rm = TRUE))))

hist(as.numeric(tr.3_utrorf.unknown$cds.length), 
     breaks = seq(0, max(as.numeric(tr.3_utrorf.unknown$cds.length), na.rm = TRUE) + 50, 50),
     main = "notOK Desiree evigene secondary ORF coding sequence length", xlab = "CDS length",
     xlim=c(0, max(300,max(as.numeric(tr.3_utrorf.unknown$cds.length), na.rm = TRUE))),
     col = "grey45", las=2)
abline(v=180, col = "red")
abline(v=200, col = "blue")
abline(v=300, col = "green")
axis(1, at=c(180), labels=c(180), las=2, col = c("red"), cex.axis=0.75, col.axis = c("red"))
axis(1, at=c(200), labels=c(200), las=2, col = c("blue"), cex.axis=0.75, col.axis = c("blue"))
axis(1, at=c(300), labels=c(300), las=2, col = c("green"), cex.axis=0.75, col.axis = c("green"))

table(as.numeric(tr.3_utrorf.unknown$cds.length) >= 300)

## 
## FALSE  TRUE 
##     2   868

table(as.numeric(tr.3_utrorf.unknown$cds.length) >= 200)

## 
## TRUE 
##  870

table(as.numeric(tr.3_utrorf.unknown$cds.length) >= 180)

## 
## TRUE 
##  870

# shorty = tr.3_utrorf.annot[which(as.numeric(tr.3_utrorf.annot$cds.length) < 300),]

# weight =  ă¤Ľă¸¶2.1 + 0.02 ćĽă¸· Similarity% (or Positives%)) + 0.01 ćĽă¸· Coverage% > 0
# S ... Similarity% (BLAST-tsv file: Positives%) 
# C ... Coverage% = Aligned_seq_length / minimum(Query_length/1 OR 3 for prot, Target_length) 


tr.3_utrorf.annot[is.na(tr.3_utrorf.annot)] = "-"
tr.3_utrorf.unknown[is.na(tr.3_utrorf.unknown)] = "-"

# write.table(tr.3_utrorf.annot, file = "../output/Desiree/Desiree_tr.3_utrorf_annot_all.tsv", 
#             append = FALSE, quote = FALSE, sep = "\t",
#             eol = "\n", na = "NA", dec = ".", row.names = FALSE,
#             col.names = TRUE, qmethod = c("escape", "double"),
#             fileEncoding = "")

# write.table(tr.3_utrorf.unknown, file = "../output/Desiree/remove/Desiree_tr.3_utrorf_unknown.tsv", 
#             append = FALSE, quote = FALSE, sep = "\t",
#             eol = "\n", na = "NA", dec = ".", row.names = FALSE,
#             col.names = TRUE, qmethod = c("escape", "double"),
#             fileEncoding = "")

# write.table(tr.3_utrorf.annot$ID, file = "../output/Desiree/Desiree_tr.3_utrorf_annot_IDs_all.tsv", 
#             append = FALSE, quote = FALSE, sep = "\t",
#             eol = "\n", na = "NA", dec = ".", row.names = FALSE,
#             col.names = FALSE, qmethod = c("escape", "double"),
#             fileEncoding = "")

write.table(tr.3_utrorf.unknown$ID, file = "../output/Desiree/Desiree_tr.3_utrorf_drop_IDs.tsv",
            append = FALSE, quote = FALSE, sep = "\t",
            eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = FALSE, qmethod = c("escape", "double"),
            fileEncoding = "")


tr.3_utrorf.annot$status="discard"
tr.3_utrorf.unknown$status="discard"

ind = which(tr.3_utrorf.annot$tr.pass == "okay")

# write.table(tr.3_utrorf.annot[ind,], file = "../output/Desiree/keep/Desiree_tr.3_utrorf_annot_keep.tsv", 
#             append = FALSE, quote = FALSE, sep = "\t",
#             eol = "\n", na = "NA", dec = ".", row.names = FALSE,
#             col.names = TRUE, qmethod = c("escape", "double"),
#             fileEncoding = "")
tr.3_utrorf.annot[ind,]$status = "keep"

write.table(tr.3_utrorf.annot$ID[ind], file = "../output/Desiree/Desiree_tr.3_utrorf_keep_IDs.tsv",
            append = FALSE, quote = FALSE, sep = "\t",
            eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = FALSE, qmethod = c("escape", "double"),
            fileEncoding = "")

# write.table(tr.3_utrorf.annot[-ind,], file = "../output/Desiree/remove/Desiree_tr.3_utrorf_annot_drop.tsv", 
#             append = FALSE, quote = FALSE, sep = "\t",
#             eol = "\n", na = "NA", dec = ".", row.names = FALSE,
#             col.names = TRUE, qmethod = c("escape", "double"),
#             fileEncoding = "")

tr.3_utrorf.annot[-ind,]$status = "discard"

write.table(tr.3_utrorf.annot$ID[-ind], file = "../output/Desiree/Desiree_tr.3_utrorf_drop_IDs.tsv", 
            append = TRUE, quote = FALSE, sep = "\t",
            eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = FALSE, qmethod = c("escape", "double"),
            fileEncoding = "")

# colnames(tr.3)

okay = which(!is.na(tr.3$DM.trID))
length(okay)

## [1] 128828

okay = c(okay, which(!is.na(tr.3$IPS_Analysis)))
length(okay)

## [1] 293246

ind = grep("Target_ID", colnames(tr.3))

okay = c(okay, 
         which(!is.na(tr.3[,ind[1]])),
         which(!is.na(tr.3[,ind[2]])),
         which(!is.na(tr.3[,ind[3]])),
         which(!is.na(tr.3[,ind[4]])),
         which(!is.na(tr.3[,ind[5]])),
         which(!is.na(tr.3[,ind[6]])))

# which(as.numeric(tr.3$cds.length) >= 180)
# which(as.numeric(tr.3$cds.length) < 180)

okay = sort(unique(okay))
length(okay)

## [1] 216880

tr.3.annot = tr.3[okay,]
tr.3.unknown = tr.3[-okay,]

tr.3.annot$status="keep"
tr.3.unknown$status="discard"

hist(as.numeric(tr.3.annot$tr.length), 
     breaks = seq(0, max(as.numeric(tr.3.annot$tr.length), na.rm = TRUE) + 50, 50),
      main = "OK Desiree evigene transcript length", xlab = "transcript length",
     xlim=c(0, max(300,max(as.numeric(tr.3.annot$tr.length), na.rm = TRUE))))

hist(as.numeric(tr.3.annot$cds.length), 
     breaks = seq(0, max(as.numeric(tr.3.annot$cds.length), na.rm = TRUE) + 50, 50),
     main = "OK Desiree evigene coding sequence length", xlab = "CDS length",
     xlim=c(0, max(300,max(as.numeric(tr.3.annot$cds.length), na.rm = TRUE))))
abline(v=180, col = "red")
abline(v=200, col = "blue")
abline(v=300, col = "green")

table(as.numeric(tr.3.annot$cds.length) >= 300)

## 
##  FALSE   TRUE 
##  45591 171289

table(as.numeric(tr.3.annot$cds.length) >= 200)

## 
##  FALSE   TRUE 
##  31936 184944

table(as.numeric(tr.3.annot$cds.length) >= 180)

## 
##  FALSE   TRUE 
##  27331 189549

hist(as.numeric(tr.3.unknown$tr.length), 
     breaks = seq(0, max(as.numeric(tr.3.unknown$tr.length), na.rm = TRUE) + 50, 50),
      main = "notOK Desiree evigene transcript with secondary ORF length", xlab = "transcript length",
     xlim=c(0, max(300,max(as.numeric(tr.3.unknown$tr.length), na.rm = TRUE))))

hist(as.numeric(tr.3.unknown$cds.length), 
     breaks = seq(0, max(as.numeric(tr.3.unknown$cds.length), na.rm = TRUE) + 50, 50),
     main = "notOK Desiree evigene secondary ORF coding sequence length", xlab = "CDS length",
     xlim=c(0, max(300,max(as.numeric(tr.3.unknown$cds.length), na.rm = TRUE))),     
     col = "grey45", las=2)
abline(v=180, col = "red")
abline(v=200, col = "blue")
abline(v=300, col = "green")
axis(1, at=c(180), labels=c(180), las=2, col = c("red"), cex.axis=0.75, col.axis = c("red"))
axis(1, at=c(200), labels=c(200), las=2, col = c("blue"), cex.axis=0.75, col.axis = c("blue"))
axis(1, at=c(300), labels=c(300), las=2, col = c("green"), cex.axis=0.75, col.axis = c("green"))

table(as.numeric(tr.3.unknown$cds.length) >= 300)

## 
##  FALSE   TRUE 
## 112820  16554

table(as.numeric(tr.3.unknown$cds.length) >= 200)

## 
## FALSE  TRUE 
## 93529 35845

table(as.numeric(tr.3.unknown$cds.length) >= 180)

## 
## FALSE  TRUE 
## 83420 45954

tr.3.annot[is.na(tr.3.annot)] = "-"
tr.3.unknown[is.na(tr.3.unknown)] = "-"

# write.table(tr.3.annot, file = "../output/Desiree/keep/Desiree_tr.3_annot.tsv", 
#             append = FALSE, quote = FALSE, sep = "\t",
#             eol = "\n", na = "NA", dec = ".", row.names = FALSE,
#             col.names = TRUE, qmethod = c("escape", "double"),
#             fileEncoding = "")
# 
# write.table(tr.3.unknown, file = "../output/Desiree/remove/Desiree_tr.3_unknown.tsv", 
#             append = FALSE, quote = FALSE, sep = "\t",
#             eol = "\n", na = "NA", dec = ".", row.names = FALSE,
#             col.names = TRUE, qmethod = c("escape", "double"),
#             fileEncoding = "")

write.table(tr.3.annot$ID, file = "../output/Desiree/Desiree_tr.3_keep_IDs.tsv", 
            append = FALSE, quote = FALSE, sep = "\t",
            eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = FALSE, qmethod = c("escape", "double"),
            fileEncoding = "")

write.table(tr.3.unknown$ID, file = "../output/Desiree/Desiree_tr.3_drop_IDs.tsv", 
            append = FALSE, quote = FALSE, sep = "\t",
            eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = FALSE, qmethod = c("escape", "double"),
            fileEncoding = "")

combo table

tr.3.unknown$ID2 = tr.3.unknown$ID
tr.3.annot$ID2 = tr.3.annot$ID
orderC = match(colnames(tr.3_utrorf.annot), colnames(tr.3.annot))
combo = rbind(tr.3.unknown[,orderC], tr.3_utrorf.unknown, tr.3_utrorf.annot, tr.3.annot[,orderC])
nrow(combo)==nrow(tr.3_utrorf)+nrow(tr.3)

## [1] TRUE

colnames(combo)[1:2]=c("tr.ID", "cds.ID")

write.table(combo, file = "../output/Desiree/Desiree_tr.cds.tsv", 
            append = FALSE, quote = FALSE, sep = "\t",
            eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = TRUE, qmethod = c("escape", "double"),
            fileEncoding = "")

# colnames(tr.3.annot)

take = c(which(is.na(no.gene_score.0$IPS_Analysis)))
length(take)

## [1] 90148

ind = grep("Target_ID", colnames(no.gene_score.0))

take = intersect(take, (
         which(is.na(no.gene_score.0[,ind[1]]) &
         (is.na(no.gene_score.0[,ind[2]])) &
         (is.na(no.gene_score.0[,ind[3]])) &
         (is.na(no.gene_score.0[,ind[4]])) &
         (is.na(no.gene_score.0[,ind[5]])) &
         (is.na(no.gene_score.0[,ind[6]])))))


take = sort(unique(take))
length(take)

## [1] 86528

no.gene_score.0 = no.gene_score.0[take,]
no.gene_score.0 = no.gene_score.0[, 1:which(colnames(no.gene_score.0) == "blastnVector")]
no.gene_score.0 = no.gene_score.0[, -c(grep("IPR", colnames(no.gene_score.0)), grep("IPS", colnames(no.gene_score.0)))]

write.table(no.gene_score.0, file = "../output/Desiree/Desiree_justDMmapped_score.0.tsv", 
            append = FALSE, quote = FALSE, sep = "\t",
            eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = TRUE, qmethod = c("escape", "double"),
            fileEncoding = "")

sessionInfo()

## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=Slovenian_Slovenia.1250  LC_CTYPE=Slovenian_Slovenia.1250   
## [3] LC_MONETARY=Slovenian_Slovenia.1250 LC_NUMERIC=C                       
## [5] LC_TIME=Slovenian_Slovenia.1250    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] grr_0.9.5         forcats_0.4.0     stringr_1.4.0    
##  [4] dplyr_0.8.3       purrr_0.3.2       readr_1.3.1      
##  [7] tidyr_0.8.3       tibble_2.1.3      ggplot2_3.2.1    
## [10] tidyverse_1.2.1   data.table_1.12.2
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.2       cellranger_1.1.0 pillar_1.4.2     compiler_3.6.1  
##  [5] tools_3.6.1      zeallot_0.1.0    digest_0.6.20    lubridate_1.7.4 
##  [9] jsonlite_1.6     evaluate_0.14    nlme_3.1-141     gtable_0.3.0    
## [13] lattice_0.20-38  pkgconfig_2.0.2  rlang_0.4.0      cli_1.1.0       
## [17] rstudioapi_0.10  yaml_2.2.0       haven_2.1.1      xfun_0.8        
## [21] withr_2.1.2      xml2_1.2.2       httr_1.4.1       knitr_1.24      
## [25] vctrs_0.2.0      hms_0.5.1        generics_0.0.2   grid_3.6.1      
## [29] tidyselect_0.2.5 glue_1.3.1       R6_2.4.0         readxl_1.3.1    
## [33] rmarkdown_1.14   modelr_0.1.5     magrittr_1.5     backports_1.1.4 
## [37] scales_1.0.0     htmltools_0.3.6  rvest_0.3.4      assertthat_0.2.1
## [41] colorspace_1.4-1 stringi_1.4.3    lazyeval_0.2.2   munsell_0.5.0   
## [45] broom_0.5.2      crayon_1.3.4

rm(list=ls())
gc()

##           used (Mb) gc trigger  (Mb)  max used   (Mb)
## Ncells  942843 50.4    4749749 253.7   9276852  495.5
## Vcells 3715083 28.4  127406989 972.1 165508274 1262.8

IDs_withFiltering

zagor

19 julij 2019

Desiree

IDs, evg headers and seq lenghts

utrorfs

lenghts

Match Annot

IPR

vecscreen

DIAMOND BLASTX

write

keep or remove