## ---- snpAnnotation ----
snpAnnotation <- function(){

  
### Import files
# import vcf
snpVCF <- read.csv(paste(config$SNPIR_RESULT, '/', patientID, '/intogen_input.vcf', sep=''), stringsAsFactors=FALSE, header=F, sep='\t', comment.char="#")

# import Intogen consequences and variant_genes
intogenSNPs <- read.csv2(paste(config$INTOGEN_RESULTS, '/', patientID, '/consequences.tsv', sep=''), stringsAsFactors=FALSE, header=T, sep="\t", na.strings="", as.is=T)
intogenGenes <- read.csv2(paste(config$INTOGEN_RESULTS, '/', patientID, '/variant_genes.tsv', sep=''), stringsAsFactors=FALSE, header=T, sep="\t", na.strings="", as.is=T)

# import COSMIC complete export
CosmicCompleteExport <- read.csv2(config$COSMIC, stringsAsFactors=FALSE, sep='\t')

# import ClinVar variant summary
clinVarSummary <- read.csv2(config$CLINVAR, stringsAsFactors=FALSE, sep='\t')

# import pharmGkb clinical annotations per rsID and info for each SNP
pharmgkbRSID <- read.csv2(config$PHARMGKB_rsID, stringsAsFactors=FALSE, sep=',')
pharmgkbAllele <- read.csv2(config$PHARMGKB_Allele, stringsAsFactors=FALSE, sep='\t')

# import DrugBank stable data file
drugBankTable <- read.csv2(config$DRUGBANK, stringsAsFactors=FALSE, sep='\t')

# import CADD data for all possible SNPs
# cadd <- read.csv2(config$CADD, stringsAsFactors=FALSE, sep='\t')


### Format VCF
# Rename columns
names(snpVCF) <- c("CHROM", "POS", "rsID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "SAMPLE")

# Remove chr from chromosome column
snpVCF$CHROM <- gsub("chr", "", snpVCF$CHROM)

# Create Alternative allele columns for matching
snpVCF$ALT1 <- sapply(strsplit(as.character(snpVCF$ALT), ","), "[", 1)
snpVCF$ALT2 <- sapply(strsplit(as.character(snpVCF$ALT), ","), "[", 2)
snpVCF$ALT3 <- sapply(strsplit(as.character(snpVCF$ALT), ","), "[", 3)

## Create Genotype allele columns for pharmGkb matching
# Split GT values ($SAMPLE) into columns
splitSAMPLE <- unlist(strsplit(snpVCF$SAMPLE, split=':'))
genotype <- splitSAMPLE[seq(1, length(splitSAMPLE), by=5)]
splitGeno <- unlist(strsplit(genotype, split='/'))
snpVCF$allele1value <- splitGeno[seq(1, length(splitGeno), by=2)]
snpVCF$allele2value <- splitGeno[seq(2, length(splitGeno), by=2)]

# Fill Genotype allele columns and Delete allele value columns
snpVCF$allele1 <- ifelse(snpVCF$allele1value == 0, snpVCF$REF,
                           ifelse(snpVCF$allele1value == 1, snpVCF$ALT1, 
                                  ifelse(snpVCF$allele1value == 2, snpVCF$ALT2,
                                         ifelse(snpVCF$allele1value == 3, snpVCF$ALT3, NA))))
snpVCF$allele2 <- ifelse(snpVCF$allele2value == 0, snpVCF$REF,
                           ifelse(snpVCF$allele2value == 1, snpVCF$ALT1,
                                  ifelse(snpVCF$allele2value == 2, snpVCF$ALT2,
                                         ifelse(snpVCF$allele2value == 3, snpVCF$ALT3, NA))))
snpVCF$allele1value <- NULL
snpVCF$allele2value <- NULL


### Label Mutation Type
# Create EFF column in vcf with single EFF for each snp
require(data.table)
snpVCF$EFF <- gsub("EFF=|;.*$", "", regmatches(snpVCF$INFO, regexpr("EFF=.*\\)", snpVCF$INFO)))
dt <- data.table(snpVCF)
snpVCFexpanded <- dt[, list(EFF = unlist(strsplit(EFF, ","))), by = POS]
snpVCF <- merge(snpVCFexpanded, snpVCF, by="POS")
snpVCF$EFF.y <- NULL
setnames(snpVCF, "EFF.x", "EFF")
snpVCF <- as.data.frame(snpVCF)
snpVCF <- snpVCF[,c(3, 1, 4:16, 2)]

# Divide EFF info into separate columns
snpVCF$Effect_type <- gsub("\\(", "", regmatches(snpVCF$EFF, regexpr("^.*\\(", snpVCF$EFF)))
snpVCF$Effect_info <- gsub("\\(|\\)", "", regmatches(snpVCF$EFF, regexpr("\\(.*\\)", snpVCF$EFF)))
snpVCF$Effect_impact <- sapply(strsplit(as.character(snpVCF$Effect_info), "\\|"), "[", 1)
snpVCF$Functional_class <- sapply(strsplit(as.character(snpVCF$Effect_info), "\\|"), "[", 2)
snpVCF$Codon_change <- sapply(strsplit(as.character(snpVCF$Effect_info), "\\|"), "[", 3)
snpVCF$AA_change <- sapply(strsplit(as.character(snpVCF$Effect_info), "\\|"), "[", 4)
snpVCF$Gene <- sapply(strsplit(as.character(snpVCF$Effect_info), "\\|"), "[", 6)
snpVCF$Coding <- sapply(strsplit(as.character(snpVCF$Effect_info), "\\|"), "[", 8)

# Aggregate Codon_change and AA_change and merge back with snpVCF
snpVCFchangesAgged <- aggregate(snpVCF[,c(21,22)], snpVCF[,c(1,2,23)], function(x) paste(unique(x[!is.na(x)]), " ", sep="", collapse=""))
snpVCFchangesAgged <- merge(snpVCF, snpVCFchangesAgged, by=c("CHROM", "POS", "Gene"))

# Format snpVCFchangesAgged and create final Effect column
snpVCFchangesAgged$Codon_change.x <- NULL
snpVCFchangesAgged$AA_change.x <- NULL
setnames(snpVCFchangesAgged, "Codon_change.y", "Codon_change")
setnames(snpVCFchangesAgged, "AA_change.y", "AA_change")
snpVCFchangesAgged$Effect <- paste(snpVCFchangesAgged$Effect_impact, snpVCFchangesAgged$Functional_class, snpVCFchangesAgged$Effect_type, snpVCFchangesAgged$Codon_change, snpVCFchangesAgged$AA_change, sep = " ")
snpVCFchangesAgged$Effect <- gsub("\\>\\s*$", "", snpVCFchangesAgged$Effect)
snpVCFchangesAgged$Effect <- paste(snpVCFchangesAgged$Effect, ";  ", sep="")
snpVCFchangesAgged$Effect <- gsub("\\s+\\<", ", ", snpVCFchangesAgged$Effect)
snpVCFchangesAgged$Effect <- paste(snpVCFchangesAgged$Gene, snpVCFchangesAgged$Effect, sep=": ")

# Aggregate by position to give final VCF
snpVCFchangesAgged <- snpVCFchangesAgged[order(snpVCFchangesAgged$Effect_impact, snpVCFchangesAgged$Coding),]
snpVCFbyPos <- aggregate(snpVCFchangesAgged[,c(4, 3, 25, 5:16)], snpVCFchangesAgged[,c(1,2)], function(x) paste(unique(x[!is.na(x)]), " ", sep="", collapse="")) 

# Create separate gene columns
snpVCFbyPos$GENE1 <- sapply(strsplit(as.character(snpVCFbyPos$Gene), "\\s"), "[", 1)
snpVCFbyPos$GENE2 <- sapply(strsplit(as.character(snpVCFbyPos$Gene), "\\s"), "[", 2)
snpVCFbyPos$GENE3 <- sapply(strsplit(as.character(snpVCFbyPos$Gene), "\\s"), "[", 3)
snpVCFbyPos$GENE4 <- sapply(strsplit(as.character(snpVCFbyPos$Gene), "\\s"), "[", 4)
snpVCFbyPos$GENE5 <- sapply(strsplit(as.character(snpVCFbyPos$Gene), "\\s"), "[", 5)

# Remove hanging spaces created by aggregate function
for(i in (1:ncol(snpVCFbyPos))){
  snpVCFbyPos[,i] <- gsub("\\>\\s+$", "", snpVCFbyPos[,i])
}

# Format gene column and order VCF by chromosome and position
snpVCFbyPos$Gene <- gsub("\\>\\s+\\<", "; ", snpVCFbyPos$Gene)
snpVCFbyPos <- snpVCFbyPos[order(snpVCFbyPos$CHROM, snpVCFbyPos$POS),]


### Intogen Annotation
## Annotate with intogen_consequences.tsv (mainly for gene names and mutation type)
# Format names and ALLELE column for matching
names(intogenSNPs)[2] <- "CHROM"
names(intogenSNPs)[4] <- "POS"
intogenSNPs$Ref <- sapply(strsplit(as.character(intogenSNPs$ALLELE), "/"), "[", 1)
intogenSNPs$Alt <- sapply(strsplit(as.character(intogenSNPs$ALLELE), "/"), "[", 2)


# Merge by Chromosome and Position
intogenSNPsAnno <- merge(snpVCFbyPos, intogenSNPs, by=c("CHROM", "POS"))

# Subset to double check alleles and select only GENE_ID from intogenSNPs
intogenSNPsAnno <- subset(intogenSNPsAnno, ((REF==Ref) & ((ALT1==Alt) | (ALT2==Alt) | (ALT3==Alt))), select = c(CHROM:GENE5, GENE_ID))

## Annotate with intogen_variant_genes.tsv based on Ensembl Gene ID (mainly for driver and impact)
# Remove first five columns
intogenGenes <- intogenGenes[,6:16]

# Merge by Ensembl Gene ID
intogenGenesAnno <- merge(intogenSNPsAnno, intogenGenes, by="GENE_ID")

# Create driver "yes or no" column
intogenGenesAnno$Intogen_driver <- ifelse(intogenGenesAnno$INTOGEN_DRIVER == 0, "No",
                            ifelse(intogenGenesAnno$INTOGEN_DRIVER == 1, "Yes", NA))

# Replace gene names that are NA with Ensembl ID
for(i in (1:nrow(intogenGenesAnno))){
  if(is.na(intogenGenesAnno$SYMBOL[i])){
    intogenGenesAnno$SYMBOL[i] <- intogenGenesAnno$GENE_ID[i]
  }
}

# Paste columns to create "intogen_info" column
intogenGenesAnno$Intogen_info <- paste(intogenGenesAnno$SYMBOL, ": ", "Variant Impact: ", intogenGenesAnno$VAR_IMPACT_DESC, ", Impact Score: ", intogenGenesAnno$VAR_IMPACT, ";", sep="")

# Aggregate by chromosome and position, keeping only intogen_info and driver columns
intogenGenesAnno <- aggregate(intogenGenesAnno[,c(4:23, 34, 35)], intogenGenesAnno[,c(2,3)], function(x) paste(unique(x[!is.na(x)]), " ", sep="", collapse=""))

## Format for future
# Order by chromosome and position
finalIntogenSNPs <- intogenGenesAnno[order(intogenGenesAnno$CHROM, intogenGenesAnno$POS),]
# Remove hanging spaces created by aggregate function
for(i in (1:ncol(finalIntogenSNPs))){
  finalIntogenSNPs[,i] <- gsub("\\>\\s+$", "", finalIntogenSNPs[,i])
}


### COSMIC Annotation
#Create new dataframe with relevant information from Cosmic and trim gene names with _EN...
CosmicVariants <- subset(CosmicCompleteExport, Mutation.GRCh37.genome.position != "", select = c(Mutation.GRCh37.genome.position, Pubmed_PMID, Gene.name, Mutation.CDS, ID_sample, Sample.name, Primary.site, Site.subtype, Primary.histology, Histology.subtype, Sample.source, Comments))
CosmicVariants[,3] <- gsub("_EN.*$", "", CosmicVariants[,3])

#Remove chr from chromosome names in vcf. Convert position format for merge. Merge. Remove extra column used for merging. Double check mutation using subset
snpVCFbyPos$CosmicFormat <- paste(snpVCFbyPos$CHROM, ":", snpVCFbyPos$POS, "-", snpVCFbyPos$POS, sep="")
snpsInCosmic <- merge(snpVCFbyPos, CosmicVariants, by.x = "CosmicFormat", by.y = "Mutation.GRCh37.genome.position")
snpVCFbyPos$CosmicFormat <- NULL
snpsInCosmic$CosmicFormat <- NULL  
snpsInCosmic$allele <- sapply(strsplit(as.character(snpsInCosmic$Mutation.CDS), ">"), "[", 2)
snpsInCosmic$altAllele2 <- ifelse((snpsInCosmic$altAllele1 == "A"), "T",
                                  ifelse((snpsInCosmic$altAllele1 == "T"), "A",
                                         ifelse((snpsInCosmic$altAllele1 == "G"), "C",
                                                ifelse((snpsInCosmic$altAllele1 == "C"), "G", NA))))
snpsInCosmic <- subset(snpsInCosmic, (is.na(snpsInCosmic$altAllele1)) | (snpsInCosmic$ALT1 == snpsInCosmic$altAllele1) | (snpsInCosmic$ALT2 == snpsInCosmic$altAllele1) | (snpsInCosmic$ALT3 == snpsInCosmic$altAllele1) | (snpsInCosmic$ALT1 == snpsInCosmic$altAllele2) | (snpsInCosmic$ALT2 == snpsInCosmic$altAllele2) | (snpsInCosmic$ALT3 == snpsInCosmic$altAllele2))
snpsInCosmic$altAllele1 <- NULL
snpsInCosmic$altAllele2 <- NULL

#Aggregate duplicate positions and Format
snpsInCosmicAgged <- aggregate(snpsInCosmic[,c(3:5,23:33)], snpsInCosmic[,1:2], FUN = function(x) c(count = length(unique(x[!is.na(x)])), paste(rle(sort(x))$values, "(", rle(sort(x))$lengths, "x", ")", "  ", sep="", collapse="")))
snpsInCosmicAgged <- apply(snpsInCosmicAgged, 2, unlist)
if(!is.null(nrow(snpsInCosmicAgged))) {
  snpsInCosmicAgged <- as.data.frame(snpsInCosmicAgged[,c(1,2,4,6,8,10,12,14,15,16,18,20,22,24,26,28,30)])
  totalSamples <- length(unique(CosmicVariants[,5]))
  snpsInCosmicAgged$ID_sample.count <- (as.numeric(snpsInCosmicAgged$ID_sample.count)/totalSamples)*100
  snpsInCosmicAgged <- snpsInCosmicAgged[order(snpsInCosmicAgged$CHROM, snpsInCosmicAgged$POS),]
  snpsInCosmicAgged <- apply(snpsInCosmicAgged, 2, function(x) gsub("\\(1x\\)|\\(x\\)", "", x))
  snpsInCosmicAgged <- apply(snpsInCosmicAgged, 2, function(x) gsub("\\<NS\\>", "", x))
  snpsInCosmicAgged <- as.data.frame(snpsInCosmicAgged)
} else {
  snpsInCosmicAgged <- as.data.frame(t(snpsInCosmicAgged[c(1,2,4,6,8,10,12,14,15,16,18,20,22,24,26,28,30)]))
  snpsInCosmicAgged$ID_sample.count <- (as.numeric(snpsInCosmicAgged$ID_sample.count)/totalSamples)*100
  snpsInCosmicAgged <- snpsInCosmicAgged[order(snpsInCosmicAgged$CHROM, snpsInCosmicAgged$POS),]
  snpsInCosmicAgged <- apply(snpsInCosmicAgged, 2, function(x) gsub("\\(1x\\)|\\(x\\)", "", x))
  snpsInCosmicAgged <- as.data.frame(t(snpsInCosmicAgged))
  snpsInCosmicAgged <- apply(snpsInCosmicAgged, 2, function(x) gsub("\\<NS\\>", "", x))
  snpsInCosmicAgged <- as.data.frame(t(snpsInCosmicAgged))
  snpsInCosmicAgged <- as.data.frame(snpsInCosmicAgged)
}

# Remove repeat numbers from rsID, Gene, and Effect columns
snpsInCosmicAgged$rsID. <- gsub("\\(\\w+\\)", "", snpsInCosmicAgged$rsID.)
snpsInCosmicAgged$Gene. <- gsub("\\(\\w+\\)", "", snpsInCosmicAgged$Gene.)
snpsInCosmicAgged$Effect. <- gsub("\\(\\w+\\)", "", snpsInCosmicAgged$Effect.)

# Remove repeat numbers for spaces
for(i in (1:ncol(snpsInCosmicAgged))){
  snpsInCosmicAgged[,i] <- gsub("\\s+\\(\\w+\\)|^\\(\\w+\\)\\s+", "", snpsInCosmicAgged[,i])
}

# Remove spaces appended by aggregate
for(i in (1:ncol(snpsInCosmicAgged))){
  snpsInCosmicAgged[,i] <- gsub("\\>\\s+$", "", snpsInCosmicAgged[,i])
}

# Format
setnames(snpsInCosmicAgged, "Pubmed_PMID.", "COSMIC_reference")
snpsInCosmicAgged$COSMIC_mutation <- paste(snpsInCosmicAgged$Gene.name, ": ", snpsInCosmicAgged$Mutation.CDS, sep="")
snpsInCosmicAgged$COSMIC_sample <- paste("Sample: ", snpsInCosmicAgged$Sample.name, " ", snpsInCosmicAgged$Sample.source, " ID:", snpsInCosmicAgged$ID_sample., ", frequency in COSMIC: ", snpsInCosmicAgged$ID_sample.count, sep="")
snpsInCosmicAgged$COSMIC_histology <- paste(snpsInCosmicAgged$Primary.site, snpsInCosmicAgged$Site.subtype, snpsInCosmicAgged$Primary.histology, snpsInCosmicAgged$Histology.subtype, snpsInCosmicAgged$Comments, sep=" ")
snpsInCosmicAgged$COSMIC_sample <- gsub("\\s+", " ", snpsInCosmicAgged$COSMIC_sample)
snpsInCosmicAgged$COSMIC_histology <- gsub("\\s+", " ", snpsInCosmicAgged$COSMIC_histology)
setnames(snpsInCosmicAgged, "rsID.", "rsID")
setnames(snpsInCosmicAgged, "Gene.", "Gene")
setnames(snpsInCosmicAgged, "Effect.", "Effect")

finalCosmicSNPs <- snpsInCosmicAgged[,c(1:6, 18:20)]


### ClinVar Annotation
# Format and merge on rsID and Start
vcfForClinvar <- snpVCFbyPos
names(vcfForClinvar)[1] <- "Chromosome"
names(vcfForClinvar)[2] <- "Start"
names(vcfForClinvar)[3] <- "rsID"
names(clinVarSummary)[7] <- "rsID"
vcfForClinvar[,3] <- gsub("rs", "", vcfForClinvar[,3])
clinVarRS <- merge(vcfForClinvar, clinVarSummary, by="rsID")
clinVarStart <- merge(vcfForClinvar, clinVarSummary, by=c("Chromosome", "Start"))

# Format and merge on Stop
names(vcfForClinvar)[2] <- "Stop"
names(clinVarSummary)[7] <- "rsID"
clinVarStop <- merge(vcfForClinvar, clinVarSummary, by=c("Chromosome", "Stop"))

# Remove nonSNPs
clinVarStart <- subset(clinVarStart, Start == Stop)
clinVarStop <- subset(clinVarStop, Start == Stop)

# Format for combining dataframes (columns: rsID, CHROM, POS, Gene, Effect, Type, Name, ClinicalSignificance, PhenotypeIDs, Origin, ReviewStatus, NumberSubmitters, OtherIDs, X.AlleleID, REF:GENE5)
names(clinVarRS)[c(2,3)] <- c("CHROM", "POS")
clinVarRS$Chromosome.y <- NULL
clinVarRS$Start.y <- NULL
clinVarRS$Stop <- NULL
clinVarRS <- clinVarRS[,c(1,2,3,4,5,24,25,28,32,33,36,39,42,23,6:22)]

names(clinVarStart)[c(1,2,3)] <- c("CHROM", "POS", "rsID")
clinVarStart$rsID.y <- NULL
clinVarStart$Stop <- NULL
clinVarStart <- clinVarStart[,c(3,1,2,4,5,24,25,28,32,33,36,39,42,23,6:22)]

names(clinVarStop)[c(1,2,3)] <- c("CHROM", "POS", "rsID")
clinVarStop$rsID.y <- NULL
clinVarStop$Start <- NULL
clinVarStop <- clinVarStop[,c(3,1,2,4,5,24,25,28,32,33,36,39,42,23,6:22)]

# Combine dataframes and Keep unique entries
clinVarAllMethods <- rbind(clinVarRS, clinVarStart, clinVarStop)
clinVarFinal <- unique(clinVarAllMethods)

## Double check correct allele by matching mutation base from vcf and ClinVar's Name field
# Remove rows without base change in Name column
clinVarFinal <- clinVarFinal[-which(regexpr(">.*$", clinVarFinal$Name)==-1),]

# Create clinVarAllele1 and complementary clinVarAllele2
clinVarFinal$clinVarAllele1 <- NA
for(i in 1:ncol(clinVarFinal)){
  if(grepl(">", clinVarFinal$Name[i])){
    clinVarFinal$clinVarAllele1[i] <- gsub(">|\\s.*$", "", regmatches(clinVarFinal$Name[i], regexpr(">.*$", clinVarFinal$Name[i])))
  }
}
clinVarFinal$clinVarAllele2 <- ifelse((clinVarFinal$clinVarAllele1 == "A"), "T",
                                      ifelse((clinVarFinal$clinVarAllele1 == "T"), "A",
                                             ifelse((clinVarFinal$clinVarAllele1 == "G"), "C",
                                                    ifelse((clinVarFinal$clinVarAllele1 == "C"), "G", NA))))

clinVarFinal <- subset(clinVarFinal, (is.na(clinVarFinal$clinVarAllele1)) | (clinVarFinal$ALT1 == clinVarFinal$clinVarAllele1) | (clinVarFinal$ALT2 == clinVarFinal$clinVarAllele1) | (clinVarFinal$ALT3 == clinVarFinal$clinVarAllele1) | (clinVarFinal$ALT1 == clinVarFinal$clinVarAllele2) | (clinVarFinal$ALT2 == clinVarFinal$clinVarAllele2) | (clinVarFinal$ALT3 == clinVarFinal$clinVarAllele2))
clinVarFinal$clinVarAllele1 <- NULL
clinVarFinal$clinVarAllele2 <- NULL

# Create rating column from number of submitters
clinVarFinal$Rating <- ifelse(grepl("not", clinVarFinal$ReviewStatus), 0,
                              ifelse(grepl("conflicting", clinVarFinal$ReviewStatus), 0,
                                     ifelse(grepl("single", clinVarFinal$ReviewStatus), 1,
                                            ifelse(grepl("multiple", clinVarFinal$ReviewStatus), 2,
                                                   ifelse(grepl("expert", clinVarFinal$ReviewStatus), 3,
                                                          ifelse(grepl("professional", clinVarFinal$ReviewStatus), 4, NA  ))))))

# Format (columns: CHROM, POS, rsID, Gene, Effect, Rating, ClinicalSignificance, PhenotypeIDs, Origin, Name, Type, ReviewStatus, NumberSubmitters, X.AlleleID, OtherIDs
finalClinVarSNPs <- clinVarFinal[,c(2,3,1,4,5,32,8,9,10,7,6,11,12,14,13)]
setnames(finalClinVarSNPs, "X.AlleleID", "AlleleID")
finalClinVarSNPs$rsID <- paste("rs", finalClinVarSNPs$rsID, sep="")
finalClinVarSNPs$Mutation <- paste(finalClinVarSNPs$Type, finalClinVarSNPs$Name, sep=" ")
finalClinVarSNPs$Name <- NULL
finalClinVarSNPs$Type <- NULL
names(finalClinVarSNPs)[c(6:14)] <- paste("ClinVar_", names(finalClinVarSNPs)[c(6:14)], sep="")
finalClinVarSNPs <- finalClinVarSNPs[,c(1:9, 14, 10:13)]


### pharmGkb Annotation
# Merge based on rsID
pharmVar <- merge(snpVCFbyPos, pharmgkbRSID, by.x = "rsID", by.y = "Variant")
# Format pharmGkb individual SNP data
names(pharmgkbAllele) <- c("rsID", "genotype", "description", "notes1", "notes2")
pharmgkbAllele$allele1 <- substr(pharmgkbAllele$genotype, 1, 1)
pharmgkbAllele$allele2 <- substr(pharmgkbAllele$genotype, 2, 2)
  
# Add individual SNP annotation from pharmGkb
pharmVar[,28] <- list("")
pharmVar[,29] <- list("")
pharmVar[,30] <- list("")
  
names(pharmVar)[28] <- "description"
names(pharmVar)[29] <- "notes1"
names(pharmVar)[30] <- "notes2"
  
if(nrow(pharmVar)!=0) {
  for(i in 1:(nrow(pharmVar))){
    for(j in 1:(nrow(pharmgkbAllele))){
      if((pharmVar[i,1] == pharmgkbAllele[j,1]) & (((pharmVar[i,16] == pharmgkbAllele[j,6]) & (pharmVar[i,17] == pharmgkbAllele[j,7])) | ((pharmVar[i,16] == pharmgkbAllele[j,7]) & (pharmVar[i,17] == pharmgkbAllele[j,6])))){
        pharmVar[i,28] <- paste(pharmVar[i,28], pharmgkbAllele[j,3], sep = " ")
        pharmVar[i,29] <- paste(pharmVar[i,29], pharmgkbAllele[j,4], sep = " ")
        pharmVar[i,30] <- paste(pharmVar[i,30], pharmgkbAllele[j,5], sep = " ")
      }
    }
  }
}
  
# Format
finalPharmgkbSNPs <- pharmVar[,c(2,3,1,4,5,24,26,25,27,23,28:30)]
setnames(finalPharmgkbSNPs, "Gene.x", "Gene")
setnames(finalPharmgkbSNPs, "Type", "Reaction")
setnames(finalPharmgkbSNPs, "Strength.of.evidence..level.", "Evidence.Level")
names(finalPharmgkbSNPs)[c(6:13)] <- paste("PharmGkb_", names(finalPharmgkbSNPs)[c(6:13)], sep="")
setnames(finalPharmgkbSNPs, "PharmGkb_Gene.y", "PharmGkb_Gene")


### DrugBank Annotation
# Merge on snp rsID
snpsInDrugBank <- merge(snpVCFbyPos, drugBankTable, by.x="rsID", by.y="snp")

# Pull out allele (three possibilities: G>A; A allele; A Allele) and subset on it
snpsInDrugBank$altAllele1 <- sapply(strsplit(as.character(snpsInDrugBank$allele.change), "> "), "[", 2)
snpsInDrugBank$altAllele2 <- sapply(strsplit(as.character(snpsInDrugBank$allele.change), "\\sall|\\sAll"), "[", 1)

# Double check that alternative alleles match
snpsInDrugBank1 <- subset(snpsInDrugBank, (snpsInDrugBank$ALT1 == snpsInDrugBank$altAllele1) | (snpsInDrugBank$ALT2 == snpsInDrugBank$altAllele1) | (snpsInDrugBank$ALT3 == snpsInDrugBank$altAllele1))
snpsInDrugBank2 <- subset(snpsInDrugBank, (snpsInDrugBank$ALT1 == snpsInDrugBank$altAllele2) | (snpsInDrugBank$ALT2 == snpsInDrugBank$altAllele2) | (snpsInDrugBank$ALT3 == snpsInDrugBank$altAllele2))
snpsInDrugBank <- rbind(snpsInDrugBank1, snpsInDrugBank2)

snpsInDrugBank$altAllele1 <- NULL
snpsInDrugBank$altAllele2 <- NULL

# Format final table (column order: CHROM, POS, rsID, Gene, Effect, drug.name, reaction, reference, drug.number, allele.name, allele.change, gene.name, gene.symbol, uniprot)
finalDrugBankSNPs <- snpsInDrugBank[,c(2,3,1,4,5,27,25,26,28,23,24,29:31)]
setnames(finalDrugBankSNPs, "drug.name", "drug")
names(finalDrugBankSNPs)[c(6:14)] <- paste("DrugBank_", names(finalDrugBankSNPs)[c(6:14)], sep="")


### Combine and Format Annotations
require(plyr)

# Cast as data.frames then rbind.fill
finalIntogenSNPs <- as.data.frame(finalIntogenSNPs)
finalCosmicSNPs <- as.data.frame(finalCosmicSNPs)
finalClinVarSNPs <- as.data.frame(finalClinVarSNPs)
finalPharmgkbSNPs <- as.data.frame(finalPharmgkbSNPs)
finalDrugBankSNPs <- as.data.frame(finalDrugBankSNPs)

if(nrow(finalIntogenSNPs)==0){
  for(i in 1:ncol(finalIntogenSNPs)){
    finalIntogenSNPs[1,i] <- "fake"
  }
}

if(nrow(finalCosmicSNPs)==0){
  for(i in 1:ncol(finalCosmicSNPs)){
    finalCosmicSNPs[1,i] <- "fake"
  }
}

if(nrow(finalClinVarSNPs)==0){
  for(i in 1:ncol(finalClinVarSNPs)){
    finalClinVarSNPs[1,i] <- "fake"
  }
}

if(nrow(finalPharmgkbSNPs)==0){
  for(i in 1:ncol(finalPharmgkbSNPs)){
    finalPharmgkbSNPs[1,i] <- "fake"
  }
}

if(nrow(finalDrugBankSNPs)==0){
  for(i in 1:ncol(finalDrugBankSNPs)){
    finalDrugBankSNPs[1,i] <- "fake"
  }
}

allAnnos <- rbind.fill(finalIntogenSNPs, finalCosmicSNPs, finalClinVarSNPs, finalPharmgkbSNPs, finalDrugBankSNPs)

# Aggregate on chromosome and position
allAnnos <- aggregate(allAnnos[,c(3:dim(allAnnos)[2])], allAnnos[,c(1,2)], function(x) paste(unique(x[!is.na(x)]), " ", sep="", collapse=""))
allAnnos <- allAnnos[-which(allAnnos$CHROM=="fake"),]

# Remove spaces appended by aggregate function
for(i in (1:ncol(allAnnos))){
  allAnnos[,i] <- gsub("\\>\\s+$", "", allAnnos[,i])
}


### Label Differentially Expressed (Yes or No)
# Import list of differentially expressed genes
diffGenes <- names(diffExpNBinom$FCdiff)

## Label differentially expressed genes
# Create DIFF column
allAnnos$DIFF <- list("")

# Remove spaces in gene columns
allAnnos$GENE1 <- gsub("\\s+", "", allAnnos$GENE1)
allAnnos$GENE2 <- gsub("\\s+", "", allAnnos$GENE2)
allAnnos$GENE3 <- gsub("\\s+", "", allAnnos$GENE3)
allAnnos$GENE4 <- gsub("\\s+", "", allAnnos$GENE4)
allAnnos$GENE5 <- gsub("\\s+", "", allAnnos$GENE5)

for(i in 1:(nrow(allAnnos))){
  if((allAnnos$GENE1[i] %in% diffGenes)){
    allAnnos$DIFF[i] <- paste(allAnnos$DIFF[i], "Yes", "(", allAnnos$GENE1[i], ")", sep="")
  }else if(allAnnos$GENE2[i] %in% diffGenes){
    allAnnos$DIFF[i] <- paste(allAnnos$DIFF[i], "Yes", "(", allAnnos$GENE2[i], ")", sep="")
  }else if(allAnnos$GENE3[i] %in% diffGenes){
    allAnnos$DIFF[i] <- paste(allAnnos$DIFF[i], "Yes", "(", allAnnos$GENE3[i], ")", sep="")
  }else if(allAnnos$GENE4[i] %in% diffGenes){
    allAnnos$DIFF[i] <- paste(allAnnos$DIFF[i], "Yes", "(", allAnnos$GENE4[i], ")", sep="")
  }else if(allAnnos$GENE5[i] %in% diffGenes){
    allAnnos$DIFF[i] <- paste(allAnnos$DIFF[i], "Yes", "(", allAnnos$GENE5[i], ")", sep="")
  }else{
    allAnnos$DIFF[i] <- "No"
  }
}

allAnnos[,55] <- unlist(allAnnos[,55])
finalSNPs <- data.frame(allAnnos[,c(1:5, 55, 23, 6, 29, 30, 28, 25, 38, 39, 46, 47, 48)])
### CADD Annotation
# Format for Merge
#names(cadd)[c(1:6)] <- c("CHROM", "POS", "REF", "Alt", "CADD_Rawscore", "CADD_PHRED")

# Merge
#finalSNPsWithCADD <- merge(allAnnos, cadd, by=c("CHROM", "POS", "REF"))

# Subset to check allele
#finalSNPsWithCADD <- subset(finalSNPsWithCADD, ((ALT1==Alt) | (ALT2==Alt) | (ALT3==Alt)))
#finalSNPsWithCADD$Alt <- NULL

# Reformat (column order: CHROM, POS, rsID  Gene	DIFF	Intogen_driver	Effect	CADD_Rawscore	CADD_PHRED	ClinVar_Rating	ClinVar_ClinicalSignificance	COSMIC_histology	COSMIC_reference	PharmGkb_Reaction	PharmGkb_Drugs	DrugBank_drug	DrugBank_reaction	DrugBank_reference
#finalSNPs <- finalSNPsWithCADD[,c(1, 2, 4, 5, 55, 23, 6, 56, 57, 29, 30, 28, 25, 38, 39, 46, 47, 48)]
setnames(finalSNPs, "Effect", "SnpEff_Effect")


### Knitr
kable(finalSNPs, format='html', table.attr = 'id=\"SNPs_table\"')

}
snpAnnotation()