-
Notifications
You must be signed in to change notification settings - Fork 2
/
IDSLGOA_main_script.R
61 lines (50 loc) · 6.04 KB
/
IDSLGOA_main_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
metabolites_to_go_inchikey <- function(iklist = "AADLTHQNYQJHQV-SVLGDMRNSA-N;ABVVZYXTZLEOHP-CMDGGOBGSA-O;AEUCYCQYAUFAKH-YEUCEMRASA-N;AFSHUZFNMVJNKX-CLFAGFIQSA-N;AFWBWPCXSWUCLB-WDSKDSINSA-N;AWUCVROLDVIAJX-VKHMYHEASA-N;AXKTVZFNYZPELI-GAXXYEEVSA-N;AYKZHWQCRQJYFJ-OWBHPGMISA-N;AYSIAIIEZAVRGL-NKHFZFFVSA-N;BDIIZHQNVIYLPJ-QOSDPKFLSA-N;BITHHVVYSMSWAG-KTKRTIGZSA-N;BRSRLMBMUIZSFE-UHKGRJRRSA-N;BWTDRYKILUYTBR-QCVIXQMPSA-N;CDAISMWEOUEBRE-GPIVLXJGSA-N;CDAISMWEOUEBRE-UHFFFAOYSA-N;CMUNUTVVOOHQPW-UHFFFAOYSA-N;CQOVPNPJLQNMDC-ZETCQYMHSA-N;CRPAVAKPGRALOU-UHFFFAOYSA-N;CSMIXYYBADWJBU-BBSIOPQKSA-N;DBYSGBYABPVNGO-HVLKLTLRSA-N;DHMQDGOQFOQNFH-UHFFFAOYSA-N;DJVOKHFQPGWUPK-HSBBONCLSA-N;DLHJJSHFSFAIEW-ZUELBQRLSA-N;DPUOLQHDNGRHBS-KTKRTIGZSA-N;DSOZLFZOYYFBEV-SJWJHEOESA-N;DTRODWMOJLEYHX-VDKLDXSNSA-N;FBPFZTCFMRRESA-JGWLITMVSA-N;FPHAZHKZJQOPBU-LGFBOSHVNA-N;FRYULLIZUDQONW-IMJSIDKUSA-N;FUJLYHJROOYKRA-QGZVFWFLSA-N;FWSWVUHDBYGKDC-DPTWWRMPSA-N;GCJYMGXKZROELS-DBEGNQDPSA-N;GDFAOVXKHJXLEI-VKHMYHEASA-N;GEUWKMFIZLQRRK-SXAUZNKPSA-N;GIMFDENDLRGOPW-YQSYOFKPSA-N;GVGOSRFGLYHWSF-KWKYUSQZSA-N;HAIPHKFLSXSDAN-FEOZPROASA-N;HJCMDXDYPOUFDY-WHFBIAKZSA-N;HMQPEDMEOBLSQB-RPHKZZMBSA-N;HZPUWSUHFOTSAY-NPIJDGQYSA-N;IDBFFSRZGUNQJB-VSCRCFLHSA-N;IGSRLTZSNICSAD-FRXQITOYSA-N;IIQYZGWQUHYURC-HWSICPKZSA-N;JLDQDLKBBDQYDE-IQDNDYAFSA-N;JLXVRFDTDUGQEE-YFKPBYRVSA-N;JOXIMZWYDAKGHI-UHFFFAOYSA-N;JPIJQSOTBSSVTP-STHAYSLISA-N;JQOHKCDMINQZRV-WDSKDSINSA-N;JYOAXOMPIXKMKK-YUMQZZPRSA-N;KDKDOORFPQYZGM-LOYOHVQTSA-N;KEVXBWUKDZIWFL-FEAYMNSJSA-N;KJDVLWWPMOBWGU-ZCXUNETKSA-N;KMWDOOIYMHAOTF-ZPHPHTNESA-N;KZVRAFHIKMDULK-MLWYYCKJSA-N;LWBHHRRTOZQPDM-UHFFFAOYSA-N;MTWJTFBVRDGROD-YUMQZZPRSA-N;NAJHAHQNQCNWOP-PUYNVXOJSA-N;NKKUHWUBWLYVOR-GGLIIMCGSA-N;NRJAVPSFFCBXDT-UHFFFAOYSA-N;OBRFWEGTASUAMF-UHFFFAOYSA-N;ODHCTXKNWHHXJC-VKHMYHEASA-N;ONCLVQFEAFTXMN-UHFFFAOYSA-N;OQKUDRFZPJFLJQ-KMUIXOOCSA-N;OTKBBAPJAYXBMS-MOHJPFBDSA-N;OYILGZBEAWRAOX-WNCGIYLDSA-N;POQRWMRXUOPCLD-MHAUTQJVSA-N;QEDPUVGSSDPBMD-OGSMZCTISA-N;QFVHCMLUKNHDSH-WTWBAFHPSA-N;QLABQLNIHHHJCI-NIZIMQIGSA-N;QMPJMPWWIPZEKC-AREXDUQASA-N;QPQYEJYOQHVWMK-MOHJPFBDSA-N;QZEYUKQNWUMXFF-BDMPKYRESA-N;QZZGJDVWLFXDLK-UHFFFAOYSA-N;RUHBZJXWUDKAMZ-ODAJXHQSSA-N;RVQPKPDZDJRSHQ-UHFFFAOYSA-N;RZEQTVHJZCIUBT-WDSKDSINSA-N;SCCPDJAQCXWPTF-VKHMYHEASA-N;SIFXMYAHXJGAFC-WDSKDSINSA-N;SLKDGVPOSSLUAI-UHFFFAOYSA-N;SNVYJLGKALMFOP-MOLROICHSA-N;SSISHJJTAXXQAX-ZETCQYMHSA-N;SUHOQUVVVLNYQR-MRVPVSSYSA-N;SVXWJFFKLMLOHO-BCTRXSSUSA-N;UCLYWNKSJDISRQ-PEZBUJJGSA-N;UFULAYFCSOUIOV-UHFFFAOYSA-N;UGQMRVRMYYASKQ-KQYNXXCUSA-N;UGXCZCKSRJRUTE-MWEXPPKQSA-N;UNSOEVNGAQIBOH-UZRIYOBRSA-N;UOPVDXWZFLMKIL-UHFFFAOYSA-N;VCWLZDVWHQVAJU-NWDGAFQWSA-N;WACUGXLVPJOLTK-ATEMJZPOSA-N;WIBLMBMOLOLTTN-PHHYKNGOSA-N;WVDDGKGOMKODPV-UHFFFAOYSA-N;WWUZIQQURGPMPG-CCEZHUSRSA-N;XDSKYRMPCFSZCU-BFZYJHCDNA-N;XMLLINSHRHLMER-ZCXUNETKSA-N;XRVROIJFISYTFL-HVZKQDKLSA-N;XWYSLMAMRKYUFH-SEYXRHQNSA-N;XXBYYQLCFDQVJY-JEPFLRBFSA-N;XXDVJSLVOGOVEP-NOVDPWBGSA-N;YDNKGFDKKRUKPY-TURZORIXSA-N;YIGARKIIFOHVPF-YAEABVQUSA-N;YODDLEUQZYIACW-ANYBSYGZSA-N;YUFFSWGQGVEMMI-JLNKQSITSA-N;ZDPHROOEEOARMN-UHFFFAOYSA-N;ZEANAEAKLIXUMF-APMJBGCSSA-N;ZYMZKMQPPJPQHY-HAGCXRLMSA-N") {
# load R objects.
load("go_names.RData")
load("ec_inchikey_links.RData")
load("go_obo_net.RData")
load("ec2gene.RData")
load("all_go_child_cpd_list_met_ik14.RData")
load("all_go_child_cpd_list_met.RData")
load("all_go_child_gene_list_met.RData")
load("all_go_child_ec_list_met.RData")
minimum_gene_go_ids_0 <- names(all_go_child_ec_list_met)[which(sapply(all_go_child_ec_list_met, length) >= 3)]
minimum_gene_go_ids_1 <- names(all_go_child_gene_list_met)[which(sapply(all_go_child_gene_list_met, length) >= 3)]
minimum_gene_go_ids <-unique(c(minimum_gene_go_ids_0,minimum_gene_go_ids_1))
ec_inchikey_links$ik14 <- as.character(sapply(ec_inchikey_links$V2, function(x) {strsplit(x,"-")[[1]][1]}))
go_names_1 <- gsub("metabolic process","*",go_names)
go_names_1 <- gsub("catabolic process","#",go_names_1)
go_names_1 <- gsub("biosynthetic process","^",go_names_1)
exclude_go_vec <- as.character(names(all_go_child_cpd_list_met_ik14))[grep("catabolic|anabolic|biosynthetic|tion$|phagy$|mRNA$|ing$",as.character(go_names[names(all_go_child_cpd_list_met_ik14)]))] # redundant and non-metabolic GO ids are excluded.
ik_vec_0 <- strsplit(iklist,";")[[1]]
ik_vec <- unique(as.character(sapply(ik_vec_0, function(ik) { strsplit(ik, "-")[[1]][1] })))
all_go_child_cpd_list_met_ik14 <- all_go_child_cpd_list_met_ik14[names(all_go_child_cpd_list_met_ik14)%in%minimum_gene_go_ids]
all_cpds <- unique(unlist(all_go_child_cpd_list_met_ik14))
xk <- length(ik_vec[ik_vec%in%all_cpds])
xn <- length(all_cpds)
if(xk > 3) {
go_analysis_list <- do.call(rbind, lapply(1:length(all_go_child_cpd_list_met_ik14), function(x) {
xm <- length(all_go_child_cpd_list_met_ik14[[x]][all_go_child_cpd_list_met_ik14[[x]]%in%ik_vec])
pl <- length(all_go_child_cpd_list_met_ik14[[x]])
# k <- length(kegg_vec)
# n <- length(all_cpds)
c(xm,pl,xk,xn,phyper(xm-1,pl,xn-pl,xk,lower.tail = F))
}))
go_res_df <- data.frame(go_analysis_list, stringsAsFactors = F)
names(go_res_df) <- c("Overlap","Process","Input", "All","P-value")
go_res_df$GO_term <- names(all_go_child_cpd_list_met_ik14)
# Filters
go_res_df$SetSizeRatio <- go_res_df$Process/xn
go_res_df <- go_res_df[go_res_df$SetSizeRatio <= 0.05,] ## We need to exclude the larger GO terms.
go_res_df <- go_res_df[!go_res_df$GO_term%in%exclude_go_vec,] ## exclude the redundant GO terms.
go_res_df$pAdjust <- p.adjust(go_res_df$`P-value`, method = "fdr")
go_res_df <- go_res_df[order(go_res_df$pAdjust),]
go_res_df <- go_res_df[which(go_res_df$pAdjust<0.05),] # padjust should be 0.05
go_res_df <- go_res_df[which(go_res_df$Process>0),]
go_res_df <- go_res_df[which(go_res_df$Overlap >= 3),]
go_res_df$SetOverlapRatio <- go_res_df$Overlap/go_res_df$Process
go_res_df <- go_res_df[which( go_res_df$SetOverlapRatio > 0.05),] # At least 5% coverage should be present in the input list.
go_res_df$GO_name = as.character(go_names[go_res_df$GO_term])
write.table(go_res_df, "goa_analysis_results.txt", col.names=T, row.names=F, sep="\t", quote=F)
}
}