-
Notifications
You must be signed in to change notification settings - Fork 0
/
BioMart.R
executable file
·95 lines (73 loc) · 3.44 KB
/
BioMart.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
require(biomaRt)
require(dplyr)
require(clusterProfiler)
############################################
# Annotation using BioMart - Configuration #
############################################
annot_file <- snakemake@params[["annotation_file_rda"]]
if (file.exists(annot_file) == TRUE){
load(annot_file)
}else{
# parameters to set BioMart
its_in_list_marts <- snakemake@params[["its_in_listMarts"]] # yes or no
biomart_name <- snakemake@params[["biomart_name"]]
biomart_dataset <- snakemake@params[["biomart_dataset"]]
# if Mart is not in listMarts, also set:
biomart_host <- snakemake@params[["biomart_host"]]
biomart_vschema <- snakemake@params[["biomart_vschema"]]
# Set what division in gff3 file column 9 containing the gene names
c_name <- as.numeric(snakemake@params[["c_name"]])
# Set the Mart to use.
if (its_in_list_marts == "yes"){
myusemart <- useMart(biomart_name, dataset = biomart_dataset)
}else if (its_in_list_marts == "no"){
## For use BioMarts from Phytozome
Mart <- new("Mart",
biomart = biomart_name,
vschema = biomart_vschema,
host = biomart_host)
## Select the dataset to use
mysets <- listDatasets(Mart)
mydataset <- mysets$dataset[mysets$dataset == biomart_dataset] # Select the dataset "Phytozome 12 Genomes"
myusemart <- useDataset(as.character(mydataset), mart = Mart)
}
######################################
# Annotation of all E. grandis genes #
######################################
# List all atributes available in selected Mart
attributes <- listAttributes(mart = myusemart)
# Select the attributes useful to annotation
my_attributes <- attributes[c(1:35), 1]
# List all filters available in selected Mart
filters <- listFilters(myusemart)
# Select the information which will be use as input (filter)
## As our list containing the Gene id, we selected the "gene_name_filter", filter 11 in the list of filters
my_filter <- filters[11, 1]
# Reaf the list of genes from gff3 file
genes_IDs <- read.table(snakemake@input[[1]], sep = "\t")
## Extract the names from the Tags column
genes_IDs <- genes_IDs[genes_IDs[, 3] == "gene", ]
genes_names <- data.frame(do.call("rbind",
strsplit(as.character(genes_IDs[,9]),
";",
fixed = TRUE)))
genes_names <- data.frame(do.call("rbind", strsplit(as.character(genes_names[, c_name]), "=", fixed = TRUE)))
genes_names <- as.character(genes_names$X2)
# Execute the annotation for the genes in "genes_list" vector
biomart_annot <- getBM(attributes = my_attributes,
filters = my_filter,
mart = myusemart,
values = genes_names)
save(file = annot_file, biomart_annot)
}
# Summarise to one row per gene. Multiples annotations will be separate by comma
annotation_file_res <- biomart_annot %>%
group_by(gene_name1) %>%
summarise_all(funs(paste(unique(.), collapse = ", ")))
# Write annotation table
write.table(annotation_file_res,
snakemake@output[[1]],
col.names = T,
row.names = F,
quote = F,
sep = "\t")