forked from LangilleLab/microbiome_helper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilter_feature_table.R
executable file
·102 lines (78 loc) · 3.85 KB
/
filter_feature_table.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env Rscript
# Load optparse package to read in command-line arguments.
library("optparse")
version <- "1.0"
option_list <- list(
make_option(c("-i", "--input"), type="character", default=NULL,
help="Location of input file (required).", metavar="PATH"),
make_option(c("-o", "--output"), type="character", default=NULL,
help="Location of output file (required).", metavar="PATH"),
make_option(c("-f", "--feature_col"), type="integer", default=1,
help=paste("Last column containing feature name information. The script assumes",
"that the feature name and description will be in columns ranging from 1:feature_col.",
sep=" "), metavar="INT"),
make_option(c("-p", "--prop"), type="numeric", default=0.2,
help="Proportion of samples that must have the feature for it to be retained (default: 0.2).",
metavar="NUMERIC"),
make_option(c("-e", "--exclude"), type="character", default=NULL,
help="Any features matching the strings in this comma-delimited list will be excluded.",
metavar="STRING"),
make_option(c("-r", "--rel"), action = "store_true", type="logical", default=FALSE,
help="Flag to indicate that output table should be re-normalized in relative abundance (default: FALSE).",
metavar="BOOL")
)
opt_parser <- OptionParser(
option_list=option_list,
usage = "%prog [options] -i FILE",
description = paste(
"\nScript to filter a tab-delimited file with features as rows and columns as samples.\n",
"The first column is assumed to be the feature names unless the feature_col argument is changed.\n",
"Note that by default the output will not be re-normalized into relative abundance",
sep=" ")
)
opt <- parse_args(opt_parser)
# Check that input and output files set:
if(is.null(opt$input)) { stop("path to input file needs to be set.") }
if(is.null(opt$output)) { stop("path to output file needs to be set.") }
# Read in input file. Note that rownames are not assigned.
intable <- read.table(opt$input, header=T, sep="\t", stringsAsFactors = FALSE,
quote="", comment.char="")
# Get feature names / descriptions only in new df.
feature_info <- intable[ , 1:opt$feature_col, drop=FALSE]
# Get feature abundances only in separate df.
intable_counts <- intable[, -c(1:opt$feature_col), drop=FALSE]
# Identify all features that are observed in fewer than the specified proportion of samples.
features2remove <- which(rowSums(intable_counts > 0) < opt$prop*ncol(intable_counts))
# If features to exclude by name were also set then also identify those rows.
if(! is.null(opt$exclude)) {
str2exclude <- strsplit(x = opt$exclude, split = ",")[[1]]
# Loop over all feature columns and strings to exclude and find all matches.
for (feature_col in colnames(feature_info)){
for(exclude in str2exclude) {
features2remove <- c(features2remove, grep(exclude, feature_info[,feature_col]))
}
}
# Get only unique rows to discard.
features2remove <- unique(features2remove)
}
if(length(features2remove) > 0) {
# Remove features from table.
intable_filt <- intable[-features2remove,,drop=FALSE]
} else {
intable_filt <- intable
}
# Re-normalize data into relative abundance per-sample if option set.
if(opt$rel){
value_col <- c((opt$feature_col+1):ncol(intable_filt))
intable_filt[,value_col] <- sweep(intable_filt[,value_col],
2,
colSums(intable_filt[,value_col]),
'/') * 100
}
# Write out filtered table to file.
write.table(x = intable_filt,
file = opt$output,
quote = FALSE,
sep = "\t",
col.names = TRUE,
row.names = FALSE)