-
Notifications
You must be signed in to change notification settings - Fork 0
/
mutual_information_in_R.bash
executable file
·309 lines (294 loc) · 12.6 KB
/
mutual_information_in_R.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/bin/bash
# program by Andreas Hadjiprocopis
# livantes at soi.city.ac.uk
# Institute of Cancer Research
# London, 2010
# Free for everyone to copy, use, modify / quote author
appname=`basename "$0"`
bindir=$( cd "$( dirname "$0" )" && pwd )
now=`date`
rand=$$
param_ifs=':'
seed=$$
png_width=1024
png_height=768
png_fontsize=18
#extra
num_bins=100
function display_help_exit {
echo "Usage : $1 -i input_file -o output_file [-c col_num ... -c ... | -C col_num ... -C ...] [-z col_name ... -z ... | -Z col_name ... -Z ...] [-1] [-e input_data_separator] [-E output_data_separator] [-R R_output_script_name] [-2 seed] [-3 png_width:png_height:png_fontsize]"
echo "other params: [-b num_bins] "
echo "This script will calculate the mutual information between all possible pairs of the specified columns-to-do of the input file. Unfortunately, the number of elements in each column must be the same."
echo "The way to do that is 1) standardize (i.e. zero mean, unit stdev) each column, 2) put all the standardize columns together, 3) calculate the histogram of all the data together, with specified number of breaks (or bins) using -b (default is ${num_bins}), 4) label each value on each column according to which bin in the histogram it falls, e.g. if it falls to first bin, its label is 1, second bin, 2 etc., 5) calculate the mutual information between each pair of columns of labels"
echo "The output file can then be plotten using the 'heatmap_in_R.bash -i inp -o out' script"
echo "fixed params"
echo " -i inputFile"
echo " defines the input file."
echo " -o outputFile"
echo " defines the output file."
echo " -c col_num"
echo " choose the column number you wish to operate as a comma-separated list of column numbers (starting from 1) - repeat -c options if necessary, all columns will be processed unless this option is specified"
echo " -C col_num"
echo " choose the column number you DO NOT wish to operate as a comma-separated list of column numbers (starting from 1) - repeate -C options if necessary, all columns will be processed unless this option is specified"
echo " -z col_name"
echo " choose the column name you wish to operate - repeat -z options if necessary, all columns will be processed unless this option is specified. Use unique separator (-1), default is '${param_ifs}' or repeat -z options"
echo " -Z col_nam"
echo " choose the column number you DO NOT wish to operate as a list of column names or repeate -Z options. Use unique separator (-1), default is '${param_ifs}' or repeat -Z options"
echo " -1"
echo " include columns not selected for processing to the output, default is not to print un-processed columns"
echo " -2 random_number_generator_seed"
echo " useful to repeat experiments, otherwise seed is selected at random using the process id"
echo " -3 png_width:png_height:png_fontsize : the size of the output image if any in pixels and font size, default: ${png_width}:${png_height}:${png_fontsize}"
# standard features
echo " -h"
echo " optional, that the first row is not numerical but it holds column names, these names will be copied to the output. If the script fails, this could be the reason (i.e. calculations with column names)."
echo " -e sep"
echo " optional, 'sep' is the separator between columns in the input file. If the script fails, this could be the reason (i.e. spaces are used as col separator but in actual terms they are tab separated)."
echo " -R filename"
echo " specify the name of the file to contain the R script to execute (because this bash file"
echo " creates an R script which then runs. If this option is not used, then the output script name will be deleted."
echo " Otherwise, the R script will not be deleted and you can re-run it again using R < script_name"
echo ""
echo "Other options:"
echo "program by Andreas Hadjiprocopis"
echo "andreashad2 at gmail.com"
echo "Institute of Cancer Research"
echo "London, 2010"
echo "Free for everyone to copy, use, modify / quote author"
echo ""
exit 1
}
function column_names_to_column_numbers {
local _inputfile="$1"; shift # must have header
local _ifs="$1"; shift
local _column_names=${@}
local _rand=$$
local _i _ret="" _ifs2=""
if [ "${_ifs}" != "" ]; then _ifs2="--ifs '${_ifs}'"; fi
# dont echo anything apart from the last one before return (unless > /dev/stderr)
# echo extract_columns.pl --input "${_inputfile}" --output /tmp/${_rand}.cn2cn --include_column_names_in_output --have_column_names_header ${_ifs2} --print_columns_and_exit > /dev/stderr
cat << EOC | sh
extract_columns.pl --input "${_inputfile}" --output /tmp/${_rand}.cn2cn --include_column_names_in_output --have_column_names_header ${_ifs2} --print_columns_and_exit > /dev/null
EOC
if [ $? -ne 0 ]; then
echo "$appname : ${FUNCNAME}, line ${LINENO} : call to the following command has failed"
echo extract_columns.pl --input \"${_inputfile}\" --output /tmp/${_rand}.cn2cn --include_column_names_in_output --have_column_names_header ${_ifs2} --print_columns_and_exit
return 1
fi
oldIFS="${IFS}"; IFS=${param_ifs}
for _a_col_name in ${_column_names[*]}; do
# echo "checking for ${_a_col_name} (/tmp/${_rand}.cn2cn)" > /dev/stderr
_a_col=`awk -F '\t' -v n="${_a_col_name}" '{if($4==n){print ($2+1); exit 2}}' /tmp/${_rand}.cn2cn`
if [ $? -ne 2 ]; then
# echo "$appname : ${FUNCNAME}, line ${LINENO} : could not find column name '${_a_col_name}' in the header of the input file '${_inputfile}'."
# return 1
continue # baby
fi
# echo "found ${_a_col}" > /dev/stderr
_ret+=",${_a_col}"
done
IFS="${oldIFS}"
echo "${_ret#,}"
rm -f /tmp/${_rand}.cn2cn
return 0
}
inputFile=""
outputFile=""
# for standard arguments
separator=''; Rseparator=''
separatorOut='\t'
header="FALSE"
Rscript_name="${rand}.R"
outputPrecision="5"
doColumns=""
dontDoColumns=""
doColumnsNames=""
dontDoColumnsNames=""
do_only_selected_columns=0
include_discarded_columns_in_output=0
delete_Rscript_when_done=1
if [ "$*" = "" ]; then
display_help_exit $appname
fi
while getopts "c:C:z:Z:i:o:R:Hhe:E:p:12:3:b:" OPTION
do
case $OPTION in
c)
if [ "${doColumns}" == "" ]; then doColumns="${OPTARG}"; else doColumns="${doColumns},$OPTARG"; fi
do_only_selected_columns=1
;;
C)
if [ "${dontDoColumns}" == "" ]; then dontDoColumns="${OPTARG}"; else dontDoColumns="${dontDoColumns},$OPTARG"; fi
do_only_selected_columns=1
;;
z)
if [ "${doColumnsNames}" == "" ]; then doColumnsNames="${OPTARG}"; else doColumnsNames="${doColumnsNames}${param_ifs}$OPTARG"; fi
do_only_selected_columns=1
;;
Z)
if [ "${dontDoColumnsNames}" == "" ]; then dontDoColumnsNames="${OPTARG}"; else dontDoColumnsNames="${dontDoColumnsNames}${param_ifs}$OPTARG"; fi
do_only_selected_columns=1
;;
1)
include_discarded_columns_in_output=1
;;
i)
if [ "$inputFile" != "" ]; then
echo "$appname : you can specify only one input file"
exit 1
fi
inputFile="$OPTARG"
;;
o)
if [ "$outputFile" != "" ]; then
echo "$appname : you can specify only one output folder"
exit 1
fi
outputFile=$OPTARG
;;
# standard arguments
2)
seed=$OPTARG
;;
3)
png_width=`echo "${OPTARG}" | cut -d':' -f1`
png_height=`echo "${OPTARG}" | cut -d':' -f2`
png_fontsize=`echo "${OPTARG}" | cut -d':' -f3`
;;
R)
delete_Rscript_when_done=0
Rscript_name="$OPTARG"
;;
H)
display_help_exit "$appname"
;;
h)
# in case there is a header (column names at the first row of file)
header="TRUE"
;;
e)
separator="$OPTARG"; Rseparator=",sep='${separator}'"
;;
E)
separatorOut=$OPTARG
;;
p)
outputPrecision=$OPTARG
;;
# other options
b)
num_bins=$OPTARG
;;
esac
done
if [ ! -f "$inputFile" ]; then
echo "$0 : input file $inputFile does not exist."
exit 1
fi
if [ "$Rscript_name" = "" ]; then
echo "$0 : script file name is not defined (use the -R option to do it)."
exit 1
fi
if [ "$outputFile" = "" ]; then
echo "$0 : you need to specify an output file using the '-o' option."
exit 1
fi
if [[ ("${doColumns}" != "" || "${doColumnsNames}" != "") && ("${dontDoColumns}" != "" || "${dontDoColumnsNames}" != "") ]]; then
echo "$0 : either -c or -C options but not both can be used (either specify columns to do, or columns not to do - both makes no sense)"
exit 1
fi
if [ "${doColumnsNames}" != "" ]; then
crap=$(column_names_to_column_numbers "${inputFile}" "${separator}" ${doColumnsNames})
if [ $? -ne 0 ]; then echo ${crap}; exit 1; fi
if [ "${crap}" != "" ]; then
doColumns+="${crap}"
echo "$appname : extra column numbers to do via their column names : ${crap}"
fi
fi
if [ "${dontDoColumnsNames}" != "" ]; then
crap=$(column_names_to_column_numbers "${inputFile}" "${separator}" ${dontDoColumnsNames})
if [ $? -ne 0 ]; then echo ${crap}; exit 1; fi
if [ "${crap}" != "" ]; then
dontDoColumns+="${crap}"
echo "$appname : extra column numbers not to do via their column names : ${crap}"
fi
fi
doColumns=${doColumns/X,/}; doColumns=${doColumns##,}; dontDoColumns=${dontDoColumns/X,/}; dontDoColumns=${dontDoColumns##,};
cat << EOC > "$Rscript_name"
set.seed($seed); write("$appname : starting R, seed is $seed", stderr());
# R script by $appname on $now
write("$appname : reading data from '$inputFile'", stderr()); gotError<-FALSE
# fill=T allows for uneven number of rows per column
tryCatch({oridata = read.table("$inputFile", fill=T, header=$header $Rseparator, check.names=FALSE)}, error=function(ex){write(paste("$appname : error reading file $inputFile\n",ex,sep=""),stderr()); gotError<-TRUE});
if( gotError == TRUE ){ quit(status=1) }
write("$appname : done.", stderr())
actual_num_cols=ncol(oridata)
if( "${dontDoColumns}" == "" ){
if( "${doColumns}" == "" ){
# no columns specified, do all of them
columns_to_do = seq(from=1, to=actual_num_cols)
must_do_this_column=rep(TRUE, actual_num_cols)
} else { columns_to_do = c($doColumns); must_do_this_column=rep(FALSE, actual_num_cols); must_do_this_column[columns_to_do]=TRUE }
} else {
must_do_this_column=rep(FALSE, actual_num_cols)
crap=c($dontDoColumns); num_crap = length(crap); columns_to_do=c();
for(i in 1:actual_num_cols){
found = 0;
for(j in 1:num_crap){ if( crap[j] == i ){ found=1; break; } }
if( found == 0 ){ columns_to_do=append(columns_to_do, i) }
}
must_do_this_column[columns_to_do]=TRUE
}
num_columns_to_do = length(columns_to_do); num_rows = nrow(oridata)
if( num_rows==0 ){ write("$appname : no rows left to do", stderr()) }
if( num_columns_to_do==0 ){ write("$appname : no columns left to do", stderr()) }
if( (num_columns_to_do==0) || (num_rows==0) ){ quit(status=0) }
source("${bindir}/library_in_R.R")
library(infotheo)
# this is where the actual program starts
data_out=NULL
labeled_data=list(); standardised_data=list()
mutual_information=matrix(ncol=num_columns_to_do,nrow=num_columns_to_do)
# make a histogram OF ALL THE DATA after standardisation -- outliers are assumed to be removed before running this program
for(in_col_id1 in 1:num_columns_to_do){
# stdev = sd(oridata[,in_col_id1])
# if( stdev <= ${min_stdev_of_column_to_include} )
a_col_id = columns_to_do[in_col_id1]
tmpdat = oridata[,a_col_id]; tmpdat=tmpdat[!is.na(tmpdat)]
standardised_data[[in_col_id1]] = (tmpdat-mean(tmpdat)) / sd(tmpdat)
}
total_data=unlist(standardised_data)
total_histogram=hist(total_data, breaks=${num_bins}, plot=F); total_data=NULL
for(in_col_id1 in 1:num_columns_to_do){
labeled_data[[in_col_id1]] = label_data(total_histogram\$breaks, standardised_data[[in_col_id1]])
}
cat(paste("$appname : calculating ", num_columns_to_do, " columns :", sep=''), file=stderr())
for(in_col_id1 in 1:num_columns_to_do){
a_col1 = columns_to_do[in_col_id1]
cat(" ", a_col1, file=stderr())
for(in_col_id2 in 1:num_columns_to_do){
if( in_col_id1 == in_col_id2 ){ next }
a_col2 = columns_to_do[in_col_id2]
mutual_information[in_col_id1,in_col_id2] = mutinformation(labeled_data[[in_col_id1]], labeled_data[[in_col_id2]])
}
}
write(". Done all.",stderr())
# precision and when this precision is not enough
options(digits=$outputPrecision, scipen=100)
if( "$header" == "TRUE" ){ namr=names(oridata[,columns_to_do]) } else { namr=paste("col:",columns_to_do,sep="") }; namc=c("AA", namr)
write.table(cbind(namr,mutual_information), file="$outputFile", sep="$separatorOut", col.names=namc, row.names=F, append=FALSE, quote=FALSE)
write("$appname : done with precision $outputPrecision", stderr())
EOC
echo "$appname : executing R script ($Rscript_name)"
R --vanilla < "$Rscript_name" > /tmp/rlog$rand.txt
if [ $? -ne 0 ]; then
# cat /tmp/rlog$rand.txt; echo "$appname : error executing R script ($Rscript_name)";
echo "Command line:"
echo "$0 ${*}"
exit 1
fi
echo "Command line:"
echo "$0 ${*}"
rm -f /tmp/rlog$rand.txt /tmp/$rand.*
if [ "${delete_Rscript_when_done}" -eq 1 ]; then rm -f "$Rscript_name"; fi
exit 0