-
Notifications
You must be signed in to change notification settings - Fork 1
/
combine_linkage_values.pl
executable file
·686 lines (582 loc) · 34.1 KB
/
combine_linkage_values.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
#!/usr/bin/env perl
use warnings;
$| = 1;
#Add use lib statement to assume there is a directory at the same level as bin in which the script is run, called 'lib'
use FindBin;
use lib "$FindBin::Bin/../lib";
use lib "$FindBin::Bin";
use strict;
use FileHandle;
use aomisc;
use Cwd;
use diagnostics;
use Getopt::Long;
use Data::Dumper;
use File::Basename;
use File::Temp;
use File::Spec;
use Statistics::R;
#use Bio::Perl;
#use Getopt::Std;
#use PostData;
#use Fasta_utils;
#use feature ":5.10"; #allows say (same as print, but adds "\n"), given/when switch, etc.
# Andrew J. Oler, PhD
# Computational Biology Section
# Bioinformatics and Computational Biosciences Branch (BCBB)
# OCICB/OSMO/OD/NIAID/NIH
# Bethesda, MD 20892
#
#
#This package is free software; you can redistribute it and/or modify
#it under the terms of the GPL (either version 1, or at your option,
#any later version) or the Artistic License 2.0.
#my %options; #Hash in which to store option arguments
#use vars qw($opt_s $opt_f $opt_n $opt);
#getopts ('s:f:n:bda:g:om:i:r:ecu:kh:j:l:ypx:',\%options);
#Print out the options
if (@ARGV){ print STDERR "Arguments: ", join " ", @ARGV, "\n"; }
my $save = Cwd::cwd();
my $files;
my $verbose;
my $output;
my $gzip;
my $var = "aa,codon,nuc";
my $mu = 0.5;
my $gamma = 0.01;
my $FDR = 0.05;
my $quasi;
my $keeptmp;
my $method = 'fisher';
my $prefix = "Linkage_plot";
my $sample = "";
my $Rscript_loc = 'Rscript';
GetOptions('save|s=s' => \$save,
'output=s' => \$output,
'verbose' => \$verbose,
'files=s' => \$files,
'gzip' => \$gzip,
'mu=s' => \$mu,
'gamma=s' => \$gamma,
'FDR|F=s' => \$FDR,
'quasi|q' => \$quasi,
'var|v=s' => \$var,
'keeptmp' => \$keeptmp,
'method|m=s' => \$method,
'prefix=s' => \$prefix,
'sample=s' => \$sample,
'Rscript=s' => \$Rscript_loc,
);
#-----------------------------------------------------------------------------
#----------------------------------- MAIN ------------------------------------
#-----------------------------------------------------------------------------
my $exe = basename($0);
my $usage = "$exe <variants_linkage_file_1> [ <variants_linkage_file_2> ... ]
variants_linkage files are the output of calculate_linkage_disequilibrium. Each input file
is a separate replicate. Single replicate is fine too. Each file should have a separate
sample id in the 'sample' column.
$exe takes variants linkage file from calculate_linkage_disequilibrium.pl and plots them in R.
Requires R on the path and requires the 'network' library to be installed in R.
Example input variants file:
#group sample type gene comparison pos1 c1 v1 v1freq pos2 c2 v2 v2freq c1c2 c1v2 v1c2 v1v2 p-value OR FDR
Group1 Sample1 nuc Seq12_093009_HA_cds 437:A:C:713:G:A 437 A C 0.04494 713 G A 0.00924 25448 255 1255 0 8.18E-06 0 1.17E-04
Group1 Sample1 nuc Seq12_093009_HA_cds 437:A:C:471:A:C 437 A C 0.04494 471 A C 0.02352 25045 656 1255 0 2.83E-14 0 1.22E-12
Group1 Sample1 nuc Seq12_093009_HA_cds 437:A:G:713:G:A 437 A G 0.03477 713 G A 0.00924 25448 255 970 0 0.000139402 0 1.50E-03
Group1 Sample1 nuc Seq12_093009_HA_cds 437:A:G:471:A:C 437 A G 0.03477 471 A C 0.02352 25045 656 970 0 4.40E-11 0 9.45E-10
Group1 Sample1 nuc Seq12_093009_HA_cds 471:A:C:713:G:A 471 A C 0.02352 713 G A 0.00924 27015 255 656 0 0.005284684 0 3.25E-02
Group1 Sample1 codon Seq12_093009_HA_cds 146:AAC:AGC:157:AAA:AAC 146 AAC AGC 0.0347 157 AAA AAC 0.02349 24993 655 969 0 4.37E-11 0 8.73E-10
Group1 Sample1 codon Seq12_093009_HA_cds 146:AAC:AGC:238:GGT:GAT 146 AAC AGC 0.0347 238 GGT GAT 0.00913 25429 255 969 0 0.00013933 0 1.39E-03
Group1 Sample1 codon Seq12_093009_HA_cds 146:AAC:AGC:157:AAA:GAA 146 AAC AGC 0.0347 157 AAA GAA 0.00122 24993 32 969 0 0.6324233 0 1.00E+00
Group1 Sample1 codon Seq12_093009_HA_cds 146:AAC:ACC:238:GGT:GAT 146 AAC ACC 0.0449 238 GGT GAT 0.00913 25429 255 1254 0 8.18E-06 0 1.09E-04
Group1 Sample1 codon Seq12_093009_HA_cds 146:AAC:ACC:157:AAA:GAA 146 AAC ACC 0.0449 157 AAA GAA 0.00122 24993 32 1252 2 0.6767571 1.24764 1.00E+00
Group1 Sample1 codon Seq12_093009_HA_cds 146:AAC:ACC:157:AAA:AAC 146 AAC ACC 0.0449 157 AAA AAC 0.02349 24993 655 1252 0 2.83E-14 0 1.13E-12
Group1 Sample1 codon Seq12_093009_HA_cds 157:AAA:AAC:238:GGT:GAT 157 AAA AAC 0.02349 238 GGT GAT 0.00913 26970 255 655 0 0.005285926 0 3.02E-02
Group1 Sample1 codon Seq12_093009_HA_cds 157:AAA:GAA:238:GGT:GAT 157 AAA GAA 0.00122 238 GGT GAT 0.00913 26970 255 34 0 1 0 1.00E+00
Group1 Sample1 aa Seq12_093009_HA_cds 146:N:S:157:K:N 146 N S 0.0347 157 K N 0.02352 24997 656 969 0 4.43E-11 0 7.31E-10
Group1 Sample1 aa Seq12_093009_HA_cds 146:N:S:157:K:E 146 N S 0.0347 157 K E 0.00122 24997 32 969 0 0.632412 0 1.00E+00
Group1 Sample1 aa Seq12_093009_HA_cds 146:N:S:238:G:D 146 N S 0.0347 238 G D 0.00913 25433 255 969 0 0.000139298 0 1.15E-03
Group1 Sample1 aa Seq12_093009_HA_cds 146:N:T:157:K:E 146 N T 0.04494 157 K E 0.00122 24997 32 1253 2 0.6768856 1.246844 1.00E+00
Group1 Sample1 aa Seq12_093009_HA_cds 146:N:T:157:K:N 146 N T 0.04494 157 K N 0.02352 24997 656 1253 0 2.83E-14 0 9.33E-13
Group1 Sample1 aa Seq12_093009_HA_cds 146:N:T:238:G:D 146 N T 0.04494 238 G D 0.00913 25433 255 1255 0 8.18E-06 0 9.00E-05
Group1 Sample1 aa Seq12_093009_HA_cds 157:K:N:238:G:D 157 K N 0.02352 238 G D 0.00913 26971 255 656 0 0.005305044 0 1.95E-02
Group1 Sample1 aa Seq12_093009_HA_cds 157:K:E:238:G:D 157 K E 0.00122 238 G D 0.00913 26971 255 34 0 1 0 1.00E+00
OPTIONS:
-F/--FDR FDR threshold for inclusion of variants in the analysis. Default = 0.05.
-v/--var Type of variant(s) to consider (for plot), comma-delimited list. Possible values:
aa codon nuc. Default is \"aa,codon,nuc\" (i.e., all variants).
-m/--method Method to use for merging linkage values if multiple files are provided.
Possible values are 'fisher' and 'counts'. 'fisher' will use Fisher's method
to combine the p-values. 'counts' will combine the counts and recompute a
p-value and fdr. Default is 'fisher'.
--prefix Prefix for the output files. Default = 'Linkage_plot'. Output merged file
will be <prefix>[.merged.xls]
--sample Sample name for the output file. Default is a concatenation of the sample
ids from the files. Note that each replicate file should have a separate sample id!
--Rscript Location of Rscript. Default is 'Rscript' (expected on path.)
--keeptmp Keep temporary files
-q/--quasi Calculate quasi-cliques in network using mgqce. Default is just to make
a network plot. Requires mgqce on the PATH.
--mu Mu value for mgqce. Default 0.5.
--gamma Gamma value for mgqce. Default 0.01.
-s/--save Directory in which to save files. Default = pwd. If folder doesn't exist,
it will be created.
";
# To do:
# Maybe add filter for only nonsyn codons?
# Work on optimizing the circle sizes and edge thickness... see /Users/oleraj/Dropbox/NIAID_Data/WInce/NIH_Research_Festival_Poster/Some_analysis_for_figures/linkage for examples of where it is needed!
# Clean up a little (subroutine names, etc.)
# Change log
# 2014-04-17
# Created script, starting from calculate_quasi_cliques.pl
# 2014-09-09
# Added merge_files sub.
# Modified $edges variable in calculate_quasi_cliques sub to be a HoA instead of an AoA.
# Modified expected input format -- added columns for v1freq and v2freq.
# Added (or made functional) options --FDR, --var, --method, --keeptmp --quasi
# Add filter for aa, nuc, codon (default to do all, but can select only aa, or perhaps nonsyn codons or nuc?)
# Added R graphs
# Added File::Spec->rel2abs for output directory
# 2015-03-23
# Modified expected input file format to match output of calculate_linkage_disequilibrium.pl
# 2015-04-16
# Fixed a bug where $save is passed to File::Spec even if it doesn't exist. Made default for $save to be Cwd::cwd().
# 2017-02-27
# Merge the input files and output a merged output file! Also compute fishers test on the combined p-values.
unless($ARGV[0]){
print STDERR $usage;
exit;
}
my $save_dir = File::Spec->rel2abs( $save) || Cwd::cwd();
unless (-d $save_dir){ mkdir($save_dir) or warn "$!\n" }
my $start_time = time;
my @suffixes = (qw(.bed .bed.gz .bed12 .bed12.gz .txt .txt.gz .BED .BED.gz .BED12 .BED12.gz .fasta .fa .FA .FASTA .FAS .fas), @SUFFIXES); #for fileparse. Feel free to add more accepted extensions. @SUFFIXES comes from aomisc.pm.
# Make a hash of the types of variants allowed (nuc, aa, and/or codon)
my @types = split(/,/, $var);
my $types; #Hashref
foreach my $type (@types){
$types->{$type}++;
}
# Set up global temporary directory
my $tempdir;
my $template = "$save_dir/comb_link_value_tempXXXXX";
if ($keeptmp){
$tempdir = File::Temp->newdir( $template, DIR => $save_dir, CLEANUP => 0 ); # CLEANUP => 0 so it will not be deleted
}
else {
$tempdir = File::Temp->newdir( $template ); # CLEANUP => 1 by default so it will be deleted when out of scope
}
# Merge files if multiple files are provided
my $merged_file;
if (@ARGV > 1){
$sample = get_sample(\@ARGV) unless $sample;
$merged_file = merge_files(\@ARGV, $method, $sample, $prefix, $save_dir); # Merge linkage values based on designated method.
}
else {
$merged_file = $ARGV[0];
}
my $stats = read_variants($merged_file);
calculate_quasi_cliques($stats);
&elapsed($start_time, 'Total', $verbose);
#-----------------------------------------------------------------------------
#---------------------------------- SUBS -------------------------------------
#-----------------------------------------------------------------------------
sub merge_files {
my ($files,$method,$merged_sample_id,$prefix, $save_dir) = @_;
my @files = @$files;
my $merged_filename = "$save_dir/$prefix" . ".mergedLinkage.xls"; # Name for merged file.
# #group sample type gene comparison pos1 c1 v1 v1freq pos2 c2 v2 v2freq c1c2 c1v2 v1c2 v1v2 p-value OR FDR
# SV12_P3 70_S3.1 aa HA 149:T:S:157:K:E 149 T S 0.01264 157 K E 0.06848 43460 3241 598 0 4.636715e-19 0 2.384596e-18
# SV12_P3 70_S3.1 aa HA 149:T:S:238:G:D 149 T S 0.01264 238 G D 0.09135 40903 4326 591 0 5.387773e-26 0 3.232664e-25
# SV12_P3 70_S3.1 aa HA 157:K:E:158:N:D 157 K E 0.06848 158 N D 0.00627 43740 297 3242 0 1.081045e-09 0 3.537965e-09
# SV12_P3 70_S3.1 aa HA 157:K:E:187:K:E 157 K E 0.06848 187 K E 0.00186 43973 55 3206 33 1.309592e-16 8.227921 5.893164e-16
# How to merge the files?
# Merge the counts for c1c2, c1v2, v1c2, v1v2 for each comparison.
# Compute new v1freq and v2freq (e.g., v1freq = (v1c2 + v1v2) / (c1c2 + c1v2) = (598 + 0) / (43460 + 3241) = 598 / 46701 = 0.012804)
# Actually, that doesn't include all of the residues at this position. I would really need to include all variants at a particular position in the denominator (v1c2 and v1v2 columns for all v1 variants to compute v1freq...)
# "c" won't change for a particular file, but "v" will.
# Across different files though, "c" might be different for a particular position.
# Perhaps the easiest would be to tally up the counts for each residue at each position across all files, then use that to determine the new consensus and compute the vfreq for each variant.
# Read the files and store all information from all files to a data structure. ($data)
# keys (constant): group, type, gene
# comparisons -> tally c1c2, etc. for each "comparison" (also, don't need to save pos1, c1, v1, pos2, c2, v2 because that comes from "comparison")
# counts -> tally residue counts for each pos
# freq -> frequencies for each residue in counts (compute afterwards)
my $data; # See above for description of format
# Walk through each file and save the data
foreach my $file (@files){
my $readfh = open_to_read($file);
while(<$readfh>){
chomp;
# #group sample type gene comparison pos1 c1 v1 v1freq pos2 c2 v2 v2freq c1c2 c1v2 v1c2 v1v2 p-value OR FDR
# WT_P3 64_S1.2 nuc HA 897:G:T:930:T:C 897 G T 0.00040 930 T C 0.00011 56784 6 23 0 1 0 1
next if (m/^#/);
my ($group, $sample, $type, $gene, $comparison, $pos1, $c1, $v1, $v1freq, $pos2, $c2, $v2, $v2freq, $c1c2, $c1v2, $v1c2, $v1v2, $pvalue, $OR, $FDR) = split(/\t/);
# Save comparison counts
$data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{c1c2} += $c1c2;
$data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{c1v2} += $c1v2;
$data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{v1c2} += $v1c2;
$data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{v1v2} += $v1v2;
# Save pvalues (actually FDR)
warn "overwriting FDR value for $group $type $gene $sample ! Something is not unique between these files!\n" if exists($data->{$group}->{$type}->{$gene}->{pvalue}->{$comparison}->{$sample});
$data->{$group}->{$type}->{$gene}->{pvalue}->{$comparison}->{$sample} = $FDR; # Should this be $FDR or $pvalue?
# Save allele counts. Save once for each residue in each sample, then add across samples
$data->{$group}->{$type}->{$gene}->{counts}->{$pos1}->{$c1}->{$sample} = $c1c2 + $c1v2 unless (exists($data->{$group}->{$type}->{$gene}->{counts}->{$pos1}->{$c1}->{$sample}));
$data->{$group}->{$type}->{$gene}->{counts}->{$pos2}->{$c2}->{$sample} = $c1c2 + $v1c2 unless (exists($data->{$group}->{$type}->{$gene}->{counts}->{$pos2}->{$c2}->{$sample}));
$data->{$group}->{$type}->{$gene}->{counts}->{$pos1}->{$v1}->{$sample} = $v1c2 + $v1v2 unless (exists($data->{$group}->{$type}->{$gene}->{counts}->{$pos1}->{$v1}->{$sample}));
$data->{$group}->{$type}->{$gene}->{counts}->{$pos2}->{$v2}->{$sample} = $c1v2 + $v1v2 unless (exists($data->{$group}->{$type}->{$gene}->{counts}->{$pos2}->{$v2}->{$sample}));
}
close($readfh);
}
my $R = Statistics::R->new();
# Now tally up counts across samples, compute p-values
my $writefh = open_to_write($merged_filename);
print $writefh "#";
print $writefh join "\t", ( qw(group sample type gene comparison pos1 c1 v1 v1freq pos2 c2 v2 v2freq c1c2 c1v2 v1c2 v1v2 pvalues OR fisherFDR) );
print $writefh "\n";
foreach my $group (sort keys %$data){
TYPE:foreach my $type (sort keys %{$data->{$group}}){
next TYPE unless (exists($types->{$type}));
foreach my $gene (sort keys %{$data->{$group}->{$type}}){
# Tally up counts and compute frequency
foreach my $pos (%{$data->{$group}->{$type}->{$gene}->{counts}}){
my $pos_total = 0;
foreach my $residue (keys %{$data->{$group}->{$type}->{$gene}->{counts}->{$pos}}){
my $residue_total_count = total(values %{$data->{$group}->{$type}->{$gene}->{counts}->{$pos}->{$residue}}); # Count up all of the counts from all samples. this assumes that each file has a separate sample id...
$data->{$group}->{$type}->{$gene}->{counts}->{$pos}->{$residue}->{allsample} = $residue_total_count;
$pos_total += $residue_total_count;
}
foreach my $residue (keys %{$data->{$group}->{$type}->{$gene}->{counts}->{$pos}}){
my $freq = $data->{$group}->{$type}->{$gene}->{counts}->{$pos}->{$residue}->{allsample} / $pos_total;
$data->{$group}->{$type}->{$gene}->{freq}->{$pos}->{$residue}->{allsample} = sprintf("%.7f", $freq);
}
}
# Compute new p-values for each comparison
foreach my $comparison (sort keys %{$data->{$group}->{$type}->{$gene}->{comparisons}}){
# if ($method =~ m/fisher/i){
# fishersMethod = function(x) pchisq(-2 * sum(log(x)),df=2*length(x),lower=FALSE)
my @pvalues = values %{$data->{$group}->{$type}->{$gene}->{pvalue}->{$comparison}}; # *** Double-check that this is grabbing the right p-values
# print Dumper(\@pvalues);
$R->set('x', [@pvalues]);
$R->run(q`f = pchisq(-2 * sum(log(x)),df=2*length(x),lower=FALSE)`);
my $f = $R->get('f');
$data->{$group}->{$type}->{$gene}->{pvalue}->{$comparison}->{allsamplefisher} = $f;
$data->{$group}->{$type}->{$gene}->{pvalue}->{$comparison}->{pvaluelist} = join ",", @pvalues;
# }
# elsif($method =~ m/count/i){
# Nothing at the moment
my $AB = $data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{c1c2};
my $Ab = $data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{c1v2};
my $aB = $data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{v1c2};
my $ab = $data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{v1v2};
$R->set('x', [$AB,$Ab,$aB,$ab]);
$R->run(q`y = matrix(x,nrow=2)`);
$R->run(q`temp = fisher.test(y)`);
$R->run(q`p = temp$p.value`);
$R->run(q`or = temp$estimate`);
my $OR = $R->get('or');
my $p = $R->get('p');
$data->{$group}->{$type}->{$gene}->{pvalue}->{$comparison}->{allsamplepvalue} = $p;
$data->{$group}->{$type}->{$gene}->{pvalue}->{$comparison}->{allsampleOR} = $OR;
# }
}
# print Dumper($data->{$group}->{$type}->{$gene}->{pvalue});
# To do: compute adjusted p-value for method 'count'
# Now print out the data
foreach my $comparison (sort keys %{$data->{$group}->{$type}->{$gene}->{comparisons}}){
# 897:G:T:930:T:C
my ($pos1, $c1, $v1, $pos2, $c2, $v2) = split(/:/, $comparison);
my $v1freq = $data->{$group}->{$type}->{$gene}->{freq}->{$pos1}->{$v1}->{allsample};
my $v2freq = $data->{$group}->{$type}->{$gene}->{freq}->{$pos2}->{$v2}->{allsample};
my $c1c2 = $data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{c1c2};
my $c1v2 = $data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{c1v2};
my $v1c2 = $data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{v1c2};
my $v1v2 = $data->{$group}->{$type}->{$gene}->{comparisons}->{$comparison}->{v1v2};
my $fisher = $data->{$group}->{$type}->{$gene}->{pvalue}->{$comparison}->{allsamplefisher};
my $OR = $data->{$group}->{$type}->{$gene}->{pvalue}->{$comparison}->{allsampleOR};
my $pvalues = $data->{$group}->{$type}->{$gene}->{pvalue}->{$comparison}->{pvaluelist};
print $writefh join "\t", $group, $merged_sample_id, $type, $gene, $comparison, $pos1, $c1, $v1, $v1freq, $pos2, $c2, $v2, $v2freq, $c1c2, $c1v2, $v1c2, $v1v2, $pvalues, $OR, $fisher;
print $writefh "\n";
}
}
}
}
close($writefh);
return $merged_filename;
}
#-----------------------------------------------------------------------------
sub get_sample {
my $files = shift;
my @files = @$files;
my %samples; # hash to store all sample ids seen in the files
# Walk through each file and save the sample ids
foreach my $file (@files){
my $readfh = open_to_read($file);
while(<$readfh>){
chomp;
# #group sample type gene comparison pos1 c1 v1 v1freq pos2 c2 v2 v2freq c1c2 c1v2 v1c2 v1v2 p-value OR FDR
# WT_P3 64_S1.2 nuc HA 897:G:T:930:T:C 897 G T 0.00040 930 T C 0.00011 56784 6 23 0 1 0 1
next if (m/^#/);
my @F = split(/\t/);
$samples{$F[1]}++;
}
}
my $merged_sample_id = join "_", sort keys %samples;
return $merged_sample_id;
}
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
sub read_variants {
my $file = shift;
my $stats; # AoA containing all lines of the variants file
my $readfh = open_to_read($file);
while (<$readfh>){
chomp;
my @line = split(/\t/);
next if (/^#/);
push @$stats, \@line;
}
close($readfh);
return $stats;
}
#-----------------------------------------------------------------------------
sub calculate_quasi_cliques {
my $stats = shift;
# $stats is an AoA where each line represents a comparison of two variants
# [
# 'aa',
# 'Seq12_093009_HA_cds',
# '136',
# 'K',
# 'E',
# '238',
# 'G',
# 'D',
# 8569,
# 199,
# 101,
# 0,
# '0.176207',
# '0',
# '2.265519e-01'
# ],
# New format output of calculate_linkage_disequilibrium.pl :
# #type gene pos1 c1 v1 v1freq pos2 c2 v2 v2freq c1c2 c1v2 v1c2 v1v2 p-value OR FDR
# nuc Seq12_093009_HA_cds 406 A G 0.00684 469 A G 0.00125 27701 35 191 0 1 0 1.000000e+00
# nuc Seq12_093009_HA_cds 406 A G 0.00684 559 A G 0.00269 27662 75 191 0 1 0 1.000000e+00
# Yet newer format out of calculate_linkage_disequilibrium.pl (as of 2014-09-06) :
# #group sample type gene comparison pos1 c1 v1 v1freq pos2 c2 v2 v2freq c1c2 c1v2 v1c2 v1v2 p-value OR FDR
# Group1 Sample1 nuc Seq12_093009_HA_cds 437:A:C:713:G:A 437 A C 0.04494 713 G A 0.00924 25448 255 1255 0 8.18E-06 0 1.17E-04
# Group1 Sample1 nuc Seq12_093009_HA_cds 437:A:C:471:A:C 437 A C 0.04494 471 A C 0.02352 25045 656 1255 0 2.83E-14 0 1.22E-12
# Group1 Sample1 nuc Seq12_093009_HA_cds 437:A:G:713:G:A 437 A G 0.03477 713 G A 0.00924 25448 255 970 0 0.000139402 0 1.50E-03
# I want to plot the network in R using 'network' package.
# To do so, I need a graph file of edges and node attributes file
# It appears that the attributes file row names (numbers starting at 1) are what is to be used to identify the nodes in the edges file.
# Read the $stats struct
# Get a unique list of the alleles and classify as "var" or "cons". Store as HoH $alleles, where first key is the allele id and second key is var_cons.
# Store edges in $edges AoA
# Read the $alleles HoH and assign $alleles->{$allele_id}->{'node_num'}
# Read the $edges AoA and print out the node_num for each pair (from $alleles); two pairs per row.
# After much trial and error, I found that network() command for vertex.attr requires only numeric values for attributes. vertex.attrnames can be strings though. The edges can have strings.
# I still want to print out the names and attributes file separately from the edges because that is the only way to be sure that the nodes are given the correct order within the network object.
# Files for plotting in R
my $temp_attrib = $tempdir."/attributes.txt";
my $temp_edges = $tempdir."/edges.txt";
my $temp_names = $tempdir."/names.txt";
my $temp_edge_lb= $tempdir."/edge_label.txt";
my $attr_fh = open_to_write($temp_attrib);
my $edges_fh = open_to_write($temp_edges);
my $names_fh = open_to_write($temp_names);
my $edge_lb_fh = open_to_write($temp_edge_lb);
my $alleles; # HoH to contain a non-redundant list of variants and consensus
my $edges; # HoA. This needs to be a hash, since I see that some edges are found multiple times. Make the pair of residues the key. Store an array of FDR values for each pair of residues and then use the best for the plot (?).
# When does this occur that an edge will be found multiple times? When there are multiple variant alleles and the variant alleles are linked with OR status is the same for both (>1 or <1) so that the consensus alleles are linked with each other. It doesn't cause a duplication of the variant pairing, but it does duplicate the consensus pairing. e.g., 'codon-146-AAC:codon-238-GAT' 146 consensus codon is AAC, variant alleles AGC, ACC. 238 consensus codon GGT and variant allele GAT. 146 is compared to 238 twice (once for each allele in 146). The OR = 0 in both, so codon-146-AAC is linked to codon-238-GAT in both. p-values are '1.393300e-03', '1.090292e-04'. Which to use? They are kind of dependent p-values, so probably shouldn't combine them with Fisher's method. Most of the time they are very similar (within 1 order of magnitude). I'll just plot the stronger of the two.
foreach (@$stats){
if (scalar(@{$_}) < 20){
print STDERR "Check input file to see if it has the correct number of columns.\n";
exit 1;
}
elsif (scalar(@{$_}) > 1){
my ( $group, $sample, $type, $gene, $comparison, $first_pos, $first_cons, $first_var, $first_var_freq, $second_pos, $second_cons, $second_var, $second_var_freq, $AB, $Ab, $aB, $ab, $p, $OR, $fdr ) = @{$_};
if ($fdr <= $FDR && exists($types->{$type})){ # Make sure the comparison passes the FDR threshold and is of the correct type.
$first_cons = join "-", $type, $first_pos, $first_cons; # Consensus residue at the first position in the comparison
$first_var = join "-", $type, $first_pos, $first_var; # Alternate residue at the first position
$second_cons = join "-", $type, $second_pos, $second_cons; # Consensus residue at the second position
$second_var = join "-", $type, $second_pos, $second_var; # Alternate residue at the second position
my $total = $AB + $Ab + $aB + $ab;
my %first_cons = ( 'var_cons' => "cons", 'allele' => $first_cons, 'freq' => ($AB + $Ab)/$total );
$alleles->{$first_cons} = \%first_cons;
my %second_cons = ( 'var_cons' => "cons", 'allele' => $second_cons, 'freq' => ($AB + $aB)/$total );
$alleles->{$second_cons} = \%second_cons;
my %first_var = ( 'var_cons' => "var", 'allele' => $first_var, 'freq' => ($aB + $ab)/$total );
$alleles->{$first_var} = \%first_var;
my %second_var = ( 'var_cons' => "var", 'allele' => $second_var, 'freq' => ($Ab + $ab)/$total );
$alleles->{$second_var} = \%second_var;
# Use OR to connect individual variants to consensus instead of just position.
# If OR > 1, then variant is linked to variant and consensus linked to consensus; if OR < 1, then variant of one is linked to consensus of another. Two lines printed for the graph per line in the original file.
if ($OR >= 1){
# variant is linked to variant and consensus linked to consensus
push @{$edges->{$first_var . ":" . $second_var }}, $fdr;
push @{$edges->{$first_cons . ":" . $second_cons}}, $fdr;
}
else {
# variant of one is linked to consensus of another
push @{$edges->{$first_var . ":" . $second_cons}}, $fdr;
push @{$edges->{$first_cons . ":" . $second_var }}, $fdr;
}
}
}
}
# print Dumper($alleles); exit;
# print Dumper($edges); exit;
# Now sort the alleles (position in the @sorted_alleles AoH will be the node_num)
my @sorted_alleles;
foreach my $allele (sort keys %$alleles){
push @sorted_alleles, $allele;
}
# print Dumper(\@sorted_alleles);
# Print out attributes file and names file for plotting in R (e.g., var or cons)
# Also add 'node_num' to $alleles
print $attr_fh "consensus4.variant2\tfreq\n";
print $names_fh "allele\n";
my $i = 1;
foreach my $allele (@sorted_alleles){
print $names_fh "$allele\n";
if ($alleles->{$allele}->{'var_cons'} eq 'cons'){
print $attr_fh "4"; # cons = 4 (blue in plot.network)
}
else {
print $attr_fh "2"; # var = 2 (red in plot.network)
}
print $attr_fh "\t$alleles->{$allele}->{'freq'}\n";
$alleles->{$allele}->{'node_num'} = $i;
$i++;
}
close ($attr_fh);
# print Dumper($alleles); exit;
print "There are no edges, I cannot procede, dying now.\n" and exit if scalar keys %$edges == 0; # philip macmenamin
# Read $edges and print out edges print out 'node_num' representing each node instead of the node name.
foreach my $edge (keys %$edges){
my ($first, $second) = split(/:/, $edge);
my @fdr_array = sort {$a <=> $b} @{$edges->{$edge}};
my $fdr = $fdr_array[0]; # Use the smaller of the values if there is more than one.
print $edges_fh "$alleles->{$first}->{'node_num'}\t$alleles->{$second}->{'node_num'}\n";
print $edge_lb_fh "$fdr\n";
}
close($edge_lb_fh);
close($edges_fh);
# Example graphing in R
# names <- read.table("~/temp/names.txt", header=T)
# edges <- read.table("~/temp/edges.txt")
# attr <- read.table("~/temp/attributes.txt", header=T)
# edglab <- read.table("~/temp/edge_label.txt")
# library(network)
# nw <- network(edges, vertex.attr=attr, directed=F)
# plot.network(nw, displaylabels=T, vertex.col=attr$consensus4.variant2, label=names$allele, label.cex=0.5, vertex.cex=attr$freq+0.5, edge.label=format(edglab$V1, scientific=T, digits=2), edge.label.cex=0.5, edge.label.col=6, edge.lwd=-0.5*log(edglab$V1))
# This plots red = variant, blue = consensus; thickness of edge represents the fdr value (thicker = lower fdr value); the size of the ball represents the frequency of the allele (bigger = higher frequency)
# Or, write R script and run it with Rscript from the system.
my $temp_R_script = $tempdir . "/temp_script.R";
my $temp_R_fh = open_to_write($temp_R_script);
print $temp_R_fh "setwd(\"$tempdir\")\n"; # Set working directory to $tempdir to read the files
print $temp_R_fh "names <- read.table(\"names.txt\", header=T)\n";
print $temp_R_fh "edges <- read.table(\"edges.txt\")\n";
print $temp_R_fh "attr <- read.table(\"attributes.txt\", header=T)\n";
print $temp_R_fh "edglab <- read.table(\"edge_label.txt\")\n";
print $temp_R_fh "library(network)\n";
print $temp_R_fh "nw <- network(edges, vertex.attr=attr, directed=F)\n";
print $temp_R_fh "setwd(\"$save_dir\")\n"; # Change working directory now to $save_dir for saving files
my $types_string = join "_", sort keys %$types;
my $outfile = $prefix . "_$types_string" . "_network.pdf";
print $temp_R_fh "pdf(\"$outfile\", height=8, width=8)\n";
print $temp_R_fh "plot.network(nw, displaylabels=T, vertex.col=attr\$consensus4.variant2, label=names\$allele, label.cex=0.5, edge.label=format(edglab\$V1, scientific=T, digits=1), edge.label.cex=0.5, edge.label.col=6, edge.lwd=-0.5*log(edglab\$V1))\n";
print $temp_R_fh "dev.off()\n";
# These next few steps are testing an alternative method of plotting, scaling frequency separately based on residue designation as either variant or consensus.
print $temp_R_fh 'meanvar = mean(subset(attr, attr$consensus4.variant2 == 2)$freq)' . "\n"; # Get mean value for all variant frequencies in the table.
print $temp_R_fh 'meancons = mean(subset(attr, attr$consensus4.variant2 == 4)$freq)'. "\n"; # Get mean value for all consensus frequencies in the table.
print $temp_R_fh 'for (i in row.names(attr)){if (attr[i, "consensus4.variant2"] == 2){attr[i, "centfreq"] = attr[i, "freq"] / meanvar} else {attr[i, "centfreq"] = attr[i, "freq"] / meancons} }' . "\n"; # transform the frequency by dividing the frequency by the mean of either the consensus frequencies or variant frequencies, depending on what type you are. Store this in "centfreq" column, for "centered frequency"
$outfile = $prefix . "_$types_string" . "_network_scaled_freqs.pdf";
print $temp_R_fh "pdf(\"$outfile\", height=8, width=8)\n";
print $temp_R_fh 'plot.network(nw, displaylabels=T, vertex.col=attr$consensus4.variant2, label=names$allele, label.cex=0.5, vertex.cex=attr$centfreq+0.5, edge.label=format(edglab$V1, scientific=T, digits=1), edge.label.cex=0.5, edge.label.col=6, edge.lwd=-0.5*log(edglab$V1))' . "\n";
print $temp_R_fh "dev.off()\n";
close($temp_R_fh);
# if (check_for_Rscript()){
# # Then we didn't find Rscript on the path.
# my $copy_R_script_file_dir = $save_dir . "/$prefix" . "_network_plot_files";
# print STDERR "Didn't find Rscript on the PATH!\nCopying the R script and files to $copy_R_script_file_dir. You can run it manually like this:\ncd $copy_R_script_file_dir; Rscript temp_script.R\n";
# system("cp -r $tempdir $copy_R_script_file_dir");
# exit;
# }
# else {
system("$Rscript_loc $temp_R_script");
# }
if ($quasi){
# This is for making quasi-cliques with mgqce
# Files for quasi-cliques in mgqce
my $temp_graph = $tempdir."/graph.mgqce.txt";
my $temp_rgc = $tempdir."/graph.rgc.txt";
my $temp_key = $tempdir."/key.rgc.txt";
my $temp_query = $tempdir."/query.txt";
my $temp_clique = $tempdir."/cliques.txt";
my $fh = open_to_write($temp_graph);
foreach (@$stats){
if (scalar(@{$_}) > 1){
my ( $group, $sample, $type, $gene, $comparison, $first_pos, $first_cons, $first_var, $second_pos, $second_cons, $second_var, $AB, $Ab, $aB, $ab, $p, $OR, $fdr ) = @{$_};
if ($fdr < $FDR){
my $first = join "-", $type, $first_pos, $first_cons . "/" . $first_var;
my $second = join "-", $type, $second_pos, $second_cons . "/" . $second_var;
print $fh "$first\t$second\n";
}
}
}
close($fh);
# Make mgqce/DENSE compatible graph and key files with rgc.
my $cmd = "rgc $temp_graph $temp_rgc $temp_key";
if ($verbose){ print STDERR "Running commdand: $cmd\n"; }
system($cmd);
# Make query file from $temp_key.
my $count = `cat $temp_key | wc -l`;
my $query_fh = open_to_write($temp_query);
my $key_fh = open_to_read($temp_key);
print $query_fh "$count\n";
while(<$key_fh>){
chomp;
my @line = split(/\s/);
print $query_fh "$line[0]\n";
}
close($query_fh);
close($key_fh);
# Run mgqce to get quasi-cliques
# E.g., mgqce graph.rgc.txt 0.75 0.01 query.txt out.txt
# mgqce <graph> <mu> <gamma> <query> <cliques>
# "mgqce enumerates all subgraphs of the graph where at least mu percent of subgraph is made up of "query" vertices and each vertices is adjacent to at least gamma percent of the other vertices."
# "Note, gamma must be at least 0.5. Also, the algorithm may enumerate the enriched cliques of a graph more "intelligently" (i.e., quickly) if you use a value of gamma = 0.999..., rather than gamma = 1."
my $mu = 0.5; # at least mu percent of subgraph is made up of "query" vertices . My default 0.5
my $gamma = 0.01; # each vertex is adjacent to at least gamma percent of the other vertices. My default 0.01.
$cmd = "mgqce $temp_rgc $mu $gamma $temp_query $temp_clique > stdout.txt";
system($cmd);
`cat $temp_clique`;
}
# Now I need to convert back to variant information using the key
# Graph in R.
}
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#-----------------------------------------------------------------------------
#Useful things
#my $total_lines = 0;
# $total_lines++; #somewhere in subroutine, if I run through the file first.
# my $lines_done = 0;
# $lines_done++; #in while loop, increment as each new line is processed
# if (($lines_done % 25) == 0){ #put this in the while loop too.
# print "Done processing $lines_done of $total_lines sequences.";
# &elapsed($start_time, ' Elapsed', $verbose);
# }