forked from KorfLab/Centromere_repeat_paper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
trf_hos_finder.pl
executable file
·229 lines (184 loc) · 8.57 KB
/
trf_hos_finder.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
#!/usr/bin/perl
#
# Process a TRF dat file to find potential candidates for higher order structure (HOS)
#
# Author: Keith Bradnam, Genome Center, UC Davis
# This work is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License.
#
# Last updated by: $Author$
# Last updated on: $Date$
use strict; use warnings;
my $usage = "
trf_hos_finder.pl
-----------------
Usage: trf_hos_finder.pl <trf dat file>
Processes a TRF *.dat output file to look for candidate higher order structure (HOS)
Looks for following criteria:
1) More than 1 tandem repeat in a sequence
2) multiple tandem repeats occupy approximately same range in sequence
3) One repeat is approximately double the size of a shorter repeat
4) TRF score of longer repeat is at least 10% higher than score of shorter repeat
5) average %identity of repeat units within longer repeat is >2% higher than in shorter repeat
Reports details of shorter and longer repeats that satisfy criteria 1–3. Additionally adds
'hos' to penultimae column of output if either criterion 4 or 5 is met. Adds 'HOS' to output
if conditions 4 AND 5 are met (these are the most likely contenders for HOS). Use sequence
IDs to go back to raw sequence files to investigate further.
Sample output (tab delimited)
-----------------------------
In this example, sequence 2 is a strong case for HOS, and sequence 1 is a weaker case. The
'hos' or 'HOS' information is only included in the line corresponding to the longer of the
two repeats.
ID LEVEL START END LENGTH COPIES SCORE %IDENT HOS? SEQ_ID
1 2 337 844 122 4.2 299 68 gnl|ti|2250106555 GGZG8286.b1
1 2 337 844 244 2.1 330 69 hos gnl|ti|2250106555 GGZG8286.b1
2 2 51 948 164 5.5 590 70 gnl|ti|2250104470 GGZG7125.g1
2 2 51 951 328 2.8 735 86 HOS gnl|ti|2250104470 GGZG7125.g1
3 2 54 641 125 4.7 467 83 gnl|ti|2250102961 GGZG6288.g1
3 2 54 641 250 2.3 479 83 gnl|ti|2250102961 GGZG6288.g1
";
die "$usage" unless (@ARGV == 1);
# keep track of how many sequences in TRF dat file
my $seq_counter = 0;
# keep track of how many repeats are found in any one sequence
# will store this data in an array
my $repeat_counter = 0; # count repeats belonging to each read
my @repeat_data;
# want to capture the (slightly processed) FASTA headers in the trf output
my $header;
# how different can start and end coordinates of 2 diff tandem repeats
# be in order to be considered occupying the same span? Default = 10 nt.
my $offset = 10;
# master hash to store all of the data for potential HOS repeats
my %hos;
# how much higher should the score of the longer repeat be in relation to shorter
# repeat. Try 15%
my $score_threshold = 1.15;
# and what about the absolute difference in %identity
# Using criteria of needing +5% in longer repeat
my $identity_threshold = 5;
################################
# MAIN LOOP OVER DAT FILE
################################
REPEAT: while(<>){
# skip blank likes
next if (m/^$/);
# extract info from Sequence headers
if (m/^Sequence: (.*)/) {
$header = $1;
$seq_counter++;
# process previous repeats (if not at first sequence)
process_repeats($repeat_counter) unless ($seq_counter == 1);
# reset certain counters and data. Want to track how many repeats are seen within each sequence
$repeat_counter = 0;
@repeat_data = ();
}
# the main output that we are interested in will all be on one line which starts with various
# numerical details of the repeat
next unless (m/^\d+ \d+ \d+ \d+\.\d /);
# capture repeat data into various variables (most will be unsused)
my ($start,$end,$period,$copies,$length,$identity,$indels,$score,$a,$c,$g,$t,$entropy,$seq) = split(/\s+/);
# if we get this far then we will capture some of the repeat data in a hash
# this is to potentially compare to other repeats in the same sequence
# duplicating some info just to be lazy later on
$repeat_data[$repeat_counter]{start} = "$start";
$repeat_data[$repeat_counter]{end} = "$end";
$repeat_data[$repeat_counter]{length} = "$length";
$repeat_data[$repeat_counter]{key} = "$header,$seq_counter";
$repeat_data[$repeat_counter]{info} = "$start,$end,$length,$copies,$score,$identity";
$repeat_counter++;
}
# process repeats for last repeat
process_repeats($repeat_counter);
print "ID\tLEVEL\tSTART\tEND\tLENGTH\tCOPIES\tSCORE\t%IDENT\tHOS?\tSEQ_ID\n";
my $counter = 1;
HOS: foreach my $key (keys %hos){
my ($seq_id, $seq_counter) = split(/,/,$key);
# only want to look at sequences with multiple levels of (potential) HOS
my $level = @{$hos{$key}};
next if ($level == 1);
# now loop over the different levels of HOS present
LEVELS: for (my $i = 0; $i < @{$hos{$key}}; $i++){
my ($start, $end, $length, $copies, $score, $identity) = split(/,/, ${$hos{$key}}[$i]);
if ($level == 2){
# first grab next tandem repeat in pair
my ($n_start, $n_end, $n_length, $n_copies, $n_score, $n_identity) = split(/,/, ${$hos{$key}}[$i+1]);
# is score of longer repeat 10% greater compared to shorter repeat?
# is average percentage identity of longer repeat 2% greater compared to shorter repeat?
# print 'hos' or 'HOS' in final output to signify weak or high confidence that this is a HOS repeat
# also need to double check whether first repeat in pair of repeats is shorter (sometimes the longer
# repeat is reported first). If this happens, reverse scores and identities
my ($s1, $s2, $i1, $i2) = ($score, $n_score, $identity, $n_identity);
my $longer_score;
my $shorter_score;
my $longer_ident;
my $shorter_ident;
#ID LEVEL START END LENGTH COPIES SCORE %IDENT HOS? SEQ_ID
#1 2 73 1098 273 3.7 770 79 gnl|ti|1516245357 FAPA551217.y1
#1 2 82 1098 137 7.4 634 67 gnl|ti|1516245357 FAPA551217.y1
if($length > $n_length){
($longer_score, $shorter_score) = ($score, $n_score);
($longer_ident, $shorter_ident) = ($identity, $n_identity);
} else{
($longer_score, $shorter_score) = ($n_score, $score);
($longer_ident, $shorter_ident) = ($n_identity, $identity);
}
my $hos_field;
if ((($longer_score / $shorter_score) > $score_threshold) and
(($longer_ident - $shorter_ident) > $identity_threshold)){
$hos_field = "HOS";
}
elsif ((($longer_score / $shorter_score) > $score_threshold) or
(($longer_ident - $shorter_ident) > $identity_threshold)){
$hos_field = "hos";
} else{
$hos_field = "";
}
# now print info for current repeat and next one, and increment value of $i
print "$counter\t$level\t$start\t$end\t$length\t$copies\t$score\t$identity\t$hos_field\t$seq_id\n";
print "$counter\t$level\t$n_start\t$n_end\t$n_length\t$n_copies\t$n_score\t$n_identity\t$hos_field\t$seq_id\n";
$i++;
} else{
print "$counter\t$level\t$start\t$end\t$length\t$copies\t$score\t$identity\t???\t$seq_id\n";
}
}
$counter++;
}
sub process_repeats{
my ($repeats) = @_;
# will add index values of potential HOS repeats to hash
my %potential_hos;
# nested loop through previous repeats. Want to see if any are multiples of another
for (my $i = 0; $i < $repeats; $i++){
for (my $j = $i+1; $j < $repeats; $j++){
# want to see if current start/end coordinates are identical or within $offset nt of previous repeat
next unless (abs($repeat_data[$i]{start} - $repeat_data[$j]{start}) <= $offset);
next unless (abs($repeat_data[$i]{end} - $repeat_data[$j]{end}) <= $offset);
# $ratio is the ratio of the longest repeat to the shorter one, need longer repeat to be at least
# 1.85x length of shorter repeat. How we calculate this depends on whether current repeat length is
# longer than previous one or not
my $ratio;
if ($repeat_data[$i]{length} < $repeat_data[$j]{length}){
$ratio = $repeat_data[$j]{length}/ $repeat_data[$i]{length};
} else{
$ratio = $repeat_data[$i]{length} / $repeat_data[$j]{length};
}
# need to factor in repeats which are not simple doublings in length
# e.g. compare 100 nt vs 300 nt, simple ratio is 3:1, but want to rule out
# lengths of 250 which are not multiples. So consider both ratio
# and processed ratio
my $processed_ratio = $ratio;
$processed_ratio =~ s/\d+\.(\d+)/0\.$1/;
next unless (($processed_ratio < 0.15 or $processed_ratio > 0.85) && ($ratio > 1.8));
# we should now be looking at two repeats which exhibit HOS
# store details as there may be other repeats in this sequence which overlap
$potential_hos{$i} = 1;
$potential_hos{$j} = 1;
}
}
# can now add details of each repeat to main hash
foreach my $key (keys %potential_hos){
my $hos_key = $repeat_data[$key]{key};
my $hos_info = $repeat_data[$key]{info};
push(@{$hos{$hos_key}}, $hos_info);
}
}