forked from weizhongli/cdhit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclstr_reps_faa_rev.pl
executable file
·57 lines (47 loc) · 1.13 KB
/
clstr_reps_faa_rev.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/perl
# output single fasta file
# for each cluster output at least $cutoff seqs
$clstr_file = shift;
$fasta_file = shift;
$cutoff = shift;
die "" unless ((-e $clstr_file) and (-e $fasta_file) and ($cutoff>0));
my ($i, $j, $k, $cmd);
my %rep_ids = (); #rep_ids to be skipped
open(TMP, $clstr_file) || die "Can not open file";
my @gis = ();
my $clstr_id = "";
while(my $ll=<TMP>) {
if ($ll =~ /^>/ ) {
if (@gis) {
my $no = $#gis+1;
for ($i=$cutoff; $i<$no; $i++) { $rep_ids{$gis[$i]}=1; }
}
@gis = ();
}
else {
chop($ll);
if ($ll =~ />(.+)\.\.\./ ) {
my $id = $1;
if ($ll =~ /\*$/) { unshift(@gis, $id);}
else { push(@gis, $id);}
}
}
}
if (@gis) {
my $no = $#gis+1;
for ($i=$cutoff; $i<$no; $i++) { $rep_ids{$gis[$i]}=1; }
}
close(TMP);
#########################################################
my $flag = 0;
open(TMP, $fasta_file) || die;
while($ll = <TMP>) {
chomp($ll);$ll=~s/\r$//;
if ($ll =~ /^>/) {
$gi = substr($ll,1);
$gi =~ s/\s.+$//;
$flag = ($rep_ids{$gi}) ? 0 : 1;
}
if ($flag) {print "$ll\n";}
}
close(TMP);