-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_seq.pl
110 lines (96 loc) · 2.31 KB
/
get_seq.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/perl -w
use strict;
my $infa=shift;
my $ingff=shift;
my $inintron=shift;
my $half=shift;
$half ||=50;
my $species=$1 if ($infa=~/(\S+).fa/);
my %gene_group;
open IN, "$inintron" || die "$!\n";
$/="GROUP IIB";<IN>;$/="\n";
while(<IN>){
chomp;
#print "$_\n";
next unless ($_);
if (/- (\S+) \(intron (\d+)\)/){
push @{$gene_group{$1}}, $2;
}elsif(/- (\S+)/){
push @{$gene_group{$1}}, "0";
}
}
close IN;
my $seq;
open IN, "$infa" || die "$!\n";
$/=">";<IN>;$/="\n";
while(<IN>){
my $scaf=$1 if(/^(\S+)/);
$/='>';
$seq=<IN>;
chomp($seq);
$seq=~s/\s+//g;
$seq=~tr/atcg/ATCG/;
$/="\n";
#$seq{$scaf}=$seq;
}
close IN;
my %gff; my %strand;
open IN, "$ingff" || die "$!\n";
while(<IN>){
chomp;
my @info=split;
if($info[8]=~/^Parent=(\S+?);/){
push @{$gff{$1}}, ($info[3], $info[4]);
$strand{$1}=$info[6];
}
}
close IN;
#foreach my $gene(keys %gene_group){
#print "$gene\n";
open OUT, ">$ingff.group2.intron5.$half.fa" || die "$!\n";
open OUT2, ">$ingff.group2.intron3.$half.fa" || die "$!\n";
foreach my $gid(keys %gff){
#print "$gid";
next if ($gid =~ /rps12/);
next if ($gid =~ /rRNA/);
next if ($gid =~ /tRNA/);
my $gene=$1 if($gid=~/(\S+?)\_/);
#print "$gene\n";
next unless (exists $gene_group{$gene});
@{$gff{$gid}} = sort {$a <=> $b} @{$gff{$gid}};
@{$gff{$gid}} = reverse @{$gff{$gid}} if ($strand{$gid} eq '-');
for(my $i=1; $i<@{$gff{$gid}}-1; $i++){
my $type;
#if ($i==0){
# $type="translation_head";
#}elsif($i==@{$gff{$gid}}-1){
# $type="translation_tail";
#}elsif($i % 2 == 1){
# $type="5-intron";
#}else{
# $type="3-intron";
#}
#my $output_head =">$gid\_$gff{$gid}[$i]\_$strand{$gid}";
my $block_s=$gff{$gid}[$i]-$half;
my $block_seq=substr($seq, $block_s-1, $half+1+$half);
#print "$block_seq\n";
if ($strand{$gid} eq '-'){
$block_seq=~ tr/AGCTagct/TCGAtcga/;
$block_seq= reverse $block_seq;
}
#my $output_head="$block_seq";
if ($i==0){
$type="translation_head";
}elsif($i==@{$gff{$gid}}-1){
$type="translation_tail";
}elsif($i % 2 == 1){
$type="5-intron";
print OUT ">$species\_$gid\_$gff{$gid}[$i]\_$strand{$gid}\_$type\n$block_seq\n";
}else{
$type="3-intron";
print OUT2 ">$species\_$gid\_$gff{$gid}[$i]\_$strand{$gid}\_$type\n$block_seq\n";
}
}
#print "\n";
}
#}