-
Notifications
You must be signed in to change notification settings - Fork 0
/
getsubgff.pl
93 lines (78 loc) · 2.15 KB
/
getsubgff.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/perl
use FindBin qw ($Bin);
use Getopt::Long;
GetOptions(\%opt,"refseq:s","start:s","end:s","repeat","gff:s","output:s","help:s");
my $help=<<USAGE;
Get gff for defined ref and coordinate
perl $0 -refseq chr01 --start 100 --end 1000000 --gff gene.gff --output sub.gff > log
--repeat: if set to repeat, only filter by each line in gff file
USAGE
if (exists $opt{help} or keys %opt < 1){
print $help;
exit;
}
if ($opt{repeat}){
open IN, "$opt{gff}" or die "cat not open outfile: $opt{gff}\n";
open OUT, ">$opt{output}" or die "cat not open outfile: $opt{output}\n";
while(<IN>){
chomp $_;
my $line=$_;
if ($line=~/^#/){
print OUT "$line\n";
}else{
my @unit=split("\t",$line);
if ($unit[0] eq $opt{refseq} and $unit[3] >= $opt{start} and $unit[4] <= $opt{end}){
$unit[3] =$unit[3]-$opt{start}+1;
$unit[4] =$unit[4]-$opt{start}+1;
my $temp=join("\t",@unit);
print OUT "$temp\n";
}
}
}
close IN;
close OUT;
}else{
my $refgff=parseGFF($opt{gff});
open OUT, ">$opt{output}" or die "cat not open outfile: $opt{output}\n";
foreach my $g (keys %$refgff){
my $gff=$refgff->{$g};
my @line=split("\n",$gff);
my @unit=split("\t",$line[0]);
if ($unit[0] eq $opt{refseq} and $unit[3] >= $opt{start} and $unit[4] <= $opt{end}){
foreach my $l (@line){
my @array=split("\t",$l);
$array[3] =$array[3]-$opt{start}+1;
$array[4] =$array[4]-$opt{start}+1;
my $temp=join("\t",@array);
print OUT "$temp\n";
}
}
}
close OUT;
}
sub parseGFF
{
my ($gff)=@_;
my %hash; ##hash to store every record by key of Seq_id
my $seq;
my $id; ##ID for element
my $record;##all line for this record
open IN, "$gff" or die "$!";
while (<IN>){
chomp $_;
next if ($_=~/^#/);
my @unit=split("\t",$_);
if ($unit[2]=~/mRNA/){
$seq=$unit[0];
if ($unit[8]=~/ID=(.*?);/ or $unit[8] =~/ID=(.*)/){
$id=$1;
}
$record="$_\n";
$hash{$id}=$record;
}elsif($unit[0] eq $seq and $unit[8] =~ /Parent=$id/){
$hash{$id}.="$_\n";
}
}
close IN;
return \%hash;
}