-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgffSort.pl
121 lines (91 loc) · 3.06 KB
/
gffSort.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/perl
use strict;
use Getopt::Std;
my %opts;
getopts('g:o:n', \%opts);
if (!$opts{'g'}) {
print "No gff file has been provided (-g flag is mandatory)\n";
&usage;
}
elsif (!(-e $opts{'g'})){
print STDERR "The file $opts{'g'} does not exist\n";
&usage;
}
my $outFile = $opts{'o'} || "-";
open (OUT,">$outFile") or die "Can't open $outFile\n";
my (@genes,%allLines,$header);
open (FILE,$opts{'g'}) or die "Can't open $opts{'g'}\n";
while (my $line = <FILE>){
next if $line=~/^[\n\r\s]+$/;
$header .= $line if $line =~/^#/;
if ($line =~/ID=([^,;\n]+)/i){
push (@genes,$line);
push (@{$allLines{$1}},$line);
#print "ID=$1\n";
}
if ($line =~/Parent=([^,;\n]+)/i){
push (@{$allLines{$1}},$line);
#print "count=".@{$allLines{$1}}."\n";
#print "Parent=$1\n";
}
}
close FILE;
my %typeOrder = qw(gene 1 pseudogene 2 mRNA 3 pseudogenic_transcript 4 exon 5 CDS 6);
print OUT $header;
@genes = sort msort (@genes);
foreach my $geneLine (@genes){
$geneLine =~ /ID=([^,;\n]+)/;
my $gName = $1;
my @lines = @{$allLines{$gName}};
#print "linecount=".@lines."\n";
@lines = sort msort (@lines);
#print OUT "gname=$gName, lines=".@lines."\n";
print OUT join("",@lines);
}
#print STDERR @genes;
#exit;
#my @orderedGff = sort msort @lines;
#my @tempFile = split(/./,$ARGV[0]);
#my $oFile = $tempFile[0],"_sorted.gff";
#print $oFile;
#print OUT @orderedGff;
##########################################################################
# Begin Subroutines
##########################################################################
sub msort{
my $aComment = $a=~/^#/ ? 1 : -1;
my $bComment = $b=~/^#/ ? 1 : -1;
my ($aScaf, $aTypeText, $aStart, $aStrand) = (split(/\t/,$a))[0,2,3,6];
my ($bScaf, $bTypeText, $bStart, $bStrand) = (split(/\t/,$b))[0,2,3,6];
if ($opts{'n'}){
$aScaf=~s/[^\d]+//g;
$bScaf=~s/[^\d]+//g;
}
my $aType = $typeOrder{$aTypeText} ? $typeOrder{$aTypeText} : 20;
my $bType = $typeOrder{$bTypeText} ? $typeOrder{$bTypeText} : 20;
$aStrand = $aStrand eq "+" ? 1 : -1;
$bStrand = $bStrand eq "+" ? 1 : -1;
$bComment <=> $aComment || $aScaf <=> $bScaf || $aStart <=> $bStart || $aType <=> $bType || $aStrand <=> $bStrand;
}
sub usage()
{
print STDERR q
(
Usage: snapSort.pl <gff File>
THIS SCRIPT SORT OF WORKS BUT NOT FOR COMPLICATED GFF FILES. NEEDS SOME WORK.
Does a multifield sort on a .gff file.
Sorts in the following order: scaffold/chromosome, start position, type, strand (+ > -).
Scaffold/Chromosome will be sorted alphabetically (should work fine with numbers unt).
Type is sorted as follows:
[#comments] > [gene|pseudogene] > [mRNA|pseudogenic_transcript] > [exon|CDS] > [all other]
Blank lines will not be printed.
ONLY WORKS IF ALL LINES HAVE EITHER ID OR PARENT TAGS. This is necessary to keep genes in blocks.
Sorting just by position doesn't work when genes overlap.
Required:
-g Gff file
Optional:
-n (flag) Tries to sort chomosomes numerically by stripping all non-numeric characters from the field
-o file. Output file. Default is STDOUT
);
exit;
}