-
Notifications
You must be signed in to change notification settings - Fork 4
/
MetaPrism_taxon_centrifuge.pl
96 lines (89 loc) · 2.84 KB
/
MetaPrism_taxon_centrifuge.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Author: Jiwoong Kim ([email protected])
use strict;
use warnings;
local $SIG{__WARN__} = sub { die "ERROR in $0: ", $_[0] };
use Cwd 'abs_path';
use Getopt::Long qw(:config no_ignore_case);
GetOptions(
'h' => \(my $help = ''),
'p=i' => \(my $threads = 1),
);
if($help || scalar(@ARGV) == 0) {
die <<EOF;
Usage: perl MetaPrism_taxon_centrifuge.pl [options] MetaPrism_gene.region.txt genome.fasta centrifuge.index > MetaPrism_gene.region.taxon.txt
Options: -h display this help message
-p INT number of threads [$threads]
EOF
}
foreach('centrifuge') {
die "ERROR in $0: '$_' is not executable.\n" unless(-x getCommandPath($_));
}
my ($regionFile, $genomeFastaFile, $centrifugeIndex) = @ARGV;
my %genomeSequenceHash = ();
{
open(my $reader, $genomeFastaFile);
my $contig = '';
while(my $line = <$reader>) {
chomp($line);
if($line =~ /^>(\S*)/) {
$contig = $1;
} else {
$genomeSequenceHash{$contig} .= $line;
}
}
close($reader);
}
(my $regionFastaFile = $regionFile) =~ s/(\.txt)?$/\.fasta/;
{
open(my $reader, $regionFile);
open(my $writer, "> $regionFastaFile");
chomp(my $line = <$reader>);
my @columnList = split(/\t/, $line);
while(my $line = <$reader>) {
chomp($line);
my %tokenHash = ();
@tokenHash{@columnList} = split(/\t/, $line);
my $region = join('|', my ($contig, $start, $end, $strand) = @tokenHash{'contig', 'start', 'end', 'strand'});
my $sequence = substr($genomeSequenceHash{$contig}, $start - 1, $end - ($start - 1));
$sequence = uc($sequence);
($sequence = reverse($sequence)) =~ tr/ACGT/TGCA/ if($strand eq '-');
print $writer ">$region\n";
print $writer "$sequence\n";
}
close($reader);
close($writer);
}
my %regionTaxonReferenceHash = ();
{
open(my $reader, "centrifuge -f -p $threads -x $centrifugeIndex -U $regionFastaFile --report-file /dev/null |");
chomp(my $line = <$reader>);
my @columnList = split(/\t/, $line);
while(my $line = <$reader>) {
my %tokenHash = ();
@tokenHash{@columnList} = split(/\t/, $line);
$regionTaxonReferenceHash{$tokenHash{'readID'}} = [@tokenHash{'taxID', 'seqID'}];
}
close($reader);
}
{
open(my $reader, $regionFile);
chomp(my $line = <$reader>);
my @columnList = split(/\t/, $line);
print join("\t", @columnList, 'taxon', 'reference'), "\n";
while(my $line = <$reader>) {
chomp($line);
my %tokenHash = ();
@tokenHash{@columnList} = split(/\t/, $line);
my $region = join('|', @tokenHash{'contig', 'start', 'end', 'strand'});
@tokenHash{'taxon', 'reference'} = defined($_ = $regionTaxonReferenceHash{$region}) ? @$_ : ('', '');
print join("\t", @tokenHash{@columnList, 'taxon', 'reference'}), "\n";
}
close($reader);
}
sub getCommandPath {
my ($command) = @_;
chomp($command = `which $command`) if($command ne '' && $command !~ /\//);
$command =~ s/^~\//$ENV{'HOME'}\//;
$command = abs_path($command) if($command ne '');
return $command;
}