Skip to content

Commit

Permalink
fix #448 long term issue related to bioperl parser( for GFF2 and GFF2…
Browse files Browse the repository at this point in the history
….5) (#449)

* fix #448 a long term issue related to bioperl parser for GFF2 and GFF2.5 adding extra empty ID attribute that mess up with AGAT

* increment AGAT to 1.4
  • Loading branch information
Juke34 authored Apr 5, 2024
1 parent a23b8ea commit e1ed07f
Show file tree
Hide file tree
Showing 6 changed files with 90 additions and 4 deletions.
2 changes: 1 addition & 1 deletion lib/AGAT/AGAT.pm
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use AGAT::Utilities;
use AGAT::PlotR;
use Bio::Tools::GFF;

our $VERSION = "v1.3.3";
our $VERSION = "v1.4";
our @ISA = qw(Exporter);
our @EXPORT = qw(get_agat_header print_agat_version get_agat_config handle_levels);
sub import {
Expand Down
10 changes: 9 additions & 1 deletion lib/AGAT/OmniscientI.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1345,8 +1345,16 @@ sub _check_uniq_id_feature{

my $uID=undef;
my $primary_tag = lc($feature->primary_tag);

my $id=undef;

# When using GFF2 or GFF2.5 bioperl parser (at least until version 1.7.8) when no ID attribute present it add an empty ID attribute
# which is problematic (all features have same ID!), so we remove it.
if($feature->has_tag('ID')){ #has the tag
if ($feature->_tag_value('ID') eq " "){
$feature->remove_tag('ID');
}
}

if($feature->has_tag('ID')){ #has the tag
$id = $feature->_tag_value('ID');
}
Expand Down
22 changes: 20 additions & 2 deletions t/gff_other.t
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

use strict;
use warnings;
use Test::More tests => 6;
use Test::More tests => 8;

=head1 DESCRIPTION
Expand Down Expand Up @@ -92,4 +92,22 @@ system("$script --g $input_folder/issue441.gtf -o $pathtmp 2>&1 1>/dev/null");
ok( system("diff $pathtmp $correct_output") == 0, "issue441 check");

unlink $pathtmp;
unlink $config;
unlink $config;

# --------- Issue 448 bioperl adding extra empty ID attribute that mess up AGAT (only when input parsed with version 2 and 2.5) ----
$script = $script_prefix."bin/agat_convert_sp_gxf2gxf.pl";
$correct_output = "$output_folder/issue448.gtf";

system("$script_agat config --expose --output_format gtf 2>&1 1>/dev/null");
system("$script --g $input_folder/issue448.gtf -o $pathtmp 2>&1 1>/dev/null");

ok( system("diff $pathtmp $correct_output") == 0, "issue441 check");

unlink $pathtmp;
unlink $config;

$correct_output = "$output_folder/issue448.gff";
system("$script --g $input_folder/issue448.gtf -o $pathtmp 2>&1 1>/dev/null");

ok( system("diff $pathtmp $correct_output") == 0, "issue441 check");
unlink $pathtmp;
17 changes: 17 additions & 0 deletions t/gff_other/in/issue448.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
chr10p ambMex60DD gene 697815 769805 1000 - . gene_id "AMEX60DD000004"; gene_name "LOC115462503 [nr]|ZNF268 [hs]";
chr10p ambMex60DD transcript 697815 769805 1000 - . gene_id "AMEX60DD000004"; transcript_id "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; gene_name "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; homolog "XP_030048360.1"; ORF_type "Putative short PTC";
chr10p ambMex60DD exon 697815 702473 1000 - . gene_id "AMEX60DD000004"; transcript_id "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; exon_number "1";
chr10p ambMex60DD exon 769665 769805 1000 - . gene_id "AMEX60DD000004"; transcript_id "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; exon_number "2";
chr10p ambMex60DD CDS 699525 701216 1000 - . gene_id "AMEX60DD000004"; transcript_id "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1";
chr10p ambMex60DD gene 770040 1018740 1000 + . gene_id "AMEX60DD000005"; gene_name "AMEX60DD000005";
chr10p ambMex60DD transcript 770040 1018740 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.1"; gene_name "AMEX60DD201000005.1";
chr10p ambMex60DD exon 770040 770424 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.1"; exon_number "1";
chr10p ambMex60DD exon 801485 801606 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.1"; exon_number "2";
chr10p ambMex60DD exon 915118 915167 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.1"; exon_number "3";
chr10p ambMex60DD exon 1018684 1018740 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.1"; exon_number "4";
chr10p ambMex60DD transcript 770040 961083 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; gene_name "AMEX60DD201000005.2"; ORF_type "Predicted";
chr10p ambMex60DD exon 770040 770414 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; exon_number "1";
chr10p ambMex60DD exon 801485 801606 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; exon_number "2";
chr10p ambMex60DD exon 877425 877977 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; exon_number "3";
chr10p ambMex60DD exon 915118 915167 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; exon_number "4";
chr10p ambMex60DD exon 960764 961083 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; exon_number "5";
21 changes: 21 additions & 0 deletions t/gff_other/out/issue448.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
##gff-version 3
chr10p ambMex60DD gene 697815 769805 1000 - . ID=AMEX60DD000004;gene_id=AMEX60DD000004;gene_name=LOC115462503 [nr]|ZNF268 [hs]
chr10p ambMex60DD transcript 697815 769805 1000 - . ID=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1;Parent=AMEX60DD000004;oRF_type=Putative short PTC;gene_id=AMEX60DD000004;gene_name=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1;homolog=XP_030048360.1;transcript_id=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1
chr10p ambMex60DD exon 697815 702473 1000 - . ID=agat-exon-1;Parent=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1;exon_number=1;gene_id=AMEX60DD000004;transcript_id=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1
chr10p ambMex60DD exon 769665 769805 1000 - . ID=agat-exon-2;Parent=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1;exon_number=2;gene_id=AMEX60DD000004;transcript_id=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1
chr10p ambMex60DD CDS 699525 701216 1000 - . ID=agat-cds-1;Parent=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1;gene_id=AMEX60DD000004;transcript_id=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1
chr10p AGAT five_prime_UTR 701217 702473 . - . ID=agat-five_prime_utr-1;Parent=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1;exon_number=1;gene_id=AMEX60DD000004;transcript_id=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1
chr10p AGAT five_prime_UTR 769665 769805 . - . ID=agat-five_prime_utr-2;Parent=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1;exon_number=1;gene_id=AMEX60DD000004;transcript_id=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1
chr10p AGAT three_prime_UTR 697815 699524 . - . ID=agat-three_prime_utr-1;Parent=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1;exon_number=1;gene_id=AMEX60DD000004;transcript_id=LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1
chr10p ambMex60DD gene 770040 1018740 1000 + . ID=AMEX60DD000005;gene_id=AMEX60DD000005;gene_name=AMEX60DD000005
chr10p ambMex60DD transcript 770040 961083 1000 + . ID=AMEX60DD201000005.2;Parent=AMEX60DD000005;oRF_type=Predicted;gene_id=AMEX60DD000005;gene_name=AMEX60DD201000005.2;transcript_id=AMEX60DD201000005.2
chr10p ambMex60DD exon 770040 770414 1000 + . ID=agat-exon-7;Parent=AMEX60DD201000005.2;exon_number=1;gene_id=AMEX60DD000005;transcript_id=AMEX60DD201000005.2
chr10p ambMex60DD exon 801485 801606 1000 + . ID=agat-exon-8;Parent=AMEX60DD201000005.2;exon_number=2;gene_id=AMEX60DD000005;transcript_id=AMEX60DD201000005.2
chr10p ambMex60DD exon 877425 877977 1000 + . ID=agat-exon-9;Parent=AMEX60DD201000005.2;exon_number=3;gene_id=AMEX60DD000005;transcript_id=AMEX60DD201000005.2
chr10p ambMex60DD exon 915118 915167 1000 + . ID=agat-exon-10;Parent=AMEX60DD201000005.2;exon_number=4;gene_id=AMEX60DD000005;transcript_id=AMEX60DD201000005.2
chr10p ambMex60DD exon 960764 961083 1000 + . ID=agat-exon-11;Parent=AMEX60DD201000005.2;exon_number=5;gene_id=AMEX60DD000005;transcript_id=AMEX60DD201000005.2
chr10p ambMex60DD transcript 770040 1018740 1000 + . ID=AMEX60DD201000005.1;Parent=AMEX60DD000005;gene_id=AMEX60DD000005;gene_name=AMEX60DD201000005.1;transcript_id=AMEX60DD201000005.1
chr10p ambMex60DD exon 770040 770424 1000 + . ID=agat-exon-3;Parent=AMEX60DD201000005.1;exon_number=1;gene_id=AMEX60DD000005;transcript_id=AMEX60DD201000005.1
chr10p ambMex60DD exon 801485 801606 1000 + . ID=agat-exon-4;Parent=AMEX60DD201000005.1;exon_number=2;gene_id=AMEX60DD000005;transcript_id=AMEX60DD201000005.1
chr10p ambMex60DD exon 915118 915167 1000 + . ID=agat-exon-5;Parent=AMEX60DD201000005.1;exon_number=3;gene_id=AMEX60DD000005;transcript_id=AMEX60DD201000005.1
chr10p ambMex60DD exon 1018684 1018740 1000 + . ID=agat-exon-6;Parent=AMEX60DD201000005.1;exon_number=4;gene_id=AMEX60DD000005;transcript_id=AMEX60DD201000005.1
22 changes: 22 additions & 0 deletions t/gff_other/out/issue448.gtf
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
##gtf-version X
# GFF-like GTF i.e. not checked against any GTF specification. Conversion based on GFF input, standardised by AGAT.
chr10p ambMex60DD gene 697815 769805 1000 - . gene_id "AMEX60DD000004"; ID "AMEX60DD000004"; gene_name "LOC115462503 [nr]|ZNF268 [hs]";
chr10p ambMex60DD transcript 697815 769805 1000 - . gene_id "AMEX60DD000004"; transcript_id "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; ID "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; ORF_type "Putative short PTC"; Parent "AMEX60DD000004"; gene_name "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; homolog "XP_030048360.1";
chr10p ambMex60DD exon 697815 702473 1000 - . gene_id "AMEX60DD000004"; transcript_id "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; ID "agat-exon-1"; Parent "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; exon_number "1";
chr10p ambMex60DD exon 769665 769805 1000 - . gene_id "AMEX60DD000004"; transcript_id "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; ID "agat-exon-2"; Parent "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; exon_number "2";
chr10p ambMex60DD CDS 699525 701216 1000 - . gene_id "AMEX60DD000004"; transcript_id "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; ID "agat-cds-1"; Parent "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1";
chr10p AGAT five_prime_UTR 701217 702473 . - . gene_id "AMEX60DD000004"; transcript_id "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; ID "agat-five_prime_utr-1"; Parent "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; exon_number "1";
chr10p AGAT five_prime_UTR 769665 769805 . - . gene_id "AMEX60DD000004"; transcript_id "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; ID "agat-five_prime_utr-2"; Parent "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; exon_number "1";
chr10p AGAT three_prime_UTR 697815 699524 . - . gene_id "AMEX60DD000004"; transcript_id "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; ID "agat-three_prime_utr-1"; Parent "LOC115462503 [nr]|ZNF268 [hs]|AMEX60DD201000004.1"; exon_number "1";
chr10p ambMex60DD gene 770040 1018740 1000 + . gene_id "AMEX60DD000005"; ID "AMEX60DD000005"; gene_name "AMEX60DD000005";
chr10p ambMex60DD transcript 770040 961083 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; ID "AMEX60DD201000005.2"; ORF_type "Predicted"; Parent "AMEX60DD000005"; gene_name "AMEX60DD201000005.2";
chr10p ambMex60DD exon 770040 770414 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; ID "agat-exon-7"; Parent "AMEX60DD201000005.2"; exon_number "1";
chr10p ambMex60DD exon 801485 801606 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; ID "agat-exon-8"; Parent "AMEX60DD201000005.2"; exon_number "2";
chr10p ambMex60DD exon 877425 877977 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; ID "agat-exon-9"; Parent "AMEX60DD201000005.2"; exon_number "3";
chr10p ambMex60DD exon 915118 915167 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; ID "agat-exon-10"; Parent "AMEX60DD201000005.2"; exon_number "4";
chr10p ambMex60DD exon 960764 961083 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.2"; ID "agat-exon-11"; Parent "AMEX60DD201000005.2"; exon_number "5";
chr10p ambMex60DD transcript 770040 1018740 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.1"; ID "AMEX60DD201000005.1"; Parent "AMEX60DD000005"; gene_name "AMEX60DD201000005.1";
chr10p ambMex60DD exon 770040 770424 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.1"; ID "agat-exon-3"; Parent "AMEX60DD201000005.1"; exon_number "1";
chr10p ambMex60DD exon 801485 801606 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.1"; ID "agat-exon-4"; Parent "AMEX60DD201000005.1"; exon_number "2";
chr10p ambMex60DD exon 915118 915167 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.1"; ID "agat-exon-5"; Parent "AMEX60DD201000005.1"; exon_number "3";
chr10p ambMex60DD exon 1018684 1018740 1000 + . gene_id "AMEX60DD000005"; transcript_id "AMEX60DD201000005.1"; ID "agat-exon-6"; Parent "AMEX60DD201000005.1"; exon_number "4";

0 comments on commit e1ed07f

Please sign in to comment.