From 39a59d0a6fab3c076b7d2e3447f59121648aa0ca Mon Sep 17 00:00:00 2001 From: Jacques Dainat Date: Mon, 3 Jun 2024 14:22:59 +0200 Subject: [PATCH] 458 (#463) * update doc fix_phases script. Fix #458 --- bin/agat_sp_fix_cds_phases.pl | 46 ++++++++++++++++++++++++++++++++--- lib/AGAT/OmniscientTool.pm | 34 +++++++++++++------------- 2 files changed, 59 insertions(+), 21 deletions(-) diff --git a/bin/agat_sp_fix_cds_phases.pl b/bin/agat_sp_fix_cds_phases.pl index a237d6ae..47d991b9 100755 --- a/bin/agat_sp_fix_cds_phases.pl +++ b/bin/agat_sp_fix_cds_phases.pl @@ -81,16 +81,54 @@ =head1 NAME -agat_sp_fix_cds_frame.pl +agat_sp_fix_cds_phases.pl =head1 DESCRIPTION -This script aims to fix the cds phases. +This script aims to fix the CDS phases. +The script is compatible with incomplete gene models (Missing start, CDS +multiple of 3 or not, i.e. with offset of 1 or 2) and + and - strand. + +How it works? + +AGAT uses the fasta sequence to verify the CDS frame. +In case the CDS start by a start codon the phase of the first CDS piece is set to 0. +In the case there is no start codon and: + - If there is only one stop codon in the sequence and it is located at the last position, the phase of the first CDS piece is set to 0. + - If there is no stop codon, the phase of the first CDS piece is set to 0 (because sequence can be translated without premature stop codon). + - If there is/are stop codon(s) in the middle of the sequence we re-execute the check with an offset of +2 nucleotides: + - If there is only one stop codon in the sequence and it is located at the last position, the phase of the first CDS piece is set to 0. + - If there is no stop codon, the phase of the first CDS piece is set to 0 (because sequence can be translated without premature stop codon). + - If there is/are stop codon(s) in the middle of the sequence we re-execute the check with an offset of +1 nucleotide: + - If there is only one stop codon in the sequence and it is located at the last position, the phase of the first CDS piece is set to 0. + - If there is no stop codon, the phase of the first CDS piece is set to 0 (because sequence can be translated without premature stop codon). + - If there is/are still stop codon(s) we keep original phase and throw a warning. In this last case it means we never succeded to make a translation without premature stop codon in all the 3 possible phases. +Then in case of CDS made of multiple CDS pieces (i.e. discontinuous feature), the rest of the CDS pieces will be checked accordingly to the first CDS piece. + +What is a phase? + +For features of type "CDS", the phase indicates where the next codon begins +relative to the 5' end (where the 5' end of the CDS is relative to the strand +of the CDS feature) of the current CDS feature. For clarification the 5' end +for CDS features on the plus strand is the feature's start and and the 5' end +for CDS features on the minus strand is the feature's end. The phase is one of +the integers 0, 1, or 2, indicating the number of bases forward from the start +of the current CDS feature the next codon begins. A phase of "0" indicates that +a codon begins on the first nucleotide of the CDS feature (i.e. 0 bases forward), +a phase of "1" indicates that the codon begins at the second nucleotide of this +CDS feature and a phase of "2" indicates that the codon begins at the third +nucleotide of this region. Note that "Phase" in the context of a GFF3 CDS +feature should not be confused with the similar concept of frame that is also a +common concept in bioinformatics. Frame is generally calculated as a value for +a given base relative to the start of the complete open reading frame (ORF) or +the codon (e.g. modulo 3) while CDS phase describes the start of the next codon +relative to a given CDS feature. +The phase is REQUIRED for all CDS features. =head1 SYNOPSIS - agat_sp_fix_cds_frame.pl --gff infile.gff -f fasta [ -o outfile ] - agat_sp_fix_cds_frame.pl --help + agat_sp_fix_cds_phases.pl --gff infile.gff -f fasta [ -o outfile ] + agat_sp_fix_cds_phases.pl --help =head1 OPTIONS diff --git a/lib/AGAT/OmniscientTool.pm b/lib/AGAT/OmniscientTool.pm index aa2be4e2..079f4731 100644 --- a/lib/AGAT/OmniscientTool.pm +++ b/lib/AGAT/OmniscientTool.pm @@ -1399,18 +1399,18 @@ sub fil_cds_frame { # If no phase found and a phase exists in the CDS feature we keep the original # otherwise we loop over CDS features to set the correct phase - if ( defined( $phase ) ) { - foreach my $cds_feature ( @cds_list) { - my $original_phase = $cds_feature->frame; + if ( defined( $phase ) ) { + foreach my $cds_feature ( @cds_list) { + my $original_phase = $cds_feature->frame; - if ( ($original_phase eq ".") or ($original_phase != $phase) ){ - print "Original phase $original_phase replaced by $phase for ".$cds_feature->_tag_value("ID")."\n" if $verbose; - $cds_feature->frame($phase); - } - my $cds_length=$cds_feature->end-$cds_feature->start +1; - $phase=(3-(($cds_length-$phase)%3))%3; #second modulo allows to avoid the frame with 3. Instead we have 0. - } - } + if ( ($original_phase eq ".") or ($original_phase != $phase) ){ + print "Original phase $original_phase replaced by $phase for ".$cds_feature->_tag_value("ID")."\n" if $verbose; + $cds_feature->frame($phase); + } + my $cds_length=$cds_feature->end-$cds_feature->start +1; + $phase=(3-(($cds_length-$phase)%3))%3; #second modulo allows to avoid the frame with 3. Instead we have 0. + } + } } } } @@ -1445,15 +1445,15 @@ sub _get_cds_start_phase { my $protein_seq_obj = $cds_obj->translate(); my $lastChar = substr $protein_seq_obj->seq(),-1,1; my $count = () = $protein_seq_obj->seq() =~ /\*/g; - if ($lastChar eq "*"){ # if last char is a stop we remove it - - if ($count == 1){ + if ($lastChar eq "*"){ + # The last char is a stop and in total we counted only one stop. + if ($count == 1){ #print "Missing start codon, phase 0, stop present\n"; return 0; } } - else{ - if ($count == 0){ + else{ # The last char is not a stop but we didn't find any stop in the middle on the sequence neither. + if ($count == 0){ #print "Missing start codon, phase 0, missing stop codon\n"; return 0; } @@ -1493,7 +1493,7 @@ sub _get_cds_start_phase { } } - # always stop codon in the middle of the sequence... cannot determine correct phase, keep original phase and trow a warning ! + # always stop codon in the middle of the sequence... cannot determine correct phase, keep original phase and throw a warning ! warn "WARNING OmniscientTools _get_cds_start_phase: No phase found for the CDS by looking at the ORFs. ". "All frames contain an internal stop codon, thus we cannot determine the correct phase. We will keep original stored phase information.\n"; return undef;