From ed66755375e8b7c0f67b67e0a0692f0a30c4feff Mon Sep 17 00:00:00 2001 From: Monica Poelchau Date: Mon, 16 Oct 2023 13:59:28 -0500 Subject: [PATCH 1/2] initial fix --- gff3tool/bin/gff3_to_fasta.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gff3tool/bin/gff3_to_fasta.py b/gff3tool/bin/gff3_to_fasta.py index cfbadc6..499dff8 100755 --- a/gff3tool/bin/gff3_to_fasta.py +++ b/gff3tool/bin/gff3_to_fasta.py @@ -191,7 +191,10 @@ def splicer(gff, ftype, dline, stype, embedded_fasta=False): cname = child['attributes']['Name'] defline='>{0:s}'.format(cid) if stype == "pep": - cid = re.sub(r'(.+-)(R)([a-zA-Z]+)', r'\1P\3', cid) + if 'protein_id' in child['attributes']: + cid = child['attributes']['protein_id'] + else: + cid = re.sub(r'(.+-)(R)([a-zA-Z]+)', r'\1P\3', cid) defline = '>{0:s}'.format(cid) elif ftype[0] == 'CDS': defline='>{0:s}-CDS'.format(cid) From e267f053af06b2ede9eefd4a62a21ba5f93efcbd Mon Sep 17 00:00:00 2001 From: Monica Poelchau Date: Mon, 16 Oct 2023 15:15:10 -0500 Subject: [PATCH 2/2] get protein_id from CDS --- gff3tool/bin/gff3_to_fasta.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gff3tool/bin/gff3_to_fasta.py b/gff3tool/bin/gff3_to_fasta.py index 499dff8..a059439 100755 --- a/gff3tool/bin/gff3_to_fasta.py +++ b/gff3tool/bin/gff3_to_fasta.py @@ -191,10 +191,11 @@ def splicer(gff, ftype, dline, stype, embedded_fasta=False): cname = child['attributes']['Name'] defline='>{0:s}'.format(cid) if stype == "pep": - if 'protein_id' in child['attributes']: - cid = child['attributes']['protein_id'] - else: - cid = re.sub(r'(.+-)(R)([a-zA-Z]+)', r'\1P\3', cid) + for grandchild in child['children']: #first try to get the CDS protein_id + if 'protein_id' in grandchild['attributes']: + cid = grandchild['attributes']['protein_id'] + + cid = re.sub(r'(.+-)(R)([a-zA-Z]+)', r'\1P\3', cid)#otherwise, if it has the -R[A-Z] format then modify that to -P[A-Z] defline = '>{0:s}'.format(cid) elif ftype[0] == 'CDS': defline='>{0:s}-CDS'.format(cid)