Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUFIX] fail when passed fastas with duplicate sequence ids #555

Merged
merged 2 commits into from
Dec 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions sub-packages/bionemo-noodles/src/bionemo/noodles/nvfaidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,14 +147,38 @@ class NvFaidx:
See Also: bionemo.noodles.nvfaidx.SequenceAccessor
"""

def __init__(self, fasta_path: str | Path, faidx_path: Optional[str | Path] = None, ignore_existing_fai=True):
def __init__(
self,
fasta_path: str | Path,
faidx_path: Optional[str | Path] = None,
ignore_existing_fai: bool = True,
allow_duplicate_seqids: bool = False,
):
"""Construct a dict-like object representing a memmapped, indexed FASTA file.

This is an indexed fasta reader. Consequences of this are that the FASTA file must be well formed, meaning
sequence-ids and line-lengths must conform to FASTA standards. Additionally, the order of returned seqid, sequence
pairs when iterating over the index is not guaranteed to be the same order as the underlying fasta file.

Args:
fasta_path (str): Path to the FASTA file.
faidx_path (str): Path to the FAI index file. If None, one will be created.
ignore_existing_fai (bool): If True, ignore any existing FAI file and create an in-memory index. Note that
this will also ignore `faidx_path`.
allow_duplicate_seqids (bool): If true, will produce index for invalid fastas which contain duplicate seqids.
In this scenario, indexing is performed by integer rather than strings.

Example with invalid seqids.
>chr1 dupes|not|allowd
ATGATGATGATG
>chr1 whoops|there|is|dupe
ATGATGATGATG
NvFaidx:
{
0 : SequenceAccessor(chr1 dupes|not|allowed),
1 : SequenceAccessor(chr1 whoops|there|is|dupe)
}

"""
if isinstance(fasta_path, Path):
fasta_path = str(fasta_path)
Expand All @@ -178,7 +202,14 @@ def __init__(self, fasta_path: str | Path, faidx_path: Optional[str | Path] = No
case _:
raise ValueError("unreachable condition.")

self.records: Dict[str, PyFaidxRecord] = {record.name: record for record in self.reader.records()}
self.records: Dict[str | int, PyFaidxRecord] = {record.name: record for record in self.reader.records()}
if len(self.records) != len(self.reader.records()):
if not allow_duplicate_seqids:
raise ValueError(
"Non-unique sequence-id detected in FASTA, this is invalid. Correct headers and try again or pass allow_duplicate_seqid'"
)
else:
self.records: Dict[str | int, PyFaidxRecord] = dict(enumerate(self.reader.records()))

def __getitem__(self, seqid: str) -> SequenceAccessor: # noqa: D105
if seqid not in self.records:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
>chr1 version|of|seq1
ACTGACTGACTG
>chr1 version|of|seq2
GGTCAAGGTCAA
>chr1 some|random|inputs
AGTCAAGGTCCA
CGTCAAGGTCCC
GGTCAAGGTCCG
TGTCAAGGTCCT
AGTCAAGGTCAA
CGTCAAGGTCAC
GGTCAAGGTCAG
>chr1 why|is|this|done
CCCCCCCCCCCC
ACGT
>chr1 stop|violated|fasta|spec
A
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def sample_fasta():
return str(pathlib.Path(__file__).parent.parent.parent / "bionemo/noodles/data/sample.fasta")


@pytest.fixture
def dupes_fasta():
return str(pathlib.Path(__file__).parent.parent.parent / "bionemo/noodles/data/dupes.fasta")


def test_create_faidx_rustbind():
filename = create_test_fasta(num_seqs=2, seq_length=200)
faidx_filename = PyIndexedMmapFastaReader.create_faidx(filename, force=False)
Expand Down Expand Up @@ -345,6 +350,16 @@ def test_parallel_index_creation_nvfaidx():
assert all(lens_equal), (set(lens), sum(lens_equal))


def test_duplicate_seqids(dupes_fasta):
# Fails since we will get back 1 entry in our dict with 5 in our records list.
with pytest.raises(ValueError):
index = NvFaidx(dupes_fasta, allow_duplicate_seqids=False)

index = NvFaidx(dupes_fasta, allow_duplicate_seqids=True)
assert list(index.records.keys()) == list(range(5))
assert len(index) == 5


def test_file_errors():
# test missing fasta file
# test failure to parse fasta file
Expand Down
Loading