Skip to content

Commit

Permalink
Adding initial batch of samtools type hinted wrapper functions
Browse files Browse the repository at this point in the history
  • Loading branch information
Nathan Roach committed Dec 30, 2022
1 parent 299cc7c commit f2b31bc
Show file tree
Hide file tree
Showing 14 changed files with 1,222 additions and 94 deletions.
5 changes: 5 additions & 0 deletions fgpyo/pysam_dispatch/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
def pysam_fn(*args: str) -> None:
"""
Type hinted import of an example function from the pysam samtools dispatcher.
"""
pass
6 changes: 6 additions & 0 deletions fgpyo/pysam_dispatch/samtools/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from fgpyo.pysam_dispatch.samtools.dict import dict
from fgpyo.pysam_dispatch.samtools.faidx import faidx
from fgpyo.pysam_dispatch.samtools.index import index
from fgpyo.pysam_dispatch.samtools.sort import sort

__all__ = ["dict", "faidx", "index", "sort"]
75 changes: 75 additions & 0 deletions fgpyo/pysam_dispatch/samtools/dict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
"""
Type-Hinted Wrapper around pysam samtools dict dispatch
-------------------------------------------------------
"""

from pathlib import Path
from typing import TYPE_CHECKING
from typing import List
from typing import Optional

if TYPE_CHECKING:
from fgpyo.pysam_dispatch import pysam_fn as _pysam_dict
else:
from pysam import dict as _pysam_dict


def dict(
input: Path,
output: Path,
no_header: bool = False,
alias: bool = False,
assembly: Optional[str] = None,
alt: Optional[Path] = None,
species: Optional[str] = None,
uri: Optional[str] = None,
) -> List[str]:
"""
Calls the `samtools` function `dict` using the pysam samtools dispatcher.
Arguments will be formatted into the appropriate command-line call which will be invoked
using the pysam dispatcher.
Returns the list of arguments passed to pysam.dict()
Args:
input: Path to the FASTA files to generate a sequence dictionary for.
output: Path to the file to write output to.
no_header: If true will not print the @HD header line.
assembly: The assembly for the AS tag.
alias: Adds an AN tag with the same value as the SN tag, except that a 'chr' prefix is
removed if SN has one or added if it does not. For mitochondria (i.e., when SN is “M”
or “MT”, with or without a “chr” prefix), also adds the remaining combinations of
“chr/M/MT” to the AN tag.
alt: Add an AH tag to each sequence listed in the specified bwa-style .alt file. These
files use SAM records to represent alternate locus sequences (as named in the QNAME
field) and their mappings to the primary assembly.
species: Specify the species for the SP tag.
uri: Specify the URI for the UR tag. Defaults to the absolute path of ref.fasta.
"""

args: List[str] = ["--output", str(output)]

if no_header:
args.append("--no-header")

if alias:
args.append("--alias")

if assembly is not None:
args.extend(["--assembly", assembly])

if alt is not None:
args.extend(["--alt", str(alt)])

if species is not None:
args.extend(["--species", species])

if uri is not None:
args.extend(["--uri", uri])

args.append(str(input))

_pysam_dict(*args)

return args
122 changes: 122 additions & 0 deletions fgpyo/pysam_dispatch/samtools/faidx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""
Type-Hinted Wrapper around pysam samtools faidx dispatch
--------------------------------------------------------
"""

import enum
from pathlib import Path
from typing import TYPE_CHECKING
from typing import List
from typing import Optional
from typing import Tuple

if TYPE_CHECKING:
from fgpyo.pysam_dispatch import pysam_fn as _pysam_faidx
else:
from pysam import faidx as _pysam_faidx


@enum.unique
class FaidxMarkStrand(enum.Enum):
RevComp = "rc"
No = "no"
Sign = "sign"
Custom = "custom"


def faidx(
input: Path,
output: Optional[Path] = None,
length: int = 60,
regions: Optional[List[str]] = None,
region_file: Optional[Path] = None,
fai_idx: Optional[Path] = None,
gzi_idx: Optional[Path] = None,
continue_if_non_existent: bool = False,
fastq: bool = False,
reverse_complement: bool = False,
mark_strand: FaidxMarkStrand = FaidxMarkStrand.RevComp,
custom_mark_strand: Optional[Tuple[str, str]] = None,
) -> List[str]:
"""
Calls the `samtools` function `faidx` using the pysam samtools dispatcher.
Arguments will be formatted into the appropriate command-line call which will be invoked
using the pysam dispatcher.
Returns the list of arguments passed to pysam.faidx()
Args:
input: Path to the FAIDX files to index / read from.
output: Path to the file to write FASTA output.
length: Length of FASTA sequence line.
regions: regions to extract from the FASTA file in samtools region format
(<chrom>:<1-based start>-<1-based end>)
region_file: Path to file containing regions to extract from the FASTA file in samtools
region format (<chrom>:<1-based start>-<1-based end>). 1 region descriptor per line.
fai_idx: Read/Write to specified FAI index file.
gzi_idx: Read/Write to specified compressed file index (used with .gz files).
continue_if_non_existent: If true continue qorking if a non-existent region is requested.
fastq: Read FASTQ files and output extracted sequences in FASTQ format. Same as using
samtools fqidx.
reverse_complement: Output the sequence as the reverse complement. When this option is
used, “/rc” will be appended to the sequence names. To turn this off or change the
string appended, use the mark_strand parameter.
mark_strand: Append strand indicator to sequence name. Type to append can be one of:
FaidxMarkStrand.RevComp - Append '/rc' when writing the reverse complement. This is the
default.
FaidxMarkStrand.No - Do not append anything.
FaidxMarkStrand.Sign - Append '(+)' for forward strand or '(-)' for reverse
complement. This matches the output of “bedtools getfasta -s”.
FaidxMarkStrand.Custom - custom,<custom_mark_strand[0]>,<custom_mark_strand[1]>
Append string <pos> to names when writing the forward strand and <neg> when writing the
reverse strand. Spaces are preserved, so it is possible to move the indicator into the
comment part of the description line by including a leading space in the strings
<custom_mark_strand[0]> and <custom_mark_strand[1]>.
custom_mark_strand: The custom strand indicators to use in in the Custom MarkStrand
setting. The first value of the tuple will be used as the positive strand indicator,
the second value will be used as the negative strand indicator.
"""

mark_strand_str: str
if mark_strand == FaidxMarkStrand.Custom:
assert custom_mark_strand is not None, (
"Cannot use custom mark strand without providing the custom strand indicators to "
+ "`custom_mark_string`"
)
mark_strand_str = f"{mark_strand.value},{custom_mark_strand[0]},{custom_mark_strand[1]}"
else:
mark_strand_str = mark_strand.value

args: List[str] = ["--length", str(length), "--mark-strand", mark_strand_str]

if output is not None:
args.extend(["--output", str(output)])

if continue_if_non_existent:
args.append("--continue")
if reverse_complement:
args.append("--reverse-complement")
if fastq:
args.append("--fastq")

if fai_idx is not None:
args.extend(["--fai-idx", str(fai_idx)])

if gzi_idx is not None:
args.extend(["--gzi-idx", str(gzi_idx)])

if region_file is not None:
args.extend(["--region-file", str(region_file)])

args.append(str(input))

if regions is not None:
args.extend(regions)

_pysam_faidx(*args)

return args
72 changes: 72 additions & 0 deletions fgpyo/pysam_dispatch/samtools/index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""
Type-Hinted Wrapper around pysam samtools index dispatch
--------------------------------------------------------
"""

import enum
from pathlib import Path
from typing import TYPE_CHECKING
from typing import List
from typing import Optional

if TYPE_CHECKING:
from fgpyo.pysam_dispatch import pysam_fn as _pysam_index
else:
from pysam import index as _pysam_index


@enum.unique
class SamIndexType(enum.Enum):
BAI = "-b"
CSI = "-c"


def index(
input: Path,
# See https://github.com/pysam-developers/pysam/issues/1155
# inputs: List[Path], Cant currently accept multiple
output: Optional[Path] = None,
threads: int = 0,
index_type: SamIndexType = SamIndexType.BAI,
csi_index_size: int = 14,
) -> List[str]:
"""
Calls the `samtools` function `index` using the pysam samtools dispatcher.
Arguments will be formatted into the appropriate command-line call which will be invoked
using the pysam dispatcher.
Returns the list of arguments passed to pysam.index()
Args:
input: Path to the SAM/BAM/CRAM file to index.
output: Path to the file to write output. (Currently may only be used when exactly one
alignment file is being indexed.)
threads: Number of input/output compression threads to use in addition to main thread.
index_type: The type of index file to produce when indexing.
csi_index_size: Sets the minimum interval size of CSI indices to 2^INT.
"""

# assert len(inputs) >= 1, "Must provide at least one input to samtools index."

# if len(inputs) != 1:
# assert (
# output is None
# ), "Output currently can only be used if there is exactly one input file being indexed"
# args = ["-M"]
# else:
# args = []
args = []

if index_type != SamIndexType.BAI:
args.append(index_type.value)

if index_type == SamIndexType.CSI:
args.extend(["-m", str(csi_index_size)])
args.extend(["-@", str(threads)])
args.append(str(input))
if output is not None:
args.extend(["-o", str(output)])
_pysam_index(*args)

return args
Loading

0 comments on commit f2b31bc

Please sign in to comment.