diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index b7d54dae5..c36c7958d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -93,3 +93,9 @@ jobs: - uses: codecov/codecov-action@v3 with: fail_ci_if_error: true + + build-docs: + uses: nextstrain/.github/.github/workflows/docs-ci.yaml@master + with: + docs-directory: docs/ + pip-install-target: .[dev] diff --git a/CHANGES.md b/CHANGES.md index 1c6155313..a31c60844 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -20,8 +20,10 @@ * translate: Fix error handling when features cannot be read from reference sequence file. [#1168][] (@victorlin) * translate: Remove an unnecessary check which allowed for inaccurate error messages to be shown. [#1169][] (@victorlin) * frequencies: Previously, monthly pivot points calculated from the end of a month may have been shifted by 1-3 days. This is now fixed. [#1150][] (@victorlin) +* docs: Fix minor formatting issues. [#1095][] (@victorlin) * Update development status on PyPI from "3 - Alpha" to "5 - Production/Stable". This should have been done since the beginning of this changelog, but now it is official. [#1160][] (@corneliusroemer) +[#1095]: https://github.com/nextstrain/augur/pull/1095 [#1150]: https://github.com/nextstrain/augur/pull/1150 [#1160]: https://github.com/nextstrain/augur/pull/1160 [#1168]: https://github.com/nextstrain/augur/pull/1168 diff --git a/augur/align.py b/augur/align.py index ebe038946..c9b021d34 100644 --- a/augur/align.py +++ b/augur/align.py @@ -54,7 +54,7 @@ def prepare(sequences, existing_aln_fname, output, ref_name, ref_seq_fname): Parameters ---------- - sequences : list[str] + sequences : list of str List of paths to FASTA-formatted sequences to align. existing_aln_fname : str Path of an existing alignment to use, or None @@ -67,7 +67,8 @@ def prepare(sequences, existing_aln_fname, output, ref_name, ref_seq_fname): Returns ------- - tuple: The existing alignment filename, the new sequences filename, and the name of the reference sequence. + tuple of str + The existing alignment filename, the new sequences filename, and the name of the reference sequence. """ seqs = read_sequences(*sequences) seqs_to_align_fname = output + ".to_align.fasta" @@ -104,7 +105,7 @@ def run(args): ''' Parameters ---------- - args : namespace + args : argparse.Namespace arguments passed in via the command-line from augur Returns @@ -152,6 +153,8 @@ def run(args): def postprocess(output_file, ref_name, keep_reference, fill_gaps): """Postprocessing of the combined alignment file. + The modified alignment is written directly to output_file. + Parameters ---------- output_file: str @@ -162,10 +165,6 @@ def postprocess(output_file, ref_name, keep_reference, fill_gaps): If the reference was provided, whether it should be kept in the alignment fill_gaps: bool Replace all gaps in the alignment with "N" to indicate ambiguous sites. - - Returns - ------- - None - the modified alignment is written directly to output_file """ # -- ref_name -- # reads the new alignment @@ -270,7 +269,7 @@ def strip_non_reference(aln, reference, insertion_csv=None): Parameters ---------- - aln : MultipleSeqAlign + aln : Bio.Align.MultipleSeqAlignment Biopython Alignment reference : str name of reference sequence, assumed to be part of the alignment @@ -280,9 +279,8 @@ def strip_non_reference(aln, reference, insertion_csv=None): list list of trimmed sequences, effectively a multiple alignment - - Tests - ----- + Examples + -------- >>> [s.name for s in strip_non_reference(read_alignment("tests/data/align/test_aligned_sequences.fasta"), "with_gaps")] Trimmed gaps in with_gaps from the alignment ['with_gaps', 'no_gaps', 'some_other_seq', '_R_crick_strand'] @@ -384,7 +382,7 @@ def prettify_alignment(aln): Parameters ---------- - aln : MultipleSeqAlign + aln : Bio.Align.MultipleSeqAlignment Biopython Alignment ''' for seq in aln: @@ -407,7 +405,7 @@ def make_gaps_ambiguous(aln): Parameters ---------- - aln : MultipleSeqAlign + aln : Bio.Align.MultipleSeqAlignment Biopython Alignment ''' for seq in aln: diff --git a/augur/ancestral.py b/augur/ancestral.py index 97279cbd4..bee763908 100644 --- a/augur/ancestral.py +++ b/augur/ancestral.py @@ -28,7 +28,7 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True, Parameters ---------- - tree : Bio.Phylo tree or str + tree : Bio.Phylo.BaseTree.Tree or str tree or filename of tree aln : Bio.Align.MultipleSeqAlignment or str alignment or filename of alignment @@ -49,7 +49,7 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True, Returns ------- - TreeAnc + treetime.TreeAnc treetime.TreeAnc instance """ @@ -78,7 +78,7 @@ def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, Parameters ---------- - tt : treetime + tt : treetime.TreeTime instance of treetime with valid ancestral reconstruction infer_tips : bool, optional if true, request the reconstructed tip sequences from treetime, otherwise retain input ambiguities diff --git a/augur/clades.py b/augur/clades.py index b3155f438..acfb4c301 100644 --- a/augur/clades.py +++ b/augur/clades.py @@ -124,9 +124,9 @@ def is_node_in_clade(clade_alleles, node, ref): ---------- clade_alleles : list list of clade defining alleles - node : Phylo.Node + node : Bio.Phylo.BaseTree.Clade node to check, assuming sequences (as mutations) are attached to node - ref : str/list + ref : str or list positions Returns @@ -162,9 +162,9 @@ def assign_clades(clade_designations, all_muts, tree, ref=None): clade definitions as :code:`{clade_name:[(gene, site, allele),...]}` all_muts : dict mutations in each node - tree : Phylo.Tree + tree : Bio.Phylo.BaseTree.Tree phylogenetic tree to process - ref : str/list, optional + ref : str or list, optional reference sequence to look up state when not mutated Returns diff --git a/augur/dates/__init__.py b/augur/dates/__init__.py index c8969ba19..472d208be 100644 --- a/augur/dates/__init__.py +++ b/augur/dates/__init__.py @@ -25,6 +25,8 @@ def numeric_date(date): 2. A string in the YYYY-MM-DD (ISO 8601) syntax 3. A string representing a relative date (duration before datetime.date.today()) + Examples + -------- >>> numeric_date("2020.42") 2020.42 >>> numeric_date("2020-06-04") diff --git a/augur/distance.py b/augur/distance.py index d41be670e..f38241cce 100644 --- a/augur/distance.py +++ b/augur/distance.py @@ -4,8 +4,7 @@ which sequences to compare) and a distance map (to determine the weight of a mismatch between any two sequences). -Comparison methods -================== +**Comparison methods** Comparison methods include: @@ -32,14 +31,15 @@ parameters allow users to specify a fixed time interval for pairwise calculations, limiting the computationally complexity of the comparisons. -Distance maps -============= +**Distance maps** Distance maps are defined in JSON format with two required top-level keys. The `default` key specifies the numeric (floating point) value to assign to all mismatches by default. The `map` key specifies a dictionary of weights to use for distance calculations. These weights are indexed hierarchically by gene name and one-based gene coordinate and are assigned in either a sequence-independent or sequence-dependent manner. -The simplest possible distance map calculates Hamming distance between sequences without any site-specific weights, as shown below:: +The simplest possible distance map calculates Hamming distance between sequences without any site-specific weights, as shown below: + +.. code-block:: json { "name": "Hamming distance", @@ -48,7 +48,9 @@ } By default, distances are floating point values whose precision can be controlled with the `precision` key that defines the number of decimal places to retain for each distance. -The following example shows how to specify a precision of two decimal places in the final output.:: +The following example shows how to specify a precision of two decimal places in the final output: + +.. code-block:: json { "name": "Hamming distance", @@ -57,7 +59,9 @@ "precision": 2 } -Distances can be reported as integer values by specifying an `output_type` as `integer` or `int` as follows.:: +Distances can be reported as integer values by specifying an `output_type` as `integer` or `int` as follows: + +.. code-block:: json { "name": "Hamming distance", @@ -70,7 +74,9 @@ value of the same type as the default value (integer or float). The following example is a distance map for antigenic amino acid substitutions near influenza A/H3N2 HA's receptor binding sites. This map calculates the Hamming distance -between amino acid sequences only at seven positions in the HA1 gene:: +between amino acid sequences only at seven positions in the HA1 gene: + +.. code-block:: json { "name": "Koel epitope sites", @@ -92,7 +98,9 @@ where the `from` sequence in each pair is interpreted as the ancestral state and the `to` sequence as the derived state. The following example is a distance map that assigns asymmetric weights to specific amino acid substitutions at a -specific position in the influenza gene HA1:: +specific position in the influenza gene HA1: + +.. code-block:: json { "default": 0.0, @@ -119,7 +127,9 @@ the JSON includes a `params` field that describes the mapping of attribute names to requested comparisons and distance maps and any date parameters specified by the user. The following example JSON shows a sample output when the distance -command is run with multiple comparisons and distance maps:: +command is run with multiple comparisons and distance maps: + +.. code-block:: json { "params": { @@ -177,7 +187,8 @@ def read_distance_map(map_file): dict : Python representation of the distance map JSON - + Examples + -------- >>> sorted(read_distance_map("tests/data/distance_map_weight_per_site.json").items()) [('default', 0), ('map', {'HA1': {144: 1}})] >>> sorted(read_distance_map("tests/data/distance_map_weight_per_site_and_sequence.json").items()) @@ -237,7 +248,8 @@ def get_distance_between_nodes(node_a_sequences, node_b_sequences, distance_map, float : distance between node sequences based on the given map - + Examples + -------- >>> node_a_sequences = {"gene": "ACTG"} >>> node_b_sequences = {"gene": "ACGG"} >>> distance_map = {"default": 0, "map": {}} @@ -465,7 +477,7 @@ def get_distances_to_root(tree, sequences_by_node_and_gene, distance_map): Parameters ---------- - tree : Bio.Phylo + tree : Bio.Phylo.BaseTree.Tree a rooted tree whose node names match the given dictionary of sequences by node and gene @@ -505,7 +517,7 @@ def get_distances_to_last_ancestor(tree, sequences_by_node_and_gene, distance_ma Parameters ---------- - tree : Bio.Phylo + tree : Bio.Phylo.BaseTree.Tree a rooted tree whose node names match the given dictionary of sequences by node and gene @@ -565,7 +577,7 @@ def get_distances_to_all_pairs(tree, sequences_by_node_and_gene, distance_map, e Parameters ---------- - tree : Bio.Phylo + tree : Bio.Phylo.BaseTree.Tree a rooted tree whose node names match the given dictionary of sequences by node and gene diff --git a/augur/export_v2.py b/augur/export_v2.py index 19bb6933c..3f606c797 100644 --- a/augur/export_v2.py +++ b/augur/export_v2.py @@ -584,7 +584,8 @@ def set_data_provenance(data_json, config): config : dict config JSON with an expected ``data_provenance`` key - + Examples + -------- >>> config = {"data_provenance": [{"name": "GISAID"}, {"name": "INSDC"}]} >>> data_json = {"meta": {}} >>> set_data_provenance(data_json, config) @@ -600,6 +601,8 @@ def counter_to_disambiguation_suffix(count): """Given a numeric count of author papers, return a distinct alphabetical disambiguation suffix. + Examples + -------- >>> counter_to_disambiguation_suffix(0) 'A' >>> counter_to_disambiguation_suffix(25) diff --git a/augur/filter/include_exclude_rules.py b/augur/filter/include_exclude_rules.py index 8024000e8..6cfda62f5 100644 --- a/augur/filter/include_exclude_rules.py +++ b/augur/filter/include_exclude_rules.py @@ -24,10 +24,11 @@ def filter_by_exclude_all(metadata): Returns ------- - set[str]: + set of str: Empty set of strains - + Examples + -------- >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) >>> filter_by_exclude_all(metadata) set() @@ -47,10 +48,11 @@ def filter_by_exclude(metadata, exclude_file): Returns ------- - set[str]: + set of str: Strains that pass the filter - + Examples + -------- >>> import os >>> from tempfile import NamedTemporaryFile >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) @@ -82,7 +84,8 @@ def parse_filter_query(query): str : Value of column to query - + Examples + -------- >>> parse_filter_query("property=value") ('property', , 'value') >>> parse_filter_query("property!=value") @@ -114,10 +117,11 @@ def filter_by_exclude_where(metadata, exclude_where): Returns ------- - set[str]: + set of str: Strains that pass the filter - + Examples + -------- >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) >>> filter_by_exclude_where(metadata, "region!=Europe") {'strain2'} @@ -166,10 +170,11 @@ def filter_by_query(metadata, query): Returns ------- - set[str]: + set of str: Strains that pass the filter - + Examples + -------- >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) >>> filter_by_query(metadata, "region == 'Africa'") {'strain1'} @@ -195,10 +200,11 @@ def filter_by_ambiguous_date(metadata, date_column="date", ambiguity="any"): Returns ------- - set[str]: + set of str: Strains that pass the filter - + Examples + -------- >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-XX"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) >>> filter_by_ambiguous_date(metadata) {'strain2'} @@ -238,10 +244,11 @@ def filter_by_date(metadata, date_column="date", min_date=None, max_date=None): Returns ------- - set[str]: + set of str: Strains that pass the filter - + Examples + -------- >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) >>> filter_by_date(metadata, min_date=numeric_date("2020-01-02")) {'strain2'} @@ -309,10 +316,11 @@ def filter_by_sequence_index(metadata, sequence_index): Returns ------- - set[str]: + set of str: Strains that pass the filter - + Examples + -------- >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) >>> sequence_index = pd.DataFrame([{"strain": "strain1", "ACGT": 28000}]).set_index("strain") >>> filter_by_sequence_index(metadata, sequence_index) @@ -339,10 +347,11 @@ def filter_by_sequence_length(metadata, sequence_index, min_length=0): Returns ------- - set[str]: + set of str: Strains that pass the filter - + Examples + -------- >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) >>> sequence_index = pd.DataFrame([{"strain": "strain1", "A": 7000, "C": 7000, "G": 7000, "T": 7000}, {"strain": "strain2", "A": 6500, "C": 6500, "G": 6500, "T": 6500}]).set_index("strain") >>> filter_by_sequence_length(metadata, sequence_index, min_length=27000) @@ -376,10 +385,11 @@ def filter_by_non_nucleotide(metadata, sequence_index): Returns ------- - set[str]: + set of str: Strains that pass the filter - + Examples + -------- >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-01-02"}], index=["strain1", "strain2"]) >>> sequence_index = pd.DataFrame([{"strain": "strain1", "invalid_nucleotides": 0}, {"strain": "strain2", "invalid_nucleotides": 1}]).set_index("strain") >>> filter_by_non_nucleotide(metadata, sequence_index) @@ -407,10 +417,11 @@ def force_include_strains(metadata, include_file): Returns ------- - set[str]: + set of str: Strains that pass the filter - + Examples + -------- >>> import os >>> from tempfile import NamedTemporaryFile >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) @@ -442,10 +453,11 @@ def force_include_where(metadata, include_where): Returns ------- - set[str]: + set of str: Strains that pass the filter - + Examples + -------- >>> metadata = pd.DataFrame([{"region": "Africa"}, {"region": "Europe"}], index=["strain1", "strain2"]) >>> force_include_where(metadata, "region!=Europe") {'strain1'} @@ -626,11 +638,11 @@ def apply_filters(metadata, exclude_by, include_by): ---------- metadata : pandas.DataFrame Metadata to filter - exclude_by : list[tuple] + exclude_by : list of tuple A list of 2-element tuples with a callable to filter by in the first index and a dictionary of kwargs to pass to the function in the second index. - include_by : list[tuple] + include_by : list of tuple A list of 2-element tuples in the same format as the ``exclude_by`` argument. @@ -638,16 +650,17 @@ def apply_filters(metadata, exclude_by, include_by): ------- set : Strains to keep (those that passed all filters) - list[dict] : + list of dict : Strains to exclude along with the function that filtered them and the arguments used to run the function. - list[dict] : + list of dict : Strains to force-include along with the function that filtered them and the arguments used to run the function. For example, filter data by minimum date, but force the include of strains from Africa. - + Examples + -------- >>> metadata = pd.DataFrame([{"region": "Africa", "date": "2020-01-01"}, {"region": "Europe", "date": "2020-10-02"}, {"region": "North America", "date": "2020-01-01"}], index=["strain1", "strain2", "strain3"]) >>> exclude_by = [(filter_by_date, {"min_date": numeric_date("2020-04-01")})] >>> include_by = [(force_include_where, {"include_where": "region=Africa"})] diff --git a/augur/filter/io.py b/augur/filter/io.py index a7dc11933..75d542212 100644 --- a/augur/filter/io.py +++ b/augur/filter/io.py @@ -41,7 +41,8 @@ def filter_kwargs_to_str(kwargs): str : String representation of the kwargs for reporting. - + Examples + -------- >>> from augur.dates import numeric_date >>> from augur.filter.include_exclude_rules import filter_by_sequence_length, filter_by_date >>> sequence_index = pd.DataFrame([{"strain": "strain1", "ACGT": 28000}, {"strain": "strain2", "ACGT": 26000}, {"strain": "strain3", "ACGT": 5000}]).set_index("strain") diff --git a/augur/filter/subsample.py b/augur/filter/subsample.py index 823d01388..a79e9101d 100644 --- a/augur/filter/subsample.py +++ b/augur/filter/subsample.py @@ -31,7 +31,8 @@ def get_groups_for_subsampling(strains, metadata, group_by=None): list : A list of dictionaries with strains that were skipped from grouping and the reason why (see also: `apply_filters` output). - + Examples + -------- >>> strains = ["strain1", "strain2"] >>> metadata = pd.DataFrame([{"strain": "strain1", "date": "2020-01-01", "region": "Africa"}, {"strain": "strain2", "date": "2020-02-01", "region": "Europe"}]).set_index("strain") >>> group_by = ["region"] @@ -253,6 +254,9 @@ class PriorityQueue: """A priority queue implementation that automatically replaces lower priority items in the heap with incoming higher priority items. + Examples + -------- + Add a single record to a heap with a maximum of 2 records. >>> queue = PriorityQueue(max_size=2) @@ -334,6 +338,9 @@ def create_queues_by_group(groups, max_size, max_attempts=100, random_seed=None) attempts to create queues for which the sum of their maximum sizes is greater than zero. + Examples + -------- + Create queues for two groups with a fixed maximum size. >>> groups = ("2015", "2016") @@ -395,7 +402,7 @@ def calculate_sequences_per_group(target_max_value, group_sizes, allow_probabili target_max_value : int Maximum number of sequences to return by subsampling at some calculated number of sequences per group for the given counts per group. - group_sizes : list[int] + group_sizes : list of int A list with the number of sequences in each requested group. allow_probabilistic : bool Whether to allow probabilistic subsampling when the number of groups @@ -403,7 +410,7 @@ def calculate_sequences_per_group(target_max_value, group_sizes, allow_probabili Raises ------ - TooManyGroupsError : + TooManyGroupsError When there are more groups than sequences per group and probabilistic subsampling is not allowed. @@ -477,6 +484,8 @@ def _calculate_sequences_per_group( maximum number of sequences allowed per group to meet the required maximum total sequences allowed + Examples + -------- >>> _calculate_sequences_per_group(4, [4, 2]) 2 >>> _calculate_sequences_per_group(2, [4, 2]) @@ -532,6 +541,8 @@ def _calculate_fractional_sequences_per_group( fractional maximum number of sequences allowed per group to meet the required maximum total sequences allowed + Examples + -------- >>> np.around(_calculate_fractional_sequences_per_group(4, [4, 2]), 4) 1.9375 >>> np.around(_calculate_fractional_sequences_per_group(2, [4, 2]), 4) diff --git a/augur/frequency_estimators.py b/augur/frequency_estimators.py index 92206003b..666fcaafe 100644 --- a/augur/frequency_estimators.py +++ b/augur/frequency_estimators.py @@ -46,7 +46,7 @@ def get_pivots(observations, pivot_interval, start_date=None, end_date=None, piv Returns ------- - pivots : ndarray + pivots : numpy.ndarray floating point pivots spanning the given the dates """ @@ -92,18 +92,18 @@ def make_pivots(pivots, tps): Parameters ---------- - pivots : scalar or iterable + pivots : int or iterable either number of pivots (a scalar) or the actual pivots (will be cast to array and returned) - tps : np.array + tps : numpy.ndarray observation time points. Will generate pivots spanning min/max Returns ------- - pivots : np.array + pivots : numpy.ndarray array of pivot values ''' - if np.isscalar(pivots): + if isinstance(pivots, int): dt = np.max(tps)-np.min(tps) return np.linspace(np.min(tps)-0.01*dt, np.max(tps)+0.01*dt, pivots) else: @@ -124,14 +124,14 @@ def running_average(obs, ws): Parameters ---------- - obs : list/np.array(bool) + obs : list or numpy.ndarray(bool) observations ws : int window size as measured in number of consecutive points Returns ------- - np.array(float) + numpy.ndarray(float) running average of the boolean observations ''' ws=int(ws) @@ -157,14 +157,14 @@ def fix_freq(freq, pc): Parameters ---------- - freq : np.array + freq : numpy.ndarray frequency trajectory to be thresholded pc : float threshold value Returns ------- - np.array + numpy.ndarray thresholded frequency trajectory ''' freq[np.isnan(freq)]=pc @@ -200,11 +200,11 @@ def __init__(self, tps, obs, pivots, stiffness = 20.0, Parameters ---------- - tps : list/np.array(float) + tps : list or numpy.ndarray(float) array with numerical dates - obs : list/np.array(bool) + obs : list or numpy.ndarray(bool) array with boolean observations - pivots : int/np.array(float) + pivots : int or numpy.ndarray(float) either integer specifying the number of pivot values, or list of explicity pivots stiffness : float, optional @@ -329,25 +329,25 @@ class freq_est_clipped(object): Attributes ---------- - dtps : TYPE + dtps Description - fe : TYPE + fe Description - good_pivots : TYPE + good_pivots Description - good_tps : TYPE + good_tps Description - obs : TYPE + obs Description - pivot_freq : TYPE + pivot_freq Description - pivot_lower_cutoff : TYPE + pivot_lower_cutoff Description - pivot_upper_cutoff : TYPE + pivot_upper_cutoff Description - pivots : TYPE + pivots Description - tps : TYPE + tps Description valid : bool Description @@ -426,11 +426,11 @@ def __init__(self, tps, obs, pivots, **kwargs): Parameters ---------- - tps : np.array + tps : numpy.ndarray array of numerical dates - obs : np.array(bool) + obs : numpy.ndarray(bool) array of true/false observations - pivots : np.array + pivots : numpy.ndarray pivot values **kwargs Description @@ -476,9 +476,9 @@ def __init__(self, tree, pivots, node_filter=None, min_clades=10, verbose=0, pc= Parameters ---------- - tree : Bio.Phylo.calde + tree : Bio.Phylo.BaseTree.Tree Biopython tree - pivots : int/array + pivots : int or array number or list of pivots node_filter : callable, optional function that evaluates to true/false to filter nodes @@ -625,10 +625,10 @@ def __init__(self, aln, tps, pivots, **kwargs): ---------- aln : Bio.Align.MultipleSeqAlignment alignment - tps : np.array(float) + tps : np.ndarray(float) Array of numerical dates, one for each sequence in the alignment in the SAME ORDER! - pivots : np.array(float) + pivots : np.ndarray(float) pivot values for which frequencies are estimated **kwargs Description @@ -653,7 +653,7 @@ def estimate_genotype_frequency(self, gt): Returns ------- - np.array + numpy.ndarray frequency trajectory ''' match = [] @@ -676,7 +676,7 @@ def mutation_frequencies(self, min_freq=0.01, include_set=None, ignore_char=''): ---------- min_freq : float, optional minimal all-time frequency for an aligment column to be considered - include_set : list/set, optional + include_set : list or set, optional set of alignment column that will be used regardless of variation ignore_char : str, optional ignore this character in an alignment column (missing data) @@ -852,6 +852,8 @@ def timestamp_to_float(time): This is not entirely accurate as it doesn't account for months with different numbers of days, but should be close enough to be accurate for weekly pivots. + Examples + -------- >>> import datetime >>> time = datetime.date(2010, 10, 1) >>> timestamp_to_float(time) @@ -1074,7 +1076,7 @@ def tip_passes_filters(self, tip): If no filters are defined, returns True. Args: - tip (Bio.Phylo): tip from a Bio.Phylo tree annotated with attributes in `tip.attr` + tip (Bio.Phylo.BaseTree.Tree): tip from a Bio.Phylo tree annotated with attributes in `tip.attr` Returns: bool: whether the given tip passes the defined filters or not @@ -1130,10 +1132,10 @@ def estimate(self, tree): values in attribute defined by `self.weights_attribute`. Args: - tree (Bio.Phylo): annotated tree whose nodes all have an `attr` attribute with at least "num_date" key + tree (Bio.Phylo.BaseTree.Tree): annotated tree whose nodes all have an `attr` attribute with at least "num_date" key Returns: - frequencies (dict): node frequencies by clade + dict: node frequencies by clade """ # Calculate pivots for the given tree. diff --git a/augur/index.py b/augur/index.py index e53401607..c8b2e6f93 100644 --- a/augur/index.py +++ b/augur/index.py @@ -26,9 +26,9 @@ def index_vcf(vcf_path, index_path): Parameters ---------- - vcf_path : str or Path-like + vcf_path : str or `os.PathLike` path to a VCF file to index. - index_path : str or Path-like + index_path : str or `os.PathLike` path to a tab-delimited file containing the composition details for each sequence in the given input file. @@ -63,7 +63,7 @@ def index_sequence(sequence, values): sequence : Bio.SeqRecord.SeqRecord sequence record to index. - values : list of sets of str + values : list of set of str values to count; sets must be non-overlapping and contain only single-character, lowercase strings @@ -74,7 +74,8 @@ def index_sequence(sequence, values): for the given values, and a final column with the number of characters that didn't match any of those in the given values. - + Examples + -------- >>> other_IUPAC = {'r', 'y', 's', 'w', 'k', 'm', 'd', 'h', 'b', 'v'} >>> values = [{'a'},{'c'},{'g'},{'t'},{'n'}, other_IUPAC, {'-'}, {'?'}] >>> sequence_a = Bio.SeqRecord.SeqRecord(seq=Bio.Seq.Seq("ACTGN-?XWN"), id="seq_A") @@ -153,10 +154,10 @@ def index_sequences(sequences_path, sequence_index_path): Parameters ---------- - sequences_path : str or Path-like + sequences_path : str or `os.PathLike` path to a sequence file to index. - sequence_index_path : str or Path-like + sequence_index_path : str or `os.PathLike` path to a tab-delimited file containing the composition details for each sequence in the given input file. diff --git a/augur/io/file.py b/augur/io/file.py index c2c704acb..146455081 100644 --- a/augur/io/file.py +++ b/augur/io/file.py @@ -10,7 +10,7 @@ def open_file(path_or_buffer, mode="r", **kwargs): Parameters ---------- - path_or_buffer : str or Path-like or IO buffer + path_or_buffer : str or `os.PathLike` or `io.StringIO` Name of the file to open or an existing IO buffer mode : str diff --git a/augur/io/json.py b/augur/io/json.py index af75a1796..2a4678ea2 100644 --- a/augur/io/json.py +++ b/augur/io/json.py @@ -118,6 +118,8 @@ class JSONDecodeError(json.JSONDecodeError): raised by :func:`load_json` and be caught by except blocks which catch the standard :class:`json.JSONDecodeError`. + Examples + -------- >>> load_json('{foo: "bar"}') Traceback (most recent call last): ... @@ -218,6 +220,8 @@ def shorten_left(text, length, placeholder): intended for shortening sentences and works at the word, not character, level. + Examples + -------- >>> shorten_left("foobar", 6, "...") 'foobar' >>> shorten_left("foobarbaz", 6, "...") @@ -244,6 +248,8 @@ def contextualize_char(text, idx, context = 10): Avoids making a copy of *text* before snipping, in case *text* is very large. + Examples + -------- >>> contextualize_char('hello world', 0, context = 4) '▸▸▸h◂◂◂ello…' >>> contextualize_char('hello world', 5, context = 3) @@ -277,6 +283,8 @@ def mark_char(text, idx): """ Prominently marks the *idx* char in *text*. + Examples + -------- >>> mark_char('hello world', 0) '▸▸▸h◂◂◂ello world' >>> mark_char('hello world', 2) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index c4ee27c62..79feb723e 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -20,20 +20,22 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None) ---------- metadata_file : str Path to a metadata file to load. - id_columns : list[str] + id_columns : list of str List of possible id column names to check for, ordered by priority. chunk_size : int Size of chunks to stream from disk with an iterator instead of loading the entire input file into memory. Returns ------- - pandas.DataFrame or pandas.TextFileReader + pandas.DataFrame or `pandas.io.parsers.TextFileReader` Raises ------ - KeyError : + KeyError When the metadata file does not have any valid index columns. + Examples + -------- For standard use, request a metadata file and get a pandas DataFrame. @@ -127,7 +129,7 @@ def read_table_to_dict(table, duplicate_reporting=DataErrorMethod.ERROR_FIRST, i Raises ------ - AugurError: + AugurError Raised for any of the following reasons: 1. There are parsing errors from the csv standard library 2. The provided *id_column* does not exist in the *metadata* @@ -391,7 +393,7 @@ def write_records_to_tsv(records, output_file): Parameters ---------- - records: iterator[dict] + records: iterable of dict Iterator that yields dict that contains sequences output_file: str diff --git a/augur/io/sequences.py b/augur/io/sequences.py index 7569b05d9..3497dbff7 100644 --- a/augur/io/sequences.py +++ b/augur/io/sequences.py @@ -12,7 +12,7 @@ def read_sequences(*paths, format="fasta"): Parameters ---------- - paths : list of str or Path-like objects + paths : list of str or `os.PathLike` One or more paths to sequence files of any type supported by BioPython. format : str @@ -44,10 +44,10 @@ def write_sequences(sequences, path_or_buffer, format="fasta"): Parameters ---------- - sequences : iterable of Bio.SeqRecord.SeqRecord objects + sequences : iterable of Bio.SeqRecord.SeqRecord A list-like collection of sequences to write - path_or_buffer : str or Path-like object or IO buffer + path_or_buffer : str or `os.PathLike` or `io.StringIO` A path to a file to write the given sequences in the given format. format : str @@ -82,7 +82,7 @@ def write_records_to_fasta(records, fasta, seq_id_field='strain', seq_field='seq Parameters ---------- - records: iterator[dict] + records: iterable of dict Iterator that yields dict that contains sequences fasta: str @@ -101,7 +101,7 @@ def write_records_to_fasta(records, fasta, seq_id_field='strain', seq_field='seq Raises ------ - AugurError: + AugurError When the sequence id field or sequence field does not exist in a record """ with open_file(fasta, "w") as output_fasta: diff --git a/augur/io/vcf.py b/augur/io/vcf.py index 9d65dda00..9808c5d38 100644 --- a/augur/io/vcf.py +++ b/augur/io/vcf.py @@ -10,6 +10,8 @@ def is_vcf(filename): """Convenience method to check if a file is a vcf file. + Examples + -------- >>> is_vcf(None) False >>> is_vcf("./foo") diff --git a/augur/mask.py b/augur/mask.py index 54d3638e7..0370ce121 100644 --- a/augur/mask.py +++ b/augur/mask.py @@ -35,7 +35,7 @@ def mask_vcf(mask_sites, in_file, out_file, cleanup=True): Parameters ---------- - mask_sites: list[int] + mask_sites: list of int A list of site indexes to exclude from the vcf. in_file: str The path to the vcf file you wish to mask. @@ -84,9 +84,9 @@ def mask_sequence(sequence, mask_sites, mask_from_beginning, mask_from_end, mask Parameters ---------- - sequence : Bio.SeqIO.SeqRecord + sequence : Bio.SeqRecord.SeqRecord A sequence to be masked - mask_sites: list[int] + mask_sites: list of int A list of site indexes to exclude from the FASTA. mask_from_beginning: int Number of sites to mask from the beginning of each sequence (default 0) @@ -97,7 +97,7 @@ def mask_sequence(sequence, mask_sites, mask_from_beginning, mask_from_end, mask Returns ------- - Bio.SeqIO.SeqRecord + Bio.SeqRecord.SeqRecord Masked sequence in its original record object """ @@ -132,7 +132,7 @@ def mask_fasta(mask_sites, in_file, out_file, mask_from_beginning=0, mask_from_e Parameters ---------- - mask_sites: list[int] + mask_sites: list of int A list of site indexes to exclude from the FASTA. in_file: str The path to the FASTA file you wish to mask. diff --git a/augur/sequence_traits.py b/augur/sequence_traits.py index 87659b13e..573511b57 100644 --- a/augur/sequence_traits.py +++ b/augur/sequence_traits.py @@ -256,7 +256,7 @@ def attach_features(annotations, label, count): ---------- annotations : dict annotations fo stgrains as globed together by `annotate_strains` - label : label + label : str label of the feature set as specified by as command line argument count : str if equal to traits, will count the number of distinct features that diff --git a/augur/titer_model.py b/augur/titer_model.py index 8b60848da..f92f34858 100644 --- a/augur/titer_model.py +++ b/augur/titer_model.py @@ -32,10 +32,11 @@ def load_from_file(filenames, excluded_sources=None): Returns ------- - tuple (dict, list, list) + tuple tuple of a dict of titer measurements, list of strains, list of sources - + Examples + -------- >>> measurements, strains, sources = TiterCollection.load_from_file("tests/data/titer_model/h3n2_titers_subset.tsv") >>> type(measurements) @@ -139,7 +140,7 @@ def count_strains(titers): Parameters ---------- - titers : defaultdict + titers : collections.defaultdict titer measurements indexed by test, reference, and serum @@ -148,7 +149,8 @@ def count_strains(titers): dict number of measurements per strain - + Examples + -------- >>> measurements, strains, sources = TiterCollection.load_from_file("tests/data/titer_model/h3n2_titers_subset.tsv") >>> titer_counts = TiterCollection.count_strains(measurements) >>> titer_counts["A/Acores/11/2013"] @@ -184,7 +186,8 @@ def filter_strains(titers, strains): reduced dictionary of titer measurements containing only those were test and reference virus are part of the strain list - + Examples + -------- >>> measurements, strains, sources = TiterCollection.load_from_file("tests/data/titer_model/h3n2_titers_subset.tsv") >>> len(measurements) 11 @@ -214,10 +217,8 @@ def __init__(self, titers, **kwargs): Parameters ---------- - titers : TYPE - Description + titers **kwargs - Description """ self.kwargs = kwargs @@ -248,15 +249,8 @@ def normalize(self, ref, val): Parameters ---------- - ref : TYPE - Description - val : TYPE - Description - - Returns - ------- - TYPE - Description + ref + val ''' consensus_func = np.mean return consensus_func(np.log2(self.autologous_titers[ref]['val'])) \ @@ -321,6 +315,8 @@ def strain_census(self, titers): make lists of reference viruses, test viruses and sera (there are often multiple sera per reference virus) + Examples + -------- >>> measurements, strains, sources = TiterCollection.load_from_file("tests/data/titer_model/h3n2_titers_subset.tsv") >>> titers = TiterCollection(measurements) >>> sera, ref_strains, test_strains = titers.strain_census(measurements) @@ -333,13 +329,7 @@ def strain_census(self, titers): Parameters ---------- - titers : TYPE - Description - - Returns - ------- - TYPE - Description + titers """ sera = set() ref_strains = set() @@ -459,15 +449,10 @@ def _train(self, method='nnl1reg', lam_drop=1.0, lam_pot = 0.5, lam_avi = 3.0, Parameters ---------- method : str, optional - Description lam_drop : float, optional - Description lam_pot : float, optional - Description lam_avi : float, optional - Description **kwargs - Description ''' self.lam_pot = lam_pot self.lam_avi = lam_avi @@ -510,18 +495,9 @@ def validate(self, plot=False, cutoff=0.0, validation_set = None, fname=None): Parameters ---------- plot : bool, optional - Description cutoff : float, optional - Description validation_set : None, optional - Description fname : None, optional - Description - - Returns - ------- - TYPE - Description ''' from scipy.stats import linregress, pearsonr if validation_set is None: @@ -592,11 +568,6 @@ def compile_titers(self): during visualization, we need the average distance of a test virus from a reference virus across sera. hence the hierarchy [ref][test][serum] NOTE: this uses node.name instead of node.clade - - Returns - ------- - TYPE - Description ''' def dstruct(): return defaultdict(dict) @@ -614,11 +585,6 @@ def compile_potencies(self): compile a json structure containing potencies for visualization we need rapid access to all sera for a given reference virus, hence the structure is organized by [ref][serum] - - Returns - ------- - TYPE - Description ''' potency_json = defaultdict(dict) for (ref_clade, serum), val in self.serum_potency.items(): @@ -639,11 +605,6 @@ def compile_potencies(self): def compile_virus_effects(self): ''' compile a json structure containing virus_effects for visualization - - Returns - ------- - TYPE - Description ''' return {test_vir:np.round(val,TITER_ROUND) for test_vir, val in self.virus_effect.items()} @@ -654,11 +615,6 @@ def compile_virus_effects(self): def fit_l1reg(self): ''' regularize genetic parameters with an l1 norm regardless of sign - - Returns - ------- - TYPE - Description ''' try: from cvxopt import matrix, solvers @@ -718,11 +674,6 @@ def fit_nnl2reg(self): def fit_nnl1reg(self): '''l1 regularization of titer drops with non-negativity constraints - - Returns - ------- - TYPE - Description ''' try: from cvxopt import matrix, solvers @@ -807,15 +758,8 @@ def cross_validate(self, n, **kwargs): Parameters ---------- - n : TYPE - Description + n **kwargs - Description - - Returns - ------- - TYPE - Description ''' model_performance = [] @@ -843,15 +787,8 @@ def get_path_no_terminals(self, v1, v2): Parameters ---------- - v1 : TYPE - Description - v2 : TYPE - Description - - Returns - ------- - TYPE - Description + v1 + v2 ''' if v1 in self.strain_lookup and v2 in self.strain_lookup: p1 = [self.strain_lookup[v1]] @@ -881,7 +818,6 @@ def find_titer_splits(self, criterium=None): Parameters ---------- criterium : None, optional - Description ''' if criterium is None: criterium = lambda x:True @@ -1029,15 +965,8 @@ def get_mutations(self, strain1, strain2): Parameters ---------- - strain1 : TYPE - Description - strain2 : TYPE - Description - - Returns - ------- - TYPE - Description + strain1 + strain2 ''' if strain1 in self.sequences and strain2 in self.sequences: muts = [] @@ -1084,7 +1013,6 @@ def make_seqgraph(self, colin_thres = 5): Parameters ---------- colin_thres : int, optional - Description ''' seq_graph = [] titer_dist = [] @@ -1134,8 +1062,7 @@ def collapse_colinear_mutations(self, colin_thres): Parameters ---------- - colin_thres : TYPE - Description + colin_thres ''' TT = self.design_matrix[:,:self.genetic_params].T mutation_clusters = [] @@ -1170,7 +1097,6 @@ def train(self,**kwargs): Parameters ---------- **kwargs - Description ''' self._train(**kwargs) for mi, mut in enumerate(self.relevant_muts): @@ -1195,12 +1121,6 @@ def compile_substitution_effects(self, cutoff=1e-4): Parameters ---------- cutoff : float, optional - Description - - Returns - ------- - TYPE - Description ''' return {mut[0]+':'+mut[1]:np.round(val,int(-np.log10(cutoff))) for mut, val in self.substitution_effect.items() if val>cutoff} @@ -1211,11 +1131,11 @@ def annotate_tree(self, tree): Parameters ---------- - tree : Bio.Phylo + tree : Bio.Phylo.BaseTree.Tree Returns ------- - Bio.Phylo + Bio.Phylo.BaseTree.Tree input tree instance with nodes annotated by per-branch and cumulative antigenic advance attributes `dTiterSub` and `cTiterSub` diff --git a/augur/traits.py b/augur/traits.py index d4879a666..c9f812ff4 100644 --- a/augur/traits.py +++ b/augur/traits.py @@ -34,7 +34,7 @@ def mugration_inference(tree=None, seq_meta=None, field='country', confidence=Tr Returns ------- - T : Phylo.Tree + T : Bio.Phylo.BaseTree.Tree Biophyton tree gtr : treetime.GTR GTR model @@ -98,13 +98,6 @@ def mugration_inference(tree=None, seq_meta=None, field='country', confidence=Tr def register_parser(parent_subparsers): - """Add subcommand specific arguments - - Parameters - ---------- - parser : argparse - subcommand argument parser - """ parser = parent_subparsers.add_parser("traits", help=__doc__) parser.add_argument('--tree', '-t', required=True, help="tree to perform trait reconstruction on") parser.add_argument('--metadata', required=True, metavar="FILE", help="table with metadata, as CSV or TSV") @@ -129,7 +122,7 @@ def run(args): Parameters ---------- - args : namespace + args : argparse.Namespace command line arguments are parsed by argparse """ tree_fname = args.tree diff --git a/augur/translate.py b/augur/translate.py index ca58fc75a..32c2598ad 100644 --- a/augur/translate.py +++ b/augur/translate.py @@ -33,6 +33,8 @@ def safe_translate(sequence, report_exceptions=False): Optionally, returns a tuple of the translated sequence and whether an exception was raised during initial translation. + Examples + -------- >>> safe_translate("ATG") 'M' >>> safe_translate("ATGGT-") diff --git a/augur/tree.py b/augur/tree.py index c6f60d8a8..63754cfb0 100644 --- a/augur/tree.py +++ b/augur/tree.py @@ -68,7 +68,8 @@ def check_conflicting_args(tree_builder_args, defaults): ConflictingArgumentsException When any user-provided arguments match those in the defaults. - + Examples + -------- >>> defaults = ("-ntmax", "-m", "-s") >>> check_conflicting_args("-czb -n 2", defaults) >>> check_conflicting_args("-czb -ntmax 2", defaults) diff --git a/augur/util_support/node_data_reader.py b/augur/util_support/node_data_reader.py index 191414318..4d365101e 100644 --- a/augur/util_support/node_data_reader.py +++ b/augur/util_support/node_data_reader.py @@ -16,7 +16,7 @@ class NodeDataReader: If a tree file is specified, it is used to verify the node names. - If validation_mode is set to :py:attr:`ValidationMode.SKIP`, Augur version of node data files is not checked. + If validation_mode is set to :py:attr:`augur.types.ValidationMode.SKIP`, Augur version of node data files is not checked. """ def __init__(self, filenames, tree_file=None, validation_mode=ValidationMode.ERROR): diff --git a/augur/utils.py b/augur/utils.py index b5146c75d..c26a696e9 100644 --- a/augur/utils.py +++ b/augur/utils.py @@ -58,7 +58,7 @@ def read_tree(fname, min_terminals=3): Returns ------- - Bio.Phylo : + Bio.Phylo.BaseTree.Tree : BioPython tree instance """ @@ -318,6 +318,8 @@ def get_parent_name_by_child_name_for_tree(tree): def annotate_parents_for_tree(tree): """Annotate each node in the given tree with its parent. + Examples + -------- >>> import io >>> tree = Bio.Phylo.read(io.StringIO("(A, (B, C))"), "newick") >>> not any([hasattr(node, "parent") for node in tree.find_clades()]) @@ -343,6 +345,9 @@ def json_to_tree(json_dict, root=True, parent_cumulative_branch_length=None): Assigns links back to parent nodes for the root of the tree. + Examples + -------- + Test opening a JSON from augur export v1. >>> import json @@ -456,7 +461,7 @@ def read_bed_file(bed_file): Returns ------- - list[int]: + list of int: Sorted list of unique zero-indexed sites """ mask_sites = [] @@ -487,7 +492,7 @@ def read_mask_file(mask_file): Returns ------- - list[int]: + list of int: Sorted list of unique zero-indexed sites """ mask_sites = [] @@ -513,7 +518,7 @@ def load_mask_sites(mask_file): Returns ------- - list[int] + list of int Sorted list of unique zero-indexed sites """ if mask_file.lower().endswith(".bed"): @@ -543,7 +548,7 @@ def read_strains(*files, comment_char="#"): Parameters ---------- - files : one or more str + files : iterable of str one or more names of text files with one strain name per line Returns diff --git a/augur/validate.py b/augur/validate.py index cb9f9526f..4860eca3f 100644 --- a/augur/validate.py +++ b/augur/validate.py @@ -203,6 +203,8 @@ def get_unique_keys(list_of_dicts): """ Returns a set of unique keys from a list of dicts + Examples + -------- >>> list_of_dicts = [{"key1": "val1", "key2": "val2"}, {"key1": "val1", "key3": "val3"}] >>> sorted(get_unique_keys(list_of_dicts)) ['key1', 'key2', 'key3'] diff --git a/docs/Makefile b/docs/Makefile index 0316170ff..21666bb84 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,9 +1,10 @@ # Minimal makefile for Sphinx documentation # -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= -n +SPHINXBUILD ?= sphinx-build SOURCEDIR = . BUILDDIR = _build diff --git a/docs/api/developer/augur.errors.rst b/docs/api/developer/augur.errors.rst new file mode 100644 index 000000000..b35076ca3 --- /dev/null +++ b/docs/api/developer/augur.errors.rst @@ -0,0 +1,7 @@ +augur.errors +============ + +.. automodule:: augur.errors + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/api/developer/augur.io.rst b/docs/api/developer/augur.io.rst index 56c1737d8..13c044f60 100644 --- a/docs/api/developer/augur.io.rst +++ b/docs/api/developer/augur.io.rst @@ -5,6 +5,7 @@ augur.io :members: :undoc-members: :show-inheritance: + :noindex: .. toctree:: diff --git a/docs/api/developer/augur.rst b/docs/api/developer/augur.rst index 835535653..d7fc72359 100644 --- a/docs/api/developer/augur.rst +++ b/docs/api/developer/augur.rst @@ -13,6 +13,7 @@ augur augur.clades augur.dates augur.distance + augur.errors augur.export augur.export_v1 augur.export_v2 @@ -35,6 +36,7 @@ augur augur.traits augur.translate augur.tree + augur.types augur.util_support augur.utils augur.validate diff --git a/docs/api/developer/augur.types.rst b/docs/api/developer/augur.types.rst new file mode 100644 index 000000000..d37872b71 --- /dev/null +++ b/docs/api/developer/augur.types.rst @@ -0,0 +1,7 @@ +augur.types +=========== + +.. automodule:: augur.types + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/conf.py b/docs/conf.py index e4f383ffe..875637583 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -61,6 +61,7 @@ def prose_list(items): 'sphinx.ext.autodoc', 'sphinxarg.ext', 'sphinx.ext.napoleon', + 'sphinx_autodoc_typehints', # must come after napoleon https://github.com/tox-dev/sphinx-autodoc-typehints/blob/1.21.4/README.md#compatibility-with-sphinxextnapoleon 'sphinx_markdown_tables', 'sphinx.ext.intersphinx', 'nextstrain.sphinx.theme', @@ -116,8 +117,28 @@ def prose_list(items): 'css/custom.css', ] +# -- Resolve build warnings -------------------------------------------------- + +nitpick_ignore = [ + # These are valid numpydoc keywords¹, but somehow they are not recognized by + # napoleon. + # ¹ https://numpydoc.readthedocs.io/en/v1.5.0/format.html#parameters + ('py:class', 'optional'), + ('py:class', 'iterable'), + + # Some references get translated to these, but somehow they can't get + # resolved by intersphinx for a proper link. + ("py:class", "json.decoder.JSONDecodeError"), + ("py:class", "json.encoder.JSONEncoder"), +] + # -- Cross-project references ------------------------------------------------ intersphinx_mapping = { + 'Bio': ('https://biopython.org/docs/latest/api/', None), 'docs.nextstrain.org': ('https://docs.nextstrain.org/en/latest/', None), + 'python': ('https://docs.python.org/3', None), + 'numpy': ('https://numpy.org/doc/stable', None), + 'pandas': ('https://pandas.pydata.org/docs', None), + 'treetime': ('https://treetime.readthedocs.io/en/stable/', None), } diff --git a/docs/faq/metadata.md b/docs/faq/metadata.md deleted file mode 100644 index 9c455d288..000000000 --- a/docs/faq/metadata.md +++ /dev/null @@ -1,59 +0,0 @@ -# Preparing Your Metadata - -Analyses are vastly more interesting if the sequences or samples analyzed have rich 'meta data' wherever possible. This metadata could typically include collection dates, geographic location, symptoms of patients, host characteristics, etc. - -To make the most of augur's features, we recommend including sampling date and at least one type of geographic information if at all possible. However, you can also include things like symptoms, host, clinical outcome - and more! - -For augur to be able to parse this data, it needs to be formated consistently. Your data may have meta information coded into the sequence name (see example [below](#parsing-from-the-header)). If not, a very transparent way is to provide the meta data as a separate table in a tab- or comma-separated file. - -An example meta data file is shown here: - -``` -strain accession date region host -1_0087_PF KX447509 2013-12-XX Oceania Human -1_0181_PF KX447512 2013-12-XX Oceania Bat -1_0199_PF KX447519 2013-11-XX Oceania Human -BRA/2016 KY785433 2016-04-08 South America Cow -BRA/2015 KY558989 2015-02-23 South America Bat -``` - -### A note on Excel - -Because Excel will automatically change the date formatting, we recommend _not_ opening or preparing your meta data file in Excel. If the metadata is already in Excel, or you decide to prepare it in Excel, we recommend using another program to correct the dates afterwards (and don't open it in Excel again!). - -### Format - -**Strain names** - -You must have one column named `strain` or `name`. It contains your sequence names, and needs to match the identifiers of your sequences (in the Fasta or VCF file) _exactly_ and must not contain characters such as spaces, or `()[]{}|#><`. - -**Dates** - -Dates should be formated according as `YYYY-MM-DD`. You can specify unknown dates or month by replacing the respected values by `XX` (ex: `2013-01-XX` or `2011-XX-XX`) and completely unknown dates can be shown with `20XX-XX-XX` (which does not restrict the sequence to being in the 21st century - they could be earlier). - -**Geography** - -Geographic locations can be broken down, for example, into `region`, `country`, `division` or `city`. You can have as many levels of geographic information as you wish. For `region`, `country`, and some `division`s augur already knows many lat-long coordinates (see which ones it already knows by checking the list [here](https://github.com/nextstrain/augur/blob/master/augur/data/lat_longs.tsv)). - -It is important that these are spelled consistently. - -If you want to include locations where augur doesn't know the lat-long values, you can include them - see how [here](./lat_longs.html). - -### Consistancy and Style - -Check that your metadata is free from spelling mistakes and that values are consistant. Augur doesn't know that 'UK' and 'United Kingdom' or 'cat' and 'feline' are the same! - -Previously, auspice 'prettified' traits by capitalizing them automatically, and removing the underscores that separated two-word locations ('new_zealand' became 'New Zealand'). - -Auspice will still do this if you are exporting 'V1' type JSON files (from augur v5 or augur v6 using `export v1`), but will not do this if you are using `export v2` ([read more](../releases/migrating-v5-v6.html#prettifying-metadata-fields)). Instead, you should update your metadata files so that traits look the same as you'd like them to display in Auspice (change 'new_zealand' to 'New Zealand' in your metadata, and in any additional latitude-longitude or coloring files you use). - -### Parsing from the header - -Sometimes, metadata can be coded into the Fasta header, like so: - -``` ->1_0087_PF | KX447509 | 2013-12-XX | oceania -ACTCGCTGCATCG... -``` - -Augur can parse meta data from Fasta headers using the `parse` function (see [here](/usage/cli/parse)), but you have to make sure that every sequence has the exact same meta data fields (even if empty), and that they are consistently delimited with `|`. Furthermore, none of the metadata fields can contain the character `|`. diff --git a/docs/faq/metadata.rst b/docs/faq/metadata.rst new file mode 100644 index 000000000..cbf54850b --- /dev/null +++ b/docs/faq/metadata.rst @@ -0,0 +1,108 @@ +Preparing Your Metadata +======================= + +Analyses are vastly more interesting if the sequences or samples +analyzed have rich 'meta data' wherever possible. This metadata could +typically include collection dates, geographic location, symptoms of +patients, host characteristics, etc. + +To make the most of augur's features, we recommend including sampling +date and at least one type of geographic information if at all possible. +However, you can also include things like symptoms, host, clinical +outcome - and more! + +For augur to be able to parse this data, it needs to be formated +consistently. Your data may have meta information coded into the +sequence name (see example :ref:`below`). If +not, a very transparent way is to provide the meta data as a separate +table in a tab- or comma-separated file. + +An example meta data file is shown here: + +:: + + strain accession date region host + 1_0087_PF KX447509 2013-12-XX Oceania Human + 1_0181_PF KX447512 2013-12-XX Oceania Bat + 1_0199_PF KX447519 2013-11-XX Oceania Human + BRA/2016 KY785433 2016-04-08 South America Cow + BRA/2015 KY558989 2015-02-23 South America Bat + +A note on Excel +~~~~~~~~~~~~~~~ + +Because Excel will automatically change the date formatting, we +recommend *not* opening or preparing your meta data file in Excel. If +the metadata is already in Excel, or you decide to prepare it in Excel, +we recommend using another program to correct the dates afterwards (and +don't open it in Excel again!). + +Format +~~~~~~ + +**Strain names** + +You must have one column named ``strain`` or ``name``. It contains your +sequence names, and needs to match the identifiers of your sequences (in +the Fasta or VCF file) *exactly* and must not contain characters such as +spaces, or ``()[]{}|#><``. + +**Dates** + +Dates should be formated according as ``YYYY-MM-DD``. You can specify +unknown dates or month by replacing the respected values by ``XX`` (ex: +``2013-01-XX`` or ``2011-XX-XX``) and completely unknown dates can be +shown with ``20XX-XX-XX`` (which does not restrict the sequence to being +in the 21st century - they could be earlier). + +**Geography** + +Geographic locations can be broken down, for example, into ``region``, +``country``, ``division`` or ``city``. You can have as many levels of +geographic information as you wish. For ``region``, ``country``, and +some ``division``\ s augur already knows many lat-long coordinates (see +which ones it already knows by checking the list +`here `__). + +It is important that these are spelled consistently. + +If you want to include locations where augur doesn't know the lat-long +values, you can include them - see how :doc:`here `. + +Consistancy and Style +~~~~~~~~~~~~~~~~~~~~~ + +Check that your metadata is free from spelling mistakes and that values +are consistant. Augur doesn't know that 'UK' and 'United Kingdom' or +'cat' and 'feline' are the same! + +Previously, auspice 'prettified' traits by capitalizing them +automatically, and removing the underscores that separated two-word +locations ('new_zealand' became 'New Zealand'). + +Auspice will still do this if you are exporting 'V1' type JSON files +(from augur v5 or augur v6 using ``export v1``), but will not do this if +you are using ``export v2`` (`read +more <../releases/migrating-v5-v6.html#prettifying-metadata-fields>`__). +Instead, you should update your metadata files so that traits look the +same as you'd like them to display in Auspice (change 'new_zealand' to +'New Zealand' in your metadata, and in any additional latitude-longitude +or coloring files you use). + +.. _parsing-from-the-header: + +Parsing from the header +~~~~~~~~~~~~~~~~~~~~~~~ + +Sometimes, metadata can be coded into the Fasta header, like so: + +:: + + >1_0087_PF | KX447509 | 2013-12-XX | oceania + ACTCGCTGCATCG... + +Augur can parse meta data from Fasta headers using the ``parse`` +function (see :doc:`here `), but you have to make sure +that every sequence has the exact same meta data fields (even if empty), +and that they are consistently delimited with ``|``. Furthermore, none +of the metadata fields can contain the character ``|``. diff --git a/docs/usage/cli/distance.rst b/docs/usage/cli/distance.rst index 7ab45d20e..04a72dba0 100644 --- a/docs/usage/cli/distance.rst +++ b/docs/usage/cli/distance.rst @@ -2,6 +2,11 @@ augur distance ============== +.. contents:: Table of Contents + :local: + +---- + .. argparse:: :module: augur :func: make_parser diff --git a/docs/usage/cli/filter.rst b/docs/usage/cli/filter.rst index 36250aee5..85efd151d 100644 --- a/docs/usage/cli/filter.rst +++ b/docs/usage/cli/filter.rst @@ -2,7 +2,8 @@ augur filter ============ -* `How we subsample sequences in the zika-tutoral <#how-we-subsample-sequences-in-the-zika-tutoral>`__ +.. contents:: Table of Contents + :local: ---- @@ -15,7 +16,7 @@ augur filter How we subsample sequences in the zika-tutoral ============================================== -As an example, we'll look that the ``filter`` command in greater detail using material from the :doc:`Zika tutorial `. +As an example, we'll look that the ``filter`` command in greater detail using material from the :doc:`Zika tutorial `. The filter command allows you to selected various subsets of your input data for different types of analysis. A simple example use of this command would be @@ -45,7 +46,7 @@ To drop such strains, you can pass the name of this file to the augur filter com --output filtered.fasta (To improve legibility, we have wrapped the command across multiple lines.) -If you run this command (you should be able to copy-paste this into your terminal) on the data provided in the :doc:`Zika tutorial `, you should see that one of the sequences in the data set was dropped since its name was in the ``dropped_strains.txt`` file. +If you run this command (you should be able to copy-paste this into your terminal) on the data provided in the :doc:`Zika tutorial `, you should see that one of the sequences in the data set was dropped since its name was in the ``dropped_strains.txt`` file. Another common filtering operation is subsetting of data to a achieve a more even spatio-temporal distribution or to cut-down data set size to more manageable numbers. The filter command allows you to select a specific number of sequences from specific groups, for example one sequence per month from each country: diff --git a/docs/usage/cli/index.rst b/docs/usage/cli/index.rst index aea3f83e0..77a4c6b00 100644 --- a/docs/usage/cli/index.rst +++ b/docs/usage/cli/index.rst @@ -2,6 +2,11 @@ augur index ============ +.. contents:: Table of Contents + :local: + +---- + .. argparse:: :module: augur :func: make_parser @@ -11,7 +16,7 @@ augur index Speed up filtering with a sequence index ======================================== -As we describe in :doc:`the zika tutorial `, augur index precalculates the composition of the sequences (e.g., numbers of nucleotides, gaps, invalid characters, and total sequence length) prior to filtering. +As we describe in :doc:`the zika tutorial `, augur index precalculates the composition of the sequences (e.g., numbers of nucleotides, gaps, invalid characters, and total sequence length) prior to filtering. The resulting sequence index speeds up subsequent filter steps especially in more complex workflows. .. code-block:: bash diff --git a/docs/usage/cli/parse.rst b/docs/usage/cli/parse.rst index 283c7d94f..372a65b80 100644 --- a/docs/usage/cli/parse.rst +++ b/docs/usage/cli/parse.rst @@ -2,7 +2,8 @@ augur parse =========== -* `Example: how to parse metadata from fasta-headers <#example-how-to-parse-metadata-from-fasta-headers>`__ +.. contents:: Table of Contents + :local: ---- diff --git a/docs/usage/cli/refine.rst b/docs/usage/cli/refine.rst index 6608565d4..1eb58a5a3 100644 --- a/docs/usage/cli/refine.rst +++ b/docs/usage/cli/refine.rst @@ -3,11 +3,8 @@ augur refine =========================== -* `How we use refine in the zika tutorial <#how-we-use-refine-in-the-zika-tutorial>`__ -* `Specify the evolutionary rate <#specify-the-evolutionary-rate>`__ -* `Confidence intervals for divergence times <#confidence-intervals-for-divergence-times>`__ -* `Specifying the root of the tree <#specifying-the-root-of-the-tree>`__ -* `Polytomy resolution <#polytomy-resolution>`__ +.. contents:: Table of Contents + :local: ---- diff --git a/docs/usage/cli/traits.rst b/docs/usage/cli/traits.rst index 772fc5592..c1034158d 100644 --- a/docs/usage/cli/traits.rst +++ b/docs/usage/cli/traits.rst @@ -2,7 +2,10 @@ augur traits ============ -.. contents:: +.. contents:: Table of Contents + :local: + +---- .. argparse:: :module: augur diff --git a/scripts/diff_trees.py b/scripts/diff_trees.py index 27fd48334..ed652f441 100644 --- a/scripts/diff_trees.py +++ b/scripts/diff_trees.py @@ -7,6 +7,8 @@ def clade_to_items(clade, attrs=("name", "branch_length")): """Recursively convert a clade of a tree to a list of nested lists according to the topology of the clade with the requested attributes per node. + Examples + -------- >>> from io import StringIO >>> treedata = "(A, (B, C), (D, E))" >>> handle = StringIO(treedata) diff --git a/setup.py b/setup.py index 2ed0bdd1f..4f7f41431 100644 --- a/setup.py +++ b/setup.py @@ -82,6 +82,7 @@ "sphinx-argparse >=0.2.5", "sphinx-markdown-tables >= 0.0.9", "sphinx-rtd-theme >=0.4.3", + "sphinx-autodoc-typehints >=1.21.4", "wheel >=0.32.3", "ipdb >=0.10.1" ]