diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs deleted file mode 100644 index 0b3bf1a..0000000 --- a/.git-blame-ignore-revs +++ /dev/null @@ -1,2 +0,0 @@ -# Reformat with black and isort -89abdee3b5fb2dc0216da5fcb1467010d32529d3 diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..296dda6 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @dpoznik diff --git a/.gitignore b/.gitignore index bcbe672..4154aa1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,59 @@ -__pycache__ -*.egg-info -*.pyc +# Binaries, byte compilations, etc. +#---------------------------------- +__pycache__/ +*.py[cod] +*.so + +# Caches +#---------------------------------- +.cache +.ipynb_checkpoints +.metaflow +.minio.sys +.mypy_cache +.pytest_cache +.tox + +# Distribution & packaging +#---------------------------------- +build/ +dist/ +eggs/ +sdist/ +wheels/ +.eggs/ +*.egg +*.egg-info/ +_version.py + +# Editors & IDEs +#---------------------------------- +*~ +\#* +.#* .project .pydevproject .settings -.venv -output/ +.vscode + +# Environments +#---------------------------------- +.python-version + +# macOS +#---------------------------------- +.DS_Store +._* +.Trash* + +# Project-specific +#---------------------------------- +logs/ +output*/ +tests/fixtures/input/1000Y.all.bcf +tests/fixtures/input/1000Y.all.bcf.csi +tests/fixtures/input/1000Y.subset.bcf +tests/fixtures/input/1000Y.subset.bcf.csi +tests/fixtures/input/ALL.chrY_10Mbp_mask.glia_freebayes_maxLikGT_siteQC.20130502.60555_biallelic_snps.vcf.gz +tests/fixtures/input/ALL.chrY_10Mbp_mask.glia_freebayes_maxLikGT_siteQC.20130502.60555_biallelic_snps.vcf.gz.tbi +!tests/fixtures/output/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 598e232..fcd731f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,17 +1,34 @@ default_stages: [commit, merge-commit] +fail_fast: true repos: - - repo: https://gitlab.com/pycqa/flake8 - rev: 3.9.0 - hooks: - - id: flake8 - types: [file, python] - args: [--select, "F401,F841"] # Check for unused imports and variables - - repo: https://github.com/pycqa/isort - rev: 5.8.0 + - repo: git@github.com:PyCQA/isort.git + rev: 5.12.0 hooks: - id: isort - - repo: https://github.com/psf/black - rev: 20.8b1 + - repo: git@github.com:psf/black.git + rev: 23.7.0 hooks: - id: black - language_version: python3 + - repo: git@github.com:pre-commit/pre-commit-hooks.git + rev: v4.4.0 + hooks: + - id: check-yaml + args: [--allow-multiple-documents] + - id: pretty-format-json + - id: trailing-whitespace + exclude: haplogroups.*.txt|isogg_tree/|isogg.[0-9.]*.txt + - repo: git@github.com:PyCQA/flake8.git + rev: 6.1.0 + hooks: + - id: flake8 + - repo: git@github.com:PyCQA/pydocstyle.git + rev: 6.3.0 + hooks: + - id: pydocstyle + additional_dependencies: [tomli] + exclude: tests/ + - repo: git@github.com:pre-commit/mirrors-mypy.git + rev: v1.5.1 + hooks: + - id: mypy + additional_dependencies: [types-PyYAML] diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..7d0065f --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,64 @@ +# Changelog for `yhaplo` + +Format based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) + + +## Planned + +- Correct ISOGG polarization errors for a few dozen SNPs. + + +## [Unreleased] + +No unreleased changes + +[Unreleased]: https://github.com/23andMe/yhaplo/compare/2.0.2...HEAD + + +## [2.0.2] - 2023-09-15 + +This is a major clean-up and refactoring release. +Core logic has not changed, and output should be equivalent to prior versions. +The key changes from an end-user perspective are BCF support, a cleaner API, +and faster processing of most input types. + +### Added +- BCF support +- Automated tests +- Optional dependencies +- `Sample` subclasses: `TextSample`, `VCFSample`, `AblockSample` +- API for processing ablocks (23andMe internal) +- `Dockerfile` defining image for Batch computes (23andMe internal) +- Compute flow (23andMe internal) +- Script for copying and altering files for open sourcing (23andMe internal) +- `CHANGELOG.md` + +### Changed +- Lint and update pre-commit hooks +- Set up Drone CI (23andMe internal) +- Set up `tox` testing (23andMe internal) +- Update `Makefile` and configuration files +- Refactor for PEP-8 compliance (snake case, etc.) +- Update directory structure +- Modernize packaging and infer version dynamically +- Namespace command-line entry points: `yhaplo`, `yhaplo_convert_to_genos`, `yhaplo_plot_tree` +- Replace static methods +- Clean up logging and use file handlers +- Use f-strings +- Reformat docstrings +- Add type annotations +- Use `importlib.resources` to load metadata files +- Move example input from package to `tests/fixtures/` +- Update `README.md`, `README.23andMe.md`, and `yhaplo_manual.pdf` +- Speed up sample-major file processing +- Speed up ablock processing (23andMe internal) +- Use Pysam to process VCF/BCF input +- Map physical coordinates to block indexes (23andMe internal) +- Handle platform SNPs natively (23andMe internal) + +### Removed +- Support for Python 2 and Python 3.8 +- Use of research-environment utilities (23andMe internal) + +[2.0.2]: https://github.com/23andMe/yhaplo/compare/1.1.2..2.0.2 + diff --git a/LICENSE.txt b/LICENSE.txt index 05ad991..45d8266 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,233 +1,233 @@ YHAPLO LICENSE -Copyright (c) 2016, 23ANDME, INC. All rights reserved. - -Redistribution and use in source and binary forms are permitted, provided that the -above copyright notice and the License Agreement (LICENSE.txt, which is distributed -with the source code) are duplicated in all such forms and that any documentation, -advertising materials, and other materials related to such distribution and use -acknowledge that the yhaplo™ Software was developed by 23andMe, Inc. The name of -23andMe, Inc. may not be used to endorse or promote products derived from this -software without specific prior written permission. THIS SOFTWARE IS PROVIDED "AS IS" -AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, +Copyright (c) 2016, 23ANDME, INC. All rights reserved. + +Redistribution and use in source and binary forms are permitted, provided that the +above copyright notice and the License Agreement (LICENSE.txt, which is distributed +with the source code) are duplicated in all such forms and that any documentation, +advertising materials, and other materials related to such distribution and use +acknowledge that the yhaplo™ Software was developed by 23andMe, Inc. The name of +23andMe, Inc. may not be used to endorse or promote products derived from this +software without specific prior written permission. THIS SOFTWARE IS PROVIDED "AS IS" +AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. NON-EXCLUSIVE LICENSE AGREEMENT -This Agreement (the "License Agreement") is made and is effective as of the date when -Licensee (as defined below) exercises the Licensed Rights with respect to the Licensed -Materials (as both terms are defined below). By exercising the Licensed Rights, Licensee -accepts and agrees to be bound by the terms and conditions of this License Agreement. -Licensee is granted the Licensed Rights in consideration of Licensee's acceptance of -these terms and conditions. The Licensed Rights in the Licensed Materials are granted -by 23ndMe, Inc. Delaware corporation having a principal place of business at 899 West -Evelyn Avenue, Mountain View, CA 94041 ("23andMe") subject to the terms of this +This Agreement (the "License Agreement") is made and is effective as of the date when +Licensee (as defined below) exercises the Licensed Rights with respect to the Licensed +Materials (as both terms are defined below). By exercising the Licensed Rights, Licensee +accepts and agrees to be bound by the terms and conditions of this License Agreement. +Licensee is granted the Licensed Rights in consideration of Licensee's acceptance of +these terms and conditions. The Licensed Rights in the Licensed Materials are granted +by 23ndMe, Inc. Delaware corporation having a principal place of business at 899 West +Evelyn Avenue, Mountain View, CA 94041 ("23andMe") subject to the terms of this License Agreement. -IF YOU ARE EXERCISING THE LICENSED RIGHTS FOR NON-COMMERCIAL RESEARCH PURPOSES AND ON -BEHALF OF (i) AN ACADEMIC INSTITUTION, (ii) A NOT-FOR-PROFIT INSTITUTION, OR (iii) -YOURSELF, AS AN INDIVIDUAL, AND YOU AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE -AGREEMENT, YOU MAY DOWNLOAD OR OTHERWISE USE THE LICENSED MATERIALS AS SET FORTH IN THIS -LICENSE AGREEMENT. OTHERWISE, 23ANDME DOES NOT GRANT YOU A LICENSE TO THE LICENSED +IF YOU ARE EXERCISING THE LICENSED RIGHTS FOR NON-COMMERCIAL RESEARCH PURPOSES AND ON +BEHALF OF (i) AN ACADEMIC INSTITUTION, (ii) A NOT-FOR-PROFIT INSTITUTION, OR (iii) +YOURSELF, AS AN INDIVIDUAL, AND YOU AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE +AGREEMENT, YOU MAY DOWNLOAD OR OTHERWISE USE THE LICENSED MATERIALS AS SET FORTH IN THIS +LICENSE AGREEMENT. OTHERWISE, 23ANDME DOES NOT GRANT YOU A LICENSE TO THE LICENSED MATERIALS. -1. Definitions. In addition to the definitions appearing elsewhere in this - Agreement, when used in this Agreement (or its exhibits) the following terms shall +1. Definitions. In addition to the definitions appearing elsewhere in this + Agreement, when used in this Agreement (or its exhibits) the following terms shall have the following meanings: - a. "Derivative Work(s)" shall mean any revision, enhancement, modification, - translation, abridgement, condensation, or expansion created by Licensee for - internal purposes that is based upon the Licensed Materials or a portion - thereof that would be a copyright infringement if prepared without the - authorization of 23andMe or used for redistribution of the Licensed Materials + a. "Derivative Work(s)" shall mean any revision, enhancement, modification, + translation, abridgement, condensation, or expansion created by Licensee for + internal purposes that is based upon the Licensed Materials or a portion + thereof that would be a copyright infringement if prepared without the + authorization of 23andMe or used for redistribution of the Licensed Materials outside Licensee's Institution. b. "Institution" means an academic or not-for-profit institution performing bona- fide non-commercial research on the field of biotechnology. - c. "Licensed Materials" means the yhaplo™ Software, including the associated + c. "Licensed Materials" means the yhaplo™ Software, including the associated White Paper, Manual, and any additional documentation. - d. "Licensed Rights" means the non-exclusive rights to download, use, access, - copy or modify the Licensed Materials for non-commercial research purposes + d. "Licensed Rights" means the non-exclusive rights to download, use, access, + copy or modify the Licensed Materials for non-commercial research purposes and subject to the terms and conditions of this License. - e. "Licensee" means (i) the Institution exercising the Licensed Rights in the - Licensed Materials under this License. For the sake of clarity, the Agreement - applies to the Institution and the individual(s) employed by, affiliated with, or - enrolled at, an Institution, who exercise(s) the Licensed Rights on behalf of - such Institution; or (ii) an individual exercising the Licensed Rights in the - Licensed Materials on behalf of himself or herself exclusively for Non-Commercial + e. "Licensee" means (i) the Institution exercising the Licensed Rights in the + Licensed Materials under this License. For the sake of clarity, the Agreement + applies to the Institution and the individual(s) employed by, affiliated with, or + enrolled at, an Institution, who exercise(s) the Licensed Rights on behalf of + such Institution; or (ii) an individual exercising the Licensed Rights in the + Licensed Materials on behalf of himself or herself exclusively for Non-Commercial Research. f. "Licensor" means 23andMe, Inc. - g. "Non-Commercial Research" means bona-fide research on the field of - biotechnology (i) primarily intended to generate new knowledge and - understanding using scientific methods; (ii) conducted to publish the research - findings and share the derived data in the scientific community; (iii) conducted - in accordance with applicable laws, regulations and ethical requirements; and - (iv) not intended for, or directed towards, commercial advantage or monetary - compensation. + g. "Non-Commercial Research" means bona-fide research on the field of + biotechnology (i) primarily intended to generate new knowledge and + understanding using scientific methods; (ii) conducted to publish the research + findings and share the derived data in the scientific community; (iii) conducted + in accordance with applicable laws, regulations and ethical requirements; and + (iv) not intended for, or directed towards, commercial advantage or monetary + compensation. 2. Scope of Licensed Rights. - a. License grant. Subject to the terms and conditions of this Agreement, - 23andMe hereby grants Licensee a worldwide, royalty-free, non-sublicensable, - non-exclusive, irrevocable license to exercise the Licensed Rights in the + a. License grant. Subject to the terms and conditions of this Agreement, + 23andMe hereby grants Licensee a worldwide, royalty-free, non-sublicensable, + non-exclusive, irrevocable license to exercise the Licensed Rights in the Licensed Materials to: - (i) Use and reproduce the Licensed Materials, in whole or in part, for + (i) Use and reproduce the Licensed Materials, in whole or in part, for internal Non-Commercial Research purposes only; and - (ii) produce, reproduce, and create Derivative Works for internal + (ii) produce, reproduce, and create Derivative Works for internal Non-Commercial Research purposes only. - b. License Term. The term of the License under this Agreement is specified in + b. License Term. The term of the License under this Agreement is specified in Section 6(a). c. Downstream recipients. - (i) Additional offer from 23andMe; Derivative Works. If Licensee modifies - the Licensed Materials, every recipient of the Licensed Materials, as - modified per Licensee's contributions automatically receives an offer from - 23andMe to exercise the Licensed Rights in the Licensed Materials under the - conditions of the license that Licensee applies to their contributions to - the Licensed Materials ("Derivative Work License"), provided that any - Derivative Work License must be solely used internally, for Non-Commercial - Research purposes only and must be substantially similar to the Licensed + (i) Additional offer from 23andMe; Derivative Works. If Licensee modifies + the Licensed Materials, every recipient of the Licensed Materials, as + modified per Licensee's contributions automatically receives an offer from + 23andMe to exercise the Licensed Rights in the Licensed Materials under the + conditions of the license that Licensee applies to their contributions to + the Licensed Materials ("Derivative Work License"), provided that any + Derivative Work License must be solely used internally, for Non-Commercial + Research purposes only and must be substantially similar to the Licensed Rights granted under this Agreement. - (ii) No downstream restrictions. Licensee may not offer or impose any - additional or different terms or conditions on, or apply any technological - measures to, the Licensed Materials (when shared internally) if doing so - restricts exercise of the Licensed Rights by a recipient of the Licensed + (ii) No downstream restrictions. Licensee may not offer or impose any + additional or different terms or conditions on, or apply any technological + measures to, the Licensed Materials (when shared internally) if doing so + restricts exercise of the Licensed Rights by a recipient of the Licensed Material. - d. No endorsement. Nothing in this License constitutes or may be construed as - permission to assert or imply that Licensee is, or that Licensee's use of the - Licensed Materials is, connected with, or sponsored, endorsed, or granted - official status by, 23andMe or others designated to receive attribution as + d. No endorsement. Nothing in this License constitutes or may be construed as + permission to assert or imply that Licensee is, or that Licensee's use of the + Licensed Materials is, connected with, or sponsored, endorsed, or granted + official status by, 23andMe or others designated to receive attribution as provided in Section 3(a)(i)(A). e. Other rights. - (i) Moral rights, such as the right of integrity, are not licensed under this - License, nor are publicity, privacy, and/or other similar personality - rights; however, to the extent possible, 23andMe waives and/or agrees not - to assert any such rights held by 23andMe to the limited extent necessary + (i) Moral rights, such as the right of integrity, are not licensed under this + License, nor are publicity, privacy, and/or other similar personality + rights; however, to the extent possible, 23andMe waives and/or agrees not + to assert any such rights held by 23andMe to the limited extent necessary to allow Licensee to exercise the Licensed Rights, but not otherwise. (ii) Patent and trademark rights are not licensed under this License. - (iii) 23andMe expressly reserves any right to collect royalties and/or exercise - any legal or equitable remedy available to it when the Licensed Materials - are used other than for Non-Commercial Research purposes or otherwise + (iii) 23andMe expressly reserves any right to collect royalties and/or exercise + any legal or equitable remedy available to it when the Licensed Materials + are used other than for Non-Commercial Research purposes or otherwise beyond the scope of rights granted under this License Agreement. -3. License Conditions. Licensee's exercise of the Licensed Rights is expressly +3. License Conditions. Licensee's exercise of the Licensed Rights is expressly made subject to the following conditions. a. Attribution. - (i) Licensee may share the Licensed Materials internally (including in + (i) Licensee may share the Licensed Materials internally (including in modified form), provided that Licensee must: - A. Retain the following if it is supplied by 23andMe with the Licensed + A. Retain the following if it is supplied by 23andMe with the Licensed Material: - a. Identification of 23andMe as creator of the Licensed Materials in + a. Identification of 23andMe as creator of the Licensed Materials in the manner requested by 23andMe; b. A copyright notice; - c. A notice that refers to the file where this License Agreement is + c. A notice that refers to the file where this License Agreement is made available or a link to this License Agreement; d. A notice that refers to the disclaimer of warranties; - e. A URI or hyperlink to the Licensed Materials to the extent + e. A URI or hyperlink to the Licensed Materials to the extent reasonably practicable; - B. Indicate if Licensee modified the Licensed Material and retain an + B. Indicate if Licensee modified the Licensed Material and retain an indication of any previous modifications; and - C. Indicate the Licensed Materials are licensed under this Agreement, and + C. Indicate the Licensed Materials are licensed under this Agreement, and include the text of, or the URI or hyperlink to, this License Agreement. - (ii) Licensee may satisfy the conditions in this Section 3(a) in any reasonable - manner based on the medium, means, and context in which Licensee shares - the Licensed Material. For example, it may be reasonable to satisfy the - conditions by providing a URI or hyperlink to a resource that includes the + (ii) Licensee may satisfy the conditions in this Section 3(a) in any reasonable + manner based on the medium, means, and context in which Licensee shares + the Licensed Material. For example, it may be reasonable to satisfy the + conditions by providing a URI or hyperlink to a resource that includes the required information. - (iii) If requested by 23andMe, Licensee must remove any of the information + (iii) If requested by 23andMe, Licensee must remove any of the information required by Section 3(a) to the extent reasonably practicable. - b. No Redistribution or Sharing. In addition to the conditions in Section - 3(a) if Licensee modifies the Licensed Materials, such modifications may - only be used for internal purposes and not for distribution. The Licensed - Materials may not be used for redistribution purposes and any individual or - institution interested in obtaining Licensed Rights in the Licensed - Materials should be directed to 23andMe. + b. No Redistribution or Sharing. In addition to the conditions in Section + 3(a) if Licensee modifies the Licensed Materials, such modifications may + only be used for internal purposes and not for distribution. The Licensed + Materials may not be used for redistribution purposes and any individual or + institution interested in obtaining Licensed Rights in the Licensed + Materials should be directed to 23andMe. 4. DISCLAIMER OF WARRANTIES AND LIMITATION OF LIABILITY. - a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY 23ANDME, TO THE - EXTENT POSSIBLE, 23ANDME OFFERS THE LICENSED MATERIALS AS-IS AND - AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY - KIND CONCERNING THE LICENSED MATERIALS, WHETHER EXPRESS, IMPLIED, - STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, - WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR - PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, - ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT - KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT - ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY 23ANDME, TO THE + EXTENT POSSIBLE, 23ANDME OFFERS THE LICENSED MATERIALS AS-IS AND + AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY + KIND CONCERNING THE LICENSED MATERIALS, WHETHER EXPRESS, IMPLIED, + STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO LICENSEE. - b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL 23ANDME BE LIABLE TO - LICENSEE OR TO ANY THIRD PARTY ON ANY LEGAL THEORY (INCLUDING, - WITHOUT LIMITATION, NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, - SPECIAL, INDIRECT, INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, - OR OTHER LOSSES, COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS - LICENSE AGREEMENT OR USE OF THE LICENSED MATERIALS, EVEN IF - 23ANDME HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, - EXPENSES, OR DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT - ALLOWED IN FULL OR IN PART, THIS LIMITATION MAY NOT APPLY TO + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL 23ANDME BE LIABLE TO + LICENSEE OR TO ANY THIRD PARTY ON ANY LEGAL THEORY (INCLUDING, + WITHOUT LIMITATION, NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, + SPECIAL, INDIRECT, INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, + OR OTHER LOSSES, COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS + LICENSE AGREEMENT OR USE OF THE LICENSED MATERIALS, EVEN IF + 23ANDME HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, + EXPENSES, OR DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT + ALLOWED IN FULL OR IN PART, THIS LIMITATION MAY NOT APPLY TO LICENSEE. - c. The disclaimer of warranties and limitation of liability provided above shall be - interpreted in a manner that, to the extent possible, most closely + c. The disclaimer of warranties and limitation of liability provided above shall be + interpreted in a manner that, to the extent possible, most closely approximates an absolute disclaimer and waiver of all liability. 5. Term and Termination. - a. The License Rights to the Licensed Materials will apply for the term of the - copyrights licensed under the Agreement. However, if Licensee fails to comply - with this License, then Licensee's rights under this License Agreement + a. The License Rights to the Licensed Materials will apply for the term of the + copyrights licensed under the Agreement. However, if Licensee fails to comply + with this License, then Licensee's rights under this License Agreement terminate automatically. - b. Where Licensee's right to use the Licensed Material has terminated under - Section 5(a), it may be reinstated: (i) Automatically as of the date the - violation is cured, provided it is cured within fifteen (15) days of Licensee's - discovery of the violation; or (ii) Upon express reinstatement by 23andMe. For - clarity, this Section 5 does not affect any right 23andMe may have to seek any - remedies available at law or equity for Licensee's violations of this License + b. Where Licensee's right to use the Licensed Material has terminated under + Section 5(a), it may be reinstated: (i) Automatically as of the date the + violation is cured, provided it is cured within fifteen (15) days of Licensee's + discovery of the violation; or (ii) Upon express reinstatement by 23andMe. For + clarity, this Section 5 does not affect any right 23andMe may have to seek any + remedies available at law or equity for Licensee's violations of this License Agreement. - c. 23andMe reserves the right to (i) offer the Licensed Materials under separate - terms or conditions or (ii) stop distributing the Licensed Materials at any time; + c. 23andMe reserves the right to (i) offer the Licensed Materials under separate + terms or conditions or (ii) stop distributing the Licensed Materials at any time; however, doing so will not terminate the License Rights granted herein. d. Sections 1, 4, 5, 6, and 7 survive termination of this License Agreement. -6. Other Terms and Conditions. By exercising the Licensed Rights, Licensee - understands and agrees that this License Agreement (and any dispute, controversy, - proceedings or claim of whatever nature arising out of this Agreement or its - formation) shall be construed, interpreted and governed by the laws of the State of - California and shall be subject to the exclusive jurisdiction of the California - Courts. 23andMe shall not be bound by any additional or different terms or conditions - communicated by Licensee unless expressly agreed in a writing executed by an - authorized officer. Nothing in this Agreement shall operate to transfer to Licensee - any intellectual property rights relating to the Licensed Materials. Any - arrangements, understandings, or agreements regarding the Licensed Material not - stated herein are separate from and independent of the terms and conditions of this +6. Other Terms and Conditions. By exercising the Licensed Rights, Licensee + understands and agrees that this License Agreement (and any dispute, controversy, + proceedings or claim of whatever nature arising out of this Agreement or its + formation) shall be construed, interpreted and governed by the laws of the State of + California and shall be subject to the exclusive jurisdiction of the California + Courts. 23andMe shall not be bound by any additional or different terms or conditions + communicated by Licensee unless expressly agreed in a writing executed by an + authorized officer. Nothing in this Agreement shall operate to transfer to Licensee + any intellectual property rights relating to the Licensed Materials. Any + arrangements, understandings, or agreements regarding the Licensed Material not + stated herein are separate from and independent of the terms and conditions of this License Agreement. -7. Interpretation. To the extent possible, if any provision of this License is - deemed unenforceable, it shall be automatically reformed to the minimum extent - necessary to make it enforceable. If the provision cannot be reformed, it shall be - severed from this License without affecting the enforceability of the remaining terms - and conditions. No term or condition of this License will be waived and no failure to - comply consented to unless expressly agreed to by 23andMe. Nothing in this License - constitutes or may be interpreted as a limitation upon, or waiver of, any privileges - and immunities that apply to 23andMe or Licensee, including from the legal processes +7. Interpretation. To the extent possible, if any provision of this License is + deemed unenforceable, it shall be automatically reformed to the minimum extent + necessary to make it enforceable. If the provision cannot be reformed, it shall be + severed from this License without affecting the enforceability of the remaining terms + and conditions. No term or condition of this License will be waived and no failure to + comply consented to unless expressly agreed to by 23andMe. Nothing in this License + constitutes or may be interpreted as a limitation upon, or waiver of, any privileges + and immunities that apply to 23andMe or Licensee, including from the legal processes of any jurisdiction or authority. End of License Agreement diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2757cc1 --- /dev/null +++ b/Makefile @@ -0,0 +1,60 @@ +BOLD_CYAN = \033[1;36m +GREEN = \033[0;32m +NO_COLOR = \033[0m + +## General +# ---------------------------------------------------------------------- +help: ## Print this help message + @egrep -h '(\s|^)##\s' $(MAKEFILE_LIST) \ + | sed -E "s/^## (.*)/\n$$(printf "${BOLD_CYAN}")\1$$(printf "${NO_COLOR}")/g" \ + | awk 'BEGIN {FS = ":.*?## "}; {printf "${GREEN}%-25s${NO_COLOR} %s\n", $$1, $$2}' + @echo + + +## Development environment +# ---------------------------------------------------------------------- +PACKAGE_NAME = yhaplo +ENV_NAME = $(PACKAGE_NAME) + +dev-pyenv-virtualenv: ## Set up pyenv-virtual-env-based development environment + pyenv uninstall --force $(ENV_NAME) + pyenv local --unset + pyenv virtualenv $(ENV_NAME) + pyenv local $(ENV_NAME) + pip install --upgrade pip setuptools wheel + $(MAKE) dev-install + $(MAKE) init-hooks + +dev-install: optional_deps := dev,plot,vcf +dev-install: ttam_deps := ,ttam +dev-install: optional_deps := $(if $(findstring ttam.yhaplo,$(CURDIR)),$(addsuffix $(ttam_deps),$(optional_deps)),$(optional_deps)) +dev-install: ## Install package as editable, with optional dependencies + pip install --editable .[$(optional_deps)] + python -m ipykernel install --user --name $(ENV_NAME) --display-name $(PACKAGE_NAME) + + +## Pre-commit hooks +# ---------------------------------------------------------------------- +init-hooks: install-hooks update-hooks ## Install and update hooks + +install-hooks: ## Install hooks + pre-commit install --install-hooks --hook-type pre-commit --hook-type commit-msg + +update-hooks: ## Update hooks + pre-commit autoupdate + +run-hooks: ## Run hooks + pre-commit run + +run-hooks-all: ## Run hooks on all files + pre-commit run --all-files + +lint: run-hooks-all ## Alias for run-hooks-all + + +## Testing +# ---------------------------------------------------------------------- +test: ## Run unit tests + pytest --verbose + + diff --git a/README.md b/README.md index b83221a..a8aa8ea 100644 --- a/README.md +++ b/README.md @@ -1,62 +1,101 @@ # `yhaplo` | Identifying Y-Chromosome Haplogroups -David Poznik -23andMe -October, 2016 +[![python]( +https://img.shields.io/badge/python-3.9%20%7C%203.10%20%7C%203.11-blue.svg)]( +https://docs.python.org) +[![style]( +https://img.shields.io/badge/style-black-blue.svg)]( +https://github.com/psf/black) +[![imports]( +https://img.shields.io/badge/imports-isort-blue.svg)]( +https://pycqa.github.io/isort) +[![docs]( +https://img.shields.io/badge/docs-pydocstyle-blue.svg)]( +https://github.com/PyCQA/pydocstyle) +[![format]( +https://img.shields.io/badge/format-flake8-blue.svg)]( +https://flake8.pycqa.org/en/latest/index.html) +[![mypy]( +https://img.shields.io/badge/annotations-mypy-blue.svg)]( +https://github.com/python/mypy) + +David Poznik, 23andMe --------------------------------------------------------------------------------- ## Overview -`yhaplo` identifies the Y-chromosome haplogroup of each male in a sample of one to -millions. It does not rely on any particular genotyping modality or platform, and it is -robust to missing data, genotype errors, mutation recurrence, and other complications. -Although full sequences yield the most granular haplogroup classifications, genotyping -arrays can yield reliable calls, provided a reasonable number of phylogenetically -informative variants has been assayed. - -Briefly, haplogroup calling involves two steps. The program first builds an internal -representation of the Y-chromosome phylogeny by reading its primary structure from -(Newick-formatted) text and importing phylogenetically informative SNPs from the -[ISOGG database](http://isogg.org/tree/ISOGG_YDNA_SNP_Index.html), affiliating each -SNP with the appropriate node and growing the tree as necessary. It then traverses the +`yhaplo` identifies the Y-chromosome haplogroup of each male in a sample of one to +millions. It does not rely on any particular genotyping modality or platform, and it is +robust to missing data, genotype errors, mutation recurrence, and other complications. +Although full sequences yield the most granular haplogroup classifications, genotyping +arrays can yield reliable calls, provided a reasonable number of phylogenetically +informative variants has been assayed. + +Briefly, haplogroup calling involves two steps. The program first builds an internal +representation of the Y-chromosome phylogeny by reading its primary structure from +(Newick-formatted) text and importing phylogenetically informative SNPs from the +[ISOGG database](http://isogg.org/tree/ISOGG_YDNA_SNP_Index.html), affiliating each +SNP with the appropriate node and growing the tree as necessary. It then traverses the tree for each individual, identifying the path of derived alleles leading to a haplogroup designation. -`yhaplo` is available for non-commercial use pursuant to the terms of the non-exclusive -license agreement, `LICENSE.txt`. To learn more about the algorithm, please see our -bioRxiv [pre-print](http://biorxiv.org/content/early/2016/11/19/088716): +`yhaplo` is available for non-commercial use pursuant to the terms of the non-exclusive +license agreement, `LICENSE.txt`. To learn more about the algorithm, please see our +bioRxiv [preprint](http://biorxiv.org/content/early/2016/11/19/088716): - Poznik GD. 2016. Identifying Y-chromosome haplogroups in arbitrarily large samples + Poznik GD. 2016. Identifying Y-chromosome haplogroups in arbitrarily large samples of sequenced or genotyped men. bioRxiv doi: 10.1101/088716 -To learn more about the software, please see the manual, `yhaplo.manual..pdf`. -For an overiew of command-line options, install the package and run: `yhaplo -h` +To learn more about the software, please see the manual, [`yhaplo_manual.pdf`]( +./yhaplo_manual.pdf). + +For an overiew of command-line options, install the package and run `yhaplo --help`. + +For 23andMe-specific documentation, see [`README.23andMe.md`](./README.23andMe.md). --------------------------------------------------------------------------------- ## Installation -`yhaplo` is compatible with both Python 2 and Python 3. +### Basic installation To install: - -``` +```sh git clone git@github.com:23andMe/yhaplo.git cd yhaplo pip install --editable . ``` -To test-run on example data: +To update: +```sh +cd /path/to/yhaplo +git pull # Update code +pip install --editable . # Update version number +``` + +### Optional dependencies +To include optional dependencies for various features: +* `pip install --editable .[dev]` Includes development tools (e.g., `pytest`) +* `pip install --editable .[plot]` Enables tree plotting +* `pip install --editable .[ttam]` Enables running on 23andMe ablocks +* `pip install --editable .[vcf]` Enables running on VCF/BCF input + +To install multiple optional features, use a comma-separated list. For example: +```sh +pip install --editable .[dev,plot,vcf] ``` -yhaplo -ex + +### Test run + +To run on example data: +```sh +yhaplo --example_text ``` -The `-ex` option tells `yhaplo` to run on a subset of 1000 Genomes data -and sets the `--all_aux_output` flag to produce all auxiliary output. +The `--example_text` option tells `yhaplo` to run on a subset of 1000 Genomes data +in sample-major text format. It also sets the `--all_aux_output` flag +to produce all auxiliary output. --------------------------------------------------------------------------------- ## Caveats Please note the following caveats before running `yhaplo`: @@ -64,110 +103,123 @@ Please note the following caveats before running `yhaplo`: * `yhaplo` does not check for sex status; it assumes all individuals are male. * `yhaplo` expects SNP coordinates consistent with the hg19/GRCh37 reference assembly. * `yhaplo` expects data at a reasonable number of ISOGG SNPs. This assumption is violated by: - * variants-only sequence data - * very low-coverage sequencing - * genotyping arrays with few Y-chromosome probes - - -If, for a given individual, `yhaplo` observes no derived alleles at ISOGG SNPs on the upper -branches of the Y-chromosome phylogeny, it will call the individual haplogroup "A," -since all human Y-chromosome lineages are technically sublineages of A. -Before concluding that the individual sample belongs to paragroup A (which -includes haplogroups A00, A0, A1a, and A1b1), run with the `-as` option, and check the -auxiliary output for ancestral alleles at haplogroup-BT SNPs. If you do not see any, + * Variants-only sequence data + * Very low-coverage sequencing + * Genotyping arrays with few Y-chromosome probes + +If, for a given individual, `yhaplo` observes no derived alleles at ISOGG SNPs on the upper +branches of the Y-chromosome phylogeny, it will call the individual haplogroup "A," +since all human Y-chromosome lineages are technically sublineages of A. +Before concluding that the individual sample belongs to paragroup A (which +includes haplogroups A00, A0, A1a, and A1b1), run with the `--anc_snps` option, and check the +auxiliary output for ancestral alleles at haplogroup-BT SNPs. If you do not see any, your data set probably violates one or more of the assumptions listed above. -In particular, "variants-only" VCF files restrict to SNPs at which alternative alleles -were observed, but ref/alt status is unimportant to `yhaplo`. What is important is -ancestral/derived status. The reference sequence contains many derived alleles, -and `yhaplo` will not be happy if you discard these valuable data. So please emit all -confident sites when calling variants. To limit compute time and file size, you could -safely restrict to positions in `output/isogg.snps.unique.DATE.txt`, as these are the -only SNPs `yhaplo` considers. To generate this file, just run `yhaplo` with no arguments. +In particular, "variants-only" VCF files restrict to SNPs at which alternative alleles +were observed, but ref/alt status is unimportant to `yhaplo`. What is important is +ancestral/derived status. The reference sequence contains many derived alleles, +and `yhaplo` will not be happy if you discard these valuable data. So please emit all +confident sites when calling variants. To limit file size, you could safely restrict to +positions in `output/isogg.snps.unique.DATE.txt`, as these are the only SNPs `yhaplo` +considers. To generate this file, just run `yhaplo` with no arguments. --------------------------------------------------------------------------------- ## Input -### Phylogenetic data - -`input/` +The following input file types are supported: +* Indexed BCF: `.bcf`, `.bcf.csi` +* Indexed VCF: `.vcf.gz`, `.vcf.gz.tbi` +* Sample-major text: `.genos.txt` or `.genos.txt.gz` + * Row 0: Physical coordinates (GRCh37) + * Column 0: Individual identifiers + * Cell (*i*, *j*): Genotype for individual *i* at position *j*.
+ Values include {"A", "C", "G", "T", "."}, with "." indicating an unobserved value. -* `y.tree.primary.DATE.nwk` : primary structure of the Y-chromosome tree -* `isogg.DATE.txt` : phylogenetically informative SNPs scraped directly from the ISOGG website. -`yhaplo` resolves errors and formatting inconsistencies and emits cleaned versions -(`output/isogg.snps.cleaned.DATE.txt` and `output/isogg.snps.unique.DATE.txt`; -see `yhaplo.manual..pdf` for details). -* `isogg.correct.*.txt` : corrections to ISOGG data -* `isogg.omit.*.txt` : SNPs to drop due to inconsistencies observed in test data -* `isogg.multiallelic.txt` : physical coordinates of multiallelic sites to be excluded -* `representative.SNPs.*.txt` : SNPs deemed representative of corresponding haplogroups +In addition, the API supports running on a mapping of individual identifiers to 23andMe ablocks. -### Supported genotype formats - -* `.genos.txt` : sample-major genotypes - * row 1: physical coordinates - * column 1: individual IDs - * cell (i, j): genotype for individual i at position j, encoded as a single character -from the set { A, C, G, T, . }, with "." representing an unobserved value -* `.resid.txt` : file with 23andMe research IDs in the first column -* `.vcf`, `.vcf.gz` : snp-major VCF file. - It is most efficient to restrict input VCF files to the Y chromosome. -* `.vcf4` : snp-major pseudo-VCF. differences include: - * no "#" in header row - * fewer header columns - * GT values recorded as { A, C, G, T, . } rather than { 0, 1, . } - - --------------------------------------------------------------------------------- ## Output -All output file formats are described in detail in `yhaplo.manual..pdf`. +All output file formats are described in detail in [`yhaplo_manual.pdf`]( +./yhaplo_manual.pdf). The two primary output files are: -1. `log.projectName.txt` : log file containing details of the run -2. `haplogroups.projectName.txt` : haplogroup calls. The 4 columns are: +1. `log.project_name.txt` Log file containing details of the run +2. `haplogroups.project_name.txt` Haplogroup calls. The 4 columns are: 1. ID 2. Haplogroup short form, with the name of a SNP observed in the derived state 3. Haplogroup short form, with the name of a representative SNP 4. Haplogroup long form, using Y-Chromosome Consortium nomenclature -`yhaplo` also produces a number of SNP tables, tree files, and auxiliary output files. -Please see `yhaplo.manual..pdf` and `yhaplo -h` for details. +`yhaplo` also produces a number of SNP tables, tree files, and auxiliary output files.
+Please see [`yhaplo_manual.pdf`](./yhaplo_manual.pdf) and `yhaplo --help` for details. + + +## API + +See `yhaplo/api/call_haplogroups.py`. + + +## CLI + +The main command-line entry-point is `yhaplo`. +Additional commands include: +* `yhaplo_convert_to_genos` +* `yhaplo_plot_tree` + + +## Implementation details + +### Package data +#### `yhaplo/data/tree/` +`y.tree.primary.DATE.nwk` Primary structure of the Y-chromosome tree --------------------------------------------------------------------------------- -## Code +#### `yhaplo/data/variants/` +* `isogg.DATE.txt` Phylogenetically informative SNPs scraped directly from the ISOGG website.
+ `yhaplo` resolves errors and formatting inconsistencies and emits cleaned versions: + `isogg.snps.cleaned.DATE.txt`, `isogg.snps.unique.DATE.txt`.
+ See [`yhaplo_manual.pdf`](./yhaplo_manual.pdf) for details. +* `isogg.correct.*.txt` Corrections to ISOGG data +* `isogg.multiallelic.txt` Physical coordinates of multiallelic sites to be excluded +* `isogg.omit.*.txt` SNPs to drop due to inconsistencies observed in test data +* `isogg.split.txt` Not currently used +* `preferred.snp_names.txt` List of preferred SNP names +* `representative.SNPs.*.txt` SNPs deemed representative of corresponding haplogroups -### Driver script -`call_haplogroups.py` +### Classes -### Main classes +#### `tree.py` +`Tree` +* Parses a Newick file to build primary tree +* Parses ISOGG table to add SNPs to nodes and grow tree +* Finds the derived path leading from the root to an individual +* Knows root, depth, haplogroup-to-node mappings, etc. -* `Tree` : knows root, depth, haplogroup-to-node mappings, etc.; - parses a Newick file to build primary tree; - parses ISOGG table to add SNPs to nodes and grow tree; - finds the derived path leading from the root to an individual -* `Node` : element of the tree. knows parent, children, snps, etc. - represents the branch that leads to it -* `SNP` : knows position, ancestral and derived alleles, node, etc. -* `PlatformSNP` : knows position and ablock index -* `Sample` : knows genotypes and haplogroup of an individual -* `Customer` : (subclass of Sample) has 23andMe metadata and genotypes from ablocks -* `Path` : path through a tree; stores the next node to visit, a list of SNPs - observed in the derived state, the most derived SNP observed, - and the number of ancestral alleles encountered -* `Page` : 23andMe content page labels -* `Config` : container for parameters, command-line options, and filenames +#### `node.py` +`Node` +* Represents a phylogenetic branch +* Knows parent, children, SNPs, etc. -### Utilities +#### `snp.py` +* `SNP` Knows position, ancestral and derived alleles, node, etc. +* `PlatformSNP` Knows position and 23andMe ablock index +* `DroppedMarker` Represents a marker not used for classification -* `utils.py` : shared utility functions +#### `sample.py` +* `Sample` Knows genotypes and haplogroup of an individual + * `TextSample` Subclass for sample-major text input + * `VCFSample` Subclass for VCF/BCF input + * `AblockSample` Subclass for 23andMe ablock input -### Auxiliary scripts +#### `path.py` +`Path` Path through a tree; stores: +* The next node to visit +* A list of SNPs observed in the derived state +* The most derived SNP observed +* The number of ancestral alleles encountered -* `convert_to_genos.py` : converts data to `.genos.txt` format -* `plot_tree.py` : plots a newick tree +#### `config.py` +`Config` Container for parameters, command-line options, and filenames diff --git a/pyproject.toml b/pyproject.toml index 1e2b821..467856e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,3 +1,95 @@ +[build-system] +requires = [ + "setuptools>=61.0", + "setuptools_scm[toml]>=6.2", +] +build-backend = "setuptools.build_meta" + + +[project] +name = "yhaplo" +authors = [ + {name = "Population Genetics R&D"}, + {email = "popgen.rd@23andme.com"}, +] +dependencies = [ + "numpy", + "pandas", + "pyyaml", +] +description = "Y-chromosome haplogroup caller" +dynamic = ["version"] +license = {text = "LICENSE.txt"} +readme = "README.md" +requires-python = ">=3.9" + +[project.optional-dependencies] +dev = [ + "ipykernel", + "ipython", + "pytest", + "setuptools_scm", +] +plot = [ + "biopython", +] +vcf = [ + "pysam", +] + +[project.scripts] +yhaplo = "yhaplo.cli.yhaplo:main" +yhaplo_convert_to_genos = "yhaplo.cli.convert_to_genos:main" +yhaplo_plot_tree = "yhaplo.cli.plot_tree:main" + +[project.urls] +homepage = "https://github.com/23andMe/yhaplo" + +[tool.setuptools_scm] +write_to = "yhaplo/_version.py" + [tool.isort] profile = "black" filter_files = true +known_first_party = ["logic"] # aws/logic/ includes logic for flow + +[tool.mypy] +ignore_missing_imports = true + +[tool.pydocstyle] +# D107 Missing docstring in __init__ +# D202 No blank lines allowed after function docstring +# Ignoring allows blank lines after one-line docstrings. +# D211 No blank lines allowed before class docstring +# Contradicts D203. +# D213 Multi-line docstring summary should start at the second line +# Contradicts D212. +# https://github.com/PyCQA/pydocstyle/issues/242#issuecomment-288166773 +ignore = "D107,D202,D211,D213" +match_dir = "(?!tests|\\.).*" + +[tool.pytest.ini_options] +filterwarnings = [ + # ttam.coregen: https://setuptools.pypa.io/en/latest/references/keywords.html#keyword-namespace-packages + "ignore:Deprecated call to `pkg_resources.declare_namespace", + "ignore:pkg_resources is deprecated as an API", +] +norecursedirs = [ + ".*", + "build", +] + +[tool.tox] +legacy_tox_ini = """ +[tox] +envlist = py39, py310, py311 +[testenv] +commands = + pytest +sitepackages = false +deps = + pysam + pytest +extras = + ttam +""" diff --git a/scripts/identify_mutation_updates.py b/scripts/identify_mutation_updates.py new file mode 100755 index 0000000..cd68657 --- /dev/null +++ b/scripts/identify_mutation_updates.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python + +"""Identify mutation updates. + +Yhaplo relies on a specific freeze of the ISOGG tree and SNP dataset. +Occasionally, errors are found and corrected therein. This script assesses which +mutations used by Yhaplo are no longer concordant with the current ISOGG SNP dataset. +It then patches a Yhaplo input file whose purpose is to correct for these errors. + +For example, Y4010 (also known as FGC5628) on the R1b1a2a1a2c1f2a1a branch used to be +listed as "A->C", but it has since been corrected to "C->A". +""" + +import csv +import logging +import os + +import pandas as pd + +# Input +INPUT_DIR = "input" +CURRENT_ISOGG_FP = os.path.join(INPUT_DIR, "isogg.snp.index.2020.12.25.csv") +PREV_ISOGG_UNIQUE_FP = os.path.join(INPUT_DIR, "isogg.snps.unique.2016.01.04.txt") +MUTATION_CORRECTIONS_FP = os.path.join(INPUT_DIR, "isogg.correct.polarize.txt") + +# Output +OUTPUT_DIR = "output" +OVERLAP_FP = os.path.join(OUTPUT_DIR, "overlap.txt") +PATCH_SNPS_FP = os.path.join(OUTPUT_DIR, "isogg.correct.polarize.patch.txt") +UPDATED_CORRECTIONS_FP = os.path.join(OUTPUT_DIR, "isogg.correct.polarize.updated.txt") + +# Logging +LOG_FP = os.path.join(OUTPUT_DIR, "log.discordant.snps.txt") +logging.basicConfig(format="%(message)s", level=logging.INFO) +logger = logging.getLogger() + +# Constants +COL_NAMES = ["name", "haplogroup", "position", "mutation", "alt_names"] + + +# ---------------------------------------------------------------------- +class MutationUpdater: + + """Class for updating yhaplo's allele corrections file. + + Attributes + ---------- + current_isogg_fp + Filepath to current ISOGG SNP index + Downloaded as CSV from: + https://docs.google.com/spreadsheets/d/1UY26FvLE3UmEmYFiXgOy0uezJi_wOut-V5TD0a_6-bE + prev_isogg_unique_fp + Filepath to previous table of unique ISOGG SNPs. + Generated by running `yhaplo` at the command line. + mutation_corrections_fp + Filepath of corrections yhaplo already knows about. + From yhaplo's input directory: + https://github.com/23andMe/yhaplo/blob/master/yhaplo/input/isogg.correct.polarize.txt + patch_snps_fp + Filepath to which discordant SNPs will be written. + updated_corrections_fp + Filepath to which updated corrections should be written. + overlap_fp + Filepath of overlap between extant file and putative patch. + current_mutation_dict + Maps SNP name to mutation (e.g., "G->A"). + patch_df + DataFrame with updated mutations. + original_df + DataFrame with corrections yhaplo already knows about. + dup_df + DataFrame of overlap between patch_df and original_df. + merged_df + DataFrame with a reconciled and updated corrections table. + + """ + + def __init__( + self, + current_isogg_fp: str = CURRENT_ISOGG_FP, + prev_isogg_unique_fp: str = PREV_ISOGG_UNIQUE_FP, + mutation_corrections_fp: str = MUTATION_CORRECTIONS_FP, + patch_snps_fp: str = PATCH_SNPS_FP, + updated_corrections_fp: str = UPDATED_CORRECTIONS_FP, + overlap_fp: str = OVERLAP_FP, + ): + # Input + self.current_isogg_fp = current_isogg_fp + self.prev_isogg_unique_fp = prev_isogg_unique_fp + self.mutation_corrections_fp = mutation_corrections_fp + + # Output + self.patch_snps_fp = patch_snps_fp + self.updated_corrections_fp = updated_corrections_fp + self.overlap_fp = overlap_fp + + def run(self) -> None: + """Run.""" + + self.current_name_to_mutation = self._load_current_name_to_mutation() + self.patch_df = self._construct_patch_df() + self.original_df = self._load_previous_corrections() + self.dup_df, self.merged_df = self._merge_corrections() + + def _load_current_name_to_mutation(self) -> dict[str, str]: + """Read current ISOGG SNP index to define a mapping of SNP name to mutation. + + Returns + ------- + name_to_mutation : dict[str, str] + Maps SNP name to mutation. + + """ + name_to_mutation = {} + num_read = 0 + with open(self.current_isogg_fp) as current_isogg_file: + current_isogg_reader = csv.reader(current_isogg_file) + for row_list in current_isogg_reader: + num_read += 1 + name, mutation = row_list[0], row_list[-1] + name_to_mutation[name] = mutation + + logger.info(f"{num_read:7d} current SNP records: {self.current_isogg_fp}") + + return name_to_mutation + + def _construct_patch_df(self) -> pd.DataFrame: + """Construct patch DataFrame. + + Read a previous table of unique ISOGG SNPs and report those whose mutations + are discordant with the current ISOGG SNP index. + + Returns + ------- + patch_df : pd.DataFrame + Table of mutation patches. + + """ + num_used = 0 + row_lists = [] + with open(self.prev_isogg_unique_fp) as prev_isogg_unique_file: + for line in prev_isogg_unique_file: + num_used += 1 + row_list = line.strip().split() + name, mutation = row_list[0], row_list[3] + current_mutation = self.current_name_to_mutation.get(name) + if current_mutation and current_mutation != mutation: + row_list[3] = current_mutation + row_lists.append(row_list) + + patch_df = pd.DataFrame(row_lists, columns=COL_NAMES).set_index("name") + write_snp_table(patch_df, self.patch_snps_fp) + logger.info( + f"{num_used:7d} used by yhaplo:" + f" {self.prev_isogg_unique_fp}\n\n" + f"{len(patch_df):7d} discordant mutations: {self.patch_snps_fp}" + ) + + return patch_df + + def _load_previous_corrections(self) -> pd.DataFrame: + """Load previous corrections file. + + Returns + ------- + original_df : pd.DataFrame + Dataframe of original corrections. + + """ + original_df = pd.read_csv( + self.mutation_corrections_fp, + sep=r"\s+", + header=None, + names=COL_NAMES, + ).set_index("name") + logger.info( + f"{len(original_df):7d} extant corrections: " + f"{self.mutation_corrections_fp}\n" + ) + + return original_df + + def _merge_corrections(self) -> tuple[pd.DataFrame, pd.DataFrame]: + """Load known mutation corrections and merge with new corrections. + + In doing so, take care not to inappapropriately override. + + Returns + ------- + dup_df : pd.DataFrame + Duplicates. + merged_df : pd.DataFrame + Merged corrections. + + """ + # Investigate overlap + suffixes = ("_orig", "_patch") + dup_df = self.original_df.merge( + self.patch_df, + left_index=True, + right_index=True, + suffixes=suffixes, + ) + for suffix in suffixes: + alleles = dup_df["mutation" + suffix].str.split("->", expand=True) + dup_df["anc" + suffix] = alleles[0] + dup_df["der" + suffix] = alleles[1] + + with open(self.overlap_fp, "w") as overlap_file: + overlap_file.write(dup_df.to_string() + "\n") + + logger.info(f"{len(dup_df):7d} overlap: {self.overlap_fp}") + + # Handle overlap + ancestral_consistent = dup_df["anc_orig"] == dup_df["anc_patch"] + override_indexes = dup_df.loc[ancestral_consistent].index + do_not_patch_indexes = dup_df.loc[~ancestral_consistent].index + original_df = self.original_df.drop(override_indexes) + patch_df = self.patch_df.drop(do_not_patch_indexes) + logger.info( + f"{len(override_indexes):7d} excluded from original " + "(ancestral alleles consistent): " + f"{', '.join(override_indexes)}" + ) + logger.info( + f"{len(do_not_patch_indexes):7d} excluded from patch: " + f"{', '.join(do_not_patch_indexes)}\n" + ) + + # Merge + merged_df = pd.concat([original_df, patch_df]).sort_values( + ["haplogroup", "position"] + ) + write_snp_table(merged_df, self.updated_corrections_fp) + logger.info( + f"{len(merged_df):7d} merged: " + f"{self.updated_corrections_fp}\n" + ) + + return dup_df, merged_df + + +def make_parent_dir(file_path: str) -> None: + """Make parent directory.""" + + dirname = os.path.dirname(file_path) + if dirname: + os.makedirs(dirname, exist_ok=True) + + +def write_snp_table(df: pd.DataFrame, out_fp: str) -> None: + """Write a SNP table to file.""" + + with open(out_fp, "w") as out_file: + for name, row in df.iterrows(): + out_file.write( + f"{name:15} {row.haplogroup:25} {row.position:>8} " + f"{row.mutation} {row.alt_names}\n" + ) + + +def main() -> None: + """Run script.""" + + make_parent_dir(LOG_FP) + logger.addHandler(logging.FileHandler(LOG_FP, "w")) + + mutation_updater = MutationUpdater() + mutation_updater.run() + + logger.info(f"Log: {LOG_FP}\n") + + +if __name__ == "__main__": + main() diff --git a/scripts/validate_yhaplo.sh b/scripts/validate_yhaplo.sh new file mode 100755 index 0000000..b5d08ac --- /dev/null +++ b/scripts/validate_yhaplo.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +# +# This script will: +# 1. Run yhaplo with various input types and output options. +# 2. Run other project scripts. +# 3. Compare output to expected. +# +# Options: +# -m Include multi-sample VCF validation +# +#---------------------------------------------------------------------- +set -o nounset -o pipefail + +# Command-line arguments and options +if [ $# == 1 ] && [ "$1" == "-m" ]; then INCLUDE_BIG_VCF=1; fi + +# Input +ttam_data_fp=data/example.23andMe.txt +multi_sample_bcf_fp=data/1000Y.all.bcf # See: tests/fixtures/generate_bcf_fixtures.sh +expected_output_dir=output.expected # See: README.23andMe.txt + +# Output +output_dir=output +nwk_fp=${output_dir}/y.tree.primary.aligned.ycc.2016.01.04.nwk +tree_drawing_fp=${nwk_fp%.nwk}.drawing.txt + +# Colors +BOLD_CYAN="\033[1;36m" +GREEN="\033[0;32m" +NO_COLOR="\033[0m" + + +rm -fr ${output_dir} +echo -e "\n${BOLD_CYAN}Removed${NO_COLOR}: ${GREEN}${output_dir}\n\n${NO_COLOR}" + + +echo -e "${BOLD_CYAN}Text Input${NO_COLOR}\n" +yhaplo --example_text \ + --breadth_first \ + --depth_first \ + --depth_first_table \ + --hg_genos Q +echo -e "\n" + + +echo -e "${BOLD_CYAN}Single-Sample VCF Input\n${NO_COLOR}" +yhaplo --example_vcf --hg_genos Q +echo -e "\n" + + +echo -e "${BOLD_CYAN}Multi-Sample VCF Input\n${NO_COLOR}" +if [ ${INCLUDE_BIG_VCF:-""} ]; then + if [ -e ${multi_sample_bcf_fp} ]; then + yhaplo -i ${multi_sample_bcf_fp} --hg_genos Q + else + echo "File not found: ${multi_sample_bcf_fp}" + echo "See: tests/fixtures/generate_bcf_fixtures.sh" + fi +else + echo "Skipping. To test multi-sample VCF, use -m option." +fi +echo -e "\n" + + +echo -e "${BOLD_CYAN}ISOGG Parser\n${NO_COLOR}" +echo Skipping. +echo -e "\n" + + +echo -e "${BOLD_CYAN}Tree Plotter\n${NO_COLOR}" +yhaplo_plot_tree -n ${nwk_fp} | tee ${tree_drawing_fp} +echo -e "\n" + + +echo -e "${BOLD_CYAN}Format Converter\n${NO_COLOR}" +if [ -e ${ttam_data_fp} ]; then + yhaplo_convert_to_genos ${ttam_data_fp} + mkdir -p ${output_dir} + mv converted/* ${output_dir}/ + rmdir converted +else + echo "Skipping. File not found: ${ttam_data_fp}" +fi +echo -e "\n" + + +echo -e "${BOLD_CYAN}Expected versus Observed\n${NO_COLOR}" + +if [ -d ${expected_output_dir} ]; then + for fn in $(comm -13 <(ls ${output_dir}/) <(ls ${expected_output_dir}/)); do + echo "* Not found: ${fn}" + done + echo + + for fn in $(ls ${expected_output_dir}); do + if [ -e ${output_dir}/${fn} ]; then + echo "Checking: ${fn}" + diff \ + <(cat ${expected_output_dir}/${fn} \ + | grep -v " | Y-chromosome haplogroup caller") \ + <(cat ${output_dir}/${fn} \ + | grep -v " | Y-chromosome haplogroup caller" \ + | sed 's| yhaplo\.| ttam.yhaplo.|g' \ + | sed 's|/yhaplo/|/ttam.yhaplo/|g') + fi + done + echo + + for fn in $(comm -23 <(ls ${output_dir}/) <(ls ${expected_output_dir}/)); do + echo "* Unexpected: ${fn}" + done + echo +else + echo "Directory not found: ${expected_output_dir}" + echo "See: README.23andMe.txt" +fi diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..7272f1e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[flake8] +# E203 Whitespace before ':' +# Conflicts with NumPy-style docstring type annotations +# E501 Line too long +# Enforce line length with Black rather than Flake8. +extend-ignore = E203,E501 diff --git a/setup.py b/setup.py index 7855238..10b9c04 100644 --- a/setup.py +++ b/setup.py @@ -1,62 +1,12 @@ -""" -yhaplo identifies Y-chromosome haplogroups +"""Specify packages and package data.""" -Sequence data will yield the most highly resolved classifications, -but the algorithm also works well with chip-based genotype data, -provided a reasonable number of phylogenetically informative sites -have been assayed. -""" +from setuptools import find_packages, setup -import subprocess - -import setuptools - -DOC_LIST = __doc__.split("\n") - - -def get_version_txt(): - "reads version from text file" - - from yhaplo import __version__ - - return __version__ - - -def get_version_git(): - "extracts version from git tag" - - checksum = ( - subprocess.check_output("git rev-list --tags --max-count=1".split()) - .strip() - .decode("utf-8") - ) - version = ( - subprocess.check_output("git describe --tags {}".format(checksum).split()) - .strip() - .decode("utf-8") - ) - - return version - - -setuptools.setup( - name="yhaplo", - version=get_version_txt(), - author="David Poznik", - description=DOC_LIST[1], - long_description="\n".join(DOC_LIST[3:]), - license="https://github.com/23andMe/yhaplo/blob/master/LICENSE.txt", - url="https://github.com/23andMe/yhaplo", - packages=setuptools.find_packages(), - include_package_data=True, - zip_safe=True, - install_requires=["six>=1.12"], - setup_requires=[], - entry_points={ - "console_scripts": [ - "yhaplo=yhaplo.call_haplogroups:call_haplogroups", - "convert_to_genos=yhaplo.convert_to_genos:main", - "plot_tree=yhaplo.plot_tree:main", - ], +setup( + package_data={ + "yhaplo.data.tree": ["*"], + "yhaplo.data.variants": ["*"], }, + packages=find_packages(exclude=["tests*"]), + url="https://github.com/23andMe/yhaplo", ) diff --git a/test_github_yhaplo.sh b/test_github_yhaplo.sh deleted file mode 100755 index b0804bc..0000000 --- a/test_github_yhaplo.sh +++ /dev/null @@ -1,96 +0,0 @@ -#!/bin/bash -# -# David Poznik -# 2020.08.22 -# test_github_yhaplo.sh -# -# Runs yhaplo with various configuations: -# {py2, py3} x {txt, single-sample VCF, multi-sample VCF} -# and runs other scripts from yhaplo package. -# -# Compares: -# - expected output to py2 output -# - py2 output to py3 output -# -# To generate expected output, run before making changes. -# -# Usage: -# test_github_yhaplo.sh -# test_github_yhaplo.sh m # include multi-sample VCF -#---------------------------------------------------------------------- -set -o nounset -o pipefail - -# input -ttam_data_fn=data/example.23andMe.txt -multi_sample_vcf_fn=data/ALL.chrY_10Mbp_mask.glia_freebayes_maxLikGT_siteQC.20130502.60555_biallelic_snps.vcf.gz -multi_sample_vcf_url=ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/chrY/ALL.chrY_10Mbp_mask.glia_freebayes_maxLikGT_siteQC.20130502.60555_biallelic_snps.vcf.gz -expected_output_dir=output.expected - -# output -py2_output_dir=output.yhaplo.py2 -py3_output_dir=output.yhaplo.py3 -nwk_fn=output/y.tree.primary.aligned.ycc.2016.01.04.nwk -tree_drawing_fn=${nwk_fn%.nwk}.drawing.txt - -py_version_start=$(pyenv global) - -rm -fr output -for py_version in 2.7.18 3.8.6; do - echo -e "Python ${py_version}\n\n" - pyenv global ${py_version} - - echo -e "Text Input\n" - yhaplo --example_1000Y_subset --hgGenos Q - - echo -e "\n\nSingle-Sample VCF Input\n" - yhaplo --example_single_sample_vcf --hgGenos Q - - echo -e "\n\nMulti-Sample VCF Input\n" - if [ $# == 1 ] && [ "$1" == "m" ]; then - if [ -e ${multi_sample_vcf_fn} ]; then - yhaplo -i ${multi_sample_vcf_fn} --hgGenos Q - else - echo "File not found: ${multi_sample_vcf_fn}" - echo -e "Download from:\n${multi_sample_vcf_url}" - fi - else - echo "Skipping. To test multi-sample VCF supply argument: m" - fi - - echo -e "\n\nFormat Converter\n" - if [ -e ${ttam_data_fn} ]; then - convert_to_genos ${ttam_data_fn} - mv converted/* output/ - rmdir converted - else - echo "File not found: ${ttam_data_fn}" - fi - - echo -e "\n\nTree Plotter\n" - plot_tree -n ${nwk_fn} | tee ${tree_drawing_fn} - - echo - py_version_major=$(echo $py_version | cut -f1 -d".") - out_dir=output.yhaplo.py${py_version_major} - rm -fr ${out_dir} - mv output ${out_dir} -done; -pyenv global ${py_version_start} - - -echo -e "\n\n\n----------------------------------------------------------------------" -echo -e "Expected versus py2\n" -if [ -d ${expected_output_dir} ]; then - ls ${expected_output_dir} \ - | xargs -I {} echo diff ${expected_output_dir}/{} ${py2_output_dir}/{} \ - | sh -v -else - echo "Directory not found: ${expected_output_dir}" -fi - - -echo -e "\n\n\n----------------------------------------------------------------------" -echo -e "Py2 versus Py3\n" -ls ${py2_output_dir} \ -| xargs -I {} echo diff ${py2_output_dir}/{} ${py3_output_dir}/{} \ -| sh -v diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/common.py b/tests/common.py new file mode 100644 index 0000000..f82586a --- /dev/null +++ b/tests/common.py @@ -0,0 +1,51 @@ +import os + +import pandas as pd + +FIXTURES_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "fixtures") +FIXTURES_INPUT_DIR = os.path.join(FIXTURES_DIR, "input") +FIXTURES_OUTPUT_DIR = os.path.join(FIXTURES_DIR, "output") + +GENOTYPES_1000Y_ALL_BCF_FP = os.path.join( + FIXTURES_INPUT_DIR, + "1000Y.all.bcf", +) +GENOTYPES_1000Y_SUBSET_BCF_FP = os.path.join( + FIXTURES_INPUT_DIR, + "1000Y.subset.bcf", +) +GENOTYPES_1000Y_SUBSET_TEXT_FP = os.path.join( + FIXTURES_INPUT_DIR, + "1000Y.subset.genos.txt", +) +GENOTYPES_1000Y_ONE_VCF_FP = os.path.join( + FIXTURES_INPUT_DIR, + "HG01938.vcf.gz", +) +GENOTYPES_1000Y_ONE_ABLOCK_FP = os.path.join( + FIXTURES_INPUT_DIR, + "HG01938.ablock", +) + +HAPLOGROUPS_1000Y_ALL_FP = os.path.join( + FIXTURES_OUTPUT_DIR, + "haplogroups.1000Y.all.txt", +) +HAPLOGROUPS_1000Y_SUBSET_FP = os.path.join( + FIXTURES_OUTPUT_DIR, + "haplogroups.1000Y.subset.txt", +) +HAPLOGROUPS_1000Y_ONE_FP = os.path.join( + FIXTURES_OUTPUT_DIR, + "haplogroups.HG01938.txt", +) + + +def load_haplogroup_df(filepath): + df = pd.read_csv( + filepath, + delim_whitespace=True, + names=["iid", "hg_snp_obs", "hg_snp", "ycc_haplogroup"], + ).set_index("iid") + + return df diff --git a/tests/fixtures/generate_bcf_fixtures.sh b/tests/fixtures/generate_bcf_fixtures.sh new file mode 100755 index 0000000..4ab2d0e --- /dev/null +++ b/tests/fixtures/generate_bcf_fixtures.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +# Generate two BCF fixtures ("subset", "all") based on the 1000Y VCF. +# The input file is available for download here: +# https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/supporting/chrY/ + +source utils.sh + +data_dir=input + +# Input +all_vcf_fp=${data_dir}/ALL.chrY_10Mbp_mask.glia_freebayes_maxLikGT_siteQC.20130502.60555_biallelic_snps.vcf.gz +subset_sample_fp=${data_dir}/1000Y.subset.genos.txt + +# Output +subset_bcf_fp=${data_dir}/1000Y.subset.bcf +all_bcf_fp=${data_dir}/1000Y.all.bcf + +echo -e "\nGenerating ${subset_bcf_fp}..." +samples=$(awk '(NR == 2){ printf $1 }(NR > 2){ printf ","$1 }' ${subset_sample_fp}) +echo_run "bcftools convert --samples ${samples} --output-type b --output ${subset_bcf_fp} ${all_vcf_fp}" +echo_run "bcftools index ${subset_bcf_fp}" + +echo -e "\nGenerating ${all_bcf_fp}..." +echo_run "bcftools convert --output-type b --output ${all_bcf_fp} ${all_vcf_fp}" +echo_run "bcftools index ${all_bcf_fp}" + +echo -e "\nDone!\n" diff --git a/yhaplo/data/1000Y.subset.genos.txt b/tests/fixtures/input/1000Y.subset.genos.txt similarity index 100% rename from yhaplo/data/1000Y.subset.genos.txt rename to tests/fixtures/input/1000Y.subset.genos.txt diff --git a/tests/fixtures/input/HG01938.vcf.gz b/tests/fixtures/input/HG01938.vcf.gz new file mode 100644 index 0000000..1296ba2 Binary files /dev/null and b/tests/fixtures/input/HG01938.vcf.gz differ diff --git a/tests/fixtures/input/HG01938.vcf.gz.tbi b/tests/fixtures/input/HG01938.vcf.gz.tbi new file mode 100644 index 0000000..3b78ff9 Binary files /dev/null and b/tests/fixtures/input/HG01938.vcf.gz.tbi differ diff --git a/tests/fixtures/output/haplogroups.1000Y.all.txt b/tests/fixtures/output/haplogroups.1000Y.all.txt new file mode 100644 index 0000000..1594a49 --- /dev/null +++ b/tests/fixtures/output/haplogroups.1000Y.all.txt @@ -0,0 +1,1244 @@ +HG02982 A0-V153 A0-V153 A0a1 +HG01890 A0-L1038 A0-L92.2 A0b +HG02613 A1a-M31 A1a-M31 A1a +HG02645 A1a-M31 A1a-M31 A1a +HG02666 A1a-M31 A1a-M31 A1a +HG00553 B-M109 B-M109 B2a1a +NA19043 B-M109 B-M109 B2a1a +NA19454 B-M109 B-M109 B2a1a +NA19384 B-M112 B-M112 B2b +HG02588 B-L1389 B-L1387 B3 +HG03225 B-L1389 B-L1387 B3 +HG03376 B-L1390 B-L1387 B3 +HG03096 E-CTS10935 E-CTS10935 E1a2a1 +NA19189 E-CTS1792 E-CTS1792 E1a2a1a1a +NA19239 E-CTS1792 E-CTS1792 E1a2a1a1a +HG01094 E-Z5993 E-Z5993 E1a2a1a2 +HG03048 E-Z5993 E-Z5993 E1a2a1a2 +NA18856 E-CTS736 E-CTS736 E1a2a1b1 +NA18910 E-Z12510.2 E-CTS736 E1a2a1b1 +HG01882 E-Z5992 E-CTS3764.1 E1a2a1b1a +HG02449 E-Z5992 E-CTS3764.1 E1a2a1b1a +HG03024 E-Z5991 E-Z5991 E1a2a2 +HG03027 E-Z5991 E-Z5991 E1a2a2 +HG02810 E-Z5985.1 E-Z5985.1 E1a2b +HG02819 E-L94 E-L94 E1a2b1 +HG02461 E-Z5988 E-Z5988 E1a2b1a1 +HG03439 E-Z5988 E-Z5988 E1a2b1a1 +HG02623 E-Z14898 E-Z14890 E1a2b1a1a1 +HG03246 E-Z14898 E-Z14890 E1a2b1a1a1 +HG03258 E-Z14898 E-Z14890 E1a2b1a1a1 +HG02570 E-M9119 E-M9119 E1a2b1a2 +HG02634 E-M9119 E-M9119 E1a2b1a2 +HG02756 E-M180 E-M180 E1b1a1a1 +HG02804 E-M180 E-M180 E1b1a1a1 +NA19900 E-M180 E-M180 E1b1a1a1 +HG01187 E-M58 E-M58 E1b1a1a1a +HG02107 E-M58 E-M58 E1b1a1a1a +HG02496 E-M58 E-M58 E1b1a1a1a +NA19429 E-M58 E-M58 E1b1a1a1a +NA20126 E-M58 E-M58 E1b1a1a1a +HG02759 E-L485 E-L485 E1b1a1a1c +HG03469 E-L485 E-L485 E1b1a1a1c +HG01305 E-M191 E-M191 E1b1a1a1c1a +HG02013 E-M191 E-M191 E1b1a1a1c1a +HG02255 E-M191 E-M191 E1b1a1a1c1a +HG03193 E-M191 E-M191 E1b1a1a1c1a +HG03301 E-M191 E-M191 E1b1a1a1c1a +HG03565 E-Z1712.1 E-M191 E1b1a1a1c1a +NA18868 E-M191 E-M191 E1b1a1a1c1a +NA19121 E-M191 E-M191 E1b1a1a1c1a +NA19350 E-M191 E-M191 E1b1a1a1c1a +HG01556 E-P252 E-P252 E1b1a1a1c1a1 +HG01986 E-P252 E-P252 E1b1a1a1c1a1 +HG01990 E-P252 E-P252 E1b1a1a1c1a1 +HG02283 E-P252 E-P252 E1b1a1a1c1a1 +HG02284 E-P252 E-P252 E1b1a1a1c1a1 +HG02307 E-P252 E-P252 E1b1a1a1c1a1 +HG02323 E-P252 E-P252 E1b1a1a1c1a1 +HG02442 E-P252 E-P252 E1b1a1a1c1a1 +HG02481 E-P252 E-P252 E1b1a1a1c1a1 +HG02484 E-P252 E-P252 E1b1a1a1c1a1 +HG02489 E-P252 E-P252 E1b1a1a1c1a1 +HG02971 E-P252 E-P252 E1b1a1a1c1a1 +HG02981 E-P252 E-P252 E1b1a1a1c1a1 +HG03060 E-P252 E-P252 E1b1a1a1c1a1 +HG03074 E-P252 E-P252 E1b1a1a1c1a1 +HG03103 E-P252 E-P252 E1b1a1a1c1a1 +HG03127 E-P252 E-P252 E1b1a1a1c1a1 +HG03172 E-P252 E-P252 E1b1a1a1c1a1 +HG03190 E-P252 E-P252 E1b1a1a1c1a1 +HG03199 E-P252 E-P252 E1b1a1a1c1a1 +HG03224 E-P252 E-P252 E1b1a1a1c1a1 +HG03268 E-P252 E-P252 E1b1a1a1c1a1 +HG03271 E-P252 E-P252 E1b1a1a1c1a1 +HG03280 E-P252 E-P252 E1b1a1a1c1a1 +HG03295 E-P252 E-P252 E1b1a1a1c1a1 +HG03352 E-P252 E-P252 E1b1a1a1c1a1 +HG03367 E-P252 E-P252 E1b1a1a1c1a1 +HG03472 E-P252 E-P252 E1b1a1a1c1a1 +HG03515 E-P252 E-P252 E1b1a1a1c1a1 +HG03518 E-P252 E-P252 E1b1a1a1c1a1 +NA18507 E-P252 E-P252 E1b1a1a1c1a1 +NA18510 E-P252 E-P252 E1b1a1a1c1a1 +NA18516 E-P252 E-P252 E1b1a1a1c1a1 +NA18519 E-P252 E-P252 E1b1a1a1c1a1 +NA18874 E-P252 E-P252 E1b1a1a1c1a1 +NA18877 E-P252 E-P252 E1b1a1a1c1a1 +NA18908 E-P252 E-P252 E1b1a1a1c1a1 +NA18915 E-P252 E-P252 E1b1a1a1c1a1 +NA19107 E-P252 E-P252 E1b1a1a1c1a1 +NA19113 E-P252 E-P252 E1b1a1a1c1a1 +NA19198 E-P252 E-P252 E1b1a1a1c1a1 +NA19200 E-P252 E-P252 E1b1a1a1c1a1 +NA19207 E-P252 E-P252 E1b1a1a1c1a1 +NA19210 E-P252 E-P252 E1b1a1a1c1a1 +NA19248 E-P252 E-P252 E1b1a1a1c1a1 +NA19455 E-P252 E-P252 E1b1a1a1c1a1 +NA19834 E-P252 E-P252 E1b1a1a1c1a1 +NA19920 E-P252 E-P252 E1b1a1a1c1a1 +NA20291 E-P252 E-P252 E1b1a1a1c1a1 +NA20298 E-P252 E-P252 E1b1a1a1c1a1 +HG02947 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +HG02977 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +HG03109 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +HG03139 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +HG03160 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +HG03175 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA18917 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19026 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19027 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19028 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19092 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19117 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19171 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19213 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19223 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19307 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19308 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19312 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19317 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19318 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19319 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19347 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19360 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19372 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19376 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19383 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19385 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19393 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19394 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19428 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19430 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19443 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19451 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19452 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19461 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19466 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19908 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA20281 E-CTS8030 E-CTS8030 E1b1a1a1c1a1c +NA19041 E-M4451 E-M4451 E1b1a1a1c1a1c1 +NA19031 E-P116 E-P116 E1b1a1a1c1a1c1a +HG02944 E-CTS1313 E-CTS1313 E1b1a1a1c1a1d +HG03196 E-CTS1313 E-CTS1313 E1b1a1a1c1a1d +HG03370 E-CTS1313 E-CTS1313 E1b1a1a1c1a1d +NA18504 E-CTS1313 E-CTS1313 E1b1a1a1c1a1d +NA19146 E-CTS1313 E-CTS1313 E1b1a1a1c1a1d +NA19904 E-CTS1313 E-CTS1313 E1b1a1a1c1a1d +HG01302 E-M263.2 E-M263.2 E1b1a1a1c1b +HG02009 E-M263.2 E-M263.2 E1b1a1a1c1b +HG02334 E-M263.2 E-M263.2 E1b1a1a1c1b +HG02429 E-M263.2 E-M263.2 E1b1a1a1c1b +HG02854 E-M263.2 E-M263.2 E1b1a1a1c1b +HG03077 E-M263.2 E-M263.2 E1b1a1a1c1b +HG03382 E-M263.2 E-M263.2 E1b1a1a1c1b +HG03388 E-M263.2 E-M263.2 E1b1a1a1c1b +NA19098 E-M263.2 E-M263.2 E1b1a1a1c1b +NA20362 E-M263.2 E-M263.2 E1b1a1a1c1b +HG03556 E-CTS3393 E-CTS3393 E1b1a1a1c1b1 +NA19916 E-CTS3393 E-CTS3393 E1b1a1a1c1b1 +HG02890 E-Z6003 E-Z6003 E1b1a1a1c2a +HG03445 E-Z6003 E-Z6003 E1b1a1a1c2a +HG02851 E-Z6004 E-Z6004 E1b1a1a1c2a1 +HG03433 E-Z6004 E-Z6004 E1b1a1a1c2a1 +HG03538 E-Z6006 E-Z6006 E1b1a1a1c2b1 +HG03547 E-Z6006 E-Z6006 E1b1a1a1c2b1 +HG02675 E-Z6007 E-Z6007 E1b1a1a1c2b2 +HG02836 E-Z16052 E-Z16052 E1b1a1a1c2b2a +HG03084 E-Z16052 E-Z16052 E1b1a1a1c2b2a +HG02464 E-CTS3274 E-CTS3274 E1b1a1a1c2c +HG02678 E-CTS3274 E-CTS3274 E1b1a1a1c2c +HG02860 E-CTS3274 E-CTS3274 E1b1a1a1c2c +HG03078 E-CTS3274 E-CTS3274 E1b1a1a1c2c +HG02878 E-CTS3725 E-CTS3725 E1b1a1a1c2c1a +NA19726 E-CTS3725 E-CTS3725 E1b1a1a1c2c1a +HG02585 E-Z6009 E-Z6009 E1b1a1a1c2c1b +HG02813 E-Z6009 E-Z6009 E1b1a1a1c2c1b +HG02620 E-Z6010 E-Z6010 E1b1a1a1c2c2 +HG02582 E-F2481 E-F2481 E1b1a1a1c2c2a +HG02887 E-F2481 E-F2481 E1b1a1a1c2c2a +HG02768 E-Z6014 E-Z6014 E1b1a1a1c2c3a1 +HG03559 E-Z6014 E-Z6014 E1b1a1a1c2c3a1 +HG02702 E-Z6015 E-Z6015 E1b1a1a1c2c3a2 +HG03039 E-Z6015 E-Z6015 E1b1a1a1c2c3a2 +HG02816 E-Z6016 E-Z6016 E1b1a1a1c2c3a2a +HG03432 E-Z6016 E-Z6016 E1b1a1a1c2c3a2a +HG02642 E-Z6017 E-Z6017 E1b1a1a1c2c3b +HG02715 E-Z6018 E-Z6018 E1b1a1a1c2c3b1 +HG03045 E-Z6018 E-Z6018 E1b1a1a1c2c3b1 +HG02561 E-Z6019 E-Z6019 E1b1a1a1c2c3c +HG02573 E-Z6019 E-Z6019 E1b1a1a1c2c3c +HG02610 E-U175 E-U175 E1b1a1a1d +HG02721 E-U175 E-U175 E1b1a1a1d +HG02884 E-U175 E-U175 E1b1a1a1d +HG02895 E-U175 E-U175 E1b1a1a1d +HG03240 E-U175 E-U175 E1b1a1a1d +HG03394 E-U175 E-U175 E1b1a1a1d +NA20346 E-U175 E-U175 E1b1a1a1d +HG01204 E-M4254 E-M4254 E1b1a1a1d1 +HG01485 E-M4254 E-M4254 E1b1a1a1d1 +HG01885 E-M4254 E-M4254 E1b1a1a1d1 +HG02143 E-M4254 E-M4254 E1b1a1a1d1 +HG02314 E-M4254 E-M4254 E1b1a1a1d1 +HG02439 E-P277 E-M4254 E1b1a1a1d1 +HG02455 E-M4254 E-M4254 E1b1a1a1d1 +HG02541 E-P277 E-M4254 E1b1a1a1d1 +HG02554 E-M4254 E-M4254 E1b1a1a1d1 +HG02594 E-M4254 E-M4254 E1b1a1a1d1 +HG02839 E-M4254 E-M4254 E1b1a1a1d1 +HG02881 E-M4254 E-M4254 E1b1a1a1d1 +HG02923 E-M4254 E-M4254 E1b1a1a1d1 +HG02938 E-M4254 E-M4254 E1b1a1a1d1 +HG02941 E-M4254 E-M4254 E1b1a1a1d1 +HG02953 E-M4254 E-M4254 E1b1a1a1d1 +HG02968 E-M4254 E-M4254 E1b1a1a1d1 +HG02973 E-M4254 E-M4254 E1b1a1a1d1 +HG03100 E-M4254 E-M4254 E1b1a1a1d1 +HG03115 E-M4254 E-M4254 E1b1a1a1d1 +HG03118 E-M4254 E-M4254 E1b1a1a1d1 +HG03120 E-M4254 E-M4254 E1b1a1a1d1 +HG03124 E-M4254 E-M4254 E1b1a1a1d1 +HG03130 E-M4254 E-M4254 E1b1a1a1d1 +HG03133 E-M4254 E-M4254 E1b1a1a1d1 +HG03136 E-M4254 E-M4254 E1b1a1a1d1 +HG03157 E-P277 E-M4254 E1b1a1a1d1 +HG03166 E-M4254 E-M4254 E1b1a1a1d1 +HG03169 E-M4254 E-M4254 E1b1a1a1d1 +HG03202 E-M4254 E-M4254 E1b1a1a1d1 +HG03265 E-M4254 E-M4254 E1b1a1a1d1 +HG03298 E-M4254 E-M4254 E1b1a1a1d1 +HG03304 E-M4254 E-M4254 E1b1a1a1d1 +HG03311 E-M4254 E-M4254 E1b1a1a1d1 +HG03313 E-P277 E-M4254 E1b1a1a1d1 +HG03343 E-M4254 E-M4254 E1b1a1a1d1 +HG03521 E-M4254 E-M4254 E1b1a1a1d1 +NA18498 E-P277 E-M4254 E1b1a1a1d1 +NA18501 E-M4254 E-M4254 E1b1a1a1d1 +NA18879 E-M4254 E-M4254 E1b1a1a1d1 +NA18923 E-M4254 E-M4254 E1b1a1a1d1 +NA18934 E-M4254 E-M4254 E1b1a1a1d1 +NA19020 E-P277 E-M4254 E1b1a1a1d1 +NA19025 E-P277 E-M4254 E1b1a1a1d1 +NA19141 E-M4254 E-M4254 E1b1a1a1d1 +NA19144 E-M4254 E-M4254 E1b1a1a1d1 +NA19236 E-M4254 E-M4254 E1b1a1a1d1 +NA19374 E-M4254 E-M4254 E1b1a1a1d1 +NA19711 E-M4254 E-M4254 E1b1a1a1d1 +HG01362 E-U290 E-U290 E1b1a1a1d1a +HG01912 E-U290 E-U290 E1b1a1a1d1a +HG01914 E-U290 E-U290 E1b1a1a1d1a +HG02053 E-U290 E-U290 E1b1a1a1d1a +HG02332 E-U290 E-U290 E1b1a1a1d1a +HG03072 E-U290 E-U290 E1b1a1a1d1a +HG03081 E-U290 E-U290 E1b1a1a1d1a +HG03112 E-U290 E-U290 E1b1a1a1d1a +HG03163 E-U290 E-U290 E1b1a1a1d1a +HG03209 E-U290 E-U290 E1b1a1a1d1a +HG03451 E-U290 E-U290 E1b1a1a1d1a +HG03478 E-U290 E-U290 E1b1a1a1d1a +HG03484 E-U290 E-U290 E1b1a1a1d1a +HG03571 E-U290 E-U290 E1b1a1a1d1a +NA18522 E-U290 E-U290 E1b1a1a1d1a +NA18853 E-U290 E-U290 E1b1a1a1d1a +NA18865 E-U290 E-U290 E1b1a1a1d1a +NA18871 E-U290 E-U290 E1b1a1a1d1a +NA19119 E-U290 E-U290 E1b1a1a1d1a +NA19130 E-U290 E-U290 E1b1a1a1d1a +NA19138 E-U290 E-U290 E1b1a1a1d1a +NA19153 E-U290 E-U290 E1b1a1a1d1a +NA19160 E-U290 E-U290 E1b1a1a1d1a +NA19184 E-U290 E-U290 E1b1a1a1d1a +NA19700 E-U290 E-U290 E1b1a1a1d1a +NA19703 E-U181 E-U181 E1b1a1a1d1a1 +NA20340 E-U181 E-U181 E1b1a1a1d1a1 +HG02557 E-CTS99 E-CTS99 E1b1a1a1d1a2 +NA18486 E-CTS99 E-CTS99 E1b1a1a1d1a2 +NA19096 E-CTS99 E-CTS99 E1b1a1a1d1a2 +NA19256 E-CTS99 E-CTS99 E1b1a1a1d1a2 +NA19346 E-CTS99 E-CTS99 E1b1a1a1d1a2 +NA19375 E-CTS99 E-CTS99 E1b1a1a1d1a2 +NA19397 E-CTS99 E-CTS99 E1b1a1a1d1a2 +NA19448 E-CTS99 E-CTS99 E1b1a1a1d1a2 +HG03397 E-Z5994 E-Z5994 E1b1a1b +HG02628 E-Z22132 E-Z22132 E1b1a1b1 +HG03057 E-Z22132 E-Z22132 E1b1a1b1 +HG03063 E-Z22132 E-Z22132 E1b1a1b1 +HG03069 E-Z22132 E-Z22132 E1b1a1b1 +HG03460 E-Z22132 E-Z22132 E1b1a1b1 +HG03054 E-Z6000 E-Z6000 E1b1a1b1a1 +HG03066 E-Z6000 E-Z6000 E1b1a1b1a1 +HG03442 E-Z6000 E-Z6000 E1b1a1b1a1 +HG03385 E-Z5996 E-Z5996 E1b1a1b1b +HG03391 E-Z5996 E-Z5996 E1b1a1b1b +HG03436 E-Z5998 E-Z5998 E1b1a1b1c +HG03457 E-Z5998 E-Z5998 E1b1a1b1c +HG02807 E-L539 E-V68.1 E1b1b1a +NA19309 E-Z808 E-Z808 E1b1b1a1a1b1 +HG01497 E-CTS693 E-CTS693 E1b1b1a1a1c +HG01970 E-CTS693 E-CTS693 E1b1b1a1a1c +HG02330 E-CTS693 E-CTS693 E1b1b1a1a1c +NA19818 E-CTS693 E-CTS693 E1b1b1a1a1c +HG01377 E-PF2284 E-PF2187 E1b1b1a1a2a +NA20544 E-PF2284 E-PF2187 E1b1b1a1a2a +HG01054 E-V13 E-V13 E1b1b1a1b1a +HG01072 E-V13 E-V13 E1b1b1a1b1a +HG01107 E-V13 E-V13 E1b1b1a1b1a +HG01308 E-V13 E-V13 E1b1b1a1b1a +NA20510 E-V13 E-V13 E1b1b1a1b1a +NA20516 E-V13 E-V13 E1b1b1a1b1a +NA20528 E-V13 E-V13 E1b1b1a1b1a +HG01200 E-L677 E-L677 E1b1b1a1b2 +NA19780 E-L677 E-L677 E1b1b1a1b2 +HG01699 E-V257 E-V257 E1b1b1b1 +HG02317 E-V257 E-V257 E1b1b1b1 +HG02798 E-V257 E-V257 E1b1b1b1 +HG01088 E-M183 E-M183 E1b1b1b1a1 +HG01104 E-M183 E-M183 E1b1b1b1a1 +HG01161 E-M183 E-M183 E1b1b1b1a1 +HG01680 E-M183 E-M183 E1b1b1b1a1 +HG02150 E-M183 E-M183 E1b1b1b1a1 +NA19676 E-M183 E-M183 E1b1b1b1a1 +NA19759 E-M183 E-M183 E1b1b1b1a1 +NA19792 E-M183 E-M183 E1b1b1b1a1 +HG01110 E-L29 E-L29 E1b1b1b2a1a +HG01250 E-L29 E-L29 E1b1b1b2a1a +HG01325 E-L29 E-L29 E1b1b1b2a1a +HG01112 E-L792 E-L791 E1b1b1b2a1d +HG01455 E-L792 E-L791 E1b1b1b2a1d +NA19311 E-M293 E-M293 E1b1b1b2b +NA19331 E-M293 E-M293 E1b1b1b2b +NA19334 E-M293 E-M293 E1b1b1b2b +HG02771 E-M85 E-M85 E2b1a +NA19035 E-M200 E-M200 E2b1a1 +NA19380 E-M200 E-M200 E2b1a1 +NA19175 E-CTS1307 E-CTS1048 E2b2 +NA20348 E-CTS1307 E-CTS1048 E2b2 +NA18966 D-M125 D-M125 D1b1a +NA19089 D-M125 D-M125 D1b1a +NA18983 D-Page3 D-Page3 D1b1a2b +NA19012 D-CTS3397 D-CTS3397 D1b1a2b1 +NA19058 D-Z1500 D-Z1500 D1b1a2b1a +NA18944 D-Z1504 D-Z1504 D1b1a2b1a1 +NA18948 D-Z1504 D-Z1504 D1b1a2b1a1 +NA18967 D-Z1504 D-Z1504 D1b1a2b1a1 +NA19068 D-Z1504 D-Z1504 D1b1a2b1a1 +NA18988 D-FGC6373 D-FGC6373 D1b1a2b1a1a1 +NA19082 D-FGC6373 D-FGC6373 D1b1a2b1a1a1 +NA18960 D-CTS6609 D-CTS6609 D1b1d +NA18952 D-CTS1897 D-CTS1897 D1b1d1 +NA19056 D-CTS4617 D-CTS4617 D1b1d1a1 +NA18940 D-CTS6909 D-CTS6909 D1b1d1a1a +NA18961 D-CTS1964 D-CTS1964 D1b1d1b +NA18970 D-CTS1964 D-CTS1964 D1b1d1b +NA19004 D-CTS5302 D-CTS220 D1b2a +NA18995 D-Z17173 D-CTS10495 D1b2a1 +NA19063 D-CTS11285 D-CTS11285 D1b2a2 +NA18989 C-CTS6678 C-CTS6678 C1a1a1 +NA19006 C-CTS6678 C-CTS6678 C1a1a1 +NA18971 C-Z1356 C-Z1356 C1a1a2 +NA18974 C-Z1356 C-Z1356 C1a1a2 +NA21133 C-P92 C-P92 C1b1a1a +HG03850 C-Z5895 C-Z5895 C1b1a1a1 +HG03867 C-K96 C-K96 C1b1a1a1a +NA21118 C-Z5896 C-Z5896 C1b1a1a1a1a +NA21127 C-K225 C-Z5896 C1b1a1a1a1a +NA20911 C-K193 C-K193 C1b1a1a1a1a1 +NA21100 C-K193 C-K193 C1b1a1a1a1a1 +NA21117 C-Z12522 C-Z12522 C1b1a1a1a1a1a +NA21124 C-Z12522 C-Z12522 C1b1a1a1a1a1a +NA20889 C-K466 C-K466 C1b1a1a1a1a1a1 +NA21094 C-K466 C-K466 C1b1a1a1a1a1a1 +NA21123 C-K466 C-K466 C1b1a1a1a1a1a1 +NA21091 C-Z5898 C-Z5898 C1b1a1a1a1a1a1a1 +NA21119 C-Z5898 C-Z5898 C1b1a1a1a1a1a1a1 +HG03228 C-Z12381 C-Z5900 C1b1a1b1 +HG03593 C-Z5900 C-Z5900 C1b1a1b1 +HG04155 C-Z5900 C-Z5900 C1b1a1b1 +HG00628 C-M217 C-M217 C2 +NA18749 C-CTS8579 C-CTS8579 C2e1a1 +NA19091 C-M407 C-M407 C2e1a1a +HG03917 C-F1319 C-F1319 C2e1b1 +NA18620 C-M4236 C-F3735 C2e1b1a +NA18612 C-CTS3385 C-CTS3385 C2e1b2 +HG02141 C-F845 C-F845 C2e2 +NA19079 C-F845 C-F845 C2e2 +HG02373 C-K516 C-K516 C2e2a1 +HG02381 C-K516 C-K516 C2e2a1 +HG02040 F-M89 F-M89 F +NA20858 G-Z3313 G-Z3189 G1a1b +NA20870 G-Z3313 G-Z3189 G1a1b +HG03708 G-Z17775 G-Z17775 G2a1a2a +HG02681 G-L166 G-L166 G2a2a1a2a1a +HG01311 G-Z6800 G-Z6800 G2a2a1a2a1b +NA20767 G-Z37368 G-Z37368 G2a2b1a1b1a +HG02224 G-CTS12570 G-CTS796 G2a2b2a1 +NA19747 G-CTS9909 G-CTS9909 G2a2b2a1a1a1a +HG01097 G-CTS2100 G-CTS2100 G2a2b2a1a1b1a +NA12546 G-L42 G-L42 G2a2b2a1a1b1a1a +NA20536 G-CTS10391 G-CTS10391 G2a2b2a1a1b1a2a1a +NA19670 G-Z3441 G-Z30771 G2a2b2a1a1b1a2a2 +NA18549 G-CTS6763 G-CTS5990 G2a2b2a1a1c1a +NA12777 G-Z3520 G-Z3520 G2a2b2a1a1c1a1a +HG01512 G-Z3428 G-Z3428 G2a2b2a1a1c1a2 +HG03577 G-Z6523 G-Z6523 G2a2b2a1a1c1a2a1 +HG02687 G-Z30504 G-Z30503 G2a2b2a4 +NA12286 G-F1705 G-F1193 G2a2b2b1 +HG02789 G-M377 G-M377 G2b1 +HG03824 H-Z5870 H-Z5870 H1a1a +HG03846 H-M197 H-M36 H1a1a1 +HG02786 H-Z5871 H-Z5871 H1a1d1 +HG04164 H-Z5871 H-Z5871 H1a1d1 +HG01583 H-M2716 H-M2716 H1a1d2 +HG03778 H-M2716 H-M2716 H1a1d2 +HG04060 H-M2716 H-M2716 H1a1d2 +HG04002 H-Z5873 H-Z5873 H1a1d2a +HG03716 H-M2853 H-M2853 H1a1d2b +HG03941 H-M2853 H-M2853 H1a1d2b +HG04211 H-M2853 H-M2853 H1a1d2b +HG03815 H-Z5876 H-Z5876 H1a1d2b1 +HG04185 H-Z5877 H-Z5877 H1a1d2b1a +HG04235 H-Z5878 H-Z5878 H1a1d2b2 +HG03738 H-Z5879 H-Z5879 H1a1d2b2a +HG03755 H-Z5879 H-Z5879 H1a1d2b2a +HG04006 H-Z14441 H-Z14441 H1a1d2b3a +HG03600 H-Z5882 H-Z5882 H1a1d2b3a1 +HG03777 H-Z5883 H-Z5883 H1a1d2b3a1a +HG03009 H-Z5884 H-Z5884 H1a1d2b3a1a1 +NA20901 H-Z5884 H-Z5884 H1a1d2b3a1a1 +NA21107 H-Z5886 H-Z5886 H1a1d2c1a +NA20871 H-Z5887 H-Z5887 H1a1d2c1a1 +NA21126 H-Z5887 H-Z5887 H1a1d2c1a1 +NA21116 H-Z5888 H-Z5888 H1a1d2c1b +HG04146 H-Z5889 H-Z5889 H1a1d2c1b1 +HG04176 H-Z5889 H-Z5889 H1a1d2c1b1 +HG03644 H-Z5890 H-Z5890 H1a1d2c1b1a +HG03646 H-Z5890 H-Z5890 H1a1d2c1b1a +HG03745 H-Z5890 H-Z5890 H1a1d2c1b1a +HG03800 H-Z5890 H-Z5890 H1a1d2c1b1a +HG03950 H-Z5890 H-Z5890 H1a1d2c1b1a +HG03985 H-Z5890 H-Z5890 H1a1d2c1b1a +HG04061 H-Z5890 H-Z5890 H1a1d2c1b1a +HG04161 H-Z5890 H-Z5890 H1a1d2c1b1a +HG04206 H-Z5890 H-Z5890 H1a1d2c1b1a +NA20863 H-Z5890 H-Z5890 H1a1d2c1b1a +NA21087 H-Z4698 H-Z4542 H1a1d2c2a +NA21099 H-Z4542 H-Z4542 H1a1d2c2a +NA21113 H-Z4542 H-Z4542 H1a1d2c2a +HG04188 H-Z4469 H-Z4469 H1a2 +HG03812 H-Z4487 H-Z4487 H1a2a +NA21093 H-Z4417 H-Z4417 H1a2a1 +NA21111 H-Z4540 H-Z14686 H1a2a1a +NA21112 H-Z4540 H-Z14686 H1a2a1a +HG04198 H-Apt H-Apt H1b1 +HG04222 H-Apt H-Apt H1b1 +HG03809 H-Z14258 H-Z14258 H1b2 +HG03594 H-Z5868 H-Z5868 H1b2a +HG03991 H-Z5868 H-Z5868 H1b2a +HG03870 H-M6531.2 H-Z5864 H3a1 +HG03872 H-M6531.2 H-Z5864 H3a1 +HG04238 H-M6531.2 H-Z5864 H3a1 +HG03848 H-Z12646 H-Z5860 H3a2a1 +HG03965 H-Z16780 H-Z16780 H3a2a1a +HG03792 H-Z5862 H-Z5862 H3a2a2 +HG04033 H-Z5862 H-Z5862 H3a2a2 +HG03680 H-Z5858 H-Z5858 H3a2b +HG03837 H-Z5858 H-Z5858 H3a2b +HG02684 H-Z13871 H-Z13871 H3b +HG03697 H-Z5859 H-Z5859 H3b1 +HG03890 H-Z5859 H-Z5859 H3b1 +HG00117 I-M253 I-M253 I1 +HG00140 I-M253 I-M253 I1 +NA19723 I-CTS6364 I-CTS6364 I1a1 +HG00190 I-Z74 I-Z74 I1a1b3 +HG00186 I-L258 I-L258 I1a1b3a1 +HG00308 I-L258 I-L258 I1a1b3a1 +HG00321 I-L258 I-L258 I1a1b3a1 +HG00325 I-L258 I-L258 I1a1b3a1 +HG00345 I-L258 I-L258 I1a1b3a1 +HG00372 I-L258 I-L258 I1a1b3a1 +NA11829 I-L813 I-L813 I1a1b3b +HG00189 I-L300 I-L300 I1a1b4 +HG00252 I-Z59 I-Z59 I1a2a +NA11881 I-Z59 I-Z59 I1a2a +NA12750 I-Z59 I-Z59 I1a2a +HG00159 I-Z60 I-Z60 I1a2a1 +HG02274 I-Z140 I-Z140 I1a2a1a +NA20762 I-Z140 I-Z140 I1a2a1a +HG02445 I-S1954 I-S1954 I1a2a1a1a +HG02420 I-L338 I-L338 I1a2a1a1a1 +HG00234 I-CTS10937 I-CTS10937 I1a2a1a1b +HG02433 I-F2642 I-F2642 I1a2a1a2 +NA19982 I-F2642 I-F2642 I1a2a1a2 +NA20511 I-F2642 I-F2642 I1a2a1a2 +HG00101 I-BY266 I-BY266 I1a2a1b1 +NA06994 I-BY266 I-BY266 I1a2a1b1 +HG00310 I-Z138 I-Z138 I1a2b +NA11919 I-Z2541 I-Z2541 I1a2b1 +NA11893 I-BY352 I-BY351 I1a3a2 +NA07051 I-CTS4279 I-CTS4279 I1a3a2c +NA11992 I-CTS6397 I-CTS6397 I1b1 +HG01167 I-M26 I-M26 I2a1a1 +HG01610 I-M26 I-M26 I2a1a1 +HG01344 I-PF4088 I-PF4088 I2a1a1a1a +HG01101 I-CTS11338 I-CTS11338 I2a1a1a1a1 +HG01197 I-CTS11338 I-CTS11338 I2a1a1a1a1 +HG00360 I-Y3118 I-Y3118 I2a1b2a1b1 +NA12413 I-L369 I-L126 I2a2a1a1a1 +HG02536 I-FGC20048 I-FGC20048 I2a2a1a1a1a +HG01527 I-Z2069 I-Y3681 I2a2a1a2a1 +HG02470 I-Z2069 I-Y3681 I2a2a1a2a1 +NA11932 I-BY138 I-BY138 I2a2a1a2a1a1d +NA20351 I-CTS10148 I-CTS10148 I2a2a1b2a1a2a +HG00136 I-Z180 I-Z180 I2a2a1b2a2a1a1a1 +NA12003 I-Z79 I-Z79 I2a2a1b2a2a1a1a1a1a1 +NA19984 I-Y8712 I-Y8712 I2a2a1b2a2a1a1a1a1a1c +NA12748 I-L1272 I-L1272 I2a2a1b2a2a2a +HG01988 I-S2488 I-S2488 I2a2b2b +HG01253 J-M267 J-M267 J1 +HG01256 J-M267 J-M267 J1 +HG03767 J-CTS5368 J-CTS5368 J1a +HG01069 J-P58 J-P58 J1a2b +HG01130 J-P58 J-P58 J1a2b +HG01431 J-P58 J-P58 J1a2b +HG01437 J-P58 J-P58 J1a2b +HG01571 J-P58 J-P58 J1a2b +HG01686 J-P58 J-P58 J1a2b +HG01935 J-P58 J-P58 J1a2b +HG02253 J-P58 J-P58 J1a2b +NA12282 J-P58 J-P58 J1a2b +NA20543 J-P58 J-P58 J1a2b +HG00181 J-L816 J-L816 J1a2b3a1 +HG01494 J-CTS15 J-CTS15 J1a3 +HG01589 J-M410 J-M410 J2a +HG02774 J-M410 J-M410 J2a +HG02236 J-L26 J-L26 J2a1 +HG02493 J-L26 J-L26 J2a1 +HG02724 J-L26 J-L26 J2a1 +HG02733 J-L26 J-L26 J2a1 +HG03012 J-L26 J-L26 J2a1 +HG03018 J-L26 J-L26 J2a1 +HG03021 J-L26 J-L26 J2a1 +HG03615 J-L26 J-L26 J2a1 +HG03660 J-L26 J-L26 J2a1 +HG03905 J-L26 J-L26 J2a1 +HG04003 J-L26 J-L26 J2a1 +HG04107 J-L26 J-L26 J2a1 +NA20884 J-L26 J-L26 J2a1 +NA21090 J-L26 J-L26 J2a1 +HG02490 J-M47 J-M47 J2a1a +HG01402 J-M67 J-M67 J2a1b +NA20534 J-M67 J-M67 J2a1b +NA20513 J-M92 J-M92 J2a1b1 +NA20765 J-M92 J-M92 J2a1b1 +NA20815 J-M92 J-M92 J2a1b1 +NA20827 J-M92 J-M92 J2a1b1 +NA20521 J-L210 J-L210 J2a1b3 +NA20787 J-L210 J-L210 J2a1b3 +NA20801 J-L210 J-L210 J2a1b3 +HG02651 J-M68 J-M68 J2a1c +HG03693 J-M68 J-M68 J2a1c +HG03854 J-M68 J-M68 J2a1c +HG01781 J-M319 J-M319 J2a1d +HG01991 J-M319 J-M319 J2a1d +HG01164 J-L24 J-L24 J2a1h +HG01672 J-L25 J-L25 J2a1h2 +HG03237 J-L25 J-L25 J2a1h2 +NA21130 J-L25 J-L25 J2a1h2 +HG01412 J-L70 J-L70 J2a1h2a1 +HG01756 J-L70 J-L70 J2a1h2a1 +NA20778 J-L70 J-L70 J2a1h2a1 +NA20805 J-L70 J-L70 J2a1h2a1 +NA20809 J-L70 J-L70 J2a1h2a1 +HG03851 J-L192.2 J-L192.2 J2a1h2d +HG01619 J-M12 J-M12 J2b +NA20588 J-M12 J-M12 J2b +HG02690 J-M241 J-M241 J2b2 +HG03006 J-M241 J-M241 J2b2 +HG03696 J-M241 J-M241 J2b2 +HG03785 J-M241 J-M241 J2b2 +HG03908 J-M241 J-M241 J2b2 +HG03969 J-M241 J-M241 J2b2 +HG03976 J-M241 J-M241 J2b2 +HG03998 J-M241 J-M241 J2b2 +HG04210 J-M241 J-M241 J2b2 +HG04239 J-M241 J-M241 J2b2 +NA20885 J-M241 J-M241 J2b2 +NA20905 J-M241 J-M241 J2b2 +NA20763 J-L283 J-L283 J2b2a +HG00160 J-Z1297 J-Z1297 J2b2a1a +HG01509 J-Z631 J-Z631 J2b2a1a1 +NA11930 J-Z631 J-Z631 J2b2a1a1 +NA20525 J-Z631 J-Z631 J2b2a1a1 +NA20811 J-Z631 J-Z631 J2b2a1a1 +HG02654 L-M27 L-M27 L1a1 +HG02792 L-M27 L-M27 L1a1 +HG03686 L-M27 L-M27 L1a1 +HG03746 L-M27 L-M27 L1a1 +HG03786 L-M27 L-M27 L1a1 +HG03788 L-M27 L-M27 L1a1 +HG03875 L-M27 L-M27 L1a1 +HG03885 L-M27 L-M27 L1a1 +HG03920 L-M27 L-M27 L1a1 +HG03953 L-M27 L-M27 L1a1 +HG03971 L-M27 L-M27 L1a1 +HG03990 L-M27 L-M27 L1a1 +HG04080 L-M27 L-M27 L1a1 +HG04093 L-M27 L-M27 L1a1 +HG04100 L-M27 L-M27 L1a1 +NA20861 L-M27 L-M27 L1a1 +NA20867 L-M27 L-M27 L1a1 +NA21109 L-M27 L-M27 L1a1 +NA21128 L-M27 L-M27 L1a1 +NA21135 L-M27 L-M27 L1a1 +HG03672 L-M357 L-M357 L1a2 +HG03695 L-M357 L-M357 L1a2 +HG03753 L-M357 L-M357 L1a2 +HG03790 L-M357 L-M357 L1a2 +HG03821 L-M357 L-M357 L1a2 +HG03900 L-M357 L-M357 L1a2 +HG04094 L-M357 L-M357 L1a2 +NA20520 T-L208 T-L208 T1a1a +HG01133 T-CTS9882 T-CTS9882 T1a1a1a1a1a +NA20527 T-CTS9882 T-CTS9882 T1a1a1a1a1a +HG01190 T-CTS6280 T-CTS6280 T1a1a1a1a1b +NA19655 T-CTS6280 T-CTS6280 T1a1a1a1a1b +NA20758 T-CTS3767 T-CTS3767 T1a2b1 +HG01051 T-CTS8862 T-CTS8862 T1a2b1a +HG01530 T-CTS8862 T-CTS8862 T1a2b1a +HG03742 NO-M2313 NO-M2313 NO +HG02138 N-M231 N-M231 N +HG04015 N-M231 N-M231 N +NA18558 N-M231 N-M231 N +NA18608 N-M231 N-M231 N +NA18639 N-M231 N-M231 N +NA18747 N-M231 N-M231 N +NA18748 N-M231 N-M231 N +HG00280 N-CTS2929 N-CTS2929 N1c1a1a1 +HG00342 N-CTS2929 N-CTS2929 N1c1a1a1 +HG00369 N-CTS2929 N-CTS2929 N1c1a1a1 +HG00341 N-L550 N-L550 N1c1a1a1a +HG00351 N-L550 N-L550 N1c1a1a1a +HG00182 N-CTS1737 N-CTS1737 N1c1a1a2a1 +HG00183 N-CTS1737 N-CTS1737 N1c1a1a2a1 +HG00187 N-CTS1737 N-CTS1737 N1c1a1a2a1 +HG00278 N-CTS1737 N-CTS1737 N1c1a1a2a1 +HG00284 N-CTS1737 N-CTS1737 N1c1a1a2a1 +HG00311 N-CTS1737 N-CTS1737 N1c1a1a2a1 +HG00329 N-CTS1737 N-CTS1737 N1c1a1a2a1 +HG00371 N-CTS1737 N-CTS1737 N1c1a1a2a1 +HG00375 N-CTS1737 N-CTS1737 N1c1a1a2a1 +HG00185 N-Z1940 N-Z1940 N1c1a1a2a1a1 +HG00188 N-Z1940 N-Z1940 N1c1a1a2a1a1 +HG00271 N-Z1940 N-Z1940 N1c1a1a2a1a1 +HG00273 N-Z1940 N-Z1940 N1c1a1a2a1a1 +HG00290 N-Z1940 N-Z1940 N1c1a1a2a1a1 +HG00335 N-Z1940 N-Z1940 N1c1a1a2a1a1 +HG00336 N-Z1940 N-Z1940 N1c1a1a2a1a1 +HG00338 N-Z1940 N-Z1940 N1c1a1a2a1a1 +HG00358 N-Z1940 N-Z1940 N1c1a1a2a1a1 +HG02026 N-L666 N-L666 N1c2 +NA18613 N-L666 N-L666 N1c2 +HG00595 O-CTS2594 O-CTS2498 O1a1a1a1a1 +HG02384 O-F656 O-F656 O1a1a1a1a1a +HG02396 O-F656 O-F656 O1a1a1a1a1a +NA18536 O-F656 O-F656 O1a1a1a1a1a +NA18606 O-CTS4585 O-CTS4585 O1a1a1a1a2 +HG02017 O-Z23392 O-Z23392 O1a1a1b2 +HG02079 O-Z23442 O-Z23442 O1a1a1b2a +HG02360 O-Z39268 O-SK1571 O1a1a1b2a1 +NA18632 O-CTS701 O-CTS701 O1a1a2a +HG00580 O-Z23266 O-Z23266 O1a1a2a1 +HG01811 O-Z23266 O-Z23266 O1a1a2a1 +HG02250 O-Z23266 O-Z23266 O1a1a2a1 +HG02353 O-Z23266 O-Z23266 O1a1a2a1 +HG02363 O-Z23266 O-Z23266 O1a1a2a1 +HG02377 O-Z23266 O-Z23266 O1a1a2a1 +HG02385 O-Z23266 O-Z23266 O1a1a2a1 +HG02399 O-Z23266 O-Z23266 O1a1a2a1 +NA18740 O-Z23266 O-Z23266 O1a1a2a1 +NA18647 O-CTS5726 O-CTS5726 O1a1b +HG01840 O-F923 O-F923 O1b1a1a1a1a1a +HG01846 O-F923 O-F923 O1b1a1a1a1a1a +HG01849 O-F923 O-F923 O1b1a1a1a1a1a +HG01867 O-F923 O-F923 O1b1a1a1a1a1a +HG02364 O-F923 O-F923 O1b1a1a1a1a1a +HG02386 O-F923 O-F923 O1b1a1a1a1a1a +HG02512 O-F1399 O-F1399 O1b1a1a1a1a1a1a +HG00701 O-F2415 O-F2415 O1b1a1a1a1a1a1a1 +HG02351 O-F2415 O-F2415 O1b1a1a1a1a1a1a1 +HG02382 O-F2415 O-F2415 O1b1a1a1a1a1a1a1 +HG02398 O-F2415 O-F2415 O1b1a1a1a1a1a1a1 +HG02394 O-Z24100 O-Z24100 O1b1a1a1a1a1a3 +HG02408 O-Z24100 O-Z24100 O1b1a1a1a1a1a3 +HG02410 O-Z24100 O-Z24100 O1b1a1a1a1a1a3 +HG02020 O-Z24091 O-Z24091 O1b1a1a1a1a1b +HG02521 O-Z24091 O-Z24091 O1b1a1a1a1a1b +HG02356 O-Z39410 O-Z39410 O1b1a1a1a1a1b1 +HG01842 O-Z24088 O-Z24088 O1b1a1a1a1a2 +HG02032 O-Z24088 O-Z24088 O1b1a1a1a1a2 +HG00844 O-F2890 O-F2890 O1b1a1a1a1b +HG00728 O-Z24050 O-Z24050 O1b1a1a1a1b1a +HG02395 O-Z24050 O-Z24050 O1b1a1a1a1b1a +HG02397 O-Z24050 O-Z24050 O1b1a1a1a1b1a +HG01873 O-Z24014 O-Z24014 O1b1a1a1a1b2 +HG02064 O-Z24014 O-Z24014 O1b1a1a1a1b2 +HG02085 O-Z24014 O-Z24014 O1b1a1a1a1b2 +NA19009 O-CTS7399 O-CTS7399 O1b1a1a1a2a1 +HG01028 O-FGC19713 O-FGC19713 O1b1a1a1a2a1a +HG02402 O-Z23858 O-FGC19713 O1b1a1a1a2a1a +HG00982 O-CTS7642 O-CTS651 O1b1a1a1a2a2 +HG01031 O-CTS7642 O-CTS651 O1b1a1a1a2a2 +HG02390 O-CTS651 O-CTS651 O1b1a1a1a2a2 +HG02401 O-CTS651 O-CTS651 O1b1a1a1a2a2 +NA18636 O-CTS9884 O-CTS9884 O1b1a1a1a2a2a +HG02374 O-F4229 O-F4229 O1b1a1a1a2b +HG02409 O-F4229 O-F4229 O1b1a1a1a2b +HG02407 O-Z23667 O-Z23667 O1b1a1a1b1 +HG04140 O-B426 O-B426 O1b1a1a1b1a +HG04173 O-B426 O-B426 O1b1a1a1b1a +HG01865 O-B427 O-B427 O1b1a1a1b1a2 +HG02061 O-B427 O-B427 O1b1a1a1b1a2 +HG02389 O-Z39485 O-Z39485 O1b1a1a1b1b +HG01844 O-SK1646 O-SK1646 O1b1a1a1b2 +HG00442 O-CTS350 O-CTS350 O1b1a1a2 +HG00457 O-F838 O-F838 O1b1a1b +NA18638 O-F1199 O-F1199 O1b1a1b1 +HG00634 O-F1759 O-F1759 O1b1a2a1 +NA18603 O-F1759 O-F1759 O1b1a2a1 +NA18637 O-F1759 O-F1759 O1b1a2a1 +NA18534 O-F2183 O-F1127 O1b1a2b1a +NA19055 O-CTS1451 O-CTS1451 O1b1a2b2 +HG00406 O-CTS9996 O-CTS9996 O1b1a2c +HG02076 O-CTS9996 O-CTS9996 O1b1a2c +NA18943 O-CTS713 O-CTS713 O1b2a1 +NA18962 O-CTS713 O-CTS713 O1b2a1 +NA18977 O-CTS713 O-CTS713 O1b2a1 +NA18985 O-CTS713 O-CTS713 O1b2a1 +NA18986 O-CTS713 O-CTS713 O1b2a1 +NA18990 O-CTS7776 O-CTS713 O1b2a1 +NA19000 O-CTS713 O-CTS713 O1b2a1 +NA19060 O-CTS713 O-CTS713 O1b2a1 +NA19066 O-CTS713 O-CTS713 O1b2a1 +NA18953 O-CTS1875 O-CTS1875 O1b2a1a +NA19088 O-CTS1875 O-CTS1875 O1b2a1a +NA18965 O-CTS1966 O-CTS10682 O1b2a1a1 +NA19005 O-CTS10682 O-CTS10682 O1b2a1a1 +NA19067 O-CTS10682 O-CTS10682 O1b2a1a1 +NA18994 O-CTS203 O-CTS203 O1b2a1c +NA19075 O-CTS2734 O-CTS2734 O1b2a2a +NA18561 O-CTS723 O-CTS723 O1b2a2a1 +NA19007 O-Z39506 O-CTS7620 O1b2a2a1a +HG00530 O-F940 O-F940 O1b2a2b +HG00533 O-F940 O-F940 O1b2a2b +NA19070 O-CTS1215 O-CTS1215 O1b2a3a +NA18563 O-CTS562 O-CTS562 O1b2b +HG00448 O-Page124 O-Page124 O2a1a1 +HG00478 O-Page124 O-Page124 O2a1a1 +HG02029 O-Page124 O-Page124 O2a1a1 +HG00463 O-F854 O-F854 O2a1a1b +HG00475 O-F854 O-F854 O2a1a1b +HG00500 O-F854 O-F854 O2a1a1b +HG00536 O-F854 O-F854 O2a1a1b +HG00542 O-F854 O-F854 O2a1a1b +NA18745 O-F854 O-F854 O2a1a1b +HG02067 O-CTS1936 O-CTS1936 O2a1a2a +NA18605 O-CTS1936 O-CTS1936 O2a1a2a +NA18984 O-JST002611 O-JST002611 O2a1c +HG00607 O-F11 O-F11 O2a1c1a +NA18633 O-F632 O-F632 O2a1c1a1 +NA18611 O-M11115 O-M11115 O2a1c1a1a +NA18629 O-M11115 O-M11115 O2a1c1a1a +HG00436 O-F856 O-F856 O2a1c1a1a1a1a +HG00451 O-F856 O-F856 O2a1c1a1a1a1a +HG00613 O-F856 O-F856 O2a1c1a1a1a1a +HG00421 O-CTS7501 O-CTS7501 O2a1c1a1a1a2 +HG00707 O-CTS7501 O-CTS7501 O2a1c1a1a1a2 +HG00445 O-F793 O-F793 O2a1c1a1a1b +HG01596 O-F196 O-F12 O2a1c1a3 +HG02128 O-F196 O-F12 O2a1c1a3 +NA18559 O-F2685 O-F2685 O2a1c1a4a +HG02047 O-M5420 O-M5420 O2a1c1a5 +HG00403 O-CTS12877 O-CTS12877 O2a1c1a6 +HG02070 O-CTS5409 O-CTS5409 O2a1c1a6a1 +NA18982 O-CTS5409 O-CTS5409 O2a1c1a6a1 +HG00610 O-F238 O-F238 O2a1c1b1 +HG00472 O-F1273 O-F1273 O2a1c1b1a1 +HG02392 O-SK1673 O-SK1673 O2a1c2 +NA18544 O-SK1673 O-SK1673 O2a1c2 +HG00650 O-M159 O-M159 O2a2a1a1a +HG00656 O-M159 O-M159 O2a2a1a1a +HG00662 O-M159 O-M159 O2a2a1a1a +HG00671 O-M159 O-M159 O2a2a1a1a +HG00674 O-M159 O-M159 O2a2a1a1a +HG00689 O-M159 O-M159 O2a2a1a1a +HG00692 O-M159 O-M159 O2a2a1a1a +HG00698 O-M159 O-M159 O2a2a1a1a +HG00704 O-M159 O-M159 O2a2a1a1a +HG00653 O-F2309 O-F1275 O2a2a1a2a1a +HG02035 O-F1275 O-F1275 O2a2a1a2a1a +HG02073 O-F1275 O-F1275 O2a2a1a2a1a +HG01816 O-F1863 O-F1863 O2a2a1a2a2 +HG01861 O-F1863 O-F1863 O2a2a1a2a2 +HG02023 O-F1863 O-F1863 O2a2a1a2a2 +HG02131 O-F1863 O-F1863 O2a2a1a2a2 +HG02137 O-F1863 O-F1863 O2a2a1a2a2 +HG00622 O-F879 O-F879 O2a2a2 +HG00625 O-F879 O-F879 O2a2a2 +NA18621 O-F1226 O-F1226 O2a2a2a +HG00565 O-F8 O-F8 O2a2b1a1a +HG00598 O-F8 O-F8 O2a2b1a1a +HG00619 O-F8 O-F8 O2a2b1a1a +HG00881 O-F8 O-F8 O2a2b1a1a +HG01810 O-F8 O-F8 O2a2b1a1a +HG01866 O-F8 O-F8 O2a2b1a1a +HG02082 O-F8 O-F8 O2a2b1a1a +HG02355 O-F8 O-F8 O2a2b1a1a +HG02371 O-F8 O-F8 O2a2b1a1a +HG02372 O-F8 O-F8 O2a2b1a1a +HG02375 O-F8 O-F8 O2a2b1a1a +HG02379 O-F8 O-F8 O2a2b1a1a +HG02380 O-F8 O-F8 O2a2b1a1a +HG02383 O-F8 O-F8 O2a2b1a1a +HG02387 O-F8 O-F8 O2a2b1a1a +HG02388 O-F8 O-F8 O2a2b1a1a +HG02391 O-F8 O-F8 O2a2b1a1a +HG02406 O-F8 O-F8 O2a2b1a1a +HG03830 O-F8 O-F8 O2a2b1a1a +HG04134 O-F8 O-F8 O2a2b1a1a +NA18530 O-F8 O-F8 O2a2b1a1a +NA18624 O-F8 O-F8 O2a2b1a1a +NA18945 O-F8 O-F8 O2a2b1a1a +NA19072 O-F8 O-F8 O2a2b1a1a +NA19086 O-F8 O-F8 O2a2b1a1a +HG00409 O-F438 O-F438 O2a2b1a1a1 +NA18622 O-Y17728 O-Y17728 O2a2b1a1a1a +NA18557 O-F155 O-F155 O2a2b1a1a1a1 +NA18757 O-Z25907 O-Z25907 O2a2b1a1a1a3 +NA19062 O-Z25907 O-Z25907 O2a2b1a1a1a3 +NA19085 O-F402 O-F402 O2a2b1a1a2a1 +HG00592 O-CTS7634 O-CTS7634 O2a2b1a1a3 +HG01852 O-F317 O-F317 O2a2b1a1a3a +HG02122 O-F317 O-F317 O2a2b1a1a3a +NA18623 O-CTS5488 O-CTS5488 O2a2b1a1a3b +NA18548 O-CTS4960 O-CTS4960 O2a2b1a1b +NA18635 O-F79 O-F79 O2a2b1a2a +HG00524 O-F46 O-F46 O2a2b1a2a1 +HG00583 O-F46 O-F46 O2a2b1a2a1 +HG00589 O-F46 O-F46 O2a2b1a2a1 +HG00683 O-F46 O-F46 O2a2b1a2a1 +HG01860 O-F46 O-F46 O2a2b1a2a1 +HG01864 O-F46 O-F46 O2a2b1a2a1 +HG02058 O-F46 O-F46 O2a2b1a2a1 +HG02367 O-F46 O-F46 O2a2b1a2a1 +NA18543 O-F46 O-F46 O2a2b1a2a1 +NA18546 O-F46 O-F46 O2a2b1a2a1 +NA18562 O-F46 O-F46 O2a2b1a2a1 +NA18572 O-F46 O-F46 O2a2b1a2a1 +NA18609 O-F46 O-F46 O2a2b1a2a1 +NA18643 O-F46 O-F46 O2a2b1a2a1 +NA18648 O-F46 O-F46 O2a2b1a2a1 +NA18959 O-F46 O-F46 O2a2b1a2a1 +NA19076 O-F46 O-F46 O2a2b1a2a1 +HG02050 O-F743 O-F743 O2a2b1a2b +NA19083 O-F743 O-F743 O2a2b1a2b +HG00556 O-F4068 O-F4068 O2a2b2a1a1 +HG01872 O-F4124 O-F4124 O2a2b2a1b +HG00631 O-JST008425p6 O-JST008425p6 O2a2b2a1b1 +HG02088 O-F1150 O-F1150 O2b1 +HG00559 O-F2244 O-F837 O2b1a +HG01944 Q-M120 Q-M120 Q1a1a1 +HG02116 Q-M120 Q-M120 Q1a1a1 +HG02134 Q-M120 Q-M120 Q1a1a1 +HG02696 Q-L712 Q-L712 Q1a1b1 +HG03681 Q-M346 Q-M346 Q1a2 +HG03943 Q-M346 Q-M346 Q1a2 +HG02090 Q-L54 Q-L54 Q1a2a1 +NA19774 Q-L54 Q-L54 Q1a2a1 +HG01124 Q-M3 Q-M3 Q1a2a1a1 +HG01139 Q-M3 Q-M3 Q1a2a1a1 +HG01142 Q-M3 Q-M3 Q1a2a1a1 +HG01565 Q-M3 Q-M3 Q1a2a1a1 +HG01892 Q-M3 Q-M3 Q1a2a1a1 +HG01920 Q-M3 Q-M3 Q1a2a1a1 +HG01923 Q-M3 Q-M3 Q1a2a1a1 +HG01926 Q-M3 Q-M3 Q1a2a1a1 +HG01938 Q-M3 Q-M3 Q1a2a1a1 +HG01950 Q-M3 Q-M3 Q1a2a1a1 +HG01961 Q-M3 Q-M3 Q1a2a1a1 +HG01967 Q-M3 Q-M3 Q1a2a1a1 +HG01974 Q-M3 Q-M3 Q1a2a1a1 +HG01977 Q-M3 Q-M3 Q1a2a1a1 +HG01979 Q-M3 Q-M3 Q1a2a1a1 +HG02104 Q-M3 Q-M3 Q1a2a1a1 +HG02146 Q-M3 Q-M3 Q1a2a1a1 +HG02259 Q-M3 Q-M3 Q1a2a1a1 +HG02265 Q-M3 Q-M3 Q1a2a1a1 +HG02271 Q-M3 Q-M3 Q1a2a1a1 +HG02277 Q-M3 Q-M3 Q1a2a1a1 +HG02285 Q-M3 Q-M3 Q1a2a1a1 +HG02291 Q-M3 Q-M3 Q1a2a1a1 +HG02299 Q-M3 Q-M3 Q1a2a1a1 +HG02304 Q-M3 Q-M3 Q1a2a1a1 +NA19664 Q-M3 Q-M3 Q1a2a1a1 +NA19682 Q-M3 Q-M3 Q1a2a1a1 +NA19729 Q-M3 Q-M3 Q1a2a1a1 +NA19732 Q-M3 Q-M3 Q1a2a1a1 +NA19735 Q-M3 Q-M3 Q1a2a1a1 +NA19783 Q-M3 Q-M3 Q1a2a1a1 +NA19786 Q-M3 Q-M3 Q1a2a1a1 +NA19771 Q-M971 Q-M971 Q1a2a1b +NA19795 Q-M971 Q-M971 Q1a2a1b +HG03864 Q-Y1170 Q-Y1150 Q1b2 +HG03914 Q-Y1170 Q-Y1150 Q1b2 +HG03652 Q-Z3946 Q-Y15624 Q1b2a1a1 +NA12155 R-L664 R-L664 R1a1a1a +HG00151 R-Z91 R-Z280 R1a1a1b1a2 +HG00277 R-Z92 R-Z92 R1a1a1b1a2a +HG00382 R-YP270 R-YP270 R1a1a1b1a2a2 +HG00366 R-CTS3402 R-CTS3402 R1a1a1b1a2b3 +NA11843 R-CTS3390 R-CTS3390 R1a1a1b1a3a1c +HG00109 R-Z287 R-Z287 R1a1a1b1a3b +HG00113 R-Z87 R-CTS2243 R1a1a1b1a3b1a +HG00114 R-Z87 R-CTS2243 R1a1a1b1a3b1a +HG02603 R-Z93 R-Z93 R1a1a1b2 +HG02699 R-Z93 R-Z93 R1a1a1b2 +HG03629 R-Z93 R-Z93 R1a1a1b2 +HG03705 R-Z93 R-Z93 R1a1a1b2 +HG03750 R-Z93 R-Z93 R1a1a1b2 +HG03926 R-Z93 R-Z93 R1a1a1b2 +HG04194 R-Z93 R-Z93 R1a1a1b2 +NA20539 R-Z93 R-Z93 R1a1a1b2 +NA20796 R-Z93 R-Z93 R1a1a1b2 +NA20897 R-Z93 R-Z93 R1a1a1b2 +HG03899 R-L657 R-L657 R1a1a1b2a1 +HG03911 R-L657 R-L657 R1a1a1b2a1 +HG04039 R-L657 R-L657 R1a1a1b2a1 +HG04098 R-L657 R-L657 R1a1a1b2a1 +NA20904 R-L657 R-L657 R1a1a1b2a1 +HG02600 R-Y7 R-Y7 R1a1a1b2a1a +HG02648 R-Y7 R-Y7 R1a1a1b2a1a +HG02727 R-Y7 R-Y7 R1a1a1b2a1a +HG02736 R-Y7 R-Y7 R1a1a1b2a1a +HG03234 R-Y7 R-Y7 R1a1a1b2a1a +HG03490 R-Y7 R-Y7 R1a1a1b2a1a +HG03624 R-Y7 R-Y7 R1a1a1b2a1a +HG03649 R-Y7 R-Y7 R1a1a1b2a1a +HG03663 R-Y7 R-Y7 R1a1a1b2a1a +HG03667 R-Y7 R-Y7 R1a1a1b2a1a +HG03679 R-Y7 R-Y7 R1a1a1b2a1a +HG03694 R-Y7 R-Y7 R1a1a1b2a1a +HG03720 R-Y7 R-Y7 R1a1a1b2a1a +HG03779 R-Y7 R-Y7 R1a1a1b2a1a +HG03896 R-Y7 R-Y7 R1a1a1b2a1a +HG03902 R-Y7 R-Y7 R1a1a1b2a1a +HG04219 R-Y7 R-Y7 R1a1a1b2a1a +NA20846 R-Y7 R-Y7 R1a1a1b2a1a +NA20850 R-Y7 R-Y7 R1a1a1b2a1a +NA20891 R-Y7 R-Y7 R1a1a1b2a1a +NA21095 R-Y7 R-Y7 R1a1a1b2a1a +NA21114 R-Y7 R-Y7 R1a1a1b2a1a +NA21115 R-Y7 R-Y7 R1a1a1b2a1a +NA21129 R-Y7 R-Y7 R1a1a1b2a1a +HG02597 R-Y6 R-Y6 R1a1a1b2a1b +HG02780 R-Y6 R-Y6 R1a1a1b2a1b +HG03585 R-Y6 R-Y6 R1a1a1b2a1b +HG03603 R-Y6 R-Y6 R1a1a1b2a1b +HG03685 R-Y6 R-Y6 R1a1a1b2a1b +HG03711 R-Y6 R-Y6 R1a1a1b2a1b +HG03713 R-Y6 R-Y6 R1a1a1b2a1b +HG03715 R-Y6 R-Y6 R1a1a1b2a1b +HG03744 R-Y6 R-Y6 R1a1a1b2a1b +HG03775 R-Y6 R-Y6 R1a1a1b2a1b +HG03803 R-Y6 R-Y6 R1a1a1b2a1b +HG03833 R-Y6 R-Y6 R1a1a1b2a1b +HG03866 R-Y6 R-Y6 R1a1a1b2a1b +HG03871 R-Y6 R-Y6 R1a1a1b2a1b +HG04017 R-Y6 R-Y6 R1a1a1b2a1b +HG04229 R-Y6 R-Y6 R1a1a1b2a1b +NA20852 R-Y6 R-Y6 R1a1a1b2a1b +NA20890 R-Y6 R-Y6 R1a1a1b2a1b +NA20898 R-Y6 R-Y6 R1a1a1b2a1b +NA21098 R-Y6 R-Y6 R1a1a1b2a1b +NA21104 R-Y6 R-Y6 R1a1a1b2a1b +HG03636 R-Z2123 R-Z2123 R1a1a1b2a2a +HG03687 R-Z2123 R-Z2123 R1a1a1b2a2a +HG03740 R-Z2123 R-Z2123 R1a1a1b2a2a +HG03743 R-Z2123 R-Z2123 R1a1a1b2a2a +HG03887 R-Z2123 R-Z2123 R1a1a1b2a2a +HG03960 R-Z2123 R-Z2123 R1a1a1b2a2a +HG03963 R-Z2123 R-Z2123 R1a1a1b2a2a +HG03999 R-Z2123 R-Z2123 R1a1a1b2a2a +HG04019 R-Z2123 R-Z2123 R1a1a1b2a2a +HG04020 R-Z2123 R-Z2123 R1a1a1b2a2a +HG04023 R-Z2123 R-Z2123 R1a1a1b2a2a +HG04096 R-Z2123 R-Z2123 R1a1a1b2a2a +HG04131 R-Z2123 R-Z2123 R1a1a1b2a2a +NA20864 R-Z2123 R-Z2123 R1a1a1b2a2a +NA21092 R-Z2123 R-Z2123 R1a1a1b2a2a +HG01617 R-CTS6 R-CTS6 R1a1a1b2a2b1a +HG01982 R-Y57 R-Y57 R1a1a1b2a2b2 +HG00640 R-L278 R-P25_1 R1b1 +HG01947 R-L278 R-P25_1 R1b1 +HG01678 R-Z8056 R-Z8056 R1b1a2a1a1a +HG00108 R-Z372 R-Z372 R1b1a2a1a1b1a +HG00260 R-Z372 R-Z372 R1b1a2a1a1b1a +NA12842 R-S5741 R-S5741 R1b1a2a1a1b1a1a +NA12872 R-S5741 R-S5741 R1b1a2a1a1b1a1a +NA12827 R-DF95 R-DF95 R1b1a2a1a1b1b +HG00256 R-Z156 R-Z156 R1b1a2a1a1c1 +HG02238 R-Z156 R-Z156 R1b1a2a1a1c1 +HG00155 R-L1 R-L1 R1b1a2a1a1c1a2a +HG02233 R-Z301 R-Z301 R1b1a2a1a1c2 +NA12272 R-Z301 R-Z301 R1b1a2a1a1c2 +NA20356 R-Z301 R-Z301 R1b1a2a1a1c2 +HG00265 R-L45 R-L45 R1b1a2a1a1c2b1a1a1 +HG00131 R-Z159 R-Z159 R1b1a2a1a1c2b1b +HG02545 R-Z159 R-Z159 R1b1a2a1a1c2b1b +NA06986 R-Z159 R-Z159 R1b1a2a1a1c2b1b +NA12760 R-Z2 R-Z2 R1b1a2a1a1c2b2a1 +NA12775 R-Z8 R-Z8 R1b1a2a1a1c2b2a1a1 +NA07048 R-Z12 R-Z12 R1b1a2a1a1c2b2a1a1a1 +NA19922 R-Z12 R-Z12 R1b1a2a1a1c2b2a1a1a1 +HG00143 R-Z6 R-Z6 R1b1a2a1a1c2b2a1a1b1a +HG00149 R-S18951 R-S18951 R1b1a2a1a1c2b2a1a1b1a1 +HG00251 R-Z343 R-Z343 R1b1a2a1a1c2b2a1a1b2a +HG00267 R-CTS7080 R-CTS7080 R1b1a2a1a1c2b2a1a1b2a1a +HG02281 R-DF102 R-DF102 R1b1a2a1a1c2b2a1a1b2b1 +NA20586 R-CTS10893 R-CTS10893 R1b1a2a1a1c2b2a1a2 +NA20524 R-Z326 R-Z326 R1b1a2a1a1c2b2b1a +HG01550 R-Z319 R-CTS2509 R1b1a2a1a1c2b2b1a1 +HG02343 R-Z319 R-CTS2509 R1b1a2a1a1c2b2b1a1 +HG00157 R-P312 R-P312 R1b1a2a1a2 +HG01082 R-P312 R-P312 R1b1a2a1a2 +HG01170 R-P312 R-P312 R1b1a2a1a2 +HG01334 R-P312 R-P312 R1b1a2a1a2 +HG01682 R-P312 R-P312 R1b1a2a1a2 +HG01917 R-P312 R-P312 R1b1a2a1a2 +NA20755 R-P312 R-P312 R1b1a2a1a2 +HG00264 R-DF27 R-DF27 R1b1a2a1a2a +HG00739 R-DF27 R-DF27 R1b1a2a1a2a +HG01063 R-DF27 R-DF27 R1b1a2a1a2a +HG01085 R-DF27 R-DF27 R1b1a2a1a2a +HG01121 R-DF27 R-DF27 R1b1a2a1a2a +HG01182 R-DF27 R-DF27 R1b1a2a1a2a +HG01241 R-DF27 R-DF27 R1b1a2a1a2a +HG01247 R-DF27 R-DF27 R1b1a2a1a2a +HG01341 R-DF27 R-DF27 R1b1a2a1a2a +HG01353 R-DF27 R-DF27 R1b1a2a1a2a +HG01395 R-DF27 R-DF27 R1b1a2a1a2a +HG01398 R-DF27 R-DF27 R1b1a2a1a2a +HG01413 R-DF27 R-DF27 R1b1a2a1a2a +HG01443 R-DF27 R-DF27 R1b1a2a1a2a +HG01461 R-DF27 R-DF27 R1b1a2a1a2a +HG01491 R-DF27 R-DF27 R1b1a2a1a2a +HG01506 R-DF27 R-DF27 R1b1a2a1a2a +HG01518 R-DF27 R-DF27 R1b1a2a1a2a +HG01606 R-DF27 R-DF27 R1b1a2a1a2a +HG01615 R-DF27 R-DF27 R1b1a2a1a2a +HG01705 R-DF27 R-DF27 R1b1a2a1a2a +HG01708 R-DF27 R-DF27 R1b1a2a1a2a +HG01771 R-DF27 R-DF27 R1b1a2a1a2a +HG01775 R-DF27 R-DF27 R1b1a2a1a2a +HG01783 R-DF27 R-DF27 R1b1a2a1a2a +HG01785 R-DF27 R-DF27 R1b1a2a1a2a +HG01789 R-DF27 R-DF27 R1b1a2a1a2a +HG01879 R-DF27 R-DF27 R1b1a2a1a2a +HG02002 R-DF27 R-DF27 R1b1a2a1a2a +NA12347 R-DF27 R-DF27 R1b1a2a1a2a +NA12812 R-DF27 R-DF27 R1b1a2a1a2a +NA12814 R-DF27 R-DF27 R1b1a2a1a2a +NA20770 R-DF27 R-DF27 R1b1a2a1a2a +NA20519 R-Z195 R-Z195 R1b1a2a1a2a1 +HG01271 R-Z209 R-Z209 R1b1a2a1a2a1a1 +HG01524 R-Z209 R-Z209 R1b1a2a1a2a1a1 +HG01700 R-Z209 R-Z209 R1b1a2a1a2a1a1 +HG01765 R-Z209 R-Z209 R1b1a2a1a2a1a1 +HG01479 R-Z295 R-Z295 R1b1a2a1a2a1a1a +NA12874 R-Z295 R-Z295 R1b1a2a1a2a1a1a +HG01286 R-Z278 R-Z278 R1b1a2a1a2a1a1a1 +HG01440 R-Z278 R-Z278 R1b1a2a1a2a1a1a1 +HG01608 R-Z278 R-Z278 R1b1a2a1a2a1a1a1 +HG01630 R-Z278 R-Z278 R1b1a2a1a2a1a1a1 +HG01669 R-Z278 R-Z278 R1b1a2a1a2a1a1a1 +HG00637 R-Z214 R-Z214 R1b1a2a1a2a1a1a1a +HG01173 R-Z214 R-Z214 R1b1a2a1a2a1a1a1a +HG01259 R-Z214 R-Z214 R1b1a2a1a2a1a1a1a +HG01488 R-Z214 R-Z214 R1b1a2a1a2a1a1a1a +HG01603 R-Z214 R-Z214 R1b1a2a1a2a1a1a1a +HG01624 R-Z214 R-Z214 R1b1a2a1a2a1a1a1a +HG02221 R-Z214 R-Z214 R1b1a2a1a2a1a1a1a +NA19750 R-Z214 R-Z214 R1b1a2a1a2a1a1a1a +HG01047 R-M153 R-M153 R1b1a2a1a2a1a1a1a1 +HG01280 R-M153 R-M153 R1b1a2a1a2a1a1a1a1 +HG01521 R-M153 R-M153 R1b1a2a1a2a1a1a1a1 +HG01932 R-M153 R-M153 R1b1a2a1a2a1a1a1a1 +NA20518 R-CTS4065 R-CTS4065 R1b1a2a1a2a1a1a2 +HG00141 R-DF17 R-DF17 R1b1a2a1a2a1a2 +HG01392 R-DF17 R-DF17 R1b1a2a1a2a1a2 +HG01631 R-DF17 R-DF17 R1b1a2a1a2a1a2 +NA20752 R-DF17 R-DF17 R1b1a2a1a2a1a2 +HG00112 R-Z198 R-Z198 R1b1a2a1a2a1b +HG01148 R-Z198 R-Z198 R1b1a2a1a2a1b +HG01365 R-Z292 R-Z292 R1b1a2a1a2a1b1 +HG00126 R-M167 R-M167 R1b1a2a1a2a1b1a1 +HG01359 R-M167 R-M167 R1b1a2a1a2a1b1a1 +HG01374 R-M167 R-M167 R1b1a2a1a2a1b1a1 +HG01464 R-M167 R-M167 R1b1a2a1a2a1b1a1 +HG01791 R-M167 R-M167 R1b1a2a1a2a1b1a1 +NA12716 R-M167 R-M167 R1b1a2a1a2a1b1a1 +NA19741 R-M167 R-M167 R1b1a2a1a2a1b1a1 +NA19777 R-M167 R-M167 R1b1a2a1a2a1b1a1 +NA20342 R-M167 R-M167 R1b1a2a1a2a1b1a1 +HG01075 R-CTS4188 R-CTS4188 R1b1a2a1a2a1b3 +HG01747 R-CTS4188 R-CTS4188 R1b1a2a1a2a1b3 +HG01761 R-CTS4188 R-CTS4188 R1b1a2a1a2a1b3 +NA19756 R-CTS4188 R-CTS4188 R1b1a2a1a2a1b3 +HG00731 R-Z225 R-Z225 R1b1a2a1a2a5 +HG00742 R-Z225 R-Z225 R1b1a2a1a2a5 +HG01048 R-Z225 R-Z225 R1b1a2a1a2a5 +HG01176 R-Z225 R-Z225 R1b1a2a1a2a5 +HG01350 R-Z225 R-Z225 R1b1a2a1a2a5 +HG01389 R-Z225 R-Z225 R1b1a2a1a2a5 +HG01694 R-Z225 R-Z225 R1b1a2a1a2a5 +HG01709 R-Z225 R-Z225 R1b1a2a1a2a5 +HG00107 R-Z2571 R-Z2571 R1b1a2a1a2a6 +HG01577 R-Z2571 R-Z2571 R1b1a2a1a2a6 +NA19762 R-Z2571 R-Z2571 R1b1a2a1a2a6 +HG00129 R-U152 R-U152 R1b1a2a1a2b +HG00145 R-U152 R-U152 R1b1a2a1a2b +HG01060 R-U152 R-U152 R1b1a2a1a2b +HG01383 R-U152 R-U152 R1b1a2a1a2b +HG01941 R-U152 R-U152 R1b1a2a1a2b +NA07357 R-U152 R-U152 R1b1a2a1a2b +NA12144 R-U152 R-U152 R1b1a2a1a2b +NA19661 R-U152 R-U152 R1b1a2a1a2b +NA19679 R-U152 R-U152 R1b1a2a1a2b +NA19685 R-U152 R-U152 R1b1a2a1a2b +NA19720 R-U152 R-U152 R1b1a2a1a2b +NA20512 R-U152 R-U152 R1b1a2a1a2b +NA20538 R-U152 R-U152 R1b1a2a1a2b +NA20754 R-U152 R-U152 R1b1a2a1a2b +NA20792 R-U152 R-U152 R1b1a2a1a2b +NA20798 R-U152 R-U152 R1b1a2a1a2b +NA20803 R-U152 R-U152 R1b1a2a1a2b +NA20806 R-U152 R-U152 R1b1a2a1a2b +HG01356 R-L20 R-L20 R1b1a2a1a2b1a1 +HG01767 R-L20 R-L20 R1b1a2a1a2b1a1 +NA12005 R-L20 R-L20 R1b1a2a1a2b1a1 +NA19649 R-L20 R-L20 R1b1a2a1a2b1a1 +NA20515 R-L20 R-L20 R1b1a2a1a2b1a1 +HG02262 R-Z35 R-Z35 R1b1a2a1a2b1a2a +NA20759 R-Z35 R-Z35 R1b1a2a1a2b1a2a +NA07347 R-Z275 R-Z275 R1b1a2a1a2b1a2a1 +NA11994 R-Z275 R-Z275 R1b1a2a1a2b1a2a1 +HG01536 R-Z51 R-Z51 R1b1a2a1a2b1c1a +HG01625 R-Z51 R-Z51 R1b1a2a1a2b1c1a +HG00142 R-CTS11232 R-CTS11232 R1b1a2a1a2b1c1a1a1a1 +NA12829 R-CTS278 R-CTS278 R1b1a2a1a2b1c1a1a1a2 +NA20812 R-CTS9490 R-CTS9490 R1b1a2a1a2b1c1b1 +HG00244 R-CTS8125 R-CTS8125 R1b1a2a1a2b1c1b1a2 +HG01777 R-Y3141 R-Y3141 R1b1a2a1a2b1c1b3a +HG00736 R-S8172 R-S8172 R1b1a2a1a2b1c2b1a2 +HG02219 R-PF6601 R-PF6601 R1b1a2a1a2b3 +HG02231 R-PF6601 R-PF6601 R1b1a2a1a2b3 +NA12342 R-PF6601 R-PF6601 R1b1a2a1a2b3 +NA19652 R-PF6601 R-PF6601 R1b1a2a1a2b3 +NA20810 R-PF6601 R-PF6601 R1b1a2a1a2b3 +NA20509 R-Z145 R-Z145 R1b1a2a1a2b3c +NA20581 R-Z145 R-Z145 R1b1a2a1a2b3c +NA20814 R-Z145 R-Z145 R1b1a2a1a2b3c +HG00115 R-CTS241 R-CTS241 R1b1a2a1a2c1 +HG00119 R-CTS8221 R-CTS241 R1b1a2a1a2c1 +HG01953 R-CTS241 R-CTS241 R1b1a2a1a2c1 +HG02051 R-CTS8221 R-CTS241 R1b1a2a1a2c1 +NA12045 R-CTS241 R-CTS241 R1b1a2a1a2c1 +NA19658 R-CTS8221 R-CTS241 R1b1a2a1a2c1 +NA12154 R-DF23 R-DF23 R1b1a2a1a2c1a1a1 +HG02501 R-Z2961 R-Z2961 R1b1a2a1a2c1a1a1a +NA20318 R-S659 R-S660 R1b1a2a1a2c1a1a1a1a1a +NA12399 R-CTS6621 R-CTS6621 R1b1a2a1a2c1b2a +HG00103 R-CTS3087 R-CTS3087 R1b1a2a1a2c1b2b1 +HG01079 R-Y3550 R-Y3550 R1b1a2a1a2c1c +NA20278 R-CTS3974 R-CTS4466 R1b1a2a1a2c1c1b +NA12340 R-CTS1751 R-CTS1751 R1b1a2a1a2c1d +HG00246 R-Z255 R-Z255 R1b1a2a1a2c1e +NA12762 R-Z255 R-Z255 R1b1a2a1a2c1e +HG01136 R-Z253 R-Z253 R1b1a2a1a2c1f +NA19717 R-Z253 R-Z253 R1b1a2a1a2c1f +HG01500 R-L1066.1 R-L1066.1 R1b1a2a1a2c1f2c1a +NA06984 R-CTS4296 R-CTS4296 R1b1a2a1a2c1f2c1a1b1b +HG01405 R-DF73 R-DF73 R1b1a2a1a2c1f2d1 +HG01503 R-DF73 R-DF73 R1b1a2a1a2c1f2d1 +HG01675 R-DF73 R-DF73 R1b1a2a1a2c1f2d1 +HG00105 R-BY157 R-BY157 R1b1a2a1a2c1f5a1a +NA12043 R-S307 R-S190 R1b1a2a1a2c1g1b1a +HG00116 R-Z16539 R-Z16539 R1b1a2a1a2c1g2a1a2b +HG00242 R-FGC15498 R-FGC15498 R1b1a2a1a2c1g2a1a4 +NA10851 R-A874 R-A874 R1b1a2a1a2c1i4 +NA11831 R-Z251 R-Z251 R1b1a2a1a2c1j +NA19789 R-Z251 R-Z251 R1b1a2a1a2c1j +HG00096 R-CTS3057 R-L1065 R1b1a2a1a2c1k1 +HG02014 R-CTS300 R-CTS300 R1b1a2a1a2c2 +HG00243 R-CTS6919 R-CTS6919 R1b1a2a1a2c2a +HG00139 R-DF19 R-DF19 R1b1a2a1a2e +HG00138 R-Z302 R-Z302 R1b1a2a1a2e2 +NA12889 R-Z302 R-Z302 R1b1a2a1a2e2 +HG02008 R-DF99 R-DF99 R1b1a2a1a2f +NA20783 R-DF99 R-DF99 R1b1a2a1a2f +HG00148 R-CTS4528 R-CTS4528 R1b1a2a1a3 +NA20785 R-PF7589 R-PF7589 R1b1a2a1b +HG01066 R-CTS11824 R-CTS11824 R1b1a2a1b1a +NA20532 R-CTS1078 R-CTS1078 R1b1a2a2 +NA18645 R-Z2106 R-Z2106 R1b1a2a2c +NA20866 R-CTS1843 R-CTS1843 R1b1a2a2c1 +HG01277 R-CTS7822 R-CTS7822 R1b1a2a2c1a +HG01515 R-Y5587 R-Y5587 R1b1a2a2c1a1a1 +HG01586 R-L266 R-L266 R2a +HG02657 R-L266 R-L266 R2a +HG02783 R-L266 R-L266 R2a +HG03691 R-L266 R-L266 R2a +HG03702 R-L266 R-L266 R2a +HG03718 R-L266 R-L266 R2a +HG03727 R-L266 R-L266 R2a +HG03729 R-L266 R-L266 R2a +HG03771 R-L266 R-L266 R2a +HG03844 R-L266 R-L266 R2a +HG03856 R-L266 R-L266 R2a +HG03869 R-L266 R-L266 R2a +HG03967 R-L266 R-L266 R2a +HG03974 R-L266 R-L266 R2a +HG03978 R-L266 R-L266 R2a +HG04056 R-L266 R-L266 R2a +HG04152 R-L266 R-L266 R2a +HG04158 R-L266 R-L266 R2a +HG04225 R-L266 R-L266 R2a +NA20845 R-L266 R-L266 R2a +NA20887 R-L266 R-L266 R2a +NA20895 R-L266 R-L266 R2a +NA20903 R-L266 R-L266 R2a +NA21105 R-L266 R-L266 R2a +HG02660 R-L294.1 R-L294.1 R2a1a +HG03015 R-L294.1 R-L294.1 R2a1a +HG03773 R-L294.1 R-L294.1 R2a1a +HG04022 R-L294.1 R-L294.1 R2a1a +HG04182 R-L294.1 R-L294.1 R2a1a diff --git a/tests/fixtures/output/haplogroups.1000Y.subset.txt b/tests/fixtures/output/haplogroups.1000Y.subset.txt new file mode 100644 index 0000000..a054889 --- /dev/null +++ b/tests/fixtures/output/haplogroups.1000Y.subset.txt @@ -0,0 +1,35 @@ +HG02982 A0-V153 A0-V153 A0a1 +HG01890 A0-L1038 A0-L92.2 A0b +HG02645 A1a-M31 A1a-M31 A1a +NA19043 B-M109 B-M109 B2a1a +HG03225 B-L1389 B-L1387 B3 +NA19239 E-CTS1792 E-CTS1792 E1a2a1a1a +HG01187 E-M58 E-M58 E1b1a1a1a +NA19031 E-P116 E-P116 E1b1a1a1c1a1c1a +HG02573 E-Z6019 E-Z6019 E1b1a1a1c2c3c +HG03484 E-U290 E-U290 E1b1a1a1d1a +NA19035 E-M200 E-M200 E2b1a1 +NA18960 D-CTS6609 D-CTS6609 D1b1d +NA18971 C-Z1356 C-Z1356 C1a1a2 +NA21091 C-Z5898 C-Z5898 C1b1a1a1a1a1a1a1 +NA18620 C-M4236 C-F3735 C2e1b1a +HG02040 F-M89 F-M89 F +HG01311 G-Z6800 G-Z6800 G2a2a1a2a1b +HG03778 H-M2716 H-M2716 H1a1d2 +HG04198 H-Apt H-Apt H1b1 +HG03792 H-Z5862 H-Z5862 H3a2a2 +HG02420 I-L338 I-L338 I1a2a1a1a1 +HG02470 I-Z2069 I-Y3681 I2a2a1a2a1 +HG01437 J-P58 J-P58 J1a2b +NA20513 J-M92 J-M92 J2a1b1 +HG03990 L-M27 L-M27 L1a1 +HG01190 T-CTS6280 T-CTS6280 T1a1a1a1a1b +HG00188 N-Z1940 N-Z1940 N1c1a1a2a1a1 +HG02396 O-F656 O-F656 O1a1a1a1a1a +HG02067 O-CTS1936 O-CTS1936 O2a1a2a +HG01938 Q-M3 Q-M3 Q1a2a1a1 +NA19795 Q-M971 Q-M971 Q1a2a1b +HG03864 Q-Y1170 Q-Y1150 Q1b2 +HG03743 R-Z2123 R-Z2123 R1a1a1b2a2a +HG01669 R-Z278 R-Z278 R1b1a2a1a2a1a1a1 +HG03015 R-L294.1 R-L294.1 R2a1a diff --git a/tests/fixtures/output/haplogroups.HG01938.txt b/tests/fixtures/output/haplogroups.HG01938.txt new file mode 100644 index 0000000..ba14a3e --- /dev/null +++ b/tests/fixtures/output/haplogroups.HG01938.txt @@ -0,0 +1 @@ +HG01938 Q-M3 Q-M3 Q1a2a1a1 diff --git a/tests/fixtures/utils.sh b/tests/fixtures/utils.sh new file mode 100644 index 0000000..0fd7901 --- /dev/null +++ b/tests/fixtures/utils.sh @@ -0,0 +1,8 @@ +function echo_run { + # Echo and run a command. + # Usage: echo_run "command [options] [arguments]" + + command=$1 + echo -e "\n$ ${command}" + eval ${command} +} diff --git a/tests/test_calling_text.py b/tests/test_calling_text.py new file mode 100644 index 0000000..e29a251 --- /dev/null +++ b/tests/test_calling_text.py @@ -0,0 +1,15 @@ +from tests.common import ( + GENOTYPES_1000Y_SUBSET_TEXT_FP, + HAPLOGROUPS_1000Y_SUBSET_FP, + load_haplogroup_df, +) +from yhaplo.api.call_haplogroups import call_haplogroups +from yhaplo.api.command_line_args import get_command_line_arg_defaults + + +def test_text_input_1000y_subset(): + command_line_args = get_command_line_arg_defaults() + command_line_args.data_fp = GENOTYPES_1000Y_SUBSET_TEXT_FP + haplogroup_df = call_haplogroups(command_line_args, suppress_output=True) + expected_haplogroups_df = load_haplogroup_df(HAPLOGROUPS_1000Y_SUBSET_FP) + assert haplogroup_df.equals(expected_haplogroups_df) diff --git a/tests/test_calling_vcf.py b/tests/test_calling_vcf.py new file mode 100644 index 0000000..ce4b2ea --- /dev/null +++ b/tests/test_calling_vcf.py @@ -0,0 +1,39 @@ +import pytest + +from tests.common import ( + GENOTYPES_1000Y_ALL_BCF_FP, + GENOTYPES_1000Y_ONE_VCF_FP, + GENOTYPES_1000Y_SUBSET_BCF_FP, + HAPLOGROUPS_1000Y_ALL_FP, + HAPLOGROUPS_1000Y_ONE_FP, + HAPLOGROUPS_1000Y_SUBSET_FP, + load_haplogroup_df, +) +from yhaplo.api.call_haplogroups import call_haplogroups +from yhaplo.api.command_line_args import get_command_line_arg_defaults + + +def test_vcf_input_1000y_single_sample(): + command_line_args = get_command_line_arg_defaults() + command_line_args.data_fp = GENOTYPES_1000Y_ONE_VCF_FP + haplogroup_df = call_haplogroups(command_line_args, suppress_output=True) + expected_haplogroups_df = load_haplogroup_df(HAPLOGROUPS_1000Y_ONE_FP) + assert haplogroup_df.equals(expected_haplogroups_df) + + +@pytest.mark.skip("Large fixture. See: ./tests/fixtures/generate_bcf_fixtures.sh") +def test_bcf_input_1000y_subset(): + command_line_args = get_command_line_arg_defaults() + command_line_args.data_fp = GENOTYPES_1000Y_SUBSET_BCF_FP + haplogroup_df = call_haplogroups(command_line_args, suppress_output=True) + expected_haplogroups_df = load_haplogroup_df(HAPLOGROUPS_1000Y_SUBSET_FP) + assert haplogroup_df.equals(expected_haplogroups_df) + + +@pytest.mark.skip("Large fixture. See: ./tests/fixtures/generate_bcf_fixtures.sh") +def test_vcf_input_1000y_all(): + command_line_args = get_command_line_arg_defaults() + command_line_args.data_fp = GENOTYPES_1000Y_ALL_BCF_FP + haplogroup_df = call_haplogroups(command_line_args, suppress_output=True) + expected_haplogroups_df = load_haplogroup_df(HAPLOGROUPS_1000Y_ALL_FP) + assert haplogroup_df.equals(expected_haplogroups_df) diff --git a/tests/test_load_module.py b/tests/test_load_module.py new file mode 100644 index 0000000..25c46f5 --- /dev/null +++ b/tests/test_load_module.py @@ -0,0 +1,5 @@ +import yhaplo # noqa F401 + + +def test_load_module(): + assert True diff --git a/tests/test_utils_loaders.py b/tests/test_utils_loaders.py new file mode 100644 index 0000000..4dd946e --- /dev/null +++ b/tests/test_utils_loaders.py @@ -0,0 +1,35 @@ +import pytest + +from yhaplo.config import Config +from yhaplo.utils.loaders import DataFile, TtamFileNotFoundError, load_data + + +def test_load_data(): + load_data(Config.primary_tree_data_file) + + +def test_load_data_missing(): + with pytest.raises(ModuleNotFoundError): + load_data(DataFile("foo", "bar")) + + with pytest.raises(FileNotFoundError): + load_data( + DataFile( + Config.primary_tree_data_file.data_subdir, + "foo", + ) + ) + + +def test_load_data_missing_ttam(): + with pytest.raises(TtamFileNotFoundError): + load_data(DataFile("foo", "bar", ttam_only=True)) + + with pytest.raises(TtamFileNotFoundError): + load_data( + DataFile( + Config.primary_tree_data_file.data_subdir, + "foo", + ttam_only=True, + ) + ) diff --git a/tests/test_utils_optional_dependencies.py b/tests/test_utils_optional_dependencies.py new file mode 100644 index 0000000..46a4b82 --- /dev/null +++ b/tests/test_utils_optional_dependencies.py @@ -0,0 +1,16 @@ +import pytest + +from yhaplo.utils.optional_dependencies import optional_import_error_message + + +def test_optional_import_error(): + with pytest.raises(ImportError, match="pip"): + try: + from uninstalled_module import some_function # noqa F401 + except ImportError as error: + error.msg = error.msg + optional_import_error_message( + "package name", + "do stuff", + "optional dependency category", + ) + raise error diff --git a/tests/test_utils_vcf.py b/tests/test_utils_vcf.py new file mode 100644 index 0000000..39c2bf3 --- /dev/null +++ b/tests/test_utils_vcf.py @@ -0,0 +1,40 @@ +import pytest + +from yhaplo.utils.vcf import check_vcf_index + + +def test_check_vcf_index_vcf(tmpdir): + vcf_fn = "foo.vcf.gz" + vcf_path = tmpdir.join(vcf_fn) + vcf_path.write("") + with pytest.raises(FileNotFoundError, match="VCF index"): + check_vcf_index(str(vcf_path)) + + tmpdir.join(f"{vcf_fn}.tbi").write("") + check_vcf_index(str(vcf_path)) + + +def test_check_vcf_index_bcf(tmpdir): + bcf_fn = "foo.bcf" + bcf_path = tmpdir.join(bcf_fn) + bcf_path.write("") + with pytest.raises(FileNotFoundError, match="BCF index"): + check_vcf_index(str(bcf_path)) + + tmpdir.join(f"{bcf_fn}.csi").write("") + check_vcf_index(str(bcf_path)) + + +def test_check_vcf_index_vcf_no_vcf(tmpdir): + vcf_fn = "foo.vcf" + vcf_path = tmpdir.join(vcf_fn) + with pytest.raises(FileNotFoundError, match="VCF/BCF file not found"): + check_vcf_index(str(vcf_path)) + + +def test_check_vcf_index_vcf_bad_extension(tmpdir): + vcf_fn = "foo.vcf" + vcf_path = tmpdir.join(vcf_fn) + vcf_path.write("") + with pytest.raises(ValueError, match="extension"): + check_vcf_index(str(vcf_path)) diff --git a/yhaplo.manual.2020.08.12.pdf b/yhaplo.manual.2020.08.12.pdf deleted file mode 100644 index c654146..0000000 Binary files a/yhaplo.manual.2020.08.12.pdf and /dev/null differ diff --git a/yhaplo/__init__.py b/yhaplo/__init__.py index ed30332..942c89e 100644 --- a/yhaplo/__init__.py +++ b/yhaplo/__init__.py @@ -1,5 +1,9 @@ -import os +"""Yhaplo.""" -version_fn = os.path.join(os.path.dirname(os.path.realpath(__file__)), "version.txt") +import warnings +from importlib.metadata import PackageNotFoundError, version -__version__ = open(version_fn).readline().strip() +try: + __version__ = version(__package__) +except PackageNotFoundError: + warnings.warn(f"{__package__} version unavailable. Is it installed?", Warning) diff --git a/yhaplo/api/__init__.py b/yhaplo/api/__init__.py new file mode 100644 index 0000000..167292b --- /dev/null +++ b/yhaplo/api/__init__.py @@ -0,0 +1 @@ +"""Application programming interface.""" diff --git a/yhaplo/api/call_haplogroups.py b/yhaplo/api/call_haplogroups.py new file mode 100644 index 0000000..ddfcfa0 --- /dev/null +++ b/yhaplo/api/call_haplogroups.py @@ -0,0 +1,72 @@ +"""Call haplogroups.""" + +import argparse +import logging +from collections.abc import Mapping +from typing import Optional, Union + +import numpy as np +import pandas as pd + +from yhaplo.config import IID_TYPE, Config +from yhaplo.sample import call_haplogroups_from_config + + +def call_haplogroups( + command_line_args: Optional[argparse.Namespace] = None, + iid_to_ablock: Optional[Mapping[IID_TYPE, Union[bytes, np.ndarray]]] = None, + iid_to_platforms: Optional[Mapping[IID_TYPE, str]] = None, + suppress_output: bool = False, + out_dir: Optional[str] = None, + all_aux_output: bool = False, + root_logger: Optional[logging.Logger] = None, +) -> pd.DataFrame: + """Configure run, build tree, and call haplogroups. + + Parameters + ---------- + command_line_args : argparse.Namespace | None, optional + Command-line arguments. + iid_to_ablock : Mapping[IID_TYPE, bytes | np.ndarray] | None, optional + Mapping of individual identifiers to 23andMe ablocks. + iid_to_platforms : Mapping[IID_TYPE, str] | None, optional + Mapping of individual identifier to comma-separated string of + 23andMe platforms, each starting with "v". + suppress_output : bool, optional + When True, do not generate output files. + out_dir : str | None, optional + Output directory. + When not None, override command_line_args value. + all_aux_output : bool = False, optional + Generate all auxiliary output. + When True, override command_line_args value. + root_logger : logging.Logger | None, optional + If supplied, add a file handler. + To populate the log file, set the logging level to INFO or lower. + + Returns + ------- + haplogroup_df : pd.DataFrame + DataFrame of haplogroup calling results. + Index: Individual identifier. + Columns: + - hg_snp_obs: Haplogroup using a variant of representative-SNP form. + Rather than using one representative SNP per haplogroup, + use the most highly ranked SNP this individual was observed + to possess in the derived state. + - hg_snp: Haplogroup in representative-SNP form (e.g., "Q-M3"). + - ycc_haplogroup: Haplogroup using YCC nomenclature (e.g., "Q1a2a1a1"). + + """ + config = Config( + command_line_args=command_line_args, + iid_to_ablock=iid_to_ablock, + iid_to_platforms=iid_to_platforms, + suppress_output=suppress_output, + out_dir=out_dir, + all_aux_output=all_aux_output, + root_logger=root_logger, + ) + haplogroup_df = call_haplogroups_from_config(config) + + return haplogroup_df diff --git a/yhaplo/api/command_line_args.py b/yhaplo/api/command_line_args.py new file mode 100644 index 0000000..59b5bba --- /dev/null +++ b/yhaplo/api/command_line_args.py @@ -0,0 +1,303 @@ +"""Command-line arguments.""" + +import argparse + +from yhaplo import __version__ + +DESCRIPTION = """ +Yhaplo identifies the Y-chromosome haplogroup of each male in a sample of one +to millions. Sequence data will yield the most highly resolved classifications, +but the algorithm also works well with chip-based genotype data, provided a +reasonable number of phylogenetically informative sites have been assayed. +""" + +ANC_STOP_THRESH_DEFAULT = 2 # BFS stopping condition parameter default +DER_COLLAPSE_THRESH_DEFAULT = 2 # BFS collapsing parameter default + + +def get_command_line_args(set_defaults: bool = False) -> argparse.Namespace: + """Process command-line arguments or set defaults. + + Parameters + ---------- + set_defaults : bool + When True, ignore command-line arguments and return defaults. + + Returns + ------- + args : argparse.Namespace + Command-line options and arguments. + + """ + parser = argparse.ArgumentParser( + description=DESCRIPTION, + formatter_class=RawTextWithDefaultsHelpFormatter, + ) + parser.add_argument( + "-v", + "--version", + action="version", + version=f"yhaplo {__version__}", + ) + + group = parser.add_argument_group("Input") + group.add_argument( + "-i", + "--input", + dest="data_fp", + metavar="file_name", + help="Input file. Formats:\n" + "* Indexed BCF: .bcf, .bcf.csi\n" + "* Indexed VCF: .vcf.gz, .vcf.gz.tbi\n" + "* Sample-major text: .genos.txt or .genos.txt.gz\n" + " Row 0: Physical coordinates (GRCh37)\n" + " Column 0: Individual identifiers\n" + " Cell (i, j): Genotype for individual i at position j.\n" + ' Values include {"A", "C", "G", "T", "."}, \n' + ' with "." indicating an unobserved value.', + ) + + group = parser.add_argument_group("Output") + group.add_argument( + "-o", + "--out_dir", + dest="out_dir", + metavar="dir_name", + help="Output directory", + ) + + group = parser.add_argument_group("Example data") + group.add_argument( + "-ex-txt", + "--example_text", + action="store_true", + help="Run yhaplo on a subset of 1000 Genomes data\n" + "and produce all auxiliary output", + ) + group.add_argument( + "-ex-vcf", + "--example_vcf", + action="store_true", + help="Run yhaplo on a single-sample 1000 Genomes VCF\n" + "and produce all auxiliary output", + ) + + group = parser.add_argument_group( + "Auxiliary output", + "Generate files detailing haplogroup calling for each individual.", + ) + group.add_argument( + "-aao", + "--all_aux_output", + action="store_true", + help="Generate all auxiliary output.\n" + "Equivalent to these seven options:\n" + "--ancDerCounts --haplogroupPaths --haplogroupPathsDetail\n" + "--derSNPs --derSNPsDetail --ancSNPs --ancSNPsDetail", + ) + group.add_argument( + "-c", + "--anc_der_counts", + dest="write_anc_der_counts", + action="store_true", + help="Counts of ancestral and derived alleles encountered\n" + "at each node visited (omits nodes with zero of each)", + ) + group.add_argument( + "-hp", + "--haplogroup_paths", + dest="write_haplogroup_paths", + action="store_true", + help="Sequence of branch labels from root to call,\n" + "with counts of derived SNPs observed", + ) + group.add_argument( + "-hpd", + "--haplogroup_paths_detail", + dest="write_haplogroup_paths_detail", + action="store_true", + help="Sequence of branch labels from root to call,\n" + "with counts of derived SNPs observed and lists thereof", + ) + group.add_argument( + "-ds", + "--der_snps", + dest="write_der_snps", + action="store_true", + help="Lists of derived SNPs on path", + ) + group.add_argument( + "-dsd", + "--der_snps_detail", + dest="write_der_snps_detail", + action="store_true", + help="Detailed information about each derived SNP on path", + ) + group.add_argument( + "-as", + "--anc_snps", + dest="write_anc_snps", + action="store_true", + help="Lists of ancestral SNPs encountered in search", + ) + group.add_argument( + "-asd", + "--anc_snps_detail", + dest="write_anc_snps_detail", + action="store_true", + help="Detailed information about each ancestral SNP\n" "encountered in search", + ) + + group = parser.add_argument_group( + "Real-time auxiliary output", + "Write haplogroup calling information as each individual is processed.", + ) + group.add_argument( + "-rt", + "--write_real_time", + dest="write_haplogroups_real_time", + action="store_true", + help="Write haplogroups in real time. includes DFS rank,\n" + "to sort ex post facto: sort -nk5", + ) + group.add_argument( + "-hg", + "--hg_genos", + dest="haplogroup_to_list_genotypes_for", + metavar="haplogroup", + help="Write genotypes observed for SNPs associated with\n" + "a specified node of the tree, when it is visited", + ) + + group = parser.add_argument_group("Tree traversal") + group.add_argument( + "-b", + "--breadth_first", + dest="traverse_bf", + action="store_true", + help="Write bread-first traversal", + ) + group.add_argument( + "-d", + "--depth_first", + dest="traverse_df", + action="store_true", + help="Write depth-first (pre-order) traversal", + ) + group.add_argument( + "-dt", + "--depth_first_table", + dest="write_tree_table", + action="store_true", + help="Write depth-first (pre-order) traversal table", + ) + group.add_argument( + "-m", + "--mrca", + nargs=2, + dest="mrca_haplogroup_list", + metavar=("haplogroup1", "haplogroup2"), + help="Output mrca of two haplogroups", + ) + group.add_argument( + "-sq", + "--snp_query", + dest="query_snp_name", + metavar="snp_name", + help="List phylogenetic path for a query SNP", + ) + group.add_argument( + "-pt", + "--platform_trees", + dest="write_platform_trees", + action="store_true", + help="23andMe: Write trees whose branch lengths are numbers\n" + "of platform sites", + ) + + group = parser.add_argument_group("Search parameters") + group.add_argument( + "-ast", + "--anc_stop_thresh", + dest="anc_stop_thresh", + metavar="anc_stop_thresh", + type=int, + default=ANC_STOP_THRESH_DEFAULT, + help="BFS ancestral allele stopping condition", + ) + group.add_argument( + "-dct", + "--der_collapse_thresh", + dest="der_collapse_thresh", + metavar="der_collapse_thresh", + type=int, + default=DER_COLLAPSE_THRESH_DEFAULT, + help="BFS derived allele collapsing parameter", + ) + + group = parser.add_argument_group("Restrictions") + group.add_argument( + "-po", + "--primary_only", + action="store_true", + help="Do NOT import ISOGG SNPs", + ) + group.add_argument( + "-r", + "--root", + dest="alternative_root", + metavar="haplogroup", + help="Start searching tree from this branch", + ) + group.add_argument( + "-s", + "--single_sample", + dest="single_sample_id", + metavar="ID", + help="Restrict to a single sample", + ) + + if set_defaults: + # Set default values for all options and arguments + args = parser.parse_args([]) + else: + # Read options and arguments from from sys.argv[1:] + args = parser.parse_args() + + return args + + +def get_command_line_arg_defaults() -> argparse.Namespace: + """Get default values of command-line arguments. + + Returns + ------- + args : argparse.Namespace + Default values of all command-line options and arguments. + + """ + args = get_command_line_args(set_defaults=True) + + return args + + +class RawTextWithDefaultsHelpFormatter(argparse.RawDescriptionHelpFormatter): + + """Help message formatter that retains formatting and adds defaults. + + Combines argparse.RawTextHelpFormatter and argparse.ArgumentDefaultsHelpFormatter. + + """ + + def _split_lines(self, text, _): + return text.splitlines() + + def _get_help_string(self, action): + help_message = action.help + if "%(default)" not in action.help: + if action.default is not argparse.SUPPRESS: + defaulting_nargs = [argparse.OPTIONAL, argparse.ZERO_OR_MORE] + if action.option_strings or action.nargs in defaulting_nargs: + help_message += "\n(default: %(default)s)" + + return help_message diff --git a/yhaplo/call_haplogroups.py b/yhaplo/call_haplogroups.py deleted file mode 100755 index 6fdbe3a..0000000 --- a/yhaplo/call_haplogroups.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python -# -# David Poznik -# 2016.01.08 -# call_haplogroups.py -# -# yhaplo driver script -# -# To run: python -m yhaplo.call_haplogroups -# ---------------------------------------------------------------------- -from __future__ import absolute_import - -import six - -from .config import Config -from .sample import Sample -from .tree import Tree - - -def call_haplogroups( - useDefaultCmdLineArgs=False, suppressOutputAndLog=False, outDir=None, residList=None -): - "configures run, builds tree, and calls haplogroups" - - config = Config( - useDefaultCmdLineArgs=useDefaultCmdLineArgs, - suppressOutputAndLog=suppressOutputAndLog, - outDir=outDir, - residList=residList, - ) - tree = Tree(config) - Sample.callHaplogroups(config, tree) - - -def call_haplogroups_for_resid_list(resid_list): - """ - calls haplogroups over a list of 23andMe research IDs. - returns a dictionary: key=resid, value=dictionary of results - """ - - call_haplogroups( - useDefaultCmdLineArgs=True, suppressOutputAndLog=True, residList=resid_list - ) - - results_dict = dict() - for sample in Sample.sampleList: - results_dict[sample.ID] = { - "yhaplo:haplogroup": six.ensure_str(sample.haplogroup), - "yhaplo:hgSNP": six.ensure_str(sample.hgSNP), - "yhaplo:hgSNPobs": six.ensure_str(sample.hgSNPobs), - } - - return results_dict - - -if __name__ == "__main__": - call_haplogroups() diff --git a/yhaplo/cli/__init__.py b/yhaplo/cli/__init__.py new file mode 100644 index 0000000..429eb26 --- /dev/null +++ b/yhaplo/cli/__init__.py @@ -0,0 +1 @@ +"""Command-line interface.""" diff --git a/yhaplo/cli/convert_to_genos.py b/yhaplo/cli/convert_to_genos.py new file mode 100644 index 0000000..44280ca --- /dev/null +++ b/yhaplo/cli/convert_to_genos.py @@ -0,0 +1,216 @@ +"""Convert data to .genos.txt format for yhaplo. + +Input format options: + +1. .ped and .map + +2. .23andMe.txt + Column 1: SNP identifier (ignored) + Column 2: Chromosome (row retained only if chromosome in CHROMOSOME_SET) + Column 3: Physical coordinate (GRCh37 assumed) + Column 4: Allele 1 (row retained only if allele 1 in ALLELE_SET) + Column 5: Allele 2 (if present) + +For details, run: + yhaplo_convert_to_genos --help + +""" + +import argparse +import os +import sys +from typing import TextIO + +CHROMOSOME_SET = {"24", "Y"} +ALLELE_SET = set("ACGTDI") +FN_ENDING_TO_FN_TYPE = { + ".ped": "ped", + ".23andMe.txt": "ttam", + ".acom.txt": "ttam", +} +OUT_DIR = "converted" + + +# ---------------------------------------------------------------------- +# Ped and Map + + +def convert_ped( + ped_fp: str, + fn_root: str, + fn_ending: str, +) -> None: + """Convert a .ped and .map to a .genos.txt.""" + + map_fp = ped_fp.replace(fn_ending, ".map") + out_fp = os.path.join(OUT_DIR, f"{fn_root}.genos.txt") + + with open(out_fp, "w") as out_file: + index_list = read_map(map_fp, out_file) + process_ped(ped_fp, index_list, out_file) + + print(f"Output: {out_fp}\n") + + +def read_map( + map_fp: str, + out_file: TextIO, +) -> list[int]: + """Read a .map file and write positions.""" + + if not os.path.exists(map_fp): + raise FileNotFoundError(f"Map file not found: {map_fp}") + + print(f"Map: {map_fp}\n") + + position_list = [] + index_list = [] + index = 0 + with open(map_fp, "r") as map_file: + for line in map_file: + chromosome, _, _, position = line.strip().split() + if chromosome in CHROMOSOME_SET: + position_list.append(position) + index_list.append(index) + index += 1 + + positions_str = "\t".join(position_list) + out_file.write(f"ID\t{positions_str}\n") + + return index_list + + +def process_ped( + ped_fp: str, + index_list: list[int], + out_file: TextIO, +) -> None: + """Process a .ped file.""" + + diploid_index_list = [2 * i for i in index_list] + num_individuals, num_female = 0, 0 + with open(ped_fp, "r") as in_file: + for line in in_file: + line_list = line.strip().split() + sex = line_list[4] + if sex == "2": + num_female += 1 + continue + + diploid_geno_list = line_list[6:] + haploid_geno_list = [] + for i in diploid_index_list: + allele1, allele2 = diploid_geno_list[i], diploid_geno_list[i + 1] + if allele1 in ALLELE_SET and allele1 == allele2: + haploid_geno_list.append(allele1) + else: + haploid_geno_list.append(".") + + num_individuals += 1 + iid = "-".join(line_list[:2]) + genotypes_str = "\t".join(haploid_geno_list) + out_file.write(f"{iid}\t{genotypes_str}\n") + + print(f"{num_female:5d} females ignored") + print(f"{num_individuals:5d} individuals written") + print(f"{len(index_list):5d} markers\n") + + +# ---------------------------------------------------------------------- +# 23andMe + + +def convert_ttam( + in_fp: str, + ID: str, +) -> None: + """Read single-sample flat format and converts to .genos.txt.""" + + out_fp = os.path.join(OUT_DIR, f"{ID}.genos.txt") + geno_tuple_list = [] + num_non_y, num_het_or_no_call = 0, 0 + with open(in_fp, "r") as in_file: + for line in in_file: + if line[0] == "#" or line[:4] == "rsid": + continue + + line_list = line.strip().split() + num_fields = len(line_list) + chromosome, position, allele1 = line_list[1:4] + if num_fields == 5: + allele2 = line_list[4] + elif num_fields != 4: + raise ValueError( + f"Encountered line with {num_fields} elements:\n" + line + ) + + if chromosome in CHROMOSOME_SET: + if allele1 in ALLELE_SET and (num_fields == 4 or allele1 == allele2): + geno_tuple_list.append((position, allele1)) + else: + num_het_or_no_call += 1 + else: + num_non_y += 1 + + os.makedirs(OUT_DIR, exist_ok=True) + with open(out_fp, "w") as out_file: + write_line_from_tuple_list(0, geno_tuple_list, out_file, "ID") + write_line_from_tuple_list(1, geno_tuple_list, out_file, ID) + + print(f"{num_non_y:6d} non-Y genotypes ignored") + print(f"{num_het_or_no_call:6d} Y-chromosome genotypes ignored (het or no-call)") + print(f"{len(geno_tuple_list):6d} written\n") + print(f"Output: {out_fp}\n") + + +def write_line_from_tuple_list( + index: int, + tuple_list: list[tuple[str, str]], + out_file: TextIO, + row_header: str = "", +) -> None: + """Write one line with the i-th element of each tuple.""" + + out_file.write(row_header) + for my_tuple in tuple_list: + out_file.write(f"\t{my_tuple[index]}") + + out_file.write("\n") + + +def main() -> None: + """Run script.""" + + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument("in_fp", type=str, help="input file name") + args = parser.parse_args() + in_fp = args.in_fp + + if not os.path.exists(in_fp): + sys.exit(f"ERROR. Input file does not exist: {in_fp}") + + print(f"Input: {in_fp}\n") + + fn_type = None + for fn_ending in FN_ENDING_TO_FN_TYPE: + if in_fp.endswith(fn_ending): + fn_type = FN_ENDING_TO_FN_TYPE[fn_ending] + fn_root = os.path.basename(in_fp).replace(fn_ending, "") + break + + if fn_type == "ped": + convert_ped(in_fp, fn_root, fn_ending) + elif fn_type == "ttam": + convert_ttam(in_fp, ID=fn_root) + else: + sys.exit( + "ERROR. Input file must be a .ped or a .23andMe.txt " + + "in the corresponding format" + ) + + +if __name__ == "__main__": + main() diff --git a/yhaplo/cli/plot_tree.py b/yhaplo/cli/plot_tree.py new file mode 100644 index 0000000..2c9529a --- /dev/null +++ b/yhaplo/cli/plot_tree.py @@ -0,0 +1,69 @@ +"""Plot a Newick-formatted tree. + +For details, run: + yhaplo_plot_tree --help + +""" + +import argparse +import os + +from yhaplo.config import Config +from yhaplo.utils.optional_dependencies import optional_import_error_message + +try: + from Bio import Phylo +except ImportError as error: + error.msg = error.msg + optional_import_error_message( + "Biopython", + "plot haplogroup trees", + "plot", + ) + raise error + + +def main() -> None: + """Plot a Newick-formatted tree.""" + + args = get_args() + phylo_tree = Phylo.read(args.newick_fp, "newick") + if args.draw: + Phylo.draw(phylo_tree) + else: + Phylo.draw_ascii(phylo_tree) + + +def get_args() -> argparse.Namespace: + """Get command-line arguments.""" + + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawTextHelpFormatter, + ) + parser.add_argument( + "-d", + "--draw", + action="store_true", + default=False, + help="Draw tree, rather than printing ASCII version", + ) + parser.add_argument( + "-n", + "--newick_fp", + type=str, + default=os.path.join( + os.path.dirname(os.path.dirname(os.path.realpath(__file__))), + "data", + Config.primary_tree_data_file.data_subdir, + Config.primary_tree_data_file.filename, + ), + help="Path to file containing Newick tree to plot", + ) + + args = parser.parse_args() + + return args + + +if __name__ == "__main__": + main() diff --git a/yhaplo/cli/yhaplo.py b/yhaplo/cli/yhaplo.py new file mode 100644 index 0000000..7ccda53 --- /dev/null +++ b/yhaplo/cli/yhaplo.py @@ -0,0 +1,28 @@ +"""Call haplogroups. + +For details, run: + yhaplo --help + +""" + +import logging + +from yhaplo.api.call_haplogroups import call_haplogroups +from yhaplo.api.command_line_args import get_command_line_args + +root_logger = logging.getLogger() + + +def main() -> None: + """Configure logging and call haplogroups.""" + + logging.basicConfig(level=logging.INFO, format="%(message)s") + command_line_args = get_command_line_args() + call_haplogroups( + command_line_args=command_line_args, + root_logger=root_logger, + ) + + +if __name__ == "__main__": + call_haplogroups() diff --git a/yhaplo/config.py b/yhaplo/config.py index 996bc5d..c3c0e01 100644 --- a/yhaplo/config.py +++ b/yhaplo/config.py @@ -1,964 +1,506 @@ -# David Poznik -# 2016.01.12 -# config.py -# -# Defines the Config class, which includes command-line arguments. -# ---------------------------------------------------------------------- -from __future__ import absolute_import, print_function +"""Define Config class, which includes command-line arguments.""" import argparse +import logging import os import sys -from collections import defaultdict, namedtuple +from collections.abc import Mapping +from typing import Optional, Union -import six -from six.moves import range, zip +import numpy as np -from . import __version__, utils +from yhaplo._version import __version__ +from yhaplo.api.command_line_args import get_command_line_arg_defaults +from yhaplo.utils.loaders import DataFile -DESCRIPTION = """ -yhaplo identifies the Y-chromosome haplogroup of each male in a sample of one -to millions. Sequence data will yield the most highly resolved classifications, -but the algorithm also works well with chip-based genotype data, provided a -reasonable number of phylogenetically informative sites have been assayed. -""" +DASHED_LINE = "-" * 72 + "\n" +IID_TYPE = Union[int, str] -# ---------------------------------------------------------------------- -# constants +logger = logging.getLogger(__name__) -ANC_STOP_THRESH_DEFAULT = 2 # BFS stopping condition parameter default -DER_COLLAPSE_THRESH_DEFAULT = 2 # BFS collapsing parameter default -# ---------------------------------------------------------------------- -class Config(object): - "container for parameters, constants, filenames, and command-line arguments" +class Config: - # parameters and constants + """Yhaplo configuration class. + + This class is a container for parameters, constants, filenames, etc. + + """ + + # Constants # -------------------------------------------------------------------- - isoggDate = "2016.01.04" # date ISOGG website scraped to isoggFN - rootHaplogroup = "A" # haplogroup to associate with root node - missingGenotype = "." # for text input - missingHaplogroup = "." # for output - vcf_chrom_label_set = {"Y", "chrY", "24"} # to restrict VCF input to chrY - vcfStartCol = 9 # first data column in .vcf - vcf4startCol = 7 # first data column in ".vcf4" - numCharsToCompareDefault = 3 # for matchFlag in Sample.__str__ - - newickSemanticTokenString = "(),:;" # used in regex - allelesString = "A C G T D I" # space of possible alleles - snpLabelLettersRankString = ( - "M P V L CTS AF B F Page U PF Z SK" # for prioritization - ) - superfluousSNPtextList = ["IMS-", "-null"] # stripped out of snp names - multiCharHgTruncString = ( + + # Independent constants + isogg_date = "2016.01.04" # Date ISOGG website scraped + root_haplogroup = "A" # Haplogroup to associate with root node + missing_genotype = "." # For text input + missing_haplogroup = "." # For output + vcf_chrom_label_set = {"Y", "chrY", "24"} # To restrict VCF input to chrY + + newick_semantic_token_string = "(),:;" # Used in regex + alleles_string = "A C G T D I" # Allowable alleles + snp_label_letters_rank_string = "M P V L CTS AF B F Page U PF Z SK" + superfluous_snp_text_list = ["IMS-", "-null"] # Stripped out of snp names + multi_char_hg_trunc_string = ( "A00 A0-T A0 A1 A1a A1b A1b1 BT CT DE CF GHIJK HIJK IJK IJ LT NO" ) - # derived constants - newickSemanticTokenSet = set(newickSemanticTokenString) - multiCharHgTruncSet = set(multiCharHgTruncString.split()) - multiCharHgTruncMaxLen = max([len(elem) for elem in multiCharHgTruncSet]) - superfluousSNPtextList = set(superfluousSNPtextList) - alleleSet = set(allelesString.split()) - homozygousGenotypeSet = {"%s%s" % (allele, allele) for allele in alleleSet} - snpLabelLettersRankDict = { - letters: rank for rank, letters in enumerate(snpLabelLettersRankString.split()) + # Derived constants + newick_semantic_token_set = set(newick_semantic_token_string) + multi_char_hg_trunc_set = set(multi_char_hg_trunc_string.split()) + multi_char_hg_trunc_max_len = max([len(elem) for elem in multi_char_hg_trunc_set]) + allele_set = set(alleles_string.split()) + homozygous_genotype_set = {f"{allele}{allele}" for allele in allele_set} + snp_label_letters_rank_dict = { + letters: rank + for rank, letters in enumerate(snp_label_letters_rank_string.split()) } - # 23andMe-specific parameters and constants - # -------------------------------------------------------------------- - ttamHgCallReplacementDict = {"BT": "B"} # prevents artifactual calls - callingProgressEarlySet = {100, 500, 1000, 5000} # for progress messages - callingProgressInterval = 10000 # for progress messages - - # SNPs - maxPlatformVersion = 5 # most recent chip version - chromosomeInteger = 24 - snpMetaDSname = "Metadata.master_v%d" % maxPlatformVersion - snpMetaColList = [ - "platform:chrom", - "platform:pos", - ] # features to draw from SNP metadata - maxPlatformVersionPlusOne = maxPlatformVersion + 1 - - # genotypes - ablockDSnameDefault = "genotype.customer.ablock" - ablock_fn_tp = "{}.ablock.npy.gz" - bitpacked_ablock_fn_tp = "*.{}.ablock" # for use in a glob.glob - - # samples - customerMetaDSname = "Metadata.customer" # for when using ablockDSnameDefault - customerIDcol = "resid" - customerPlatformColList = [ - "is_v%d" % platformVersion - for platformVersion in range(1, maxPlatformVersionPlusOne) - ] - customerSexColList = ["sex", "sex_x", "sex_y"] # for --allMaleCustomers option - customerPrevHaplogroupCol = "y_haplogroup" # for --compareToMetadata option - customerMetaColList = [ - customerIDcol - ] + customerPlatformColList # for whenever metadata used - CustomerTuple = namedtuple( - "CustomerTuple", customerMetaColList + [customerPrevHaplogroupCol] - ) + # 23andMe-specific constants + ttam_hg_call_replacement_dict = {"BT": "B"} # Prevents artifactual calls + calling_progress_early_set = {100, 500, 1000, 5000} # For progress messages + calling_progress_interval = 10_000 # For progress messages + platforms = ["v1", "v2", "v3", "v4", "v5"] - # filenames + # Data files # ---------------------------------------------------------------------- - # directories - softwareDir = os.path.dirname(os.path.realpath(__file__)) - inDir = os.path.join(softwareDir, "input") - defaultOutDir = "output" - - # input: phylogenetic data - primaryTreeFN = "%s/y.tree.primary.%s.nwk" % (inDir, isoggDate) - isoggFN = "%s/isogg.%s.txt" % (inDir, isoggDate) - isoggCorrectionsFNlist = [ - "%s/isogg.correct.coordinate.txt" % inDir, - "%s/isogg.correct.polarize.txt" % inDir, + # Tree data + primary_tree_data_file = DataFile( + "tree", + f"y.tree.primary.{isogg_date}.nwk", + "primary tree topology", + ) + + # Variant data + isogg_data_file = DataFile( + "variants", + f"isogg.{isogg_date}.txt", + "ISOGG SNP data", + ) + isogg_corrections_data_files = [ + DataFile("variants", "isogg.correct.coordinate.txt"), + DataFile("variants", "isogg.correct.polarize.txt"), ] - isoggOmitFNlist = [ - "%s/isogg.omit.bad.txt" % inDir, - "%s/isogg.omit.bad.23andMe.txt" % inDir, - "%s/isogg.omit.branch.conflict.txt" % inDir, - "%s/isogg.omit.branch.conflict.23andMe.v5.txt" % inDir, + isogg_omit_data_files = [ + DataFile("variants", "isogg.omit.bad.txt"), + DataFile("variants", "isogg.omit.bad.23andMe.txt"), + DataFile("variants", "isogg.omit.branch.conflict.txt"), + DataFile("variants", "isogg.omit.branch.conflict.23andMe.v5.txt"), ] - isoggMultiAllelicFN = "%s/isogg.multiallelic.txt" % inDir - isoggRepSNPfn = "%s/representative.SNPs.isogg.2015tree.txt" % inDir - otherRepSNPfn = "%s/representative.SNPs.additional.txt" % inDir - preferredSNPnamesFN = "%s/preferred.snpNames.txt" % inDir - pagesFN = "%s/23andMe.content.pages.txt" % inDir - - # input: platform - platform_pos_fn_tp = os.path.join(softwareDir, "platform_sites/v{}.sites.txt") - - # input: example data - kgp_subset_fn = os.path.join(softwareDir, "data", "1000Y.subset.genos.txt") - kgp_single_sample_vcf_fn = os.path.join(softwareDir, "data", "HG01938.vcf.gz") - - # input: test data - thousandYdataFNtp = os.path.join(softwareDir, "1000Y/process/output/%s") - thousandYhgFN = os.path.join( - softwareDir, "1000Y/haplogroups/4.haplogroups.called.txt" + isogg_multi_allelic_data_file = DataFile( + "variants", + "isogg.multiallelic.txt", + ) + isogg_rep_snp_data_file = DataFile( + "variants", + "representative.SNPs.isogg.2015_tree.txt", + ) + other_rep_snp_data_file = DataFile( + "variants", + "representative.SNPs.additional.txt", + ) + preferred_snp_names_data_file = DataFile( + "variants", + "preferred.snp_names.txt", ) - # output: phylogenetic info - phyloOutputFNtpDictDict = { - "withOutdirAndIsoggDate": { - "alignedPrimaryTreeFN": "%s/y.tree.primary.aligned.ycc.%s.nwk", - "yccTreeFN": "%s/y.tree.ycc.%s.nwk", - "hgsnpTreeFN": "%s/y.tree.hgSNP.%s.nwk", - "alignedYccTreeFN": "%s/y.tree.aligned.ycc.%s.nwk", - "alignedHgsnpTreeFN": "%s/y.tree.aligned.hgSNP.%s.nwk", - "platformYccTreeFNtp": "%s/y.tree.platform.%%d.ycc.%s.nwk", - "platformHgsnpTreeFNtp": "%s/y.tree.platform.%%d.hgSNP.%s.nwk", - "bfTreeFN": "%s/y.tree.bf.traversal.%s.txt", - "dfTreeFN": "%s/y.tree.df.traversal.%s.txt", - "treeTableFN": "%s/y.tree.table.%s.tsv", - "bfPrimaryTreeFN": "%s/y.tree.primary.bf.traversal.%s.txt", - "dfPrimaryTreeFN": "%s/y.tree.primary.df.traversal.%s.txt", - "cleanedIsoggFN": "%s/isogg.snps.cleaned.%s.txt", - "uniqueIsoggFN": "%s/isogg.snps.unique.%s.txt", - "droppedIsoggFN": "%s/isogg.snps.dropped.%s.txt", - }, - "withOutdir": { - "multiAllelicFoundFN": "%s/multiallelic.pos", - "pageMappingsFN": "%s/23andMe.content.page.mappings.txt", - "pageUpdatesFN": "%s/23andMe.page.updates.txt", - }, - } + # 23andMe: Block data + pos_to_block_indexes_data_file = DataFile( + "block", + "pos_to_block_indexes.yaml", + "Position-to-block-index", + ttam_only=True, + ) + + # 23andMe: Platform positions + platform_pos_data_subdir = "platform" + platform_pos_data_filename_tp = "{platform}.b37.positions.txt" - # output: haplogroup calls, log, optional files, 23andMe auxiliary files - logFNtp = "%s/log.%stxt" - haplogroupCallsFNtp = "%s/haplogroups.%stxt" - haplogroupRealTimeFNtp = "%s/haplogroups.realTime.%stxt" - countsAncDerFNtp = "%s/counts.ancDer.%stxt" - haplogroupPathsFNtp = "%s/paths.%stxt" - derSNPsFNtp = "%s/derived.snps.%stxt" - derSNPsDetailFNtp = "%s/derived.snps.detail.%stxt" - ancSNPsFNtp = "%s/ancestral.snps.%stxt" - ancSNPsDetailFNtp = "%s/ancestral.snps.detail.%stxt" - hgGenosFNtp = "%s/hg.%%s.genotypes.%stxt" - noAblocksFNtp = "%s/ignored.noAblocks.%sresid.txt" - noGenotypesFNtp = "%s/ignored.noGenotypes.%sresid.txt" + # Example input files + # ---------------------------------------------------------------------- + repo_root_dir = os.path.dirname( + os.path.dirname(os.path.realpath(__file__)) + ).removesuffix("/ttam") + example_genotype_dir = os.path.join( + repo_root_dir, + "tests", + "fixtures", + "input", + ) + kgp_subset_fp = os.path.join(example_genotype_dir, "1000Y.subset.genos.txt") + kgp_single_sample_vcf_fp = os.path.join(example_genotype_dir, "HG01938.vcf.gz") + + # Output files + # ---------------------------------------------------------------------- + default_out_dir = "output" + + # Phylogenetic info + aligned_primary_tree_fn = f"y.tree.primary.aligned.ycc.{isogg_date}.nwk" + ycc_tree_fn = f"y.tree.ycc.{isogg_date}.nwk" + hg_snp_tree_fn = f"y.tree.hg_snp.{isogg_date}.nwk" + aligned_ycc_tree_fn = f"y.tree.aligned.ycc.{isogg_date}.nwk" + aligned_hg_snp_tree_fn = f"y.tree.aligned.hg_snp.{isogg_date}.nwk" + platform_ycc_tree_fn_tp = f"y.tree.platform.{{platform}}.ycc.{isogg_date}.nwk" + platform_hg_snp_tree_fn_tp = f"y.tree.platform.{{platform}}.hg_snp.{isogg_date}.nwk" + bf_tree_fn = f"y.tree.bf.traversal.{isogg_date}.txt" + df_tree_fn = f"y.tree.df.traversal.{isogg_date}.txt" + tree_table_fn = f"y.tree.table.{isogg_date}.tsv" + bf_primary_tree_fn = f"y.tree.primary.bf.traversal.{isogg_date}.txt" + df_primary_tree_fn = f"y.tree.primary.df.traversal.{isogg_date}.txt" + cleaned_isogg_fn = f"isogg.snps.cleaned.{isogg_date}.txt" + unique_isogg_fn = f"isogg.snps.unique.{isogg_date}.txt" + dropped_isogg_fn = f"isogg.snps.dropped.{isogg_date}.txt" + multi_allelic_found_fn = "multiallelic.pos" + + # Haplogroup calls, log, optional files, 23andMe auxiliary files + log_fn_tp = "log.{fn_label}.txt" + haplogroup_calls_fn_tp = "haplogroups.{fn_label}.txt" + haplogroup_real_time_fn_tp = "haplogroups.real_time.{fn_label}.txt" + counts_anc_der_fn_tp = "counts.anc_der.{fn_label}.txt" + haplogroup_paths_fn_tp = "paths.{fn_label}.txt" + der_snps_fn_tp = "derived.snps.{fn_label}.txt" + der_snps_detail_fn_tp = "derived.snps.detail.{fn_label}.txt" + anc_snps_fn_tp = "ancestral.snps.{fn_label}.txt" + anc_snps_detail_fn_tp = "ancestral.snps.detail.{fn_label}.txt" + hg_genos_fn_tp = "hg.{{haplogroup}}.genotypes.{fn_label}.txt" def __init__( self, - useDefaultCmdLineArgs=False, - suppressOutputAndLog=False, - outDir=None, - residList=None, + command_line_args: Optional[argparse.Namespace] = None, + iid_to_ablock: Optional[Mapping[IID_TYPE, Union[bytes, np.ndarray]]] = None, + iid_to_platforms: Optional[Mapping[IID_TYPE, str]] = None, + suppress_output: bool = False, + out_dir: Optional[str] = None, + all_aux_output: bool = False, + root_logger: Optional[logging.Logger] = None, ): - self.residList = residList - self.useDefaultCmdLineArgs = useDefaultCmdLineArgs - self.suppressOutputAndLog = suppressOutputAndLog - - self.setCommandLineArgs() - self.setParamsGeneral(outDir) - self.validateParams() - self.setParamsBasedOnInputType() - self.setParamsBasedOnRunType() - self.makeOutputDirectories() - self.setOutputFileNamesAndOpenSome() + """Instantiate Config. + + Parameters + ---------- + command_line_args : argparse.Namespace | None, optional + Command-line arguments. + iid_to_ablock : Mapping[IID_TYPE, bytes | np.ndarray] | None, optional + Mapping of individual identifier to 23andMe ablock. + iid_to_platforms : Mapping[IID_TYPE, str] | None, optional + Mapping of individual identifier to comma-separated string of + 23andMe platforms, each starting with "v". + suppress_output : bool, optional + When True, do not generate output files. + out_dir : str | None, optional + Output directory. + When not None, override command_line_args value. + all_aux_output : bool = False, optional + Generate all auxiliary output. + When True, override command_line_args value. + root_logger : logging.Logger | None, optional + If supplied, add a file handler. + To populate the log file, set the logging level to INFO or lower. - if self.args.fileNamesOnly: - self.printFileNamesAndExit() + """ + self.args = ( + command_line_args + if command_line_args is not None + else get_command_line_arg_defaults() + ) + self.invoked_from_command_line = command_line_args is not None - self.openLogAndWelcome() + self.iid_to_ablock = iid_to_ablock + self.iid_to_platforms = iid_to_platforms + if iid_to_ablock is not None and iid_to_platforms is None: + raise ValueError("Calling from ablocks requires iid_to_platforms") - if self.runFromAblocks: - self.setParams23andMe() - self.get23andMeDatasets() + self.suppress_output = suppress_output + self.set_params_general(out_dir, all_aux_output) + self.set_params_based_on_input_type() + self.make_output_directories() + self.set_output_file_paths_and_open_some(root_logger) - if self.suppressOutputAndLog: - self.overrideOutputGeneratingArgs() + self.log_welcome_message() + if self.suppress_output: + self.override_output_generating_args() - # ---------------------------------------------------------------------- - def setParamsGeneral(self, outDir): - "set general parameters" - - # run type: zero or one of these four will be set to True by downstream methods - self.runFromAblocks = False - self.runFromSampleMajorTxt = False - self.runFromVCF = False - self.runFromVCF4 = False - - # output directories - if outDir is not None: - self.outDir = outDir - elif self.args.outDir is not None: - self.outDir = self.args.outDir + def set_params_general( + self, + out_dir: Optional[str], + all_aux_output: bool, + ) -> None: + """Set general parameters.""" + + # Run type: Zero or one of these will be set True by downstream methods. + self.run_from_ablocks = False + self.run_from_sample_major_txt = False + self.run_from_vcf = False + + # Output directories + if out_dir is not None: + self.out_dir = out_dir + elif self.args.out_dir is not None: + self.out_dir = self.args.out_dir else: - self.outDir = Config.defaultOutDir - self.phyloOutDir = self.outDir - - # dependent args - if self.args.example_1000Y_subset or self.args.example_single_sample_vcf: - self.args.all_aux_output = True + self.out_dir = type(self).default_out_dir - if self.args.all_aux_output: - aux_output_arg_list = [ - "writeAncDerCounts", - "writeHaplogroupPaths", - "writeHaplogroupPathsDetail", - "writeDerSNPs", - "writeDerSNPsDetail", - "writeAncSNPs", - "writeAncSNPsDetail", - ] - for aux_output_arg in aux_output_arg_list: - setattr(self.args, aux_output_arg, True) - - # example data - if self.args.example_1000Y_subset: - self.args.dataFN = type(self).kgp_subset_fn - elif self.args.example_single_sample_vcf: - self.args.dataFN = type(self).kgp_single_sample_vcf_fn - - # derived 1000Y testing parameters - self.test1000Y = ( - self.args.test1000Yall - or self.args.test1000YplatformVersion - or self.args.test1000Ysubset - or self.args.test1000YoneID - ) - self.test1000YformatSpecified = ( - self.args.test1000Yvcf or self.args.test1000Yvcf4 - ) + self.phylo_out_dir = self.out_dir - # other parameters - self.vcfStartCol = Config.vcfStartCol - self.numCharsToCompare = Config.numCharsToCompareDefault - self.prevCalledHgFN = self.args.prevCalledHgFN - self.compareToPrevCalls = ( - self.prevCalledHgFN - or ( - self.args.dataFN - and self.args.dataFN.endswith(".resid.txt") - and os.path.isfile(self.args.dataFN) - and len(open(self.args.dataFN).readline().split()) > 2 - ) - or self.test1000Y - or self.args.compareToMetadata - ) + # Example data + if self.args.example_text: + self.args.data_fp = type(self).kgp_subset_fp + elif self.args.example_vcf: + self.args.data_fp = type(self).kgp_single_sample_vcf_fp - # ---------------------------------------------------------------------- - def validateParams(self): - "ensure consistency of run options" - - # preclude specification of both a data option and a test option - if self.test1000Y: - if self.args.dataFN: - sys.exit( - "ERROR. Do not specify a 1000Y test option " - "and a specific data file.\n" - ) - if self.args.allMaleCustomers: - sys.exit( - "ERROR. Do not specify a 1000Y test option " - "to run on all male 23andMe customers.\n" - ) - if self.args.primaryOnly: - sys.exit( - "ERROR. Do not specify a 1000Y test option " - "if not importing ISOGG SNPs.\n" - ) - elif self.test1000YformatSpecified: - sys.exit( - "ERROR. Only specify a 1000Y data format " - "when running on 1000Y data.\n" - ) - - # require a resid list input file when ablock dataset specified - if self.args.ablockDSname or self.args.ablocks_dir: - if not self.args.dataFN or not self.args.dataFN.endswith(".resid.txt"): - sys.exit( - "ERROR. Provide a resid list (-i file_label.resid.txt) " - "when specifying\n a non-default ablock dataset " - "name or ablock directory" - ) - if self.args.compareToMetadata: - sys.exit( - "ERROR. Will not check metadata when using " - "non-default ablock dataset.\n" - ) + if self.args.example_text or self.args.example_vcf: + check_example_data_availability(self.args.data_fp) + self.args.all_aux_output = True - # ---------------------------------------------------------------------- - def setParamsBasedOnInputType(self): - "set parameters based on input type" - - if self.test1000Y: - self.setParams1000Y() - elif self.args.dataFN: # any type of input file, including resid list - self.parseDataFN() - elif self.args.allMaleCustomers: - self.runFromAblocks = True - self.outFNlabel = "23andMe.all." - elif self.residList: # residList specified at config instantiation - self.runFromAblocks = True - self.outFNlabel = "residList." - else: - self.outFNlabel = "" - - def setParams1000Y(self): - "set parameters for 1000Y testing" - - # parameters - self.outDir = self.outDir + ".1000Y" - self.args.alternativeRoot = "A0-T" - if not self.prevCalledHgFN: - self.prevCalledHgFN = Config.thousandYhgFN - - # output-file label - if self.args.test1000Yall: - self.outFNlabel = "1000Y.all." - elif self.args.test1000Ysubset: - self.outFNlabel = "1000Y.subset." - elif self.args.test1000YoneID: - self.outFNlabel = "1000Y.%s." % self.args.test1000YoneID - if ( - self.args.singleSampleID - and self.args.singleSampleID != self.args.test1000YoneID - ): - sys.exit( - "ERROR. Contradiction. %s vs. %s" - % (self.args.singleSampleID, self.args.test1000YoneID) - ) - elif self.args.test1000YplatformVersion: - self.outFNlabel = "1000Y.all.v%d." % self.args.test1000YplatformVersion - - # input file name - if self.args.test1000Yvcf: - self.runFromVCF = True - dataFNlabel = self.outFNlabel + "vcf.gz" - elif self.args.test1000Yvcf4: - self.runFromVCF4 = True - dataFNlabel = self.outFNlabel + "vcf4" + # All auxiliary output option + if self.args.all_aux_output or all_aux_output: + self.args.write_anc_der_counts = True + self.args.write_haplogroup_paths = True + self.args.write_haplogroup_paths_detail = True + self.args.write_der_snps = True + self.args.write_der_snps_detail = True + self.args.write_anc_snps = True + self.args.write_anc_snps_detail = True + + def set_params_based_on_input_type(self) -> None: + """Set parameters based on input type.""" + + if self.args.data_fp: + self.parse_data_fp() else: - self.runFromSampleMajorTxt = True - dataFNlabel = self.outFNlabel + "genos.txt" - self.args.dataFN = Config.thousandYdataFNtp % dataFNlabel - - def parseDataFN(self): - "set parameters based on data file name" - - dataFN = self.args.dataFN - if dataFN.endswith(".resid.txt"): - self.runFromAblocks = True - residFNlabel = utils.basenameNoEnding(dataFN, ".resid.txt") - self.outFNlabel = residFNlabel - elif dataFN.endswith(".genos.txt"): - self.runFromSampleMajorTxt = True - self.outFNlabel = utils.basenameNoEnding(dataFN, ".genos.txt") - elif dataFN.endswith(".vcf"): - self.runFromVCF = True - self.outFNlabel = utils.basenameNoEnding(dataFN, ".vcf") - elif dataFN.endswith(".vcf.gz"): - self.runFromVCF = True - self.outFNlabel = utils.basenameNoEnding(dataFN, ".vcf.gz") - elif dataFN.endswith(".vcf4"): - self.runFromVCF4 = True - self.outFNlabel = utils.basenameNoEnding(dataFN, ".vcf4") - else: - sys.exit("\nERROR. Unknown data type: %s\n\n" % dataFN) + self.run_from_ablocks = bool(self.iid_to_ablock) + self.out_fn_label = "" - # ---------------------------------------------------------------------- - def setParamsBasedOnRunType(self): - "set parameters based on run type" + def parse_data_fp(self) -> None: + """Set parameters based on data file name.""" - if self.runFromAblocks and self.args.ablockDSname is None: - self.outDir = self.outDir + ".23andMe" + sample_major_suffixes = {".genos.txt", ".genos.txt.gz"} + vcf_suffixes = {".vcf.gz", ".bcf"} + supported_suffixes = sample_major_suffixes | vcf_suffixes + suffix = "" + for supported_suffix in supported_suffixes: + if self.args.data_fp.endswith(supported_suffix): + suffix = supported_suffix - if self.runFromVCF4: - self.vcfStartCol = Config.vcf4startCol + if not suffix: + raise ValueError(f"Unknown data type: {self.args.data_fp}\n\n") - # ---------------------------------------------------------------------- - def makeOutputDirectories(self): - "make output directories if they do not already exist" + self.run_from_sample_major_txt = suffix in sample_major_suffixes + self.run_from_vcf = suffix in vcf_suffixes + self.out_fn_label = basename_no_suffix(self.args.data_fp, suffix) + if self.args.single_sample_id: + self.out_fn_label = f"{self.out_fn_label}.{self.args.single_sample_id}." - if not self.suppressOutputAndLog: - utils.mkdirP(self.outDir) - utils.mkdirP(self.phyloOutDir) + def make_output_directories(self) -> None: + """Make output directories.""" - # ---------------------------------------------------------------------- - def setOutputFileNamesAndOpenSome(self): - """ - set log and output file names. - open those to which we will be writing in real time. - """ + if not self.suppress_output: + for dir_ in [self.out_dir, self.phylo_out_dir]: + os.makedirs(dir_, exist_ok=True) - for fn, tp in six.iteritems( - Config.phyloOutputFNtpDictDict["withOutdirAndIsoggDate"] - ): - setattr(self, fn, tp % (self.phyloOutDir, self.isoggDate)) + def set_output_file_paths_and_open_some( + self, + root_logger: Optional[logging.Logger] = None, + ) -> None: + """Set log and output file paths. - for fn, tp in six.iteritems(Config.phyloOutputFNtpDictDict["withOutdir"]): - setattr(self, fn, tp % self.phyloOutDir) + If (and only if) `root_logger` is supplied, add a file handler. + In general, library code should not be in the business of adding + handlers, but we (conditionally) do so here since the log filepath + is dynamically determined. - if self.args.singleSampleID: - self.outFNlabel = "%s%s." % (self.outFNlabel, self.args.singleSampleID) - self.haplogroupCallsFN = self.constructOutFileName(Config.haplogroupCallsFNtp) - self.logFN = self.constructOutFileName(Config.logFNtp) + Open output files to which yhaplo will write in real time. - if self.args.writeAncDerCounts: - self.countsAncDerFN = self.constructOutFileName(Config.countsAncDerFNtp) - if self.args.writeHaplogroupPaths or self.args.writeHaplogroupPathsDetail: - self.haplogroupPathsFN = self.constructOutFileName( - Config.haplogroupPathsFNtp - ) - if self.args.writeDerSNPs: - self.derSNPsFN = self.constructOutFileName(Config.derSNPsFNtp) - if self.args.writeDerSNPsDetail: - self.derSNPsDetailFN = self.constructOutFileName(Config.derSNPsDetailFNtp) - if self.args.writeAncSNPs: - self.ancSNPsFN = self.constructOutFileName(Config.ancSNPsFNtp) - if self.args.writeAncSNPsDetail: - self.ancSNPsDetailFN = self.constructOutFileName(Config.ancSNPsDetailFNtp) - - # files written to in real time. open now. - if self.args.writeHaplogroupsRealTime: - self.haplogroupRealTimeFN = self.constructOutFileName( - Config.haplogroupRealTimeFNtp - ) - self.haplogroupRealTimeFile = open(self.haplogroupRealTimeFN, "w", 1) + Parameters + ---------- + root_logger : logging.Logger | None, optional + If supplied, add a file handler. - if self.args.haplogroupToListGenotypesFor: - self.hgGenosFN = ( - self.constructOutFileName(Config.hgGenosFNtp) - % self.args.haplogroupToListGenotypesFor - ) - self.hgGenosFile = open(self.hgGenosFN, "w", 1) + """ + self.log_fp = self.construct_out_path(type(self).log_fn_tp) + self.root_logger = root_logger + if self.root_logger is not None: + self.root_logger.addHandler(logging.FileHandler(self.log_fp, "w")) - def constructOutFileName(self, FNtp): - "returns an output file name, given a template" + self.haplogroup_calls_fp = self.construct_out_path( + type(self).haplogroup_calls_fn_tp + ) - return FNtp % (self.outDir, self.outFNlabel) + self.aligned_primary_tree_fp = self.construct_phylo_out_path( + type(self).aligned_primary_tree_fn + ) + self.ycc_tree_fp = self.construct_phylo_out_path(type(self).ycc_tree_fn) + self.hg_snp_tree_fp = self.construct_phylo_out_path(type(self).hg_snp_tree_fn) + self.aligned_ycc_tree_fp = self.construct_phylo_out_path( + type(self).aligned_ycc_tree_fn + ) + self.aligned_hg_snp_tree_fp = self.construct_phylo_out_path( + type(self).aligned_hg_snp_tree_fn + ) + self.platform_ycc_tree_fp_tp = self.construct_phylo_out_path( + type(self).platform_ycc_tree_fn_tp + ) + self.platform_hg_snp_tree_fp_tp = self.construct_phylo_out_path( + type(self).platform_hg_snp_tree_fn_tp + ) + self.bf_tree_fp = self.construct_phylo_out_path(type(self).bf_tree_fn) + self.df_tree_fp = self.construct_phylo_out_path(type(self).df_tree_fn) + self.tree_table_fp = self.construct_phylo_out_path(type(self).tree_table_fn) + self.bf_primary_tree_fp = self.construct_phylo_out_path( + type(self).bf_primary_tree_fn + ) + self.df_primary_tree_fp = self.construct_phylo_out_path( + type(self).df_primary_tree_fn + ) + self.cleaned_isogg_fp = self.construct_phylo_out_path( + type(self).cleaned_isogg_fn + ) + self.unique_isogg_fp = self.construct_phylo_out_path(type(self).unique_isogg_fn) + self.dropped_isogg_fp = self.construct_phylo_out_path( + type(self).dropped_isogg_fn + ) + self.multi_allelic_found_fp = self.construct_phylo_out_path( + type(self).multi_allelic_found_fn + ) - # ---------------------------------------------------------------------- - def printFileNamesAndExit(self): - "prints input and output file names to stdout, then exits" + if self.args: + if self.args.write_anc_der_counts: + self.counts_anc_der_fp = self.construct_out_path( + type(self).counts_anc_der_fn_tp + ) - print("in: %s" % self.args.dataFN) - print("out: %s" % self.haplogroupCallsFN) - sys.exit() + if ( + self.args.write_haplogroup_paths + or self.args.write_haplogroup_paths_detail + ): + self.haplogroup_paths_fp = self.construct_out_path( + type(self).haplogroup_paths_fn_tp + ) - # ---------------------------------------------------------------------- - def openLogAndWelcome(self): - "opens log file and emits a welcome message" + if self.args.write_der_snps: + self.der_snps_fp = self.construct_out_path(type(self).der_snps_fn_tp) - if self.suppressOutputAndLog: - self.logFile = None - else: - self.logFile = open(self.logFN, "w", 1) + if self.args.write_der_snps_detail: + self.der_snps_detail_fp = self.construct_out_path( + type(self).der_snps_detail_fn_tp + ) - self.errAndLog( - "\n%s yhaplo %s | Y-chromosome haplogroup caller\n" - % (utils.DASHES, __version__) - ) - if not self.useDefaultCmdLineArgs: - command = os.path.basename(sys.argv[0]) - args = " ".join(sys.argv[1:]) - self.errAndLog(" Command: %s %s\n" % (command, args)) - if not self.suppressOutputAndLog: - self.errAndLog(" Log: %s\n" % self.logFN) - self.errAndLog("%s" % utils.DASHES) + if self.args.write_anc_snps: + self.anc_snps_fp = self.construct_out_path(type(self).anc_snps_fn_tp) - self.emitWarnings() + if self.args.write_anc_snps_detail: + self.anc_snps_detail_fp = self.construct_out_path( + type(self).anc_snps_detail_fn_tp + ) - def errAndLog(self, message): - "output a message to stderr and write to the log file" + # Files to write to in real time. Open now. + if self.args.write_haplogroups_real_time: + self.haplogroup_real_time_fp = self.construct_out_path( + type(self).haplogroup_real_time_fn_tp + ) + self.haplogroup_real_time_file = open( + self.haplogroup_real_time_fp, "w", 1 + ) - message = message.replace(Config.softwareDir + "/", "") - sys.stderr.write(message) - if self.logFile: - self.logFile.write(message) + if self.args.haplogroup_to_list_genotypes_for: + self.hg_genos_fp = self.construct_out_path( + type(self).hg_genos_fn_tp + ).format(haplogroup=self.args.haplogroup_to_list_genotypes_for) + self.hg_genos_file = open(self.hg_genos_fp, "w", 1) - def emitWarnings(self): - "emit warnings for deprecated options, etc." + def construct_out_path(self, fn_tp: str) -> str: + """Return an output file path, given a filename template.""" - if self.args.compareToMetadata: - self.errAndLog( - "\nWARNING. Deprecated option: -mdh, --compareToMetadata.\n" - + "The old algorithm will soon be retired.\n\n" - ) + file_path = os.path.join( + self.out_dir, + fn_tp.format(fn_label=self.out_fn_label).replace("..", "."), + ) - # ---------------------------------------------------------------------- - def setParams23andMe(self): - "set arguments for 23andMe data" - - self.ablockDSname = None - self.ablocks_dir = None - if self.args.ablocks_dir: - self.ablocks_dir = self.args.ablocks_dir - elif self.args.ablockDSname: - self.ablockDSname = self.args.ablockDSname - else: - self.ablockDSname = type(self).ablockDSnameDefault + return file_path - self.noAblocksFN = self.constructOutFileName(Config.noAblocksFNtp) - self.noGenotypesFN = self.constructOutFileName(Config.noGenotypesFNtp) - self.args.writeContentMappings = True - self.numCharsToCompare = 1 + def construct_phylo_out_path(self, fn: str) -> str: + """Return an output file path, given a filename or filename template.""" - def get23andMeDatasets(self): - """ - get 3 datasets: - - snp metadata - - customer metadata (if pulling ablocks from ablockDSnameDefault) - - ablock dataset (unless self.args.ablocks_dir has been specified) - """ + file_path = os.path.join(self.phylo_out_dir, fn) + return file_path - # imports - self.errAndLog("\n%sAccessing 23andMe data...\n\n" % utils.DASHES) - try: - from rtk23.config import init_rtk23 - from rtk23.dataset import UnknownDatasetException, dataset_factory - from rtk23.lib.coregen import VALUE_TO_CALL as ablockCodeToGenotypeDict - except ImportError: - sys.exit( - "ERROR. Cannot import from rtk23.\n" - "This run configuration is only supported " - "in the 23andMe research environment." - ) - self.ablockCodeToGenotypeDict = ablockCodeToGenotypeDict - - # initialization - self.errAndLog(" Initializing rtk23 ... ") - init_rtk23() - self.errAndLog("Done.\n") - - # SNP metadata - self.errAndLog(" SNPs: inferring ablock indexes ... ") - self.snpMetaDS = dataset_factory.get_dataset(Config.snpMetaDSname) - self.setPos2ablockIndexListDict() - self.errAndLog("Done.\n") - - # customer metadata - if not (self.args.ablockDSname or self.ablocks_dir): - self.errAndLog(" Customers: getting metadata...\n") - self.customerMetaDS = dataset_factory.get_dataset(Config.customerMetaDSname) - - # genotypes - if not self.ablocks_dir: - self.errAndLog(" Genotypes: getting ablock dataset...\n") - try: - self.ablockDS = dataset_factory.get_dataset(self.ablockDSname) - except UnknownDatasetException: - sys.exit("\nERROR. Unknown ablock dataset: %s" % self.ablockDSname) - - def setPos2ablockIndexListDict(self): - """ - builds a dictionary with: - key: physical coordinate - value: list of ablock indexes - """ + def log_welcome_message(self) -> None: + """Log welcome message.""" - snpMetaArrayDict = self.snpMetaDS.load(Config.snpMetaColList) - self.pos2ablockIndexListDict = defaultdict(list) - chrom_pos_tuple_list = zip( - *[snpMetaArrayDict[column] for column in Config.snpMetaColList] + logger.info( + f"\n{DASHED_LINE} yhaplo {__version__} " + "| Y-chromosome haplogroup caller" ) - for ablockIndex, (chromosome, position) in enumerate(chrom_pos_tuple_list): - if chromosome == Config.chromosomeInteger: - self.pos2ablockIndexListDict[position].append(ablockIndex) - # ---------------------------------------------------------------------- - def overrideOutputGeneratingArgs(self): - "turn off all auxiliary output options" - - self.args.traverseBF = False - self.args.traverseDF = False - self.args.writeTreeTable = False - self.args.writeContentMappings = False - self.args.writePlatformTrees = False - - self.args.writeAncDerCounts = False - self.args.writeHaplogroupPaths = False - self.args.writeHaplogroupPathsDetail = False - self.args.writeDerSNPs = False - self.args.writeDerSNPsDetail = False - self.args.writeAncSNPs = False - self.args.writeAncSNPsDetail = False - - self.args.writeHaplogroupsRealTime = False - self.args.haplogroupToListGenotypesFor = None - - # ---------------------------------------------------------------------- - def closeFiles(self): - "close optional real-time output files and log" - - if self.args.writeHaplogroupsRealTime: - self.haplogroupRealTimeFile.close() - - if self.args.haplogroupToListGenotypesFor: - self.hgGenosFile.close() - self.errAndLog( - ("Wrote genotypes at SNPs associated haplogroup %s:\n" + " %s\n\n") - % (self.args.haplogroupToListGenotypesFor, self.hgGenosFN) - ) + if self.invoked_from_command_line: + command = os.path.basename(sys.argv[0]) + args = " ".join(sys.argv[1:]) + logger.info(f" Command: {command} {args}") - if self.logFile: - self.logFile.close() + if self.root_logger is not None: + logger.info(f" Log: {self.log_fp}") - # ---------------------------------------------------------------------- - def setCommandLineArgs(self): - "reads command-line arguments or sets defaults if self.useDefaultCmdLineArgs" + logger.info(DASHED_LINE) - parser = argparse.ArgumentParser( - description=DESCRIPTION, - formatter_class=utils.RawTextWithDefaultsHelpFormatter, - ) + def override_output_generating_args(self) -> None: + """Turn off all auxiliary output options.""" - group = parser.add_argument_group("input") - group.add_argument( - "-i", - "--input", - dest="dataFN", - metavar="fileName", - help="input file in one of the following formats:\n" - " .resid.txt : col 1: 23andMe research IDs\n" - + " col 2: (optional) comma-separated int platforms\n" - + " col 3: (optional) previous haplogroup calls\n" - " .genos.txt : sample-major text data\n" - " row 1: coordinates, col 1: sample IDs\n" - " .vcf, .vcf.gz, .vcf4 : snp-major text data", - ) + self.args.traverse_bf = False + self.args.traverse_df = False + self.args.write_tree_table = False + self.args.write_platform_trees = False - group = parser.add_argument_group("[23andMe only] input") - group.add_argument( - "-a", - "--allMaleCustomers", - dest="allMaleCustomers", - action="store_true", - help="run on all male 23andMe customers (or one, when used with -s)", - ) - group.add_argument( - "-ab", - "--ablockDSname", - dest="ablockDSname", - metavar="dsName", - help="non-default ablock dataset name", - ) - group.add_argument( - "-ad", - "--ablocks_dir", - help="directory containing ablocks,\n" + "each named .ablock.npy.gz", - ) + self.args.write_anc_der_counts = False + self.args.write_haplogroup_paths = False + self.args.write_haplogroup_paths_detail = False + self.args.write_der_snps = False + self.args.write_der_snps_detail = False + self.args.write_anc_snps = False + self.args.write_anc_snps_detail = False - group = parser.add_argument_group("output") - group.add_argument( - "-o", "--outDir", dest="outDir", metavar="dirName", help="output directory" - ) + self.args.write_haplogroups_real_time = False + self.args.haplogroup_to_list_genotypes_for = None - group = parser.add_argument_group("run on example data") - group.add_argument( - "-ex", - "--example_1000Y_subset", - action="store_true", - help="run yhaplo on a subset of 1000 Genomes data\n" - "and produce all auxiliary output", - ) - group.add_argument( - "-ex1", - "--example_single_sample_vcf", - action="store_true", - help="run yhaplo on a single-sample 1000 Genomes VCF\n" - "and produce all auxiliary output", - ) + def close_real_time_output_files(self) -> None: + """Close optional real-time output files.""" - group = parser.add_argument_group( - "generate auxiliary output", - "generate files detailing haplogroup calling for each individual", - ) - group.add_argument( - "-aao", - "--all_aux_output", - action="store_true", - help="generate all auxilary output.\n" - "equivalent to these seven options:\n" - "--ancDerCounts --haplogroupPaths --haplogroupPathsDetail\n" - "--derSNPs --derSNPsDetail --ancSNPs --ancSNPsDetail", - ) - group.add_argument( - "-c", - "--ancDerCounts", - dest="writeAncDerCounts", - action="store_true", - help="counts of ancestral and derived alleles encountered\n" - "at each node visited (omits nodes with zero of each)", - ) - group.add_argument( - "-hp", - "--haplogroupPaths", - dest="writeHaplogroupPaths", - action="store_true", - help="sequence of branch labels from root to call,\n" - "with counts of derived SNPs observed", - ) - group.add_argument( - "-hpd", - "--haplogroupPathsDetail", - dest="writeHaplogroupPathsDetail", - action="store_true", - help="sequence of branch labels from root to call,\n" - "with counts of derived SNPs observed and lists thereof", - ) - group.add_argument( - "-ds", - "--derSNPs", - dest="writeDerSNPs", - action="store_true", - help="lists of derived SNPs on path", - ) - group.add_argument( - "-dsd", - "--derSNPsDetail", - dest="writeDerSNPsDetail", - action="store_true", - help="detailed information about each derived SNP on path", - ) - group.add_argument( - "-as", - "--ancSNPs", - dest="writeAncSNPs", - action="store_true", - help="lists of ancestral SNPs encountered in search", - ) - group.add_argument( - "-asd", - "--ancSNPsDetail", - dest="writeAncSNPsDetail", - action="store_true", - help="detailed information about each ancestral SNP\n" - "encountered in search", - ) + if self.args.write_haplogroups_real_time: + self.haplogroup_real_time_file.close() - group = parser.add_argument_group( - "generate real-time auxilary output", - "write haplogroup calling information as each individual is processed", - ) - group.add_argument( - "-rt", - "--writeRealTime", - dest="writeHaplogroupsRealTime", - action="store_true", - help="write haplogroups in real time. includes DFS rank,\n" - "to sort ex post facto: sort -nk5", - ) - group.add_argument( - "-hg", - "--hgGenos", - dest="haplogroupToListGenotypesFor", - metavar="haplogroup", - help="write genotypes observed for SNPs associated with\n" - "a specified node of the tree, when it is visited", - ) + if self.args.haplogroup_to_list_genotypes_for: + self.hg_genos_file.close() + logger.info( + ( + "Wrote genotypes at SNPs associated with haplogroup " + f"{self.args.haplogroup_to_list_genotypes_for}:\n" + f" {self.hg_genos_fp}\n" + ) + ) - group = parser.add_argument_group("traverse tree") - group.add_argument( - "-b", - "--breadthFirst", - dest="traverseBF", - action="store_true", - help="write bread-first traversal", - ) - group.add_argument( - "-d", - "--depthFirst", - dest="traverseDF", - action="store_true", - help="write depth-first (pre-order) traversal", - ) - group.add_argument( - "-dt", - "--depthFirstTable", - dest="writeTreeTable", - action="store_true", - help="write depth-first (pre-order) traversal table", - ) - group.add_argument( - "-m", - "--mrca", - nargs=2, - dest="mrcaHaplogroupList", - metavar=("haplogroup1", "haplogroup2"), - help="output mrca of two haplogroups", - ) - group.add_argument( - "-sq", - "--snpQuery", - dest="querySNPname", - metavar="snpName", - help="list phylogenetic path for a query SNP", - ) - group.add_argument( - "-cm", - "--contentMapping", - dest="writeContentMappings", - action="store_true", - help="23andMe: map each node to the most recent ancestor\n" - "with an info page", - ) - group.add_argument( - "-pt", - "--platformTrees", - dest="writePlatformTrees", - action="store_true", - help="23andMe: write trees whose branch lengths are numbers\n" - "of platform sites", - ) - group = parser.add_argument_group("compare to previously called haplogroups") - group.add_argument( - "-mdh", - "--compareToMetadata", - dest="compareToMetadata", - action="store_true", - help="23andMe: compare to haplogroups called by original algorithm", - ) - group.add_argument( - "-ph", - "--prevCalledHgFN", - dest="prevCalledHgFN", - metavar="fileName", - help="import previously called haplogroups:\n" - "ID in first column, Haplogroup in last", - ) +def basename_no_suffix(file_path: str, suffix: str) -> str: + """Return the basename of a file path, with the supplied suffix removed.""" - group = parser.add_argument_group("change search parameters") - group.add_argument( - "-ast", - "--ancStopThresh", - dest="ancStopThresh", - metavar="anc_stop_thresh", - type=int, - default=ANC_STOP_THRESH_DEFAULT, - help="BFS ancestral allele stopping condition", - ) - group.add_argument( - "-dct", - "--derCollapseThresh", - dest="derCollapseThresh", - metavar="der_collapse_thresh", - type=int, - default=DER_COLLAPSE_THRESH_DEFAULT, - help="BFS derived allele collapsing parameter", - ) + basename_no_suffix = os.path.basename(file_path).removesuffix(suffix) + return basename_no_suffix - group = parser.add_argument_group("restrict input or traversal") - group.add_argument( - "-po", "--primaryOnly", action="store_true", help="do NOT import ISOGG SNPs" - ) - group.add_argument( - "-r", - "--root", - dest="alternativeRoot", - metavar="haplogroup", - help="start searching tree from this branch", - ) - group.add_argument( - "-s", - "--singleSample", - dest="singleSampleID", - metavar="ID", - help="restrict to a single sample (resid for 23andMe data)", - ) - group = parser.add_argument_group("[dev only] test data") - group.add_argument( - "-ta", - "--test1000Yall", - action="store_true", - help="1000Y testing: all sites, all samples", - ) - group.add_argument( - "-ts", - "--test1000Ysubset", - action="store_true", - help="1000Y testing: all sites, subset of samples", - ) - group.add_argument( - "-t1", - "--test1000YoneID", - metavar="ID", - help="1000Y testing: all sites, one sample", - ) - group.add_argument( - "-tv", - "--test1000YplatformSites", - type=int, - dest="test1000YplatformVersion", - metavar="version", - help="1000Y testing: 23andMe sites, all samples", - ) +def check_example_data_availability(filepath: str) -> None: + """Check whether example data is available. - group = parser.add_argument_group("[dev only] test data format") - group.add_argument( - "-tvcf", - "--test1000Yvcf", - dest="test1000Yvcf", - action="store_true", - help="1000Y testing: use .vcf.gz file rather than .genos.txt", - ) - group.add_argument( - "-tvcf4", - "--test1000Yvcf4", - dest="test1000Yvcf4", - action="store_true", - help="1000Y testing: use .vcf4 file rather than .genos.txt", - ) + Raises + ------ + FileNotFoundError + If example data is unavailable. - group = parser.add_argument_group("spot-check and exit") - group.add_argument( - "-fn", - "--fileNamesOnly", - action="store_true", - help="print file names to stdout and exit", - ) - group.add_argument( - "-v", "--version", action="version", version="yhaplo %s" % __version__ + """ + if not os.path.isfile(filepath): + raise FileNotFoundError( + f"Example input file not available: {filepath}\n\n" + "There are two ways to run on example data:\n\n" + "- Clone the repo and install yhaplo as editable:\n\n" + " cd \n" + " pip install --editable .\n\n" + "- Download fixture data from tests/fixtures/input and run:\n\n" + " yhaplo --input --all_aux_output\n" ) - - if self.useDefaultCmdLineArgs: - self.args = parser.parse_args([]) - else: - self.args = parser.parse_args() # reads from sys.argv[1:] diff --git a/yhaplo/convert_to_genos.py b/yhaplo/convert_to_genos.py deleted file mode 100755 index e7f987d..0000000 --- a/yhaplo/convert_to_genos.py +++ /dev/null @@ -1,205 +0,0 @@ -#!/usr/bin/env python -# -# David Poznik -# 2016.06.30 -# convert_to_genos.py -# -# To run: python -m yhaplo.convert_to_genos -# ---------------------------------------------------------------------- -from __future__ import absolute_import, print_function - -import argparse -import os -import sys - -from . import utils - -DESCRIPTION = """ -Converts data to .genos.txt format for yhaplo sofware. - -Input format options: - -1. .ped and .map - -2. .23andMe.txt - Column 1: SNP identifier (ignored) - Column 2: Chromosome (row retained only if chromosome in CHROMOSOME_SET) - Column 3: Physical coordinate (GRCh37 assumed) - Column 4: Allele 1 (row retained only if allele 1 in ALLELE_SET) - Column 5: Allele 2 (if present) -""" - -CHROMOSOME_SET = {"24", "Y"} -ALLELE_SET = set("ACGTDI") - -fnEnding2fnTypeDict = { - ".ped": "ped", - ".23andMe.txt": "ttam", - ".acom.txt": "ttam", -} - -outDir = "converted" -utils.mkdirP(outDir) - - -# ---------------------------------------------------------------------- -# ped and map - - -def convertPed(pedFN, fnRoot, fnEnding): - "reads a .ped and a .map and converts to .genos.txt" - - mapFN = pedFN.replace(fnEnding, ".map") - outFN = "%s/%s.genos.txt" % (outDir, fnRoot) - outFile = open(outFN, "w") - - indexList = readMap(mapFN, outFile) - processPed(pedFN, indexList, outFile) - - print("Output: %s\n" % outFN) - outFile.close() - - -def readMap(mapFN, outFile): - "reads a .map file" - - if not os.path.exists(mapFN): - sys.exit("ERROR. Expecting map file: %s" % mapFN) - print("Map: %s\n" % mapFN) - - positionList = list() - indexList = list() - index = 0 - with open(mapFN, "r") as mapFile: - for line in mapFile: - chromosome, _, _, position = line.strip().split() - if chromosome in CHROMOSOME_SET: - positionList.append(position) - indexList.append(index) - index += 1 - - outFile.write("ID\t%s\n" % "\t".join(positionList)) - return indexList - - -def processPed(pedFN, indexList, outFile): - "process a .ped file" - - diploidIndexList = [2 * i for i in indexList] - numIndividuals, numFemale = 0, 0 - with open(pedFN, "r") as inFile: - for line in inFile: - lineList = line.strip().split() - sex = lineList[4] - if sex == "2": - numFemale += 1 - continue - - diploidGenoList = lineList[6:] - haploidGenoList = list() - for i in diploidIndexList: - allele1, allele2 = diploidGenoList[i], diploidGenoList[i + 1] - if allele1 in ALLELE_SET and allele1 == allele2: - haploidGenoList.append(allele1) - else: - haploidGenoList.append(".") - - numIndividuals += 1 - ID = "-".join(lineList[:2]) - outFile.write("%s\t%s\n" % (ID, "\t".join(haploidGenoList))) - - print("%5d females ignored" % numFemale) - print("%5d individuals written" % numIndividuals) - print("%5d markers\n" % len(indexList)) - - -# ---------------------------------------------------------------------- -# 23andMe - - -def convertTTAM(inFN, ID): - "reads single-sample flat format and converts to .genos.txt" - - outFN = "%s/%s.genos.txt" % (outDir, ID) - - genoTupleList = list() - numNonY, numHetOrNoCall = 0, 0 - with open(inFN, "r") as inFile: - for line in inFile: - if line[0] == "#" or line[:4] == "rsid": - continue - - lineList = line.strip().split() - numFields = len(lineList) - chromosome, position, allele1 = lineList[1:4] - if numFields == 5: - allele2 = lineList[4] - elif numFields != 4: - sys.exit( - "ERROR. Encountered line with %d elements:\n%s" % (numFields, line) - ) - - if chromosome in CHROMOSOME_SET: - if allele1 in ALLELE_SET and (numFields == 4 or allele1 == allele2): - genoTupleList.append((position, allele1)) - else: - numHetOrNoCall += 1 - else: - numNonY += 1 - - with open(outFN, "w") as outFile: - writeLineFromTupleList(0, genoTupleList, outFile, "ID") - writeLineFromTupleList(1, genoTupleList, outFile, ID) - - print("%6d non-Y genotypes ignored" % numNonY) - print("%6d Y-chromosome genotypes ignored (het or no-call)" % numHetOrNoCall) - print("%6d written\n" % len(genoTupleList)) - print("Output: %s\n" % outFN) - - -def writeLineFromTupleList(index, tupleList, outFile, rowHeader=""): - "given a list of tuples, writes one line with the i-th element of each tuple" - - outFile.write(rowHeader) - for myTuple in tupleList: - outFile.write("\t%s" % myTuple[index]) - - outFile.write("\n") - - -# ---------------------------------------------------------------------- -# main - - -def main(): - parser = argparse.ArgumentParser( - description=DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter - ) - parser.add_argument("inFN", type=str, help="input file name") - args = parser.parse_args() - inFN = args.inFN - - if not os.path.exists(inFN): - sys.exit("ERROR. Input file does not exist: %s" % inFN) - print("Input: %s\n" % inFN) - - fnType = None - for fnEnding in fnEnding2fnTypeDict: - if inFN.endswith(fnEnding): - fnType = fnEnding2fnTypeDict[fnEnding] - fnRoot = os.path.basename(inFN).replace(fnEnding, "") - break - - if fnType == "ped": - convertPed(inFN, fnRoot, fnEnding) - elif fnType == "ttam": - convertTTAM(inFN, ID=fnRoot) - else: - sys.exit( - "ERROR. Input file must be a .ped or a .23andMe.txt " - + "in the corresponding format" - ) - - -if __name__ == "__main__": - main() diff --git a/yhaplo/data/HG01938.vcf.gz b/yhaplo/data/HG01938.vcf.gz deleted file mode 100644 index dbdf109..0000000 Binary files a/yhaplo/data/HG01938.vcf.gz and /dev/null differ diff --git a/yhaplo/data/__init__.py b/yhaplo/data/__init__.py new file mode 100644 index 0000000..b9241f5 --- /dev/null +++ b/yhaplo/data/__init__.py @@ -0,0 +1 @@ +"""Data files.""" diff --git a/yhaplo/data/tree/__init__.py b/yhaplo/data/tree/__init__.py new file mode 100644 index 0000000..a58363d --- /dev/null +++ b/yhaplo/data/tree/__init__.py @@ -0,0 +1 @@ +"""Newick file encoding the primary structure of the Y-chromosome haplogroup tree.""" diff --git a/yhaplo/input/y.tree.primary.2016.01.04.nwk b/yhaplo/data/tree/y.tree.primary.2016.01.04.nwk similarity index 100% rename from yhaplo/input/y.tree.primary.2016.01.04.nwk rename to yhaplo/data/tree/y.tree.primary.2016.01.04.nwk diff --git a/yhaplo/data/variants/__init__.py b/yhaplo/data/variants/__init__.py new file mode 100644 index 0000000..5470e69 --- /dev/null +++ b/yhaplo/data/variants/__init__.py @@ -0,0 +1 @@ +"""Variant data.""" diff --git a/yhaplo/input/isogg.2016.01.04.txt b/yhaplo/data/variants/isogg.2016.01.04.txt similarity index 100% rename from yhaplo/input/isogg.2016.01.04.txt rename to yhaplo/data/variants/isogg.2016.01.04.txt diff --git a/yhaplo/input/isogg.correct.coordinate.txt b/yhaplo/data/variants/isogg.correct.coordinate.txt similarity index 100% rename from yhaplo/input/isogg.correct.coordinate.txt rename to yhaplo/data/variants/isogg.correct.coordinate.txt diff --git a/yhaplo/input/isogg.correct.polarize.txt b/yhaplo/data/variants/isogg.correct.polarize.txt similarity index 100% rename from yhaplo/input/isogg.correct.polarize.txt rename to yhaplo/data/variants/isogg.correct.polarize.txt diff --git a/yhaplo/input/isogg.multiallelic.txt b/yhaplo/data/variants/isogg.multiallelic.txt similarity index 100% rename from yhaplo/input/isogg.multiallelic.txt rename to yhaplo/data/variants/isogg.multiallelic.txt diff --git a/yhaplo/input/isogg.omit.bad.23andMe.txt b/yhaplo/data/variants/isogg.omit.bad.23andMe.txt similarity index 99% rename from yhaplo/input/isogg.omit.bad.23andMe.txt rename to yhaplo/data/variants/isogg.omit.bad.23andMe.txt index 883a961..4d71bd8 100644 --- a/yhaplo/input/isogg.omit.bad.23andMe.txt +++ b/yhaplo/data/variants/isogg.omit.bad.23andMe.txt @@ -1,4 +1,4 @@ M288 B1 2649694 G->T M288 # many T carriers who were clearly B2 P117 M3 14819693 G->T P117 # lots of people have this allele -P80 H1b1a 6739899 G->C P80 # P80 and P266 often both derived, regardless of H1b1 status +P80 H1b1a 6739899 G->C P80 # P80 and P266 often both derived, regardless of H1b1 status P266 H1b1b 6739738 A->T P266 # P80 and P266 often both derived, regardless of H1b1 status diff --git a/yhaplo/input/isogg.omit.bad.txt b/yhaplo/data/variants/isogg.omit.bad.txt similarity index 100% rename from yhaplo/input/isogg.omit.bad.txt rename to yhaplo/data/variants/isogg.omit.bad.txt diff --git a/yhaplo/input/isogg.omit.branch.conflict.23andMe.v5.txt b/yhaplo/data/variants/isogg.omit.branch.conflict.23andMe.v5.txt similarity index 100% rename from yhaplo/input/isogg.omit.branch.conflict.23andMe.v5.txt rename to yhaplo/data/variants/isogg.omit.branch.conflict.23andMe.v5.txt diff --git a/yhaplo/input/isogg.omit.branch.conflict.txt b/yhaplo/data/variants/isogg.omit.branch.conflict.txt similarity index 100% rename from yhaplo/input/isogg.omit.branch.conflict.txt rename to yhaplo/data/variants/isogg.omit.branch.conflict.txt diff --git a/yhaplo/input/isogg.split.txt b/yhaplo/data/variants/isogg.split.txt similarity index 100% rename from yhaplo/input/isogg.split.txt rename to yhaplo/data/variants/isogg.split.txt diff --git a/yhaplo/input/preferred.snpNames.txt b/yhaplo/data/variants/preferred.snp_names.txt similarity index 100% rename from yhaplo/input/preferred.snpNames.txt rename to yhaplo/data/variants/preferred.snp_names.txt diff --git a/yhaplo/input/representative.SNPs.additional.txt b/yhaplo/data/variants/representative.SNPs.additional.txt similarity index 100% rename from yhaplo/input/representative.SNPs.additional.txt rename to yhaplo/data/variants/representative.SNPs.additional.txt diff --git a/yhaplo/input/representative.SNPs.isogg.2015tree.txt b/yhaplo/data/variants/representative.SNPs.isogg.2015_tree.txt similarity index 100% rename from yhaplo/input/representative.SNPs.isogg.2015tree.txt rename to yhaplo/data/variants/representative.SNPs.isogg.2015_tree.txt diff --git a/yhaplo/node.py b/yhaplo/node.py index 1840da2..d6b7ed4 100644 --- a/yhaplo/node.py +++ b/yhaplo/node.py @@ -1,466 +1,526 @@ -# David Poznik -# 2015.12.29 -# node.py -# -# Defines the Node class. -# ---------------------------------------------------------------------- -from __future__ import absolute_import +"""Define Node class.""" +from __future__ import annotations + +import argparse +import logging from collections import deque from operator import attrgetter +from typing import Optional, TextIO -from six.moves import range +from yhaplo import sample as sample_module # noqa F401 +from yhaplo import snp as snp_module # noqa F401 +from yhaplo import tree as tree_module # noqa F401 +from yhaplo.config import Config -from . import utils -from .page import Page -from .snp import SNP +logger = logging.getLogger(__name__) -class Node(object): - """ +class Node: + + """Class representing one node of a haplogroup tree. + + Each node represents the branch that leads to it. + A node knows its: - - parent (self.parent is None == self.isRoot()) - - depth - - children - - diagnostic SNPs + - Parent + - Depth + - Children + - Diagnostic SNPs - Throughout this code, each node represents the branch that leads to it. """ - tree = None - config = None - args = None - errAndLog = None - pageList = list() - pageDict = dict() - hgSNPset = set() + tree: "tree_module.Tree" + config: Config + args: argparse.Namespace + hg_snp_set: set[str] = set() - def __init__(self, parent, tree=None): + def __init__( + self, + parent: Optional[Node], + tree: Optional["tree_module.Tree"] = None, + ): self.parent = parent - if self.isRoot(): - Node.setTreeConfigAndArgs(tree) - self.depth = 0 + if parent is None: + if tree is not None: + type(self).set_tree_config_and_args(tree) + self.depth = 0 + else: + raise ValueError( + "A tree instance must be supplied when instantiating a root node." + ) else: - parent.addChild(self) + parent.add_child(self) self.depth = parent.depth + 1 - if self.depth > Node.tree.maxDepth: - Node.tree.maxDepth = self.depth + if self.depth > type(self).tree.max_depth: + type(self).tree.max_depth = self.depth + + self.haplogroup: str = "" # YCC haplogroup name (e.g., "R1b1c") + self.label: str = "" # YCC including alt names (e.g., "P/K2b2") + self.hg_trunc: str = "" # Truncated haplogroup (e.g., "R") + self.hg_snp: str = "" # Haplogroup with representative SNP (e.g., "R-V88") + self.child_list: list[Node] = [] + self.snp_list: list["snp_module.SNP"] = [] + self.dropped_marker_list: list["snp_module.DroppedMarker"] = [] + self.branch_length: Optional[float] = None + self.dfs_rank: int = 0 + + # String representations + # ---------------------------------------------------------------------- + def __str__(self) -> str: + """Return string representation.""" - self.haplogroup = "" # see setLabel | ycc haplogroup name e.g., R1b1c - self.label = "" # see setLabel | ycc including alt names e.g., P/K2b2 - self.hgTrunc = "" # see setLabel | truncated haplogroup e.g., R1b1c -> R - self.hgSNP = "" # see prioritySortSNPlistAndSetHgSNP | e.g., R-V88 - self.childList = list() # see addChild, bifurcate, serialSplit - self.snpList = list() # see addSNP - self.droppedMarkerList = list() # see addDroppedMarker - self.page = None # see addSNP - self.branchLength = None # see setBranchLength - self.DFSrank = 0 # see setDFSrank + return self.str_simple - if Node.args.writeContentMappings and self.isRoot(): - self.page = Node.pageDict[Node.config.rootHaplogroup] - self.page.setNode(self) + @property + def str_simple(self) -> str: + """Return string representation with label and representative SNP.""" - def __str__(self): - return self.strSimple() + return f"{self.label:25s} {self.hg_snp}" - def strSimple(self): - "string representation: label and representative SNP" + @property + def str_snp_list(self) -> str: + """Return string representation with label and list of snps.""" - return "%-25s %s" % (self.label, self.hgSNP) + snp_string = " ".join(snp.label for snp in self.snp_list) + str_snp_list = f"{self.label:25s} {snp_string}" - def strSNPlist(self): - "string representation: label and list of snps" + return str_snp_list - snpString = " ".join(snp.label for snp in self.snpList) - return "%-25s %s" % (self.label, snpString) + @property + def str_dot_pipe_depth(self) -> str: + """Return string representation indicating depth with dots and pipes.""" - def strDotPipeDepth(self): - "string representation: indicates depth with a series of dots and pipes" + dot_list = list("." * (self.depth)) + for i in range(0, len(dot_list), 5): + dot_list[i] = "|" - dotList = list("." * (self.depth)) - for i in range(0, len(dotList), 5): - dotList[i] = "|" - return "%s%s %s" % ("".join(dotList), self.label, self.hgSNP) + dots = "".join(dot_list) + str_dot_pipe_depth = f"{dots}{self.label} {self.hg_snp}" - def strTreeTableRow(self): - "string representation: one row of tree table" + return str_dot_pipe_depth - yccLabel = self.haplogroup + # Other properties + # ---------------------------------------------------------------------- + @property + def tree_table_data(self) -> tuple[str, str, str, str, str]: + """Return a tuple of data summarizing the node. - if self.isRoot(): - parentDFSrank = parentHgSNP = "root" - else: - parentDFSrank = str(self.parent.DFSrank) - parentHgSNP = self.parent.hgSNP + Returns + ------- + tree_table_tuple : tuple[str, str, str, str, str] + Depth-first-search rank, YCC haplogroup label, SNP-based haplogroup, + parent DFS rank, parent SNP-based haplogroup. - return "\t".join( - [str(self.DFSrank), yccLabel, self.hgSNP, parentDFSrank, parentHgSNP] + """ + if self.parent is not None: + parent_dfs_rank = str(self.parent.dfs_rank) + parent_hg_snp = self.parent.hg_snp + else: + parent_dfs_rank = "root" + parent_hg_snp = "root" + + tree_table_row = ( + str(self.dfs_rank), + self.haplogroup, + self.hg_snp, + parent_dfs_rank, + parent_hg_snp, ) + return tree_table_row + @property - def mostHighlyRankedSNP(self): - "the most highly ranked SNP" + def most_highly_ranked_snp(self) -> "snp_module.SNP": + """Return the most highly ranked SNP.""" - return SNP.mostHighlyRankedMarkerOnList(self.snpList) + return self.snp_list[0] @property - def mostHighlyRankedDroppedMarker(self): - "the most highly ranked dropped marker" + def most_highly_ranked_dropped_marker(self) -> "snp_module.DroppedMarker": + """Return the most highly ranked dropped marker.""" - return SNP.mostHighlyRankedMarkerOnList(self.droppedMarkerList) + return self.dropped_marker_list[0] - # static methods, including class variable setters + # Class methods # ---------------------------------------------------------------------- - @staticmethod - def setTreeConfigAndArgs(tree): - "enables Node class to know about the tree instance, config, and args" - - Node.tree = tree - Node.config = tree.config - Node.args = tree.args - Node.errAndLog = tree.config.errAndLog - if Node.args.writeContentMappings: - Node.buildPageDict() - - @staticmethod - def buildPageDict(): - """ - builds a dictionary of 23andMe content pages. pagesFN comes from these two gdocs: - - https://docs.google.com/spreadsheets/d/1mf86slweZEKUd5hzG2GmKGTGIpHuDipJz2u221y2zVE/edit?ts=568eb997#gid=0 - - https://docs.google.com/spreadsheets/d/1oo0sRmYFNeWikuOxcb_1obOoO35wQccmOzyGRmqDMtc/edit?ts=578578d0#gid=362797346 - """ + @classmethod + def set_tree_config_and_args(cls, tree: "tree_module.Tree") -> None: + """Set tree, config, and args.""" - utils.checkFileExistence(Node.config.pagesFN, "Content pages") - with open(Node.config.pagesFN, "r") as pagesFile: - pagesFile.readline() # header - for line in pagesFile: - yccOld, snpName = line.strip().split() - page = Page(yccOld, snpName) - Node.pageList.append(page) + cls.tree = tree + cls.config = tree.config + cls.args = tree.args - if yccOld == Node.config.rootHaplogroup: - Node.pageDict[Node.config.rootHaplogroup] = page - elif snpName != ".": - Node.pageDict[snpName] = page + @classmethod + def truncate_haplogroup_label(cls, haplogroup: str) -> str: + """Return first truncated haplogroup label. - @staticmethod - def truncateHaplogroupLabel(haplogroup): - "returns first 2-5 characters of specified haplogroups and first letter of others" + Truncation here means the first two to five characters of specified haplogroups + and the first letter of others. - for numChars in range(Node.config.multiCharHgTruncMaxLen, 1, -1): - if haplogroup[:numChars] in Node.config.multiCharHgTruncSet: - return haplogroup[:numChars] + """ + truncated_haplogroup_label = haplogroup[0] + for num_chars in range(cls.config.multi_char_hg_trunc_max_len, 1, -1): + if haplogroup[:num_chars] in cls.config.multi_char_hg_trunc_set: + truncated_haplogroup_label = haplogroup[:num_chars] + break - return haplogroup[0] + return truncated_haplogroup_label - # setters, mutaters + # Setters and mutaters # ---------------------------------------------------------------------- - def setLabel(self, label): - "sets label, haplogroup, and hgTrunc" + def set_label(self, label: str) -> None: + """Set label, haplogroup, and hg_trunc.""" self.label = label - labelList = label.split("/") + label_list = label.split("/") - if self.isRoot(): - self.haplogroup = self.hgTrunc = self.config.rootHaplogroup - Node.tree.hg2nodeDict[self.haplogroup] = self + if self.is_root(): + self.haplogroup = self.hg_trunc = self.config.root_haplogroup + type(self).tree.haplogroup_to_node[self.haplogroup] = self else: - self.haplogroup = labelList[0] - self.hgTrunc = Node.truncateHaplogroupLabel(self.haplogroup) + self.haplogroup = label_list[0] + self.hg_trunc = type(self).truncate_haplogroup_label(self.haplogroup) - for key in labelList: - Node.tree.hg2nodeDict[key] = self + for key in label_list: + type(self).tree.haplogroup_to_node[key] = self - def setBranchLength(self, branchLength): - "sets the branch length" + def set_branch_length(self, branch_length: float) -> None: + """Set branch length.""" - self.branchLength = branchLength + self.branch_length = branch_length - def setDFSrank(self, DFSrank): - "set depth-first search rank" + def set_dfs_rank(self, dfs_rank: int) -> None: + """Set depth-first search rank.""" - self.DFSrank = DFSrank + self.dfs_rank = dfs_rank - def addSNP(self, snp): - "appends a snp to the snp list" + def add_snp(self, snp: "snp_module.SNP") -> None: + """Append a SNP to the SNP list.""" - self.snpList.append(snp) - if snp.label in Node.pageDict: - self.page = Node.pageDict[snp.label] - self.page.setNode(self) + self.snp_list.append(snp) - def addDroppedMarker(self, droppedMarker): - "appends a dropped marker to the list" + def add_dropped_marker( + self, + dropped_marker: "snp_module.DroppedMarker", + ) -> None: + """Append a dropped marker to the list.""" - self.droppedMarkerList.append(droppedMarker) + self.dropped_marker_list.append(dropped_marker) - def prioritySortSNPlistAndSetHgSNP(self): - """ - first, sorts snp list (or dropped marker list) by priority ranking. - then, sets reresentative-SNP-based label: self.hgSNP - the standard form incudes the truncated haplogroup label - and the label of a representative SNP, separated by a hyphen (e.g. R-V88). - """ - - # root: no markers - if self.isRoot(): - self.hgSNP = self.haplogroup + def priority_sort_snp_list_and_set_hg_snp(self) -> None: + """Sort SNP list and set SNP-based haplogroup. - # normal case - elif self.snpList: - self.snpList = SNP.prioritySortMarkerList(self.snpList) - self.hgSNP = self.mostHighlyRankedSNP.hgSNP + First, sort SNP list (or dropped marker list) by priority ranking. + Then, set reresentative-SNP-based label: self.hg_snp. + The standard form incudes the truncated haplogroup label + and the label of a representative SNP, separated by a hyphen (e.g. R-V88). - # backup: use discared marker name - elif self.droppedMarkerList: - self.droppedMarkerList = SNP.prioritySortMarkerList(self.droppedMarkerList) - markerName = self.mostHighlyRankedDroppedMarker.name - self.hgSNP = "%s-%s" % (self.hgTrunc, markerName) + """ + # Root: no markers + if self.is_root(): + self.hg_snp = self.haplogroup + + # Normal case + elif self.snp_list: + self.snp_list = snp_module.priority_sort_marker_list(self.snp_list) + self.hg_snp = self.most_highly_ranked_snp.hg_snp + + # Backup: use discarded marker name + elif self.dropped_marker_list: + self.dropped_marker_list = snp_module.priority_sort_marker_list( + self.dropped_marker_list + ) + marker_name = self.most_highly_ranked_dropped_marker.name + self.hg_snp = f"{self.hg_trunc}-{marker_name}" - # no markers to use + # No markers to use else: - if self.parent.hgSNP: - symbol = "*" if self.isLeaf() else "+" - self.hgSNP = self.parent.hgSNP + symbol + if self.parent is not None and self.parent.hg_snp: + symbol = "*" if self.is_leaf() else "+" + self.hg_snp = self.parent.hg_snp + symbol - # uniquify if necessary - if self.hgSNP in Node.hgSNPset: + # Uniquify if necessary + if self.hg_snp in type(self).hg_snp_set: i = 1 - hgSNPuniqe = "%s%d" % (self.hgSNP, i) - while hgSNPuniqe in Node.hgSNPset: + hg_snp_uniqe = f"{self.hg_snp}{i}" + while hg_snp_uniqe in type(self).hg_snp_set: i += 1 - hgSNPuniqe = "%s%d" % (self.hgSNP, i) + hg_snp_uniqe = f"{self.hg_snp}{i}" - self.hgSNP = hgSNPuniqe + self.hg_snp = hg_snp_uniqe else: - Node.errAndLog( + logger.warning( "WARNING. Attempted to set star label, " - + "but parent.hgSNP not set yet: %s\n" % self.haplogroup + f"but parent.hg_snp not set yet: {self.haplogroup}\n" ) - self.hgSNP = self.haplogroup + self.hg_snp = self.haplogroup - Node.hgSNPset.add(self.hgSNP) + type(self).hg_snp_set.add(self.hg_snp) - # queries + # Queries # ---------------------------------------------------------------------- - def isRoot(self): + def is_root(self) -> bool: + """Return a Boolean indicating whether or not the Node is root.""" + return self.parent is None - def isLeaf(self): - return len(self.childList) == 0 - - def getBranchLength(self, alignTips=False, platformVersion=None): - if self.branchLength: - return self.branchLength - elif alignTips and self.isLeaf(): - return Node.tree.maxDepth - self.depth + 1 - elif alignTips: - return 1 - elif platformVersion: - branchLength = 0 - for snp in self.snpList: - if snp.isOnPlatform(platformVersion): - branchLength += 1 - return branchLength + def is_leaf(self) -> bool: + """Return a Boolean indicating whether or not the Node is a leaf.""" + + return len(self.child_list) == 0 + + def get_branch_length( + self, + align_tips: bool = False, + platform: Optional[str] = None, + ) -> Optional[float]: + """Get branch length.""" + + if self.branch_length: + branch_length = self.branch_length + elif align_tips and self.is_leaf(): + branch_length = type(self).tree.max_depth - self.depth + 1 + elif align_tips: + branch_length = 1 + elif platform: + branch_length = 0 + for snp in self.snp_list: + if snp.is_on_platform(platform): + branch_length += 1 else: - return None + branch_length = None - def backTracePath(self): - "returns a list of nodes from root to self" + return branch_length - nodeList = [self] + def back_trace_path(self) -> list[Node]: + """Return a list of nodes from root to self.""" + + node_list = [self] parent = self.parent while parent is not None: - nodeList.append(parent) + node_list.append(parent) parent = parent.parent - nodeList.reverse() - return nodeList - def assessGenotypes(self, sample): - """ - assess an individual's genotypes with respect to self.snpList - returns two lists of snps. those for which: - - ancestral genotypes were observed - - derived genotypes were observed - """ + node_list.reverse() - genotypedSnpList = [ - snp for snp in self.snpList if snp.position in sample.pos2genoDict - ] - ancSNPlist, derSNPlist = list(), list() - listAllGenotypes = Node.args.haplogroupToListGenotypesFor == self.haplogroup + return node_list - for snp in genotypedSnpList: - geno = sample.pos2genoDict[snp.position] + def assess_genotypes( + self, + sample: "sample_module.Sample", + ) -> tuple[list["snp_module.SNP"], list["snp_module.SNP"]]: + """Assess an individual's genotypes with respect to self.snp_list. - if snp.isAncestral(geno): - ancSNPlist.append(snp) - elif snp.isDerived(geno): - derSNPlist.append(snp) + Returns + ------- + anc_snp_list : list[SNP] + SNPs for which ancestral genotypes were observed. + der_snp_list : list[SNP] + SNPs for which derived genotypes were observed. - if listAllGenotypes: - derivedFlag = "*" if snp.isDerived(geno) else "" - Node.config.hgGenosFile.write( - "%-8s %s %s %s\n" % (sample.ID, snp, geno, derivedFlag) - ) + """ + anc_snp_list, der_snp_list = [], [] + for snp in self.snp_list: + genotype = sample.get_genotype(snp.position) + if genotype != Config.missing_genotype: + if snp.is_ancestral(genotype): + anc_snp_list.append(snp) + elif snp.is_derived(genotype): + der_snp_list.append(snp) + + if type(self).args.haplogroup_to_list_genotypes_for == self.haplogroup: + file = type(self).config.hg_genos_file + derived_flag = "*" if snp.is_derived(genotype) else "" + file.write( + f"{str(sample.iid):8s} {snp} {genotype} {derived_flag}\n" + ) - return ancSNPlist, derSNPlist + return anc_snp_list, der_snp_list - # children + # Children # ---------------------------------------------------------------------- - def addChild(self, child): - "appends a child to the child list" - - self.childList.append(child) - - def serialSplit(self, targetHaplogroup): - "serially split node until there is a spot for the target haplogroup" - - currentNode = self - startLength = len(self.haplogroup) - endLength = len(targetHaplogroup) - for strLen in range(startLength, endLength): - nextNode = None - targetHgSubstring = targetHaplogroup[: (strLen + 1)] - if currentNode.numChildren < 2: - currentNode.bifurcate() - for node in currentNode.childList: - if node.haplogroup == targetHgSubstring: - nextNode = node - if nextNode is None: - nextNode = Node(parent=currentNode) - nextNode.setLabel(targetHgSubstring) - currentNode.sortChildren() - - currentNode = nextNode - - return currentNode + def add_child(self, child: Node) -> None: + """Append a child to the child list.""" + + self.child_list.append(child) + + def serial_split(self, target_haplogroup: str) -> Node: + """Split node serially until there is a spot for the target haplogroup.""" + + current_node = self + start_length = len(self.haplogroup) + end_length = len(target_haplogroup) + for str_len in range(start_length, end_length): + next_node = None + target_hg_substring = target_haplogroup[: (str_len + 1)] + if current_node.num_children < 2: + current_node.bifurcate() + for node in current_node.child_list: + if node.haplogroup == target_hg_substring: + next_node = node + if next_node is None: + next_node = type(self)(parent=current_node) + next_node.set_label(target_hg_substring) + current_node.sort_children() + + current_node = next_node + + return current_node @property - def numChildren(self): - return len(self.childList) + def num_children(self) -> int: + """Return number of children.""" + + return len(self.child_list) - def bifurcate(self): - "split a node and return the two children" + def bifurcate(self) -> tuple[Node, Node]: + """Split a node and return the two children.""" - leftChild = Node(parent=self) - rightChild = Node(parent=self) + left_child = type(self)(parent=self) + right_child = type(self)(parent=self) if self.haplogroup[-1].isalpha(): - leftChild.setLabel(self.haplogroup + "1") - rightChild.setLabel(self.haplogroup + "2") + left_child.set_label(self.haplogroup + "1") + right_child.set_label(self.haplogroup + "2") else: - leftChild.setLabel(self.haplogroup + "a") - rightChild.setLabel(self.haplogroup + "b") - return leftChild, rightChild + left_child.set_label(self.haplogroup + "a") + right_child.set_label(self.haplogroup + "b") + + return left_child, right_child - def sortChildren(self): - self.childList = sorted(self.childList, key=attrgetter("haplogroup")) + def sort_children(self) -> None: + """Sort children by haplogroup.""" - def reverseChildren(self): - self.childList.reverse() + self.child_list = sorted(self.child_list, key=attrgetter("haplogroup")) - # tree traversals + def reverse_children(self) -> None: + """Reverse child list.""" + + self.child_list.reverse() + + # Tree traversals # ---------------------------------------------------------------------- - def writeBreadthFirstTraversal(self, bfTreeFile): - "writes breadth-first traversal" + def write_breadth_first_traversal(self, bf_tree_file: TextIO) -> None: + """Write breadth-first traversal.""" + + bf_tree_file.write(self.str_dot_pipe_depth + "\n") + node_deque = deque(self.child_list) + while node_deque: + node = node_deque.popleft() + bf_tree_file.write(node.str_dot_pipe_depth + "\n") + node_deque.extend(node.child_list) - bfTreeFile.write("%s\n" % self.strDotPipeDepth()) - nodeDeque = deque(self.childList) - while nodeDeque: - node = nodeDeque.popleft() - bfTreeFile.write("%s\n" % node.strDotPipeDepth()) - nodeDeque.extend(node.childList) + def get_depth_first_node_list(self) -> list[Node]: + """Conduct depth-first pre-order traversal.""" - def getDepthFirstNodeList(self): - "wrapper function for recursive depth-first pre-order traversal" + depth_first_node_list = [self] + self.traverse_depth_first_pre_order_recursive(depth_first_node_list) - depthFirstNodeList = [self] - self.traverseDepthFirstPreOrderRecursive(depthFirstNodeList) - return depthFirstNodeList + return depth_first_node_list - def traverseDepthFirstPreOrderRecursive(self, depthFirstNodeList): - "recursively appends each node in depth-first pre order" + def traverse_depth_first_pre_order_recursive( + self, + depth_first_node_list: list[Node], + ) -> None: + """Append each node in depth-first pre order, recursively.""" - for child in self.childList: - depthFirstNodeList.append(child) - child.traverseDepthFirstPreOrderRecursive(depthFirstNodeList) + for child in self.child_list: + depth_first_node_list.append(child) + child.traverse_depth_first_pre_order_recursive(depth_first_node_list) - def mrca(self, otherNode): - "returns the most recent common ancestor of this node and another" + def mrca(self, other_node: Node) -> Node: + """Return the most recent common ancestor of this node and another.""" - if self.depth < otherNode.depth: - higherNode, lowerNode = self, otherNode + if self.depth < other_node.depth: + higher_node, lower_node = self, other_node else: - higherNode, lowerNode = otherNode, self + higher_node, lower_node = other_node, self - while higherNode.depth < lowerNode.depth: - lowerNode = lowerNode.parent - while lowerNode != higherNode: - lowerNode = lowerNode.parent - higherNode = higherNode.parent + while higher_node.depth < lower_node.depth: + assert lower_node.parent is not None + lower_node = lower_node.parent + while lower_node != higher_node: + assert lower_node.parent is not None + assert higher_node.parent is not None + lower_node = lower_node.parent + higher_node = higher_node.parent - return higherNode + return higher_node - # writing tree to file in Newick format + # Writing tree to file in Newick format # ---------------------------------------------------------------------- - def writeNewick( - self, newickFN, useHgSNPlabel=False, alignTips=False, platformVersion=None - ): - "write Newick string for the subtree rooted at this node" - - if not Node.config.suppressOutputAndLog: - with open(newickFN, "w") as outFile: - outFile.write( - "%s;\n" - % self.buildNewickStringRecursive( - useHgSNPlabel, alignTips, platformVersion + def write_newick( + self, + newick_fp: str, + use_hg_snp_label: bool = False, + align_tips: bool = False, + platform: Optional[str] = None, + ) -> None: + """Write Newick string for the subtree rooted at this node.""" + + if not type(self).config.suppress_output: + with open(newick_fp, "w") as out_file: + out_file.write( + self.build_newick_string_recursive( + use_hg_snp_label, + align_tips, + platform, ) + + ";\n" ) - if alignTips: - treeDescriptor = "aligned " - elif platformVersion: - treeDescriptor = "platform v%d " % platformVersion + if align_tips: + tree_descriptor = "aligned " + elif platform: + tree_descriptor = f"platform {platform} " else: - treeDescriptor = "" + tree_descriptor = "" - if useHgSNPlabel: - labelType = "representative-SNP" + if use_hg_snp_label: + label_type = "representative-SNP" else: - labelType = "YCC" + label_type = "YCC" - Node.errAndLog( - "Wrote %stree with %s labels:\n %s\n\n" - % (treeDescriptor, labelType, newickFN) + logger.info( + f"Wrote {tree_descriptor}tree with {label_type} labels:\n" + f" {newick_fp}\n" ) - def buildNewickStringRecursive( - self, useHgSNPlabel=False, alignTips=False, platformVersion=None - ): - "recursively builds Newick string for the subtree rooted at this node" - - if not self.isLeaf(): - childStringList = list() - for child in self.childList[::-1]: - childString = child.buildNewickStringRecursive( - useHgSNPlabel, alignTips, platformVersion + def build_newick_string_recursive( + self, + use_hg_snplabel: bool = False, + align_tips: bool = False, + platform: Optional[str] = None, + ) -> str: + """Build Newick string recursively for the subtree rooted at this node.""" + + if not self.is_leaf(): + child_string_list = [] + for child in self.child_list[::-1]: + child_string = child.build_newick_string_recursive( + use_hg_snplabel, + align_tips, + platform, ) - childStringList.append(childString) - treeStringPart1 = "(%s)" % ",".join(childStringList) + child_string_list.append(child_string) + + children = ",".join(child_string_list) + tree_string_part_1 = f"({children})" else: - treeStringPart1 = "" - - branchLabel = self.hgSNP if useHgSNPlabel else self.label - branchLength = self.getBranchLength(alignTips, platformVersion) - if alignTips: - branchString = "%s:%d" % (branchLabel, branchLength) - elif branchLength is None or (self.isLeaf() and branchLength == 0): - branchString = branchLabel - elif branchLength > 0: - branchString = "%s|%d:%d" % (branchLabel, branchLength, branchLength) + tree_string_part_1 = "" + + branch_label = self.hg_snp if use_hg_snplabel else self.label + branch_length = self.get_branch_length(align_tips, platform) + if align_tips: + branch_string = f"{branch_label}:{branch_length}" + elif branch_length is None or (self.is_leaf() and branch_length == 0): + branch_string = branch_label + elif branch_length > 0: + branch_string = f"{branch_label}|{branch_length}:{branch_length}" else: - branchString = ":0.5" + branch_string = ":0.5" + + tree_string = f"{tree_string_part_1}{branch_string}" - treeString = "%s%s" % (treeStringPart1, branchString) - return treeString + return tree_string diff --git a/yhaplo/page.py b/yhaplo/page.py deleted file mode 100644 index 58716de..0000000 --- a/yhaplo/page.py +++ /dev/null @@ -1,33 +0,0 @@ -#!/usr/bin/env python -# -# David Poznik -# 2016.7.26 -# page.py -# -# Defines the Page class. -# ---------------------------------------------------------------------- - - -class Page(object): - "A simple container for a 23andMe content page" - - stringTp = "%-10s %s" - - def __init__(self, yccOld, snpName): - self.yccOld = yccOld - self.snpName = snpName - self.node = None - - def __str__(self): - if not self.node: - return Page.stringTp % (".", ".") - elif self.node.isRoot(): - return Page.stringTp % (self.node.haplogroup, self.node.haplogroup) - else: - return Page.stringTp % (self.node.hgSNP, self.node.haplogroup) - - def strFull(self): - return "%-10s %-10s %s" % (self.yccOld, self.snpName, str(self)) - - def setNode(self, node): - self.node = node diff --git a/yhaplo/path.py b/yhaplo/path.py index 72c92a4..32df5e9 100644 --- a/yhaplo/path.py +++ b/yhaplo/path.py @@ -1,193 +1,225 @@ -# David Poznik -# 2016.06.08 -# path.py -# -# Defines the Path class. -# ---------------------------------------------------------------------- -from __future__ import absolute_import +"""Define Path class.""" + +from __future__ import annotations from collections import deque +from collections.abc import Sequence +from typing import Optional -from .snp import SNP +from yhaplo import node as node_module # noqa F401 +from yhaplo import snp as snp_module # noqa F401 -class Path(object): - """ - An instance of this class represents a path through a tree. - It stores the next node to visit, a list of SNPs observed in the derived state, - the most derived SNP observed, and the number of ancestral alleles encountered. - """ +class Path: - def __init__(self, node): - self.node = node - self.derSNPlist = list() - self.mostDerivedSNP = None - self.numAncestral = 0 - self.initPushThroughVars() - - def initPushThroughVars(self): - """ - initializes variables that track progress subsequent to pushing through - a branch with 1 ancestral and 0 derived alleles - """ + """Class representing a path through a haplogroup tree. - self.nodeWhenPushedThrough = None - self.mostDerivedSNPWhenPushedThrough = None - self.numAncSincePushThrough = 0 - self.numDerSincePushThrough = 0 + Instances store: + - The next node to visit + - A list of SNPs observed in the derived state + - The most derived SNP observed + - The number of ancestral alleles encountered. - def setPushThroughVars(self): - "set memory of pushthough state to current state" - - self.nodeWhenPushedThrough = self.node - self.mostDerivedSNPWhenPushedThrough = self.mostDerivedSNP + """ - def updatePushThroughVars(self, numAncestral, numDerived): - "update pushthrough state with data from most recent branch assessment" + def __init__( + self, + node: "node_module.Node", + ): + self.node = node + self.der_snp_list: list["snp_module.SNP"] = [] + self.most_derived_snp: Optional["snp_module.SNP"] = None + self.num_ancestral = 0 + self.init_push_through_vars() - self.numAncSincePushThrough += numAncestral - self.numDerSincePushThrough += numDerived - if self.numDerSincePushThrough > self.numAncSincePushThrough: - self.initPushThroughVars() + def init_push_through_vars(self) -> None: + """Initialize "push-through" variables. - def copyAllAttributesOtherThanNode(self, other): - "copies all attributes of another path, other than its node" + These track progress subsequent to "pushing through" a branch with + one ancestral allele and no derived alleles. - self.derSNPlist = list(other.derSNPlist) - self.mostDerivedSNP = other.mostDerivedSNP - self.numAncestral = other.numAncestral + """ + self.node_when_pushed_through: Optional["node_module.Node"] = None + self.most_derived_snp_when_pushed_through: Optional["snp_module.SNP"] = None + self.num_anc_since_push_through = 0 + self.num_der_since_push_through = 0 + + def set_push_through_vars(self) -> None: + """Set memory of push-though state to current state.""" + + self.node_when_pushed_through = self.node + self.most_derived_snp_when_pushed_through = self.most_derived_snp + + def update_push_through_vars( + self, + num_ancestral: int, + num_derived: int, + ) -> None: + """Update push-through state with data from most recent branch assessment.""" + + self.num_anc_since_push_through += num_ancestral + self.num_der_since_push_through += num_derived + if self.num_der_since_push_through > self.num_anc_since_push_through: + self.init_push_through_vars() + + def copy_all_attributes_other_than_node(self, other: Path) -> None: + """Copy all attributes of another path, other than its node.""" + + self.der_snp_list = list(other.der_snp_list) + self.most_derived_snp = other.most_derived_snp + self.num_ancestral = other.num_ancestral + + self.node_when_pushed_through = other.node_when_pushed_through + self.most_derived_snp_when_pushed_through = ( + other.most_derived_snp_when_pushed_through + ) + self.num_anc_since_push_through = other.num_anc_since_push_through + self.num_der_since_push_through = other.num_der_since_push_through - self.nodeWhenPushedThrough = other.nodeWhenPushedThrough - self.mostDerivedSNPWhenPushedThrough = other.mostDerivedSNPWhenPushedThrough - self.numAncSincePushThrough = other.numAncSincePushThrough - self.numDerSincePushThrough = other.numDerSincePushThrough + def __str__(self) -> str: + """Return string representation.""" - def __str__(self): - return "%d %d\n%s\n%s\n" % ( - self.numAncestral, - self.numDerived, - self.nodeString, - self.snpString, + str_ = ( + f"{self.num_ancestral} {self.num_derived}\n" + f"{self.node_string}\n" + f"{self.snp_string}" ) - # properties + return str_ + + # Properties # ---------------------------------------------------------------------- @property - def hasPushedThrough(self): - "whether or not this path has pushed through a branch with 1 ancestral and 0 derived" + def has_pushed_through(self) -> bool: + """Whether or not this path has "pushed through". - return self.nodeWhenPushedThrough is not None + That is, whether or not a path has proceeded past a branch with + one ancestral allele and no derived alleles. + + """ + has_pushed_through = self.node_when_pushed_through is not None + return has_pushed_through @property - def nodeString(self): - "string concatenation of nodes visited" + def node_string(self) -> str: + """String concatenation of nodes visited.""" - return " ".join([node.label for node in self.node.backTracePath()]) + node_string = " ".join([node.label for node in self.node.back_trace_path()]) + return node_string @property - def numDerived(self): - "number of derived SNPs in the list" + def num_derived(self) -> int: + """Number of derived SNPs in the list.""" - return len(self.derSNPlist) + num_derived = len(self.der_snp_list) + return num_derived @property - def snpString(self): - "string concatenation of derived SNPs observed" + def snp_string(self) -> str: + """String concatenation of derived SNPs observed.""" - return " ".join([snp.label for snp in self.derSNPlist]) + snp_string = " ".join([snp.label for snp in self.der_snp_list]) + return snp_string - # regular methods + # Regular methods # ---------------------------------------------------------------------- - def betterThan(self, other): - "evaluates whether this path is better than another" + def better_than(self, other: Path) -> bool: + """Evaluate whether this path is better than another.""" - return ( + better_than = ( other is None - or self.numDerived > other.numDerived + or self.num_derived > other.num_derived or ( - self.numDerived == other.numDerived - and self.numAncestral < other.numAncestral + self.num_derived == other.num_derived + and self.num_ancestral < other.num_ancestral ) ) + return better_than + + def fork(self, node_list: Sequence["node_module.Node"]) -> deque[Path]: + """Fork path. + + Returns + ------- + path_deque : deque[Path] + Deque of paths, each of which is identical to self, + but with a new current node. - def fork(self, nodeList): - """ - returns a deque of paths, each of which is identical to self - but with a new current node """ + path_deque: deque[Path] = deque() + for node in node_list: + path = type(self)(node) + path.copy_all_attributes_other_than_node(self) + path_deque.append(path) - pathDeque = deque() - for node in nodeList: - path = Path(node) - path.copyAllAttributesOtherThanNode(self) - pathDeque.append(path) + return path_deque - return pathDeque + def revert_if_pushed_through_too_far(self) -> None: + """Revert path to its state prior to pushing through. - def revertIfPushedThroughTooFar(self): - """ - if the path has pushed through a branch with 1 ancestral and 0 derived - and, after doing so, it has encountered just one derived allele and a nonzero - number of ancestral alleles, revert the path to its state before pushing through - """ + Do so if the path has pushed through a branch with one ancestral allele + and no derived alleles and, since doing so, it has encountered just + one derived allele and a nonzero number of ancestral alleles. + """ if ( - self.hasPushedThrough - and self.numAncSincePushThrough > 0 - and self.numDerSincePushThrough == 1 + self.has_pushed_through + and self.num_anc_since_push_through > 0 + and self.num_der_since_push_through == 1 ): - self.node = self.nodeWhenPushedThrough - del self.derSNPlist[-1] - self.mostDerivedSNP = self.mostDerivedSNPWhenPushedThrough - self.numAncestral -= self.numAncSincePushThrough - self.initPushThroughVars() + assert isinstance(self.node_when_pushed_through, node_module.Node) + self.node = self.node_when_pushed_through + del self.der_snp_list[-1] + self.most_derived_snp = self.most_derived_snp_when_pushed_through + self.num_ancestral -= self.num_anc_since_push_through + self.init_push_through_vars() + + def update_with_branch_assessment( + self, + anc_snp_list: Sequence["snp_module.SNP"], + der_snp_list: Sequence["snp_module.SNP"], + ) -> None: + """Update with branch assessment. + + Extend derived SNP list. + Set most derived SNP. + Add number of ancestral alleles seen. + Track whether or not path has pushed through an (anc, der) == (1, 0) branch. - def updateWithBranchAssessment(self, ancSNPlist, derSNPlist): - """ - extends derived SNP list, sets most derived SNP, and adds number of - ancestral alleles seen. also, manages tracking of whether or not path - has pushed through an (anc,der)==(1,0) branch """ - - numAncestral, numDerived = len(ancSNPlist), len(derSNPlist) - self.numAncestral += numAncestral - self.derSNPlist.extend(derSNPlist) - if derSNPlist: - self.mostDerivedSNP = SNP.mostHighlyRankedMarkerOnList(derSNPlist) - - if self.hasPushedThrough: - self.updatePushThroughVars(numAncestral, numDerived) - elif (numAncestral, numDerived) == (1, 0): - self.setPushThroughVars() - - # static methods + num_ancestral, num_derived = len(anc_snp_list), len(der_snp_list) + self.num_ancestral += num_ancestral + self.der_snp_list.extend(der_snp_list) + if der_snp_list: + self.most_derived_snp = der_snp_list[0] + + if self.has_pushed_through: + self.update_push_through_vars(num_ancestral, num_derived) + elif (num_ancestral, num_derived) == (1, 0): + self.set_push_through_vars() + + # Class methods # ---------------------------------------------------------------------- - @staticmethod - def createPathDeque(nodeList): - "returns a deque of paths, with the node of each initialized from the given list" - - pathDeque = deque() - for node in nodeList: - pathDeque.append(Path(node)) + @classmethod + def create_path_deque(cls, node_list: Sequence["node_module.Node"]) -> deque[Path]: + """Return a deque of paths, each corresponding to one node in node_list.""" - return pathDeque + path_deque: deque[Path] = deque() + for node in node_list: + path_deque.append(cls(node)) - @staticmethod - def postProcessPathListAndSelectBest(pathList): - "post-processes each path in list and returns the best one" + return path_deque - for path in pathList: - path.revertIfPushedThroughTooFar() - return Path.bestPathInList(pathList) +def post_process_path_list_and_select_best(path_list: Sequence[Path]) -> Path: + """Post-processes each Path return the best.""" - @staticmethod - def bestPathInList(pathList): - "selects the best from a list of paths" + for path in path_list: + path.revert_if_pushed_through_too_far() - bestPath = None - for path in pathList: - if path.betterThan(bestPath): - bestPath = path + best_path = path_list[0] + for path in path_list[1:]: + if path.better_than(best_path): + best_path = path - return bestPath + return best_path diff --git a/yhaplo/plot_tree.py b/yhaplo/plot_tree.py deleted file mode 100755 index d4972bb..0000000 --- a/yhaplo/plot_tree.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env python -# -# David Poznik -# 2016.01.07 -# plot_tree.py -# -# To run: python -m yhaplo.plot_tree -# ---------------------------------------------------------------------- -from __future__ import absolute_import - -import argparse -import sys - -try: - from Bio import Phylo -except ImportError: - sys.exit( - "\nERROR. Please install Biopython with the following command:\n\n\t" - + "pip install biopython\n" - ) - -from .config import Config - -DESCRIPTION = "plots a newick tree" - -# ---------------------------------------------------------------------- -def main(): - args = get_args() - phyloTree = Phylo.read(args.newickFN, "newick") - if args.draw: - Phylo.draw(phyloTree) - else: - Phylo.draw_ascii(phyloTree) - - -def get_args(): - parser = argparse.ArgumentParser( - description=DESCRIPTION, formatter_class=argparse.RawTextHelpFormatter - ) - parser.add_argument( - "-d", - "--draw", - action="store_true", - default=False, - help="draw tree, rather than printing ascii version", - ) - parser.add_argument( - "-n", - "--newickFN", - type=str, - default=Config.primaryTreeFN, - help="name of file containing newick tree to plot", - ) - - args = parser.parse_args() - - return args - - -if __name__ == "__main__": - main() diff --git a/yhaplo/sample.py b/yhaplo/sample.py index efc864c..d5ca681 100644 --- a/yhaplo/sample.py +++ b/yhaplo/sample.py @@ -1,1028 +1,740 @@ -# David Poznik -# 2016.1.26 -# sample.py -# -# Defines two classes: -# - Sample -# - Customer (a subclass of Sample) -# ---------------------------------------------------------------------- -from __future__ import absolute_import - -import glob +"""Define Sample class and subclasses specific to various input formats. + +Classes defined herein include: +* Sample +* TextSample +* VCFSample +* AblockSample + +""" + +from __future__ import annotations + +import argparse +import csv import gzip +import logging import os -import sys -import warnings +import re from collections import defaultdict from operator import attrgetter +from typing import Optional + +import pandas as pd -import six -from six.moves import range +from yhaplo import node as node_module # noqa F401 +from yhaplo import snp as snp_module # noqa F401 +from yhaplo import tree as tree_module # noqa F401 +from yhaplo.config import IID_TYPE, Config +from yhaplo.utils.optional_dependencies import check_vcf_dependencies +from yhaplo.utils.vcf import check_vcf_index -# only used within 23andMe research env try: - import numpy as np - import pandas as pd - import rtk23.lib.coregen as coregen + from pysam import VariantFile except ImportError: pass -from . import utils -from .snp import PlatformSNP +logger = logging.getLogger(__name__) -# ---------------------------------------------------------------------- -class Sample(object): - """ - A sample: - - holds a genotype dictionary - - knows its haplogroup. +def call_haplogroups_from_config(config: Config) -> pd.DataFrame: + """Call haplogroups from a Config instance. + + Parameters + ---------- + config : Config + Yhaplo Config instance. + + Returns + ------- + haplogroup_df : pd.DataFrame + DataFrame of haplogroup calling results. + Index: Individual identifier. + Columns: + - hg_snp_obs: Haplogroup using a variant of representative-SNP form. + Rather than using one representative SNP per haplogroup, + use the most highly ranked SNP this individual was observed + to possess in the derived state. + - hg_snp: Haplogroup in representative-SNP form (e.g., "Q-M3"). + - ycc_haplogroup: Haplogroup using YCC nomenclature (e.g., "Q1a2a1a1"). + """ + if config.run_from_sample_major_txt: + TextSample.call_haplogroups(config) + elif config.run_from_vcf: + VCFSample.call_haplogroups(config) + else: + logger.info("Mode: No input data\n") + tree_module.Tree(config) - tree = None - config = None - args = None - errAndLog = None - numAssigned = 0 - numRootCalls = 0 - sampleList = list() - prevCalledHaplogroupDict = dict() - - def __init__(self, ID, sampleIndex=None): - self.ID = ID - self.sampleIndex = sampleIndex # for snp-major data - self.pos2genoDict = dict() - - self.ancDerCountTupleList = list() - self.haplogroupNode = None - self.mostDerivedSNP = None - self.derSNPlist = None - self.ancSNPlist = None - - self.prevCalledHaplogroup = self.config.missingHaplogroup - self.prevCalledHaplogroupDFSrank = 0 - if Sample.config.compareToPrevCalls: - self.setPrevCalledHaplogroup() + haplogroup_df = Sample.haplogroup_df() - @property - def haplogroup(self): - "haplogroup: YCC nomenclature (e.g., Q1a2a1a1)" + return haplogroup_df - return self.haplogroupNodeAttribute("haplogroup") - @property - def hgSNP(self): - "haplogroup: representative-SNP form (e.g., Q-M3)" +class Sample: - return self.haplogroupNodeAttribute("hgSNP") + """Class representing an individual. - @property - def hgTrunc(self): - "haplogroup: truncated form (e.g., Q1a2a1a1 -> Q)" + A sample knows its: + - Genotypes + - Haplogroup, once called - return self.haplogroupNodeAttribute("hgTrunc") + """ - @property - def hgSNPobs(self): - """ - like hgSNP, but rather than one representative SNP per haplogroup, - uses the most highly ranked SNP this individual was observed to possess - """ + config: Config + args: argparse.Namespace + tree: "tree_module.Tree" + tree_has_been_set: bool = False - if self.mostDerivedSNP: - return self.mostDerivedSNP.hgSNP - elif self.haplogroupNode: - return Sample.tree.root.haplogroup - else: - self.prematureHaplogroupAccessError() + num_assigned = 0 + num_root_calls = 0 + sample_list: list[Sample] = [] - @property - def haplogroupDFSrank(self): - "depth-first-search ranking of haplogroup node" + def __init__(self, iid: IID_TYPE): + """Construct Sample instance and append to `Sample.sample_list`.""" - return self.haplogroupNodeAttribute("DFSrank") + self.iid = iid + self.haplogroup_node: Optional["node_module.Node"] = None + self.most_derived_snp: Optional["snp_module.SNP"] = None + self.der_snp_list: list["snp_module.SNP"] + self.anc_snp_list: list["snp_module.SNP"] + self.anc_der_count_tuples: list[tuple["node_module.Node", int, int]] - def haplogroupNodeAttribute(self, attribute): - "look up attribute of self.haplogroupNode" + type(self).sample_list.append(self) - if self.haplogroupNode: - return getattr(self.haplogroupNode, attribute) - else: - self.prematureHaplogroupAccessError() + def __str__(self) -> str: + """Return string representation.""" - def prematureHaplogroupAccessError(self): - sys.exit("ERROR. Attempted to access haplogroup before assigned: %s" % self.ID) + sample_string = ( + f"{str(self.iid):8s} {self.hg_snp_obs:15s} " + f"{self.hg_snp:15s} {self.haplogroup:25s}" + ) + return sample_string - # string representations and output + # Haplogroup calling # ---------------------------------------------------------------------- - def __str__(self): - "string representation gets extra information if previous calls exist" - - sampleString = "%-8s %-15s %-15s %-25s" % ( - self.ID, - self.hgSNPobs, - self.hgSNP, - self.haplogroup, - ) + def call_haplogroup(self) -> None: + """Call haplogroup.""" + + type(self).num_assigned += 1 + tree = type(self).tree + ( + path, + self.anc_snp_list, + self.anc_der_count_tuples, + ) = tree.identify_phylogenetic_path(self) + self.der_snp_list = path.der_snp_list + self.most_derived_snp = path.most_derived_snp + + if self.most_derived_snp: + self.haplogroup_node = self.most_derived_snp.node + else: + type(self).num_root_calls += 1 + self.haplogroup_node = type(self).tree.root - if Sample.config.compareToPrevCalls: - prevHgStart = self.prevCalledHaplogroup[: Sample.config.numCharsToCompare] - hgStart = self.haplogroup[: Sample.config.numCharsToCompare] - matchFlag = "." if prevHgStart == hgStart else "*" - sampleString = "%s %-25s %s" % ( - sampleString, - self.prevCalledHaplogroup, - matchFlag, - ) + try: + self.fix_haplogroup_if_artifact() + except NotImplementedError: + pass - return sampleString + self.write_real_time_output() + self.purge_data() - def strCompressed(self): - return utils.compressWhitespace(str(self)) + def get_genotype(self, position: int) -> str: + """Return consensus genotype for position. Subclasses must override.""" - def strSimple(self): - "string representation with just ID, haplogroup, and hgSNP" + raise NotImplementedError - return "%-8s %-25s %-15s" % (self.ID, self.haplogroup, self.hgSNP) + def fix_haplogroup_if_artifact(self) -> None: + """Fix artifactual haplogroup assignments. Subclasses may override.""" - def strForCounts(self): - "string representation for anc/der counts output" + raise NotImplementedError - leftPart = "%-8s %s" % (self.ID, self.haplogroup) - rightPart = "%s %s" % (self.hgSNPobs, self.hgSNP) - if Sample.config.compareToPrevCalls: - return "%s %s | %s" % (leftPart, self.prevCalledHaplogroup, rightPart) - else: - return "%s | %s" % (leftPart, rightPart) + def write_real_time_output(self) -> None: + """Write real-time output if requested.""" - def strSNPs(self, ancestral): - "constructs a string representation with derived or ancestral SNPs" + args, config = type(self).args, type(self).config - if ancestral: - snpList = self.ancSNPlist - else: - snpList = self.derSNPlist + if args.write_haplogroups_real_time: + config.haplogroup_real_time_file.write( + f"{str(self)} {self.haplogroup_dfs_rank:5d}\n" + ) - snpListString = " ".join(snp.strShort() for snp in snpList) - return "%s | %s" % (self.strSimple(), snpListString) + if args.haplogroup_to_list_genotypes_for: + config.hg_genos_file.write(f"{self.str_compressed}\n\n") - def strHaplogroupPath(self, include_SNPs=False): - "constructs a string representation with haplogroup path" + def purge_data(self) -> None: + """Clear data structures if no longer needed.""" - if self.mostDerivedSNP: - snp_label_list_dict = defaultdict(list) - for snp in self.derSNPlist: - snp_label_list_dict[snp.node.haplogroup].append(snp.labelCleaned) + args = type(self).args - path_info_list = list() - for node in self.mostDerivedSNP.backTracePath(): - if node.haplogroup in snp_label_list_dict: - snp_label_list = snp_label_list_dict[node.haplogroup] - num_snps = len(snp_label_list) - path_info = "%s:%d" % (node.haplogroup, num_snps) - if include_SNPs: - path_info = "%s:%s" % (path_info, ",".join(snp_label_list)) - path_info_list.append(path_info) + if not ( + args.write_der_snps + or args.write_der_snps_detail + or args.write_haplogroup_paths + or args.write_haplogroup_paths_detail + ): + self.der_snp_list.clear() - haplogroup_path = " ".join(path_info_list) - else: - haplogroup_path = "" + if not (args.write_anc_snps or args.write_anc_snps_detail): + self.anc_snp_list.clear() - return "%s | %s" % (self.strSimple(), haplogroup_path) + if not self.args.write_anc_der_counts: + self.anc_der_count_tuples.clear() - def realTimeOutput(self): - "generate real-time output if requested" + # Haplogroup properties + # ---------------------------------------------------------------------- + @property + def haplogroup(self) -> str: + """Return haplogroup using YCC nomenclature (e.g., "Q1a2a1a1").""" - if Sample.args.writeHaplogroupsRealTime: - output = "%s %5d\n" % (str(self), self.haplogroupDFSrank) - Sample.config.haplogroupRealTimeFile.write(output) + if self.haplogroup_node is None: + raise RuntimeError(f"Haplogroup not yet computed for {self.iid}") - if Sample.args.haplogroupToListGenotypesFor: - Sample.config.hgGenosFile.write("%s\n\n" % self.strCompressed()) + return self.haplogroup_node.haplogroup - # previously called haplogroups - # ---------------------------------------------------------------------- - def setPrevCalledHaplogroup(self): - """ - sets previously called haplogroup for testing/comparison. - also sets corresponding DFS rank for sorting - """ + @property + def hg_snp(self) -> str: + """Return haplogroup in representative-SNP form (e.g., "Q-M3").""" - if not Sample.prevCalledHaplogroupDict: - Sample.importPrevCalledHaplogroups() - if self.ID in Sample.prevCalledHaplogroupDict: - self.prevCalledHaplogroup = Sample.prevCalledHaplogroupDict[self.ID] - else: - Sample.errAndLog( - "WARNING. No previously called haplogroup for: %s\n" % self.ID - ) + if self.haplogroup_node is None: + raise RuntimeError(f"Haplogroup not yet computed for {self.iid}") - self.setPrevCalledHaplogroupDFSrank() + return self.haplogroup_node.hg_snp - @staticmethod - def importPrevCalledHaplogroups(): - """ - reads file with previously called haplogroups, - assuming first col = ID & last col = haplogroup - """ + @property + def hg_trunc(self) -> str: + """Return haplogroup in truncated form (e.g., "Q").""" - utils.checkFileExistence( - Sample.config.prevCalledHgFN, "Previously called haplogroups" - ) - with open(Sample.config.prevCalledHgFN, "r") as prevCalledHgFile: - for line in prevCalledHgFile: - lineList = line.strip().split() - ID, prevCalledHaplogroup = lineList[0], lineList[-1] - Sample.prevCalledHaplogroupDict[ID] = prevCalledHaplogroup - - Sample.errAndLog( - "%sRead previously called haplogroups:\n %s\n\n" - % (utils.DASHES, Sample.config.prevCalledHgFN) - ) + if self.haplogroup_node is None: + raise RuntimeError(f"Haplogroup not yet computed for {self.iid}") - def setPrevCalledHaplogroupDFSrank(self, ignore=False): - "sets depth-first search rank of previously called haplogroup" + return self.haplogroup_node.hg_trunc - hg2nodeDict = Sample.tree.hg2nodeDict - if not ignore and self.prevCalledHaplogroup != self.config.missingHaplogroup: - haplogroupKey = self.prevCalledHaplogroup - while haplogroupKey not in hg2nodeDict and len(haplogroupKey) > 0: - haplogroupKey = haplogroupKey[:-1] - if haplogroupKey in hg2nodeDict: - self.prevCalledHaplogroupDFSrank = hg2nodeDict[haplogroupKey].DFSrank + @property + def hg_snp_obs(self) -> str: + """Return haplogroup using a variant of representative-SNP form. + + Rather than using one representative SNP per haplogroup, + use the most highly ranked SNP this individual was observed to possess + in the derived state. - # mutaters - # ---------------------------------------------------------------------- - def addGeno(self, position, genotype): - """ - adds one value to the genotype dictionary - if a contradiction is encountered, sets value to missing - Note: there is no reason to call this method with a missing genotype, - this should be the only way missing values enter the dictionary. """ + if self.haplogroup_node is None: + raise RuntimeError(f"Haplogroup not yet computed for {self.iid}") - if position in self.pos2genoDict: - if genotype != self.pos2genoDict[position]: - self.pos2genoDict[position] = self.config.missingGenotype + if self.most_derived_snp: + hg_snp_obs = self.most_derived_snp.hg_snp + elif self.haplogroup_node: + hg_snp_obs = type(self).tree.root.haplogroup else: - self.pos2genoDict[position] = genotype + raise RuntimeError(f"Haplogroup not yet computed for {self.iid}") - def appendAncDerCountTuple(self, node, numAncestral, numDerived): - "stores results of search path" + return hg_snp_obs - ancDerCountTuple = (node, numAncestral, numDerived) - self.ancDerCountTupleList.append(ancDerCountTuple) + @property + def haplogroup_dfs_rank(self) -> int: + """Return depth-first-search ranking of haplogroup node.""" - def callHaplogroup(self): - "finds path through tree and returns haplogroup" + if self.haplogroup_node is None: + raise RuntimeError(f"Haplogroup not yet computed for {self.iid}") - Sample.numAssigned += 1 - path, self.ancSNPlist = Sample.tree.identifyPhylogeneticPath(self) - self.derSNPlist = path.derSNPlist - self.mostDerivedSNP = path.mostDerivedSNP + haplogroup_dfs_rank = self.haplogroup_node.dfs_rank + return haplogroup_dfs_rank - if self.mostDerivedSNP: - self.haplogroupNode = self.mostDerivedSNP.node - else: - Sample.numRootCalls += 1 - self.haplogroupNode = Sample.tree.root + # String-representation properties and methods + # ---------------------------------------------------------------------- + @property + def str_compressed(self) -> str: + """Return compressed string representation.""" + + str_compressed = re.sub(r"\s+", " ", str(self)) + return str_compressed - self.fixHaplogroupIfArtifact() - self.realTimeOutput() - self.freeUpMemory() + @property + def str_simple(self) -> str: + """Return string representation with just iid, haplogroup, and hg_snp.""" - def fixHaplogroupIfArtifact(self): - "fixes artifactual haplogroup assignments; override as appropriate" + return f"{str(self.iid):8s} {self.haplogroup:25s} {self.hg_snp:15s}" - pass + @property + def str_for_counts(self) -> str: + """Return string representation for ancestral/derived counts output.""" - def freeUpMemory(self): - "free up some memory if possible" + left_part = f"{str(self.iid):8s} {self.haplogroup}" + right_part = f"{self.hg_snp_obs} {self.hg_snp}" + str_for_counts = f"{left_part} | {right_part}" - self.pos2genoDict = None - if not ( - Sample.args.writeDerSNPs - or Sample.args.writeDerSNPsDetail - or Sample.args.writeHaplogroupPaths - or Sample.args.writeHaplogroupPathsDetail - ): - self.derSNPlist = None - if not (Sample.args.writeAncSNPs or Sample.args.writeAncSNPsDetail): - self.ancSNPlist = None + return str_for_counts - # ---------------------------------------------------------------------- - # Run: main entry point - # ---------------------------------------------------------------------- - @staticmethod - def callHaplogroups(config, tree): - "this method is the entry point and is to be called from outside." - - Sample.setTreeConfigAndArgs(config, tree) - Sample.testNumberOfRunModes(config) - - if config.runFromSampleMajorTxt: - Sample.runFromSampleMajorTxt() - elif config.runFromVCF or config.runFromVCF4: - Sample.runFromVCF() - elif config.runFromAblocks: - Customer.runFromAblocks() + def str_snps( + self, + allele_state: str = "derived", + ) -> str: + """Return string representation with derived or ancestral SNPs.""" + + if allele_state == "derived": + snp_list = self.der_snp_list + elif allele_state == "ancestral": + snp_list = self.anc_snp_list else: - Sample.errAndLog( - "%sNo input genotypes specified. Exiting.\n\n" % utils.DASHES + raise ValueError( + f"allele_state must be 'ancestral' or 'derived', not '{allele_state}'" ) - config.closeFiles() + snp_list_string = " ".join(snp.str_short for snp in snp_list) + str_snps = f"{self.str_simple} | {snp_list_string}" - @staticmethod - def setTreeConfigAndArgs(config, tree): - "enables Sample class to know about the tree instance, config, and args" + return str_snps - Sample.config = config - Sample.args = config.args - Sample.errAndLog = config.errAndLog - Sample.tree = tree + def str_haplogroup_path( + self, + include_snps: bool = False, + ) -> str: + """Return string representation with haplogroup path.""" - if Sample.args.writeHaplogroupsRealTime: - Sample.realTimeHaplogroupWritingMessage() + if self.most_derived_snp: + snp_label_list_dict = defaultdict(list) + for snp in self.der_snp_list: + snp_label_list_dict[snp.node.haplogroup].append(snp.label_cleaned) - @staticmethod - def realTimeHaplogroupWritingMessage(): - "emit a message for real-time haplogroup writing" + path_info_list = [] + for node in self.most_derived_snp.back_trace_path(): + if node.haplogroup in snp_label_list_dict: + snp_label_list = snp_label_list_dict[node.haplogroup] + num_snps = len(snp_label_list) + path_info = f"{node.haplogroup}:{num_snps}" + if include_snps: + path_info = f"{path_info}:{','.join(snp_label_list)}" - Sample.errAndLog( - "%sWill write haplogroups as they are called:\n" % utils.DASHES - + " %s\n\n" % Sample.config.haplogroupRealTimeFN - + "Note: This file includes DFS rank, so it can be sorted ex post facto with:\n" - + " sort -nk5 %s\n\n" % Sample.config.haplogroupRealTimeFN - ) + path_info_list.append(path_info) - @staticmethod - def testNumberOfRunModes(config): - "consistency check the number of run modes" + haplogroup_path = " ".join(path_info_list) + else: + haplogroup_path = "" - numberOfRunModesSelected = ( - config.runFromSampleMajorTxt - + config.runFromVCF - + config.runFromVCF4 - + config.runFromAblocks - ) - if numberOfRunModesSelected > 1: - sys.exit( - "ERROR. Expecting no more than one run mode\n" - + " %d selected\n" % numberOfRunModesSelected - ) + str_haplogroup_path = f"{self.str_simple} | {haplogroup_path}" + return str_haplogroup_path + # Class methods: results # ---------------------------------------------------------------------- - # Run option 1: sample-major text data - # ---------------------------------------------------------------------- - @staticmethod - def runFromSampleMajorTxt(): - "run pipeline on sample-major data" - - Sample.processSampleMajorTxtandCallHaplogroups() - Sample.sortSampleList() - Sample.writeSampleList() + @classmethod + def haplogroup_df(cls) -> pd.DataFrame: + """Return DataFrame of haplogroup calling results. + + Returns + ------- + haplogroup_df : pd.DataFrame + DataFrame of haplogroup calling results. + Index: Individual identifier. + Columns: + - hg_snp_obs: Haplogroup using a variant of representative-SNP form. + Rather than using one representative SNP per haplogroup, + use the most highly ranked SNP this individual was observed + to possess in the derived state. + - hg_snp: Haplogroup in representative-SNP form (e.g., "Q-M3"). + - ycc_haplogroup: Haplogroup using YCC nomenclature (e.g., "Q1a2a1a1"). - @staticmethod - def processSampleMajorTxtandCallHaplogroups(): """ - reads in sample major data, calling haplogroup for each line. - returns list of sample objects with genotype data purged. + haplogroup_df = pd.DataFrame( + [ + (sample.iid, sample.hg_snp_obs, sample.hg_snp, sample.haplogroup) + for sample in cls.sample_list + ], + columns=["iid", "hg_snp_obs", "hg_snp", "ycc_haplogroup"], + ).set_index("iid") - assumed format: - row 1 = physical coordinates - column 1 = sample ID - """ + return haplogroup_df - genoFN = Sample.args.dataFN - genoFile, genoReader = utils.getCSVreader(genoFN, delimiter="\t") - Sample.errAndLog( - "%sReading genotype data:\n %s\n\n" % (utils.DASHES, genoFN) - ) + # Class methods: configuration + # ---------------------------------------------------------------------- + @classmethod + def configure(cls, config: Config) -> None: + """Configure class. - # determine relevant physical coordinates and corresponding columns - allPositionsList = [int(position) for position in six.next(genoReader)[1:]] - columnPositionTupleList = list() - for column, position in enumerate(allPositionsList): - if position in Sample.tree.snpPosSet: - columnPositionTupleList.append((column, position)) + Set: + - Config instance + - Tree instance + - Parameters - # read genotypes, call haplogroups - for genoList in genoReader: - ID, genoList = genoList[0], genoList[1:] - if Sample.args.singleSampleID and ID != Sample.args.singleSampleID: - continue + """ + cls.config = config + cls.args = cls.config.args + if not cls.tree_has_been_set: + cls.tree = tree_module.Tree(config) + else: + logger.info("\nUsing previously contructed tree\n") + + cls.num_assigned = 0 + cls.num_root_calls = 0 + cls.sample_list.clear() + cls.check_number_of_run_modes() + + if cls.args.write_haplogroups_real_time: + logger.info( + f"\nWill write haplogroups as they are called:\n" + f" {cls.config.haplogroup_real_time_fp}\n" + "Note: This file includes DFS rank, so it can be sorted ex post facto with:\n" + f" sort -nk5 {cls.config.haplogroup_real_time_fp}\n" + ) - sample = Sample(ID) - for column, position in columnPositionTupleList: - genotype = genoList[column] - if genotype != Sample.config.missingGenotype: - sample.addGeno(position, genotype) + if config.run_from_sample_major_txt: + input_description = f"sample-major text file:\n {cls.args.data_fp}" + elif config.run_from_vcf: + input_description = f"variant-major VCF/BCF file:\n {cls.args.data_fp}" + else: + assert config.iid_to_ablock is not None + num_ablocks = len(config.iid_to_ablock) + plural_s = "s" if num_ablocks > 1 else "" + input_description = f"[{num_ablocks}] 23andMe ablock{plural_s}..." - sample.callHaplogroup() - Sample.sampleList.append(sample) + logger.info(f"\nGenotypes\n\nLoading genotypes from {input_description}\n") - genoFile.close() + @classmethod + def check_number_of_run_modes(cls) -> None: + """Check the number of run modes. - # ---------------------------------------------------------------------- - # Run option 2: snp-major txt data (VCF) - # ---------------------------------------------------------------------- - @staticmethod - def runFromVCF(): - "run pipeline on snp-major data" + Raises + ------ + ValueError + When more then one run mode have been selected. - Sample.loadDataFromVCF() - if ( - Sample.args.writeHaplogroupsRealTime - or Sample.args.haplogroupToListGenotypesFor - ): - Sample.sortSampleList(sortByPrevHg=True) - for sample in Sample.sampleList: - sample.callHaplogroup() - Sample.sortSampleList() - Sample.writeSampleList() - - @staticmethod - def loadDataFromVCF(): - "constructs list of sample objects, each with a genotype dictionary" - - vcfFN = Sample.args.dataFN - vcfFile, vcfReader = utils.getCSVreader(vcfFN, delimiter="\t") - Sample.setSampleListFromVCFheader(vcfFile) - ref_geno_set = {"0", "0/0"} - alt_geno_set = {"1", "1/1"} - - Sample.errAndLog( - "%sReading genotype data...\n %s\n\n" % (utils.DASHES, vcfFN) - ) - - for lineList in vcfReader: - chromosome, position = lineList[0], int(lineList[1]) - if ( - chromosome in Sample.config.vcf_chrom_label_set - and position in Sample.tree.snpPosSet - ): - genoList = lineList[Sample.config.vcfStartCol :] - for sample in Sample.sampleList: - genotype = genoList[sample.sampleIndex].split(":")[0] - - if genotype == Sample.config.missingGenotype: - continue - elif Sample.config.runFromVCF: # as opposed to .vcf4 - ref, alt = lineList[3:5] - if genotype in ref_geno_set: - genotype = ref - elif genotype in alt_geno_set: - genotype = alt - - sample.addGeno(position, genotype) - - vcfFile.close() - - @staticmethod - def setSampleListFromVCFheader(vcfFile): - """ - skips over VCF metadata, - checks that the next line contains the column names, - extracts sample IDs, - and construct a list of sample objects """ - - for line in vcfFile: - if not line.startswith("##"): - break - - lineList = line.strip().split("\t") - Sample.validateVCFheader(lineList) - idList = lineList[Sample.config.vcfStartCol :] - for sampleIndex, ID in enumerate(idList): - if not Sample.args.singleSampleID or ID == Sample.args.singleSampleID: - sample = Sample(ID, sampleIndex) - Sample.sampleList.append(sample) - - @staticmethod - def validateVCFheader(lineList): - "checks that the second element of a list is POS" - - col2label = lineList[1] - if col2label != "POS": - sys.exit( - "ERROR. Invalid VCF. Expected column 2 header to be POS.\n" - + " Instead found: %s" % col2label + number_of_run_modes_selected = ( + cls.config.run_from_sample_major_txt + + cls.config.run_from_vcf + + cls.config.run_from_ablocks + ) + if number_of_run_modes_selected > 1: + raise ValueError( + "Expecting no more than one run mode\n" + f" {number_of_run_modes_selected} selected\n" ) - # sample list manipulation and writing + # Class methods: class variable mutaters # ---------------------------------------------------------------------- - @staticmethod - def sortSampleList(sortByPrevHg=False): - """ - sorts sample list: - 1. primarily by haplogroup - 2. secondarily by ID - - If a previously called haplogroup is available and sortByPrevHg is True, - the primary sort uses it. Otherwise, if a newly called haplogroup is - available, the primary sort uses that. Otherwise, sorting is by ID only. - """ + @classmethod + def sort_sample_list(cls) -> None: + """Sort sample list by haplogroup, then by iid.""" - Sample.sampleList = sorted(Sample.sampleList, key=attrgetter("ID")) - if sortByPrevHg and Sample.config.compareToPrevCalls: - Sample.sampleList = sorted( - Sample.sampleList, key=attrgetter("prevCalledHaplogroupDFSrank") - ) - elif Sample.sampleList[0].haplogroupNode: - Sample.sampleList = sorted( - Sample.sampleList, key=attrgetter("haplogroupDFSrank") - ) + cls.sample_list.sort(key=attrgetter("iid")) + if cls.sample_list[0].haplogroup_node: + cls.sample_list.sort(key=attrgetter("haplogroup_dfs_rank")) - @staticmethod - def writeSampleList(): - "writes haplogroup and other optional data for each sample" + # Class methods: output writers + # ---------------------------------------------------------------------- + @classmethod + def write_results(cls) -> None: + """Sort samples, write results, and close optional real-time output files.""" - Sample.reportCounts() + cls.sort_sample_list() - Sample.errAndLog("%sOutput\n\n" % utils.DASHES) + logger.info( + "\nHaplogroups\n\n" + f" {cls.num_assigned:8d} assigned\n" + f" {cls.num_root_calls:8d} assigned to root haplogroup: " + f"{cls.tree.root.haplogroup}\n" + ) + if cls.num_root_calls > 0: + logger.warning( + "WARNING. If the dataset does not include fixed reference sites,\n" + " re-run with alternative root (e.g., with: -r A0-T).\n\n\n" + ) - if Sample.config.suppressOutputAndLog: - Sample.errAndLog("None (suppressed).\n\n") - else: - Sample.writeHaplogroups() # uses str(sample) + logger.info("\nOutput\n") - if Sample.args.writeAncDerCounts: - Sample.writeAncDerCounts() # uses sample.strForCounts() + if cls.config.suppress_output: + logger.info("None (suppressed).\n") + else: # Use str(sample) + cls.write_haplogroups() - if Sample.args.writeHaplogroupPathsDetail: - Sample.writeHaplogroupPaths( - include_SNPs=True - ) # uses sample.strHaplogroupPath() - elif Sample.args.writeHaplogroupPaths: - Sample.writeHaplogroupPaths() # uses sample.strHaplogroupPath() + if cls.args.write_anc_der_counts: # Use sample.str_for_counts + cls.write_anc_der_counts() - if Sample.args.writeDerSNPs: - Sample.writeSNPs() # uses sample.strSNPs(ancestral) + if cls.args.write_haplogroup_paths_detail: # Use sample.str_haplogroup_path() + cls.write_haplogroup_paths(include_snps=True) + elif cls.args.write_haplogroup_paths: # Use sample.str_haplogroup_path() + cls.write_haplogroup_paths() - if Sample.args.writeDerSNPsDetail: - Sample.writeSNPsDetail() # uses sample.strCompressed() + if cls.args.write_der_snps: # Use sample.str_snps() + cls.write_snps(allele_state="derived") - if Sample.args.writeAncSNPs: - Sample.writeSNPs(ancestral=True) + if cls.args.write_der_snps_detail: # Use sample.str_compressed + cls.write_snps_detail() - if Sample.args.writeAncSNPsDetail: - Sample.writeSNPsDetail(ancestral=True) + if cls.args.write_anc_snps: # Use sample.str_snps() + cls.write_snps(allele_state="ancestral") - @staticmethod - def reportCounts(): - "report number assigned and number assigned to root" + if cls.args.write_anc_snps_detail: # Use sample.str_compressed + cls.write_snps_detail(ancestral=True) - Sample.errAndLog( - "%sCalled haplogroups:\n\n" % utils.DASHES - + " %8d assigned\n" % Sample.numAssigned - + " %8d assigned to root haplogroup: %s\n\n" - % (Sample.numRootCalls, Sample.tree.root.haplogroup) - ) + cls.config.close_real_time_output_files() - if Sample.numRootCalls > 0: - Sample.warnVariantsOnlyData() + @classmethod + def write_haplogroups(cls) -> None: + """Write haplogroup of each sample.""" - @staticmethod - def warnVariantsOnlyData(): - "warning for datasets that exclude sites with no variation in the sample" + with open(cls.config.haplogroup_calls_fp, "w") as haplogroup_calls_file: + for sample in cls.sample_list: + haplogroup_calls_file.write(f"{sample}\n") - Sample.errAndLog( - "WARNING. If the dataset does not include fixed reference sites,\n" - + " re-run with alternative root (e.g., with: -r A0-T).\n\n\n" + logger.info( + f"Wrote called haplogroups:\n {cls.config.haplogroup_calls_fp}\n" ) - @staticmethod - def writeHaplogroups(): - "writes haplogroup of each sample" - - with open(Sample.config.haplogroupCallsFN, "w") as haplogroupCallsFile: - for sample in Sample.sampleList: - haplogroupCallsFile.write("%s\n" % str(sample)) + @classmethod + def write_anc_der_counts(cls) -> None: + """Write counts of ancestral and derived alleles encountered. - Sample.errAndLog( - "Wrote called haplogroups:\n" - + " %s\n\n" % Sample.config.haplogroupCallsFN - ) + This includes each visited node, + other than those with no ancestral or derived alleles. - @staticmethod - def writeAncDerCounts(): - """ - writes counts of ancestral and derived alleles encountered - at each node visited (excluding nodes with zero of each) """ - - with open(Sample.config.countsAncDerFN, "w") as countsAncDerFile: - for sample in Sample.sampleList: - for node, numAncestral, numDerived in sample.ancDerCountTupleList: - if numAncestral > 0 or numDerived > 0: - countsAncDerFile.write( - "%-8s %-20s %3d %3d\n" - % (sample.ID, node.label, numAncestral, numDerived) + with open(cls.config.counts_anc_der_fp, "w") as counts_anc_der_file: + for sample in cls.sample_list: + for node, num_ancestral, num_derived in sample.anc_der_count_tuples: + if num_ancestral > 0 or num_derived > 0: + counts_anc_der_file.write( + f"{str(sample.iid):8s} {node.label:20s} " + f"{num_ancestral:3d} {num_derived:3d}\n" ) - countsAncDerFile.write("%s\n\n" % sample.strForCounts()) + counts_anc_der_file.write(f"{sample.str_for_counts}\n\n") - Sample.errAndLog( - "Wrote counts of ancestral and derived alleles encountered\n" - + "at each node visited (excluding nodes with zero of each):\n" - + " %s\n\n" % Sample.config.countsAncDerFN + logger.info( + "Wrote counts of ancestral and derived alleles encountered:\n" + f" {cls.config.counts_anc_der_fp}\n" ) - @staticmethod - def writeHaplogroupPaths(include_SNPs=False): - "writes haplogroup path for each sample" - - with open(Sample.config.haplogroupPathsFN, "w") as haplogroupPathsFile: - for sample in Sample.sampleList: - path = sample.strHaplogroupPath(include_SNPs) - haplogroupPathsFile.write("%s\n" % path) - - snps_included_text = " and a list thereof" if include_SNPs else "" - Sample.errAndLog( - "Wrote sequences of haplogroups from root to calls,\n" - + "with counts of derived SNPs observed%s:\n" % snps_included_text - + " %s\n\n" % Sample.config.haplogroupPathsFN + @classmethod + def write_haplogroup_paths( + cls, + include_snps: bool = False, + ) -> None: + """Write haplogroup path for each sample.""" + + with open(cls.config.haplogroup_paths_fp, "w") as haplogroup_paths_file: + for sample in cls.sample_list: + path = sample.str_haplogroup_path(include_snps) + haplogroup_paths_file.write(f"{path}\n") + + logger.info( + "Wrote paths with counts of derived SNPs observed:\n" + f" {cls.config.haplogroup_paths_fp}\n" ) - @staticmethod - def writeSNPs(ancestral=False): - """ - for each sample, writes list of derived SNPs on path - or list of ancestral SNPs encountered in search - """ - - if ancestral: - snpFN = Sample.config.ancSNPsFN - typeOfSNPs = "ancestral SNPs encountered in search" - else: - snpFN = Sample.config.derSNPsFN - typeOfSNPs = "derived SNPs on path" - - with open(snpFN, "w") as snpFile: - for sample in Sample.sampleList: - snpFile.write("%s\n" % sample.strSNPs(ancestral)) + @classmethod + def write_snps( + cls, + allele_state: str = "derived", + ) -> None: + """Write list of derived or ancestral alleles encountered. - Sample.errAndLog("Wrote lists of %s:\n %s\n\n" % (typeOfSNPs, snpFN)) + Repeat for each sample. - @staticmethod - def writeSNPsDetail(ancestral=False): - """ - for each sample, writes detailed information about each - derived SNP on path or about each ancestral SNP encountered in search """ - - if ancestral: - snpDetailFN = Sample.config.ancSNPsDetailFN - typeOfSNPs = "ancestral SNP encountered in search" + if allele_state == "derived": + snp_fp = cls.config.der_snps_fp + type_of_snps = "derived SNPs on path" + elif allele_state == "ancestral": + snp_fp = cls.config.anc_snps_fp + type_of_snps = "ancestral SNPs encountered in search" else: - snpDetailFN = Sample.config.derSNPsDetailFN - typeOfSNPs = "derived SNP on path" - - with open(snpDetailFN, "w") as snpDetailFile: - for sample in Sample.sampleList: - snpDetailFile.write("%s\n" % sample.strCompressed()) - snpList = sample.ancSNPlist if ancestral else sample.derSNPlist - for snp in snpList: - snpDetailFile.write("%-8s %s\n" % (sample.ID, snp)) - snpDetailFile.write("\n") - - Sample.errAndLog( - "Wrote detailed information about each %s:\n" % typeOfSNPs - + " %s\n\n" % snpDetailFN - ) - - -# -------------------------------------------------------------------------- - + raise ValueError( + f"allele_state must be 'ancestral' or 'derived', not '{allele_state}'" + ) -class Customer(Sample): - 'a "customer" is any sample whose genotypes are stored as 23andMe ablocks' + with open(snp_fp, "w") as snp_file: + for sample in cls.sample_list: + snp_file.write(f"{sample.str_snps(allele_state)}\n") - numNoAblock, numNoGenotypes = 0, 0 - noAblocksFile, noGenotypesFile = None, None + logger.info(f"Wrote lists of {type_of_snps}:\n {snp_fp}\n") - def __init__(self, customerTuple): - self.customerTuple = customerTuple - Sample.__init__(self, customerTuple.resid) + @classmethod + def write_snps_detail( + cls, + ancestral: bool = False, + ) -> None: + """Write detailed information about derived or ancestral alleles observed. - def setPrevCalledHaplogroup(self): - """ - for testing/comparison, sets previously called haplogroup from - original 23andMe algorithm. does not set corresponding DFS rank, - since the nomenclature has changed substantially + Repeat for each sample. - called from Customer constructor via Sample constructor, - if config.compareToPrevCalls """ - - self.prevCalledHaplogroup = self.customerTuple.y_haplogroup - - def loadAblockAndCallHaplogroup(self): - "loads ablock, reads relevant genotypes, calls haplogroup" - - ablock = type(self).load_ablock(self.ID) - - if ablock is None: - Customer.numNoAblock += 1 - if Customer.noAblocksFile: - Customer.noAblocksFile.write("%d\n" % self.ID) - return False - - if self.readAblockGenotypes(ablock): - self.callHaplogroup() - return True + if ancestral: + snp_detail_fp = cls.config.anc_snps_detail_fp + type_of_snps = "ancestral SNP encountered" else: - Customer.numNoGenotypes += 1 - if Customer.noGenotypesFile: - Customer.noGenotypesFile.write("%d\n" % self.ID) - return False + snp_detail_fp = cls.config.der_snps_detail_fp + type_of_snps = "derived SNP on path" - @classmethod - def load_ablock(cls, ID): - """ - loads an a ablock + with open(snp_detail_fp, "w") as snp_detail_file: + for sample in cls.sample_list: + snp_detail_file.write(f"{sample.str_compressed}\n") + snp_list = sample.anc_snp_list if ancestral else sample.der_snp_list + for snp in snp_list: + snp_detail_file.write(f"{str(sample.iid):8s} {snp}\n") - if a directory has been supplied, ablocks can be bitpacked or .npy.gz. - if not, the ablock will be loaded from an ablock dataset. - """ + snp_detail_file.write("\n") - ablock = None + logger.info( + f"Wrote detailed information about each {type_of_snps}:\n" + f" {snp_detail_fp}\n" + ) - if cls.config.ablocks_dir: - # .npy.gz - ablock_fn = os.path.join( - cls.config.ablocks_dir, cls.config.ablock_fn_tp.format(ID) - ) - if os.path.isfile(ablock_fn): - ablock = np.load(gzip.open(ablock_fn)) - # bitpacked - else: - bitpacked_ablock_fn = glob.glob( - os.path.join( - cls.config.ablocks_dir, - cls.config.bitpacked_ablock_fn_tp.format(ID), - ) - ).pop() - if os.path.isfile(bitpacked_ablock_fn): - with open(bitpacked_ablock_fn) as bitpacked_ablock_file: - ablock = coregen.ABlockFormat.decompress( - bitpacked_ablock_file.read() - ) +class TextSample(Sample): - # ablock dataset - else: - with warnings.catch_warnings(): - warnings.simplefilter("ignore") - try: - ablock = cls.config.ablockDS.load_block(ID) - except KeyError: - pass - - return ablock - - def readAblockGenotypes(self, ablock): - "pulls phylogenetically informative genotypes from ablock" - - hasGenotypes = False - for platformVersion in range(1, Sample.config.maxPlatformVersionPlusOne): - if getattr(self.customerTuple, "is_v%d" % platformVersion): - hasGenotypes = True - platformSNPlist = PlatformSNP.platformSNPlistDict[platformVersion] - for platformSNP in platformSNPlist: - genotype = platformSNP.getConsensusGenotypeFromAblock(ablock) - self.addGeno(platformSNP.position, genotype) - - return hasGenotypes - - def fixHaplogroupIfArtifact(self): - "fixes artifactual haplogroup assignments" - - for calledHaplogroup, replacementHaplogroup in six.iteritems( - Sample.config.ttamHgCallReplacementDict - ): - if self.haplogroup == calledHaplogroup: - self.haplogroupNode = Sample.tree.hg2nodeDict[replacementHaplogroup] - break + """Class representing an individual whose data are in a sample-major text file. - # ---------------------------------------------------------------------- - # Run option 3: sample-major 23andMe ablock data - # ---------------------------------------------------------------------- - @staticmethod - def runFromAblocks(): - "run pipeline on sample-major 23andMe ablock data" + Expected input format: + - Row 1: Physical coordinates + - Column 1: Individual identifiers - Customer.openAuxiliaryOutputFiles() + """ - Sample.errAndLog("%sProcessing 23andMe customer data...\n\n" % utils.DASHES) - customerTupleList = Customer.buildCustomerTupleList() + position_to_column_index: dict[int, int] - Sample.errAndLog( - "\n%sCalling haplogroups...\n\n" % (utils.DASHES) + " Progress...\n" - ) - for customerTuple in customerTupleList: - customer = Customer(customerTuple) - if customer.loadAblockAndCallHaplogroup(): - Sample.sampleList.append(customer) - Customer.emitProgress() + def __init__( + self, + iid: IID_TYPE, + genotypes: list[str], + ): + """Construct TextSample instance.""" - Customer.closeAuxiliaryFilesAndReportCounts() - Sample.sortSampleList() - Sample.writeSampleList() + super(TextSample, self).__init__(iid) + self.genotypes = genotypes - @staticmethod - def openAuxiliaryOutputFiles(): - "open auxiliary output files" + def get_genotype(self, position: int) -> str: + """Return genotype for position.""" - if not Sample.config.suppressOutputAndLog: - Customer.noAblocksFile = open(Sample.config.noAblocksFN, "w") - Customer.noGenotypesFile = open(Sample.config.noGenotypesFN, "w") + try: + genotype = self.genotypes[type(self).position_to_column_index[position]] + except KeyError: + genotype = Config.missing_genotype - @classmethod - def buildCustomerTupleList(cls): - "builds a list of CustomerTuple instances" + return genotype - if Sample.args.ablockDSname or cls.config.ablocks_dir: - customerTupleList = cls.buildCustomerTupleListFromFile() - else: - customerTupleList = cls.buildCustomerTupleListFromMetadata() + def purge_data(self) -> None: + """Clear genotype data and other data structures if no longer needed.""" - return customerTupleList + super(TextSample, self).purge_data() + self.genotypes.clear() - @staticmethod - def buildCustomerTupleListFromFile(): - """ - builds a list of CustomerTuple instances from a two-column file. + @classmethod + def call_haplogroups(cls, config: Config) -> None: + """Call haplogroups from sample-major text file.""" + + logger.info("Mode: Sample-major text\n") + cls.configure(config) + geno_file = ( + open(cls.args.data_fp, "r") + if os.path.splitext(cls.args.data_fp)[1] != ".gz" + else gzip.open(cls.args.data_fp, "rt") + ) + geno_reader = csv.reader(geno_file, delimiter="\t") + cls.position_to_column_index = { + position: column_index + for column_index, position in enumerate(map(int, next(geno_reader)[1:])) + if position in cls.tree.snp_pos_set + } + for genotypes in geno_reader: + iid, genotypes = genotypes[0], genotypes[1:] + if cls.args.single_sample_id is None or iid == cls.args.single_sample_id: + text_sample = cls(iid, genotypes) + text_sample.call_haplogroup() - column 1: ID - column 2: comma-separated list of platforms for this individual - column 3: (optional) previous haplogroup call + geno_file.close() - example: - 314159265358979323 1,2,5 R1b1a2a1a - """ + cls.write_results() - utils.checkFileExistence(Sample.args.dataFN, "Sample IDs") - Sample.errAndLog("Reading sample IDs:\n {}\n".format(Sample.args.dataFN)) - - customer_tuple_list = list() - id_set = set() - with open(Sample.args.dataFN) as id_file: - for line in id_file: - token_list = line.strip().split() - if len(token_list) < 2 or len(token_list) > 3: - sys.exit( - "ERROR. When loading ablocks from file or non-default " - + "ablock dataset,\ninput file must have 2 or 3 columns:\n" - + "1. ID\n" - + "2. comma-separated integer platform versions\n" - + "3. (optional) Previous haplogroup" - ) - ID, platform_versions = token_list[:2] - prev_haplogroup = ( - token_list[2] - if len(token_list) > 2 - else Sample.config.missingHaplogroup - ) - id_set.add(ID) - tuple_kwargs_dict = {"resid": ID, "y_haplogroup": prev_haplogroup} - platform_version_set = set( - [int(i) for i in platform_versions.split(",")] - ) - for i in range(1, Sample.config.maxPlatformVersionPlusOne): - tuple_kwargs_dict["is_v%d" % i] = i in platform_version_set - - customer_tuple = Sample.config.CustomerTuple(**tuple_kwargs_dict) - customer_tuple_list.append(customer_tuple) +class VCFSample(Sample): - Sample.errAndLog(" %8d read\n" % len(customer_tuple_list)) - Sample.errAndLog(" %8d unique\n\n" % len(id_set)) + """Class representing an individual whose data are in a VCF/BCF file.""" - return customer_tuple_list + def __init__(self, iid: IID_TYPE): + """Construct VCFSample instance.""" - @staticmethod - def buildCustomerTupleListFromMetadata(): - "builds a list of CustomerTuple instances from customer metadata." + super(VCFSample, self).__init__(iid) + self.position_to_genotype: dict[int, str] = {} - Sample.errAndLog("Building customer mask ... ") - metaDS = Sample.config.customerMetaDS - metaColList = Sample.config.customerMetaColList - prevHapCol = Sample.config.customerPrevHaplogroupCol - metaDF = pd.DataFrame(metaDS.load(metaColList))[metaColList] - if Sample.config.compareToPrevCalls: - metaDF[prevHapCol] = pd.Series(metaDS.load([prevHapCol])[prevHapCol]) - else: - metaDF[prevHapCol] = Sample.config.missingHaplogroup - mask, maskType = Customer.buildCustomerMask(metaDF) - metaDF = metaDF[mask] - - customerTupleList = list() - for row in metaDF.itertuples(): - customerTupleList.append(Sample.config.CustomerTuple(*row[1:])) - - Sample.errAndLog( - "Done.\n" - + " %8d %s customers to be processed\n" - % (len(customerTupleList), maskType) - ) + def put_genotype(self, position: int, genotype: str) -> None: + """Store one genotype.""" - return customerTupleList + self.position_to_genotype[position] = genotype - @staticmethod - def buildCustomerMask(metaDF): - "if resids have been specified, use those. otherwise, use all males." + def get_genotype(self, position: int) -> str: + """Return genotype for position.""" - residList = Customer.generateResidList() - if residList: - maskType = "specified" - mask = np.in1d(metaDF[Sample.config.customerIDcol], residList) - else: - maskType = "male" - sexDF = pd.DataFrame( - Sample.config.customerMetaDS.load(Sample.config.customerSexColList) - ) - mask = np.ones(len(sexDF), dtype=bool) - for column in sexDF.columns: - mask = mask & (sexDF[column] == "M") + genotype = self.position_to_genotype.get(position, Config.missing_genotype) - return mask, maskType + return genotype - @staticmethod - def generateResidList(): - """ - 4 possibilities: - residList specified at config instantiation - -a -s RESID -> a single research ID has been specified - -i FILENAME.resid.txt -> read research IDs from file - -a -> return empty list to indicate no subsetting - """ + @classmethod + def call_haplogroups(cls, config: Config) -> None: + """Call haplogroups from variant-major VCF/BCF file.""" - residList = list() - if Sample.config.residList: - residList = Sample.config.residList - Sample.errAndLog( - "Research ID list supplied.\n" - + " %8d resids (%d unique)\n\n" - % (len(residList), len(set(residList))) - ) - elif Sample.args.singleSampleID: - resid = Customer.generateResid(Sample.args.singleSampleID) - residList = [resid] - Sample.errAndLog("Will call haplogroup for:\n %d\n\n" % resid) - elif Sample.args.dataFN: - utils.checkFileExistence(Sample.args.dataFN, "Research IDs") - Sample.errAndLog("Reading research IDs:\n %s\n" % Sample.args.dataFN) - with open(Sample.args.dataFN, "r") as residFile: - for line in residFile: - ID = line.strip().split()[0] - residList.append(Customer.generateResid(ID)) - - Sample.errAndLog(" %8d read\n" % len(residList)) - Sample.errAndLog(" %8d unique\n\n" % len(set(residList))) - - return residList - - @staticmethod - def generateResid(ID): - "converts ID to integer, exiting gracefully if not possible" + logger.info("Mode: VCF\n") + cls.configure(config) + cls.load_data_from_vcf() - try: - resid = int(ID) - except ValueError: - sys.exit("\nERROR. Cannot convert ID to integer: %s" % ID) + if ( + cls.args.write_haplogroups_real_time + or cls.args.haplogroup_to_list_genotypes_for + ): + cls.sort_sample_list() - return resid + for vcf_sample in cls.sample_list: + vcf_sample.call_haplogroup() - @staticmethod - def emitProgress(): - "emit a message indicating how many haplogroups have been assigned thus far" + cls.write_results() - if ( - Sample.numAssigned in Sample.config.callingProgressEarlySet - or Sample.numAssigned % Sample.config.callingProgressInterval == 0 - ): - Sample.errAndLog(" %8d haplogroups assigned\n" % Sample.numAssigned) + @classmethod + def load_data_from_vcf(cls) -> None: + """Load data from VCF.""" - @staticmethod - def closeAuxiliaryFilesAndReportCounts(): - "close auxiliary files and report counts" + check_vcf_dependencies() + check_vcf_index(cls.args.data_fp) - utils.closeFiles([Customer.noAblocksFile, Customer.noGenotypesFile]) + with VariantFile(cls.args.data_fp) as variant_file: + iids = list(variant_file.header.samples) + if cls.args.single_sample_id is None: + for iid in iids: + cls(iid) + else: + if cls.args.single_sample_id in iids: + cls(cls.args.single_sample_id) + else: + raise ValueError( + f"{cls.args.single_sample_id} " + f"not present in {cls.args.data_fp}" + ) - messageA = ( - "\n %8d resid(s) ignored | could not load ablock:\n" - % Customer.numNoAblock - ) - messageB = " %s\n" % Sample.config.noAblocksFN - messageC = ( - " %8d resid(s) ignored | no genotypes:\n" % Customer.numNoGenotypes - ) - messageD = " %s\n\n" % Sample.config.noGenotypesFN + chromosome_set = set(variant_file.header.contigs.keys()) + y_chromosome_set = chromosome_set.intersection(Config.vcf_chrom_label_set) + if len(y_chromosome_set) == 1: + chromosome = y_chromosome_set.pop() + else: + raise ValueError( + f"VCF must include exactly one contig with a label in: " + f"{sorted(Config.vcf_chrom_label_set)}\n" + f"Observed: {chromosome_set}" + ) - if Sample.config.suppressOutputAndLog: - Sample.errAndLog(messageA + messageC + "\n") - else: - Sample.errAndLog(messageA + messageB + messageC + messageD) + for variant_record in variant_file.fetch(chromosome): + if variant_record.pos in cls.tree.snp_pos_set: + for vcf_sample in cls.sample_list: + alleles = variant_record.samples[vcf_sample.iid].alleles + if len(alleles) > 1: + raise ValueError( + "More than one allele observed:\n" + f"IID: {vcf_sample.iid}\n" + f"Position: {variant_record.pos}\n" + f"Alleles: {alleles}" + ) + + genotype = alleles[0] + if genotype is not None: + assert isinstance(vcf_sample, cls) + vcf_sample.put_genotype(variant_record.pos, genotype) diff --git a/yhaplo/snp.py b/yhaplo/snp.py index e778948..6406fbe 100644 --- a/yhaplo/snp.py +++ b/yhaplo/snp.py @@ -1,328 +1,413 @@ -# -*- coding: utf-8 -*- -# David Poznik -# 2015.12.29 -# snp.py -# -# Defines two classes: -# - SNP -# - PlatformSNP (a subclass of SNP) -# ---------------------------------------------------------------------- -from __future__ import absolute_import +"""Define SNP and related classes. +Classes defined herein include: +* SNP +* PlatformSNP +* DroppedMarker + +""" + +from __future__ import annotations + +import itertools +import logging import re -import sys +from collections.abc import Mapping, Sequence from operator import attrgetter +from typing import TypeVar -import six -from six.moves import range +import yaml -from . import utils +from yhaplo import node as node_module # noqa F401 +from yhaplo import tree as tree_module # noqa F401 +from yhaplo.config import Config +from yhaplo.utils.loaders import DataFile, load_data, load_data_lines +logger = logging.getLogger(__name__) -class SNP(object): - """ - A snp knows its: - - names - - haplogroup - - physical info: position, ancestral, derived - """ - tree = None - config = None - args = None - errAndLog = None +class SNP: + + """Class representing a SNP. + + A SNP instance knows its: + - Names + - Haplogroup + - Position + - Ancestral and derived alleles - def __init__(self, name, haplogroup, position, ancestral, derived): - if SNP.tree is None: - sys.exit( - "ERROR. Before instantiating, must call SNP.setClassVariables(tree)." + """ + + tree: "tree_module.Tree" + + def __init__( + self, + name: str, + haplogroup: str, + position: int, + ancestral: str, + derived: str, + ): + if type(self).tree is None: + raise RuntimeError( + "Before instantiating, call: " + f"{self.__class__.__name__}.set_class_variables(tree)" ) - self.setLabel(name) - self.nameList = [name] - self.isRepresentative = name in SNP.tree.representativeSNPnameSet + self.set_label(name) + self.name_list = [name] + self.is_representative = name in type(self).tree.representative_snp_name_set self.haplogroup = haplogroup self.position = position self.ancestral = ancestral self.derived = derived - self.alleleSet = {ancestral, derived} + self.allele_set = {ancestral, derived} - self.node = SNP.tree.findOrCreateNode(haplogroup) - self.node.addSNP(self) + self.node = type(self).tree.find_or_create_node(haplogroup) + self.node.add_snp(self) - def setLabel(self, label): - "sets the label and associated ivars" + def set_label(self, label: str) -> None: + """Set label and associated instance variables.""" self.label = label - self.labelLettersRank, self.labelLetters, self.labelNumber = SNP.parseLabel( - label, SNP.config.snpLabelLettersRankDict + ( + self.label_letters_rank, + self.label_letters, + self.label_number, + ) = parse_snp_label(label, Config.snp_label_letters_rank_dict) + self.label_cleaned = clean_snp_label(label) + + def __str__(self) -> str: + """Return medium-length string representation.""" + + str_ = ( + f"{self.label:15s} {self.node.label:25s} {self.position:8d} " + f"{self.ancestral}->{self.derived}" ) - self.labelCleaned = SNP.cleanLabel(label) + return str_ - def __str__(self): - "medium-length string representation" - - return "%-15s %-25s %8d %s->%s" % ( - self.label, - self.node.label, - self.position, - self.ancestral, - self.derived, - ) + @property + def str_with_all_names(self) -> str: + """Return long string representation. - def strWithAllNames(self): - "long string representation: normal str plus comma-separated list of names" + This includes the normal string representation, + plus a comma-separated list of names. - return "%s %s" % (str(self), ",".join(self.nameList)) + """ + names = ",".join(self.name_list) + str_with_all_names = f"{str(self)} {names}" + return str_with_all_names - def strShort(self): - "short string representation: node label and snp label" + @property + def str_short(self) -> str: + """Return short string representation: Node label and SNP label.""" - return "%s:%s" % (self.node.label, self.label) + str_short = f"{self.node.label}:{self.label}" + return str_short @property - def DFSrank(self): - "returns depth-first search rank" + def dfs_rank(self) -> int: + """Return depth-first search rank.""" - return self.node.DFSrank + return self.node.dfs_rank @property - def hgSNP(self): - "string representation: truncated haplogroup label with SNP label. e.g., R-V88" + def hg_snp(self) -> str: + """Return string representation with truncated haplogroup label and SNP label. - return "%s-%s" % (self.node.hgTrunc, self.labelCleaned) + Example: R-V88 - def isDerived(self, geno): - return geno == self.derived + """ + hg_snp = f"{self.node.hg_trunc}-{self.label_cleaned}" + return hg_snp + + def is_derived(self, geno: str) -> bool: + """Return True if geno is the derived allele.""" + + is_derived = geno == self.derived + return is_derived + + def is_ancestral(self, geno: str) -> bool: + """Return True if geno is the ancestral allele.""" - def isAncestral(self, geno): - return geno == self.ancestral + is_ancestral = geno == self.ancestral + return is_ancestral - def isOnPlatform(self, platformVersion): - return self.position in PlatformSNP.platformPosSetDict[platformVersion] + def is_on_platform(self, platform: str) -> bool: + """Return True if this SNP is on the supplied 23andMe platform.""" - def backTracePath(self): - "returns the backtrace path (node list) for the corresponding node" + is_on_platform = self.position in PlatformSNP.platform_to_pos_set[platform] + return is_on_platform - return self.node.backTracePath() + def back_trace_path(self) -> list["node_module.Node"]: + """Return the backtrace path (node list) for the corresponding node.""" - def addName(self, name): - "adds name to list and updates label if appropriate" + back_trace_path = self.node.back_trace_path() + return back_trace_path - self.nameList.append(name) - if name in SNP.tree.representativeSNPnameSet: - self.isRepresentative = True + def add_name(self, name: str) -> None: + """Add name to list and update label if appropriate.""" - if SNP.isApreferredName(self.label): - if SNP.isApreferredName(name): - SNP.errAndLog( + self.name_list.append(name) + if name in type(self).tree.representative_snp_name_set: + self.is_representative = True + + if type(self).is_a_preferred_name(self.label): + if type(self).is_a_preferred_name(name): + logger.warning( "WARNING. Two preferred names for one SNP: " - + "%s, %s\n" % (name, self.label) + f"{name}, {self.label}\n" ) - elif SNP.isApreferredName(name): - self.setLabel(name) + elif type(self).is_a_preferred_name(name): + self.set_label(name) else: - labelLettersRank, labelLetters, labelNumber = SNP.parseLabel( - name, SNP.config.snpLabelLettersRankDict + label_letters_rank, label_letters, label_number = parse_snp_label( + name, + Config.snp_label_letters_rank_dict, ) - if labelLettersRank < self.labelLettersRank or ( - labelLetters == self.labelLetters and labelNumber < self.labelNumber + if label_letters_rank < self.label_letters_rank or ( + label_letters == self.label_letters and label_number < self.label_number ): - self.setLabel(name) - - @staticmethod - def isApreferredName(name): - """checks wither a SNP name is in the set of preferred names, - with or without an extension (e.g., '.1') if present""" + self.set_label(name) - return ( - name in SNP.tree.preferredSNPnameSet - or name.split(".")[0] in SNP.tree.preferredSNPnameSet - ) + @classmethod + def set_class_variables( + cls, + tree: "tree_module.Tree", + ) -> None: + """Set class variables. - @staticmethod - def cleanLabel(label): - "removes superfluous text and hyphens from a SNP label" + Doing so enables the SNP class to know about the tree instance, + configuration and command-line arguments. - for superfluousSNPtext in SNP.config.superfluousSNPtextList: - label = label.replace(superfluousSNPtext, "") + """ + cls.tree = tree + if tree.config.run_from_ablocks or tree.args.write_platform_trees: + PlatformSNP.set_class_variables() - label = six.ensure_text(label) - label = label.replace("-", "_").replace("^", "") - label = six.ensure_text(label.replace(u"≤", "<=").encode("utf-8")) + @classmethod + def is_a_preferred_name(cls, name: str) -> bool: + """Return True if a SNP name is in the set of preferred names. - return label + Ignore extenstions like ".1". - @staticmethod - def parseLabel(name, snpLabelLettersRankDict): - """ - returns the priority rank of a snp name - and a decomposition of the name into letters and a number """ + is_a_preferred_name = ( + name in cls.tree.preferred_snp_name_set + or name.split(".")[0] in cls.tree.preferred_snp_name_set + ) + return is_a_preferred_name - match = re.search(r"([a-zA-Z-]*)([0-9]*)", str(name)) - labelLetters, labelNumber = match.group(1), match.group(2) - labelNumber = int(labelNumber) if len(labelNumber) > 0 else 0 - if labelLetters in snpLabelLettersRankDict: - labelLettersRank = snpLabelLettersRankDict[labelLetters] - else: - labelLettersRank = len(snpLabelLettersRankDict) # max value - return labelLettersRank, labelLetters, labelNumber +def clean_snp_label(label: str) -> str: + """Remove superfluous text and hyphens from a SNP label.""" - @staticmethod - def setClassVariables(tree): - "enables SNP class to know about the tree instance, config, and args" + for superfluous_snp_text in Config.superfluous_snp_text_list: + label = label.replace(superfluous_snp_text, "") - SNP.tree = tree - SNP.config = tree.config - SNP.args = tree.args - SNP.errAndLog = tree.config.errAndLog + label = label.replace("-", "_").replace("^", "").replace("≤", "<=") - if SNP.config.runFromAblocks or SNP.args.writePlatformTrees: - PlatformSNP.buildPlatformPosSetDict() - if SNP.config.runFromAblocks: - PlatformSNP.buildPlatformSNPlistDict() + return label - @staticmethod - def prioritySortMarkerList(markerList): - """ - sorts a list of markers by priority ranking, with preference given to - those deemed representative for the corresponding haplogroup - """ - markerList = sorted( - markerList, - key=attrgetter("labelLettersRank", "labelLetters", "labelNumber"), - ) - markerList = sorted( - markerList, key=attrgetter("isRepresentative"), reverse=True - ) +def parse_snp_label( + name: str, + snp_label_letters_rank_dict: Mapping[str, int], +) -> tuple[int, str, int]: + """Parse SNP label. - return markerList + Returns + ------- + label_letters_rank + Priority rank of SNP name. + label_letters + SNP-name letters. + label_number + SNP-name numbers. - @staticmethod - def mostHighlyRankedMarkerOnList(markerList): - """ - returns the most highly ranked marker on a list. - the purpose of this method is to record the fact that marker lists are - sorted with highest priority first - """ + """ + match = re.search(r"([a-zA-Z-]*)([0-9]*)", str(name)) + if match is None: + raise ValueError(f"SNP name unparseable: {name}") + + label_letters, label_number = match.group(1), match.group(2) + label_number = int(label_number) if len(label_number) > 0 else 0 + if label_letters in snp_label_letters_rank_dict: + label_letters_rank = snp_label_letters_rank_dict[label_letters] + else: + label_letters_rank = len(snp_label_letters_rank_dict) # max value + + return label_letters_rank, label_letters, label_number - if markerList: - return markerList[0] - else: - return None +class PlatformSNP: -# -------------------------------------------------------------------------- + """Class representing a platform SNP. + A PlatformSNP instance knows its: + - Position + - Block indexes -class PlatformSNP(object): - "A platform SNP knows its: position and ablock index" + """ - platformPosSetDict = dict() - platformSNPlistDict = dict() + pos_to_block_indexes: dict[int, list[int]] = {} + platform_to_pos_set: dict[str, set[int]] = {} - def __init__(self, position): + def __init__( + self, + position: int, + ablock_index_list: list[int], + ): self.position = position - self.ablockIndexList = SNP.config.pos2ablockIndexListDict[position] + self.ablock_index_list = ablock_index_list - def __str__(self): - return "%8d %7d" % (self.position, self.ablockIndex) + @classmethod + def set_class_variables(cls) -> None: + """Set class variables.""" - def getConsensusGenotypeFromAblock(self, ablock): - "given an ablock, returns a consensus genotype for this SNP" + cls.pos_to_block_indexes = load_pos_to_block_indexes() + cls.platform_to_pos_set = build_platform_to_pos_set() - genotypeList = [ - PlatformSNP.getGenotypeFromAblock(ablock, ablockIndex) - for ablockIndex in self.ablockIndexList - ] - if len(genotypeList) == 1: - genotype = genotypeList[0] - else: - genotypeSet = set(genotypeList) - genotypeSet.discard(SNP.config.missingGenotype) - if len(genotypeSet) == 1: - genotype = genotypeSet.pop() - else: - genotype = SNP.config.missingGenotype + @classmethod + def build_platform_to_platform_snps(cls) -> dict[str, list[PlatformSNP]]: + """Build mapping from platform to PlatformSNPs. - return genotype + Note: This method is no longer in use. - @staticmethod - def getGenotypeFromAblock(ablock, ablockIndex): """ - gets genotype from ablock + logger.info("Building mapping from platform version to PlatformSNPs...") + + if not cls.pos_to_block_indexes: + cls.set_class_variables() + + platform_to_platform_snps = {} + for platform in Config.platforms: + platform_snps = [] + unknown_positions_set = set() + for position in cls.platform_to_pos_set[platform]: + try: + ablock_index_list = cls.pos_to_block_indexes[position] + platform_snp = cls(position, ablock_index_list) + platform_snps.append(platform_snp) + except KeyError: + unknown_positions_set.add(position) + + platform_to_platform_snps[platform] = platform_snps + logger.info( + f"{platform}: {len(platform_snps):5d} Platform SNPs, " + f"{len(unknown_positions_set)} positions with unknown block indexes" + ) - input: ablock : a numpy array of {0, ..., 15} - ablockIndex - output: genotype - """ + return platform_to_platform_snps - if ablockIndex < len(ablock): - diploidGenotype = SNP.config.ablockCodeToGenotypeDict[ablock[ablockIndex]] - if diploidGenotype in SNP.config.homozygousGenotypeSet: - return diploidGenotype[0] - return SNP.config.missingGenotype +def load_pos_to_block_indexes() -> dict[int, list[int]]: + """Load mapping from physical position to block indexes.""" - @staticmethod - def buildPlatformPosSetDict(): - "reads files to build a dictionary: platform_version -> set of positions" + pos_to_block_indexes = yaml.safe_load( + load_data(Config.pos_to_block_indexes_data_file) + ) + num_positions = len(pos_to_block_indexes) + num_block_indexes = len( + list(itertools.chain.from_iterable(pos_to_block_indexes.values())) + ) + logger.info( + "Loaded mapping from physical position to block indexes\n" + f" {num_positions:6d} positions: " + f"{Config.pos_to_block_indexes_data_file.filename}\n" + f" {num_block_indexes:6d} block indexes\n\n" + ) - SNP.errAndLog("%sReading platform positions...\n\n" % utils.DASHES) - for platform_version in range(1, SNP.config.maxPlatformVersionPlusOne): - platform_pos_fn = SNP.config.platform_pos_fn_tp.format(platform_version) - platform_pos_set = utils.readPositionsSet( - platform_pos_fn, logFunction=SNP.errAndLog - ) - PlatformSNP.platformPosSetDict[platform_version] = platform_pos_set + return pos_to_block_indexes - SNP.errAndLog("\n") - @staticmethod - def buildPlatformSNPlistDict(): - "builds dictionary of platformSNP lists. key = platformVersion" +def build_platform_to_pos_set() -> dict[str, set[int]]: + """Build mapping from platform to a set of physical positions.""" - SNP.errAndLog("Building dictionary of platform SNP lists...\n\n") - for platformVersion in range(1, SNP.config.maxPlatformVersionPlusOne): - platformPosSet = PlatformSNP.platformPosSetDict[platformVersion] - platformSNPlist = list() - for position in platformPosSet: - platformSNPlist.append(PlatformSNP(position)) + logger.info("Loading platform positions...") - PlatformSNP.platformSNPlistDict[platformVersion] = platformSNPlist + platform_to_pos_set = {} + for platform in Config.platforms: + platform_pos_data_file = DataFile( + Config.platform_pos_data_subdir, + Config.platform_pos_data_filename_tp.format(platform=platform), + f"Platform {platform} SNP positions", + ttam_only=True, + ) + pos_set = set( + int(line.strip().split()[0]) + for line in load_data_lines(platform_pos_data_file) + ) + platform_to_pos_set[platform] = pos_set + logger.info( + f"{platform}: {len(pos_set):5d} unique positions loaded: " + f"{platform_pos_data_file.filename}" + ) + logger.info("") -# -------------------------------------------------------------------------- + return platform_to_pos_set -class DroppedMarker(object): - """ - a marker not used for classification but potentially useful for node labeling - examples: non-SNPs, multiallelic SNPs, and SNPs not meeting ISOGG quality guidelines +class DroppedMarker: + + """Class representing a marker not used for classification. + + Such a marker may be useful for node labeling. Examples: + - Non-SNPs + - Multiallelic SNPs + - SNPs not meeting ISOGG quality guidelines + """ - def __init__(self, name, haplogroup): - self.name = SNP.cleanLabel(name) + def __init__( + self, + name: str, + haplogroup: str, + tree: "tree_module.Tree", + ): + self.name = clean_snp_label(name) self.haplogroup = haplogroup + self.tree = tree - def addToNode(self): - "adds this dropped marker to the corresponding node, if it exists" + def add_to_node(self) -> bool: + """Add this dropped marker to the corresponding node, if it exists.""" - if self.haplogroup in SNP.tree.hg2nodeDict: - self.setSortVariables() - SNP.tree.hg2nodeDict[self.haplogroup].addDroppedMarker(self) - return True - else: - return False + added = False + if self.haplogroup in self.tree.haplogroup_to_node: + ( + self.label_letters_rank, + self.label_letters, + self.label_number, + ) = parse_snp_label(self.name, Config.snp_label_letters_rank_dict) + self.is_representative = self.name in self.tree.representative_snp_name_set + self.tree.haplogroup_to_node[self.haplogroup].add_dropped_marker(self) + added = True - def setSortVariables(self): - "set variables used for priority sorting" + return added - self.labelLettersRank, self.labelLetters, self.labelNumber = SNP.parseLabel( - self.name, SNP.config.snpLabelLettersRankDict - ) - self.isRepresentative = self.name in SNP.tree.representativeSNPnameSet + +Marker = TypeVar("Marker", SNP, DroppedMarker) + + +def priority_sort_marker_list(marker_list: Sequence[Marker]) -> list[Marker]: + """Sort a list of markers by priority ranking. + + Preference is given to those deemed representative for the corresponding haplogroup. + + """ + sorted_marker_list = sorted( + sorted( + marker_list, + key=attrgetter( + "label_letters_rank", + "label_letters", + "label_number", + ), + ), + key=attrgetter("is_representative"), + reverse=True, + ) + + return sorted_marker_list diff --git a/yhaplo/tree.py b/yhaplo/tree.py index 37d4673..50d9ecc 100644 --- a/yhaplo/tree.py +++ b/yhaplo/tree.py @@ -1,738 +1,772 @@ -# David Poznik -# 2015.01.12 -# tree.py -# -# Defines the Tree class. -# ---------------------------------------------------------------------- -from __future__ import absolute_import - -import csv -import os +"""Define Tree class.""" + +import logging import re -import sys from collections import defaultdict, deque from operator import attrgetter +from typing import Optional, Union -import six -from six.moves import range +from yhaplo import node as node_module # noqa F401 +from yhaplo import path as path_module # noqa F401 +from yhaplo import sample as sample_module # noqa F401 +from yhaplo import snp as snp_module # noqa F401 +from yhaplo.config import Config +from yhaplo.utils.loaders import load_data, load_data_lines -from . import utils -from .node import Node -from .path import Path -from .snp import SNP, DroppedMarker +logger = logging.getLogger(__name__) -class Tree(object): - """ +class Tree: + + """Class representing a haplogroup tree. + A tree has single Node instance (the root) as well as a dictionary that maps haplogroup labels to node instances. """ - def __init__(self, config): + def __init__( + self, + config: Config, + ): self.config = config self.args = config.args - self.errAndLog = config.errAndLog - self.maxDepth = 0 - - # node info - self.hg2nodeDict = dict() - self.depthFirstNodeList = list() - - # snp info - self.snpDict = dict() # keys: name, (haplogroup, position), position - self.snpList = list() - self.snpPosSet = set() - self.snpNameSet = set() - self.preferredSNPnameSet = set() - self.representativeSNPnameSet = set() - self.multiAllelicOldPosSet = set() - self.multiAllelicNewPosSet = set() - self.isoggOmitSet = set() - self.isoggCorrectionDict = dict() - self.isoggCountsDict = defaultdict(int) - self.numSNPsCorrected = 0 - - # build tree - self.root = self.buildTreeFromNewick() - if self.args.primaryOnly: - self.setDepthFirstNodeList() + + self.max_depth = 0 + self.haplogroup_to_node: dict[str, "node_module.Node"] = {} + self.depth_first_node_list: list["node_module.Node"] = [] + + self.snp_dict: dict[ + Union[str, tuple[str, int], int], + "snp_module.SNP", + ] = {} # Possible keys: name, (haplogroup, position), position + self.snp_list: list["snp_module.SNP"] = [] + self.snp_pos_set: set[int] = set() + self.snp_name_set: set[str] = set() + self.preferred_snp_name_set: set[str] = set() + self.representative_snp_name_set: set[str] = set() + self.multi_allelic_old_pos_set: set[int] = set() + self.multi_allelic_new_pos_set: set[int] = set() + self.isogg_omit_set: set[tuple[str, str]] = set() + self.isogg_correction_dict: dict[str, tuple[str, str, str]] = {} + self.isogg_counts_dict: dict[str, int] = defaultdict(int) + self.num_snps_corrected = 0 + + self.root = self.build_tree_from_newick() + if self.args.primary_only: + self.set_depth_first_node_list() else: - self.importIsoggSnps() - self.setSearchRoot() - self.optionalTraversalOutput() - if self.args.writeContentMappings: - self.writeContentMappings() + self.import_isogg_snps() + + self.set_search_root() + self.write_optional_traversal_output() - # setters + # Setters # ---------------------------------------------------------------------- - def setSearchRoot(self): - "set node from which to start haplogroup-calling traversals" - - if self.args.alternativeRoot: - alternativeRootHg = self.args.alternativeRoot - if alternativeRootHg in self.hg2nodeDict: - self.searchRoot = self.hg2nodeDict[alternativeRootHg] - self.errAndLog( + def set_search_root(self) -> None: + """Set node from which to start haplogroup-calling traversals.""" + + if self.args.alternative_root: + alternative_root_hg = self.args.alternative_root + if alternative_root_hg in self.haplogroup_to_node: + self.search_root = self.haplogroup_to_node[alternative_root_hg] + logger.info( "Will start haplogroup assignment traversal from:\n" - + " %s\n\n" % alternativeRootHg + f" {alternative_root_hg}\n\n" ) else: - sys.exit( - "\nERROR. Cannot start traversal " - + "from non-existant haplogroup: %s\n" % alternativeRootHg + raise ValueError( + "Cannot start traversal " + f"from non-existant haplogroup: {alternative_root_hg}\n" ) else: - self.searchRoot = self.root + self.search_root = self.root - def setDepthFirstNodeList(self): - "build node list from depth-first pre-order traversal" + def set_depth_first_node_list(self) -> None: + """Build Node list from depth-first pre-order traversal.""" - self.depthFirstNodeList = self.root.getDepthFirstNodeList() - for DFSrank, node in enumerate(self.depthFirstNodeList): - node.setDFSrank(DFSrank) + self.depth_first_node_list = self.root.get_depth_first_node_list() + for dfs_rank, node in enumerate(self.depth_first_node_list): + node.set_dfs_rank(dfs_rank) - # traversals + # Traversals # ---------------------------------------------------------------------- - def optionalTraversalOutput(self): - "optional tree-traversal output" - - if self.args.traverseBF: - self.writeBreadthFirst() - if self.args.traverseDF: - self.writeDepthFirstPreOrder() - if self.args.writeTreeTable: - self.writeTreeTable() - if self.args.mrcaHaplogroupList: - self.queryMRCA() - if self.args.querySNPname: - self.querySNPpath() - - def writeBreadthFirst(self): - "writes bread-first traversal in pipe/dot format" - - bfTreeFN = ( - self.config.bfPrimaryTreeFN - if self.args.primaryOnly - else self.config.bfTreeFN + def write_optional_traversal_output(self) -> None: + """Write optional tree-traversal output.""" + + if self.args.traverse_bf: + self.write_breadth_first() + if self.args.traverse_df: + self.write_depth_first_pre_order() + if self.args.write_tree_table: + self.write_tree_table() + if self.args.mrca_haplogroup_list: + self.query_mrca() + if self.args.query_snp_name: + self.query_snp_path() + + def write_breadth_first(self) -> None: + """Write bread-first traversal in pipe/dot format.""" + + bf_tree_fp = ( + self.config.bf_primary_tree_fp + if self.args.primary_only + else self.config.bf_tree_fp ) - with open(bfTreeFN, "w") as bfTreeFile: - self.root.writeBreadthFirstTraversal(bfTreeFile) + with open(bf_tree_fp, "w") as bf_tree_file: + self.root.write_breadth_first_traversal(bf_tree_file) - self.errAndLog("Wrote breadth-first tree traveral:\n %s\n\n" % bfTreeFN) + logger.info(f"Wrote breadth-first tree traveral:\n {bf_tree_fp}\n") - def writeDepthFirstPreOrder(self): - "writes depth-first pre-order traversal in pipe/dot format" + def write_depth_first_pre_order(self) -> None: + """Write depth-first pre-order traversal in pipe/dot format.""" - dfTreeFN = ( - self.config.dfPrimaryTreeFN - if self.args.primaryOnly - else self.config.dfTreeFN + df_tree_fp = ( + self.config.df_primary_tree_fp + if self.args.primary_only + else self.config.df_tree_fp ) - with open(dfTreeFN, "w") as dfTreeFile: - for node in self.depthFirstNodeList: - dfTreeFile.write("%s\n" % node.strDotPipeDepth()) + with open(df_tree_fp, "w") as df_tree_file: + for node in self.depth_first_node_list: + df_tree_file.write(node.str_dot_pipe_depth + "\n") - self.errAndLog("Wrote depth-first tree traveral:\n %s\n\n" % dfTreeFN) + logger.info(f"Wrote depth-first tree traveral:\n {df_tree_fp}\n") - def writeTreeTable(self): - "writes depth-first pre-order traversal in table format" + def write_tree_table(self) -> None: + """Write depth-first pre-order traversal in table format.""" - treeTableFN = self.config.treeTableFN - headerList = "#index ycc_label label parent_index parent_label".split() - with open(treeTableFN, "w") as treeTableFile: - treeTableFile.write("%s\n" % "\t".join(headerList)) - for node in self.depthFirstNodeList: - treeTableFile.write("%s\n" % node.strTreeTableRow()) + tree_table_fp = self.config.tree_table_fp + header_list = "#index ycc_label label parent_index parent_label".split() + with open(tree_table_fp, "w") as tree_table_file: + tree_table_file.write("\t".join(header_list) + "\n") + for node in self.depth_first_node_list: + tree_table_file.write("\t".join(node.tree_table_data) + "\n") - self.errAndLog("Wrote tree table:\n %s\n\n" % treeTableFN) + logger.info(f"Wrote tree table:\n {tree_table_fp}\n") - def writeContentMappings(self): - "writes the best 23andMe content page for each node" + def query_mrca(self) -> None: + """Write MRCA of two haplogroups.""" - with open(self.config.pageUpdatesFN, "w") as pageUpdatesFile: - pageUpdatesFile.write( - "%-10s %-10s %-10s %s\n" % ("yccOld", "SNP", "hgSNP", "ycc") + mrca_haplogroup_list = self.args.mrca_haplogroup_list + if not isinstance(mrca_haplogroup_list, list) or len(mrca_haplogroup_list) != 2: + raise ValueError( + f"mrca expects a list of 2 haplogroups, not this: {mrca_haplogroup_list}\n" ) - for page in Node.pageList: - pageUpdatesFile.write("%s\n" % page.strFull()) - - with open(self.config.pageMappingsFN, "w") as pageMappingsFile: - for node in self.depthFirstNodeList: - ancestorNode = node - while node.page is None: - ancestorNode = ancestorNode.parent - node.page = ancestorNode.page - - pageMappingsFile.write( - "%-25s %-15s | %s\n" - % (node.haplogroup, node.hgSNP if node.hgSNP else ".", node.page) - ) - - self.errAndLog( - "%s23andMe content pages\n\n" % utils.DASHES - + "Read %4d titles:\n %s\n\n" - % (len(Node.pageList), self.config.pagesFN) - + "Wrote %4d updates:\n %s\n\n" - % (len(Node.pageList), self.config.pageUpdatesFN) - + "Wrote %4d mappings:\n %s\n\n" - % (len(self.depthFirstNodeList), self.config.pageMappingsFN) - ) - - def queryMRCA(self): - "writes MRCA of two haplogroups" - - mrcaHaplogroupList = self.args.mrcaHaplogroupList - if type(mrcaHaplogroupList) != list or len(mrcaHaplogroupList) != 2: - sys.exit( - "ERROR. mrca expects a list of 2 haplogroups: %s\n" % mrcaHaplogroupList - ) - haplogroup1, haplogroup2 = mrcaHaplogroupList - node1 = self.hg2nodeDict[haplogroup1] - node2 = self.hg2nodeDict[haplogroup2] + haplogroup1, haplogroup2 = mrca_haplogroup_list + node1 = self.haplogroup_to_node[haplogroup1] + node2 = self.haplogroup_to_node[haplogroup2] mrca = node1.mrca(node2) - self.errAndLog( - "%sMRCA Query\n\n" % utils.DASHES - + "Haplogroup 1: %s\n" % node1.haplogroup - + "Haplogroup 2: %s\n" % node2.haplogroup - + "MRCA: %s\n\n" % mrca.haplogroup + logger.info( + "\nMRCA Query\n\n" + f"Haplogroup 1: {node1.haplogroup}\n" + f"Haplogroup 2: {node2.haplogroup}\n" + f"MRCA: {mrca.haplogroup}\n" ) - def querySNPpath(self): - "lists phylogenetic path for a query SNP" + def query_snp_path(self) -> None: + """List phylogenetic path for a query SNP.""" - queryName = self.args.querySNPname - self.errAndLog("%sSNP Query: %s\n\n" % (utils.DASHES, queryName)) - snp = self.snpDict.get(queryName, None) + query_name = self.args.query_snp_name + logger.info(f"\nSNP Query: {query_name}\n\n") + snp = self.snp_dict.get(query_name, None) if snp: - for node in snp.backTracePath(): - self.errAndLog("%s\n" % node.strSimple()) - if snp.label != queryName: - self.errAndLog( - "\nNote: %s is an alias of %s.\n" % (queryName, snp.label) - ) + for node in snp.back_trace_path(): + logger.info(node.str_simple + "\n") + if snp.label != query_name: + logger.info(f"\nNote: {query_name} is an alias of {snp.label}.\n") else: - self.errAndLog("Not found.\n") + logger.info("Not found.\n") - self.errAndLog("\n") + logger.info("") - # write newick files + # Write Newick files # ---------------------------------------------------------------------- - def writeNewick(self): - "write tree as is and with aligned terminal branch lengths" - - if not self.config.suppressOutputAndLog: - self.errAndLog("%sWriting trees...\n\n" % utils.DASHES) - self.root.writeNewick(self.config.yccTreeFN) - self.root.writeNewick(self.config.hgsnpTreeFN, useHgSNPlabel=True) - self.root.writeNewick(self.config.alignedYccTreeFN, alignTips=True) - self.root.writeNewick( - self.config.alignedHgsnpTreeFN, useHgSNPlabel=True, alignTips=True + def write_newick(self) -> None: + """Write tree as-is and with aligned terminal branch lengths.""" + + if not self.config.suppress_output: + logger.info("\nTree output\n") + self.root.write_newick(self.config.ycc_tree_fp) + self.root.write_newick(self.config.hg_snp_tree_fp, use_hg_snp_label=True) + self.root.write_newick(self.config.aligned_ycc_tree_fp, align_tips=True) + self.root.write_newick( + self.config.aligned_hg_snp_tree_fp, + use_hg_snp_label=True, + align_tips=True, ) - if self.args.writePlatformTrees: - self.writePlatformTrees() + if self.args.write_platform_trees: + self.write_platform_trees() - def writePlatformTrees(self): - "write trees whose branch lengths are numbers of platform sites" + def write_platform_trees(self) -> None: + """Write trees whose branch lengths are numbers of platform sites.""" - for platformVersion in range(1, self.config.maxPlatformVersionPlusOne): - self.root.writeNewick( - self.config.platformYccTreeFNtp % platformVersion, - platformVersion=platformVersion, + for platform in Config.platforms: + self.root.write_newick( + self.config.platform_ycc_tree_fp_tp.format(platform=platform), + platform=platform, ) - self.root.writeNewick( - self.config.platformHgsnpTreeFNtp % platformVersion, - useHgSNPlabel=True, - platformVersion=platformVersion, + self.root.write_newick( + self.config.platform_hg_snp_tree_fp_tp.format(platform=platform), + use_hg_snp_label=True, + platform=platform, ) - # query + # Query # ---------------------------------------------------------------------- - def identifyPhylogeneticPath(self, sample): - """ - conducts a modified breadth-first search (bfs) to identify + def identify_phylogenetic_path( + self, + sample: "sample_module.Sample", + ) -> tuple[ + "path_module.Path", + list["snp_module.SNP"], + list[tuple["node_module.Node", int, int]], + ]: + """Identify phylogenetic path for haplogroup call. + + Conduct a modified breadth-first search (BFS) to identify the phylogenetic path leading from the root to the most terminal branch - representing a sample's haplogroup. + representing a Sample's haplogroup. - returns: best path, list of SNPs observed in the ancestral state. + Returns + ------- + best_path : Path + The best phylogenetic path. + anc_snp_full_list : list[SNP] + List of SNPs observed in the ancestral state. - key differences from a standard bfs are: - - stopping condition is robust to genotype error, homoplasy, etc. - - collapsing condition to speed up and (marginally) improve accuracy + Notes + ----- + The key differences from a standard BFS are: + - Stopping condition is robust to genotype error, homoplasy, etc. + - Collapsing condition to speed up and (marginally) improve accuracy - when the stopping condition is met, adds the current path to a list. - at the end, post-processes this list and selects the best element. + When the stopping condition is met, add the current path to a list. + At the end, post-processes this list and select the best element. - the stopping condition is a disjunction of three literals. - the first is trivial: + The stopping condition is a disjunction of three atomic conditions. + The first is trivial: - a. node.isLeaf() - we cannot go any further + a. node.is_leaf() + We cannot go any further. - the following table enumerates possible cases for the two other literals. - #anc: number of ancestral alleles observed on a branch - #der: number of derived alleles observed on the branch - (only considered if #anc == 2) - stop: whether or not to stop + The following table enumerates possible cases for the other two + atomic conditions. - | #anc | #der | stop | reason + #Anc: Number of ancestral alleles observed on a branch. + #Der: Number of derived alleles observed on the branch. + These are only considered if #anc == 2. + Stop: Whether or not to stop. + + | #Anc | #Der | Stop | Reason |------|------|------|-------------------------------------------------------- - | 0, 1 | . | no | insufficient evidence to stop, or none at all - | 2 | 1+ | no | given evidence to continue, do so for robustness - | 2 | 0 | yes | reasonable evidence to stop and no evidence to continue - | 3+ | . | yes | strong evidence to stop + | 0, 1 | . | no | Insufficient evidence to stop + | 2 | 1+ | no | Given evidence to continue, do so for robustness + | 2 | 0 | yes | Reasonable evidence to stop and no evidence to continue + | 3+ | . | yes | Strong evidence to stop - b. row 4: numAncestral > 2 - der = 0: compelling evidence to stop. - der = 1+: the sample's lineage probably diverges from the known tree here. + b. Row 4: num_ancestral >= 3 + num_derived == 0: Compelling evidence to stop. + num_derived >= 1: The sample's lineage probably diverges from the known tree here. - c. row 3: numAncestral == 2 and numDerived == 0 - it is safe to assume that this path will not yield fruit. + c. Row 3: num_ancestral == 2 and num_derived == 0 + It is safe to assume that this path will not yield fruit. - these conditions are robust to the most challenging case: + These conditions are robust to the most challenging case: when just a single SNP is genotyped on a branch, and the observed genotype corresponds to the ancestral allele due to genotype error, homoplasy, - or an undetected isogg error. when at least one derived allele is observed, + or an uncorrected ISOGG error. When at least one derived allele is observed, the conditions are also robust to two false ancestral alleles on a branch. - """ - - pathDeque = Path.createPathDeque(self.searchRoot.childList) - stoppedPathList = list() - ancSNPfullList = list() - while pathDeque: - path = pathDeque.popleft() - ancSNPlist, derSNPlist = path.node.assessGenotypes(sample) - path.updateWithBranchAssessment(ancSNPlist, derSNPlist) - ancSNPfullList.extend(ancSNPlist) - numAncestral, numDerived = len(ancSNPlist), len(derSNPlist) - if self.args.writeAncDerCounts: - sample.appendAncDerCountTuple(path.node, numAncestral, numDerived) + """ + path_deque = path_module.Path.create_path_deque(self.search_root.child_list) + stopped_path_list = [] + anc_snp_full_list = [] + anc_der_count_tuples = [] + while path_deque: + path = path_deque.popleft() + anc_snp_list, der_snp_list = path.node.assess_genotypes(sample) + path.update_with_branch_assessment(anc_snp_list, der_snp_list) + anc_snp_full_list.extend(anc_snp_list) + num_ancestral, num_derived = len(anc_snp_list), len(der_snp_list) + anc_der_count_tuples.append((path.node, num_ancestral, num_derived)) if ( - path.node.isLeaf() - or (numAncestral > self.config.args.ancStopThresh) - or (numAncestral == self.config.args.ancStopThresh and numDerived == 0) + path.node.is_leaf() + or (num_ancestral > self.config.args.anc_stop_thresh) + or ( + num_ancestral == self.config.args.anc_stop_thresh + and num_derived == 0 + ) ): - stoppedPathList.append(path) + stopped_path_list.append(path) else: - if numDerived >= self.config.args.derCollapseThresh: - pathDeque = deque() - pathDeque.extend(path.fork(path.node.childList)) + if num_derived >= self.config.args.der_collapse_thresh: + path_deque = deque() + + path_deque.extend(path.fork(path.node.child_list)) + + best_path = path_module.post_process_path_list_and_select_best( + stopped_path_list + ) - bestPath = Path.postProcessPathListAndSelectBest(stoppedPathList) - return bestPath, ancSNPfullList + return best_path, anc_snp_full_list, anc_der_count_tuples - def getDFSrank(self, haplogroup): - "returns the DFS rank of a haplogroup" + def get_dfs_rank(self, haplogroup: str) -> int: + """Return the DFS rank of a haplogroup.""" - return self.hg2nodeDict[haplogroup].DFSrank + dfs_rank = self.haplogroup_to_node[haplogroup].dfs_rank + return dfs_rank - # build tree from Newick-formatted text file + # Build tree from Newick-formatted text file # ---------------------------------------------------------------------- - def buildTreeFromNewick(self): - """ - Reads in a Newick-formatted tree, strips out bootstraps, - tokenizes it, and initiates tree building. - Returns a node instance: the root. - """ + def build_tree_from_newick(self) -> "node_module.Node": + """Read a Newick-formatted tree and build a Tree instance. - utils.checkFileExistence(self.config.primaryTreeFN, "Primary tree") - with open(self.config.primaryTreeFN, "r") as treeFile: - treeString = treeFile.readline().strip() - self.errAndLog( - "\n%sRead primary tree topology:\n %s\n\n" - % (utils.DASHES, self.config.primaryTreeFN) - ) + Discard bootstrap values. + + Returns + ------- + root : Node + Root of the tree. """ - Tokenization: - a. strip out bootstraps: text within brackets - b. split on any semantic token: [%s] - c. but group to retain retain tokens themselves: () - d. then drop empty tokens from splitting adjacent semantic tokens - """ - treeString = re.subn(r"\[.*?\]", "", treeString)[0] - treeList = re.split( - "([%s])" % self.config.newickSemanticTokenString, treeString + logger.info("\nPrimary tree\n") + tree_string = load_data(self.config.primary_tree_data_file, log=True).strip() + + # Tokenization: + # a. Strip out bootstraps: text within brackets. + # b. Split on any semantic token. + # c. Group to retain retain tokens themselves. + # d. Drop empty tokens from splitting adjacent semantic tokens. + tree_string = re.subn(r"\[.*?\]", "", tree_string)[0] + tree_list = re.split( + f"([{self.config.newick_semantic_token_string}])", + tree_string, ) - treeList = [token for token in treeList if token is not ""] - treeDeque = deque(treeList) + tree_list = [token for token in tree_list if token != ""] + tree_deque = deque(tree_list) - hasLengths = ":" in treeDeque # determine whether tree has lengths - root = self.addChildSubtreeFromNewickDeque(None, treeDeque, hasLengths) - root.writeNewick(self.config.alignedPrimaryTreeFN, alignTips=True) + has_lengths = ":" in tree_deque # determine whether tree has lengths + root = self.add_child_subtree_from_newick_deque(None, tree_deque, has_lengths) + root.write_newick(self.config.aligned_primary_tree_fp, align_tips=True) return root - def addChildSubtreeFromNewickDeque(self, parent, treeDeque, hasLengths): - """ - Recursively processes a deque of Newick tokens to build a tree. + def add_child_subtree_from_newick_deque( + self, + parent: Optional["node_module.Node"], + tree_deque: deque[str], + has_lengths: bool, + ) -> "node_module.Node": + """Process a deque of Newick tokens to build a tree. + Each call constructs one subtree and returns its root. - 1. Recursive case: an open paren indicates a compound subtree. - The function calls itself to add the first child. - 2. Base case: an alphanumeric label indicates a leaf. - Return a simple leaf node. - 3. Following the first child subtree, there will be - an arbitrary number of sibling subtrees, each preceeded by a comma. - The function calls itself to add each in turn. - 4. The end of a subtree signaled by a close paren. - At this point, add a label and/or length, if either are provided. - """ + 1. Recursive case + An open paren indicates a compound subtree. + The function calls itself to add the first child. + 2. Base case + An alphanumeric label indicates a leaf. + Return a simple leaf node. + 3. Following the first child subtree + There will be an arbitrary number of sibling subtrees, + each preceeded by a comma. + The function calls itself to add each in turn. + 4. The end of a subtree + Signaled by a close paren. + At this point, add a label and/or length, if either are provided. + """ # ------------------------------------------------------------------------- - # first node of subtree - node = Node(parent=parent, tree=self) - token = treeDeque.popleft() - if token == "(": # recursive case: compound subtree - self.addChildSubtreeFromNewickDeque(node, treeDeque, hasLengths) - else: # base case: leaf tree - node.setLabel(token) - if hasLengths: - Tree.processNewickLength(node, treeDeque) + # First node of subtree + node = node_module.Node(parent=parent, tree=self) + token = tree_deque.popleft() + if token == "(": # Recursive case: compound subtree + self.add_child_subtree_from_newick_deque(node, tree_deque, has_lengths) + else: # Base case: leaf tree + node.set_label(token) + if has_lengths: + type(self).process_newick_length(node, tree_deque) + return node # ------------------------------------------------------------------------- - # second through nth nodes of subtree - token = treeDeque.popleft() + # Second through n-th nodes of subtree + token = tree_deque.popleft() while token == ",": - self.addChildSubtreeFromNewickDeque(node, treeDeque, hasLengths) - token = treeDeque.popleft() + self.add_child_subtree_from_newick_deque(node, tree_deque, has_lengths) + token = tree_deque.popleft() # ------------------------------------------------------------------------- - # end of subtree - Tree.verifyToken(token, ")") - node.reverseChildren() - token = treeDeque.popleft() - if token not in self.config.newickSemanticTokenSet: - node.setLabel(token) - if hasLengths and treeDeque[0] != ";": - self.config.processNewickLength(node, treeDeque) - return node + # End of subtree + verify_newick_token(token, ")") + node.reverse_children() + token = tree_deque.popleft() + if token not in self.config.newick_semantic_token_set: + node.set_label(token) - @staticmethod - def verifyToken(observed, expected): - "exits program if observed and expected strings do not match" + if has_lengths and tree_deque[0] != ";": + self.process_newick_length(node, tree_deque) - if observed != expected: - sys.exit( - "ERROR. Malformed newick file.\n" - + "Expected this token: %s\n" % expected - + "Got this one: %s\n" % observed - ) + return node - @staticmethod - def processNewickLength(node, treeDeque): - "processes a Newick-format branch length of the form :length" + @classmethod + def process_newick_length( + cls, + node: "node_module.Node", + tree_deque: deque[str], + ) -> None: + """Set branch length from Newick tokens.""" - Tree.verifyToken(treeDeque.popleft(), ":") # next token should be colon - branchLength = float(treeDeque.popleft()) # then branch length - node.setBranchLength(branchLength) + verify_newick_token(tree_deque.popleft(), ":") # Next token should be colon + branch_length = float(tree_deque.popleft()) # Branch length + node.set_branch_length(branch_length) - # import SNPs and assign to branches + # Import SNPs and assign to branches # ---------------------------------------------------------------------- - def importIsoggSnps(self): - "import ISOGG SNPs" - - SNP.setClassVariables(self) - self.readPreferredSNPnameSet() - self.readRepresentativeSNPnameSet() - self.readIsoggMultiAllelicPosSet() - self.readIsoggOmitSet() - self.readIsoggCorrectionsDict() - self.parseIsoggTable() - self.setDepthFirstNodeList() - self.sortSNPlistsAndSetRepresentatives() - self.writeIsoggCounts() - self.writeUniqueSNPtable() - self.writeNewick() - self.checkMultiAllelics() - - def readPreferredSNPnameSet(self): - """reads a set of widely known SNP names. presence on this list is - the primary selection criterion for SNP labels""" - - preferredSNPnamesFN = self.config.preferredSNPnamesFN - - utils.checkFileExistence(preferredSNPnamesFN, "Preferred SNP names") - with open(preferredSNPnamesFN, "r") as preferredSNPnamesFile: - for line in preferredSNPnamesFile: - self.preferredSNPnameSet.add(line.strip()) - - self.errAndLog( - "%sRead preferred SNP names\n" % utils.DASHES - + "%6d SNP names: %s\n\n" - % (len(self.preferredSNPnameSet), preferredSNPnamesFN) + def import_isogg_snps(self) -> None: + """Import ISOGG SNPs.""" + + snp_module.SNP.set_class_variables(self) + self.load_preferred_snp_name_set() + self.load_representative_snp_name_set() + self.load_isogg_multi_allelic_pos_set() + self.load_isogg_omit_set() + self.load_isogg_corrections() + self.load_and_parse_isogg_table() + self.set_depth_first_node_list() + self.sort_snplists_and_set_representatives() + self.log_isogg_counts() + self.write_unique_snp_table() + self.write_newick() + self.check_multi_allelics() + + def load_preferred_snp_name_set(self) -> None: + """Load preferred SNP names. + + Presence on this list is the primary selection criterion for SNP labels. + + Set self.preferred_snp_name_set. + + """ + for line in load_data_lines(self.config.preferred_snp_names_data_file): + self.preferred_snp_name_set.add(line.strip()) + + logger.info( + "\nVariant names\n\n" + "Loaded preferred SNP names\n" + f"{len(self.preferred_snp_name_set):6d} SNP names: " + f"{self.config.preferred_snp_names_data_file.filename}\n" ) - def readRepresentativeSNPnameSet(self): - "reads the names of SNPs deemed representative for their respective lineages" + def load_representative_snp_name_set(self) -> None: + """Load the names of SNPs deemed representative for their respective lineages. + + Set self.representative_snp_name_set. - isoggRepSNPfn = self.config.isoggRepSNPfn - otherRepSNPfn = self.config.otherRepSNPfn - countsDicts = defaultdict(int) + """ + counts_dict: dict[str, int] = defaultdict(int) set1 = set() - utils.checkFileExistence(isoggRepSNPfn, "First representative SNPs") - with open(isoggRepSNPfn, "r") as isoggRepSNPfile: - for line in isoggRepSNPfile: - countsDicts["lines"] += 1 - snpAliasesString = line.strip().split()[1] - if snpAliasesString != ".": - countsDicts["haplogroups"] += 1 - for snpAliases in snpAliasesString.split(","): - countsDicts["snps"] += 1 - for snpName in snpAliases.split("/"): - set1.add(snpName) + for line in load_data_lines(self.config.isogg_rep_snp_data_file): + counts_dict["lines"] += 1 + snp_aliases_string = line.strip().split()[1] + if snp_aliases_string != ".": + counts_dict["haplogroups"] += 1 + for snp_aliases in snp_aliases_string.split(","): + counts_dict["snps"] += 1 + for snp_name in snp_aliases.split("/"): + set1.add(snp_name) set2 = set() - utils.checkFileExistence(otherRepSNPfn, "Second representative SNPs") - with open(otherRepSNPfn, "r") as otherRepSNPfile: - for line in otherRepSNPfile: - set2.add(line.strip().split()[1]) - - self.representativeSNPnameSet = set1 | set2 - self.errAndLog( - "Read representative SNPs\n" - + "%6d haplogroups in: %s\n" % (countsDicts["lines"], isoggRepSNPfn) - + "%6d haplogroups with at least one ISOGG-designated representative SNP\n" - % countsDicts["haplogroups"] - + "%6d SNPs, as some haplogroups have more than one representative\n" - % countsDicts["snps"] - + "%6d SNP names, including aliases\n" % len(set1) - + "%6d additional representative SNPs read from: %s\n" - % (len(set2), otherRepSNPfn) - + "%6d total SNP names\n\n" % len(self.representativeSNPnameSet) + for line in load_data_lines(self.config.other_rep_snp_data_file): + set2.add(line.strip().split()[1]) + + self.representative_snp_name_set = set1 | set2 + logger.info( + "Loaded representative SNPs\n" + f"{counts_dict['lines']:6d} Haplogroups in: " + f"{self.config.isogg_rep_snp_data_file.filename}\n" + f"{counts_dict['haplogroups']:6d} " + "Haplogroups with at least one ISOGG-designated representative SNP\n" + f"{counts_dict['snps']:6d} " + "SNPs, as some haplogroups have more than one representative\n" + f"{len(set1):6d} SNP names, including aliases\n" + f"{len(set2):6d} Additional representative SNPs loaded from: " + f"{self.config.other_rep_snp_data_file.filename}\n" + f"{len(self.representative_snp_name_set):6d} Total SNP names\n" ) - def readIsoggMultiAllelicPosSet(self): - "reads list of positions to exclude because of multiple alleles" + def load_isogg_multi_allelic_pos_set(self) -> None: + """Load list of positions to exclude due to multiple alleles. - if not os.path.isfile(self.config.isoggMultiAllelicFN): - return - with open(self.config.isoggMultiAllelicFN, "r") as multiFile: - for line in multiFile: - position = int(line.strip()) - self.multiAllelicOldPosSet.add(position) + Set self.multi_allelic_old_pos_set. - def readIsoggOmitSet(self): - "reads a list of SNPs to omit from ISOGG db" + """ + for line in load_data_lines(self.config.isogg_multi_allelic_data_file): + position = int(line.strip()) + self.multi_allelic_old_pos_set.add(position) - for isoggOmitFN in self.config.isoggOmitFNlist: - if not os.path.isfile(isoggOmitFN): - continue - with open(isoggOmitFN, "r") as omitFile: - for line in omitFile: - lineList = line.strip().split() - if len(lineList) > 0 and lineList[0] != "#": - position, mutation = lineList[2:4] - self.isoggOmitSet.add((position, mutation)) - - def readIsoggCorrectionsDict(self): - "reads a list of SNPs to correct from ISOGG db" - - for isoggCorrectionsFN in self.config.isoggCorrectionsFNlist: - if not os.path.isfile(isoggCorrectionsFN): - continue - with open(isoggCorrectionsFN, "r") as correctionsFile: - for line in correctionsFile: - lineList = line.strip().split() - if len(lineList) > 0 and lineList[0] != "#": - haplogroup, position, mutation, aliases = lineList[1:5] - for alias in aliases.split(","): - self.isoggCorrectionDict[alias] = ( - haplogroup, - position, - mutation, - ) - - def parseIsoggTable(self): - "parses ISOGG table" - - # input reader - utils.checkFileExistence(self.config.isoggFN, "Isogg") - isoggInFile = open(self.config.isoggFN, "r") - isoggReader = csv.reader(isoggInFile, delimiter="\t") - next(isoggReader) # ignore header - - # output file handles - if self.config.suppressOutputAndLog: - isoggOutFile = None - isoggDropOutFile = None + def load_isogg_omit_set(self) -> None: + """Load list of SNPs to omit from ISOGG database. + + Set self.isogg_omit_set. + + """ + for isogg_omit_data_file in self.config.isogg_omit_data_files: + for line in load_data_lines(isogg_omit_data_file): + line_list = line.strip().split() + if len(line_list) > 0 and line_list[0] != "#": + position_str, mutation = line_list[2:4] + self.isogg_omit_set.add((position_str, mutation)) + + def load_isogg_corrections(self) -> None: + """Load SNPs to correct from ISOGG database. + + Set self.isogg_correction_dict. + + """ + for isogg_corrections_data_file in self.config.isogg_corrections_data_files: + for line in load_data_lines(isogg_corrections_data_file): + line_list = line.strip().split() + if len(line_list) > 0 and line_list[0] != "#": + haplogroup, position_str, mutation, aliases = line_list[1:5] + for alias in aliases.split(","): + self.isogg_correction_dict[alias] = ( + haplogroup, + position_str, + mutation, + ) + + def load_and_parse_isogg_table(self) -> None: + """Load and parse ISOGG table.""" + + logger.info("\nISOGG variant data\n") + if self.config.suppress_output: + isogg_out_file = None + isogg_drop_out_file = None else: - isoggOutFile = open(self.config.cleanedIsoggFN, "w") - isoggDropOutFile = open(self.config.droppedIsoggFN, "w") + isogg_out_file = open(self.config.cleaned_isogg_fp, "w") + isogg_drop_out_file = open(self.config.dropped_isogg_fp, "w") - droppedMarkerList = list() + dropped_marker_list = [] + for line in load_data_lines(self.config.isogg_data_file, log=True)[1:]: + line_list = line.split("\t") + self.isogg_counts_dict["read"] += 1 - for lineList in isoggReader: - self.isoggCountsDict["read"] += 1 + # Clean up data row and extract values + line_list = [element.strip() for element in line_list] + if line_list[1] == "": # When present, remove extra tab after SNP name + del line_list[1] - # clean up data row and extract values - lineList = [element.strip() for element in lineList] - if lineList[1] == "": # when present, remove extra tab after snp name - del lineList[1] - if len(lineList) != 6: - self.isoggCountsDict["badLines"] += 1 + if len(line_list) != 6: + self.isogg_counts_dict["bad_lines"] += 1 continue - name, haplogroup, _, _, position, mutation = lineList - # apply corrections - if name in self.isoggCorrectionDict: - haplogroup, position, mutation = self.isoggCorrectionDict[name] - self.numSNPsCorrected += 1 + name, haplogroup, _, _, position_str, mutation = line_list - # identify markers to drop - recordIsBad, markerIsOkToRepresentNode = self.checkIsoggRecord( - name, haplogroup, position, mutation + # Apply corrections + if name in self.isogg_correction_dict: + haplogroup, position_str, mutation = self.isogg_correction_dict[name] + self.num_snps_corrected += 1 + + # Identify markers to drop + record_is_bad, marker_is_ok_to_represent_node = self.check_isogg_record( + name, + haplogroup, + position_str, + mutation, ) - if recordIsBad: - self.isoggCountsDict["dropped"] += 1 - if isoggDropOutFile: - isogg_drop_output = six.ensure_str( - "%-10s %-25s %8s %s\n" - % (six.ensure_text(name), haplogroup, position, mutation) + if record_is_bad: + self.isogg_counts_dict["dropped"] += 1 + if isogg_drop_out_file: + isogg_drop_out_file.write( + f"{name:10s} {haplogroup:25s} {position_str:>8s} {mutation}\n" ) - isoggDropOutFile.write(isogg_drop_output) - if markerIsOkToRepresentNode: - droppedMarkerList.append(DroppedMarker(name, haplogroup)) + + if marker_is_ok_to_represent_node: + dropped_marker = snp_module.DroppedMarker(name, haplogroup, self) + dropped_marker_list.append(dropped_marker) + continue - # process retained SNPs - self.isoggCountsDict["retained"] += 1 - position = int(position) - if isoggOutFile: - isoggOutFile.write( - "%-10s %-25s %8d %s\n" % (name, haplogroup, position, mutation) + # Process retained SNPs + self.isogg_counts_dict["retained"] += 1 + position = int(position_str) + if isogg_out_file: + isogg_out_file.write( + f"{name:10s} {haplogroup:25s} {position:8d} {mutation}\n" ) - self.constructSNP(name, haplogroup, position, mutation) - self.addDroppedMarkersToNodes(droppedMarkerList) - utils.closeFiles([isoggInFile, isoggOutFile, isoggDropOutFile]) + self.construct_snp(name, haplogroup, position, mutation) - def constructSNP(self, name, haplogroup, position, mutation): - """ - typically, instantiates a SNP and adds it to various containers. - note that when SNPs are instantiated, they are added to the tree, - and this process may entail growing the tree to include the corresponding node. + self.add_dropped_markers_to_nodes(dropped_marker_list) + for file in [isogg_out_file, isogg_drop_out_file]: + if file is not None: + file.close() - more specialized things occur if a SNP already exists at this position. - """ + def construct_snp( + self, + name: str, + haplogroup: str, + position: int, + mutation: str, + ) -> None: + """Construct SNP. - if self.hg2nodeDict: + Typically, instantiate a SNP and add it to various containers. + When SNPs are instantiated, they are added to the tree. + This process may entail growing the tree to include the corresponding node. + More specialized things occur if a SNP already exists at this position. + + """ + if self.haplogroup_to_node: ancestral, derived = mutation[0], mutation[3] - snpKey = (haplogroup, position) + snp_key = (haplogroup, position) - if snpKey in self.snpDict: # snp exists under an alias - snp = self.snpDict[snpKey] - if snp.isAncestral(ancestral) and snp.isDerived(derived): - snp.addName(name) - self.snpDict[name] = snp + if snp_key in self.snp_dict: # SNP exists under an alias + snp = self.snp_dict[snp_key] + if snp.is_ancestral(ancestral) and snp.is_derived(derived): + snp.add_name(name) + self.snp_dict[name] = snp else: - newSNP = SNP(name, haplogroup, position, ancestral, derived) - sys.exit("\n\nERROR! Conlicting SNPs:\n%s\n%s\n" % (snp, newSNP)) + new_snp = snp_module.SNP( + name, + haplogroup, + position, + ancestral, + derived, + ) + raise ValueError(f"Conflicting SNPs:\n{snp}\n{new_snp}\n") else: - if position in self.snpDict: # another snp with same position - oldSNP = self.snpDict[position] + if position in self.snp_dict: # Another SNP with same position + old_snp = self.snp_dict[position] if ( - ancestral not in oldSNP.alleleSet - or derived not in oldSNP.alleleSet + ancestral not in old_snp.allele_set + or derived not in old_snp.allele_set ): - self.multiAllelicNewPosSet.add(position) + self.multi_allelic_new_pos_set.add(position) - # typical behavior - snp = SNP(name, haplogroup, position, ancestral, derived) - self.snpDict[(haplogroup, position)] = snp - self.snpDict[name] = snp - self.snpDict[position] = snp - self.snpList.append(snp) - self.snpPosSet.add(position) - self.snpNameSet.add(name) - self.isoggCountsDict["unique"] += 1 + # Typical behavior + snp = snp_module.SNP(name, haplogroup, position, ancestral, derived) + self.snp_dict[(haplogroup, position)] = snp + self.snp_dict[name] = snp + self.snp_dict[position] = snp + self.snp_list.append(snp) + self.snp_pos_set.add(position) + self.snp_name_set.add(name) + self.isogg_counts_dict["unique"] += 1 - def addDroppedMarkersToNodes(self, droppedMarkerList): - "adds dropped markers to coresponding nodes" + def add_dropped_markers_to_nodes( + self, + dropped_marker_list: list["snp_module.DroppedMarker"], + ) -> None: + """Add dropped markers to coresponding nodes.""" - for droppedMarker in droppedMarkerList: - droppedMarker.addToNode() + for dropped_marker in dropped_marker_list: + dropped_marker.add_to_node() - def sortSNPlistsAndSetRepresentatives(self): - "for each node, sorts snps by priority ranking and selects the best representative" + def sort_snplists_and_set_representatives(self) -> None: + """Sort SNPs by priority ranking and select the best representative. + + Repeat for each Node. + + """ + if not self.depth_first_node_list: + self.set_depth_first_node_list - if not self.depthFirstNodeList: - self.setDepthFirstNodeList + for node in self.depth_first_node_list: + node.priority_sort_snp_list_and_set_hg_snp() - for node in self.depthFirstNodeList: - node.prioritySortSNPlistAndSetHgSNP() + def log_isogg_counts(self) -> None: + """Log counts of ISOGG SNPs.""" - def writeIsoggCounts(self): config = self.config - countsDict = self.isoggCountsDict - numAltNames = countsDict["retained"] - countsDict["unique"] - - logTextList = [ - "%sRead ISOGG SNP data:\n %s\n" % (utils.DASHES, config.isoggFN), - " %5d SNPs read" % countsDict["read"], - " %5d corrected " % self.numSNPsCorrected - + "based on:\n %s\n" - % ("\n" + " " * 12).join(config.isoggCorrectionsFNlist), - "- %5d SNPs dropped" % countsDict["dropped"], - " %5d flagged as not meeting quality guidelines" % countsDict["qc"], - " %5d tree location approximate" % countsDict["approxLoc"], - " %5d removed, flagged as provisional, or otherwise problematic" - % countsDict["provisional"], - " %5d non-SNPs" % countsDict["nonSNP"], - " %5d excluded as multiallelic " % countsDict["multiallelic"] - + "based on:\n %s" % config.isoggMultiAllelicFN, - " %5d duplicated names" % countsDict["duplicatedNames"], - " %5d explicitly excluded " % countsDict["omitted"] - + "based on:\n %s" - % ("\n" + " " * 18).join(config.isoggOmitFNlist), - "- %5d bad lines" % countsDict["badLines"], - "= %5d SNPs retained\n" % countsDict["retained"], - "- %5d alternative names" % numAltNames, - "= %5d unique SNPs added to the tree\n" % countsDict["unique"], + counts_dict = self.isogg_counts_dict + num_alt_names = counts_dict["retained"] - counts_dict["unique"] + correction_fps_str = ("\n" + " " * 12).join( + [ + isogg_corrections_data_file.filename + for isogg_corrections_data_file in config.isogg_corrections_data_files + ] + ) + omit_fps_str = ("\n" + " " * 18).join( + [ + isogg_omit_data_file.filename + for isogg_omit_data_file in config.isogg_omit_data_files + ] + ) + + log_text_list = [ + f" {counts_dict['read']:5d} SNPs loaded", + f" {self.num_snps_corrected:5d} Corrected based on:\n" + f" {correction_fps_str}\n", + f"- {counts_dict['dropped']:5d} SNPs Dropped", + f" {counts_dict['qc']:5d} Flagged as not meeting quality guidelines", + f" {counts_dict['approx_loc']:5d} Tree location approximate", + f" {counts_dict['provisional']:5d} Removed, flagged as provisional, " + "or otherwise problematic", + f" {counts_dict['non_snp']:5d} Non-SNPs", + f" {counts_dict['multiallelic']:5d} Excluded as multiallelic " + f"based on: {config.isogg_multi_allelic_data_file.filename}", + f" {counts_dict['duplicated_names']:5d} Duplicated names", + f" {counts_dict['omitted']:5d} Explicitly excluded based on:\n" + f" {omit_fps_str}", + f"- {counts_dict['bad_lines']:5d} Bad lines", + f"= {counts_dict['retained']:5d} SNPs retained\n", + f"- {num_alt_names:5d} Alternative names", + f"= {counts_dict['unique']:5d} Unique SNPs added to the tree\n", ] - if not config.suppressOutputAndLog: - logTextList.extend( + if not config.suppress_output: + log_text_list.extend( [ - "Wrote summary tables", - " dropped: %s" % config.droppedIsoggFN, - " retained: %s" % config.cleanedIsoggFN, - " unique: %s\n" % config.uniqueIsoggFN, + "Wrote summary tables:", + f"- Dropped: {config.dropped_isogg_fp}", + f"- Retained: {config.cleaned_isogg_fp}", + f"- Unique: {config.unique_isogg_fp}\n", ] ) - self.errAndLog(("\n" + " " * 4).join(logTextList) + "\n") + logger.info(("\n" + " " * 4).join(log_text_list)) - def writeUniqueSNPtable(self): - "sort unique snp list by phylogeny and position; write to file" + def write_unique_snp_table(self) -> None: + """Sort unique SNP list by phylogeny and position, then write to file.""" - if not self.config.suppressOutputAndLog: - self.snpList = sorted(self.snpList, key=attrgetter("DFSrank", "position")) - with open(self.config.uniqueIsoggFN, "w") as uniqueIsoggFile: - for snp in self.snpList: - uniqueIsoggFile.write("%s\n" % snp.strWithAllNames()) - - def checkIsoggRecord(self, name, haplogroup, position, mutation): - """ - returns a tuple of booleans: (recordIsBad, nameIsOkToRepresentNode) + if not self.config.suppress_output: + self.snp_list = sorted( + self.snp_list, + key=attrgetter("dfs_rank", "position"), + ) + with open(self.config.unique_isogg_fp, "w") as unique_isogg_file: + for snp in self.snp_list: + unique_isogg_file.write(f"{snp.str_with_all_names}\n") + + def check_isogg_record( + self, + name: str, + haplogroup: str, + position_str: str, + mutation: str, + ) -> tuple[bool, bool]: + """Check an ISOGG record. + + Returns + ------- + record_is_bad : bool + When True, do not use this marker for classification. + name_is_ok_to_represent_node : bool + When True, if no SNPs are retained for the corresponding node, + it is OK to use this marker name for the node's hg_snp representation. - how to interpret TRUE values: - 1. recordIsBad : do not use this marker for classification - 2. markerIsOkToRepresentNode : if no SNPs are retained for the corresponding node, - it's OK to use this marker name for the node's hgSNP representation """ - if name.endswith("^"): - self.isoggCountsDict["qc"] += 1 + self.isogg_counts_dict["qc"] += 1 return True, True if haplogroup.find("~") >= 0: - self.isoggCountsDict["approxLoc"] += 1 - return True, False # second value irrelevant: no corresponding node + self.isogg_counts_dict["approx_loc"] += 1 + return True, False # Second value irrelevant: no corresponding node if ( haplogroup.find("Investigation") >= 0 @@ -743,66 +777,83 @@ def checkIsoggRecord(self, name, haplogroup, position, mutation): or haplogroup.find("Freq. Mut.") >= 0 or len(haplogroup) < 1 ): - self.isoggCountsDict["provisional"] += 1 - return True, False # second value irrelevant: no corresponding node + self.isogg_counts_dict["provisional"] += 1 + return True, False # Second value irrelevant: no corresponding node - if len(mutation) != 4 or mutation.find("?") >= 0 or position.find("..") >= 0: - self.isoggCountsDict["nonSNP"] += 1 + if ( + len(mutation) != 4 + or mutation.find("?") >= 0 + or position_str.find("..") >= 0 + ): + self.isogg_counts_dict["non_snp"] += 1 return True, True - if int(position) in self.multiAllelicOldPosSet: - self.isoggCountsDict["multiallelic"] += 1 + try: + position = int(position_str) + except ValueError: + logger.info(f"\nERROR. Invalid position: {position}\n") + return True, False + + if position in self.multi_allelic_old_pos_set: + self.isogg_counts_dict["multiallelic"] += 1 return True, True - if name in self.snpNameSet: - self.isoggCountsDict["duplicatedNames"] += 1 + if name in self.snp_name_set: + self.isogg_counts_dict["duplicated_names"] += 1 return True, False - if (position, mutation) in self.isoggOmitSet: - self.isoggCountsDict["omitted"] += 1 - return True, False - - try: - position = int(position) - except ValueError: - self.errAndLog("\nERROR. Invalid position: %s\n" % position) + if (position_str, mutation) in self.isogg_omit_set: + self.isogg_counts_dict["omitted"] += 1 return True, False return False, True - def checkMultiAllelics(self): - "checks for mutliallelic variants and writes list to file" - - if len(self.multiAllelicNewPosSet) > 0: - if not self.config.suppressOutputAndLog: - with open(self.config.multiAllelicFoundFN, "w") as outFile: - for position in sorted(list(self.multiAllelicNewPosSet)): - outFile.write("%8d\n" % position) - - self.errAndLog( - "%s*** Dectected %d multiallelic positions. ***\n\n" - % (utils.DASHES, len(self.multiAllelicNewPosSet)) - + "Please do the following and then re-run:\n" - + " cat %s >> %s\n\n" - % (self.config.multiAllelicFoundFN, self.config.isoggMultiAllelicFN) + def check_multi_allelics(self) -> None: + """Check for mutliallelic variants and write list to file.""" + + if not self.config.suppress_output and len(self.multi_allelic_new_pos_set) > 0: + with open(self.config.multi_allelic_found_fp, "w") as out_file: + for position in sorted(list(self.multi_allelic_new_pos_set)): + out_file.write(f"{position:8d}\n") + + num_multiallelic = len(self.multi_allelic_new_pos_set) + logger.info( + f"\n*** Detected {num_multiallelic} multiallelic positions. ***\n\n" + "Please do the following and then re-run:\n" + f" cat {self.config.multi_allelic_found_fp}" + f" >> {self.config.isogg_multi_allelic_data_file.filename}\n\n" ) - def findOrCreateNode(self, haplogroup): - """ - given a haplogroup, returns corresponding node if it exists. - if not, first serially split most recent ancestor that does exists - until there is a place for it. + def find_or_create_node(self, haplogroup: str) -> "node_module.Node": + """Return Node corresponding to a haplogroup, if it exists. + + If no Node corresponds to the haplogroup, serially split the + most recent ancestor that does exist until there is a place for a new Node. - Note: the loop has to start at 1, because myString[:0] --> '' """ + node = None + if haplogroup in self.haplogroup_to_node: + node = self.haplogroup_to_node[haplogroup] + else: + for num_chars_to_chop in range(1, len(haplogroup)): + ancestor_string = haplogroup[:-num_chars_to_chop] + if ancestor_string in self.haplogroup_to_node: + ancestor = self.haplogroup_to_node[ancestor_string] + node = ancestor.serial_split(haplogroup) + break + + if node is None: + raise ValueError(f"Unplaceable haplogroup: {haplogroup}") - if haplogroup in self.hg2nodeDict: - return self.hg2nodeDict[haplogroup] + return node - for numCharsToChop in range(1, len(haplogroup)): - ancestorString = haplogroup[:-numCharsToChop] - if ancestorString in self.hg2nodeDict: - ancestor = self.hg2nodeDict[ancestorString] - return ancestor.serialSplit(haplogroup) - self.errAndLog("Unplaceable haplogroup: %s" % haplogroup) +def verify_newick_token(observed: str, expected: str) -> None: + """Raise ValueError if observed and expected strings do not match.""" + + if observed != expected: + raise ValueError( + "Malformed newick file.\n" + f"Expected this token: {expected}\n" + f"Got this one: {observed}\n" + ) diff --git a/yhaplo/utils.py b/yhaplo/utils.py deleted file mode 100644 index e1cfd46..0000000 --- a/yhaplo/utils.py +++ /dev/null @@ -1,158 +0,0 @@ -# David Poznik -# 2016.01.05 -# utils.py -# -# Defines utility functions and non-application-specific global constants. -# ---------------------------------------------------------------------- -from __future__ import absolute_import, print_function - -import argparse -import csv -import errno -import gzip -import logging -import os -import re -import sys - -# ---------------------------------------------------------------------- -# constants - -DASHES = "-" * 72 + "\n" - -type2fmtDict = {bool: "%i", int: "%i", str: "%s", float: "%f"} - - -# ---------------------------------------------------------------------- -# utility functions - - -def basenameNoEnding(fn, ending): - "returns the basename of a file and removes the supplied ending" - - return os.path.basename(fn)[: (1 - len(ending))] - - -def checkFileExistence(fn, fileDescription=None): - "exits if file does not exist" - - if fileDescription: - message = "%s file not found" % fileDescription - else: - message = "File not found" - - if not os.path.isfile(fn): - sys.exit("\nERROR. %s: %s\n" % (message, fn)) - - -def closeFiles(fileList): - "closes files from list, ignoring any that are set to None" - - for File in fileList: - if File: - File.close() - - -def compressWhitespace(myString): - "replaces whitespace with a single space" - - return re.sub(r"\s+", " ", myString) - - -def getCSVreader(inFN, delimiter="\t"): - "opens a (possibly gzipped) file and creates a csv reader" - - extension = os.path.splitext(inFN)[1] - if extension == ".gz": - try: - inFile = gzip.open(inFN, "rt") - except IOError: - sys.exit("\nERROR. Could not open: %s\n" % inFN) - else: - try: - inFile = open(inFN, "r") - except IOError: - sys.exit("\nERROR. Could not open: %s\n" % inFN) - - return inFile, csv.reader(inFile, delimiter=delimiter) - - -def mkdirP(dirName): - "makes a directory" - - try: - os.makedirs(dirName) - except OSError as exc: - if exc.errno == errno.EEXIST and os.path.isdir(dirName): - pass - else: - raise - - -def object2fmt(x): - "returns a printf style format string appropriate to the object" - - return type2fmtDict[type(x)] - - -def printAndLogger(message): - "output a message to stdout and to the logger" - - print(message) - logging.info(message) - - -def printIterable(myIterable): - "cycles through an iterable, printing each item" - - for item in myIterable: - print(item) - - -def readPositionsSet(inFN, column=0, logFunction=None): - "reads positions from the specified column of a file and constructs a set" - - positionsSet = set() - checkFileExistence(inFN, "SNP positions") - with open(inFN, "r") as inFile: - for line in inFile: - pos = int(line.strip().split()[column]) - positionsSet.add(pos) - - message = "%5d unique positions read: %s\n" % (len(positionsSet), inFN) - if logFunction: - logFunction(message) - else: - sys.stderr.write(message) - return positionsSet - - -def unimplementedMessage(methodName): - "emits message and exits" - - sys.exit("\n\n! Unimplemented method: %s\nExiting.\n" % methodName) - - -# ---------------------------------------------------------------------- -# command-line arguments - - -class RawTextWithDefaultsHelpFormatter(argparse.RawDescriptionHelpFormatter): - """ - argparse help message formatter which: - - retains help text formatting - - adds default values to argument help - combines argparse.RawTextHelpFormatter and argparse.ArgumentDefaultsHelpFormatter - """ - - def _split_lines(self, text, _): - return text.splitlines() - - def _get_help_string(self, action): - help_message = action.help - if "%(default)" not in action.help: - if action.default is not argparse.SUPPRESS: - defaulting_nargs = [argparse.OPTIONAL, argparse.ZERO_OR_MORE] - if action.option_strings or action.nargs in defaulting_nargs: - help_message += "\n(default: %(default)s)" - return help_message diff --git a/yhaplo/utils/__init__.py b/yhaplo/utils/__init__.py new file mode 100644 index 0000000..f5d3dc7 --- /dev/null +++ b/yhaplo/utils/__init__.py @@ -0,0 +1 @@ +"""Utility functions.""" diff --git a/yhaplo/utils/loaders.py b/yhaplo/utils/loaders.py new file mode 100644 index 0000000..feb5b8a --- /dev/null +++ b/yhaplo/utils/loaders.py @@ -0,0 +1,76 @@ +"""Data loaders.""" + +import importlib.resources +import logging +from typing import NamedTuple + +DATA_SUBPACKAGE = __package__.replace("utils", "data") + +logger = logging.getLogger(__name__) + + +class DataFile(NamedTuple): + + """Attributes of a yhaplo data file.""" + + data_subdir: str + filename: str + description: str = "Data" + ttam_only: bool = False + + +class TtamFileNotFoundError(FileNotFoundError): + + """Exception indicating that an unfound data file is not publicly available.""" + + def __init__( + self, + data_file: DataFile, + package: str, + ): + super(TtamFileNotFoundError, self).__init__( + f'Failed to load "{data_file.filename}" from {package}.\n' + f"{data_file.description} file only available internally at 23andMe.\n" + ) + + +def load_data_lines( + data_file: DataFile, + log: bool = False, +) -> list[str]: + """Load yhaplo data file and split into lines.""" + + lines = load_data(data_file, log=log).strip().split("\n") + return lines + + +def load_data( + data_file: DataFile, + log: bool = False, +) -> str: + """Load yhaplo data file. + + Raises + ------ + TtamFileNotFoundError + When failing to load a non-public data file. + + """ + package = f"{DATA_SUBPACKAGE}.{data_file.data_subdir}" + + try: + data = ( + importlib.resources.files(package).joinpath(data_file.filename).read_text() + ) + except (FileNotFoundError, ModuleNotFoundError): + if data_file.ttam_only: + raise TtamFileNotFoundError(data_file, package) + else: + raise + + if log: + logger.info( + f"Loaded {data_file.description}:\n {package}: {data_file.filename}\n" + ) + + return data diff --git a/yhaplo/utils/optional_dependencies.py b/yhaplo/utils/optional_dependencies.py new file mode 100644 index 0000000..0b6882e --- /dev/null +++ b/yhaplo/utils/optional_dependencies.py @@ -0,0 +1,35 @@ +"""Functions for checking whether optional dependencies are available.""" + +ROOT_PACKAGE = __package__.removesuffix(".utils") + + +def check_vcf_dependencies(): + """Check that "vcf" dependencies are available.""" + + try: + from pysam import VariantFile # noqa F401 + except ImportError as error: + error.msg = error.msg + optional_import_error_message( + "Pysam", + "process VCF/BCF input", + "vcf", + ) + raise error + + +def optional_import_error_message( + package_name: str, + package_use: str, + optional_dep_category: str, + root_package: str = ROOT_PACKAGE, +) -> str: + """Construct message indicating that an optional dependency is required.""" + + message = ( + f"\n\n{package_name} is required to {package_use}.\n" + f'Please re-install yhaplo with the "{optional_dep_category}" ' + "optional dependencies.\n" + "For example:\n" + f" pip install {root_package}[{optional_dep_category}]\n" + ) + return message diff --git a/yhaplo/utils/vcf.py b/yhaplo/utils/vcf.py new file mode 100644 index 0000000..7c845e9 --- /dev/null +++ b/yhaplo/utils/vcf.py @@ -0,0 +1,33 @@ +"""VCF utilities.""" + +import os + + +def check_vcf_index(vcf_fp: str) -> None: + """Check that VCF or BCF index file is present. + + Raises + ------ + FileNotFoundError + If the VCF/BCF file or its corresponding index file is not present. + ValueError + If the file path does not have a .vcf.gz or .bcf extension. + + """ + if not os.path.isfile(vcf_fp): + raise FileNotFoundError(f"VCF/BCF file not found: {vcf_fp}") + + if vcf_fp.endswith(".vcf.gz"): + index_fp = vcf_fp.replace(".vcf.gz", ".vcf.gz.tbi") + if not os.path.isfile(index_fp): + raise FileNotFoundError(f"VCF index file not found: {index_fp}") + + elif vcf_fp.endswith(".bcf"): + index_fp = vcf_fp.replace(".bcf", ".bcf.csi") + if not os.path.isfile(index_fp): + raise FileNotFoundError(f"BCF index file not found: {index_fp}") + + else: + raise ValueError( + f"VCF/BCF file path does not have .vcf.gz or .bcf extension: {vcf_fp}" + ) diff --git a/yhaplo/version.txt b/yhaplo/version.txt deleted file mode 100644 index 45a1b3f..0000000 --- a/yhaplo/version.txt +++ /dev/null @@ -1 +0,0 @@ -1.1.2 diff --git a/yhaplo_manual.pdf b/yhaplo_manual.pdf new file mode 100644 index 0000000..1e5b50f Binary files /dev/null and b/yhaplo_manual.pdf differ