diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 00000000..4ecfbfe3 --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,28 @@ +{ + "name": "nfcore", + "image": "nfcore/gitpod:latest", + "remoteUser": "gitpod", + "runArgs": ["--privileged"], + + // Configure tool-specific properties. + "customizations": { + // Configure properties specific to VS Code. + "vscode": { + // Set *default* container specific settings.json values on container create. + "settings": { + "python.defaultInterpreterPath": "/opt/conda/bin/python", + "python.linting.enabled": true, + "python.linting.pylintEnabled": true, + "python.formatting.autopep8Path": "/opt/conda/bin/autopep8", + "python.formatting.yapfPath": "/opt/conda/bin/yapf", + "python.linting.flake8Path": "/opt/conda/bin/flake8", + "python.linting.pycodestylePath": "/opt/conda/bin/pycodestyle", + "python.linting.pydocstylePath": "/opt/conda/bin/pydocstyle", + "python.linting.pylintPath": "/opt/conda/bin/pylint" + }, + + // Add the IDs of extensions you want installed when the container is created. + "extensions": ["ms-python.python", "ms-python.vscode-pylance", "nf-core.nf-core-extensionpack"] + } + } +} diff --git a/.editorconfig b/.editorconfig index 95549501..9b990088 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,12 +8,9 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{yml,yaml}] +[*.{md,yml,yaml,html,css,scss,js}] indent_size = 2 -[*.json] -insert_final_newline = unset - # These files are edited and tested upstream in nf-core/modules [/modules/nf-core/**] charset = unset @@ -25,3 +22,11 @@ indent_size = unset [/assets/email*] indent_size = unset + +# ignore Readme +[README.md] +indent_style = unset + +# ignore python +[*.{py}] +indent_style = unset diff --git a/.gitattributes b/.gitattributes index 7fe55006..7a2dabc2 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,4 @@ *.config linguist-language=nextflow +*.nf.test linguist-language=nextflow +modules/nf-core/** linguist-generated +subworkflows/nf-core/** linguist-generated diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index ef0e8f8f..c53b7965 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -9,14 +9,15 @@ Please use the pre-filled template to save time. However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) -> If you need help using or modifying nf-core/metatdenovo then the best place to ask is on the nf-core Slack [#metatdenovo](https://nfcore.slack.com/channels/metatdenovo) channel ([join our Slack here](https://nf-co.re/join/slack)). +:::info +If you need help using or modifying nf-core/metatdenovo then the best place to ask is on the nf-core Slack [#metatdenovo](https://nfcore.slack.com/channels/metatdenovo) channel ([join our Slack here](https://nf-co.re/join/slack)). +::: ## Contribution workflow If you'd like to write some code for nf-core/metatdenovo, the standard workflow is as follows: -1. Check that there isn't already an issue about your idea in the [nf-core/metatdenovo issues](https://github.com/nf-core/metatdenovo/issues) to avoid duplicating work - * If there isn't one already, please create one so that others know you're working on this +1. Check that there isn't already an issue about your idea in the [nf-core/metatdenovo issues](https://github.com/nf-core/metatdenovo/issues) to avoid duplicating work. If there isn't one already, please create one so that others know you're working on this 2. [Fork](https://help.github.com/en/github/getting-started-with-github/fork-a-repo) the [nf-core/metatdenovo repository](https://github.com/nf-core/metatdenovo) to your GitHub account 3. Make the necessary changes / additions within your forked repository following [Pipeline conventions](#pipeline-contribution-conventions) 4. Use `nf-core schema build` and add any new parameters to the pipeline JSON schema (requires [nf-core tools](https://github.com/nf-core/tools) >= 1.10). @@ -26,6 +27,9 @@ If you're not used to this workflow with git, you can start with some [docs from ## Tests +You can optionally test your changes by running the pipeline locally. Then it is recommended to use the `debug` profile to +receive warnings about process selectors and other debug info. Example: `nextflow run . -profile debug,test,docker --outdir `. + When you create a pull request with changes, [GitHub Actions](https://github.com/features/actions) will run automatic tests. Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. @@ -49,9 +53,9 @@ These tests are run both with the latest available version of `Nextflow` and als :warning: Only in the unlikely and regretful event of a release happening with a bug. -* On your own fork, make a new branch `patch` based on `upstream/master`. -* Fix the bug, and bump version (X.Y.Z+1). -* A PR should be made on `master` from patch to directly this particular bug. +- On your own fork, make a new branch `patch` based on `upstream/master`. +- Fix the bug, and bump version (X.Y.Z+1). +- A PR should be made on `master` from patch to directly this particular bug. ## Getting help @@ -68,16 +72,13 @@ If you wish to contribute a new step, please use the following coding standards: 1. Define the corresponding input channel into your new process from the expected previous process channel 2. Write the process block (see below). 3. Define the output channel if needed (see below). -4. Add any new flags/options to `nextflow.config` with a default (see below). -5. Add any new flags/options to `nextflow_schema.json` with help text (with `nf-core schema build`). -6. Add any new flags/options to the help message (for integer/text parameters, print to help the corresponding `nextflow.config` parameter). -7. Add sanity checks for all relevant parameters. -8. Add any new software to the `scrape_software_versions.py` script in `bin/` and the version command to the `scrape_software_versions` process in `main.nf`. -9. Do local tests that the new code works properly and as expected. -10. Add a new test command in `.github/workflow/ci.yml`. -11. If applicable add a [MultiQC](https://https://multiqc.info/) module. -12. Update MultiQC config `assets/multiqc_config.yaml` so relevant suffixes, name clean up, General Statistics Table column order, and module figures are in the right order. -13. Optional: Add any descriptions of MultiQC report sections and output files to `docs/output.md`. +4. Add any new parameters to `nextflow.config` with a default (see below). +5. Add any new parameters to `nextflow_schema.json` with help text (via the `nf-core schema build` tool). +6. Add sanity checks and validation for all relevant parameters. +7. Perform local tests to validate that the new code works as expected. +8. If applicable, add a new test command in `.github/workflow/ci.yml`. +9. Update MultiQC config `assets/multiqc_config.yml` so relevant suffixes, file name clean up and module plots are in the appropriate order. If applicable, add a [MultiQC](https://https://multiqc.info/) module. +10. Add a description of the output files and if relevant any appropriate images from the MultiQC report to `docs/output.md`. ### Default values @@ -95,34 +96,28 @@ The process resources can be passed on to the tool dynamically within the proces Please use the following naming schemes, to make it easy to understand what is going where. -* initial process channel: `ch_output_from_` -* intermediate and terminal channels: `ch__for_` +- initial process channel: `ch_output_from_` +- intermediate and terminal channels: `ch__for_` ### Nextflow version bumping If you are using a new feature from core Nextflow, you may bump the minimum required version of nextflow in the pipeline with: `nf-core bump-version --nextflow . [min-nf-version]` -### Software version reporting - -If you add a new tool to the pipeline, please ensure you add the information of the tool to the `get_software_version` process. - -Add to the script block of the process, something like the following: +### Images and figures -```bash - --version &> v_.txt 2>&1 || true -``` +For overview images and other documents we follow the nf-core [style guidelines and examples](https://nf-co.re/developers/design_guidelines). -or +## GitHub Codespaces -```bash - --help | head -n 1 &> v_.txt 2>&1 || true -``` +This repo includes a devcontainer configuration which will create a GitHub Codespaces for Nextflow development! This is an online developer environment that runs in your browser, complete with VSCode and a terminal. -You then need to edit the script `bin/scrape_software_versions.py` to: +To get started: -1. Add a Python regex for your tool's `--version` output (as in stored in the `v_.txt` file), to ensure the version is reported as a `v` and the version number e.g. `v2.1.1` -2. Add a HTML entry to the `OrderedDict` for formatting in MultiQC. +- Open the repo in [Codespaces](https://github.com/nf-core/metatdenovo/codespaces) +- Tools installed + - nf-core + - Nextflow -### Images and figures +Devcontainer specs: -For overview images and other documents we follow the nf-core [style guidelines and examples](https://nf-co.re/developers/design_guidelines). +- [DevContainer config](.devcontainer/devcontainer.json) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 3feffe2b..00000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -name: Bug report -about: Report something that is broken or incorrect -labels: bug ---- - - - -## Check Documentation - -I have checked the following places for your error: - -- [ ] [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting) -- [ ] [nf-core/metatdenovo pipeline documentation](https://nf-co.re/metatdenovo/usage) - -## Description of the bug - - - -## Steps to reproduce - -Steps to reproduce the behaviour: - -1. Command line: -2. See error: - -## Expected behaviour - - - -## Log files - -Have you provided the following extra information/files: - -- [ ] The command used to run the pipeline -- [ ] The `.nextflow.log` file - -## System - -- Hardware: -- Executor: -- OS: -- Version - -## Nextflow Installation - -- Version: - -## Container engine - -- Engine: -- version: - -## Additional context - - diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 00000000..8c055936 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,50 @@ +name: Bug report +description: Report something that is broken or incorrect +labels: bug +body: + - type: markdown + attributes: + value: | + Before you post this issue, please check the documentation: + + - [nf-core website: troubleshooting](https://nf-co.re/usage/troubleshooting) + - [nf-core/metatdenovo pipeline documentation](https://nf-co.re/metatdenovo/usage) + + - type: textarea + id: description + attributes: + label: Description of the bug + description: A clear and concise description of what the bug is. + validations: + required: true + + - type: textarea + id: command_used + attributes: + label: Command used and terminal output + description: Steps to reproduce the behaviour. Please paste the command you used to launch the pipeline and the output from your terminal. + render: console + placeholder: | + $ nextflow run ... + + Some output where something broke + + - type: textarea + id: files + attributes: + label: Relevant files + description: | + Please drag and drop the relevant files here. Create a `.zip` archive if the extension is not allowed. + Your verbose log file `.nextflow.log` is often useful _(this is a hidden file in the directory where you launched the pipeline)_ as well as custom Nextflow configuration files. + + - type: textarea + id: system + attributes: + label: System information + description: | + * Nextflow version _(eg. 23.04.0)_ + * Hardware _(eg. HPC, Desktop, Cloud)_ + * Executor _(eg. slurm, local, awsbatch)_ + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, or Apptainer)_ + * OS _(eg. CentOS Linux, macOS, Linux Mint)_ + * Version of nf-core/metatdenovo _(eg. 1.1, 1.5, 1.8.2)_ diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index e9b6f73f..f2a952a8 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,4 +1,3 @@ -blank_issues_enabled: false contact_links: - name: Join nf-core url: https://nf-co.re/join diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md deleted file mode 100644 index 8855d26f..00000000 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -name: Feature request -about: Suggest an idea for the nf-core/metatdenovo pipeline -labels: enhancement ---- - - - -## Is your feature request related to a problem? Please describe - - - - - -## Describe the solution you'd like - - - -## Describe alternatives you've considered - - - -## Additional context - - diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml new file mode 100644 index 00000000..d5a02558 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -0,0 +1,11 @@ +name: Feature request +description: Suggest an idea for the nf-core/metatdenovo pipeline +labels: enhancement +body: + - type: textarea + id: description + attributes: + label: Description of feature + description: Please describe your suggestion for a new feature. It might help to describe a problem or use case, plus any alternatives that you have considered. + validations: + required: true diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 1214a01c..57b3bcb2 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -10,16 +10,16 @@ Remember that PRs should be made against the dev branch, unless you're preparing Learn more about contributing: [CONTRIBUTING.md](https://github.com/nf-core/metatdenovo/tree/master/.github/CONTRIBUTING.md) --> - ## PR checklist - [ ] This comment contains a description of changes (with reason). - [ ] If you've fixed a bug or added code that should be tested, add tests! - - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/metatdenovo/tree/master/.github/CONTRIBUTING.md) - - [ ] If necessary, also make a PR on the nf-core/metatdenovo _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. +- [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/nf-core/metatdenovo/tree/master/.github/CONTRIBUTING.md) +- [ ] If necessary, also make a PR on the nf-core/metatdenovo _branch_ on the [nf-core/test-datasets](https://github.com/nf-core/test-datasets) repository. - [ ] Make sure your code lints (`nf-core lint`). -- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). +- [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. - [ ] `CHANGELOG.md` is updated. diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index f27f724c..68eb9e78 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -14,21 +14,23 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower - uses: nf-core/tower-action@master - # TODO nf-core: You can customise AWS full pipeline tests as required - # Add full size test data (but still relatively small datasets for few samples) - # on the `test_full.config` test runs with only one set of parameters - + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - bearer_token: ${{ secrets.TOWER_BEARER_TOKEN }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/metatdenovo/work-${{ github.sha }} parameters: | { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/metatdenovo/results-${{ github.sha }}" } - profiles: '[ "test_full", "aws_tower" ]' + profiles: test_full + - uses: actions/upload-artifact@v4 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index 52e953cf..9bc63325 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -10,19 +10,24 @@ jobs: if: github.repository == 'nf-core/metatdenovo' runs-on: ubuntu-latest steps: + # Launch workflow using Tower CLI tool action - name: Launch workflow via tower - uses: nf-core/tower-action@master - + uses: seqeralabs/action-tower-launch@v2 with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - bearer_token: ${{ secrets.TOWER_BEARER_TOKEN }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - pipeline: ${{ github.repository }} revision: ${{ github.sha }} workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/metatdenovo/work-${{ github.sha }} parameters: | { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/metatdenovo/results-${{ github.sha }}" + "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/metatdenovo/results-test-${{ github.sha }}" } - profiles: '[ "test", "aws_tower" ]' + profiles: test + - uses: actions/upload-artifact@v4 + with: + name: Tower debug log file + path: | + tower_action_*.log + tower_action_*.json diff --git a/.github/workflows/branch.yml b/.github/workflows/branch.yml index 993ef74f..614d385e 100644 --- a/.github/workflows/branch.yml +++ b/.github/workflows/branch.yml @@ -13,14 +13,13 @@ jobs: - name: Check PRs if: github.repository == 'nf-core/metatdenovo' run: | - { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/metatdenovo ]] && [[ $GITHUB_HEAD_REF = "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] - + { [[ ${{github.event.pull_request.head.repo.full_name }} == nf-core/metatdenovo ]] && [[ $GITHUB_HEAD_REF == "dev" ]]; } || [[ $GITHUB_HEAD_REF == "patch" ]] # If the above check failed, post a comment on the PR explaining the failure # NOTE - this doesn't currently work if the PR is coming from a fork, due to limitations in GitHub actions secrets - name: Post PR comment if: failure() - uses: mshick/add-pr-comment@v1 + uses: mshick/add-pr-comment@v2 with: message: | ## This PR is against the `master` branch :x: @@ -43,4 +42,3 @@ jobs: Thanks again for your contribution! repo-token: ${{ secrets.GITHUB_TOKEN }} allow-repeats: false - diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f2992382..1aac440d 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -8,36 +8,56 @@ on: release: types: [published] -# Uncomment if we need an edge release of Nextflow again -# env: NXF_EDGE: 1 +env: + NXF_ANSI_LOG: false + +concurrency: + group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" + cancel-in-progress: true jobs: test: - name: Run workflow tests + name: Run pipeline with test data + # Only run on push if this is the nf-core dev branch (merged PRs) + if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/metatdenovo') }}" + runs-on: ubuntu-latest + strategy: + matrix: + NXF_VER: + - "23.04.0" + - "latest-everything" + steps: + - name: Check out pipeline code + uses: actions/checkout@v4 + + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + with: + version: "${{ matrix.NXF_VER }}" + + - name: Run pipeline with test data + run: | + nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + + profiles: + name: Run workflow profile # Only run on push if this is the nf-core dev branch (merged PRs) if: ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/metatdenovo') }} runs-on: ubuntu-latest - env: - NXF_VER: ${{ matrix.nxf_ver }} - NXF_ANSI_LOG: false strategy: matrix: - # Nextflow versions: check pipeline minimum and current latest - nxf_ver: ['21.04.0', ''] + # Run remaining test profiles with minimum nextflow version + profile: [test, test_prokka, test_rnaspades, test_transdecoder] steps: - name: Check out pipeline code uses: actions/checkout@v2 - name: Install Nextflow - env: - CAPSULE_LOG: none run: | wget -qO- get.nextflow.io | bash sudo mv nextflow /usr/local/bin/ - - - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix + - name: Run pipeline with ${{ matrix.profile }} test profile run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker + nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.profile }},docker --outdir ./results + +# diff --git a/.github/workflows/clean-up.yml b/.github/workflows/clean-up.yml new file mode 100644 index 00000000..e37cfda5 --- /dev/null +++ b/.github/workflows/clean-up.yml @@ -0,0 +1,24 @@ +name: "Close user-tagged issues and PRs" +on: + schedule: + - cron: "0 0 * * 0" # Once a week + +jobs: + clean-up: + runs-on: ubuntu-latest + permissions: + issues: write + pull-requests: write + steps: + - uses: actions/stale@v9 + with: + stale-issue-message: "This issue has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment otherwise this issue will be closed in 20 days." + stale-pr-message: "This PR has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor. Remove stale label or add a comment if it is still useful." + close-issue-message: "This issue was closed because it has been tagged as awaiting-changes or awaiting-feedback by an nf-core contributor and then staled for 20 days with no activity." + days-before-stale: 30 + days-before-close: 20 + days-before-pr-close: -1 + any-of-labels: "awaiting-changes,awaiting-feedback" + exempt-issue-labels: "WIP" + exempt-pr-labels: "WIP" + repo-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/download_pipeline.yml b/.github/workflows/download_pipeline.yml new file mode 100644 index 00000000..8a330045 --- /dev/null +++ b/.github/workflows/download_pipeline.yml @@ -0,0 +1,67 @@ +name: Test successful pipeline download with 'nf-core download' + +# Run the workflow when: +# - dispatched manually +# - when a PR is opened or reopened to master branch +# - the head branch of the pull request is updated, i.e. if fixes for a release are pushed last minute to dev. +on: + workflow_dispatch: + pull_request: + types: + - opened + branches: + - master + pull_request_target: + branches: + - master + +env: + NXF_ANSI_LOG: false + +jobs: + download: + runs-on: ubuntu-latest + steps: + - name: Install Nextflow + uses: nf-core/setup-nextflow@v1 + + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + architecture: "x64" + - uses: eWaterCycle/setup-singularity@v7 + with: + singularity-version: 3.8.3 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install git+https://github.com/nf-core/tools.git@dev + + - name: Get the repository name and current branch set as environment variable + run: | + echo "REPO_LOWERCASE=${GITHUB_REPOSITORY,,}" >> ${GITHUB_ENV} + echo "REPOTITLE_LOWERCASE=$(basename ${GITHUB_REPOSITORY,,})" >> ${GITHUB_ENV} + echo "REPO_BRANCH=${GITHUB_REF#refs/heads/}" >> ${GITHUB_ENV} + + - name: Download the pipeline + env: + NXF_SINGULARITY_CACHEDIR: ./ + run: | + nf-core download ${{ env.REPO_LOWERCASE }} \ + --revision ${{ env.REPO_BRANCH }} \ + --outdir ./${{ env.REPOTITLE_LOWERCASE }} \ + --compress "none" \ + --container-system 'singularity' \ + --container-library "quay.io" -l "docker.io" -l "ghcr.io" \ + --container-cache-utilisation 'amend' \ + --download-configuration + + - name: Inspect download + run: tree ./${{ env.REPOTITLE_LOWERCASE }} + + - name: Run the downloaded pipeline + env: + NXF_SINGULARITY_CACHEDIR: ./ + NXF_SINGULARITY_HOME_MOUNT: true + run: nextflow run ./${{ env.REPOTITLE_LOWERCASE }}/$( sed 's/\W/_/g' <<< ${{ env.REPO_BRANCH }}) -stub -profile test,singularity --outdir ./results diff --git a/.github/workflows/fix-linting.yml b/.github/workflows/fix-linting.yml new file mode 100644 index 00000000..cc7770bc --- /dev/null +++ b/.github/workflows/fix-linting.yml @@ -0,0 +1,89 @@ +name: Fix linting from a comment +on: + issue_comment: + types: [created] + +jobs: + fix-linting: + # Only run if comment is on a PR with the main repo, and if it contains the magic keywords + if: > + contains(github.event.comment.html_url, '/pull/') && + contains(github.event.comment.body, '@nf-core-bot fix linting') && + github.repository == 'nf-core/metatdenovo' + runs-on: ubuntu-latest + steps: + # Use the @nf-core-bot token to check out so we can push later + - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + with: + token: ${{ secrets.nf_core_bot_auth_token }} + + # indication that the linting is being fixed + - name: React on comment + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + with: + comment-id: ${{ github.event.comment.id }} + reactions: eyes + + # Action runs on the issue comment, so we don't get the PR by default + # Use the gh cli to check out the PR + - name: Checkout Pull Request + run: gh pr checkout ${{ github.event.issue.number }} + env: + GITHUB_TOKEN: ${{ secrets.nf_core_bot_auth_token }} + + # Install and run pre-commit + - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + with: + python-version: 3.11 + + - name: Install pre-commit + run: pip install pre-commit + + - name: Run pre-commit + id: pre-commit + run: pre-commit run --all-files + continue-on-error: true + + # indication that the linting has finished + - name: react if linting finished succesfully + if: steps.pre-commit.outcome == 'success' + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + with: + comment-id: ${{ github.event.comment.id }} + reactions: "+1" + + - name: Commit & push changes + id: commit-and-push + if: steps.pre-commit.outcome == 'failure' + run: | + git config user.email "core@nf-co.re" + git config user.name "nf-core-bot" + git config push.default upstream + git add . + git status + git commit -m "[automated] Fix code linting" + git push + + - name: react if linting errors were fixed + id: react-if-fixed + if: steps.commit-and-push.outcome == 'success' + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + with: + comment-id: ${{ github.event.comment.id }} + reactions: hooray + + - name: react if linting errors were not fixed + if: steps.commit-and-push.outcome == 'failure' + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + with: + comment-id: ${{ github.event.comment.id }} + reactions: confused + + - name: react if linting errors were not fixed + if: steps.commit-and-push.outcome == 'failure' + uses: peter-evans/create-or-update-comment@71345be0265236311c031f5c7866368bd1eff043 # v4 + with: + issue-number: ${{ github.event.issue.number }} + body: | + @${{ github.actor }} I tried to fix the linting errors, but it didn't work. Please fix them manually. + See [CI log](https://github.com/nf-core/metatdenovo/actions/runs/${{ github.run_id }}) for more details. diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 3b448773..81cd098e 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -1,121 +1,46 @@ name: nf-core linting # This workflow is triggered on pushes and PRs to the repository. -# It runs the `nf-core lint` and markdown lint tests to ensure that the code meets the nf-core guidelines +# It runs the `nf-core lint` and markdown lint tests to ensure +# that the code meets the nf-core guidelines. on: push: + branches: + - dev pull_request: release: types: [published] jobs: - Markdown: + pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-node@v1 - with: - node-version: '10' - - name: Install markdownlint - run: npm install -g markdownlint-cli - - name: Run Markdownlint - run: markdownlint . - - # If the above check failed, post a comment on the PR explaining the failure - - name: Post PR comment - if: failure() - uses: mshick/add-pr-comment@v1 - with: - message: | - ## Markdown linting is failing - - To keep the code consistent with lots of contributors, we run automated code consistency checks. - To fix this CI test, please run: - - * Install `markdownlint-cli` - * On Mac: `brew install markdownlint-cli` - * Everything else: [Install `npm`](https://www.npmjs.com/get-npm) then [install `markdownlint-cli`](https://www.npmjs.com/package/markdownlint-cli) (`npm install -g markdownlint-cli`) - * Fix the markdown errors - * Automatically: `markdownlint . --fix` - * Manually resolve anything left from `markdownlint .` - - Once you push these changes the test should pass, and you can hide this comment :+1: - - We highly recommend setting up markdownlint in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! - - Thanks again for your contribution! - repo-token: ${{ secrets.GITHUB_TOKEN }} - allow-repeats: false - - EditorConfig: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - uses: actions/setup-node@v1 - with: - node-version: '10' + - uses: actions/checkout@v4 - - name: Install editorconfig-checker - run: npm install -g editorconfig-checker - - - name: Run ECLint check - run: editorconfig-checker -exclude README.md $(git ls-files | grep -v test) - - YAML: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v1 - - uses: actions/setup-node@v1 + - name: Set up Python 3.11 + uses: actions/setup-python@v5 with: - node-version: '10' - - name: Install yaml-lint - run: npm install -g yaml-lint - - name: Run yaml-lint - run: yamllint $(find ${GITHUB_WORKSPACE} -type f -name "*.yml" -o -name "*.yaml") - - # If the above check failed, post a comment on the PR explaining the failure - - name: Post PR comment - if: failure() - uses: mshick/add-pr-comment@v1 - with: - message: | - ## YAML linting is failing - - To keep the code consistent with lots of contributors, we run automated code consistency checks. - To fix this CI test, please run: + python-version: 3.11 + cache: "pip" - * Install `yaml-lint` - * [Install `npm`](https://www.npmjs.com/get-npm) then [install `yaml-lint`](https://www.npmjs.com/package/yaml-lint) (`npm install -g yaml-lint`) - * Fix the markdown errors - * Run the test locally: `yamllint $(find . -type f -name "*.yml" -o -name "*.yaml")` - * Fix any reported errors in your YAML files + - name: Install pre-commit + run: pip install pre-commit - Once you push these changes the test should pass, and you can hide this comment :+1: - - We highly recommend setting up yaml-lint in your code editor so that this formatting is done automatically on save. Ask about it on Slack for help! - - Thanks again for your contribution! - repo-token: ${{ secrets.GITHUB_TOKEN }} - allow-repeats: false + - name: Run pre-commit + run: pre-commit run --all-files nf-core: runs-on: ubuntu-latest steps: - - name: Check out pipeline code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Install Nextflow - env: - CAPSULE_LOG: none - run: | - wget -qO- get.nextflow.io | bash - sudo mv nextflow /usr/local/bin/ + uses: nf-core/setup-nextflow@v1 - - uses: actions/setup-python@v1 + - uses: actions/setup-python@v5 with: - python-version: '3.6' - architecture: 'x64' + python-version: "3.11" + architecture: "x64" - name: Install dependencies run: | @@ -135,11 +60,10 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: linting-logs path: | lint_log.txt lint_results.md PR_number.txt - diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index 90f03c6f..147bcd10 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -1,4 +1,3 @@ - name: nf-core linting comment # This workflow is triggered after the linting action is complete # It posts an automated comment to the PR, even if the PR is coming from a fork @@ -12,13 +11,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@v2 + uses: dawidd6/action-download-artifact@v3 with: workflow: linting.yml + workflow_conclusion: completed - name: Get PR number id: pr_number - run: echo "::set-output name=pr_number::$(cat linting-logs/PR_number.txt)" + run: echo "pr_number=$(cat linting-logs/PR_number.txt)" >> $GITHUB_OUTPUT - name: Post PR comment uses: marocchino/sticky-pull-request-comment@v2 @@ -26,4 +26,3 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} number: ${{ steps.pr_number.outputs.pr_number }} path: linting-logs/lint_results.md - diff --git a/.github/workflows/release-announcements.yml b/.github/workflows/release-announcements.yml new file mode 100644 index 00000000..21ac3f06 --- /dev/null +++ b/.github/workflows/release-announcements.yml @@ -0,0 +1,68 @@ +name: release-announcements +# Automatic release toot and tweet anouncements +on: + release: + types: [published] + workflow_dispatch: + +jobs: + toot: + runs-on: ubuntu-latest + steps: + - uses: rzr/fediverse-action@master + with: + access-token: ${{ secrets.MASTODON_ACCESS_TOKEN }} + host: "mstdn.science" # custom host if not "mastodon.social" (default) + # GitHub event payload + # https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#release + message: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + + send-tweet: + runs-on: ubuntu-latest + + steps: + - uses: actions/setup-python@v5 + with: + python-version: "3.10" + - name: Install dependencies + run: pip install tweepy==4.14.0 + - name: Send tweet + shell: python + run: | + import os + import tweepy + + client = tweepy.Client( + access_token=os.getenv("TWITTER_ACCESS_TOKEN"), + access_token_secret=os.getenv("TWITTER_ACCESS_TOKEN_SECRET"), + consumer_key=os.getenv("TWITTER_CONSUMER_KEY"), + consumer_secret=os.getenv("TWITTER_CONSUMER_SECRET"), + ) + tweet = os.getenv("TWEET") + client.create_tweet(text=tweet) + env: + TWEET: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + TWITTER_CONSUMER_KEY: ${{ secrets.TWITTER_CONSUMER_KEY }} + TWITTER_CONSUMER_SECRET: ${{ secrets.TWITTER_CONSUMER_SECRET }} + TWITTER_ACCESS_TOKEN: ${{ secrets.TWITTER_ACCESS_TOKEN }} + TWITTER_ACCESS_TOKEN_SECRET: ${{ secrets.TWITTER_ACCESS_TOKEN_SECRET }} + + bsky-post: + runs-on: ubuntu-latest + steps: + - uses: zentered/bluesky-post-action@v0.1.0 + with: + post: | + Pipeline release! ${{ github.repository }} v${{ github.event.release.tag_name }} - ${{ github.event.release.name }}! + + Please see the changelog: ${{ github.event.release.html_url }} + env: + BSKY_IDENTIFIER: ${{ secrets.BSKY_IDENTIFIER }} + BSKY_PASSWORD: ${{ secrets.BSKY_PASSWORD }} + # diff --git a/.gitignore b/.gitignore index 5124c9ac..bed1df98 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,9 @@ results/ testing/ testing* *.pyc +.*.sw? +.screenrc +eggnog +kofam +kofamscan +eukulele diff --git a/.gitpod.yml b/.gitpod.yml new file mode 100644 index 00000000..363d5b1d --- /dev/null +++ b/.gitpod.yml @@ -0,0 +1,22 @@ +image: nfcore/gitpod:latest +tasks: + - name: Update Nextflow and setup pre-commit + command: | + pre-commit install --install-hooks + nextflow self-update + - name: unset JAVA_TOOL_OPTIONS + command: | + unset JAVA_TOOL_OPTIONS + +vscode: + extensions: # based on nf-core.nf-core-extensionpack + - codezombiech.gitignore # Language support for .gitignore files + # - cssho.vscode-svgviewer # SVG viewer + - esbenp.prettier-vscode # Markdown/CommonMark linting and style checking for Visual Studio Code + - eamodio.gitlens # Quickly glimpse into whom, why, and when a line or code block was changed + - EditorConfig.EditorConfig # override user/workspace settings with settings found in .editorconfig files + - Gruntfuggly.todo-tree # Display TODO and FIXME in a tree view in the activity bar + - mechatroner.rainbow-csv # Highlight columns in csv files in different colors + # - nextflow.nextflow # Nextflow syntax highlighting + - oderwat.indent-rainbow # Highlight indentation level + - streetsidesoftware.code-spell-checker # Spelling checker for source code diff --git a/.markdownlint.yml b/.markdownlint.yml deleted file mode 100644 index 9e605fcf..00000000 --- a/.markdownlint.yml +++ /dev/null @@ -1,14 +0,0 @@ -# Markdownlint configuration file -default: true -line-length: false -ul-indent: - indent: 4 -no-duplicate-header: - siblings_only: true -no-inline-html: - allowed_elements: - - img - - p - - kbd - - details - - summary diff --git a/.nf-core.yml b/.nf-core.yml new file mode 100644 index 00000000..3805dc81 --- /dev/null +++ b/.nf-core.yml @@ -0,0 +1 @@ +repository_type: pipeline diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..af57081f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,10 @@ +repos: + - repo: https://github.com/pre-commit/mirrors-prettier + rev: "v3.1.0" + hooks: + - id: prettier + - repo: https://github.com/editorconfig-checker/editorconfig-checker.python + rev: "2.7.3" + hooks: + - id: editorconfig-checker + alias: ec diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 00000000..437d763d --- /dev/null +++ b/.prettierignore @@ -0,0 +1,12 @@ +email_template.html +adaptivecard.json +slackreport.json +.nextflow* +work/ +data/ +results/ +.DS_Store +testing/ +testing* +*.pyc +bin/ diff --git a/.prettierrc.yml b/.prettierrc.yml new file mode 100644 index 00000000..c81f9a76 --- /dev/null +++ b/.prettierrc.yml @@ -0,0 +1 @@ +printWidth: 120 diff --git a/CHANGELOG.md b/CHANGELOG.md index 89787aea..e6e05ffe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## v1.0dev - [date] +## v1.0.0 - [date] Initial release of nf-core/metatdenovo, created with the [nf-core](https://nf-co.re/) template. diff --git a/CITATIONS.md b/CITATIONS.md index 05f2fecd..4722acbb 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,23 +10,94 @@ ## Pipeline tools -* [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) -* [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) - > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. + > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + +- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) + + > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. + +- [Trim Galore!](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/) + +- [khmer](https://github.com/dib-lab/khmer) + + > Brown, C. Titus, Adina Howe, Qingpeng Zhang, Alexis B. Pyrkosz, and Timothy H. Brom. 2012. “A Reference-Free Algorithm for Computational Normalization of Shotgun Sequencing Data.” ArXiv:1203.4802 [q-Bio], May. arxiv.org/abs/1203.4802. + > Crusoe, Michael R., Hussien F. Alameldin, Sherine Awad, Elmar Boucher, Adam Caldwell, Reed Cartwright, Amanda Charbonneau, et al. 2015. “The Khmer Software Package: Enabling Efficient Nucleotide Sequence Analysis.” F1000Research 4 (September): 900. doi.org/10.12688/f1000research.6924.1. + > Zhang, Qingpeng, Jason Pell, Rosangela Canino-Koning, Adina Chuang Howe, and C. Titus Brown. 2014. “These Are Not the K-Mers You Are Looking For: Efficient Online K-Mer Counting Using a Probabilistic Data Structure.” PLOS ONE 9 (7): e101271. doi.org/10.1371/journal.pone.0101271. + +- [Seqtk](https://github.com/lh3/seqtk) + +- [RNAspade](https://cab.spbu.ru/software/rnaspades/) + + > Elena Bushmanova, Dmitry Antipov, Alla Lapidus, Andrey D Prjibelski rnaSPAdes: a de novo transcriptome assembler and its application to RNA-Seq data GigaScience, 2019 + +- [Megahit](https://github.com/voutcn/megahit) + + > Li, D., Liu, C-M., Luo, R., Sadakane, K., and Lam, T-W., (2015) MEGAHIT: An ultra-fast single-node solution for large and complex metagenomics assembly via succinct de Bruijn graph. Bioinformatics, doi: 10.1093/bioinformatics/btv033 [PMID: 25609793]. + > Li, D., Luo, R., Liu, C.M., Leung, C.M., Ting, H.F., Sadakane, K., Yamashita, H. and Lam, T.W., 2016. MEGAHIT v1.0: A Fast and Scalable Metagenome Assembler driven by Advanced Methodologies and Community Practices. Methods. + +- [TransDecoder](https://github.com/TransDecoder/TransDecoder) + +- [Prokka](https://github.com/tseemann/prokka) + + > Seemann T. Prokka: rapid prokaryotic genome annotation Bioinformatics 2014 Jul 15;30(14):2068-9. PMID:24642063 + +- [Prodigal](https://github.com/hyattpd/Prodigal) + +- [BBmap](https://sourceforge.net/projects/bbmap/) + +- [FeatureCounts](https://subread.sourceforge.net) + + > Liao Y, Smyth GK and Shi W. The R package Rsubread is easier, faster, cheaper and better for alignment and quantification of RNA sequencing reads. Nucleic Acids Research, 47(8):e47, 2019 + > Liao Y, Smyth GK and Shi W. featureCounts: an efficient general-purpose program for assigning sequence reads to genomic features. Bioinformatics, 30(7):923-30, 2014 + > Liao Y, Smyth GK and Shi W. The Subread aligner: fast, accurate and scalable read mapping by seed-and-vote. Nucleic Acids Research, 41(10):e108, 2013 + +- [Eggnog](https://github.com/eggnogdb/eggnog-mapper) + + > eggNOG-mapper v2: functional annotation, orthology assignments, and domain + > prediction at the metagenomic scale. Carlos P. Cantalapiedra, + > Ana Hernandez-Plaza, Ivica Letunic, Peer Bork, Jaime Huerta-Cepas. 2021. + > Molecular Biology and Evolution, msab293, https://doi.org/10.1093/molbev/msab293 + > eggNOG 5.0: a hierarchical, functionally and phylogenetically annotated + > orthology resource based on 5090 organisms and 2502 viruses. Jaime + > Huerta-Cepas, Damian Szklarczyk, Davide Heller, Ana Hernández-Plaza, Sofia + > K Forslund, Helen Cook, Daniel R Mende, Ivica Letunic, Thomas Rattei, Lars + > J Jensen, Christian von Mering, Peer Bork Nucleic Acids Res. 2019 Jan 8; + > 47(Database issue): D309–D314. doi: 10.1093/nar/gky1085 + +- [Kofamscan](https://github.com/takaram/kofam_scan) + +- [HMMsearch](https://www.ebi.ac.uk/Tools/hmmer/search/hmmsearch) + +- [EUKulele](https://github.com/AlexanderLabWHOI/EUKulele) + +- [CAT](https://github.com/dutilh/CAT) + + > von Meijenfeldt FAB, Arkhipova K, Cambuy DD, Coutinho FH, Dutilh BE. Robust taxonomic classification of uncharted microbial sequences and bins with CAT and BAT. Genome Biology. 2019;20:217. + +- [transrate](https://hibberdlab.com/transrate/) + + > TransRate: reference free quality assessment of de-novo transcriptome assemblies (2016). Richard D Smith-Unna, Chris Boursnell, Rob Patro, Julian M Hibberd, Steven Kelly. Genome Research doi: [http://dx.doi.org/10.1101/gr.196469.115](http://dx.doi.org/10.1101/gr.196469.115) ## Software packaging/containerisation tools -* [Anaconda](https://anaconda.com) - > Anaconda Software Distribution. Computer software. Vers. 2-2.4.0. Anaconda, Nov. 2016. Web. +- [Anaconda](https://anaconda.com) + + > Anaconda Software Distribution. Computer software. Vers. 2-2.4.0. Anaconda, Nov. 2016. Web. + +- [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/) + + > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506. + +- [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) + + > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. -* [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/) - > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506. +- [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) -* [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) - > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. + > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241. -* [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) +- [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) -* [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) - > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. + > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f4fd052f..c089ec78 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,18 +1,20 @@ -# Code of Conduct at nf-core (v1.0) +# Code of Conduct at nf-core (v1.4) ## Our Pledge -In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: +In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: - Age +- Ability - Body size +- Caste - Familial status - Gender identity and expression - Geographical location - Level of experience - Nationality and national origins - Native language -- Physical and neurological ability +- Neurodiversity - Race or ethnicity - Religion - Sexual identity and orientation @@ -22,80 +24,133 @@ Please note that the list above is alphabetised and is therefore not ranked in a ## Preamble -> Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply. +:::note +This Code of Conduct (CoC) has been drafted by Renuka Kudva, Cris Tuñí, and Michael Heuer, with input from the nf-core Core Team and Susanna Marquez from the nf-core community. "We", in this document, refers to the Safety Officers and members of the nf-core Core Team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will be amended periodically to keep it up-to-date. In case of any dispute, the most current version will apply. +::: -An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva. +An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). + +Our Safety Officers are Saba Nafees, Cris Tuñí, and Michael Heuer. nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. -We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc. +We have therefore adopted this CoC, which we require all members of our community and attendees of nf-core events to adhere to in all our workspaces at all times. Workspaces include, but are not limited to, Slack, meetings on Zoom, gather.town, YouTube live etc. -Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. +Our CoC will be strictly enforced and the nf-core team reserves the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. -We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. +We ask all members of our community to help maintain supportive and productive workspaces and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. -Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re +Questions, concerns, or ideas on what we can include? Contact members of the Safety Team on Slack or email safety [at] nf-co [dot] re. ## Our Responsibilities -The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. +Members of the Safety Team (the Safety Officers) are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. -The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. +The Safety Team, in consultation with the nf-core core team, have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this CoC, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. -Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. +Members of the core team or the Safety Team who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and will be subject to the same actions as others in violation of the CoC. -## When are where does this Code of Conduct apply? +## When and where does this Code of Conduct apply? -Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: +Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events, such as hackathons, workshops, bytesize, and collaborative workspaces on gather.town. These guidelines include, but are not limited to, the following (listed alphabetically and therefore in no order of preference): - Communicating with an official project email address. - Communicating with community members within the nf-core Slack channel. - Participating in hackathons organised by nf-core (both online and in-person events). -- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence. -- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc. +- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence, and on the nf-core gather.town workspace. +- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, gather.town, Jitsi, YouTube live etc. - Representing nf-core on social media. This includes both official and personal accounts. ## nf-core cares 😊 -nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order): +nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include, but are not limited to, the following (listed in alphabetical order): - Ask for consent before sharing another community member’s personal information (including photographs) on social media. - Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. -- Celebrate your accomplishments at events! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) +- Celebrate your accomplishments! (Get creative with your use of emojis 🎉 🥳 💯 🙌 !) - Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) - Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) - Focus on what is best for the team and the community. (When in doubt, ask) -- Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn. +- Accept feedback, yet be unafraid to question, deliberate, and learn. - Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) -- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**) +- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communication to be kind.**) - Take breaks when you feel like you need them. -- Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.) +- Use welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack) ## nf-core frowns on 😕 -The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces. +The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this CoC. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces: - Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. - “Doxing” i.e. posting (or threatening to post) another person’s personal identifying information online. - Spamming or trolling of individuals on social media. -- Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention. -- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience. +- Use of sexual or discriminatory imagery, comments, jokes, or unwelcome sexual attention. +- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion, or work experience. ### Online Trolling -The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately. +The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the risk of online trolling. This is unacceptable — reports of such behaviour will be taken very seriously and perpetrators will be excluded from activities immediately. -All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls. +All community members are **required** to ask members of the group they are working with for explicit consent prior to taking screenshots of individuals during video calls. -## Procedures for Reporting CoC violations +## Procedures for reporting CoC violations If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. -You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s). +You can reach out to members of the Safety Team (Saba Nafees, Cris Tuñí, and Michael Heuer) on Slack. Alternatively, contact a member of the nf-core core team [nf-core core team](https://nf-co.re/about), and they will forward your concerns to the Safety Team. + +Issues directly concerning members of the Core Team or the Safety Team will be dealt with by other members of the core team and the safety manager — possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson and details will be shared in due course. + +All reports will be handled with the utmost discretion and confidentiality. + +You can also report any CoC violations to safety [at] nf-co [dot] re. In your email report, please do your best to include: + +- Your contact information. +- Identifying information (e.g. names, nicknames, pseudonyms) of the participant who has violated the Code of Conduct. +- The behaviour that was in violation and the circumstances surrounding the incident. +- The approximate time of the behaviour (if different than the time the report was made). +- Other people involved in the incident, if applicable. +- If you believe the incident is ongoing. +- If there is a publicly available record (e.g. mailing list record, a screenshot). +- Any additional information. + +After you file a report, one or more members of our Safety Team will contact you to follow up on your report. + +## Who will read and handle reports + +All reports will be read and handled by the members of the Safety Team at nf-core. + +If members of the Safety Team are deemed to have a conflict of interest with a report, they will be required to recuse themselves as per our Code of Conduct and will not have access to any follow-ups. + +To keep this first report confidential from any of the Safety Team members, please submit your first report by direct messaging on Slack/direct email to any of the nf-core members you are comfortable disclosing the information to, and be explicit about which member(s) you do not consent to sharing the information with. + +## Reviewing reports + +After receiving the report, members of the Safety Team will review the incident report to determine whether immediate action is required, for example, whether there is immediate threat to participants’ safety. + +The Safety Team, in consultation with members of the nf-core core team, will assess the information to determine whether the report constitutes a Code of Conduct violation, for them to decide on a course of action. + +In the case of insufficient information, one or more members of the Safety Team may contact the reporter, the reportee, or any other attendees to obtain more information. -Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course. +Once additional information is gathered, the Safety Team will collectively review and decide on the best course of action to take, if any. The Safety Team reserves the right to not act on a report. -All reports will be handled with utmost discretion and confidentially. +## Confidentiality + +All reports, and any additional information included, are only shared with the team of safety officers (and possibly members of the core team, in case the safety officer is in violation of the CoC). We will respect confidentiality requests for the purpose of protecting victims of abuse. + +We will not name harassment victims, beyond discussions between the safety officer and members of the nf-core team, without the explicit consent of the individuals involved. + +## Enforcement + +Actions taken by the nf-core’s Safety Team may include, but are not limited to: + +- Asking anyone to stop a behaviour. +- Asking anyone to leave the event and online spaces either temporarily, for the remainder of the event, or permanently. +- Removing access to the gather.town and Slack, either temporarily or permanently. +- Communicating to all participants to reinforce our expectations for conduct and remind what is unacceptable behaviour; this may be public for practical reasons. +- Communicating to all participants that an incident has taken place and how we will act or have acted — this may be for the purpose of letting event participants know we are aware of and dealing with the incident. +- Banning anyone from participating in nf-core-managed spaces, future events, and activities, either temporarily or permanently. +- No action. ## Attribution and Acknowledgements @@ -106,6 +161,22 @@ All reports will be handled with utmost discretion and confidentially. ## Changelog -### v1.0 - March 12th, 2021 +### v1.4 - February 8th, 2022 + +- Included a new member of the Safety Team. Corrected a typographical error in the text. + +### v1.3 - December 10th, 2021 + +- Added a statement that the CoC applies to nf-core gather.town workspaces. Corrected typographical errors in the text. + +### v1.2 - November 12th, 2021 + +- Removed information specific to reporting CoC violations at the Hackathon in October 2021. + +### v1.1 - October 14th, 2021 + +- Updated with names of new Safety Officers and specific information for the hackathon in October 2021. + +### v1.0 - March 15th, 2021 - Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. diff --git a/LICENSE b/LICENSE index d55ce4bc..16b993a7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) Daniel Lundin +Copyright (c) Danilo Di Leo, Emelie Nilsson & Daniel Lundin Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 28903613..ba40c257 100644 --- a/README.md +++ b/README.md @@ -1,71 +1,105 @@ -# ![nf-core/metatdenovo](docs/images/nf-core-metatdenovo_logo.png) - +

+ + + nf-core/metatdenovo + +

[![GitHub Actions CI Status](https://github.com/nf-core/metatdenovo/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/metatdenovo/actions?query=workflow%3A%22nf-core+CI%22) -[![GitHub Actions Linting Status](https://github.com/nf-core/metatdenovo/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/metatdenovo/actions?query=workflow%3A%22nf-core+linting%22) -[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/metatdenovo/results) -[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) +[![GitHub Actions Linting Status](https://github.com/nf-core/metatdenovo/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/metatdenovo/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/metatdenovo/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) -[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A521.04.0-23aa62.svg?labelColor=000000)](https://www.nextflow.io/) +[![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) +[![Launch on Nextflow Tower](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Nextflow%20Tower-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/nf-core/metatdenovo) -[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23metatdenovo-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/metatdenovo) -[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core) -[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) +[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23metatdenovo-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/metatdenovo)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) ## Introduction - -**nf-core/metatdenovo** is a bioinformatics best-practice analysis pipeline for Assembly and annotation of metatranscriptomic data, both prokaryotic and eukaryotic. +**nf-core/metatdenovo** is a bioinformatics best-practice analysis pipeline for assembly and annotation of metatranscriptomic data, both prokaryotic and eukaryotic. The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community! - On release, automated continuous integration tests run the pipeline on a full-sized dataset on the AWS cloud infrastructure. This ensures that the pipeline runs on AWS, has sensible resource allocation defaults set to run on real-world datasets, and permits the persistent storage of results to benchmark between pipeline releases and other analysis sources. The results obtained from the full-sized test can be viewed on the [nf-core website](https://nf-co.re/metatdenovo/results). ## Pipeline summary - +![nf-core/metatdenovo metro map](docs/images/metat_v6.svg) 1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) 2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) - -## Quick Start - -1. Install [`Nextflow`](https://www.nextflow.io/docs/latest/getstarted.html#installation) (`>=21.04.0`) - -2. Install any of [`Docker`](https://docs.docker.com/engine/installation/), [`Singularity`](https://www.sylabs.io/guides/3.0/user-guide/), [`Podman`](https://podman.io/), [`Shifter`](https://nersc.gitlab.io/development/shifter/how-to-use/) or [`Charliecloud`](https://hpc.github.io/charliecloud/) for full pipeline reproducibility _(please only use [`Conda`](https://conda.io/miniconda.html) as a last resort; see [docs](https://nf-co.re/usage/configuration#basic-configuration-profiles))_ - -3. Download the pipeline and test it on a minimal dataset with a single command: - - ```console - nextflow run nf-core/metatdenovo -profile test, - ``` - - > * Please check [nf-core/configs](https://github.com/nf-core/configs#documentation) to see if a custom config file to run nf-core pipelines already exists for your Institute. If so, you can simply use `-profile ` in your command. This will enable either `docker` or `singularity` and set the appropriate execution settings for your local compute environment. - > * If you are using `singularity` then the pipeline will auto-detect this and attempt to download the Singularity images directly as opposed to performing a conversion from Docker images. If you are persistently observing issues downloading Singularity images directly due to timeout or network issues then please use the `--singularity_pull_docker_container` parameter to pull and convert the Docker image instead. Alternatively, it is highly recommended to use the [`nf-core download`](https://nf-co.re/tools/#downloading-pipelines-for-offline-use) command to pre-download all of the required containers before running the pipeline and to set the [`NXF_SINGULARITY_CACHEDIR` or `singularity.cacheDir`](https://www.nextflow.io/docs/latest/singularity.html?#singularity-docker-hub) Nextflow options to be able to store and re-use the images from a central location for future pipeline runs. - > * If you are using `conda`, it is highly recommended to use the [`NXF_CONDA_CACHEDIR` or `conda.cacheDir`](https://www.nextflow.io/docs/latest/conda.html) settings to store the environments in a central location for future pipeline runs. - -4. Start running your own analysis! - - - - ```console - nextflow run nf-core/metatdenovo -profile --input samplesheet.csv --genome GRCh37 - ``` - -## Documentation - -The nf-core/metatdenovo pipeline comes with documentation about the pipeline [usage](https://nf-co.re/metatdenovo/usage), [parameters](https://nf-co.re/metatdenovo/parameters) and [output](https://nf-co.re/metatdenovo/output). +3. Quality trimming and adapter removal for raw reads ([`Trim Galore!`](https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/)) +4. Optional: Filter sequences with [`BBduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) +5. Optional: Normalize the sequencing depth with [`BBnorm`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbnorm-guide/) +6. Merge trimmed, pair-end reads ([`Seqtk`](https://github.com/lh3/seqtk)) +7. Choice of de novo assembly programs: + 1. [`RNAspades`](https://cab.spbu.ru/software/rnaspades/) suggested for Eukaryote de novo assembly + 2. [`Megahit`](https://github.com/voutcn/megahit) suggested for Prokaryote de novo assembly +8. Choice of orf caller: + 1. [`TransDecoder`](https://github.com/TransDecoder/TransDecoder) suggested for Eukaryotes + 2. [`Prokka`](https://github.com/tseemann/prokka) suggested for Prokaryotes + 3. [`Prodigal`](https://github.com/hyattpd/Prodigal) suggested for Prokaryotes +9. Quantification of genes identified in assemblies: + 1. Generate index of assembly ([`BBmap index`](https://sourceforge.net/projects/bbmap/)) + 2. Mapping cleaned reads to the assembly for quantification ([`BBmap`](https://sourceforge.net/projects/bbmap/)) + 3. Get raw counts per each gene present in the assembly ([`Featurecounts`](http://subread.sourceforge.net)) -> TSV table with collected featurecounts output +10. Functional annotation: + 1. [`Eggnog`](https://github.com/eggnogdb/eggnog-mapper) -> Reformat TSV output "eggnog table" + 2. [`KOfamscan`](https://github.com/takaram/kofam_scan) + 3. [`HMMERsearch`](https://www.ebi.ac.uk/Tools/hmmer/search/hmmsearch) -> Ranking orfs based on HMMprofile with [`Hmmrank`](https://github.com/erikrikarddaniel/hmmrank) +11. Taxonomic annotation: + 1. [`EUKulele`](https://github.com/AlexanderLabWHOI/EUKulele) -> Reformat TSV output "Reformat_tax.R" + 2. [`CAT`](https://github.com/dutilh/CAT) +12. Summary statistics table. "Collect_stats.R" + +## Usage + +> [!NOTE] +> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. + +First, prepare a samplesheet with your input data that looks as follows: + +`samplesheet.csv`: + +``` +| sample | fastq_1 | fastq_2 +| -------- | ------------------------- | ------------------------- | +| sample1 | ./data/S1_R1_001.fastq.gz | ./data/S1_R2_001.fastq.gz | +| sample2 | ./data/S2_fw.fastq.gz | ./data/S2_rv.fastq.gz | +| sample3 | ./S4x.fastq.gz | ./S4y.fastq.gz | +| sample4 | ./a.fastq.gz | ./b.fastq.gz | +``` + +Each row represents a fastq file (single-end) or a pair of fastq files (paired-end). + +Now, you can run the pipeline using: + +```bash +nextflow run nf-core/metatdenovo \ + -profile \ + --input samplesheet.csv \ + --outdir +``` + +> [!WARNING] +> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; +> see [docs](https://nf-co.re/usage/configuration#custom-configuration-files). + +For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/metatdenovo/usage) and the [parameter documentation](https://nf-co.re/metatdenovo/parameters). + +## Pipeline output + +To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/metatdenovo/results) tab on the nf-core website pipeline page. +For more details about the output files and reports, please refer to the +[output documentation](https://nf-co.re/metatdenovo/output). + +> [!NOTE] +> Tables in `summary_tables` directory under the output directory are made especially for further analysis in tools like R or Python. ## Credits -nf-core/metatdenovo was originally written by Daniel Lundin. - -We thank the following people for their extensive assistance in the development of this pipeline: - - +nf-core/metatdenovo was originally written by Danilo Di Leo (@danilodileo), Emelie Nilsson (@emnilsson) & Daniel Lundin (@erikrikarddaniel). ## Contributions and Support @@ -76,9 +110,8 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations - + - An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. You can cite the `nf-core` publication as follows: diff --git a/assets/adaptivecard.json b/assets/adaptivecard.json new file mode 100644 index 00000000..4646348f --- /dev/null +++ b/assets/adaptivecard.json @@ -0,0 +1,67 @@ +{ + "type": "message", + "attachments": [ + { + "contentType": "application/vnd.microsoft.card.adaptive", + "contentUrl": null, + "content": { + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "msteams": { + "width": "Full" + }, + "type": "AdaptiveCard", + "version": "1.2", + "body": [ + { + "type": "TextBlock", + "size": "Large", + "weight": "Bolder", + "color": "<% if (success) { %>Good<% } else { %>Attention<%} %>", + "text": "nf-core/metatdenovo v${version} - ${runName}", + "wrap": true + }, + { + "type": "TextBlock", + "spacing": "None", + "text": "Completed at ${dateComplete} (duration: ${duration})", + "isSubtle": true, + "wrap": true + }, + { + "type": "TextBlock", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors. The full error message was: ${errorReport}.<% } %>", + "wrap": true + }, + { + "type": "TextBlock", + "text": "The command used to launch the workflow was as follows:", + "wrap": true + }, + { + "type": "TextBlock", + "text": "${commandLine}", + "isSubtle": true, + "wrap": true + } + ], + "actions": [ + { + "type": "Action.ShowCard", + "title": "Pipeline Configuration", + "card": { + "type": "AdaptiveCard", + "\$schema": "http://adaptivecards.io/schemas/adaptive-card.json", + "body": [ + { + "type": "FactSet", + "facts": [<% out << summary.collect{ k,v -> "{\"title\": \"$k\", \"value\" : \"$v\"}"}.join(",\n") %> + ] + } + ] + } + } + ] + } + } + ] +} diff --git a/assets/email_template.html b/assets/email_template.html index 4d5b455c..c88601b6 100644 --- a/assets/email_template.html +++ b/assets/email_template.html @@ -12,7 +12,7 @@ -

nf-core/metatdenovo v${version}

+

nf-core/metatdenovo ${version}

Run Name: $runName

<% if (!success){ diff --git a/assets/email_template.txt b/assets/email_template.txt index 9fc04806..5f1e0726 100644 --- a/assets/email_template.txt +++ b/assets/email_template.txt @@ -4,9 +4,8 @@ |\\ | |__ __ / ` / \\ |__) |__ } { | \\| | \\__, \\__/ | \\ |___ \\`-._,-`-, `._,._,' - nf-core/metatdenovo v${version} + nf-core/metatdenovo ${version} ---------------------------------------------------- - Run Name: $runName <% if (success){ diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml new file mode 100644 index 00000000..4804e43b --- /dev/null +++ b/assets/methods_description_template.yml @@ -0,0 +1,27 @@ +id: "nf-core-metatdenovo-methods-description" +description: "Suggested text and references to use when describing pipeline usage within the methods section of a publication." +section_name: "nf-core/metatdenovo Methods Description" +section_href: "https://github.com/nf-core/metatdenovo" +plot_type: "html" +data: | +

Methods

+

Data was processed using nf-core/metatdenovo v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

+

The pipeline was executed with Nextflow v${workflow.nextflow.version} (Di Tommaso et al., 2017) with the following command:

+
${workflow.commandLine}
+

${tool_citations}

+

References

+
    +
  • Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. Nature Biotechnology, 35(4), 316-319. doi: 10.1038/nbt.3820
  • +
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. doi: 10.1038/s41587-020-0439-x
  • +
  • Grüning, B., Dale, R., Sjödin, A., Chapman, B. A., Rowe, J., Tomkins-Tinch, C. H., Valieris, R., Köster, J., & Bioconda Team. (2018). Bioconda: sustainable and comprehensive software distribution for the life sciences. Nature Methods, 15(7), 475–476. doi: 10.1038/s41592-018-0046-7
  • +
  • da Veiga Leprevost, F., Grüning, B. A., Alves Aflitos, S., Röst, H. L., Uszkoreit, J., Barsnes, H., Vaudel, M., Moreno, P., Gatto, L., Weber, J., Bai, M., Jimenez, R. C., Sachsenberg, T., Pfeuffer, J., Vera Alvarez, R., Griss, J., Nesvizhskii, A. I., & Perez-Riverol, Y. (2017). BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics (Oxford, England), 33(16), 2580–2582. doi: 10.1093/bioinformatics/btx192
  • + ${tool_bibliography} +
+
+
Notes:
+
    + ${nodoi_text} +
  • The command above does not include parameters contained in any configs or profiles that may have been used. Ensure the config file is also uploaded with your publication!
  • +
  • You should also cite all software used within this run. Check the "Software Versions" of this report to get version information.
  • +
+
diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml index 27ccd018..6dd8f722 100644 --- a/assets/multiqc_config.yaml +++ b/assets/multiqc_config.yaml @@ -1,11 +1,11 @@ report_comment: > - This report has been generated by the nf-core/metatdenovo - analysis pipeline. For information about how to interpret these results, please see the - documentation. + This report has been generated by the nf-core/metatdenovo + analysis pipeline. For information about how to interpret these results, please see the + documentation. report_section_order: - software_versions: - order: -1000 - nf-core-metatdenovo-summary: - order: -1001 + software_versions: + order: -1000 + "nf-core-metatdenovo-summary": + order: -1001 export_plots: true diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml new file mode 100644 index 00000000..4c8e1f76 --- /dev/null +++ b/assets/multiqc_config.yml @@ -0,0 +1,27 @@ +report_comment: > + This report has been generated by the nf-core/metatdenovo + analysis pipeline. For information about how to interpret these results, please see the + documentation. +report_section_order: + "nf-core-metatdenovo-methods-description": + order: -1000 + software_versions: + order: -1001 + "nf-core-metatdenovo-summary": + order: -1002 + +export_plots: true + +custom_data: + megahit_assemblies: + description: "Describes assembly statistics, generated by TransRate." + plot_type: table + rnaspades_assemblies: + description: "Describes assembly statistics, generated by TransRate." + plot_type: table + +custom_plot_config: + megahit_assemblies-plot: + col1_header: "File Name" + rnaspades_assemblies-plot: + col1_header: "File Name" diff --git a/assets/nf-core-metatdenovo_logo.png b/assets/nf-core-metatdenovo_logo.png deleted file mode 100644 index 9cf290d4..00000000 Binary files a/assets/nf-core-metatdenovo_logo.png and /dev/null differ diff --git a/assets/nf-core-metatdenovo_logo_light.png b/assets/nf-core-metatdenovo_logo_light.png new file mode 100644 index 00000000..2056280e Binary files /dev/null and b/assets/nf-core-metatdenovo_logo_light.png differ diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab7..448cbead 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +SAMPLE1_PE,https://github.com/nf-core/test-datasets/raw/metatdenovo/test_data/test_minigut_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/metatdenovo/test_data/test_minigut_R2.fastq.gz +SAMPLE2_PE,https://github.com/nf-core/test-datasets/raw/metatdenovo/test_data/test_minigut_sample2_R1.fastq.gz,https://github.com/nf-core/test-datasets/raw/metatdenovo/test_data/test_minigut_sample2_R2.fastq.gz diff --git a/assets/schema_input.json b/assets/schema_input.json index 02378da2..db2b3c70 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -31,9 +31,6 @@ ] } }, - "required": [ - "sample", - "fastq_1" - ] + "required": ["sample", "fastq_1"] } } diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt index 6bf6049e..0749c9f6 100644 --- a/assets/sendmail_template.txt +++ b/assets/sendmail_template.txt @@ -12,9 +12,9 @@ $email_html Content-Type: image/png;name="nf-core-metatdenovo_logo.png" Content-Transfer-Encoding: base64 Content-ID: -Content-Disposition: inline; filename="nf-core-metatdenovo_logo.png" +Content-Disposition: inline; filename="nf-core-metatdenovo_logo_light.png" -<% out << new File("$projectDir/assets/nf-core-metatdenovo_logo.png"). +<% out << new File("$projectDir/assets/nf-core-metatdenovo_logo_light.png"). bytes. encodeBase64(). toString(). diff --git a/assets/slackreport.json b/assets/slackreport.json new file mode 100644 index 00000000..fcab916f --- /dev/null +++ b/assets/slackreport.json @@ -0,0 +1,34 @@ +{ + "attachments": [ + { + "fallback": "Plain-text summary of the attachment.", + "color": "<% if (success) { %>good<% } else { %>danger<%} %>", + "author_name": "nf-core/metatdenovo ${version} - ${runName}", + "author_icon": "https://www.nextflow.io/docs/latest/_static/favicon.ico", + "text": "<% if (success) { %>Pipeline completed successfully!<% } else { %>Pipeline completed with errors<% } %>", + "fields": [ + { + "title": "Command used to launch the workflow", + "value": "```${commandLine}```", + "short": false + } + <% + if (!success) { %> + , + { + "title": "Full error message", + "value": "```${errorReport}```", + "short": false + }, + { + "title": "Pipeline configuration", + "value": "<% out << summary.collect{ k,v -> k == "hook_url" ? "_${k}_: (_hidden_)" : ( ( v.class.toString().contains('Path') || ( v.class.toString().contains('String') && v.contains('/') ) ) ? "_${k}_: `${v}`" : (v.class.toString().contains('DateTime') ? ("_${k}_: " + v.format(java.time.format.DateTimeFormatter.ofLocalizedDateTime(java.time.format.FormatStyle.MEDIUM))) : "_${k}_: ${v}") ) }.join(",\n") %>", + "short": false + } + <% } + %> + ], + "footer": "Completed at <% out << dateComplete.format(java.time.format.DateTimeFormatter.ofLocalizedDateTime(java.time.format.FormatStyle.MEDIUM)) %> (duration: ${duration})" + } + ] +} diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py index 8ba9b9d3..4a758fe0 100755 --- a/bin/check_samplesheet.py +++ b/bin/check_samplesheet.py @@ -1,145 +1,258 @@ #!/usr/bin/env python -# TODO nf-core: Update the script to check the samplesheet -# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv -import os -import sys -import errno +"""Provide a command line tool to validate and transform tabular samplesheets.""" + + import argparse +import csv +import logging +import sys +from collections import Counter +from pathlib import Path +logger = logging.getLogger() -def parse_args(args=None): - Description = "Reformat nf-core/metatdenovo samplesheet file and check its contents." - Epilog = "Example usage: python check_samplesheet.py " - parser = argparse.ArgumentParser(description=Description, epilog=Epilog) - parser.add_argument("FILE_IN", help="Input samplesheet file.") - parser.add_argument("FILE_OUT", help="Output file.") - return parser.parse_args(args) +class RowChecker: + """ + Define a service that can validate and transform each given row. + Attributes: + modified (list): A list of dicts, where each dict corresponds to a previously + validated and transformed row. The order of rows is maintained. -def make_dir(path): - if len(path) > 0: - try: - os.makedirs(path) - except OSError as exception: - if exception.errno != errno.EEXIST: - raise exception + """ + VALID_FORMATS = ( + ".fq.gz", + ".fastq.gz", + ) -def print_error(error, context="Line", context_str=""): - error_str = "ERROR: Please check samplesheet -> {}".format(error) - if context != "" and context_str != "": - error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format( - error, context.strip(), context_str.strip() - ) - print(error_str) - sys.exit(1) + def __init__( + self, + sample_col="sample", + first_col="fastq_1", + second_col="fastq_2", + single_col="single_end", + **kwargs, + ): + """ + Initialize the row checker with the expected column names. + Args: + sample_col (str): The name of the column that contains the sample name + (default "sample"). + first_col (str): The name of the column that contains the first (or only) + FASTQ file path (default "fastq_1"). + second_col (str): The name of the column that contains the second (if any) + FASTQ file path (default "fastq_2"). + single_col (str): The name of the new column that will be inserted and + records whether the sample contains single- or paired-end sequencing + reads (default "single_end"). + + """ + super().__init__(**kwargs) + self._sample_col = sample_col + self._first_col = first_col + self._second_col = second_col + self._single_col = single_col + self._seen = set() + self.modified = [] + + def validate_and_transform(self, row): + """ + Perform all validations on the given row and insert the read pairing status. + + Args: + row (dict): A mapping from column headers (keys) to elements of that row + (values). + + """ + self._validate_sample(row) + self._validate_first(row) + self._validate_second(row) + self._validate_pair(row) + self._seen.add((row[self._sample_col], row[self._first_col])) + self.modified.append(row) + + def _validate_sample(self, row): + """Assert that the sample name exists and convert spaces to underscores.""" + if len(row[self._sample_col]) <= 0: + raise AssertionError("Sample input is required.") + # Sanitize samples slightly. + row[self._sample_col] = row[self._sample_col].replace(" ", "_") + + def _validate_first(self, row): + """Assert that the first FASTQ entry is non-empty and has the right format.""" + if len(row[self._first_col]) <= 0: + raise AssertionError("At least the first FASTQ file is required.") + self._validate_fastq_format(row[self._first_col]) + + def _validate_second(self, row): + """Assert that the second FASTQ entry has the right format if it exists.""" + if len(row[self._second_col]) > 0: + self._validate_fastq_format(row[self._second_col]) + + def _validate_pair(self, row): + """Assert that read pairs have the same file extension. Report pair status.""" + if row[self._first_col] and row[self._second_col]: + row[self._single_col] = False + first_col_suffix = Path(row[self._first_col]).suffixes[-2:] + second_col_suffix = Path(row[self._second_col]).suffixes[-2:] + if first_col_suffix != second_col_suffix: + raise AssertionError("FASTQ pairs must have the same file extensions.") + else: + row[self._single_col] = True + + def _validate_fastq_format(self, filename): + """Assert that a given filename has one of the expected FASTQ extensions.""" + if not any(filename.endswith(extension) for extension in self.VALID_FORMATS): + raise AssertionError( + f"The FASTQ file has an unrecognized extension: {filename}\n" + f"It should be one of: {', '.join(self.VALID_FORMATS)}" + ) + + def validate_unique_samples(self): + """ + Assert that the combination of sample name and FASTQ filename is unique. + + In addition to the validation, also rename all samples to have a suffix of _T{n}, where n is the + number of times the same sample exist, but with different FASTQ files, e.g., multiple runs per experiment. + + """ + if len(self._seen) != len(self.modified): + raise AssertionError("The pair of sample name and FASTQ must be unique.") + seen = Counter() + for row in self.modified: + sample = row[self._sample_col] + seen[sample] += 1 + row[self._sample_col] = f"{sample}_T{seen[sample]}" + + +def read_head(handle, num_lines=10): + """Read the specified number of lines from the current position in the file.""" + lines = [] + for idx, line in enumerate(handle): + if idx == num_lines: + break + lines.append(line) + return "".join(lines) + + +def sniff_format(handle): + """ + Detect the tabular format. + + Args: + handle (text file): A handle to a `text file`_ object. The read position is + expected to be at the beginning (index 0). + + Returns: + csv.Dialect: The detected tabular format. + + .. _text file: + https://docs.python.org/3/glossary.html#term-text-file -# TODO nf-core: Update the check_samplesheet function -def check_samplesheet(file_in, file_out): """ - This function checks that the samplesheet follows the following structure: + peek = read_head(handle) + handle.seek(0) + sniffer = csv.Sniffer() + dialect = sniffer.sniff(peek) + return dialect - sample,fastq_1,fastq_2 - SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz - SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz - SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, - For an example see: - https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv +def check_samplesheet(file_in, file_out): """ + Check that the tabular samplesheet has the structure expected by nf-core pipelines. + + Validate the general shape of the table, expected columns, and each row. Also add + an additional column which records whether one or two FASTQ reads were found. + + Args: + file_in (pathlib.Path): The given tabular samplesheet. The format can be either + CSV, TSV, or any other format automatically recognized by ``csv.Sniffer``. + file_out (pathlib.Path): Where the validated and transformed samplesheet should + be created; always in CSV format. + + Example: + This function checks that the samplesheet follows the following structure, + see also the `viral recon samplesheet`_:: + + sample,fastq_1,fastq_2 + SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz + SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz + SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz, - sample_mapping_dict = {} - with open(file_in, "r") as fin: + .. _viral recon samplesheet: + https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv - ## Check header - MIN_COLS = 2 - # TODO nf-core: Update the column names for the input samplesheet - HEADER = ["sample", "fastq_1", "fastq_2"] - header = [x.strip('"') for x in fin.readline().strip().split(",")] - if header[: len(HEADER)] != HEADER: - print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER))) + """ + required_columns = {"sample", "fastq_1", "fastq_2"} + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_in.open(newline="") as in_handle: + reader = csv.DictReader(in_handle, dialect=sniff_format(in_handle)) + # Validate the existence of the expected header columns. + if not required_columns.issubset(reader.fieldnames): + req_cols = ", ".join(required_columns) + logger.critical(f"The sample sheet **must** contain these column headers: {req_cols}.") sys.exit(1) + # Validate each row. + checker = RowChecker() + for i, row in enumerate(reader): + try: + checker.validate_and_transform(row) + except AssertionError as error: + logger.critical(f"{str(error)} On line {i + 2}.") + sys.exit(1) + checker.validate_unique_samples() + header = list(reader.fieldnames) + header.insert(1, "single_end") + # See https://docs.python.org/3.9/library/csv.html#id3 to read up on `newline=""`. + with file_out.open(mode="w", newline="") as out_handle: + writer = csv.DictWriter(out_handle, header, delimiter=",") + writer.writeheader() + for row in checker.modified: + writer.writerow(row) + + +def parse_args(argv=None): + """Define and immediately parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Validate and transform a tabular samplesheet.", + epilog="Example: python check_samplesheet.py samplesheet.csv samplesheet.valid.csv", + ) + parser.add_argument( + "file_in", + metavar="FILE_IN", + type=Path, + help="Tabular input samplesheet in CSV or TSV format.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Transformed output samplesheet in CSV format.", + ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) + return parser.parse_args(argv) + - ## Check sample entries - for line in fin: - lspl = [x.strip().strip('"') for x in line.strip().split(",")] - - # Check valid number of columns per row - if len(lspl) < len(HEADER): - print_error( - "Invalid number of columns (minimum = {})!".format(len(HEADER)), - "Line", - line, - ) - num_cols = len([x for x in lspl if x]) - if num_cols < MIN_COLS: - print_error( - "Invalid number of populated columns (minimum = {})!".format(MIN_COLS), - "Line", - line, - ) - - ## Check sample name entries - sample, fastq_1, fastq_2 = lspl[: len(HEADER)] - sample = sample.replace(" ", "_") - if not sample: - print_error("Sample entry has not been specified!", "Line", line) - - ## Check FastQ file extension - for fastq in [fastq_1, fastq_2]: - if fastq: - if fastq.find(" ") != -1: - print_error("FastQ file contains spaces!", "Line", line) - if not fastq.endswith(".fastq.gz") and not fastq.endswith(".fq.gz"): - print_error( - "FastQ file does not have extension '.fastq.gz' or '.fq.gz'!", - "Line", - line, - ) - - ## Auto-detect paired-end/single-end - sample_info = [] ## [single_end, fastq_1, fastq_2] - if sample and fastq_1 and fastq_2: ## Paired-end short reads - sample_info = ["0", fastq_1, fastq_2] - elif sample and fastq_1 and not fastq_2: ## Single-end short reads - sample_info = ["1", fastq_1, fastq_2] - else: - print_error("Invalid combination of columns provided!", "Line", line) - - ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] } - if sample not in sample_mapping_dict: - sample_mapping_dict[sample] = [sample_info] - else: - if sample_info in sample_mapping_dict[sample]: - print_error("Samplesheet contains duplicate rows!", "Line", line) - else: - sample_mapping_dict[sample].append(sample_info) - - ## Write validated samplesheet with appropriate columns - if len(sample_mapping_dict) > 0: - out_dir = os.path.dirname(file_out) - make_dir(out_dir) - with open(file_out, "w") as fout: - fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n") - for sample in sorted(sample_mapping_dict.keys()): - - ## Check that multiple runs of the same sample are of the same datatype - if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]): - print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample)) - - for idx, val in enumerate(sample_mapping_dict[sample]): - fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n") - else: - print_error("No entries to process!", "Samplesheet: {}".format(file_in)) - - -def main(args=None): - args = parse_args(args) - check_samplesheet(args.FILE_IN, args.FILE_OUT) +def main(argv=None): + """Coordinate argument parsing and program execution.""" + args = parse_args(argv) + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + if not args.file_in.is_file(): + logger.error(f"The given input file {args.file_in} was not found!") + sys.exit(2) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + check_samplesheet(args.file_in, args.file_out) if __name__ == "__main__": diff --git a/conf/base.config b/conf/base.config index 44d4f460..39352384 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,7 +1,7 @@ /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nf-core/metatdenovo Nextflow base config file -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ A 'blank slate' config file, appropriate for general use on most high performance compute environments. Assumes that all software is installed and available on the PATH. Runs in `local` mode - all jobs will be run on the logged in environment. @@ -10,12 +10,11 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } - errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' @@ -24,8 +23,12 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors + withLabel:process_single { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } withLabel:process_low { cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 12.GB * task.attempt, 'memory' ) } diff --git a/conf/igenomes.config b/conf/igenomes.config index 855948de..e69de29b 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -1,432 +0,0 @@ -/* -======================================================================================== - Nextflow config file for iGenomes paths -======================================================================================== - Defines reference genomes using iGenome paths. - Can be used by any config that customises the base path using: - $params.igenomes_base / --igenomes_base ----------------------------------------------------------------------------------------- -*/ - -params { - // illumina iGenomes reference file paths - genomes { - 'GRCh37' { - fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" - } - 'GRCh38' { - fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" - } - 'GRCm38' { - fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/GRCm38-blacklist.bed" - } - 'TAIR10' { - fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/README.txt" - mito_name = "Mt" - } - 'EB2' { - fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/README.txt" - } - 'UMD3.1' { - fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/README.txt" - mito_name = "MT" - } - 'WBcel235' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.bed" - mito_name = "MtDNA" - macs_gsize = "9e7" - } - 'CanFam3.1' { - fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/README.txt" - mito_name = "MT" - } - 'GRCz10' { - fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'BDGP6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.bed" - mito_name = "M" - macs_gsize = "1.2e8" - } - 'EquCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/README.txt" - mito_name = "MT" - } - 'EB1' { - fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/README.txt" - } - 'Galgal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'Gm01' { - fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/README.txt" - } - 'Mmul_1' { - fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/README.txt" - mito_name = "MT" - } - 'IRGSP-1.0' { - fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.bed" - mito_name = "Mt" - } - 'CHIMP2.1.4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/README.txt" - mito_name = "MT" - } - 'Rnor_5.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'Rnor_6.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'R64-1-1' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.bed" - mito_name = "MT" - macs_gsize = "1.2e7" - } - 'EF2' { - fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.21e7" - } - 'Sbi1' { - fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/README.txt" - } - 'Sscrofa10.2' { - fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/README.txt" - mito_name = "MT" - } - 'AGPv3' { - fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.bed" - mito_name = "Mt" - } - 'hg38' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" - } - 'hg19' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg19-blacklist.bed" - } - 'mm10' { - fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/mm10-blacklist.bed" - } - 'bosTau8' { - fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'ce10' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "9e7" - } - 'canFam3' { - fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/README.txt" - mito_name = "chrM" - } - 'danRer10' { - fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.37e9" - } - 'dm6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.2e8" - } - 'equCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/README.txt" - mito_name = "chrM" - } - 'galGal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/README.txt" - mito_name = "chrM" - } - 'panTro4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/README.txt" - mito_name = "chrM" - } - 'rn6' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'sacCer3' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BismarkIndex/" - readme = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.2e7" - } - 'susScr3' { - fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/genome.fa" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/README.txt" - mito_name = "chrM" - } - } -} diff --git a/conf/modules.config b/conf/modules.config index 0b1bfdec..b1d5db6c 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1,32 +1,330 @@ /* -======================================================================================== - Config file for defining DSL2 per module options -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Available keys to override module options: - args = Additional arguments appended to command in module. - args2 = Second set of arguments appended to command in module (multi-tool modules). - args3 = Third set of arguments appended to command in module (multi-tool modules). - publish_dir = Directory to publish results. - publish_by_meta = Groovy list of keys available in meta map to append as directories to "publish_dir" path - If publish_by_meta = true - Value of ${meta['id']} is appended as a directory to "publish_dir" path - If publish_by_meta = ['id', 'custompath'] - If "id" is in meta map and "custompath" isn't then "${meta['id']}/custompath/" - is appended as a directory to "publish_dir" path - If publish_by_meta = false / null - No directories are appended to "publish_dir" path - publish_files = Groovy map where key = "file_ext" and value = "directory" to publish results for that file extension - The value of "directory" is appended to the standard "publish_dir" path as defined above. - If publish_files = null (unspecified) - All files are published. - If publish_files = false - No files are published. - suffix = File name suffix for output files. + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. ---------------------------------------------------------------------------------------- */ +process { + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: false + ] -params { - modules { - 'fastqc' { - args = "--quiet" - } - 'multiqc' { - args = "" - } + withName: SAMPLESHEET_CHECK { + publishDir = [ + path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: FASTQC { + ext.args = '--quiet' + } + + withName: TRIMGALORE { + ext.args = { [ + '--fastqc', + params.trim_nextseq > 0 ? "--nextseq ${params.trim_nextseq}" : '' + ].join(' ').trim() } + publishDir = [ + [ + path: { "${params.outdir}/trimgalore/fastqc" }, + mode: params.publish_dir_mode, + pattern: "*.{html,zip}" + ], + [ + path: { "${params.outdir}/trimgalore" }, + mode: params.publish_dir_mode, + pattern: "*.txt" + ] + ] + } + + withName: SEQTK_MERGEPE { + ext.prefix = { "${meta.id}.processed" } + } + + withName: BBMAP_BBNORM { + ext.args = { [ + params.bbnorm_target ? "target=${params.bbnorm_target}" : '', + params.bbnorm_min ? "min=${params.bbnorm_min}" : '', + ].join(' ').trim() } + publishDir = [ + [ + path : { "${params.outdir}/bbmap/bbnorm/logs" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ], + [ + path : { "${params.outdir}/bbmap/bbnorm/"}, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz", + enabled: params.save_bbnorm_fastq + ] + ] + } + + withName: BBMAP_BBDUK { + publishDir = [ + path : { "${params.outdir}/bbmap/bbduk/"}, + pattern: '*.fastq.gz', + mode: params.publish_dir_mode + ] + } + + withName: MEGAHIT_INTERLEAVED { + publishDir = [ + path: { "${params.outdir}/megahit" }, + mode: params.publish_dir_mode, + pattern: '**/*.{gz,log}' + ] + } + + withName: WRITESPADESYAML { + publishDir = [ + path: { "${params.outdir}/rnaspades" }, + mode: params.publish_dir_mode, + pattern: '*.yaml' + ] + } + + withName: SPADES { + ext.args = "--rna" + publishDir = [ + path: { "${params.outdir}/rnaspades" }, + mode: params.publish_dir_mode, + pattern: '*.{gz,log}' + ] + } + + withName: SEQTK_SEQ_CONTIG_FILTER { + ext.args = { "-L ${params.min_contig_length}" } + } + + withName: BBMAP_ALIGN { + ext.args = "trimreaddescriptions=t pigz=t" + publishDir = [ + [ + path: { "${params.outdir}/bbmap/bam" }, + mode: params.publish_dir_mode, + pattern: "*.bam", + enabled: params.save_bam + ], + [ + path: { "${params.outdir}/bbmap/logs" }, + mode: params.publish_dir_mode, + pattern: "*.log" + ] + ] + } + + withName: SAMTOOLS_SORT { + ext.prefix = { "${meta.id}.sorted" } + } + + withName: ".*SAMTOOLS.*" { + publishDir = [ + [ + pattern: "*.bam", + path: { "${params.outdir}/samtools/" }, + mode: params.publish_dir_mode + + ], + [ + pattern: "*.bam.bai", + path: { "${params.outdir}/samtools/" }, + mode: params.publish_dir_mode + ], + [ + pattern: "*.{flagstat,idxstats}", + path: { "${params.outdir}/samtools/" }, + mode: params.publish_dir_mode, + enabled: params.save_samtools, + saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${filename}" } + ] + ] + } + + withName: PRODIGAL { + ext.args = { params.prodigal_trainingfile ? "-t $params.prodigal_trainingfile" : "-p meta" } + publishDir = [ + path: { "${params.outdir}/prodigal" }, + mode: params.publish_dir_mode, + pattern: "*.gz" + ] + } + + withName: FORMAT_PRODIGAL_GFF { + publishDir = [ + path: { "${params.outdir}/prodigal" }, + mode: params.publish_dir_mode, + pattern: "*.gz" + ] + } + + withName: PREDICT { + publishDir = [ + path: { "${params.outdir}/transdecoder" }, + mode: params.publish_dir_mode, + pattern: "*.transdecoder.*" + ] + } + + withName: 'PROKKA' { + ext.args = '--prodigal --metagenome' + } + + withName: 'FAA_CAT' { + ext.prefix = 'prokka.faa.gz' + publishDir = [ + path: { "${params.outdir}/prokka" }, + mode: params.publish_dir_mode, + pattern: "*.gz" + ] + } + + withName: 'FFN_CAT' { + ext.prefix = 'prokka.ffn.gz' + publishDir = [ + path: { "${params.outdir}/prokka" }, + mode: params.publish_dir_mode, + pattern: "*.gz" + ] + } + + withName: 'GFF_CAT' { + ext.prefix = 'prokka.gff.gz' + publishDir = [ + path: { "${params.outdir}/prokka" }, + mode: params.publish_dir_mode, + pattern: "*.gz" + ] + } + + withName: PROKKAGFF2TSV { + publishDir = [ + path: { "${params.outdir}/summary_tables" }, + mode: params.publish_dir_mode, + pattern: "*.prokka-annotations.tsv.gz" + ] + } + + withName: '.*:FEATURECOUNTS_CDS' { + ext.args = '-g ID -t CDS -F gtf' + publishDir = [ + path: { "${params.outdir}/featurecounts" }, + mode: params.publish_dir_mode, + pattern: "*.featureCounts.*" + ] + } + + withName: COLLECT_FEATURECOUNTS { + publishDir = [ + path: { "${params.outdir}/summary_tables" }, + mode: params.publish_dir_mode, + pattern: "*.gz" + ] + } + + withName: 'HMMRANK' { + publishDir = [ + path: { "${params.outdir}/summary_tables" }, + mode: params.publish_dir_mode, + pattern: "*.tsv.gz" + ] + } + + withName: EGGNOG_DOWNLOAD { + storeDir = { "${params.eggnog_dbpath}" } + } + + withName: EGGNOG_MAPPER { + publishDir = [ + [ + path: { "${params.outdir}/eggnog" }, + mode: params.publish_dir_mode, + pattern: "*.emapper.*" + ], + [ + path: { "${params.outdir}/summary_tables" }, + mode: params.publish_dir_mode, + pattern: "*.emapper.tsv.gz" + ] + ] + } + + withName: KOFAMSCAN_DOWNLOAD { + storeDir = { "${params.kofam_dir}" } + } + + withName: KOFAMSCAN_SCAN { + publishDir = [ + [ + path: { "${params.outdir}/summary_tables/" }, + pattern: "kofamscan.tsv.gz", + mode: params.publish_dir_mode, + saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${filename}" } + ], + [ + path: { "${params.outdir}/kofamscan/" }, + pattern: "kofamscan_output.tsv.gz", + mode: params.publish_dir_mode, + saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${filename}" } + ] + ] + } + + withName: EUKULELE_SEARCH { + publishDir = [ + path: { "${params.outdir}/eukulele" }, + mode: params.publish_dir_mode + ] + ext.args = { params.eukulele_method ? "-m ${params.eukulele_method}" : '' } + } + + withName: FORMAT_TAX { + publishDir = [ + path: { "${params.outdir}/summary_tables" }, + mode: params.publish_dir_mode, + pattern: '*.tsv.gz', + saveAs: { filename -> "${params.assembly ? 'user_assembly' : params.assembler}.${params.gff ? 'user_orfs' : params.orf_caller}.${params.eukulele_db ?: 'userdb'}_taxonomy.tsv.gz" } + ] + } + + withName: SUM_TAXONOMY { + ext.prefix = { "${params.eukulele_db ?: 'userdb'}" } + } + + withName: COLLECT_STATS { + publishDir = [ + path: { "${params.outdir}/summary_tables" }, + mode: params.publish_dir_mode, + pattern: "*.gz" + ] + } + + withName: CUSTOM_DUMPSOFTWAREVERSIONS { + publishDir = [ + path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, + pattern: '*_versions.yml' + ] + } + + withName: 'MULTIQC' { + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } + publishDir = [ + path: "${params.outdir}/multiqc", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] } } diff --git a/conf/test.config b/conf/test.config index 5600067e..df9c177a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -1,11 +1,11 @@ /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Nextflow config file for running minimal tests -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/metatdenovo -profile test, + nextflow run nf-core/metatdenovo -profile test, --outdir ---------------------------------------------------------------------------------------- */ @@ -13,17 +13,18 @@ params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions max_cpus = 2 - max_memory = 6.GB - max_time = 6.h + max_memory = '6.GB' + max_time = '6.h' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/samplesheet/samplesheet.csv' + hmmfiles = 'https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/test_data/PF00317.hmm,https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/test_data/PF00848.hmm,https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/test_data/PF03477.hmm,https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/test_data/PF13597.hmm' + + // Other options + skip_eukulele = true + skip_eggnog = true + skip_kofamscan = true - // Genome references - genome = 'R64-1-1' } diff --git a/conf/test_eggnog.config b/conf/test_eggnog.config new file mode 100644 index 00000000..036a7fea --- /dev/null +++ b/conf/test_eggnog.config @@ -0,0 +1,28 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/metatdenovo -profile test_eggnog, + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test eggnog profile' + config_profile_description = 'Minimal test dataset to check pipeline with eggnog function added' + // Limit resources so that this can run on GitHub Actions + max_cpus = 8 + max_memory = '24.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/samplesheet/samplesheet.csv' + + // parameters + skip_eukulele = true + skip_eggnog = false + skip_kofamscan = true +} diff --git a/conf/test_eukulele.config b/conf/test_eukulele.config new file mode 100644 index 00000000..ab61ac51 --- /dev/null +++ b/conf/test_eukulele.config @@ -0,0 +1,28 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/metatdenovo -profile test_eukulele, + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile for eukulele taxonomic annotation' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 8 + max_memory = '24.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/samplesheet/samplesheet.csv' + + skip_eggnog = true + skip_kofamscan = true + eukulele_db = 'phylodb' +} diff --git a/conf/test_filter.config b/conf/test_filter.config new file mode 100644 index 00000000..597862a8 --- /dev/null +++ b/conf/test_filter.config @@ -0,0 +1,30 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/metatdenovo -profile test, + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function, including removal of contaminating sequences (e.g. rRNA)' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 6.h + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/samplesheet/samplesheet.csv' + + sequence_filter = 'https://github.com/nf-core/test-datasets/raw/metatdenovo/test_data/rrna.fna.gz' + + skip_eukulele = true + skip_eggnog = true + skip_kofamscan = true +} diff --git a/conf/test_full.config b/conf/test_full.config index 8358d505..ca9fe91e 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -1,11 +1,11 @@ /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Nextflow config file for running full-size tests -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a full size pipeline test. Use as follows: - nextflow run nf-core/metatdenovo -profile test_full, + nextflow run nf-core/metatdenovo -profile test_full, --outdir ---------------------------------------------------------------------------------------- */ @@ -15,10 +15,15 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/samplesheet/samplesheet_full_test.csv' // Genome references genome = 'R64-1-1' + + // parameters + skip_eukulele = false + skip_eggnog = false + skip_kofamscan = true + eukulele_dbpath = 's3://ngi-igenomes/test-data/metatdenovo/gtdb_eukulele/' + eggnog_dbpath = 's3://ngi-igenomes/test-data/metatdenovo/eggnog/' } diff --git a/conf/test_kofamscan.config b/conf/test_kofamscan.config new file mode 100644 index 00000000..9584e0f9 --- /dev/null +++ b/conf/test_kofamscan.config @@ -0,0 +1,29 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/metatdenovo -profile test_eggnog, + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test kofamscan profile' + config_profile_description = 'Minimal test dataset to check pipeline with kofamscan function added' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 8 + max_memory = '24.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/samplesheet/samplesheet.csv' + + // parameters + skip_eukulele = true + skip_eggnog = true + skip_kofamscan = false +} diff --git a/conf/test_prokka.config b/conf/test_prokka.config new file mode 100644 index 00000000..49783626 --- /dev/null +++ b/conf/test_prokka.config @@ -0,0 +1,30 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/metatdenovo -profile test_prokka, + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile for prokka orf caller' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 6.h + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/samplesheet/samplesheet.csv' + + // skip taxonomy + orf_caller = 'prokka' + skip_eukulele = true + skip_kofamscan = true + skip_eggnog = true +} diff --git a/conf/test_rnaspades.config b/conf/test_rnaspades.config new file mode 100644 index 00000000..9a329baf --- /dev/null +++ b/conf/test_rnaspades.config @@ -0,0 +1,33 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/metatdenovo -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test rnaspades assembler profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/samplesheet/samplesheet.csv' + + // Assembler option + assembler = 'rnaspades' + + skip_eukulele = true + skip_eggnog = true + skip_kofamscan = true + + // genomes parameter + genomes = '' +} diff --git a/conf/test_transdecoder.config b/conf/test_transdecoder.config new file mode 100644 index 00000000..7766a2d1 --- /dev/null +++ b/conf/test_transdecoder.config @@ -0,0 +1,30 @@ +/* +======================================================================================== + Nextflow config file for running minimal tests +======================================================================================== + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/metatdenovo -profile test_transdecoder, + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile for transdecoder orf caller' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = 6.GB + max_time = 6.h + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/metatdenovo/samplesheet/samplesheet.csv' + + // params + orf_caller = 'transdecoder' + skip_eukulele = true + skip_eggnog = true + skip_kofamscan = true +} diff --git a/docs/README.md b/docs/README.md index f691b641..69958ee1 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,9 +2,9 @@ The nf-core/metatdenovo documentation is split into the following pages: -* [Usage](usage.md) - * An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. -* [Output](output.md) - * An overview of the different results produced by the pipeline and how to interpret them. +- [Usage](usage.md) + - An overview of how the pipeline works, how to run it and a description of all of the different command-line flags. +- [Output](output.md) + - An overview of the different results produced by the pipeline and how to interpret them. You can find a lot more documentation about installing, configuring and running nf-core pipelines on the website: [https://nf-co.re](https://nf-co.re) diff --git a/docs/images/metat_v6.svg b/docs/images/metat_v6.svg new file mode 100644 index 00000000..34abdb94 --- /dev/null +++ b/docs/images/metat_v6.svg @@ -0,0 +1,2537 @@ + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + contigs.fa + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + contigs.fa + + + + + + + + + Input check + BBnorm + Seqtk + BBduk + + + + + + vcf + + vcf + + + + + + + + + + bam + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Pipeline summary + + STAGE + 1. + Assembly + Gene annotation + Quantification + Taxonomy & functional annotation + 2. + 3. + 4. + 5. + 6. + 7. + + + Start + End step + Intermediate step + + Mapping + + + + + Pre-processing + LEGEND + + + 1. + + + + 2. + + + + 3. + + + 4. + + + 5. + + + Connection Point + + + + + + + + + + + + + + + + + + + + + + Megahit + RNAspades + BBmap/Featurecounts + Transdecoder + Prodigal + Prokka + Collect statistics + MultiQC + EUKulele + KOfamScan + Hmmsearch + Prokka annotation + + Eggnog-mapper + + + sample.fq + + + + + .fna + + + .gff + + + .faa + + + + + .tsv + + + + + .tsv + + + + + .tsv + + + + + .tsv + + + + + .tsv + + + + + .tsv + + + + + + + + + .tsv + + + + + .tsv + + + RNAspades + Megahit + BBmap + FeatureCounts + Prokka annotation + KOfamScan + Hmmsearch + Hmmrank + EUKulele + Eggnog-mapper + MultiQC + Collect statistics + TransDecoder + Prodigal + Prokka + + Trim galore! + + + + + 6. + + 7. + + + + + diff --git a/docs/images/mqc_fastqc_adapter.png b/docs/images/mqc_fastqc_adapter.png deleted file mode 100755 index 361d0e47..00000000 Binary files a/docs/images/mqc_fastqc_adapter.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_counts.png b/docs/images/mqc_fastqc_counts.png deleted file mode 100755 index cb39ebb8..00000000 Binary files a/docs/images/mqc_fastqc_counts.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_quality.png b/docs/images/mqc_fastqc_quality.png deleted file mode 100755 index a4b89bf5..00000000 Binary files a/docs/images/mqc_fastqc_quality.png and /dev/null differ diff --git a/docs/images/nf-core-metatdenovo_logo.png b/docs/images/nf-core-metatdenovo_logo.png deleted file mode 100644 index c699d66a..00000000 Binary files a/docs/images/nf-core-metatdenovo_logo.png and /dev/null differ diff --git a/docs/images/nf-core-metatdenovo_logo_dark.png b/docs/images/nf-core-metatdenovo_logo_dark.png new file mode 100644 index 00000000..7fd78d9b Binary files /dev/null and b/docs/images/nf-core-metatdenovo_logo_dark.png differ diff --git a/docs/images/nf-core-metatdenovo_logo_light.png b/docs/images/nf-core-metatdenovo_logo_light.png new file mode 100644 index 00000000..50ea4e01 Binary files /dev/null and b/docs/images/nf-core-metatdenovo_logo_light.png differ diff --git a/docs/output.md b/docs/output.md index d805aa31..711fac74 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,67 +2,286 @@ ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. +This document describes the output produced by the pipeline. -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - +The directories listed below will be created in the results directory after the pipeline has finished. +All paths are relative to the top-level results directory. ## Pipeline overview -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +The pipeline is built using [Nextflow](https://www.nextflow.io/) and the results are organized as follow: + +- [Module output](#module-output) + - [Preprocessing](#preprocessing) + - [FastQC](#fastqc) - Read quality control + - [Trim galore!](#trim-galore) - Primer trimming + - [MultiQC](#multiqc) - Aggregate report describing results + - [BBduk](#bbduk) - Filter out sequences from samples that matches sequences in a user-provided fasta file (optional) + - [BBnorm](#bbnorm) - Normalize the reads in the samples to use less resources for assembly (optional) + - [Assembly step](#assembly-step) - Generate contigs with an assembler program + - [Megahit](#megahit) - Output from Megahit assembly (default) + - [RNASpades](#rnaspades) - Output from Spades assembly (optional) + - [ORF Caller step](#orf-caller-step) - Identify protein-coding genes (ORFs) with an ORF caller + - [Prodigal](#prodigal) - Output from Prodigal (default) + - [Prokka](#prokka) - Output from Prokka (optional) + - [TransDecoder](#transdecoder) - Output from transdecoder (optional) + - [Functional and taxonomical annotation](#functional-and-taxonomical-annotation) - Predict the function and the taxonomy of ORFs + - [EggNOG](#eggnog) - Output from EggNOG-mapper (default; optional) + - [KOfamSCAN](#kofamscan) - Output KOfamSCAN (optional) + - [EUKulele](#eukulele) - Output from EUKulele taxonomy annotation (default; optional) + - [Hmmsearch](#hmmsearch) - Output from HMMER run with user-supplied HMM profiles (optional) +- [Custom metatdenovo output](#metatdenovo-output) + - [Summary tables folder](#summary-tables) - Tab separated tables ready for further analysis in tools like R and Python + - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution + +## Module output + +### Preprocessing -* [FastQC](#fastqc) - Raw read QC -* [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -* [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +#### FastQC -### FastQC +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). FastQC is run as part of Trim galore! therefore its output can be found in Trimgalore's folder.
Output files -* `fastqc/` - * `*_fastqc.html`: FastQC report containing quality metrics. - * `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `trimgalore/fastqc/` + - `*_fastqc.html`: FastQC report containing quality metrics for your untrimmed raw fastq files.
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +#### Trim galore! -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +[Trimgalore](https://github.com/FelixKrueger/TrimGalore) is trimming primer sequences from sequencing reads. Primer sequences are non-biological sequences that often introduce point mutations that do not reflect sample sequences. This is especially true for degenerated PCR primers. If primer trimming would be omitted, artifactual amplicon sequence variants might be computed by the denoising tool or sequences might be lost due to become labelled as PCR chimera. -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +
+Output files -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +- `trimgalore/`: directory containing log files with retained reads, trimming percentage, etc. for each sample. + - `*trimming_report.txt`: report of read numbers that pass trimgalore. -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +
-### MultiQC +#### MultiQC + +[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. + +Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see .
Output files -* `multiqc/` - * `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - * `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - * `multiqc_plots/`: directory containing static images from the report in various formats. +- `multiqc/` + - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. + - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. + - `multiqc_plots/`: directory containing static images from the report in various formats.
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +:::note +The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. +::: -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +#### BBduk -### Pipeline information +[BBduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbnorm-guide/) is a filtering tool that removes specific sequences from the samples using a reference fasta file. +BBduk is built-in tool from BBmap. + +
+Output files + +- `bbmap/` + - `*.bbduk.log`: a text file with the results from BBduk analysis. Number of filtered reads can be seen in this log. + +
+ +#### BBnorm + +[BBnorm](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) is a tool from BBmap that allows to reduce the coverage of highly abundant sequence kmers and remove sequences representing kmers that are below a threshold. +It can be useful if the data set is too large to assemble but also potentially improve an assembly. +N.B. the digital normalization is done only for the assembly and the non-normalized sequences will be used for quantification. +BBnorm is a BBmap tool. + +
+Output files + +- `bbmap/bbnorm/logs/` + - `*.logs`: it is a log file of the bbnorm run. + +
+ +### Assembly step + +#### Megahit + +[Megahit](https://github.com/voutcn/megahit) is used to assemble the cleaned and trimmed FastQ reads into contigs. + +
+Output files + +- `megahit/megahit_out/` + - `*.log`: log file of Megahit run. + - `megahit_assembly.contigs.fa.gz`: reference genome created by Megahit. + - `intermediate_contigs`: folder that contains the intermediate steps of Megahit run. + +
+ +#### RNASpades + +Optionally, you can use [RNASpades](https://cab.spbu.ru/software/rnaspades/) to assemble reads into contigs. + +
+Output files + +- `rnaspades/` + - `rnaspades.assembly.gfa.gz`: gfa file output from rnaspades + - `rnaspades.spades.log`: log file output from rnaspades run + - `rnaspades.transcripts.fa.gz`: reference genome created by RNASpades + +
+ +### ORF caller step + +#### Prodigal + +As default, [Prodigal](https://github.com/hyattpd/Prodigal) is used to identify ORFs in the assembly. + +
+Output files + +- `prodigal/` + - `*.fna.gz`: nucleotides fasta file output + - `*.faa.gz`: amino acids fasta file output + - `*.gff.gz`: genome feature file output + +
+ +#### Prokka + +As one alternative, you can use [Prokka](https://github.com/tseemann/prokka) to identify ORFs in the assembly. +In addition to calling ORFs (done with Prodigal) Prokka will filter ORFs to only retain quality ORFs and will functionally annotate the ORFs. +NB: Prodigal or Prokka are recomended for prokaryotic samples + +
+Output files + +- `prokka/` + - `*.ffn.gz`: nucleotides fasta file output + - `*.faa.gz`: amino acids fasta file output + - `*.gff.gz`: genome feature file output + +
+ +#### TransDecoder + +Another alternative is [TransDecoder](https://github.com/sghignone/TransDecoder) to find ORFs in the assembly. +N.B. TransDecoder is recomended for eukaryotic samples + +
+Output files + +- `transdecoder/` + - `*.cds`: nucleotides fasta file output + - `*.pep`: amino acids fasta file output + - `*.gff3`: genome feature file output + +
+ +### Functional and taxonomical annotation + +#### EggNOG + +[EggNOG-mapper](https://github.com/eggnogdb/eggnog-mapper) will perform an analysis to assign functions to the ORFs. + +
+Output files + +- `eggnog/` + - `*.emapper.annotations.gz`: a file with the results from the annotation phase, see the [EggNOG-mapper documentation](https://github.com/eggnogdb/eggnog-mapper/wiki/). + - `*.emapper.hits.gz`: a file with the results from the search phase, from HMMER, Diamond or MMseqs2. + - `*.emapper.seed_orthologs.gz`: a file with the results from parsing the hits. Each row links a query with a seed ortholog. This file has the same format independently of which searcher was used, except that it can be in short format (4 fields), or full. + +
+ +#### KOfamScan + +[KOfamScan](https://github.com/takaram/kofam_scan) will perform an analysis to assign KEGG orthologs to ORFs. + +
+Output files + +- `kofamscan/` + - `*.kofamscan_output.tsv.gz`: kofamscan output. + +
+ +#### EUKulele + +[EUKulele](https://github.com/AlexanderLabWHOI/EUKulele) will perform an analysis to assign taxonomy to the ORFs. +A number of databases are supported: MMETSP, PhyloDB and GTDB. +GTDB currently only works as a user provided database, i.e. data must be downloaded before running nf-core/metatdenovo. + +
+Output files + +- `eukulele/assembler.orfcaller/mets_full/diamond/` + - `*.diamond.out.gz`: Diamond output +- `eukulele/assembler.orfcaller/taxonomy_estimation/` + - `*-estimated-taxonomy.out.gz`: EUKulele output + +
+ +#### Hmmsearch + +You can run [hmmsearch](https://www.ebi.ac.uk/Tools/hmmer/search/hmmsearch) on ORFs using a set of HMM profiles provided to the pipeline (see the `--hmmdir`, `--hmmpatern` and `--hmmfiles` parameters). + +
+Output files + +- `hmmer/` + - `*.tbl.gz`: Table output gzipped as result of Hmmsearch run. + +
+ +After the search, hits for each ORF and HMM will be summarised and ranked based on scores for the hits (see also output in [summary tables](#summary-tables)). + +
+Output files + +- `hmmrank/` + - `*.tsv.gz`: tab separeted file with the ranked ORFs for each HMM profile. + +
+ +## Metatdenovo output + +### Summary tables + +Consistently named and formated output tables in tsv format ready for further analysis. +Filenames start with assembly program and ORF caller, to allow reruns of the pipeline with different parameter settings without overwriting output files.
Output files -* `pipeline_info/` - * Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - * Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - * Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. +- `summary_tables/` + - `{assembler}.{orf_caller}.overall_stats.tsv.gz`: overall statistics from the pipeline, e.g. number of reads, number of called ORFs, number of reads mapping back to contigs/ORFs etc. + - `{assembler}.{orf_caller}.counts.tsv.gz`: read counts per ORF and sample. + - `{assembler}.{orf_caller}.emapper.tsv.gz`: reformatted output from EggNOG-mapper. + - `{assembler}.{orf_caller}.{db}_eukulele.tsv.gz`: taxonomic annotation per ORF for specific database. + - `{assembler}.{orf_caller}.prokka-annotations.tsv.gz`: reformatted annotation output from Prokka. + - `{assembler}.{orf_caller}.hmmrank.tsv.gz`: ranked summary table from HMMER results.
-[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. +### Pipeline information + +
+Output files + +- `pipeline_info/` + - reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. + - reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. + - reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. + - parameters used by the pipeline run: `params.json`. + +
diff --git a/docs/usage.md b/docs/usage.md index 0d205d0a..ba80678a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,4 +1,4 @@ -# nf-core/metatdenovo: Usage +# nf-core/metatdenovo: Usage ## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/metatdenovo/usage](https://nf-co.re/metatdenovo/usage) @@ -6,256 +6,320 @@ ## Introduction - +Metatdenovo is a workflow primarily designed for annotation of metatranscriptomes for which reference genomes are not available. +The approach is to first create an assembly, then call genes and finally quantify and annotate the genes. +Since the workflow includes gene callers and annotation tools and databases both for prokaryotes and eukaryotes, the workflow should be suitable for both +organism groups and mixed communities can be handled by trying different gene callers and comparing the results. -## Samplesheet input +While the rationale for writing the workflow was metatranscriptomes, there is nothing in the workflow that precludes use for single organisms rather than +communities nor genomes rather than transcriptomes. +Instead, the workflow should be usable for any project in which a de novo assembly followed by quantification and annotation is suitable. -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +## Running the workflow -```console ---input '[path to samplesheet file]' +### Quickstart + +A typical command for running the workflow is: + +```bash +nextflow run nf-core/metatdenovo -profile docker --outdir results/ --input samples.csv ``` -### Multiple runs of the same sample +### Samplesheet input -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It must be a comma-separated file with 3 columns, and a header row as shown in the examples below -```console +```csv title="samplesheet.csv" sample,fastq_1,fastq_2 CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz ``` -### Full samplesheet +#### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. + + -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. + -```console +A final samplesheet file consisting of samples taken at time 0 and 24 in triplicate may look like the one below. + +```csv title="samplesheet.csv" sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +T0a,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz +T0b,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz +T0c,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz +T24a,AEG588A4_S1_L002_R1_001.fastq.gz,AEG588A4_S1_L002_R2_001.fastq.gz +T24b,AEG588A5_S2_L002_R1_001.fastq.gz,AEG588A5_S2_L002_R2_001.fastq.gz +T24c,AEG588A6_S3_L002_R1_001.fastq.gz,AEG588A6_S3_L002_R2_001.fastq.gz ``` -| Column | Description | -|----------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. -## Running the pipeline +#### Multiple runs of the same sample -The typical command for running the pipeline is as follows: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: ```console -nextflow run nf-core/metatdenovo --input samplesheet.csv --genome GRCh37 -profile docker +sample,fastq_1,fastq_2 +T0a,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz +T0a,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz +T0a,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +### Filter/remove sequences from the samples (e.g. rRNA sequences with SILVA database) -Note that the pipeline will create the following files in your working directory: +The pipeline can remove potential contaminants using the BBduk program. +Specify a fasta file, gzipped or not, with the --sequence_filter sequences.fasta parameter. +For further documentation, see the [BBduk official website](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/). -```console -work # Directory containing the nextflow working files -results # Finished results (configurable, see below) -.nextflow_log # Log file from Nextflow -# Other nextflow hidden files, eg. history of pipeline runs and old logs. -``` +### Digital normalization -### Updating the pipeline +Metatdenovo can perform "digital normalization" of the reads before the assembly. +This will reduce coverage of highly abundant sequences and remove sequences that are below a threshold, and can be useful if the data set is too large to assemble but also potentially improve an assembly. +N.B. the digital normalization is done only for the assembly and the full set of sequences will be used for quantification. +To turn on digital normalization, use the `--bbnorm` parameter and, if required, adjust the `--bbnorm_target` and `--bbnorm_min` parameters. -When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: +> Please, check the [bbnorm](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbnorm-guide/) documentation for further information about these programs and how digital normalization works. Remember to check [Parameters](https://nf-co.re/metatdenovo/parameters) page for the all options that can be used for this step. -```console -nextflow pull nf-core/metatdenovo -``` +### Assembler options -### Reproducibility +By default, the pipeline uses Megahit (`--assembler megahit`) to assemble the cleaned and trimmed reads to create the reference contigs. +Megahit is fast and it does not require a lot of memory to run, making it ideal for large sets of samples. +The workflow also supports RNAspades, (`--assembler rnaspades` ) as an alternative. -It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. +You can also choose to input contigs from an assembly that you made outside the pipeline using the `--assembly file.fna` (where `file.fna` is the name of a fasta file with contigs) option. -First, go to the [nf-core/metatdenovo releases page](https://github.com/nf-core/metatdenovo/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. +### ORF caller options -This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. +By default, the pipeline uses prodigal (`--orf_caller prodigal` ) to call genes/ORFs from the assembly. +This is suitable for prokaryotes, as is the Prokka alternative (`--orf_caller prokka`). +The latter uses Prodigal internally making it suitable for prokaryotic genes. +It also performs functional annotation of ORFs. -## Core Nextflow arguments +For eukaryotic genes, we recommend users to use Transdecoder (`--orf_caller transdecoder`) to call ORFs. -> **NB:** These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +### Taxonomic annotation options -### `-profile` +Metatdenovo uses EUKulele as the main program for taxonomy annotation. +EUKulele can be run with different reference datasets. +The default dataset is PhyloDB (`--eukulele_db phylodb` ) which works for mixed communities of prokaryotes and eukaryotes. +Other database options for running the pipeline are MMETSP (`--eukulele_db mmetsp`; for marine protists) and GTDB (`--eukulele_db gtdb`; for prokarytes +[under development]). -Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. +Options: -Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Conda) - see below. When using Biocontainers, most of these software packaging methods pull Docker containers from quay.io e.g [FastQC](https://quay.io/repository/biocontainers/fastqc) except for Singularity which directly downloads Singularity images via https hosted by the [Galaxy project](https://depot.galaxyproject.org/singularity/) and Conda which downloads and installs software locally from [Bioconda](https://bioconda.github.io/). +- PhyloDB: default, covers both prokaryotes and eukaryotes +- MMETSP: marine protists +- GTDB: prokaryotes, both bacteria and archaea -> We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +You can also provide your own database, see the [EUKulele documentation](https://eukulele.readthedocs.io/en/latest/#) documentation. -The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). +Databases are automatically downloaded by the workflow, but if you already have them available you can use the `--eukulele_dbpath path/to/db` pointing +to the root directory of the EUKulele databases. +(The default for this parameter is `eukulele`.) -Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! -They are loaded in sequence, so later profiles can overwrite earlier profiles. +> Please, check the [EUKulele documentation](https://eukulele.readthedocs.io/en/latest/#) for more information about the databases. -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended. - -* `docker` - * A generic configuration profile to be used with [Docker](https://docker.com/) -* `singularity` - * A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) -* `podman` - * A generic configuration profile to be used with [Podman](https://podman.io/) -* `shifter` - * A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) -* `charliecloud` - * A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) -* `conda` - * A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter or Charliecloud. -* `test` - * A profile with a complete configuration for automated testing - * Includes links to test data so needs no other parameters +> :warning: There is currently a bug in the EUKulele program so that some databases properly do not download properly, check [EUKulele issue](https://github.com/AlexanderLabWHOI/EUKulele/issues/60). Until the developers have fixed this bug, we recommend downloading the database manually. To do so, follow these steps: -### `-resume` +- Create conda environment: -Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. +```bash +conda create -n eukulele -c akrinos -c bioconda -c conda-forge EUKulele +conda activate EUKulele +``` -You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. +- Download the database you need: -### `-c` +```bash +mkdir eukulele +cd eukulele +EUKulele download --database mmetsp (you can use the name of the database you would like to download) +``` -Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. +- Fix the problematic database tables: + +```bash +mkdir mmetsp +cd mmetsp +mv reference.pep.fa reference.pep.fa.gz +gunzip reference.pep.fa.gz +create_protein_table.py --infile_peptide reference.pep.fa \ + --infile_taxonomy taxonomy-table.txt --outfile_json prot-map.json \ + --output tax-table.txt --delim "/" --col_source_id Source_ID \ + --taxonomy_col_id taxonomy --column SOURCE_ID +``` -## Custom configuration +> :warning: -### Resource requests + + + +### Functional annotation options + +Besides the functional annotation that the gene caller Prokka gives (see above) there are two general purpose functional annotation programs available +in the workflow: the [eggNOG-mapper](http://eggnog-mapper.embl.de/) and [KofamScan](https://github.com/takaram/kofam_scan). +Both are suitable for both prokaryotic and eukaryotic genes and both are run by default, but can be skipped using the `--skip_eggnog` and +`--skip_kofamscan` options respectivelly. +The tools use large databases which are downloaded automatically but paths can be provided by the user through the `--eggnog_dbpath directory` +and `--kofam_dir dir` parameters respectively. + +A more targeted annotation option offered by the workflow is the possibility for the user to provide a set of +[HMMER HMM profiles](http://eddylab.org/software/hmmer/Userguide.pdf) through the `--hmmdir dir` or `hmmfiles file0.hmm,file1.hmm,...,filen.hmm` parameters. +Each HMM file will be used to search the amino acid sequences of the ORF set and the results will be summarized in a tab separated file in which each +ORF-HMM combination will be ranked according to score and E-value. + +## Example pipeline command with some common features -Caused by: - Process `RNASEQ:ALIGN_STAR:STAR_ALIGN (WT_REP1)` terminated with an error exit status (137) +```bash +nextflow run nf-core/metatdenovo -profile docker --input samplesheet.csv --assembler rnaspades --orf_caller prokka --eggnog --eukulele_db gtdb +``` + +In this example, we are running metatdenovo with `rnaspades` as assembler, `prokka` as ORF caller, `eggnog` for functional annotation and EUKulele with the GTDB database for taxonomic annotation. + +Note that the pipeline will create the following files in your working directory: -Command executed: - STAR \ - --genomeDir star \ - --readFilesIn WT_REP1_trimmed.fq.gz \ - --runThreadN 2 \ - --outFileNamePrefix WT_REP1. \ - +```bash +work # Directory containing the nextflow working files + # Finished results in specified location (defined with --outdir) +.nextflow_log # Log file from Nextflow +# Other nextflow hidden files, eg. history of pipeline runs and old logs. +``` -Command exit status: - 137 +If you wish to repeatedly use the same parameters for multiple runs, rather than specifying each flag in the command, you can specify these in a params file. -Command output: - (empty) +Pipeline settings can be provided in a `yaml` or `json` file via `-params-file `. -Command error: - .command.sh: line 9: 30 Killed STAR --genomeDir star --readFilesIn WT_REP1_trimmed.fq.gz --runThreadN 2 --outFileNamePrefix WT_REP1. -Work dir: - /home/pipelinetest/work/9d/172ca5881234073e8d76f2a19c88fb +:::warning +Do not use `-c ` to specify parameters as this will result in errors. Custom config files specified with `-c` must only be used for [tuning process resource specifications](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources), other infrastructural tweaks (such as output directories), or module arguments (args). +::: -Tip: you can replicate the issue by changing to the process work dir and entering the command `bash .command.run` +The above pipeline run specified with a params file in yaml format: + +```bash +nextflow run nf-core/metatdenovo -profile docker -params-file params.yaml ``` -To bypass this error you would need to find exactly which resources are set by the `STAR_ALIGN` process. The quickest way is to search for `process STAR_ALIGN` in the [nf-core/rnaseq Github repo](https://github.com/nf-core/rnaseq/search?q=process+STAR_ALIGN). We have standardised the structure of Nextflow DSL2 pipelines such that all module files will be present in the `modules/` directory and so based on the search results the file we want is `modules/nf-core/software/star/align/main.nf`. If you click on the link to that file you will notice that there is a `label` directive at the top of the module that is set to [`label process_high`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L9). The [Nextflow `label`](https://www.nextflow.io/docs/latest/process.html#label) directive allows us to organise workflow processes in separate groups which can be referenced in a configuration file to select and configure subset of processes having similar computing requirements. The default values for the `process_high` label are set in the pipeline's [`base.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L33-L37) which in this case is defined as 72GB. Providing you haven't set any other standard nf-core parameters to __cap__ the [maximum resources](https://nf-co.re/usage/configuration#max-resources) used by the pipeline then we can try and bypass the `STAR_ALIGN` process failure by creating a custom config file that sets at least 72GB of memory, in this case increased to 100GB. The custom config below can then be provided to the pipeline via the [`-c`](#-c) parameter as highlighted in previous sections. +with `params.yaml` containing: -```nextflow -process { - withName: STAR_ALIGN { - memory = 100.GB - } -} +```yaml +input: 'samplesheet.csv' +assembler: 'rnaspades' +orf_caller: 'prokka' +eggnog: true +eukulele_db: 'gtdb' +<...> ``` -> **NB:** We specify just the process name i.e. `STAR_ALIGN` in the config file and not the full task name string that is printed to screen in the error message or on the terminal whilst the pipeline is running i.e. `RNASEQ:ALIGN_STAR:STAR_ALIGN`. You may get a warning suggesting that the process selector isn't recognised but you can ignore that if the process name has been specified correctly. This is something that needs to be fixed upstream in core Nextflow. +You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). -### Tool-specific options +### Updating the pipeline -For the ultimate flexibility, we have implemented and are using Nextflow DSL2 modules in a way where it is possible for both developers and users to change tool-specific command-line arguments (e.g. providing an additional command-line argument to the `STAR_ALIGN` process) as well as publishing options (e.g. saving files produced by the `STAR_ALIGN` process that aren't saved by default by the pipeline). In the majority of instances, as a user you won't have to change the default options set by the pipeline developer(s), however, there may be edge cases where creating a simple custom config file can improve the behaviour of the pipeline if for example it is failing due to a weird error that requires setting a tool-specific parameter to deal with smaller / larger genomes. +When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: -The command-line arguments passed to STAR in the `STAR_ALIGN` module are a combination of: +```bash +nextflow pull nf-core/metatdenovo +``` -* Mandatory arguments or those that need to be evaluated within the scope of the module, as supplied in the [`script`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L49-L55) section of the module file. +### Reproducibility -* An [`options.args`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/modules/nf-core/software/star/align/main.nf#L56) string of non-mandatory parameters that is set to be empty by default in the module but can be overwritten when including the module in the sub-workflow / workflow context via the `addParams` Nextflow option. +It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -The nf-core/rnaseq pipeline has a sub-workflow (see [terminology](https://github.com/nf-core/modules#terminology)) specifically to align reads with STAR and to sort, index and generate some basic stats on the resulting BAM files using SAMtools. At the top of this file we import the `STAR_ALIGN` module via the Nextflow [`include`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/subworkflows/nf-core/align_star.nf#L10) keyword and by default the options passed to the module via the `addParams` option are set as an empty Groovy map [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/subworkflows/nf-core/align_star.nf#L5); this in turn means `options.args` will be set to empty by default in the module file too. This is an intentional design choice and allows us to implement well-written sub-workflows composed of a chain of tools that by default run with the bare minimum parameter set for any given tool in order to make it much easier to share across pipelines and to provide the flexibility for users and developers to customise any non-mandatory arguments. +First, go to the [nf-core/metatdenovo releases page](https://github.com/nf-core/metatdenovo/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. -When including the sub-workflow above in the main pipeline workflow we use the same `include` statement, however, we now have the ability to overwrite options for each of the tools in the sub-workflow including the [`align_options`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/workflows/rnaseq.nf#L225) variable that will be used specifically to overwrite the optional arguments passed to the `STAR_ALIGN` module. In this case, the options to be provided to `STAR_ALIGN` have been assigned sensible defaults by the developer(s) in the pipeline's [`modules.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/modules.config#L70-L74) and can be accessed and customised in the [workflow context](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/workflows/rnaseq.nf#L201-L204) too before eventually passing them to the sub-workflow as a Groovy map called `star_align_options`. These options will then be propagated from `workflow -> sub-workflow -> module`. +This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. -As mentioned at the beginning of this section it may also be necessary for users to overwrite the options passed to modules to be able to customise specific aspects of the way in which a particular tool is executed by the pipeline. Given that all of the default module options are stored in the pipeline's `modules.config` as a [`params` variable](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/modules.config#L24-L25) it is also possible to overwrite any of these options via a custom config file. +To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. -Say for example we want to append an additional, non-mandatory parameter (i.e. `--outFilterMismatchNmax 16`) to the arguments passed to the `STAR_ALIGN` module. Firstly, we need to copy across the default `args` specified in the [`modules.config`](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/modules.config#L71) and create a custom config file that is a composite of the default `args` as well as the additional options you would like to provide. This is very important because Nextflow will overwrite the default value of `args` that you provide via the custom config. +:::tip +If you wish to share such profile (such as uploaded as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. +::: -As you will see in the example below, we have: +## Core Nextflow arguments -* appended `--outFilterMismatchNmax 16` to the default `args` used by the module. -* changed the default `publish_dir` value to where the files will eventually be published in the main results directory. -* appended `'bam':''` to the default value of `publish_files` so that the BAM files generated by the process will also be saved in the top-level results directory for the module. Note: `'out':'log'` means any file/directory ending in `out` will now be saved in a separate directory called `my_star_directory/log/`. +:::note +These options are part of Nextflow and use a _single_ hyphen (pipeline parameters use a double-hyphen). +::: -```nextflow -params { - modules { - 'star_align' { - args = "--quantMode TranscriptomeSAM --twopassMode Basic --outSAMtype BAM Unsorted --readFilesCommand zcat --runRNGseed 0 --outFilterMultimapNmax 20 --alignSJDBoverhangMin 1 --outSAMattributes NH HI AS NM MD --quantTranscriptomeBan Singleend --outFilterMismatchNmax 16" - publish_dir = "my_star_directory" - publish_files = ['out':'log', 'tab':'log', 'bam':''] - } - } -} -``` +### `-profile` + +Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. + +Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. + +:::info +We highly recommend the use of Docker or Singularity containers for full pipeline reproducibility, however when this is not possible, Conda is also supported. +::: + +The pipeline also dynamically loads configurations from [https://github.com/nf-core/configs](https://github.com/nf-core/configs) when it runs, making multiple config profiles for various institutional clusters available at run time. For more information and to see if your system is available in these configs please see the [nf-core/configs documentation](https://github.com/nf-core/configs#documentation). -### Updating containers +Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! +They are loaded in sequence, so later profiles can overwrite earlier profiles. -The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. If for some reason you need to use a different version of a particular tool with the pipeline then you just need to identify the `process` name and override the Nextflow `container` definition for that process using the `withName` declaration. For example, in the [nf-core/viralrecon](https://nf-co.re/viralrecon) pipeline a tool called [Pangolin](https://github.com/cov-lineages/pangolin) has been used during the COVID-19 pandemic to assign lineages to SARS-CoV-2 genome sequenced samples. Given that the lineage assignments change quite frequently it doesn't make sense to re-release the nf-core/viralrecon everytime a new version of Pangolin has been released. However, you can override the default container used by the pipeline by creating a custom config file and passing it as a command-line argument via `-c custom.config`. +If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment. + +- `test` + - A profile with a complete configuration for automated testing + - Includes links to test data so needs no other parameters +- `docker` + - A generic configuration profile to be used with [Docker](https://docker.com/) +- `singularity` + - A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) +- `podman` + - A generic configuration profile to be used with [Podman](https://podman.io/) +- `shifter` + - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) +- `charliecloud` + - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) +- `apptainer` + - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) +- `conda` + - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. -1. Check the default version used by the pipeline in the module file for [Pangolin](https://github.com/nf-core/viralrecon/blob/a85d5969f9025409e3618d6c280ef15ce417df65/modules/nf-core/software/pangolin/main.nf#L14-L19) -2. Find the latest version of the Biocontainer available on [Quay.io](https://quay.io/repository/biocontainers/pangolin?tag=latest&tab=tags) -3. Create the custom config accordingly: +### `-resume` - * For Docker: +Specify this when restarting a pipeline. Nextflow will use cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. For input to be considered the same, not only the names must be identical but the files' contents as well. For more info about this parameter, see [this blog post](https://www.nextflow.io/blog/2019/demystifying-nextflow-resume.html). - ```nextflow - process { - withName: PANGOLIN { - container = 'quay.io/biocontainers/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. - * For Singularity: +### `-c` - ```nextflow - process { - withName: PANGOLIN { - container = 'https://depot.galaxyproject.org/singularity/pangolin:3.0.5--pyhdfd78af_0' - } - } - ``` +Specify the path to a specific config file (this is a core Nextflow command). See the [nf-core website documentation](https://nf-co.re/usage/configuration) for more information. - * For Conda: +## Custom configuration - ```nextflow - process { - withName: PANGOLIN { - conda = 'bioconda::pangolin=3.0.5' - } - } - ``` +### Resource requests + +Whilst the default requirements set within the pipeline will hopefully work for most people and with most input data, you may find that you want to customise the compute resources that the pipeline requests. Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with any of the error codes specified [here](https://github.com/nf-core/rnaseq/blob/4c27ef5610c87db00c3c5a3eed10b1d161abf575/conf/base.config#L18) it will automatically be resubmitted with higher requests (2 x original, then 3 x original). If it still fails after the third attempt then the pipeline execution is stopped. -> **NB:** If you wish to periodically update individual tool-specific results (e.g. Pangolin) generated by the pipeline then you must ensure to keep the `work/` directory otherwise the `-resume` ability of the pipeline will be compromised and it will restart from scratch. +To change the resource requests, please see the [max resources](https://nf-co.re/docs/usage/configuration#max-resources) and [tuning workflow resources](https://nf-co.re/docs/usage/configuration#tuning-workflow-resources) section of the nf-core website. + +### Custom Containers + +In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. + +To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. + +### Custom Tool Arguments + +A pipeline might not always support every possible argument or option of a particular tool used in pipeline. Fortunately, nf-core pipelines provide some freedom to users to insert additional parameters that the pipeline does not include by default. + +To learn how to provide additional arguments to a particular tool of the pipeline, please see the [customising tool arguments](https://nf-co.re/docs/usage/configuration#customising-tool-arguments) section of the nf-core website. ### nf-core/configs @@ -265,6 +329,14 @@ See the main [Nextflow documentation](https://www.nextflow.io/docs/latest/config If you have any questions or issues please send us a message on [Slack](https://nf-co.re/join/slack) on the [`#configs` channel](https://nfcore.slack.com/channels/configs). +## Azure Resource Requests + +To be used with the `azurebatch` profile by specifying the `-profile azurebatch`. +We recommend providing a compute `params.vm_type` of `Standard_D16_v3` VMs by default but these options can be changed if required. + +Note that the choice of VM size depends on your quota and the overall workload during the analysis. +For a thorough list, please refer the [Azure Sizes for virtual machines in Azure](https://docs.microsoft.com/en-us/azure/virtual-machines/sizes). + ## Running in the background Nextflow handles job submissions and supervises the running jobs. The Nextflow process must run until the pipeline is finished. @@ -279,6 +351,6 @@ Some HPC setups also allow you to run nextflow within a cluster job submitted yo In some cases, the Nextflow Java virtual machines can start to request a large amount of memory. We recommend adding the following line to your environment to limit this (typically in `~/.bashrc` or `~./bash_profile`): -```console +```bash NXF_OPTS='-Xms1g -Xmx4g' ``` diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 8d6920dd..33cd4f6e 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -27,7 +27,7 @@ class NfcoreSchema { /* groovylint-disable-next-line UnusedPrivateMethodParameter */ public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { def has_error = false - //=====================================================================// + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Check for nextflow core params and unexpected params def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') @@ -46,7 +46,6 @@ class NfcoreSchema { 'quiet', 'syslog', 'v', - 'version', // Options for `nextflow run` command 'ansi', @@ -105,9 +104,13 @@ class NfcoreSchema { // Collect expected parameters from the schema def expectedParams = [] + def enums = [:] for (group in schemaParams) { for (p in group.value['properties']) { expectedParams.push(p.key) + if (group.value['properties'][p.key].containsKey('enum')) { + enums[p.key] = group.value['properties'][p.key]['enum'] + } } } @@ -131,7 +134,7 @@ class NfcoreSchema { } } - //=====================================================================// + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Validate parameters against the schema InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) @@ -155,7 +158,7 @@ class NfcoreSchema { println '' log.error 'ERROR: Validation of pipeline parameters failed!' JSONObject exceptionJSON = e.toJSON() - printExceptions(exceptionJSON, params_json, log) + printExceptions(exceptionJSON, params_json, log, enums) println '' has_error = true } @@ -202,7 +205,7 @@ class NfcoreSchema { } def type = '[' + group_params.get(param).type + ']' def description = group_params.get(param).description - def defaultValue = group_params.get(param).default ? " [default: " + group_params.get(param).default.toString() + "]" : '' + def defaultValue = group_params.get(param).default != null ? " [default: " + group_params.get(param).default.toString() + "]" : '' def description_default = description + colors.dim + defaultValue + colors.reset // Wrap long description texts // Loosely based on https://dzone.com/articles/groovy-plain-text-word-wrap @@ -260,13 +263,12 @@ class NfcoreSchema { // Get pipeline parameters defined in JSON Schema def Map params_summary = [:] - def blacklist = ['hostnames'] def params_map = paramsLoad(getSchemaPath(workflow, schema_filename=schema_filename)) for (group in params_map.keySet()) { def sub_params = new LinkedHashMap() def group_params = params_map.get(group) // This gets the parameters of that particular group for (param in group_params.keySet()) { - if (params.containsKey(param) && !blacklist.contains(param)) { + if (params.containsKey(param)) { def params_value = params.get(param) def schema_value = group_params.get(param).default def param_type = group_params.get(param).type @@ -330,7 +332,7 @@ class NfcoreSchema { // // Loop over nested exceptions and print the causingException // - private static void printExceptions(ex_json, params_json, log) { + private static void printExceptions(ex_json, params_json, log, enums, limit=5) { def causingExceptions = ex_json['causingExceptions'] if (causingExceptions.length() == 0) { def m = ex_json['message'] =~ /required key \[([^\]]+)\] not found/ @@ -346,11 +348,20 @@ class NfcoreSchema { else { def param = ex_json['pointerToViolation'] - ~/^#\// def param_val = params_json[param].toString() - log.error "* --${param}: ${ex_json['message']} (${param_val})" + if (enums.containsKey(param)) { + def error_msg = "* --${param}: '${param_val}' is not a valid choice (Available choices" + if (enums[param].size() > limit) { + log.error "${error_msg} (${limit} of ${enums[param].size()}): ${enums[param][0..limit-1].join(', ')}, ... )" + } else { + log.error "${error_msg}: ${enums[param].join(', ')})" + } + } else { + log.error "* --${param}: ${ex_json['message']} (${param_val})" + } } } for (ex in causingExceptions) { - printExceptions(ex, params_json, log) + printExceptions(ex, params_json, log, enums) } } diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy index 44551e0a..e248e4c3 100755 --- a/lib/NfcoreTemplate.groovy +++ b/lib/NfcoreTemplate.groovy @@ -3,6 +3,8 @@ // import org.yaml.snakeyaml.Yaml +import groovy.json.JsonOutput +import nextflow.extension.FilesEx class NfcoreTemplate { @@ -19,30 +21,38 @@ class NfcoreTemplate { } // - // Check params.hostnames + // Warn if a -profile or Nextflow config has not been provided to run the pipeline // - public static void hostName(workflow, params, log) { - Map colors = logColours(params.monochrome_logs) - if (params.hostnames) { - try { - def hostname = "hostname".execute().text.trim() - params.hostnames.each { prof, hnames -> - hnames.each { hname -> - if (hostname.contains(hname) && !workflow.profile.contains(prof)) { - log.info "=${colors.yellow}====================================================${colors.reset}=\n" + - "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" + - " but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" + - " ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" + - "=${colors.yellow}====================================================${colors.reset}=" - } - } - } - } catch (Exception e) { - log.warn "[$workflow.manifest.name] Could not determine 'hostname' - skipping check. Reason: ${e.message}." - } + public static void checkConfigProvided(workflow, log) { + if (workflow.profile == 'standard' && workflow.configFiles.size() <= 1) { + log.warn "[$workflow.manifest.name] You are attempting to run the pipeline without any custom configuration!\n\n" + + "This will be dependent on your local compute environment but can be achieved via one or more of the following:\n" + + " (1) Using an existing pipeline profile e.g. `-profile docker` or `-profile singularity`\n" + + " (2) Using an existing nf-core/configs for your Institution e.g. `-profile crick` or `-profile uppmax`\n" + + " (3) Using your own local custom config e.g. `-c /path/to/your/custom.config`\n\n" + + "Please refer to the quick start section and usage docs for the pipeline.\n " } } + // + // Generate version string + // + public static String version(workflow) { + String version_string = "" + + if (workflow.manifest.version) { + def prefix_v = workflow.manifest.version[0] != 'v' ? 'v' : '' + version_string += "${prefix_v}${workflow.manifest.version}" + } + + if (workflow.commitId) { + def git_shortsha = workflow.commitId.substring(0, 7) + version_string += "-g${git_shortsha}" + } + + return version_string + } + // // Construct and send completion email // @@ -72,7 +82,7 @@ class NfcoreTemplate { misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp def email_fields = [:] - email_fields['version'] = workflow.manifest.version + email_fields['version'] = NfcoreTemplate.version(workflow) email_fields['runName'] = workflow.runName email_fields['success'] = workflow.success email_fields['dateComplete'] = workflow.complete @@ -120,7 +130,7 @@ class NfcoreTemplate { def email_html = html_template.toString() // Render the sendmail template - def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit + def max_multiqc_email_size = (params.containsKey('max_multiqc_email_size') ? params.max_multiqc_email_size : 0) as nextflow.util.MemoryUnit def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] def sf = new File("$projectDir/assets/sendmail_template.txt") def sendmail_template = engine.createTemplate(sf).make(smail_fields) @@ -132,12 +142,14 @@ class NfcoreTemplate { try { if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } // Try to send HTML e-mail using sendmail + def sendmail_tf = new File(workflow.launchDir.toString(), ".sendmail_tmp.html") + sendmail_tf.withWriter { w -> w << sendmail_html } [ 'sendmail', '-t' ].execute() << sendmail_html log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" } catch (all) { // Catch failures and try with plaintext def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] - if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { + if ( mqc_report != null && mqc_report.size() <= max_multiqc_email_size.toBytes() ) { mail_cmd += [ '-A', mqc_report ] } mail_cmd.execute() << email_html @@ -146,14 +158,88 @@ class NfcoreTemplate { } // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") + def output_hf = new File(workflow.launchDir.toString(), ".pipeline_report.html") output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") + FilesEx.copyTo(output_hf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.html"); + output_hf.delete() + + // Write summary e-mail TXT to a file + def output_tf = new File(workflow.launchDir.toString(), ".pipeline_report.txt") output_tf.withWriter { w -> w << email_txt } + FilesEx.copyTo(output_tf.toPath(), "${params.outdir}/pipeline_info/pipeline_report.txt"); + output_tf.delete() + } + + // + // Construct and send a notification to a web server as JSON + // e.g. Microsoft Teams and Slack + // + public static void IM_notification(workflow, params, summary_params, projectDir, log) { + def hook_url = params.hook_url + + def summary = [:] + for (group in summary_params.keySet()) { + summary << summary_params[group] + } + + def misc_fields = [:] + misc_fields['start'] = workflow.start + misc_fields['complete'] = workflow.complete + misc_fields['scriptfile'] = workflow.scriptFile + misc_fields['scriptid'] = workflow.scriptId + if (workflow.repository) misc_fields['repository'] = workflow.repository + if (workflow.commitId) misc_fields['commitid'] = workflow.commitId + if (workflow.revision) misc_fields['revision'] = workflow.revision + misc_fields['nxf_version'] = workflow.nextflow.version + misc_fields['nxf_build'] = workflow.nextflow.build + misc_fields['nxf_timestamp'] = workflow.nextflow.timestamp + + def msg_fields = [:] + msg_fields['version'] = NfcoreTemplate.version(workflow) + msg_fields['runName'] = workflow.runName + msg_fields['success'] = workflow.success + msg_fields['dateComplete'] = workflow.complete + msg_fields['duration'] = workflow.duration + msg_fields['exitStatus'] = workflow.exitStatus + msg_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + msg_fields['errorReport'] = (workflow.errorReport ?: 'None') + msg_fields['commandLine'] = workflow.commandLine.replaceFirst(/ +--hook_url +[^ ]+/, "") + msg_fields['projectDir'] = workflow.projectDir + msg_fields['summary'] = summary << misc_fields + + // Render the JSON template + def engine = new groovy.text.GStringTemplateEngine() + // Different JSON depending on the service provider + // Defaults to "Adaptive Cards" (https://adaptivecards.io), except Slack which has its own format + def json_path = hook_url.contains("hooks.slack.com") ? "slackreport.json" : "adaptivecard.json" + def hf = new File("$projectDir/assets/${json_path}") + def json_template = engine.createTemplate(hf).make(msg_fields) + def json_message = json_template.toString() + + // POST + def post = new URL(hook_url).openConnection(); + post.setRequestMethod("POST") + post.setDoOutput(true) + post.setRequestProperty("Content-Type", "application/json") + post.getOutputStream().write(json_message.getBytes("UTF-8")); + def postRC = post.getResponseCode(); + if (! postRC.equals(200)) { + log.warn(post.getErrorStream().getText()); + } + } + + // + // Dump pipeline parameters in a json file + // + public static void dump_parameters(workflow, params) { + def timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') + def filename = "params_${timestamp}.json" + def temp_pf = new File(workflow.launchDir.toString(), ".${filename}") + def jsonStr = JsonOutput.toJson(params) + temp_pf.text = JsonOutput.prettyPrint(jsonStr) + + FilesEx.copyTo(temp_pf.toPath(), "${params.outdir}/pipeline_info/params_${timestamp}.json") + temp_pf.delete() } // @@ -165,10 +251,9 @@ class NfcoreTemplate { if (workflow.stats.ignoredCount == 0) { log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" } else { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" + log.info "-${colors.purple}[$workflow.manifest.name]${colors.yellow} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" } } else { - hostName(workflow, params, log) log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" } } @@ -254,6 +339,7 @@ class NfcoreTemplate { // public static String logo(workflow, monochrome_logs) { Map colors = logColours(monochrome_logs) + String workflow_version = NfcoreTemplate.version(workflow) String.format( """\n ${dashedLine(monochrome_logs)} @@ -262,7 +348,7 @@ class NfcoreTemplate { ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} ${colors.green}`._,._,\'${colors.reset} - ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} + ${colors.purple} ${workflow.manifest.name} ${workflow_version}${colors.reset} ${dashedLine(monochrome_logs)} """.stripIndent() ) diff --git a/lib/Utils.groovy b/lib/Utils.groovy old mode 100755 new mode 100644 index 18173e98..8d030f4e --- a/lib/Utils.groovy +++ b/lib/Utils.groovy @@ -21,27 +21,27 @@ class Utils { } // Check that all channels are present - def required_channels = ['conda-forge', 'bioconda', 'defaults'] - def conda_check_failed = !required_channels.every { ch -> ch in channels } + // This channel list is ordered by required channel priority. + def required_channels_in_order = ['conda-forge', 'bioconda', 'defaults'] + def channels_missing = ((required_channels_in_order as Set) - (channels as Set)) as Boolean // Check that they are in the right order - conda_check_failed |= !(channels.indexOf('conda-forge') < channels.indexOf('bioconda')) - conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) + def channel_priority_violation = false + def n = required_channels_in_order.size() + for (int i = 0; i < n - 1; i++) { + channel_priority_violation |= !(channels.indexOf(required_channels_in_order[i]) < channels.indexOf(required_channels_in_order[i+1])) + } - if (conda_check_failed) { - log.warn "=============================================================================\n" + + if (channels_missing | channel_priority_violation) { + log.warn "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " There is a problem with your Conda configuration!\n\n" + " You will need to set-up the conda-forge and bioconda channels correctly.\n" + - " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + - " NB: The order of the channels matters!\n" + - "===================================================================================" + " Please refer to https://bioconda.github.io/\n" + + " The observed channel order is \n" + + " ${channels}\n" + + " but the following channel order is required:\n" + + " ${required_channels_in_order}\n" + + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" } } - - // - // Join module args with appropriate spacing - // - public static String joinModuleArgs(args_list) { - return ' ' + args_list.join(' ') - } } diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy index d32af566..3b6502b6 100755 --- a/lib/WorkflowMain.groovy +++ b/lib/WorkflowMain.groovy @@ -2,6 +2,8 @@ // This file holds several functions specific to the main.nf workflow in the nf-core/metatdenovo pipeline // +import nextflow.Nextflow + class WorkflowMain { // @@ -9,86 +11,63 @@ class WorkflowMain { // public static String citation(workflow) { return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + - // TODO nf-core: Add Zenodo DOI for pipeline after first release - //"* The pipeline\n" + - //" https://doi.org/10.5281/zenodo.XXXXXXX\n\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" } - // - // Print help to screen if required - // - public static String help(workflow, params, log) { - def command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" - def help_string = '' - help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) - help_string += NfcoreSchema.paramsHelp(workflow, params, command) - help_string += '\n' + citation(workflow) + '\n' - help_string += NfcoreTemplate.dashedLine(params.monochrome_logs) - return help_string - } - - // - // Print parameter summary log to screen - // - public static String paramsSummaryLog(workflow, params, log) { - def summary_log = '' - summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) - summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) - summary_log += '\n' + citation(workflow) + '\n' - summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) - return summary_log - } - // // Validate parameters and print summary to screen // - public static void initialise(workflow, params, log) { - // Print help to screen if required - if (params.help) { - log.info help(workflow, params, log) - System.exit(0) - } + public static void initialise(workflow, params, log, args) { - // Validate workflow parameters via the JSON schema - if (params.validate_params) { - NfcoreSchema.validateParameters(workflow, params, log) + // Print workflow version and exit on --version + if (params.version) { + String workflow_version = NfcoreTemplate.version(workflow) + log.info "${workflow.manifest.name} ${workflow_version}" + System.exit(0) } - // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params, log) + // Check that a -profile or Nextflow config has been provided to run the pipeline + NfcoreTemplate.checkConfigProvided(workflow, log) + // Check that the profile doesn't contain spaces and doesn't end with a trailing comma + checkProfile(workflow.profile, args, log) // Check that conda channels are set-up correctly - if (params.enable_conda) { + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { Utils.checkCondaChannels(log) } // Check AWS batch settings NfcoreTemplate.awsBatch(workflow, params) - // Check the hostnames against configured profiles - NfcoreTemplate.hostName(workflow, params, log) - // Check input has been provided if (!params.input) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" - System.exit(1) + Nextflow.error("Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'") } } - // // Get attribute from genome config file e.g. fasta // - public static String getGenomeAttribute(params, attribute) { - def val = '' + public static Object getGenomeAttribute(params, attribute) { if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) { if (params.genomes[ params.genome ].containsKey(attribute)) { - val = params.genomes[ params.genome ][ attribute ] + return params.genomes[ params.genome ][ attribute ] } } - return val + return null + } + + // + // Exit pipeline if --profile contains spaces + // + private static void checkProfile(profile, args, log) { + if (profile.endsWith(',')) { + Nextflow.error "Profile cannot end with a trailing comma. Please remove the comma from the end of the profile string.\nHint: A common mistake is to provide multiple values to `-profile` separated by spaces. Please use commas to separate profiles instead,e.g., `-profile docker,test`." + } + if (args[0]) { + log.warn "nf-core pipelines do not accept positional arguments. The positional argument `${args[0]}` has been detected.\n Hint: A common mistake is to provide multiple values to `-profile` separated by spaces. Please use commas to separate profiles instead,e.g., `-profile docker,test`." + } } } diff --git a/lib/WorkflowMetatdenovo.groovy b/lib/WorkflowMetatdenovo.groovy index 3b2ad8f9..8b97653c 100755 --- a/lib/WorkflowMetatdenovo.groovy +++ b/lib/WorkflowMetatdenovo.groovy @@ -2,18 +2,18 @@ // This file holds several functions specific to the workflow/metatdenovo.nf in the nf-core/metatdenovo pipeline // +import nextflow.Nextflow +import groovy.text.SimpleTemplateEngine + class WorkflowMetatdenovo { // // Check and validate parameters // public static void initialise(params, log) { + genomeExistsError(params, log) - if (!params.fasta) { - log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - System.exit(1) - } } // @@ -43,17 +43,73 @@ class WorkflowMetatdenovo { return yaml_file_text } + // + // Generate methods description for MultiQC + // + + public static String toolCitationText(params) { + + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def citation_text = [ + "Tools used in the workflow included:", + "FastQC (Andrews 2010),", + "MultiQC (Ewels et al. 2016)", + "." + ].join(' ').trim() + + return citation_text + } + + public static String toolBibliographyText(params) { + + // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", + // Uncomment function in methodsDescriptionText to render in MultiQC report + def reference_text = [ + "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " + ].join(' ').trim() + + return reference_text + } + + public static String methodsDescriptionText(run_workflow, mqc_methods_yaml, params) { + // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file + def meta = [:] + meta.workflow = run_workflow.toMap() + meta["manifest_map"] = run_workflow.manifest.toMap() + + // Pipeline DOI + meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" + meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " + + // Tool references + meta["tool_citations"] = "" + meta["tool_bibliography"] = "" + + //meta["tool_citations"] = toolCitationText(params).replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") + //meta["tool_bibliography"] = toolBibliographyText(params) + + + def methods_text = mqc_methods_yaml.text + + def engine = new SimpleTemplateEngine() + def description_html = engine.createTemplate(methods_text).make(meta) + + return description_html + } + // // Exit pipeline if incorrect --genome key provided // private static void genomeExistsError(params, log) { if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { - log.error "=============================================================================\n" + + def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + " Currently, the available genome keys are:\n" + " ${params.genomes.keySet().join(", ")}\n" + - "===================================================================================" - System.exit(1) + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" + Nextflow.error(error_string) } } } diff --git a/lib/nfcore_external_java_deps.jar b/lib/nfcore_external_java_deps.jar deleted file mode 100644 index 805c8bb5..00000000 Binary files a/lib/nfcore_external_java_deps.jar and /dev/null differ diff --git a/main.nf b/main.nf index b9ec7f20..622e1cfa 100644 --- a/main.nf +++ b/main.nf @@ -1,8 +1,8 @@ #!/usr/bin/env nextflow /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nf-core/metatdenovo -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Github : https://github.com/nf-core/metatdenovo Website: https://nf-co.re/metatdenovo Slack : https://nfcore.slack.com/channels/metatdenovo @@ -12,25 +12,40 @@ nextflow.enable.dsl = 2 /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GENOME PARAMETER VALUES -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ VALIDATE & PRINT PARAMETER SUMMARY -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -WorkflowMain.initialise(workflow, params, log) +include { validateParameters; paramsHelp } from 'plugin/nf-validation' + +// Print help message if needed +if (params.help) { + def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) + def citation = '\n' + WorkflowMain.citation(workflow) + '\n' + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --genome GRCh37 -profile docker" + log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) + System.exit(0) +} + +// Validate input parameters +if (params.validate_params) { + validateParameters() +} + +WorkflowMain.initialise(workflow, params, log, args) /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ NAMED WORKFLOW FOR PIPELINE -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ include { METATDENOVO } from './workflows/metatdenovo' @@ -43,9 +58,9 @@ workflow NFCORE_METATDENOVO { } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN ALL WORKFLOWS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // @@ -57,7 +72,7 @@ workflow { } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ diff --git a/modules.json b/modules.json index d2976a18..7ad9351c 100644 --- a/modules.json +++ b/modules.json @@ -2,16 +2,145 @@ "name": "nf-core/metatdenovo", "homePage": "https://github.com/nf-core/metatdenovo", "repos": { - "nf-core/modules": { - "custom/dumpsoftwareversions": { - "git_sha": "84f2302920078b0cf7716b2a2e5fcc0be5c4531d" + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "bbmap/align": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bbmap/bbduk": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bbmap/bbnorm": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "bbmap/index": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "cat/cat": { + "branch": "master", + "git_sha": "81f27e75847087865299cc46605deb3b09b4e0a2", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "02fd5bd7275abad27aad32d5c852e0a9b1b98882", + "installed_by": ["modules"] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", + "installed_by": ["modules"] + }, + "fastqc": { + "branch": "master", + "git_sha": "f4ae1d942bd50c5c0b9bd2de1393ce38315ba57c", + "installed_by": ["modules"] + }, + "hmmer/hmmsearch": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "multiqc": { + "branch": "master", + "git_sha": "9e71d8519dfbfc328c078bba14d4bd4c99e39a94", + "installed_by": ["modules"] + }, + "prodigal": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "prokka": { + "branch": "master", + "git_sha": "49ebda931c36c2b282f7958d00e1236b751f1031", + "installed_by": ["modules"] + }, + "samtools/flagstat": { + "branch": "master", + "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", + "installed_by": ["bam_stats_samtools"] + }, + "samtools/idxstats": { + "branch": "master", + "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", + "installed_by": ["bam_stats_samtools"] + }, + "samtools/index": { + "branch": "master", + "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", + "installed_by": ["bam_sort_stats_samtools"] + }, + "samtools/sort": { + "branch": "master", + "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", + "installed_by": ["bam_sort_stats_samtools"] + }, + "samtools/stats": { + "branch": "master", + "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", + "installed_by": ["bam_stats_samtools"] + }, + "seqtk/mergepe": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "seqtk/seq": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "spades": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "subread/featurecounts": { + "branch": "master", + "git_sha": "f6bba1a67cdbb605f24d7a4e8dd383b0eec45b52", + "installed_by": ["modules"] + }, + "transdecoder/longorf": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "transdecoder/predict": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": ["modules"] + }, + "trimgalore": { + "branch": "master", + "git_sha": "2e6c468e0ad43b23df71d7a7f130d5c0e0aa89e3", + "installed_by": ["modules"] + } + } }, - "fastqc": { - "git_sha": "7b3315591a149609e27914965f858c9a7e071564" - }, - "multiqc": { - "git_sha": "7b3315591a149609e27914965f858c9a7e071564" + "subworkflows": { + "nf-core": { + "bam_sort_stats_samtools": { + "branch": "master", + "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", + "installed_by": ["subworkflows"] + }, + "bam_stats_samtools": { + "branch": "master", + "git_sha": "f4596fe0bdc096cf53ec4497e83defdb3a94ff62", + "installed_by": ["bam_sort_stats_samtools"] + } + } } } } -} \ No newline at end of file +} diff --git a/modules/local/cat/cat_contigs.nf b/modules/local/cat/cat_contigs.nf new file mode 100644 index 00000000..0e64ab8d --- /dev/null +++ b/modules/local/cat/cat_contigs.nf @@ -0,0 +1,49 @@ +process CAT_CONTIGS { + tag "${meta.id}-${db_name}" + label 'process_high' + + conda "bioconda::cat=4.6 bioconda::diamond=2.0.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/cat:5.2.3--hdfd78af_1' : + 'quay.io/biocontainers/cat:5.2.3--hdfd78af_1' }" + + input: + tuple val(meta), path(assembly) + tuple val(db_name), path("database/*"), path("taxonomy/*") + + output: + path("*.names.txt.gz") , emit: tax_classification + path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca + path("raw/*.predicted_proteins.faa.gz") , emit: faa + path("raw/*.predicted_proteins.gff.gz") , emit: gff + path("raw/*.log") , emit: log + path("raw/*.contig2classification.txt.gz"), emit: tax_classification_taxids + path "versions.yml" , emit: versions + + script: + def official_taxonomy = params.cat_official_taxonomy ? "--only_official" : "" + """ + CAT contigs -c "$assembly" -d database/ -t taxonomy/ -n 4 -o "${meta.id}" --top 11 --I_know_what_Im_doing + CAT add_names -i "${meta.id}.ORF2LCA.txt" -o "${meta.id}.ORF2LCA.names.txt" -t taxonomy/ ${official_taxonomy} + CAT add_names \\ + -i "${meta.id}.contig2classification.txt" \\ + -o "${meta.id}.contig2classification.names.txt" \\ + -t taxonomy/ ${official_taxonomy} + + mkdir raw + mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.contig2classification.txt raw/ + gzip \ + "raw/${meta.id}.predicted_proteins.faa" \ + "raw/${meta.id}.predicted_proteins.gff" \ + "raw/${meta.id}.contig2classification.txt" \ + "${meta.id}.ORF2LCA.names.txt" \ + "raw/${meta.id}.ORF2LCA.txt" \ + "${meta.id}.contig2classification.names.txt" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + CAT_CONTIGS: \$(CAT --version | sed "s/CAT v//; s/(.*//") + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/local/cat/cat_db.nf b/modules/local/cat/cat_db.nf new file mode 100644 index 00000000..ea3b55c8 --- /dev/null +++ b/modules/local/cat/cat_db.nf @@ -0,0 +1,33 @@ +process CAT_DB { + tag "${database.baseName}" + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'ubuntu:20.04' }" + + input: + path(database) + + output: + tuple val("${database.toString().replace(".tar.gz", "")}"), path("database/*"), path("taxonomy/*"), emit: db + path "versions.yml" , emit: versions + + script: + """ + if [[ ${database} != *.tar.gz ]]; then + ln -sr `find ${database}/ -type d -name "*taxonomy*"` taxonomy + ln -sr `find ${database}/ -type d -name "*database*"` database + else + mkdir catDB + tar -xf ${database} -C catDB + mv `find catDB/ -type d -name "*taxonomy*"` taxonomy/ + mv `find catDB/ -type d -name "*database*"` database/ + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + tar: \$(tar --version 2>&1 | sed -n 1p | sed 's/tar (GNU tar) //') + END_VERSIONS + """ +} diff --git a/modules/local/cat/cat_db_generate.nf b/modules/local/cat/cat_db_generate.nf new file mode 100644 index 00000000..2b0382ea --- /dev/null +++ b/modules/local/cat/cat_db_generate.nf @@ -0,0 +1,36 @@ +process CAT_DB_GENERATE { + + conda "bioconda::cat=4.6 bioconda::diamond=2.0.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/cat:5.2.3--hdfd78af_1' : + 'quay.io/biocontainers/cat:5.2.3--hdfd78af_1' }" + + output: + tuple env(DB_NAME), path("database/*"), path("taxonomy/*"), emit: db + path("CAT_prepare_*.tar.gz"), optional:true , emit: db_tar_gz + path "versions.yml" , emit: versions + + script: + def save_db = params.save_cat_db ? "Y" : "N" + """ + CAT prepare --fresh + + # get name/date of generated datase + out=(*_taxonomy/) + [[ \$out =~ (.*)_taxonomy/ ]]; + DB_NAME="CAT_prepare_\${BASH_REMATCH[1]}" + + mv *_taxonomy taxonomy + mv *_database database + rm database/*.nr.gz + if [ ${save_db} = "Y" ] ; then + tar -cf - taxonomy database | gzip > "\${DB_NAME}".tar.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + CAT: \$(CAT --version | sed "s/CAT v//; s/(.*//") + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/local/cat/cat_summary.nf b/modules/local/cat/cat_summary.nf new file mode 100644 index 00000000..11d08661 --- /dev/null +++ b/modules/local/cat/cat_summary.nf @@ -0,0 +1,29 @@ +process CAT_SUMMARY { + label 'process_low' + + conda "bioconda::bioawk=1.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioawk:1.0--hed695b0_5' : + 'quay.io/biocontainers/bioawk:1.0--hed695b0_5' }" + + input: + path(cat_summaries) + + output: + path("*.tsv") , emit: combined + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "cat_summary" + """ + # use find as sometimes these are empty and need to fail gracefully + find -L -type f -name "*contig2classification.names.txt.gz" -exec sh -c 'for f do gunzip -c \$f > \${f%.*}; done' find-sh {} + + + bioawk '(NR == 1) || (FNR > 1)' *.txt > ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bioawk: \$(bioawk --version | cut -f 3 -d ' ' ) + END_VERSIONS + """ +} diff --git a/modules/local/collect_featurecounts.nf b/modules/local/collect_featurecounts.nf new file mode 100644 index 00000000..c78723d3 --- /dev/null +++ b/modules/local/collect_featurecounts.nf @@ -0,0 +1,70 @@ +process COLLECT_FEATURECOUNTS { + tag "$meta.id" + label 'process_high' + + conda "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" + + input: + tuple val(meta), path(inputfiles) + + output: + tuple val(meta), path("*.counts.tsv.gz"), emit: counts + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + #!/usr/bin/env Rscript + + library(data.table) + library(dtplyr) + library(readr) + library(dplyr) + library(stringr) + + setDTthreads($task.cpus) + + tibble(f = Sys.glob('*.featureCounts.txt')) %>% + mutate( + d = purrr::map( + f, + function(file) { + fread(file, sep = '\\t', skip = 1) %>% + melt(measure.vars = c(ncol(.)), variable.name = 'sample', value.name = 'count') %>% + lazy_dt() %>% + filter(count > 0) %>% + mutate( + sample = str_remove(sample, '.sorted.bam'), + r = count/Length + ) %>% + rename( orf = Geneid, chr = Chr, start = Start, end = End, strand = Strand, length = Length ) %>% + group_by(sample) %>% + mutate(tpm = r/sum(r) * 1e6) %>% ungroup() %>% + select(-r) %>% + as_tibble() + } + ) + ) %>% + tidyr::unnest(d) %>% + select(-f) %>% + write_tsv("${prefix}.counts.tsv.gz") + + writeLines( + c( + "\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" dplyr: ", packageVersion('dplyr')), + paste0(" dtplyr: ", packageVersion('dtplyr')), + paste0(" data.table: ", packageVersion('data.table')) + ), + "versions.yml" + ) + """ +} diff --git a/modules/local/collect_stats.nf b/modules/local/collect_stats.nf new file mode 100644 index 00000000..59ab2ffa --- /dev/null +++ b/modules/local/collect_stats.nf @@ -0,0 +1,133 @@ +process COLLECT_STATS { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" + + input: + tuple val(meta), val(samples), path(trimlogs), path(bblogs), path(idxstats), path(fcs), path(mergetab) + + output: + path "${meta.id}.overall_stats.tsv.gz", emit: overall_stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if ( trimlogs ) { + read_trimlogs = """%>% + mutate( + d = map( + sample, + function(s) { + fread(cmd = sprintf("grep 'Reads written (passing filters)' %s*trimming_report.txt | sed 's/.*: *//' | sed 's/ .*//' | sed 's/,//g'", s)) %>% + as_tibble() + } + ) + ) %>% + unnest(d) %>% + rename(n_trimmed = V1) %>% + mutate(n_trimmed = n_trimmed*2) %>% + """ + } else { + read_trimlogs = "%>%" + } + + if (mergetab) { + read_mergetab = """ + + mergetab <- list.files(pattern = "*_merged_table.tsv.gz" ) %>% + map_df(~read_tsv(., show_col_types = FALSE)) %>% + mutate(sample = as.character(sample)) + + """ + } else { + read_mergetab = """ + mergetab <- data.frame(sample = character(), stringsAsFactors = FALSE) + """ + } + + """ + #!/usr/bin/env Rscript + + library(data.table) + library(dtplyr) + library(dplyr) + library(readr) + library(purrr) + library(tidyr) + library(stringr) + + TYPE_ORDER = c('n_trimmed', 'n_non_contaminated', 'idxs_n_mapped', 'idxs_n_unmapped', 'n_feature_count') + + # Collect stats for each sample, create a table in long format that can be appended to + t <- tibble(sample = c("${samples.join('", "')}")) ${read_trimlogs} + # add samtools idxstats output + mutate( + i = map( + sample, + function(s) { + fread(cmd = sprintf("grep -v '^*' %s*idxstats", s), sep = '\\t', col.names = c('chr', 'length', 'idxs_n_mapped', 'idxs_n_unmapped')) %>% + lazy_dt() %>% + summarise(idxs_n_mapped = sum(idxs_n_mapped), idxs_n_unmapped = sum(idxs_n_unmapped)) %>% + as_tibble() + } + ) + ) %>% + unnest(i) %>% + pivot_longer(2:ncol(.), names_to = 'm', values_to = 'v') %>% + union( + # Total observation after featureCounts + tibble(file = Sys.glob('*.counts.tsv.gz')) %>% + mutate(d = map(file, function(f) fread(cmd = sprintf("gunzip -c %s", f), sep = '\\t'))) %>% + as_tibble() %>% + unnest(d) %>% + mutate(sample = as.character(sample)) %>% + group_by(sample) %>% summarise(n_feature_count = sum(count), .groups = 'drop') %>% + pivot_longer(2:ncol(.), names_to = 'm', values_to = 'v') + ) + + # Add in stats from BBDuk, if present + for ( f in Sys.glob('*.bbduk.log') ) { + s = str_remove(f, '.bbduk.log') + t <- t %>% union( + fread(cmd = sprintf("grep 'Result:' %s | sed 's/Result:[ \\t]*//; s/ reads.*//'", f), col.names = c('v')) %>% + as_tibble() %>% + mutate(sample = s, m = 'n_non_contaminated') + ) + } + + # Add in stats from taxonomy and function + ${read_mergetab} + + # Write the table in wide format + t %>% + mutate(m = parse_factor(m, levels = TYPE_ORDER, ordered = TRUE)) %>% + arrange(sample, m) %>% + pivot_wider(names_from = m, values_from = v) %>% + left_join(mergetab, by = 'sample') %>% + write_tsv('${prefix}.overall_stats.tsv.gz') + + writeLines( + c( + "\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" dplyr: ", packageVersion('dplyr')), + paste0(" dtplyr: ", packageVersion('dtplyr')), + paste0(" data.table: ", packageVersion('data.table')), + paste0(" readr: ", packageVersion('readr')), + paste0(" purrr: ", packageVersion('purrr')), + paste0(" tidyr: ", packageVersion('tidyr')), + paste0(" stringr: ", packageVersion('stringr')) + ), + "versions.yml" + ) + """ +} diff --git a/modules/local/eggnog/download.nf b/modules/local/eggnog/download.nf new file mode 100644 index 00000000..e2d50917 --- /dev/null +++ b/modules/local/eggnog/download.nf @@ -0,0 +1,52 @@ +process EGGNOG_DOWNLOAD { + tag 'EggNOG' + label 'process_low' + + conda "bioconda::eggnog-mapper=2.1.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/eggnog-mapper:2.1.9--pyhdfd78af_0': + 'biocontainers/eggnog-mapper:2.1.9--pyhdfd78af_0' }" + + input: + + output: + path "eggnog.db" , emit: eggnog_db + path "eggnog_proteins.dmnd" , emit: dmnd + path "eggnog.taxa.db" , emit: taxa_db + path "eggnog.taxa.db.traverse.pkl", emit: pkl + path "*" , emit: all + path "versions.yml" , emit: versions, optional: true // Optional to allow skipping if this is the only file that's missing + + script: + def args = task.ext.args ?: '' + + """ + download_eggnog_data.py \\ + $args \\ + -y \\ + --data_dir . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eggnog: \$( echo \$(emapper.py --version 2>&1)| sed 's/.* emapper-//' | sed 's/ \\/ Expected.*//') + END_VERSIONS + + """ + + stub: + + """ + + mkdir eggnog + touch ./eggnog/eggnog.db + touch ./eggnog/eggnog.taxa.db + touch ./eggnog/eggnog.taxa.db.traverse.pkl + ln -s eggnog/* ./ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eggnog: \$( echo \$(emapper.py --version 2>&1)| sed 's/.* emapper-//' ) + END_VERSIONS + + """ +} diff --git a/modules/local/eggnog/mapper.nf b/modules/local/eggnog/mapper.nf new file mode 100644 index 00000000..1a5e1fbd --- /dev/null +++ b/modules/local/eggnog/mapper.nf @@ -0,0 +1,65 @@ +process EGGNOG_MAPPER { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::eggnog-mapper=2.1.9" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/eggnog-mapper:2.1.9--pyhdfd78af_0': + 'biocontainers/eggnog-mapper:2.1.9--pyhdfd78af_0' }" + + input: + tuple val(meta), path(fasta) + path(eggnog_files), stageAs: 'eggnog/*' + + output: + tuple val(meta), path("*.emapper.hits.gz") , emit: hits + tuple val(meta), path("*.emapper.seed_orthologs.gz") , emit: seed_orthologs + tuple val(meta), path("*.emapper.annotations.gz") , emit: annotations + tuple val(meta), path("*.emapper.tsv.gz") , emit: emappertsv + tuple val(meta), path("*.emapper.annotations.xlsx") , emit: xlsx, optional: true + tuple val(meta), path("*.emapper.orthologs.gz") , emit: orthologs, optional: true + tuple val(meta), path("*.emapper.genepred.fasta.gz") , emit: genepred, optional: true + tuple val(meta), path("*.emapper.gff.gz") , emit: gff, optional: true + tuple val(meta), path("*.emapper.no_annotations.fasta.gz"), emit: no_anno, optional: true + tuple val(meta), path("*.emapper.pfam.gz") , emit: pfam, optional: true + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + input = fasta.name.endsWith(".gz") ? fasta.baseName : fasta + gunzip = fasta =~ /\.gz$/ ? "gunzip -c ${fasta} > ${input}" : "" + + """ + $gunzip + + emapper.py \\ + $args \\ + --cpu $task.cpus \\ + --data_dir eggnog \\ + --output $prefix \\ + -i $input + + gzip ${prefix}.emapper.* + zgrep -v '^##' ${prefix}.emapper.annotations | \\ + sed 's/^#// ; /^query/s/.*/\\L&/ ; s/query/orf/' | \\ + gzip -c > ${prefix}.emapper.tsv.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eggnog: \$( echo \$(emapper.py --version 2>&1)| sed 's/.* emapper-//' | sed 's/ \\/ Expected.*//') + END_VERSIONS + """ + + stub: + """ + touch test.emapper.hits + touch test.emapper.seed_orthologs + touch test.emapper.annotations + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eggnog: \$( echo \$(emapper.py --version 2>&1)| sed 's/.* emapper-//' | sed 's/ \\/ Expected.*//') + END_VERSIONS + """ +} diff --git a/modules/local/eggnog/sum.nf b/modules/local/eggnog/sum.nf new file mode 100644 index 00000000..7eb1bfa5 --- /dev/null +++ b/modules/local/eggnog/sum.nf @@ -0,0 +1,66 @@ +process EGGNOG_SUM { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" + + input: + + tuple val(meta), path(eggnog) + path(fcs) + + output: + + tuple val(meta), path("${meta.id}.eggnog_summary.tsv.gz") , emit: eggnog_summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + #!/usr/bin/env Rscript + + library(dplyr) + library(readr) + library(tidyr) + library(stringr) + library(tidyverse) + + # call the tables into variables + eggnog <- read_tsv("${eggnog}", show_col_types = FALSE ) + + counts <- list.files(pattern = "*.counts.tsv.gz") %>% + map_df(~read_tsv(., show_col_types = FALSE)) %>% + mutate(sample = as.character(sample)) + + counts %>% + inner_join(eggnog, by = 'orf') %>% + group_by(sample) %>% + drop_na() %>% + summarise( value = sum(count), .groups = 'drop') %>% + add_column(database = "eggnog", field = "eggnog_n_counts") %>% + relocate(value, .after = last_col()) %>% + write_tsv('${meta.id}.eggnog_summary.tsv.gz') + + writeLines( + c( + "\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" dplyr: ", packageVersion('dplyr')), + paste0(" dtplyr: ", packageVersion('dtplyr')), + paste0(" data.table: ", packageVersion('data.table')), + paste0(" readr: ", packageVersion('readr')), + paste0(" purrr: ", packageVersion('purrr')), + paste0(" tidyr: ", packageVersion('tidyr')), + paste0(" stringr: ", packageVersion('stringr')) + ), + "versions.yml") + """ +} diff --git a/modules/local/eukulele/search.nf b/modules/local/eukulele/search.nf new file mode 100644 index 00000000..b9677220 --- /dev/null +++ b/modules/local/eukulele/search.nf @@ -0,0 +1,55 @@ +process EUKULELE_SEARCH { + tag "$meta.id" + label 'process_high' + + conda "bioconda::eukulele=2.0.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/eukulele:2.0.5--pyh723bec7_0' : + 'biocontainers/eukulele:2.0.5--pyh723bec7_0' }" + + input: + tuple val(meta), path(fasta), val(dbname), path(eukdb) + + output: + tuple val(meta), path("*/taxonomy_estimation/*.out.gz"), val("${dbname}") , emit: taxonomy_estimation + tuple val(meta), path("*/taxonomy_counts/*.csv.gz") , emit: taxonomy_counts + tuple val(meta), path("*/mets_full/diamond/*") , emit: diamond + + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def gunzip = fasta =~ /\.gz$/ ? "gunzip -c ${fasta} > ./contigs/proteins.faa" : "mv ${fasta} contigs/proteins.faa" + def database = dbname ? "--database ${dbname}" : '' + + """ + rc=0 + mkdir contigs + $gunzip + EUKulele \\ + $args \\ + $database \\ + --protein_extension .faa \\ + --reference_dir $eukdb \\ + -o ${prefix} \\ + --CPUs ${task.cpus} \\ + -s \\ + contigs || rc=\$? + + gzip ${prefix}/mets_full/diamond/*.out + gzip ${prefix}/taxonomy_counts/*.csv + gzip ${prefix}/taxonomy_estimation/*.out + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + eukulele: \$(echo \$(EUKulele --version 2>&1) | sed -n 's/.* \\([0-9]\\+\\.[0-9]\\+\\.[0-9]\\+\\).*/\\1/p') + END_VERSIONS + + if [ \$rc -le 1 ]; then + exit 0 + else + exit \$rc; + fi + """ +} diff --git a/modules/local/format_prodigal.nf b/modules/local/format_prodigal.nf new file mode 100644 index 00000000..af6cdf6c --- /dev/null +++ b/modules/local/format_prodigal.nf @@ -0,0 +1,33 @@ +process FORMAT_PRODIGAL_GFF { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::gzip=1.12" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gzip:1.11': + 'biocontainers/gzip:1.11' }" + + input: + tuple val(meta), path (gff) + + output: + tuple val(meta), path("${prefix}_format.gff.gz"), emit: format_gff + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + cat_input = gff =~ /\.gz$/ ? "gunzip -c ${gff}" : "cat ${gff}" + + """ + $cat_input | sed 's/^\\([^\\t]\\+\\)\\(.*ID=\\)[0-9]\\+\\(_[0-9]\\+\\)/\\1\\2\\1\\3/' | gzip -c > ${prefix}_format.gff.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gzip: \$( echo \$(gzip --version 2>&1) | sed -n 's/.*gzip \\([0-9.]\\+\\).*/\\1/p') + END_VERSIONS + """ +} diff --git a/modules/local/format_tax.nf b/modules/local/format_tax.nf new file mode 100644 index 00000000..cdfe915a --- /dev/null +++ b/modules/local/format_tax.nf @@ -0,0 +1,63 @@ +process FORMAT_TAX { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" + + input: + tuple val(meta), path(taxtable) + + output: + tuple val(meta), path("*.taxonomy_classification.tsv.gz"), emit: tax + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + #!/usr/bin/env Rscript + + library(readr) + library(dplyr) + library(tidyr) + + # Create and write a table with taxonomy categories in each column + read_tsv("${taxtable}") %>% + select(-1) %>% + rename(orf = transcript_name) %>% + group_by(orf) %>% + filter(max_pid == max(max_pid)) %>% + ungroup() %>% + separate( + full_classification, + c("domain","phylum", "class", "order", "family", "genus", "species"), + sep = "\\\\s*;\\\\s*" + ) %>% + mutate( + domain = ifelse(is.na(domain) | domain == '', 'Uncl.', domain), + phylum = ifelse(is.na(phylum) | phylum == '', sprintf("%s uncl.", domain), phylum), + class = ifelse(is.na(class) | class == '', sprintf("%s uncl.", phylum), class), + order = ifelse(is.na(order) | order == '', sprintf("%s uncl.", class), order), + family = ifelse(is.na(family) | family == '', sprintf("%s uncl.", order), family), + genus = ifelse(is.na(genus) | genus == '', sprintf("%s uncl.", family), genus), + species = ifelse(is.na(species) | species == '', sprintf("%s uncl.", genus), species) + ) %>% + write_tsv("${prefix}.taxonomy_classification.tsv.gz") + + writeLines( + c( + "\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" dplyr: ", packageVersion("dplyr")) + ), + "versions.yml" + ) + """ +} diff --git a/modules/local/formatspades.nf b/modules/local/formatspades.nf new file mode 100644 index 00000000..adac0ca2 --- /dev/null +++ b/modules/local/formatspades.nf @@ -0,0 +1,33 @@ +process FORMATSPADES { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::gzip=1.11" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gzip:1.11': + 'biocontainers/gzip:1.11' }" + + input: + tuple val(meta), path(assembly) + + output: + tuple val(meta), path("rnaspades.format_header.transcript.fa.gz") , emit: assembly + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + cp $assembly rnaspades.fa.gz + gunzip -c rnaspades.fa.gz | sed 's/>NODE_\\([0-9]*\\).*/>NODE_\\1/g' | gzip > rnaspades.format_header.transcript.fa.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gzip: \$( echo \$(gzip --version 2>&1)| sed 's/.* gzip//') + END_VERSIONS + """ +} diff --git a/modules/local/functions.nf b/modules/local/functions.nf deleted file mode 100644 index 85628ee0..00000000 --- a/modules/local/functions.nf +++ /dev/null @@ -1,78 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Extract name of module from process name using $task.process -// -def getProcessName(task_process) { - return task_process.tokenize(':')[-1] -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - - // Do not publish versions.yml unless running from pytest workflow - if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { - return null - } - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } -} diff --git a/modules/local/hmmrank.nf b/modules/local/hmmrank.nf new file mode 100644 index 00000000..62756158 --- /dev/null +++ b/modules/local/hmmrank.nf @@ -0,0 +1,56 @@ +process HMMRANK { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" + + input: + tuple val(meta), path(tblouts) + + output: + tuple val(meta), path("*.hmmrank.tsv.gz"), emit: hmmrank + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + #!/usr/bin/env Rscript + library(readr) + library(dplyr) + library(tidyr) + library(stringr) + + # Read all the tblout files + + read_fwf(c('${tblouts.join("','")}'), fwf_cols(content = c(1, NA)), col_types = cols(content = col_character()), comment='#', id = 'fname') %>% + filter(! grepl('^ *#', content)) %>% + separate( + content, + c('accno', 't0', 'profile_desc', 't1', 'evalue', 'score', 'bias', 'f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'rest'), + '\\\\s+', extra='merge', convert = FALSE + ) %>% + transmute(profile = basename(fname) %>% str_remove('${prefix}\\\\.') %>% str_remove('.tbl.gz'), accno, profile_desc, evalue = as.double(evalue), score = as.double(score)) %>% + # Group and calculate a rank based on score and evalue; let ties be resolved by profile in alphabetical order + group_by(accno) %>% + arrange(desc(score), evalue, profile) %>% + mutate(rank = row_number()) %>% + ungroup() %>% + write_tsv('${prefix}.hmmrank.tsv.gz') + + writeLines( + c( + "\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" tidyverse: ", packageVersion('tidyverse')) + ), + "versions.yml" + ) + """ +} diff --git a/modules/local/kofamscan/download.nf b/modules/local/kofamscan/download.nf new file mode 100644 index 00000000..78712a3c --- /dev/null +++ b/modules/local/kofamscan/download.nf @@ -0,0 +1,33 @@ +process KOFAMSCAN_DOWNLOAD { + tag "KEGG data" + label 'process_long' + + conda "bioconda::gnu-wget=1.18" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gnu-wget:1.18--h36e9172_9': + 'quay.io/biocontainers/gnu-wget:1.18--h36e9172_9' }" + + output: + path "ko_list" , emit: ko_list + path "profiles" , emit: koprofiles + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + + """ + wget https://www.genome.jp/ftp/db/kofam/ko_list.gz + gunzip ko_list.gz + + wget https://www.genome.jp/ftp/db/kofam/profiles.tar.gz + tar -zxf profiles.tar.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(wget --version | grep 'GNU Wget' | sed 's/GNU Wget \\([0-9.]\\+\\) .*/\\1/') + END_VERSIONS + """ +} diff --git a/modules/local/kofamscan/scan.nf b/modules/local/kofamscan/scan.nf new file mode 100644 index 00000000..eaa0aac7 --- /dev/null +++ b/modules/local/kofamscan/scan.nf @@ -0,0 +1,52 @@ +process KOFAMSCAN_SCAN { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::kofamscan=1.3.0" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kofamscan:1.3.0--hdfd78af_2': + 'quay.io/biocontainers/kofamscan:1.3.0--hdfd78af_2' }" + + input: + tuple val(meta), path(fasta) + path(ko_list) + path(koprofiles) + + output: + tuple val(meta), path("kofamscan_output.tsv.gz"), emit: kout + path("kofamscan.tsv.gz") , emit: kofamtsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + input = fasta =~ /\.gz$/ ? fasta.name.take(fasta.name.lastIndexOf('.')) : fasta + gunzip = fasta =~ /\.gz$/ ? "gunzip -c ${fasta} > ${input}" : "" + + """ + $gunzip + + exec_annotation \\ + --profile $koprofiles \\ + --ko-list $ko_list \\ + --format detail-tsv \\ + --cpu $task.cpus \\ + $input \\ + -o kofamscan_output.tsv + + # Create a cleaned up version for summary_tables + echo "orf ko thrshld score evalue ko_definition" | gzip -c > kofamscan.tsv.gz + grep -v '#' kofamscan_output.tsv | cut -f 2-7|sed 's/\t"/\t/' | sed 's/"\$//' | gzip -c >> kofamscan.tsv.gz + + # Gzip the original file + gzip kofamscan_output.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kofamscan: \$(echo \$(exec_annotation --version 2>&1) | sed 's/^.*exec_annotation//' ) + END_VERSIONS + """ +} diff --git a/modules/local/megahit/interleaved.nf b/modules/local/megahit/interleaved.nf new file mode 100644 index 00000000..4c940320 --- /dev/null +++ b/modules/local/megahit/interleaved.nf @@ -0,0 +1,51 @@ +process MEGAHIT_INTERLEAVED { + tag "$assembly" + label 'process_high' + + conda "bioconda::megahit=1.2.9 conda-forge::pigz=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-0f92c152b180c7cd39d9b0e6822f8c89ccb59c99:8ec213d21e5d03f9db54898a2baeaf8ec729b447-0' : + 'biocontainers/mulled-v2-0f92c152b180c7cd39d9b0e6822f8c89ccb59c99:8ec213d21e5d03f9db54898a2baeaf8ec729b447-0' }" + + input: + path intl_pe_reads + path se_reads + val assembly + + output: + path("megahit_out/*.contigs.fa.gz") , emit: contigs + path("megahit_out/*.log") , emit: log + path("megahit_out/intermediate_contigs/k*.contigs.fa.gz") , emit: k_contigs + path("megahit_out/intermediate_contigs/k*.addi.fa.gz") , emit: addi_contigs + path("megahit_out/intermediate_contigs/k*.local.fa.gz") , emit: local_contigs + path("megahit_out/intermediate_contigs/k*.final.contigs.fa.gz"), emit: kfinal_contigs + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: '' + single_ends = se_reads ? "--read ${se_reads.join(',')}" : "" + pair_ends = intl_pe_reads ? "--12 ${intl_pe_reads.join(',')}" : "" + + """ + megahit \\ + $pair_ends \\ + ${single_ends} \\ + -t $task.cpus \\ + $args \\ + --out-prefix $assembly + + pigz \\ + --no-name \\ + -p $task.cpus \\ + $args2 \\ + megahit_out/*.fa \\ + megahit_out/intermediate_contigs/*.fa + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megahit_interleaved: \$(echo \$(megahit -v 2>&1) | sed 's/MEGAHIT v//') + END_VERSIONS + """ +} diff --git a/modules/local/merge_summary_tables.nf b/modules/local/merge_summary_tables.nf new file mode 100644 index 00000000..ab7fafbd --- /dev/null +++ b/modules/local/merge_summary_tables.nf @@ -0,0 +1,56 @@ +process MERGE_TABLES { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" + + input: + + tuple val(meta), path(eggtab), path(taxtab), path(kofamscan) + + output: + tuple val(meta), path("${meta.id}_merged_table.tsv.gz") , emit: merged_table + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + #!/usr/bin/env Rscript + + library(dplyr) + library(readr) + library(purrr) + library(tidyr) + library(stringr) + + Sys.glob('*.tsv.gz') %>% + read_tsv() %>% + mutate(sample = as.character(sample)) %>% + pivot_wider(names_from = c(database,field), values_from = value) %>% + write_tsv('${prefix}_merged_table.tsv.gz') + + writeLines( + c( + "\\"${task.process}\\":", + paste0(" R: ", + paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" dplyr: ", packageVersion('dplyr')), + paste0(" dtplyr: ", packageVersion('dtplyr')), + paste0(" data.table: ", packageVersion('data.table')), + paste0(" readr: ", packageVersion('readr')), + paste0(" purrr: ", packageVersion('purrr')), + paste0(" tidyr: ", packageVersion('tidyr')), + paste0(" stringr: ", packageVersion('stringr')) + ), + "versions.yml" + ) + """ +} diff --git a/modules/local/prokkagff2tsv.nf b/modules/local/prokkagff2tsv.nf new file mode 100644 index 00000000..b8735162 --- /dev/null +++ b/modules/local/prokkagff2tsv.nf @@ -0,0 +1,62 @@ +process PROKKAGFF2TSV { + tag "$meta.id" + label 'process_medium' + + conda "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" + + input: + tuple val(meta), path(gff) + + output: + tuple val(meta), path("*.prokka-annotations.tsv.gz"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + #!/usr/bin/env Rscript + + library(data.table) + library(dtplyr) + library(dplyr) + library(tidyr) + library(readr) + library(stringr) + + fread( + cmd = "zgrep -P '\\t' $gff", + col.names = c('contig', 'gene_caller', 'feature', 'start', 'end', 'a', 'strand', 'b', 'c') + ) %>% + separate_rows(c, sep = ';') %>% + separate(c, c('k', 'v'), sep = '=') %>% + pivot_wider(names_from = k, values_from = v) %>% + select(-a, -b) %>% + rename(orf = ID) %>% + rename_all(str_to_lower) %>% + relocate(sort(colnames(.)[8:ncol(.)]), .after = 7) %>% + relocate(orf) %>% + as.data.table() %>% + write_tsv("${prefix}.prokka-annotations.tsv.gz") + + writeLines( + c( + "\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" data.table: ", packageVersion("data.table")), + paste0(" dtplyr: " , packageVersion("dtplyr")), + paste0(" dplyr: " , packageVersion("dplyr")), + paste0(" tidyr: " , packageVersion("tidyr")), + paste0(" readr: " , packageVersion("readr")) + ), + "versions.yml" + ) + """ +} diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index 8f6acadf..f24f9d22 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -1,20 +1,11 @@ -// Import generic module functions -include { saveFiles; getProcessName } from './functions' - -params.options = [:] - process SAMPLESHEET_CHECK { tag "$samplesheet" - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) } + label 'process_single' - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.8.3" - } else { - container "quay.io/biocontainers/python:3.8.3" - } + conda "conda-forge::python=3.8.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" input: path samplesheet @@ -23,6 +14,9 @@ process SAMPLESHEET_CHECK { path '*.csv' , emit: csv path "versions.yml", emit: versions + when: + task.ext.when == null || task.ext.when + script: // This script is bundled with the pipeline, in nf-core/metatdenovo/bin/ """ check_samplesheet.py \\ @@ -30,7 +24,7 @@ process SAMPLESHEET_CHECK { samplesheet.valid.csv cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: + "${task.process}": python: \$(python --version | sed 's/Python //g') END_VERSIONS """ diff --git a/modules/local/sum_kofamscan.nf b/modules/local/sum_kofamscan.nf new file mode 100644 index 00000000..490ff543 --- /dev/null +++ b/modules/local/sum_kofamscan.nf @@ -0,0 +1,67 @@ +process SUM_KOFAMSCAN { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" + + input: + tuple val(meta), path(kofmascan) + path(fcs) + + output: + tuple val(meta), path("${meta.id}.kofamscan_summary.tsv.gz") , emit: kofamscan_summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + #!/usr/bin/env Rscript + + library(dplyr) + library(readr) + library(tidyr) + library(stringr) + library(tidyverse) + + # call the tables into variables + kofams <- read_tsv("kofamscan_output.tsv.gz", show_col_types = FALSE ) %>% + select(-"#") %>% + slice(-1) %>% + rename(orf = "gene name") %>% + distinct(orf, .keep_all = TRUE) + + counts <- list.files(pattern = "*.counts.tsv.gz") %>% + map_df(~read_tsv(., show_col_types = FALSE)) %>% + mutate(sample = as.character(sample)) + + counts %>% + inner_join(kofams, by = 'orf') %>% + group_by(sample) %>% + summarise(value = sum(count), .groups = 'drop') %>% + add_column(database = "kofamscan", field = "kofamscan_n_counts") %>% + relocate(value, .after = last_col()) %>% + write_tsv('${meta.id}.kofamscan_summary.tsv.gz') + + writeLines( + c( + "\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" dplyr: ", packageVersion('dplyr')), + paste0(" dtplyr: ", packageVersion('dtplyr')), + paste0(" data.table: ", packageVersion('data.table')), + paste0(" readr: ", packageVersion('readr')), + paste0(" purrr: ", packageVersion('purrr')), + paste0(" tidyr: ", packageVersion('tidyr')), + paste0(" stringr: ", packageVersion('stringr')) + ), + "versions.yml" + ) + """ +} diff --git a/modules/local/sum_taxonomy.nf b/modules/local/sum_taxonomy.nf new file mode 100644 index 00000000..3a3f349d --- /dev/null +++ b/modules/local/sum_taxonomy.nf @@ -0,0 +1,54 @@ +process SUM_TAXONOMY { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::r-tidyverse=2.0.0 conda-forge::r-dtplyr=1.3.1 conda-forge::r-data.table=1.14.8" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' : + 'biocontainers/mulled-v2-b2ec1fea5791d428eebb8c8ea7409c350d31dada:a447f6b7a6afde38352b24c30ae9cd6e39df95c4-1' }" + + input: + tuple val(meta), val(db), path(taxonomy) + path feature_counts + + output: + tuple val(meta), path("*_summary.tsv.gz") , emit: taxonomy_summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + #!/usr/bin/env Rscript + + library(tidyverse) + + # Read the taxonomy and counts tables + taxonomy <- read_tsv("${taxonomy}", show_col_types = FALSE ) + + counts <- read_tsv("${feature_counts}", show_col_types = FALSE) %>% + mutate(sample = as.character(sample)) + + # Join the two and count the number of ORFs with assigned taxonomy + counts %>% + inner_join(taxonomy, by = 'orf') %>% + group_by(sample) %>% + summarise(value = sum(count), .groups = 'drop') %>% + mutate(database = "${db ?: 'userdb'}", field = "eukulele_n_counts") %>% + relocate(value, .after = last_col()) %>% + write_tsv('${prefix}_summary.tsv.gz') + + writeLines( + c( + "\\"${task.process}\\":", + paste0(" R: ", paste0(R.Version()[c("major","minor")], collapse = ".")), + paste0(" tidyverse: ", packageVersion('tidyverse')) + ), + "versions.yml" + ) + """ +} diff --git a/modules/local/transrate.nf b/modules/local/transrate.nf new file mode 100644 index 00000000..59690a41 --- /dev/null +++ b/modules/local/transrate.nf @@ -0,0 +1,39 @@ +process TRANSRATE { + tag "$meta.id" + label 'process_low' + + conda "bioconda::transrate=1.0.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/transrate:1.0.3--hec16e2b_4': + 'biocontainers/transrate:1.0.3--hec16e2b_4' }" + + input: + tuple val(meta), path(assembly) + + output: + tuple val(meta), path("*assemblies_mqc.csv") , emit: assembly_qc + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + + transrate \\ + --threads $task.cpus \\ + --assembly $assembly \\ + --output ${prefix}_transrate \\ + $args + + mv ${prefix}_transrate/assemblies.csv ${prefix}_assemblies_mqc.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + transrate: \$(transrate --version) + END_VERSIONS + """ +} diff --git a/modules/local/unpigz.nf b/modules/local/unpigz.nf new file mode 100644 index 00000000..ebb293b0 --- /dev/null +++ b/modules/local/unpigz.nf @@ -0,0 +1,34 @@ +process UNPIGZ { + tag "$file" + label 'process_low' + + conda "conda-forge::pigz=2.3.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4': + 'biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(file) + + output: + tuple val(meta), path("$gunzip") , emit: unzipped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + + gunzip = file.toString() - '.gz' + + """ + unpigz \\ + -c \\ + -p $task.cpus \\ + ${file} > $gunzip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( echo \$(pigz --version 2>&1) | sed 's/pigz //') + END_VERSIONS + """ +} diff --git a/modules/local/writespadesyaml.nf b/modules/local/writespadesyaml.nf new file mode 100644 index 00000000..60621378 --- /dev/null +++ b/modules/local/writespadesyaml.nf @@ -0,0 +1,37 @@ +process WRITESPADESYAML { + tag "spades.yaml" + label 'process_single' + + conda "conda-forge::pigz=2.3.4=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'biocontainers/pigz:2.3.4' }" + + input: + path(pe) + path(se) + + output: + path("*.yaml") , emit: yaml + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def read_list = [] + if ( pe ) read_list.add('{ orientation: "fr", type: "paired-end", interlaced reads: [ "' + pe.join('", "') + '" ] }') + if ( se ) read_list.add('{ type: "single", single reads: [ "' + se.join('", "') + '" ] }') + def reads = read_list.join(", ") + """ + cat <<-YAML > spades.yaml + [ $reads ] + YAML + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + writespadesyaml: \$(echo \$(bash --version | grep 'GNU bash' | sed 's/.*version //' | sed 's/ .*//')) + END_VERSIONS + """ +} diff --git a/modules/nf-core/bbmap/align/environment.yml b/modules/nf-core/bbmap/align/environment.yml new file mode 100644 index 00000000..96c4c32c --- /dev/null +++ b/modules/nf-core/bbmap/align/environment.yml @@ -0,0 +1,9 @@ +name: bbmap_align +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bbmap=39.01 + - bioconda::samtools=1.16.1 + - pigz=2.6 diff --git a/modules/nf-core/bbmap/align/main.nf b/modules/nf-core/bbmap/align/main.nf new file mode 100644 index 00000000..e3fee17d --- /dev/null +++ b/modules/nf-core/bbmap/align/main.nf @@ -0,0 +1,58 @@ +process BBMAP_ALIGN { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-008daec56b7aaf3f162d7866758142b9f889d690:e8a286b2e789c091bac0a57302cdc78aa0112353-0' : + 'biocontainers/mulled-v2-008daec56b7aaf3f162d7866758142b9f889d690:e8a286b2e789c091bac0a57302cdc78aa0112353-0' }" + + input: + tuple val(meta), path(fastq) + path ref + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.log"), emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + input = meta.single_end ? "in=${fastq}" : "in=${fastq[0]} in2=${fastq[1]}" + + // Set the db variable to reflect the three possible types of reference input: 1) directory + // named 'ref', 2) directory named something else (containg a 'ref' subdir) or 3) a sequence + // file in fasta format + if ( ref.isDirectory() ) { + if ( ref ==~ /(.\/)?ref\/?/ ) { + db = '' + } else { + db = "path=${ref}" + } + } else { + db = "ref=${ref}" + } + + """ + bbmap.sh \\ + $db \\ + $input \\ + out=${prefix}.bam \\ + $args \\ + threads=$task.cpus \\ + -Xmx${task.memory.toGiga()}g \\ + &> ${prefix}.bbmap.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/bbmap/align/meta.yml b/modules/nf-core/bbmap/align/meta.yml new file mode 100644 index 00000000..a0bddba4 --- /dev/null +++ b/modules/nf-core/bbmap/align/meta.yml @@ -0,0 +1,50 @@ +name: bbmap_align +description: Align short or PacBio reads to a reference genome using BBMap +keywords: + - align + - map + - fasta + - fastq + - genome + - reference +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic tools. + homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + licence: ["UC-LBL license (see package)"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - ref: + type: file + description: | + Either "ref" a directory containing an index, the name of another directory + with a "ref" subdirectory containing an index or the name of a fasta formatted + nucleotide file containg the reference to map to. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - bam: + type: file + description: BAM file + pattern: "*.{bam}" +authors: + - "@erikrikarddaniel" +maintainers: + - "@erikrikarddaniel" diff --git a/modules/nf-core/bbmap/bbduk/environment.yml b/modules/nf-core/bbmap/bbduk/environment.yml new file mode 100644 index 00000000..1221474c --- /dev/null +++ b/modules/nf-core/bbmap/bbduk/environment.yml @@ -0,0 +1,7 @@ +name: bbmap_bbduk +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bbmap=39.01 diff --git a/modules/nf-core/bbmap/bbduk/main.nf b/modules/nf-core/bbmap/bbduk/main.nf new file mode 100644 index 00000000..6453afc6 --- /dev/null +++ b/modules/nf-core/bbmap/bbduk/main.nf @@ -0,0 +1,43 @@ +process BBMAP_BBDUK { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bbmap:39.01--h5c4e2a8_0': + 'biocontainers/bbmap:39.01--h5c4e2a8_0' }" + + input: + tuple val(meta), path(reads) + path contaminants + + output: + tuple val(meta), path('*.fastq.gz'), emit: reads + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def raw = meta.single_end ? "in=${reads[0]}" : "in1=${reads[0]} in2=${reads[1]}" + def trimmed = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.fastq.gz out2=${prefix}_2.fastq.gz" + def contaminants_fa = contaminants ? "ref=$contaminants" : '' + """ + maxmem=\$(echo \"$task.memory\"| sed 's/ GB/g/g') + bbduk.sh \\ + -Xmx\$maxmem \\ + $raw \\ + $trimmed \\ + threads=$task.cpus \\ + $args \\ + $contaminants_fa \\ + &> ${prefix}.bbduk.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bbmap/bbduk/meta.yml b/modules/nf-core/bbmap/bbduk/meta.yml new file mode 100644 index 00000000..9a1f0562 --- /dev/null +++ b/modules/nf-core/bbmap/bbduk/meta.yml @@ -0,0 +1,50 @@ +name: bbmap_bbduk +description: Adapter and quality trimming of sequencing reads +keywords: + - trimming + - adapter trimming + - quality trimming + - fastq +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic tools. + homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + licence: ["UC-LBL license (see package)"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - contaminants: + type: file + description: | + Reference files containing adapter and/or contaminant sequences for sequence kmer matching +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified fastq reads + pattern: "*fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: Bbduk log file + pattern: "*bbduk.log" +authors: + - "@MGordon09" +maintainers: + - "@MGordon09" diff --git a/modules/nf-core/bbmap/bbnorm/environment.yml b/modules/nf-core/bbmap/bbnorm/environment.yml new file mode 100644 index 00000000..8b97ffda --- /dev/null +++ b/modules/nf-core/bbmap/bbnorm/environment.yml @@ -0,0 +1,8 @@ +name: bbmap_bbnorm +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bbmap=39.01 + - pigz=2.6 diff --git a/modules/nf-core/bbmap/bbnorm/main.nf b/modules/nf-core/bbmap/bbnorm/main.nf new file mode 100644 index 00000000..1cac93de --- /dev/null +++ b/modules/nf-core/bbmap/bbnorm/main.nf @@ -0,0 +1,42 @@ +process BBMAP_BBNORM { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-008daec56b7aaf3f162d7866758142b9f889d690:e8a286b2e789c091bac0a57302cdc78aa0112353-0': + 'biocontainers/mulled-v2-008daec56b7aaf3f162d7866758142b9f889d690:e8a286b2e789c091bac0a57302cdc78aa0112353-0' }" + + input: + tuple val(meta), path(fastq) + + output: + tuple val(meta), path("*.fastq.gz"), emit: fastq + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + input = meta.single_end ? "in=${fastq.join(',')}" : "in=${fastq[0]} in2=${fastq[1]}" + output = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.nm.fastq.gz out2=${prefix}_2.nm.fastq.gz" + + """ + bbnorm.sh \\ + $input \\ + $output \\ + $args \\ + threads=$task.cpus \\ + -Xmx${task.memory.toGiga()}g \\ + &> ${prefix}.bbnorm.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bbmap/bbnorm/meta.yml b/modules/nf-core/bbmap/bbnorm/meta.yml new file mode 100644 index 00000000..6c8426f8 --- /dev/null +++ b/modules/nf-core/bbmap/bbnorm/meta.yml @@ -0,0 +1,41 @@ +name: bbmap_bbnorm +description: BBNorm is designed to normalize coverage by down-sampling reads over high-depth areas of a genome, to result in a flat coverage distribution. +keywords: + - normalization + - assembly + - coverage +tools: + - bbmap: + description: "BBMap is a short read aligner, as well as various other bioinformatic tools." + homepage: "https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/" + documentation: "https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/" + tool_dev_url: "https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbnorm-guide/" + licence: "BBMap - Bushnell B. - sourceforge.net/projects/bbmap/" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastq: + type: file + description: fastq file + pattern: "*.{fastq,fq}(.gz)?" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: fastq file + pattern: "*.{fastq, fq}.gz" +authors: + - "@danilodileo" +maintainers: + - "@danilodileo" diff --git a/modules/nf-core/bbmap/index/environment.yml b/modules/nf-core/bbmap/index/environment.yml new file mode 100644 index 00000000..515f8d21 --- /dev/null +++ b/modules/nf-core/bbmap/index/environment.yml @@ -0,0 +1,7 @@ +name: bbmap_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bbmap=39.01 diff --git a/modules/nf-core/bbmap/index/main.nf b/modules/nf-core/bbmap/index/main.nf new file mode 100644 index 00000000..c9b61e17 --- /dev/null +++ b/modules/nf-core/bbmap/index/main.nf @@ -0,0 +1,34 @@ +process BBMAP_INDEX { + tag "$fasta" + label 'process_long' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bbmap:39.01--h5c4e2a8_0': + 'biocontainers/bbmap:39.01--h5c4e2a8_0' }" + + input: + path fasta + + output: + path 'ref' , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + bbmap.sh \\ + ref=${fasta} \\ + $args \\ + threads=$task.cpus \\ + -Xmx${task.memory.toGiga()}g + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bbmap/index/meta.yml b/modules/nf-core/bbmap/index/meta.yml new file mode 100644 index 00000000..7e1cbc9f --- /dev/null +++ b/modules/nf-core/bbmap/index/meta.yml @@ -0,0 +1,30 @@ +name: bbmap_index +description: Creates an index from a fasta file, ready to be used by bbmap.sh in mapping mode. +keywords: + - map + - index + - fasta +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic tools. + homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + licence: ["UC-LBL license (see package)"] +input: + - fasta: + type: fasta + description: fasta formatted file with nucleotide sequences + pattern: "*.{fna,fa,fasta}" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - db: + type: directory + description: Directory with index files + pattern: "ref" +authors: + - "@daniellundin" +maintainers: + - "@daniellundin" diff --git a/modules/nf-core/cat/cat/environment.yml b/modules/nf-core/cat/cat/environment.yml new file mode 100644 index 00000000..17a04ef2 --- /dev/null +++ b/modules/nf-core/cat/cat/environment.yml @@ -0,0 +1,7 @@ +name: cat_cat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::pigz=2.3.4 diff --git a/modules/nf-core/cat/cat/main.nf b/modules/nf-core/cat/cat/main.nf new file mode 100644 index 00000000..adbdbd7b --- /dev/null +++ b/modules/nf-core/cat/cat/main.nf @@ -0,0 +1,79 @@ +process CAT_CAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/pigz:2.3.4' : + 'biocontainers/pigz:2.3.4' }" + + input: + tuple val(meta), path(files_in) + + output: + tuple val(meta), path("${prefix}"), emit: file_out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def file_list = files_in.collect { it.toString() } + + // choose appropriate concatenation tool depending on input and output format + + // | input | output | command1 | command2 | + // |-----------|------------|----------|----------| + // | gzipped | gzipped | cat | | + // | ungzipped | ungzipped | cat | | + // | gzipped | ungzipped | zcat | | + // | ungzipped | gzipped | cat | pigz | + + // Use input file ending as default + prefix = task.ext.prefix ?: "${meta.id}${getFileSuffix(file_list[0])}" + out_zip = prefix.endsWith('.gz') + in_zip = file_list[0].endsWith('.gz') + command1 = (in_zip && !out_zip) ? 'zcat' : 'cat' + command2 = (!in_zip && out_zip) ? "| pigz -c -p $task.cpus $args2" : '' + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } + """ + $command1 \\ + $args \\ + ${file_list.join(' ')} \\ + $command2 \\ + > ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def file_list = files_in.collect { it.toString() } + prefix = task.ext.prefix ?: "${meta.id}${file_list[0].substring(file_list[0].lastIndexOf('.'))}" + if(file_list.contains(prefix.trim())) { + error "The name of the input file can't be the same as for the output prefix in the " + + "module CAT_CAT (currently `$prefix`). Please choose a different one." + } + """ + touch $prefix + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} + +// for .gz files also include the second to last extension if it is present. E.g., .fasta.gz +def getFileSuffix(filename) { + def match = filename =~ /^.*?((\.\w{1,5})?(\.\w{1,5}\.gz$))/ + return match ? match[0][1] : filename.substring(filename.lastIndexOf('.')) +} + diff --git a/modules/nf-core/cat/cat/meta.yml b/modules/nf-core/cat/cat/meta.yml new file mode 100644 index 00000000..00a8db0b --- /dev/null +++ b/modules/nf-core/cat/cat/meta.yml @@ -0,0 +1,36 @@ +name: cat_cat +description: A module for concatenation of gzipped or uncompressed files +keywords: + - concatenate + - gzip + - cat +tools: + - cat: + description: Just concatenation + documentation: https://man7.org/linux/man-pages/man1/cat.1.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - files_in: + type: file + description: List of compressed / uncompressed files + pattern: "*" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - file_out: + type: file + description: Concatenated file. Will be gzipped if file_out ends with ".gz" + pattern: "${file_out}" +authors: + - "@erikrikarddaniel" + - "@FriederikeHanssen" +maintainers: + - "@erikrikarddaniel" + - "@FriederikeHanssen" diff --git a/modules/nf-core/cat/cat/tests/main.nf.test b/modules/nf-core/cat/cat/tests/main.nf.test new file mode 100644 index 00000000..aaae04f9 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test @@ -0,0 +1,177 @@ +nextflow_process { + + name "Test Process CAT_CAT" + script "../main.nf" + process "CAT_CAT" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/cat" + + test("test_cat_name_conflict") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'genome', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert !process.success }, + { assert process.stdout.toString().contains("The name of the input file can't be the same as for the output prefix") } + ) + } + } + + test("test_cat_unzipped_unzipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + + test("test_cat_zipped_zipped") { + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_zipped_unzipped") { + config './nextflow_zipped_unzipped.config' + + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_gff3_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['contigs_genome_maf_gz'], checkIfExists: true) + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_cat_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_sizes'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_one_file_unzipped_zipped") { + config './nextflow_unzipped_zipped.config' + when { + params { + outdir = "${outputDir}" + } + process { + """ + input[0] = + [ + [ id:'test', single_end:true ], + [ + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true) + ] + ] + """ + } + } + then { + def lines = path(process.out.file_out.get(0).get(1)).linesGzip + assertAll( + { assert process.success }, + { assert snapshot(lines[0..5]).match("test_cat_one_file_unzipped_zipped_lines") }, + { assert snapshot(lines.size()).match("test_cat_one_file_unzipped_zipped_size")} + ) + } + } +} + diff --git a/modules/nf-core/cat/cat/tests/main.nf.test.snap b/modules/nf-core/cat/cat/tests/main.nf.test.snap new file mode 100644 index 00000000..0c9bfe8d --- /dev/null +++ b/modules/nf-core/cat/cat/tests/main.nf.test.snap @@ -0,0 +1,145 @@ +{ + "test_cat_unzipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fasta:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:18.500464399" + }, + "test_cat_zipped_unzipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2023-10-16T14:32:49.642741302" + }, + "test_cat_zipped_zipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.gff3.gz:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "test.gff3.gz:md5,c439d3b60e7bc03e8802a451a0d9a5d9" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2024-01-12T14:02:02.999254641" + }, + "test_cat_one_file_unzipped_zipped_lines": { + "content": [ + [ + ">MT192765.1 Severe acute respiratory syndrome coronavirus 2 isolate SARS-CoV-2/human/USA/PC00101P/2020, complete genome", + "GTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTTGTAGATCTGTTCTCTAAACGAACTTTAAAATCTGT", + "GTGGCTGTCACTCGGCTGCATGCTTAGTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAG", + "TAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTTGTCCGG", + "GTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTT", + "ACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAAG" + ] + ], + "timestamp": "2023-10-16T14:33:21.39642399" + }, + "test_cat_unzipped_zipped": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt.gz:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "1": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ], + "file_out": [ + [ + { + "id": "test", + "single_end": true + }, + "cat.txt.gz:md5,f44b33a0e441ad58b2d3700270e2dbe2" + ] + ], + "versions": [ + "versions.yml:md5,115ed6177ebcff24eb99d503fa5ef894" + ] + } + ], + "timestamp": "2024-01-12T14:08:26.948048418" + }, + "test_cat_one_file_unzipped_zipped_size": { + "content": [ + 374 + ], + "timestamp": "2024-01-12T14:10:22.445700266" + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config new file mode 100644 index 00000000..ec26b0fd --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_unzipped_zipped.config @@ -0,0 +1,6 @@ + +process { + withName: CAT_CAT { + ext.prefix = 'cat.txt.gz' + } +} diff --git a/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config new file mode 100644 index 00000000..fbc79783 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/nextflow_zipped_unzipped.config @@ -0,0 +1,8 @@ + +process { + + withName: CAT_CAT { + ext.prefix = 'cat.txt' + } + +} diff --git a/modules/nf-core/cat/cat/tests/tags.yml b/modules/nf-core/cat/cat/tests/tags.yml new file mode 100644 index 00000000..37b578f5 --- /dev/null +++ b/modules/nf-core/cat/cat/tests/tags.yml @@ -0,0 +1,2 @@ +cat/cat: + - modules/nf-core/cat/cat/** diff --git a/modules/nf-core/cat/fastq/environment.yml b/modules/nf-core/cat/fastq/environment.yml new file mode 100644 index 00000000..bff93add --- /dev/null +++ b/modules/nf-core/cat/fastq/environment.yml @@ -0,0 +1,7 @@ +name: cat_fastq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf new file mode 100644 index 00000000..3d963784 --- /dev/null +++ b/modules/nf-core/cat/fastq/main.nf @@ -0,0 +1,80 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size >= 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size > 1) { + """ + touch ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + """ + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + +} diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml new file mode 100644 index 00000000..db4ac3c7 --- /dev/null +++ b/modules/nf-core/cat/fastq/meta.yml @@ -0,0 +1,42 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - cat + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" +maintainers: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test b/modules/nf-core/cat/fastq/tests/main.nf.test new file mode 100644 index 00000000..dab2e14c --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test @@ -0,0 +1,138 @@ +nextflow_process { + + name "Test Process CAT_FASTQ" + script "../main.nf" + process "CAT_FASTQ" + tag "modules" + tag "modules_nfcore" + tag "cat" + tag "cat/fastq" + + test("test_cat_fastq_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_single_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_paired_end_same_name") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("test_cat_fastq_single_end_single_file") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/cat/fastq/tests/main.nf.test.snap b/modules/nf-core/cat/fastq/tests/main.nf.test.snap new file mode 100644 index 00000000..43dfe28f --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/main.nf.test.snap @@ -0,0 +1,169 @@ +{ + "test_cat_fastq_single_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,ee314a9bd568d06617171b0c85f508da" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,ee314a9bd568d06617171b0c85f508da" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:30:39.816981" + }, + "test_cat_fastq_single_end_same_name": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:32:35.229332" + }, + "test_cat_fastq_single_end_single_file": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test.merged.fastq.gz:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:34:00.058829" + }, + "test_cat_fastq_paired_end_same_name": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:33:33.031555" + }, + "test_cat_fastq_paired_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "1": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.merged.fastq.gz:md5,3ad9406595fafec8172368f9cd0b6a22", + "test_2.merged.fastq.gz:md5,a52cab0b840c7178b0ea83df1fdbe8d5" + ] + ] + ], + "versions": [ + "versions.yml:md5,d42d6e24d67004608495883e00bd501b" + ] + } + ], + "timestamp": "2024-01-17T17:32:02.270935" + } +} \ No newline at end of file diff --git a/modules/nf-core/cat/fastq/tests/tags.yml b/modules/nf-core/cat/fastq/tests/tags.yml new file mode 100644 index 00000000..6ac43614 --- /dev/null +++ b/modules/nf-core/cat/fastq/tests/tags.yml @@ -0,0 +1,2 @@ +cat/fastq: + - modules/nf-core/cat/fastq/** diff --git a/modules/nf-core/custom/dumpsoftwareversions/environment.yml b/modules/nf-core/custom/dumpsoftwareversions/environment.yml new file mode 100644 index 00000000..9b3272bc --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/environment.yml @@ -0,0 +1,7 @@ +name: custom_dumpsoftwareversions +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.19 diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf new file mode 100644 index 00000000..f2187611 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -0,0 +1,24 @@ +process CUSTOM_DUMPSOFTWAREVERSIONS { + label 'process_single' + + // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" + + input: + path versions + + output: + path "software_versions.yml" , emit: yml + path "software_versions_mqc.yml", emit: mqc_yml + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + template 'dumpsoftwareversions.py' +} diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml similarity index 82% rename from modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml rename to modules/nf-core/custom/dumpsoftwareversions/meta.yml index 8d4a6ed4..5f15a5fd 100644 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/meta.yml +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -1,20 +1,21 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: custom_dumpsoftwareversions description: Custom module used to dump software versions within the nf-core pipeline template keywords: - custom + - dump - version tools: - custom: description: Custom module used to dump software versions within the nf-core pipeline template homepage: https://github.com/nf-core/tools documentation: https://github.com/nf-core/tools - + licence: ["MIT"] input: - versions: type: file description: YML file containing software versions pattern: "*.yml" - output: - yml: type: file @@ -28,6 +29,9 @@ output: type: file description: File containing software versions pattern: "versions.yml" - authors: - "@drpatelh" + - "@grst" +maintainers: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py new file mode 100755 index 00000000..da033408 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + + +"""Provide functions to merge multiple versions.yml files.""" + + +import yaml +import platform +from textwrap import dedent + + +def _make_versions_html(versions): + """Generate a tabular HTML output of all versions for MultiQC.""" + html = [ + dedent( + """\\ + + + + + + + + + + """ + ) + ] + for process, tmp_versions in sorted(versions.items()): + html.append("") + for i, (tool, version) in enumerate(sorted(tmp_versions.items())): + html.append( + dedent( + f"""\\ + + + + + + """ + ) + ) + html.append("") + html.append("
    Process Name Software Version
    {process if (i == 0) else ''}{tool}{version}
    ") + return "\\n".join(html) + + +def main(): + """Load all version files and generate merged output.""" + versions_this_module = {} + versions_this_module["${task.process}"] = { + "python": platform.python_version(), + "yaml": yaml.__version__, + } + + with open("$versions") as f: + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + + # aggregate versions by the module name (derived from fully-qualified process name) + versions_by_module = {} + for process, process_versions in versions_by_process.items(): + module = process.split(":")[-1] + try: + if versions_by_module[module] != process_versions: + raise AssertionError( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) + except KeyError: + versions_by_module[module] = process_versions + + versions_by_module["Workflow"] = { + "Nextflow": "$workflow.nextflow.version", + "$workflow.manifest.name": "$workflow.manifest.version", + } + + versions_mqc = { + "id": "software_versions", + "section_name": "${workflow.manifest.name} Software Versions", + "section_href": "https://github.com/${workflow.manifest.name}", + "plot_type": "html", + "description": "are collected at run time from the software output.", + "data": _make_versions_html(versions_by_module), + } + + with open("software_versions.yml", "w") as f: + yaml.dump(versions_by_module, f, default_flow_style=False) + with open("software_versions_mqc.yml", "w") as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + with open("versions.yml", "w") as f: + yaml.dump(versions_this_module, f, default_flow_style=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test new file mode 100644 index 00000000..b1e1630b --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test @@ -0,0 +1,43 @@ +nextflow_process { + + name "Test Process CUSTOM_DUMPSOFTWAREVERSIONS" + script "../main.nf" + process "CUSTOM_DUMPSOFTWAREVERSIONS" + tag "modules" + tag "modules_nfcore" + tag "custom" + tag "dumpsoftwareversions" + tag "custom/dumpsoftwareversions" + + test("Should run without failures") { + when { + process { + """ + def tool1_version = ''' + TOOL1: + tool1: 0.11.9 + '''.stripIndent() + + def tool2_version = ''' + TOOL2: + tool2: 1.9 + '''.stripIndent() + + input[0] = Channel.of(tool1_version, tool2_version).collectFile() + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + file(process.out.mqc_yml[0]).readLines()[0..10], + file(process.out.yml[0]).readLines()[0..7] + ).match() + } + ) + } + } +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap new file mode 100644 index 00000000..5f59a936 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "Should run without failures": { + "content": [ + [ + "versions.yml:md5,76d454d92244589d32455833f7c1ba6d" + ], + [ + "data: \"\\n\\n \\n \\n \\n \\n \\n \\n \\n\\", + " \\n\\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n \\n \\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\", + " \\ \\n\\n\\n\\n\\n\\n \\n\\", + " \\ \\n \\n\\n\\n\\n\\", + " \\n\\n \\n \\n\\" + ], + [ + "CUSTOM_DUMPSOFTWAREVERSIONS:", + " python: 3.11.7", + " yaml: 5.4.1", + "TOOL1:", + " tool1: 0.11.9", + "TOOL2:", + " tool2: '1.9'", + "Workflow:" + ] + ], + "timestamp": "2024-01-09T23:01:18.710682" + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml new file mode 100644 index 00000000..405aa24a --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/tests/tags.yml @@ -0,0 +1,2 @@ +custom/dumpsoftwareversions: + - modules/nf-core/custom/dumpsoftwareversions/** diff --git a/modules/nf-core/fastqc/environment.yml b/modules/nf-core/fastqc/environment.yml new file mode 100644 index 00000000..1787b38a --- /dev/null +++ b/modules/nf-core/fastqc/environment.yml @@ -0,0 +1,7 @@ +name: fastqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::fastqc=0.12.1 diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf new file mode 100644 index 00000000..9e19a74c --- /dev/null +++ b/modules/nf-core/fastqc/main.nf @@ -0,0 +1,55 @@ +process FASTQC { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : + 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.html"), emit: html + tuple val(meta), path("*.zip") , emit: zip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + // Make list of old name and new name pairs to use for renaming in the bash while loop + def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } + def rename_to = old_new_pairs*.join(' ').join(' ') + def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') + """ + printf "%s %s\\n" $rename_to | while read old_name new_name; do + [ -f "\${new_name}" ] || ln -s \$old_name \$new_name + done + + fastqc \\ + $args \\ + --threads $task.cpus \\ + $renamed_files + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.html + touch ${prefix}.zip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml new file mode 100644 index 00000000..ee5507e0 --- /dev/null +++ b/modules/nf-core/fastqc/meta.yml @@ -0,0 +1,57 @@ +name: fastqc +description: Run FastQC on sequenced reads +keywords: + - quality control + - qc + - adapters + - fastq +tools: + - fastqc: + description: | + FastQC gives general quality metrics about your reads. + It provides information about the quality score distribution + across your reads, the per base sequence content (%A/C/G/T). + You get information about adapter contamination and other + overrepresented sequences. + homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ + documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ + licence: ["GPL-2.0-only"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - html: + type: file + description: FastQC report + pattern: "*_{fastqc.html}" + - zip: + type: file + description: FastQC report archive + pattern: "*_{fastqc.zip}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@grst" + - "@ewels" + - "@FelixKrueger" +maintainers: + - "@drpatelh" + - "@grst" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test new file mode 100644 index 00000000..70edae4d --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -0,0 +1,212 @@ +nextflow_process { + + name "Test Process FASTQC" + script "../main.nf" + process "FASTQC" + + tag "modules" + tag "modules_nfcore" + tag "fastqc" + + test("sarscov2 single-end [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [ id: 'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. + // looks like this:
    Mon 2 Oct 2023
    test.gz
    + // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("fastqc_versions_single") } + ) + } + } + + test("sarscov2 paired-end [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("") }, + { assert path(process.out.html[0][1][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("fastqc_versions_paired") } + ) + } + } + + test("sarscov2 interleaved [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("fastqc_versions_interleaved") } + ) + } + } + + test("sarscov2 paired-end [bam]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("fastqc_versions_bam") } + ) + } + } + + test("sarscov2 multiple [fastq]") { + + when { + process { + """ + input[0] = Channel.of([ + [id: 'test', single_end: false], // meta map + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, + { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, + { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" }, + { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" }, + { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, + { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, + { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" }, + { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" }, + { assert path(process.out.html[0][1][0]).text.contains("") }, + { assert path(process.out.html[0][1][1]).text.contains("") }, + { assert path(process.out.html[0][1][2]).text.contains("") }, + { assert path(process.out.html[0][1][3]).text.contains("") }, + + { assert snapshot(process.out.versions).match("fastqc_versions_multiple") } + ) + } + } + + test("sarscov2 custom_prefix") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'mysample', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("") }, + + { assert snapshot(process.out.versions).match("fastqc_versions_custom_prefix") } + ) + } + } + + test("sarscov2 single-end [fastq] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [ id: 'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.html.collect { file(it[1]).getName() } + + process.out.zip.collect { file(it[1]).getName() } + + process.out.versions ).match("fastqc_stub") } + ) + } + } + +} diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap new file mode 100644 index 00000000..86f7c311 --- /dev/null +++ b/modules/nf-core/fastqc/tests/main.nf.test.snap @@ -0,0 +1,88 @@ +{ + "fastqc_versions_interleaved": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:40:07.293713" + }, + "fastqc_stub": { + "content": [ + [ + "test.html", + "test.zip", + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:31:01.425198" + }, + "fastqc_versions_multiple": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:40:55.797907" + }, + "fastqc_versions_bam": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:40:26.795862" + }, + "fastqc_versions_single": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:39:27.043675" + }, + "fastqc_versions_paired": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:39:47.584191" + }, + "fastqc_versions_custom_prefix": { + "content": [ + [ + "versions.yml:md5,e1cc25ca8af856014824abd842e93978" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:41:14.576531" + } +} \ No newline at end of file diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml new file mode 100644 index 00000000..7834294b --- /dev/null +++ b/modules/nf-core/fastqc/tests/tags.yml @@ -0,0 +1,2 @@ +fastqc: + - modules/nf-core/fastqc/** diff --git a/modules/nf-core/hmmer/hmmsearch/environment.yml b/modules/nf-core/hmmer/hmmsearch/environment.yml new file mode 100644 index 00000000..89978984 --- /dev/null +++ b/modules/nf-core/hmmer/hmmsearch/environment.yml @@ -0,0 +1,7 @@ +name: hmmer_hmmsearch +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::hmmer=3.3.2 diff --git a/modules/nf-core/hmmer/hmmsearch/main.nf b/modules/nf-core/hmmer/hmmsearch/main.nf new file mode 100644 index 00000000..ad3117c8 --- /dev/null +++ b/modules/nf-core/hmmer/hmmsearch/main.nf @@ -0,0 +1,51 @@ +process HMMER_HMMSEARCH { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/hmmer:3.3.2--h1b792b2_1' : + 'biocontainers/hmmer:3.3.2--h1b792b2_1' }" + + input: + tuple val(meta), path(hmmfile), path(seqdb), val(write_align), val(write_target), val(write_domain) + + output: + tuple val(meta), path('*.txt.gz') , emit: output + tuple val(meta), path('*.sto.gz') , emit: alignments , optional: true + tuple val(meta), path('*.tbl.gz') , emit: target_summary, optional: true + tuple val(meta), path('*.domtbl.gz'), emit: domain_summary, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + output = "${prefix}.txt" + alignment = write_align ? "-A ${prefix}.sto" : '' + target_summary = write_target ? "--tblout ${prefix}.tbl" : '' + domain_summary = write_domain ? "--domtblout ${prefix}.domtbl" : '' + """ + hmmsearch \\ + $args \\ + --cpu $task.cpus \\ + -o $output \\ + $alignment \\ + $target_summary \\ + $domain_summary \\ + $hmmfile \\ + $seqdb + + gzip --no-name *.txt \\ + ${write_align ? '*.sto' : ''} \\ + ${write_target ? '*.tbl' : ''} \\ + ${write_domain ? '*.domtbl' : ''} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + hmmer: \$(hmmsearch -h | grep -o '^# HMMER [0-9.]*' | sed 's/^# HMMER *//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/hmmer/hmmsearch/meta.yml b/modules/nf-core/hmmer/hmmsearch/meta.yml new file mode 100644 index 00000000..a0c39761 --- /dev/null +++ b/modules/nf-core/hmmer/hmmsearch/meta.yml @@ -0,0 +1,68 @@ +name: hmmer_hmmsearch +description: search profile(s) against a sequence database +keywords: + - hidden Markov model + - HMM + - hmmer + - hmmsearch +tools: + - hmmer: + description: Biosequence analysis using profile hidden Markov models + homepage: http://hmmer.org/ + documentation: http://hmmer.org/documentation.html + tool_dev_url: https://github.com/EddyRivasLab/hmmer + doi: "10.1371/journal.pcbi.1002195" + licence: ["BSD"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - hmmfile: + type: file + description: One or more HMM profiles created with hmmbuild + pattern: "*.{hmm,hmm.gz}" + - seqdb: + type: file + description: Database of sequences in FASTA format + pattern: "*.{fasta,fna,faa,fa,fasta.gz,fna.gz,faa.gz,fa.gz}" + - write_align: + type: val + description: Flag to write optional alignment output. Specify with 'true' to output + - write_target: + type: val + description: Flag to write optional per target summary . Specify with 'true' to output + - write_domain: + type: val + description: Flag to write optional per domain summary. Specify with 'true' to output +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - output: + type: file + description: Human readable output summarizing hmmsearch results + pattern: "*.{txt.gz}" + - alignments: + type: file + description: Optional multiple sequence alignment (MSA) in Stockholm format + pattern: "*.{sto.gz}" + - target_summary: + type: file + description: Optional tabular (space-delimited) summary of per-target output + pattern: "*.{tbl.gz}" + - domain_summary: + type: file + description: Optional tabular (space-delimited) summary of per-domain output + pattern: "*.{domtbl.gz}" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/functions.nf b/modules/nf-core/modules/custom/dumpsoftwareversions/functions.nf deleted file mode 100644 index 85628ee0..00000000 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/functions.nf +++ /dev/null @@ -1,78 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Extract name of module from process name using $task.process -// -def getProcessName(task_process) { - return task_process.tokenize(':')[-1] -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - - // Do not publish versions.yml unless running from pytest workflow - if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { - return null - } - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } -} diff --git a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf b/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf deleted file mode 100644 index faf2073f..00000000 --- a/modules/nf-core/modules/custom/dumpsoftwareversions/main.nf +++ /dev/null @@ -1,106 +0,0 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] -options = initOptions(params.options) - -process CUSTOM_DUMPSOFTWAREVERSIONS { - label 'process_low' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) } - - // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container - conda (params.enable_conda ? "bioconda::multiqc=1.11" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0" - } else { - container "quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0" - } - - input: - path versions - - output: - path "software_versions.yml" , emit: yml - path "software_versions_mqc.yml", emit: mqc_yml - path "versions.yml" , emit: versions - - script: - """ - #!/usr/bin/env python - - import yaml - import platform - from textwrap import dedent - - def _make_versions_html(versions): - html = [ - dedent( - '''\\ - -
    Process Name \\", + " \\ Software Version
    CUSTOM_DUMPSOFTWAREVERSIONSpython3.11.7
    yaml5.4.1
    TOOL1tool10.11.9
    TOOL2tool21.9
    WorkflowNextflow
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    File typeConventional base calls
    - - - - - - - - ''' - ) - ] - for process, tmp_versions in sorted(versions.items()): - html.append("") - for i, (tool, version) in enumerate(sorted(tmp_versions.items())): - html.append( - dedent( - f'''\\ - - - - - - ''' - ) - ) - html.append("") - html.append("
    Process Name Software Version
    {process if (i == 0) else ''}{tool}{version}
    ") - return "\\n".join(html) - - module_versions = {} - module_versions["${getProcessName(task.process)}"] = { - 'python': platform.python_version(), - 'yaml': yaml.__version__ - } - - with open("$versions") as f: - workflow_versions = yaml.load(f, Loader=yaml.BaseLoader) | module_versions - - workflow_versions["Workflow"] = { - "Nextflow": "$workflow.nextflow.version", - "$workflow.manifest.name": "$workflow.manifest.version" - } - - versions_mqc = { - 'id': 'software_versions', - 'section_name': '${workflow.manifest.name} Software Versions', - 'section_href': 'https://github.com/${workflow.manifest.name}', - 'plot_type': 'html', - 'description': 'are collected at run time from the software output.', - 'data': _make_versions_html(workflow_versions) - } - - with open("software_versions.yml", 'w') as f: - yaml.dump(workflow_versions, f, default_flow_style=False) - with open("software_versions_mqc.yml", 'w') as f: - yaml.dump(versions_mqc, f, default_flow_style=False) - - with open('versions.yml', 'w') as f: - yaml.dump(module_versions, f, default_flow_style=False) - """ -} diff --git a/modules/nf-core/modules/fastqc/functions.nf b/modules/nf-core/modules/fastqc/functions.nf deleted file mode 100644 index 85628ee0..00000000 --- a/modules/nf-core/modules/fastqc/functions.nf +++ /dev/null @@ -1,78 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Extract name of module from process name using $task.process -// -def getProcessName(task_process) { - return task_process.tokenize(':')[-1] -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - - // Do not publish versions.yml unless running from pytest workflow - if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { - return null - } - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } -} diff --git a/modules/nf-core/modules/fastqc/main.nf b/modules/nf-core/modules/fastqc/main.nf deleted file mode 100644 index 9f6cfc55..00000000 --- a/modules/nf-core/modules/fastqc/main.nf +++ /dev/null @@ -1,54 +0,0 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] -options = initOptions(params.options) - -process FASTQC { - tag "$meta.id" - label 'process_medium' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:meta, publish_by_meta:['id']) } - - conda (params.enable_conda ? "bioconda::fastqc=0.11.9" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/fastqc:0.11.9--0" - } else { - container "quay.io/biocontainers/fastqc:0.11.9--0" - } - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.html"), emit: html - tuple val(meta), path("*.zip") , emit: zip - path "versions.yml" , emit: versions - - script: - // Add soft-links to original FastQs for consistent naming in pipeline - def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" - if (meta.single_end) { - """ - [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz - fastqc $options.args --threads $task.cpus ${prefix}.fastq.gz - - cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: - ${getSoftwareName(task.process)}: \$( fastqc --version | sed -e "s/FastQC v//g" ) - END_VERSIONS - """ - } else { - """ - [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz - [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz - fastqc $options.args --threads $task.cpus ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz - - cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: - ${getSoftwareName(task.process)}: \$( fastqc --version | sed -e "s/FastQC v//g" ) - END_VERSIONS - """ - } -} diff --git a/modules/nf-core/modules/fastqc/meta.yml b/modules/nf-core/modules/fastqc/meta.yml deleted file mode 100644 index 48031356..00000000 --- a/modules/nf-core/modules/fastqc/meta.yml +++ /dev/null @@ -1,51 +0,0 @@ -name: fastqc -description: Run FastQC on sequenced reads -keywords: - - quality control - - qc - - adapters - - fastq -tools: - - fastqc: - description: | - FastQC gives general quality metrics about your reads. - It provides information about the quality score distribution - across your reads, the per base sequence content (%A/C/G/T). - You get information about adapter contamination and other - overrepresented sequences. - homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ - documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - html: - type: file - description: FastQC report - pattern: "*_{fastqc.html}" - - zip: - type: file - description: FastQC report archive - pattern: "*_{fastqc.zip}" - - version: - type: file - description: File containing software version - pattern: "versions.yml" -authors: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" diff --git a/modules/nf-core/modules/multiqc/functions.nf b/modules/nf-core/modules/multiqc/functions.nf deleted file mode 100644 index 85628ee0..00000000 --- a/modules/nf-core/modules/multiqc/functions.nf +++ /dev/null @@ -1,78 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Extract name of module from process name using $task.process -// -def getProcessName(task_process) { - return task_process.tokenize(':')[-1] -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - - // Do not publish versions.yml unless running from pytest workflow - if (args.filename.equals('versions.yml') && !System.getenv("NF_CORE_MODULES_TEST")) { - return null - } - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } -} diff --git a/modules/nf-core/modules/multiqc/main.nf b/modules/nf-core/modules/multiqc/main.nf deleted file mode 100644 index 0861aa59..00000000 --- a/modules/nf-core/modules/multiqc/main.nf +++ /dev/null @@ -1,38 +0,0 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName; getProcessName } from './functions' - -params.options = [:] -options = initOptions(params.options) - -process MULTIQC { - label 'process_medium' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:getSoftwareName(task.process), meta:[:], publish_by_meta:[]) } - - conda (params.enable_conda ? 'bioconda::multiqc=1.11' : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/multiqc:1.11--pyhdfd78af_0" - } else { - container "quay.io/biocontainers/multiqc:1.11--pyhdfd78af_0" - } - - input: - path multiqc_files - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - script: - """ - multiqc -f $options.args . - - cat <<-END_VERSIONS > versions.yml - ${getProcessName(task.process)}: - ${getSoftwareName(task.process)}: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/modules/multiqc/meta.yml b/modules/nf-core/modules/multiqc/meta.yml deleted file mode 100644 index 2d99ec0d..00000000 --- a/modules/nf-core/modules/multiqc/meta.yml +++ /dev/null @@ -1,39 +0,0 @@ -name: MultiQC -description: Aggregate results from bioinformatics analyses across many samples into a single report -keywords: - - QC - - bioinformatics tools - - Beautiful stand-alone HTML report -tools: - - multiqc: - description: | - MultiQC searches a given directory for analysis logs and compiles a HTML report. - It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. - homepage: https://multiqc.info/ - documentation: https://multiqc.info/docs/ -input: - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC -output: - - report: - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - type: dir - description: MultiQC data dir - pattern: "multiqc_data" - - plots: - type: file - description: Plots created by MultiQC - pattern: "*_data" - - version: - type: file - description: File containing software version - pattern: "versions.yml" -authors: - - "@abhi18av" - - "@bunop" - - "@drpatelh" diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml new file mode 100644 index 00000000..7625b752 --- /dev/null +++ b/modules/nf-core/multiqc/environment.yml @@ -0,0 +1,7 @@ +name: multiqc +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::multiqc=1.19 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf new file mode 100644 index 00000000..1b9f7c43 --- /dev/null +++ b/modules/nf-core/multiqc/main.nf @@ -0,0 +1,55 @@ +process MULTIQC { + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.19--pyhdfd78af_0' : + 'biocontainers/multiqc:1.19--pyhdfd78af_0' }" + + input: + path multiqc_files, stageAs: "?/*" + path(multiqc_config) + path(extra_multiqc_config) + path(multiqc_logo) + + output: + path "*multiqc_report.html", emit: report + path "*_data" , emit: data + path "*_plots" , optional:true, emit: plots + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def config = multiqc_config ? "--config $multiqc_config" : '' + def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' + def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' + """ + multiqc \\ + --force \\ + $args \\ + $config \\ + $extra_config \\ + $logo \\ + . + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ + + stub: + """ + mkdir multiqc_data + touch multiqc_plots + touch multiqc_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml new file mode 100644 index 00000000..45a9bc35 --- /dev/null +++ b/modules/nf-core/multiqc/meta.yml @@ -0,0 +1,58 @@ +name: multiqc +description: Aggregate results from bioinformatics analyses across many samples into a single report +keywords: + - QC + - bioinformatics tools + - Beautiful stand-alone HTML report +tools: + - multiqc: + description: | + MultiQC searches a given directory for analysis logs and compiles a HTML report. + It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. + homepage: https://multiqc.info/ + documentation: https://multiqc.info/docs/ + licence: ["GPL-3.0-or-later"] +input: + - multiqc_files: + type: file + description: | + List of reports / files recognised by MultiQC, for example the html and zip output of FastQC + - multiqc_config: + type: file + description: Optional config yml for MultiQC + pattern: "*.{yml,yaml}" + - extra_multiqc_config: + type: file + description: Second optional config yml for MultiQC. Will override common sections in multiqc_config. + pattern: "*.{yml,yaml}" + - multiqc_logo: + type: file + description: Optional logo file for MultiQC + pattern: "*.{png}" +output: + - report: + type: file + description: MultiQC report file + pattern: "multiqc_report.html" + - data: + type: directory + description: MultiQC data dir + pattern: "multiqc_data" + - plots: + type: file + description: Plots created by MultiQC + pattern: "*_data" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" +maintainers: + - "@abhi18av" + - "@bunop" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test new file mode 100644 index 00000000..f1c4242e --- /dev/null +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -0,0 +1,84 @@ +nextflow_process { + + name "Test Process MULTIQC" + script "../main.nf" + process "MULTIQC" + + tag "modules" + tag "modules_nfcore" + tag "multiqc" + + test("sarscov2 single-end [fastqc]") { + + when { + process { + """ + input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("multiqc_versions_single") } + ) + } + + } + + test("sarscov2 single-end [fastqc] [config]") { + + when { + process { + """ + input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, + { assert process.out.data[0] ==~ ".*/multiqc_data" }, + { assert snapshot(process.out.versions).match("multiqc_versions_config") } + ) + } + } + + test("sarscov2 single-end [fastqc] - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.report.collect { file(it).getName() } + + process.out.data.collect { file(it).getName() } + + process.out.plots.collect { file(it).getName() } + + process.out.versions ).match("multiqc_stub") } + ) + } + + } +} diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap new file mode 100644 index 00000000..549ba79c --- /dev/null +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -0,0 +1,41 @@ +{ + "multiqc_versions_single": { + "content": [ + [ + "versions.yml:md5,14e9a2661241abd828f4f06a7b5c222d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:43:40.529579" + }, + "multiqc_stub": { + "content": [ + [ + "multiqc_report.html", + "multiqc_data", + "multiqc_plots", + "versions.yml:md5,14e9a2661241abd828f4f06a7b5c222d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:45:09.605359" + }, + "multiqc_versions_config": { + "content": [ + [ + "versions.yml:md5,14e9a2661241abd828f4f06a7b5c222d" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-01-31T17:44:53.535994" + } +} \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml new file mode 100644 index 00000000..bea6c0d3 --- /dev/null +++ b/modules/nf-core/multiqc/tests/tags.yml @@ -0,0 +1,2 @@ +multiqc: + - modules/nf-core/multiqc/** diff --git a/modules/nf-core/prodigal/environment.yml b/modules/nf-core/prodigal/environment.yml new file mode 100644 index 00000000..85746534 --- /dev/null +++ b/modules/nf-core/prodigal/environment.yml @@ -0,0 +1,8 @@ +name: prodigal +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::prodigal=2.6.3 + - conda-forge::pigz=2.6 diff --git a/modules/nf-core/prodigal/main.nf b/modules/nf-core/prodigal/main.nf new file mode 100644 index 00000000..b7df4787 --- /dev/null +++ b/modules/nf-core/prodigal/main.nf @@ -0,0 +1,44 @@ +process PRODIGAL { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-2e442ba7b07bfa102b9cf8fac6221263cd746ab8:57f05cfa73f769d6ed6d54144cb3aa2a6a6b17e0-0' : + 'biocontainers/mulled-v2-2e442ba7b07bfa102b9cf8fac6221263cd746ab8:57f05cfa73f769d6ed6d54144cb3aa2a6a6b17e0-0' }" + + input: + tuple val(meta), path(genome) + val(output_format) + + output: + tuple val(meta), path("${prefix}.${output_format}.gz"), emit: gene_annotations + tuple val(meta), path("${prefix}.fna.gz"), emit: nucleotide_fasta + tuple val(meta), path("${prefix}.faa.gz"), emit: amino_acid_fasta + tuple val(meta), path("${prefix}_all.txt.gz"), emit: all_gene_annotations + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + pigz -cdf ${genome} | prodigal \\ + $args \\ + -f $output_format \\ + -d "${prefix}.fna" \\ + -o "${prefix}.${output_format}" \\ + -a "${prefix}.faa" \\ + -s "${prefix}_all.txt" + + pigz -nm ${prefix}* + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prodigal: \$(prodigal -v 2>&1 | sed -n 's/Prodigal V\\(.*\\):.*/\\1/p') + pigz: \$(pigz -V 2>&1 | sed 's/pigz //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/prodigal/meta.yml b/modules/nf-core/prodigal/meta.yml new file mode 100644 index 00000000..a5d15d58 --- /dev/null +++ b/modules/nf-core/prodigal/meta.yml @@ -0,0 +1,56 @@ +name: prodigal +description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) is a microbial (bacterial and archaeal) gene finding program +keywords: + - prokaryotes + - gene finding + - microbial +tools: + - prodigal: + description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) is a microbial (bacterial and archaeal) gene finding program + homepage: https://github.com/hyattpd/Prodigal + documentation: https://github.com/hyattpd/prodigal/wiki + tool_dev_url: https://github.com/hyattpd/Prodigal + doi: "10.1186/1471-2105-11-119" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - genome: + type: file + description: fasta/fasta.gz file + - output_format: + type: string + description: Output format ("gbk"/"gff"/"sqn"/"sco") +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - nucleotide_fasta: + type: file + description: nucleotide sequences file + pattern: "*.{fna}" + - amino_acid_fasta: + type: file + description: protein translations file + pattern: "*.{faa}" + - all_gene_annotations: + type: file + description: complete starts file + pattern: "*.{_all.txt}" + - gene_annotations: + type: file + description: gene annotations in output_format given as input + pattern: "*.{output_format}" +authors: + - "@grst" +maintainers: + - "@grst" diff --git a/modules/nf-core/prokka/environment.yml b/modules/nf-core/prokka/environment.yml new file mode 100644 index 00000000..d7c44d5a --- /dev/null +++ b/modules/nf-core/prokka/environment.yml @@ -0,0 +1,7 @@ +name: prokka +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::prokka=1.14.6 diff --git a/modules/nf-core/prokka/main.nf b/modules/nf-core/prokka/main.nf new file mode 100644 index 00000000..adfda037 --- /dev/null +++ b/modules/nf-core/prokka/main.nf @@ -0,0 +1,52 @@ +process PROKKA { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/prokka:1.14.6--pl5321hdfd78af_4' : + 'biocontainers/prokka:1.14.6--pl5321hdfd78af_4' }" + + input: + tuple val(meta), path(fasta) + path proteins + path prodigal_tf + + output: + tuple val(meta), path("${prefix}/*.gff"), emit: gff + tuple val(meta), path("${prefix}/*.gbk"), emit: gbk + tuple val(meta), path("${prefix}/*.fna"), emit: fna + tuple val(meta), path("${prefix}/*.faa"), emit: faa + tuple val(meta), path("${prefix}/*.ffn"), emit: ffn + tuple val(meta), path("${prefix}/*.sqn"), emit: sqn + tuple val(meta), path("${prefix}/*.fsa"), emit: fsa + tuple val(meta), path("${prefix}/*.tbl"), emit: tbl + tuple val(meta), path("${prefix}/*.err"), emit: err + tuple val(meta), path("${prefix}/*.log"), emit: log + tuple val(meta), path("${prefix}/*.txt"), emit: txt + tuple val(meta), path("${prefix}/*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def proteins_opt = proteins ? "--proteins ${proteins[0]}" : "" + def prodigal_tf = prodigal_tf ? "--prodigaltf ${prodigal_tf[0]}" : "" + """ + prokka \\ + $args \\ + --cpus $task.cpus \\ + --prefix $prefix \\ + $proteins_opt \\ + $prodigal_tf \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prokka: \$(echo \$(prokka --version 2>&1) | sed 's/^.*prokka //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/prokka/meta.yml b/modules/nf-core/prokka/meta.yml new file mode 100644 index 00000000..9d82ffac --- /dev/null +++ b/modules/nf-core/prokka/meta.yml @@ -0,0 +1,90 @@ +name: prokka +description: Whole genome annotation of small genomes (bacterial, archeal, viral) +keywords: + - annotation + - fasta + - prokka +tools: + - prokka: + description: Rapid annotation of prokaryotic genomes + homepage: https://github.com/tseemann/prokka + doi: "10.1093/bioinformatics/btu153" + licence: ["GPL v2"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: | + FASTA file to be annotated. Has to contain at least a non-empty string dummy value. + - proteins: + type: file + description: FASTA file of trusted proteins to first annotate from (optional) + - prodigal_tf: + type: file + description: Training file to use for Prodigal (optional) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - gff: + type: file + description: annotation in GFF3 format, containing both sequences and annotations + pattern: "*.{gff}" + - gbk: + type: file + description: annotation in GenBank format, containing both sequences and annotations + pattern: "*.{gbk}" + - fna: + type: file + description: nucleotide FASTA file of the input contig sequences + pattern: "*.{fna}" + - faa: + type: file + description: protein FASTA file of the translated CDS sequences + pattern: "*.{faa}" + - ffn: + type: file + description: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA) + pattern: "*.{ffn}" + - sqn: + type: file + description: an ASN1 format "Sequin" file for submission to Genbank + pattern: "*.{sqn}" + - fsa: + type: file + description: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file + pattern: "*.{fsa}" + - tbl: + type: file + description: feature Table file, used by "tbl2asn" to create the .sqn file + pattern: "*.{tbl}" + - err: + type: file + description: unacceptable annotations - the NCBI discrepancy report. + pattern: "*.{err}" + - log: + type: file + description: contains all the output that Prokka produced during its run + pattern: "*.{log}" + - txt: + type: file + description: statistics relating to the annotated features found + pattern: "*.{txt}" + - tsv: + type: file + description: tab-separated file of all features (locus_tag,ftype,len_bp,gene,EC_number,COG,product) + pattern: "*.{tsv}" +authors: + - "@rpetit3" +maintainers: + - "@rpetit3" diff --git a/modules/nf-core/prokka/tests/main.nf.test b/modules/nf-core/prokka/tests/main.nf.test new file mode 100644 index 00000000..3b59ef3a --- /dev/null +++ b/modules/nf-core/prokka/tests/main.nf.test @@ -0,0 +1,46 @@ +nextflow_process { + + name "Test Process PROKKA" + script "../main.nf" + process "PROKKA" + + tag "modules" + tag "modules_nfcore" + tag "prokka" + + test("Prokka - sarscov2 - genome.fasta") { + + when { + process { + """ + input[0] = Channel.fromList([ + tuple([ id:'test', single_end:false ], // meta map + file(params.test_data['sarscov2']['genome']['genome_fasta'], checkIfExists: true)) + ]) + input[1] = [] + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.gff).match("gff") }, + { assert snapshot(process.out.fna).match("fna") }, + { assert snapshot(process.out.faa).match("faa") }, + { assert snapshot(process.out.ffn).match("ffn") }, + { assert snapshot(process.out.fsa).match("fsa") }, + { assert snapshot(process.out.tbl).match("tbl") }, + { assert snapshot(process.out.err).match("err") }, + { assert snapshot(process.out.txt).match("txt") }, + { assert snapshot(process.out.tsv).match("tsv") }, + { assert path(process.out.gbk.get(0).get(1)).exists() }, + { assert path(process.out.log.get(0).get(1)).exists() }, + { assert path(process.out.sqn.get(0).get(1)).exists() } + ) + } + + } + +} diff --git a/modules/nf-core/prokka/tests/main.nf.test.snap b/modules/nf-core/prokka/tests/main.nf.test.snap new file mode 100644 index 00000000..859e8df8 --- /dev/null +++ b/modules/nf-core/prokka/tests/main.nf.test.snap @@ -0,0 +1,128 @@ +{ + "txt": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.txt:md5,b40e485ffc8eaf1feacf8d79d9751a33" + ] + ] + ], + "timestamp": "2023-12-14T15:19:54.84139118" + }, + "err": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.err:md5,b3daedc646fddd422824e2b3e5e9229d" + ] + ] + ], + "timestamp": "2023-12-14T15:19:54.837204155" + }, + "fsa": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.fsa:md5,71bbefcb7f12046bcd3263f58cfd5404" + ] + ] + ], + "timestamp": "2023-12-14T15:19:54.803513721" + }, + "gff": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.gff:md5,5dbfb8fcf2db020564c16045976a0933" + ] + ] + ], + "timestamp": "2023-12-14T15:19:54.710100529" + }, + "tsv": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv:md5,da7c720c3018c5081d6a70b517b7d450" + ] + ] + ], + "timestamp": "2023-12-14T15:19:54.846026731" + }, + "faa": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.faa:md5,a4ceda83262b3c222a6b1f508fb9e24b" + ] + ] + ], + "timestamp": "2023-12-14T15:19:54.722112433" + }, + "fna": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.fna:md5,787307f29a263e5657cc276ebbf7e2b3" + ] + ] + ], + "timestamp": "2023-12-14T15:19:54.717325796" + }, + "ffn": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.ffn:md5,80f474b5367b7ea5ed23791935f65e34" + ] + ] + ], + "timestamp": "2023-12-14T15:19:54.727149899" + }, + "tbl": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.tbl:md5,d8f816a066ced94b62d9618b13fb8add" + ] + ] + ], + "timestamp": "2023-12-14T15:19:54.831206944" + } +} \ No newline at end of file diff --git a/modules/nf-core/prokka/tests/tags.yml b/modules/nf-core/prokka/tests/tags.yml new file mode 100644 index 00000000..a2dc7bdc --- /dev/null +++ b/modules/nf-core/prokka/tests/tags.yml @@ -0,0 +1,2 @@ +prokka: + - "modules/nf-core/prokka/**" diff --git a/modules/nf-core/samtools/flagstat/environment.yml b/modules/nf-core/samtools/flagstat/environment.yml new file mode 100644 index 00000000..bd57cb54 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/environment.yml @@ -0,0 +1,8 @@ +name: samtools_flagstat +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.19.2 + - bioconda::htslib=1.19.1 diff --git a/modules/nf-core/samtools/flagstat/main.nf b/modules/nf-core/samtools/flagstat/main.nf new file mode 100644 index 00000000..eb5f5252 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/main.nf @@ -0,0 +1,46 @@ +process SAMTOOLS_FLAGSTAT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.flagstat"), emit: flagstat + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + samtools \\ + flagstat \\ + --threads ${task.cpus} \\ + $bam \\ + > ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.flagstat + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/flagstat/meta.yml b/modules/nf-core/samtools/flagstat/meta.yml new file mode 100644 index 00000000..97991358 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/meta.yml @@ -0,0 +1,51 @@ +name: samtools_flagstat +description: Counts the number of alignments in a BAM/CRAM/SAM file for each FLAG type +keywords: + - stats + - mapping + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - flagstat: + type: file + description: File containing samtools flagstat output + pattern: "*.{flagstat}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test b/modules/nf-core/samtools/flagstat/tests/main.nf.test new file mode 100644 index 00000000..24c3c04b --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test @@ -0,0 +1,36 @@ +nextflow_process { + + name "Test Process SAMTOOLS_FLAGSTAT" + script "../main.nf" + process "SAMTOOLS_FLAGSTAT" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/flagstat" + + test("BAM") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.flagstat).match("flagstat") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } +} diff --git a/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap new file mode 100644 index 00000000..a76fc27e --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/main.nf.test.snap @@ -0,0 +1,32 @@ +{ + "flagstat": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:31:37.783927" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,fd0030ce49ab3a92091ad80260226452" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:11:44.299617452" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/flagstat/tests/tags.yml b/modules/nf-core/samtools/flagstat/tests/tags.yml new file mode 100644 index 00000000..2d2b7255 --- /dev/null +++ b/modules/nf-core/samtools/flagstat/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/flagstat: + - modules/nf-core/samtools/flagstat/** diff --git a/modules/nf-core/samtools/idxstats/environment.yml b/modules/nf-core/samtools/idxstats/environment.yml new file mode 100644 index 00000000..174973b8 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/environment.yml @@ -0,0 +1,8 @@ +name: samtools_idxstats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.19.2 + - bioconda::htslib=1.19.1 diff --git a/modules/nf-core/samtools/idxstats/main.nf b/modules/nf-core/samtools/idxstats/main.nf new file mode 100644 index 00000000..a544026f --- /dev/null +++ b/modules/nf-core/samtools/idxstats/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_IDXSTATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" + + input: + tuple val(meta), path(bam), path(bai) + + output: + tuple val(meta), path("*.idxstats"), emit: idxstats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + samtools \\ + idxstats \\ + --threads ${task.cpus-1} \\ + $bam \\ + > ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.idxstats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/idxstats/meta.yml b/modules/nf-core/samtools/idxstats/meta.yml new file mode 100644 index 00000000..344e92a3 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/meta.yml @@ -0,0 +1,52 @@ +name: samtools_idxstats +description: Reports alignment summary statistics for a BAM/CRAM/SAM file +keywords: + - stats + - mapping + - counts + - chromosome + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: Index for BAM/CRAM/SAM file + pattern: "*.{bai,crai,sai}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - idxstats: + type: file + description: File containing samtools idxstats output + pattern: "*.{idxstats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test b/modules/nf-core/samtools/idxstats/tests/main.nf.test new file mode 100644 index 00000000..a2dcb27c --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test @@ -0,0 +1,36 @@ +nextflow_process { + + name "Test Process SAMTOOLS_IDXSTATS" + script "../main.nf" + process "SAMTOOLS_IDXSTATS" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/idxstats" + + test("bam") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.idxstats).match("idxstats") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + } +} diff --git a/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap new file mode 100644 index 00000000..a7050bdc --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/main.nf.test.snap @@ -0,0 +1,32 @@ +{ + "versions": { + "content": [ + [ + "versions.yml:md5,613dde56f108418039ffcdeeddba397a" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:16:50.147462763" + }, + "idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:36:41.561026" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/idxstats/tests/tags.yml b/modules/nf-core/samtools/idxstats/tests/tags.yml new file mode 100644 index 00000000..d3057c61 --- /dev/null +++ b/modules/nf-core/samtools/idxstats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/idxstats: + - modules/nf-core/samtools/idxstats/** diff --git a/modules/nf-core/samtools/index/environment.yml b/modules/nf-core/samtools/index/environment.yml new file mode 100644 index 00000000..a5e50649 --- /dev/null +++ b/modules/nf-core/samtools/index/environment.yml @@ -0,0 +1,8 @@ +name: samtools_index +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.19.2 + - bioconda::htslib=1.19.1 diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf new file mode 100644 index 00000000..dc14f98d --- /dev/null +++ b/modules/nf-core/samtools/index/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml new file mode 100644 index 00000000..01a4ee03 --- /dev/null +++ b/modules/nf-core/samtools/index/meta.yml @@ -0,0 +1,57 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" +maintainers: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/index/tests/csi.nextflow.config b/modules/nf-core/samtools/index/tests/csi.nextflow.config new file mode 100644 index 00000000..0ed260ef --- /dev/null +++ b/modules/nf-core/samtools/index/tests/csi.nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_INDEX { + ext.args = '-c' + } + +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test b/modules/nf-core/samtools/index/tests/main.nf.test new file mode 100644 index 00000000..bb7756d1 --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test @@ -0,0 +1,87 @@ +nextflow_process { + + name "Test Process SAMTOOLS_INDEX" + script "../main.nf" + process "SAMTOOLS_INDEX" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/index" + + test("bai") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.bai).match("bai") }, + { assert snapshot(process.out.versions).match("bai_versions") } + ) + } + } + + test("crai") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.crai).match("crai") }, + { assert snapshot(process.out.versions).match("crai_versions") } + ) + } + } + + test("csi") { + + config "./csi.nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert path(process.out.csi.get(0).get(1)).exists() }, + { assert snapshot(process.out.versions).match("csi_versions") } + ) + } + } +} diff --git a/modules/nf-core/samtools/index/tests/main.nf.test.snap b/modules/nf-core/samtools/index/tests/main.nf.test.snap new file mode 100644 index 00000000..3dc8e7de --- /dev/null +++ b/modules/nf-core/samtools/index/tests/main.nf.test.snap @@ -0,0 +1,74 @@ +{ + "crai_versions": { + "content": [ + [ + "versions.yml:md5,cc4370091670b64bba7c7206403ffb3e" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:12:00.324667957" + }, + "csi_versions": { + "content": [ + [ + "versions.yml:md5,cc4370091670b64bba7c7206403ffb3e" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:12:07.885103162" + }, + "crai": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.recalibrated.sorted.cram.crai:md5,14bc3bd5c89cacc8f4541f9062429029" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:41:38.446424" + }, + "bai": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.paired_end.sorted.bam.bai:md5,704c10dd1326482448ca3073fdebc2f4" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T18:40:46.579747" + }, + "bai_versions": { + "content": [ + [ + "versions.yml:md5,cc4370091670b64bba7c7206403ffb3e" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:11:51.641425452" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/index/tests/tags.yml b/modules/nf-core/samtools/index/tests/tags.yml new file mode 100644 index 00000000..e0f58a7a --- /dev/null +++ b/modules/nf-core/samtools/index/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/index: + - modules/nf-core/samtools/index/** diff --git a/modules/nf-core/samtools/sort/environment.yml b/modules/nf-core/samtools/sort/environment.yml new file mode 100644 index 00000000..4d898e48 --- /dev/null +++ b/modules/nf-core/samtools/sort/environment.yml @@ -0,0 +1,8 @@ +name: samtools_sort +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.19.2 + - bioconda::htslib=1.19.1 diff --git a/modules/nf-core/samtools/sort/main.nf b/modules/nf-core/samtools/sort/main.nf new file mode 100644 index 00000000..cdd8305d --- /dev/null +++ b/modules/nf-core/samtools/sort/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_SORT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" + + input: + tuple val(meta), path(bam) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.csi"), emit: csi, optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ("$bam" == "${prefix}.bam") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools sort \\ + $args \\ + -@ $task.cpus \\ + -o ${prefix}.bam \\ + -T $prefix \\ + $bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/sort/meta.yml b/modules/nf-core/samtools/sort/meta.yml new file mode 100644 index 00000000..2200de72 --- /dev/null +++ b/modules/nf-core/samtools/sort/meta.yml @@ -0,0 +1,51 @@ +name: samtools_sort +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - csi: + type: file + description: BAM index file (optional) + pattern: "*.csi" +authors: + - "@drpatelh" + - "@ewels" +maintainers: + - "@drpatelh" + - "@ewels" diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test b/modules/nf-core/samtools/sort/tests/main.nf.test new file mode 100644 index 00000000..31e24b88 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test @@ -0,0 +1,64 @@ +nextflow_process { + + name "Test Process SAMTOOLS_SORT" + script "../main.nf" + process "SAMTOOLS_SORT" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/sort" + + test("bam") { + + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("bam_stub") { + + config "./nextflow.config" + options "-stub-run" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(file(process.out.bam[0][1]).name).match("bam_stub_bam") }, + { assert snapshot(process.out.versions).match("bam_stub_versions") } + ) + } + } +} diff --git a/modules/nf-core/samtools/sort/tests/main.nf.test.snap b/modules/nf-core/samtools/sort/tests/main.nf.test.snap new file mode 100644 index 00000000..a7cf0210 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/main.nf.test.snap @@ -0,0 +1,65 @@ +{ + "bam_stub_bam": { + "content": [ + "test.sorted.bam" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-12T19:21:04.364044" + }, + "bam_stub_versions": { + "content": [ + [ + "versions.yml:md5,e6d43fefc9a8bff91c2ce6e3a1716eca" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:15:00.20800281" + }, + "bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,c6ea1346ec4aae007eb40b708935088c" + ] + ], + "1": [ + + ], + "2": [ + "versions.yml:md5,e6d43fefc9a8bff91c2ce6e3a1716eca" + ], + "bam": [ + [ + { + "id": "test", + "single_end": false + }, + "test.sorted.bam:md5,c6ea1346ec4aae007eb40b708935088c" + ] + ], + "csi": [ + + ], + "versions": [ + "versions.yml:md5,e6d43fefc9a8bff91c2ce6e3a1716eca" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:14:52.736359271" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/sort/tests/nextflow.config b/modules/nf-core/samtools/sort/tests/nextflow.config new file mode 100644 index 00000000..d0f35086 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: SAMTOOLS_SORT { + ext.prefix = { "${meta.id}.sorted" } + } + +} diff --git a/modules/nf-core/samtools/sort/tests/tags.yml b/modules/nf-core/samtools/sort/tests/tags.yml new file mode 100644 index 00000000..cd63ea20 --- /dev/null +++ b/modules/nf-core/samtools/sort/tests/tags.yml @@ -0,0 +1,3 @@ +samtools/sort: + - modules/nf-core/samtools/sort/** + - tests/modules/nf-core/samtools/sort/** diff --git a/modules/nf-core/samtools/stats/environment.yml b/modules/nf-core/samtools/stats/environment.yml new file mode 100644 index 00000000..67bb0ca4 --- /dev/null +++ b/modules/nf-core/samtools/stats/environment.yml @@ -0,0 +1,8 @@ +name: samtools_stats +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.19.2 + - bioconda::htslib=1.19.1 diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf new file mode 100644 index 00000000..52b00f4b --- /dev/null +++ b/modules/nf-core/samtools/stats/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_STATS { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.19.2--h50ea8bc_0' : + 'biocontainers/samtools:1.19.2--h50ea8bc_0' }" + + input: + tuple val(meta), path(input), path(input_index) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + stats \\ + --threads ${task.cpus} \\ + ${reference} \\ + ${input} \\ + > ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml new file mode 100644 index 00000000..735ff812 --- /dev/null +++ b/modules/nf-core/samtools/stats/meta.yml @@ -0,0 +1,63 @@ +name: samtools_stats +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" +maintainers: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test b/modules/nf-core/samtools/stats/tests/main.nf.test new file mode 100644 index 00000000..e3d5cb14 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test @@ -0,0 +1,65 @@ +nextflow_process { + + name "Test Process SAMTOOLS_STATS" + script "../main.nf" + process "SAMTOOLS_STATS" + tag "modules" + tag "modules_nfcore" + tag "samtools" + tag "samtools/stats" + + test("bam") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ]) + input[1] = [[],[]] + """ + } + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + } + + test("cram") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + {assert process.success}, + {assert snapshot(process.out).match()} + ) + } + } +} diff --git a/modules/nf-core/samtools/stats/tests/main.nf.test.snap b/modules/nf-core/samtools/stats/tests/main.nf.test.snap new file mode 100644 index 00000000..1b7c9ba4 --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "cram": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,01812900aa4027532906c5d431114233" + ] + ], + "1": [ + "versions.yml:md5,0514ceb1769b2a88843e08c1f82624a9" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,01812900aa4027532906c5d431114233" + ] + ], + "versions": [ + "versions.yml:md5,0514ceb1769b2a88843e08c1f82624a9" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:15:25.562429714" + }, + "bam": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,5d8681bf541199898c042bf400391d59" + ] + ], + "1": [ + "versions.yml:md5,0514ceb1769b2a88843e08c1f82624a9" + ], + "stats": [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,5d8681bf541199898c042bf400391d59" + ] + ], + "versions": [ + "versions.yml:md5,0514ceb1769b2a88843e08c1f82624a9" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:15:07.857611509" + } +} \ No newline at end of file diff --git a/modules/nf-core/samtools/stats/tests/tags.yml b/modules/nf-core/samtools/stats/tests/tags.yml new file mode 100644 index 00000000..7c28e30f --- /dev/null +++ b/modules/nf-core/samtools/stats/tests/tags.yml @@ -0,0 +1,2 @@ +samtools/stats: + - modules/nf-core/samtools/stats/** diff --git a/modules/nf-core/seqtk/mergepe/environment.yml b/modules/nf-core/seqtk/mergepe/environment.yml new file mode 100644 index 00000000..cefab216 --- /dev/null +++ b/modules/nf-core/seqtk/mergepe/environment.yml @@ -0,0 +1,7 @@ +name: seqtk_mergepe +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::seqtk=1.3 diff --git a/modules/nf-core/seqtk/mergepe/main.nf b/modules/nf-core/seqtk/mergepe/main.nf new file mode 100644 index 00000000..8a42428d --- /dev/null +++ b/modules/nf-core/seqtk/mergepe/main.nf @@ -0,0 +1,46 @@ +process SEQTK_MERGEPE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.3--h5bf99c6_3' : + 'biocontainers/seqtk:1.3--h5bf99c6_3' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + """ + ln -s ${reads} ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + seqtk \\ + mergepe \\ + $args \\ + ${reads} \\ + | gzip -n >> ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/seqtk/mergepe/meta.yml b/modules/nf-core/seqtk/mergepe/meta.yml new file mode 100644 index 00000000..80df9e78 --- /dev/null +++ b/modules/nf-core/seqtk/mergepe/meta.yml @@ -0,0 +1,39 @@ +name: seqtk_mergepe +description: Interleave pair-end reads from FastQ files +keywords: + - interleave +tools: + - seqtk: + description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. Seqtk mergepe command merges pair-end reads into one interleaved file. + homepage: https://github.com/lh3/seqtk + documentation: https://docs.csc.fi/apps/seqtk/ + tool_dev_url: https://github.com/lh3/seqtk + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: List of input FastQ files of size 1 and 2 for single-end and paired-end data,respectively. + pattern: "*.{fastq.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: If single-end reads, the output is the same as the input, 1 FastQ file for each read. If pair-end reads, the read pairs will be interleaved and output as 1 FastQ file for each read pair. + pattern: "*.{fastq.gz}" +authors: + - "@emnilsson" +maintainers: + - "@emnilsson" diff --git a/modules/nf-core/seqtk/seq/environment.yml b/modules/nf-core/seqtk/seq/environment.yml new file mode 100644 index 00000000..78dc2c84 --- /dev/null +++ b/modules/nf-core/seqtk/seq/environment.yml @@ -0,0 +1,7 @@ +name: seqtk_seq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::seqtk=1.4 diff --git a/modules/nf-core/seqtk/seq/main.nf b/modules/nf-core/seqtk/seq/main.nf new file mode 100644 index 00000000..af085f0d --- /dev/null +++ b/modules/nf-core/seqtk/seq/main.nf @@ -0,0 +1,40 @@ +process SEQTK_SEQ { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/seqtk:1.4--he4a0461_1' : + 'biocontainers/seqtk:1.4--he4a0461_1' }" + + input: + tuple val(meta), path(fastx) + + output: + tuple val(meta), path("*.gz") , emit: fastx + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + def extension = "fastq" + if ("$fastx" ==~ /.+\.fasta|.+\.fasta.gz|.+\.fa|.+\.fa.gz|.+\.fas|.+\.fas.gz|.+\.fna|.+\.fna.gz/ || "$args" ==~ /\-[aA]/ ) { + extension = "fasta" + } + """ + seqtk \\ + seq \\ + $args \\ + $fastx | \\ + gzip -c > ${prefix}.seqtk-seq.${extension}.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqtk: \$(echo \$(seqtk 2>&1) | sed 's/^.*Version: //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/seqtk/seq/meta.yml b/modules/nf-core/seqtk/seq/meta.yml new file mode 100644 index 00000000..780ccc0c --- /dev/null +++ b/modules/nf-core/seqtk/seq/meta.yml @@ -0,0 +1,45 @@ +name: seqtk_seq +description: Common transformation operations on FASTA or FASTQ files. +keywords: + - seq + - filter + - transformation +tools: + - seqtk: + description: Seqtk is a fast and lightweight tool for processing sequences in the FASTA or FASTQ format. The seqtk seq command enables common transformation operations on FASTA or FASTQ files. + homepage: https://github.com/lh3/seqtk + documentation: https://docs.csc.fi/apps/seqtk/ + tool_dev_url: https://github.com/lh3/seqtk + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - sequences: + type: file + description: A FASTQ or FASTA file + pattern: "*.{fastq.gz, fastq, fq, fq.gz, fasta, fastq.gz, fa, fa.gz, fas, fas.gz, fna, fna.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - sequences: + type: file + description: FASTQ/FASTA file containing renamed sequences + pattern: "*.{fastq.gz, fasta.gz}" +authors: + - "@hseabolt" + - "@mjcipriano" + - "@sateeshperi" +maintainers: + - "@hseabolt" + - "@mjcipriano" + - "@sateeshperi" diff --git a/modules/nf-core/spades/environment.yml b/modules/nf-core/spades/environment.yml new file mode 100644 index 00000000..12315814 --- /dev/null +++ b/modules/nf-core/spades/environment.yml @@ -0,0 +1,7 @@ +name: spades +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::spades=3.15.5 diff --git a/modules/nf-core/spades/main.nf b/modules/nf-core/spades/main.nf new file mode 100644 index 00000000..010525e9 --- /dev/null +++ b/modules/nf-core/spades/main.nf @@ -0,0 +1,73 @@ +process SPADES { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/spades:3.15.5--h95f258a_1' : + 'biocontainers/spades:3.15.5--h95f258a_1' }" + + input: + tuple val(meta), path(illumina), path(pacbio), path(nanopore) + path yml + path hmm + + output: + tuple val(meta), path('*.scaffolds.fa.gz') , optional:true, emit: scaffolds + tuple val(meta), path('*.contigs.fa.gz') , optional:true, emit: contigs + tuple val(meta), path('*.transcripts.fa.gz') , optional:true, emit: transcripts + tuple val(meta), path('*.gene_clusters.fa.gz'), optional:true, emit: gene_clusters + tuple val(meta), path('*.assembly.gfa.gz') , optional:true, emit: gfa + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def maxmem = task.memory.toGiga() + def illumina_reads = illumina ? ( meta.single_end ? "-s $illumina" : "-1 ${illumina[0]} -2 ${illumina[1]}" ) : "" + def pacbio_reads = pacbio ? "--pacbio $pacbio" : "" + def nanopore_reads = nanopore ? "--nanopore $nanopore" : "" + def custom_hmms = hmm ? "--custom-hmms $hmm" : "" + def reads = yml ? "--dataset $yml" : "$illumina_reads $pacbio_reads $nanopore_reads" + """ + spades.py \\ + $args \\ + --threads $task.cpus \\ + --memory $maxmem \\ + $custom_hmms \\ + $reads \\ + -o ./ + mv spades.log ${prefix}.spades.log + + if [ -f scaffolds.fasta ]; then + mv scaffolds.fasta ${prefix}.scaffolds.fa + gzip -n ${prefix}.scaffolds.fa + fi + if [ -f contigs.fasta ]; then + mv contigs.fasta ${prefix}.contigs.fa + gzip -n ${prefix}.contigs.fa + fi + if [ -f transcripts.fasta ]; then + mv transcripts.fasta ${prefix}.transcripts.fa + gzip -n ${prefix}.transcripts.fa + fi + if [ -f assembly_graph_with_scaffolds.gfa ]; then + mv assembly_graph_with_scaffolds.gfa ${prefix}.assembly.gfa + gzip -n ${prefix}.assembly.gfa + fi + + if [ -f gene_clusters.fasta ]; then + mv gene_clusters.fasta ${prefix}.gene_clusters.fa + gzip -n ${prefix}.gene_clusters.fa + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + spades: \$(spades.py --version 2>&1 | sed 's/^.*SPAdes genome assembler v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/spades/meta.yml b/modules/nf-core/spades/meta.yml new file mode 100644 index 00000000..99c482cd --- /dev/null +++ b/modules/nf-core/spades/meta.yml @@ -0,0 +1,94 @@ +name: spades +description: Assembles a small genome (bacterial, fungal, viral) +keywords: + - genome + - assembly + - genome assembler + - small genome + - de novo assembler +tools: + - spades: + description: SPAdes (St. Petersburg genome assembler) is intended for both standard isolates and single-cell MDA bacteria assemblies. + homepage: http://cab.spbu.ru/files/release3.15.0/manual.html + documentation: http://cab.spbu.ru/files/release3.15.0/manual.html + tool_dev_url: https://github.com/ablab/spades + doi: 10.1089/cmb.2012.0021 + licence: ["GPL v2"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - illumina: + type: file + description: | + List of input FastQ (Illumina or PacBio CCS reads) files + of size 1 and 2 for single-end and paired-end data, + respectively. This input data type is required. + - pacbio: + type: file + description: | + List of input PacBio CLR FastQ files of size 1. + - nanopore: + type: file + description: | + List of input FastQ files of size 1, originating from Oxford Nanopore technology. + - yml: + type: file + description: | + Path to yml file containing read information. + The raw FASTQ files listed in this YAML file MUST be supplied to the respective illumina/pacbio/nanopore input channel(s) _in addition_ to this YML. + File entries in this yml must contain only the file name and no paths. + pattern: "*.{yml,yaml}" + - hmm: + type: file + description: File or directory with amino acid HMMs for Spades HMM-guided mode. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - scaffolds: + type: file + description: | + Fasta file containing scaffolds + pattern: "*.fa.gz" + - contigs: + type: file + description: | + Fasta file containing contigs + pattern: "*.fa.gz" + - transcripts: + type: file + description: | + Fasta file containing transcripts + pattern: "*.fa.gz" + - gene_clusters: + type: file + description: | + Fasta file containing gene_clusters + pattern: "*.fa.gz" + - gfa: + type: file + description: | + gfa file containing assembly + pattern: "*.gfa.gz" + - log: + type: file + description: | + Spades log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@JoseEspinosa" + - "@drpatelh" + - "@d4straub" +maintainers: + - "@JoseEspinosa" + - "@drpatelh" + - "@d4straub" diff --git a/modules/nf-core/subread/featurecounts/environment.yml b/modules/nf-core/subread/featurecounts/environment.yml new file mode 100644 index 00000000..ca19439d --- /dev/null +++ b/modules/nf-core/subread/featurecounts/environment.yml @@ -0,0 +1,7 @@ +name: subread_featurecounts +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::subread=2.0.1 diff --git a/modules/nf-core/subread/featurecounts/main.nf b/modules/nf-core/subread/featurecounts/main.nf new file mode 100644 index 00000000..20979962 --- /dev/null +++ b/modules/nf-core/subread/featurecounts/main.nf @@ -0,0 +1,47 @@ +process SUBREAD_FEATURECOUNTS { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/subread:2.0.1--hed695b0_0' : + 'biocontainers/subread:2.0.1--hed695b0_0' }" + + input: + tuple val(meta), path(bams), path(annotation) + + output: + tuple val(meta), path("*featureCounts.txt") , emit: counts + tuple val(meta), path("*featureCounts.txt.summary"), emit: summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired_end = meta.single_end ? '' : '-p' + + def strandedness = 0 + if (meta.strandedness == 'forward') { + strandedness = 1 + } else if (meta.strandedness == 'reverse') { + strandedness = 2 + } + """ + featureCounts \\ + $args \\ + $paired_end \\ + -T $task.cpus \\ + -a $annotation \\ + -s $strandedness \\ + -o ${prefix}.featureCounts.txt \\ + ${bams.join(' ')} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + subread: \$( echo \$(featureCounts -v 2>&1) | sed -e "s/featureCounts v//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/subread/featurecounts/meta.yml b/modules/nf-core/subread/featurecounts/meta.yml new file mode 100644 index 00000000..38a37945 --- /dev/null +++ b/modules/nf-core/subread/featurecounts/meta.yml @@ -0,0 +1,50 @@ +name: subread_featurecounts +description: Count reads that map to genomic features +keywords: + - counts + - fasta + - genome + - reference +tools: + - featurecounts: + description: featureCounts is a highly efficient general-purpose read summarization program that counts mapped reads for genomic features such as genes, exons, promoter, gene bodies, genomic bins and chromosomal locations. It can be used to count both RNA-seq and genomic DNA-seq reads. + homepage: http://bioinf.wehi.edu.au/featureCounts/ + documentation: http://bioinf.wehi.edu.au/subread-package/SubreadUsersGuide.pdf + doi: "10.1093/bioinformatics/btt656" + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/SAM file containing read alignments + pattern: "*.{bam}" + - annotation: + type: file + description: Genomic features annotation in GTF or SAF + pattern: "*.{gtf,saf}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - counts: + type: file + description: Counts of reads mapping to features + pattern: "*featureCounts.txt" + - summary: + type: file + description: Summary log file + pattern: "*.featureCounts.txt.summary" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@ntoda03" +maintainers: + - "@ntoda03" diff --git a/modules/nf-core/subread/featurecounts/tests/main.nf.test b/modules/nf-core/subread/featurecounts/tests/main.nf.test new file mode 100644 index 00000000..6ff22179 --- /dev/null +++ b/modules/nf-core/subread/featurecounts/tests/main.nf.test @@ -0,0 +1,82 @@ +nextflow_process { + + name "Test Process SUBREAD_FEATURECOUNTS" + script "../main.nf" + process "SUBREAD_FEATURECOUNTS" + config "./nextflow.config" + tag "modules" + tag "modules_nfcore" + tag "subread" + tag "subread/featurecounts" + + test("sarscov2 [bam] - forward") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true, strandedness:'forward' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.counts).match("forward_counts") }, + { assert snapshot(process.out.summary).match("forward_summary") }, + { assert snapshot(process.out.versions).match("forward_versions") } + ) + } + } + + test("sarscov2 [bam] - reverse") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true, strandedness:'reverse' ], // meta map + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/bam/test.single_end.bam", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gtf", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.counts).match("reverse_counts") }, + { assert snapshot(process.out.summary).match("reverse_summary") }, + { assert snapshot(process.out.versions).match("reverse_versions") } + ) + } + } + + test("sarscov2 [bam] - unstranded") { + + when { + process { + """ + input[0] = [ [ id:'test', single_end:true, strandedness:'unstranded' ], // meta map + file(params.test_data['sarscov2']['illumina']['test_single_end_bam'], checkIfExists: true), + file(params.test_data['sarscov2']['genome']['genome_gtf'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.counts).match("unstranded_counts") }, + { assert snapshot(process.out.summary).match("unstranded_summary") }, + { assert snapshot(process.out.versions).match("unstranded_versions") } + ) + } + } +} diff --git a/modules/nf-core/subread/featurecounts/tests/main.nf.test.snap b/modules/nf-core/subread/featurecounts/tests/main.nf.test.snap new file mode 100644 index 00000000..ad5524f6 --- /dev/null +++ b/modules/nf-core/subread/featurecounts/tests/main.nf.test.snap @@ -0,0 +1,116 @@ +{ + "forward_counts": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "forward" + }, + "test.featureCounts.txt:md5,0012df4c0a0e47eec1440017ab34f75f" + ] + ] + ], + "timestamp": "2023-11-23T15:50:10.685863663" + }, + "unstranded_counts": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "unstranded" + }, + "test.featureCounts.txt:md5,3307d31b44a5d6bb3389786bb8f4e91f" + ] + ] + ], + "timestamp": "2023-11-23T15:50:38.67903701" + }, + "reverse_summary": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "reverse" + }, + "test.featureCounts.txt.summary:md5,7cfa30ad678b9bc1bc63afbb0281547b" + ] + ] + ], + "timestamp": "2023-11-23T15:50:25.168206514" + }, + "reverse_counts": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "reverse" + }, + "test.featureCounts.txt:md5,8175816b8260ed444d59232bd7e7120b" + ] + ] + ], + "timestamp": "2023-11-23T15:50:25.160010804" + }, + "forward_summary": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "forward" + }, + "test.featureCounts.txt.summary:md5,8f602ff9a8ef467af43294e80b367cdf" + ] + ] + ], + "timestamp": "2023-11-23T15:50:10.699024934" + }, + "forward_versions": { + "content": [ + [ + "versions.yml:md5,c2c0903b93c93d9afd2667052b9ee726" + ] + ], + "timestamp": "2023-11-23T15:50:10.704797013" + }, + "unstranded_summary": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "strandedness": "unstranded" + }, + "test.featureCounts.txt.summary:md5,23164b79f9f23f11c82820db61a35560" + ] + ] + ], + "timestamp": "2023-11-23T15:50:38.68776235" + }, + "reverse_versions": { + "content": [ + [ + "versions.yml:md5,c2c0903b93c93d9afd2667052b9ee726" + ] + ], + "timestamp": "2023-11-23T15:50:25.175265594" + }, + "unstranded_versions": { + "content": [ + [ + "versions.yml:md5,c2c0903b93c93d9afd2667052b9ee726" + ] + ], + "timestamp": "2023-11-23T15:50:38.69390501" + } +} \ No newline at end of file diff --git a/modules/nf-core/subread/featurecounts/tests/nextflow.config b/modules/nf-core/subread/featurecounts/tests/nextflow.config new file mode 100644 index 00000000..d9fd4fd5 --- /dev/null +++ b/modules/nf-core/subread/featurecounts/tests/nextflow.config @@ -0,0 +1,9 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: SUBREAD_FEATURECOUNTS { + ext.args = '-t CDS' + } + +} diff --git a/modules/nf-core/subread/featurecounts/tests/tags.yml b/modules/nf-core/subread/featurecounts/tests/tags.yml new file mode 100644 index 00000000..6d2534bf --- /dev/null +++ b/modules/nf-core/subread/featurecounts/tests/tags.yml @@ -0,0 +1,2 @@ +subread/featurecounts: + - modules/nf-core/subread/featurecounts/** diff --git a/modules/nf-core/transdecoder/longorf/environment.yml b/modules/nf-core/transdecoder/longorf/environment.yml new file mode 100644 index 00000000..29a06d31 --- /dev/null +++ b/modules/nf-core/transdecoder/longorf/environment.yml @@ -0,0 +1,7 @@ +name: transdecoder_longorf +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::transdecoder=5.5.0 diff --git a/modules/nf-core/transdecoder/longorf/main.nf b/modules/nf-core/transdecoder/longorf/main.nf new file mode 100644 index 00000000..19466443 --- /dev/null +++ b/modules/nf-core/transdecoder/longorf/main.nf @@ -0,0 +1,40 @@ +process TRANSDECODER_LONGORF { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/transdecoder:5.5.0--pl5262hdfd78af_4' : + 'quay.io/comp-bio-aging/transdecoder' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("${meta.id}/*.pep") , emit: pep + tuple val(meta), path("${meta.id}/*.gff3"), emit: gff3 + tuple val(meta), path("${meta.id}/*.cds") , emit: cds + tuple val(meta), path("${meta.id}/*.dat") , emit: dat + path("${meta.id}/") , emit: folder + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + TransDecoder.LongOrfs \\ + $args \\ + -O $prefix \\ + -t \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + transdecoder: \$(echo \$(TransDecoder.LongOrfs --version) | sed -e "s/TransDecoder.LongOrfs //g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/transdecoder/longorf/meta.yml b/modules/nf-core/transdecoder/longorf/meta.yml new file mode 100644 index 00000000..e6c6bea3 --- /dev/null +++ b/modules/nf-core/transdecoder/longorf/meta.yml @@ -0,0 +1,55 @@ +name: transdecoder_longorf +description: TransDecoder itentifies candidate coding regions within transcript sequences. it is used to build gff file. +keywords: + - eucaryotes + - gff +tools: + - transdecoder: + description: TransDecoder identifies candidate coding regions within transcript sequences, such as those generated by de novo RNA-Seq transcript assembly using Trinity, or constructed based on RNA-Seq alignments to the genome using Tophat and Cufflinks. + homepage: https://github.com/TransDecoder + documentation: https://github.com/TransDecoder/TransDecoder/wiki + tool_dev_url: https://github.com/TransDecoder/TransDecoder + licence: ["Broad Institute"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: fasta file + pattern: "*.{fasta}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - pep: + type: Amino acids fasta file + description: all ORFs meeting the minimum length criteria, regardless of coding potential. file + pattern: "*.{pep}" + - gff3: + type: gff file + description: positions of all ORFs as found in the target transcripts. file + pattern: "*.{gff3}" + - cds: + type: fasta file + description: the nucleotide coding sequence for all detected ORFs. file + pattern: "*{cds}" + - dat: + type: tsv file + description: nucleotide frequencies + pattern: "*{dat}" + - folder: + type: directory + description: contains all the files from the run +authors: + - "@Danilo2771" +maintainers: + - "@Danilo2771" diff --git a/modules/nf-core/transdecoder/predict/environment.yml b/modules/nf-core/transdecoder/predict/environment.yml new file mode 100644 index 00000000..07efaa4f --- /dev/null +++ b/modules/nf-core/transdecoder/predict/environment.yml @@ -0,0 +1,7 @@ +name: transdecoder_predict +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::transdecoder=5.5.0 diff --git a/modules/nf-core/transdecoder/predict/main.nf b/modules/nf-core/transdecoder/predict/main.nf new file mode 100644 index 00000000..7d56a79d --- /dev/null +++ b/modules/nf-core/transdecoder/predict/main.nf @@ -0,0 +1,40 @@ +process TRANSDECODER_PREDICT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/transdecoder:5.5.0--pl5262hdfd78af_4': + 'quay.io/comp-bio-aging/transdecoder' }" + + input: + tuple val(meta), path(fasta) + path(fold) + + output: + tuple val(meta), path("*.transdecoder.pep") , emit: pep + tuple val(meta), path("*.transdecoder.gff3") , emit: gff3 + tuple val(meta), path("*.transdecoder.cds") , emit: cds + tuple val(meta), path("*.transdecoder.bed") , emit: bed + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + TransDecoder.Predict \\ + $args \\ + -O ${prefix} \\ + -t \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + transdecoder: \$(echo \$(TransDecoder.Predict --version) | sed -e "s/TransDecoder.Predict //g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/transdecoder/predict/meta.yml b/modules/nf-core/transdecoder/predict/meta.yml new file mode 100644 index 00000000..ea2c9b8e --- /dev/null +++ b/modules/nf-core/transdecoder/predict/meta.yml @@ -0,0 +1,56 @@ +name: transdecoder_predict +description: TransDecoder identifies candidate coding regions within transcript sequences. It is used to build gff file. You can use this module after transdecoder_longorf +keywords: + - eukaryotes + - gff +tools: + - transdecoder: + description: TransDecoder identifies candidate coding regions within transcript sequences, such as those generated by de novo RNA-Seq transcript assembly using Trinity, or constructed based on RNA-Seq alignments to the genome using Tophat and Cufflinks. + homepage: https://github.com/TransDecoder + documentation: https://github.com/TransDecoder/TransDecoder/wiki + tool_dev_url: https://github.com/TransDecoder/TransDecoder + licence: ["Broad Institute"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: fasta file + pattern: "*.{fasta}" + - fold: + type: directory + description: Output from the module transdecoder_longorf + pattern: "*" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - pep: + type: amino acids fasta file + description: All ORFs meeting the minimum length criteria, regardless of coding potential + pattern: "*.{pep}" + - gff3: + type: gff3 file + description: Positions of all ORFs as found in the target transcripts + pattern: "*.{gff3}" + - cds: + type: nucleotide fasta file + description: the nucleotide coding sequence for all detected ORFs + pattern: "*{cds}" + - bed: + type: bed file + description: bed file + pattern: "*{bed}" +authors: + - "@Danilo2771" +maintainers: + - "@Danilo2771" diff --git a/modules/nf-core/trimgalore/environment.yml b/modules/nf-core/trimgalore/environment.yml new file mode 100644 index 00000000..6cd0f51b --- /dev/null +++ b/modules/nf-core/trimgalore/environment.yml @@ -0,0 +1,7 @@ +name: trimgalore +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::trim-galore=0.6.7 diff --git a/modules/nf-core/trimgalore/main.nf b/modules/nf-core/trimgalore/main.nf new file mode 100644 index 00000000..24ead871 --- /dev/null +++ b/modules/nf-core/trimgalore/main.nf @@ -0,0 +1,75 @@ +process TRIMGALORE { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/trim-galore:0.6.7--hdfd78af_0' : + 'biocontainers/trim-galore:0.6.7--hdfd78af_0' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*{3prime,5prime,trimmed,val}*.fq.gz"), emit: reads + tuple val(meta), path("*report.txt") , emit: log , optional: true + tuple val(meta), path("*unpaired*.fq.gz") , emit: unpaired, optional: true + tuple val(meta), path("*.html") , emit: html , optional: true + tuple val(meta), path("*.zip") , emit: zip , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + // Calculate number of --cores for TrimGalore based on value of task.cpus + // See: https://github.com/FelixKrueger/TrimGalore/blob/master/Changelog.md#version-060-release-on-1-mar-2019 + // See: https://github.com/nf-core/atacseq/pull/65 + def cores = 1 + if (task.cpus) { + cores = (task.cpus as int) - 4 + if (meta.single_end) cores = (task.cpus as int) - 3 + if (cores < 1) cores = 1 + if (cores > 8) cores = 8 + } + + // Added soft-links to original fastqs for consistent naming in MultiQC + def prefix = task.ext.prefix ?: "${meta.id}" + if (meta.single_end) { + def args_list = args.split("\\s(?=--)").toList() + args_list.removeAll { it.toLowerCase().contains('_r2 ') } + """ + [ ! -f ${prefix}.fastq.gz ] && ln -s $reads ${prefix}.fastq.gz + trim_galore \\ + ${args_list.join(' ')} \\ + --cores $cores \\ + --gzip \\ + ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + trimgalore: \$(echo \$(trim_galore --version 2>&1) | sed 's/^.*version //; s/Last.*\$//') + cutadapt: \$(cutadapt --version) + END_VERSIONS + """ + } else { + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + trim_galore \\ + $args \\ + --cores $cores \\ + --paired \\ + --gzip \\ + ${prefix}_1.fastq.gz \\ + ${prefix}_2.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + trimgalore: \$(echo \$(trim_galore --version 2>&1) | sed 's/^.*version //; s/Last.*\$//') + cutadapt: \$(cutadapt --version) + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/trimgalore/meta.yml b/modules/nf-core/trimgalore/meta.yml new file mode 100644 index 00000000..e649088c --- /dev/null +++ b/modules/nf-core/trimgalore/meta.yml @@ -0,0 +1,68 @@ +name: trimgalore +description: Trim FastQ files using Trim Galore! +keywords: + - trimming + - adapters + - sequencing adapters + - fastq +tools: + - trimgalore: + description: | + A wrapper tool around Cutadapt and FastQC to consistently apply quality + and adapter trimming to FastQ files, with some extra functionality for + MspI-digested RRBS-type (Reduced Representation Bisufite-Seq) libraries. + homepage: https://www.bioinformatics.babraham.ac.uk/projects/trim_galore/ + documentation: https://github.com/FelixKrueger/TrimGalore/blob/master/Docs/Trim_Galore_User_Guide.md + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input adapter trimmed FastQ files of size 1 and 2 for + single-end and paired-end data, respectively. + pattern: "*{3prime,5prime,trimmed,val}*.fq.gz" + - unpaired: + type: file + description: | + FastQ files containing unpaired reads from read 1 or read 2 + pattern: "*unpaired*.fq.gz" + - html: + type: file + description: FastQC report (optional) + pattern: "*_{fastqc.html}" + - zip: + type: file + description: FastQC report archive (optional) + pattern: "*_{fastqc.zip}" + - log: + type: file + description: Trim Galore! trimming report + pattern: "*_{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@FelixKrueger" +maintainers: + - "@drpatelh" + - "@ewels" + - "@FelixKrueger" diff --git a/modules/nf-core/trimgalore/tests/main.nf.test b/modules/nf-core/trimgalore/tests/main.nf.test new file mode 100644 index 00000000..3e3819b6 --- /dev/null +++ b/modules/nf-core/trimgalore/tests/main.nf.test @@ -0,0 +1,107 @@ +nextflow_process { + + name "Test Process TRIMGALORE" + script "../main.nf" + process "TRIMGALORE" + tag "modules" + tag "modules_nfcore" + tag "trimgalore" + + test("test_trimgalore_single_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:true ], // meta map + [ file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true) ] + ] + """ + } + } + + then { + def read_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1)).linesGzip.contains(read_line) } + } + }, + { report1_lines.each { report1_line -> + { assert path(process.out.log.get(0).get(1)).getText().contains(report1_line) } + } + } + ) + } + } + + test("test_trimgalore_paired_end") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ [ id:'test', single_end:false ], // meta map + [ + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_1.fastq.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/sarscov2/illumina/fastq/test_2.fastq.gz", checkIfExists: true) + ] + ] + """ + } + } + + then { + def read1_lines = ["@ERR5069949.2151832 NS500628:121:HK3MMAFX2:2:21208:10793:15304/1", + "TCATAAACCAAAGCACTCACAGTGTCAACAATTTCAGCAGGACAACGCCGACAAGTTCCGAGGAACATGTCTGGACCTATAGTTTTCATAAGTCTACACACTGAATTGAAATATTCTGGTTCTAGTGTGCCCTTAGTTAGCAATGTGCGT", + "AAAAAAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEAEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAAEEEEE + { assert path(process.out.reads.get(0).get(1).get(0)).linesGzip.contains(read1_line) } + } + }, + { read2_lines.each { read2_line -> + { assert path(process.out.reads.get(0).get(1).get(1)).linesGzip.contains(read2_line) } + } + }, + { report1_lines.each { report1_line -> + { assert path(process.out.log.get(0).get(1).get(0)).getText().contains(report1_line) } + } + }, + { report2_lines.each { report2_line -> + { assert path(process.out.log.get(0).get(1).get(1)).getText().contains(report2_line) } + } + } + ) + } + } +} diff --git a/modules/nf-core/trimgalore/tests/main.nf.test.snap b/modules/nf-core/trimgalore/tests/main.nf.test.snap new file mode 100644 index 00000000..84feacca --- /dev/null +++ b/modules/nf-core/trimgalore/tests/main.nf.test.snap @@ -0,0 +1,148 @@ +{ + "test_trimgalore_single_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_trimmed.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastq.gz_trimming_report.txt:md5,a1ab3958205f1ddf48af623242b5b429" + ] + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "html": [ + + ], + "log": [ + [ + { + "id": "test", + "single_end": true + }, + "test.fastq.gz_trimming_report.txt:md5,a1ab3958205f1ddf48af623242b5b429" + ] + ], + "reads": [ + [ + { + "id": "test", + "single_end": true + }, + "test_trimmed.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4" + ] + ], + "unpaired": [ + + ], + "versions": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "zip": [ + + ] + } + ], + "timestamp": "2023-10-17T15:24:57.782141441" + }, + "test_trimgalore_paired_end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1_val_1.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4", + "test_2_val_2.fq.gz:md5,f3d61189e6d10202da7b8686f1dbb71b" + ] + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.gz_trimming_report.txt:md5,315d40465412f9909bbaabf52269274d", + "test_2.fastq.gz_trimming_report.txt:md5,34436303da1c78811103427a2fb57f7b" + ] + ] + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "html": [ + + ], + "log": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1.fastq.gz_trimming_report.txt:md5,315d40465412f9909bbaabf52269274d", + "test_2.fastq.gz_trimming_report.txt:md5,34436303da1c78811103427a2fb57f7b" + ] + ] + ], + "reads": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test_1_val_1.fq.gz:md5,e0a7516b8ea8d6467d6306acb2cd13c4", + "test_2_val_2.fq.gz:md5,f3d61189e6d10202da7b8686f1dbb71b" + ] + ] + ], + "unpaired": [ + + ], + "versions": [ + "versions.yml:md5,47d966cbb31c80eb8f7fe860d55659b7" + ], + "zip": [ + + ] + } + ], + "timestamp": "2023-10-17T15:25:08.513589909" + } +} \ No newline at end of file diff --git a/modules/nf-core/trimgalore/tests/tags.yml b/modules/nf-core/trimgalore/tests/tags.yml new file mode 100644 index 00000000..e9937691 --- /dev/null +++ b/modules/nf-core/trimgalore/tests/tags.yml @@ -0,0 +1,2 @@ +trimgalore: + - modules/nf-core/trimgalore/** diff --git a/nextflow.config b/nextflow.config index d23b3a81..266de3d6 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,7 +1,7 @@ /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nf-core/metatdenovo Nextflow config file -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Default config options for all compute environments ---------------------------------------------------------------------------------------- */ @@ -9,58 +9,113 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags // Input options - input = null + input = null + assembly = null + genomes = null + protein_fasta = null + gff = null + se_reads = false - // References - genome = null - igenomes_base = 's3://ngi-igenomes/igenomes' - igenomes_ignore = false + // QC options + skip_qc = false + skip_fastqc = false + + // Trimming options + clip_r1 = null + clip_r2 = null + three_prime_clip_r1 = null + three_prime_clip_r2 = null + trim_nextseq = null + save_trimmed = false + skip_trimming = false + + // BBDuk options + sequence_filter = null + + // Digital normalization options + bbnorm = false + bbnorm_target = 100 + bbnorm_min = 5 + save_bbnorm_fastq = false + + // assembler option + assembler = 'megahit' + min_contig_length = 0 + + // Mapping options + save_samtools = false + save_bam = false + + // orf caller options + orf_caller = 'prodigal' + prodigal_trainingfile = null + prokka_batchsize = 10.MB + + // HMMSEARCH options + hmmdir = null + hmmpattern = '*.hmm' + hmmfiles = null + + // Eggnog options + eggnog_dbpath = 'eggnog' + skip_eggnog = false + + // KOfamscan options + skip_kofamscan = false + kofam_dir = './kofam/' + + // Eukulele options + eukulele_db = '' + skip_eukulele = false + eukulele_dbpath = './eukulele/' + eukulele_method = 'mets' // MultiQC options multiqc_config = null multiqc_title = null + multiqc_logo = null max_multiqc_email_size = '25.MB' + multiqc_methods_description = null // Boilerplate options outdir = './results' - tracedir = "${params.outdir}/pipeline_info" publish_dir_mode = 'copy' email = null email_on_fail = null plaintext_email = false monochrome_logs = false + hook_url = null help = false - validate_params = true - show_hidden_params = false - schema_ignore_params = 'genomes,modules' - enable_conda = false - singularity_pull_docker_container = false + version = false // Config options + config_profile_name = null + config_profile_description = null custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - hostnames = [:] - config_profile_description = null config_profile_contact = null config_profile_url = null - config_profile_name = null + // Max resource options // Defaults only, expecting to be overwritten - max_memory = '128.GB' - max_cpus = 16 - max_time = '240.h' + max_memory = '128.GB' + max_cpus = 16 + max_time = '240.h' + + // Schema validation default options + validationFailUnrecognisedParams = false + validationLenientMode = false + validationSchemaIgnoreParams = 'genomes,igenomes_base' + validationShowHiddenParams = false + validate_params = false } // Load base.config by default for all pipelines includeConfig 'conf/base.config' -// Load modules.config for DSL2 module specific options -includeConfig 'conf/modules.config' - // Load nf-core custom profiles from different Institutions try { includeConfig "${params.custom_config_base}/nfcore_custom.config" @@ -68,102 +123,178 @@ try { System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") } -// Load igenomes.config if required -if (!params.igenomes_ignore) { - includeConfig 'conf/igenomes.config' -} else { - params.genomes = [:] -} - +// Load nf-core/metatdenovo custom profiles from different institutions. +// Warning: Uncomment only if a pipeline-specific institutional config already exists on nf-core/configs! +// try { +// includeConfig "${params.custom_config_base}/pipeline/metatdenovo.config" +// } catch (Exception e) { +// System.err.println("WARNING: Could not load nf-core/config/metatdenovo profiles: ${params.custom_config_base}/pipeline/metatdenovo.config") +// } profiles { - debug { process.beforeScript = 'echo $HOSTNAME' } + debug { + dumpHashes = true + process.beforeScript = 'echo $HOSTNAME' + cleanup = false + nextflow.enable.configProcessNamesValidation = true + } conda { - params.enable_conda = true + conda.enabled = true docker.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + channels = ['conda-forge', 'bioconda', 'defaults'] + apptainer.enabled = false + } + mamba { + conda.enabled = true + conda.useMamba = true + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false } docker { docker.enabled = true - docker.userEmulation = true + conda.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false + docker.runOptions = '-u $(id -u):$(id -g)' + } + arm { + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { singularity.enabled = true singularity.autoMounts = true + conda.enabled = false docker.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } podman { podman.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false shifter.enabled = false charliecloud.enabled = false + apptainer.enabled = false } shifter { shifter.enabled = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false charliecloud.enabled = false + apptainer.enabled = false } charliecloud { charliecloud.enabled = true + conda.enabled = false + docker.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + apptainer.enabled = false + } + apptainer { + apptainer.enabled = true + apptainer.autoMounts = true + conda.enabled = false docker.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false + charliecloud.enabled = false + } + test { includeConfig 'conf/test.config' } + test_filter { includeConfig 'conf/test_filter.config' } + test_full { includeConfig 'conf/test_full.config' } + test_prokka { includeConfig 'conf/test_prokka.config' } + test_transdecoder { includeConfig 'conf/test_transdecoder.config' } + test_eggnog { includeConfig 'conf/test_eggnog.config' } + test_eukulele { includeConfig 'conf/test_eukulele.config' } + test_kofamscan { includeConfig 'conf/test_kofamscan.config' } + test_rnaspades { includeConfig 'conf/test_rnaspades.config' } + gitpod { + executor.name = 'local' + executor.cpus = 16 + executor.memory = 60.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } +} + +// Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile +// Will not be used unless Apptainer / Docker / Podman / Singularity are enabled +// Set to your registry if you have a mirror of containers +apptainer.registry = 'quay.io' +docker.registry = 'quay.io' +podman.registry = 'quay.io' +singularity.registry = 'quay.io' + +// Nextflow plugins +plugins { + id 'nf-validation@1.1.3' // Validation of pipeline parameters and creation of an input channel from a sample sheet } // Export these variables to prevent local Python/R libraries from conflicting with those in the container +// The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. +// See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. + env { PYTHONNOUSERSITE = 1 R_PROFILE_USER = "/.Rprofile" R_ENVIRON_USER = "/.Renviron" + JULIA_DEPOT_PATH = "/usr/local/share/julia" } // Capture exit codes from upstream processes when piping process.shell = ['/bin/bash', '-euo', 'pipefail'] +// Disable process selector warnings by default. Use debug profile to enable warnings. +nextflow.enable.configProcessNamesValidation = false + def trace_timestamp = new java.util.Date().format( 'yyyy-MM-dd_HH-mm-ss') timeline { enabled = true - file = "${params.tracedir}/execution_timeline_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_timeline_${trace_timestamp}.html" } report { enabled = true - file = "${params.tracedir}/execution_report_${trace_timestamp}.html" + file = "${params.outdir}/pipeline_info/execution_report_${trace_timestamp}.html" } trace { enabled = true - file = "${params.tracedir}/execution_trace_${trace_timestamp}.txt" + file = "${params.outdir}/pipeline_info/execution_trace_${trace_timestamp}.txt" } dag { enabled = true - file = "${params.tracedir}/pipeline_dag_${trace_timestamp}.svg" + file = "${params.outdir}/pipeline_info/pipeline_dag_${trace_timestamp}.html" } manifest { name = 'nf-core/metatdenovo' - author = 'Daniel Lundin' + author = """Danilo Di Leo, Emelie Nilsson & Daniel Lundin""" homePage = 'https://github.com/nf-core/metatdenovo' - description = 'Assembly and annotation of metatranscriptomic data, both prokaryotic and eukaryotic' + description = """Assembly and annotation of metatranscriptomic data, both prokaryotic and eukaryotic""" mainScript = 'main.nf' - nextflowVersion = '!>=21.04.0' - version = '1.0dev' + nextflowVersion = '!>=23.04.0' + version = '1.0.0' + doi = '' } +// Load modules.config for DSL2 module specific options +includeConfig 'conf/modules.config' + // Function to ensure that resource requirements don't go beyond // a maximum limit def check_max(obj, type) { diff --git a/nextflow_schema.json b/nextflow_schema.json index 4b4a194f..ed156e20 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,25 +10,29 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input" - ], + "required": ["input", "outdir"], "properties": { "input": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", - "schema": "assets/schema_input.json", "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/metatdenovo/usage#samplesheet-input).", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row.", "fa_icon": "fas fa-file-csv" }, + "se_reads": { + "type": "boolean", + "description": "activate when using single end reads input", + "fa_icon": "fas fa-long-arrow-alt-right" + }, "outdir": { "type": "string", - "description": "Path to the output directory where the results will be saved.", - "default": "./results", - "fa_icon": "fas fa-folder-open" + "format": "directory-path", + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open", + "default": "./results" }, "email": { "type": "string", @@ -44,43 +48,296 @@ } } }, - "reference_genome_options": { - "title": "Reference genome options", + "quality_control_options": { + "title": "Quality control options", "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", + "description": "Option for QC steps", + "default": "", + "fa_icon": "fas fa-check", "properties": { - "genome": { + "skip_qc": { + "type": "boolean", + "description": "Skip all QC steps except for MultiQC.", + "fa_icon": "fas fa-forward" + }, + "skip_fastqc": { + "type": "boolean", + "description": "Skip FastQC.", + "fa_icon": "fas fa-forward" + } + } + }, + "trimming_options": { + "title": "trimming options", + "type": "object", + "fa_icon": "fas fa-cut", + "properties": { + "clip_r1": { + "type": "string", + "description": "Instructs Trim Galore to remove bp from the 5' end of read 1 (or single-end reads).", + "fa_icon": "fas fa-cut" + }, + "clip_r2": { + "type": "string", + "description": "Instructs Trim Galore to remove bp from the 5' end of read 2 (paired-end reads only).", + "fa_icon": "fas fa-cut" + }, + "three_prime_clip_r1": { + "type": "string", + "description": "Instructs Trim Galore to remove bp from the 3' end of read 1 AFTER adapter/quality trimming has been performed.", + "fa_icon": "fas fa-cut" + }, + "three_prime_clip_r2": { "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "description": "Instructs Trim Galore to remove bp from the 3' end of read 2 AFTER adapter/quality trimming has been performed.", + "fa_icon": "fas fa-cut" + }, + "save_trimmed": { + "type": "boolean", + "description": "Save the trimmed FastQ files in the results directory.", + "help_text": "By default, trimmed FastQ files will not be saved to the results directory. Specify this flag (or set to true in your config file) to copy these files to the results directory when complete.", + "fa_icon": "fas fa-cut" }, - "fasta": { + "trim_nextseq": { + "type": "string", + "description": "Instructs Trim Galore to apply the --nextseq=X option, to trim based on quality after removing poly-G tails.", + "help_text": "This enables the option Cutadapt `--nextseq-trim=3'CUTOFF` option via Trim Galore, which will set a quality cutoff (that is normally given with -q instead), but qualities of G bases are ignored. This trimming is in common for the NextSeq- and NovaSeq-platforms, where basecalls without any signal are called as high-quality G bases.", + "fa_icon": "fas fa-cut" + }, + "skip_trimming": { + "type": "boolean", + "description": "Skip the adapter trimming step.", + "help_text": "Use this if your input FastQ files have already been trimmed outside of the workflow or if you're very confident that there is no adapter contamination in your data.", + "fa_icon": "fas fa-forward" + } + }, + "description": "all the trim option are listed below" + }, + "bbnorm_options": { + "title": "Filtering options", + "type": "object", + "description": "", + "default": "", + "properties": { + "sequence_filter": { + "type": "string", + "description": "Fasta file with sequences to filter away before running assembly etc..", + "help_text": "Read sequences matching this file will be filtered out from samples with BBDuk before mapping. If no file is specified, BBDuk will not be run.", + "fa_icon": "fas fa-filter" + } + }, + "fa_icon": "fas fa-filter" + }, + "digital_normalization_options": { + "title": "Digital normalization options", + "type": "object", + "description": "Use these option if you need to normalize the reads before the assembly", + "default": "", + "fa_icon": "fas fa-align-justify", + "properties": { + "bbnorm": { + "type": "boolean", + "description": "Perform normalization to reduce sequencing depth.", + "help_text": "Normalization is performed following the example in https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbnorm-guide/", + "fa_icon": "fas fa-align-justify" + }, + "bbnorm_target": { + "type": "integer", + "default": 100, + "description": "Reduce the number of reads for assembly average coverage of this number.", + "fa_icon": "fas fa-align-justify" + }, + "bbnorm_min": { + "type": "integer", + "default": 5, + "description": "Reads with an apparent depth of under nx will be presumed to be errors and discarded", + "fa_icon": "fas fa-align-justify" + }, + "save_bbnorm_fastq": { + "type": "boolean", + "description": "save the resulting fastq files from normalization", + "fa_icon": "fas fa-align-center" + } + } + }, + "assembler_options": { + "title": "Assembler options", + "type": "object", + "description": "", + "default": "", + "properties": { + "assembler": { + "type": "string", + "default": "megahit", + "enum": ["megahit", "rnaspades"], + "description": "Specify which assembler you would like to run, possible alternatives: megahit, rnaspades. default: megahit", + "fa_icon": "fas fa-bezier-curve" + }, + "assembly": { "type": "string", "format": "file-path", + "exists": true, "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", + "description": "Path to a fasta file with a finished assembly. Assembly will be skipped by the pipeline.", "fa_icon": "far fa-file-code" }, - "igenomes_base": { + "min_contig_length": { + "type": "integer", + "default": 0, + "description": "Filter out contigs shorter than this.", + "fa_icon": "fas fa-align-justify" + } + }, + "fa_icon": "fas fa-bezier-curve" + }, + "mapping_options": { + "title": "Mapping options", + "type": "object", + "description": "", + "default": "", + "properties": { + "save_bam": { + "type": "boolean", + "description": "Save the bam files from mapping", + "fa_icon": "fas fa-align-center" + }, + "save_samtools": { + "type": "boolean", + "description": "Save the output from samtools" + } + }, + "fa_icon": "fas fa-filter" + }, + "orf_caller_options": { + "title": "Orf Caller options", + "type": "object", + "description": "", + "default": "", + "properties": { + "protein_fasta": { + "type": "string", + "fa_icon": "fas fa-file-code", + "description": "Path to a protein fasta file" + }, + "gff": { + "type": "string", + "fa_icon": "fas fa-file-code", + "description": "Path to a gff file" + }, + "orf_caller": { + "type": "string", + "default": "prodigal", + "enum": ["prodigal", "prokka", "transdecoder"], + "description": "Specify which ORF caller you would like to run, possible alternatives: prodigal, prokka, transdecoder, default: prodigal.", + "fa_icon": "fas fa-phone-volume" + }, + "prodigal_trainingfile": { + "type": "string", + "description": "Specify a training file for prodigal. By default prodigal will learn from the input sequences", + "fa_icon": "fas fa-phone-volume" + }, + "prokka_batchsize": { + "type": "string", + "description": "Size of individual files annotated by Prokka in one batch.", + "default": "10 MB", + "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", + "hidden": true, + "help_text": "Prokka usually fails on very large input files. This parameter controls the size of smaller batches for which Prokka will be called. Should be a string in the format integer-unit e.g. `--prokka_batchsize '8.MB'`" + } + }, + "fa_icon": "fas fa-phone-volume" + }, + "functional_annotation_options": { + "title": "Functional annotation options", + "type": "object", + "description": "", + "default": "", + "properties": { + "skip_eggnog": { + "type": "boolean", + "default": false, + "description": "Skip EGGNOG functional annotation", + "fa_icon": "fas fa-forward" + }, + "eggnog_dbpath": { "type": "string", + "default": "eggnog", + "description": "Specify EGGNOG database path", + "fa_icon": "far fa-file-code", + "help_text": "This parameter specifies where you have an EGGNOG database, or, where it will be created using the --create_eggnog_db parameter. The directory must exist." + }, + "skip_kofamscan": { + "type": "boolean", + "default": false, + "description": "skip kofamscan run", + "fa_icon": "fas fa-forward" + }, + "kofam_dir": { + "type": "string", + "default": "./kofam/", + "description": "Path to a directory with KOfam files. Will be created if it doesn't exist.", + "help_text": "If a `ko_list` file and/or `profiles` does not exist, they will be downloaded.", + "fa_icon": "fas fa-folder-open" + }, + "hmmdir": { + "type": "string", + "description": "Directory with hmm files which will be searched for among ORFs", + "pattern": "^\\S+", "format": "directory-path", - "description": "Directory / URL base for iGenomes references.", - "default": "s3://ngi-igenomes/igenomes", - "fa_icon": "fas fa-cloud-download-alt", - "hidden": true + "fa_icon": "fas fa-folder-open" }, - "igenomes_ignore": { + "hmmfiles": { + "type": "string", + "description": "Comma-separated list of hmm files which will be searched for among ORFs", + "pattern": "\\S+hmm(\\.gz)?", + "format": "file-path", + "mimetype": "text/plain", + "fa_icon": "far fa-file-code" + }, + "hmmpattern": { + "type": "string", + "default": "*.hmm", + "description": "specify which pattern hmm files end with", + "fa_icon": "fas fa-barcode" + } + }, + "fa_icon": "fas fa-clipboard" + }, + "taxonomy_annotation_options": { + "title": "Taxonomy annotation options", + "type": "object", + "description": "", + "default": "", + "properties": { + "skip_eukulele": { "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + "default": false, + "description": "skip eukulele run", + "fa_icon": "fas fa-forward" + }, + "eukulele_method": { + "type": "string", + "default": "mets", + "description": "Specify which method to use for EUKulele. the alternatives are: mets (metatranscriptomics) or mags (Metagenome Assembled Genomes). default: mets", + "enum": ["mets", "mags"], + "fa_icon": "fas fa-bezier-curve" + }, + "eukulele_db": { + "type": "string", + "enum": ["gtdb", "phylodb", "marmmetsp", "mmetsp", "eukprot"], + "description": "EUKulele database.", + "help_text": "This option allows the user to specify which database (or set of databases, separated by comma) to use with EUKulele. Databases that are provided with EUKulele will be downloaded if not already present inside the database directory (see --eukulele_dbpath). Possible alternatives: phylodb, mmetsp, marmmetsp, eukprot. NB: you can't use this option with a custom database as eukulele will not recognize the name and it will start to download phylodb by default. If you want to use a custom database, please skip this option and specify only --eukulele_dbpath.", + "fa_icon": "far fa-file-code" + }, + "eukulele_dbpath": { + "type": "string", + "default": "./eukulele/", + "description": "EUKulele database folder.", + "help_text": "If this parameter is set, EUKulele will look for a database to use in this folder. If --eukulele_db also is set, the specified database will be searched for in this directory and if it is not present it will be downloaded. If a custom database (see EUKulele documentation) should be used, EUKulele will assume that it is present in this folder - N.B. only works with one custom database (if using a custom database, point to a directory that only contains that database).", + "fa_icon": "far fa-file-code" } - } + }, + "fa_icon": "fas fa-address-card" }, "institutional_config_options": { "title": "Institutional config options", @@ -104,12 +361,6 @@ "help_text": "If you're running offline, Nextflow will not be able to fetch the institutional config files from the internet. If you don't need them, then this is not a problem. If you do need them, you should download the files from the repo and tell Nextflow where to find them with this parameter.", "fa_icon": "fas fa-users-cog" }, - "hostnames": { - "type": "string", - "description": "Institutional configs hostname.", - "hidden": true, - "fa_icon": "fas fa-users-cog" - }, "config_profile_name": { "type": "string", "description": "Institutional config name.", @@ -139,7 +390,7 @@ "max_job_request_options": { "title": "Max job request options", "type": "object", - "fa_icon": "fab fa-acquisitions-incorporated", + "fa_icon": "fas fa-microchip", "description": "Set the top limit for requested resources for any single job.", "help_text": "If you are running on a smaller system, a pipeline step requesting more resources than are available may cause the Nextflow to stop the run with an error. These options allow you to cap the maximum resources requested by any single job so that the pipeline will run on your system.\n\nNote that you can not _increase_ the resources requested by any job using these options. For that you will need your own configuration file. See [the nf-core website](https://nf-co.re/usage/configuration) for details.", "properties": { @@ -165,7 +416,7 @@ "description": "Maximum amount of time that can be requested for any single job.", "default": "240.h", "fa_icon": "far fa-clock", - "pattern": "^(\\d+\\.?\\s*(s|m|h|day)\\s*)+$", + "pattern": "^(\\d+\\.?\\s*(s|m|h|d|day)\\s*)+$", "hidden": true, "help_text": "Use to set an upper-limit for the time requirement for each process. Should be a string in the format integer-unit e.g. `--max_time '2.h'`" } @@ -184,20 +435,19 @@ "fa_icon": "fas fa-question-circle", "hidden": true }, + "version": { + "type": "boolean", + "description": "Display version and exit.", + "fa_icon": "fas fa-question-circle", + "hidden": true + }, "publish_dir_mode": { "type": "string", "default": "copy", "description": "Method used to save pipeline results to output directory.", "help_text": "The Nextflow `publishDir` option specifies which intermediate files should be saved to the output directory. This option tells the pipeline what method should be used to move these files. See [Nextflow docs](https://www.nextflow.io/docs/latest/process.html#publishdir) for details.", "fa_icon": "fas fa-copy", - "enum": [ - "symlink", - "rellink", - "link", - "copy", - "copyNoFollow", - "move" - ], + "enum": ["symlink", "rellink", "link", "copy", "copyNoFollow", "move"], "hidden": true }, "email_on_fail": { @@ -228,45 +478,57 @@ "fa_icon": "fas fa-palette", "hidden": true }, + "hook_url": { + "type": "string", + "description": "Incoming hook URL for messaging service", + "fa_icon": "fas fa-people-group", + "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", + "hidden": true + }, "multiqc_config": { "type": "string", + "format": "file-path", "description": "Custom config file to supply to MultiQC.", "fa_icon": "fas fa-cog", "hidden": true }, - "tracedir": { + "multiqc_logo": { "type": "string", - "description": "Directory to keep pipeline Nextflow logs and reports.", - "default": "${params.outdir}/pipeline_info", - "fa_icon": "fas fa-cogs", + "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", + "fa_icon": "fas fa-image", "hidden": true }, + "multiqc_methods_description": { + "type": "string", + "description": "Custom MultiQC yaml file containing HTML including a methods description.", + "fa_icon": "fas fa-cog" + }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", - "default": true, "fa_icon": "fas fa-check-square", "hidden": true }, - "show_hidden_params": { + "validationShowHiddenParams": { "type": "boolean", "fa_icon": "far fa-eye-slash", "description": "Show all params when using `--help`", "hidden": true, "help_text": "By default, parameters set as _hidden_ in the schema are not shown on the command line when a user runs with `--help`. Specifying this option will tell the pipeline to show all parameters." }, - "enable_conda": { + "validationFailUnrecognisedParams": { "type": "boolean", - "description": "Run this workflow with Conda. You can also use '-profile conda' instead of providing this parameter.", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters fails when an unrecognised parameter is found.", "hidden": true, - "fa_icon": "fas fa-bacon" + "help_text": "By default, when an unrecognised parameter is found, it returns a warinig." }, - "singularity_pull_docker_container": { + "validationLenientMode": { "type": "boolean", - "description": "Instead of directly downloading Singularity images for use with Singularity, force the workflow to pull and convert Docker containers instead.", + "fa_icon": "far fa-check-circle", + "description": "Validation of parameters in lenient more.", "hidden": true, - "fa_icon": "fas fa-toolbox", - "help_text": "This may be useful for example if you are unable to directly pull Singularity containers to run the pipeline due to http/https proxy issues." + "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } } @@ -276,7 +538,31 @@ "$ref": "#/definitions/input_output_options" }, { - "$ref": "#/definitions/reference_genome_options" + "$ref": "#/definitions/quality_control_options" + }, + { + "$ref": "#/definitions/trimming_options" + }, + { + "$ref": "#/definitions/bbnorm_options" + }, + { + "$ref": "#/definitions/digital_normalization_options" + }, + { + "$ref": "#/definitions/assembler_options" + }, + { + "$ref": "#/definitions/mapping_options" + }, + { + "$ref": "#/definitions/orf_caller_options" + }, + { + "$ref": "#/definitions/functional_annotation_options" + }, + { + "$ref": "#/definitions/taxonomy_annotation_options" }, { "$ref": "#/definitions/institutional_config_options" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..7d08e1c8 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +# Config file for Python. Mostly used to configure linting of bin/*.py with Ruff. +# Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. +[tool.ruff] +line-length = 120 +target-version = "py38" +select = ["I", "E1", "E4", "E7", "E9", "F", "UP", "N"] +cache-dir = "~/.cache/ruff" + +[tool.ruff.isort] +known-first-party = ["nf_core"] + +[tool.ruff.per-file-ignores] +"__init__.py" = ["E402", "F401"] diff --git a/subworkflows/local/eggnog.nf b/subworkflows/local/eggnog.nf new file mode 100644 index 00000000..6dc19697 --- /dev/null +++ b/subworkflows/local/eggnog.nf @@ -0,0 +1,38 @@ +// +// Run eggnog-mapper on called ORFs, first optionally downloading the required databases +// + +include { EGGNOG_DOWNLOAD } from '../../modules/local/eggnog/download' +include { EGGNOG_MAPPER } from '../../modules/local/eggnog/mapper' +include { EGGNOG_SUM } from '../../modules/local/eggnog/sum' + +workflow EGGNOG { + take: + faa + collect_fcs + + main: + ch_versions = Channel.empty() + + EGGNOG_DOWNLOAD() + ch_versions = ch_versions.mix ( EGGNOG_DOWNLOAD.out.versions ) + + EGGNOG_DOWNLOAD.out.eggnog_db + .combine(EGGNOG_DOWNLOAD.out.dmnd) + .combine(EGGNOG_DOWNLOAD.out.taxa_db) + .combine(EGGNOG_DOWNLOAD.out.pkl) + .set{ ch_eggnog_database } + + EGGNOG_MAPPER ( faa, ch_eggnog_database ) + ch_versions = ch_versions.mix ( EGGNOG_MAPPER.out.versions ) + + EGGNOG_SUM ( EGGNOG_MAPPER.out.emappertsv, collect_fcs ) + ch_versions = ch_versions.mix ( EGGNOG_SUM.out.versions ) + + emit: + hits = EGGNOG_MAPPER.out.hits + emappertsv = EGGNOG_MAPPER.out.emappertsv + sumtable = EGGNOG_SUM.out.eggnog_summary + versions = ch_versions + +} diff --git a/subworkflows/local/eukulele.nf b/subworkflows/local/eukulele.nf new file mode 100644 index 00000000..70408b4c --- /dev/null +++ b/subworkflows/local/eukulele.nf @@ -0,0 +1,38 @@ +// +// Run EUKULELE on protein fasta from orf_caller output +// + +include { EUKULELE_SEARCH } from '../../modules/local/eukulele/search' +include { FORMAT_TAX } from '../../modules/local/format_tax' +include { SUM_TAXONOMY } from '../../modules/local/sum_taxonomy' + +workflow SUB_EUKULELE { + + take: + eukulele // Channel: val(meta), path(fasta), val(database), path(directory) + feature_counts + + main: + ch_versions = Channel.empty() + + EUKULELE_SEARCH( eukulele ) + ch_versions = ch_versions.mix ( EUKULELE_SEARCH.out.versions ) + + FORMAT_TAX( EUKULELE_SEARCH.out.taxonomy_estimation.map { meta, taxonomy, dbname -> [ meta, taxonomy ] } ) + ch_versions = ch_versions.mix ( FORMAT_TAX.out.versions ) + + FORMAT_TAX.out.tax + .join(eukulele) + .map { meta, taxonomy, protein, dbname, database -> [ meta, dbname, taxonomy ] } + .set { ch_sum_taxonomy } + + SUM_TAXONOMY ( ch_sum_taxonomy, feature_counts ) + ch_versions = ch_versions.mix ( SUM_TAXONOMY.out.versions ) + + emit: + taxonomy_summary = SUM_TAXONOMY.out.taxonomy_summary + taxonomy_estimation = EUKULELE_SEARCH.out.taxonomy_estimation + taxonomy_counts = EUKULELE_SEARCH.out.taxonomy_counts + diamond = EUKULELE_SEARCH.out.diamond + versions = ch_versions +} diff --git a/subworkflows/local/fastqc_trimgalore.nf b/subworkflows/local/fastqc_trimgalore.nf new file mode 100644 index 00000000..5b06f5b6 --- /dev/null +++ b/subworkflows/local/fastqc_trimgalore.nf @@ -0,0 +1,46 @@ +// +// Read QC, UMI extraction and trimming +// + +include { FASTQC } from '../../modules/nf-core/fastqc/main' +include { TRIMGALORE } from '../../modules/nf-core/trimgalore/main' + +workflow FASTQC_TRIMGALORE { + take: + reads // channel: [ val(meta), [ reads ] ] + skip_fastqc // boolean: true/false + skip_trimming // boolean: true/false + + main: + ch_versions = Channel.empty() + fastqc_html = Channel.empty() + fastqc_zip = Channel.empty() + + if (!skip_fastqc) { + FASTQC ( reads ).html.set { fastqc_html } + fastqc_zip = FASTQC.out.zip + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + } + + trim_reads = reads + trim_html = Channel.empty() + trim_zip = Channel.empty() + trim_log = Channel.empty() + + if (!skip_trimming) { + TRIMGALORE ( reads ).reads.set { trim_reads } + trim_html = TRIMGALORE.out.html + trim_zip = TRIMGALORE.out.zip + trim_log = TRIMGALORE.out.log + ch_versions = ch_versions.mix(TRIMGALORE.out.versions.first()) + } + + emit: + reads = trim_reads // channel: [ val(meta), [ reads ] ] + fastqc_html // channel: [ val(meta), [ html ] ] + fastqc_zip // channel: [ val(meta), [ zip ] ] + trim_html // channel: [ val(meta), [ html ] ] + trim_zip // channel: [ val(meta), [ zip ] ] + trim_log // channel: [ val(meta), [ txt ] ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/local/hmmclassify.nf b/subworkflows/local/hmmclassify.nf new file mode 100644 index 00000000..c7d15fb1 --- /dev/null +++ b/subworkflows/local/hmmclassify.nf @@ -0,0 +1,32 @@ +include { HMMER_HMMSEARCH } from '../../modules/nf-core/hmmer/hmmsearch/main' +include { HMMRANK } from '../../modules/local/hmmrank' + +workflow HMMCLASSIFY { + + take: + ch_hmmclassify // channel: [ val(meta), [ hmm, aa_fasta ] ] + + main: + ch_versions = Channel.empty() + + HMMER_HMMSEARCH ( + ch_hmmclassify + .map { meta, hmm, seqdb -> [ [ id: "${meta.id}.${hmm.baseName}" ], hmm, seqdb, false, true, false ] } + ) + ch_versions = ch_versions.mix(HMMER_HMMSEARCH.out.versions.first()) + + HMMRANK ( + ch_hmmclassify + .map { meta, hmm, seqdb -> meta } + .distinct() + .combine ( HMMER_HMMSEARCH.out.target_summary.collect { meta, summary -> summary } ) + .map { [ it[0], it[1..-1] ] } + ) + ch_versions = ch_versions.mix(HMMRANK.out.versions) + + emit: + HMMRANK.out.hmmrank + + versions = ch_versions // channel: [ versions.yml ] +} + diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index c1b07196..4f0919c2 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -2,9 +2,7 @@ // Check input samplesheet and get read channels // -params.options = [:] - -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' addParams( options: params.options ) +include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' workflow INPUT_CHECK { take: @@ -14,7 +12,7 @@ workflow INPUT_CHECK { SAMPLESHEET_CHECK ( samplesheet ) .csv .splitCsv ( header:true, sep:',' ) - .map { create_fastq_channels(it) } + .map { create_fastq_channel(it) } .set { reads } emit: @@ -23,22 +21,24 @@ workflow INPUT_CHECK { } // Function to get list of [ meta, [ fastq_1, fastq_2 ] ] -def create_fastq_channels(LinkedHashMap row) { +def create_fastq_channel(LinkedHashMap row) { + // create meta map def meta = [:] - meta.id = row.sample - meta.single_end = row.single_end.toBoolean() + meta.id = row.sample + meta.single_end = row.single_end.toBoolean() - def array = [] + // add path(s) of the fastq file(s) to the meta map + def fastq_meta = [] if (!file(row.fastq_1).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" + error "ERROR: Please check input samplesheet -> Read 1 FastQ file does not exist!\n${row.fastq_1}" } if (meta.single_end) { - array = [ meta, [ file(row.fastq_1) ] ] + fastq_meta = [ meta, [ file(row.fastq_1) ] ] } else { if (!file(row.fastq_2).exists()) { - exit 1, "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" + error "ERROR: Please check input samplesheet -> Read 2 FastQ file does not exist!\n${row.fastq_2}" } - array = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] + fastq_meta = [ meta, [ file(row.fastq_1), file(row.fastq_2) ] ] } - return array + return fastq_meta } diff --git a/subworkflows/local/kofamscan.nf b/subworkflows/local/kofamscan.nf new file mode 100644 index 00000000..c1e8cdc7 --- /dev/null +++ b/subworkflows/local/kofamscan.nf @@ -0,0 +1,31 @@ +// +// Run KOFAMSCAN on protein fasta from orf_caller output +// + +include { KOFAMSCAN_SCAN } from '../../modules/local/kofamscan/scan' +include { KOFAMSCAN_DOWNLOAD } from '../../modules/local/kofamscan/download' +include { SUM_KOFAMSCAN } from '../../modules/local/sum_kofamscan' + +workflow KOFAMSCAN { + + take: + kofamscan // Channel: val(meta), path(fasta) + fcs // featureCounts output + + main: + ch_versions = Channel.empty() + + KOFAMSCAN_DOWNLOAD() + ch_versions = ch_versions.mix ( KOFAMSCAN_DOWNLOAD.out.versions ) + + KOFAMSCAN_SCAN( kofamscan, KOFAMSCAN_DOWNLOAD.out.ko_list, KOFAMSCAN_DOWNLOAD.out.koprofiles ) + ch_versions = ch_versions.mix(KOFAMSCAN_SCAN.out.versions) + + SUM_KOFAMSCAN( KOFAMSCAN_SCAN.out.kout, fcs ) + ch_versions = ch_versions.mix(SUM_KOFAMSCAN.out.versions) + + emit: + kofam_table_out = KOFAMSCAN_SCAN.out.kout + kofamscan_summary = SUM_KOFAMSCAN.out.kofamscan_summary + versions = ch_versions +} diff --git a/subworkflows/local/prodigal.nf b/subworkflows/local/prodigal.nf new file mode 100644 index 00000000..6f94b040 --- /dev/null +++ b/subworkflows/local/prodigal.nf @@ -0,0 +1,25 @@ +// +// Run prodigal as orf caller then generate nice format for gff +// + +include { PRODIGAL as PRODIGAL_MODULE } from '../../modules/nf-core/prodigal/main' +include { FORMAT_PRODIGAL_GFF } from '../../modules/local/format_prodigal' + +workflow PRODIGAL { + take: + fastafile + + main: + ch_versions = Channel.empty() + + PRODIGAL_MODULE ( fastafile, 'gff' ) + ch_versions = ch_versions.mix(PRODIGAL_MODULE.out.versions) + FORMAT_PRODIGAL_GFF ( PRODIGAL_MODULE.out.gene_annotations ) + ch_versions = ch_versions.mix(FORMAT_PRODIGAL_GFF.out.versions) + + emit: + faa = PRODIGAL_MODULE.out.amino_acid_fasta + gff = FORMAT_PRODIGAL_GFF.out.format_gff + versions = ch_versions + +} diff --git a/subworkflows/local/prokka_subsets.nf b/subworkflows/local/prokka_subsets.nf new file mode 100644 index 00000000..c8c2d600 --- /dev/null +++ b/subworkflows/local/prokka_subsets.nf @@ -0,0 +1,51 @@ +// +// Run PROKKA on contigs that are split by size, then concatenate output and gunzip it +// + +include { PROKKA } from '../../modules/nf-core/prokka/main' +include { CAT_CAT as GFF_CAT } from '../../modules/nf-core/cat/cat/main' +include { CAT_CAT as FAA_CAT } from '../../modules/nf-core/cat/cat/main' +include { CAT_CAT as FFN_CAT } from '../../modules/nf-core/cat/cat/main' +include { PROKKAGFF2TSV } from '../../modules/local/prokkagff2tsv' + +workflow PROKKA_SUBSETS { + take: + contigs // channel: tuple val(meta), file(contigs) + batchsize // channel: strings like '10.MB'. Usually from params.prokka_batchsize + + main: + ch_versions = Channel.empty() + + PROKKA ( contigs.map{ meta, contigs -> contigs }.splitFasta(size: batchsize, file: true).map { contigs -> [ [ id: contigs.getBaseName() ], contigs] }, [], [] ) + ch_versions = ch_versions.mix(PROKKA.out.versions) + ch_log = PROKKA.out.txt.map { meta, log -> log }.collect() + contigs.map{ meta, contigs -> [ id:"${meta.id}.prokka" ] } + .combine(PROKKA.out.gff.collect { meta, gff -> gff }.map { [ it ] }) + .set { ch_gff } + GFF_CAT ( ch_gff ) + ch_versions = ch_versions.mix(GFF_CAT.out.versions) + + contigs.map{ meta, contigs -> [ id:"${meta.id}.prokka" ] } + .combine(PROKKA.out.faa.collect { meta, protein -> protein }.map { [ it ] }) + .set { ch_faa } + FAA_CAT ( ch_faa ) + ch_versions = ch_versions.mix(FAA_CAT.out.versions) + + contigs.map{ meta, contigs -> [ id:"${meta.id}.prokka" ] } + .combine(PROKKA.out.ffn.collect { meta, fnn -> fnn }.map { [ it ] }) + .set { ch_ffn } + FFN_CAT ( ch_ffn ) + ch_versions = ch_versions.mix(FFN_CAT.out.versions) + + PROKKAGFF2TSV ( GFF_CAT.out.file_out) + ch_versions = ch_versions.mix(PROKKAGFF2TSV.out.versions) + + emit: + gff = GFF_CAT.out.file_out + faa = FAA_CAT.out.file_out + ffn = FFN_CAT.out.file_out + gfftsv = PROKKAGFF2TSV.out.tsv + prokka_log = ch_log + versions = ch_versions + +} diff --git a/subworkflows/local/transdecoder.nf b/subworkflows/local/transdecoder.nf new file mode 100644 index 00000000..d5aab6ea --- /dev/null +++ b/subworkflows/local/transdecoder.nf @@ -0,0 +1,28 @@ +// +// TRANSDECODER SUBWORKFLOW +// + +include { TRANSDECODER_LONGORF as LONGORF } from '../../modules/nf-core/transdecoder/longorf/main' +include { TRANSDECODER_PREDICT as PREDICT } from '../../modules/nf-core/transdecoder/predict/main' + +workflow TRANSDECODER { + take: + contigs // channel: [ val(meta), [ contigs ] ] + + main: + ch_versions = Channel.empty() + + LONGORF (contigs) + ch_versions = ch_versions.mix(LONGORF.out.versions) + PREDICT (contigs, LONGORF.out.folder) + ch_versions = ch_versions.mix(PREDICT.out.versions) + + emit: + gff = PREDICT.out.gff3 + cds = PREDICT.out.cds + pep = PREDICT.out.pep + bed = PREDICT.out.bed + + versions = ch_versions + +} diff --git a/subworkflows/nf-core/bam_sort_stats_samtools/main.nf b/subworkflows/nf-core/bam_sort_stats_samtools/main.nf new file mode 100644 index 00000000..fc1c652b --- /dev/null +++ b/subworkflows/nf-core/bam_sort_stats_samtools/main.nf @@ -0,0 +1,50 @@ +// +// Sort, index BAM file and run samtools stats, flagstat and idxstats +// + +include { SAMTOOLS_SORT } from '../../../modules/nf-core/samtools/sort/main' +include { SAMTOOLS_INDEX } from '../../../modules/nf-core/samtools/index/main' +include { BAM_STATS_SAMTOOLS } from '../bam_stats_samtools/main' + +workflow BAM_SORT_STATS_SAMTOOLS { + take: + ch_bam // channel: [ val(meta), [ bam ] ] + ch_fasta // channel: [ val(meta), path(fasta) ] + + main: + + ch_versions = Channel.empty() + + SAMTOOLS_SORT ( ch_bam ) + ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions.first()) + + SAMTOOLS_INDEX ( SAMTOOLS_SORT.out.bam ) + ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + + SAMTOOLS_SORT.out.bam + .join(SAMTOOLS_INDEX.out.bai, by: [0], remainder: true) + .join(SAMTOOLS_INDEX.out.csi, by: [0], remainder: true) + .map { + meta, bam, bai, csi -> + if (bai) { + [ meta, bam, bai ] + } else { + [ meta, bam, csi ] + } + } + .set { ch_bam_bai } + + BAM_STATS_SAMTOOLS ( ch_bam_bai, ch_fasta ) + ch_versions = ch_versions.mix(BAM_STATS_SAMTOOLS.out.versions) + + emit: + bam = SAMTOOLS_SORT.out.bam // channel: [ val(meta), [ bam ] ] + bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] + csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), [ csi ] ] + + stats = BAM_STATS_SAMTOOLS.out.stats // channel: [ val(meta), [ stats ] ] + flagstat = BAM_STATS_SAMTOOLS.out.flagstat // channel: [ val(meta), [ flagstat ] ] + idxstats = BAM_STATS_SAMTOOLS.out.idxstats // channel: [ val(meta), [ idxstats ] ] + + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/bam_sort_stats_samtools/meta.yml b/subworkflows/nf-core/bam_sort_stats_samtools/meta.yml new file mode 100644 index 00000000..e01f9ccf --- /dev/null +++ b/subworkflows/nf-core/bam_sort_stats_samtools/meta.yml @@ -0,0 +1,70 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: bam_sort_stats_samtools +description: Sort SAM/BAM/CRAM file +keywords: + - sort + - bam + - sam + - cram +components: + - samtools/sort + - samtools/index + - samtools/stats + - samtools/idxstats + - samtools/flagstat + - bam_stats_samtools +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - fasta: + type: file + description: Reference genome fasta file + pattern: "*.{fasta,fa}" +# TODO Update when we decide on a standard for subworkflow docs +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: Sorted BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - flagstat: + type: file + description: File containing samtools flagstat output + pattern: "*.{flagstat}" + - idxstats: + type: file + description: File containing samtools idxstats output + pattern: "*.{idxstats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" +maintainers: + - "@drpatelh" + - "@ewels" diff --git a/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test b/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test new file mode 100644 index 00000000..75b5b934 --- /dev/null +++ b/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test @@ -0,0 +1,82 @@ +nextflow_workflow { + + name "Test Workflow BAM_SORT_STATS_SAMTOOLS" + script "../main.nf" + workflow "BAM_SORT_STATS_SAMTOOLS" + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/bam_sort_stats_samtools" + tag "bam_sort_stats_samtools" + tag "subworkflows/bam_stats_samtools" + tag "bam_stats_samtools" + tag "samtools" + tag "samtools/index" + tag "samtools/sort" + tag "samtools/stats" + tag "samtools/idxstats" + tag "samtools/flagstat" + + test("test_bam_sort_stats_samtools_single_end") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.bam', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert workflow.out.bam.get(0).get(1) ==~ ".*.bam"}, + { assert workflow.out.bai.get(0).get(1) ==~ ".*.bai"}, + { assert snapshot(workflow.out.stats).match("test_bam_sort_stats_samtools_single_end_stats") }, + { assert snapshot(workflow.out.flagstat).match("test_bam_sort_stats_samtools_single_end_flagstats") }, + { assert snapshot(workflow.out.idxstats).match("test_bam_sort_stats_samtools_single_end_idxstats") } + ) + } + } + + test("test_bam_sort_stats_samtools_paired_end") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.bam', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert workflow.out.bam.get(0).get(1) ==~ ".*.bam"}, + { assert workflow.out.bai.get(0).get(1) ==~ ".*.bai"}, + { assert snapshot(workflow.out.stats).match("test_bam_sort_stats_samtools_paired_end_stats") }, + { assert snapshot(workflow.out.flagstat).match("test_bam_sort_stats_samtools_paired_end_flagstats") }, + { assert snapshot(workflow.out.idxstats).match("test_bam_sort_stats_samtools_paired_end_idxstats") } + ) + } + } +} diff --git a/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test.snap b/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test.snap new file mode 100644 index 00000000..6645a092 --- /dev/null +++ b/subworkflows/nf-core/bam_sort_stats_samtools/tests/main.nf.test.snap @@ -0,0 +1,110 @@ +{ + "test_bam_sort_stats_samtools_paired_end_flagstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2023-10-22T20:25:03.687121177" + }, + "test_bam_sort_stats_samtools_paired_end_idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2023-10-22T20:25:03.709648916" + }, + "test_bam_sort_stats_samtools_single_end_stats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,cb0bf2b79de52fdf0c61e80efcdb0bb4" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:44:38.553256801" + }, + "test_bam_sort_stats_samtools_paired_end_stats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d7796222a087b9bb97f631f1c21b9c95" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:44:48.355870518" + }, + "test_bam_sort_stats_samtools_single_end_idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,613e048487662c694aa4a2f73ca96a20" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-01-18T17:10:02.84631" + }, + "test_bam_sort_stats_samtools_single_end_flagstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,2191911d72575a2358b08b1df64ccb53" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-01-18T17:10:02.829756" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/bam_sort_stats_samtools/tests/tags.yml b/subworkflows/nf-core/bam_sort_stats_samtools/tests/tags.yml new file mode 100644 index 00000000..30b69d6a --- /dev/null +++ b/subworkflows/nf-core/bam_sort_stats_samtools/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/bam_sort_stats_samtools: + - subworkflows/nf-core/bam_sort_stats_samtools/** diff --git a/subworkflows/nf-core/bam_stats_samtools/main.nf b/subworkflows/nf-core/bam_stats_samtools/main.nf new file mode 100644 index 00000000..44d4c010 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/main.nf @@ -0,0 +1,32 @@ +// +// Run SAMtools stats, flagstat and idxstats +// + +include { SAMTOOLS_STATS } from '../../../modules/nf-core/samtools/stats/main' +include { SAMTOOLS_IDXSTATS } from '../../../modules/nf-core/samtools/idxstats/main' +include { SAMTOOLS_FLAGSTAT } from '../../../modules/nf-core/samtools/flagstat/main' + +workflow BAM_STATS_SAMTOOLS { + take: + ch_bam_bai // channel: [ val(meta), path(bam), path(bai) ] + ch_fasta // channel: [ val(meta), path(fasta) ] + + main: + ch_versions = Channel.empty() + + SAMTOOLS_STATS ( ch_bam_bai, ch_fasta ) + ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions) + + SAMTOOLS_FLAGSTAT ( ch_bam_bai ) + ch_versions = ch_versions.mix(SAMTOOLS_FLAGSTAT.out.versions) + + SAMTOOLS_IDXSTATS ( ch_bam_bai ) + ch_versions = ch_versions.mix(SAMTOOLS_IDXSTATS.out.versions) + + emit: + stats = SAMTOOLS_STATS.out.stats // channel: [ val(meta), path(stats) ] + flagstat = SAMTOOLS_FLAGSTAT.out.flagstat // channel: [ val(meta), path(flagstat) ] + idxstats = SAMTOOLS_IDXSTATS.out.idxstats // channel: [ val(meta), path(idxstats) ] + + versions = ch_versions // channel: [ path(versions.yml) ] +} diff --git a/subworkflows/nf-core/bam_stats_samtools/meta.yml b/subworkflows/nf-core/bam_stats_samtools/meta.yml new file mode 100644 index 00000000..809bf736 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/meta.yml @@ -0,0 +1,43 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: bam_stats_samtools +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +components: + - samtools/stats + - samtools/idxstats + - samtools/flagstat +input: + - ch_bam_bai: + description: | + The input channel containing the BAM/CRAM and it's index + Structure: [ val(meta), path(bam), path(bai) ] + - ch_fasta: + description: | + Reference genome fasta file + Structure: [ path(fasta) ] +output: + - stats: + description: | + File containing samtools stats output + Structure: [ val(meta), path(stats) ] + - flagstat: + description: | + File containing samtools flagstat output + Structure: [ val(meta), path(flagstat) ] + - idxstats: + description: | + File containing samtools idxstats output + Structure: [ val(meta), path(idxstats)] + - versions: + description: | + Files containing software versions + Structure: [ path(versions.yml) ] +authors: + - "@drpatelh" +maintainers: + - "@drpatelh" diff --git a/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test new file mode 100644 index 00000000..c8b21f28 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test @@ -0,0 +1,108 @@ +nextflow_workflow { + + name "Test Workflow BAM_STATS_SAMTOOLS" + script "../main.nf" + workflow "BAM_STATS_SAMTOOLS" + tag "subworkflows" + tag "subworkflows_nfcore" + tag "bam_stats_samtools" + tag "subworkflows/bam_stats_samtools" + tag "samtools" + tag "samtools/flagstat" + tag "samtools/idxstats" + tag "samtools/stats" + + test("test_bam_stats_samtools_single_end") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam.bai', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out.stats).match("test_bam_stats_samtools_single_end_stats") }, + { assert snapshot(workflow.out.flagstat).match("test_bam_stats_samtools_single_end_flagstats") }, + { assert snapshot(workflow.out.idxstats).match("test_bam_stats_samtools_single_end_idxstats") } + ) + } + } + + test("test_bam_stats_samtools_paired_end") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam.bai', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out.stats).match("test_bam_stats_samtools_paired_end_stats") }, + { assert snapshot(workflow.out.flagstat).match("test_bam_stats_samtools_paired_end_flagstats") }, + { assert snapshot(workflow.out.idxstats).match("test_bam_stats_samtools_paired_end_idxstats") } + ) + } + } + + test("test_bam_stats_samtools_paired_end_cram") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:false ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.sorted.cram.crai', checkIfExists: true) + ]) + input[1] = Channel.of([ + [ id:'genome' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out.stats).match("test_bam_stats_samtools_paired_end_cram_stats") }, + { assert snapshot(workflow.out.flagstat).match("test_bam_stats_samtools_paired_end_cram_flagstats") }, + { assert snapshot(workflow.out.idxstats).match("test_bam_stats_samtools_paired_end_cram_idxstats") } + ) + } + } + +} diff --git a/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test.snap b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test.snap new file mode 100644 index 00000000..bf0b0c69 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/tests/main.nf.test.snap @@ -0,0 +1,164 @@ +{ + "test_bam_stats_samtools_paired_end_cram_flagstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.flagstat:md5,a53f3d26e2e9851f7d528442bbfe9781" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2023-11-06T09:31:26.194017574" + }, + "test_bam_stats_samtools_paired_end_stats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.stats:md5,ddaf8f33fe9c1ebe9b06933213aec8ed" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:45:06.230091746" + }, + "test_bam_stats_samtools_paired_end_flagstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.flagstat:md5,4f7ffd1e6a5e85524d443209ac97d783" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-01-18T17:17:27.717482" + }, + "test_bam_stats_samtools_single_end_flagstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.flagstat:md5,2191911d72575a2358b08b1df64ccb53" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2023-11-06T09:26:10.340046381" + }, + "test_bam_stats_samtools_paired_end_cram_idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.idxstats:md5,e179601fa7b8ebce81ac3765206f6c15" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2023-11-06T09:31:26.207052003" + }, + "test_bam_stats_samtools_single_end_stats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.stats:md5,dc178e1a4956043aba8abc83e203521b" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:44:57.442208382" + }, + "test_bam_stats_samtools_paired_end_idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.idxstats:md5,df60a8c8d6621100d05178c93fb053a2" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-01-18T17:17:27.726719" + }, + "test_bam_stats_samtools_single_end_idxstats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test.idxstats:md5,613e048487662c694aa4a2f73ca96a20" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2023-11-06T09:26:10.349439801" + }, + "test_bam_stats_samtools_paired_end_cram_stats": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test.stats:md5,d3345c4887f4a9ea4f7f56405b495db0" + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-13T16:45:14.997164209" + } +} \ No newline at end of file diff --git a/subworkflows/nf-core/bam_stats_samtools/tests/tags.yml b/subworkflows/nf-core/bam_stats_samtools/tests/tags.yml new file mode 100644 index 00000000..ec2f2d68 --- /dev/null +++ b/subworkflows/nf-core/bam_stats_samtools/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/bam_stats_samtools: + - subworkflows/nf-core/bam_stats_samtools/** diff --git a/tower.yml b/tower.yml new file mode 100644 index 00000000..787aedfe --- /dev/null +++ b/tower.yml @@ -0,0 +1,5 @@ +reports: + multiqc_report.html: + display: "MultiQC HTML report" + samplesheet.csv: + display: "Auto-created samplesheet with collated metadata and FASTQ paths" diff --git a/workflows/metatdenovo.nf b/workflows/metatdenovo.nf index 9d466a88..ea51fb76 100644 --- a/workflows/metatdenovo.nf +++ b/workflows/metatdenovo.nf @@ -1,65 +1,136 @@ /* -======================================================================================== - VALIDATE INPUTS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + PRINT PARAMS SUMMARY +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -def summary_params = NfcoreSchema.paramsSummaryMap(workflow, params) +include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' -// Validate input parameters -WorkflowMetatdenovo.initialise(params, log) +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) -// TODO nf-core: Add all file path parameters for the pipeline to the list below // Check input path parameters to see if they exist -def checkPathParamList = [ params.input, params.multiqc_config, params.fasta ] +def checkPathParamList = [ params.input, params.multiqc_config ] for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation + +WorkflowMetatdenovo.initialise(params, log) + +// Deal with user-supplied assembly to make sure output names are correct +if ( params.assembly ) { + assembler = 'user_assembly' +} else { + assembler = params.assembler +} + +// Deal with params from user-supplied ORFs, and set orf_caller correctly +if ( params.gff && params.protein_fasta ) { + orf_caller = 'user_orfs' +} else if ( params.gff && ! params.protein_fasta ) { + error 'When supplying ORFs, both --gff and --protein_fasta must be specified, --protein_fasta file is missing!' +} else if ( params.protein_fasta && ! params.gff ) { + error 'When supplying ORFs, both --gff and --protein_fasta must be specified, --gff file is missing!' +} else { + orf_caller = params.orf_caller +} -// Check mandatory parameters -if (params.input) { ch_input = file(params.input) } else { exit 1, 'Input samplesheet not specified!' } +// set an empty multiqc channel +ch_multiqc_files = Channel.empty() + +// If the user supplied hmm files, we will run hmmsearch and then rank the results. +// Create a channel for hmm files. +ch_hmmrs = Channel.empty() +if ( params.hmmdir ) { + Channel + .fromPath(params.hmmdir + params.hmmpattern, checkIfExists: true) + .set { ch_hmmrs } +} else if ( params.hmmfiles ) { + Channel + .fromList( params.hmmfiles.tokenize(',') ) + .map { [ file(it) ] } + .set { ch_hmmrs } +} /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONFIG FILES -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -ch_multiqc_config = file("$projectDir/assets/multiqc_config.yaml", checkIfExists: true) -ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config) : Channel.empty() +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT LOCAL MODULES/SUBWORKFLOWS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// Don't overwrite global params.modules, create a copy instead and use that within the main script. -def modules = params.modules.clone() +// +// MODULE: local +// +include { WRITESPADESYAML } from '../modules/local/writespadesyaml' +include { MEGAHIT_INTERLEAVED } from '../modules/local/megahit/interleaved' +include { COLLECT_FEATURECOUNTS } from '../modules/local/collect_featurecounts' +include { COLLECT_STATS } from '../modules/local/collect_stats' +include { FORMATSPADES } from '../modules/local/formatspades' +include { UNPIGZ as UNPIGZ_CONTIGS } from '../modules/local/unpigz' +include { UNPIGZ as UNPIGZ_GFF } from '../modules/local/unpigz' +include { MERGE_TABLES } from '../modules/local/merge_summary_tables' +include { TRANSRATE } from '../modules/local/transrate' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules // -include { INPUT_CHECK } from '../subworkflows/local/input_check' addParams( options: [:] ) +include { INPUT_CHECK } from '../subworkflows/local/input_check' + +// +// SUBWORKFLOW: Consisting of local modules +// +include { EGGNOG } from '../subworkflows/local/eggnog' +include { SUB_EUKULELE } from '../subworkflows/local/eukulele' +include { HMMCLASSIFY } from '../subworkflows/local/hmmclassify' +include { PROKKA_SUBSETS } from '../subworkflows/local/prokka_subsets' +include { TRANSDECODER } from '../subworkflows/local/transdecoder' +include { FASTQC_TRIMGALORE } from '../subworkflows/local/fastqc_trimgalore' +include { PRODIGAL } from '../subworkflows/local/prodigal' +include { KOFAMSCAN } from '../subworkflows/local/kofamscan' /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ IMPORT NF-CORE MODULES/SUBWORKFLOWS -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -def multiqc_options = modules['multiqc'] -multiqc_options.args += params.multiqc_title ? Utils.joinModuleArgs(["--title \"$params.multiqc_title\""]) : '' - // // MODULE: Installed directly from nf-core/modules // -include { FASTQC } from '../modules/nf-core/modules/fastqc/main' addParams( options: modules['fastqc'] ) -include { MULTIQC } from '../modules/nf-core/modules/multiqc/main' addParams( options: multiqc_options ) -include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/modules/custom/dumpsoftwareversions/main' addParams( options: [publish_files : ['_versions.yml':'']] ) +include { BBMAP_BBDUK } from '../modules/nf-core/bbmap/bbduk/main' +include { BBMAP_INDEX } from '../modules/nf-core/bbmap/index/main' +include { BBMAP_ALIGN } from '../modules/nf-core/bbmap/align/main' +include { BBMAP_BBNORM } from '../modules/nf-core/bbmap/bbnorm/main' +include { SEQTK_MERGEPE } from '../modules/nf-core/seqtk/mergepe/main' +include { SUBREAD_FEATURECOUNTS as FEATURECOUNTS_CDS } from '../modules/nf-core/subread/featurecounts/main' +include { SPADES } from '../modules/nf-core/spades/main' +include { SEQTK_SEQ as SEQTK_SEQ_CONTIG_FILTER } from '../modules/nf-core/seqtk/seq/main' +include { CAT_FASTQ } from '../modules/nf-core/cat/fastq/main' +include { FASTQC } from '../modules/nf-core/fastqc/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' + +// +// SUBWORKFLOWS: Installed directly from nf-core/modules +// +include { BAM_SORT_STATS_SAMTOOLS } from '../subworkflows/nf-core/bam_sort_stats_samtools/main' /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ // Info required for completion email and summary @@ -73,17 +144,376 @@ workflow METATDENOVO { // SUBWORKFLOW: Read in samplesheet, validate and stage input files // INPUT_CHECK ( - ch_input + file(params.input) ) + .reads + .map { + meta, fastq -> + new_id = meta.id - ~/_T\d+/ + [ meta + [id: new_id], fastq ] + } + .groupTuple() + .branch { + meta, fastq -> + single : fastq.size() == 1 + return [ meta, fastq.flatten() ] + multiple: fastq.size() > 1 + return [ meta, fastq.flatten() ] + } + .set { ch_fastq } ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) // - // MODULE: Run FastQC + // MODULE: Concatenate FastQ files from same sample if required + // + CAT_FASTQ ( + ch_fastq.multiple + ) + .reads + .mix(ch_fastq.single) + .set { ch_cat_fastq } + + ch_versions = ch_versions.mix(CAT_FASTQ.out.versions.first()) + + // + // SUBWORKFLOW: Read QC and trim adapters // - FASTQC ( - INPUT_CHECK.out.reads + FASTQC_TRIMGALORE ( + ch_cat_fastq, + params.skip_fastqc || params.skip_qc, + params.skip_trimming ) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + ch_versions = ch_versions.mix(FASTQC_TRIMGALORE.out.versions) + ch_collect_stats = ch_cat_fastq.collect { meta, fasta -> meta.id }.map { [ [ id:"${assembler}.${orf_caller}" ], it ] } + if ( params.skip_trimming ) { + ch_collect_stats + .map { meta, samples -> [ meta, samples, [] ] } + .set { ch_collect_stats } + + } else { + if ( params.se_reads ) { + ch_collect_stats + .combine(FASTQC_TRIMGALORE.out.trim_log.collect { meta, report -> report }.map { [ it ] }) + .set { ch_collect_stats } + } else { + ch_collect_stats + .combine(FASTQC_TRIMGALORE.out.trim_log.collect { meta, report -> report[0] }.map { [ it ] }) + .set { ch_collect_stats } + } + } + + // + // MODULE: Run BBDuk to clean out whatever sequences the user supplied via params.sequence_filter + // + if ( params.sequence_filter ) { + BBMAP_BBDUK ( FASTQC_TRIMGALORE.out.reads, params.sequence_filter ) + ch_clean_reads = BBMAP_BBDUK.out.reads + ch_bbduk_logs = BBMAP_BBDUK.out.log.collect { meta, log -> log }.map { [ it ] } + ch_versions = ch_versions.mix(BBMAP_BBDUK.out.versions.first()) + ch_collect_stats + .combine(ch_bbduk_logs) + .set {ch_collect_stats} + ch_multiqc_files = ch_multiqc_files.mix(BBMAP_BBDUK.out.log.collect{ meta, log -> log }) + } else { + ch_clean_reads = FASTQC_TRIMGALORE.out.reads + ch_bbduk_logs = Channel.empty() + ch_collect_stats + .map { meta, samples, report -> [ meta, samples, report, [] ] } + .set { ch_collect_stats } + } + + // + // MODULE: Interleave sequences for assembly + // + // DL & DDL: We can probably not deal with single end input + ch_interleaved = Channel.empty() + if ( ! params.assembly ) { + if ( params.se_reads) { + ch_single_end = ch_clean_reads + } else { + SEQTK_MERGEPE(ch_clean_reads) + ch_interleaved = SEQTK_MERGEPE.out.reads + ch_versions = ch_versions.mix(SEQTK_MERGEPE.out.versions) + } + } + + // + // SUBWORKFLOW: Perform digital normalization. There are two options: khmer or BBnorm. The latter is faster. + // + if ( ! params.assembly ) { + if ( params.se_reads ) { + if ( params.bbnorm ) { + BBMAP_BBNORM(ch_single_end.collect { meta, fastq -> fastq }.map {[ [id:'all_samples', single_end:true], it ] } ) + ch_se_reads_to_assembly = BBMAP_BBNORM.out.fastq.map { meta, fasta -> fasta } + ch_pe_reads_to_assembly = Channel.empty() + ch_versions = ch_versions.mix(BBMAP_BBNORM.out.versions) + } else { + ch_se_reads_to_assembly = ch_single_end.map { meta, fastq -> fastq } + ch_pe_reads_to_assembly = Channel.empty() + } + } + else if ( params.bbnorm ) { + BBMAP_BBNORM(ch_interleaved.collect { meta, fastq -> fastq }.map {[ [id:'all_samples', single_end:true], it ] } ) + ch_pe_reads_to_assembly = BBMAP_BBNORM.out.fastq.map { meta, fasta -> fasta } + ch_se_reads_to_assembly = Channel.empty() + ch_versions = ch_versions.mix(BBMAP_BBNORM.out.versions) + } else { + ch_pe_reads_to_assembly = ch_interleaved.map { meta, fastq -> fastq } + ch_se_reads_to_assembly = Channel.empty() + } + } + + // + // MODULE: Run Megahit or RNAspades on all interleaved fastq files + // + if ( params.assembly ) { + Channel + .value ( [ [ id: 'user_assembly' ], file(params.assembly) ] ) + .set { ch_assembly_contigs } + } else if ( assembler == 'rnaspades' ) { + // 1. Write a yaml file for Spades + WRITESPADESYAML ( + ch_pe_reads_to_assembly.toList(), + ch_se_reads_to_assembly.toList() + ) + ch_versions = ch_versions.mix(WRITESPADESYAML.out.versions) + // 2. Call the module with a channel with all fastq files plus the yaml + ch_pe_reads_to_assembly + .mix(ch_se_reads_to_assembly) + .collect() + .map { [ [ id:'rnaspades' ], it, [], [] ] } + .set { ch_spades } + SPADES ( + ch_spades, + WRITESPADESYAML.out.yaml, + [] + ) + ch_assembly = SPADES.out.transcripts + ch_versions = ch_versions.mix(SPADES.out.versions) + FORMATSPADES( ch_assembly ) + ch_assembly_contigs = FORMATSPADES.out.assembly + ch_versions = ch_versions.mix(FORMATSPADES.out.versions) + } else if ( assembler == 'megahit' ) { + MEGAHIT_INTERLEAVED( + ch_pe_reads_to_assembly.toList(), + ch_se_reads_to_assembly.toList(), + 'megahit_assembly' + ) + MEGAHIT_INTERLEAVED.out.contigs + .map { [ [ id: 'megahit' ], it ] } + .set { ch_assembly_contigs } + ch_versions = ch_versions.mix(MEGAHIT_INTERLEAVED.out.versions) + } else { error 'Assembler not specified!' } + + // If the user asked for length filtering, perform that with SEQTK_SEQ (the actual length parameter is used in modules.config) + if ( params.min_contig_length > 0 ) { + SEQTK_SEQ_CONTIG_FILTER ( ch_assembly_contigs ) + ch_assembly_contigs = SEQTK_SEQ_CONTIG_FILTER.out.fastx + ch_versions = ch_versions.mix(SEQTK_SEQ_CONTIG_FILTER.out.versions) + } + + // + // Call ORFs + // + ch_gff = Channel.empty() + ch_protein = Channel.empty() + + // + // SUBWORKFLOW: Run PROKKA_SUBSETS on assmebly output, but split the fasta file in chunks of 10 MB, then concatenate and compress output. + // + if ( params.orf_caller == 'prokka' ) { + PROKKA_SUBSETS(ch_assembly_contigs, params.prokka_batchsize) + ch_versions = ch_versions.mix(PROKKA_SUBSETS.out.versions) + ch_protein = PROKKA_SUBSETS.out.faa + ch_multiqc_files = ch_multiqc_files.mix(PROKKA_SUBSETS.out.prokka_log) + + UNPIGZ_GFF(PROKKA_SUBSETS.out.gff.map { meta, gff -> [ [id: "${params.orf_caller}.${meta}"], gff ] }) + ch_gff = UNPIGZ_GFF.out.unzipped + ch_versions = ch_versions.mix(UNPIGZ_GFF.out.versions) + } + + // + // MODULE: Run PRODIGAL on assembly output. + // + if ( orf_caller == 'prodigal' ) { + PRODIGAL( ch_assembly_contigs.map { meta, contigs -> [ [id: "${assembler}.${orf_caller}"], contigs ] } ) + ch_protein = PRODIGAL.out.faa + ch_versions = ch_versions.mix(PRODIGAL.out.versions) + + UNPIGZ_GFF(PRODIGAL.out.gff.map { meta, gff -> [ [id: "${meta.id}.${orf_caller}"], gff ] }) + ch_gff = UNPIGZ_GFF.out.unzipped + ch_versions = ch_versions.mix(UNPIGZ_GFF.out.versions) + } + + // + // SUBWORKFLOW: run TRANSDECODER. Orf caller alternative for eukaryotes. + // + if ( orf_caller == 'transdecoder' ) { + TRANSDECODER ( ch_assembly_contigs.map { meta, contigs -> [ [id: "transdecoder.${meta.id}" ], contigs ] } ) + ch_gff = TRANSDECODER.out.gff + ch_protein = TRANSDECODER.out.pep + ch_versions = ch_versions.mix(TRANSDECODER.out.versions) + } + + // Populate channels if the user provided the orfs + if ( orf_caller == 'user_orfs' ) { + Channel + .value ( [ [ id: "${assembler}.${orf_caller}" ], file(params.gff) ] ) + .set { ch_gff } + Channel + .value ( [ [ id: "${assembler}.${orf_caller}" ], file(params.protein_fasta) ] ) + .set { ch_protein } + } + + // + // MODULE: Create a BBMap index + // + BBMAP_INDEX(ch_assembly_contigs.map { meta, contigs -> contigs }) + ch_versions = ch_versions.mix(BBMAP_INDEX.out.versions) + + // + // MODULE: Call BBMap with the index once per sample + // + BBMAP_ALIGN ( ch_clean_reads, BBMAP_INDEX.out.index ) + ch_versions = ch_versions.mix(BBMAP_ALIGN.out.versions) + + // + // SUBWORKFLOW: classify ORFs with a set of hmm files + // + ch_hmmrs + .combine(ch_protein) + .map { hmm, meta, protein ->[ [ id: "${assembler}.${orf_caller}" ], hmm, protein ] } + .set { ch_hmmclassify } + HMMCLASSIFY ( ch_hmmclassify ) + ch_versions = ch_versions.mix(HMMCLASSIFY.out.versions) + + // + // MODULE: FeatureCounts. Create a table for each samples that provides raw counts as result of the alignment. + // + + BAM_SORT_STATS_SAMTOOLS ( BBMAP_ALIGN.out.bam, ch_assembly_contigs ) + ch_versions = ch_versions.mix(BAM_SORT_STATS_SAMTOOLS.out.versions) + + BAM_SORT_STATS_SAMTOOLS.out.bam + .combine(ch_gff.map { meta, bam -> bam } ) + .set { ch_featurecounts } + + ch_collect_stats + .combine(BAM_SORT_STATS_SAMTOOLS.out.idxstats.collect { meta, idxstats -> idxstats }.map { [ it ] } ) + .set { ch_collect_stats } + + FEATURECOUNTS_CDS ( ch_featurecounts) + ch_versions = ch_versions.mix(FEATURECOUNTS_CDS.out.versions) + + // + // MODULE: Collect featurecounts output counts in one table + // + FEATURECOUNTS_CDS.out.counts + .collect() { meta, featurecounts -> featurecounts } + .map { featurecounts -> [ [ id:"${assembler}.${orf_caller}" ], featurecounts ] } + .set { ch_collect_feature } + + COLLECT_FEATURECOUNTS ( ch_collect_feature ) + ch_versions = ch_versions.mix(COLLECT_FEATURECOUNTS.out.versions) + ch_fcs_for_stats = COLLECT_FEATURECOUNTS.out.counts.collect { meta, tsv -> tsv }.map { [ it ] } + ch_fcs_for_summary = COLLECT_FEATURECOUNTS.out.counts.map { meta, tsv -> tsv } + ch_collect_stats + .combine(ch_fcs_for_stats) + .set { ch_collect_stats } + + // + // SUBWORKFLOW: run eggnog_mapper on the ORF-called amino acid sequences + // + if ( ! params.skip_eggnog ) { + EGGNOG(ch_protein, ch_fcs_for_summary) + ch_versions = ch_versions.mix(EGGNOG.out.versions) + ch_merge_tables = EGGNOG.out.sumtable + } else { + ch_protein + .map { meta, protein -> [ meta, [] ] } + .set { ch_merge_tables } + } + + + // + // SUBWORKFLOW: run kofamscan on the ORF-called amino acid sequences + // + if( !params.skip_kofamscan ) { + ch_protein + .map { meta, protein -> [ meta, protein ] } + .set { ch_kofamscan } + KOFAMSCAN( ch_kofamscan, ch_fcs_for_summary) + ch_versions = ch_versions.mix(KOFAMSCAN.out.versions) + ch_kofamscan_summary = KOFAMSCAN.out.kofamscan_summary.collect{ meta, tsv -> tsv } + ch_merge_tables + .combine( ch_kofamscan_summary ) + .set { ch_merge_tables } + } else { + ch_merge_tables + .map { meta, tsv -> [ meta, tsv, [] ] } + .set { ch_merge_tables } + } + + // set up contig channel to use in CAT and TransRate + UNPIGZ_CONTIGS(ch_assembly_contigs) + ch_unzipped_contigs = UNPIGZ_CONTIGS.out.unzipped + ch_versions = ch_versions.mix(UNPIGZ_CONTIGS.out.versions) + + // + // MODULE: Use TransRate to judge assembly quality, piped into MultiQC + // + TRANSRATE(ch_unzipped_contigs) + ch_versions = ch_versions.mix(TRANSRATE.out.versions) + + // + // SUBWORKFLOW: Eukulele + // + ch_eukulele_db = Channel.empty() + if( !params.skip_eukulele){ + // Create a channel for EUKulele either with a named database or not. The latter means a user-provided database in a directory. + if ( params.eukulele_db ) { + Channel + .fromList ( params.eukulele_db.split(',') ) + .map { [ it, file(params.eukulele_dbpath) ] } + .set { ch_eukulele_db } + } else { + Channel.fromPath(params.eukulele_dbpath, checkIfExists: true) + .map { [ [], it ] } + .set { ch_eukulele_db } + } + ch_protein + .map { meta, protein -> [ [ id:"${meta.id}" ], protein ] } + .combine( ch_eukulele_db ) + .set { ch_eukulele } + SUB_EUKULELE( ch_eukulele, ch_fcs_for_summary ) + ch_taxonomy_summary = SUB_EUKULELE.out.taxonomy_summary.collect{ meta, tsv -> tsv } + ch_versions = ch_versions.mix(SUB_EUKULELE.out.versions) + ch_merge_tables + .combine( ch_taxonomy_summary ) + .set { ch_merge_tables } + } else { + ch_merge_tables + .map { meta, tsv1, tsv2 -> [ meta, tsv1, tsv2, [] ] } + .set { ch_merge_tables } + } + + // + // MODULE: Collect statistics from mapping analysis + // + if( !params.skip_eggnog || !params.skip_eukulele || !params.skip_kofamscan) { + MERGE_TABLES ( ch_merge_tables ) + ch_collect_stats + .combine(MERGE_TABLES.out.merged_table.collect{ meta, tblout -> tblout }.map { [ it ] }) + .set { ch_collect_stats } + ch_versions = ch_versions.mix(MERGE_TABLES.out.versions) + } else { + ch_collect_stats + .map { meta, samples, report, tsv, idxstats, counts -> [ meta, samples, report, tsv, idxstats, counts, [] ] } + .set { ch_collect_stats } + } + + COLLECT_STATS(ch_collect_stats) + ch_versions = ch_versions.mix(COLLECT_STATS.out.versions) CUSTOM_DUMPSOFTWAREVERSIONS ( ch_versions.unique().collectFile(name: 'collated_versions.yml') @@ -95,35 +525,51 @@ workflow METATDENOVO { workflow_summary = WorkflowMetatdenovo.paramsSummaryMultiqc(workflow, summary_params) ch_workflow_summary = Channel.value(workflow_summary) - ch_multiqc_files = Channel.empty() - ch_multiqc_files = ch_multiqc_files.mix(Channel.from(ch_multiqc_config)) - ch_multiqc_files = ch_multiqc_files.mix(ch_multiqc_custom_config.collect().ifEmpty([])) - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + methods_description = WorkflowMetatdenovo.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) + ch_methods_description = Channel.value(methods_description) + ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + ch_multiqc_files = ch_multiqc_files.mix(FASTQC_TRIMGALORE.out.trim_zip.collect{ meta, zip -> zip }) + ch_multiqc_files = ch_multiqc_files.mix(TRANSRATE.out.assembly_qc.collect{ meta, tbl -> tbl }) + ch_multiqc_files = ch_multiqc_files.mix(BAM_SORT_STATS_SAMTOOLS.out.idxstats.collect{ meta, idxstats -> idxstats }) + ch_multiqc_files = ch_multiqc_files.mix(FEATURECOUNTS_CDS.out.summary.collect{ meta, summary -> summary }) + MULTIQC ( - ch_multiqc_files.collect() + ch_multiqc_files.collect(), + ch_multiqc_config.toList(), + ch_multiqc_custom_config.toList(), + ch_multiqc_logo.toList() ) multiqc_report = MULTIQC.out.report.toList() - ch_versions = ch_versions.mix(MULTIQC.out.versions) } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ COMPLETION EMAIL AND SUMMARY -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ workflow.onComplete { if (params.email || params.email_on_fail) { NfcoreTemplate.email(workflow, params, summary_params, projectDir, log, multiqc_report) } + NfcoreTemplate.dump_parameters(workflow, params) NfcoreTemplate.summary(workflow, params, log) + if (params.hook_url) { + NfcoreTemplate.IM_notification(workflow, params, summary_params, projectDir, log) + } +} + +workflow.onError { + if (workflow.errorReport.contains("Process requirement exceeds available memory")) { + println("🛑 Default resources exceed availability 🛑 ") + println("💡 See here on how to configure pipeline: https://nf-co.re/docs/usage/configuration#tuning-workflow-resources 💡") + } } /* -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ THE END -======================================================================================== +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */