From 2462a9040de785f254893f96453ae12ebeb36619 Mon Sep 17 00:00:00 2001 From: Ksenia Krasheninnikova Date: Thu, 15 Aug 2024 16:27:38 +0100 Subject: [PATCH] Merge in from dev --- .github/workflows/ci.yml | 6 +- assets/test.yaml | 13 ++- assets/test_github.yaml | 19 ---- assets/test_gsMetZobe1.yaml | 5 +- bin/generate_cram_csv.sh | 7 +- conf/modules.config | 35 +----- conf/test.config | 7 +- conf/test_full.config | 1 - conf/test_github.config | 28 ----- docs/usage.md | 9 +- modules/local/generate_cram_csv.nf | 4 +- modules/local/longranger/align/main.nf | 4 +- .../batch_summary.txt | 2 - .../logs/busco.log | 107 ------------------ modules/nf-core/busco/main.nf_ | 83 -------------- nextflow.config | 1 - subworkflows/local/organelles.nf | 12 +- subworkflows/local/prepare_input.nf | 6 +- workflows/genomeassembly.nf | 2 +- 19 files changed, 38 insertions(+), 313 deletions(-) delete mode 100644 assets/test_github.yaml delete mode 100644 conf/test_github.config delete mode 100644 modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/batch_summary.txt delete mode 100644 modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/logs/busco.log delete mode 100644 modules/nf-core/busco/main.nf_ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 845d8539..ea139834 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,10 +48,6 @@ jobs: run: | nextflow secrets set NCBI_API_KEY ${{ secrets.NCBI_API_KEY }} - - name: Download test data - run: | - curl https://tolit.cog.sanger.ac.uk/test-data/resources/genomeassembly/genomeassembly_test_data.tar.gz | tar xzf - - - name: Setup apptainer uses: eWaterCycle/setup-apptainer@main @@ -62,4 +58,4 @@ jobs: - name: Run pipeline with test data run: | - nextflow run ${GITHUB_WORKSPACE} -profile test_github,singularity --outdir ./results + nextflow run ${GITHUB_WORKSPACE} -profile test,singularity --outdir ./results diff --git a/assets/test.yaml b/assets/test.yaml index aff832fd..423fdc22 100644 --- a/assets/test.yaml +++ b/assets/test.yaml @@ -1,13 +1,16 @@ dataset: id: baUndUnlc1 illumina_10X: - reads: /lustre/scratch123/tol/resources/nextflow/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/ + reads: + - https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/baUndUnlc1_S12_L002_R1_001.fastq.gz + - https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/baUndUnlc1_S12_L002_R2_001.fastq.gz + - https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/baUndUnlc1_S12_L002_I1_001.fastq.gz pacbio: reads: - - reads: /lustre/scratch124/tol/projects/darwin/users/kk16/development/test/test/HiFi.reads.BIG.fasta + - reads: https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/pacbio/fasta/HiFi.reads.fasta HiC: reads: - - reads: /lustre/scratch123/tol/resources/nextflow/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/hic-arima2/41741_2#7.sub.cram + - reads: https://tolit.cog.sanger.ac.uk/test-data/Undibacterium_unclassified/genomic_data/baUndUnlc1/hic-arima2/41741_2%237.sub.cram hic_motif: GATC,GANTC,CTNAG,TTAA hic_aligner: bwamem2 busco: @@ -16,6 +19,6 @@ mito: species: Caradrina clavipalpis min_length: 15000 code: 5 - fam: /lustre/scratch124/tol/projects/darwin/users/cz3/organelle_asm/hmm_db/insecta_mito.fam + fam: https://github.com/c-zhou/OatkDB/raw/main/v20230921/insecta_mito.fam plastid: - fam: /lustre/scratch124/tol/projects/darwin/users/cz3/organelle_asm/hmm_db/acrogymnospermae_pltd.fam + fam: https://github.com/c-zhou/OatkDB/raw/main/v20230921/acrogymnospermae_pltd.fam diff --git a/assets/test_github.yaml b/assets/test_github.yaml deleted file mode 100644 index 3bf048f3..00000000 --- a/assets/test_github.yaml +++ /dev/null @@ -1,19 +0,0 @@ -dataset: - id: baUndUnlc1 - illumina_10X: - reads: /home/runner/work/genomeassembly/genomeassembly/Undibacterium_unclassified/genomic_data/baUndUnlc1/10x/ - pacbio: - reads: - - reads: /home/runner/work/genomeassembly/genomeassembly/Undibacterium_unclassified/genomic_data/baUndUnlc1/pacbio/fasta/HiFi.reads.fasta - HiC: - reads: - - reads: /home/runner/work/genomeassembly/genomeassembly/Undibacterium_unclassified/genomic_data/baUndUnlc1/hic-arima2/41741_2#7.sub.cram -hic_motif: GATC,GANTC,CTNAG,TTAA -hic_aligner: minimap2 -busco: - lineage: bacteria_odb10 -mito: - species: Caradrina clavipalpis - min_length: 15000 - code: 5 - fam: /home/runner/work/genomeassembly/genomeassembly/Undibacterium_unclassified/hmm_db/insecta_mito.fam diff --git a/assets/test_gsMetZobe1.yaml b/assets/test_gsMetZobe1.yaml index c115395f..56f7a484 100644 --- a/assets/test_gsMetZobe1.yaml +++ b/assets/test_gsMetZobe1.yaml @@ -1,7 +1,10 @@ dataset: id: gsMetZobe1 illumina_10X: - reads: /lustre/scratch123/tol/resources/genomeassembly/testdata/gsMetZobe1/10x/ + reads: + - /lustre/scratch123/tol/resources/genomeassembly/testdata/gsMetZobe1/10x/gsMetZobe1_S6_L008_R1_001.fastq.gz + - /lustre/scratch123/tol/resources/genomeassembly/testdata/gsMetZobe1/10x/gsMetZobe1_S6_L008_R2_001.fastq.gz + - /lustre/scratch123/tol/resources/genomeassembly/testdata/gsMetZobe1/10x/gsMetZobe1_S6_L008_I1_001.fastq.gz pacbio: reads: - reads: /lustre/scratch123/tol/resources/genomeassembly/testdata/gsMetZobe1/pacbio/m64125_200823_145825.ccs.bc1019_BAK8B_OA--bc1019_BAK8B_OA.filtered.fasta.gz diff --git a/bin/generate_cram_csv.sh b/bin/generate_cram_csv.sh index 81eaad34..15e51694 100755 --- a/bin/generate_cram_csv.sh +++ b/bin/generate_cram_csv.sh @@ -12,8 +12,9 @@ for cram in "$@"; do rgline=$(samtools view -H $cram|grep "RG"|sed 's/\t/\\t/g'|sed "s/'//g") crampath=$(readlink -f ${cram}) + craipath=$(readlink -f ${cram}.crai) - ncontainers=$(zcat ${crampath}.crai|wc -l) + ncontainers=$(zcat ${craipath} | wc -l) base=$(basename $cram .cram) from=0 @@ -22,7 +23,7 @@ for cram in "$@"; do while [ $to -lt $ncontainers ] do - echo $crampath,${crampath}.crai,${from},${to},${base},${chunkn},${rgline} + echo $crampath,${craipath},${from},${to},${base},${chunkn},${rgline} from=$((to+1)) ((to+=10000)) ((chunkn++)) @@ -30,7 +31,7 @@ for cram in "$@"; do if [ $from -le $ncontainers ] then - echo $crampath,${crampath}.crai,${from},${ncontainers},${base},${chunkn},${rgline} + echo $crampath,${craipath},${from},${ncontainers},${base},${chunkn},${rgline} ((chunkn++)) fi done diff --git a/conf/modules.config b/conf/modules.config index 2c30e6a3..21cb4af8 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -372,6 +372,9 @@ process { // Set up of the polishing pipeline if (params.polishing_on) { withName: LONGRANGER_MKREF { + if(System.getenv('GITHUB_ACTION') != null ) { + container = "ghcr.io/sanger-tol/longranger:2.2.2-c4" + } publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" }, mode: params.publish_dir_mode, @@ -382,6 +385,9 @@ process { withName: LONGRANGER_ALIGN { // Keep in sync with `longranger_lsf_sanger.config` ext.args = "--disable-ui --nopreflight" + if(System.getenv('GITHUB_ACTION') != null ) { + container = "ghcr.io/sanger-tol/longranger:2.2.2-c4" + } publishDir = [ path: { "${params.outdir}/${meta.id}.${params.hifiasm}/polishing" }, mode: params.publish_dir_mode, @@ -843,33 +849,4 @@ profiles { } } } - - test_github { - process { - // Set up of the scaffolding pipeline - withName: 'YAHS' { - // Skip the initial assembly error correction step - ext.args = '-r 1000,2000,5000' - } - - withName: '.*HIFIASM.*' { - // Skip bloom filter - ext.args = '--primary -f0' - } - - withName: '.*OATK' { - // Set kmer size and minimal coverage - ext.args = "-k1001 -c5 -Ttmp" - } - - if (params.polishing_on) { - withName: LONGRANGER_MKREF { - container = "ghcr.io/sanger-tol/longranger:2.2.2-c4" - } - withName: LONGRANGER_ALIGN { - container = "ghcr.io/sanger-tol/longranger:2.2.2-c4" - } - } - } - } } diff --git a/conf/test.config b/conf/test.config index 03941e71..84874df3 100644 --- a/conf/test.config +++ b/conf/test.config @@ -14,15 +14,14 @@ params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' - // Limit resources so that this can run on GitHub Actions - max_cpus = 2 - max_memory = '6.GB' + // Match resource limits with the ubuntu2204-4c runner + max_cpus = 4 + max_memory = '15.GB' max_time = '6.h' // Input data input = "${projectDir}/assets/test.yaml" bed_chunks_polishing = 2 organelles_on = true - polishing_on = false hifiasm_hic_on = true } diff --git a/conf/test_full.config b/conf/test_full.config index e83aac0d..eb2551fb 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -22,6 +22,5 @@ params { // Input data for full size test input = "${projectDir}/assets/test_gsMetZobe1.yaml" - polishing_on = true hifiasm_hic_on = true } diff --git a/conf/test_github.config b/conf/test_github.config deleted file mode 100644 index 8086af2d..00000000 --- a/conf/test_github.config +++ /dev/null @@ -1,28 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for running minimal tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a fast and simple pipeline test. - - Use as follows: - nextflow run sanger-tol/genomeassembly -profile test, --outdir - ----------------------------------------------------------------------------------------- -*/ - -params { - config_profile_name = 'Github test profile' - config_profile_description = 'Minimal test dataset to check pipeline function' - - max_cpus = 4 - max_memory = '15.GB' - max_time = '6.h' - // Limit resources so that this can run on GitHub Actions - - // Input data - input = "${projectDir}/assets/test_github.yaml" - bed_chunks_polishing = 2 - polishing_on = false - hifiasm_hic_on = true - organelles_on = true -} diff --git a/docs/usage.md b/docs/usage.md index 0bc8e117..660ebf5a 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -86,17 +86,12 @@ mito: The pipeline can be tested locally using a provided small test dataset: ``` -cd ${GENOMEASSEMBLY_TEST_DATA} -curl https://darwin.cog.sanger.ac.uk/genomeassembly_test_data.tar.gz | tar xzf - - git clone git@github.com:sanger-tol/genomeassembly.git cd genomeassembly/ -sed -i "s|/home/runner/work/genomeassembly/genomeassembly|${GENOMEASSEMBLY_TEST_DATA}|" assets/test_github.yaml -nextflow run main.nf -profile test_github,singularity --outdir ${OUTDIR} {OTHER ARGUMENTS} +nextflow run main.nf -profile test,singularity --outdir ${OUTDIR} {OTHER ARGUMENTS} ``` -These command line steps will download and decompress the test data first, then download the pipeline and modify YAML so that it matches dataset location in your file system. -The last command line runs the test. +These command line steps will download the pipeline and run the test. You should now be able to run the pipeline as you see fit. diff --git a/modules/local/generate_cram_csv.nf b/modules/local/generate_cram_csv.nf index 860dfe65..85d78516 100644 --- a/modules/local/generate_cram_csv.nf +++ b/modules/local/generate_cram_csv.nf @@ -13,7 +13,7 @@ process GENERATE_CRAM_CSV { 'biocontainers/samtools:1.17--h00cdaf9_0' }" input: - tuple val(meta), path(crampaths, stageAs: "?/*") + tuple val(meta), path(crampaths, stageAs: "?/*"), path(craipaths, stageAs: "?/*") output: @@ -23,7 +23,7 @@ process GENERATE_CRAM_CSV { script: def prefix = task.ext.prefix ?: "${meta.id}" """ - generate_cram_csv.sh $crampaths >> ${prefix}_cram.csv + generate_cram_csv.sh $crampaths > ${prefix}_cram.csv cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/longranger/align/main.nf b/modules/local/longranger/align/main.nf index 728cd9af..0f3009a7 100644 --- a/modules/local/longranger/align/main.nf +++ b/modules/local/longranger/align/main.nf @@ -11,7 +11,7 @@ process LONGRANGER_ALIGN { input: tuple val(meta), path(reference) - path(fastqs) + path(fastqs, stageAs: "10X_inputs/*") output: tuple val(meta), path("${meta.id}/outs/possorted_bam.bam"), emit: bam @@ -26,7 +26,7 @@ process LONGRANGER_ALIGN { def args = task.ext.args ?: '' def sample = "${meta.id}" """ - longranger align --id=$sample --fastqs=$fastqs \ + longranger align --id=$sample --fastqs=10X_inputs \ --sample=$sample --reference=$reference \ ${args} diff --git a/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/batch_summary.txt b/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/batch_summary.txt deleted file mode 100644 index 226d64fd..00000000 --- a/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/batch_summary.txt +++ /dev/null @@ -1,2 +0,0 @@ -Input_file Dataset Complete Single Duplicated Fragmented Missing n_markers Scaffold N50 Contigs N50 Percent gaps Number of scaffolds -iyVesGerm1_scaffolds_final.fa Run failed; check logs diff --git a/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/logs/busco.log b/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/logs/busco.log deleted file mode 100644 index 583cbbff..00000000 --- a/modules/nf-core/busco/iyVesGerm1-insecta_odb10-busco/logs/busco.log +++ /dev/null @@ -1,107 +0,0 @@ -2023-03-31 12:35:30 DEBUG:busco.run_BUSCO Command line: /usr/local/bin/busco --cpu 2 --in input_seqs --out iyVesGerm1-insecta_odb10-busco --out_path /lustre/scratch124/tol/projects/darwin/users/kk16/development/nextflow/dev/genomeassembly/./workflows/../subworkflows/local/../../modules/nf-core/busco --lineage_dataset /lustre/scratch123/tol/resources/busco/v5/lineages/insecta_odb10 --mode genome -2023-03-31 12:35:30 INFO:busco.run_BUSCO ***** Start a BUSCO v5.4.3 analysis, current time: 03/31/2023 12:35:30 ***** -2023-03-31 12:35:30 DEBUG:busco.ConfigManager Getting config file -2023-03-31 12:35:30 INFO:busco.ConfigManager Configuring BUSCO with local environment -2023-03-31 12:35:30 INFO:busco.BuscoConfig Mode is genome -2023-03-31 12:35:30 INFO:busco.BuscoDownloadManager Downloading information on latest versions of BUSCO data... -2023-03-31 12:35:32 DEBUG:busco.BuscoConfig State of BUSCO config before run: -2023-03-31 12:35:32 DEBUG:busco.BuscoConfig {'_allow_no_value': False, - '_comment_prefixes': ('#', ';'), - '_converters': , - '_defaults': {}, - '_delimiters': ('=', ':'), - '_dict': , - '_empty_lines_in_values': True, - '_inline_comment_prefixes': (), - '_input_filepath': '/lustre/scratch124/tol/projects/darwin/users/kk16/development/nextflow/dev/genomeassembly/work/06/5d8ceb4658862a248b20dcc4c3b27a/input_seqs', - '_interpolation': , - '_mode': 'genome', - '_optcre': re.compile('\n (?P