From 8bdeffb7c04ebec46d75353f1284e3615ba3f4a8 Mon Sep 17 00:00:00 2001 From: aschroed Date: Tue, 6 Aug 2024 17:35:28 -0400 Subject: [PATCH 1/8] refactoring getting workflow details from db --- functions/cleanup.py | 90 ++-- functions/wfr_settings.py | 450 ------------------ .../01_find_and_release.ipynb | 12 +- 3 files changed, 37 insertions(+), 515 deletions(-) delete mode 100644 functions/wfr_settings.py diff --git a/functions/cleanup.py b/functions/cleanup.py index 78a9542..808f929 100644 --- a/functions/cleanup.py +++ b/functions/cleanup.py @@ -1,63 +1,33 @@ from dcicutils import ff_utils from datetime import datetime -# accepted workflows -# workflow name, accepted revision numbers (0 if none), accetable run time (hours) -workflow_details = [ - # TODO: take this info from foursight - # common ones - ['md5', ['0.0.4', '0.2.6'], 12], - ['fastqc-0-11-4-1', ['0.2.0'], 50], - ['fastqc', ['v1', 'v2'], 50], - # 4dn ones - ['bwa-mem', ['0.2.6'], 50], - ['pairsqc-single', ['0.2.5', '0.2.6'], 100], - ['hi-c-processing-bam', ['0.2.6'], 50], - ['hi-c-processing-pairs', ['0.2.6', '0.2.7'], 200], - ['hi-c-processing-pairs-nore', ['0.2.6'], 200], - ['hi-c-processing-pairs-nonorm', ['0.2.6'], 200], - ['hi-c-processing-pairs-nore-nonorm', ['0.2.6'], 200], - ['imargi-processing-fastq', ["1.1.1_dcic_4"], 200], - ['imargi-processing-bam', ["1.1.1_dcic_4"], 200], - ['imargi-processing-pairs', ["1.1.1_dcic_4"], 200], - ['repliseq-parta', ['v13.1', 'v14', 'v16', 'v16.1'], 200], - ['bedGraphToBigWig', ['v4'], 24], - ['bedtobeddb', ['v2', 'v3'], 24], - ['encode-chipseq-aln-chip', ['1.1.1', '2.1.6'], 200], - ['encode-chipseq-aln-ctl', ['1.1.1','2.1.6'], 200], - ['encode-chipseq-postaln', ['1.1.1','2.1.6'], 200], - ['encode-atacseq-aln', ['1.1.1'], 200], - ['encode-atacseq-postaln', ['1.1.1'], 200], - ['mergebed', ['v1'], 200], - ['merge-fastq', ['v1'], 200], - ['bamqc', ['v2', 'v3'], 200], - ['encode-rnaseq-stranded', ['1.1'], 200], - ['encode-rnaseq-unstranded', ['1.1'], 200], - ['rna-strandedness', ['v2'], 200], - ['fastq-first-line', ['v2'], 200], - ['re_checker_workflow', ['v1.1', 'v1.2'], 200], - ['mad_qc_workflow', ['1.1_dcic_2'], 200], - ['insulation-scores-and-boundaries-caller', ['v1'], 200], - ['compartments-caller', ['v1.2'], 200], - ['mcoolQC', ['v1'], 200], - # cgap ones - ['workflow_bwa-mem_no_unzip-check', ['v9', 'v10', 'v11', 'v12', 'v13'], 48], - ['workflow_add-readgroups-check', ['v9', 'v10', 'v11', 'v12', 'v13'], 12], - ['workflow_merge-bam-check', ['v9', 'v10', 'v11', 'v12', 'v13'], 12], - ['workflow_picard-MarkDuplicates-check', ['v9', 'v10', 'v11', 'v12', 'v13'], 12], - ['workflow_sort-bam-check', ['v9', 'v10', 'v11', 'v12', 'v13'], 12], - ['workflow_gatk-BaseRecalibrator', ['v9', 'v10', 'v11', 'v12', 'v13'], 12], - ['workflow_gatk-ApplyBQSR-check', ['v9', 'v10', 'v11', 'v12', 'v13'], 12], - ['workflow_index-sorted-bam', ['v9'], 12], - ['workflow_gatk-HaplotypeCaller', ['v10', 'v11', 'v12', 'v13'], 12], - ['workflow_gatk-CombineGVCFs', ['v10', 'v11', 'v12', 'v13'], 12], - ['workflow_gatk-GenotypeGVCFs-check', ['v10', 'v11', 'v12', 'v13'], 12], - ['workflow_gatk-VQSR-check', ['v10', 'v11', 'v12', 'v13'], 12], - ['workflow_qcboard-bam', ['v9'], 12], - ['workflow_cram2fastq', ['v12', 'v13'], 12], -] -workflow_names = [i[0] for i in workflow_details] +# function to get workflow_details info from db +# initial datastructure that is the same as that to get info for foursight is transformed +# into the format used in the cleanup functions +# workflow name, accepted revision numbers (0 if none), accetable run time (hours) +def get_workflow_details(my_auth): + wf_details = {} + wf_query = "search/?type=Workflow&tags=current&tags=accepted&field=max_runtime" \ + "&app_name!=No value&app_version!=No value&field=app_name&field=app_version" + workflows = ff_utils.search_metadata(wf_query, my_auth) + for wf in workflows: + app_name = wf.get('app_name') + app_version = wf.get('app_version') + run_time = wf.get('max_runtime', 0) + wf_details.setdefault(app_name, {}) + wf_details[app_name].setdefault('accepted_versions', []).append(app_version) + wf_details[app_name].setdefault('run_time', run_time) + # for unexpected case of different wf items with same app_name having + # different run times - use max value + if run_time > wf_details[app_name].get('run_time'): + wf_details[app_name]['run_time'] = run_time + # here is the transformation + # workflow_details = [] + # for wfname, wf_info in wf_details.items(): + # workflow_details.append((wfname, wf_info.get('accepted_versions'), [], wf_info.get('run_time'))) + return [(wfname, wf_details[wfname].get('accepted_versions', []), wf_details[wfname].get('run_time')) + for wfname in wf_details.keys()] def fetch_pf_associated(pf_id_or_dict, my_key): @@ -124,11 +94,6 @@ def get_wfr_report(wfrs): except ValueError: wfr_time = datetime.strptime(time_info, '%Y-%m-%d %H:%M:%S') run_hours = (datetime.utcnow() - wfr_time).total_seconds() / 3600 - # try: - # wfr_time = datetime.strptime(wfr_data['date_created'], '%Y-%m-%dT%H:%M:%S.%f+00:00') - # except ValueError: # if it was exact second, no fraction is in value - # print("wfr time bingo", wfr_uuid) - # wfr_time = datetime.strptime(wfr_data['date_created'], '%Y-%m-%dT%H:%M:%S+00:00') output_files = wfr_data.get('output_files', None) output_uuids = [] qc_uuids = [] @@ -158,7 +123,7 @@ def get_wfr_report(wfrs): return wfr_report -def delete_wfrs(file_resp, my_key, delete=False, stash=None): +def delete_wfrs(file_resp, my_key, workflow_details, delete=False, stash=None): # file_resp in embedded frame # stash: all related wfrs for file_resp deleted_wfrs = [] # reports WorkflowRun items deleted by this function @@ -221,6 +186,7 @@ def _delete_action(wfr_to_del): return # CLEAN UP IF FILE IS DELETED + workflow_names = [wfinfo[0] for wfinfo in workflow_details] if file_resp['status'] == 'deleted': if file_resp.get('quality_metric'): if delete: diff --git a/functions/wfr_settings.py b/functions/wfr_settings.py deleted file mode 100644 index e3a7167..0000000 --- a/functions/wfr_settings.py +++ /dev/null @@ -1,450 +0,0 @@ -# Step Settings -def step_settings(step_name, my_organism, attribution, params={}): - """Return a setting dict for given step, and modify variables in - output files; genome assembly, file_type, desc, contributing lab.""" - genome = "" - mapper = {'human': 'GRCh38', 'mouse': 'GRCm38', 'fruit-fly': 'dm6', 'chicken': 'galGal5'} - genome = mapper.get(my_organism) - - out_n = "This is an output file of the Hi-C processing pipeline" - int_n = "This is an intermediate file in the HiC processing pipeline" - out_n_rep = "This is an output file of the RepliSeq processing pipeline" - # int_n_rep = "This is an intermediate file in the Repliseq processing pipeline" - - wf_dict = [ - { - 'wf_name': 'md5', - 'wf_uuid': 'c77a117b-9a58-477e-aaa5-291a109a99f6', - 'parameters': {} - }, - { - 'wf_name': 'fastqc-0-11-4-1', - 'wf_uuid': '2324ad76-ff37-4157-8bcc-3ce72b7dace9', - 'parameters': {} - }, - { - 'wf_name': 'bwa-mem', - 'wf_uuid': '3feedadc-50f9-4bb4-919b-09a8b731d0cc', - 'parameters': {"nThreads": 16}, - 'custom_pf_fields': { - 'out_bam': { - 'genome_assembly': genome, - 'file_type': 'intermediate file', - 'description': int_n} - } - }, - { - 'wf_name': 'hi-c-processing-bam', - 'wf_uuid': '023bfb3e-9a8b-42b9-a9d4-216079526f68', - 'parameters': {"nthreads_merge": 16, "nthreads_parse_sort": 16}, - 'custom_pf_fields': { - 'annotated_bam': { - 'genome_assembly': genome, - 'file_type': 'alignment', - 'description': out_n}, - 'filtered_pairs': { - 'genome_assembly': genome, - 'file_type': 'contact list-replicate', - 'description': out_n} - } - }, - { - 'wf_name': 'hi-c-processing-pairs', - 'wf_uuid': '4dn-dcic-lab:wf-hi-c-processing-pairs-0.2.7', - 'parameters': {"nthreads": 4, - "maxmem": "32g", - "max_split_cooler": 10, - "no_balance": False - }, - 'custom_pf_fields': { - 'hic': { - 'genome_assembly': genome, - 'file_type': 'contact matrix', - 'description': out_n}, - 'mcool': { - 'genome_assembly': genome, - 'file_type': 'contact matrix', - 'description': out_n}, - 'merged_pairs': { - 'genome_assembly': genome, - 'file_type': 'contact list-combined', - 'description': out_n} - } - }, - { - 'wf_name': 'repliseq-parta', - 'workflow_uuid': '4dn-dcic-lab:wf-repliseq-parta-v16.1', - "parameters": {"nthreads": 4, "memperthread": "2G"}, - 'custom_pf_fields': { - 'filtered_sorted_deduped_bam': { - 'genome_assembly': genome, - 'file_type': 'alignments', - 'description': 'This is an output file of the RepliSeq processing pipeline'}, - 'count_bg_rpkm': { - 'genome_assembly': genome, - 'file_type': 'normalized counts', - 'description': 'read counts, unfiltered RPKM'}, - 'count_bg': { - 'genome_assembly': genome, - 'file_type': 'counts', - 'description': 'read counts, unfiltered, unnormalized'} - } - }, - { - "wf_name": "bedGraphToBigWig", - "wf_uuid": "667b14a7-a47e-4857-adf1-12a6393c4b8e", - "parameters": {}, - "config": { - "instance_type": "t2.micro", - "EBS_optimized": False, - "ebs_size": 10, - "ebs_type": "gp2", - "json_bucket": "4dn-aws-pipeline-run-json", - "ebs_iops": "", - "shutdown_min": "now", - "password": "", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" - }, - "overwrite_input_extra": False - }, - { - "wf_name": "bedtobeddb", - "wf_uuid": "9d575e99-5ffe-4ea4-b74f-ad40f621cd39", - "parameters": {}, - "config": { - "instance_type": "m3.2xlarge", - "EBS_optimized": False, - "ebs_size": 10, - "ebs_type": "gp2", - "json_bucket": "4dn-aws-pipeline-run-json", - "ebs_iops": "", - "shutdown_min": "now", - "password": "", - "log_bucket": "tibanna-output", - "key_name": "4dn-encode" - }, - "overwrite_input_extra": False - }, - { - "wf_name": "encode-chipseq-aln-chip", - "wf_uuid": "4dn-dcic-lab:wf-encode-chipseq-aln-chip", - "parameters": {}, - "config": { - "ebs_size": 0, - "ebs_type": "gp2", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": "", - "shutdown_min": "now", - "instance_type": "", - "password": "", - "log_bucket": "tibanna-output", - "key_name": "", - "cloudwatch_dashboard": True - }, - 'custom_pf_fields': { - 'chip.first_ta': { - 'genome_assembly': genome, - 'file_type': 'read positions', - 'description': 'Positions of aligned reads in bed format, one line per read mate, for control experiment, from ENCODE ChIP-Seq Pipeline'}, - 'chip.first_ta_xcor': { - 'genome_assembly': genome, - 'file_type': 'intermediate file', - 'description': 'Counts file used only for QC'} - } - }, - { - "wf_name": "encode-chipseq-aln-ctl", - "wf_uuid": "4dn-dcic-lab:wf-encode-chipseq-aln-ctl", - "parameters": {}, - "config": { - "ebs_size": 0, - "ebs_type": "gp2", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": "", - "shutdown_min": 'now', - "instance_type": "", - "password": "", - "log_bucket": "tibanna-output", - "key_name": "", - "cloudwatch_dashboard": True - }, - 'custom_pf_fields': { - 'chip.first_ta_ctl': { - 'genome_assembly': genome, - 'file_type': 'read positions', - 'description': 'Positions of aligned reads in bed format, one line per read mate, for control experiment, from ENCODE ChIP-Seq Pipeline'} - } - }, - { - "wf_name": "encode-chipseq-postaln", - "wf_uuid": "4dn-dcic-lab:wf-encode-chipseq-postaln", - "parameters": {}, - "config": { - "ebs_size": 0, - "ebs_type": "gp2", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": "", - "shutdown_min": "now", - "instance_type": "", - "password": "", - "log_bucket": "tibanna-output", - "key_name": "", - "cloudwatch_dashboard": True - }, - 'custom_pf_fields': { - 'chip.optimal_peak': { - 'genome_assembly': genome, - 'file_type': 'peaks', - 'description': 'Peak calls from ENCODE ChIP-Seq Pipeline'}, - 'chip.conservative_peak': { - 'genome_assembly': genome, - 'file_type': 'conservative peaks', - 'description': 'Conservative peak calls from ENCODE ChIP-Seq Pipeline'}, - 'chip.sig_fc': { - 'genome_assembly': genome, - 'file_type': 'signal fold change', - 'description': 'ChIP-seq signal fold change over input control'} - } - }, - { - "wf_name": "encode-atacseq-aln", - "wf_uuid": "4dn-dcic-lab:wf-encode-atacseq-aln", - "parameters": {}, - "config": { - "ebs_size": 0, - "ebs_type": "gp2", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": "", - "shutdown_min": 'now', - "instance_type": "", - "password": "", - "log_bucket": "tibanna-output", - "key_name": "", - "cloudwatch_dashboard": True - }, - 'custom_pf_fields': { - 'atac.first_ta': { - 'genome_assembly': genome, - 'file_type': 'read positions', - 'description': 'Positions of aligned reads in bed format, one line per read mate, from ENCODE ATAC-Seq Pipeline'} - } - }, - { - "wf_name": "encode-atacseq-postaln", - "wf_uuid": "4dn-dcic-lab:wf-encode-atacseq-postaln", - "parameters": {}, - "config": { - "ebs_size": 0, - "ebs_type": "gp2", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": "", - "shutdown_min": "now", - "instance_type": "", - "password": "", - "log_bucket": "tibanna-output", - "key_name": "", - "cloudwatch_dashboard": True - }, - 'custom_pf_fields': { - 'atac.optimal_peak': { - 'genome_assembly': genome, - 'file_type': 'peaks', - 'description': 'Peak calls from ENCODE ATAC-Seq Pipeline'}, - 'atac.conservative_peak': { - 'genome_assembly': genome, - 'file_type': 'conservative peaks', - 'description': 'Conservative peak calls from ENCODE ATAC-Seq Pipeline'}, - 'atac.sig_fc': { - 'genome_assembly': genome, - 'file_type': 'signal fold change', - 'description': 'ATAC-seq signal fold change'} - } - }, - { - "wf_name": "mergebed", - "wf_uuid": "2b10e472-065e-43ed-992c-fccad6417b65", - "parameters": {"sortv": "0"}, - "config": { - "ebs_size": 0, - "ebs_type": "gp2", - "json_bucket": "4dn-aws-pipeline-run-json", - "EBS_optimized": "", - "ebs_iops": "", - "shutdown_min": "now", - "instance_type": "", - "password": "", - "log_bucket": "tibanna-output", - "key_name": "", - "cloudwatch_dashboard": True - }, - 'custom_pf_fields': { - 'merged_bed': { - 'genome_assembly': genome, - 'file_type': 'read positions', - 'description': 'Merged file, positions of aligned reads in bed format, one line per read mate'} - } - }, - { - "app_name": "insulation-scores-and-boundaries-caller", - "workflow_uuid": "dc9efc2d-baa5-4304-b72b-14610d8d5fc4", - "parameters": {"binsize": -1, "windowsize": 100000}, - "config": {'mem': 32}, - 'custom_pf_fields': { - 'bwfile': { - 'genome_assembly': genome, - 'file_type': 'insulation score-diamond', - 'description': 'Diamond insulation scores calls on Hi-C contact matrices'}, - 'bedfile': { - 'genome_assembly': genome, - 'file_type': 'boundaries', - 'description': 'Boundaries calls on Hi-C contact matrices'} - } - }, - { - "app_name": "compartments-caller", - "workflow_uuid": "d07fa5d4-8721-403e-89b5-e8f323ac9ece", - "parameters": {"binsize": 250000, "contact_type": "cis"}, - "config": {'mem': 4, 'cpu': 1, 'ebs_size': '1.1x', 'EBS_optimized': 'false'}, - 'custom_pf_fields': { - 'bwfile': { - 'genome_assembly': genome, - 'file_type': 'compartments', - 'description': 'Compartments signals on Hi-C contact matrices'} - }, - }, - { - "app_name": "rna-strandedness", - "workflow_uuid": "af97597e-877a-40b7-b211-98ec0cfb17b4", - 'config': {'mem': 2, 'cpu': 2, "instance_type": "t3.small", 'ebs_size': '1.1x', 'EBS_optimized': 'false'} - }, - # RNA SEQ - { - "app_name": "encode-rnaseq-stranded", - "workflow_uuid": "4dn-dcic-lab:wf-encode-rnaseq-stranded", - "parameters": { - 'rna.strandedness': 'stranded', - 'rna.strandedness_direction': '', - 'rna.endedness': '' - }, - 'custom_pf_fields': { - 'rna.outbam': { - 'genome_assembly': genome, - 'file_type': 'read positions', - 'description': 'Output file from RNA seq pipeline' - }, - 'rna.plusbw': { - 'genome_assembly': genome, - 'file_type': 'read counts (plus)', - 'description': 'Output file from RNA seq pipeline' - }, - 'rna.minusbw': { - 'genome_assembly': genome, - 'file_type': 'read counts (minus)', - 'description': 'Output file from RNA seq pipeline' - }, - 'rna.gene_expression': { - 'genome_assembly': genome, - 'file_type': 'gene expression', - 'description': 'Output file from RNA seq pipeline' - }, - 'rna.isoform_expression': { - 'genome_assembly': genome, - 'file_type': 'isoform expression', - 'description': 'Output file from RNA seq pipeline' - } - } - }, - { - "app_name": "encode-rnaseq-unstranded", - "workflow_uuid": "4dn-dcic-lab:wf-encode-rnaseq-unstranded", - "parameters": { - 'rna.strandedness': 'unstranded', - 'rna.strandedness_direction': 'unstranded', - 'rna.endedness': 'paired' - }, - 'custom_pf_fields': { - 'rna.outbam': { - 'genome_assembly': genome, - 'file_type': 'read positions', - 'description': 'Output file from RNA seq pipeline' - }, - 'rna.outbw': { - 'genome_assembly': genome, - 'file_type': 'read counts', - 'description': 'Output file from RNA seq pipeline' - }, - 'rna.gene_expression': { - 'genome_assembly': genome, - 'file_type': 'gene expression', - 'description': 'Output file from RNA seq pipeline' - }, - 'rna.isoform_expression': { - 'genome_assembly': genome, - 'file_type': 'isoform expression', - 'description': 'Output file from RNA seq pipeline' - } - } - }, - { - "app_name": "bamqc", - "workflow_uuid": "42683ab1-59bf-4ec5-a973-030053a134f1", - "overwrite_input_extra": False, - "config": {"ebs_size": 10} - }, - { - "app_name": "fastq-first-line", - "workflow_uuid": "93a1a931-d55d-4623-adfb-0fa735daf6ae", - "overwrite_input_extra": False, - 'config': {'mem': 2, 'cpu': 2, "instance_type": "t3.small"} - }, - { - "app_name": "re_checker_workflow", - "workflow_uuid": "8479d16e-667a-41e9-8ace-391128f50dc5", - "parameters": {}, - "config": { - "mem": 4, - "ebs_size": 10, - "instance_type": "t3.medium" - } - }, - { - "app_name": "mad_qc_workflow", - "workflow_uuid": "4dba38f0-af7a-4432-88e4-ca804dea64f8", - "parameters": {}, - "config": {"ebs_size": 10, "instance_type": "t3.medium"} - }, - { - "app_name": "mcoolQC", - "workflow_uuid": "0bf9f47a-dec1-4324-9b41-fa183880a7db", - "overwrite_input_extra": False, - "config": {"ebs_size": 10, "instance_type": "c5ad.2xlarge"} - }, - # temp - { - "app_name": "", - "workflow_uuid": "", - "parameters": {}, - 'custom_pf_fields': { - '': { - 'genome_assembly': genome, - 'file_type': '', - 'description': ''} - } - } - ] - # if params, overwrite parameters - template = [i for i in wf_dict if i['wf_name'] == step_name][0] - if params: - template['parameters'] = params - - if template.get('custom_pf_fields'): - for a_file in template['custom_pf_fields']: - template['custom_pf_fields'][a_file].update(attribution) - template['wfr_meta'] = attribution - return template diff --git a/notebooks/useful_notebooks/01_find_and_release.ipynb b/notebooks/useful_notebooks/01_find_and_release.ipynb index 208c397..c472ee2 100644 --- a/notebooks/useful_notebooks/01_find_and_release.ipynb +++ b/notebooks/useful_notebooks/01_find_and_release.ipynb @@ -26,7 +26,7 @@ "source": [ "from dcicutils import ff_utils\n", "from functions.notebook_functions import *\n", - "from functions.cleanup import *\n", + "from functions.cleanup import get_workflow_details, delete_wfrs\n", "import time\n", "\n", "# status mapping for ordering purposes\n", @@ -195,6 +195,12 @@ "# do you want to check for duplicate/problematic runs on files?\n", "# it will take some time\n", "check_wfrs = True\n", + "# get workflow_details in format to use in these checks\n", + "# NOTE: this format is different than that used in foursight\n", + "wf_details = None\n", + "if check_wfrs:\n", + " wf_details = get_workflow_details(my_auth)\n", + "\n", "# if any are found do you want to remove them?\n", "delete_problematic = False\n", "\n", @@ -251,7 +257,7 @@ " if not a_file.get('md5sum'):\n", " print(a_file['accession'], 'md5 was not calculated during upload, missing md5sum')\n", " if check_wfrs:\n", - " new_deleted_items = delete_wfrs(a_file, my_auth, delete=delete_problematic, stash=stash)\n", + " new_deleted_items = delete_wfrs(a_file, my_auth, wf_details, delete=delete_problematic, stash=stash)\n", " if new_deleted_items:\n", " deleted_items.extend(new_deleted_items)\n", "\n", @@ -286,7 +292,7 @@ " if not a_file.get('source_experiments'):\n", " print(a_file['accession'], 'user submitted or produced by sbg runs')\n", " if check_wfrs:\n", - " new_deleted_items = delete_wfrs(a_file, my_auth, delete=delete_problematic, stash=stash)\n", + " new_deleted_items = delete_wfrs(a_file, my_auth, wf_details, delete=delete_problematic, stash=stash)\n", " if new_deleted_items:\n", " deleted_items.extend(new_deleted_items)\n", "\n", From cafe3a0367c820b3a73c5b4af7af4495c3de2d66 Mon Sep 17 00:00:00 2001 From: aschroed Date: Thu, 8 Aug 2024 15:00:00 -0400 Subject: [PATCH 2/8] add pdb --- functions/cleanup.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/functions/cleanup.py b/functions/cleanup.py index 808f929..5e3a057 100644 --- a/functions/cleanup.py +++ b/functions/cleanup.py @@ -201,6 +201,8 @@ def _delete_action(wfr_to_del): else: wfr_report = get_wfr_report(wfrs) for wfr_to_del in wfr_report: + if wfr_to_del['uuid'] == '15700187-3843-4062-95ff-57c8ac913a1d': + import pdb; pdb.set_trace() if wfr_to_del['status'] != 'deleted': if wfr_to_del['wfr_name'] not in workflow_names: print('Unlisted Workflow', wfr_to_del['wfr_name'], 'deleted file workflow', From c97daedb314d74b042eb691407f31f702bb5ff59 Mon Sep 17 00:00:00 2001 From: aschroed Date: Thu, 8 Aug 2024 15:07:03 -0400 Subject: [PATCH 3/8] move pdb --- functions/cleanup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/functions/cleanup.py b/functions/cleanup.py index 5e3a057..5f8db9b 100644 --- a/functions/cleanup.py +++ b/functions/cleanup.py @@ -151,6 +151,8 @@ def delete_wfrs(file_resp, my_key, workflow_details, delete=False, stash=None): wfr_uuids = [i['uuid'] for i in file_resp.get('workflow_run_inputs')] wfrs = [] if wfr_uuids: + if '15700187-3843-4062-95ff-57c8ac913a1d' in wfr_uuids: + import pdb; pdb.set_trace() # fetch them from stash if stash: wfrs = [i for i in stash if i['uuid'] in wfr_uuids] @@ -201,8 +203,6 @@ def _delete_action(wfr_to_del): else: wfr_report = get_wfr_report(wfrs) for wfr_to_del in wfr_report: - if wfr_to_del['uuid'] == '15700187-3843-4062-95ff-57c8ac913a1d': - import pdb; pdb.set_trace() if wfr_to_del['status'] != 'deleted': if wfr_to_del['wfr_name'] not in workflow_names: print('Unlisted Workflow', wfr_to_del['wfr_name'], 'deleted file workflow', From c084fb9f740668531227c8f73c4ee5239f4b87ae Mon Sep 17 00:00:00 2001 From: aschroed Date: Thu, 8 Aug 2024 15:14:39 -0400 Subject: [PATCH 4/8] remove debugger; add print --- functions/cleanup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/functions/cleanup.py b/functions/cleanup.py index 5f8db9b..d279c78 100644 --- a/functions/cleanup.py +++ b/functions/cleanup.py @@ -152,7 +152,7 @@ def delete_wfrs(file_resp, my_key, workflow_details, delete=False, stash=None): wfrs = [] if wfr_uuids: if '15700187-3843-4062-95ff-57c8ac913a1d' in wfr_uuids: - import pdb; pdb.set_trace() + print("WE HAVE THE PROBLEM HERE") # fetch them from stash if stash: wfrs = [i for i in stash if i['uuid'] in wfr_uuids] From ee57d6c9bbfe17ee3959792e30bd81a770176586 Mon Sep 17 00:00:00 2001 From: aschroed Date: Thu, 8 Aug 2024 15:39:12 -0400 Subject: [PATCH 5/8] deal with skipped pf with no output_wfr --- functions/cleanup.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/functions/cleanup.py b/functions/cleanup.py index d279c78..caefdac 100644 --- a/functions/cleanup.py +++ b/functions/cleanup.py @@ -135,12 +135,13 @@ def delete_wfrs(file_resp, my_key, workflow_details, delete=False, stash=None): # do not delete output wfrs of control files output_wfrs = file_resp.get('workflow_run_outputs') if not output_wfrs: - if file_type == 'files-processed': - # user submtted processed files - return - else: + #if file_type == 'files-processed': + # user submtted processed files but we still want to see if there is an associated errored wfr + # return + # continue + #else: # raw files: - pass + pass else: output_wfr = output_wfrs[0] wfr_type, _ = output_wfr['display_title'].split(' run ') @@ -151,8 +152,6 @@ def delete_wfrs(file_resp, my_key, workflow_details, delete=False, stash=None): wfr_uuids = [i['uuid'] for i in file_resp.get('workflow_run_inputs')] wfrs = [] if wfr_uuids: - if '15700187-3843-4062-95ff-57c8ac913a1d' in wfr_uuids: - print("WE HAVE THE PROBLEM HERE") # fetch them from stash if stash: wfrs = [i for i in stash if i['uuid'] in wfr_uuids] From 837b9f0168eda1264a1498b974097925fe934e68 Mon Sep 17 00:00:00 2001 From: aschroed Date: Thu, 8 Aug 2024 15:45:34 -0400 Subject: [PATCH 6/8] removed debugging and updated so user uploaded processed files weren't skipped --- functions/cleanup.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/functions/cleanup.py b/functions/cleanup.py index caefdac..4948078 100644 --- a/functions/cleanup.py +++ b/functions/cleanup.py @@ -135,12 +135,8 @@ def delete_wfrs(file_resp, my_key, workflow_details, delete=False, stash=None): # do not delete output wfrs of control files output_wfrs = file_resp.get('workflow_run_outputs') if not output_wfrs: - #if file_type == 'files-processed': - # user submtted processed files but we still want to see if there is an associated errored wfr - # return - # continue - #else: - # raw files: + # user submitted and raw files generally lack wfr_outputs but they can still have + # duplicate and errored runs so changed return (for file_processed) to pass for all pass else: output_wfr = output_wfrs[0] From d01787337c1b9ec538878f565c2d92662b9f207a Mon Sep 17 00:00:00 2001 From: aschroed Date: Thu, 8 Aug 2024 15:54:23 -0400 Subject: [PATCH 7/8] change log and version bump --- CHANGELOG.rst | 8 ++++++++ functions/cleanup.py | 6 +++--- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 95f2032..67c3414 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,14 @@ Change Log ------ +3.2.0 +===== + +* refactor of cleanup.py to get info about accepted workflow versions and run times from db and remove hard coded array +* updated find_and_release notebook to utilize this - not backward compatible +* corrected small bug in delete_wfr function that failed to cleanup errored or duplicated workflow runs on user submitted files as they are not outputs of wfrs + + 3.1.1 ===== diff --git a/functions/cleanup.py b/functions/cleanup.py index 4948078..34afba9 100644 --- a/functions/cleanup.py +++ b/functions/cleanup.py @@ -2,7 +2,7 @@ from datetime import datetime -# function to get workflow_details info from db +# function to get workflow_details info from db # initial datastructure that is the same as that to get info for foursight is transformed # into the format used in the cleanup functions # workflow name, accepted revision numbers (0 if none), accetable run time (hours) @@ -86,7 +86,7 @@ def get_wfr_report(wfrs): # skip all style awsem runs try: wfr_type_base, wfr_version = wfr_type.strip().split(' ') - except: + except Exception: continue time_info = time_info.strip('on').strip() try: @@ -141,7 +141,7 @@ def delete_wfrs(file_resp, my_key, workflow_details, delete=False, stash=None): else: output_wfr = output_wfrs[0] wfr_type, _ = output_wfr['display_title'].split(' run ') - if wfr_type in ['encode-chipseq-aln-ctl 1.1.1', 'encode-chipseq-aln-ctl 2.1.6'] : + if wfr_type in ['encode-chipseq-aln-ctl 1.1.1', 'encode-chipseq-aln-ctl 2.1.6']: print('skipping control file for wfr check', file_resp['accession']) return From 364d4ca6080e7b32949a227bef6c374ea5c58664 Mon Sep 17 00:00:00 2001 From: aschroed Date: Thu, 8 Aug 2024 16:07:59 -0400 Subject: [PATCH 8/8] update pyproject version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index f13c9a9..3a50557 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicwrangling" -version = "3.1.1" +version = "3.2.0" description = "Scripts and Jupyter notebooks for 4DN wrangling" authors = ["4DN-DCIC Team "] license = "MIT"