From 4743ef28c9ccc8edc9454cf9124507820790c75b Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Wed, 8 Feb 2023 18:50:09 -0600 Subject: [PATCH] DSERV-81-footprint-assay-term-name (#78) --- genomic_data_service/region_indexer.py | 38 +++++++++++++-------- genomic_data_service/region_indexer_task.py | 13 ++++--- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/genomic_data_service/region_indexer.py b/genomic_data_service/region_indexer.py index 937925d..948f81b 100644 --- a/genomic_data_service/region_indexer.py +++ b/genomic_data_service/region_indexer.py @@ -3,6 +3,7 @@ RegionIndexerElasticSearch, ) import requests +from requests.adapters import HTTPAdapter, Retry import pickle from genomic_data_service.constants import FILE_HG19 import argparse @@ -123,6 +124,7 @@ 'https://www.encodeproject.org/search/?control_type!=*&status=released&perturbed=false&assay_title=Histone+ChIP-seq&target.label=H3K27ac&target.label=H3K36me3&target.label=H3K4me3&target.label=H3K4me1&target.label=H3K27me3&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&assembly=GRCh38&files.file_type=bed+narrowPeak&type=Experiment&files.analyses.status=released&files.preferred_default=true&limit=all&format=json' + '&field=files.accession&field=files.preferred_default&field=files.file_format&field=files.analyses.@id&field=default_analysis' ) + parser = argparse.ArgumentParser( description='indexing files for genomic data service.' ) @@ -149,6 +151,11 @@ choices=['RegulomeDB_2_0', 'RegulomeDB_2_1'], ) +session = requests.Session() +retries = Retry(total=5, backoff_factor=1, + status_forcelist=[500, 502, 503, 504]) +session.mount('https://', HTTPAdapter(max_retries=retries)) + def clean_up(obj, fields): clean_obj = {} @@ -185,7 +192,7 @@ def encode_graph(query): query += ['field=*', 'limit=all', 'format=json'] endpoint = f"{ENCODE_DOMAIN}/search/?{'&'.join(query)}" - return requests.get(endpoint).json()['@graph'] + return session.get(endpoint).json()['@graph'] def need_to_fetch_documents(dataset): @@ -218,7 +225,7 @@ def fetch_documents(dataset): documents = [] for document_id in dataset.get('documents', []): endpoint = f'{ENCODE_DOMAIN}{document_id}?format=json' - documents.append(requests.get(endpoint).json()) + documents.append(session.get(endpoint).json()) dataset['documents'] = documents @@ -329,38 +336,39 @@ def make_pickle_file(encode_accessions): def get_encode_accessions_from_portal(): encode_accessions = [] # get files in experiment TF ChIP-seq using assembly GRCh38 - experiments = requests.get( + experiments = session.get( TF_CHIP_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph'] # get files in experiment DNase-seq using assembly GRCh38 - experiments.extend(requests.get( + experiments.extend(session.get( DNASE_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph']) # get files in experiment ATAC-seq using assembly GRCh38 - experiments.extend(requests.get( + experiments.extend(session.get( ATAC_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph']) # get files in experiment histone ChIP-seq using assembly GRCh38 - experiments.extend(requests.get( + experiments.extend(session.get( HISTONE_CHIP_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph']) # get files in footprints - annotations = requests.get( + annotations = session.get( FOOTPRINT_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph'] # get files in PWMs - annotations.extend(requests.get( + annotations.extend(session.get( PWM_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph']) # get files in eQTLs - annotations.extend(requests.get( + annotations.extend(session.get( EQTL_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph']) # get files for chromatin state for grch38 - chromatin_state_files = requests.get( + chromatin_state_files = session.get( CHROMATIN_STATE_FILES_GRCH38_ENDPOINT).json()['@graph'] # get ds_qtl annotations for grch38 - ds_qtls = requests.get(CAQTL_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph'] + ds_qtls = session.get(CAQTL_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph'] for experiment in experiments: files = experiment.get('files', []) - default_analysis_id = experiment['default_analysis'] - for file in files: - if is_preferred_default_bed_from_default_analysis(default_analysis_id, file): - encode_accessions.append(file['accession']) + default_analysis_id = experiment.get('default_analysis') + if default_analysis_id: + for file in files: + if is_preferred_default_bed_from_default_analysis(default_analysis_id, file): + encode_accessions.append(file['accession']) for annotation in annotations: files = annotation.get('files', []) diff --git a/genomic_data_service/region_indexer_task.py b/genomic_data_service/region_indexer_task.py index 2513609..499a04f 100644 --- a/genomic_data_service/region_indexer_task.py +++ b/genomic_data_service/region_indexer_task.py @@ -423,16 +423,19 @@ def metadata_doc(file_uuid, file_metadata, dataset_metadata): assay_title = dataset_metadata.get('assay_title') if assay_title == 'Histone ChIP-seq': meta_doc['dataset']['collection_type'] = assay_title + # footprints have both assay_term_name(a list) and annotation_type elif dataset_metadata.get('annotation_type') == 'footprints': meta_doc['dataset']['collection_type'] = 'footprints' - if dataset_metadata.get('assay_term_name') and 'ATAC-seq' in dataset_metadata.get('assay_term_name'): - meta_doc['dataset']['footprint_assay_term_name'] = 'ATAC-seq' - else: - meta_doc['dataset']['footprint_assay_term_name'] = 'DNase-seq' + assay_term_name = dataset_metadata.get('assay_term_name') + if assay_term_name: + if 'ATAC-seq' in assay_term_name: + meta_doc['dataset']['footprint_assay_term_name'] = 'ATAC-seq' + elif 'DNase-seq' in assay_term_name: + meta_doc['dataset']['footprint_assay_term_name'] = 'DNase-seq' else: # regulome use three type of datasets: experiments, annotations and references. experiements has property assay_term_name, annotations has property annotation_type, references has property reference_type. # Those properties will be indexed as dataset collection_type in regulome datase base. - # Annotations can have both assay_term_name and annotation_type, for example, imputations and gkm-SVMs, but we don't use those datasets in regulome. + # Annotations can have both assay_term_name and annotation_type, for example, footprints, imputations and gkm-SVMs, we have footprints in regulome. for prop in REGULOME_COLLECTION_TYPES: prop_value = dataset_metadata.get(prop) if prop_value: