From 4743ef28c9ccc8edc9454cf9124507820790c75b Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Wed, 8 Feb 2023 18:50:09 -0600
Subject: [PATCH] DSERV-81-footprint-assay-term-name (#78)

---
 genomic_data_service/region_indexer.py      | 38 +++++++++++++--------
 genomic_data_service/region_indexer_task.py | 13 ++++---
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/genomic_data_service/region_indexer.py b/genomic_data_service/region_indexer.py
index 937925d..948f81b 100644
--- a/genomic_data_service/region_indexer.py
+++ b/genomic_data_service/region_indexer.py
@@ -3,6 +3,7 @@
     RegionIndexerElasticSearch,
 )
 import requests
+from requests.adapters import HTTPAdapter, Retry
 import pickle
 from genomic_data_service.constants import FILE_HG19
 import argparse
@@ -123,6 +124,7 @@
     'https://www.encodeproject.org/search/?control_type!=*&status=released&perturbed=false&assay_title=Histone+ChIP-seq&target.label=H3K27ac&target.label=H3K36me3&target.label=H3K4me3&target.label=H3K4me1&target.label=H3K27me3&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&assembly=GRCh38&files.file_type=bed+narrowPeak&type=Experiment&files.analyses.status=released&files.preferred_default=true&limit=all&format=json'
     + '&field=files.accession&field=files.preferred_default&field=files.file_format&field=files.analyses.@id&field=default_analysis'
 )
+
 parser = argparse.ArgumentParser(
     description='indexing files for genomic data service.'
 )
@@ -149,6 +151,11 @@
     choices=['RegulomeDB_2_0', 'RegulomeDB_2_1'],
 )
 
+session = requests.Session()
+retries = Retry(total=5, backoff_factor=1,
+                status_forcelist=[500, 502, 503, 504])
+session.mount('https://', HTTPAdapter(max_retries=retries))
+
 
 def clean_up(obj, fields):
     clean_obj = {}
@@ -185,7 +192,7 @@ def encode_graph(query):
     query += ['field=*', 'limit=all', 'format=json']
 
     endpoint = f"{ENCODE_DOMAIN}/search/?{'&'.join(query)}"
-    return requests.get(endpoint).json()['@graph']
+    return session.get(endpoint).json()['@graph']
 
 
 def need_to_fetch_documents(dataset):
@@ -218,7 +225,7 @@ def fetch_documents(dataset):
     documents = []
     for document_id in dataset.get('documents', []):
         endpoint = f'{ENCODE_DOMAIN}{document_id}?format=json'
-        documents.append(requests.get(endpoint).json())
+        documents.append(session.get(endpoint).json())
 
     dataset['documents'] = documents
 
@@ -329,38 +336,39 @@ def make_pickle_file(encode_accessions):
 def get_encode_accessions_from_portal():
     encode_accessions = []
     # get files in experiment TF ChIP-seq using assembly GRCh38
-    experiments = requests.get(
+    experiments = session.get(
         TF_CHIP_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph']
     # get files in experiment DNase-seq using assembly GRCh38
-    experiments.extend(requests.get(
+    experiments.extend(session.get(
         DNASE_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph'])
     # get files in experiment ATAC-seq using assembly GRCh38
-    experiments.extend(requests.get(
+    experiments.extend(session.get(
         ATAC_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph'])
     # get files in experiment histone ChIP-seq using assembly GRCh38
-    experiments.extend(requests.get(
+    experiments.extend(session.get(
         HISTONE_CHIP_SEQ_EXPS_GRCH38_ENDPOINT).json()['@graph'])
     # get files in footprints
-    annotations = requests.get(
+    annotations = session.get(
         FOOTPRINT_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph']
     # get files in PWMs
-    annotations.extend(requests.get(
+    annotations.extend(session.get(
         PWM_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph'])
     # get files in eQTLs
-    annotations.extend(requests.get(
+    annotations.extend(session.get(
         EQTL_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph'])
     # get files for chromatin state for grch38
-    chromatin_state_files = requests.get(
+    chromatin_state_files = session.get(
         CHROMATIN_STATE_FILES_GRCH38_ENDPOINT).json()['@graph']
     # get ds_qtl annotations for grch38
-    ds_qtls = requests.get(CAQTL_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph']
+    ds_qtls = session.get(CAQTL_ANNOTATIONS_GRCH38_ENDPOINT).json()['@graph']
 
     for experiment in experiments:
         files = experiment.get('files', [])
-        default_analysis_id = experiment['default_analysis']
-        for file in files:
-            if is_preferred_default_bed_from_default_analysis(default_analysis_id, file):
-                encode_accessions.append(file['accession'])
+        default_analysis_id = experiment.get('default_analysis')
+        if default_analysis_id:
+            for file in files:
+                if is_preferred_default_bed_from_default_analysis(default_analysis_id, file):
+                    encode_accessions.append(file['accession'])
 
     for annotation in annotations:
         files = annotation.get('files', [])
diff --git a/genomic_data_service/region_indexer_task.py b/genomic_data_service/region_indexer_task.py
index 2513609..499a04f 100644
--- a/genomic_data_service/region_indexer_task.py
+++ b/genomic_data_service/region_indexer_task.py
@@ -423,16 +423,19 @@ def metadata_doc(file_uuid, file_metadata, dataset_metadata):
     assay_title = dataset_metadata.get('assay_title')
     if assay_title == 'Histone ChIP-seq':
         meta_doc['dataset']['collection_type'] = assay_title
+    # footprints have both assay_term_name(a list) and annotation_type
     elif dataset_metadata.get('annotation_type') == 'footprints':
         meta_doc['dataset']['collection_type'] = 'footprints'
-        if dataset_metadata.get('assay_term_name') and 'ATAC-seq' in dataset_metadata.get('assay_term_name'):
-            meta_doc['dataset']['footprint_assay_term_name'] = 'ATAC-seq'
-        else:
-            meta_doc['dataset']['footprint_assay_term_name'] = 'DNase-seq'
+        assay_term_name = dataset_metadata.get('assay_term_name')
+        if assay_term_name:
+            if 'ATAC-seq' in assay_term_name:
+                meta_doc['dataset']['footprint_assay_term_name'] = 'ATAC-seq'
+            elif 'DNase-seq' in assay_term_name:
+                meta_doc['dataset']['footprint_assay_term_name'] = 'DNase-seq'
     else:
         # regulome use three type of datasets: experiments, annotations and references. experiements has property assay_term_name, annotations has property annotation_type, references has property reference_type.
         # Those properties will be indexed as dataset collection_type in regulome datase base.
-        # Annotations can have both assay_term_name and annotation_type, for example, imputations and gkm-SVMs, but we don't use those datasets in regulome.
+        # Annotations can have both assay_term_name and annotation_type, for example, footprints, imputations and gkm-SVMs, we have footprints in regulome.
         for prop in REGULOME_COLLECTION_TYPES:
             prop_value = dataset_metadata.get(prop)
             if prop_value: