Tradis vault bulk import from xlsx script

AU-Biocommons · Dec 11, 2023 · 50a89da · 50a89da
1 parent b51c2ab
commit 50a89da
Show file tree

Hide file tree

Showing 5 changed files with 172 additions and 1 deletion.
diff --git a/apollo_portal/tracks/management/commands/import_genomes.py b/apollo_portal/tracks/management/commands/import_genomes.py
@@ -0,0 +1,88 @@
+"""Bulk import Genome records to the database from a JSON file.
+
+Expect JSON:
+
+{
+    "group_name": value,
+    "lab_name" value,
+    "name" value,
+    "description" value,
+    "reference" value,
+    "doi" value,
+    "strain" value,
+    "species" value,
+    "condition" value,
+    "ncbi_bioproject" value,
+    "metadata" value,
+        "key": "value",
+        ...
+    },
+}
+
+"""
+
+import json
+from django.db import transaction
+from django.core.management.base import BaseCommand
+from django.contrib.auth.models import Group
+from pathlib import Path
+
+from tracks.models import Genome, Lab
+
+
+class Command(BaseCommand):
+    """Seed the database."""
+
+    help = "Bulk import records from a JSON file."
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "-j",
+            "--json",
+            type=Path,
+            help=("Path to JSON to import"),
+        )
+
+    @transaction.atomic
+    def handle(self, *args, **kwargs):
+        """Run the command."""
+        with open(kwargs["json"], "r") as f:
+            data = json.load(f)
+            self.stdout.write(f"\nImporting {len(data)} genome records...")
+            input("\nPress ENTER to continue or CTRL+C to cancel\n\n> ")
+            for genome in data:
+                create_genome_from_json(genome)
+
+
+def create_genome_from_json(genome):
+    """Create genome from given data.
+
+    Create required group and lab records for relational fields.
+    """
+    if genome["group_name"]:
+        group, _ = Group.objects.get_or_create(name=genome["group_name"])
+    else:
+        group = None
+    if genome["lab_name"]:
+        lab, _ = Lab.objects.get_or_create(name=genome["lab_name"])
+    else:
+        lab = None
+
+    g = Genome.objects.create(
+        group=group,
+        lab=lab,
+        name=genome["name"],
+        description_html=genome["description"],
+        reference=genome["reference"],
+        doi=genome["doi"],
+        strain=genome["strain"],
+        species=genome["species"],
+        condition=genome["condition"],
+        ncbi_bioproject=genome["ncbi_bioproject"],
+    )
+
+    for k, v in genome["metadata"].items():
+        g.set_metadata(k, v)
+    g.save()
+
+    print(f"Created genome {genome['name']} in lab {lab.name}")
diff --git a/apollo_portal/tracks/models.py b/apollo_portal/tracks/models.py
@@ -112,7 +112,6 @@ def metadata(self):
             return {}
         return yaml.safe_load(self._metadata_yaml)
 
-    @property
     def set_metadata(self, k, v):
         """Set metadata key to given value."""
         data = self.metadata

diff --git a/scripts/import_genomes/.gitignore b/scripts/import_genomes/.gitignore
@@ -0,0 +1 @@
+data/
diff --git a/scripts/import_genomes/requirements.txt b/scripts/import_genomes/requirements.txt
@@ -0,0 +1,2 @@
+pandas
+openpyxl
diff --git a/scripts/import_genomes/tradis_import.py b/scripts/import_genomes/tradis_import.py
@@ -0,0 +1,81 @@
+"""Bulk import tradis-vault genomes from excel sheet."""
+
+import json
+import os
+import pandas as pd
+import requests
+from pathlib import Path
+
+os.chdir(Path(__file__).parent)
+
+XLS_PATH = Path("data/230828_tradis_vault_samples_update.xlsx")
+JSON_OUTFILE = Path("data/genomes.json")
+
+pmid_dois = {}
+
+
+def get_value(row, key):
+    """Return row value while omitting NaN values."""
+    value = row[key]
+    if pd.isna(value):
+        return None
+    if type(value) is int or type(value) is float:
+        return value
+    if value.strip() == "":
+        return None
+    if value.strip() == "...":
+        return None
+    return value
+
+
+def get_doi_from_pubmed(pubmed_id):
+    """Fetch pubmed webpage and parse out DOI."""
+    print(f"Fetching DOI for pubmed ID {pubmed_id}... ")
+    if not pubmed_id or pd.isna(pubmed_id):
+        return None
+    url = f"https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}"
+    r = requests.get(url)
+    if r.status_code != 200:
+        return None
+    doi = r.text.split('data-ga-action="DOI"')[1].split('<')[0].strip(' \n<>')
+    return doi
+
+
+genomes = []
+df = pd.read_excel(XLS_PATH, sheet_name=0, header=0)
+
+for ix, row in df.iterrows():
+    print(f"Parsing row {ix}... ")
+    if row["Pubmed ID"] in pmid_dois:
+        doi = pmid_dois[row["Pubmed ID"]]
+    else:
+        doi = get_doi_from_pubmed(row["Pubmed ID"])
+        pmid_dois[row["Pubmed ID"]] = doi
+
+    genome = {
+        "group_name": "tradis-vault",
+        "lab_name": get_value(row, "Lab Name"),
+        "name": get_value(row, "Name"),
+        "description": get_value(row, "Description"),
+        "reference": get_value(row, "Source"),
+        "doi": doi,
+        "strain": get_value(row, "Strain"),
+        "species": "Escherichia coli",
+        "condition": get_value(row, "Condition"),
+        "ncbi_bioproject": get_value(row, "Submission ID"),
+    }
+    metadata = {}
+    for key in [
+        "File Type",
+        "Taxonomy ID",
+        "Platform",
+    ]:
+        val = get_value(row, key)
+        if val:
+            metadata[key] = val
+    genome["metadata"] = metadata
+    genomes.append(genome)
+
+print(f"Writing genomes to {JSON_OUTFILE}")
+with open(JSON_OUTFILE, "w") as f:
+    json.dump(genomes, f)