Skip to content

Commit

Permalink
Tradis vault bulk import from xlsx script
Browse files Browse the repository at this point in the history
  • Loading branch information
neoformit committed Dec 11, 2023
1 parent b51c2ab commit 50a89da
Show file tree
Hide file tree
Showing 5 changed files with 172 additions and 1 deletion.
88 changes: 88 additions & 0 deletions apollo_portal/tracks/management/commands/import_genomes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""Bulk import Genome records to the database from a JSON file.
Expect JSON:
{
"group_name": value,
"lab_name" value,
"name" value,
"description" value,
"reference" value,
"doi" value,
"strain" value,
"species" value,
"condition" value,
"ncbi_bioproject" value,
"metadata" value,
"key": "value",
...
},
}
"""

import json
from django.db import transaction
from django.core.management.base import BaseCommand
from django.contrib.auth.models import Group
from pathlib import Path

from tracks.models import Genome, Lab


class Command(BaseCommand):
"""Seed the database."""

help = "Bulk import records from a JSON file."

def add_arguments(self, parser):
parser.add_argument(
"-j",
"--json",
type=Path,
help=("Path to JSON to import"),
)

@transaction.atomic
def handle(self, *args, **kwargs):
"""Run the command."""
with open(kwargs["json"], "r") as f:
data = json.load(f)
self.stdout.write(f"\nImporting {len(data)} genome records...")
input("\nPress ENTER to continue or CTRL+C to cancel\n\n> ")
for genome in data:
create_genome_from_json(genome)


def create_genome_from_json(genome):
"""Create genome from given data.
Create required group and lab records for relational fields.
"""
if genome["group_name"]:
group, _ = Group.objects.get_or_create(name=genome["group_name"])
else:
group = None
if genome["lab_name"]:
lab, _ = Lab.objects.get_or_create(name=genome["lab_name"])
else:
lab = None

g = Genome.objects.create(
group=group,
lab=lab,
name=genome["name"],
description_html=genome["description"],
reference=genome["reference"],
doi=genome["doi"],
strain=genome["strain"],
species=genome["species"],
condition=genome["condition"],
ncbi_bioproject=genome["ncbi_bioproject"],
)

for k, v in genome["metadata"].items():
g.set_metadata(k, v)
g.save()

print(f"Created genome {genome['name']} in lab {lab.name}")
1 change: 0 additions & 1 deletion apollo_portal/tracks/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ def metadata(self):
return {}
return yaml.safe_load(self._metadata_yaml)

@property
def set_metadata(self, k, v):
"""Set metadata key to given value."""
data = self.metadata
Expand Down
1 change: 1 addition & 0 deletions scripts/import_genomes/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/
2 changes: 2 additions & 0 deletions scripts/import_genomes/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
pandas
openpyxl
81 changes: 81 additions & 0 deletions scripts/import_genomes/tradis_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""Bulk import tradis-vault genomes from excel sheet."""

import json
import os
import pandas as pd
import requests
from pathlib import Path

os.chdir(Path(__file__).parent)

XLS_PATH = Path("data/230828_tradis_vault_samples_update.xlsx")
JSON_OUTFILE = Path("data/genomes.json")

pmid_dois = {}


def get_value(row, key):
"""Return row value while omitting NaN values."""
value = row[key]
if pd.isna(value):
return None
if type(value) is int or type(value) is float:
return value
if value.strip() == "":
return None
if value.strip() == "...":
return None
return value


def get_doi_from_pubmed(pubmed_id):
"""Fetch pubmed webpage and parse out DOI."""
print(f"Fetching DOI for pubmed ID {pubmed_id}... ")
if not pubmed_id or pd.isna(pubmed_id):
return None
url = f"https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}"
r = requests.get(url)
if r.status_code != 200:
return None
doi = r.text.split('data-ga-action="DOI"')[1].split('<')[0].strip(' \n<>')
return doi


genomes = []
df = pd.read_excel(XLS_PATH, sheet_name=0, header=0)

for ix, row in df.iterrows():
print(f"Parsing row {ix}... ")
if row["Pubmed ID"] in pmid_dois:
doi = pmid_dois[row["Pubmed ID"]]
else:
doi = get_doi_from_pubmed(row["Pubmed ID"])
pmid_dois[row["Pubmed ID"]] = doi

genome = {
"group_name": "tradis-vault",
"lab_name": get_value(row, "Lab Name"),
"name": get_value(row, "Name"),
"description": get_value(row, "Description"),
"reference": get_value(row, "Source"),
"doi": doi,
"strain": get_value(row, "Strain"),
"species": "Escherichia coli",
"condition": get_value(row, "Condition"),
"ncbi_bioproject": get_value(row, "Submission ID"),
}
metadata = {}
for key in [
"File Type",
"Taxonomy ID",
"Platform",
]:
val = get_value(row, key)
if val:
metadata[key] = val
genome["metadata"] = metadata
genomes.append(genome)

print(f"Writing genomes to {JSON_OUTFILE}")
with open(JSON_OUTFILE, "w") as f:
json.dump(genomes, f)

0 comments on commit 50a89da

Please sign in to comment.