-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Tradis vault bulk import from xlsx script
- Loading branch information
Showing
5 changed files
with
172 additions
and
1 deletion.
There are no files selected for viewing
88 changes: 88 additions & 0 deletions
88
apollo_portal/tracks/management/commands/import_genomes.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
"""Bulk import Genome records to the database from a JSON file. | ||
Expect JSON: | ||
{ | ||
"group_name": value, | ||
"lab_name" value, | ||
"name" value, | ||
"description" value, | ||
"reference" value, | ||
"doi" value, | ||
"strain" value, | ||
"species" value, | ||
"condition" value, | ||
"ncbi_bioproject" value, | ||
"metadata" value, | ||
"key": "value", | ||
... | ||
}, | ||
} | ||
""" | ||
|
||
import json | ||
from django.db import transaction | ||
from django.core.management.base import BaseCommand | ||
from django.contrib.auth.models import Group | ||
from pathlib import Path | ||
|
||
from tracks.models import Genome, Lab | ||
|
||
|
||
class Command(BaseCommand): | ||
"""Seed the database.""" | ||
|
||
help = "Bulk import records from a JSON file." | ||
|
||
def add_arguments(self, parser): | ||
parser.add_argument( | ||
"-j", | ||
"--json", | ||
type=Path, | ||
help=("Path to JSON to import"), | ||
) | ||
|
||
@transaction.atomic | ||
def handle(self, *args, **kwargs): | ||
"""Run the command.""" | ||
with open(kwargs["json"], "r") as f: | ||
data = json.load(f) | ||
self.stdout.write(f"\nImporting {len(data)} genome records...") | ||
input("\nPress ENTER to continue or CTRL+C to cancel\n\n> ") | ||
for genome in data: | ||
create_genome_from_json(genome) | ||
|
||
|
||
def create_genome_from_json(genome): | ||
"""Create genome from given data. | ||
Create required group and lab records for relational fields. | ||
""" | ||
if genome["group_name"]: | ||
group, _ = Group.objects.get_or_create(name=genome["group_name"]) | ||
else: | ||
group = None | ||
if genome["lab_name"]: | ||
lab, _ = Lab.objects.get_or_create(name=genome["lab_name"]) | ||
else: | ||
lab = None | ||
|
||
g = Genome.objects.create( | ||
group=group, | ||
lab=lab, | ||
name=genome["name"], | ||
description_html=genome["description"], | ||
reference=genome["reference"], | ||
doi=genome["doi"], | ||
strain=genome["strain"], | ||
species=genome["species"], | ||
condition=genome["condition"], | ||
ncbi_bioproject=genome["ncbi_bioproject"], | ||
) | ||
|
||
for k, v in genome["metadata"].items(): | ||
g.set_metadata(k, v) | ||
g.save() | ||
|
||
print(f"Created genome {genome['name']} in lab {lab.name}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
data/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
pandas | ||
openpyxl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
"""Bulk import tradis-vault genomes from excel sheet.""" | ||
|
||
import json | ||
import os | ||
import pandas as pd | ||
import requests | ||
from pathlib import Path | ||
|
||
os.chdir(Path(__file__).parent) | ||
|
||
XLS_PATH = Path("data/230828_tradis_vault_samples_update.xlsx") | ||
JSON_OUTFILE = Path("data/genomes.json") | ||
|
||
pmid_dois = {} | ||
|
||
|
||
def get_value(row, key): | ||
"""Return row value while omitting NaN values.""" | ||
value = row[key] | ||
if pd.isna(value): | ||
return None | ||
if type(value) is int or type(value) is float: | ||
return value | ||
if value.strip() == "": | ||
return None | ||
if value.strip() == "...": | ||
return None | ||
return value | ||
|
||
|
||
def get_doi_from_pubmed(pubmed_id): | ||
"""Fetch pubmed webpage and parse out DOI.""" | ||
print(f"Fetching DOI for pubmed ID {pubmed_id}... ") | ||
if not pubmed_id or pd.isna(pubmed_id): | ||
return None | ||
url = f"https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}" | ||
r = requests.get(url) | ||
if r.status_code != 200: | ||
return None | ||
doi = r.text.split('data-ga-action="DOI"')[1].split('<')[0].strip(' \n<>') | ||
return doi | ||
|
||
|
||
genomes = [] | ||
df = pd.read_excel(XLS_PATH, sheet_name=0, header=0) | ||
|
||
for ix, row in df.iterrows(): | ||
print(f"Parsing row {ix}... ") | ||
if row["Pubmed ID"] in pmid_dois: | ||
doi = pmid_dois[row["Pubmed ID"]] | ||
else: | ||
doi = get_doi_from_pubmed(row["Pubmed ID"]) | ||
pmid_dois[row["Pubmed ID"]] = doi | ||
|
||
genome = { | ||
"group_name": "tradis-vault", | ||
"lab_name": get_value(row, "Lab Name"), | ||
"name": get_value(row, "Name"), | ||
"description": get_value(row, "Description"), | ||
"reference": get_value(row, "Source"), | ||
"doi": doi, | ||
"strain": get_value(row, "Strain"), | ||
"species": "Escherichia coli", | ||
"condition": get_value(row, "Condition"), | ||
"ncbi_bioproject": get_value(row, "Submission ID"), | ||
} | ||
metadata = {} | ||
for key in [ | ||
"File Type", | ||
"Taxonomy ID", | ||
"Platform", | ||
]: | ||
val = get_value(row, key) | ||
if val: | ||
metadata[key] = val | ||
genome["metadata"] = metadata | ||
genomes.append(genome) | ||
|
||
print(f"Writing genomes to {JSON_OUTFILE}") | ||
with open(JSON_OUTFILE, "w") as f: | ||
json.dump(genomes, f) |