Skip to content

Commit

Permalink
Merge pull request #12 from acdh-oeaw/new-relations
Browse files Browse the repository at this point in the history
Migrate to new relations
  • Loading branch information
gythaogg authored Oct 3, 2024
2 parents 85d02fc + 4c4ee6f commit caaa69c
Show file tree
Hide file tree
Showing 12 changed files with 17,058 additions and 80 deletions.
17 changes: 14 additions & 3 deletions apis_ontology/management/commands/import_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,13 @@


class Command(BaseCommand):
help = "import entities from vanilla APIS in data/dump_3105.json"
help = "import entities from vanilla APIS"

def add_arguments(self, parser):
parser.add_argument("dump", type=str, help="dump file name")

def handle(self, *args, **kwargs):
df = pd.read_json("data/dump_3105.json")
df = pd.read_json(kwargs["dump"])
FIELDNAME_MAPPING = {
"apis_entities.person": {"first_name": "forename", "name": "surname"},
"apis_entities.place": {
Expand All @@ -66,7 +69,7 @@ def get_base_vocab_data(vocab_pk):
(df.model == "apis_vocabularies.vocabsbaseclass") & (df.pk == vocab_pk)
]
if match.shape[0]:
return {"name": match.iloc[0].fields["name"]}
return {"name": match.iloc[0].fields["name"]}print
return {}

def get_text_field(text_pk):
Expand Down Expand Up @@ -152,6 +155,7 @@ def create_collections(collection_ids, entity):

def import_persons():
MODEL = "apis_entities.person"
self.stdout.write("importing persons")
persons = df[df.model == MODEL]
for _, p in tqdm(persons.iterrows(), total=persons.shape[0]):
title_ids = None
Expand Down Expand Up @@ -202,6 +206,7 @@ def import_persons():

def import_places():
MODEL = "apis_entities.place"
self.stdout.write("importing places")
places = df[df.model == MODEL]
for _, p in tqdm(places.iterrows(), total=places.shape[0]):
place_type_pk = None
Expand All @@ -227,6 +232,7 @@ def import_places():

def import_institutions():
MODEL = "apis_entities.institution"
self.stdout.write("importing institutions")
institutions = df[df.model == MODEL]
for _, p in tqdm(institutions.iterrows(), total=institutions.shape[0]):
i_type_pk = None
Expand All @@ -253,6 +259,7 @@ def import_institutions():

def import_events():
MODEL = "apis_entities.event"
self.stdout.write("importing events")
df_subset = df[df.model == MODEL]
for _, row in tqdm(df_subset.iterrows(), total=df_subset.shape[0]):
type_pk = None
Expand All @@ -279,6 +286,7 @@ def import_events():

def import_works():
MODEL = "apis_entities.work"
self.stdout.write("importing works")
df_subset = df[df.model == MODEL]
for _, row in tqdm(df_subset.iterrows(), total=df_subset.shape[0]):
type_pk = None
Expand Down Expand Up @@ -316,6 +324,7 @@ def import_works():

def import_expressions():
MODEL = "apis_entities.expression"
self.stdout.write("importing expressions")
df_subset = df[df.model == MODEL]
for _, row in tqdm(df_subset.iterrows(), total=df_subset.shape[0]):
ted = get_all_entity_data(row.pk, MODEL)
Expand Down Expand Up @@ -364,6 +373,7 @@ def import_expressions():

def import_manuscripts():
MODEL = "apis_entities.manuscript"
self.stdout.write("importing manuscripts")
df_subset = df[df.model == MODEL]
for _, row in tqdm(df_subset.iterrows(), total=df_subset.shape[0]):
condition_pks = None
Expand Down Expand Up @@ -392,6 +402,7 @@ def import_manuscripts():

def import_manuscript_parts():
MODEL = "apis_entities.manuscriptpart"
self.stdout.write("importing manuscript parts")
df_subset = df[df.model == MODEL]
for _, row in tqdm(df_subset.iterrows(), total=df_subset.shape[0]):
mpart_type_pk = None
Expand Down
210 changes: 210 additions & 0 deletions apis_ontology/management/commands/import_relations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
from django.contrib.contenttypes.models import ContentType
from django.core.management.base import BaseCommand
import pandas as pd
from django.apps import apps
from tqdm.auto import tqdm
import re
import pandas as pd


class Command(BaseCommand):
help = "import relations from file"

def add_arguments(self, parser):
parser.add_argument("dump", type=str, help="dump file name")

def handle(self, *args, **kwargs):
def extract_relation_models():
df_vocabsbaseclass = df[
df["model"] == "apis_vocabularies.vocabsbaseclass"
].copy()
df_relationbaseclass = df[
df["model"] == "apis_vocabularies.relationbaseclass"
].copy()
df_vocabnames = df[df["model"] == "apis_vocabularies.vocabnames"].copy()

# Extract necessary fields from the fields dictionaries
df_vocabsbaseclass["name"] = df_vocabsbaseclass["fields"].apply(
lambda x: x.get("name", "")
)
df_vocabsbaseclass["vocab_name"] = df_vocabsbaseclass["fields"].apply(
lambda x: x.get("vocab_name", None)
)
df_relationbaseclass["name_reverse"] = df_relationbaseclass["fields"].apply(
lambda x: x.get("name_reverse", "")
)
df_vocabnames["vocabnames"] = df_vocabnames["fields"].apply(
lambda x: x.get("name", "")
)

# Filter the vocabnames DataFrame to include only rows with pk in df_vocabsbaseclass['vocab_name']
filtered_vocabnames_df = df_vocabnames[
df_vocabnames["pk"].isin(df_vocabsbaseclass["vocab_name"])
]

# Merge the DataFrames on the 'pk' column using inner joins
merged_df = pd.merge(
df_relationbaseclass[["pk", "name_reverse"]],
df_vocabsbaseclass[["pk", "name", "vocab_name"]],
on="pk",
how="inner",
)
relations_df = pd.merge(
merged_df,
filtered_vocabnames_df[["pk", "vocabnames"]],
left_on="vocab_name",
right_on="pk",
how="left",
suffixes=("", "_vocabname"),
)

# Drop redundant columns
relations_df = relations_df.drop(columns=["vocab_name", "pk_vocabname"])
relations_df.rename(columns={"name": "name_forward"}, inplace=True)

# Function to split vocabnames into two words based on uppercase transition
def split_vocabname(vocabname):
matches = re.findall(r"[A-Z][a-z]*", vocabname)
if len(matches) >= 2:
return (
f"apis_ontology.{matches[0].lower()}",
f"apis_ontology.{matches[1].lower()}",
)
return vocabname, ""

def new_rel_classname(property_name):
return f"{property_name.title().replace('-','').replace(' ', '')}"

# Apply the function to split the vocabnames column
relations_df[["subj_class", "obj_class"]] = relations_df[
"vocabnames"
].apply(lambda x: pd.Series(split_vocabname(x)))
relations_df["class_name"] = relations_df["name_forward"].apply(
new_rel_classname
)
df_grouped = (
relations_df.groupby("class_name")
.agg(
{
"subj_class": lambda x: sorted(set(x)),
"obj_class": lambda x: sorted(set(x)),
"name_reverse": "first",
"name_forward": "first",
"pk": list,
}
)
.reset_index()
)

return df_grouped

def get_rel_class(rel_pk):
match = df_relations[df_relations["pk"].apply(lambda x: rel_pk in x)]
if match.shape[0] != 1:
self.stdout.write(
self.style.ERROR("Cannot finf the correct match for %s.", rel_pk)
)
print(match)

return "apis_ontology." + match.iloc[0].class_name.lower()

def validate_relations_model():
print("Validating relations model ...")
model_is_valid = True
df_relations = extract_relation_models()
for i, p in tqdm(
df_relations.sort_values(by="class_name").iterrows(),
total=df_relations.shape[0],
):
class_name = p.class_name
try:
model = apps.get_model(f"apis_ontology.{class_name.lower()}")
except LookupError as le:
model_is_valid = False
print(
f"class {class_name}(Relation, NomanslandRelationMixin, VersionMixin): "
)
print(
f"\trelation_type_old = {p.pk} # pk of Property in apis_relations"
)
print(f"\tsubj_model = {p.subj_class}")
print(f"\tobj_model = {p.obj_class}")
print(f"\t@classmethod")
print(f"\tdef reverse_name(cls) -> str:")
print(f'\t\treturn "{p.name_reverse}"\n')

return model_is_valid, df_relations

def create_relations_instances():
print("Creating relations instances ...")
relation_rows = df[df.model.str.startswith("apis_relations.")]
for i, row in tqdm(relation_rows.iterrows(), total=relation_rows.shape[0]):
RelModel = apps.get_model(get_rel_class(row.fields["relation_type"]))
rel_data = {"pk_old": row.pk, **row.fields}
subj_key = None
obj_key = None
for subj_model in RelModel.subj_model:
subj_model = subj_model.removeprefix("apis_ontology.")
if f"related_{subj_model}A" in rel_data.keys():
subj_key = f"related_{subj_model}A"
obj_key = f"related_{subj_model}B"
break
if f"related_{subj_model}" in rel_data.keys():
subj_key = f"related_{subj_model}"
try:
obj_key = [
f"related_{obj_model.removeprefix('apis_ontology.')}"
for obj_model in RelModel.obj_model
if f"related_{obj_model.removeprefix('apis_ontology.')}"
in rel_data.keys()
][0]
break
except IndexError:
continue

try:
subj_pk, obj_pk = (rel_data[subj_key], rel_data[obj_key])
subj_model_name = f"apis_ontology.{subj_key.removeprefix('related_').removesuffix('A')}"
obj_model_name = f"apis_ontology.{obj_key.removeprefix('related_').removesuffix('B')}"
subj = apps.get_model(subj_model_name).objects.get(pk_old=subj_pk)
obj = apps.get_model(obj_model_name).objects.get(pk_old=obj_pk)
data = {
"subj_object_id": subj.pk,
"obj_object_id": obj.pk,
"pk_old": row.pk,
"certainty": (
rel_data.get("certainty")
if rel_data.get("certainty")
else "unknown"
),
"subj_content_type": ContentType.objects.get(
app_label="apis_ontology",
model=subj_model_name.split(".")[-1],
),
"obj_content_type": ContentType.objects.get(
app_label="apis_ontology",
model=obj_model_name.split(".")[-1],
),
}
rel, _ = RelModel.objects.get_or_create(**data)
except Exception as e:
print(e)
break

### handle command
try:
df = pd.read_json(kwargs["dump"])
model_is_valid, df_relations = validate_relations_model()
if not model_is_valid:
self.stdout.write(
self.style.ERROR(
"Please update the Relations model before importing."
)
)
create_relations_instances()
except KeyError:
# print help message and quit
self.stdout.write(self.style.ERROR("Please provide a dump file."))
return

self.stdout.write(self.style.SUCCESS("Relations imported."))
Loading

0 comments on commit caaa69c

Please sign in to comment.