Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
martinantonmueller committed Feb 27, 2024
2 parents 759865c + cf775de commit 92cb67b
Show file tree
Hide file tree
Showing 21 changed files with 534 additions and 132 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ jobs:
- name: Create Coverage Report
run: coverage xml
- name: "Upload coverage to Codecov"
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
with:
token: ${{secrets.CODECOV_TOKEN}}
file: ./coverage.xml
Expand Down
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -180,3 +180,8 @@ hansi.csv
media/duplicated_*.csv
Untitled.ipynb
listevent.xml
relations.csv
hansi.*
media/relations.gexf
edges.csv
nodes.csv
4 changes: 2 additions & 2 deletions apis_core/apis_entities/list_view_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@

class WorkListFilter(MyBaseFilter):
name = django_filters.CharFilter(
lookup_expr="icontains",
method="name_label_filter",
label="Werktitel",
help_text="eingegebene Zeichenkette muss im Titel enthalten sein",
help_text="eingegebene Zeichenkette muss im Titel oder in einem der Labels enthalten sein",
)
references = django_filters.CharFilter(
lookup_expr="icontains",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import pandas as pd
import recordlinkage
from recordlinkage.compare import Geographic

from typing import Any
from django.conf import settings
Expand All @@ -15,14 +16,15 @@ class Command(BaseCommand):
def handle(self, *args: Any, **options: Any) -> str | None:
print("searching for potential duplicates")

props = [
"id",
"name",
]
df = pd.DataFrame(
Place.objects.values_list(*props),
columns=props,
).astype("str")
props = ["id", "name", "lat", "lng"]
df = (
pd.DataFrame(
Place.objects.values_list(*props),
columns=props,
)
.astype("str")
.fillna("nix")
)
df["custom_index"] = df["id"].astype(str) + " " + df["name"]
df.set_index("custom_index", inplace=True)
indexer = recordlinkage.Index()
Expand All @@ -31,8 +33,10 @@ def handle(self, *args: Any, **options: Any) -> str | None:
len(candidate_links)
compare_cl = recordlinkage.Compare()
compare_cl.exact("name", "name", label="name")
compare_cl.exact("lat", "lat", label="lat")
compare_cl.exact("lng", "lng", label="lng")
features = compare_cl.compute(candidate_links, df)
matches = features[features.sum(axis=1) > 0]
matches = features[features.sum(axis=1) > 2]
save_path = os.path.join(settings.MEDIA_ROOT, "duplicated_places.csv")
matches.to_csv(save_path)
print(f"found {len(matches)} potential duplicates")
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import pandas as pd

from django.core.exceptions import ObjectDoesNotExist
from django.core.management.base import BaseCommand
from tqdm import tqdm

from apis_core.apis_metainfo.models import TempEntityClass


def get_id(row, col):
return int(row[col].split(" ")[0])


def is_greater(row, col_a, col_b):
if row[col_a] > row[col_b]:
return True
else:
return False


class Command(BaseCommand):
help = """merges duplicated entities\
e.g. python manage.py merge_duplicated_entities --csv https://pmb.acdh.oeaw.ac.at/media/duplicated_places.csv
"""

def add_arguments(self, parser):
parser.add_argument("--csv")

def handle(self, *args, **kwargs):
csv_url = kwargs["csv"]
print(f"reading duplicated objects from csv: {csv_url}")
df = pd.read_csv(csv_url)
df["id_a"] = df.apply(lambda row: get_id(row, "custom_index_1"), axis=1)
df["id_b"] = df.apply(lambda row: get_id(row, "custom_index_2"), axis=1)
df = df[["id_a", "id_b"]]
df["b_smaller_a"] = df.apply(
lambda row: is_greater(row, "id_a", "id_b"), axis=1
)

keep_not_found = set()
merge_did_not_work = []
print(f"start merging of {len(df)} duplicated objects")
for i, row in tqdm(df.iterrows(), total=len(df)):
if row["b_smaller_a"]:
try:
keep = TempEntityClass.objects.get(
id=row["id_b"]
).get_child_entity()
except ObjectDoesNotExist:
keep_not_found.add(row["id_b"])
try:
keep.merge_with(row["id_a"])
except Exception as e:
merge_did_not_work.append([row, e])
if len(keep_not_found) > 0:
print("following potential to keep objects could not be found")
for x in keep_not_found:
print(x)
if len(merge_did_not_work) > 0:
print("for following objects the merge did not work")
for x in merge_did_not_work:
print(x)
print("done")
30 changes: 30 additions & 0 deletions apis_core/apis_entities/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,6 +512,8 @@ def save(self, *args, **kwargs):
return self

class Meta:
verbose_name = "Person"
verbose_name_plural = "Personen"
ordering = [
"id",
]
Expand All @@ -526,6 +528,10 @@ def get_api_url(self):
def get_icon(self):
return "bi bi-people apis-person"

@classmethod
def get_color(self):
return "#720e07"


class Place(AbstractEntity):
kind = models.ForeignKey(
Expand All @@ -541,6 +547,8 @@ def save(self, *args, **kwargs):
return self

class Meta:
verbose_name = "Ort"
verbose_name_plural = "Orte"
ordering = [
"id",
]
Expand All @@ -555,13 +563,19 @@ def get_api_url(self):
def get_icon(self):
return "bi bi-map apis-place"

@classmethod
def get_color(self):
return "#5bc0eb"


class Institution(AbstractEntity):
kind = models.ForeignKey(
InstitutionType, blank=True, null=True, on_delete=models.SET_NULL
)

class Meta:
verbose_name = "Institution"
verbose_name_plural = "Institutionen"
ordering = [
"id",
]
Expand All @@ -576,13 +590,19 @@ def get_api_url(self):
def get_icon(self):
return "bi bi-building-gear apis-institution"

@classmethod
def get_color(self):
return "#1d3461"


class Event(AbstractEntity):
kind = models.ForeignKey(
EventType, blank=True, null=True, on_delete=models.SET_NULL
)

class Meta:
verbose_name = "Ereignis"
verbose_name_plural = "Ereignisse"
ordering = [
"id",
]
Expand All @@ -597,11 +617,17 @@ def get_api_url(self):
def get_icon(self):
return "bi bi-calendar3 apis-event"

@classmethod
def get_color(self):
return "#9bc53d"


class Work(AbstractEntity):
kind = models.ForeignKey(WorkType, blank=True, null=True, on_delete=models.SET_NULL)

class Meta:
verbose_name = "Werk"
verbose_name_plural = "Werke"
ordering = [
"id",
]
Expand All @@ -616,6 +642,10 @@ def get_api_url(self):
def get_icon(self):
return "bi bi-book apis-work"

@classmethod
def get_color(self):
return "#ff8600"


a_ents = getattr(settings, "APIS_ADDITIONAL_ENTITIES", False)

Expand Down
21 changes: 21 additions & 0 deletions apis_core/apis_entities/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,27 @@ def test_009_merge_view(self):
after = Person.objects.all().count()
self.assertTrue(before > after)

def test_009a_merge_notesandreferences(self):
source_one = Person.objects.create(
name="Person which will be merged",
notes="notes_one",
references="references_one",
)
source_two = Person.objects.create(
name="Person two which will be merged",
)
target = Person.objects.create(
name="Person which will be kept",
notes="target_notes",
references="target_references",
)
target.merge_with(source_one.id)
self.assertTrue("notes_one" in target.notes)
self.assertTrue("target_notes" in target.notes)
self.assertTrue("references_one" in target.references)
self.assertTrue("target_references" in target.references)
target.merge_with(source_two)

def test_010_delete_views(self):
client.login(**USER)
for x in MODELS:
Expand Down
18 changes: 17 additions & 1 deletion apis_core/apis_metainfo/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,13 @@ def merge_with(self, entities):
rels = ContentType.objects.filter(
app_label="apis_relations", model__icontains=e_a
)
notes = []
references = []
for ent in entities:
if isinstance(ent.notes, str):
notes.append(ent.notes)
if isinstance(ent.references, str):
references.append(ent.references)
e_b = type(ent).__name__
e_b_pk = ent.pk
if e_b_pk == e_a_pk:
Expand Down Expand Up @@ -332,8 +338,18 @@ def merge_with(self, entities):
for t in k:
setattr(t, "related_{}".format(e_a.lower()), self)
t.save()

ent.delete()
save_target = False
if len(notes) > 0:
additional_notes = " ".join(notes)
self.notes = f"{self.notes} {additional_notes}"
save_target = True
if len(references) > 0:
additional_references = " ".join(references)
self.references = f"{self.references} {additional_references}"
save_target = True
if save_target:
self.save()


class Source(models.Model):
Expand Down
13 changes: 12 additions & 1 deletion apis_core/apis_relations/forms2.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,19 @@ def save(self, site_instance, instance=None, commit=True):
target = AbstractEntity.get_entity_class_of_name(self.rel_accessor[0])
t1 = target.get_or_create_uri(cd["target"])
setattr(x, self.rel_accessor[2], t1)
params = {
self.rel_accessor[3]: site_instance,
self.rel_accessor[2]: t1,
"start_date_written": cd["start_date_written"],
"end_date_written": cd["end_date_written"],
"relation_type_id": cd["relation_type"],
}
if commit:
x.save()
qs = x.__class__.objects.filter(**params)
if qs.count() > 0:
pass
else:
x.save()
return x

def get_text_id(self):
Expand Down
Empty file.
Loading

0 comments on commit 92cb67b

Please sign in to comment.