diff --git a/.gitignore b/.gitignore index fb46f7a..df1222a 100644 --- a/.gitignore +++ b/.gitignore @@ -177,3 +177,5 @@ media/listbibl.xml staticfiles/ hansi.csv .docker +media/duplicated_*.csv +Untitled.ipynb diff --git a/apis_core/apis_entities/management/commands/find_duplicated_persons.py b/apis_core/apis_entities/management/commands/find_duplicated_persons.py new file mode 100644 index 0000000..31be1d0 --- /dev/null +++ b/apis_core/apis_entities/management/commands/find_duplicated_persons.py @@ -0,0 +1,45 @@ +import os +import pandas as pd +import recordlinkage + +from typing import Any +from django.conf import settings +from django.core.management.base import BaseCommand + +from apis_core.apis_entities.models import Person + + +class Command(BaseCommand): + help = "lists potential duplicated entities" + + def handle(self, *args: Any, **options: Any) -> str | None: + print("searching for potential duplicates") + + props = [ + "id", + "name", + "first_name", + "start_date__year", + "end_date__year", + ] + df = pd.DataFrame( + Person.objects.exclude(start_date__isnull=True).values_list(*props), + columns=props, + ).astype("str") + df["custom_index"] = df["id"].astype(str) + " " + df["name"] + df["first_name"] + df.set_index("custom_index", inplace=True) + indexer = recordlinkage.Index() + indexer.block(["name"]) + candidate_links = indexer.index(df) + len(candidate_links) + compare_cl = recordlinkage.Compare() + compare_cl.exact("first_name", "first_name", label="first_name") + compare_cl.exact( + "start_date__year", "start_date__year", label="start_date__year" + ) + compare_cl.exact("end_date__year", "end_date__year", label="end_date__year") + features = compare_cl.compute(candidate_links, df) + matches = features[features.sum(axis=1) > 2] + save_path = os.path.join(settings.MEDIA_ROOT, "duplicated_persons.csv") + matches.to_csv(save_path) + print(f"found {len(matches)} potential duplicates") diff --git a/apis_core/apis_entities/management/commands/find_duplicated_places.py b/apis_core/apis_entities/management/commands/find_duplicated_places.py new file mode 100644 index 0000000..c4f7add --- /dev/null +++ b/apis_core/apis_entities/management/commands/find_duplicated_places.py @@ -0,0 +1,38 @@ +import os +import pandas as pd +import recordlinkage + +from typing import Any +from django.conf import settings +from django.core.management.base import BaseCommand + +from apis_core.apis_entities.models import Place + + +class Command(BaseCommand): + help = "lists potential duplicated entities" + + def handle(self, *args: Any, **options: Any) -> str | None: + print("searching for potential duplicates") + + props = [ + "id", + "name", + ] + df = pd.DataFrame( + Place.objects.values_list(*props), + columns=props, + ).astype("str") + df["custom_index"] = df["id"].astype(str) + " " + df["name"] + df.set_index("custom_index", inplace=True) + indexer = recordlinkage.Index() + indexer.block(["name"]) + candidate_links = indexer.index(df) + len(candidate_links) + compare_cl = recordlinkage.Compare() + compare_cl.exact("name", "name", label="name") + features = compare_cl.compute(candidate_links, df) + matches = features[features.sum(axis=1) > 0] + save_path = os.path.join(settings.MEDIA_ROOT, "duplicated_places.csv") + matches.to_csv(save_path) + print(f"found {len(matches)} potential duplicates") diff --git a/requirements.txt b/requirements.txt index 21ce1e6..95c4b56 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,5 @@ psycopg2 pyocclient==0.6 icecream flake8 -black \ No newline at end of file +black +recordlinkage>0.15,<1 \ No newline at end of file