From d2cb6384ccffe12a24410376609923872dfe707d Mon Sep 17 00:00:00 2001 From: csae8092 Date: Tue, 28 May 2024 16:51:59 +0200 Subject: [PATCH 1/2] deleted nonsense professions, closes #195 --- issue__195_deletenotusedprofessions.ipynb | 127 ++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 issue__195_deletenotusedprofessions.ipynb diff --git a/issue__195_deletenotusedprofessions.ipynb b/issue__195_deletenotusedprofessions.ipynb new file mode 100644 index 0000000..09a4a23 --- /dev/null +++ b/issue__195_deletenotusedprofessions.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "1e1c119a", + "metadata": {}, + "outputs": [], + "source": [ + "# 2024-05-28 run against production\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d95d08e9", + "metadata": {}, + "outputs": [], + "source": [ + "ProfessionType.objects.all().count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5543f8e0", + "metadata": {}, + "outputs": [], + "source": [ + "ProfessionType.objects.filter(person=None).count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b071e98", + "metadata": {}, + "outputs": [], + "source": [ + "data = []\n", + "for x in ProfessionType.objects.filter(person=None).distinct():\n", + " data.append([x.id, x.name])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "30beda86", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fcb8186", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(data, columns=[\"id\", \"name\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7acc58e", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(\"hansi.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18d88961", + "metadata": {}, + "outputs": [], + "source": [ + "for x in ProfessionType.objects.filter(person=None).distinct():\n", + " x.delete()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bac8d6e", + "metadata": {}, + "outputs": [], + "source": [ + "ProfessionType.objects.all().count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17dc6357", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Django Shell-Plus", + "language": "python", + "name": "django_extensions" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 75f34a916879bb3a9fb1cbe0a088262989a3a4fb Mon Sep 17 00:00:00 2001 From: csae8092 Date: Mon, 3 Jun 2024 20:22:19 +0200 Subject: [PATCH 2/2] removes entities with wikidata-ids from no wikidata collection --- dumper/management/commands/wikidata_minter.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/dumper/management/commands/wikidata_minter.py b/dumper/management/commands/wikidata_minter.py index 795aca9..d2de2fc 100644 --- a/dumper/management/commands/wikidata_minter.py +++ b/dumper/management/commands/wikidata_minter.py @@ -16,9 +16,15 @@ class Command(BaseCommand): help = "mint WikiData IDs for GND-URIs" def handle(self, *args, **kwargs): - LIMIT = 100 + LIMIT = 2 USER_AGENT_PMB = "pmb (https://pmb.acdh.oeaw.ac.at)" col, _ = Collection.objects.get_or_create(name="No WikiData-ID found") + ents = TempEntityClass.objects.filter(uri__uri__icontains="wikidata").filter(collection=col) + print(f"found {ents.count()} entities with wikidata-ids but related to {col}") + if ents: + print(f"remove relation to {col}") + for x in tqdm(ents, total=ents.count()): + x.collection.remove(col) types = ["d-nb.info", "geonames"] for uri_type in types: print(f"processing URIS with type: {uri_type}")