From f8b1cd9223c42609a31bf2806b904c1f6b66b728 Mon Sep 17 00:00:00 2001 From: Roman Kalyakin Date: Wed, 16 Oct 2024 12:53:52 +0200 Subject: [PATCH] added cluster_id filter (#7) --- examples/notebooks/textReuse.ipynb | 142 ++++++++++++++++++++++ impresso/resources/text_reuse/clusters.py | 3 + impresso/resources/text_reuse/passages.py | 4 + 3 files changed, 149 insertions(+) diff --git a/examples/notebooks/textReuse.ipynb b/examples/notebooks/textReuse.ipynb index 613351d..f8eb89f 100644 --- a/examples/notebooks/textReuse.ipynb +++ b/examples/notebooks/textReuse.ipynb @@ -2048,6 +2048,148 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Find passages for a cluster by its ID" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "

FindTextReusePassages result

\n", + "
Contains 2 items of 2 total items.
\n", + "
\n", + "See this result in the Impresso App.\n", + "

Data preview:

\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
offsetStartoffsetEndcontenttitlepageNumberscollectionsconnectedClustersisFrontsizedatepageRegionsarticle.idtextReuseCluster.idtextReuseCluster.clusterSizetextReuseCluster.timeDifferenceDaytextReuseCluster.lexicalOverlapnewspaper.idissue.id
id
c137438978332-LLE-1891-07-21-a-i0023@3354:568233545682Un rapport géné-\\nral au roi Léopold sur la si...BULLETIN POLITIQUE[1][local-duma-x9GD_Bj6, local-eb-ikYoMqvi][{'id': 'tr-nobp-all-v01-c137438978332'}, {'id...True23281891-07-21T00:00:00+00:00[657,609,1237,3220]LLE-1891-07-21-a-i0023tr-nobp-all-v01-c1374389783322159.183673LLELLE-1891-07-21-a
c137438978332-indeplux-1891-07-20-a-i0012@2514:510425145104Un rapport général au roi Léopold\\nsur la situ...Le Congo.[1][][{'id': 'tr-nobp-all-v01-c137438978332'}, {'id...True25901891-07-20T00:00:00+00:00[1822,976,1565,3353]indeplux-1891-07-20-a-i0012tr-nobp-all-v01-c1374389783322159.183673indepluxindeplux-1891-07-20-a
\n", + "
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "impresso.text_reuse.passages.find(\n", + " cluster_id=\"tr-nobp-all-v01-c137438978332\",\n", + " order_by=\"clusterSize\",\n", + ")" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/impresso/resources/text_reuse/clusters.py b/impresso/resources/text_reuse/clusters.py index 888ed9d..0264150 100644 --- a/impresso/resources/text_reuse/clusters.py +++ b/impresso/resources/text_reuse/clusters.py @@ -254,6 +254,7 @@ def _build_cluster_facet_filters( def _build_filters( text: str | None = None, + cluster_id: str | AND[str] | OR[str] | None = None, cluster_size: Range | AND[Range] | OR[Range] | None = None, title: str | AND[str] | OR[str] | None = None, lexical_overlap: Range | AND[Range] | OR[Range] | None = None, @@ -273,6 +274,8 @@ def _build_filters( filters: list[Filter] = [] if text is not None: filters.extend(and_or_filter(text, "string")) + if cluster_id is not None: + filters.extend(and_or_filter(cluster_id, "text_reuse_cluster")) if cluster_size is not None: filters.extend( and_or_filter( diff --git a/impresso/resources/text_reuse/passages.py b/impresso/resources/text_reuse/passages.py index 9ee0804..5176552 100644 --- a/impresso/resources/text_reuse/passages.py +++ b/impresso/resources/text_reuse/passages.py @@ -54,6 +54,7 @@ def find( limit: int | None = None, offset: int | None = None, order_by: FindTextReusePassagesOrderByLiteral | None = None, + cluster_id: str | AND[str] | OR[str] | None = None, cluster_size: Range | AND[Range] | OR[Range] | None = None, title: str | AND[str] | OR[str] | None = None, lexical_overlap: Range | AND[Range] | OR[Range] | None = None, @@ -70,6 +71,7 @@ def find( ) -> FindTextReusePassagesContainer: # reusing build filters from clusters - they are the same filters = _build_filters( + cluster_id=cluster_id, cluster_size=cluster_size, title=title, lexical_overlap=lexical_overlap, @@ -119,6 +121,7 @@ def facet( limit: int | None = None, offset: int | None = None, order_by: FindTextReusePassagesOrderByLiteral | None = None, + cluster_id: str | AND[str] | OR[str] | None = None, cluster_size: Range | AND[Range] | OR[Range] | None = None, title: str | AND[str] | OR[str] | None = None, lexical_overlap: Range | AND[Range] | OR[Range] | None = None, @@ -138,6 +141,7 @@ def facet( raise ValueError(f"{facet} is not a valid value") filters = _build_filters( + cluster_id=cluster_id, cluster_size=cluster_size, title=title, lexical_overlap=lexical_overlap,