Skip to content

Commit

Permalink
added cluster_id filter (#7)
Browse files Browse the repository at this point in the history
  • Loading branch information
theorm authored Oct 16, 2024
1 parent c9ef79d commit f8b1cd9
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 0 deletions.
142 changes: 142 additions & 0 deletions examples/notebooks/textReuse.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2048,6 +2048,148 @@
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Find passages for a cluster by its ID"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<h2>FindTextReusePassages result</h2>\n",
"<div>Contains <b>2</b> items of <b>2</b> total items.</div>\n",
"<br/>\n",
"See this result in the <a href=\"https://impresso-project.ch/app/text-reuse/passages?sort=clusterSize&sq=CiEYHSoddHItbm9icC1hbGwtdjAxLWMxMzc0Mzg5NzgzMzI=&p=1\">Impresso App</a>.\n",
"<h3>Data preview:</h3>\n",
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>offsetStart</th>\n",
" <th>offsetEnd</th>\n",
" <th>content</th>\n",
" <th>title</th>\n",
" <th>pageNumbers</th>\n",
" <th>collections</th>\n",
" <th>connectedClusters</th>\n",
" <th>isFront</th>\n",
" <th>size</th>\n",
" <th>date</th>\n",
" <th>pageRegions</th>\n",
" <th>article.id</th>\n",
" <th>textReuseCluster.id</th>\n",
" <th>textReuseCluster.clusterSize</th>\n",
" <th>textReuseCluster.timeDifferenceDay</th>\n",
" <th>textReuseCluster.lexicalOverlap</th>\n",
" <th>newspaper.id</th>\n",
" <th>issue.id</th>\n",
" </tr>\n",
" <tr>\n",
" <th>id</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>c137438978332-LLE-1891-07-21-a-i0023@3354:5682</th>\n",
" <td>3354</td>\n",
" <td>5682</td>\n",
" <td>Un rapport géné-\\nral au roi Léopold sur la si...</td>\n",
" <td>BULLETIN POLITIQUE</td>\n",
" <td>[1]</td>\n",
" <td>[local-duma-x9GD_Bj6, local-eb-ikYoMqvi]</td>\n",
" <td>[{'id': 'tr-nobp-all-v01-c137438978332'}, {'id...</td>\n",
" <td>True</td>\n",
" <td>2328</td>\n",
" <td>1891-07-21T00:00:00+00:00</td>\n",
" <td>[657,609,1237,3220]</td>\n",
" <td>LLE-1891-07-21-a-i0023</td>\n",
" <td>tr-nobp-all-v01-c137438978332</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>59.183673</td>\n",
" <td>LLE</td>\n",
" <td>LLE-1891-07-21-a</td>\n",
" </tr>\n",
" <tr>\n",
" <th>c137438978332-indeplux-1891-07-20-a-i0012@2514:5104</th>\n",
" <td>2514</td>\n",
" <td>5104</td>\n",
" <td>Un rapport général au roi Léopold\\nsur la situ...</td>\n",
" <td>Le Congo.</td>\n",
" <td>[1]</td>\n",
" <td>[]</td>\n",
" <td>[{'id': 'tr-nobp-all-v01-c137438978332'}, {'id...</td>\n",
" <td>True</td>\n",
" <td>2590</td>\n",
" <td>1891-07-20T00:00:00+00:00</td>\n",
" <td>[1822,976,1565,3353]</td>\n",
" <td>indeplux-1891-07-20-a-i0012</td>\n",
" <td>tr-nobp-all-v01-c137438978332</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>59.183673</td>\n",
" <td>indeplux</td>\n",
" <td>indeplux-1891-07-20-a</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"<impresso.resources.text_reuse.passages.FindTextReusePassagesContainer at 0x111a1d8d0>"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"impresso.text_reuse.passages.find(\n",
" cluster_id=\"tr-nobp-all-v01-c137438978332\",\n",
" order_by=\"clusterSize\",\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand Down
3 changes: 3 additions & 0 deletions impresso/resources/text_reuse/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ def _build_cluster_facet_filters(

def _build_filters(
text: str | None = None,
cluster_id: str | AND[str] | OR[str] | None = None,
cluster_size: Range | AND[Range] | OR[Range] | None = None,
title: str | AND[str] | OR[str] | None = None,
lexical_overlap: Range | AND[Range] | OR[Range] | None = None,
Expand All @@ -273,6 +274,8 @@ def _build_filters(
filters: list[Filter] = []
if text is not None:
filters.extend(and_or_filter(text, "string"))
if cluster_id is not None:
filters.extend(and_or_filter(cluster_id, "text_reuse_cluster"))
if cluster_size is not None:
filters.extend(
and_or_filter(
Expand Down
4 changes: 4 additions & 0 deletions impresso/resources/text_reuse/passages.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def find(
limit: int | None = None,
offset: int | None = None,
order_by: FindTextReusePassagesOrderByLiteral | None = None,
cluster_id: str | AND[str] | OR[str] | None = None,
cluster_size: Range | AND[Range] | OR[Range] | None = None,
title: str | AND[str] | OR[str] | None = None,
lexical_overlap: Range | AND[Range] | OR[Range] | None = None,
Expand All @@ -70,6 +71,7 @@ def find(
) -> FindTextReusePassagesContainer:
# reusing build filters from clusters - they are the same
filters = _build_filters(
cluster_id=cluster_id,
cluster_size=cluster_size,
title=title,
lexical_overlap=lexical_overlap,
Expand Down Expand Up @@ -119,6 +121,7 @@ def facet(
limit: int | None = None,
offset: int | None = None,
order_by: FindTextReusePassagesOrderByLiteral | None = None,
cluster_id: str | AND[str] | OR[str] | None = None,
cluster_size: Range | AND[Range] | OR[Range] | None = None,
title: str | AND[str] | OR[str] | None = None,
lexical_overlap: Range | AND[Range] | OR[Range] | None = None,
Expand All @@ -138,6 +141,7 @@ def facet(
raise ValueError(f"{facet} is not a valid value")

filters = _build_filters(
cluster_id=cluster_id,
cluster_size=cluster_size,
title=title,
lexical_overlap=lexical_overlap,
Expand Down

0 comments on commit f8b1cd9

Please sign in to comment.