Skip to content

Commit

Permalink
Methods for mapping Wikipedia ID (Page ID) to Wikipedia page title an…
Browse files Browse the repository at this point in the history
…d back (tests included) (#15)

* added methods for mapping wikipedia id (page id) to wikidata id and back
  • Loading branch information
Goader authored May 11, 2023
1 parent efab44a commit 8eca8e6
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 0 deletions.
22 changes: 22 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,28 @@ Map Wikidata id to Wikipedia id
Mapping Wikidata id to Wikipedia id can lead to more than one result, as some pages in Wikipedia are
redirects, all linking to the same Wikidata item.

Map Wikipedia id to Wikipedia page title
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. code:: python
from wikimapper import WikiMapper
mapper = WikiMapper("index_enwiki-latest.db")
page_title = mapper.wikipedia_id_to_title(3342)
print(page_title) # Bundesrepublik_Deutschland
Map Wikipedia page title to Wikipedia id
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. code:: python
from wikimapper import WikiMapper
mapper = WikiMapper("index_enwiki-latest.db")
wikipedia_id = mapper.title_to_wikipedia_id("Germany")
print(wikipedia_id) # 11867
Create your own index
~~~~~~~~~~~~~~~~~~~~~

Expand Down
55 changes: 55 additions & 0 deletions tests/test_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,58 @@ def test_id_to_wikipedia_ids(bavarian_wiki_mapper, wikidata_id: str, expected: L
wikipedia_ids = mapper.id_to_wikipedia_ids(wikidata_id)

assert set(wikipedia_ids) == set(expected)


@pytest.mark.parametrize(
"wikipedia_id, expected",
[
(24520, "Stoaboog"),
(8535, "Wechslkrod"),
(32218, None), # Wechslkrod, but namespace 1, so cannot be in the database
(32176, "Wickiana"),
(32252, "Ulrich_Zwingli"),
(32311, "Jingstes_Gricht"),
(2143, "Sånkt_Johann_im_Pongau"),
(2217, "Quadrátkilometa"),
(4209, "D'_boarische_Woocha"),
(1997, "Brezn"),
(5740, None), # Brezn, but namespace 1, so cannot be in the database
(24100, "Brezel"),
(28193, "Brezen"),
(105208, "Vulkanologie"),
(105288, "Vuikanologie"),
]
)
def test_wikipedia_id_to_title(bavarian_wiki_mapper, wikipedia_id: int, expected: str):
mapper = bavarian_wiki_mapper

title = mapper.wikipedia_id_to_title(wikipedia_id)

assert title == expected


@pytest.mark.parametrize(
"title, expected",
[
("Stoaboog", 24520),
("Wechslkrod", 8535),
("Wickiana", 32176),
("Ulrich_Zwingli", 32252),
("Jingstes_Gricht", 32311),
("Sånkt_Johann_im_Pongau", 2143),
("Quadrátkilometa", 2217),
("D'_boarische_Woocha", 4209),
("Brezn", 1997),
("Brezel", 24100),
("Brezen", 28193),
("Vulkanologie", 105208),
("Vuikanologie", 105288),
("xxxxxxxxxx", None),
]
)
def test_wikipedia_id_to_title(bavarian_wiki_mapper, title: str, expected: int):
mapper = bavarian_wiki_mapper

wikipedia_id = mapper.title_to_wikipedia_id(title)

assert wikipedia_id == expected
43 changes: 43 additions & 0 deletions wikimapper/mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,46 @@ def id_to_wikipedia_ids(self, wikidata_id: str) -> List[int]:
results = c.fetchall()

return [e[0] for e in results]

def wikipedia_id_to_title(self, wikipedia_id: int) -> Optional[str]:
"""Given a Wikipedia ID (in other words Page ID), returns the corresponding page title.
Args:
wikipedia_id (int): The Wikipedia ID to map, e.g. `11867`
Returns:
Optional[str]: If a mapping found for `wikipedia_id`, then return
it, else return `None`.
"""

c = self.conn.execute(
"SELECT wikipedia_title FROM mapping WHERE wikipedia_id=?", (wikipedia_id,)
)
result = c.fetchone()

if result is not None and result[0] is not None:
return result[0]
else:
return None

def title_to_wikipedia_id(self, page_title: str) -> Optional[int]:
"""Given a Wikipedia page title, returns the corresponding Wikipedia id.
Args:
page_title (str): The Wikipedia page title to map, e.g. `Germany`
Returns:
Optional[str]: If a mapping found for `page_title`, then return
it, else return `None`.
"""

# no need for `DISTINCT` as `wikipedia_id` is a PRIMARY KEY, thus we have no duplicates there
c = self.conn.execute(
"SELECT wikipedia_id FROM mapping WHERE wikipedia_title=?", (page_title,)
)
result = c.fetchone()

if result is not None and result[0] is not None:
return result[0]
else:
return None

0 comments on commit 8eca8e6

Please sign in to comment.