Skip to content

Commit

Permalink
Merge pull request #272 from amosproj/archive-combined-search-terms
Browse files Browse the repository at this point in the history
Archive combined search terms
  • Loading branch information
eloinoel authored Jul 13, 2024
2 parents 7164af7 + 84dd471 commit e2c632a
Showing 1 changed file with 14 additions and 2 deletions.
16 changes: 14 additions & 2 deletions src/backend/Scrapers/Archive/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,25 @@ def extract_arxiv_id_from_url(cls, url):
# Use this method for multiple keywords
@classmethod
def query_ids_per_keyword(cls, keywords, max_results=100_000) -> list[str]:
"""Queries ArXiv for papers based on a list of keywords and returns the ids."""
"""Queries ArXiv for papers based on a list of keywords and returns the ids.
This Method is used for multiple queries with a single keyword each.
"""
ids = []
for keyword in keywords:
arxiv_links = ArchiveScraper.search_arxiv_ids(keyword, max_results)
ids.extend([cls.extract_arxiv_id_from_url(url) for url in arxiv_links])
return ids

@classmethod
def query_ids_from_keywords(cls, keywords, max_results=100_000) -> list[str]:
"""Queries ArXiv for papers based on a list of keywords and returns the ids.
This method is used for a single query with multiple keywords.
"""
arxiv_links = ArchiveScraper.search_arxiv_ids(' AND '.join(keywords), max_results)
return [cls.extract_arxiv_id_from_url(url) for url in arxiv_links]

@classmethod
def search_arxiv_ids(cls, query, max_results) -> list[str]:
"""Search ArXiv for articles related to a query and return the ids."""
Expand Down Expand Up @@ -204,6 +216,6 @@ def _scrape(self) -> TypeArchiveScrappingData:
@classmethod
def get_all_possible_elements(cls, target) -> List[BaseScraper]:
old_indexes = set(cls.INDEX['indexes'])
new_indexes = set(cls.query_ids_per_keyword(target.keywords, target.max_results))
new_indexes = set(cls.query_ids_from_keywords(target.keywords, target.max_results))
new_target_elements = new_indexes - old_indexes
return [ArchiveScraper(element_id=id) for id in new_target_elements]

0 comments on commit e2c632a

Please sign in to comment.