From 9d2f5551b56d55b5b0255c8cb3799a8efc834a3e Mon Sep 17 00:00:00 2001 From: irenayli Date: Mon, 9 Dec 2024 17:06:04 -0500 Subject: [PATCH 1/2] implemented more than 100 article fetches using sliding time window, added respective test case --- gnews/gnews.py | 55 +++++++++++++++++++++++++++++++++++++++++++++ tests/test_gnews.py | 17 ++++++++++++++ 2 files changed, 72 insertions(+) diff --git a/gnews/gnews.py b/gnews/gnews.py index fe689d2..a5ed539 100644 --- a/gnews/gnews.py +++ b/gnews/gnews.py @@ -233,10 +233,65 @@ def get_news(self, key): :return: A list of dictionaries with structure: {0}. """ if key: + if self._max_results > 100: + return self._get_more_than_100(key) + key = "%20".join(key.split(" ")) query = '/search?q={}'.format(key) return self._get_news(query) + def _get_more_than_100(self, key): + """ + Fetch more than 100 news articles by iterating backward in time, dynamically adjusting + the date range based on the earliest date seen so far. + """ + articles = [] + seen_urls = set() + earliest_date = None + + if self._start_date or self._end_date or self._period: + warnings.warn(message=("Searches for over 100 articles do not currently support date ranges. \nStart " + "date, end date, and period will be ignored"), category=UserWarning, stacklevel=4) + + # Start with no specific date range for the first query + self._start_date = None + self._end_date = None + + while len(articles) < self._max_results: + # Fetch articles for the current range + fetched_articles = self._get_news(f'/search?q={key}') + if not fetched_articles: # Stop if no more articles are found + break + + for article in fetched_articles: + if article['url'] not in seen_urls: + articles.append(article) + seen_urls.add(article['url']) + + # Track the earliest published date + published_date = article.get("published date") + try: + published_date = datetime.datetime.strptime(published_date, '%a, %d %b %Y %H:%M:%S GMT') + except Exception as e: + logger.warning(f"Failed to parse published date: {e}") + continue + + if earliest_date is None or published_date < earliest_date: + earliest_date = published_date + + if len(articles) >= self._max_results: + return articles + + # If fewer than 100 articles were fetched, assume the range is exhausted + if len(fetched_articles) < 100: + break + + # Update the sliding window to fetch older articles + self._end_date = earliest_date + self._start_date = earliest_date - datetime.timedelta(days=7) + + return articles + @docstring_parameter(standard_output) def get_top_news(self): """ diff --git a/tests/test_gnews.py b/tests/test_gnews.py index ece1fb8..7ec74dd 100644 --- a/tests/test_gnews.py +++ b/tests/test_gnews.py @@ -45,6 +45,23 @@ def test_get_news_by_site_invalid(self): news_articles = self.gnews.get_news_by_site(site) self.assertEqual(news_articles, []) + def test_get_news_more_than_100(self): + # Set up a GNews instance with a high max_results value + self.gnews = GNews(max_results=150) + query = "technology" + + # Call get_news with the query + news_articles = self.gnews.get_news(query) + + # Verify the result respects the maximum result cap + self.assertTrue(isinstance(news_articles, list)) + self.assertTrue(len(news_articles) > 0) + self.assertTrue(len(news_articles) <= 150, "Should fetch no more than max_results") + + # Ensure no duplicates in the results + urls = [article['url'] for article in news_articles] + self.assertEqual(len(urls), len(set(urls)), "No duplicate articles should be fetched") + def test_get_full_article(self): pass # Test that get_full_article returns a valid article object for a valid URL From 5dff304b788c935f8fd1bf964f3469d170ecb498 Mon Sep 17 00:00:00 2001 From: irenayli Date: Mon, 9 Dec 2024 17:11:18 -0500 Subject: [PATCH 2/2] renamed helper function --- gnews/gnews.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gnews/gnews.py b/gnews/gnews.py index a5ed539..a4dd7c7 100644 --- a/gnews/gnews.py +++ b/gnews/gnews.py @@ -240,7 +240,7 @@ def get_news(self, key): query = '/search?q={}'.format(key) return self._get_news(query) - def _get_more_than_100(self, key): + def _get_news_more_than_100(self, key): """ Fetch more than 100 news articles by iterating backward in time, dynamically adjusting the date range based on the earliest date seen so far.