From 9d2f5551b56d55b5b0255c8cb3799a8efc834a3e Mon Sep 17 00:00:00 2001
From: irenayli <irenayli@umich.edu>
Date: Mon, 9 Dec 2024 17:06:04 -0500
Subject: [PATCH 1/2] implemented more than 100 article fetches using sliding
 time window, added respective test case

---
 gnews/gnews.py      | 55 +++++++++++++++++++++++++++++++++++++++++++++
 tests/test_gnews.py | 17 ++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/gnews/gnews.py b/gnews/gnews.py
index fe689d2..a5ed539 100644
--- a/gnews/gnews.py
+++ b/gnews/gnews.py
@@ -233,10 +233,65 @@ def get_news(self, key):
         :return: A list of dictionaries with structure: {0}.
         """
         if key:
+            if self._max_results > 100:
+                return self._get_more_than_100(key)
+            
             key = "%20".join(key.split(" "))
             query = '/search?q={}'.format(key)
             return self._get_news(query)
 
+    def _get_more_than_100(self, key):
+        """
+        Fetch more than 100 news articles by iterating backward in time, dynamically adjusting
+        the date range based on the earliest date seen so far.
+        """
+        articles = []
+        seen_urls = set()
+        earliest_date = None
+        
+        if self._start_date or self._end_date or self._period:
+            warnings.warn(message=("Searches for over 100 articles do not currently support date ranges. \nStart "
+                                    "date, end date, and period will be ignored"), category=UserWarning, stacklevel=4)
+
+        # Start with no specific date range for the first query
+        self._start_date = None
+        self._end_date = None
+
+        while len(articles) < self._max_results:
+            # Fetch articles for the current range
+            fetched_articles = self._get_news(f'/search?q={key}')
+            if not fetched_articles:  # Stop if no more articles are found
+                break
+
+            for article in fetched_articles:
+                if article['url'] not in seen_urls:
+                    articles.append(article)
+                    seen_urls.add(article['url'])
+
+                    # Track the earliest published date
+                    published_date = article.get("published date")
+                    try:
+                        published_date = datetime.datetime.strptime(published_date, '%a, %d %b %Y %H:%M:%S GMT')
+                    except Exception as e:
+                        logger.warning(f"Failed to parse published date: {e}")
+                        continue
+
+                    if earliest_date is None or published_date < earliest_date:
+                        earliest_date = published_date
+
+                if len(articles) >= self._max_results:
+                    return articles
+
+            # If fewer than 100 articles were fetched, assume the range is exhausted
+            if len(fetched_articles) < 100:
+                break
+
+            # Update the sliding window to fetch older articles
+            self._end_date = earliest_date
+            self._start_date = earliest_date - datetime.timedelta(days=7)
+
+        return articles
+    
     @docstring_parameter(standard_output)
     def get_top_news(self):
         """
diff --git a/tests/test_gnews.py b/tests/test_gnews.py
index ece1fb8..7ec74dd 100644
--- a/tests/test_gnews.py
+++ b/tests/test_gnews.py
@@ -45,6 +45,23 @@ def test_get_news_by_site_invalid(self):
         news_articles = self.gnews.get_news_by_site(site)
         self.assertEqual(news_articles, [])
 
+    def test_get_news_more_than_100(self):
+        # Set up a GNews instance with a high max_results value
+        self.gnews = GNews(max_results=150)
+        query = "technology"
+
+        # Call get_news with the query
+        news_articles = self.gnews.get_news(query)
+
+        # Verify the result respects the maximum result cap
+        self.assertTrue(isinstance(news_articles, list))
+        self.assertTrue(len(news_articles) > 0)
+        self.assertTrue(len(news_articles) <= 150, "Should fetch no more than max_results")
+
+        # Ensure no duplicates in the results
+        urls = [article['url'] for article in news_articles]
+        self.assertEqual(len(urls), len(set(urls)), "No duplicate articles should be fetched")
+
     def test_get_full_article(self):
         pass
         # Test that get_full_article returns a valid article object for a valid URL

From 5dff304b788c935f8fd1bf964f3469d170ecb498 Mon Sep 17 00:00:00 2001
From: irenayli <irenayli@umich.edu>
Date: Mon, 9 Dec 2024 17:11:18 -0500
Subject: [PATCH 2/2] renamed helper function

---
 gnews/gnews.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gnews/gnews.py b/gnews/gnews.py
index a5ed539..a4dd7c7 100644
--- a/gnews/gnews.py
+++ b/gnews/gnews.py
@@ -240,7 +240,7 @@ def get_news(self, key):
             query = '/search?q={}'.format(key)
             return self._get_news(query)
 
-    def _get_more_than_100(self, key):
+    def _get_news_more_than_100(self, key):
         """
         Fetch more than 100 news articles by iterating backward in time, dynamically adjusting
         the date range based on the earliest date seen so far.