Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add preliminary support for fetching more than 100 articles with dynamic date range adjustments #116

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions gnews/gnews.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,65 @@ def get_news(self, key):
:return: A list of dictionaries with structure: {0}.
"""
if key:
if self._max_results > 100:
return self._get_more_than_100(key)

key = "%20".join(key.split(" "))
query = '/search?q={}'.format(key)
return self._get_news(query)

def _get_news_more_than_100(self, key):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @irenayli Thanks for the PR, can you please add some examples in readme ?

"""
Fetch more than 100 news articles by iterating backward in time, dynamically adjusting
the date range based on the earliest date seen so far.
"""
articles = []
seen_urls = set()
earliest_date = None

if self._start_date or self._end_date or self._period:
warnings.warn(message=("Searches for over 100 articles do not currently support date ranges. \nStart "
"date, end date, and period will be ignored"), category=UserWarning, stacklevel=4)

# Start with no specific date range for the first query
self._start_date = None
self._end_date = None

while len(articles) < self._max_results:
# Fetch articles for the current range
fetched_articles = self._get_news(f'/search?q={key}')
if not fetched_articles: # Stop if no more articles are found
break

for article in fetched_articles:
if article['url'] not in seen_urls:
articles.append(article)
seen_urls.add(article['url'])

# Track the earliest published date
published_date = article.get("published date")
try:
published_date = datetime.datetime.strptime(published_date, '%a, %d %b %Y %H:%M:%S GMT')
except Exception as e:
logger.warning(f"Failed to parse published date: {e}")
continue

if earliest_date is None or published_date < earliest_date:
earliest_date = published_date

if len(articles) >= self._max_results:
return articles

# If fewer than 100 articles were fetched, assume the range is exhausted
if len(fetched_articles) < 100:
break

# Update the sliding window to fetch older articles
self._end_date = earliest_date
self._start_date = earliest_date - datetime.timedelta(days=7)

return articles

@docstring_parameter(standard_output)
def get_top_news(self):
"""
Expand Down
17 changes: 17 additions & 0 deletions tests/test_gnews.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,23 @@ def test_get_news_by_site_invalid(self):
news_articles = self.gnews.get_news_by_site(site)
self.assertEqual(news_articles, [])

def test_get_news_more_than_100(self):
# Set up a GNews instance with a high max_results value
self.gnews = GNews(max_results=150)
query = "technology"

# Call get_news with the query
news_articles = self.gnews.get_news(query)

# Verify the result respects the maximum result cap
self.assertTrue(isinstance(news_articles, list))
self.assertTrue(len(news_articles) > 0)
self.assertTrue(len(news_articles) <= 150, "Should fetch no more than max_results")

# Ensure no duplicates in the results
urls = [article['url'] for article in news_articles]
self.assertEqual(len(urls), len(set(urls)), "No duplicate articles should be fetched")

def test_get_full_article(self):
pass
# Test that get_full_article returns a valid article object for a valid URL
Expand Down