From f0eccb3790e53586640c415a1f030075d7c60a07 Mon Sep 17 00:00:00 2001 From: Haofei Yu <1125027232@qq.com> Date: Mon, 22 Jul 2024 13:16:12 -0400 Subject: [PATCH] update function name --- research_town/utils/paper_collector.py | 22 ++++++++-------------- tests/utils/test_paper_collector.py | 6 +++--- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/research_town/utils/paper_collector.py b/research_town/utils/paper_collector.py index dbb9fad9..49fd5942 100644 --- a/research_town/utils/paper_collector.py +++ b/research_town/utils/paper_collector.py @@ -32,7 +32,7 @@ def get_daily_papers( paper_table_captions, paper_figure_captions, paper_bibliography, - ) = get_full_content(paper_url) + ) = get_paper_content(paper_url) if publish_time in content: content[publish_time]['title'].append(paper_title) @@ -41,12 +41,9 @@ def get_daily_papers( content[publish_time]['url'].append(paper_url) content[publish_time]['domain'].append(paper_domain) content[publish_time]['timestamp'].append(paper_timestamp) - content[publish_time]['section_contents'].append( - paper_section_contents) - content[publish_time]['table_captions'].append( - paper_table_captions) - content[publish_time]['figure_captions'].append( - paper_figure_captions) + content[publish_time]['section_contents'].append(paper_section_contents) + content[publish_time]['table_captions'].append(paper_table_captions) + content[publish_time]['figure_captions'].append(paper_figure_captions) content[publish_time]['bibliography'].append(paper_bibliography) else: content[publish_time] = {} @@ -56,15 +53,14 @@ def get_daily_papers( content[publish_time]['url'] = [paper_url] content[publish_time]['domain'] = [paper_domain] content[publish_time]['timestamp'] = [paper_timestamp] - content[publish_time]['section_contents'] = [ - paper_section_contents] + content[publish_time]['section_contents'] = [paper_section_contents] content[publish_time]['table_captions'] = [paper_table_captions] content[publish_time]['figure_captions'] = [paper_figure_captions] content[publish_time]['bibliography'] = [paper_bibliography] return content, publish_time -def get_full_content( +def get_paper_content( url: str, ) -> Tuple[ Optional[Dict[str, str]], @@ -95,8 +91,7 @@ def get_full_content( section_contents[section_title] = section_content # bibliography - bibliography_raw = article.find( - 'section', class_='ltx_bibliography') + bibliography_raw = article.find('section', class_='ltx_bibliography') if bibliography_raw is not None: bibliography = {} bibliography_list = bibliography_raw.find_all( @@ -140,8 +135,7 @@ def get_full_content( table_captions = {} table_index = 0 for table in tables: - table_caption_raw = table.find( - 'figcaption', class_='ltx_caption') + table_caption_raw = table.find('figcaption', class_='ltx_caption') if table_caption_raw: table_caption = table_caption_raw.text else: diff --git a/tests/utils/test_paper_collector.py b/tests/utils/test_paper_collector.py index b4cea8b1..12d5ec2b 100644 --- a/tests/utils/test_paper_collector.py +++ b/tests/utils/test_paper_collector.py @@ -1,7 +1,7 @@ import datetime from unittest.mock import MagicMock, patch -from research_town.utils.paper_collector import get_daily_papers, get_full_content +from research_town.utils.paper_collector import get_daily_papers, get_paper_content def test_get_daily_papers() -> None: @@ -29,8 +29,8 @@ def test_get_daily_papers() -> None: assert newest_day == datetime.date(2023, 7, 2) # Compare to the date part only -def test_get_full_content() -> None: - section_contents, table_captions, figure_captions, bibliography = get_full_content( +def test_get_paper_content() -> None: + section_contents, table_captions, figure_captions, bibliography = get_paper_content( 'https://arxiv.org/html/2403.05534v1' ) assert section_contents is not None