Skip to content

Commit

Permalink
update function name
Browse files Browse the repository at this point in the history
  • Loading branch information
lwaekfjlk committed Jul 22, 2024
1 parent e144a32 commit f0eccb3
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 17 deletions.
22 changes: 8 additions & 14 deletions research_town/utils/paper_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def get_daily_papers(
paper_table_captions,
paper_figure_captions,
paper_bibliography,
) = get_full_content(paper_url)
) = get_paper_content(paper_url)

if publish_time in content:
content[publish_time]['title'].append(paper_title)
Expand All @@ -41,12 +41,9 @@ def get_daily_papers(
content[publish_time]['url'].append(paper_url)
content[publish_time]['domain'].append(paper_domain)
content[publish_time]['timestamp'].append(paper_timestamp)
content[publish_time]['section_contents'].append(
paper_section_contents)
content[publish_time]['table_captions'].append(
paper_table_captions)
content[publish_time]['figure_captions'].append(
paper_figure_captions)
content[publish_time]['section_contents'].append(paper_section_contents)
content[publish_time]['table_captions'].append(paper_table_captions)
content[publish_time]['figure_captions'].append(paper_figure_captions)
content[publish_time]['bibliography'].append(paper_bibliography)
else:
content[publish_time] = {}
Expand All @@ -56,15 +53,14 @@ def get_daily_papers(
content[publish_time]['url'] = [paper_url]
content[publish_time]['domain'] = [paper_domain]
content[publish_time]['timestamp'] = [paper_timestamp]
content[publish_time]['section_contents'] = [
paper_section_contents]
content[publish_time]['section_contents'] = [paper_section_contents]
content[publish_time]['table_captions'] = [paper_table_captions]
content[publish_time]['figure_captions'] = [paper_figure_captions]
content[publish_time]['bibliography'] = [paper_bibliography]
return content, publish_time


def get_full_content(
def get_paper_content(
url: str,
) -> Tuple[
Optional[Dict[str, str]],
Expand Down Expand Up @@ -95,8 +91,7 @@ def get_full_content(
section_contents[section_title] = section_content

# bibliography
bibliography_raw = article.find(
'section', class_='ltx_bibliography')
bibliography_raw = article.find('section', class_='ltx_bibliography')
if bibliography_raw is not None:
bibliography = {}
bibliography_list = bibliography_raw.find_all(
Expand Down Expand Up @@ -140,8 +135,7 @@ def get_full_content(
table_captions = {}
table_index = 0
for table in tables:
table_caption_raw = table.find(
'figcaption', class_='ltx_caption')
table_caption_raw = table.find('figcaption', class_='ltx_caption')
if table_caption_raw:
table_caption = table_caption_raw.text
else:
Expand Down
6 changes: 3 additions & 3 deletions tests/utils/test_paper_collector.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import datetime
from unittest.mock import MagicMock, patch

from research_town.utils.paper_collector import get_daily_papers, get_full_content
from research_town.utils.paper_collector import get_daily_papers, get_paper_content


def test_get_daily_papers() -> None:
Expand Down Expand Up @@ -29,8 +29,8 @@ def test_get_daily_papers() -> None:
assert newest_day == datetime.date(2023, 7, 2) # Compare to the date part only


def test_get_full_content() -> None:
section_contents, table_captions, figure_captions, bibliography = get_full_content(
def test_get_paper_content() -> None:
section_contents, table_captions, figure_captions, bibliography = get_paper_content(
'https://arxiv.org/html/2403.05534v1'
)
assert section_contents is not None
Expand Down

0 comments on commit f0eccb3

Please sign in to comment.