From 0989e2e1b3e8eb80b87ee65d272d0edbeba49de2 Mon Sep 17 00:00:00 2001 From: chengzr01 Date: Mon, 22 Jul 2024 16:25:43 +0800 Subject: [PATCH 1/6] initialize full paper api (#423) --- research_town/utils/paper_collector.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/research_town/utils/paper_collector.py b/research_town/utils/paper_collector.py index 367d8ff5..29478de3 100644 --- a/research_town/utils/paper_collector.py +++ b/research_town/utils/paper_collector.py @@ -1,7 +1,9 @@ import datetime import arxiv +import requests from beartype.typing import Any, Dict, List, Tuple +from bs4 import BeautifulSoup def get_daily_papers( @@ -42,4 +44,19 @@ def get_daily_papers( content[publish_time]['url'] = [paper_url] content[publish_time]['domain'] = [paper_domain] content[publish_time]['timestamp'] = [timestamp] + + html_url = paper_url.replace('abs', 'html') + response = requests.get(html_url, timeout=60) + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'lxml') + article = soup.find('article', class_='ltx_document') + sections = article.find_all('section', class_='ltx_section') + bibliography = article.find('section', class_='ltx_bibliography') + appendices = article.find_all('section', class_='ltx_appendix') + figures = article.find_all('figure', class_='ltx_figure') + figure_captions = [figure.find( + 'figcaption').text for figure in figures] + tables = article.find_all('figure', class_='ltx_table') + table_captions = [table.find( + 'figcaption').text for table in tables] return content, newest_day From 674157d5314e8c5d75bf1a931a9149bcdd2c9884 Mon Sep 17 00:00:00 2001 From: Haofei Yu <1125027232@qq.com> Date: Mon, 22 Jul 2024 10:30:12 -0400 Subject: [PATCH 2/6] add dependency --- poetry.lock | 93 +++++++++++++++++++++++++++++++++++++------------- pyproject.toml | 5 ++- 2 files changed, 73 insertions(+), 25 deletions(-) diff --git a/poetry.lock b/poetry.lock index 46776dcf..35847a18 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aiohttp" @@ -235,6 +235,41 @@ doc-rtd = ["autoapi (>=0.9.0)", "pydata-sphinx-theme (<=0.7.2)", "sphinx (>=4.2. test-tox = ["equinox", "mypy (>=0.800)", "numpy", "pandera", "pytest (>=4.0.0)", "sphinx", "typing-extensions (>=3.10.0.0)"] test-tox-coverage = ["coverage (>=5.5)"] +[[package]] +name = "beautifulsoup4" +version = "4.12.3" +description = "Screen-scraping library" +optional = true +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, + {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + +[[package]] +name = "bs4" +version = "0.0.2" +description = "Dummy package for Beautiful Soup (beautifulsoup4)" +optional = true +python-versions = "*" +files = [ + {file = "bs4-0.0.2-py2.py3-none-any.whl", hash = "sha256:abf8742c0805ef7f662dce4b51cca104cffe52b835238afc169142ab9b3fbccc"}, + {file = "bs4-0.0.2.tar.gz", hash = "sha256:a48685c58f50fe127722417bae83fe6badf500d54b55f7e39ffe43b798653925"}, +] + +[package.dependencies] +beautifulsoup4 = "*" + [[package]] name = "certifi" version = "2024.7.4" @@ -848,13 +883,13 @@ socks = ["socksio (==1.*)"] [[package]] name = "huggingface-hub" -version = "0.23.5" +version = "0.24.0" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" files = [ - {file = "huggingface_hub-0.23.5-py3-none-any.whl", hash = "sha256:d7a7d337615e11a45cc14a0ce5a605db6b038dc24af42866f731684825226e90"}, - {file = "huggingface_hub-0.23.5.tar.gz", hash = "sha256:67a9caba79b71235be3752852ca27da86bd54311d2424ca8afdb8dda056edf98"}, + {file = "huggingface_hub-0.24.0-py3-none-any.whl", hash = "sha256:7ad92edefb93d8145c061f6df8d99df2ff85f8379ba5fac8a95aca0642afa5d7"}, + {file = "huggingface_hub-0.24.0.tar.gz", hash = "sha256:6c7092736b577d89d57b3cdfea026f1b0dc2234ae783fa0d59caf1bf7d52dfa7"}, ] [package.dependencies] @@ -867,17 +902,17 @@ tqdm = ">=4.42.1" typing-extensions = ">=3.7.4.3" [package.extras] -all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] cli = ["InquirerPy (==0.3.4)"] -dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.3.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] hf-transfer = ["hf-transfer (>=0.1.4)"] inference = ["aiohttp", "minijinja (>=1.0)"] -quality = ["mypy (==1.5.1)", "ruff (>=0.3.0)"] +quality = ["mypy (==1.5.1)", "ruff (>=0.5.0)"] tensorflow = ["graphviz", "pydot", "tensorflow"] tensorflow-testing = ["keras (<3.0)", "tensorflow"] -testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] -torch = ["safetensors", "torch"] +testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio", "jedi", "minijinja (>=1.0)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] +torch = ["safetensors[torch]", "torch"] typing = ["types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)"] [[package]] @@ -1675,7 +1710,6 @@ description = "Nvidia JIT LTO Library" optional = true python-versions = ">=3" files = [ - {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, ] @@ -1693,13 +1727,13 @@ files = [ [[package]] name = "openai" -version = "1.35.14" +version = "1.36.1" description = "The official Python library for the openai API" optional = false python-versions = ">=3.7.1" files = [ - {file = "openai-1.35.14-py3-none-any.whl", hash = "sha256:adadf8c176e0b8c47ad782ed45dc20ef46438ee1f02c7103c4155cff79c8f68b"}, - {file = "openai-1.35.14.tar.gz", hash = "sha256:394ba1dfd12ecec1d634c50e512d24ff1858bbc2674ffcce309b822785a058de"}, + {file = "openai-1.36.1-py3-none-any.whl", hash = "sha256:d399b9d476dbbc167aceaac6bc6ed0b2e2bb6c9e189c7f7047f822743ae62e64"}, + {file = "openai-1.36.1.tar.gz", hash = "sha256:41be9e0302e95dba8a9374b885c5cb1cec2202816a70b98736fee25a2cadd1f2"}, ] [package.dependencies] @@ -1931,13 +1965,13 @@ files = [ [[package]] name = "pure-eval" -version = "0.2.2" +version = "0.2.3" description = "Safely evaluate AST nodes without side effects" optional = false python-versions = "*" files = [ - {file = "pure_eval-0.2.2-py3-none-any.whl", hash = "sha256:01eaab343580944bc56080ebe0a674b39ec44a945e6d09ba7db3cb8cec289350"}, - {file = "pure_eval-0.2.2.tar.gz", hash = "sha256:2b45320af6dfaa1750f543d714b6d1c520a1688dec6fd24d339063ce0aaa9ac3"}, + {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"}, + {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"}, ] [package.extras] @@ -2781,6 +2815,17 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] +[[package]] +name = "soupsieve" +version = "2.5" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = true +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, + {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, +] + [[package]] name = "stack-data" version = "0.6.3" @@ -2802,13 +2847,13 @@ tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] [[package]] name = "sympy" -version = "1.13.0" +version = "1.13.1" description = "Computer algebra system (CAS) in Python" optional = true python-versions = ">=3.8" files = [ - {file = "sympy-1.13.0-py3-none-any.whl", hash = "sha256:6b0b32a4673fb91bd3cac3b55406c8e01d53ae22780be467301cc452f6680c92"}, - {file = "sympy-1.13.0.tar.gz", hash = "sha256:3b6af8f4d008b9a1a6a4268b335b984b23835f26d1d60b0526ebc71d48a25f57"}, + {file = "sympy-1.13.1-py3-none-any.whl", hash = "sha256:db36cdc64bf61b9b24578b6f7bab1ecdd2452cf008f34faa33776680c26d66f8"}, + {file = "sympy-1.13.1.tar.gz", hash = "sha256:9cebf7e04ff162015ce31c9c6c9144daa34a93bd082f54fd8f12deca4f47515f"}, ] [package.dependencies] @@ -3242,13 +3287,13 @@ urllib3 = ">=2" [[package]] name = "types-setuptools" -version = "70.3.0.20240710" +version = "71.0.0.20240722" description = "Typing stubs for setuptools" optional = false python-versions = ">=3.8" files = [ - {file = "types-setuptools-70.3.0.20240710.tar.gz", hash = "sha256:842cbf399812d2b65042c9d6ff35113bbf282dee38794779aa1f94e597bafc35"}, - {file = "types_setuptools-70.3.0.20240710-py3-none-any.whl", hash = "sha256:bd0db2a4b9f2c49ac5564be4e0fb3125c4c46b1f73eafdcbceffa5b005cceca4"}, + {file = "types-setuptools-71.0.0.20240722.tar.gz", hash = "sha256:8f1fd5281945ed8f5a896f05dd50bc31917d6e2487ff9508f4bac522d13ad395"}, + {file = "types_setuptools-71.0.0.20240722-py3-none-any.whl", hash = "sha256:04a383bd1a2dcdb6a85397516ce2d7b46617d89f1d758f686d0d9069943d9811"}, ] [[package]] @@ -3451,7 +3496,7 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [extras] -collector = ["arxiv"] +collector = ["arxiv", "bs4", "requests"] humaneval = ["pandas"] logger = ["termcolor"] retriever = ["torch", "transformers"] @@ -3459,4 +3504,4 @@ retriever = ["torch", "transformers"] [metadata] lock-version = "2.0" python-versions = ">=3.10, <3.12" -content-hash = "aecfdb73f916b65c9f14246962a7eb231cae24e487e304aeda621cddc395e941" +content-hash = "319e7ee84886f0680e1251cc9e4879551774053597f69698711f368e47420c93" diff --git a/pyproject.toml b/pyproject.toml index 9f273a71..0ae0544d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,9 @@ transformers = { version = "^4.42.4", optional = true } termcolor = { version = "^2.4.0", optional = true } arxiv = { version = "^2.1.0", optional = true } pandas = { version = "*", optional = true } +requests = { version = "*", optional = true } +bs4 = { version = "*", optional = true } + [tool.poetry.group.dev.dependencies] pre-commit = "*" @@ -35,7 +38,7 @@ mypy = "^1.11.0" [tool.poetry.extras] retriever = ["torch", "transformers"] -collector = ["arxiv"] +collector = ["arxiv", "requests", "bs4"] logger = ["termcolor"] humaneval = ["pandas"] From 88d502e7612a8a1d80a000543f366b7f2b3bbc4f Mon Sep 17 00:00:00 2001 From: chengzr01 Date: Tue, 23 Jul 2024 00:31:15 +0800 Subject: [PATCH 3/6] update full paper fetching (#423) --- research_town/dbs/db_paper.py | 10 ++- research_town/utils/paper_collector.py | 118 ++++++++++++++++++++++--- tests/utils/test_paper_collector.py | 18 +++- 3 files changed, 130 insertions(+), 16 deletions(-) diff --git a/research_town/dbs/db_paper.py b/research_town/dbs/db_paper.py index 82ecd692..a7a60d2e 100644 --- a/research_town/dbs/db_paper.py +++ b/research_town/dbs/db_paper.py @@ -34,14 +34,22 @@ def pull_papers(self, num: int, domain: str) -> None: url=url, domain=domain, timestamp=int(timestamp), + section_contents=section_contents, + bibliography=bibliography, + figure_captions=figure_captions, + table_captions=table_captions, ) - for title, abstract, authors, url, domain, timestamp in zip( + for title, abstract, authors, url, domain, timestamp, section_contents, bibliography, figure_captions, table_captions in zip( paper_data['title'], paper_data['abstract'], paper_data['authors'], paper_data['url'], paper_data['domain'], paper_data['timestamp'], + paper_data['section_contents'], + paper_data['bibliography'], + paper_data['figure_captions'], + paper_data['table_captions'], ) ] diff --git a/research_town/utils/paper_collector.py b/research_town/utils/paper_collector.py index 29478de3..37a4ce94 100644 --- a/research_town/utils/paper_collector.py +++ b/research_town/utils/paper_collector.py @@ -2,7 +2,7 @@ import arxiv import requests -from beartype.typing import Any, Dict, List, Tuple +from beartype.typing import Any, Dict, List, Optional, Tuple from bs4 import BeautifulSoup @@ -15,7 +15,6 @@ def get_daily_papers( ) results = client.results(search) content: Dict[str, Dict[str, Any]] = {} - newest_day = '' for result in results: paper_title = result.title paper_url = result.entry_id @@ -23,19 +22,29 @@ def get_daily_papers( paper_authors = [author.name for author in result.authors] paper_domain = result.primary_category publish_time = result.published.date() - timestamp = int( + paper_timestamp = int( datetime.datetime( publish_time.year, publish_time.month, publish_time.day ).timestamp() ) - newest_day = publish_time + ( + paper_section_contents, + paper_table_captions, + paper_figure_captions, + paper_bibliography, + ) = get_full_content(paper_url) + if publish_time in content: content[publish_time]['title'].append(paper_title) content[publish_time]['abstract'].append(paper_abstract) content[publish_time]['authors'].append(paper_authors) content[publish_time]['url'].append(paper_url) content[publish_time]['domain'].append(paper_domain) - content[publish_time]['timestamp'].append(timestamp) + content[publish_time]['timestamp'].append(paper_timestamp) + content[publish_time]['section_contents'].append(paper_section_contents) + content[publish_time]['table_captions'].append(paper_table_captions) + content[publish_time]['figure_captions'].append(paper_figure_captions) + content[publish_time]['bibliography'].append(paper_bibliography) else: content[publish_time] = {} content[publish_time]['title'] = [paper_title] @@ -43,20 +52,101 @@ def get_daily_papers( content[publish_time]['authors'] = [paper_authors] content[publish_time]['url'] = [paper_url] content[publish_time]['domain'] = [paper_domain] - content[publish_time]['timestamp'] = [timestamp] + content[publish_time]['timestamp'] = [paper_timestamp] + content[publish_time]['section_contents'] = [paper_section_contents] + content[publish_time]['table_captions'] = [paper_table_captions] + content[publish_time]['figure_captions'] = [paper_figure_captions] + content[publish_time]['bibliography'] = [paper_bibliography] + return content, publish_time + - html_url = paper_url.replace('abs', 'html') +def get_full_content( + url: str, +) -> Tuple[ + Optional[Dict[str, str]], + Optional[Dict[str, str]], + Optional[Dict[str, str]], + Optional[Dict[str, str]], +]: + section_contents = None + table_captions = None + figure_captions = None + bibliography = None + html_url = url.replace('abs', 'html') + try: response = requests.get(html_url, timeout=60) if response.status_code == 200: + print(html_url) soup = BeautifulSoup(response.text, 'lxml') article = soup.find('article', class_='ltx_document') + + # section contents sections = article.find_all('section', class_='ltx_section') - bibliography = article.find('section', class_='ltx_bibliography') - appendices = article.find_all('section', class_='ltx_appendix') + sections.extend(article.find_all('section', class_='ltx_appendix')) + if len(sections) > 0: + section_contents = {} + for section in sections: + section_title = section.find('h2', class_='ltx_title').text + section_content = section.text + section_contents[section_title] = section_content + + # bibliography + bibliography_raw = article.find('section', class_='ltx_bibliography') + if bibliography_raw is not None: + bibliography = {} + bibliography_list = bibliography_raw.find_all( + 'li', class_='ltx_bibitem' + ) + for bibliography_item in bibliography_list: + bibliography_tag_raw = bibliography_item.find( + 'span', class_='ltx_tag' + ) + if bibliography_tag_raw: + bibliography_tag = bibliography_tag_raw.text + else: + continue + bibliography_content = bibliography_item.text + bibliography[bibliography_tag] = bibliography_content + + # figure captions figures = article.find_all('figure', class_='ltx_figure') - figure_captions = [figure.find( - 'figcaption').text for figure in figures] + if len(figures) > 0: + figure_captions = {} + figure_index = 0 + for figure in figures: + figure_caption_raw = figure.find_all( + 'figcaption', class_='ltx_caption' + ) + if len(figure_caption_raw) > 0: + figure_caption = figure_caption_raw[-1].text + else: + continue + figure_tag_raw = figure.find_all('span', class_='ltx_tag') + if len(figure_tag_raw) > 0: + figure_tag = figure_tag_raw[-1].text + else: + figure_index += 1 + figure_tag = str(figure_index) + figure_captions[figure_tag] = figure_caption + + # table_captions tables = article.find_all('figure', class_='ltx_table') - table_captions = [table.find( - 'figcaption').text for table in tables] - return content, newest_day + if len(tables) > 0: + table_captions = {} + table_index = 0 + for table in tables: + table_caption_raw = table.find('figcaption', class_='ltx_caption') + if table_caption_raw: + table_caption = table_caption_raw.text + else: + continue + table_tag_raw = table.find('span', class_='ltx_tag') + if table_tag_raw: + table_tag = table_tag_raw.text + else: + table_index += 1 + table_tag = str(table_index) + table_captions[table_tag] = table_caption + except ConnectionError: + pass + return section_contents, table_captions, figure_captions, bibliography diff --git a/tests/utils/test_paper_collector.py b/tests/utils/test_paper_collector.py index f9fdcaef..b4cea8b1 100644 --- a/tests/utils/test_paper_collector.py +++ b/tests/utils/test_paper_collector.py @@ -1,7 +1,7 @@ import datetime from unittest.mock import MagicMock, patch -from research_town.utils.paper_collector import get_daily_papers +from research_town.utils.paper_collector import get_daily_papers, get_full_content def test_get_daily_papers() -> None: @@ -27,3 +27,19 @@ def test_get_daily_papers() -> None: assert len(result) == 2 assert newest_day == datetime.date(2023, 7, 2) # Compare to the date part only + + +def test_get_full_content() -> None: + section_contents, table_captions, figure_captions, bibliography = get_full_content( + 'https://arxiv.org/html/2403.05534v1' + ) + assert section_contents is not None + assert '\n1 Introduction' in section_contents + assert len(section_contents['\n1 Introduction']) > 0 + assert table_captions is None + assert figure_captions is not None + assert 'Figure 1: ' in figure_captions + assert len(figure_captions['Figure 1: ']) > 0 + assert bibliography is not None + assert 'Arora and Huber (2001)' in bibliography + assert len(bibliography['Arora and Huber (2001)']) > 0 From cfd3698c621aec0a38246aac833f76436dbf493a Mon Sep 17 00:00:00 2001 From: chengzr01 Date: Tue, 23 Jul 2024 00:49:42 +0800 Subject: [PATCH 4/6] add dependency & fix errors --- pyproject.toml | 3 ++- research_town/utils/paper_collector.py | 20 +++++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 0ae0544d..40b47d0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ arxiv = { version = "^2.1.0", optional = true } pandas = { version = "*", optional = true } requests = { version = "*", optional = true } bs4 = { version = "*", optional = true } +lxml = { version = "*", optional = true } [tool.poetry.group.dev.dependencies] @@ -38,7 +39,7 @@ mypy = "^1.11.0" [tool.poetry.extras] retriever = ["torch", "transformers"] -collector = ["arxiv", "requests", "bs4"] +collector = ["arxiv", "requests", "bs4", "lxml"] logger = ["termcolor"] humaneval = ["pandas"] diff --git a/research_town/utils/paper_collector.py b/research_town/utils/paper_collector.py index 37a4ce94..dbb9fad9 100644 --- a/research_town/utils/paper_collector.py +++ b/research_town/utils/paper_collector.py @@ -41,9 +41,12 @@ def get_daily_papers( content[publish_time]['url'].append(paper_url) content[publish_time]['domain'].append(paper_domain) content[publish_time]['timestamp'].append(paper_timestamp) - content[publish_time]['section_contents'].append(paper_section_contents) - content[publish_time]['table_captions'].append(paper_table_captions) - content[publish_time]['figure_captions'].append(paper_figure_captions) + content[publish_time]['section_contents'].append( + paper_section_contents) + content[publish_time]['table_captions'].append( + paper_table_captions) + content[publish_time]['figure_captions'].append( + paper_figure_captions) content[publish_time]['bibliography'].append(paper_bibliography) else: content[publish_time] = {} @@ -53,7 +56,8 @@ def get_daily_papers( content[publish_time]['url'] = [paper_url] content[publish_time]['domain'] = [paper_domain] content[publish_time]['timestamp'] = [paper_timestamp] - content[publish_time]['section_contents'] = [paper_section_contents] + content[publish_time]['section_contents'] = [ + paper_section_contents] content[publish_time]['table_captions'] = [paper_table_captions] content[publish_time]['figure_captions'] = [paper_figure_captions] content[publish_time]['bibliography'] = [paper_bibliography] @@ -91,7 +95,8 @@ def get_full_content( section_contents[section_title] = section_content # bibliography - bibliography_raw = article.find('section', class_='ltx_bibliography') + bibliography_raw = article.find( + 'section', class_='ltx_bibliography') if bibliography_raw is not None: bibliography = {} bibliography_list = bibliography_raw.find_all( @@ -135,7 +140,8 @@ def get_full_content( table_captions = {} table_index = 0 for table in tables: - table_caption_raw = table.find('figcaption', class_='ltx_caption') + table_caption_raw = table.find( + 'figcaption', class_='ltx_caption') if table_caption_raw: table_caption = table_caption_raw.text else: @@ -147,6 +153,6 @@ def get_full_content( table_index += 1 table_tag = str(table_index) table_captions[table_tag] = table_caption - except ConnectionError: + except requests.exceptions.RequestException: pass return section_contents, table_captions, figure_captions, bibliography From e144a32da474bc20b85073dab67d92797c4ac620 Mon Sep 17 00:00:00 2001 From: chengzr01 Date: Tue, 23 Jul 2024 00:55:17 +0800 Subject: [PATCH 5/6] update dependency --- poetry.lock | 171 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 165 insertions(+), 6 deletions(-) diff --git a/poetry.lock b/poetry.lock index 35847a18..d668e4bd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohttp" @@ -1182,6 +1182,164 @@ tokenizers = "*" extra-proxy = ["azure-identity (>=1.15.0,<2.0.0)", "azure-keyvault-secrets (>=4.8.0,<5.0.0)", "google-cloud-kms (>=2.21.3,<3.0.0)", "prisma (==0.11.0)", "pynacl (>=1.5.0,<2.0.0)", "resend (>=0.8.0,<0.9.0)"] proxy = ["PyJWT (>=2.8.0,<3.0.0)", "apscheduler (>=3.10.4,<4.0.0)", "backoff", "cryptography (>=42.0.5,<43.0.0)", "fastapi (>=0.111.0,<0.112.0)", "fastapi-sso (>=0.10.0,<0.11.0)", "gunicorn (>=22.0.0,<23.0.0)", "orjson (>=3.9.7,<4.0.0)", "python-multipart (>=0.0.9,<0.0.10)", "pyyaml (>=6.0.1,<7.0.0)", "rq", "uvicorn (>=0.22.0,<0.23.0)"] +[[package]] +name = "lxml" +version = "5.2.2" +description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +optional = true +python-versions = ">=3.6" +files = [ + {file = "lxml-5.2.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:364d03207f3e603922d0d3932ef363d55bbf48e3647395765f9bfcbdf6d23632"}, + {file = "lxml-5.2.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:50127c186f191b8917ea2fb8b206fbebe87fd414a6084d15568c27d0a21d60db"}, + {file = "lxml-5.2.2-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:74e4f025ef3db1c6da4460dd27c118d8cd136d0391da4e387a15e48e5c975147"}, + {file = "lxml-5.2.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:981a06a3076997adf7c743dcd0d7a0415582661e2517c7d961493572e909aa1d"}, + {file = "lxml-5.2.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aef5474d913d3b05e613906ba4090433c515e13ea49c837aca18bde190853dff"}, + {file = "lxml-5.2.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1e275ea572389e41e8b039ac076a46cb87ee6b8542df3fff26f5baab43713bca"}, + {file = "lxml-5.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f5b65529bb2f21ac7861a0e94fdbf5dc0daab41497d18223b46ee8515e5ad297"}, + {file = "lxml-5.2.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:bcc98f911f10278d1daf14b87d65325851a1d29153caaf146877ec37031d5f36"}, + {file = "lxml-5.2.2-cp310-cp310-manylinux_2_28_ppc64le.whl", hash = "sha256:b47633251727c8fe279f34025844b3b3a3e40cd1b198356d003aa146258d13a2"}, + {file = "lxml-5.2.2-cp310-cp310-manylinux_2_28_s390x.whl", hash = "sha256:fbc9d316552f9ef7bba39f4edfad4a734d3d6f93341232a9dddadec4f15d425f"}, + {file = "lxml-5.2.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:13e69be35391ce72712184f69000cda04fc89689429179bc4c0ae5f0b7a8c21b"}, + {file = "lxml-5.2.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:3b6a30a9ab040b3f545b697cb3adbf3696c05a3a68aad172e3fd7ca73ab3c835"}, + {file = "lxml-5.2.2-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:a233bb68625a85126ac9f1fc66d24337d6e8a0f9207b688eec2e7c880f012ec0"}, + {file = "lxml-5.2.2-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:dfa7c241073d8f2b8e8dbc7803c434f57dbb83ae2a3d7892dd068d99e96efe2c"}, + {file = "lxml-5.2.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1a7aca7964ac4bb07680d5c9d63b9d7028cace3e2d43175cb50bba8c5ad33316"}, + {file = "lxml-5.2.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ae4073a60ab98529ab8a72ebf429f2a8cc612619a8c04e08bed27450d52103c0"}, + {file = "lxml-5.2.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ffb2be176fed4457e445fe540617f0252a72a8bc56208fd65a690fdb1f57660b"}, + {file = "lxml-5.2.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e290d79a4107d7d794634ce3e985b9ae4f920380a813717adf61804904dc4393"}, + {file = "lxml-5.2.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:96e85aa09274955bb6bd483eaf5b12abadade01010478154b0ec70284c1b1526"}, + {file = "lxml-5.2.2-cp310-cp310-win32.whl", hash = "sha256:f956196ef61369f1685d14dad80611488d8dc1ef00be57c0c5a03064005b0f30"}, + {file = "lxml-5.2.2-cp310-cp310-win_amd64.whl", hash = "sha256:875a3f90d7eb5c5d77e529080d95140eacb3c6d13ad5b616ee8095447b1d22e7"}, + {file = "lxml-5.2.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:45f9494613160d0405682f9eee781c7e6d1bf45f819654eb249f8f46a2c22545"}, + {file = "lxml-5.2.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b0b3f2df149efb242cee2ffdeb6674b7f30d23c9a7af26595099afaf46ef4e88"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d28cb356f119a437cc58a13f8135ab8a4c8ece18159eb9194b0d269ec4e28083"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:657a972f46bbefdbba2d4f14413c0d079f9ae243bd68193cb5061b9732fa54c1"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b74b9ea10063efb77a965a8d5f4182806fbf59ed068b3c3fd6f30d2ac7bee734"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07542787f86112d46d07d4f3c4e7c760282011b354d012dc4141cc12a68cef5f"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:303f540ad2dddd35b92415b74b900c749ec2010e703ab3bfd6660979d01fd4ed"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:2eb2227ce1ff998faf0cd7fe85bbf086aa41dfc5af3b1d80867ecfe75fb68df3"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_28_ppc64le.whl", hash = "sha256:1d8a701774dfc42a2f0b8ccdfe7dbc140500d1049e0632a611985d943fcf12df"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_28_s390x.whl", hash = "sha256:56793b7a1a091a7c286b5f4aa1fe4ae5d1446fe742d00cdf2ffb1077865db10d"}, + {file = "lxml-5.2.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:eb00b549b13bd6d884c863554566095bf6fa9c3cecb2e7b399c4bc7904cb33b5"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:1a2569a1f15ae6c8c64108a2cd2b4a858fc1e13d25846be0666fc144715e32ab"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:8cf85a6e40ff1f37fe0f25719aadf443686b1ac7652593dc53c7ef9b8492b115"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d237ba6664b8e60fd90b8549a149a74fcc675272e0e95539a00522e4ca688b04"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:0b3f5016e00ae7630a4b83d0868fca1e3d494c78a75b1c7252606a3a1c5fc2ad"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:23441e2b5339bc54dc949e9e675fa35efe858108404ef9aa92f0456929ef6fe8"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:2fb0ba3e8566548d6c8e7dd82a8229ff47bd8fb8c2da237607ac8e5a1b8312e5"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:79d1fb9252e7e2cfe4de6e9a6610c7cbb99b9708e2c3e29057f487de5a9eaefa"}, + {file = "lxml-5.2.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6dcc3d17eac1df7859ae01202e9bb11ffa8c98949dcbeb1069c8b9a75917e01b"}, + {file = "lxml-5.2.2-cp311-cp311-win32.whl", hash = "sha256:4c30a2f83677876465f44c018830f608fa3c6a8a466eb223535035fbc16f3438"}, + {file = "lxml-5.2.2-cp311-cp311-win_amd64.whl", hash = "sha256:49095a38eb333aaf44c06052fd2ec3b8f23e19747ca7ec6f6c954ffea6dbf7be"}, + {file = "lxml-5.2.2-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:7429e7faa1a60cad26ae4227f4dd0459efde239e494c7312624ce228e04f6391"}, + {file = "lxml-5.2.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:50ccb5d355961c0f12f6cf24b7187dbabd5433f29e15147a67995474f27d1776"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dc911208b18842a3a57266d8e51fc3cfaccee90a5351b92079beed912a7914c2"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33ce9e786753743159799fdf8e92a5da351158c4bfb6f2db0bf31e7892a1feb5"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ec87c44f619380878bd49ca109669c9f221d9ae6883a5bcb3616785fa8f94c97"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:08ea0f606808354eb8f2dfaac095963cb25d9d28e27edcc375d7b30ab01abbf6"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:75a9632f1d4f698b2e6e2e1ada40e71f369b15d69baddb8968dcc8e683839b18"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74da9f97daec6928567b48c90ea2c82a106b2d500f397eeb8941e47d30b1ca85"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_28_ppc64le.whl", hash = "sha256:0969e92af09c5687d769731e3f39ed62427cc72176cebb54b7a9d52cc4fa3b73"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_28_s390x.whl", hash = "sha256:9164361769b6ca7769079f4d426a41df6164879f7f3568be9086e15baca61466"}, + {file = "lxml-5.2.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d26a618ae1766279f2660aca0081b2220aca6bd1aa06b2cf73f07383faf48927"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab67ed772c584b7ef2379797bf14b82df9aa5f7438c5b9a09624dd834c1c1aaf"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:3d1e35572a56941b32c239774d7e9ad724074d37f90c7a7d499ab98761bd80cf"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:8268cbcd48c5375f46e000adb1390572c98879eb4f77910c6053d25cc3ac2c67"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:e282aedd63c639c07c3857097fc0e236f984ceb4089a8b284da1c526491e3f3d"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dfdc2bfe69e9adf0df4915949c22a25b39d175d599bf98e7ddf620a13678585"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4aefd911793b5d2d7a921233a54c90329bf3d4a6817dc465f12ffdfe4fc7b8fe"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:8b8df03a9e995b6211dafa63b32f9d405881518ff1ddd775db4e7b98fb545e1c"}, + {file = "lxml-5.2.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f11ae142f3a322d44513de1018b50f474f8f736bc3cd91d969f464b5bfef8836"}, + {file = "lxml-5.2.2-cp312-cp312-win32.whl", hash = "sha256:16a8326e51fcdffc886294c1e70b11ddccec836516a343f9ed0f82aac043c24a"}, + {file = "lxml-5.2.2-cp312-cp312-win_amd64.whl", hash = "sha256:bbc4b80af581e18568ff07f6395c02114d05f4865c2812a1f02f2eaecf0bfd48"}, + {file = "lxml-5.2.2-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:e3d9d13603410b72787579769469af730c38f2f25505573a5888a94b62b920f8"}, + {file = "lxml-5.2.2-cp36-cp36m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:38b67afb0a06b8575948641c1d6d68e41b83a3abeae2ca9eed2ac59892b36706"}, + {file = "lxml-5.2.2-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c689d0d5381f56de7bd6966a4541bff6e08bf8d3871bbd89a0c6ab18aa699573"}, + {file = "lxml-5.2.2-cp36-cp36m-manylinux_2_28_x86_64.whl", hash = "sha256:cf2a978c795b54c539f47964ec05e35c05bd045db5ca1e8366988c7f2fe6b3ce"}, + {file = "lxml-5.2.2-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:739e36ef7412b2bd940f75b278749106e6d025e40027c0b94a17ef7968d55d56"}, + {file = "lxml-5.2.2-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:d8bbcd21769594dbba9c37d3c819e2d5847656ca99c747ddb31ac1701d0c0ed9"}, + {file = "lxml-5.2.2-cp36-cp36m-musllinux_1_2_x86_64.whl", hash = "sha256:2304d3c93f2258ccf2cf7a6ba8c761d76ef84948d87bf9664e14d203da2cd264"}, + {file = "lxml-5.2.2-cp36-cp36m-win32.whl", hash = "sha256:02437fb7308386867c8b7b0e5bc4cd4b04548b1c5d089ffb8e7b31009b961dc3"}, + {file = "lxml-5.2.2-cp36-cp36m-win_amd64.whl", hash = "sha256:edcfa83e03370032a489430215c1e7783128808fd3e2e0a3225deee278585196"}, + {file = "lxml-5.2.2-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:28bf95177400066596cdbcfc933312493799382879da504633d16cf60bba735b"}, + {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3a745cc98d504d5bd2c19b10c79c61c7c3df9222629f1b6210c0368177589fb8"}, + {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b590b39ef90c6b22ec0be925b211298e810b4856909c8ca60d27ffbca6c12e6"}, + {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b336b0416828022bfd5a2e3083e7f5ba54b96242159f83c7e3eebaec752f1716"}, + {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:c2faf60c583af0d135e853c86ac2735ce178f0e338a3c7f9ae8f622fd2eb788c"}, + {file = "lxml-5.2.2-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:4bc6cb140a7a0ad1f7bc37e018d0ed690b7b6520ade518285dc3171f7a117905"}, + {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:7ff762670cada8e05b32bf1e4dc50b140790909caa8303cfddc4d702b71ea184"}, + {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:57f0a0bbc9868e10ebe874e9f129d2917750adf008fe7b9c1598c0fbbfdde6a6"}, + {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:a6d2092797b388342c1bc932077ad232f914351932353e2e8706851c870bca1f"}, + {file = "lxml-5.2.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:60499fe961b21264e17a471ec296dcbf4365fbea611bf9e303ab69db7159ce61"}, + {file = "lxml-5.2.2-cp37-cp37m-win32.whl", hash = "sha256:d9b342c76003c6b9336a80efcc766748a333573abf9350f4094ee46b006ec18f"}, + {file = "lxml-5.2.2-cp37-cp37m-win_amd64.whl", hash = "sha256:b16db2770517b8799c79aa80f4053cd6f8b716f21f8aca962725a9565ce3ee40"}, + {file = "lxml-5.2.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:7ed07b3062b055d7a7f9d6557a251cc655eed0b3152b76de619516621c56f5d3"}, + {file = "lxml-5.2.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f60fdd125d85bf9c279ffb8e94c78c51b3b6a37711464e1f5f31078b45002421"}, + {file = "lxml-5.2.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a7e24cb69ee5f32e003f50e016d5fde438010c1022c96738b04fc2423e61706"}, + {file = "lxml-5.2.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:23cfafd56887eaed93d07bc4547abd5e09d837a002b791e9767765492a75883f"}, + {file = "lxml-5.2.2-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:19b4e485cd07b7d83e3fe3b72132e7df70bfac22b14fe4bf7a23822c3a35bff5"}, + {file = "lxml-5.2.2-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:7ce7ad8abebe737ad6143d9d3bf94b88b93365ea30a5b81f6877ec9c0dee0a48"}, + {file = "lxml-5.2.2-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:e49b052b768bb74f58c7dda4e0bdf7b79d43a9204ca584ffe1fb48a6f3c84c66"}, + {file = "lxml-5.2.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d14a0d029a4e176795cef99c056d58067c06195e0c7e2dbb293bf95c08f772a3"}, + {file = "lxml-5.2.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:be49ad33819d7dcc28a309b86d4ed98e1a65f3075c6acd3cd4fe32103235222b"}, + {file = "lxml-5.2.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:a6d17e0370d2516d5bb9062c7b4cb731cff921fc875644c3d751ad857ba9c5b1"}, + {file = "lxml-5.2.2-cp38-cp38-win32.whl", hash = "sha256:5b8c041b6265e08eac8a724b74b655404070b636a8dd6d7a13c3adc07882ef30"}, + {file = "lxml-5.2.2-cp38-cp38-win_amd64.whl", hash = "sha256:f61efaf4bed1cc0860e567d2ecb2363974d414f7f1f124b1df368bbf183453a6"}, + {file = "lxml-5.2.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:fb91819461b1b56d06fa4bcf86617fac795f6a99d12239fb0c68dbeba41a0a30"}, + {file = "lxml-5.2.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d4ed0c7cbecde7194cd3228c044e86bf73e30a23505af852857c09c24e77ec5d"}, + {file = "lxml-5.2.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:54401c77a63cc7d6dc4b4e173bb484f28a5607f3df71484709fe037c92d4f0ed"}, + {file = "lxml-5.2.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:625e3ef310e7fa3a761d48ca7ea1f9d8718a32b1542e727d584d82f4453d5eeb"}, + {file = "lxml-5.2.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:519895c99c815a1a24a926d5b60627ce5ea48e9f639a5cd328bda0515ea0f10c"}, + {file = "lxml-5.2.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c7079d5eb1c1315a858bbf180000757db8ad904a89476653232db835c3114001"}, + {file = "lxml-5.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:343ab62e9ca78094f2306aefed67dcfad61c4683f87eee48ff2fd74902447726"}, + {file = "lxml-5.2.2-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:cd9e78285da6c9ba2d5c769628f43ef66d96ac3085e59b10ad4f3707980710d3"}, + {file = "lxml-5.2.2-cp39-cp39-manylinux_2_28_ppc64le.whl", hash = "sha256:546cf886f6242dff9ec206331209db9c8e1643ae642dea5fdbecae2453cb50fd"}, + {file = "lxml-5.2.2-cp39-cp39-manylinux_2_28_s390x.whl", hash = "sha256:02f6a8eb6512fdc2fd4ca10a49c341c4e109aa6e9448cc4859af5b949622715a"}, + {file = "lxml-5.2.2-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:339ee4a4704bc724757cd5dd9dc8cf4d00980f5d3e6e06d5847c1b594ace68ab"}, + {file = "lxml-5.2.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0a028b61a2e357ace98b1615fc03f76eb517cc028993964fe08ad514b1e8892d"}, + {file = "lxml-5.2.2-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:f90e552ecbad426eab352e7b2933091f2be77115bb16f09f78404861c8322981"}, + {file = "lxml-5.2.2-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:d83e2d94b69bf31ead2fa45f0acdef0757fa0458a129734f59f67f3d2eb7ef32"}, + {file = "lxml-5.2.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a02d3c48f9bb1e10c7788d92c0c7db6f2002d024ab6e74d6f45ae33e3d0288a3"}, + {file = "lxml-5.2.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:6d68ce8e7b2075390e8ac1e1d3a99e8b6372c694bbe612632606d1d546794207"}, + {file = "lxml-5.2.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:453d037e09a5176d92ec0fd282e934ed26d806331a8b70ab431a81e2fbabf56d"}, + {file = "lxml-5.2.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:3b019d4ee84b683342af793b56bb35034bd749e4cbdd3d33f7d1107790f8c472"}, + {file = "lxml-5.2.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:cb3942960f0beb9f46e2a71a3aca220d1ca32feb5a398656be934320804c0df9"}, + {file = "lxml-5.2.2-cp39-cp39-win32.whl", hash = "sha256:ac6540c9fff6e3813d29d0403ee7a81897f1d8ecc09a8ff84d2eea70ede1cdbf"}, + {file = "lxml-5.2.2-cp39-cp39-win_amd64.whl", hash = "sha256:610b5c77428a50269f38a534057444c249976433f40f53e3b47e68349cca1425"}, + {file = "lxml-5.2.2-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:b537bd04d7ccd7c6350cdaaaad911f6312cbd61e6e6045542f781c7f8b2e99d2"}, + {file = "lxml-5.2.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4820c02195d6dfb7b8508ff276752f6b2ff8b64ae5d13ebe02e7667e035000b9"}, + {file = "lxml-5.2.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2a09f6184f17a80897172863a655467da2b11151ec98ba8d7af89f17bf63dae"}, + {file = "lxml-5.2.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:76acba4c66c47d27c8365e7c10b3d8016a7da83d3191d053a58382311a8bf4e1"}, + {file = "lxml-5.2.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b128092c927eaf485928cec0c28f6b8bead277e28acf56800e972aa2c2abd7a2"}, + {file = "lxml-5.2.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ae791f6bd43305aade8c0e22f816b34f3b72b6c820477aab4d18473a37e8090b"}, + {file = "lxml-5.2.2-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:a2f6a1bc2460e643785a2cde17293bd7a8f990884b822f7bca47bee0a82fc66b"}, + {file = "lxml-5.2.2-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e8d351ff44c1638cb6e980623d517abd9f580d2e53bfcd18d8941c052a5a009"}, + {file = "lxml-5.2.2-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bec4bd9133420c5c52d562469c754f27c5c9e36ee06abc169612c959bd7dbb07"}, + {file = "lxml-5.2.2-pp37-pypy37_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:55ce6b6d803890bd3cc89975fca9de1dff39729b43b73cb15ddd933b8bc20484"}, + {file = "lxml-5.2.2-pp37-pypy37_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8ab6a358d1286498d80fe67bd3d69fcbc7d1359b45b41e74c4a26964ca99c3f8"}, + {file = "lxml-5.2.2-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:06668e39e1f3c065349c51ac27ae430719d7806c026fec462e5693b08b95696b"}, + {file = "lxml-5.2.2-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:9cd5323344d8ebb9fb5e96da5de5ad4ebab993bbf51674259dbe9d7a18049525"}, + {file = "lxml-5.2.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89feb82ca055af0fe797a2323ec9043b26bc371365847dbe83c7fd2e2f181c34"}, + {file = "lxml-5.2.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e481bba1e11ba585fb06db666bfc23dbe181dbafc7b25776156120bf12e0d5a6"}, + {file = "lxml-5.2.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:9d6c6ea6a11ca0ff9cd0390b885984ed31157c168565702959c25e2191674a14"}, + {file = "lxml-5.2.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:3d98de734abee23e61f6b8c2e08a88453ada7d6486dc7cdc82922a03968928db"}, + {file = "lxml-5.2.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:69ab77a1373f1e7563e0fb5a29a8440367dec051da6c7405333699d07444f511"}, + {file = "lxml-5.2.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:34e17913c431f5ae01d8658dbf792fdc457073dcdfbb31dc0cc6ab256e664a8d"}, + {file = "lxml-5.2.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:05f8757b03208c3f50097761be2dea0aba02e94f0dc7023ed73a7bb14ff11eb0"}, + {file = "lxml-5.2.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a520b4f9974b0a0a6ed73c2154de57cdfd0c8800f4f15ab2b73238ffed0b36e"}, + {file = "lxml-5.2.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5e097646944b66207023bc3c634827de858aebc226d5d4d6d16f0b77566ea182"}, + {file = "lxml-5.2.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:b5e4ef22ff25bfd4ede5f8fb30f7b24446345f3e79d9b7455aef2836437bc38a"}, + {file = "lxml-5.2.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:ff69a9a0b4b17d78170c73abe2ab12084bdf1691550c5629ad1fe7849433f324"}, + {file = "lxml-5.2.2.tar.gz", hash = "sha256:bb2dc4898180bea79863d5487e5f9c7c34297414bad54bcd0f0852aee9cfdb87"}, +] + +[package.extras] +cssselect = ["cssselect (>=0.7)"] +html-clean = ["lxml-html-clean"] +html5 = ["html5lib"] +htmlsoup = ["BeautifulSoup4"] +source = ["Cython (>=3.0.10)"] + [[package]] name = "markupsafe" version = "2.1.5" @@ -1710,6 +1868,7 @@ description = "Nvidia JIT LTO Library" optional = true python-versions = ">=3" files = [ + {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, ] @@ -1727,13 +1886,13 @@ files = [ [[package]] name = "openai" -version = "1.36.1" +version = "1.37.0" description = "The official Python library for the openai API" optional = false python-versions = ">=3.7.1" files = [ - {file = "openai-1.36.1-py3-none-any.whl", hash = "sha256:d399b9d476dbbc167aceaac6bc6ed0b2e2bb6c9e189c7f7047f822743ae62e64"}, - {file = "openai-1.36.1.tar.gz", hash = "sha256:41be9e0302e95dba8a9374b885c5cb1cec2202816a70b98736fee25a2cadd1f2"}, + {file = "openai-1.37.0-py3-none-any.whl", hash = "sha256:a903245c0ecf622f2830024acdaa78683c70abb8e9d37a497b851670864c9f73"}, + {file = "openai-1.37.0.tar.gz", hash = "sha256:dc8197fc40ab9d431777b6620d962cc49f4544ffc3011f03ce0a805e6eb54adb"}, ] [package.dependencies] @@ -3496,7 +3655,7 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [extras] -collector = ["arxiv", "bs4", "requests"] +collector = ["arxiv", "bs4", "lxml", "requests"] humaneval = ["pandas"] logger = ["termcolor"] retriever = ["torch", "transformers"] @@ -3504,4 +3663,4 @@ retriever = ["torch", "transformers"] [metadata] lock-version = "2.0" python-versions = ">=3.10, <3.12" -content-hash = "319e7ee84886f0680e1251cc9e4879551774053597f69698711f368e47420c93" +content-hash = "e7ce61e78f3bf58e5b43af8f14d2d8ef37d76631b12dfd608e02599033213887" From f0eccb3790e53586640c415a1f030075d7c60a07 Mon Sep 17 00:00:00 2001 From: Haofei Yu <1125027232@qq.com> Date: Mon, 22 Jul 2024 13:16:12 -0400 Subject: [PATCH 6/6] update function name --- research_town/utils/paper_collector.py | 22 ++++++++-------------- tests/utils/test_paper_collector.py | 6 +++--- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/research_town/utils/paper_collector.py b/research_town/utils/paper_collector.py index dbb9fad9..49fd5942 100644 --- a/research_town/utils/paper_collector.py +++ b/research_town/utils/paper_collector.py @@ -32,7 +32,7 @@ def get_daily_papers( paper_table_captions, paper_figure_captions, paper_bibliography, - ) = get_full_content(paper_url) + ) = get_paper_content(paper_url) if publish_time in content: content[publish_time]['title'].append(paper_title) @@ -41,12 +41,9 @@ def get_daily_papers( content[publish_time]['url'].append(paper_url) content[publish_time]['domain'].append(paper_domain) content[publish_time]['timestamp'].append(paper_timestamp) - content[publish_time]['section_contents'].append( - paper_section_contents) - content[publish_time]['table_captions'].append( - paper_table_captions) - content[publish_time]['figure_captions'].append( - paper_figure_captions) + content[publish_time]['section_contents'].append(paper_section_contents) + content[publish_time]['table_captions'].append(paper_table_captions) + content[publish_time]['figure_captions'].append(paper_figure_captions) content[publish_time]['bibliography'].append(paper_bibliography) else: content[publish_time] = {} @@ -56,15 +53,14 @@ def get_daily_papers( content[publish_time]['url'] = [paper_url] content[publish_time]['domain'] = [paper_domain] content[publish_time]['timestamp'] = [paper_timestamp] - content[publish_time]['section_contents'] = [ - paper_section_contents] + content[publish_time]['section_contents'] = [paper_section_contents] content[publish_time]['table_captions'] = [paper_table_captions] content[publish_time]['figure_captions'] = [paper_figure_captions] content[publish_time]['bibliography'] = [paper_bibliography] return content, publish_time -def get_full_content( +def get_paper_content( url: str, ) -> Tuple[ Optional[Dict[str, str]], @@ -95,8 +91,7 @@ def get_full_content( section_contents[section_title] = section_content # bibliography - bibliography_raw = article.find( - 'section', class_='ltx_bibliography') + bibliography_raw = article.find('section', class_='ltx_bibliography') if bibliography_raw is not None: bibliography = {} bibliography_list = bibliography_raw.find_all( @@ -140,8 +135,7 @@ def get_full_content( table_captions = {} table_index = 0 for table in tables: - table_caption_raw = table.find( - 'figcaption', class_='ltx_caption') + table_caption_raw = table.find('figcaption', class_='ltx_caption') if table_caption_raw: table_caption = table_caption_raw.text else: diff --git a/tests/utils/test_paper_collector.py b/tests/utils/test_paper_collector.py index b4cea8b1..12d5ec2b 100644 --- a/tests/utils/test_paper_collector.py +++ b/tests/utils/test_paper_collector.py @@ -1,7 +1,7 @@ import datetime from unittest.mock import MagicMock, patch -from research_town.utils.paper_collector import get_daily_papers, get_full_content +from research_town.utils.paper_collector import get_daily_papers, get_paper_content def test_get_daily_papers() -> None: @@ -29,8 +29,8 @@ def test_get_daily_papers() -> None: assert newest_day == datetime.date(2023, 7, 2) # Compare to the date part only -def test_get_full_content() -> None: - section_contents, table_captions, figure_captions, bibliography = get_full_content( +def test_get_paper_content() -> None: + section_contents, table_captions, figure_captions, bibliography = get_paper_content( 'https://arxiv.org/html/2403.05534v1' ) assert section_contents is not None