fix(collector): fix pdf parsing error catch (#741)

* fix pdf error * fix pdf error * fix pdf error * fix pdf error
ulab-uiuc · Oct 4, 2024 · 28e8809 · 28e8809
1 parent dda113d
commit 28e8809
Show file tree

Hide file tree

Showing 8 changed files with 86 additions and 74 deletions.
diff --git a/.github/workflows/aws-backend.yml b/.github/workflows/aws-backend.yml
@@ -45,4 +45,4 @@ jobs:
           fi
           tmux send-keys -t 0 "uvicorn main:app --host 127.0.0.1 --port 8000" C-m
 
-        EOF
+        EOF
diff --git a/.github/workflows/aws-frontend.yml b/.github/workflows/aws-frontend.yml
@@ -27,7 +27,7 @@ jobs:
           git fetch origin
           git reset --hard origin/main
           sed -i 's|http://localhost:8000|https://demo.auto-research.dev|g' ./frontend/src/pages/Home.jsx
-          
+
           cd ./frontend/
           export NVM_DIR="$HOME/.nvm"
           [ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"

diff --git a/configs/agent_prompt/brainstorm_idea.yaml b/configs/agent_prompt/brainstorm_idea.yaml
@@ -20,13 +20,13 @@ fewshot_examples:
   Develop a graph-based reinforcement learning framework that leverages node embeddings to represent states and actions, enabling more efficient exploration and value function approximation in environments with complex relational structures, while incorporating boundary-invariant analyses to ensure optimality guarantees regardless of how the agent-environment boundary is defined.
 
 template: |
-  Here is your research background: 
+  Here is your research background:
   {bio}
 
-  Here are the research insights: 
+  Here are the research insights:
   {insights}
 
-  Here are the related works: 
+  Here are the related works:
   {papers}
 
   Please begin brainstorming idea conditioned on your research background. Please keep it within one to two sentences. Your idea should be different from those in the related papers.
diff --git a/configs/agent_prompt/review_literature.yaml b/configs/agent_prompt/review_literature.yaml
@@ -45,10 +45,10 @@ fewshot_examples:
  The challenges highlighted in watermarking techniques, particularly the trade-off between robustness and text quality, can inspire further research into ethical AI decision-making frameworks. By exploring how watermarking can be integrated into AI systems to ensure accountability in AI-generated content, researchers could develop methods that enhance cooperation among AI agents while maintaining ethical standards. Additionally, investigating the implications of detectable watermarks on user trust and the ethical responsibilities of AI developers may yield valuable insights into the societal impacts of AI technologies.
 
 template: |
- Here is my profile biology: 
+ Here is my profile biology:
  {bio}
 
- Here are some recent papers: 
+ Here are some recent papers:
  {papers}
 
  Please begin writing research insights that you can gain based on your research experience and the academic context. Mainly based on the academic context. Please make sure it is only two to three sentences.

diff --git a/research_town/utils/paper_collector.py b/research_town/utils/paper_collector.py
@@ -305,62 +305,70 @@ def get_paper_bibliography_from_html(url: str) -> Optional[Dict[str, str]]:
 
 
 def get_paper_content_from_pdf(url: str) -> Optional[Dict[str, str]]:
-    if 'abs' in url:
-        pdf_url = url.replace('abs', 'pdf')
-    elif 'html' in url:
-        pdf_url = url.replace('html', 'pdf')
-    else:
-        pdf_url = url
-
-    response = requests.get(pdf_url)
-    file_stream = BytesIO(response.content)
-    reader = PdfReader(file_stream)
-
-    text = ''
-    for page in reader.pages:
-        text += page.extract_text()
-
-    if text == '':
-        return None
+    try:
+        if 'abs' in url:
+            pdf_url = url.replace('abs', 'pdf')
+        elif 'html' in url:
+            pdf_url = url.replace('html', 'pdf')
+        else:
+            pdf_url = url
+
+        response = requests.get(pdf_url)
+        file_stream = BytesIO(response.content)
+        reader = PdfReader(file_stream)
+
+        text = ''
+        for page in reader.pages:
+            text += page.extract_text()
+
+        if text == '':
+            return None
+
+        section_titles = [
+            'Abstract',
+            'Introduction',
+            'Related Work',
+            'Background',
+            'Methods',
+            'Experiments',
+            'Results',
+            'Discussion',
+            'Conclusion',
+            'Conclusions',
+            'Acknowledgments',
+            'References',
+            'Appendix',
+            'Materials and Methods',
+        ]
+
+        section_pattern = re.compile(
+            r'\b(' + '|'.join(re.escape(title) for title in section_titles) + r')\b',
+            re.IGNORECASE,
+        )
 
-    section_titles = [
-        'Abstract',
-        'Introduction',
-        'Related Work',
-        'Background',
-        'Methods',
-        'Experiments',
-        'Results',
-        'Discussion',
-        'Conclusion',
-        'Conclusions',
-        'Acknowledgments',
-        'References',
-        'Appendix',
-        'Materials and Methods',
-    ]
-
-    section_pattern = re.compile(
-        r'\b(' + '|'.join(re.escape(title) for title in section_titles) + r')\b',
-        re.IGNORECASE,
-    )
+        sections = {}
+        matches = list(section_pattern.finditer(text))
 
-    sections = {}
-    matches = list(section_pattern.finditer(text))
+        for i, match in enumerate(matches):
+            section_name = match.group()
+            section_start = match.start()
 
-    for i, match in enumerate(matches):
-        section_name = match.group()
-        section_start = match.start()
+            if i + 1 < len(matches):
+                section_end = matches[i + 1].start()
+            else:
+                section_end = len(text)
 
-        if i + 1 < len(matches):
-            section_end = matches[i + 1].start()
-        else:
-            section_end = len(text)
+            section_content = text[section_start:section_end].strip()
+            sections[section_name] = section_content
 
-        section_content = text[section_start:section_end].strip()
-        sections[section_name] = section_content
+        return sections
 
-    return sections
+    except requests.exceptions.RequestException as e:
+        print(f'Error fetching the PDF from the URL: {e}')
+        return None
+    except Exception as e:
+        print(f'An unexpected error occurred: {e}')
+        return None
 
 
 def get_paper_introduction(url: str) -> Optional[str]:

diff --git a/tests/mocks/mocking_func.py b/tests/mocks/mocking_func.py
@@ -22,10 +22,14 @@ def mock_prompting(
     eval_prompt_template = example_config.eval_prompt_template
     if prompt[0]['content'] == agent_prompt_template.write_bio['sys_prompt']:
         return ['Bio1', 'Bio2', 'Bio3']
+    elif prompt[0]['content'] == agent_prompt_template.summarize_domain['sys_prompt']:
+        return ['Domain1; Domain2', 'Domain3; Domain4', 'Domain5; Domain6']
     elif prompt[0]['content'] == agent_prompt_template.review_literature['sys_prompt']:
-        return ['Summary of Target Paper: Summary1. Keywords of Target Paper: keyword1. Valuable Points from Target Paper: Insight1',
-                'Summary of Target Paper: Summary2. Keywords of Target Paper: keyword2. Valuable Points from Target Paper: Insight2',
-                'Summary of Target Paper: Summary3. Keywords of Target Paper: keyword3. Valuable Points from Target Paper: Insight3']
+        return [
+            'Summary of Target Paper: Summary1. Keywords of Target Paper: keyword1. Valuable Points from Target Paper: Insight1',
+            'Summary of Target Paper: Summary2. Keywords of Target Paper: keyword2. Valuable Points from Target Paper: Insight2',
+            'Summary of Target Paper: Summary3. Keywords of Target Paper: keyword3. Valuable Points from Target Paper: Insight3',
+        ]
     elif prompt[0]['content'] == agent_prompt_template.brainstorm_idea['sys_prompt']:
         return ['Idea1', 'Idea2', 'Idea3']
     elif prompt[0]['content'] == agent_prompt_template.discuss_idea['sys_prompt']:

diff --git a/tests/utils/test_model_prompting.py b/tests/utils/test_model_prompting.py
@@ -3,12 +3,10 @@
 
 def test_openai_call() -> None:
     # supported by OPENAI_API_KEY
-    prompt = [
-        {
-            'role': 'user',
-            'content': 'Here is a high-level summarized insight of a research field Machine Learning. ',
-        }
-    ]
+    prompt = [{
+        'role': 'user',
+        'content': 'Here is a high-level summarized insight of a research field Machine Learning. ',
+    }]
     response = model_prompting('gpt-4o-mini', prompt, mode='TEST')
     assert response is not None
     assert len(response) > 0
@@ -17,12 +15,10 @@ def test_openai_call() -> None:
 
 def test_togetherai_mistral_call() -> None:
     # supported by TOGETHERAI_API_KEY
-    prompt = [
-        {
-            'role': 'user',
-            'content': 'Here is a high-level summarized insight of a research field Machine Learning. ',
-        }
-    ]
+    prompt = [{
+        'role': 'user',
+        'content': 'Here is a high-level summarized insight of a research field Machine Learning. ',
+    }]
     response = model_prompting(
         'together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1', prompt, mode='TEST'
     )

diff --git a/tests/utils/test_string_mapper.py b/tests/utils/test_string_mapper.py
@@ -33,7 +33,9 @@ def test_map_paper_list_to_str() -> None:
         {'abstract': 'Abstract 2'},
         {'abstract': 'Abstract 3'},
     ]
-    expected_result = '1th paper: Abstract 1\n2th paper: Abstract 2\n3th paper: Abstract 3\n'
+    expected_result = (
+        '1th paper: Abstract 1\n2th paper: Abstract 2\n3th paper: Abstract 3\n'
+    )
     assert map_paper_list_to_str(papers) == expected_result
 
 
@@ -129,7 +131,9 @@ def test_map_insight_list_to_str() -> None:
         {'content': 'Insight 2'},
         {'content': 'Insight 3'},
     ]
-    expected_result = '1th insight: Insight 1\n2th insight: Insight 2\n3th insight: Insight 3\n'
+    expected_result = (
+        '1th insight: Insight 1\n2th insight: Insight 2\n3th insight: Insight 3\n'
+    )
     assert map_insight_list_to_str(insights) == expected_result