Skip to content

Commit

Permalink
fix(collector): fix pdf parsing error catch (#741)
Browse files Browse the repository at this point in the history
* fix pdf error

* fix pdf error

* fix pdf error

* fix pdf error
  • Loading branch information
lwaekfjlk authored Oct 4, 2024
1 parent dda113d commit 28e8809
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 74 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/aws-backend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,4 @@ jobs:
fi
tmux send-keys -t 0 "uvicorn main:app --host 127.0.0.1 --port 8000" C-m
EOF
EOF
2 changes: 1 addition & 1 deletion .github/workflows/aws-frontend.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
git fetch origin
git reset --hard origin/main
sed -i 's|http://localhost:8000|https://demo.auto-research.dev|g' ./frontend/src/pages/Home.jsx
cd ./frontend/
export NVM_DIR="$HOME/.nvm"
[ -s "$NVM_DIR/nvm.sh" ] && \. "$NVM_DIR/nvm.sh"
Expand Down
6 changes: 3 additions & 3 deletions configs/agent_prompt/brainstorm_idea.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ fewshot_examples:
Develop a graph-based reinforcement learning framework that leverages node embeddings to represent states and actions, enabling more efficient exploration and value function approximation in environments with complex relational structures, while incorporating boundary-invariant analyses to ensure optimality guarantees regardless of how the agent-environment boundary is defined.
template: |
Here is your research background:
Here is your research background:
{bio}
Here are the research insights:
Here are the research insights:
{insights}
Here are the related works:
Here are the related works:
{papers}
Please begin brainstorming idea conditioned on your research background. Please keep it within one to two sentences. Your idea should be different from those in the related papers.
4 changes: 2 additions & 2 deletions configs/agent_prompt/review_literature.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,10 @@ fewshot_examples:
The challenges highlighted in watermarking techniques, particularly the trade-off between robustness and text quality, can inspire further research into ethical AI decision-making frameworks. By exploring how watermarking can be integrated into AI systems to ensure accountability in AI-generated content, researchers could develop methods that enhance cooperation among AI agents while maintaining ethical standards. Additionally, investigating the implications of detectable watermarks on user trust and the ethical responsibilities of AI developers may yield valuable insights into the societal impacts of AI technologies.

template: |
Here is my profile biology:
Here is my profile biology:
{bio}

Here are some recent papers:
Here are some recent papers:
{papers}

Please begin writing research insights that you can gain based on your research experience and the academic context. Mainly based on the academic context. Please make sure it is only two to three sentences.
Expand Down
108 changes: 58 additions & 50 deletions research_town/utils/paper_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,62 +305,70 @@ def get_paper_bibliography_from_html(url: str) -> Optional[Dict[str, str]]:


def get_paper_content_from_pdf(url: str) -> Optional[Dict[str, str]]:
if 'abs' in url:
pdf_url = url.replace('abs', 'pdf')
elif 'html' in url:
pdf_url = url.replace('html', 'pdf')
else:
pdf_url = url

response = requests.get(pdf_url)
file_stream = BytesIO(response.content)
reader = PdfReader(file_stream)

text = ''
for page in reader.pages:
text += page.extract_text()

if text == '':
return None
try:
if 'abs' in url:
pdf_url = url.replace('abs', 'pdf')
elif 'html' in url:
pdf_url = url.replace('html', 'pdf')
else:
pdf_url = url

response = requests.get(pdf_url)
file_stream = BytesIO(response.content)
reader = PdfReader(file_stream)

text = ''
for page in reader.pages:
text += page.extract_text()

if text == '':
return None

section_titles = [
'Abstract',
'Introduction',
'Related Work',
'Background',
'Methods',
'Experiments',
'Results',
'Discussion',
'Conclusion',
'Conclusions',
'Acknowledgments',
'References',
'Appendix',
'Materials and Methods',
]

section_pattern = re.compile(
r'\b(' + '|'.join(re.escape(title) for title in section_titles) + r')\b',
re.IGNORECASE,
)

section_titles = [
'Abstract',
'Introduction',
'Related Work',
'Background',
'Methods',
'Experiments',
'Results',
'Discussion',
'Conclusion',
'Conclusions',
'Acknowledgments',
'References',
'Appendix',
'Materials and Methods',
]

section_pattern = re.compile(
r'\b(' + '|'.join(re.escape(title) for title in section_titles) + r')\b',
re.IGNORECASE,
)
sections = {}
matches = list(section_pattern.finditer(text))

sections = {}
matches = list(section_pattern.finditer(text))
for i, match in enumerate(matches):
section_name = match.group()
section_start = match.start()

for i, match in enumerate(matches):
section_name = match.group()
section_start = match.start()
if i + 1 < len(matches):
section_end = matches[i + 1].start()
else:
section_end = len(text)

if i + 1 < len(matches):
section_end = matches[i + 1].start()
else:
section_end = len(text)
section_content = text[section_start:section_end].strip()
sections[section_name] = section_content

section_content = text[section_start:section_end].strip()
sections[section_name] = section_content
return sections

return sections
except requests.exceptions.RequestException as e:
print(f'Error fetching the PDF from the URL: {e}')
return None
except Exception as e:
print(f'An unexpected error occurred: {e}')
return None


def get_paper_introduction(url: str) -> Optional[str]:
Expand Down
10 changes: 7 additions & 3 deletions tests/mocks/mocking_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,14 @@ def mock_prompting(
eval_prompt_template = example_config.eval_prompt_template
if prompt[0]['content'] == agent_prompt_template.write_bio['sys_prompt']:
return ['Bio1', 'Bio2', 'Bio3']
elif prompt[0]['content'] == agent_prompt_template.summarize_domain['sys_prompt']:
return ['Domain1; Domain2', 'Domain3; Domain4', 'Domain5; Domain6']
elif prompt[0]['content'] == agent_prompt_template.review_literature['sys_prompt']:
return ['Summary of Target Paper: Summary1. Keywords of Target Paper: keyword1. Valuable Points from Target Paper: Insight1',
'Summary of Target Paper: Summary2. Keywords of Target Paper: keyword2. Valuable Points from Target Paper: Insight2',
'Summary of Target Paper: Summary3. Keywords of Target Paper: keyword3. Valuable Points from Target Paper: Insight3']
return [
'Summary of Target Paper: Summary1. Keywords of Target Paper: keyword1. Valuable Points from Target Paper: Insight1',
'Summary of Target Paper: Summary2. Keywords of Target Paper: keyword2. Valuable Points from Target Paper: Insight2',
'Summary of Target Paper: Summary3. Keywords of Target Paper: keyword3. Valuable Points from Target Paper: Insight3',
]
elif prompt[0]['content'] == agent_prompt_template.brainstorm_idea['sys_prompt']:
return ['Idea1', 'Idea2', 'Idea3']
elif prompt[0]['content'] == agent_prompt_template.discuss_idea['sys_prompt']:
Expand Down
20 changes: 8 additions & 12 deletions tests/utils/test_model_prompting.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,10 @@

def test_openai_call() -> None:
# supported by OPENAI_API_KEY
prompt = [
{
'role': 'user',
'content': 'Here is a high-level summarized insight of a research field Machine Learning. ',
}
]
prompt = [{
'role': 'user',
'content': 'Here is a high-level summarized insight of a research field Machine Learning. ',
}]
response = model_prompting('gpt-4o-mini', prompt, mode='TEST')
assert response is not None
assert len(response) > 0
Expand All @@ -17,12 +15,10 @@ def test_openai_call() -> None:

def test_togetherai_mistral_call() -> None:
# supported by TOGETHERAI_API_KEY
prompt = [
{
'role': 'user',
'content': 'Here is a high-level summarized insight of a research field Machine Learning. ',
}
]
prompt = [{
'role': 'user',
'content': 'Here is a high-level summarized insight of a research field Machine Learning. ',
}]
response = model_prompting(
'together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1', prompt, mode='TEST'
)
Expand Down
8 changes: 6 additions & 2 deletions tests/utils/test_string_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def test_map_paper_list_to_str() -> None:
{'abstract': 'Abstract 2'},
{'abstract': 'Abstract 3'},
]
expected_result = '1th paper: Abstract 1\n2th paper: Abstract 2\n3th paper: Abstract 3\n'
expected_result = (
'1th paper: Abstract 1\n2th paper: Abstract 2\n3th paper: Abstract 3\n'
)
assert map_paper_list_to_str(papers) == expected_result


Expand Down Expand Up @@ -129,7 +131,9 @@ def test_map_insight_list_to_str() -> None:
{'content': 'Insight 2'},
{'content': 'Insight 3'},
]
expected_result = '1th insight: Insight 1\n2th insight: Insight 2\n3th insight: Insight 3\n'
expected_result = (
'1th insight: Insight 1\n2th insight: Insight 2\n3th insight: Insight 3\n'
)
assert map_insight_list_to_str(insights) == expected_result


Expand Down

0 comments on commit 28e8809

Please sign in to comment.