-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
177 lines (157 loc) · 5.54 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env python3
import asyncio
import argparse
import logging
from datetime import datetime
from pathlib import Path
from typing import Optional
import yaml
import pytz
from src.core.connector import ArxivZoteroCollector
from src.core.search_params import ArxivSearchParams
from src.utils.credentials import load_credentials, CredentialsError
from src.utils.summarizer import PaperSummarizer
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(),
logging.FileHandler('arxiv_zotero.log', mode='a', encoding='utf-8')
]
)
logger = logging.getLogger(__name__)
def parse_date(date_str: Optional[str]) -> Optional[datetime]:
"""Parse date string in YYYY-MM-DD format"""
if not date_str:
return None
try:
# Add timezone awareness to parsed date
parsed_date = datetime.strptime(date_str, '%Y-%m-%d')
return parsed_date.replace(tzinfo=pytz.UTC) # Make timezone-aware
except ValueError:
raise argparse.ArgumentTypeError(f"Invalid date format: {date_str}. Use YYYY-MM-DD")
def load_yaml_config(config_path: Path) -> dict:
"""Load search parameters from YAML configuration file"""
try:
with open(config_path, 'r') as f:
return yaml.safe_load(f)
except Exception as e:
raise argparse.ArgumentTypeError(f"Error loading config file: {str(e)}")
def parse_arguments():
"""Parse command line arguments"""
parser = argparse.ArgumentParser(
description='ArXiv-Zotero Connector: Download and organize arXiv papers in Zotero'
)
parser.add_argument(
'--config',
type=Path,
help='Path to YAML configuration file'
)
# Search parameters
parser.add_argument(
'--keywords', '-k',
nargs='+',
help='Keywords to search for (space-separated)'
)
parser.add_argument(
'--title', '-t',
help='Search specifically in paper titles'
)
parser.add_argument(
'--categories', '-c',
nargs='+',
help='arXiv categories to search in (e.g., cs.AI cs.MA)'
)
parser.add_argument(
'--author', '-a',
help='Author name to search for'
)
parser.add_argument(
'--start-date',
type=parse_date,
help='Start date for papers (YYYY-MM-DD)'
)
parser.add_argument(
'--end-date',
type=parse_date,
help='End date for papers (YYYY-MM-DD)'
)
parser.add_argument(
'--content-type',
choices=['journal', 'conference', 'preprint'],
help='Type of content to filter for'
)
parser.add_argument(
'--max-results', '-m',
type=int,
default=50,
help='Maximum number of results to retrieve (default: 50)'
)
# Application settings
parser.add_argument(
'--env-file',
type=Path,
help='Path to .env file containing credentials'
)
parser.add_argument(
'--no-pdf',
action='store_true',
help='Skip downloading PDFs'
)
return parser.parse_args()
async def main():
"""Main entry point for the application"""
args = parse_arguments()
collector = None
config_params = {}
try:
# Load credentials
credentials = load_credentials(args.env_file)
# Load YAML config if provided
if args.config:
config_params = load_yaml_config(args.config)
summarizer = None
if config_params.get('summarizer', {}).get('enabled'):
summarizer = PaperSummarizer(
api_key=credentials['gemini_api_key'], # Add Gemini API key to credentials
config=config_params
)
# Initialize collector
collector = ArxivZoteroCollector(
zotero_library_id=credentials['library_id'],
zotero_api_key=credentials['api_key'],
collection_key=credentials['collection_key'],
summarizer=summarizer, # Pass summarizer to collector
config=config_params
)
# Merge command line arguments with config file, preferring command line
search_params = ArxivSearchParams(
keywords=args.keywords or config_params.get('keywords'),
title_search=args.title or config_params.get('title_search'),
categories=args.categories or config_params.get('categories'),
author=args.author or config_params.get('author'),
start_date=args.start_date or parse_date(config_params.get('start_date')),
end_date=args.end_date or parse_date(config_params.get('end_date')),
content_type=args.content_type or config_params.get('content_type'),
max_results=args.max_results if args.max_results != 50 else config_params.get('max_results', 50)
)
# Run collection process
successful, failed = await collector.run_collection_async(
search_params=search_params,
download_pdfs=not args.no_pdf
)
logger.info(f"Collection complete. Successfully processed: {successful}, Failed: {failed}")
except CredentialsError as e:
logger.error(f"Credential error: {str(e)}")
return 1
except Exception as e:
logger.error(f"Application error: {str(e)}")
return 1
finally:
if collector:
await collector.close()
return 0 if failed == 0 else 1
if __name__ == "__main__":
exit_code = asyncio.run(main())
exit(exit_code)