Skip to content

Commit

Permalink
fix: [crawler] fix crawler queue stats
Browse files Browse the repository at this point in the history
  • Loading branch information
Terrtia committed Sep 17, 2024
1 parent cc7e67d commit a20b605
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 0 deletions.
2 changes: 2 additions & 0 deletions bin/crawlers/Crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ def __init__(self):
crawlers.load_blacklist()
# update captures cache
crawlers.reload_crawler_captures()
# update crawler queue stats
crawlers.reload_crawlers_stats()

self.crawler_scheduler = crawlers.CrawlerScheduler()

Expand Down
10 changes: 10 additions & 0 deletions bin/lib/crawlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1018,6 +1018,16 @@ def get_crawlers_stats(domain_type=None):
stats[domain_type] = {'queue': queue, 'up': up, 'down': down, 'crawled': crawled}
return stats

def reload_crawlers_stats():
for domain_type in get_crawler_all_types():
to_remove = []
for task_uuid in r_crawler.smembers(f'crawler:queue:type:{domain_type}'):
task = CrawlerTask(task_uuid)
if not task.exists():
to_remove.append(task_uuid)
for task_uuid in to_remove:
r_crawler.srem(f'crawler:queue:type:{domain_type}', task_uuid)

#### Blocklist ####

def get_blacklist():
Expand Down

0 comments on commit a20b605

Please sign in to comment.