fix: [crawler] fix crawler queue stats

ail-project · Sep 17, 2024 · a20b605 · a20b605
1 parent cc7e67d
commit a20b605
Show file tree

Hide file tree

Showing 2 changed files with 12 additions and 0 deletions.
diff --git a/bin/crawlers/Crawler.py b/bin/crawlers/Crawler.py
@@ -61,6 +61,8 @@ def __init__(self):
         crawlers.load_blacklist()
         # update captures cache
         crawlers.reload_crawler_captures()
+        # update crawler queue stats
+        crawlers.reload_crawlers_stats()
 
         self.crawler_scheduler = crawlers.CrawlerScheduler()
 

diff --git a/bin/lib/crawlers.py b/bin/lib/crawlers.py
@@ -1018,6 +1018,16 @@ def get_crawlers_stats(domain_type=None):
         stats[domain_type] = {'queue': queue, 'up': up, 'down': down, 'crawled': crawled}
     return stats
 
+def reload_crawlers_stats():
+    for domain_type in get_crawler_all_types():
+        to_remove = []
+        for task_uuid in r_crawler.smembers(f'crawler:queue:type:{domain_type}'):
+            task = CrawlerTask(task_uuid)
+            if not task.exists():
+                to_remove.append(task_uuid)
+        for task_uuid in to_remove:
+            r_crawler.srem(f'crawler:queue:type:{domain_type}', task_uuid)
+
 #### Blocklist ####
 
 def get_blacklist():