From eaf28a772dbbca3e5719c01ca8ae8c9271bc84ae Mon Sep 17 00:00:00 2001 From: adhishthite Date: Tue, 19 Mar 2024 10:39:52 +0530 Subject: [PATCH] implemented Marathi crawler --- app/core/crawler.py | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/app/core/crawler.py b/app/core/crawler.py index f4ca9b1..fb4421e 100644 --- a/app/core/crawler.py +++ b/app/core/crawler.py @@ -47,8 +47,8 @@ def process_and_store_page(url: str): data = {"title": title, "url": url, "content": text} - # create a unique file name using hashing algorithm - file_name = hashlib.md5(string=url.encode()).hexdigest() + # Create a unique file name using MD5 hashing algorithm + file_name = get_string_hash(url) with open( os.path.join("data", f"{file_name}.json"), "w" @@ -58,7 +58,7 @@ def process_and_store_page(url: str): def crawl_and_process_page(args): """Crawls and processes a single Wikipedia page.""" - current_url, depth, max_depth, rp, visited, to_visit = args + current_url, depth, max_depth, rp, visited, to_visit, crawled_urls = args new_links = [] @@ -67,7 +67,12 @@ def crawl_and_process_page(args): if rp.can_fetch("*", current_url): print(f"Visiting: {current_url} at depth {depth}") - process_and_store_page(current_url) + + url_hash = get_string_hash(current_url) + if url_hash not in crawled_urls: + process_and_store_page(current_url) + else: + print(f"Skipping: (Already processed) {url_hash}") if depth < max_depth: links = get_wiki_links(current_url) @@ -85,6 +90,7 @@ def crawl_and_process_page(args): def crawl_wikipedia(start_url: str, max_depth: int = 2): """Crawls Wikipedia pages.""" init_data_folder() # Create data folder if it doesn't exist + crawled_urls: set = get_crawled_urls() # Get a list of already crawled URLs rp = RobotFileParser() rp.set_url("https://mr.wikipedia.org/robots.txt") @@ -94,11 +100,11 @@ def crawl_wikipedia(start_url: str, max_depth: int = 2): visited = manager.list() to_visit = manager.list([(start_url, 0)]) # A queue of (url, depth) pairs - with Pool(processes=8) as pool: + with Pool(processes=os.cpu_count()) as pool: with tqdm(total=0, desc="Crawling", unit="page") as pbar: while to_visit: args_list = [ - (url, depth, max_depth, rp, visited, to_visit) + (url, depth, max_depth, rp, visited, to_visit, crawled_urls) for url, depth in to_visit ] results = pool.map(crawl_and_process_page, args_list) @@ -108,6 +114,19 @@ def crawl_wikipedia(start_url: str, max_depth: int = 2): pbar.update(len(args_list)) +def get_crawled_urls(data_path: str = "data") -> set[str]: + """Returns a list of crawled URLs.""" + if not os.path.exists(data_path): + return set() + # Strip the extension and get the hash of the filename + return set([file_name.split(".")[0] for file_name in os.listdir(data_path)]) + + +def get_string_hash(string: str) -> str: + """Returns the MD5 hash of a string.""" + return hashlib.md5(string.encode()).hexdigest() + + if __name__ == "__main__": - start_page = "https://mr.wikipedia.org/wiki/%E0%A4%AE%E0%A4%B9%E0%A4%BE%E0%A4%B0%E0%A4%BE%E0%A4%B7%E0%A5%8D%E0%A4%9F%E0%A5%8D%E0%A4%B0" + start_page = "https://mr.wikipedia.org/wiki/%E0%A4%A4%E0%A4%BF%E0%A4%AC%E0%A5%87%E0%A4%9F%E0%A5%80_%E0%A4%AD%E0%A4%BE%E0%A4%B7%E0%A4%BE" crawl_wikipedia(start_url=start_page, max_depth=3) # Start the crawl