From b25c1e182a5d3116b7d851e55048735bb3585dbe Mon Sep 17 00:00:00 2001 From: TofyLion Date: Sun, 8 Sep 2024 19:43:09 +0300 Subject: [PATCH] Updated crawler for new immowelt website --- flathunter/crawler/immowelt.py | 56 +++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/flathunter/crawler/immowelt.py b/flathunter/crawler/immowelt.py index 8f6b113f..070c58fa 100644 --- a/flathunter/crawler/immowelt.py +++ b/flathunter/crawler/immowelt.py @@ -49,61 +49,68 @@ def get_expose_details(self, expose): def extract_data(self, soup: BeautifulSoup): """Extracts all exposes from a provided Soup object""" entries = [] - soup_res = soup.find("main") + soup_res = soup if not isinstance(soup_res, Tag): return [] - title_elements = soup_res.find_all("h2") - expose_ids = soup_res.find_all("a", id=True) + advertisements = soup_res.find_all("div", attrs={"class": "css-79elbk"}) + for adv in advertisements: + try: + title = adv.find("div", {"class": "css-1cbj9xw"}).text + except: + title = "" - for idx, title_el in enumerate(title_elements): try: - price = expose_ids[idx].find( - "div", attrs={"data-test": "price"}).text - except IndexError: + price = adv.find( + "div", attrs={"data-test": "cardmfe-price-testid"}).text + except: price = "" try: - size = expose_ids[idx].find( - "div", attrs={"data-test": "area"}).text + descriptions = adv.find("div", attrs={"data-testid": "cardmfe-keyfacts-testid"}).children + descriptions = [result.text for result in descriptions] + except: + descriptions = [] + + size = list(filter(lambda x: "m²" in x, descriptions)) + try: + size = size[0] except IndexError: size = "" + rooms = list(filter(lambda x: "Zimmer" in x, descriptions)) try: - rooms = expose_ids[idx].find( - "div", attrs={"data-test": "rooms"}).text.replace(" Zi.", "") + rooms = rooms[0] except IndexError: rooms = "" + id_element = adv.find("a") try: - url = expose_ids[idx].get("href") + url = "https://immowelt.de" + id_element.get("href") except IndexError: continue - picture = expose_ids[idx].find("picture") + picture = adv.find("img") image = None if picture: - src = picture.find("source") - if src: - image = src.get("data-srcset") + image = picture.get('src') try: - address = expose_ids[idx].find( - "div", attrs={"class": re.compile("IconFact.*")} - ) - address = address.find("span").text + address = adv.find( + "div", attrs={"data-testid": "cardmfe-description-box-address"} + ).text except (IndexError, AttributeError): address = "" - + ad_id = url.split('/')[-1] processed_id = int( - hashlib.sha256(expose_ids[idx].get("id").encode('utf-8')).hexdigest(), 16 + hashlib.sha256(ad_id.encode('utf-8')).hexdigest(), 16 ) % 10**16 details = { 'id': processed_id, 'image': image, 'url': url, - 'title': title_el.text.strip(), + 'title': title.strip(), 'rooms': rooms, 'price': price, 'size': size, @@ -113,5 +120,4 @@ def extract_data(self, soup: BeautifulSoup): entries.append(details) logger.debug('Number of entries found: %d', len(entries)) - - return entries + return entries \ No newline at end of file