diff --git a/Raw DB Tasks/__pycache__/settings.cpython-310.pyc b/Raw DB Tasks/__pycache__/settings.cpython-310.pyc index 3a1f70a..1451921 100644 Binary files a/Raw DB Tasks/__pycache__/settings.cpython-310.pyc and b/Raw DB Tasks/__pycache__/settings.cpython-310.pyc differ diff --git a/__pycache__/config.cpython-310.pyc b/__pycache__/config.cpython-310.pyc index bc3cd4e..bf94cfa 100644 Binary files a/__pycache__/config.cpython-310.pyc and b/__pycache__/config.cpython-310.pyc differ diff --git a/extractors/__pycache__/__init__.cpython-310.pyc b/extractors/__pycache__/__init__.cpython-310.pyc index 3495dbb..fa8e1a8 100644 Binary files a/extractors/__pycache__/__init__.cpython-310.pyc and b/extractors/__pycache__/__init__.cpython-310.pyc differ diff --git a/extractors/__pycache__/items.cpython-310.pyc b/extractors/__pycache__/items.cpython-310.pyc index ff31a0a..2332f75 100644 Binary files a/extractors/__pycache__/items.cpython-310.pyc and b/extractors/__pycache__/items.cpython-310.pyc differ diff --git a/extractors/__pycache__/pipelines.cpython-310.pyc b/extractors/__pycache__/pipelines.cpython-310.pyc index 42ecea5..9db3d1a 100644 Binary files a/extractors/__pycache__/pipelines.cpython-310.pyc and b/extractors/__pycache__/pipelines.cpython-310.pyc differ diff --git a/extractors/__pycache__/settings.cpython-310.pyc b/extractors/__pycache__/settings.cpython-310.pyc index 78d6ec9..8ef0207 100644 Binary files a/extractors/__pycache__/settings.cpython-310.pyc and b/extractors/__pycache__/settings.cpython-310.pyc differ diff --git a/extractors/__pycache__/utils.cpython-310.pyc b/extractors/__pycache__/utils.cpython-310.pyc index ffbb07f..7819fe6 100644 Binary files a/extractors/__pycache__/utils.cpython-310.pyc and b/extractors/__pycache__/utils.cpython-310.pyc differ diff --git a/extractors/items.py b/extractors/items.py index 1befaed..14f0359 100644 --- a/extractors/items.py +++ b/extractors/items.py @@ -21,3 +21,5 @@ class MarketItem(scrapy.Item): oldPrice = scrapy.Field() productProcessTime= scrapy.Field() productProcessSize= scrapy.Field() + variant= scrapy.Field() + discountType= scrapy.Field() diff --git a/extractors/pipelines.py b/extractors/pipelines.py index 41fff87..55a51da 100644 --- a/extractors/pipelines.py +++ b/extractors/pipelines.py @@ -79,6 +79,10 @@ def process_item(self, item, spider): productToSave["productLocalId"] = product["productLocalId"] productToSave["productProcessTime"] = product["productProcessTime"] productToSave["productProcessSize"] = product["productProcessSize"] + try: + productToSave["productVariants"] = product["variant"] + except Exception as error: + print("no variant") # add necessary data related to collections. productToSave["lastUpdate"] = datetime.timestamp(datetime.now()) @@ -112,13 +116,21 @@ def process_item(self, item, spider): try: oldPrice = float(sub(r'[^\d.]', '', product["oldPrice"])) currentPrice = float(sub(r'[^\d.]', '', product["price"])) - discountValue = 100 - currentPrice * 100 / oldPrice or 0 - price["productDiscountValue"] = float(f'{discountValue:.2f}') + if product["discountType"] == "Percent": + discountValue = 100 - currentPrice * 100 / oldPrice or 0 + elif product["discountType"] == "Fixed": + discountValue = oldPrice - currentPrice + + discountValue = int(discountValue) + price["productDiscount"] = { + "productDiscountValue" : discountValue, + "productDiscountType" : product["discountType"] + } price["productOldPrice"] = oldPrice except Exception as inst: print(inst) price["productOldPrice"] = float(format(0, '.2f')) - price["productDiscountValue"] = float(format(0, '.2f')) + price["productDiscount"] = {} self.productPriceHistoryCollection.insert_one(price) return item diff --git a/extractors/selectors/__pycache__/amazon.cpython-310.pyc b/extractors/selectors/__pycache__/amazon.cpython-310.pyc index 45edec0..b17d136 100644 Binary files a/extractors/selectors/__pycache__/amazon.cpython-310.pyc and b/extractors/selectors/__pycache__/amazon.cpython-310.pyc differ diff --git a/extractors/selectors/amazon.py b/extractors/selectors/amazon.py index 70b22ec..d871e9c 100644 --- a/extractors/selectors/amazon.py +++ b/extractors/selectors/amazon.py @@ -20,20 +20,51 @@ "userRatingCount": ['//span[@id="acrCustomerReviewText"]/text()'], "userRatingStar": ['//span[@id="acrPopover"]/@title'], "price": [ - # '//span[contains(@class,"a-price")]/span[1]/text()', '//div[@id="centerCol"]/div[ - # @id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span', - # '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[ - # contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]', - '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[' - 'contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()', - '//div[@id="centerCol"]/div/div[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span/text()' + '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()', + '//*[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span[1]/text()', + '//span[contains(@class, "priceToPay")]/span[1]/text()', + '//*[@id="snsDetailPagePrice"]/span[@id="sns-base-price"]/text()', + '//*[@id="priceblock_ourprice"]/text()', + '//*[@id="corePrice_desktop"]/div/table/tr[2]/td[2]/span[1]/span[1]/text()' ], "oldPrice": [ - '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[' - 'contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()', - # '//span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()', + '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()', + '//*[@id="corePrice_desktop"]/div/table/tr[1]/td[2]/span[@data-a-strike="true"]/span[1]/text()' + + ], + "discountType":[ + '//*[@id="savingsPercentage"]/text()', + '//*[@id="corePrice_desktop"]/div/table/tr[3]/td[2]/span[1]/text()', + ], "variants": [ - '//li[@data-defaultasin]/@data-defaultasin' + '//li[@data-defaultasin]/@data-dp-url', + '//option[@class="dropdownAvailable"]/@value' + ], + "variantName":[ + '//div[contains(@class,"twisterTextDiv")]/p/text()', + '/@data-a-html-content' + ], + 'variantPrice':[ + '//p[contains(@class,"twisterSwatchPrice")]/text()' + ], + 'variantGroups':[ + '//form[@id="twister"]/div[contains(@id,"variation_")]' ] } + + #price data + # '//span[contains(@class,"a-price")]/span[1]/text()', '//div[@id="centerCol"]/div[ + # @id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span', + # '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[ + # contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]', + + # '//span[contains(@class, "apexPriceToPay")]/span[1]/text()', + # '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()', + # '//div[@id="centerCol"]/div/div[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span/text()', + # '//*[@id="snsDetailPagePrice"]/span[@id="sns-base-price"]/text()', + # '//*[@id="corePrice_desktop"]/div/table/tr[2]/td[2]/span[1]/span[1]/text()', + # '//*[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span[contains(@class,"priceToPay")]/span[1]/text()' + + # old price data + # '//span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()', \ No newline at end of file diff --git a/extractors/spiders/__pycache__/__init__.cpython-310.pyc b/extractors/spiders/__pycache__/__init__.cpython-310.pyc index 4cced2f..ab1fb6b 100644 Binary files a/extractors/spiders/__pycache__/__init__.cpython-310.pyc and b/extractors/spiders/__pycache__/__init__.cpython-310.pyc differ diff --git a/extractors/spiders/__pycache__/newegg.cpython-310.pyc b/extractors/spiders/__pycache__/newegg.cpython-310.pyc index fd0216c..0c162a6 100644 Binary files a/extractors/spiders/__pycache__/newegg.cpython-310.pyc and b/extractors/spiders/__pycache__/newegg.cpython-310.pyc differ diff --git a/extractors/spiders/amazon.py b/extractors/spiders/amazon.py index 580408f..2705d2d 100644 --- a/extractors/spiders/amazon.py +++ b/extractors/spiders/amazon.py @@ -4,7 +4,7 @@ from scrapy.utils.project import get_project_settings from extractors.items import MarketItem -from extractors.utils import getCategoryName, getElement, getRandomUAgents +from extractors.utils import getCategoryName, getElement, getRandomUAgents, cleanUrl from extractors.selectors.amazon import selectors from dataclasses import asdict @@ -13,6 +13,7 @@ from urllib.parse import urljoin from urllib.parse import unquote import copy +import uuid import random @@ -24,6 +25,9 @@ class AmazonSpider(scrapy.Spider): baseUrl = "https://www.amazon.com" + env = "dev" + # env = "prod" + # custom_settings = { # 'CONCURRENT_REQUESTS':30, # 'DOWNLOAD_DELAY': requestInterval @@ -34,68 +38,76 @@ def start_requests(self): This method is to get content of given category url. """ - # url = "https://www.amazon.com/Azzaro-Wanted-Eau-Toilette-5-1/dp/B078P7YZ3L/ref=sxin_15_pa_sp_search_thematic_sspa?content-id=amzn1.sym.ee6a664f-a1c5-4f93-a61f-81d41af42efc%3Aamzn1.sym.ee6a664f-a1c5-4f93-a61f-81d41af42efc&crid=HQB58X9PHWMD&cv_ct_cx=dior+sauvage+men&keywords=dior+sauvage+men&pd_rd_i=B078P7YZ3L&pd_rd_r=1e0d974b-6cda-46c9-a707-8bc83fb8491a&pd_rd_w=YoqOE&pd_rd_wg=0Trhw&pf_rd_p=ee6a664f-a1c5-4f93-a61f-81d41af42efc&pf_rd_r=YZTS4H22J6C2NJ9DG4XD&qid=1669453831&sprefix=dio+savage+me%2Caps%2C340&sr=1-2-cbc80bc4-104b-44f8-8e5c-6397d5250496-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEyTVVNNFJKQkc4SjdTJmVuY3J5cHRlZElkPUEwMjM4Nzk4SE42S1dMTzlKTVhDJmVuY3J5cHRlZEFkSWQ9QTA3ODA4NzkxMDBGR1FYSEFNWkRIJndpZGdldE5hbWU9c3Bfc2VhcmNoX3RoZW1hdGljJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ==" - # self.meta["asin"] = "B078P7YZ3L" - url = "https://www.amazon.com/New-Apple-AirPods-Max-Green/dp/B08PZDSP2Z/ref=sr_1_3?crid=1V8XTXSXHHBI2&keywords=apple+airpods+max&qid=1669453913&sprefix=apple+airpods+max%2Caps%2C335&sr=8-3" - self.meta["asin"] = "B08PZDSP2Z" - # request with category url - yield scrapy.Request(url=url, callback=self.parse_product, - headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta) - # yield scrapy.Request(url=self.categoryUrl, callback=self.parse_category, headers = getRandomUAgents( - # settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta) - - # def parse_category(self, response): - # ''' - # This method is to extract product pages from given category - - # ''' - - # # check if the Captcha exists. - # if response.css('#captchacharacters').extract_first(): - # self.log("Captcha found") - - # # get products from the category - # products = getElement(selectors["products"], response).getall() - - # for productLink in products: - - # # get asin - # if re.search(r'dp\/(.*)\/', productLink): - # asin = re.search(r'dp\/(.*)\/', productLink).group(1) - # else: - # asin = "" - - # # get current link - # productUrl = urljoin(self.baseUrl, productLink) - - # # get rid of unnecessary query params - # if re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl): - # realProductlink = re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl).group(0) - # else: - # realProductlink = "" - - # # get product page if asin: if asin not in self.productLists: self.productLists.append(asin) customMeta = - # copy.deepcopy(self.meta) customMeta['asin'] = asin yield scrapy.Request(url=realProductlink, - # callback=self.parse_product,headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), - # meta=customMeta) - - # # get next page url nextPage = getElement(selectors["nextPage"], response).extract_first(default="NA") if - # nextPage: nextUrl = urljoin(self.baseUrl, nextPage) yield scrapy.Request(url=nextUrl, - # callback=self.parse_category, headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), - # meta=self.meta) - - def parse_product(self, response): + test_urls = [ + 'https://www.amazon.com/DreamController-Original-Controller-Compatible-Wireless/dp/B09V37CLLR?th=1', + 'https://www.amazon.com/Razer-Universal-Quick-Charging-Xbox-S/dp/B09DHSJ4SZ', + 'https://www.amazon.com/CableMod-CM-PCSR-FKIT-NKW-R-Cable-Kit-White/dp/B089KPWW3J?th=1', + 'https://www.amazon.com/Azzaro-Most-Wanted-Parfum-Fragrance/dp/B09VN2FCDF/?_encoding=UTF8&pd_rd_w=jVQKE&content-id=amzn1.sym.aa5d5fb8-9ab9-46ea-8709-d60f551faa80&pf_rd_p=aa5d5fb8-9ab9-46ea-8709-d60f551faa80&pf_rd_r=F2CTCZ402NYW0D04S2DQ&pd_rd_wg=7duSD&pd_rd_r=f5ad392d-c089-448e-afc3-213f9cefcfc3&ref_=pd_gw_deals_gi' + + ] + if self.env == "dev": + for url in test_urls: + # self.meta["asin"] = "B08WC2SMSN" + asin = re.search(r'\/[0-9A-Z]{10}',url).group(0) + asin = asin[1:] + self.meta['asin'] = asin + self.productLists = [] + # request with category url + yield scrapy.Request(url=cleanUrl(url), callback=self.parse_product, + headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta, cb_kwargs={"isProduct":True}) + else: + yield scrapy.Request(url=cleanUrl(self.categoryUrl), callback=self.parse_category, headers = getRandomUAgents( + settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta) + + def parse_category(self, response): + ''' + This method is to extract product pages from given category + + ''' + + # check if the Captcha exists. + if response.css('#captchacharacters').extract_first(): + self.log("Captcha found") + + # get products from the category + products = getElement(selectors["products"], response).getall() + + for productLink in products: + + # get asin + if re.search(r'dp\/(.*)\/', productLink): + asin = re.search(r'dp\/(.*)\/', productLink).group(1) + else: + asin = "" + + # get current link + productUrl = urljoin(self.baseUrl, productLink) + + # get rid of unnecessary query params + if re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl): + realProductlink = re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl).group(0) + else: + realProductlink = "" + + # get product page + if asin: + if asin not in self.productLists: + self.productLists.append(asin) + customMeta = copy.deepcopy(self.meta) + customMeta['asin'] = asin + yield scrapy.Request(url=realProductlink, callback=self.parse_product,headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),meta=customMeta, cb_kwargs = {"isProduct":True}) + + # get next page url + nextPage = getElement(selectors["nextPage"], response).extract_first(default="NA") + if nextPage: + nextUrl = urljoin(self.baseUrl, nextPage) + yield scrapy.Request(url=cleanUrl(nextUrl), callback=self.parse_category, headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),meta=self.meta) + + def parse_product(self, response, isProduct = False): """ This method is to extract data from product page. """ - # try: - # with open('response.html', 'w', encoding='utf-8') as file: - # file.write(response.body.decode('utf-8')) - # file.close() - # except Exception: - # print(Exception) - # check if the recaptcha exists. if response.css('#captchacharacters').extract_first(): self.log("Captcha found ") @@ -196,6 +208,20 @@ def parse_product(self, response): # price Item["price"] = getElement(selectors["price"], response).extract_first(default="NA") Item["oldPrice"] = getElement(selectors["oldPrice"], response).extract_first(default="NA") + discountTypeList = getElement(selectors["discountType"], response).getall() + + if Item["price"] != "NA" and Item["oldPrice"] != "NA": + + if len(discountTypeList) > 1: + discountType = discountTypeList[1] + else: + discountType = "Fixed" + else: + discountType = "NA" + if '%' in discountType: + discountType = "Percent" + + Item["discountType"] = discountType # productProcessTime Item["productProcessTime"] = round(response.meta.get('download_latency'), 2) @@ -205,17 +231,76 @@ def parse_product(self, response): Item["productProcessSize"] = round(len(response.body) / 1024, 2) # other variants + + if isProduct: + variantId = str(uuid.uuid5(uuid.NAMESPACE_DNS, response.meta['asin'])) + else: + variantId = response.meta["variantId"] + + variantGroups = getElement(selectors["variantGroups"], response) + variants = getElement(selectors["variants"], response).getall() - base_variant_url = response.url.split("/dp/", 1)[0] - for variant in variants: - if variant != response.meta['asin']: - self.productLists.append(variant) - customMeta = copy.deepcopy(self.meta) - customMeta['asin'] = variant - url = base_variant_url + "/dp/" + variant - yield scrapy.Request(url=url, callback=self.parse_product, - headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), - meta=customMeta) + variantPrices = getElement(selectors["variantPrice"], response).getall() + + if len(variantPrices) <2 and len(variantGroups) < 2: + variantId = "NA" + print('HERE?????') + print(len(variantPrices)) + print(len(variantGroups)) + + #variantId + try: + if response.meta["variantId"] != "NA": + Item["variant"] = { + "variantId": response.meta["variantId"], + "variantName": response.meta["variantName"] + } + except Exception as inst: + if len(variantPrices) > 1: + variantName = response.xpath('//li[@data-defaultasin="'+Item['productLocalId']+'"]' + selectors["variantName"][0]).get() + Item["variant"] = { + "variantId": variantId, + "variantName": variantName + } + if len(variantGroups) > 1: + variantName = "Many Variants" + Item["variant"] = { + "variantId": variantId, + "variantName": variantName + } + for temp_variant in variants: + r = re.search(r'\/[A-Z0-9]{10}\/',temp_variant) + if r is not None: + variant = r.group(0) + variant = variant[1:-1] + else: + r = re.search(r',[A-Z0-9]{10}',temp_variant) + if r is not None: + variant = r.group(0) + variant = variant[1:] + else: + variant = "" + + if variant != "" and variant != response.meta['asin']: + if variant not in self.productLists: + self.productLists.append(variant) + customMeta = copy.deepcopy(self.meta) + customMeta['asin'] = variant + + if len(variantGroups) > 1: + variantName = "Many Variants" + else: + variantName = response.xpath('//li[@data-defaultasin="'+variant+'"]' + selectors["variantName"][0]).get(default = "NA") + if variantName == "NA": + variantName = response.xpath('//option[contains(@value,"'+variant+'")]' + selectors["variantName"][1]).get(default = "NA") + + customMeta["variantId"] = variantId + customMeta["variantName"] = variantName + url = re.sub(r'\/[0-9A-Z]{10}','/'+variant, response.url) + + yield scrapy.Request(url=cleanUrl(url), callback=self.parse_product, + headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), + meta=customMeta) yield Item diff --git a/extractors/utils.py b/extractors/utils.py index 3b29478..f976581 100644 --- a/extractors/utils.py +++ b/extractors/utils.py @@ -1,4 +1,5 @@ import random +import re def getCategoryName(name): name = name.title() @@ -26,4 +27,26 @@ def getRandomUAgents(agents, headers): randIndex = random.randint(0, len(agents)-1) headers["'User-Agent'"] = agents[randIndex] - return headers \ No newline at end of file + return headers + +def cleanUrl(url): + try: + #detect asin as this type /DHA2423SLA/ + search_result = re.search(r'https:\/\/.*?\/[0-9A-Z]{10}\/',url) + + if search_result is not None: + result = search_result.group(0) + result = result[:-1] + else: + search_result = re.search(r'https:\/\/.*?\/[0-9A-Z]{10}\?',url) + if search_result is not None: + result = search_result.group(0) + result = result[:-1] + else: + result = url + + except Exception as inst: + print(inst) + result = url + + return result \ No newline at end of file