Merge pull request #2 from leostech/test

update
ai-to-ai · Dec 4, 2022 · 594d793 · 594d793
2 parents 8970809 + a64a844
commit 594d793
Show file tree

Hide file tree

Showing 15 changed files with 237 additions and 84 deletions.
diff --git a/Raw DB Tasks/__pycache__/settings.cpython-310.pyc b/Raw DB Tasks/__pycache__/settings.cpython-310.pyc
diff --git a/__pycache__/config.cpython-310.pyc b/__pycache__/config.cpython-310.pyc
diff --git a/extractors/__pycache__/__init__.cpython-310.pyc b/extractors/__pycache__/__init__.cpython-310.pyc
diff --git a/extractors/__pycache__/items.cpython-310.pyc b/extractors/__pycache__/items.cpython-310.pyc
diff --git a/extractors/__pycache__/pipelines.cpython-310.pyc b/extractors/__pycache__/pipelines.cpython-310.pyc
diff --git a/extractors/__pycache__/settings.cpython-310.pyc b/extractors/__pycache__/settings.cpython-310.pyc
diff --git a/extractors/__pycache__/utils.cpython-310.pyc b/extractors/__pycache__/utils.cpython-310.pyc
diff --git a/extractors/items.py b/extractors/items.py
@@ -21,3 +21,5 @@ class MarketItem(scrapy.Item):
     oldPrice = scrapy.Field()
     productProcessTime= scrapy.Field()
     productProcessSize= scrapy.Field()
+    variant= scrapy.Field()
+    discountType= scrapy.Field()
diff --git a/extractors/pipelines.py b/extractors/pipelines.py
@@ -79,6 +79,10 @@ def process_item(self, item, spider):
             productToSave["productLocalId"] = product["productLocalId"]
             productToSave["productProcessTime"] = product["productProcessTime"]
             productToSave["productProcessSize"] = product["productProcessSize"]
+            try:
+                productToSave["productVariants"] = product["variant"]
+            except Exception as error:
+                print("no variant")
 
             # add necessary data related to collections.
             productToSave["lastUpdate"] = datetime.timestamp(datetime.now())
@@ -112,13 +116,21 @@ def process_item(self, item, spider):
                 try:
                     oldPrice = float(sub(r'[^\d.]', '', product["oldPrice"]))
                     currentPrice = float(sub(r'[^\d.]', '', product["price"]))
-                    discountValue = 100 - currentPrice * 100 / oldPrice or 0
-                    price["productDiscountValue"] = float(f'{discountValue:.2f}')
+                    if product["discountType"] == "Percent":
+                        discountValue = 100 - currentPrice * 100 / oldPrice or 0
+                    elif product["discountType"] == "Fixed":
+                        discountValue = oldPrice - currentPrice
+
+                    discountValue = int(discountValue)
+                    price["productDiscount"] = {
+                        "productDiscountValue" : discountValue,
+                        "productDiscountType" : product["discountType"]
+                    }
                     price["productOldPrice"] = oldPrice
                 except Exception as inst:
                     print(inst)
                     price["productOldPrice"] = float(format(0, '.2f'))
-                    price["productDiscountValue"] = float(format(0, '.2f'))
+                    price["productDiscount"] = {}
 
                 self.productPriceHistoryCollection.insert_one(price)
         return item
diff --git a/extractors/selectors/__pycache__/amazon.cpython-310.pyc b/extractors/selectors/__pycache__/amazon.cpython-310.pyc
diff --git a/extractors/selectors/amazon.py b/extractors/selectors/amazon.py
@@ -20,20 +20,51 @@
     "userRatingCount": ['//span[@id="acrCustomerReviewText"]/text()'],
     "userRatingStar": ['//span[@id="acrPopover"]/@title'],
     "price": [
-        # '//span[contains(@class,"a-price")]/span[1]/text()', '//div[@id="centerCol"]/div[
-        # @id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span',
-        # '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[
-        # contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]',
-        '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span['
-        'contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()',
-        '//div[@id="centerCol"]/div/div[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span/text()'
+        '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()',
+        '//*[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span[1]/text()',
+        '//span[contains(@class, "priceToPay")]/span[1]/text()',
+        '//*[@id="snsDetailPagePrice"]/span[@id="sns-base-price"]/text()',
+        '//*[@id="priceblock_ourprice"]/text()',
+        '//*[@id="corePrice_desktop"]/div/table/tr[2]/td[2]/span[1]/span[1]/text()'
     ],
     "oldPrice": [
-        '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span['
-        'contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
-        # '//span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
+        '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
+        '//*[@id="corePrice_desktop"]/div/table/tr[1]/td[2]/span[@data-a-strike="true"]/span[1]/text()'
+
+    ],
+    "discountType":[
+        '//*[@id="savingsPercentage"]/text()',
+        '//*[@id="corePrice_desktop"]/div/table/tr[3]/td[2]/span[1]/text()',
+
     ],
     "variants": [
-        '//li[@data-defaultasin]/@data-defaultasin'
+        '//li[@data-defaultasin]/@data-dp-url',
+        '//option[@class="dropdownAvailable"]/@value'
+    ],
+    "variantName":[
+        '//div[contains(@class,"twisterTextDiv")]/p/text()',
+        '/@data-a-html-content'
+    ],
+    'variantPrice':[
+        '//p[contains(@class,"twisterSwatchPrice")]/text()'
+    ],
+    'variantGroups':[
+        '//form[@id="twister"]/div[contains(@id,"variation_")]'
     ]
 }
+
+        #price data
+        # '//span[contains(@class,"a-price")]/span[1]/text()', '//div[@id="centerCol"]/div[
+        # @id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span',
+        # '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[
+        # contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]',
+
+        # '//span[contains(@class, "apexPriceToPay")]/span[1]/text()',
+        # '//div[@id="centerCol"]/div[@id="apex_desktop"]/div/div/div[@id="corePrice_desktop"]/div/table/tr/td/span[contains(@class,"a-price") and not(contains(@data-a-strike,"true"))]/span[1]/text()',
+        # '//div[@id="centerCol"]/div/div[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span/span/text()',
+        # '//*[@id="snsDetailPagePrice"]/span[@id="sns-base-price"]/text()',
+        # '//*[@id="corePrice_desktop"]/div/table/tr[2]/td[2]/span[1]/span[1]/text()',
+        # '//*[@id="corePriceDisplay_desktop_feature_div"]/div[1]/span[contains(@class,"priceToPay")]/span[1]/text()'
+
+        # old price data
+        # '//span[contains(@class,"a-price") and @data-a-strike="true"]/span[1]/text()',
diff --git a/extractors/spiders/__pycache__/__init__.cpython-310.pyc b/extractors/spiders/__pycache__/__init__.cpython-310.pyc
diff --git a/extractors/spiders/__pycache__/newegg.cpython-310.pyc b/extractors/spiders/__pycache__/newegg.cpython-310.pyc
diff --git a/extractors/spiders/amazon.py b/extractors/spiders/amazon.py
@@ -4,7 +4,7 @@
 from scrapy.utils.project import get_project_settings
 
 from extractors.items import MarketItem
-from extractors.utils import getCategoryName, getElement, getRandomUAgents
+from extractors.utils import getCategoryName, getElement, getRandomUAgents, cleanUrl
 from extractors.selectors.amazon import selectors
 
 from dataclasses import asdict
@@ -13,6 +13,7 @@
 from urllib.parse import urljoin
 from urllib.parse import unquote
 import copy
+import uuid
 
 import random
 
@@ -24,6 +25,9 @@ class AmazonSpider(scrapy.Spider):
 
     baseUrl = "https://www.amazon.com"
 
+    env = "dev"
+    # env = "prod"
+
     # custom_settings = {
     #      'CONCURRENT_REQUESTS':30,
     #      'DOWNLOAD_DELAY': requestInterval
@@ -34,68 +38,76 @@ def start_requests(self):
             This method is to get content of given category url.
 
         """
-        # url = "https://www.amazon.com/Azzaro-Wanted-Eau-Toilette-5-1/dp/B078P7YZ3L/ref=sxin_15_pa_sp_search_thematic_sspa?content-id=amzn1.sym.ee6a664f-a1c5-4f93-a61f-81d41af42efc%3Aamzn1.sym.ee6a664f-a1c5-4f93-a61f-81d41af42efc&crid=HQB58X9PHWMD&cv_ct_cx=dior+sauvage+men&keywords=dior+sauvage+men&pd_rd_i=B078P7YZ3L&pd_rd_r=1e0d974b-6cda-46c9-a707-8bc83fb8491a&pd_rd_w=YoqOE&pd_rd_wg=0Trhw&pf_rd_p=ee6a664f-a1c5-4f93-a61f-81d41af42efc&pf_rd_r=YZTS4H22J6C2NJ9DG4XD&qid=1669453831&sprefix=dio+savage+me%2Caps%2C340&sr=1-2-cbc80bc4-104b-44f8-8e5c-6397d5250496-spons&psc=1&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEyTVVNNFJKQkc4SjdTJmVuY3J5cHRlZElkPUEwMjM4Nzk4SE42S1dMTzlKTVhDJmVuY3J5cHRlZEFkSWQ9QTA3ODA4NzkxMDBGR1FYSEFNWkRIJndpZGdldE5hbWU9c3Bfc2VhcmNoX3RoZW1hdGljJmFjdGlvbj1jbGlja1JlZGlyZWN0JmRvTm90TG9nQ2xpY2s9dHJ1ZQ=="
-        # self.meta["asin"] = "B078P7YZ3L"
-        url = "https://www.amazon.com/New-Apple-AirPods-Max-Green/dp/B08PZDSP2Z/ref=sr_1_3?crid=1V8XTXSXHHBI2&keywords=apple+airpods+max&qid=1669453913&sprefix=apple+airpods+max%2Caps%2C335&sr=8-3"
-        self.meta["asin"] = "B08PZDSP2Z"
-        # request with  category url
-        yield scrapy.Request(url=url, callback=self.parse_product,
-                             headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta)
-        # yield scrapy.Request(url=self.categoryUrl, callback=self.parse_category, headers = getRandomUAgents(
-        # settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta)
-
-    # def parse_category(self, response):
-    #     '''
-    #         This method is to extract product pages from given category
-
-    #     '''
-
-    #     # check if the Captcha exists.
-    #     if response.css('#captchacharacters').extract_first():
-    #         self.log("Captcha found")
-
-    #     # get products from the category
-    #     products = getElement(selectors["products"], response).getall()
-
-    #     for productLink in products:
-
-    #        # get asin
-    #         if re.search(r'dp\/(.*)\/', productLink):
-    #             asin = re.search(r'dp\/(.*)\/', productLink).group(1)
-    #         else:
-    #             asin = ""
-
-    #         # get current link
-    #         productUrl = urljoin(self.baseUrl, productLink)
-
-    #         # get rid of unnecessary query params
-    #         if re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl):
-    #             realProductlink = re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl).group(0)
-    #         else:
-    #             realProductlink = ""
-
-    # # get product page if asin: if asin not in self.productLists: self.productLists.append(asin) customMeta =
-    # copy.deepcopy(self.meta) customMeta['asin'] = asin yield scrapy.Request(url=realProductlink,
-    # callback=self.parse_product,headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),
-    # meta=customMeta)
-
-    # # get next page url nextPage = getElement(selectors["nextPage"], response).extract_first(default="NA") if
-    # nextPage: nextUrl = urljoin(self.baseUrl, nextPage) yield scrapy.Request(url=nextUrl,
-    # callback=self.parse_category, headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),
-    # meta=self.meta)
-
-    def parse_product(self, response):
+        test_urls = [
+            'https://www.amazon.com/DreamController-Original-Controller-Compatible-Wireless/dp/B09V37CLLR?th=1',
+            'https://www.amazon.com/Razer-Universal-Quick-Charging-Xbox-S/dp/B09DHSJ4SZ',
+            'https://www.amazon.com/CableMod-CM-PCSR-FKIT-NKW-R-Cable-Kit-White/dp/B089KPWW3J?th=1',
+            'https://www.amazon.com/Azzaro-Most-Wanted-Parfum-Fragrance/dp/B09VN2FCDF/?_encoding=UTF8&pd_rd_w=jVQKE&content-id=amzn1.sym.aa5d5fb8-9ab9-46ea-8709-d60f551faa80&pf_rd_p=aa5d5fb8-9ab9-46ea-8709-d60f551faa80&pf_rd_r=F2CTCZ402NYW0D04S2DQ&pd_rd_wg=7duSD&pd_rd_r=f5ad392d-c089-448e-afc3-213f9cefcfc3&ref_=pd_gw_deals_gi'
+
+        ]
+        if self.env == "dev":
+            for url in test_urls:
+                # self.meta["asin"] = "B08WC2SMSN"
+                asin = re.search(r'\/[0-9A-Z]{10}',url).group(0)
+                asin = asin[1:]
+                self.meta['asin'] = asin
+                self.productLists = []
+                # request with  category url
+                yield scrapy.Request(url=cleanUrl(url), callback=self.parse_product,
+                                     headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta, cb_kwargs={"isProduct":True})
+        else:
+            yield scrapy.Request(url=cleanUrl(self.categoryUrl), callback=self.parse_category, headers = getRandomUAgents(
+            settings.get('UAGENTS'), settings.get('HEADERS')), meta=self.meta)
+
+    def parse_category(self, response):
+        '''
+            This method is to extract product pages from given category
+
+        '''
+
+        # check if the Captcha exists.
+        if response.css('#captchacharacters').extract_first():
+            self.log("Captcha found")
+
+        # get products from the category
+        products = getElement(selectors["products"], response).getall()
+
+        for productLink in products:
+
+           # get asin
+            if re.search(r'dp\/(.*)\/', productLink):
+                asin = re.search(r'dp\/(.*)\/', productLink).group(1)
+            else:
+                asin = ""
+
+            # get current link
+            productUrl = urljoin(self.baseUrl, productLink)
+
+            # get rid of unnecessary query params
+            if re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl):
+                realProductlink = re.search(r'https:\/\/[^\/]+\/[^\/]+\/dp\/[^\/]+',productUrl).group(0)
+            else:
+                realProductlink = ""
+
+            # get product page
+            if asin: 
+                if asin not in self.productLists:
+                    self.productLists.append(asin) 
+                    customMeta = copy.deepcopy(self.meta) 
+                    customMeta['asin'] = asin 
+                    yield scrapy.Request(url=realProductlink, callback=self.parse_product,headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),meta=customMeta, cb_kwargs = {"isProduct":True})
+
+        # get next page url 
+        nextPage = getElement(selectors["nextPage"], response).extract_first(default="NA") 
+        if nextPage: 
+            nextUrl = urljoin(self.baseUrl, nextPage)
+            yield scrapy.Request(url=cleanUrl(nextUrl), callback=self.parse_category, headers = getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),meta=self.meta)
+
+    def parse_product(self, response, isProduct = False):
         """
             This method is to extract data from product page.
         """
 
-        # try: 
-        #     with open('response.html', 'w', encoding='utf-8') as file:
-        #         file.write(response.body.decode('utf-8'))
-        #     file.close()
-        # except Exception:
-        #     print(Exception)
-
         # check if the recaptcha exists.
         if response.css('#captchacharacters').extract_first():
             self.log("Captcha found ")
@@ -196,6 +208,20 @@ def parse_product(self, response):
         # price
         Item["price"] = getElement(selectors["price"], response).extract_first(default="NA")
         Item["oldPrice"] = getElement(selectors["oldPrice"], response).extract_first(default="NA")
+        discountTypeList = getElement(selectors["discountType"], response).getall()
+
+        if Item["price"] != "NA" and Item["oldPrice"] != "NA":
+
+            if len(discountTypeList) > 1:
+                discountType = discountTypeList[1]
+            else:
+                discountType = "Fixed"
+        else:
+            discountType = "NA"
+        if '%' in discountType:
+            discountType = "Percent"
+
+        Item["discountType"] = discountType
 
         # productProcessTime
         Item["productProcessTime"] = round(response.meta.get('download_latency'), 2)
@@ -205,17 +231,76 @@ def parse_product(self, response):
         Item["productProcessSize"] = round(len(response.body) / 1024, 2)
 
         # other variants
+
+        if isProduct:
+            variantId = str(uuid.uuid5(uuid.NAMESPACE_DNS, response.meta['asin']))
+        else:
+            variantId = response.meta["variantId"]
+
+        variantGroups = getElement(selectors["variantGroups"], response)
+
         variants = getElement(selectors["variants"], response).getall()
 
-        base_variant_url = response.url.split("/dp/", 1)[0]
-        for variant in variants:
-            if variant != response.meta['asin']:
-                self.productLists.append(variant)
-                customMeta = copy.deepcopy(self.meta)
-                customMeta['asin'] = variant
-                url = base_variant_url + "/dp/" + variant
-                yield scrapy.Request(url=url, callback=self.parse_product,
-                                     headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),
-                                     meta=customMeta)
+        variantPrices = getElement(selectors["variantPrice"], response).getall()
+
+        if len(variantPrices) <2 and len(variantGroups) < 2:
+            variantId = "NA"
+            print('HERE?????')
+            print(len(variantPrices))
+            print(len(variantGroups))
+
+        #variantId
+        try:
+            if response.meta["variantId"] != "NA":
+                Item["variant"] = {
+                    "variantId": response.meta["variantId"],
+                    "variantName": response.meta["variantName"]
+                }
+        except Exception as inst:
+            if len(variantPrices) > 1:
+                variantName = response.xpath('//li[@data-defaultasin="'+Item['productLocalId']+'"]' + selectors["variantName"][0]).get()
+                Item["variant"] = {
+                    "variantId": variantId,
+                    "variantName": variantName
+                }
+            if len(variantGroups) > 1:
+                variantName = "Many Variants"
+                Item["variant"] = {
+                    "variantId": variantId,
+                    "variantName": variantName
+                }
+        for temp_variant in variants:
+            r = re.search(r'\/[A-Z0-9]{10}\/',temp_variant)
+            if r is not None:
+                variant = r.group(0)
+                variant = variant[1:-1]
+            else:
+                r = re.search(r',[A-Z0-9]{10}',temp_variant)
+                if r is not None:
+                    variant = r.group(0)
+                    variant = variant[1:]
+                else:
+                    variant = ""
+
+            if variant != "" and variant != response.meta['asin']:
+                if variant not in self.productLists:
+                    self.productLists.append(variant)
+                    customMeta = copy.deepcopy(self.meta)
+                    customMeta['asin'] = variant
+
+                    if len(variantGroups) > 1:
+                        variantName = "Many Variants"
+                    else:
+                        variantName = response.xpath('//li[@data-defaultasin="'+variant+'"]' + selectors["variantName"][0]).get(default = "NA")
+                        if variantName == "NA":
+                            variantName = response.xpath('//option[contains(@value,"'+variant+'")]' + selectors["variantName"][1]).get(default = "NA")
+
+                    customMeta["variantId"] = variantId
+                    customMeta["variantName"] = variantName
+                    url = re.sub(r'\/[0-9A-Z]{10}','/'+variant, response.url)
+
+                    yield scrapy.Request(url=cleanUrl(url), callback=self.parse_product,
+                                         headers=getRandomUAgents(settings.get('UAGENTS'), settings.get('HEADERS')),
+                                         meta=customMeta)
 
         yield Item