-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrap.py
76 lines (64 loc) · 2.94 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from lxml import html
import csv,os,json
import requests
from builtins import ValueError
from time import sleep
import sys
def AmzonParser(url):
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
page = requests.get(url,headers=headers)
# tree = html.fromString(page.content)
while True:
sleep(3)
try:
doc = html.fromstring(page.content)
XPATH_NAME = '//h1[@id="title"]//text()'
XPATH_SALE_PRICE = '//span[contains(@id,"ourprice") or contains(@id,"saleprice")]/text()'
XPATH_ORIGINAL_PRICE = '//td[contains(text(),"List Price") or contains(text(),"M.R.P") or contains(text(),"Price")]/following-sibling::td/text()'
XPATH_CATEGORY = '//a[@class="a-link-normal a-color-tertiary"]//text()'
XPATH_AVAILABILITY = '//div[@id="availability"]//text()'
XPATH_CONTENT = '//div[@id="bookDescription_feature_div"]//noscript//text()'
#XPATH_CONTENT = '//div[@id="iframeContent"]//text()'
RAW_NAME = doc.xpath(XPATH_NAME)
RAW_SALE_PRICE = doc.xpath(XPATH_SALE_PRICE)
RAW_CATEGORY = doc.xpath(XPATH_CATEGORY)
RAW_ORIGINAL_PRICE = doc.xpath(XPATH_ORIGINAL_PRICE)
RAw_AVAILABILITY = doc.xpath(XPATH_AVAILABILITY)
RAW_CONTENT = doc.xpath(XPATH_CONTENT)
NAME = ' '.join(''.join(RAW_NAME).split()) if RAW_NAME else None
SALE_PRICE = ' '.join(''.join(RAW_SALE_PRICE).split()).strip() if RAW_SALE_PRICE else None
CATEGORY = ' > '.join([i.strip() for i in RAW_CATEGORY]) if RAW_CATEGORY else None
ORIGINAL_PRICE = ''.join(RAW_ORIGINAL_PRICE).strip() if RAW_ORIGINAL_PRICE else None
AVAILABILITY = ''.join(RAw_AVAILABILITY).strip() if RAw_AVAILABILITY else None
CONTENT = ''.join(RAW_CONTENT).strip() if RAW_CONTENT else None
if not ORIGINAL_PRICE:
ORIGINAL_PRICE = SALE_PRICE
if page.status_code!=200:
raise ValueError('captha')
data = {
#'NAME':NAME,
#'SALE_PRICE':SALE_PRICE,
#'CATEGORY':CATEGORY,
#'ORIGINAL_PRICE':ORIGINAL_PRICE,
#'AVAILABILITY':AVAILABILITY,
#'URL':url,
'CONTENT': CONTENT
}
return data
except Exception as e:
print(e)
def ReadAsin(asin):
AsinList = [str(asin)]
extracted_data = []
for i in AsinList:
url = "http://www.amazon.com/dp/"+i
#print("Processing: "+url)
extracted_data.append(AmzonParser(url))
sleep(5)
#f=open('data.json','w')
#json.dump(extracted_data,f,indent=4)
print(extracted_data[-1])
sys.stdout.flush()
#print("Extracted Data is", extracted_data)
if __name__ == "__main__":
ReadAsin(sys.argv[1])