-
Notifications
You must be signed in to change notification settings - Fork 16
/
cc.py
56 lines (48 loc) · 1.51 KB
/
cc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import requests
import json
import threading
import queue
import argparse
class commonCrawlDataClass():
def __init__(self,domain):
self.jsonIndexData = ""
self.domain = domain
self.domains = []
self.q = queue.Queue()
def getIndexes(self):
indexURL = "https://index.commoncrawl.org/collinfo.json"
r = requests.get(indexURL)
jsonIndexData = json.loads(r.text)
for index in jsonIndexData:
self.q.put(index['id'])
def getIndexData(self,indexID):
try:
commonCrawlURL = "http://index.commoncrawl.org/"+indexID+"-index?url="+self.domain+"/*&output=json"
r = requests.get(commonCrawlURL)
data = r.text.split("\n")[:-1]
for entry in data:
url = json.loads(entry)['url']
if url not in self.domains:
self.domains.append(url)
print(url)
except:
pass
def worker(self):
while 1:
indexID = self.q.get()
self.getIndexData(indexID)
self.q.task_done()
def start(self):
self.getIndexes()
for i in range(0,10):
t = threading.Thread(target=self.worker)
t.daemon = True
t.start()
self.q.join()
parser = argparse.ArgumentParser()
parser.add_argument("-d","--domain", help="Domain Name; EX: test.com")
args = parser.parse_args()
if args.domain:
domain = args.domain
cc = commonCrawlDataClass(domain)
cc.start()