-
Notifications
You must be signed in to change notification settings - Fork 2
/
everynoise.py
159 lines (129 loc) · 6.25 KB
/
everynoise.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
import scrapy
from scrapy.crawler import CrawlerProcess
import json
import io
import time
from datetime import datetime
import os
from urllib.parse import parse_qs, urlparse
import boto3
import logging
# enable logging
logging.basicConfig(level=logging.INFO, filename="everynoise_newreleases_logs.log", filemode="a", format="%(asctime)s - %(levelname)s - %(name)s - %(message)s")
bucket_name = "uvt-streaming-data"
bucket_dir = 'everynoise/new-releases/'
# function to backup on S3
def uploadToS3(filepath, filename):
logging.info("Trying to upload to S3 file... %s", filename)
# uploads the file via a managed uploader, which will split up large files automatically and upload in parallel
try:
s3.upload_file(filepath, bucket_name, filename)
logging.info("Upload to S3 OK.")
return True
except boto3.exceptions.S3UploadFailedError as e:
logging.critical("Upload to S3 ERROR.", exc_info=True)
return False
# function to move files to errorsDirectory when an error is raised
def moveFile(filepath, filename):
try:
os.rename(filepath, errorsDirectory + "/" + filename)
logging.error("File moved to /errors: %s", filepath)
except OSError as e:
logging.critical("Can't move file: %s", filepath, exc_info=True)
class EveryNoiseSpider(scrapy.Spider):
name = "releases"
start_urls = ['http://everynoise.com/new_releases_by_genre.cgi?region=']
def parse(self, response):
for region in response.xpath('//select[@name="region"]/option'):
# reconstruct the url using the country code obtained from the "region" drop-down menu
final_url = self.start_urls[0] + region.css('option::attr(value)').get() + "&albumsonly=&style=cards&date=&genre=anygenre&artistsfrom="
# final_url = self.start_urls[0] + "US" + "&albumsonly=&style=cards&date=&genre=anygenre&artistsfrom=" # Uncomment for US only - use it to debug
yield scrapy.Request(final_url, callback=self.parse_page)
def parse_page(self, response):
logging.info("Crawling started...")
# retrieve date from "date" drop-down menu
everyNoiseDate = response.xpath('//select[@name="date"]/option[@selected]/text()').get()
# retrieve country code from the current url parameter
parsed_url = urlparse(response.request.url)
countryCode = parse_qs(parsed_url.query)['region'][0] # the list contains only 1 item, the current country code
with open(htmlDirectory +'/page_' + runDate + '_' + countryCode + '.html', 'wb') as html_file:
html_file.write(response.body)
files_to_handle.append(os.path.basename(html_file.name)) # add html_file filename to files_to_handle list
for albumrow in response.css('div.albumrow'):
# if a:nth-child(3) contains no text, then the album name is in the child i
if albumrow.css('a:nth-child(3)::text').get() is None:
albumName = albumrow.css('a > i::text').get()
else:
albumName = albumrow.css('a:nth-child(3)::text').get()
yield {
'countryCode': countryCode,
'trackId': albumrow.css('span.play::attr(trackid)').get(),
'artistId': albumrow.css('a::attr(href)').extract()[0],
'rank': albumrow.css('a::attr(title)').extract()[0],
'artistName': albumrow.css('a > b::text').get(),
'albumId': albumrow.css('a::attr(href)').extract()[1],
'albumName': albumName,
'scrapeUnix': runUnix,
'scrapeDate': runDate,
'everyNoiseDate': everyNoiseDate,
}
# define list to collect all items
items = []
# pipeline processes items
class EveryNoisePipeline(object):
def __init__(self):
self.ids_seen = set()
def process_item(self, item, spider):
items.append(item)
# settings
process = CrawlerProcess(settings={
'ITEM_PIPELINES': {'everynoise.EveryNoisePipeline': 300},
'LOG_LEVEL' : 'INFO',
})
# define directories
directory = "output"
htmlDirectory = "html_dumbs"
errorsDirectory = "errors"
try:
os.makedirs(directory)
except FileExistsError:
pass # directory already exists
try:
os.makedirs(htmlDirectory)
except FileExistsError:
pass # directory already exists
try:
os.makedirs(errorsDirectory)
except FileExistsError:
pass # directory already exists
# define timestamps
runUnix = int(time.time())
runDate = datetime.now().strftime("%Y%m%d")
# define empty list for file names that needs to be handled
files_to_handle = []
# create an S3 client and configure from shell
s3 = boto3.client("s3")
# launch the spider
process.crawl(EveryNoiseSpider)
process.start() # the script will block here until the crawling is finished
# write output file
with io.open(directory + "/everynoise_newreleases_" + runDate + ".json", "w", encoding="UTF-8") as json_output:
for item in items: # loop through objects to add new lines between them
json.dump(item, json_output, ensure_ascii=False)
json_output.write("\n") # add new line for the next object
logging.info("File written.")
files_to_handle.append(os.path.basename(json_output.name)) # append json_output filename to files_to_handle list
# upload all json files to S3
for file in os.scandir(directory):
if file.name.endswith(".json") and file.name in files_to_handle: # only upload files from the current crawling
uploadResult = uploadToS3(file.path, bucket_dir+file.name)
if uploadResult is False:
moveFile(file.path, file.name) # move file to errorsDirectory if an error is raised
logging.info("All done with json files.")
# upload all html files from htmlDirectory to S3
for file in os.scandir(htmlDirectory):
if file.name.endswith(".html") and file.name in files_to_handle: # only upload files from the current crawling
uploadResult = uploadToS3(file.path, bucket_dir+"html_dumbs/"+file.name)
if uploadResult is False:
moveFile(file.path, file.name) # move file to errorsDirectory if an error is raised
logging.info("All done with html files.")