forked from 0916dhkim/new-hack-who-this-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.py
193 lines (169 loc) · 5.6 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
from config import config
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pylast
import asyncio
from typing import List
import csv
# Use spotipy package for maanaging access tokens.
if config["spotifyClientId"] is None:
raise Exception("Environment variable SPOTIFY_CLIENT_ID is required.")
if config["spotifyClientSecret"] is None:
raise Exception("Environment variable SPOTIFY_CLIENT_SECRET is required.")
if config["lastFmKey"] is None:
raise Exception("Environment varialbe LASTFM_API_KEY is required.")
if config["lastFmSecret"] is None:
raise Exception("Environment varialbe LASTFM_API_SECRET is required.")
spotifyCredentials = SpotifyClientCredentials(
client_id=config["spotifyClientId"],
client_secret=config["spotifyClientSecret"],
)
spotify = spotipy.Spotify(client_credentials_manager=spotifyCredentials)
lastfm = pylast.LastFMNetwork(
api_key=config["lastFmKey"], api_secret=config["lastFmSecret"]
)
class SpotifyTrack:
def __init__(self, id, title, artist):
self.id = id
self.title = title
self.artist = artist
def __str__(self):
return f"<[{self.id}] {self.title} ; {self.artist}>"
class SpotifyFeatures:
def __init__(
self,
key,
mode,
acousticness,
danceability,
energy,
instrumentalness,
liveness,
loudness,
speechiness,
valence,
tempo,
):
self.key = key
self.mode = mode
self.acousticness = acousticness
self.danceability = danceability
self.energy = energy
self.instrumentalness = instrumentalness
self.liveness = liveness
self.loudness = loudness
self.speechiness = speechiness
self.valence = valence
self.tempo = tempo
class Track:
def __init__(self, spotifyTrack, spotifyFeatures, tags):
self.spotifyTrack = spotifyTrack
self.spotifyFeatures = spotifyFeatures
self.tags = tags
def __str__(self):
return (
f"<Track | {self.spotifyTrack.title} | "
f"{self.spotifyTrack.artist} | {self.tags}>"
)
# All tracks.
allTracks: List[Track] = []
# Get playlists from toplists category.
async def handleTopLists():
res = spotify.category_playlists("toplists", country="US", limit=50)
playlistIds = [i["id"] for i in res["playlists"]["items"]]
await asyncio.gather(
*[handlePlaylist(playlist) for playlist in playlistIds]
)
# Get tracks in playlist.
async def handlePlaylist(playlistId: str):
res = spotify.playlist_tracks(playlistId)
spotifyTrackData = [i["track"] for i in res["items"]]
spotifyTracks = [
SpotifyTrack(track["id"], track["name"], track["artists"][0]["name"])
for track in spotifyTrackData
]
await asyncio.gather(
*[handleSpotifyTrack(track) for track in spotifyTracks]
)
async def handleSpotifyTrack(spotifyTrack: SpotifyTrack):
features = await getSpotifyFeatures(spotifyTrack.id)
tags = getLastFmTags(spotifyTrack.title, spotifyTrack.artist)
if features is None:
return
track = Track(spotifyTrack, features, tags)
print(track)
allTracks.append(track)
async def getSpotifyFeatures(trackId: str):
[res] = spotify.audio_features([trackId])
if res is None:
return None
return SpotifyFeatures(
res["key"],
res["mode"],
res["acousticness"],
res["danceability"],
res["energy"],
res["instrumentalness"],
res["liveness"],
res["loudness"],
res["speechiness"],
res["valence"],
res["tempo"],
)
def getLastFmTags(title: str, artist: str):
try:
return list(
map(
lambda x: x.item.name,
lastfm.get_track(artist, title).get_top_tags(),
)
)
except Exception:
return []
if __name__ == "__main__":
asyncio.run(handleTopLists())
with open("scraped.csv", "w", newline="", encoding="utf-8") as csvfile:
fieldnames = [
"spotify_id",
"title",
"artist",
"key",
"mode",
"acousticness",
"danceability",
"energy",
"instrumentalness",
"liveness",
"loudness",
"speechiness",
"valence",
"tempo",
"tags",
]
writer = csv.DictWriter(
csvfile,
fieldnames=fieldnames,
quoting=csv.QUOTE_NONNUMERIC,
lineterminator="\n",
)
writer.writeheader()
for track in allTracks:
writer.writerow(
{
"spotify_id": track.spotifyTrack.id,
"title": track.spotifyTrack.title,
"artist": track.spotifyTrack.artist,
"key": track.spotifyFeatures.key,
"mode": track.spotifyFeatures.mode,
"acousticness": track.spotifyFeatures.acousticness,
"danceability": track.spotifyFeatures.danceability,
"energy": track.spotifyFeatures.energy,
"instrumentalness": track.spotifyFeatures.instrumentalness,
"liveness": track.spotifyFeatures.liveness,
"loudness": track.spotifyFeatures.loudness,
"speechiness": track.spotifyFeatures.speechiness,
"valence": track.spotifyFeatures.valence,
"tempo": track.spotifyFeatures.tempo,
"tags": track.tags,
}
)