-
Notifications
You must be signed in to change notification settings - Fork 3
/
GeniusArtistDataCollect.py
153 lines (106 loc) · 5.06 KB
/
GeniusArtistDataCollect.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import re
import requests
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
class GeniusArtistDataCollect:
"""A wrapper class that is able to retrieve, clean, and organize all the album songs of a given artist
Uses the Genius API and webscraping techniques to get the data."""
def __init__(self, client_access_token, artist_name, albums):
"""
Instantiate a GeniusArtistDataCollect object
:param client_access_token: str - Token to access the Genius API. Create one at https://genius.com/developers
:param artist_name: str - The name of the artist of interest
:param albums: list - A list of all the artist's albums to be collected
"""
self.client_access_token = client_access_token
self.artist_name = artist_name
self.albums = albums
self.base_url = 'https://api.genius.com/'
self.headers = {'Authorization': 'Bearer ' + self.client_access_token}
self.artist_songs = None
def search(self, query):
"""Makes a search request in the Genius API based on the query parameter. Returns a JSON response."""
request_url = self.base_url + 'search'
data = {'q': query}
response = requests.get(request_url, data=data, headers=self.headers).json()
return response
def get_artist_songs(self):
"""Gets the songs of self.artist_name and places in a pandas.DataFrame"""
# Search for the artist and get their id
search_artist = self.search(self.artist_name)
artist_id = str(search_artist['response']['hits'][0]['result']['primary_artist']['id'])
print("ID: " + artist_id)
# Initialize DataFrame
df = pd.DataFrame(columns=['Title', 'URL'])
# Iterate through all the pages of the artist's songs
more_pages = True
page = 1
i = 0
while more_pages:
print("page: " + str(page))
# Make a request to get the songs of an artist on a given page
request_url = self.base_url + 'artists/' + artist_id + '/songs' + '?per_page=50&page=' + str(page)
response = requests.get(request_url, headers=self.headers).json()
print(response)
# For each song which the given artist is the primary_artist of the song, add the song title and
# Genius URL to the DataFrame
for song in response['response']['songs']:
if str(song['primary_artist']['id']) == artist_id:
title = song['title']
url = song['url']
df.loc[i] = [title, url]
i += 1
page += 1
if response['response']['next_page'] is None:
more_pages = False
# Get the HTML, Album Name, and Song Lyrics from helper methods in the class
df['html'] = df['URL'].apply(self.get_song_html)
df['Album'] = df['html'].apply(self.get_album_from_html)
df['InAnAlbum'] = df['Album'].apply(lambda a: self.is_track_in_an_album(a, self.albums))
df = df[df['InAnAlbum'] == True]
df['Lyrics'] = df.apply(lambda row: self.get_lyrics(row.html), axis=1)
del df['html']
self.artist_songs = df
return self.artist_songs
def get_song_html(self, url):
"""Scrapes the entire HTML of the url parameter"""
request = urllib.request.Request(url)
request.add_header("Authorization", "Bearer " + self.client_access_token)
request.add_header("User-Agent",
"curl/7.9.8 (i686-pc-linux-gnu) libcurl 7.9.8 (OpenSSL 0.9.6b) (ipv6 enabled)")
page = urllib.request.urlopen(request)
html = BeautifulSoup(page, "lxml")
print("Scraped: " + url)
return html
def get_lyrics(self, html):
"""Scrapes the html parameter to get the song lyrics on a Genius page in one, large String object"""
lyrics = html.find("div", class_="lyrics")
all_words = ''
# Clean lyrics
for line in lyrics.get_text():
all_words += line
# Remove identifiers like chorus, verse, etc
all_words = re.sub(r'[\(\[].*?[\)\]]', '', all_words)
# remove empty lines, extra spaces, and special characters
all_words = os.linesep.join([s for s in all_words.splitlines() if s])
all_words = all_words.replace('\r', '')
all_words = all_words.replace(' ', ' ')
return all_words
def get_album_from_html(self, html):
"""Scrapes the html parameter to get the album name of the song on a Genius page"""
parse = html.findAll("span")
album = ''
for i in range(len(parse)):
if parse[i].text == 'Album':
i += 1
album = parse[i].text.strip()
break
return album
def is_track_in_an_album(self, album, albums_list):
"""A helper method to see if a given album (which would be tied with a song) is in the given album_list"""
if album in albums_list:
return True
else:
return False