-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_lyrics.py
117 lines (103 loc) · 3.65 KB
/
get_lyrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from bs4 import BeautifulSoup as bs, SoupStrainer
from bs4 import Comment
import urllib
import requests
import time
import nltk
from gensim import corpora
from lxml.html import fromstring
import lxml.html as PARSER
import os
import re
import sys
import requests
import time
import nltk
from gensim import corpora
import sys
from analyze_lyrics import *
from urllib.request import urlopen
#from find_artist_songs import find_artist_songs
#from analyze_lyrics import get_sentiment
#artist = sys.argv[1]
#response = urllib.request.urlopen(path + 'acrosstheline.html').read()
#
# artists = ['Mitski', 'LinkinPark', 'Beatles', 'TaylorSwift', 'TwentyOnePilots']
# for artist in artists:
# path = 'Artists/' + artist +'/urls/'
# for filename in os.listdir(path):
# print(filename)
# if(filename=='urls.txt'):
# continue
# f = open(path + "../lyrics/" + filename.rstrip('.html') + ".txt", "w")
#
# soup = bs(open(path + filename), "lxml")
# title = soup.find_all('b', class_=False)
# if len(title) == 0:
# continue
# f.write(artist + "\n")
# f.write(re.sub('<.*?>|"', '', str(title[len(title) - 1])))
# for lyrics in soup.find_all(string=lambda text:isinstance(text,Comment)):
# if "start of lyrics" in lyrics or "Usage" in lyrics:
# curr = re.sub('<.*?>|([^\s\w]|_)', '', str(lyrics.parent))
# f.write(curr)
# break
# f.close()
# print("====================================")
#
def format_artist(artist):
artist = artist.lower()
if artist[0:2] == "a ":
artist = artist.replace("a ", "", 1)
elif artist[0:4] == "the ":
artist = artist.replace("the ", "", 1)
artist = artist.replace(" ", "" , 1)
artist = re.sub('[^0-9a-zA-Z]+','', artist)
#[^\x00-\x7F]
return artist
def format_song(song):
song = song.lower()
song = song.replace(" ", "" , 1)
song = re.sub('[^0-9a-zA-Z]+','', song)
return song
def get_lyrics_with_urls(urls):
#TODO
#urls = ['http://lyrics123.net/snoop-dogg/deeez-nuuuts/']
ret = []
for url in urls:
time.sleep(3)
print(url)
response = urlopen(url, timeout = 5)
content = response.read()
for lyrics in bs(content,"html.parser", parse_only=SoupStrainer('p')):
if(lyrics.has_attr('style')):
lyrics = re.sub('</?br/?>', '\n',str(lyrics))
lyrics = re.sub('<.*?>', '',str(lyrics))
lyrics = re.sub('\n',' \n',str(lyrics));
ret.append(lyrics)
print(lyrics)
print(str(get_sentiment(lyrics)))
return ret
def get_lyrics(artist, song):
#urls = ['http://lyrics123.net/snoop-dogg/deeez-nuuuts/']
artist = format_artist(artist)
#print(artist)
song = format_song(song)
time.sleep(1)
url = "https://web.archive.org/web/20161007001058/http://www.azlyrics.com/lyrics/" +artist + "/" + song + ".html"
content = None
try:
response = urlopen(url, timeout = 5)
content = response.read()
except:
print(url)
print("failed\n")
return None
soup = bs(content,"html.parser", parse_only=SoupStrainer('div'))
for l in soup:
for lyrics in soup.find_all(string=lambda text:isinstance(text,Comment)):
if "start of lyrics" in lyrics or "Usage" in lyrics:
lyrics = re.sub('</?br/?>', '',str(lyrics.parent))
lyrics = re.sub('<.*?>', '',str(lyrics))
#lyrics = re.sub('\n',' \n',str(lyrics));
return str(lyrics)