-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscratch.py
2154 lines (1342 loc) · 55.3 KB
/
scratch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# coding: utf-8
# # **Introduction**
#
# Our goal is to predict the popularity of songs within a Spotify dataset. This task involves developing a predictive model to identify the key factors contributing to a song's popularity on the platform.
# In[1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import re
import ast
from scipy import stats
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from scipy.stats import uniform, randint
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, mean_absolute_error
import plotly.express as px
from sklearn.feature_selection import SelectKBest, f_regression, chi2, mutual_info_regression, f_classif
import subprocess
subprocess.run(["pip", "install", "category_encoders"])
from scipy.stats import pearsonr, spearmanr
from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder
import ast
import pickle
pd.set_option('display.max_colwidth', None)
# In[2]:
df = pd.read_csv("C:\\Users\\lenovo\\anaconda3\\SongPopularity.csv")
pd.set_option('display.max_columns', None)
df.head()
# **Hot100 Ranking Year:** The year in which the song achieved its ranking on the Billboard Hot 100 chart.
#
# **Hot100 Rank:** The specific ranking of the song on the Billboard Hot 100 chart during a particular year.
#
# **Acousticness:** tells us how much of a song is made with real instruments versus electronic ones: higher numbers mean more real instruments, while lower numbers mean more electronic sounds.
#
# **Danceability:** measures how easy it is to dance to a song: higher values mean it's easier to dance to, while lower values mean it might be harder to dance to.
#
# **Energy:** A measure of the song's intensity and activity, often associated with loudness and speed.
#
# **Instrumentalness:** Indicates the presence of vocals vs. instrumental elements in the song.
#
# **Liveness:** Reflects the likelihood of the song being performed live, based on audience noises and crowd sounds.
#
# **Speechiness:** Measures the presence of spoken words or speech-like elements in the song.
#
# **Tempo:** The speed or pace of the song, typically measured in beats per minute (BPM).
#
# **Valence:** Describes the musical positiveness conveyed by the song, such as happiness or cheerfulness.
#
# **Key:** The musical key or tonality of the song, which influences its mood and sound.
#
# **Time Signature:** Specifies the number of beats in each bar and the type of note that receives one beat, defining the song's rhythmic structure.
# # **Data Preprocessing**
# In[3]:
df.shape
# In[4]:
list(df.columns)
# In[5]:
df.info()
# All datatypes are correct
# In[6]:
df.describe().round(2)
# **Hot100 Ranking Year:** The ranking years range between 2010 and 2020
#
# **Hot100 Rank:** The average rank is 50.5 and There are no missing values.
#
# **Popularity:** The average popularity score is 66.4
#
# **Energy:** The minimum energy value is 0.0112, the maximum energy value is 0.996.
#
# **Instrumentalness:** The average instrumentalness value is 0.117.
#
# **Liveness:** The liveness value ranges from 0.0158 to 0.995.
#
# **Loudness:** The minimum loudness value is -14.874, the maximum loudness value is 5.787.
#
# **Acousticness:** The most acoustic songs tend to have an acousticness score of 0.8 or higher.
#
# **Danceability:** The distribution of danceability scores is approximately normal, with a slight skew to the right.
#
# **Key:** The distribution of key scores is approximately uniform.
#
# **Speechiness:** The distribution of speechiness scores is skewed to the right.
#
# **Tempo:** The fastest songs tend to have a tempo score of 120 or higher.
#
# **Valence:** The minimum valence score is 0.0337, the maximum valence score is 0.982.
#
# **Time Signature:** The average time signature is 4.0.
#
# **Mode:** There are two modes in the dataset: "Major" and "Minor"and the most common mode is "Major", which appears 61 times in the dataset.
# In[7]:
df.isnull().sum()
# In[8]:
df.duplicated().sum()
# No null or duplicated values
# ### Exploring unique values of some features
# In[9]:
df['Mode'].unique()
# In[10]:
df['Hot100 Ranking Year'].unique()
# In[11]:
df['Key'].unique()
# In[12]:
df['Time Signature'].unique()
# In[13]:
df['Artist Names'].value_counts()
# In the dataset, many artists have multiple songs, and the artist with the highest contribution is 'The Karaoke Channel' with 42 songs. Following them are 'Madonna' and 'Janet Jackson' with their respective song counts.
# In[14]:
album_counts = df['Album'].value_counts()
multiple_occurrences = album_counts[album_counts > 1]
print(multiple_occurrences)
# For albums, the most prolific is 'Greatest Hits' with 48 entries. This is followed by 'Super Hits' and '16 Most Requested Songs', each having their own respective counts.
# In[15]:
song_counts = df['Song'].value_counts()
multiple_occurrences = song_counts[song_counts > 1]
print(multiple_occurrences)
# **Many song names are duplicated!**
# In[16]:
df1 = df[df['Song'] == 'I Like It']
df1
# Song names are duplicated but not all links are
# In[17]:
df1['Spotify Link'].unique()
# In[18]:
link_counts = df['Spotify Link'].value_counts()
link_multiple_occurrences = link_counts[link_counts > 1]
print(link_multiple_occurrences)
# In[19]:
duplicate_links = df[df.duplicated(subset=['Spotify Link'], keep=False)]
duplicate_links
# ### If a link appears more than once but with different top 100 ranking years, it may not be an issue since the song could be featured in multiple significant song lists across different years. However, if the top 100 ranking years are identical for duplicated links, it presents a conflict for the model. This is uncommon because a song typically shouldn't have multiple rankings in the same year. Therefore, I will investigate rows where both the link and top 100 ranking year are duplicated. ###
#
# In[20]:
duplicate_links = df[df.duplicated(subset=['Spotify Link', 'Hot100 Ranking Year'], keep=False)]
duplicate_links.shape[0]
# In[21]:
duplicate_links
# Since there are only 12 rows where both the top 100 ranking year and the link are duplicated, I will drop these rows from the dataset.
#
# In[22]:
df.drop_duplicates(subset=['Spotify Link', 'Hot100 Ranking Year'], keep=False, inplace=True)
df.reset_index(drop=True, inplace=True)
# In[23]:
duplicate_links = df[df.duplicated(subset=['Spotify Link'], keep=False)]
duplicate_links.shape[0]
# In[24]:
link_counts = df['Spotify URI'].value_counts()
link_multiple_occurrences = link_counts[link_counts > 1]
print(link_multiple_occurrences)
# In[25]:
df['Hot100 Ranking Year'].value_counts()
# In[26]:
df['Album'].value_counts()
# The majority of songs appear to have rankings in the mid to high 90s, particularly in 2017 and 1974. This suggests that these years might have had a higher number of popular or significant songs. On the other hand, earlier years like 1955, 1950, and 1952 show lower rankings, indicating fewer popular songs or possibly a smaller dataset for those years.
#
# ### Handling Some Hidden Nulls
# Nulls in Artists' Genres are represented as "[]" in the dataset.
# In[27]:
df[df['Artist(s) Genres'].isin(["[]"])].head()
# In[28]:
df['Artist(s) Genres'].isin(["[]"]).sum()
# Filling null values in the Artists' Genres, which is categorical data, with the mode is a suitable approach. This will replace missing values with the most frequently occurring genre in the dataset.
# In[29]:
df['Artist(s) Genres'] = df['Artist(s) Genres'].apply(lambda x: np.nan if x == "[]" else x)
mode_value = df['Artist(s) Genres'].mode()[0]
df['Artist(s) Genres'].fillna(mode_value, inplace=True)
# In[30]:
print(df['Artist(s) Genres'].isin(["[]"]).sum())
# **Checking if there any other hidden Nulls**
# In[31]:
pattern = re.compile(r'[^a-zA-Z\s]')
df[df['Song'].str.contains(pattern, na=False)].head()
# In[32]:
pattern = r'^[^\w\s]+$'
HiddenNulls = df[df['Album'].str.match(pattern, na=False)]
HiddenNulls
# **There are hidden nulls in album column in form of "?" !!!**
# In[33]:
HiddenNulls.shape[0]
# In[34]:
rows_to_delete = df[df.isin(HiddenNulls.to_dict('list')).all(axis=1)].index
df.drop(rows_to_delete, inplace=True)
df.reset_index(drop=True, inplace=True)
# In[35]:
pattern = r'^[^\w\s]+$'
HiddenNulls = df[df['Album'].str.match(pattern, na=False)]
HiddenNulls.shape[0]
# I will convert the 'Album Release Date' column to only display the 'Year' since it is the most relevant information for our analysis.
#
# In[36]:
df['Year'] = df['Album Release Date'].apply(lambda x: x.split('/')[-1].split('-')[0])
df.head()
# A tempo of 0 is not possible for any song, so I will remove entries with a tempo value of 0 from the dataset.
#
# In[37]:
df[df['Tempo'] == 0].shape[0]
# In[38]:
'''
rows_to_delete = df[df['Tempo'] <= 0].index
df.drop(rows_to_delete, inplace=True)
df.reset_index(drop=True, inplace=True)
'''
# When an attempt was made to exclude instances where Tempo was equal to 0, it resulted in a decrease in accuracy. Consequently, these instances were retained in the dataset.
# # **EDA**
# In[39]:
cont = df[['Song Length(ms)', 'Acousticness', 'Danceability', 'Energy', 'Instrumentalness', 'Liveness', 'Loudness',
'Speechiness', 'Tempo', 'Valence', 'Popularity']]
cat = df.drop(columns=cont.columns)
# **Boxplot**
# In[40]:
plt.figure(figsize=(15, 25))
for idx, i in enumerate(cont):
plt.subplot(12, 2, idx + 1)
sns.boxplot(x=i, data=df, palette="mako")
plt.title(i, color='black', fontsize=15)
plt.xlabel(i, size=12)
plt.tight_layout()
plt.show()
# There are outliers but we will handle them later.
# # **Histplots and Kdeplots**
# Check distributions
# In[41]:
fig, axs = plt.subplots(len(cont.columns), 2, figsize=(20, 60))
axs = axs.flatten()
for i, column in enumerate(cont.columns):
sns.histplot(cont[column], bins=50, ax=axs[2 * i])
axs[2 * i].set_title(f'Histogram of {column}')
axs[2 * i].set_xlabel(column)
axs[2 * i].set_ylabel('Frequency')
sns.kdeplot(cont[column], ax=axs[2 * i + 1], fill=True)
axs[2 * i + 1].set_title(f'KDE Plot of {column}')
axs[2 * i + 1].set_xlabel(column)
axs[2 * i + 1].set_ylabel('Density')
plt.tight_layout()
plt.show()
# The 'popularity' feature has over 250 zeros. We will further investigate to determine if these zeros represent null values or if they are
#
# In[42]:
def check_distribution(data):
skewness = stats.skew(data)
_, shapiro_p_value = stats.shapiro(data)
if shapiro_p_value > 0.05:
if skewness > 0:
return "Right-skewed"
elif skewness < 0:
return "Left-skewed"
else:
return "Normally distributed"
else:
return "Not normally distributed"
results = cont.apply(check_distribution)
print(results)
# Some features, like 'danceability', appear to have a distribution close to normal. However, most of the other features seem to be right-skewed.
#
# In[43]:
# rows_to_delete = df[df['Tempo'] > 230].index
# df.drop(rows_to_delete, inplace=True)
# rows_to_delete = df[df['Loudness'] < -35].index
# df.drop(rows_to_delete, inplace=True)
# rows_to_delete = df[df['Speechiness'] > 0.8].index
# df.drop(rows_to_delete, inplace=True)
# df.reset_index(drop=True, inplace=True)
# In[44]:
list(cat.columns)
# **Countplot**
# Most artists have multiple songs in the dataset.
#
# In[45]:
duplicatedArtists = df[df['Artist Names'].duplicated()]
artist_counts = df['Artist Names'].value_counts()
top_30_artists = artist_counts.head(30)
plt.figure(figsize=(15, 10))
sns.countplot(y='Artist Names', data=df[df['Artist Names'].isin(top_30_artists.index)], order=top_30_artists.index,
palette='mako')
plt.title('Count of Duplicated Artists (Top 30)')
plt.xlabel('Count')
plt.ylabel('Artist Names')
plt.show()
# Average popularity in these top 30 artists.
# In[46]:
top_30_artists = df['Artist Names'].value_counts().head(30).index
average_popularity = df[df['Artist Names'].isin(top_30_artists)].groupby('Artist Names')['Popularity'].mean()
plt.figure(figsize=(15, 10))
average_popularity.plot(kind='bar')
plt.title('Average Popularity of Songs by Top 30 Artists')
plt.xlabel('Artist Names')
plt.ylabel('Average Popularity')
plt.xticks(rotation=90)
plt.show()
# In[47]:
df[df['Artist Names'] == "['Ameritz Countdown Karaoke']"]
# The artists with the most popular songs are typically the most successful and famous in the industry.
#
# In[48]:
average_popularity = df.groupby('Artist Names')['Popularity'].mean()
top_30_artists = average_popularity.sort_values(ascending=False).head(30)
plt.figure(figsize=(15, 10))
top_30_artists.plot(kind='bar')
plt.title('Average Popularity of Songs by Top 30 Artists')
plt.xlabel('Artist Names')
plt.ylabel('Average Popularity')
plt.xticks(rotation=90)
plt.show()
# The albums with the most popular songs.
#
# In[49]:
average_popularity = df.groupby('Album')['Popularity'].mean()
top_30_albums = average_popularity.sort_values(ascending=False).head(30)
plt.figure(figsize=(15, 10))
top_30_albums.plot(kind='bar')
plt.title('Average Popularity of Songs from Top 30 Albums')
plt.xlabel('ALbums')
plt.ylabel('Average Popularity')
plt.xticks(rotation=90)
plt.show()
# In[50]:
pd.set_option('display.max_colwidth', None)
df[df['Popularity'] == 0]
# Most repeated years in the dataset
# In[51]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Year', palette='mako')
plt.title('Count Plot for Album Release Date')
plt.xlabel('Album Release Date')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.show()
# In[52]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Hot100 Ranking Year', palette='mako')
plt.title('Count Plot for Album Release Date')
plt.xlabel('Album Release Date')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.show()
# In[53]:
df['Year'] = df['Year'].astype(int)
# Albums with most columns
# In[54]:
album_counts = df['Album'].value_counts().reset_index()
album_counts.columns = ['Album', 'Count']
top_30_albums = album_counts.head(50)
plt.figure(figsize=(15, 8))
sns.barplot(data=top_30_albums, x='Album', y='Count', palette='mako')
plt.title('Top 30 Albums by Number of Songs')
plt.xlabel('Album')
plt.ylabel('Number of Songs')
plt.xticks(rotation=90)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plt.show()
# **Scatterplots**
# In[55]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, y='Year', x='Popularity', palette='mako')
plt.title('Scatter Plot of Popularity vs Year')
plt.ylabel('Year')
plt.xlabel('Popularity')
plt.grid(True)
plt.show()
# In[56]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, y='Hot100 Ranking Year', hue='Hot100 Rank', x='Popularity', palette='mako')
plt.title('Hot100 Ranking Year vs Popularity')
plt.xlabel('Popularity')
plt.ylabel('Hot 100 Ranking Year')
plt.grid(True)
plt.show()
# In[57]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, y='Valence', x='Popularity', palette='mako')
plt.title('Popularity vs Valence')
plt.xlabel('Popularity')
plt.ylabel('Valence')
plt.grid(True)
plt.show()
# no direct relation betweeen valence and popularity
# In[58]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, y='Danceability', x='Popularity', palette='mako')
plt.title('Popularity vs Danceability')
plt.xlabel('Popularity')
plt.ylabel('Danceability')
plt.grid(True)
plt.show()
# no direct relation betweeen valence and popularity
# In[59]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, y='Tempo', x='Popularity', palette='mako')
plt.title('Popularity vs Tempo')
plt.xlabel('Popularity')
plt.ylabel('Tempo')
plt.grid(True)
plt.show()
# In[60]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, y='Loudness', x='Popularity', palette='mako')
plt.title('Popularity vs Loudness')
plt.xlabel('Popularity')
plt.ylabel('Loudness')
plt.grid(True)
plt.show()
# In[61]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, y='Danceability', x='Valence', hue='Popularity', palette='mako')
plt.title('Scatter Plot of Popularity vs Year')
plt.xlabel('Valence')
plt.ylabel('Danceability')
plt.grid(True)
plt.show()
# In[62]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, y='Danceability', x='Popularity', palette='mako')
plt.title('Popularity vs Danceability')
plt.xlabel('Popularity')
plt.ylabel('Danceability')
plt.grid(True)
plt.show()
# In[63]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, y='Energy', x='Loudness', hue='Popularity', palette='mako')
plt.title('Energy vs Loudness')
plt.xlabel('Loudness')
plt.ylabel('Energy')
plt.grid(True)
plt.show()
# In[64]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, y='Acousticness', x='Energy', hue='Popularity', palette='mako')
plt.title('Acousticness vs Energy')
plt.xlabel('Energy')
plt.ylabel('Acousticness')
plt.grid(True)
plt.show()
# In[65]:
df['minutes_length'] = df['Song Length(ms)'].apply(lambda x: x / 60000)
df.head()
# In[66]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, y='minutes_length', x='Popularity', hue='Popularity', palette='mako')
plt.title('Scatter Plot of Popularity vs Length')
plt.ylabel('Length')
plt.xlabel('Popularity')
plt.grid(True)
plt.show()
# **Pie Charts**
# In[67]:
original_speechiness_values = df['Speechiness'].copy()
# In[68]:
def type_of_Song(x):
if x >= 0.0 and x < 0.1:
return "very low"
elif x >= 0.1 and x < 0.3:
return "low"
elif x >= 0.3 and x < 0.5:
return "medium"
elif x >= 0.5 and x < 0.7:
return "high"
else:
return "very high"
# In[69]:
df['Speechiness'] = df['Speechiness'].apply(type_of_Song)
# In[70]:
n_songs_per_category = df.groupby('Speechiness').size()
fig = px.pie(df, names=n_songs_per_category.index, values=n_songs_per_category.values)
fig.update_layout(title='type of songs')
fig.show()
# In[71]:
df['Speechiness'] = original_speechiness_values
# In[72]:
Tempo_original_value = df['Tempo'].copy()
# In[73]:
def classify_tempo(bpm):
if bpm < 90:
return "Slow"
elif 90 <= bpm <= 130:
return "Moderate"
else:
return "Fast"
# In[74]:
df['Tempo'] = df['Tempo'].apply(classify_tempo)
# In[75]:
n_songs_per_category = df.groupby('Tempo').size()
fig = px.pie(names=n_songs_per_category.index,
values=n_songs_per_category.values,
title='Types of Tempo')
fig.update_layout(title='Types of Tempo')
fig.show()
# In[76]:
df['Tempo'] = Tempo_original_value
# In[77]:
valence_original_value = df['Valence'].copy()
# In[78]:
def valence_type(x):
if x >= 0.0 and x < 0.5:
return "Happy|Positive"
elif x >= 0.5 and x < 1:
return "Sad|Negative"
df['Valence'] = df['Valence'].apply(valence_type)
# In[79]:
n_songs_per_category = df.groupby('Valence').size()
fig = px.pie(df, names=n_songs_per_category.index, values=n_songs_per_category.values)
fig.update_layout(title='types of valence (Happy or sad)')
fig.show()
# In[80]:
df['Valence'] = valence_original_value
# **How songs popularity increases over time**
# In[81]:
df_sorted = df.sort_values(by='Year')
bins = range(1900, 2030, 10)
labels = [f"{i}-{i + 9}" for i in range(1900, 2020, 10)]
df_sorted['Year Group'] = pd.cut(df_sorted['Year'], bins=bins, labels=labels, right=False)
avg_popularity_by_year = df_sorted.groupby('Year Group')['Popularity'].mean()
plt.figure(figsize=(15, 5))
plt.plot(avg_popularity_by_year.index, avg_popularity_by_year.values, marker='o', linestyle='-')
plt.title('Average Popularity Over 10-Year Intervals (1900-2019)')
plt.xlabel('Year Interval')
plt.ylabel('Average Popularity')
plt.xticks(rotation=45)
plt.grid(True)
plt.show()
# Higher popularities tend to be in more recent years. As we look further back in time, the popularities of songs generally decrease.
#
# ## Insights from EDA Visualizations
#
# **Hot Ranking Year & Popularity**: There is a direct proportional relationship between the hot ranking year and popularity rank.
#
#
# **Valence & Popularity**: There is no direct relationship between valence and popularity, loudness, or tempo.
#
#
# **Valence & Danceability**: Valence and danceability are somewhat directly proportional. Songs with higher danceability tend to have higher popularity.
#
#
# **Energy, Loudness & Popularity**: Energy and loudness are directly proportional to each other. Higher loudness and energy levels correlate with higher popularity.
#
#
# **Acousticness & Energy**: Acousticness and energy show an inverse relationship. Popularity tends to be higher with lower acousticness.
#
#
# **Song Length & Popularity**: There is no direct proportional relationship between song length and popularity. However, songs with extremely long lengths do not have high popularity.
#
#
# **Speechiness**: Most of the dataset has very low speechiness.
#
#
# **Tempo & Speed**: Over 54% of the data indicates songs that are neither too fast nor too slow. Fast songs with high tempo are twice as common as slow songs.
#
#
# **Mood & Popularity**: More than 64% of the dataset comprises sad songs. Interestingly, sad songs seem to have higher popularity.
#
# # **Feature Engineering**
# In[82]:
df.head()
# In[83]:
df.drop(['Album Release Date', 'minutes_length'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
# In[84]:
df = df[['Song', 'Album', 'Year', 'Artist Names', 'Artist(s) Genres',
'Hot100 Ranking Year', 'Hot100 Rank', 'Song Length(ms)', 'Spotify Link',
'Song Image', 'Spotify URI', 'Acousticness',
'Danceability', 'Energy', 'Instrumentalness', 'Liveness', 'Loudness',
'Speechiness', 'Tempo', 'Valence', 'Key', 'Mode', 'Time Signature', 'Popularity']]
# ## Encoding
# ### **Since the 'artist name' and 'artist genres' columns contain lists of strings, I will split each list into multiple rows, with each element in its own row. I will perform this transformation for both the 'artist name' and 'genre' columns.**
# In[85]:
df['Artist Names'] = df['Artist Names'].apply(ast.literal_eval)
df['Artist(s) Genres'] = df['Artist(s) Genres'].apply(ast.literal_eval)
df_exploded = df.explode('Artist Names')
df_exploded = df_exploded.explode('Artist(s) Genres')
df_exploded.info()
# In[86]:
df_exploded
# ### After splitting the 'artist name' and 'artist genres' columns into individual rows, I applied both target and label encoding to these features. Subsequently, I combined the encoded values back into a single cell for lists with more than one string.
#
# Target Encoding for Artist Genres
# In[87]:
encoder = TargetEncoder(cols=['Artist(s) Genres'])
encoder.fit(df_exploded, df_exploded['Popularity'])
df_encoded = encoder.transform(df_exploded)
with open('target_encoder.pkl', 'wb') as f:
pickle.dump(encoder, f)
# Label Encoding for Artist Genres
# In[88]:
le = LabelEncoder()
df_encoded['Artist Names'] = le.fit_transform(df_encoded['Artist Names'])
with open('label_encoder.pkl', 'wb') as f:
pickle.dump(le, f)
# In[89]:
df_encoded
# In[90]:
def aggregate_rows(group):
sum_artists = sum(group['Artist Names'].unique())
sum_genres = sum(group['Artist(s) Genres'].unique())
return pd.Series({
'Artist Names Encoded': sum_artists,
'Artist(s) Genres Encoded': sum_genres
})
aggregated_df = df_encoded.groupby(df_encoded.index).apply(aggregate_rows)
aggregated_df = aggregated_df.reset_index(drop=True)
aggregated_df.info()
# In[91]:
aggregated_df
# In[92]: