-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_preparation.py
63 lines (37 loc) · 1.3 KB
/
data_preparation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
dm=pd.read_csv('movies.csv')
print(dm.head(10))
print(dm.shape)
dg=dm.copy()
#spliting dates with movie name
x=dm['title'].str.partition('(',True)
x.head()
dm['title']=x[0]
dm['date']=x[2].str.replace(')','')
dg=dm.copy()
dg.to_csv('xyz.csv')
# In[8]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
dm['year'] = dm.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
dm['year'] = dm.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
dm['title'] = dm.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
dm['title'] = dm['title'].apply(lambda x: x.strip())
dm.head()
cleaned=dm.set_index('title').genres.str.split('|',expand=True).stack()
df=pd.get_dummies(cleaned).groupby(level=0).sum()
df=df.drop('(no genres listed)',axis=1)
df.head()
de=dm.iloc[:,0:2]
de['year']=dm['year']
de.head()
dv=df.merge(de,how='inner',left_on='title',right_on='title').sort_values(by='movieId')
dmain=pd.DataFrame(dv)
dmain=dmain.reset_index(drop=True)
dmain.to_csv('movie_withgenres.csv')