-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdata_cleaner.py
45 lines (36 loc) · 1.34 KB
/
data_cleaner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import nltk
from nltk.corpus import stopwords
import pandas as pd
from utils import logger
nltk.download("stopwords", quiet=True)
def clean_comments(dataframe: pd.DataFrame) -> pd.DataFrame:
"""Perform data cleaning on comments to prepare for analysis
:type dataframe: pd.DataFrame
:param dataframe: Uncleaned dataframe for comments
:rtype: pd.DataFrame
:returns: Cleaned pandas dataframe
"""
logger.info("Cleaning data for analysis...")
dataframe["Cleaned Comment Text"] = (
dataframe["Original Comment Text"]
# remove whitespace
.str.strip()
# replace newlines with space
.str.replace("\n", " ")
# remove mentions and links
.str.replace(r"(?:\@|http?\://|https?\://|www)\S+", "", regex=True)
# remove punctuations, emojis, special characters
.str.replace(r"[^\w\s]+", "", regex=True)
# turn into lowercase
.str.lower()
# remove numbers
.str.replace(r"\d+", "", regex=True)
# remove hashtags
.str.replace(r"#\S+", " ", regex=True)
)
# remove stop words
stop_words = stopwords.words("english")
dataframe["Cleaned Comment Text"] = dataframe["Cleaned Comment Text"].apply(
lambda comment: " ".join([word for word in comment.split() if word not in stop_words])
)
return dataframe