-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharticles_classification.py
82 lines (67 loc) · 2.63 KB
/
articles_classification.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python
# coding: utf-8
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import (concat, col)
from pyspark.ml.feature import (RegexTokenizer, StopWordsRemover,
StringIndexer, HashingTF, IDF)
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# start a new spark session
spark = SparkSession.builder.appName('articles_classification').getOrCreate()
# reading the file into data object
data = spark.read.csv('articles.csv', header=True, inferSchema=True)
# drop null values and redundant articles
data = data.na.drop() \
.distinct()
data.show()
# combining Title with Content columns and drop them after
data_combined = data.withColumn('article', concat('Title', 'Content')) \
.drop('Title') \
.drop('Content')
# show the categories and the number of articles for each
data_combined.groupBy("Category") \
.count() \
.orderBy(col("count").desc()) \
.show()
# words tokenization
regex_tokenizer = RegexTokenizer(pattern='\\W',
inputCol='article',
outputCol='words')
# apply tokenization
words = regex_tokenizer.transform(data_combined).drop('article')
words.show()
# stopword remover object
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
# appply remove stopwords
filtered_words = remover.transform(words).drop('words')
filtered_words.show()
# defining an HashingTF object
hashingTF = HashingTF(inputCol='filtered', outputCol='tf')
# transform the words into vectors
tf = hashingTF.transform(filtered_words).drop('filtered')
# the output column is the features which is tf-idf vector
idf = IDF(inputCol='tf', outputCol='features')
idf_model = idf.fit(tf)
# transforming the data into TF-IDF vectors
tf_idf = idf_model.transform(tf).drop('tf')
# class (Category) into number conversion
category_numeric = StringIndexer(inputCol='Category', outputCol='label')
ready_data = category_numeric.fit(tf_idf) \
.transform(tf_idf).drop('category')
# can select only features and label columns only
# ready_data = ready_data.select(['label', 'features'])
ready_data.show()
# splitting the data into 70% training and 30% testing
train_data, test_data = ready_data.randomSplit([0.7, 0.3])
# defining LogisticRegression object lr
lr = LogisticRegression()
# training the model
lr_model = lr.fit(train_data)
# getting predictions of test data
test_results = lr_model.transform(test_data)
# Evaluation (accuracy by default)
acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print(acc)