-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy pathhashtag_parse.py
52 lines (32 loc) · 1.15 KB
/
hashtag_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
__author__ = 'seandolinar'
__version__ = '0.1.0'
import collections as col
import pandas as pd
import re
class HashtagParse(object):
def __init__(self):
self.list = []
self.dict = {}
self.baskets = []
def count(self, text, case=False):
'''
finds and returns all the hashtags
while adding them to the internal counter
no support for multiple hastags per item
'''
if not case:
text = text.lower()
text = text.replace('#', ' #')
text = re.sub(r'([^A-Za-z1-9# ])', ' ', text)
hash_dict = list({tag for tag in text.split() if tag.startswith("#")})
temp_dict = self.dict
self.dict = col.Counter(temp_dict) + col.Counter(hash_dict)
if hash_dict != []:
self.baskets.append(hash_dict)
return hash_dict
def create_csv(self, file='hashtag_out.csv'):
hashtag_items = self.dict.items()
hashtag_count_df = pd.DataFrame(hashtag_items, columns=['hashtag', 'total'])
with open(file, 'w') as f:
hashtag_count_df.to_csv(f, sep=',', index = False, encoding='utf-8')
return