-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathenhance_extract_and_validate_emails.py
40 lines (31 loc) · 1.33 KB
/
enhance_extract_and_validate_emails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
import etl_plugin_core
from email_validator import validate_email, EmailNotValidError
class enhance_extract_and_validate_emails(object):
def process(self, parameters = {}, data = {}):
text = etl_plugin_core.get_text(data = data)
for match in re.finditer('[\w\.-]+@[\w\.-]+', text, re.IGNORECASE):
email = match.group(0)
try:
valid = validate_email(email, check_deliverability = False)
email = valid.email
etl_plugin_core.append(data, 'email_ss', email)
except EmailNotValidError as e:
pass
if 'email_ss' in data:
for match in re.finditer('From: (.* )?([\w\.-]+@[\w\.-]+)', text, re.IGNORECASE):
value = match.group(2)
etl_plugin_core.append(data, 'Message-From_ss', value)
# extract email adresses (to)
for match in re.finditer('To: (.* )?([\w\.-]+@[\w\.-]+)', text, re.IGNORECASE):
value = match.group(2)
etl_plugin_core.append(data, 'Message-To_ss', value)
# extract the domain part from all emailadresses to facet email domains
data['email_domain_ss'] = []
emails = data['email_ss']
if not isinstance(emails, list):
emails = [emails]
for email in emails:
domain = email.split('@')[1]
etl_plugin_core.append(data, 'email_domain_ss', domain)
return parameters, data