Skip to content

Commit

Permalink
Replaced emails by [email]
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Oct 31, 2024
1 parent 720b9f2 commit 799faf6
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 0 deletions.
11 changes: 11 additions & 0 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,16 @@ def pseudonymize_numbers(self, sentence):

return "".join(new_list)

def pseudonymize_email_addresses(self, sentence):
split = sentence.split(" ")
new_list = []
for word in split:
if "@" in word:
new_list.append("[email]")
else:
new_list.append(word)
return " ".join(new_list)

def concatenate(self, sentences):
return " ".join(sentences)

Expand All @@ -193,6 +203,7 @@ def pseudonymize(self, text: str):
ner = self.get_ner(sent)
ps_sent = " ".join(self.pseudonymize_ne(ner, sent)) if ner else sent
ps_sent = self.pseudonymize_numbers(ps_sent)
ps_sent = self.pseudonymize_email_addresses(ps_sent)
pseudonymized_sentences.append(ps_sent)
return self.concatenate(pseudonymized_sentences)

Expand Down
18 changes: 18 additions & 0 deletions mailcom/test/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,21 @@ def test_pseudonymize_no_entities(get_default_fr):
text = "Ceci est une phrase simple sans entités nommées ni chiffres."
pseudonymized_text = get_default_fr.pseudonymize(text)
assert pseudonymized_text == text


def test_pseudonymize_email_addresses(get_default_fr):
sentence = "My email is [email protected]."
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == "My email is [email]"

sentence = "Contact us at [email protected] or [email protected]."
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == "Contact us at [email] or [email]"

sentence = "No email addresses here!"
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == "No email addresses here!"

sentence = ""
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == ""

0 comments on commit 799faf6

Please sign in to comment.