Skip to content

Commit

Permalink
Replace emails, numbers and other NE by brackets (#48)
Browse files Browse the repository at this point in the history
* Replaced numbers by [number]

* Replaced emails by [email]

* Replaced locations, organizations and misc with brakets

* Changed email psuedonymization to before ner
  • Loading branch information
fexfl authored Nov 5, 2024
1 parent 5a4dd30 commit b9d14f8
Show file tree
Hide file tree
Showing 2 changed files with 146 additions and 39 deletions.
41 changes: 31 additions & 10 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def pseudonymize_ne(self, ner, sentence):
for i in range(len(ner)):
entity = ner[i]
ent_string = entity["entity_group"] # noqa
ent_word = entity["word"]
# here we could check that string is "PER"
ent_conf = entity["score"] # noqa
ent_position = entity["start"], entity["end"]
Expand All @@ -156,14 +157,16 @@ def pseudonymize_ne(self, ner, sentence):
# replace PER
if ent_string == "PER":
# add the name of this entity to list
nelist.append(entity["word"])
else:
# Locations and Organizations
new_sentence = (
new_sentence[: (ent_position[0])]
+ "x" * (ent_position[1] - ent_position[0])
+ new_sentence[(ent_position[1]) :] # noqa
)
nelist.append(ent_word)
# replace LOC
elif ent_string == "LOC":
new_sentence = new_sentence.replace(ent_word, "[location]")
# replace ORG
elif ent_string == "ORG":
new_sentence = new_sentence.replace(ent_word, "[organization]")
# replace MISC
elif ent_string == "MISC":
new_sentence = new_sentence.replace(ent_word, "[misc]")
# replace all unique PER now
new_sentence = self.pseudonymize_per(new_sentence, nelist)

Expand All @@ -172,8 +175,25 @@ def pseudonymize_ne(self, ner, sentence):

def pseudonymize_numbers(self, sentence):
sent_as_list = list(sentence)
sent_as_list = [char if not char.isdigit() else "x" for char in sent_as_list]
return "".join(sent_as_list)
new_list = []
for i in range(len(sent_as_list)):
if sent_as_list[i].isdigit():
if i == 0 or not sent_as_list[i - 1].isdigit():
new_list.append("[number]")
else:
new_list.append(sent_as_list[i])

return "".join(new_list)

def pseudonymize_email_addresses(self, sentence):
split = sentence.split(" ")
new_list = []
for word in split:
if "@" in word:
new_list.append("[email]")
else:
new_list.append(word)
return " ".join(new_list)

def concatenate(self, sentences):
return " ".join(sentences)
Expand All @@ -183,6 +203,7 @@ def pseudonymize(self, text: str):
sentences = self.get_sentences(text)
pseudonymized_sentences = []
for sent in sentences:
sent = self.pseudonymize_email_addresses(sent)
ner = self.get_ner(sent)
ps_sent = " ".join(self.pseudonymize_ne(ner, sent)) if ner else sent
ps_sent = self.pseudonymize_numbers(ps_sent)
Expand Down
144 changes: 115 additions & 29 deletions mailcom/test/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,46 +114,23 @@ def test_pseudonymize_per(get_default_fr):
)


def test_pseudonymize_ne(get_default_fr):
sentence = "Francois and Agathe are friends."
ner = [
{
"entity_group": "PER",
"score": 0.99,
"word": "Francois",
"start": 0,
"end": 8,
},
{
"entity_group": "PER",
"score": 0.99,
"word": "Agathe",
"start": 13,
"end": 19,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Francois" not in pseudonymized_sentence[0]
assert "Agathe" not in pseudonymized_sentence[0]
assert any(
pseudo in pseudonymized_sentence[0]
for pseudo in get_default_fr.pseudo_first_names["fr"]
)


def test_pseudonymize_numbers(get_default_fr):
sentence = "My phone number is 123-456-7890."
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
assert pseudonymized_sentence == "My phone number is xxx-xxx-xxxx."
assert pseudonymized_sentence == "My phone number is [number]-[number]-[number]."

sentence = "The year 2023 is almost over."
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
assert pseudonymized_sentence == "The year xxxx is almost over."
assert pseudonymized_sentence == "The year [number] is almost over."

sentence = "No digits here!"
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
assert pseudonymized_sentence == "No digits here!"

sentence = ""
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
assert pseudonymized_sentence == ""


def test_concatenate_empty_list(get_default_fr):
sentences = []
Expand Down Expand Up @@ -200,3 +177,112 @@ def test_pseudonymize_no_entities(get_default_fr):
text = "Ceci est une phrase simple sans entités nommées ni chiffres."
pseudonymized_text = get_default_fr.pseudonymize(text)
assert pseudonymized_text == text


def test_pseudonymize_email_addresses(get_default_fr):
sentence = "My email is [email protected]."
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == "My email is [email]"

sentence = "Contact us at [email protected] or [email protected]."
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == "Contact us at [email] or [email]"

sentence = "No email addresses here!"
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == "No email addresses here!"

sentence = ""
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence)
assert pseudonymized_sentence == ""


def test_pseudonymize_ne_with_person_entities(get_default_fr):
sentence = "Francois et Agathe sont amis."
ner = [
{
"entity_group": "PER",
"score": 0.99,
"word": "Francois",
"start": 0,
"end": 8,
},
{
"entity_group": "PER",
"score": 0.99,
"word": "Agathe",
"start": 13,
"end": 19,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Francois" not in pseudonymized_sentence[0]
assert "Agathe" not in pseudonymized_sentence[0]
assert any(
pseudo in pseudonymized_sentence[0]
for pseudo in get_default_fr.pseudo_first_names["fr"]
)


def test_pseudonymize_ne_with_location_entities(get_default_fr):
sentence = "Paris et New York sont des villes."
ner = [
{
"entity_group": "LOC",
"score": 0.99,
"word": "Paris",
"start": 0,
"end": 5,
},
{
"entity_group": "LOC",
"score": 0.99,
"word": "New York",
"start": 10,
"end": 18,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Paris" not in pseudonymized_sentence[0]
assert "New York" not in pseudonymized_sentence[0]
assert "[location]" in pseudonymized_sentence[0]


def test_pseudonymize_ne_with_organization_entities(get_default_fr):
sentence = "Google et Microsoft sont des géants de la technologie."
ner = [
{
"entity_group": "ORG",
"score": 0.99,
"word": "Google",
"start": 0,
"end": 6,
},
{
"entity_group": "ORG",
"score": 0.99,
"word": "Microsoft",
"start": 11,
"end": 20,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Google" not in pseudonymized_sentence[0]
assert "Microsoft" not in pseudonymized_sentence[0]
assert "[organization]" in pseudonymized_sentence[0]


def test_pseudonymize_ne_with_misc_entities(get_default_fr):
sentence = "La tour Eiffel est un monument célèbre."
ner = [
{
"entity_group": "MISC",
"score": 0.99,
"word": "tour Eiffel",
"start": 4,
"end": 16,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "tour Eiffel" not in pseudonymized_sentence[0]
assert "[misc]" in pseudonymized_sentence[0]

0 comments on commit b9d14f8

Please sign in to comment.