-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Replace emails, numbers and other NE by brackets (#48)
* Replaced numbers by [number] * Replaced emails by [email] * Replaced locations, organizations and misc with brakets * Changed email psuedonymization to before ner
- Loading branch information
Showing
2 changed files
with
146 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -114,46 +114,23 @@ def test_pseudonymize_per(get_default_fr): | |
) | ||
|
||
|
||
def test_pseudonymize_ne(get_default_fr): | ||
sentence = "Francois and Agathe are friends." | ||
ner = [ | ||
{ | ||
"entity_group": "PER", | ||
"score": 0.99, | ||
"word": "Francois", | ||
"start": 0, | ||
"end": 8, | ||
}, | ||
{ | ||
"entity_group": "PER", | ||
"score": 0.99, | ||
"word": "Agathe", | ||
"start": 13, | ||
"end": 19, | ||
}, | ||
] | ||
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) | ||
assert "Francois" not in pseudonymized_sentence[0] | ||
assert "Agathe" not in pseudonymized_sentence[0] | ||
assert any( | ||
pseudo in pseudonymized_sentence[0] | ||
for pseudo in get_default_fr.pseudo_first_names["fr"] | ||
) | ||
|
||
|
||
def test_pseudonymize_numbers(get_default_fr): | ||
sentence = "My phone number is 123-456-7890." | ||
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence) | ||
assert pseudonymized_sentence == "My phone number is xxx-xxx-xxxx." | ||
assert pseudonymized_sentence == "My phone number is [number]-[number]-[number]." | ||
|
||
sentence = "The year 2023 is almost over." | ||
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence) | ||
assert pseudonymized_sentence == "The year xxxx is almost over." | ||
assert pseudonymized_sentence == "The year [number] is almost over." | ||
|
||
sentence = "No digits here!" | ||
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence) | ||
assert pseudonymized_sentence == "No digits here!" | ||
|
||
sentence = "" | ||
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence) | ||
assert pseudonymized_sentence == "" | ||
|
||
|
||
def test_concatenate_empty_list(get_default_fr): | ||
sentences = [] | ||
|
@@ -200,3 +177,112 @@ def test_pseudonymize_no_entities(get_default_fr): | |
text = "Ceci est une phrase simple sans entités nommées ni chiffres." | ||
pseudonymized_text = get_default_fr.pseudonymize(text) | ||
assert pseudonymized_text == text | ||
|
||
|
||
def test_pseudonymize_email_addresses(get_default_fr): | ||
sentence = "My email is [email protected]." | ||
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence) | ||
assert pseudonymized_sentence == "My email is [email]" | ||
|
||
sentence = "Contact us at [email protected] or [email protected]." | ||
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence) | ||
assert pseudonymized_sentence == "Contact us at [email] or [email]" | ||
|
||
sentence = "No email addresses here!" | ||
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence) | ||
assert pseudonymized_sentence == "No email addresses here!" | ||
|
||
sentence = "" | ||
pseudonymized_sentence = get_default_fr.pseudonymize_email_addresses(sentence) | ||
assert pseudonymized_sentence == "" | ||
|
||
|
||
def test_pseudonymize_ne_with_person_entities(get_default_fr): | ||
sentence = "Francois et Agathe sont amis." | ||
ner = [ | ||
{ | ||
"entity_group": "PER", | ||
"score": 0.99, | ||
"word": "Francois", | ||
"start": 0, | ||
"end": 8, | ||
}, | ||
{ | ||
"entity_group": "PER", | ||
"score": 0.99, | ||
"word": "Agathe", | ||
"start": 13, | ||
"end": 19, | ||
}, | ||
] | ||
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) | ||
assert "Francois" not in pseudonymized_sentence[0] | ||
assert "Agathe" not in pseudonymized_sentence[0] | ||
assert any( | ||
pseudo in pseudonymized_sentence[0] | ||
for pseudo in get_default_fr.pseudo_first_names["fr"] | ||
) | ||
|
||
|
||
def test_pseudonymize_ne_with_location_entities(get_default_fr): | ||
sentence = "Paris et New York sont des villes." | ||
ner = [ | ||
{ | ||
"entity_group": "LOC", | ||
"score": 0.99, | ||
"word": "Paris", | ||
"start": 0, | ||
"end": 5, | ||
}, | ||
{ | ||
"entity_group": "LOC", | ||
"score": 0.99, | ||
"word": "New York", | ||
"start": 10, | ||
"end": 18, | ||
}, | ||
] | ||
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) | ||
assert "Paris" not in pseudonymized_sentence[0] | ||
assert "New York" not in pseudonymized_sentence[0] | ||
assert "[location]" in pseudonymized_sentence[0] | ||
|
||
|
||
def test_pseudonymize_ne_with_organization_entities(get_default_fr): | ||
sentence = "Google et Microsoft sont des géants de la technologie." | ||
ner = [ | ||
{ | ||
"entity_group": "ORG", | ||
"score": 0.99, | ||
"word": "Google", | ||
"start": 0, | ||
"end": 6, | ||
}, | ||
{ | ||
"entity_group": "ORG", | ||
"score": 0.99, | ||
"word": "Microsoft", | ||
"start": 11, | ||
"end": 20, | ||
}, | ||
] | ||
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) | ||
assert "Google" not in pseudonymized_sentence[0] | ||
assert "Microsoft" not in pseudonymized_sentence[0] | ||
assert "[organization]" in pseudonymized_sentence[0] | ||
|
||
|
||
def test_pseudonymize_ne_with_misc_entities(get_default_fr): | ||
sentence = "La tour Eiffel est un monument célèbre." | ||
ner = [ | ||
{ | ||
"entity_group": "MISC", | ||
"score": 0.99, | ||
"word": "tour Eiffel", | ||
"start": 4, | ||
"end": 16, | ||
}, | ||
] | ||
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) | ||
assert "tour Eiffel" not in pseudonymized_sentence[0] | ||
assert "[misc]" in pseudonymized_sentence[0] |