Skip to content

Commit

Permalink
Merge pull request #47 from ssciwr/pseudonymization-tests
Browse files Browse the repository at this point in the history
Pseudonymization class method tests
  • Loading branch information
fexfl authored Oct 31, 2024
2 parents f8e9afd + 61732c9 commit 5a4dd30
Show file tree
Hide file tree
Showing 3 changed files with 200 additions and 8 deletions.
15 changes: 12 additions & 3 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,18 @@ def init_spacy(self, language: str, model="default"):
model, exclude=["morphologizer", "attribute_ruler", "lemmatizer", "ner"]
)
except OSError:
raise OSError("Could not find {} in standard directory.".format(model))

self.nlp_spacy = sp.load(model)
try:
print(
"Could not find model in standard directory. Trying to download model from repo." # noqa
)
# try downloading model
sp.cli.download(model)
self.nlp_spacy = sp.load(
model,
exclude=["morphologizer", "attribute_ruler", "lemmatizer", "ner"],
)
except SystemExit:
raise SystemExit("Could not download {} from repo".format(model))

def init_transformers(
self,
Expand Down
191 changes: 186 additions & 5 deletions mailcom/test/test_parse.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,202 @@
from mailcom.parse import make_dir, check_dir
from mailcom import parse
import pytest


# these worked when we were using strings
# with the update to Path, we need to change the tests
def test_check_dir(tmpdir):
mydir = tmpdir.mkdir("sub")
assert check_dir(str(mydir))
assert parse.check_dir(str(mydir))
with pytest.raises(OSError):
check_dir(str(tmpdir.join("sub2")))
parse.check_dir(str(tmpdir.join("sub2")))


def test_make_dir(tmpdir):
mydir = tmpdir.join("sub")
make_dir(str(mydir))
parse.make_dir(str(mydir))
assert mydir.check()


def test_check_dir_fail():
with pytest.raises(OSError):
check_dir(str("mydir"))
parse.check_dir(str("mydir"))


@pytest.fixture()
def get_instant():
return parse.Pseudonymize()


@pytest.fixture()
def get_default_fr():
inst = parse.Pseudonymize()
inst.init_spacy("fr")
inst.init_transformers()
return inst


def test_init_spacy(get_instant):
with pytest.raises(KeyError):
get_instant.init_spacy("not_a_language")
with pytest.raises(SystemExit):
get_instant.init_spacy("fr", "not_an_existing_spacy_model")


def test_init_transformers(get_instant):
# Test with default model and revision number
get_instant.init_transformers()
assert get_instant.ner_recognizer is not None

# Test with an invalid model
with pytest.raises(OSError):
get_instant.init_transformers(model="invalid-model")

# Test with an invalid revision number
with pytest.raises(OSError):
get_instant.init_transformers(
model="xlm-roberta-large-finetuned-conll03-english",
model_revision_number="invalid-revision",
)


def test_reset(get_default_fr):
text1 = "ceci est un exemple de texte écrit par Claude. Il contient trois noms différents, comme celui de Dominique. Voyons si Martin est reconnu." # noqa
text2 = "ceci est un exemple de texte écrit par Francois. Il contient trois noms différents, comme celui de Agathe. Voyons si Antoine est reconnu." # noqa
sample_texts = [text1, text2]
for text in sample_texts:
# pseudonymize email
get_default_fr.pseudonymize(text)
get_default_fr.reset()
# Test that used names lists are empty now
# They should be cleared after every email
assert len(get_default_fr.used_first_names) == 0


def test_get_ner(get_default_fr):
text = "ceci est un exemple de texte écrit par Claude. Il contient trois noms différents, comme celui de Dominique. Voyons si Martin est reconnu." # noqa
sents = get_default_fr.get_sentences(text)
for sent in sents:
assert get_default_fr.get_ner(sent)


def test_get_sentences_empty_string(get_default_fr):
text = ""
assert get_default_fr.get_sentences(text) == []


def test_get_sentences_multiple_sentences(get_default_fr):
text = "Ceci est la première phrase. Voici la deuxième phrase. Et enfin, la troisième phrase." # noqa
sentences = get_default_fr.get_sentences(text)
assert len(sentences) == 3
assert sentences[0] == "Ceci est la première phrase."
assert sentences[1] == "Voici la deuxième phrase."
assert sentences[2] == "Et enfin, la troisième phrase."


def test_get_sentences_with_punctuation(get_default_fr):
text = "Bonjour! Comment ça va? Très bien, merci."
sentences = get_default_fr.get_sentences(text)
assert len(sentences) == 3
assert sentences[0] == "Bonjour!"
assert sentences[1] == "Comment ça va?"
assert sentences[2] == "Très bien, merci."


def test_pseudonymize_per(get_default_fr):
sentence = "Francois and Agathe are friends."
nelist = ["Francois", "Agathe"]
pseudonymized_sentence = get_default_fr.pseudonymize_per(sentence, nelist)
assert "Francois" not in pseudonymized_sentence
assert "Agathe" not in pseudonymized_sentence
assert any(
pseudo in pseudonymized_sentence
for pseudo in get_default_fr.pseudo_first_names["fr"]
)


def test_pseudonymize_ne(get_default_fr):
sentence = "Francois and Agathe are friends."
ner = [
{
"entity_group": "PER",
"score": 0.99,
"word": "Francois",
"start": 0,
"end": 8,
},
{
"entity_group": "PER",
"score": 0.99,
"word": "Agathe",
"start": 13,
"end": 19,
},
]
pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence)
assert "Francois" not in pseudonymized_sentence[0]
assert "Agathe" not in pseudonymized_sentence[0]
assert any(
pseudo in pseudonymized_sentence[0]
for pseudo in get_default_fr.pseudo_first_names["fr"]
)


def test_pseudonymize_numbers(get_default_fr):
sentence = "My phone number is 123-456-7890."
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
assert pseudonymized_sentence == "My phone number is xxx-xxx-xxxx."

sentence = "The year 2023 is almost over."
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
assert pseudonymized_sentence == "The year xxxx is almost over."

sentence = "No digits here!"
pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence)
assert pseudonymized_sentence == "No digits here!"


def test_concatenate_empty_list(get_default_fr):
sentences = []
concatenated = get_default_fr.concatenate(sentences)
assert concatenated == ""


def test_concatenate_multiple_sentences(get_default_fr):
sentences = [
"This is the first sentence.",
"This is the second sentence.",
"This is the third sentence.",
]
concatenated = get_default_fr.concatenate(sentences)
assert (
concatenated
== "This is the first sentence. This is the second sentence. This is the third sentence." # noqa
)


def test_pseudonymize(get_default_fr):
text = "Francois et Agathe sont amis. Mon numéro de téléphone est 123-456-7890."
pseudonymized_text = get_default_fr.pseudonymize(text)

# Check that names are pseudonymized
assert "Francois" not in pseudonymized_text
assert "Agathe" not in pseudonymized_text
assert any(
pseudo in pseudonymized_text
for pseudo in get_default_fr.pseudo_first_names["fr"]
)

# Check that numbers are pseudonymized
assert "123-456-7890" not in pseudonymized_text


def test_pseudonymize_empty_string(get_default_fr):
text = ""
pseudonymized_text = get_default_fr.pseudonymize(text)
assert pseudonymized_text == ""


def test_pseudonymize_no_entities(get_default_fr):
text = "Ceci est une phrase simple sans entités nommées ni chiffres."
pseudonymized_text = get_default_fr.pseudonymize(text)
assert pseudonymized_text == text
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
spacy
fr_core_news_md @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.8.0/fr_core_news_md-3.8.0-py3-none-any.whl
es_core_news_md @ https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.8.0/es_core_news_md-3.8.0-py3-none-any.whl
transformers

0 comments on commit 5a4dd30

Please sign in to comment.