From 99a562d609558485cf9fb6cbc47de75939d70d52 Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:10:41 +0200 Subject: [PATCH 01/13] Added test for init_spacy --- mailcom/test/test_parse.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index 80e8caf..f006131 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -1,4 +1,4 @@ -from mailcom.parse import make_dir, check_dir +from mailcom import parse import pytest @@ -6,16 +6,29 @@ # with the update to Path, we need to change the tests def test_check_dir(tmpdir): mydir = tmpdir.mkdir("sub") - assert check_dir(str(mydir)) + assert parse.check_dir(str(mydir)) with pytest.raises(OSError): - check_dir(str(tmpdir.join("sub2"))) + parse.check_dir(str(tmpdir.join("sub2"))) def test_make_dir(tmpdir): mydir = tmpdir.join("sub") - make_dir(str(mydir)) + parse.make_dir(str(mydir)) assert mydir.check() + def test_check_dir_fail(): with pytest.raises(OSError): - check_dir(str("mydir")) + parse.check_dir(str("mydir")) + + +@pytest.fixture() +def get_instant(): + return parse.Pseudonymize() + + +def test_init_spacy(get_instant): + with pytest.raises(KeyError): + get_instant.init_spacy("not_a_language") + with pytest.raises(OSError): + get_instant.init_spacy("fr", "not_an_existing_spacy_model") From c6121c4e2b88a967793a97190e09e9aeed844fda Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 15 Oct 2024 14:36:39 +0200 Subject: [PATCH 02/13] Added test for reset --- mailcom/test/test_parse.py | 43 +++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index f006131..fee6023 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -1,5 +1,11 @@ -from mailcom import parse +from mailcom import parse, inout import pytest +from pathlib import Path +from importlib import resources + +pkg = resources.files("mailcom") + +FILE_PATH = Path(pkg / "test" / "data") # these worked when we were using strings @@ -27,8 +33,43 @@ def get_instant(): return parse.Pseudonymize() +@pytest.fixture() +def get_default_fr(): + inst = parse.Pseudonymize() + inst.init_spacy("fr") + inst.init_transformers() + return inst + + +@pytest.fixture() +def get_sample_texts(): + inst = inout.InoutHandler(FILE_PATH) + inst.list_of_files() + email_list = [] + for file in inst.email_list: + text = inst.get_text(file) + text = inst.get_html_text(text) + if not text: + continue + email_list.append(text) + return email_list + + def test_init_spacy(get_instant): with pytest.raises(KeyError): get_instant.init_spacy("not_a_language") with pytest.raises(OSError): get_instant.init_spacy("fr", "not_an_existing_spacy_model") + + +# TODO init_transformers + + +def test_reset(get_default_fr, get_sample_texts): + for text in get_sample_texts: + # Test that used names lists are empty + # They should be cleared after every email + assert len(get_default_fr.used_first_names) == 0 + assert len(get_default_fr.used_last_names) == 0 + # pseudonymize email + get_default_fr.pseudonymize(text) From ccb078418dda380f4b1d7c0a3d5540a07543e20e Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 15 Oct 2024 15:04:50 +0200 Subject: [PATCH 03/13] Added test for get_sentences --- mailcom/test/test_parse.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index fee6023..aefde24 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -73,3 +73,8 @@ def test_reset(get_default_fr, get_sample_texts): assert len(get_default_fr.used_last_names) == 0 # pseudonymize email get_default_fr.pseudonymize(text) + + +def test_get_sentences(get_default_fr): + text = "ceci est un exemple de texte. Il doit comprendre 3 phrases. Si ce n’est pas le cas, quelque chose ne va vraiment pas." # noqa + assert len(get_default_fr.get_sentences(text)) == 3 From be4795bb2f0b8e3e9c38ba22d7e32717e3c87b9c Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 15 Oct 2024 15:11:56 +0200 Subject: [PATCH 04/13] Added test for get_ner --- mailcom/test/test_parse.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index aefde24..d2754e5 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -78,3 +78,10 @@ def test_reset(get_default_fr, get_sample_texts): def test_get_sentences(get_default_fr): text = "ceci est un exemple de texte. Il doit comprendre 3 phrases. Si ce n’est pas le cas, quelque chose ne va vraiment pas." # noqa assert len(get_default_fr.get_sentences(text)) == 3 + + +def test_get_ner(get_default_fr): + text = "ceci est un exemple de texte écrit par Claude. Il contient trois noms différents, comme celui de Dominique. Voyons si Martin est reconnu." # noqa + sents = get_default_fr.get_sentences(text) + for sent in sents: + assert get_default_fr.get_ner(sent) From 55d0a4d09b5a9547d3d464c61df86d722739d592 Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 15 Oct 2024 15:18:34 +0200 Subject: [PATCH 05/13] Added test for pseudonymize_ne --- mailcom/test/test_parse.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index d2754e5..ea1c97b 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -85,3 +85,13 @@ def test_get_ner(get_default_fr): sents = get_default_fr.get_sentences(text) for sent in sents: assert get_default_fr.get_ner(sent) + + +def test_pseudonymize_ne(get_default_fr): + text = "ceci est un exemple de texte écrit par Francois. Il contient trois noms différents, comme celui de Agathe. Voyons si Antoine est reconnu." # noqa + sents = get_default_fr.get_sentences(text) + names = ["Francois", "Agathe", "Antoine"] + for i in range(len(sents)): + ner = get_default_fr.get_ner(sents[i]) + ps_sent = " ".join(get_default_fr.pseudonymize_ne(ner, sents[i])) + assert names[i] not in ps_sent From b7c15c63602707caf5ec922122bc5ca95c799192 Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Mon, 21 Oct 2024 08:48:37 +0200 Subject: [PATCH 06/13] Added spacy models to requirements --- requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index a54ec31..6cd0a84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ spacy +fr_core_news_md @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.8.0/fr_core_news_md-3.8.0-py3-none-any.whl +es_core_news_md @ https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.8.0/es_core_news_md-3.8.0-py3-none-any.whl transformers \ No newline at end of file From b667a24358210c72a6ba6a8a3f778533eca6b89a Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Mon, 21 Oct 2024 09:00:01 +0200 Subject: [PATCH 07/13] Adjusted test_reset to changes in class --- mailcom/test/test_parse.py | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index ea1c97b..6383880 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -1,4 +1,4 @@ -from mailcom import parse, inout +from mailcom import parse import pytest from pathlib import Path from importlib import resources @@ -41,20 +41,6 @@ def get_default_fr(): return inst -@pytest.fixture() -def get_sample_texts(): - inst = inout.InoutHandler(FILE_PATH) - inst.list_of_files() - email_list = [] - for file in inst.email_list: - text = inst.get_text(file) - text = inst.get_html_text(text) - if not text: - continue - email_list.append(text) - return email_list - - def test_init_spacy(get_instant): with pytest.raises(KeyError): get_instant.init_spacy("not_a_language") @@ -65,14 +51,17 @@ def test_init_spacy(get_instant): # TODO init_transformers -def test_reset(get_default_fr, get_sample_texts): - for text in get_sample_texts: - # Test that used names lists are empty - # They should be cleared after every email - assert len(get_default_fr.used_first_names) == 0 - assert len(get_default_fr.used_last_names) == 0 +def test_reset(get_default_fr): + text1 = "ceci est un exemple de texte écrit par Claude. Il contient trois noms différents, comme celui de Dominique. Voyons si Martin est reconnu." # noqa + text2 = "ceci est un exemple de texte écrit par Francois. Il contient trois noms différents, comme celui de Agathe. Voyons si Antoine est reconnu." # noqa + sample_texts = [text1, text2] + for text in sample_texts: # pseudonymize email get_default_fr.pseudonymize(text) + get_default_fr.reset() + # Test that used names lists are empty now + # They should be cleared after every email + assert len(get_default_fr.used_first_names) == 0 def test_get_sentences(get_default_fr): From bf191339e11c5f63b9b6021a9fcd75ddc2c5c90c Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Thu, 24 Oct 2024 12:03:27 +0200 Subject: [PATCH 08/13] Added model download if not found --- mailcom/parse.py | 18 +++++++++++++++--- mailcom/test/test_parse.py | 2 +- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/mailcom/parse.py b/mailcom/parse.py index 75171dd..81ea972 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -67,9 +67,21 @@ def init_spacy(self, language: str, model="default"): model, exclude=["morphologizer", "attribute_ruler", "lemmatizer", "ner"] ) except OSError: - raise OSError("Could not find {} in standard directory.".format(model)) - - self.nlp_spacy = sp.load(model) + pass + try: + print( + "Could not find model in standard directory. Trying to download model from repo." # noqa + ) + # try downloading model + sp.cli.download(model) + self.nlp_spacy = sp.load( + model, + exclude=["morphologizer", "attribute_ruler", "lemmatizer", "ner"], + ) + except SystemExit: + raise SystemExit("Could not download {} from repo".format(model)) + except OSError: + raise OSError("Could not find {} in standard directory".format(model)) def init_transformers( self, diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index 6383880..b7a782e 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -44,7 +44,7 @@ def get_default_fr(): def test_init_spacy(get_instant): with pytest.raises(KeyError): get_instant.init_spacy("not_a_language") - with pytest.raises(OSError): + with pytest.raises(SystemExit): get_instant.init_spacy("fr", "not_an_existing_spacy_model") From 7b8d31a4d7b9ad4e6e2a93caa9915bb8eb2f5761 Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Thu, 24 Oct 2024 12:17:20 +0200 Subject: [PATCH 09/13] Added more tests for get_sentences --- mailcom/test/test_parse.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index b7a782e..94c4271 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -64,11 +64,6 @@ def test_reset(get_default_fr): assert len(get_default_fr.used_first_names) == 0 -def test_get_sentences(get_default_fr): - text = "ceci est un exemple de texte. Il doit comprendre 3 phrases. Si ce n’est pas le cas, quelque chose ne va vraiment pas." # noqa - assert len(get_default_fr.get_sentences(text)) == 3 - - def test_get_ner(get_default_fr): text = "ceci est un exemple de texte écrit par Claude. Il contient trois noms différents, comme celui de Dominique. Voyons si Martin est reconnu." # noqa sents = get_default_fr.get_sentences(text) @@ -84,3 +79,26 @@ def test_pseudonymize_ne(get_default_fr): ner = get_default_fr.get_ner(sents[i]) ps_sent = " ".join(get_default_fr.pseudonymize_ne(ner, sents[i])) assert names[i] not in ps_sent + + +def test_get_sentences_empty_string(get_default_fr): + text = "" + assert get_default_fr.get_sentences(text) == [] + + +def test_get_sentences_multiple_sentences(get_default_fr): + text = "Ceci est la première phrase. Voici la deuxième phrase. Et enfin, la troisième phrase." # noqa + sentences = get_default_fr.get_sentences(text) + assert len(sentences) == 3 + assert sentences[0] == "Ceci est la première phrase." + assert sentences[1] == "Voici la deuxième phrase." + assert sentences[2] == "Et enfin, la troisième phrase." + + +def test_get_sentences_with_punctuation(get_default_fr): + text = "Bonjour! Comment ça va? Très bien, merci." + sentences = get_default_fr.get_sentences(text) + assert len(sentences) == 3 + assert sentences[0] == "Bonjour!" + assert sentences[1] == "Comment ça va?" + assert sentences[2] == "Très bien, merci." From 6675d7a99ffb676ce2503624cdf238e86df5f5bf Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Fri, 25 Oct 2024 09:06:53 +0200 Subject: [PATCH 10/13] Added tests for concatenate and pseudonymize --- mailcom/test/test_parse.py | 110 +++++++++++++++++++++++++++++++++---- 1 file changed, 100 insertions(+), 10 deletions(-) diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index 94c4271..d7191a2 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -71,16 +71,6 @@ def test_get_ner(get_default_fr): assert get_default_fr.get_ner(sent) -def test_pseudonymize_ne(get_default_fr): - text = "ceci est un exemple de texte écrit par Francois. Il contient trois noms différents, comme celui de Agathe. Voyons si Antoine est reconnu." # noqa - sents = get_default_fr.get_sentences(text) - names = ["Francois", "Agathe", "Antoine"] - for i in range(len(sents)): - ner = get_default_fr.get_ner(sents[i]) - ps_sent = " ".join(get_default_fr.pseudonymize_ne(ner, sents[i])) - assert names[i] not in ps_sent - - def test_get_sentences_empty_string(get_default_fr): text = "" assert get_default_fr.get_sentences(text) == [] @@ -102,3 +92,103 @@ def test_get_sentences_with_punctuation(get_default_fr): assert sentences[0] == "Bonjour!" assert sentences[1] == "Comment ça va?" assert sentences[2] == "Très bien, merci." + + +def test_pseudonymize_per(get_default_fr): + sentence = "Francois and Agathe are friends." + nelist = ["Francois", "Agathe"] + pseudonymized_sentence = get_default_fr.pseudonymize_per(sentence, nelist) + assert "Francois" not in pseudonymized_sentence + assert "Agathe" not in pseudonymized_sentence + assert any( + pseudo in pseudonymized_sentence + for pseudo in get_default_fr.pseudo_first_names["fr"] + ) + + +def test_pseudonymize_ne(get_default_fr): + sentence = "Francois and Agathe are friends." + ner = [ + { + "entity_group": "PER", + "score": 0.99, + "word": "Francois", + "start": 0, + "end": 8, + }, + { + "entity_group": "PER", + "score": 0.99, + "word": "Agathe", + "start": 13, + "end": 19, + }, + ] + pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) + assert "Francois" not in pseudonymized_sentence[0] + assert "Agathe" not in pseudonymized_sentence[0] + assert any( + pseudo in pseudonymized_sentence[0] + for pseudo in get_default_fr.pseudo_first_names["fr"] + ) + + +def test_pseudonymize_numbers(get_default_fr): + sentence = "My phone number is 123-456-7890." + pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence) + assert pseudonymized_sentence == "My phone number is xxx-xxx-xxxx." + + sentence = "The year 2023 is almost over." + pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence) + assert pseudonymized_sentence == "The year xxxx is almost over." + + sentence = "No digits here!" + pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence) + assert pseudonymized_sentence == "No digits here!" + + +def test_concatenate_empty_list(get_default_fr): + sentences = [] + concatenated = get_default_fr.concatenate(sentences) + assert concatenated == "" + + +def test_concatenate_multiple_sentences(get_default_fr): + sentences = [ + "This is the first sentence.", + "This is the second sentence.", + "This is the third sentence.", + ] + concatenated = get_default_fr.concatenate(sentences) + assert ( + concatenated + == "This is the first sentence. This is the second sentence. This is the third sentence." # noqa + ) + + +def test_pseudonymize(get_default_fr): + text = "Francois et Agathe sont amis. Mon numéro de téléphone est 123-456-7890." + pseudonymized_text = get_default_fr.pseudonymize(text) + + # Check that names are pseudonymized + assert "Francois" not in pseudonymized_text + assert "Agathe" not in pseudonymized_text + assert any( + pseudo in pseudonymized_text + for pseudo in get_default_fr.pseudo_first_names["fr"] + ) + + # Check that numbers are pseudonymized + assert "123-456-7890" not in pseudonymized_text + + +def test_pseudonymize_empty_string(get_default_fr): + text = "" + pseudonymized_text = get_default_fr.pseudonymize(text) + assert pseudonymized_text == "" + + +def test_pseudonymize_no_entities(get_default_fr): + text = "Ceci est une phrase simple sans entités nommées ni chiffres." + pseudonymized_text = get_default_fr.pseudonymize(text) + assert pseudonymized_text == text From 40b6b14859183085de6d9c888ccce5c5fe613cfb Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 29 Oct 2024 14:20:13 +0100 Subject: [PATCH 11/13] Removed unused imports --- mailcom/test/test_parse.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index d7191a2..566694c 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -1,11 +1,5 @@ from mailcom import parse import pytest -from pathlib import Path -from importlib import resources - -pkg = resources.files("mailcom") - -FILE_PATH = Path(pkg / "test" / "data") # these worked when we were using strings From 2f13356a1b689bfa1a8bce0b8c13cffd2f4e98e0 Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 29 Oct 2024 14:29:36 +0100 Subject: [PATCH 12/13] Added test for init_transformers --- mailcom/test/test_parse.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index 566694c..6629fef 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -42,7 +42,21 @@ def test_init_spacy(get_instant): get_instant.init_spacy("fr", "not_an_existing_spacy_model") -# TODO init_transformers +def test_init_transformers(get_instant): + # Test with default model and revision number + get_instant.init_transformers() + assert get_instant.ner_recognizer is not None + + # Test with an invalid model + with pytest.raises(OSError): + get_instant.init_transformers(model="invalid-model") + + # Test with an invalid revision number + with pytest.raises(OSError): + get_instant.init_transformers( + model="xlm-roberta-large-finetuned-conll03-english", + model_revision_number="invalid-revision", + ) def test_reset(get_default_fr): From 61732c9a2df3fe9b9d814bdce678dee87f16cc76 Mon Sep 17 00:00:00 2001 From: Felix <65565033+fexfl@users.noreply.github.com> Date: Tue, 29 Oct 2024 15:16:28 +0100 Subject: [PATCH 13/13] Fixed the model download in init_spacy --- mailcom/parse.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/mailcom/parse.py b/mailcom/parse.py index e0579c6..472c352 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -67,21 +67,18 @@ def init_spacy(self, language: str, model="default"): model, exclude=["morphologizer", "attribute_ruler", "lemmatizer", "ner"] ) except OSError: - pass - try: - print( - "Could not find model in standard directory. Trying to download model from repo." # noqa - ) - # try downloading model - sp.cli.download(model) - self.nlp_spacy = sp.load( - model, - exclude=["morphologizer", "attribute_ruler", "lemmatizer", "ner"], - ) - except SystemExit: - raise SystemExit("Could not download {} from repo".format(model)) - except OSError: - raise OSError("Could not find {} in standard directory".format(model)) + try: + print( + "Could not find model in standard directory. Trying to download model from repo." # noqa + ) + # try downloading model + sp.cli.download(model) + self.nlp_spacy = sp.load( + model, + exclude=["morphologizer", "attribute_ruler", "lemmatizer", "ner"], + ) + except SystemExit: + raise SystemExit("Could not download {} from repo".format(model)) def init_transformers( self,