diff --git a/mailcom/parse.py b/mailcom/parse.py index 8af7fcd..472c352 100644 --- a/mailcom/parse.py +++ b/mailcom/parse.py @@ -67,9 +67,18 @@ def init_spacy(self, language: str, model="default"): model, exclude=["morphologizer", "attribute_ruler", "lemmatizer", "ner"] ) except OSError: - raise OSError("Could not find {} in standard directory.".format(model)) - - self.nlp_spacy = sp.load(model) + try: + print( + "Could not find model in standard directory. Trying to download model from repo." # noqa + ) + # try downloading model + sp.cli.download(model) + self.nlp_spacy = sp.load( + model, + exclude=["morphologizer", "attribute_ruler", "lemmatizer", "ner"], + ) + except SystemExit: + raise SystemExit("Could not download {} from repo".format(model)) def init_transformers( self, diff --git a/mailcom/test/test_parse.py b/mailcom/test/test_parse.py index 80e8caf..6629fef 100644 --- a/mailcom/test/test_parse.py +++ b/mailcom/test/test_parse.py @@ -1,4 +1,4 @@ -from mailcom.parse import make_dir, check_dir +from mailcom import parse import pytest @@ -6,16 +6,197 @@ # with the update to Path, we need to change the tests def test_check_dir(tmpdir): mydir = tmpdir.mkdir("sub") - assert check_dir(str(mydir)) + assert parse.check_dir(str(mydir)) with pytest.raises(OSError): - check_dir(str(tmpdir.join("sub2"))) + parse.check_dir(str(tmpdir.join("sub2"))) def test_make_dir(tmpdir): mydir = tmpdir.join("sub") - make_dir(str(mydir)) + parse.make_dir(str(mydir)) assert mydir.check() + def test_check_dir_fail(): with pytest.raises(OSError): - check_dir(str("mydir")) + parse.check_dir(str("mydir")) + + +@pytest.fixture() +def get_instant(): + return parse.Pseudonymize() + + +@pytest.fixture() +def get_default_fr(): + inst = parse.Pseudonymize() + inst.init_spacy("fr") + inst.init_transformers() + return inst + + +def test_init_spacy(get_instant): + with pytest.raises(KeyError): + get_instant.init_spacy("not_a_language") + with pytest.raises(SystemExit): + get_instant.init_spacy("fr", "not_an_existing_spacy_model") + + +def test_init_transformers(get_instant): + # Test with default model and revision number + get_instant.init_transformers() + assert get_instant.ner_recognizer is not None + + # Test with an invalid model + with pytest.raises(OSError): + get_instant.init_transformers(model="invalid-model") + + # Test with an invalid revision number + with pytest.raises(OSError): + get_instant.init_transformers( + model="xlm-roberta-large-finetuned-conll03-english", + model_revision_number="invalid-revision", + ) + + +def test_reset(get_default_fr): + text1 = "ceci est un exemple de texte écrit par Claude. Il contient trois noms différents, comme celui de Dominique. Voyons si Martin est reconnu." # noqa + text2 = "ceci est un exemple de texte écrit par Francois. Il contient trois noms différents, comme celui de Agathe. Voyons si Antoine est reconnu." # noqa + sample_texts = [text1, text2] + for text in sample_texts: + # pseudonymize email + get_default_fr.pseudonymize(text) + get_default_fr.reset() + # Test that used names lists are empty now + # They should be cleared after every email + assert len(get_default_fr.used_first_names) == 0 + + +def test_get_ner(get_default_fr): + text = "ceci est un exemple de texte écrit par Claude. Il contient trois noms différents, comme celui de Dominique. Voyons si Martin est reconnu." # noqa + sents = get_default_fr.get_sentences(text) + for sent in sents: + assert get_default_fr.get_ner(sent) + + +def test_get_sentences_empty_string(get_default_fr): + text = "" + assert get_default_fr.get_sentences(text) == [] + + +def test_get_sentences_multiple_sentences(get_default_fr): + text = "Ceci est la première phrase. Voici la deuxième phrase. Et enfin, la troisième phrase." # noqa + sentences = get_default_fr.get_sentences(text) + assert len(sentences) == 3 + assert sentences[0] == "Ceci est la première phrase." + assert sentences[1] == "Voici la deuxième phrase." + assert sentences[2] == "Et enfin, la troisième phrase." + + +def test_get_sentences_with_punctuation(get_default_fr): + text = "Bonjour! Comment ça va? Très bien, merci." + sentences = get_default_fr.get_sentences(text) + assert len(sentences) == 3 + assert sentences[0] == "Bonjour!" + assert sentences[1] == "Comment ça va?" + assert sentences[2] == "Très bien, merci." + + +def test_pseudonymize_per(get_default_fr): + sentence = "Francois and Agathe are friends." + nelist = ["Francois", "Agathe"] + pseudonymized_sentence = get_default_fr.pseudonymize_per(sentence, nelist) + assert "Francois" not in pseudonymized_sentence + assert "Agathe" not in pseudonymized_sentence + assert any( + pseudo in pseudonymized_sentence + for pseudo in get_default_fr.pseudo_first_names["fr"] + ) + + +def test_pseudonymize_ne(get_default_fr): + sentence = "Francois and Agathe are friends." + ner = [ + { + "entity_group": "PER", + "score": 0.99, + "word": "Francois", + "start": 0, + "end": 8, + }, + { + "entity_group": "PER", + "score": 0.99, + "word": "Agathe", + "start": 13, + "end": 19, + }, + ] + pseudonymized_sentence = get_default_fr.pseudonymize_ne(ner, sentence) + assert "Francois" not in pseudonymized_sentence[0] + assert "Agathe" not in pseudonymized_sentence[0] + assert any( + pseudo in pseudonymized_sentence[0] + for pseudo in get_default_fr.pseudo_first_names["fr"] + ) + + +def test_pseudonymize_numbers(get_default_fr): + sentence = "My phone number is 123-456-7890." + pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence) + assert pseudonymized_sentence == "My phone number is xxx-xxx-xxxx." + + sentence = "The year 2023 is almost over." + pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence) + assert pseudonymized_sentence == "The year xxxx is almost over." + + sentence = "No digits here!" + pseudonymized_sentence = get_default_fr.pseudonymize_numbers(sentence) + assert pseudonymized_sentence == "No digits here!" + + +def test_concatenate_empty_list(get_default_fr): + sentences = [] + concatenated = get_default_fr.concatenate(sentences) + assert concatenated == "" + + +def test_concatenate_multiple_sentences(get_default_fr): + sentences = [ + "This is the first sentence.", + "This is the second sentence.", + "This is the third sentence.", + ] + concatenated = get_default_fr.concatenate(sentences) + assert ( + concatenated + == "This is the first sentence. This is the second sentence. This is the third sentence." # noqa + ) + + +def test_pseudonymize(get_default_fr): + text = "Francois et Agathe sont amis. Mon numéro de téléphone est 123-456-7890." + pseudonymized_text = get_default_fr.pseudonymize(text) + + # Check that names are pseudonymized + assert "Francois" not in pseudonymized_text + assert "Agathe" not in pseudonymized_text + assert any( + pseudo in pseudonymized_text + for pseudo in get_default_fr.pseudo_first_names["fr"] + ) + + # Check that numbers are pseudonymized + assert "123-456-7890" not in pseudonymized_text + + +def test_pseudonymize_empty_string(get_default_fr): + text = "" + pseudonymized_text = get_default_fr.pseudonymize(text) + assert pseudonymized_text == "" + + +def test_pseudonymize_no_entities(get_default_fr): + text = "Ceci est une phrase simple sans entités nommées ni chiffres." + pseudonymized_text = get_default_fr.pseudonymize(text) + assert pseudonymized_text == text diff --git a/requirements.txt b/requirements.txt index a54ec31..6cd0a84 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ spacy +fr_core_news_md @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-3.8.0/fr_core_news_md-3.8.0-py3-none-any.whl +es_core_news_md @ https://github.com/explosion/spacy-models/releases/download/es_core_news_md-3.8.0/es_core_news_md-3.8.0-py3-none-any.whl transformers \ No newline at end of file