Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

changed inout to a classstructure #33

Merged
merged 4 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 69 additions & 60 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,69 +3,78 @@
import eml_parser
from bs4 import BeautifulSoup

def list_of_files(directory_name: str) -> list[Path]:
"""Function to create a list of files that are present in a directory as path objects.

Args:
directory_name (str): The directory where the files are located.

Returns:
list[Path]: A list of Path objects that represent the files in the directory."""
if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(directory_name))
mypath = Path(directory_name)
pattern = [".eml", ".html"] # we would not change the file type through user input
email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
if len(email_list) == 0:
raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
return email_list
class InoutHandler:
def __init__(self, directory_name: str):
"""Constructor for the InoutHandler class.

Args:
directory_name (str): The directory where the files are located.
"""
self.directory_name = directory_name
# presets
self.pattern = [".eml", ".html"]

def list_of_files(self):
"""Method to create a list of Path objects (files) that are present
in a directory."""
if not os.path.exists(self.directory_name): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(self.directory_name))
mypath = Path(self.directory_name)
self.email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in self.pattern]
if len(self.email_list) == 0:
raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))

def get_html_text(text_check: str) -> str:
"""Clean up a string if it contains html content.
Args:
text_check (str): The string that may contain html content.
def get_html_text(self, text_check: str) -> str:
"""Clean up a string if it contains html content.
Args:
text_check (str): The string that may contain html content.

Returns:
str: The (potentially) cleaned up string."""
soup = BeautifulSoup(text_check , 'html.parser')
if soup.find():
text_check = soup.get_text()
return text_check

def get_text(self, file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.

Returns:
str: The (potentially) cleaned up string."""
soup = BeautifulSoup(text_check , 'html.parser')
if soup.find():
text_check = soup.get_text()
return text_check
Args:
file (Path): The path to the email file.

Returns:
str: The textual content of the email. In the future, this will return the
complete dictionary with the metadata."""
if not file.is_file(): # check if given file exists raises error otherwise
raise OSError("File {} does not exist".format(file))
with open(file, 'rb') as fhdl:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
attachmenttypes = []
# find if there are any attachements, and if yes, how many
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
self.email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
}
return(self.email_content["content"])

def get_text(file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.
def validate_data(self):
pass

Check warning on line 68 in mailcom/inout.py

View check run for this annotation

Codecov / codecov/patch

mailcom/inout.py#L68

Added line #L68 was not covered by tests

Args:
file (Path): The path to the email file.

Returns:
str: The textual content of the email. In the future, this will return the
complete dictionary with the metadata."""
if not file.is_file(): # check if given file exists raises error otherwise
raise OSError("File {} does not exist".format(file))
with open(file, 'rb') as fhdl:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
attachmenttypes = []
# find if there are any attachements, and if yes, how many
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
}
return(email_content["content"])
def data_to_xml(self):
pass

Check warning on line 71 in mailcom/inout.py

View check run for this annotation

Codecov / codecov/patch

mailcom/inout.py#L71

Added line #L71 was not covered by tests

def write_file(self, text: str, name: str)-> None:
"""Write the extracted string to a text file.

def write_file(text: str, name: str)-> None:
"""Write the extracted string to a text file.

Args:
text (str): The string to be written to the file.
name (str): The name of the file to be written."""
with open("{}.out".format(name), "w") as file:
file.write(text)
Args:
text (str): The string to be written to the file.
name (str): The name of the file to be written."""
with open("{}.out".format(name), "w") as file:
file.write(text)

Check warning on line 80 in mailcom/inout.py

View check run for this annotation

Codecov / codecov/patch

mailcom/inout.py#L79-L80

Added lines #L79 - L80 were not covered by tests
14 changes: 9 additions & 5 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import spacy as sp
from transformers import pipeline
from pathlib import Path
from mailcom.inout import get_text, list_of_files, get_html_text
from mailcom.inout import InoutHandler

# please modify this section depending on your setup
# input language - either "es" or "fr"
Expand Down Expand Up @@ -116,12 +116,16 @@
print("Generating output directory/ies.")
make_dir(path_output)
# process the text
eml_files = list_of_files(path_input)
io = InoutHandler(path_input)
io.list_of_files()

Check warning on line 120 in mailcom/parse.py

View check run for this annotation

Codecov / codecov/patch

mailcom/parse.py#L119-L120

Added lines #L119 - L120 were not covered by tests
# html_files = list_of_files(path_input, "html")
for file in eml_files:
text = get_text(file)
text = get_html_text(text)
for file in io.email_list:
text = io.get_text(file)
text = io.get_html_text(text)

Check warning on line 124 in mailcom/parse.py

View check run for this annotation

Codecov / codecov/patch

mailcom/parse.py#L122-L124

Added lines #L122 - L124 were not covered by tests
print(text)
print(io.email_content["date"])
print(io.email_content["attachment"])
print(io.email_content["attachement type"])

Check warning on line 128 in mailcom/parse.py

View check run for this annotation

Codecov / codecov/patch

mailcom/parse.py#L126-L128

Added lines #L126 - L128 were not covered by tests
# skip this text if email could not be parsed
if not text:
continue
Expand Down
65 changes: 30 additions & 35 deletions mailcom/test/test_inout.py
Original file line number Diff line number Diff line change
@@ -1,56 +1,51 @@
from mailcom.inout import list_of_files, get_text, get_html_text
from mailcom import inout
import pytest
from pathlib import Path
from importlib import resources
import datetime

pkg = resources.files("mailcom")

FILE_PATH = Path(pkg / "test" / "data" / "Bonjour Agathe.eml")

TEXT_REF = "J'espère que tu vas bien!"

def test_list_of_files_found(tmp_path):
p = tmp_path / "test.eml"
p.write_text("test")
assert len(list_of_files(tmp_path)) != 0
@pytest.fixture()
def get_instant(tmp_path):
return inout.InoutHandler(tmp_path)

def test_list_of_files_empty(tmp_path):
def test_list_of_files(get_instant):
with pytest.raises(ValueError):
list_of_files(tmp_path)

def test_list_of_files_dir_not_existing():
with pytest.raises(OSError):
list_of_files("nonexistingDir")

def test_list_of_files_correct_format(tmp_path):
p = tmp_path / "test.eml"
get_instant.list_of_files()
p = get_instant.directory_name / "test.eml"
p.write_text("test")
p = tmp_path / "test2.html"
get_instant.list_of_files()
assert len(get_instant.email_list) != 0
get_instant2 = inout.InoutHandler("nonexistingDir")
with pytest.raises(OSError):
get_instant2.list_of_files()
p = get_instant.directory_name / "test2.html"
p.write_text("test2")
p = tmp_path / "test3.xml"
p = get_instant.directory_name / "test3.xml"
p.write_text("test3")
assert tmp_path / "test3.xml" not in list_of_files(tmp_path)
get_instant.list_of_files()
assert get_instant.directory_name / "test3.xml" not in get_instant.email_list

def test_get_text(tmp_path):
p = tmp_path / "test.eml"
def test_get_text(get_instant):
p = get_instant.directory_name / "test.eml"
p.write_text("test")
assert get_text(p) == 'test'
text = get_text(FILE_PATH)
print(text[0:25])
extracted_text = get_instant.get_text(p)
assert extracted_text == 'test'
text = get_instant.get_text(FILE_PATH)
assert text[0:25] == TEXT_REF

def test_get_text_err():
assert get_instant.email_content["date"] == datetime.datetime(2024, 4, 17, 15, 13, 56, tzinfo=datetime.timezone.utc)
assert get_instant.email_content["attachment"] == 2
assert get_instant.email_content["attachement type"] == ['jpg', 'jpg']
with pytest.raises(OSError):
list_of_files("nonexistingDir")
get_instant.get_text(get_instant.directory_name / "nonexisting.eml")

def test_get_html_text():
def test_get_html_text(get_instant):
html = """<html><head><title>Test</title></head></html>"""
assert get_html_text(html) == 'Test'

def test_get_html_text_noHtml():
assert get_instant.get_html_text(html) == 'Test'
noHtml = """Test"""
assert get_html_text(noHtml) == 'Test'

def test_get_text_no_file(tmp_path):
p = tmp_path / "test.eml"
with pytest.raises(OSError):
get_text(p)
assert get_instant.get_html_text(noHtml) == 'Test'