diff --git a/mailcom/inout.py b/mailcom/inout.py index 5b6956b..00a6706 100644 --- a/mailcom/inout.py +++ b/mailcom/inout.py @@ -4,6 +4,13 @@ from bs4 import BeautifulSoup def list_of_files(directory_name: str) -> list[Path]: + """Function to create a list of files that are present in a directory as path objects. + + Args: + directory_name (str): The directory where the files are located. + + Returns: + list[Path]: A list of Path objects that represent the files in the directory.""" if not os.path.exists(directory_name): # check if given dir exists raises error otherwise raise OSError("Path {} does not exist".format(directory_name)) mypath = Path(directory_name) @@ -13,28 +20,39 @@ def list_of_files(directory_name: str) -> list[Path]: raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath)) return email_list -def get_html_text(text_check): +def get_html_text(text_check: str) -> str: + """Clean up a string if it contains html content. + Args: + text_check (str): The string that may contain html content. + + Returns: + str: The (potentially) cleaned up string.""" soup = BeautifulSoup(text_check , 'html.parser') if soup.find(): - text = soup.get_text() - return text + text_check = soup.get_text() return text_check -def get_text(file): +def get_text(file: Path) -> str: + """Function to extract the textual content and other metadata from an email file. + + Args: + file (Path): The path to the email file. + + Returns: + str: The textual content of the email. In the future, this will return the + complete dictionary with the metadata.""" if not file.is_file(): # check if given file exists raises error otherwise raise OSError("File {} does not exist".format(file)) with open(file, 'rb') as fhdl: raw_email = fhdl.read() ep = eml_parser.EmlParser(include_raw_body=True) parsed_eml = ep.decode_email_bytes(raw_email) - attachments = 0 attachmenttypes = [] - if "attachment" in parsed_eml: - attachments = len(parsed_eml["attachment"]) - if attachments > 0: - for i in range(attachments): - attachmenttypes.append(parsed_eml["attachment"][i]["extension"]) - + # find if there are any attachements, and if yes, how many + attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0 + # find the types of attachements + if attachments > 0: + attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)] email_content = {"content": parsed_eml["body"][0]["content"], "date": parsed_eml["header"]["date"], "attachment": attachments, @@ -43,6 +61,11 @@ def get_text(file): return(email_content["content"]) -def write_file(text, name): +def write_file(text: str, name: str)-> None: + """Write the extracted string to a text file. + + Args: + text (str): The string to be written to the file. + name (str): The name of the file to be written.""" with open("{}.out".format(name), "w") as file: file.write(text) diff --git a/mailcom/test/test_inout.py b/mailcom/test/test_inout.py index 5db705f..61fc553 100644 --- a/mailcom/test/test_inout.py +++ b/mailcom/test/test_inout.py @@ -2,6 +2,10 @@ import pytest from pathlib import Path + +FILE_PATH = Path("mailcom/test/data/Bonjour Agathe.eml") +TEXT_REF = "J'espère que tu vas bien!" + def test_list_of_files_found(tmp_path): p = tmp_path / "test.eml" p.write_text("test") @@ -28,6 +32,9 @@ def test_get_text(tmp_path): p = tmp_path / "test.eml" p.write_text("test") assert get_text(p) == 'test' + text = get_text(FILE_PATH) + print(text[0:25]) + assert text[0:25] == TEXT_REF def test_get_text_err(): with pytest.raises(OSError):