Skip to content

Commit

Permalink
add docstrings, more tests, type hints, simplify logic
Browse files Browse the repository at this point in the history
  • Loading branch information
iulusoy committed Sep 5, 2024
1 parent 1f2f113 commit c0839b6
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 12 deletions.
47 changes: 35 additions & 12 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@
from bs4 import BeautifulSoup

def list_of_files(directory_name: str) -> list[Path]:
"""Function to create a list of files that are present in a directory as path objects.
Args:
directory_name (str): The directory where the files are located.
Returns:
list[Path]: A list of Path objects that represent the files in the directory."""
if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(directory_name))
mypath = Path(directory_name)
Expand All @@ -13,28 +20,39 @@ def list_of_files(directory_name: str) -> list[Path]:
raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
return email_list

def get_html_text(text_check):
def get_html_text(text_check: str) -> str:
"""Clean up a string if it contains html content.
Args:
text_check (str): The string that may contain html content.
Returns:
str: The (potentially) cleaned up string."""
soup = BeautifulSoup(text_check , 'html.parser')
if soup.find():
text = soup.get_text()
return text
text_check = soup.get_text()
return text_check

def get_text(file):
def get_text(file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.
Args:
file (Path): The path to the email file.
Returns:
str: The textual content of the email. In the future, this will return the
complete dictionary with the metadata."""
if not file.is_file(): # check if given file exists raises error otherwise
raise OSError("File {} does not exist".format(file))
with open(file, 'rb') as fhdl:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
attachments = 0
attachmenttypes = []
if "attachment" in parsed_eml:
attachments = len(parsed_eml["attachment"])
if attachments > 0:
for i in range(attachments):
attachmenttypes.append(parsed_eml["attachment"][i]["extension"])

# find if there are any attachements, and if yes, how many
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
Expand All @@ -43,6 +61,11 @@ def get_text(file):
return(email_content["content"])


def write_file(text, name):
def write_file(text: str, name: str)-> None:
"""Write the extracted string to a text file.
Args:
text (str): The string to be written to the file.
name (str): The name of the file to be written."""
with open("{}.out".format(name), "w") as file:
file.write(text)
7 changes: 7 additions & 0 deletions mailcom/test/test_inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
import pytest
from pathlib import Path


FILE_PATH = Path("mailcom/test/data/Bonjour Agathe.eml")
TEXT_REF = "J'espère que tu vas bien!"

def test_list_of_files_found(tmp_path):
p = tmp_path / "test.eml"
p.write_text("test")
Expand All @@ -28,6 +32,9 @@ def test_get_text(tmp_path):
p = tmp_path / "test.eml"
p.write_text("test")
assert get_text(p) == 'test'
text = get_text(FILE_PATH)
print(text[0:25])
assert text[0:25] == TEXT_REF

def test_get_text_err():
with pytest.raises(OSError):
Expand Down

0 comments on commit c0839b6

Please sign in to comment.