Skip to content

Commit

Permalink
changed inout to a classstructure
Browse files Browse the repository at this point in the history
  • Loading branch information
Olthoff231381 committed Sep 10, 2024
1 parent b67c25f commit 4a37487
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 57 deletions.
115 changes: 62 additions & 53 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,63 +3,72 @@
import eml_parser
from bs4 import BeautifulSoup

def list_of_files(directory_name: str) -> list[Path]:
"""Function to create a list of files that are present in a directory as path objects.
Args:
directory_name (str): The directory where the files are located.
Returns:
list[Path]: A list of Path objects that represent the files in the directory."""
if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(directory_name))
mypath = Path(directory_name)
pattern = [".eml", ".html"] # we would not change the file type through user input
email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
if len(email_list) == 0:
raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
return email_list

def get_html_text(text_check: str) -> str:
"""Clean up a string if it contains html content.
Args:
text_check (str): The string that may contain html content.
class InoutHandler:
@staticmethod
def list_of_files(directory_name: str) -> list[Path]:
"""Function to create a list of files that are present in a directory as path objects.
Args:
directory_name (str): The directory where the files are located.
Returns:
str: The (potentially) cleaned up string."""
soup = BeautifulSoup(text_check , 'html.parser')
if soup.find():
text_check = soup.get_text()
return text_check
Returns:
list[Path]: A list of Path objects that represent the files in the directory."""
if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
raise OSError("Path {} does not exist".format(directory_name))
mypath = Path(directory_name)
pattern = [".eml", ".html"] # we would not change the file type through user input
email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
if len(email_list) == 0:
raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
return email_list

def get_text(file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.
Args:
file (Path): The path to the email file.
@staticmethod
def get_html_text(text_check: str) -> str:
"""Clean up a string if it contains html content.
Args:
text_check (str): The string that may contain html content.
Returns:
str: The (potentially) cleaned up string."""
soup = BeautifulSoup(text_check , 'html.parser')
if soup.find():
text_check = soup.get_text()
return text_check

@staticmethod
def get_text(file: Path) -> str:
"""Function to extract the textual content and other metadata from an email file.
Returns:
str: The textual content of the email. In the future, this will return the
complete dictionary with the metadata."""
if not file.is_file(): # check if given file exists raises error otherwise
raise OSError("File {} does not exist".format(file))
with open(file, 'rb') as fhdl:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
attachmenttypes = []
# find if there are any attachements, and if yes, how many
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
}
return(email_content["content"])
Args:
file (Path): The path to the email file.
Returns:
str: The textual content of the email. In the future, this will return the
complete dictionary with the metadata."""
if not file.is_file(): # check if given file exists raises error otherwise
raise OSError("File {} does not exist".format(file))
with open(file, 'rb') as fhdl:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
attachmenttypes = []
# find if there are any attachements, and if yes, how many
attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
# find the types of attachements
if attachments > 0:
attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
}
return(email_content["content"])

def validate_data():
return

def data_to_xml():
return

def write_file(text: str, name: str)-> None:
"""Write the extracted string to a text file.
Expand Down
9 changes: 5 additions & 4 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import spacy as sp
from transformers import pipeline
from pathlib import Path
from mailcom.inout import get_text, list_of_files, get_html_text
from mailcom import inout

# please modify this section depending on your setup
# input language - either "es" or "fr"
Expand Down Expand Up @@ -116,11 +116,12 @@ def make_dir(path: str):
print("Generating output directory/ies.")
make_dir(path_output)
# process the text
eml_files = list_of_files(path_input)
io = inout.InoutHandler()
eml_files = io.list_of_files(path_input)
# html_files = list_of_files(path_input, "html")
for file in eml_files:
text = get_text(file)
text = get_html_text(text)
text = io.get_text(file)
text = io.get_html_text(text)
print(text)
# skip this text if email could not be parsed
if not text:
Expand Down

0 comments on commit 4a37487

Please sign in to comment.