changed inout to a classstructure

ssciwr · Sep 10, 2024 · 4a37487 · 4a37487
1 parent b67c25f
commit 4a37487
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 57 deletions.
diff --git a/mailcom/inout.py b/mailcom/inout.py
@@ -3,63 +3,72 @@
 import eml_parser
 from bs4 import BeautifulSoup
 
-def list_of_files(directory_name: str) -> list[Path]:
-    """Function to create a list of files that are present in a directory as path objects.
-    
-    Args: 
-        directory_name (str): The directory where the files are located.
-    
-    Returns:
-        list[Path]: A list of Path objects that represent the files in the directory."""
-    if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
-        raise OSError("Path {} does not exist".format(directory_name))
-    mypath = Path(directory_name)
-    pattern = [".eml", ".html"]  # we would not change the file type through user input
-    email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
-    if len(email_list) == 0:
-        raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
-    return email_list
-
-def get_html_text(text_check: str) -> str:
-    """Clean up a string if it contains html content.
-    Args:
-        text_check (str): The string that may contain html content.
+class InoutHandler:
+    @staticmethod
+    def list_of_files(directory_name: str) -> list[Path]:
+        """Function to create a list of files that are present in a directory as path objects.
+        
+        Args: 
+            directory_name (str): The directory where the files are located.
         
-    Returns:
-        str: The (potentially) cleaned up string."""
-    soup = BeautifulSoup(text_check , 'html.parser')
-    if soup.find():
-        text_check = soup.get_text()
-    return text_check
+        Returns:
+            list[Path]: A list of Path objects that represent the files in the directory."""
+        if not os.path.exists(directory_name): # check if given dir exists raises error otherwise
+            raise OSError("Path {} does not exist".format(directory_name))
+        mypath = Path(directory_name)
+        pattern = [".eml", ".html"]  # we would not change the file type through user input
+        email_list = [mp.resolve() for mp in mypath.glob("**/*") if mp.suffix in pattern]
+        if len(email_list) == 0:
+            raise ValueError("The directory {} does not contain .eml or .html files. Please check that the directory is containing the email data files".format(mypath))
+        return email_list
 
-def get_text(file: Path) -> str:
-    """Function to extract the textual content and other metadata from an email file.
-    
-    Args:
-        file (Path): The path to the email file.
+    @staticmethod
+    def get_html_text(text_check: str) -> str:
+        """Clean up a string if it contains html content.
+        Args:
+            text_check (str): The string that may contain html content.
+            
+        Returns:
+            str: The (potentially) cleaned up string."""
+        soup = BeautifulSoup(text_check , 'html.parser')
+        if soup.find():
+            text_check = soup.get_text()
+        return text_check
+
+    @staticmethod
+    def get_text(file: Path) -> str:
+        """Function to extract the textual content and other metadata from an email file.
         
-    Returns:
-        str: The textual content of the email. In the future, this will return the 
-        complete dictionary with the metadata."""
-    if not file.is_file(): # check if given file exists raises error otherwise
-        raise OSError("File {} does not exist".format(file))
-    with open(file, 'rb') as fhdl:
-        raw_email = fhdl.read()
-    ep = eml_parser.EmlParser(include_raw_body=True)
-    parsed_eml = ep.decode_email_bytes(raw_email)
-    attachmenttypes = []
-    # find if there are any attachements, and if yes, how many
-    attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
-    # find the types of attachements
-    if attachments > 0:
-        attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
-    email_content = {"content": parsed_eml["body"][0]["content"], 
-                 "date": parsed_eml["header"]["date"], 
-                 "attachment": attachments, 
-                 "attachement type": attachmenttypes
-                 }
-    return(email_content["content"])
+        Args:
+            file (Path): The path to the email file.
+            
+        Returns:
+            str: The textual content of the email. In the future, this will return the 
+            complete dictionary with the metadata."""
+        if not file.is_file(): # check if given file exists raises error otherwise
+            raise OSError("File {} does not exist".format(file))
+        with open(file, 'rb') as fhdl:
+            raw_email = fhdl.read()
+        ep = eml_parser.EmlParser(include_raw_body=True)
+        parsed_eml = ep.decode_email_bytes(raw_email)
+        attachmenttypes = []
+        # find if there are any attachements, and if yes, how many
+        attachments = len(parsed_eml["attachment"]) if "attachment" in parsed_eml else 0
+        # find the types of attachements
+        if attachments > 0:
+            attachmenttypes = [parsed_eml["attachment"][i]["extension"] for i in range(attachments)]
+        email_content = {"content": parsed_eml["body"][0]["content"], 
+                    "date": parsed_eml["header"]["date"], 
+                    "attachment": attachments, 
+                    "attachement type": attachmenttypes
+                    }
+        return(email_content["content"])
 
+    def validate_data():
+        return
+
+    def data_to_xml():
+        return
 
 def write_file(text: str, name: str)-> None:
     """Write the extracted string to a text file.

diff --git a/mailcom/parse.py b/mailcom/parse.py
@@ -2,7 +2,7 @@
 import spacy as sp
 from transformers import pipeline
 from pathlib import Path
-from mailcom.inout import get_text, list_of_files, get_html_text
+from mailcom import inout
 
 # please modify this section depending on your setup
 # input language - either "es" or "fr"
@@ -116,11 +116,12 @@ def make_dir(path: str):
         print("Generating output directory/ies.")
         make_dir(path_output)
     # process the text
-    eml_files = list_of_files(path_input)
+    io = inout.InoutHandler()
+    eml_files = io.list_of_files(path_input)
     # html_files = list_of_files(path_input, "html")
     for file in eml_files:
-        text = get_text(file)
-        text = get_html_text(text)
+        text = io.get_text(file)
+        text = io.get_html_text(text)
         print(text)
         # skip this text if email could not be parsed
         if not text: