Skip to content

Commit

Permalink
changed text extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
Olthoff231381 committed Sep 2, 2024
1 parent c70f681 commit 7666065
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 11 deletions.
29 changes: 19 additions & 10 deletions mailcom/inout.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from email import policy
from email.parser import BytesParser
import eml_parser
from pathlib import Path


Expand All @@ -14,16 +15,24 @@ def list_of_files(directory_name: str) -> list[Path]:


def get_text(name):
with open(name, "rb") as fp:
msg = BytesParser(policy=policy.default).parse(fp)
if msg.get_body(preferencelist="plain") is None:
print("ATTENTION ATTENTION ATTENTION")
print("Could not parse email {}".format(name))
print("ATTENTION ATTENTION ATTENTION")
content = None
else:
content = msg.get_body(preferencelist="plain").get_content()
return content
with open(name, 'rb') as fhdl:
raw_email = fhdl.read()
ep = eml_parser.EmlParser(include_raw_body=True)
parsed_eml = ep.decode_email_bytes(raw_email)
attachments = 0
attachmenttypes = []
if "attachment" in parsed_eml:
attachments = len(parsed_eml["attachment"])
if attachments > 0:
for i in range(attachments):
attachmenttypes.append(parsed_eml["attachment"][i]["extension"])

email_content = {"content": parsed_eml["body"][0]["content"],
"date": parsed_eml["header"]["date"],
"attachment": attachments,
"attachement type": attachmenttypes
}
return(email_content["content"])


def delete_header(text):
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
spacy
transformers
transformers
eml_parser

0 comments on commit 7666065

Please sign in to comment.