Skip to content

Commit

Permalink
Added functionality to process RTF and Word (DOC, DOCX) files
Browse files Browse the repository at this point in the history
  • Loading branch information
beveradb committed Nov 21, 2024
1 parent cea00cf commit dbf636b
Show file tree
Hide file tree
Showing 5 changed files with 645 additions and 319 deletions.
3 changes: 2 additions & 1 deletion karaoke_lyrics_processor/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ def main():
processor.process()
processor.write_to_output_file()

logger.info(f"Lyrics processing complete, lyrics written to output file: {output_filename}")
output_file = processor.output_filename
logger.info(f"Lyrics processing complete, lyrics written to output file: {output_file}")


if __name__ == "__main__":
Expand Down
64 changes: 57 additions & 7 deletions karaoke_lyrics_processor/karaoke_lyrics_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import logging
import pyperclip
import unicodedata
import docx2txt
from striprtf.striprtf import rtf_to_text
import os


class KaraokeLyricsProcessor:
Expand Down Expand Up @@ -36,13 +39,44 @@ def __init__(
if input_lyrics_text is not None and input_filename is None:
self.input_lyrics_lines = input_lyrics_text.splitlines()
elif input_filename is not None and input_lyrics_text is None:
self.input_lyrics_lines = self.read_input_lyrics_file()
self.input_lyrics_lines = self.read_input_file()
else:
raise ValueError("Either input_lyrics or input_filename must be set, but not both.")

def read_input_lyrics_file(self):
with open(self.input_filename, "r") as infile:
return infile.readlines()
def read_input_file(self):
file_extension = os.path.splitext(self.input_filename)[1].lower()

if file_extension == ".txt":
return self.read_txt_file()
elif file_extension in [".docx", ".doc"]:
return self.read_doc_file()
elif file_extension == ".rtf":
return self.read_rtf_file()
else:
raise ValueError(f"Unsupported file format: {file_extension}")

def read_txt_file(self):
with open(self.input_filename, "r", encoding="utf-8") as infile:
return self.clean_text(infile.read()).splitlines()

def read_doc_file(self):
text = docx2txt.process(self.input_filename)
return self.clean_text(text).splitlines()

def read_rtf_file(self):
with open(self.input_filename, "r", encoding="utf-8") as file:
rtf_text = file.read()
plain_text = rtf_to_text(rtf_text)
return self.clean_text(plain_text).splitlines()

def clean_text(self, text):
# Remove any non-printable characters except newlines
text = "".join(char for char in text if char.isprintable() or char == "\n")
# Replace multiple newlines with a single newline
text = re.sub(r"\n{2,}", "\n", text)
# Remove leading/trailing whitespace from each line
text = "\n".join(line.strip() for line in text.splitlines())
return text

def find_best_split_point(self, line):
"""
Expand Down Expand Up @@ -84,7 +118,7 @@ def find_best_split_point(self, line):

# If the line is still too long, find the last space before max_line_length
if len(line) > self.max_line_length:
last_space = line.rfind(' ', 0, self.max_line_length)
last_space = line.rfind(" ", 0, self.max_line_length)
if last_space != -1:
self.logger.debug(f"Splitting at last space before max_line_length: {last_space}")
return last_space
Expand All @@ -111,13 +145,25 @@ def replace_non_printable_spaces(self, text):
self.logger.debug(f"Text after replacing non-printable spaces: {cleaned_text}")
return cleaned_text

def clean_punctuation_spacing(self, text):
"""
Remove unnecessary spaces before punctuation marks.
"""
self.logger.debug(f"Cleaning punctuation spacing in: {text}")
# Remove space before comma, period, exclamation mark, question mark, colon, and semicolon
cleaned_text = re.sub(r"\s+([,\.!?:;])", r"\1", text)
self.logger.debug(f"Text after cleaning punctuation spacing: {cleaned_text}")
return cleaned_text

def process_line(self, line):
"""
Process a single line to ensure it's within the maximum length,
handle parentheses, and replace non-printable spaces.
"""
# Replace non-printable spaces at the beginning
line = self.replace_non_printable_spaces(line)
# Clean up punctuation spacing
line = self.clean_punctuation_spacing(line)

processed_lines = []
iteration_count = 0
Expand Down Expand Up @@ -180,8 +226,9 @@ def process(self):

processed_lyrics_text = "\n".join(lyrics_lines)

# Final pass to replace any remaining non-printable spaces
# Final pass to replace any remaining non-printable spaces and clean punctuation
processed_lyrics_text = self.replace_non_printable_spaces(processed_lyrics_text)
processed_lyrics_text = self.clean_punctuation_spacing(processed_lyrics_text)

self.processed_lyrics_text = processed_lyrics_text
pyperclip.copy(processed_lyrics_text)
Expand All @@ -191,8 +238,11 @@ def process(self):
return processed_lyrics_text

def write_to_output_file(self):
# Ensure the output filename has a .txt extension
base, _ = os.path.splitext(self.output_filename)
self.output_filename = f"{base}.txt"

with open(self.output_filename, "w") as outfile:
with open(self.output_filename, "w", encoding="utf-8") as outfile:
outfile.write(self.processed_lyrics_text)

self.logger.info(f"Processed lyrics written to output file {self.output_filename}")
Loading

0 comments on commit dbf636b

Please sign in to comment.