Merge branch 'main' of github.com:pbs/pycaption into OCTO-10460-detec…

…t-format-crashed-with-empty-string
pbs · Aug 6, 2024 · 223524d · 223524d
2 parents 03c3544 + 6ae805c
commit 223524d
Show file tree

Hide file tree

Showing 20 changed files with 454 additions and 235 deletions.
diff --git a/docs/changelog.rst b/docs/changelog.rst
@@ -1,5 +1,48 @@
 Changelog
 ---------
+2.2.11
+^^^^^^
+- A space should not be placed before a mid row code if it follows a PAC command or a Tab Offset
+- The backspace command should be treated like other commands and duplicates should be skipped if PAC commands are duplicated
+- Prevent webvtt writer from creating a new cue in case of line break
+- In case of style setting PAC which also breaks the line, we add the break first, then the style tag
+
+2.2.10
+^^^^^
+- Yanked.
+
+2.2.9
+^^^^^
+- Yanked.
+
+2.2.8
+^^^^^
+- Honor backspaces on captions in scc files
+- When mid-row codes which are preceded by a PAC command don't add spaces
+- Mid row codes which don't follow after a PAC and don't have a style reset command before will add a space to the end of the previous text node
+- Mid row codes which don't follow after a PAC and have a style reset command before will add a space to the beginning of the next text node
+- Background color codes to delete the space in front
+
+2.2.7
+^^^^^
+- The cursor moves automatically one column to the right after each character or Mid-Row Code received.
+
+2.2.6
+^^^^^
+- Pass the caption cue time with all error messages.
+
+2.2.5
+^^^^^
+- Yanked.
+
+2.2.4
+^^^^^
+- Skip duplicated extended characters.
+
+2.2.3
+^^^^^
+- Add new substitute character to ignore before extended character in SCC input files
+
 2.2.2
 ^^^^^
 - Remove support for Python 3.6 & 3.7

diff --git a/docs/conf.py b/docs/conf.py
@@ -53,9 +53,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '2.2.0'
+version = '2.2.11'
 # The full version, including alpha/beta/rc tags.
-release = '2.2.0'
+release = '2.2.11'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/pycaption/__init__.py b/pycaption/__init__.py
@@ -10,7 +10,7 @@
 from .transcript import TranscriptWriter
 from .webvtt import WebVTTReader, WebVTTWriter
 from .exceptions import (
-    CaptionReadError, CaptionReadNoCaptions, CaptionReadSyntaxError,
+    CaptionReadError, CaptionReadNoCaptions, CaptionReadSyntaxError, CaptionLineLengthError
 )
 
 

diff --git a/pycaption/base.py b/pycaption/base.py
@@ -114,13 +114,16 @@ class CaptionNode:
     STYLE = 2
     BREAK = 3
 
-    def __init__(self, type_, layout_info=None, content=None, start=None):
+    def __init__(
+            self, type_, layout_info=None, content=None, start=None, position=None
+    ):
         """
         :type type_: int
         :type layout_info: Layout
         """
         self.type_ = type_
         self.content = content
+        self.position = position
 
         # Boolean. Marks the beginning/ end of a Style node.
         self.start = start
@@ -139,19 +142,24 @@ def __repr__(self):
             raise RuntimeError(f'Unknown node type: {t}')
 
     @staticmethod
-    def create_text(text, layout_info=None):
+    def create_text(text, layout_info=None, position=None):
         return CaptionNode(
-            CaptionNode.TEXT, layout_info=layout_info, content=text)
+            type_=CaptionNode.TEXT, layout_info=layout_info,
+            position=position, content=text
+        )
 
     @staticmethod
     def create_style(start, content, layout_info=None):
         return CaptionNode(
-            CaptionNode.STYLE, layout_info=layout_info, content=content,
+            type_=CaptionNode.STYLE, layout_info=layout_info, content=content,
             start=start)
 
     @staticmethod
-    def create_break(layout_info=None):
-        return CaptionNode(CaptionNode.BREAK, layout_info=layout_info)
+    def create_break(layout_info=None, content=None):
+        return CaptionNode(
+            type_=CaptionNode.BREAK, layout_info=layout_info,
+            content=content
+        )
 
 
 class Caption:

diff --git a/pycaption/scc/__init__.py b/pycaption/scc/__init__.py
@@ -81,11 +81,11 @@
 import math
 import re
 import textwrap
-from collections import deque
+from collections import deque, defaultdict
 from copy import deepcopy
 
 from pycaption.base import (
-    BaseReader, BaseWriter, CaptionSet, CaptionNode,
+    BaseReader, BaseWriter, CaptionSet
 )
 from pycaption.exceptions import CaptionReadNoCaptions, InvalidInputError, \
     CaptionReadTimingError, CaptionLineLengthError
@@ -94,7 +94,7 @@
     MICROSECONDS_PER_CODEWORD, CHARACTER_TO_CODE,
     SPECIAL_OR_EXTENDED_CHAR_TO_CODE, PAC_BYTES_TO_POSITIONING_MAP,
     PAC_HIGH_BYTE_BY_ROW, PAC_LOW_BYTE_BY_ROW_RESTRICTED,
-    PAC_TAB_OFFSET_COMMANDS,
+    PAC_TAB_OFFSET_COMMANDS, CUE_STARTING_COMMAND
 )
 from .specialized_collections import (  # noqa: F401
     TimingCorrectingCaptionList, NotifyingDict, CaptionCreator,
@@ -164,6 +164,7 @@ def __init__(self, *args, **kw):
         )
 
         self.last_command = ''
+        self.double_starter = False
 
         self.buffer_dict = NotifyingDict()
 
@@ -223,6 +224,7 @@ def read(self, content, lang='en-US', simulate_roll_up=False, offset=0):
         # split lines
         lines = content.splitlines()
 
+
         # loop through each line except the first
         for line in lines[1:]:
             self._translate_line(line)
@@ -232,16 +234,24 @@ def read(self, content, lang='en-US', simulate_roll_up=False, offset=0):
         captions = CaptionSet({lang: self.caption_stash.get_all()})
 
         # check captions for incorrect lengths
-        lines = []
+        lines_too_long = defaultdict(list)
         for caption in self.caption_stash._collection:
+            caption_start = caption.to_real_caption().format_start()
             caption_text = "".join(caption.to_real_caption().get_text_nodes())
-            lines.extend(caption_text.split("\n"))
-        lines_too_long = [line for line in lines if len(line) > 32]
-
-        if bool(lines_too_long):
-            msg = ""
-            for line in lines_too_long:
-                msg += line + f" - Length { len(line)}" + "\n"
+            text_too_long = [line for line in caption_text.split("\n") if len(line) > 32]
+            if caption_start in lines_too_long:
+                lines_too_long[caption_start] = text_too_long
+            else:
+                lines_too_long[caption_start].extend(text_too_long)
+
+        msg = ""
+        if bool(lines_too_long.keys()):
+            for key in lines_too_long:
+                if lines_too_long[key]:
+                    msg += f"around {key} - "
+                    for line in lines_too_long[key]:
+                        msg += line + f" - Length { len(line)}" + "\n"
+        if len(msg):
             raise CaptionLineLengthError(
                 f"32 character limit for caption cue in scc file.\n"
                 f"Lines longer than 32:\n"
@@ -299,15 +309,20 @@ def _translate_line(self, line):
         parts = r.findall(line.lower())
 
         self.time_translator.start_at(parts[0][0])
+        word_list = parts[0][2].split(' ')
 
-        # loop through each word
-        for word in parts[0][2].split(' '):
-            # ignore empty results or invalid commands
+        for idx, word in enumerate(word_list):
             word = word.strip()
+            previous_is_pac_or_tab = len(word_list) > 1 and (
+                    _is_pac_command(word_list[idx - 1]) or word_list[idx - 1] in PAC_TAB_OFFSET_COMMANDS
+            )
             if len(word) == 4:
-                self._translate_word(word)
+                self._translate_word(
+                    word=word,
+                    previous_is_pac_or_tab=previous_is_pac_or_tab,
+                )
 
-    def _translate_word(self, word):
+    def _translate_word(self, word, previous_is_pac_or_tab):
         if self._handle_double_command(word):
             # count frames for timing
             self.time_translator.increment_frames()
@@ -316,7 +331,7 @@ def _translate_word(self, word):
         # TODO - check that all the positioning commands are here, or use
         # some other strategy to determine if the word is a command.
         if word in COMMANDS or _is_pac_command(word):
-            self._translate_command(word)
+            self._translate_command(word=word, previous_is_pac_or_tab=previous_is_pac_or_tab)
 
         # second, check if word is a special character
         elif word in SPECIAL_CHARS:
@@ -337,23 +352,33 @@ def _handle_double_command(self, word):
         # up for redundancy in case the signal is garbled in transmission.
         # The decoder is programmed to ignore a second command when it is the
         # same as the first.
-        # Also like codes, Special Characters are always doubled up,
+        # If we have doubled commands we're skipping also
+        # doubled special characters and doubled extended characters
         # with only one member of each pair being displayed.
-        if word in COMMANDS or _is_pac_command(word) or word in SPECIAL_CHARS:
-            if word == self.last_command:
-                self.last_command = ''
-                return True
+
+        doubled_types = word != "94a1" and word in COMMANDS or _is_pac_command(word)
+        if self.double_starter:
+            doubled_types = doubled_types or word in EXTENDED_CHARS or word == "94a1" or word in SPECIAL_CHARS
+
+        if word in CUE_STARTING_COMMAND and word != self.last_command:
+            self.double_starter = False
+
+        if doubled_types and word == self.last_command:
+            if word in CUE_STARTING_COMMAND:
+                self.double_starter = True
+            self.last_command = ''
+            return True
             # Fix for the <position> <tab offset> <position> <tab offset>
             # repetition
-            elif _is_pac_command(word) and word in self.last_command:
-                self.last_command = ''
+        elif _is_pac_command(word) and word in self.last_command:
+            self.last_command = ''
+            return True
+        elif word in PAC_TAB_OFFSET_COMMANDS:
+            if _is_pac_command(self.last_command):
+                self.last_command += f" {word}"
+                return False
+            else:
                 return True
-            elif word in PAC_TAB_OFFSET_COMMANDS:
-                if _is_pac_command(self.last_command):
-                    self.last_command += f" {word}"
-                    return False
-                else:
-                    return True
 
         self.last_command = word
         return False
@@ -362,12 +387,18 @@ def _translate_special_char(self, word):
         self.buffer.add_chars(SPECIAL_CHARS[word])
 
     def _translate_extended_char(self, word):
-        self.buffer.remove_ascii_duplicate(EXTENDED_CHARS[word])
-
+        """
+        Each of the 64 Extended Characters incorporates an automatic BS.
+        When an Extended Character is received, the cursor moves to the
+        left one column position (unless the Extended Character is the first
+        character on a row), erasing any character which may be in that location,
+        then displays the Extended Character.
+        """
+        self.buffer.handle_backspace(word)
         # add to buffer
         self.buffer.add_chars(EXTENDED_CHARS[word])
 
-    def _translate_command(self, word):
+    def _translate_command(self, word, previous_is_pac_or_tab):
         # if command is pop_up
         if word == '9420':
             self.buffer_dict.set_active('pop')
@@ -436,7 +467,10 @@ def _translate_command(self, word):
 
         # If command is not one of the aforementioned, add it to buffer
         else:
-            self.buffer.interpret_command(word)
+            self.buffer.interpret_command(
+                command=word,
+                previous_is_pac_or_tab=previous_is_pac_or_tab
+            )
 
     def _translate_characters(self, word):
         # split word into the 2 bytes