Skip to content

Commit

Permalink
Merge branch 'main' of github.com:pbs/pycaption into OCTO-10460-detec…
Browse files Browse the repository at this point in the history
…t-format-crashed-with-empty-string
  • Loading branch information
OlteanuRares committed Aug 6, 2024
2 parents 03c3544 + 6ae805c commit 223524d
Show file tree
Hide file tree
Showing 20 changed files with 454 additions and 235 deletions.
43 changes: 43 additions & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,5 +1,48 @@
Changelog
---------
2.2.11
^^^^^^
- A space should not be placed before a mid row code if it follows a PAC command or a Tab Offset
- The backspace command should be treated like other commands and duplicates should be skipped if PAC commands are duplicated
- Prevent webvtt writer from creating a new cue in case of line break
- In case of style setting PAC which also breaks the line, we add the break first, then the style tag

2.2.10
^^^^^
- Yanked.

2.2.9
^^^^^
- Yanked.

2.2.8
^^^^^
- Honor backspaces on captions in scc files
- When mid-row codes which are preceded by a PAC command don't add spaces
- Mid row codes which don't follow after a PAC and don't have a style reset command before will add a space to the end of the previous text node
- Mid row codes which don't follow after a PAC and have a style reset command before will add a space to the beginning of the next text node
- Background color codes to delete the space in front

2.2.7
^^^^^
- The cursor moves automatically one column to the right after each character or Mid-Row Code received.

2.2.6
^^^^^
- Pass the caption cue time with all error messages.

2.2.5
^^^^^
- Yanked.

2.2.4
^^^^^
- Skip duplicated extended characters.

2.2.3
^^^^^
- Add new substitute character to ignore before extended character in SCC input files

2.2.2
^^^^^
- Remove support for Python 3.6 & 3.7
Expand Down
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@
# built documents.
#
# The short X.Y version.
version = '2.2.0'
version = '2.2.11'
# The full version, including alpha/beta/rc tags.
release = '2.2.0'
release = '2.2.11'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion pycaption/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from .transcript import TranscriptWriter
from .webvtt import WebVTTReader, WebVTTWriter
from .exceptions import (
CaptionReadError, CaptionReadNoCaptions, CaptionReadSyntaxError,
CaptionReadError, CaptionReadNoCaptions, CaptionReadSyntaxError, CaptionLineLengthError
)


Expand Down
20 changes: 14 additions & 6 deletions pycaption/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,13 +114,16 @@ class CaptionNode:
STYLE = 2
BREAK = 3

def __init__(self, type_, layout_info=None, content=None, start=None):
def __init__(
self, type_, layout_info=None, content=None, start=None, position=None
):
"""
:type type_: int
:type layout_info: Layout
"""
self.type_ = type_
self.content = content
self.position = position

# Boolean. Marks the beginning/ end of a Style node.
self.start = start
Expand All @@ -139,19 +142,24 @@ def __repr__(self):
raise RuntimeError(f'Unknown node type: {t}')

@staticmethod
def create_text(text, layout_info=None):
def create_text(text, layout_info=None, position=None):
return CaptionNode(
CaptionNode.TEXT, layout_info=layout_info, content=text)
type_=CaptionNode.TEXT, layout_info=layout_info,
position=position, content=text
)

@staticmethod
def create_style(start, content, layout_info=None):
return CaptionNode(
CaptionNode.STYLE, layout_info=layout_info, content=content,
type_=CaptionNode.STYLE, layout_info=layout_info, content=content,
start=start)

@staticmethod
def create_break(layout_info=None):
return CaptionNode(CaptionNode.BREAK, layout_info=layout_info)
def create_break(layout_info=None, content=None):
return CaptionNode(
type_=CaptionNode.BREAK, layout_info=layout_info,
content=content
)


class Caption:
Expand Down
102 changes: 68 additions & 34 deletions pycaption/scc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,11 @@
import math
import re
import textwrap
from collections import deque
from collections import deque, defaultdict
from copy import deepcopy

from pycaption.base import (
BaseReader, BaseWriter, CaptionSet, CaptionNode,
BaseReader, BaseWriter, CaptionSet
)
from pycaption.exceptions import CaptionReadNoCaptions, InvalidInputError, \
CaptionReadTimingError, CaptionLineLengthError
Expand All @@ -94,7 +94,7 @@
MICROSECONDS_PER_CODEWORD, CHARACTER_TO_CODE,
SPECIAL_OR_EXTENDED_CHAR_TO_CODE, PAC_BYTES_TO_POSITIONING_MAP,
PAC_HIGH_BYTE_BY_ROW, PAC_LOW_BYTE_BY_ROW_RESTRICTED,
PAC_TAB_OFFSET_COMMANDS,
PAC_TAB_OFFSET_COMMANDS, CUE_STARTING_COMMAND
)
from .specialized_collections import ( # noqa: F401
TimingCorrectingCaptionList, NotifyingDict, CaptionCreator,
Expand Down Expand Up @@ -164,6 +164,7 @@ def __init__(self, *args, **kw):
)

self.last_command = ''
self.double_starter = False

self.buffer_dict = NotifyingDict()

Expand Down Expand Up @@ -223,6 +224,7 @@ def read(self, content, lang='en-US', simulate_roll_up=False, offset=0):
# split lines
lines = content.splitlines()


# loop through each line except the first
for line in lines[1:]:
self._translate_line(line)
Expand All @@ -232,16 +234,24 @@ def read(self, content, lang='en-US', simulate_roll_up=False, offset=0):
captions = CaptionSet({lang: self.caption_stash.get_all()})

# check captions for incorrect lengths
lines = []
lines_too_long = defaultdict(list)
for caption in self.caption_stash._collection:
caption_start = caption.to_real_caption().format_start()
caption_text = "".join(caption.to_real_caption().get_text_nodes())
lines.extend(caption_text.split("\n"))
lines_too_long = [line for line in lines if len(line) > 32]

if bool(lines_too_long):
msg = ""
for line in lines_too_long:
msg += line + f" - Length { len(line)}" + "\n"
text_too_long = [line for line in caption_text.split("\n") if len(line) > 32]
if caption_start in lines_too_long:
lines_too_long[caption_start] = text_too_long
else:
lines_too_long[caption_start].extend(text_too_long)

msg = ""
if bool(lines_too_long.keys()):
for key in lines_too_long:
if lines_too_long[key]:
msg += f"around {key} - "
for line in lines_too_long[key]:
msg += line + f" - Length { len(line)}" + "\n"
if len(msg):
raise CaptionLineLengthError(
f"32 character limit for caption cue in scc file.\n"
f"Lines longer than 32:\n"
Expand Down Expand Up @@ -299,15 +309,20 @@ def _translate_line(self, line):
parts = r.findall(line.lower())

self.time_translator.start_at(parts[0][0])
word_list = parts[0][2].split(' ')

# loop through each word
for word in parts[0][2].split(' '):
# ignore empty results or invalid commands
for idx, word in enumerate(word_list):
word = word.strip()
previous_is_pac_or_tab = len(word_list) > 1 and (
_is_pac_command(word_list[idx - 1]) or word_list[idx - 1] in PAC_TAB_OFFSET_COMMANDS
)
if len(word) == 4:
self._translate_word(word)
self._translate_word(
word=word,
previous_is_pac_or_tab=previous_is_pac_or_tab,
)

def _translate_word(self, word):
def _translate_word(self, word, previous_is_pac_or_tab):
if self._handle_double_command(word):
# count frames for timing
self.time_translator.increment_frames()
Expand All @@ -316,7 +331,7 @@ def _translate_word(self, word):
# TODO - check that all the positioning commands are here, or use
# some other strategy to determine if the word is a command.
if word in COMMANDS or _is_pac_command(word):
self._translate_command(word)
self._translate_command(word=word, previous_is_pac_or_tab=previous_is_pac_or_tab)

# second, check if word is a special character
elif word in SPECIAL_CHARS:
Expand All @@ -337,23 +352,33 @@ def _handle_double_command(self, word):
# up for redundancy in case the signal is garbled in transmission.
# The decoder is programmed to ignore a second command when it is the
# same as the first.
# Also like codes, Special Characters are always doubled up,
# If we have doubled commands we're skipping also
# doubled special characters and doubled extended characters
# with only one member of each pair being displayed.
if word in COMMANDS or _is_pac_command(word) or word in SPECIAL_CHARS:
if word == self.last_command:
self.last_command = ''
return True

doubled_types = word != "94a1" and word in COMMANDS or _is_pac_command(word)
if self.double_starter:
doubled_types = doubled_types or word in EXTENDED_CHARS or word == "94a1" or word in SPECIAL_CHARS

if word in CUE_STARTING_COMMAND and word != self.last_command:
self.double_starter = False

if doubled_types and word == self.last_command:
if word in CUE_STARTING_COMMAND:
self.double_starter = True
self.last_command = ''
return True
# Fix for the <position> <tab offset> <position> <tab offset>
# repetition
elif _is_pac_command(word) and word in self.last_command:
self.last_command = ''
elif _is_pac_command(word) and word in self.last_command:
self.last_command = ''
return True
elif word in PAC_TAB_OFFSET_COMMANDS:
if _is_pac_command(self.last_command):
self.last_command += f" {word}"
return False
else:
return True
elif word in PAC_TAB_OFFSET_COMMANDS:
if _is_pac_command(self.last_command):
self.last_command += f" {word}"
return False
else:
return True

self.last_command = word
return False
Expand All @@ -362,12 +387,18 @@ def _translate_special_char(self, word):
self.buffer.add_chars(SPECIAL_CHARS[word])

def _translate_extended_char(self, word):
self.buffer.remove_ascii_duplicate(EXTENDED_CHARS[word])

"""
Each of the 64 Extended Characters incorporates an automatic BS.
When an Extended Character is received, the cursor moves to the
left one column position (unless the Extended Character is the first
character on a row), erasing any character which may be in that location,
then displays the Extended Character.
"""
self.buffer.handle_backspace(word)
# add to buffer
self.buffer.add_chars(EXTENDED_CHARS[word])

def _translate_command(self, word):
def _translate_command(self, word, previous_is_pac_or_tab):
# if command is pop_up
if word == '9420':
self.buffer_dict.set_active('pop')
Expand Down Expand Up @@ -436,7 +467,10 @@ def _translate_command(self, word):

# If command is not one of the aforementioned, add it to buffer
else:
self.buffer.interpret_command(word)
self.buffer.interpret_command(
command=word,
previous_is_pac_or_tab=previous_is_pac_or_tab
)

def _translate_characters(self, word):
# split word into the 2 bytes
Expand Down
Loading

0 comments on commit 223524d

Please sign in to comment.