Skip to content

Commit

Permalink
Implement suppress_numerals
Browse files Browse the repository at this point in the history
  • Loading branch information
darkmirage committed Oct 15, 2024
1 parent d57c5b4 commit 4e31cf4
Showing 1 changed file with 21 additions and 1 deletion.
22 changes: 21 additions & 1 deletion faster_whisper/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,16 @@ class TranscriptionInfo(NamedTuple):
vad_options: VadOptions


def find_numeral_symbol_tokens(tokenizer: Tokenizer):
numeral_symbol_tokens = []
for i in range(tokenizer.eot):
token = tokenizer.decode([i]).removeprefix(" ")
has_numeral_symbol = any(c in "0123456789%$£¥" for c in token)
if has_numeral_symbol:
numeral_symbol_tokens.append(i)
return numeral_symbol_tokens


# The code below is originally from HF pipeline and is used in whisper-x
# (https://github.com/m-bain/whisperX) and adapted for faster_whisper

Expand Down Expand Up @@ -314,6 +324,7 @@ def transcribe(
prefix: Optional[str] = None,
suppress_blank: bool = True,
suppress_tokens: Optional[List[int]] = [-1],
suppress_numerals: bool = False,
prepend_punctuations: str = "\"'“¿([{-",
append_punctuations: str = "\"'.。,,!!??::”)]}、",
max_new_tokens: Optional[int] = None,
Expand Down Expand Up @@ -485,7 +496,9 @@ def transcribe(
initial_prompt=initial_prompt,
prefix=prefix,
suppress_blank=suppress_blank,
suppress_tokens=get_suppressed_tokens(self.tokenizer, suppress_tokens),
suppress_tokens=get_suppressed_tokens(
self.tokenizer, suppress_tokens, suppress_numerals
),
prepend_punctuations=prepend_punctuations,
append_punctuations=append_punctuations,
max_new_tokens=max_new_tokens,
Expand Down Expand Up @@ -2110,6 +2123,7 @@ def get_compression_ratio(text: str) -> float:
def get_suppressed_tokens(
tokenizer: Tokenizer,
suppress_tokens: Tuple[int],
suppress_numerals: bool = False,
) -> Optional[List[int]]:
if -1 in suppress_tokens:
suppress_tokens = [t for t in suppress_tokens if t >= 0]
Expand All @@ -2129,6 +2143,12 @@ def get_suppressed_tokens(
]
)

# This is not present in the original faster_whisper implementation
# Follows the same logic as whisperx
if suppress_numerals:
numeral_symbol_tokens = find_numeral_symbol_tokens(tokenizer)
suppress_tokens.extend(numeral_symbol_tokens)

return tuple(sorted(set(suppress_tokens)))


Expand Down

0 comments on commit 4e31cf4

Please sign in to comment.