Skip to content

Commit

Permalink
Remove duplicate lines and long words during verification
Browse files Browse the repository at this point in the history
  • Loading branch information
sashacmc committed May 18, 2024
1 parent 68d30ff commit 7fc97bf
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 2 deletions.
25 changes: 24 additions & 1 deletion src/mmdiary/transcriber/verifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,28 @@ def clean_wrong_symbols(s, language):
return res


def cut_long_words(s):
lis = []
words = s.split(" ")
for w in words:
lis.append(w[:30])

res = " ".join(lis)
if res != s:
logging.debug("Has long words: '%s'->'%s'", s, res)
return res


def remove_duplicate_lines(src):
res = src[0:2]
for i in range(2, len(src)):
if src[i] != src[i - 1] or src[i] != src[i - 2]:
res.append(src[i])
if res != src:
logging.debug("Has string duplicates: '%s'->'%s'", src, res)
return res


def check_text(text, language):
if text == "":
return text
Expand All @@ -115,7 +137,8 @@ def check_text(text, language):
if s == "":
logging.debug("Has empty string (was '%s')", t)
else:
res.append(s)
res.append(cut_long_words(s))
res = remove_duplicate_lines(res)
return "\n".join(res)


Expand Down
20 changes: 19 additions & 1 deletion tests/transcriber/test_verifier.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest

from mmdiary.transcriber.verifier import check_text
from mmdiary.transcriber.verifier import check_text, remove_duplicate_lines


@pytest.mark.parametrize(
Expand Down Expand Up @@ -35,3 +35,21 @@
)
def test_hall_match(par, expected):
assert check_text(par, "ru") == expected


@pytest.mark.parametrize(
"par,expected",
[
([], []),
(["line1"], ["line1"]),
(["line1", "line2", "line3"], ["line1", "line2", "line3"]),
(["line1", "line2", "line2"], ["line1", "line2", "line2"]),
(["line1", "line1", "line1"], ["line1", "line1"]),
(
["line1", "line2", "line3", "line3", "line3", "line3", "line3", "line1", "line2"],
["line1", "line2", "line3", "line3", "line1", "line2"],
),
],
)
def test_remove_duplicate_lines(par, expected):
assert remove_duplicate_lines(par) == expected

0 comments on commit 7fc97bf

Please sign in to comment.