Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main' into clipping_subsampler_r…
Browse files Browse the repository at this point in the history
…efactor
  • Loading branch information
MattUnderscoreZhang committed Jan 22, 2024
2 parents 388f51a + 0e68456 commit 5101379
Show file tree
Hide file tree
Showing 13 changed files with 185 additions and 42 deletions.
2 changes: 1 addition & 1 deletion API.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ reading:
yt_args:
download_size: 360
yt_metadata_args:
writesubtitles: True
writesubtitles: 'all'
subtitleslangs: ['en']
writeautomaticsub: True
get_info: True
Expand Down
2 changes: 1 addition & 1 deletion dataset_examples/HDVILA.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ reading:
download_size: 360
download_audio_rate: 44100
yt_metadata_args:
writesubtitles: True
writesubtitles: 'all'
subtitleslangs: ['en']
writeautomaticsub: True
get_info: True
Expand Down
2 changes: 1 addition & 1 deletion dataset_examples/VideoCC.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ reading:
download_size: 360
download_audio_rate: 44100
yt_metadata_args:
writesubtitles: True
writesubtitles: 'all'
subtitleslangs: ['en']
writeautomaticsub: True
get_info: True
Expand Down
2 changes: 1 addition & 1 deletion examples/default_slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ reading:
download_size: 360
download_audio_rate: 44100
yt_metadata_args:
writesubtitles: True
writesubtitles: 'all'
subtitleslangs: ['en']
writeautomaticsub: True
get_info: True
Expand Down
117 changes: 116 additions & 1 deletion examples/yt_metadata.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ yt_args:
download_size: 360
download_audio_rate: 44100
yt_metadata_args:
writesubtitles: True
writesubtitles: 'all'
subtitleslangs: ['en']
writeautomaticsub: True
get_info: True
Expand Down Expand Up @@ -123,3 +123,118 @@ For every sample the metadata will be present in the json file as such:
```

And since we specified that captions_are_subtitles the txt file will have the subtitle for that given clip inside of it. For this particular example it would be: "analytics to assess performance based on"

#### Multilingual Subtitles
To control the language/s of the subtitles from your videos, you can prvoide either `'first'` or `'all'` for `writesubtitles` (any value that evalutes to True will work also work as `'all'`).

`first`: This will extract subtitles for the first language that is in `subtitleslangs` for which there exists subtitles. \
`all`: Attempt to extract subtitles for every language in `subtitleslangs`.

Below are some example outputs with `subtitleslangs: ['en', 'es', 'fr']`.

Using `writesubtitles: 'first'`:
```json
{
"url": "https://www.youtube.com/watch?v=CvHAfXKIvgw",
...
"yt_meta_dict": {
...
"subtitles": {
"en": [
{
"start": "00:00:02.100",
"end": "00:00:03.360",
"lines": [
"Good morning Lisa"
]
},
...
]
}
},
"clips": [
[
2.1,
3.36
]
],
"clip_subtitles": [
{
"start": "00:00:02.100",
"end": "00:00:03.360",
"lines": {
"en": [
"Good morning Lisa"
]
}
}
]
}
```


Using `writesubtitles: 'all'`:
```json
{
"url": "https://www.youtube.com/watch?v=CvHAfXKIvgw",
...
"yt_meta_dict": {
...
"subtitles": {
"en": [
{
"start": "00:00:02.100",
"end": "00:00:03.360",
"lines": [
"Good morning Lisa"
]
},
...
],
"es": [
{
"start": "00:00:02.100",
"end": "00:00:03.360",
"lines": [
"Buenos d\u00edas Lisa"
]
},
...
],
"fr": [
{
"start": "00:00:02.100",
"end": "00:00:03.360",
"lines": [
"Bonjour Lisa"
]
},
...
]
}
},
"clips": [
[
2.1,
3.36
]
],
"clip_subtitles": [
{
"start": "00:00:02.100",
"end": "00:00:03.360",
"lines": {
"en": [
"Good morning Lisa"
],
"es": [
"Buenos d\u00edas Lisa"
],
"fr": [
"Bonjour Lisa"
]
}
}
]
}
```
4 changes: 2 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,10 @@ timeout-decorator
scenedetect[opencv]==0.6
decord
#eva-decord # use eva-decord instead of decord on Mac (https://github.com/dmlc/decord/issues/213)
torch==2.0.0
torch==2.1.2
langdetect
torchdata==0.7.1
torchaudio==2.0.0
torchaudio==2.1.2
soundfile
omegaconf
einops
Expand Down
5 changes: 3 additions & 2 deletions tests/test_yt_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ def test_subtitles(input_file):
yt_meta_dict = get_yt_meta(url, yt_metadata_args)

assert type(yt_meta_dict) == dict
assert type(yt_meta_dict["subtitles"]) == list
assert type(yt_meta_dict["subtitles"][0]) == dict
assert type(yt_meta_dict["subtitles"]) == dict
assert type(yt_meta_dict["subtitles"]["en"]) == list
assert type(yt_meta_dict["subtitles"]["en"][0]) == dict


@pytest.mark.parametrize("input_file", ["test_yt.csv"])
Expand Down
2 changes: 1 addition & 1 deletion video2dataset/configs/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ reading:
download_size: 360
download_audio_rate: 44100
yt_metadata_args:
writesubtitles: True
writesubtitles: 'all'
subtitleslangs: ['en']
writeautomaticsub: True
get_info: True
Expand Down
2 changes: 1 addition & 1 deletion video2dataset/configs/downsample_ml.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ reading:
download_size: 360
download_audio_rate: 44100
yt_metadata_args:
writesubtitles: True
writesubtitles: 'all'
subtitleslangs: ['en']
writeautomaticsub: True
get_info: True
Expand Down
25 changes: 16 additions & 9 deletions video2dataset/data_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,16 @@ def get_yt_meta(url, yt_metadata_args: dict) -> dict:
"""Return yt meta dict with meta data and/or subtitles
yt_metadata_args is a dict of follwing format:
yt_metadata_args = {
'writesubtitles': True,
'writesubtitles': 'first',
'subtitleslangs': ['en'],
'writeautomaticsub': True,
'get_info': True
}
writesubtitles: Whether to write subtitles
writesubtitles: Whether to write subtitles for each provided language or just the first present
writeautomaticsub: Write the automatically generated subtitles to a file
subtitleslangs: List of languages of the subtitles to download.
get_info: whether to add info (title, description, tags etc) to the output.
get_info: Whether to add info (title, description, tags etc) to the output.
"""

write_subs = yt_metadata_args.get("writesubtitles", None)
Expand All @@ -74,15 +74,22 @@ def get_yt_meta(url, yt_metadata_args: dict) -> dict:
yt_metadata_args["ignoreerrors"] = True
yt_metadata_args["quiet"] = True

info_dict, sub_dict = None, None
info_dict, full_sub_dict = None, None

with yt_dlp.YoutubeDL(yt_metadata_args) as yt:
info_dict = yt.extract_info(url, download=False)
if write_subs:
sub_url = info_dict["requested_subtitles"][yt_metadata_args["subtitleslangs"][0]]["url"]
res = requests.get(sub_url, timeout=10)
sub = io.TextIOWrapper(io.BytesIO(res.content)).read()
sub_dict = sub_to_dict(sub)
full_sub_dict = {}
for lang in yt_metadata_args["subtitleslangs"]:
if lang not in info_dict["requested_subtitles"]:
continue
sub_url = info_dict["requested_subtitles"][lang]["url"]
res = requests.get(sub_url, timeout=10)
sub = io.TextIOWrapper(io.BytesIO(res.content)).read()
full_sub_dict[lang] = sub_to_dict(sub)

if write_subs == "first":
break

if yt_metadata_args["get_info"]:
info_dict.pop("subtitles")
Expand All @@ -93,7 +100,7 @@ def get_yt_meta(url, yt_metadata_args: dict) -> dict:
else:
info_dict = None

yt_meta_dict = {"info": info_dict, "subtitles": sub_dict}
yt_meta_dict = {"info": info_dict, "subtitles": full_sub_dict}

return yt_meta_dict

Expand Down
3 changes: 2 additions & 1 deletion video2dataset/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ def video2dataset(
assert clip_col is None # no weird double-clipping
if config["reading"]["yt_args"]["yt_metadata_args"] is None:
config["reading"]["yt_args"]["yt_metadata_args"] = {}
config["reading"]["yt_args"]["yt_metadata_args"]["writesubtitles"] = True # type: ignore
if not config["reading"]["yt_args"]["yt_metadata_args"].get("writesubtitles", None): # type: ignore
config["reading"]["yt_args"]["yt_metadata_args"]["writesubtitles"] = "all" # type: ignore

if encode_formats is None:
encode_formats = {"video": "mp4"}
Expand Down
56 changes: 37 additions & 19 deletions video2dataset/subsamplers/clipping_subsampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from .subsampler import Subsampler


ClipSpans = List[float] # [start, end]
ClipSpan = List[float] # [start, end]


class EncodeFormats(TypedDict):
Expand Down Expand Up @@ -45,7 +45,7 @@ def _get_strtime(t_sec: float) -> str:
return f"{hour:02d}:{minute:02d}:{second:02d}.{microsecond:03d}"


def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> List[ClipSpans]:
def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> List[ClipSpan]:
"""Filters out cuts by min and max length"""
time_d = e - s
n_full_clips = int(time_d // max_length)
Expand All @@ -55,7 +55,7 @@ def _split_time_frame(s: float, e: float, min_length: float, max_length: float)
return clip_spans


def _adjust_clip_spans_to_keyframes(clip_spans: List[ClipSpans], keyframes: List[float]) -> List[ClipSpans]:
def _adjust_clip_spans_to_keyframes(clip_spans: List[ClipSpan], keyframes: List[float]) -> List[ClipSpan]:
"""Translates clip_spans into keyframe vocab"""
adjusted_clip_spans = []
for start, end in clip_spans:
Expand All @@ -69,15 +69,15 @@ def _adjust_clip_spans_to_keyframes(clip_spans: List[ClipSpans], keyframes: List


def _adjust_clip_spans(
clip_spans: List[ClipSpans],
clip_spans: List[ClipSpan],
keyframe_timestamps: Union[List[float], None],
min_length: float,
max_length: float,
max_length_strategy: str,
) -> List[ClipSpans]:
) -> List[ClipSpan]:
"""Adjusts cut times around keyframes, filtering by min and max length"""
if not isinstance(clip_spans[0], Iterable): # make sure clip_spans looks like [[start, end]] and not [start, end]
clip_spans = cast(List[ClipSpans], [clip_spans])
clip_spans = cast(List[ClipSpan], [clip_spans])
clip_spans = [[_get_seconds(s), _get_seconds(e)] for [s, e] in clip_spans]

if keyframe_timestamps:
Expand All @@ -92,7 +92,7 @@ def _adjust_clip_spans(
return filtered_clip_spans


def _collate_clip_spans(clip_spans: List[ClipSpans]) -> Tuple[str, List[int]]:
def _collate_clip_spans(clip_spans: List[ClipSpan]) -> Tuple[str, List[int]]:
"""Collates clip spans into a single string for ffmpeg and a list of clip idxs"""
clip_times = []
clip_idxs = []
Expand Down Expand Up @@ -138,8 +138,30 @@ def _process_stream(
return stream_clips


def _extract_subtitles(clip_span: ClipSpan, meta_clip: dict) -> List[dict]:
"""Extracts subtitles and groups them by language"""
clip_subtitles = []
s_c, e_c = _get_seconds(clip_span[0]), _get_seconds(clip_span[1])
for lang_id, (lang, subtitles) in enumerate(meta_clip["yt_meta_dict"]["subtitles"].items()):
idx = 0
for line in subtitles:
line_dict = {lang: line["lines"]}
s, e = _get_seconds(line["start"]), _get_seconds(line["end"])
if max(s_c, s) < min(e_c, e):
if lang_id != 0:
clip_subtitles[idx]["lines"].update(line_dict)
idx += 1
else:
temp_line = copy.deepcopy(line)
temp_line["lines"] = line_dict
clip_subtitles.append(temp_line)
elif s > e_c:
break
return clip_subtitles


def _get_clip_metadata(
clip_spans: List[ClipSpans],
clip_spans: List[ClipSpan],
clip_idxs: List[int],
metadata: dict,
oom_clip_count: int,
Expand All @@ -162,25 +184,21 @@ def _get_clip_metadata(

yt_md_dict = meta_clip.get("yt_meta_dict", {})
if (yt_md_dict is not None) and (yt_md_dict.get("subtitles", None) is not None):
clip_subtitles = []
s_c, e_c = _get_seconds(clip_span[0]), _get_seconds(clip_span[1])
for line in meta_clip["yt_meta_dict"]["subtitles"]:
s, e = _get_seconds(line["start"]), _get_seconds(line["end"])
if max(s_c, s) < min(e_c, e):
clip_subtitles.append(line)
elif s > e_c:
break
# full video subtitles might still be useful for context
meta_clip["clip_subtitles"] = clip_subtitles
meta_clip["clip_subtitles"] = _extract_subtitles(clip_span, meta_clip)
metadata_clips.append(meta_clip)

# remove redundant metadata from clips after the first
for m_clips in metadata_clips[1:]:
m_clips["yt_meta_dict"] = {}

return metadata_clips


def _get_clips(
streams: Streams,
encode_formats: EncodeFormats,
precision: str,
clip_spans: List[ClipSpans],
clip_spans: List[ClipSpan],
metadata: dict,
oom_clip_count: int,
strtime_formatting: bool,
Expand Down
Loading

0 comments on commit 5101379

Please sign in to comment.