From 2efa849602b6b6040d7f93690068a1b9842eff43 Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Thu, 18 Jan 2024 10:03:29 -0500 Subject: [PATCH 01/13] ClippingSubsampler rewrite and bug fixes --- .../subsamplers/clipping_subsampler.py | 201 ++++++++++-------- 1 file changed, 115 insertions(+), 86 deletions(-) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index 466fd8e8..6424eaf9 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -7,12 +7,26 @@ import ffmpeg import tempfile from collections.abc import Iterable +from typing import Annotated, TypedDict, Literal, cast import datetime from .subsampler import Subsampler -def _get_seconds(t): +ClipTimes = Annotated[list[float], 2] + + +class EncodeFormats(TypedDict): + video: str + audio: str + + +class Streams(TypedDict): + video: bytes + audio: bytes + + +def _get_seconds(t: str | float) -> float: if not isinstance(t, str): return float(t) # already seconds time_format = "%H:%M:%S.%f" # TODO: maybe parameterize this? @@ -20,7 +34,7 @@ def _get_seconds(t): return t_obj.second + t_obj.microsecond / 1e6 + t_obj.minute * 60 + t_obj.hour * 3600 -def _get_strtime(t_sec): +def _get_strtime(t_sec: float) -> str: hour = int(t_sec // 3600) minute = int((t_sec // 60) % 60) second = int(t_sec % 60) @@ -29,24 +43,20 @@ def _get_strtime(t_sec): return f"{hour:02d}:{minute:02d}:{second:02d}.{microsecond:03d}" -def _split_time_frame(s, e, min_length, max_length): +def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> list[ClipTimes]: """Filters out cuts by min and max length""" time_d = e - s - time_frames = [ - (s + i * max_length, min(s + (i + 1) * max_length, e)) - for i in range(int(time_d // max_length) + (1 if time_d % max_length > 0 else 0)) - ] - if len(time_frames) == 0: - return [] - last_time_d = time_frames[-1][1] - time_frames[-1][0] - time_frames = time_frames if last_time_d >= min_length else time_frames[:-1] - return time_frames - - -def _adjust_ranges_to_keyframes(ranges, keyframes): - """Translates ranges into keyframe vocab""" + n_full_clips = int(time_d // max_length) + clip_times = [[s + i * max_length, s + (i + 1) * max_length] for i in range(n_full_clips)] + ( + [[s + (n_full_clips - 1) * max_length, e]] if time_d % max_length > min_length else [] + ) + return clip_times + + +def _adjust_clip_times_to_keyframes(clip_times: list[ClipTimes], keyframes: list[float]) -> list[ClipTimes]: + """Translates clip_times into keyframe vocab""" adjusted_ranges = [] - for start, end in ranges: + for start, end in clip_times: keyframes_in_range = [k for k in keyframes if start <= k <= end] if keyframes_in_range: adjusted_start = min(keyframes_in_range) @@ -56,6 +66,52 @@ def _adjust_ranges_to_keyframes(ranges, keyframes): return adjusted_ranges +def _adjust_clip_times( + clip_times: list[ClipTimes], + keyframe_timestamps: list[float] | None, + min_length: float, + max_length: float, + max_length_strategy: str, +) -> list[ClipTimes]: + if not isinstance(clip_times[0], Iterable): # make sure clip_times looks like [[start, end]] and not [start, end] + clip_times = cast(list[ClipTimes], [clip_times]) + clip_times = [[_get_seconds(s), _get_seconds(e)] for [s, e] in clip_times] + + if keyframe_timestamps: + clip_times = _adjust_clip_times_to_keyframes(clip_times, keyframe_timestamps) + + filtered_clip_times = [] + for s, e in clip_times: + max_len_clip_times = _split_time_frame(s, e, min_length, max_length) + if max_length_strategy == "first": + max_len_clip_times = max_len_clip_times[:1] + filtered_clip_times += max_len_clip_times + return filtered_clip_times + + +def _get_clip_intervals(clip_times: list[ClipTimes]) -> tuple[str, list[int]]: + s_clip, e_clip = clip_times[0] + skip_first_interval = int(s_clip > 0.0) + + # which timestamp intervals to take, used to discard non-contiguous sections + intervals = [skip_first_interval] + timestamps = [0.0] + skip_first_interval * [s_clip] + [e_clip] + interval = 1 + skip_first_interval + for s, e in clip_times[1:]: + if s == e_clip: # situations like [0, 1], [1, 2], [2, 3] -> 1, 2 + timestamps += [e] + intervals.append(interval) + interval += 1 + else: + timestamps += [s, e] + intervals.append(interval + 1) + interval += 2 + e_clip = e + + timestamps = ",".join([str(time) for time in timestamps]) + return timestamps, intervals + + class ClippingSubsampler(Subsampler): """ Cuts videos up into segments according to the 'clips' metadata @@ -85,72 +141,59 @@ class ClippingSubsampler(Subsampler): def __init__( self, - oom_clip_count, - encode_formats, - min_length=0.0, - max_length=999999.0, - max_length_strategy="all", - precision="low", + oom_clip_count: int, + encode_formats: EncodeFormats, + min_length: float = 0.0, + max_length: float = 999999.0, + max_length_strategy: Literal["all", "first"] = "all", + precision: Literal["low", "keyframe_adjusted", "exact"] = "low", ): + assert max_length_strategy in ["all", "first"] + assert precision in ["exact", "low", "keyframe_adjusted"] self.oom_clip_count = oom_clip_count self.encode_formats = encode_formats self.min_length = min_length - self.max_length, self.max_length_strategy = max_length, max_length_strategy - assert precision in ["exact", "low", "keyframe_adjusted"] + self.max_length = max_length + self.max_length_strategy = max_length_strategy self.precision = precision def __call__(self, streams, metadata): - clips = metadata.pop("clips") - - if not isinstance(clips[0], Iterable): # make sure clips looks like [[start, end]] and not [start, end] - clips = [clips] + strtime_formatting = isinstance(metadata["clips"][0][0], str) - is_strtime = isinstance(clips[0][0], str) + clip_times = _adjust_clip_times( + clip_times=metadata.pop("clips"), + keyframe_timestamps=( + # TODO: make it so if keyframe timestamps not present, get it yourself + metadata["video_metadata"].pop("keyframe_timestamps") + if self.precision == "keyframe_adjusted" + else None + ), + min_length=self.min_length, + max_length=self.max_length, + max_length_strategy=self.max_length_strategy, + ) + if len(clip_times) == 0: + return {}, [], f"Video had no clip_times longer than {self.min_length}" - if self.precision == "keyframe_adjusted": - # TODO: make it so if not present, get it yourself - keyframe_timestamps = metadata["video_metadata"].pop("keyframe_timestamps") - s_clips = [[_get_seconds(s), _get_seconds(e)] for (s, e) in clips] - clips = _adjust_ranges_to_keyframes(s_clips, keyframe_timestamps) + timestamps, intervals = _get_clip_intervals(clip_times) - filtered_clips = [] - for s, e in clips: - max_len_clips = _split_time_frame(_get_seconds(s), _get_seconds(e), self.min_length, self.max_length) + ffmpeg_kwargs = { + "map": 0, + "f": "segment", + "segment_times": timestamps, + "reset_timestamps": 1, + } + if self.precision == "exact": + ffmpeg_kwargs["force_key_frames"] = timestamps + else: + ffmpeg_kwargs["c"] = "copy" - if self.max_length_strategy == "first": - max_len_clips = max_len_clips[:1] - filtered_clips += max_len_clips - clips = filtered_clips - if len(clips) == 0: - # return an error - return {}, [], f"Video had no clips longer than {self.min_length}" - start_0 = _get_seconds(clips[0][0]) == 0.0 - ind = 1 + int(not start_0) - s_p, e_p = clips[0] - s_p, e_p = _get_seconds(s_p), _get_seconds(e_p) - splits = (not start_0) * [s_p] + [e_p] - # list of indicies of clips to take, used to discard non-contiguous sections - take_inds = [int(not start_0)] - # TODO: make nicer - for s, e in clips[1:]: - s, e = _get_seconds(s), _get_seconds(e) - if s == e_p: # situations like [0, 1], [1, 2], [2, 3] -> 1, 2 - splits += [e] - take_inds.append(ind) - ind += 1 - else: - splits += [s, e] - take_inds.append(ind + 1) - ind += 2 - e_p = e - - segment_times = ",".join([str(spl) for spl in splits]) streams_clips = {} for k in streams.keys(): @@ -165,25 +208,11 @@ def __call__(self, streams, metadata): with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f: f.write(stream_bytes) try: - kwargs = { - "map": 0, - "f": "segment", - "segment_times": segment_times, - "reset_timestamps": 1, - } - - # Precision things, tradeoff for speed - if self.precision != "exact": - kwargs["c"] = "copy" - else: - kwargs["force_key_frames"] = segment_times - - _ = ( + ( ffmpeg.input(f"{tmpdir}/input.{encode_format}") - .output(f"{tmpdir}/clip_%d.{encode_format}", **kwargs) + .output(f"{tmpdir}/clip_%d.{encode_format}", **ffmpeg_kwargs) .run(capture_stdout=True, quiet=True) ) - except Exception as err: # pylint: disable=broad-except return {}, [], str(err) @@ -191,10 +220,10 @@ def __call__(self, streams, metadata): stream_clips.sort(key=lambda x: int(x.split("_")[-1].split(".")[0])) correct_clips = [] - for clip_id, (clip, ind) in enumerate(zip(clips, take_inds)): + for clip_id, (clip, ind) in enumerate(zip(clip_times, intervals)): if ind < len(stream_clips): correct_clips.append((clip_id, clip, stream_clips[ind])) - # clips_lost = len(take_inds) - len(correct_clips) # TODO report this somehow + # clips_lost = len(intervals) - len(correct_clips) # TODO report this somehow stream_clips, metadata_clips = [], [] for clip_id, clip_span, clip_pth in correct_clips: @@ -207,8 +236,8 @@ def __call__(self, streams, metadata): ) meta_clip = copy.deepcopy(metadata) # set the timeframe of this clip - if is_strtime: - # Keep clips in the original format to be compatible with the data schema. + if strtime_formatting: + # Keep clip_times in the original format to be compatible with the data schema. meta_clip["clips"] = [(_get_strtime(clip_span[0]), _get_strtime(clip_span[1]))] else: meta_clip["clips"] = [clip_span] From a5c9649b32af7541e7887a0460e4ecf46e855f4f Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Thu, 18 Jan 2024 11:59:45 -0500 Subject: [PATCH 02/13] More refactoring of ClippingSubsampler, plus a fix to _get_clip_intervals --- .../subsamplers/clipping_subsampler.py | 52 +++++++++---------- 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index 6424eaf9..4d2e578e 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -55,15 +55,15 @@ def _split_time_frame(s: float, e: float, min_length: float, max_length: float) def _adjust_clip_times_to_keyframes(clip_times: list[ClipTimes], keyframes: list[float]) -> list[ClipTimes]: """Translates clip_times into keyframe vocab""" - adjusted_ranges = [] + adjusted_clip_times = [] for start, end in clip_times: keyframes_in_range = [k for k in keyframes if start <= k <= end] if keyframes_in_range: adjusted_start = min(keyframes_in_range) adjusted_end = max(keyframes_in_range) if adjusted_start != adjusted_end: - adjusted_ranges.append((adjusted_start, adjusted_end)) - return adjusted_ranges + adjusted_clip_times.append((adjusted_start, adjusted_end)) + return adjusted_clip_times def _adjust_clip_times( @@ -89,27 +89,25 @@ def _adjust_clip_times( return filtered_clip_times -def _get_clip_intervals(clip_times: list[ClipTimes]) -> tuple[str, list[int]]: - s_clip, e_clip = clip_times[0] - skip_first_interval = int(s_clip > 0.0) +def _get_clip_times(clip_times: list[ClipTimes]) -> tuple[str, list[int]]: + all_clip_times = [0.0] + clip_idxs = [] + e_prev = 0.0 + clip_idx = 0 - # which timestamp intervals to take, used to discard non-contiguous sections - intervals = [skip_first_interval] - timestamps = [0.0] + skip_first_interval * [s_clip] + [e_clip] - interval = 1 + skip_first_interval - for s, e in clip_times[1:]: - if s == e_clip: # situations like [0, 1], [1, 2], [2, 3] -> 1, 2 - timestamps += [e] - intervals.append(interval) - interval += 1 - else: - timestamps += [s, e] - intervals.append(interval + 1) - interval += 2 - e_clip = e + for s, e in clip_times: + if s == e_prev: # clip starts where last one left off + all_clip_times += [e] + clip_idxs.append(clip_idx) + clip_idx += 1 + else: # next clip skips over some time + all_clip_times += [s, e] + clip_idxs.append(clip_idx + 1) + clip_idx += 2 + e_prev = e - timestamps = ",".join([str(time) for time in timestamps]) - return timestamps, intervals + all_clip_times = ",".join([str(time) for time in all_clip_times]) + return all_clip_times, clip_idxs class ClippingSubsampler(Subsampler): @@ -175,16 +173,16 @@ def __call__(self, streams, metadata): if len(clip_times) == 0: return {}, [], f"Video had no clip_times longer than {self.min_length}" - timestamps, intervals = _get_clip_intervals(clip_times) + all_clip_times, clip_idxs = _get_clip_times(clip_times) ffmpeg_kwargs = { "map": 0, "f": "segment", - "segment_times": timestamps, + "segment_times": all_clip_times, "reset_timestamps": 1, } if self.precision == "exact": - ffmpeg_kwargs["force_key_frames"] = timestamps + ffmpeg_kwargs["force_key_frames"] = all_clip_times else: ffmpeg_kwargs["c"] = "copy" @@ -220,10 +218,10 @@ def __call__(self, streams, metadata): stream_clips.sort(key=lambda x: int(x.split("_")[-1].split(".")[0])) correct_clips = [] - for clip_id, (clip, ind) in enumerate(zip(clip_times, intervals)): + for clip_id, (clip, ind) in enumerate(zip(clip_times, clip_idxs)): if ind < len(stream_clips): correct_clips.append((clip_id, clip, stream_clips[ind])) - # clips_lost = len(intervals) - len(correct_clips) # TODO report this somehow + # clips_lost = len(clip_idxs) - len(correct_clips) # TODO report this somehow stream_clips, metadata_clips = [], [] for clip_id, clip_span, clip_pth in correct_clips: From 2cb5854b03760d4d75404c25ac3f553586c43876 Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Thu, 18 Jan 2024 12:48:17 -0500 Subject: [PATCH 03/13] Finished refactoring ClippingSubsampler --- .../subsamplers/clipping_subsampler.py | 281 ++++++++++-------- 1 file changed, 159 insertions(+), 122 deletions(-) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index 4d2e578e..9ae4ee60 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -13,7 +13,7 @@ from .subsampler import Subsampler -ClipTimes = Annotated[list[float], 2] +ClipSpans = Annotated[list[float], 2] class EncodeFormats(TypedDict): @@ -43,71 +43,180 @@ def _get_strtime(t_sec: float) -> str: return f"{hour:02d}:{minute:02d}:{second:02d}.{microsecond:03d}" -def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> list[ClipTimes]: +def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> list[ClipSpans]: """Filters out cuts by min and max length""" time_d = e - s n_full_clips = int(time_d // max_length) - clip_times = [[s + i * max_length, s + (i + 1) * max_length] for i in range(n_full_clips)] + ( + clip_spans = [[s + i * max_length, s + (i + 1) * max_length] for i in range(n_full_clips)] + ( [[s + (n_full_clips - 1) * max_length, e]] if time_d % max_length > min_length else [] ) - return clip_times + return clip_spans -def _adjust_clip_times_to_keyframes(clip_times: list[ClipTimes], keyframes: list[float]) -> list[ClipTimes]: - """Translates clip_times into keyframe vocab""" - adjusted_clip_times = [] - for start, end in clip_times: +def _adjust_clip_spans_to_keyframes(clip_spans: list[ClipSpans], keyframes: list[float]) -> list[ClipSpans]: + """Translates clip_spans into keyframe vocab""" + adjusted_clip_spans = [] + for start, end in clip_spans: keyframes_in_range = [k for k in keyframes if start <= k <= end] if keyframes_in_range: adjusted_start = min(keyframes_in_range) adjusted_end = max(keyframes_in_range) if adjusted_start != adjusted_end: - adjusted_clip_times.append((adjusted_start, adjusted_end)) - return adjusted_clip_times + adjusted_clip_spans.append((adjusted_start, adjusted_end)) + return adjusted_clip_spans -def _adjust_clip_times( - clip_times: list[ClipTimes], +def _adjust_clip_spans( + clip_spans: list[ClipSpans], keyframe_timestamps: list[float] | None, min_length: float, max_length: float, max_length_strategy: str, -) -> list[ClipTimes]: - if not isinstance(clip_times[0], Iterable): # make sure clip_times looks like [[start, end]] and not [start, end] - clip_times = cast(list[ClipTimes], [clip_times]) - clip_times = [[_get_seconds(s), _get_seconds(e)] for [s, e] in clip_times] +) -> list[ClipSpans]: + if not isinstance(clip_spans[0], Iterable): # make sure clip_spans looks like [[start, end]] and not [start, end] + clip_spans = cast(list[ClipSpans], [clip_spans]) + clip_spans = [[_get_seconds(s), _get_seconds(e)] for [s, e] in clip_spans] if keyframe_timestamps: - clip_times = _adjust_clip_times_to_keyframes(clip_times, keyframe_timestamps) + clip_spans = _adjust_clip_spans_to_keyframes(clip_spans, keyframe_timestamps) - filtered_clip_times = [] - for s, e in clip_times: - max_len_clip_times = _split_time_frame(s, e, min_length, max_length) + filtered_clip_spans = [] + for s, e in clip_spans: + max_len_clip_spans = _split_time_frame(s, e, min_length, max_length) if max_length_strategy == "first": - max_len_clip_times = max_len_clip_times[:1] - filtered_clip_times += max_len_clip_times - return filtered_clip_times + max_len_clip_spans = max_len_clip_spans[:1] + filtered_clip_spans += max_len_clip_spans + return filtered_clip_spans -def _get_clip_times(clip_times: list[ClipTimes]) -> tuple[str, list[int]]: - all_clip_times = [0.0] +def _get_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]: + segment_times = [0.0] clip_idxs = [] e_prev = 0.0 clip_idx = 0 - for s, e in clip_times: + for s, e in clip_spans: if s == e_prev: # clip starts where last one left off - all_clip_times += [e] + segment_times += [e] clip_idxs.append(clip_idx) clip_idx += 1 else: # next clip skips over some time - all_clip_times += [s, e] + segment_times += [s, e] clip_idxs.append(clip_idx + 1) clip_idx += 2 e_prev = e - all_clip_times = ",".join([str(time) for time in all_clip_times]) - return all_clip_times, clip_idxs + segment_times = ",".join([str(time) for time in segment_times]) + return segment_times, clip_idxs + + +def _process_stream(stream_bytes: bytes, encode_format: str, ffmpeg_kwargs: dict) -> list[str]: + with tempfile.TemporaryDirectory() as tmpdir: + # TODO: we need to put the extension into the metadata + # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn + with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f: + f.write(stream_bytes) + try: + ( + ffmpeg.input(f"{tmpdir}/input.{encode_format}") + .output(f"{tmpdir}/clip_%d.{encode_format}", **ffmpeg_kwargs) + .run(capture_stdout=True, quiet=True) + ) + except Exception as err: # pylint: disable=broad-except + raise err + stream_clips = glob.glob(f"{tmpdir}/clip*.{encode_format}") + stream_clips.sort(key=lambda x: int(x.split("_")[-1].split(".")[0])) + return stream_clips + + +def _get_clip_metadata( + clip_spans: list[ClipSpans], + clip_idxs: list[int], + metadata: dict, + oom_clip_count: int, + strtime_formatting: bool, +) -> list[dict]: + metadata_clips = [] + for clip_id, (clip_span, _) in enumerate(zip(clip_spans, clip_idxs)): + clip_key = "{clip_id:0{oom_clip_count}d}".format( # pylint: disable=consider-using-f-string + clip_id=clip_id, oom_clip_count=oom_clip_count + ) + meta_clip = copy.deepcopy(metadata) + # set the timeframe of this clip + if strtime_formatting: + # Keep clip_spans in the original format to be compatible with the data schema. + meta_clip["clips"] = [(_get_strtime(clip_span[0]), _get_strtime(clip_span[1]))] + else: + meta_clip["clips"] = [clip_span] + meta_clip["key"] = f"{meta_clip['key']}_{clip_key}" + + yt_md_dict = meta_clip.get("yt_meta_dict", {}) + if (yt_md_dict is not None) and (yt_md_dict.get("subtitles", None) is not None): + clip_subtitles = [] + s_c, e_c = _get_seconds(clip_span[0]), _get_seconds(clip_span[1]) + for line in meta_clip["yt_meta_dict"]["subtitles"]: + s, e = _get_seconds(line["start"]), _get_seconds(line["end"]) + if max(s_c, s) < min(e_c, e): + clip_subtitles.append(line) + elif s > e_c: + break + # full video subtitles might still be useful for context + meta_clip["clip_subtitles"] = clip_subtitles + metadata_clips.append(meta_clip) + return metadata_clips + + +def _get_clips( + streams: Streams, + encode_formats: EncodeFormats, + precision: str, + clip_spans: list[ClipSpans], + metadata: dict, + oom_clip_count: int, + strtime_formatting: bool, +) -> tuple[dict[str, list[str]], list[dict]]: + segment_times, clip_idxs = _get_clip_spans(clip_spans) + + ffmpeg_kwargs = { + "map": 0, + "f": "segment", + "segment_times": segment_times, + "reset_timestamps": 1, + } + if precision == "exact": + ffmpeg_kwargs["force_key_frames"] = segment_times + else: + ffmpeg_kwargs["c"] = "copy" + + clips = {} + for k in streams.keys(): + stream_bytes = streams[k][0] # pre-broadcast so only one + if stream_bytes is None: + continue + try: + stream_clips = _process_stream( + stream_bytes=stream_bytes, + encode_format=encode_formats[k], + ffmpeg_kwargs=ffmpeg_kwargs, + ) + except Exception as err: + raise err + + clips[k] = [] + for _, (_, clip_idx) in enumerate(zip(clip_spans, clip_idxs)): + with open(stream_clips[clip_idx], "rb") as vid_f: + clip_bytes = vid_f.read() + clips[k].append(clip_bytes) + + clip_metadata = _get_clip_metadata( + clip_spans=clip_spans, + clip_idxs=clip_idxs, + metadata=metadata, + oom_clip_count=oom_clip_count, + strtime_formatting=strtime_formatting, + ) + + return clips, clip_metadata class ClippingSubsampler(Subsampler): @@ -155,11 +264,11 @@ def __init__( self.max_length_strategy = max_length_strategy self.precision = precision - def __call__(self, streams, metadata): + def __call__(self, streams: Streams, metadata: dict): strtime_formatting = isinstance(metadata["clips"][0][0], str) - clip_times = _adjust_clip_times( - clip_times=metadata.pop("clips"), + clip_spans = _adjust_clip_spans( + clip_spans=metadata.pop("clips"), keyframe_timestamps=( # TODO: make it so if keyframe timestamps not present, get it yourself metadata["video_metadata"].pop("keyframe_timestamps") @@ -170,92 +279,20 @@ def __call__(self, streams, metadata): max_length=self.max_length, max_length_strategy=self.max_length_strategy, ) - if len(clip_times) == 0: - return {}, [], f"Video had no clip_times longer than {self.min_length}" - - all_clip_times, clip_idxs = _get_clip_times(clip_times) - - ffmpeg_kwargs = { - "map": 0, - "f": "segment", - "segment_times": all_clip_times, - "reset_timestamps": 1, - } - if self.precision == "exact": - ffmpeg_kwargs["force_key_frames"] = all_clip_times - else: - ffmpeg_kwargs["c"] = "copy" - - - - - - - - streams_clips = {} - - for k in streams.keys(): - stream_bytes = streams[k][0] # pre-broadcast so only one - if stream_bytes is None: - continue - encode_format = self.encode_formats[k] - - with tempfile.TemporaryDirectory() as tmpdir: - # TODO: we need to put the extension into the metadata - # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn - with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f: - f.write(stream_bytes) - try: - ( - ffmpeg.input(f"{tmpdir}/input.{encode_format}") - .output(f"{tmpdir}/clip_%d.{encode_format}", **ffmpeg_kwargs) - .run(capture_stdout=True, quiet=True) - ) - except Exception as err: # pylint: disable=broad-except - return {}, [], str(err) - - stream_clips = glob.glob(f"{tmpdir}/clip*.{encode_format}") - stream_clips.sort(key=lambda x: int(x.split("_")[-1].split(".")[0])) - - correct_clips = [] - for clip_id, (clip, ind) in enumerate(zip(clip_times, clip_idxs)): - if ind < len(stream_clips): - correct_clips.append((clip_id, clip, stream_clips[ind])) - # clips_lost = len(clip_idxs) - len(correct_clips) # TODO report this somehow - - stream_clips, metadata_clips = [], [] - for clip_id, clip_span, clip_pth in correct_clips: - with open(clip_pth, "rb") as vid_f: - clip_bytes = vid_f.read() - stream_clips.append(clip_bytes) - - clip_key = "{clip_id:0{oom_clip_count}d}".format( # pylint: disable=consider-using-f-string - clip_id=clip_id, oom_clip_count=self.oom_clip_count - ) - meta_clip = copy.deepcopy(metadata) - # set the timeframe of this clip - if strtime_formatting: - # Keep clip_times in the original format to be compatible with the data schema. - meta_clip["clips"] = [(_get_strtime(clip_span[0]), _get_strtime(clip_span[1]))] - else: - meta_clip["clips"] = [clip_span] - meta_clip["key"] = f"{meta_clip['key']}_{clip_key}" - - yt_md_dict = meta_clip.get("yt_meta_dict", {}) - if (yt_md_dict is not None) and (yt_md_dict.get("subtitles", None) is not None): - clip_subtitles = [] - s_c, e_c = _get_seconds(clip_span[0]), _get_seconds(clip_span[1]) - for line in meta_clip["yt_meta_dict"]["subtitles"]: - s, e = _get_seconds(line["start"]), _get_seconds(line["end"]) - if max(s_c, s) < min(e_c, e): - clip_subtitles.append(line) - elif s > e_c: - break - # full video subtitles might still be useful for context - meta_clip["clip_subtitles"] = clip_subtitles - - metadata_clips.append(meta_clip) - - streams_clips[k] = stream_clips - - return streams_clips, metadata_clips, None + if len(clip_spans) == 0: + return {}, [], f"Video had no clip_spans longer than {self.min_length}" + + try: + clips, clip_metadata = _get_clips( + streams=streams, + encode_formats=self.encode_formats, + precision=self.precision, + clip_spans=clip_spans, + metadata=metadata, + oom_clip_count=self.oom_clip_count, + strtime_formatting=strtime_formatting, + ) + except Exception as err: + return {}, [], str(err) + + return clips, clip_metadata, None From 5d03b720e10345a5492de4d0c8529a354c3a48fc Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Thu, 18 Jan 2024 20:48:19 -0500 Subject: [PATCH 04/13] Final code changes --- .../subsamplers/clipping_subsampler.py | 94 ++++++++++--------- 1 file changed, 50 insertions(+), 44 deletions(-) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index 9ae4ee60..a0960ff0 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -7,7 +7,7 @@ import ffmpeg import tempfile from collections.abc import Iterable -from typing import Annotated, TypedDict, Literal, cast +from typing import Any, Annotated, TypedDict, Literal, cast import datetime from .subsampler import Subsampler @@ -89,42 +89,46 @@ def _adjust_clip_spans( return filtered_clip_spans -def _get_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]: - segment_times = [0.0] +def _collate_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]: + clip_times = [0.0] clip_idxs = [] e_prev = 0.0 clip_idx = 0 for s, e in clip_spans: if s == e_prev: # clip starts where last one left off - segment_times += [e] + clip_times += [e] clip_idxs.append(clip_idx) clip_idx += 1 else: # next clip skips over some time - segment_times += [s, e] + clip_times += [s, e] clip_idxs.append(clip_idx + 1) clip_idx += 2 e_prev = e - segment_times = ",".join([str(time) for time in segment_times]) - return segment_times, clip_idxs - - -def _process_stream(stream_bytes: bytes, encode_format: str, ffmpeg_kwargs: dict) -> list[str]: - with tempfile.TemporaryDirectory() as tmpdir: - # TODO: we need to put the extension into the metadata - # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn - with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f: - f.write(stream_bytes) - try: - ( - ffmpeg.input(f"{tmpdir}/input.{encode_format}") - .output(f"{tmpdir}/clip_%d.{encode_format}", **ffmpeg_kwargs) - .run(capture_stdout=True, quiet=True) - ) - except Exception as err: # pylint: disable=broad-except - raise err - stream_clips = glob.glob(f"{tmpdir}/clip*.{encode_format}") + clip_times = ",".join([str(time) for time in clip_times]) + return clip_times, clip_idxs + + +def _process_stream( + tmpdir: Any, # BytesPath + stream_bytes: bytes, + encode_format: str, + ffmpeg_kwargs: dict, +) -> list[str]: + # TODO: we need to put the extension into the metadata + # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn + with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f: + f.write(stream_bytes) + try: + ( + ffmpeg.input(f"{tmpdir}/input.{encode_format}") + .output(f"{tmpdir}/clip_%d.{encode_format}", **ffmpeg_kwargs) + .run(capture_stdout=True, quiet=True) + ) + except Exception as err: # pylint: disable=broad-except + raise err + stream_clips = glob.glob(f"{tmpdir}/clip*.{encode_format}") stream_clips.sort(key=lambda x: int(x.split("_")[-1].split(".")[0])) return stream_clips @@ -175,38 +179,40 @@ def _get_clips( oom_clip_count: int, strtime_formatting: bool, ) -> tuple[dict[str, list[str]], list[dict]]: - segment_times, clip_idxs = _get_clip_spans(clip_spans) + clip_times, clip_idxs = _collate_clip_spans(clip_spans) ffmpeg_kwargs = { "map": 0, "f": "segment", - "segment_times": segment_times, + "segment_times": clip_times, "reset_timestamps": 1, } if precision == "exact": - ffmpeg_kwargs["force_key_frames"] = segment_times + ffmpeg_kwargs["force_key_frames"] = clip_times else: ffmpeg_kwargs["c"] = "copy" clips = {} for k in streams.keys(): - stream_bytes = streams[k][0] # pre-broadcast so only one - if stream_bytes is None: - continue - try: - stream_clips = _process_stream( - stream_bytes=stream_bytes, - encode_format=encode_formats[k], - ffmpeg_kwargs=ffmpeg_kwargs, - ) - except Exception as err: - raise err - - clips[k] = [] - for _, (_, clip_idx) in enumerate(zip(clip_spans, clip_idxs)): - with open(stream_clips[clip_idx], "rb") as vid_f: - clip_bytes = vid_f.read() - clips[k].append(clip_bytes) + with tempfile.TemporaryDirectory() as tmpdir: + stream_bytes = streams[k][0] # pre-broadcast so only one + if stream_bytes is None: + continue + try: + stream_clips = _process_stream( + tmpdir=tmpdir, + stream_bytes=stream_bytes, + encode_format=encode_formats[k], + ffmpeg_kwargs=ffmpeg_kwargs, + ) + except Exception as err: + raise err + + clips[k] = [] + for _, (_, clip_idx) in enumerate(zip(clip_spans, clip_idxs)): + with open(stream_clips[clip_idx], "rb") as vid_f: + clip_bytes = vid_f.read() + clips[k].append(clip_bytes) clip_metadata = _get_clip_metadata( clip_spans=clip_spans, From 47c7d647a5fc36cd11953aaa5baaf000f200458e Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Thu, 18 Jan 2024 20:53:16 -0500 Subject: [PATCH 05/13] Added docstrings --- video2dataset/subsamplers/clipping_subsampler.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index a0960ff0..3f18d703 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -27,6 +27,7 @@ class Streams(TypedDict): def _get_seconds(t: str | float) -> float: + """Converts time to seconds""" if not isinstance(t, str): return float(t) # already seconds time_format = "%H:%M:%S.%f" # TODO: maybe parameterize this? @@ -35,6 +36,7 @@ def _get_seconds(t: str | float) -> float: def _get_strtime(t_sec: float) -> str: + """Converts time to string""" hour = int(t_sec // 3600) minute = int((t_sec // 60) % 60) second = int(t_sec % 60) @@ -73,6 +75,7 @@ def _adjust_clip_spans( max_length: float, max_length_strategy: str, ) -> list[ClipSpans]: + """Adjusts cut times around keyframes, filtering by min and max length""" if not isinstance(clip_spans[0], Iterable): # make sure clip_spans looks like [[start, end]] and not [start, end] clip_spans = cast(list[ClipSpans], [clip_spans]) clip_spans = [[_get_seconds(s), _get_seconds(e)] for [s, e] in clip_spans] @@ -90,6 +93,7 @@ def _adjust_clip_spans( def _collate_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]: + """Collates clip spans into a single string for ffmpeg and a list of clip idxs""" clip_times = [0.0] clip_idxs = [] e_prev = 0.0 @@ -116,6 +120,7 @@ def _process_stream( encode_format: str, ffmpeg_kwargs: dict, ) -> list[str]: + """Processes a stream into clips using ffmpeg""" # TODO: we need to put the extension into the metadata # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f: @@ -140,6 +145,7 @@ def _get_clip_metadata( oom_clip_count: int, strtime_formatting: bool, ) -> list[dict]: + """Gets metadata for each clip""" metadata_clips = [] for clip_id, (clip_span, _) in enumerate(zip(clip_spans, clip_idxs)): clip_key = "{clip_id:0{oom_clip_count}d}".format( # pylint: disable=consider-using-f-string @@ -179,6 +185,7 @@ def _get_clips( oom_clip_count: int, strtime_formatting: bool, ) -> tuple[dict[str, list[str]], list[dict]]: + """Gets clips from streams""" clip_times, clip_idxs = _collate_clip_spans(clip_spans) ffmpeg_kwargs = { From 5aa84d49d95535fd3db4f109ef94ba8238973e87 Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Thu, 18 Jan 2024 21:35:15 -0500 Subject: [PATCH 06/13] Passed tests and linting --- tests/test_subsamplers.py | 10 +++++----- video2dataset/subsamplers/__init__.py | 2 +- video2dataset/subsamplers/clipping_subsampler.py | 10 +++++----- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/test_subsamplers.py b/tests/test_subsamplers.py index e6a5b5f0..28ace480 100644 --- a/tests/test_subsamplers.py +++ b/tests/test_subsamplers.py @@ -11,6 +11,7 @@ ClippingSubsampler, _get_seconds, _split_time_frame, + Streams, FFProbeSubsampler, ResolutionSubsampler, FrameSubsampler, @@ -45,8 +46,8 @@ def test_clipping_subsampler(clips): min_length = 5.0 if clips == MULTI else 2.0 max_length = 999999.0 if clips == MULTI else 3.0 subsampler = ClippingSubsampler( - 3, - {"video": "mp4", "audio": "mp3"}, + oom_clip_count=3, + encode_formats={"video": "mp4", "audio": "mp3"}, min_length=min_length, max_length=max_length, max_length_strategy="all", @@ -58,7 +59,7 @@ def test_clipping_subsampler(clips): "clips": clips, } - streams = {"video": [video_bytes], "audio": [audio_bytes]} + streams: Streams = {"video": [video_bytes], "audio": [audio_bytes]} stream_fragments, meta_fragments, error_message = subsampler(streams, metadata) video_fragments = stream_fragments["video"] audio_fragments = stream_fragments["audio"] @@ -84,7 +85,7 @@ def test_clipping_subsampler(clips): s_target, e_target = clips[key_ind] s_target, e_target = _get_seconds(s_target), _get_seconds(e_target) expected_clips = _split_time_frame(s_target, e_target, min_length, max_length) - assert (_get_seconds(s), _get_seconds(e)) in expected_clips + assert [_get_seconds(s), _get_seconds(e)] in expected_clips assert _get_seconds(e) - _get_seconds(s) >= min_length s_s, e_s = _get_seconds(s), _get_seconds(e) @@ -92,7 +93,6 @@ def test_clipping_subsampler(clips): video_stream = [stream for stream in probe["streams"] if stream["codec_type"] == "video"][0] frag_len = float(video_stream["duration"]) - # currently some segments can be pretty innacurate assert abs(frag_len - (e_s - s_s)) < 5.0 diff --git a/video2dataset/subsamplers/__init__.py b/video2dataset/subsamplers/__init__.py index 5d4741f8..90e4cd58 100644 --- a/video2dataset/subsamplers/__init__.py +++ b/video2dataset/subsamplers/__init__.py @@ -3,7 +3,7 @@ """ from .audio_rate_subsampler import AudioRateSubsampler -from .clipping_subsampler import ClippingSubsampler, _get_seconds, _split_time_frame +from .clipping_subsampler import ClippingSubsampler, _get_seconds, _split_time_frame, Streams from .frame_subsampler import FrameSubsampler from .ffprobe_subsampler import FFProbeSubsampler from .noop_subsampler import NoOpSubsampler diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index 3f18d703..b3ae717a 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -22,8 +22,8 @@ class EncodeFormats(TypedDict): class Streams(TypedDict): - video: bytes - audio: bytes + video: list[bytes] + audio: list[bytes] def _get_seconds(t: str | float) -> float: @@ -50,7 +50,7 @@ def _split_time_frame(s: float, e: float, min_length: float, max_length: float) time_d = e - s n_full_clips = int(time_d // max_length) clip_spans = [[s + i * max_length, s + (i + 1) * max_length] for i in range(n_full_clips)] + ( - [[s + (n_full_clips - 1) * max_length, e]] if time_d % max_length > min_length else [] + [[s + (n_full_clips) * max_length, e]] if time_d % max_length > min_length else [] ) return clip_spans @@ -94,7 +94,7 @@ def _adjust_clip_spans( def _collate_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]: """Collates clip spans into a single string for ffmpeg and a list of clip idxs""" - clip_times = [0.0] + clip_times = [] clip_idxs = [] e_prev = 0.0 clip_idx = 0 @@ -216,7 +216,7 @@ def _get_clips( raise err clips[k] = [] - for _, (_, clip_idx) in enumerate(zip(clip_spans, clip_idxs)): + for clip_idx in clip_idxs: with open(stream_clips[clip_idx], "rb") as vid_f: clip_bytes = vid_f.read() clips[k].append(clip_bytes) From 140e1abbe4445916b5f81347673adba1f7e9ebbe Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Thu, 18 Jan 2024 21:48:17 -0500 Subject: [PATCH 07/13] Made type annotations consistent with Python 3.8 --- .../subsamplers/clipping_subsampler.py | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index b3ae717a..25c7f665 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -7,13 +7,13 @@ import ffmpeg import tempfile from collections.abc import Iterable -from typing import Any, Annotated, TypedDict, Literal, cast +from typing import Any, Union, List, TypedDict, Literal, cast import datetime from .subsampler import Subsampler -ClipSpans = Annotated[list[float], 2] +ClipSpans = List[float] # [start, end] class EncodeFormats(TypedDict): @@ -22,11 +22,11 @@ class EncodeFormats(TypedDict): class Streams(TypedDict): - video: list[bytes] - audio: list[bytes] + video: List[bytes] + audio: List[bytes] -def _get_seconds(t: str | float) -> float: +def _get_seconds(t: Union[str, float]) -> float: """Converts time to seconds""" if not isinstance(t, str): return float(t) # already seconds @@ -45,7 +45,7 @@ def _get_strtime(t_sec: float) -> str: return f"{hour:02d}:{minute:02d}:{second:02d}.{microsecond:03d}" -def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> list[ClipSpans]: +def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> List[ClipSpans]: """Filters out cuts by min and max length""" time_d = e - s n_full_clips = int(time_d // max_length) @@ -55,7 +55,7 @@ def _split_time_frame(s: float, e: float, min_length: float, max_length: float) return clip_spans -def _adjust_clip_spans_to_keyframes(clip_spans: list[ClipSpans], keyframes: list[float]) -> list[ClipSpans]: +def _adjust_clip_spans_to_keyframes(clip_spans: List[ClipSpans], keyframes: List[float]) -> List[ClipSpans]: """Translates clip_spans into keyframe vocab""" adjusted_clip_spans = [] for start, end in clip_spans: @@ -69,15 +69,15 @@ def _adjust_clip_spans_to_keyframes(clip_spans: list[ClipSpans], keyframes: list def _adjust_clip_spans( - clip_spans: list[ClipSpans], - keyframe_timestamps: list[float] | None, + clip_spans: List[ClipSpans], + keyframe_timestamps: List[float] | None, min_length: float, max_length: float, max_length_strategy: str, -) -> list[ClipSpans]: +) -> List[ClipSpans]: """Adjusts cut times around keyframes, filtering by min and max length""" if not isinstance(clip_spans[0], Iterable): # make sure clip_spans looks like [[start, end]] and not [start, end] - clip_spans = cast(list[ClipSpans], [clip_spans]) + clip_spans = cast(List[ClipSpans], [clip_spans]) clip_spans = [[_get_seconds(s), _get_seconds(e)] for [s, e] in clip_spans] if keyframe_timestamps: @@ -92,7 +92,7 @@ def _adjust_clip_spans( return filtered_clip_spans -def _collate_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]: +def _collate_clip_spans(clip_spans: List[ClipSpans]) -> tuple[str, List[int]]: """Collates clip spans into a single string for ffmpeg and a list of clip idxs""" clip_times = [] clip_idxs = [] @@ -119,7 +119,7 @@ def _process_stream( stream_bytes: bytes, encode_format: str, ffmpeg_kwargs: dict, -) -> list[str]: +) -> List[str]: """Processes a stream into clips using ffmpeg""" # TODO: we need to put the extension into the metadata # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn @@ -139,12 +139,12 @@ def _process_stream( def _get_clip_metadata( - clip_spans: list[ClipSpans], - clip_idxs: list[int], + clip_spans: List[ClipSpans], + clip_idxs: List[int], metadata: dict, oom_clip_count: int, strtime_formatting: bool, -) -> list[dict]: +) -> List[dict]: """Gets metadata for each clip""" metadata_clips = [] for clip_id, (clip_span, _) in enumerate(zip(clip_spans, clip_idxs)): @@ -180,11 +180,11 @@ def _get_clips( streams: Streams, encode_formats: EncodeFormats, precision: str, - clip_spans: list[ClipSpans], + clip_spans: List[ClipSpans], metadata: dict, oom_clip_count: int, strtime_formatting: bool, -) -> tuple[dict[str, list[str]], list[dict]]: +) -> tuple[dict[str, List[str]], List[dict]]: """Gets clips from streams""" clip_times, clip_idxs = _collate_clip_spans(clip_spans) From 077ca27e78d713ff5b13c69692a66ed2dc95381d Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Thu, 18 Jan 2024 21:59:26 -0500 Subject: [PATCH 08/13] More annotation fixes --- .../subsamplers/clipping_subsampler.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index 25c7f665..317c6f92 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -7,7 +7,7 @@ import ffmpeg import tempfile from collections.abc import Iterable -from typing import Any, Union, List, TypedDict, Literal, cast +from typing import Any, Union, List, Tuple, Dict, TypedDict, Literal, cast import datetime from .subsampler import Subsampler @@ -64,13 +64,13 @@ def _adjust_clip_spans_to_keyframes(clip_spans: List[ClipSpans], keyframes: List adjusted_start = min(keyframes_in_range) adjusted_end = max(keyframes_in_range) if adjusted_start != adjusted_end: - adjusted_clip_spans.append((adjusted_start, adjusted_end)) + adjusted_clip_spans.append([adjusted_start, adjusted_end]) return adjusted_clip_spans def _adjust_clip_spans( clip_spans: List[ClipSpans], - keyframe_timestamps: List[float] | None, + keyframe_timestamps: Union[List[float], None], min_length: float, max_length: float, max_length_strategy: str, @@ -92,7 +92,7 @@ def _adjust_clip_spans( return filtered_clip_spans -def _collate_clip_spans(clip_spans: List[ClipSpans]) -> tuple[str, List[int]]: +def _collate_clip_spans(clip_spans: List[ClipSpans]) -> Tuple[str, List[int]]: """Collates clip spans into a single string for ffmpeg and a list of clip idxs""" clip_times = [] clip_idxs = [] @@ -110,8 +110,8 @@ def _collate_clip_spans(clip_spans: List[ClipSpans]) -> tuple[str, List[int]]: clip_idx += 2 e_prev = e - clip_times = ",".join([str(time) for time in clip_times]) - return clip_times, clip_idxs + clip_times_str = ",".join([str(time) for time in clip_times]) + return clip_times_str, clip_idxs def _process_stream( @@ -184,7 +184,7 @@ def _get_clips( metadata: dict, oom_clip_count: int, strtime_formatting: bool, -) -> tuple[dict[str, List[str]], List[dict]]: +) -> Tuple[Dict[str, List[bytes]], List[dict]]: """Gets clips from streams""" clip_times, clip_idxs = _collate_clip_spans(clip_spans) @@ -199,8 +199,10 @@ def _get_clips( else: ffmpeg_kwargs["c"] = "copy" - clips = {} - for k in streams.keys(): + clips: Dict[str, List[bytes]] = {} + for k in Streams.__annotations__.keys(): + if k not in streams: + continue with tempfile.TemporaryDirectory() as tmpdir: stream_bytes = streams[k][0] # pre-broadcast so only one if stream_bytes is None: From 32fa4eaf760302a011bf263876c9f7bb17313205 Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Thu, 18 Jan 2024 22:05:04 -0500 Subject: [PATCH 09/13] The Python 3.8 annotation needs a lot of hand-holding, it seems --- video2dataset/subsamplers/clipping_subsampler.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index 317c6f92..3c07e2de 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -200,9 +200,8 @@ def _get_clips( ffmpeg_kwargs["c"] = "copy" clips: Dict[str, List[bytes]] = {} - for k in Streams.__annotations__.keys(): - if k not in streams: - continue + for k in streams.keys(): + k = cast(Literal["audio", "video"], k) with tempfile.TemporaryDirectory() as tmpdir: stream_bytes = streams[k][0] # pre-broadcast so only one if stream_bytes is None: From 5a8957fce3285632bbf566c5c577f498b407415f Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Fri, 19 Jan 2024 00:00:31 -0500 Subject: [PATCH 10/13] Pylint has to cut it out, I swear to God --- video2dataset/subsamplers/clipping_subsampler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index 3c07e2de..2af9a93c 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -213,7 +213,7 @@ def _get_clips( encode_format=encode_formats[k], ffmpeg_kwargs=ffmpeg_kwargs, ) - except Exception as err: + except Exception as err: # pylint: disable=broad-except raise err clips[k] = [] @@ -306,7 +306,7 @@ def __call__(self, streams: Streams, metadata: dict): oom_clip_count=self.oom_clip_count, strtime_formatting=strtime_formatting, ) - except Exception as err: + except Exception as err: # pylint: disable=broad-except return {}, [], str(err) return clips, clip_metadata, None From f0f01688fe3d60069d51af3fe61565d8e35eda04 Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Fri, 19 Jan 2024 08:15:29 -0500 Subject: [PATCH 11/13] No real change, just relauching unit tests which failed due to connection timeouts --- video2dataset/subsamplers/clipping_subsampler.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index 2af9a93c..439fd7b9 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -1,13 +1,13 @@ """ clipping subsampler turns full videos into clips of videos according to clip_col """ -import os +from collections.abc import Iterable +from typing import Any, Union, List, Tuple, Dict, TypedDict, Literal, cast import copy -import glob import ffmpeg +import glob +import os import tempfile -from collections.abc import Iterable -from typing import Any, Union, List, Tuple, Dict, TypedDict, Literal, cast import datetime from .subsampler import Subsampler From 1df88dd6b1bc1a8f3236418a863703a200aaa019 Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Sun, 21 Jan 2024 22:46:56 -0500 Subject: [PATCH 12/13] Linting issue --- video2dataset/subsamplers/clipping_subsampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index df68fb46..508c6ed8 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -140,7 +140,7 @@ def _process_stream( def _extract_subtitles(clip_span: ClipSpan, meta_clip: dict) -> List[dict]: """Extracts subtitles and groups them by language""" - clip_subtitles = [] + clip_subtitles: list[dict] = [] s_c, e_c = _get_seconds(clip_span[0]), _get_seconds(clip_span[1]) for lang_id, (lang, subtitles) in enumerate(meta_clip["yt_meta_dict"]["subtitles"].items()): idx = 0 From 226fba3bbf5ae98c689dc1f95f911a0532b6fe5c Mon Sep 17 00:00:00 2001 From: Matt Zhang Date: Sun, 21 Jan 2024 22:51:59 -0500 Subject: [PATCH 13/13] Another linting issue --- video2dataset/subsamplers/clipping_subsampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py index 508c6ed8..73eae18f 100644 --- a/video2dataset/subsamplers/clipping_subsampler.py +++ b/video2dataset/subsamplers/clipping_subsampler.py @@ -140,7 +140,7 @@ def _process_stream( def _extract_subtitles(clip_span: ClipSpan, meta_clip: dict) -> List[dict]: """Extracts subtitles and groups them by language""" - clip_subtitles: list[dict] = [] + clip_subtitles: List[dict] = [] s_c, e_c = _get_seconds(clip_span[0]), _get_seconds(clip_span[1]) for lang_id, (lang, subtitles) in enumerate(meta_clip["yt_meta_dict"]["subtitles"].items()): idx = 0