From 2efa849602b6b6040d7f93690068a1b9842eff43 Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Thu, 18 Jan 2024 10:03:29 -0500
Subject: [PATCH 01/13] ClippingSubsampler rewrite and bug fixes

---
 .../subsamplers/clipping_subsampler.py        | 201 ++++++++++--------
 1 file changed, 115 insertions(+), 86 deletions(-)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index 466fd8e8..6424eaf9 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -7,12 +7,26 @@
 import ffmpeg
 import tempfile
 from collections.abc import Iterable
+from typing import Annotated, TypedDict, Literal, cast
 
 import datetime
 from .subsampler import Subsampler
 
 
-def _get_seconds(t):
+ClipTimes = Annotated[list[float], 2]
+
+
+class EncodeFormats(TypedDict):
+    video: str
+    audio: str
+
+
+class Streams(TypedDict):
+    video: bytes
+    audio: bytes
+
+
+def _get_seconds(t: str | float) -> float:
     if not isinstance(t, str):
         return float(t)  # already seconds
     time_format = "%H:%M:%S.%f"  # TODO: maybe parameterize this?
@@ -20,7 +34,7 @@ def _get_seconds(t):
     return t_obj.second + t_obj.microsecond / 1e6 + t_obj.minute * 60 + t_obj.hour * 3600
 
 
-def _get_strtime(t_sec):
+def _get_strtime(t_sec: float) -> str:
     hour = int(t_sec // 3600)
     minute = int((t_sec // 60) % 60)
     second = int(t_sec % 60)
@@ -29,24 +43,20 @@ def _get_strtime(t_sec):
     return f"{hour:02d}:{minute:02d}:{second:02d}.{microsecond:03d}"
 
 
-def _split_time_frame(s, e, min_length, max_length):
+def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> list[ClipTimes]:
     """Filters out cuts by min and max length"""
     time_d = e - s
-    time_frames = [
-        (s + i * max_length, min(s + (i + 1) * max_length, e))
-        for i in range(int(time_d // max_length) + (1 if time_d % max_length > 0 else 0))
-    ]
-    if len(time_frames) == 0:
-        return []
-    last_time_d = time_frames[-1][1] - time_frames[-1][0]
-    time_frames = time_frames if last_time_d >= min_length else time_frames[:-1]
-    return time_frames
-
-
-def _adjust_ranges_to_keyframes(ranges, keyframes):
-    """Translates ranges into keyframe vocab"""
+    n_full_clips = int(time_d // max_length)
+    clip_times = [[s + i * max_length, s + (i + 1) * max_length] for i in range(n_full_clips)] + (
+        [[s + (n_full_clips - 1) * max_length, e]] if time_d % max_length > min_length else []
+    )
+    return clip_times
+
+
+def _adjust_clip_times_to_keyframes(clip_times: list[ClipTimes], keyframes: list[float]) -> list[ClipTimes]:
+    """Translates clip_times into keyframe vocab"""
     adjusted_ranges = []
-    for start, end in ranges:
+    for start, end in clip_times:
         keyframes_in_range = [k for k in keyframes if start <= k <= end]
         if keyframes_in_range:
             adjusted_start = min(keyframes_in_range)
@@ -56,6 +66,52 @@ def _adjust_ranges_to_keyframes(ranges, keyframes):
     return adjusted_ranges
 
 
+def _adjust_clip_times(
+    clip_times: list[ClipTimes],
+    keyframe_timestamps: list[float] | None,
+    min_length: float,
+    max_length: float,
+    max_length_strategy: str,
+) -> list[ClipTimes]:
+    if not isinstance(clip_times[0], Iterable):  # make sure clip_times looks like [[start, end]] and not [start, end]
+        clip_times = cast(list[ClipTimes], [clip_times])
+    clip_times = [[_get_seconds(s), _get_seconds(e)] for [s, e] in clip_times]
+
+    if keyframe_timestamps:
+        clip_times = _adjust_clip_times_to_keyframes(clip_times, keyframe_timestamps)
+
+    filtered_clip_times = []
+    for s, e in clip_times:
+        max_len_clip_times = _split_time_frame(s, e, min_length, max_length)
+        if max_length_strategy == "first":
+            max_len_clip_times = max_len_clip_times[:1]
+        filtered_clip_times += max_len_clip_times
+    return filtered_clip_times
+
+
+def _get_clip_intervals(clip_times: list[ClipTimes]) -> tuple[str, list[int]]:
+    s_clip, e_clip = clip_times[0]
+    skip_first_interval = int(s_clip > 0.0)
+
+    # which timestamp intervals to take, used to discard non-contiguous sections
+    intervals = [skip_first_interval]
+    timestamps = [0.0] + skip_first_interval * [s_clip] + [e_clip]
+    interval = 1 + skip_first_interval
+    for s, e in clip_times[1:]:
+        if s == e_clip:  # situations like [0, 1], [1, 2], [2, 3] -> 1, 2
+            timestamps += [e]
+            intervals.append(interval)
+            interval += 1
+        else:
+            timestamps += [s, e]
+            intervals.append(interval + 1)
+            interval += 2
+        e_clip = e
+
+    timestamps = ",".join([str(time) for time in timestamps])
+    return timestamps, intervals
+
+
 class ClippingSubsampler(Subsampler):
     """
     Cuts videos up into segments according to the 'clips' metadata
@@ -85,72 +141,59 @@ class ClippingSubsampler(Subsampler):
 
     def __init__(
         self,
-        oom_clip_count,
-        encode_formats,
-        min_length=0.0,
-        max_length=999999.0,
-        max_length_strategy="all",
-        precision="low",
+        oom_clip_count: int,
+        encode_formats: EncodeFormats,
+        min_length: float = 0.0,
+        max_length: float = 999999.0,
+        max_length_strategy: Literal["all", "first"] = "all",
+        precision: Literal["low", "keyframe_adjusted", "exact"] = "low",
     ):
+        assert max_length_strategy in ["all", "first"]
+        assert precision in ["exact", "low", "keyframe_adjusted"]
         self.oom_clip_count = oom_clip_count
         self.encode_formats = encode_formats
         self.min_length = min_length
-        self.max_length, self.max_length_strategy = max_length, max_length_strategy
-        assert precision in ["exact", "low", "keyframe_adjusted"]
+        self.max_length = max_length
+        self.max_length_strategy = max_length_strategy
         self.precision = precision
 
     def __call__(self, streams, metadata):
-        clips = metadata.pop("clips")
-
-        if not isinstance(clips[0], Iterable):  # make sure clips looks like [[start, end]] and not [start, end]
-            clips = [clips]
+        strtime_formatting = isinstance(metadata["clips"][0][0], str)
 
-        is_strtime = isinstance(clips[0][0], str)
+        clip_times = _adjust_clip_times(
+            clip_times=metadata.pop("clips"),
+            keyframe_timestamps=(
+                # TODO: make it so if keyframe timestamps not present, get it yourself
+                metadata["video_metadata"].pop("keyframe_timestamps")
+                if self.precision == "keyframe_adjusted"
+                else None
+            ),
+            min_length=self.min_length,
+            max_length=self.max_length,
+            max_length_strategy=self.max_length_strategy,
+        )
+        if len(clip_times) == 0:
+            return {}, [], f"Video had no clip_times longer than {self.min_length}"
 
-        if self.precision == "keyframe_adjusted":
-            # TODO: make it so if not present, get it yourself
-            keyframe_timestamps = metadata["video_metadata"].pop("keyframe_timestamps")
-            s_clips = [[_get_seconds(s), _get_seconds(e)] for (s, e) in clips]
-            clips = _adjust_ranges_to_keyframes(s_clips, keyframe_timestamps)
+        timestamps, intervals = _get_clip_intervals(clip_times)
 
-        filtered_clips = []
-        for s, e in clips:
-            max_len_clips = _split_time_frame(_get_seconds(s), _get_seconds(e), self.min_length, self.max_length)
+        ffmpeg_kwargs = {
+            "map": 0,
+            "f": "segment",
+            "segment_times": timestamps,
+            "reset_timestamps": 1,
+        }
+        if self.precision == "exact":
+            ffmpeg_kwargs["force_key_frames"] = timestamps
+        else:
+            ffmpeg_kwargs["c"] = "copy"
 
-            if self.max_length_strategy == "first":
-                max_len_clips = max_len_clips[:1]
 
-            filtered_clips += max_len_clips
-        clips = filtered_clips
 
-        if len(clips) == 0:
-            # return an error
-            return {}, [], f"Video had no clips longer than {self.min_length}"
 
-        start_0 = _get_seconds(clips[0][0]) == 0.0
 
-        ind = 1 + int(not start_0)
-        s_p, e_p = clips[0]
-        s_p, e_p = _get_seconds(s_p), _get_seconds(e_p)
-        splits = (not start_0) * [s_p] + [e_p]
-        # list of indicies of clips to take, used to discard non-contiguous sections
-        take_inds = [int(not start_0)]
 
-        # TODO: make nicer
-        for s, e in clips[1:]:
-            s, e = _get_seconds(s), _get_seconds(e)
 
-            if s == e_p:  # situations like [0, 1], [1, 2], [2, 3] -> 1, 2
-                splits += [e]
-                take_inds.append(ind)
-                ind += 1
-            else:
-                splits += [s, e]
-                take_inds.append(ind + 1)
-                ind += 2
-            e_p = e
-
-        segment_times = ",".join([str(spl) for spl in splits])
         streams_clips = {}
 
         for k in streams.keys():
@@ -165,25 +208,11 @@ def __call__(self, streams, metadata):
                 with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f:
                     f.write(stream_bytes)
                 try:
-                    kwargs = {
-                        "map": 0,
-                        "f": "segment",
-                        "segment_times": segment_times,
-                        "reset_timestamps": 1,
-                    }
-
-                    # Precision things, tradeoff for speed
-                    if self.precision != "exact":
-                        kwargs["c"] = "copy"
-                    else:
-                        kwargs["force_key_frames"] = segment_times
-
-                    _ = (
+                    (
                         ffmpeg.input(f"{tmpdir}/input.{encode_format}")
-                        .output(f"{tmpdir}/clip_%d.{encode_format}", **kwargs)
+                        .output(f"{tmpdir}/clip_%d.{encode_format}", **ffmpeg_kwargs)
                         .run(capture_stdout=True, quiet=True)
                     )
-
                 except Exception as err:  # pylint: disable=broad-except
                     return {}, [], str(err)
 
@@ -191,10 +220,10 @@ def __call__(self, streams, metadata):
                 stream_clips.sort(key=lambda x: int(x.split("_")[-1].split(".")[0]))
 
                 correct_clips = []
-                for clip_id, (clip, ind) in enumerate(zip(clips, take_inds)):
+                for clip_id, (clip, ind) in enumerate(zip(clip_times, intervals)):
                     if ind < len(stream_clips):
                         correct_clips.append((clip_id, clip, stream_clips[ind]))
-                # clips_lost = len(take_inds) - len(correct_clips) # TODO report this somehow
+                # clips_lost = len(intervals) - len(correct_clips) # TODO report this somehow
 
                 stream_clips, metadata_clips = [], []
                 for clip_id, clip_span, clip_pth in correct_clips:
@@ -207,8 +236,8 @@ def __call__(self, streams, metadata):
                     )
                     meta_clip = copy.deepcopy(metadata)
                     # set the timeframe of this clip
-                    if is_strtime:
-                        #  Keep clips in the original format to be compatible with the data schema.
+                    if strtime_formatting:
+                        #  Keep clip_times in the original format to be compatible with the data schema.
                         meta_clip["clips"] = [(_get_strtime(clip_span[0]), _get_strtime(clip_span[1]))]
                     else:
                         meta_clip["clips"] = [clip_span]

From a5c9649b32af7541e7887a0460e4ecf46e855f4f Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Thu, 18 Jan 2024 11:59:45 -0500
Subject: [PATCH 02/13] More refactoring of ClippingSubsampler, plus a fix to
 _get_clip_intervals

---
 .../subsamplers/clipping_subsampler.py        | 52 +++++++++----------
 1 file changed, 25 insertions(+), 27 deletions(-)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index 6424eaf9..4d2e578e 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -55,15 +55,15 @@ def _split_time_frame(s: float, e: float, min_length: float, max_length: float)
 
 def _adjust_clip_times_to_keyframes(clip_times: list[ClipTimes], keyframes: list[float]) -> list[ClipTimes]:
     """Translates clip_times into keyframe vocab"""
-    adjusted_ranges = []
+    adjusted_clip_times = []
     for start, end in clip_times:
         keyframes_in_range = [k for k in keyframes if start <= k <= end]
         if keyframes_in_range:
             adjusted_start = min(keyframes_in_range)
             adjusted_end = max(keyframes_in_range)
             if adjusted_start != adjusted_end:
-                adjusted_ranges.append((adjusted_start, adjusted_end))
-    return adjusted_ranges
+                adjusted_clip_times.append((adjusted_start, adjusted_end))
+    return adjusted_clip_times
 
 
 def _adjust_clip_times(
@@ -89,27 +89,25 @@ def _adjust_clip_times(
     return filtered_clip_times
 
 
-def _get_clip_intervals(clip_times: list[ClipTimes]) -> tuple[str, list[int]]:
-    s_clip, e_clip = clip_times[0]
-    skip_first_interval = int(s_clip > 0.0)
+def _get_clip_times(clip_times: list[ClipTimes]) -> tuple[str, list[int]]:
+    all_clip_times = [0.0]
+    clip_idxs = []
+    e_prev = 0.0
+    clip_idx = 0
 
-    # which timestamp intervals to take, used to discard non-contiguous sections
-    intervals = [skip_first_interval]
-    timestamps = [0.0] + skip_first_interval * [s_clip] + [e_clip]
-    interval = 1 + skip_first_interval
-    for s, e in clip_times[1:]:
-        if s == e_clip:  # situations like [0, 1], [1, 2], [2, 3] -> 1, 2
-            timestamps += [e]
-            intervals.append(interval)
-            interval += 1
-        else:
-            timestamps += [s, e]
-            intervals.append(interval + 1)
-            interval += 2
-        e_clip = e
+    for s, e in clip_times:
+        if s == e_prev:  # clip starts where last one left off
+            all_clip_times += [e]
+            clip_idxs.append(clip_idx)
+            clip_idx += 1
+        else:  # next clip skips over some time
+            all_clip_times += [s, e]
+            clip_idxs.append(clip_idx + 1)
+            clip_idx += 2
+        e_prev = e
 
-    timestamps = ",".join([str(time) for time in timestamps])
-    return timestamps, intervals
+    all_clip_times = ",".join([str(time) for time in all_clip_times])
+    return all_clip_times, clip_idxs
 
 
 class ClippingSubsampler(Subsampler):
@@ -175,16 +173,16 @@ def __call__(self, streams, metadata):
         if len(clip_times) == 0:
             return {}, [], f"Video had no clip_times longer than {self.min_length}"
 
-        timestamps, intervals = _get_clip_intervals(clip_times)
+        all_clip_times, clip_idxs = _get_clip_times(clip_times)
 
         ffmpeg_kwargs = {
             "map": 0,
             "f": "segment",
-            "segment_times": timestamps,
+            "segment_times": all_clip_times,
             "reset_timestamps": 1,
         }
         if self.precision == "exact":
-            ffmpeg_kwargs["force_key_frames"] = timestamps
+            ffmpeg_kwargs["force_key_frames"] = all_clip_times
         else:
             ffmpeg_kwargs["c"] = "copy"
 
@@ -220,10 +218,10 @@ def __call__(self, streams, metadata):
                 stream_clips.sort(key=lambda x: int(x.split("_")[-1].split(".")[0]))
 
                 correct_clips = []
-                for clip_id, (clip, ind) in enumerate(zip(clip_times, intervals)):
+                for clip_id, (clip, ind) in enumerate(zip(clip_times, clip_idxs)):
                     if ind < len(stream_clips):
                         correct_clips.append((clip_id, clip, stream_clips[ind]))
-                # clips_lost = len(intervals) - len(correct_clips) # TODO report this somehow
+                # clips_lost = len(clip_idxs) - len(correct_clips) # TODO report this somehow
 
                 stream_clips, metadata_clips = [], []
                 for clip_id, clip_span, clip_pth in correct_clips:

From 2cb5854b03760d4d75404c25ac3f553586c43876 Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Thu, 18 Jan 2024 12:48:17 -0500
Subject: [PATCH 03/13] Finished refactoring ClippingSubsampler

---
 .../subsamplers/clipping_subsampler.py        | 281 ++++++++++--------
 1 file changed, 159 insertions(+), 122 deletions(-)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index 4d2e578e..9ae4ee60 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -13,7 +13,7 @@
 from .subsampler import Subsampler
 
 
-ClipTimes = Annotated[list[float], 2]
+ClipSpans = Annotated[list[float], 2]
 
 
 class EncodeFormats(TypedDict):
@@ -43,71 +43,180 @@ def _get_strtime(t_sec: float) -> str:
     return f"{hour:02d}:{minute:02d}:{second:02d}.{microsecond:03d}"
 
 
-def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> list[ClipTimes]:
+def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> list[ClipSpans]:
     """Filters out cuts by min and max length"""
     time_d = e - s
     n_full_clips = int(time_d // max_length)
-    clip_times = [[s + i * max_length, s + (i + 1) * max_length] for i in range(n_full_clips)] + (
+    clip_spans = [[s + i * max_length, s + (i + 1) * max_length] for i in range(n_full_clips)] + (
         [[s + (n_full_clips - 1) * max_length, e]] if time_d % max_length > min_length else []
     )
-    return clip_times
+    return clip_spans
 
 
-def _adjust_clip_times_to_keyframes(clip_times: list[ClipTimes], keyframes: list[float]) -> list[ClipTimes]:
-    """Translates clip_times into keyframe vocab"""
-    adjusted_clip_times = []
-    for start, end in clip_times:
+def _adjust_clip_spans_to_keyframes(clip_spans: list[ClipSpans], keyframes: list[float]) -> list[ClipSpans]:
+    """Translates clip_spans into keyframe vocab"""
+    adjusted_clip_spans = []
+    for start, end in clip_spans:
         keyframes_in_range = [k for k in keyframes if start <= k <= end]
         if keyframes_in_range:
             adjusted_start = min(keyframes_in_range)
             adjusted_end = max(keyframes_in_range)
             if adjusted_start != adjusted_end:
-                adjusted_clip_times.append((adjusted_start, adjusted_end))
-    return adjusted_clip_times
+                adjusted_clip_spans.append((adjusted_start, adjusted_end))
+    return adjusted_clip_spans
 
 
-def _adjust_clip_times(
-    clip_times: list[ClipTimes],
+def _adjust_clip_spans(
+    clip_spans: list[ClipSpans],
     keyframe_timestamps: list[float] | None,
     min_length: float,
     max_length: float,
     max_length_strategy: str,
-) -> list[ClipTimes]:
-    if not isinstance(clip_times[0], Iterable):  # make sure clip_times looks like [[start, end]] and not [start, end]
-        clip_times = cast(list[ClipTimes], [clip_times])
-    clip_times = [[_get_seconds(s), _get_seconds(e)] for [s, e] in clip_times]
+) -> list[ClipSpans]:
+    if not isinstance(clip_spans[0], Iterable):  # make sure clip_spans looks like [[start, end]] and not [start, end]
+        clip_spans = cast(list[ClipSpans], [clip_spans])
+    clip_spans = [[_get_seconds(s), _get_seconds(e)] for [s, e] in clip_spans]
 
     if keyframe_timestamps:
-        clip_times = _adjust_clip_times_to_keyframes(clip_times, keyframe_timestamps)
+        clip_spans = _adjust_clip_spans_to_keyframes(clip_spans, keyframe_timestamps)
 
-    filtered_clip_times = []
-    for s, e in clip_times:
-        max_len_clip_times = _split_time_frame(s, e, min_length, max_length)
+    filtered_clip_spans = []
+    for s, e in clip_spans:
+        max_len_clip_spans = _split_time_frame(s, e, min_length, max_length)
         if max_length_strategy == "first":
-            max_len_clip_times = max_len_clip_times[:1]
-        filtered_clip_times += max_len_clip_times
-    return filtered_clip_times
+            max_len_clip_spans = max_len_clip_spans[:1]
+        filtered_clip_spans += max_len_clip_spans
+    return filtered_clip_spans
 
 
-def _get_clip_times(clip_times: list[ClipTimes]) -> tuple[str, list[int]]:
-    all_clip_times = [0.0]
+def _get_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]:
+    segment_times = [0.0]
     clip_idxs = []
     e_prev = 0.0
     clip_idx = 0
 
-    for s, e in clip_times:
+    for s, e in clip_spans:
         if s == e_prev:  # clip starts where last one left off
-            all_clip_times += [e]
+            segment_times += [e]
             clip_idxs.append(clip_idx)
             clip_idx += 1
         else:  # next clip skips over some time
-            all_clip_times += [s, e]
+            segment_times += [s, e]
             clip_idxs.append(clip_idx + 1)
             clip_idx += 2
         e_prev = e
 
-    all_clip_times = ",".join([str(time) for time in all_clip_times])
-    return all_clip_times, clip_idxs
+    segment_times = ",".join([str(time) for time in segment_times])
+    return segment_times, clip_idxs
+
+
+def _process_stream(stream_bytes: bytes, encode_format: str, ffmpeg_kwargs: dict) -> list[str]:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        # TODO: we need to put the extension into the metadata
+        # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn
+        with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f:
+            f.write(stream_bytes)
+        try:
+            (
+                ffmpeg.input(f"{tmpdir}/input.{encode_format}")
+                .output(f"{tmpdir}/clip_%d.{encode_format}", **ffmpeg_kwargs)
+                .run(capture_stdout=True, quiet=True)
+            )
+        except Exception as err:  # pylint: disable=broad-except
+            raise err
+        stream_clips = glob.glob(f"{tmpdir}/clip*.{encode_format}")
+    stream_clips.sort(key=lambda x: int(x.split("_")[-1].split(".")[0]))
+    return stream_clips
+
+
+def _get_clip_metadata(
+    clip_spans: list[ClipSpans],
+    clip_idxs: list[int],
+    metadata: dict,
+    oom_clip_count: int,
+    strtime_formatting: bool,
+) -> list[dict]:
+    metadata_clips = []
+    for clip_id, (clip_span, _) in enumerate(zip(clip_spans, clip_idxs)):
+        clip_key = "{clip_id:0{oom_clip_count}d}".format(  # pylint: disable=consider-using-f-string
+            clip_id=clip_id, oom_clip_count=oom_clip_count
+        )
+        meta_clip = copy.deepcopy(metadata)
+        # set the timeframe of this clip
+        if strtime_formatting:
+            #  Keep clip_spans in the original format to be compatible with the data schema.
+            meta_clip["clips"] = [(_get_strtime(clip_span[0]), _get_strtime(clip_span[1]))]
+        else:
+            meta_clip["clips"] = [clip_span]
+        meta_clip["key"] = f"{meta_clip['key']}_{clip_key}"
+
+        yt_md_dict = meta_clip.get("yt_meta_dict", {})
+        if (yt_md_dict is not None) and (yt_md_dict.get("subtitles", None) is not None):
+            clip_subtitles = []
+            s_c, e_c = _get_seconds(clip_span[0]), _get_seconds(clip_span[1])
+            for line in meta_clip["yt_meta_dict"]["subtitles"]:
+                s, e = _get_seconds(line["start"]), _get_seconds(line["end"])
+                if max(s_c, s) < min(e_c, e):
+                    clip_subtitles.append(line)
+                elif s > e_c:
+                    break
+            # full video subtitles might still be useful for context
+            meta_clip["clip_subtitles"] = clip_subtitles
+        metadata_clips.append(meta_clip)
+    return metadata_clips
+
+
+def _get_clips(
+    streams: Streams,
+    encode_formats: EncodeFormats,
+    precision: str,
+    clip_spans: list[ClipSpans],
+    metadata: dict,
+    oom_clip_count: int,
+    strtime_formatting: bool,
+) -> tuple[dict[str, list[str]], list[dict]]:
+    segment_times, clip_idxs = _get_clip_spans(clip_spans)
+
+    ffmpeg_kwargs = {
+        "map": 0,
+        "f": "segment",
+        "segment_times": segment_times,
+        "reset_timestamps": 1,
+    }
+    if precision == "exact":
+        ffmpeg_kwargs["force_key_frames"] = segment_times
+    else:
+        ffmpeg_kwargs["c"] = "copy"
+
+    clips = {}
+    for k in streams.keys():
+        stream_bytes = streams[k][0]  # pre-broadcast so only one
+        if stream_bytes is None:
+            continue
+        try:
+            stream_clips = _process_stream(
+                stream_bytes=stream_bytes,
+                encode_format=encode_formats[k],
+                ffmpeg_kwargs=ffmpeg_kwargs,
+            )
+        except Exception as err:
+            raise err
+
+        clips[k] = []
+        for _, (_, clip_idx) in enumerate(zip(clip_spans, clip_idxs)):
+            with open(stream_clips[clip_idx], "rb") as vid_f:
+                clip_bytes = vid_f.read()
+            clips[k].append(clip_bytes)
+
+    clip_metadata = _get_clip_metadata(
+        clip_spans=clip_spans,
+        clip_idxs=clip_idxs,
+        metadata=metadata,
+        oom_clip_count=oom_clip_count,
+        strtime_formatting=strtime_formatting,
+    )
+
+    return clips, clip_metadata
 
 
 class ClippingSubsampler(Subsampler):
@@ -155,11 +264,11 @@ def __init__(
         self.max_length_strategy = max_length_strategy
         self.precision = precision
 
-    def __call__(self, streams, metadata):
+    def __call__(self, streams: Streams, metadata: dict):
         strtime_formatting = isinstance(metadata["clips"][0][0], str)
 
-        clip_times = _adjust_clip_times(
-            clip_times=metadata.pop("clips"),
+        clip_spans = _adjust_clip_spans(
+            clip_spans=metadata.pop("clips"),
             keyframe_timestamps=(
                 # TODO: make it so if keyframe timestamps not present, get it yourself
                 metadata["video_metadata"].pop("keyframe_timestamps")
@@ -170,92 +279,20 @@ def __call__(self, streams, metadata):
             max_length=self.max_length,
             max_length_strategy=self.max_length_strategy,
         )
-        if len(clip_times) == 0:
-            return {}, [], f"Video had no clip_times longer than {self.min_length}"
-
-        all_clip_times, clip_idxs = _get_clip_times(clip_times)
-
-        ffmpeg_kwargs = {
-            "map": 0,
-            "f": "segment",
-            "segment_times": all_clip_times,
-            "reset_timestamps": 1,
-        }
-        if self.precision == "exact":
-            ffmpeg_kwargs["force_key_frames"] = all_clip_times
-        else:
-            ffmpeg_kwargs["c"] = "copy"
-
-
-
-
-
-
-
-        streams_clips = {}
-
-        for k in streams.keys():
-            stream_bytes = streams[k][0]  # pre-broadcast so only one
-            if stream_bytes is None:
-                continue
-            encode_format = self.encode_formats[k]
-
-            with tempfile.TemporaryDirectory() as tmpdir:
-                # TODO: we need to put the extension into the metadata
-                # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn
-                with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f:
-                    f.write(stream_bytes)
-                try:
-                    (
-                        ffmpeg.input(f"{tmpdir}/input.{encode_format}")
-                        .output(f"{tmpdir}/clip_%d.{encode_format}", **ffmpeg_kwargs)
-                        .run(capture_stdout=True, quiet=True)
-                    )
-                except Exception as err:  # pylint: disable=broad-except
-                    return {}, [], str(err)
-
-                stream_clips = glob.glob(f"{tmpdir}/clip*.{encode_format}")
-                stream_clips.sort(key=lambda x: int(x.split("_")[-1].split(".")[0]))
-
-                correct_clips = []
-                for clip_id, (clip, ind) in enumerate(zip(clip_times, clip_idxs)):
-                    if ind < len(stream_clips):
-                        correct_clips.append((clip_id, clip, stream_clips[ind]))
-                # clips_lost = len(clip_idxs) - len(correct_clips) # TODO report this somehow
-
-                stream_clips, metadata_clips = [], []
-                for clip_id, clip_span, clip_pth in correct_clips:
-                    with open(clip_pth, "rb") as vid_f:
-                        clip_bytes = vid_f.read()
-                    stream_clips.append(clip_bytes)
-
-                    clip_key = "{clip_id:0{oom_clip_count}d}".format(  # pylint: disable=consider-using-f-string
-                        clip_id=clip_id, oom_clip_count=self.oom_clip_count
-                    )
-                    meta_clip = copy.deepcopy(metadata)
-                    # set the timeframe of this clip
-                    if strtime_formatting:
-                        #  Keep clip_times in the original format to be compatible with the data schema.
-                        meta_clip["clips"] = [(_get_strtime(clip_span[0]), _get_strtime(clip_span[1]))]
-                    else:
-                        meta_clip["clips"] = [clip_span]
-                    meta_clip["key"] = f"{meta_clip['key']}_{clip_key}"
-
-                    yt_md_dict = meta_clip.get("yt_meta_dict", {})
-                    if (yt_md_dict is not None) and (yt_md_dict.get("subtitles", None) is not None):
-                        clip_subtitles = []
-                        s_c, e_c = _get_seconds(clip_span[0]), _get_seconds(clip_span[1])
-                        for line in meta_clip["yt_meta_dict"]["subtitles"]:
-                            s, e = _get_seconds(line["start"]), _get_seconds(line["end"])
-                            if max(s_c, s) < min(e_c, e):
-                                clip_subtitles.append(line)
-                            elif s > e_c:
-                                break
-                        # full video subtitles might still be useful for context
-                        meta_clip["clip_subtitles"] = clip_subtitles
-
-                    metadata_clips.append(meta_clip)
-
-                streams_clips[k] = stream_clips
-
-        return streams_clips, metadata_clips, None
+        if len(clip_spans) == 0:
+            return {}, [], f"Video had no clip_spans longer than {self.min_length}"
+
+        try:
+            clips, clip_metadata = _get_clips(
+                streams=streams,
+                encode_formats=self.encode_formats,
+                precision=self.precision,
+                clip_spans=clip_spans,
+                metadata=metadata,
+                oom_clip_count=self.oom_clip_count,
+                strtime_formatting=strtime_formatting,
+            )
+        except Exception as err:
+            return {}, [], str(err)
+
+        return clips, clip_metadata, None

From 5d03b720e10345a5492de4d0c8529a354c3a48fc Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Thu, 18 Jan 2024 20:48:19 -0500
Subject: [PATCH 04/13] Final code changes

---
 .../subsamplers/clipping_subsampler.py        | 94 ++++++++++---------
 1 file changed, 50 insertions(+), 44 deletions(-)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index 9ae4ee60..a0960ff0 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -7,7 +7,7 @@
 import ffmpeg
 import tempfile
 from collections.abc import Iterable
-from typing import Annotated, TypedDict, Literal, cast
+from typing import Any, Annotated, TypedDict, Literal, cast
 
 import datetime
 from .subsampler import Subsampler
@@ -89,42 +89,46 @@ def _adjust_clip_spans(
     return filtered_clip_spans
 
 
-def _get_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]:
-    segment_times = [0.0]
+def _collate_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]:
+    clip_times = [0.0]
     clip_idxs = []
     e_prev = 0.0
     clip_idx = 0
 
     for s, e in clip_spans:
         if s == e_prev:  # clip starts where last one left off
-            segment_times += [e]
+            clip_times += [e]
             clip_idxs.append(clip_idx)
             clip_idx += 1
         else:  # next clip skips over some time
-            segment_times += [s, e]
+            clip_times += [s, e]
             clip_idxs.append(clip_idx + 1)
             clip_idx += 2
         e_prev = e
 
-    segment_times = ",".join([str(time) for time in segment_times])
-    return segment_times, clip_idxs
-
-
-def _process_stream(stream_bytes: bytes, encode_format: str, ffmpeg_kwargs: dict) -> list[str]:
-    with tempfile.TemporaryDirectory() as tmpdir:
-        # TODO: we need to put the extension into the metadata
-        # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn
-        with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f:
-            f.write(stream_bytes)
-        try:
-            (
-                ffmpeg.input(f"{tmpdir}/input.{encode_format}")
-                .output(f"{tmpdir}/clip_%d.{encode_format}", **ffmpeg_kwargs)
-                .run(capture_stdout=True, quiet=True)
-            )
-        except Exception as err:  # pylint: disable=broad-except
-            raise err
-        stream_clips = glob.glob(f"{tmpdir}/clip*.{encode_format}")
+    clip_times = ",".join([str(time) for time in clip_times])
+    return clip_times, clip_idxs
+
+
+def _process_stream(
+    tmpdir: Any,  # BytesPath
+    stream_bytes: bytes,
+    encode_format: str,
+    ffmpeg_kwargs: dict,
+) -> list[str]:
+    # TODO: we need to put the extension into the metadata
+    # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn
+    with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f:
+        f.write(stream_bytes)
+    try:
+        (
+            ffmpeg.input(f"{tmpdir}/input.{encode_format}")
+            .output(f"{tmpdir}/clip_%d.{encode_format}", **ffmpeg_kwargs)
+            .run(capture_stdout=True, quiet=True)
+        )
+    except Exception as err:  # pylint: disable=broad-except
+        raise err
+    stream_clips = glob.glob(f"{tmpdir}/clip*.{encode_format}")
     stream_clips.sort(key=lambda x: int(x.split("_")[-1].split(".")[0]))
     return stream_clips
 
@@ -175,38 +179,40 @@ def _get_clips(
     oom_clip_count: int,
     strtime_formatting: bool,
 ) -> tuple[dict[str, list[str]], list[dict]]:
-    segment_times, clip_idxs = _get_clip_spans(clip_spans)
+    clip_times, clip_idxs = _collate_clip_spans(clip_spans)
 
     ffmpeg_kwargs = {
         "map": 0,
         "f": "segment",
-        "segment_times": segment_times,
+        "segment_times": clip_times,
         "reset_timestamps": 1,
     }
     if precision == "exact":
-        ffmpeg_kwargs["force_key_frames"] = segment_times
+        ffmpeg_kwargs["force_key_frames"] = clip_times
     else:
         ffmpeg_kwargs["c"] = "copy"
 
     clips = {}
     for k in streams.keys():
-        stream_bytes = streams[k][0]  # pre-broadcast so only one
-        if stream_bytes is None:
-            continue
-        try:
-            stream_clips = _process_stream(
-                stream_bytes=stream_bytes,
-                encode_format=encode_formats[k],
-                ffmpeg_kwargs=ffmpeg_kwargs,
-            )
-        except Exception as err:
-            raise err
-
-        clips[k] = []
-        for _, (_, clip_idx) in enumerate(zip(clip_spans, clip_idxs)):
-            with open(stream_clips[clip_idx], "rb") as vid_f:
-                clip_bytes = vid_f.read()
-            clips[k].append(clip_bytes)
+        with tempfile.TemporaryDirectory() as tmpdir:
+            stream_bytes = streams[k][0]  # pre-broadcast so only one
+            if stream_bytes is None:
+                continue
+            try:
+                stream_clips = _process_stream(
+                    tmpdir=tmpdir,
+                    stream_bytes=stream_bytes,
+                    encode_format=encode_formats[k],
+                    ffmpeg_kwargs=ffmpeg_kwargs,
+                )
+            except Exception as err:
+                raise err
+
+            clips[k] = []
+            for _, (_, clip_idx) in enumerate(zip(clip_spans, clip_idxs)):
+                with open(stream_clips[clip_idx], "rb") as vid_f:
+                    clip_bytes = vid_f.read()
+                    clips[k].append(clip_bytes)
 
     clip_metadata = _get_clip_metadata(
         clip_spans=clip_spans,

From 47c7d647a5fc36cd11953aaa5baaf000f200458e Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Thu, 18 Jan 2024 20:53:16 -0500
Subject: [PATCH 05/13] Added docstrings

---
 video2dataset/subsamplers/clipping_subsampler.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index a0960ff0..3f18d703 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -27,6 +27,7 @@ class Streams(TypedDict):
 
 
 def _get_seconds(t: str | float) -> float:
+    """Converts time to seconds"""
     if not isinstance(t, str):
         return float(t)  # already seconds
     time_format = "%H:%M:%S.%f"  # TODO: maybe parameterize this?
@@ -35,6 +36,7 @@ def _get_seconds(t: str | float) -> float:
 
 
 def _get_strtime(t_sec: float) -> str:
+    """Converts time to string"""
     hour = int(t_sec // 3600)
     minute = int((t_sec // 60) % 60)
     second = int(t_sec % 60)
@@ -73,6 +75,7 @@ def _adjust_clip_spans(
     max_length: float,
     max_length_strategy: str,
 ) -> list[ClipSpans]:
+    """Adjusts cut times around keyframes, filtering by min and max length"""
     if not isinstance(clip_spans[0], Iterable):  # make sure clip_spans looks like [[start, end]] and not [start, end]
         clip_spans = cast(list[ClipSpans], [clip_spans])
     clip_spans = [[_get_seconds(s), _get_seconds(e)] for [s, e] in clip_spans]
@@ -90,6 +93,7 @@ def _adjust_clip_spans(
 
 
 def _collate_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]:
+    """Collates clip spans into a single string for ffmpeg and a list of clip idxs"""
     clip_times = [0.0]
     clip_idxs = []
     e_prev = 0.0
@@ -116,6 +120,7 @@ def _process_stream(
     encode_format: str,
     ffmpeg_kwargs: dict,
 ) -> list[str]:
+    """Processes a stream into clips using ffmpeg"""
     # TODO: we need to put the extension into the metadata
     # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn
     with open(os.path.join(tmpdir, f"input.{encode_format}"), "wb") as f:
@@ -140,6 +145,7 @@ def _get_clip_metadata(
     oom_clip_count: int,
     strtime_formatting: bool,
 ) -> list[dict]:
+    """Gets metadata for each clip"""
     metadata_clips = []
     for clip_id, (clip_span, _) in enumerate(zip(clip_spans, clip_idxs)):
         clip_key = "{clip_id:0{oom_clip_count}d}".format(  # pylint: disable=consider-using-f-string
@@ -179,6 +185,7 @@ def _get_clips(
     oom_clip_count: int,
     strtime_formatting: bool,
 ) -> tuple[dict[str, list[str]], list[dict]]:
+    """Gets clips from streams"""
     clip_times, clip_idxs = _collate_clip_spans(clip_spans)
 
     ffmpeg_kwargs = {

From 5aa84d49d95535fd3db4f109ef94ba8238973e87 Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Thu, 18 Jan 2024 21:35:15 -0500
Subject: [PATCH 06/13] Passed tests and linting

---
 tests/test_subsamplers.py                        | 10 +++++-----
 video2dataset/subsamplers/__init__.py            |  2 +-
 video2dataset/subsamplers/clipping_subsampler.py | 10 +++++-----
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/test_subsamplers.py b/tests/test_subsamplers.py
index e6a5b5f0..28ace480 100644
--- a/tests/test_subsamplers.py
+++ b/tests/test_subsamplers.py
@@ -11,6 +11,7 @@
     ClippingSubsampler,
     _get_seconds,
     _split_time_frame,
+    Streams,
     FFProbeSubsampler,
     ResolutionSubsampler,
     FrameSubsampler,
@@ -45,8 +46,8 @@ def test_clipping_subsampler(clips):
     min_length = 5.0 if clips == MULTI else 2.0
     max_length = 999999.0 if clips == MULTI else 3.0
     subsampler = ClippingSubsampler(
-        3,
-        {"video": "mp4", "audio": "mp3"},
+        oom_clip_count=3,
+        encode_formats={"video": "mp4", "audio": "mp3"},
         min_length=min_length,
         max_length=max_length,
         max_length_strategy="all",
@@ -58,7 +59,7 @@ def test_clipping_subsampler(clips):
         "clips": clips,
     }
 
-    streams = {"video": [video_bytes], "audio": [audio_bytes]}
+    streams: Streams = {"video": [video_bytes], "audio": [audio_bytes]}
     stream_fragments, meta_fragments, error_message = subsampler(streams, metadata)
     video_fragments = stream_fragments["video"]
     audio_fragments = stream_fragments["audio"]
@@ -84,7 +85,7 @@ def test_clipping_subsampler(clips):
             s_target, e_target = clips[key_ind]
             s_target, e_target = _get_seconds(s_target), _get_seconds(e_target)
             expected_clips = _split_time_frame(s_target, e_target, min_length, max_length)
-            assert (_get_seconds(s), _get_seconds(e)) in expected_clips
+            assert [_get_seconds(s), _get_seconds(e)] in expected_clips
             assert _get_seconds(e) - _get_seconds(s) >= min_length
 
             s_s, e_s = _get_seconds(s), _get_seconds(e)
@@ -92,7 +93,6 @@ def test_clipping_subsampler(clips):
             video_stream = [stream for stream in probe["streams"] if stream["codec_type"] == "video"][0]
             frag_len = float(video_stream["duration"])
 
-            # currently some segments can be pretty innacurate
             assert abs(frag_len - (e_s - s_s)) < 5.0
 
 
diff --git a/video2dataset/subsamplers/__init__.py b/video2dataset/subsamplers/__init__.py
index 5d4741f8..90e4cd58 100644
--- a/video2dataset/subsamplers/__init__.py
+++ b/video2dataset/subsamplers/__init__.py
@@ -3,7 +3,7 @@
 """
 
 from .audio_rate_subsampler import AudioRateSubsampler
-from .clipping_subsampler import ClippingSubsampler, _get_seconds, _split_time_frame
+from .clipping_subsampler import ClippingSubsampler, _get_seconds, _split_time_frame, Streams
 from .frame_subsampler import FrameSubsampler
 from .ffprobe_subsampler import FFProbeSubsampler
 from .noop_subsampler import NoOpSubsampler
diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index 3f18d703..b3ae717a 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -22,8 +22,8 @@ class EncodeFormats(TypedDict):
 
 
 class Streams(TypedDict):
-    video: bytes
-    audio: bytes
+    video: list[bytes]
+    audio: list[bytes]
 
 
 def _get_seconds(t: str | float) -> float:
@@ -50,7 +50,7 @@ def _split_time_frame(s: float, e: float, min_length: float, max_length: float)
     time_d = e - s
     n_full_clips = int(time_d // max_length)
     clip_spans = [[s + i * max_length, s + (i + 1) * max_length] for i in range(n_full_clips)] + (
-        [[s + (n_full_clips - 1) * max_length, e]] if time_d % max_length > min_length else []
+        [[s + (n_full_clips) * max_length, e]] if time_d % max_length > min_length else []
     )
     return clip_spans
 
@@ -94,7 +94,7 @@ def _adjust_clip_spans(
 
 def _collate_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]:
     """Collates clip spans into a single string for ffmpeg and a list of clip idxs"""
-    clip_times = [0.0]
+    clip_times = []
     clip_idxs = []
     e_prev = 0.0
     clip_idx = 0
@@ -216,7 +216,7 @@ def _get_clips(
                 raise err
 
             clips[k] = []
-            for _, (_, clip_idx) in enumerate(zip(clip_spans, clip_idxs)):
+            for clip_idx in clip_idxs:
                 with open(stream_clips[clip_idx], "rb") as vid_f:
                     clip_bytes = vid_f.read()
                     clips[k].append(clip_bytes)

From 140e1abbe4445916b5f81347673adba1f7e9ebbe Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Thu, 18 Jan 2024 21:48:17 -0500
Subject: [PATCH 07/13] Made type annotations consistent with Python 3.8

---
 .../subsamplers/clipping_subsampler.py        | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index b3ae717a..25c7f665 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -7,13 +7,13 @@
 import ffmpeg
 import tempfile
 from collections.abc import Iterable
-from typing import Any, Annotated, TypedDict, Literal, cast
+from typing import Any, Union, List, TypedDict, Literal, cast
 
 import datetime
 from .subsampler import Subsampler
 
 
-ClipSpans = Annotated[list[float], 2]
+ClipSpans = List[float]  # [start, end]
 
 
 class EncodeFormats(TypedDict):
@@ -22,11 +22,11 @@ class EncodeFormats(TypedDict):
 
 
 class Streams(TypedDict):
-    video: list[bytes]
-    audio: list[bytes]
+    video: List[bytes]
+    audio: List[bytes]
 
 
-def _get_seconds(t: str | float) -> float:
+def _get_seconds(t: Union[str, float]) -> float:
     """Converts time to seconds"""
     if not isinstance(t, str):
         return float(t)  # already seconds
@@ -45,7 +45,7 @@ def _get_strtime(t_sec: float) -> str:
     return f"{hour:02d}:{minute:02d}:{second:02d}.{microsecond:03d}"
 
 
-def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> list[ClipSpans]:
+def _split_time_frame(s: float, e: float, min_length: float, max_length: float) -> List[ClipSpans]:
     """Filters out cuts by min and max length"""
     time_d = e - s
     n_full_clips = int(time_d // max_length)
@@ -55,7 +55,7 @@ def _split_time_frame(s: float, e: float, min_length: float, max_length: float)
     return clip_spans
 
 
-def _adjust_clip_spans_to_keyframes(clip_spans: list[ClipSpans], keyframes: list[float]) -> list[ClipSpans]:
+def _adjust_clip_spans_to_keyframes(clip_spans: List[ClipSpans], keyframes: List[float]) -> List[ClipSpans]:
     """Translates clip_spans into keyframe vocab"""
     adjusted_clip_spans = []
     for start, end in clip_spans:
@@ -69,15 +69,15 @@ def _adjust_clip_spans_to_keyframes(clip_spans: list[ClipSpans], keyframes: list
 
 
 def _adjust_clip_spans(
-    clip_spans: list[ClipSpans],
-    keyframe_timestamps: list[float] | None,
+    clip_spans: List[ClipSpans],
+    keyframe_timestamps: List[float] | None,
     min_length: float,
     max_length: float,
     max_length_strategy: str,
-) -> list[ClipSpans]:
+) -> List[ClipSpans]:
     """Adjusts cut times around keyframes, filtering by min and max length"""
     if not isinstance(clip_spans[0], Iterable):  # make sure clip_spans looks like [[start, end]] and not [start, end]
-        clip_spans = cast(list[ClipSpans], [clip_spans])
+        clip_spans = cast(List[ClipSpans], [clip_spans])
     clip_spans = [[_get_seconds(s), _get_seconds(e)] for [s, e] in clip_spans]
 
     if keyframe_timestamps:
@@ -92,7 +92,7 @@ def _adjust_clip_spans(
     return filtered_clip_spans
 
 
-def _collate_clip_spans(clip_spans: list[ClipSpans]) -> tuple[str, list[int]]:
+def _collate_clip_spans(clip_spans: List[ClipSpans]) -> tuple[str, List[int]]:
     """Collates clip spans into a single string for ffmpeg and a list of clip idxs"""
     clip_times = []
     clip_idxs = []
@@ -119,7 +119,7 @@ def _process_stream(
     stream_bytes: bytes,
     encode_format: str,
     ffmpeg_kwargs: dict,
-) -> list[str]:
+) -> List[str]:
     """Processes a stream into clips using ffmpeg"""
     # TODO: we need to put the extension into the metadata
     # TODO: This can be done better using pipes I just don't feel like sinking too much time into this rn
@@ -139,12 +139,12 @@ def _process_stream(
 
 
 def _get_clip_metadata(
-    clip_spans: list[ClipSpans],
-    clip_idxs: list[int],
+    clip_spans: List[ClipSpans],
+    clip_idxs: List[int],
     metadata: dict,
     oom_clip_count: int,
     strtime_formatting: bool,
-) -> list[dict]:
+) -> List[dict]:
     """Gets metadata for each clip"""
     metadata_clips = []
     for clip_id, (clip_span, _) in enumerate(zip(clip_spans, clip_idxs)):
@@ -180,11 +180,11 @@ def _get_clips(
     streams: Streams,
     encode_formats: EncodeFormats,
     precision: str,
-    clip_spans: list[ClipSpans],
+    clip_spans: List[ClipSpans],
     metadata: dict,
     oom_clip_count: int,
     strtime_formatting: bool,
-) -> tuple[dict[str, list[str]], list[dict]]:
+) -> tuple[dict[str, List[str]], List[dict]]:
     """Gets clips from streams"""
     clip_times, clip_idxs = _collate_clip_spans(clip_spans)
 

From 077ca27e78d713ff5b13c69692a66ed2dc95381d Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Thu, 18 Jan 2024 21:59:26 -0500
Subject: [PATCH 08/13] More annotation fixes

---
 .../subsamplers/clipping_subsampler.py        | 20 ++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index 25c7f665..317c6f92 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -7,7 +7,7 @@
 import ffmpeg
 import tempfile
 from collections.abc import Iterable
-from typing import Any, Union, List, TypedDict, Literal, cast
+from typing import Any, Union, List, Tuple, Dict, TypedDict, Literal, cast
 
 import datetime
 from .subsampler import Subsampler
@@ -64,13 +64,13 @@ def _adjust_clip_spans_to_keyframes(clip_spans: List[ClipSpans], keyframes: List
             adjusted_start = min(keyframes_in_range)
             adjusted_end = max(keyframes_in_range)
             if adjusted_start != adjusted_end:
-                adjusted_clip_spans.append((adjusted_start, adjusted_end))
+                adjusted_clip_spans.append([adjusted_start, adjusted_end])
     return adjusted_clip_spans
 
 
 def _adjust_clip_spans(
     clip_spans: List[ClipSpans],
-    keyframe_timestamps: List[float] | None,
+    keyframe_timestamps: Union[List[float], None],
     min_length: float,
     max_length: float,
     max_length_strategy: str,
@@ -92,7 +92,7 @@ def _adjust_clip_spans(
     return filtered_clip_spans
 
 
-def _collate_clip_spans(clip_spans: List[ClipSpans]) -> tuple[str, List[int]]:
+def _collate_clip_spans(clip_spans: List[ClipSpans]) -> Tuple[str, List[int]]:
     """Collates clip spans into a single string for ffmpeg and a list of clip idxs"""
     clip_times = []
     clip_idxs = []
@@ -110,8 +110,8 @@ def _collate_clip_spans(clip_spans: List[ClipSpans]) -> tuple[str, List[int]]:
             clip_idx += 2
         e_prev = e
 
-    clip_times = ",".join([str(time) for time in clip_times])
-    return clip_times, clip_idxs
+    clip_times_str = ",".join([str(time) for time in clip_times])
+    return clip_times_str, clip_idxs
 
 
 def _process_stream(
@@ -184,7 +184,7 @@ def _get_clips(
     metadata: dict,
     oom_clip_count: int,
     strtime_formatting: bool,
-) -> tuple[dict[str, List[str]], List[dict]]:
+) -> Tuple[Dict[str, List[bytes]], List[dict]]:
     """Gets clips from streams"""
     clip_times, clip_idxs = _collate_clip_spans(clip_spans)
 
@@ -199,8 +199,10 @@ def _get_clips(
     else:
         ffmpeg_kwargs["c"] = "copy"
 
-    clips = {}
-    for k in streams.keys():
+    clips: Dict[str, List[bytes]] = {}
+    for k in Streams.__annotations__.keys():
+        if k not in streams:
+            continue
         with tempfile.TemporaryDirectory() as tmpdir:
             stream_bytes = streams[k][0]  # pre-broadcast so only one
             if stream_bytes is None:

From 32fa4eaf760302a011bf263876c9f7bb17313205 Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Thu, 18 Jan 2024 22:05:04 -0500
Subject: [PATCH 09/13] The Python 3.8 annotation needs a lot of hand-holding,
 it seems

---
 video2dataset/subsamplers/clipping_subsampler.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index 317c6f92..3c07e2de 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -200,9 +200,8 @@ def _get_clips(
         ffmpeg_kwargs["c"] = "copy"
 
     clips: Dict[str, List[bytes]] = {}
-    for k in Streams.__annotations__.keys():
-        if k not in streams:
-            continue
+    for k in streams.keys():
+        k = cast(Literal["audio", "video"], k)
         with tempfile.TemporaryDirectory() as tmpdir:
             stream_bytes = streams[k][0]  # pre-broadcast so only one
             if stream_bytes is None:

From 5a8957fce3285632bbf566c5c577f498b407415f Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Fri, 19 Jan 2024 00:00:31 -0500
Subject: [PATCH 10/13] Pylint has to cut it out, I swear to God

---
 video2dataset/subsamplers/clipping_subsampler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index 3c07e2de..2af9a93c 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -213,7 +213,7 @@ def _get_clips(
                     encode_format=encode_formats[k],
                     ffmpeg_kwargs=ffmpeg_kwargs,
                 )
-            except Exception as err:
+            except Exception as err:  # pylint: disable=broad-except
                 raise err
 
             clips[k] = []
@@ -306,7 +306,7 @@ def __call__(self, streams: Streams, metadata: dict):
                 oom_clip_count=self.oom_clip_count,
                 strtime_formatting=strtime_formatting,
             )
-        except Exception as err:
+        except Exception as err:  # pylint: disable=broad-except
             return {}, [], str(err)
 
         return clips, clip_metadata, None

From f0f01688fe3d60069d51af3fe61565d8e35eda04 Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Fri, 19 Jan 2024 08:15:29 -0500
Subject: [PATCH 11/13] No real change, just relauching unit tests which failed
 due to connection timeouts

---
 video2dataset/subsamplers/clipping_subsampler.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index 2af9a93c..439fd7b9 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -1,13 +1,13 @@
 """
 clipping subsampler turns full videos into clips of videos according to clip_col
 """
-import os
+from collections.abc import Iterable
+from typing import Any, Union, List, Tuple, Dict, TypedDict, Literal, cast
 import copy
-import glob
 import ffmpeg
+import glob
+import os
 import tempfile
-from collections.abc import Iterable
-from typing import Any, Union, List, Tuple, Dict, TypedDict, Literal, cast
 
 import datetime
 from .subsampler import Subsampler

From 1df88dd6b1bc1a8f3236418a863703a200aaa019 Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Sun, 21 Jan 2024 22:46:56 -0500
Subject: [PATCH 12/13] Linting issue

---
 video2dataset/subsamplers/clipping_subsampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index df68fb46..508c6ed8 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -140,7 +140,7 @@ def _process_stream(
 
 def _extract_subtitles(clip_span: ClipSpan, meta_clip: dict) -> List[dict]:
     """Extracts subtitles and groups them by language"""
-    clip_subtitles = []
+    clip_subtitles: list[dict] = []
     s_c, e_c = _get_seconds(clip_span[0]), _get_seconds(clip_span[1])
     for lang_id, (lang, subtitles) in enumerate(meta_clip["yt_meta_dict"]["subtitles"].items()):
         idx = 0

From 226fba3bbf5ae98c689dc1f95f911a0532b6fe5c Mon Sep 17 00:00:00 2001
From: Matt Zhang <set.stun@gmail.com>
Date: Sun, 21 Jan 2024 22:51:59 -0500
Subject: [PATCH 13/13] Another linting issue

---
 video2dataset/subsamplers/clipping_subsampler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/video2dataset/subsamplers/clipping_subsampler.py b/video2dataset/subsamplers/clipping_subsampler.py
index 508c6ed8..73eae18f 100644
--- a/video2dataset/subsamplers/clipping_subsampler.py
+++ b/video2dataset/subsamplers/clipping_subsampler.py
@@ -140,7 +140,7 @@ def _process_stream(
 
 def _extract_subtitles(clip_span: ClipSpan, meta_clip: dict) -> List[dict]:
     """Extracts subtitles and groups them by language"""
-    clip_subtitles: list[dict] = []
+    clip_subtitles: List[dict] = []
     s_c, e_c = _get_seconds(clip_span[0]), _get_seconds(clip_span[1])
     for lang_id, (lang, subtitles) in enumerate(meta_clip["yt_meta_dict"]["subtitles"].items()):
         idx = 0