Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Subset worker refactor #287

Merged
merged 28 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
2efa849
ClippingSubsampler rewrite and bug fixes
MattUnderscoreZhang Jan 18, 2024
a5c9649
More refactoring of ClippingSubsampler, plus a fix to _get_clip_inter…
MattUnderscoreZhang Jan 18, 2024
2cb5854
Finished refactoring ClippingSubsampler
MattUnderscoreZhang Jan 18, 2024
6106f62
Merge branch 'clipping_subsampler_rewrite' into all_fixes
MattUnderscoreZhang Jan 18, 2024
5d03b72
Final code changes
MattUnderscoreZhang Jan 19, 2024
47c7d64
Added docstrings
MattUnderscoreZhang Jan 19, 2024
5aa84d4
Passed tests and linting
MattUnderscoreZhang Jan 19, 2024
140e1ab
Made type annotations consistent with Python 3.8
MattUnderscoreZhang Jan 19, 2024
077ca27
More annotation fixes
MattUnderscoreZhang Jan 19, 2024
32fa4ea
The Python 3.8 annotation needs a lot of hand-holding, it seems
MattUnderscoreZhang Jan 19, 2024
5a8957f
Pylint has to cut it out, I swear to God
MattUnderscoreZhang Jan 19, 2024
f0f0168
No real change, just relauching unit tests which failed due to connec…
MattUnderscoreZhang Jan 19, 2024
f5d7c85
Merge branch 'main' into clipping_subsampler_refactor
iejMac Jan 19, 2024
388f51a
Merge branch 'main' into clipping_subsampler_refactor
rom1504 Jan 21, 2024
5101379
Merge remote-tracking branch 'origin/main' into clipping_subsampler_r…
MattUnderscoreZhang Jan 22, 2024
1df88dd
Linting issue
MattUnderscoreZhang Jan 22, 2024
226fba3
Another linting issue
MattUnderscoreZhang Jan 22, 2024
8ed5074
Separated per-shard code from code that should only be executed once
MattUnderscoreZhang Jan 24, 2024
e862eaa
Pulled ShardStatus parameters into their own data type
MattUnderscoreZhang Jan 24, 2024
d158106
Cleaned up shard processing error handling
MattUnderscoreZhang Jan 24, 2024
5cd53a9
Cleaned up code
MattUnderscoreZhang Jan 24, 2024
ffe0e71
Bug fixes
MattUnderscoreZhang Jan 24, 2024
2c7daf8
Formatting
MattUnderscoreZhang Jan 24, 2024
ac5a35b
Fixed linting issues
MattUnderscoreZhang Jan 24, 2024
5222f39
Fixing more damn linting
MattUnderscoreZhang Jan 24, 2024
6dc8991
Added a missing docstring
MattUnderscoreZhang Jan 24, 2024
bdc47cc
Merge branch 'main' into subset_worker_refactoring
MattUnderscoreZhang Jan 24, 2024
cd0d27c
Removed git worktree folder (ugh)
MattUnderscoreZhang Jan 24, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions video2dataset/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,26 @@
from typing import List, Optional, Any
import numpy as np # pylint: disable=unused-import

from .logger import LoggerProcess
from .data_writer import (
from video2dataset.logger import LoggerProcess
from video2dataset.data_writer import (
WebDatasetSampleWriter,
FilesSampleWriter,
ParquetSampleWriter,
TFRecordSampleWriter,
DummySampleWriter,
)
from .input_sharder import InputSharder
from .output_sharder import OutputSharder
from .distributor import (
from video2dataset.input_sharder import InputSharder
from video2dataset.output_sharder import OutputSharder
from video2dataset.distributor import (
no_distributor,
multiprocessing_distributor,
pyspark_distributor,
SlurmDistributor,
SlurmShardSampler,
)
from .workers import DownloadWorker, SubsetWorker, OpticalFlowWorker, CaptionWorker, WhisperWorker
from .configs import CONFIGS
from video2dataset.workers import DownloadWorker, SubsetWorker, OpticalFlowWorker, CaptionWorker, WhisperWorker
from video2dataset.configs import CONFIGS
from video2dataset.types import EncodeFormats


def identity(x):
Expand All @@ -42,7 +43,7 @@ def video2dataset(
output_folder: str = "dataset",
output_format: str = "files",
input_format: str = "csv",
encode_formats: Optional[dict] = None,
encode_formats: Optional[EncodeFormats] = None,
stage: str = "download",
url_col: str = "url",
caption_col: Optional[str] = None,
Expand Down
17 changes: 4 additions & 13 deletions video2dataset/subsamplers/clipping_subsampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,21 @@
clipping subsampler turns full videos into clips of videos according to clip_col
"""
from collections.abc import Iterable
from typing import Any, Union, List, Tuple, Dict, TypedDict, Literal, cast
import copy
import datetime
import ffmpeg
import glob
import os
import tempfile
from typing import Any, Union, List, Tuple, Dict, Literal, cast

import datetime
from .subsampler import Subsampler
from video2dataset.subsamplers.subsampler import Subsampler
from video2dataset.types import EncodeFormats, Streams


ClipSpan = List[float] # [start, end]


class EncodeFormats(TypedDict):
video: str
audio: str


class Streams(TypedDict):
video: List[bytes]
audio: List[bytes]


def _get_seconds(t: Union[str, float]) -> float:
"""Converts time to seconds"""
if not isinstance(t, str):
Expand Down
12 changes: 12 additions & 0 deletions video2dataset/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
"""Type definitions for video2dataset."""
from typing import List, TypedDict


class EncodeFormats(TypedDict, total=False):
video: str
audio: str


class Streams(TypedDict, total=False):
video: List[bytes]
audio: List[bytes]
Loading
Loading