From 5e6ce95bb45daa02482b3484148b04cc8510e7b4 Mon Sep 17 00:00:00 2001 From: Ali Hamdi Ali Fadel Date: Fri, 1 Nov 2024 16:42:04 +0000 Subject: [PATCH] Remove --playlist_items option --- README.en.md | 23 ++++++----------------- README.md | 23 ++++++----------------- tafrigh/cli.py | 3 +-- tafrigh/config.py | 2 -- tafrigh/downloader.py | 4 +--- tafrigh/utils/cli_utils.py | 32 -------------------------------- 6 files changed, 14 insertions(+), 73 deletions(-) diff --git a/README.en.md b/README.en.md index 8a23bf0..178a65b 100644 --- a/README.en.md +++ b/README.en.md @@ -73,7 +73,6 @@ @@ -159,14 +158,10 @@ ``` ➜ tafrigh --help -usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--playlist_items PLAYLIST_ITEMS] - [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] +usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] [-l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}] - [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] - [--ct2_compute_type {default,int8,int8_float16,int16,float16}] - [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]] - [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] - [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE] + [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] [--ct2_compute_type {default,int8,int8_float16,int16,float16}] [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]] + [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE] [-f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...]] [-o OUTPUT_DIR] urls_or_paths [urls_or_paths ...] @@ -178,8 +173,6 @@ Input: urls_or_paths Video/Playlist URLs or local folder/file(s) to transcribe. --skip_if_output_exist, --no-skip_if_output_exist Whether to skip generating the output if the output file already exists. - --playlist_items PLAYLIST_ITEMS - Comma separated playlist_index of the items to download. You can specify a range using "[START]:[STOP][:STEP]". --download_retries DOWNLOAD_RETRIES Number of retries for yt-dlp downloads that fail. --verbose, --no-verbose @@ -201,22 +194,19 @@ Whisper: Wit: -w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...], --wit_client_access_tokens WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...] - List of wit.ai client access tokens. If provided, wit.ai APIs will be used to do the transcription, otherwise - whisper will be used. + List of wit.ai client access tokens. If provided, wit.ai APIs will be used to do the transcription, otherwise whisper will be used. --max_cutting_duration [1-17] The maximum allowed cutting duration. It should be between 1 and 17. Output: --min_words_per_segment MIN_WORDS_PER_SEGMENT - The minimum number of words should appear in each transcript segment. Any segment have words count less than - this threshold will be merged with the next one. Pass 0 to disable this behavior. + The minimum number of words should appear in each transcript segment. Any segment have words count less than this threshold will be merged with the next one. Pass 0 to disable this behavior. --save_files_before_compact, --no-save_files_before_compact Saves the output files before applying the compact logic that is based on --min_words_per_segment. --save_yt_dlp_responses, --no-save_yt_dlp_responses Whether to save the yt-dlp library JSON responses or not. --output_sample OUTPUT_SAMPLE - Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to - disable this behavior. + Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to disable this behavior. -f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...], --output_formats {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...] Format of the output file; if not specified, all available formats will be produced. -o OUTPUT_DIR, --output_dir OUTPUT_DIR @@ -321,7 +311,6 @@ if __name__ == '__main__': input=Config.Input( urls_or_paths=['https://youtu.be/qFsUwp5iomU'], skip_if_output_exist=False, - playlist_items='', download_retries=3, verbose=False, ), diff --git a/README.md b/README.md index 5eacfab..c244393 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,6 @@ @@ -158,14 +157,10 @@ ``` ➜ tafrigh --help -usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--playlist_items PLAYLIST_ITEMS] - [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] +usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] [-l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}] - [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] - [--ct2_compute_type {default,int8,int8_float16,int16,float16}] - [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]] - [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] - [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE] + [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] [--ct2_compute_type {default,int8,int8_float16,int16,float16}] [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]] + [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE] [-f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...]] [-o OUTPUT_DIR] urls_or_paths [urls_or_paths ...] @@ -177,8 +172,6 @@ Input: urls_or_paths Video/Playlist URLs or local folder/file(s) to transcribe. --skip_if_output_exist, --no-skip_if_output_exist Whether to skip generating the output if the output file already exists. - --playlist_items PLAYLIST_ITEMS - Comma separated playlist_index of the items to download. You can specify a range using "[START]:[STOP][:STEP]". --download_retries DOWNLOAD_RETRIES Number of retries for yt-dlp downloads that fail. --verbose, --no-verbose @@ -200,22 +193,19 @@ Whisper: Wit: -w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...], --wit_client_access_tokens WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...] - List of wit.ai client access tokens. If provided, wit.ai APIs will be used to do the transcription, otherwise - whisper will be used. + List of wit.ai client access tokens. If provided, wit.ai APIs will be used to do the transcription, otherwise whisper will be used. --max_cutting_duration [1-17] The maximum allowed cutting duration. It should be between 1 and 17. Output: --min_words_per_segment MIN_WORDS_PER_SEGMENT - The minimum number of words should appear in each transcript segment. Any segment have words count less than - this threshold will be merged with the next one. Pass 0 to disable this behavior. + The minimum number of words should appear in each transcript segment. Any segment have words count less than this threshold will be merged with the next one. Pass 0 to disable this behavior. --save_files_before_compact, --no-save_files_before_compact Saves the output files before applying the compact logic that is based on --min_words_per_segment. --save_yt_dlp_responses, --no-save_yt_dlp_responses Whether to save the yt-dlp library JSON responses or not. --output_sample OUTPUT_SAMPLE - Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to - disable this behavior. + Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to disable this behavior. -f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...], --output_formats {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...] Format of the output file; if not specified, all available formats will be produced. -o OUTPUT_DIR, --output_dir OUTPUT_DIR @@ -320,7 +310,6 @@ if __name__ == '__main__': input=Config.Input( urls_or_paths=['https://youtu.be/qFsUwp5iomU'], skip_if_output_exist=False, - playlist_items='', download_retries=3, verbose=False, ), diff --git a/tafrigh/cli.py b/tafrigh/cli.py index c62fcce..fb2668c 100644 --- a/tafrigh/cli.py +++ b/tafrigh/cli.py @@ -42,7 +42,6 @@ def main(): input=Config.Input( urls_or_paths=args.urls_or_paths, skip_if_output_exist=args.skip_if_output_exist, - playlist_items=args.playlist_items, download_retries=args.download_retries, verbose=args.verbose, ), @@ -193,7 +192,7 @@ def process_url( config: Config, progress_info: dict, ) -> Generator[tuple[dict[str, Any], list[SegmentType]], None, None]: - url_data = Downloader(playlist_items=config.input.playlist_items, output_dir=config.output.output_dir).download( + url_data = Downloader(output_dir=config.output.output_dir).download( url, retries=config.input.download_retries, save_response=config.output.save_yt_dlp_responses, diff --git a/tafrigh/config.py b/tafrigh/config.py index aa938ec..42cc9c4 100644 --- a/tafrigh/config.py +++ b/tafrigh/config.py @@ -20,13 +20,11 @@ def __init__( self, urls_or_paths: list[str], skip_if_output_exist: bool, - playlist_items: str, download_retries: int, verbose: bool, ): self.urls_or_paths = urls_or_paths self.skip_if_output_exist = skip_if_output_exist - self.playlist_items = playlist_items self.download_retries = download_retries self.verbose = verbose diff --git a/tafrigh/downloader.py b/tafrigh/downloader.py index d97eb2a..674ef55 100644 --- a/tafrigh/downloader.py +++ b/tafrigh/downloader.py @@ -7,8 +7,7 @@ class Downloader: - def __init__(self, playlist_items: str, output_dir: str): - self.playlist_items = playlist_items + def __init__(self, output_dir: str): self.output_dir = output_dir self._initialize_youtube_dl_with_archive() @@ -44,7 +43,6 @@ def _config(self, **kwargs: Any) -> dict[str, Any]: 'format': 'bestaudio', 'ignoreerrors': True, 'outtmpl': os.path.join(self.output_dir, '%(id)s.%(ext)s'), - 'playlist_items': self.playlist_items, 'postprocessors': [ { 'key': 'FFmpegExtractAudio', diff --git a/tafrigh/utils/cli_utils.py b/tafrigh/utils/cli_utils.py index 2058c0f..d4bd996 100644 --- a/tafrigh/utils/cli_utils.py +++ b/tafrigh/utils/cli_utils.py @@ -5,16 +5,6 @@ from tafrigh.types.transcript_type import TranscriptType -PLAYLIST_ITEMS_RE = re.compile( - r'''(?x) - (?P[+-]?\d+)? - (?P[:-] - (?P[+-]?\d+|inf(?:inite)?)? - (?::(?P[+-]?\d+))? - )?''' -) - - def parse_args(argv: list[str]) -> argparse.Namespace: parser = argparse.ArgumentParser() @@ -39,12 +29,6 @@ def parse_args(argv: list[str]) -> argparse.Namespace: help='Whether to skip generating the output if the output file already exists.', ) - input_group.add_argument( - '--playlist_items', - type=parse_playlist_items, - help='Comma separated playlist_index of the items to download. You can specify a range using "[START]:[STOP][:STEP]".', - ) - input_group.add_argument( '--download_retries', type=int, @@ -181,22 +165,6 @@ def parse_args(argv: list[str]) -> argparse.Namespace: return parser.parse_args(argv) -def parse_playlist_items(arg_value: str) -> str: - for segment in arg_value.split(','): - if not segment: - raise ValueError('There is two or more consecutive commas.') - - mobj = PLAYLIST_ITEMS_RE.fullmatch(segment) - if not mobj: - raise ValueError(f'{segment!r} is not a valid specification.') - - _, _, step, _ = mobj.group('start', 'end', 'step', 'range') - if int_or_none(step) == 0: - raise ValueError(f'Step in {segment!r} cannot be zero.') - - return arg_value - - def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr and v is not None: v = getattr(v, get_attr, None)