From 2a209efb089dbf47d64e8a59c85b9d06b79abd86 Mon Sep 17 00:00:00 2001 From: Ali Hamdi Ali Fadel Date: Fri, 1 Nov 2024 16:59:17 +0000 Subject: [PATCH] Add --yt_dlp_options option --- README.en.md | 5 ++++- README.md | 5 ++++- pyproject.toml | 2 +- tafrigh/cli.py | 3 ++- tafrigh/config.py | 2 ++ tafrigh/downloader.py | 4 +++- tafrigh/utils/cli_utils.py | 7 +++++++ 7 files changed, 23 insertions(+), 5 deletions(-) diff --git a/README.en.md b/README.en.md index 178a65b..94f39c0 100644 --- a/README.en.md +++ b/README.en.md @@ -74,6 +74,7 @@
  • Links or file paths: Pass the links or file paths of the materials to be transcribed directly after the Tafrigh tool name. For example: tafrigh "https://yout..." "https://yout..." "C:\Users\ieasybooks\leactue.wav"
  • Skip transcription if output exists: Use the --skip_if_output_exist option to skip transcription if the required outputs already exist in the specified output folder
  • Number of download retries: If downloading a full playlist using the yt-dlp library, some items may fail to download. The --download_retries option can be used to specify the number of retry attempts if a download fails. The default value is 3
  • +
  • Additional options for yt-dlp: You can pass additional options to the yt-dlp library using the --yt_dlp_options option in valid JSON format. For example, to download only the first 10 items from a playlist, pass --yt_dlp_options '{"playlist_items": "1-10"}'
  • @@ -158,7 +159,7 @@ ``` ➜ tafrigh --help -usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] +usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--download_retries DOWNLOAD_RETRIES] [--yt_dlp_options YT_DLP_OPTIONS] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] [-l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}] [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] [--ct2_compute_type {default,int8,int8_float16,int16,float16}] [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]] [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE] @@ -175,6 +176,8 @@ Input: Whether to skip generating the output if the output file already exists. --download_retries DOWNLOAD_RETRIES Number of retries for yt-dlp downloads that fail. + --yt_dlp_options YT_DLP_OPTIONS + Additional options to pass to yt-dlp in valid JSON format (e.g. `'{"playlist_items": "1-10"}'`). --verbose, --no-verbose Whether to print out the progress and debug messages. diff --git a/README.md b/README.md index c244393..b6d0cd3 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@
  • الروابط أو مسارات الملفات: يجب تمرير الروابط أو مسارات الملفات للمواد المُراد تفريغها بعد اسم أداة تفريغ بشكل مباشر. على سبيل المثال: tafrigh "https://yout..." "https://yout..." "C:\Users\ieasybooks\leactue.wav"
  • تخطي عملية التفريغ في حال وجود المخرجات مسبقًا: يمكن تمرير الاختيار --skip_if_output_exist لتخطي عملية التفريغ إذا كانت المخرجات المطلوبة موجودة بالفعل في مجلد الإخراج المحدد
  • عدد مرات محاولة إعادة تحميل المواد: قد يفشل تحميل بعض المواد عند تحميل قائمة تشغيل كاملة باستخدام مكتبة yt-dlp، يمكن من خلال الاختيار --download_retries تحديد عدد مرات محاولة إعادة التحميل في حال فشل تحميل إحدى المواد. القيمة الافتراضية هي 3
  • +
  • إضافة الخيارات المخصصة لمكتبة yt-dlp: يمكنك تمرير الاختيار --yt_dlp_options لإضافة الخيارات المخصصة لمكتبة yt-dlp في صيغة JSON. على سبيل المثال، لتحميل القسم الأول من قائمة تشغيل YouTube، يمكنك تمرير --yt_dlp_options '{"playlist_items": "1-10"}'
  • @@ -157,7 +158,7 @@ ``` ➜ tafrigh --help -usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] +usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--download_retries DOWNLOAD_RETRIES] [--yt_dlp_options YT_DLP_OPTIONS] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}] [-l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}] [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] [--ct2_compute_type {default,int8,int8_float16,int16,float16}] [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]] [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE] @@ -174,6 +175,8 @@ Input: Whether to skip generating the output if the output file already exists. --download_retries DOWNLOAD_RETRIES Number of retries for yt-dlp downloads that fail. + --yt_dlp_options YT_DLP_OPTIONS + Additional options to pass to yt-dlp in valid JSON format (e.g. `'{"playlist_items": "1-10"}'`). --verbose, --no-verbose Whether to print out the progress and debug messages. diff --git a/pyproject.toml b/pyproject.toml index 1dcb496..5110857 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "tafrigh" -version = "1.6.2" +version = "1.7.0" description = "تفريغ النصوص وإنشاء ملفات SRT و VTT باستخدام نماذج Whisper وتقنية wit.ai." authors = ["EasyBooks "] license = "MIT" diff --git a/tafrigh/cli.py b/tafrigh/cli.py index fb2668c..9ca3517 100644 --- a/tafrigh/cli.py +++ b/tafrigh/cli.py @@ -43,6 +43,7 @@ def main(): urls_or_paths=args.urls_or_paths, skip_if_output_exist=args.skip_if_output_exist, download_retries=args.download_retries, + yt_dlp_options=args.yt_dlp_options, verbose=args.verbose, ), whisper=Config.Whisper( @@ -192,7 +193,7 @@ def process_url( config: Config, progress_info: dict, ) -> Generator[tuple[dict[str, Any], list[SegmentType]], None, None]: - url_data = Downloader(output_dir=config.output.output_dir).download( + url_data = Downloader(yt_dlp_options=config.input.yt_dlp_options, output_dir=config.output.output_dir).download( url, retries=config.input.download_retries, save_response=config.output.save_yt_dlp_responses, diff --git a/tafrigh/config.py b/tafrigh/config.py index 42cc9c4..6b4417d 100644 --- a/tafrigh/config.py +++ b/tafrigh/config.py @@ -21,11 +21,13 @@ def __init__( urls_or_paths: list[str], skip_if_output_exist: bool, download_retries: int, + yt_dlp_options: str, verbose: bool, ): self.urls_or_paths = urls_or_paths self.skip_if_output_exist = skip_if_output_exist self.download_retries = download_retries + self.yt_dlp_options = yt_dlp_options self.verbose = verbose class Whisper: diff --git a/tafrigh/downloader.py b/tafrigh/downloader.py index 674ef55..3b7f12f 100644 --- a/tafrigh/downloader.py +++ b/tafrigh/downloader.py @@ -7,7 +7,8 @@ class Downloader: - def __init__(self, output_dir: str): + def __init__(self, yt_dlp_options: str, output_dir: str): + self.yt_dlp_options = yt_dlp_options self.output_dir = output_dir self._initialize_youtube_dl_with_archive() @@ -54,6 +55,7 @@ def _config(self, **kwargs: Any) -> dict[str, Any]: } config.update(kwargs) + config.update(json.loads(self.yt_dlp_options)) return config diff --git a/tafrigh/utils/cli_utils.py b/tafrigh/utils/cli_utils.py index d4bd996..0b1b80d 100644 --- a/tafrigh/utils/cli_utils.py +++ b/tafrigh/utils/cli_utils.py @@ -36,6 +36,13 @@ def parse_args(argv: list[str]) -> argparse.Namespace: help="Number of retries for yt-dlp downloads that fail.", ) + input_group.add_argument( + '--yt_dlp_options', + type=str, + default='{}', + help="Additional options to pass to yt-dlp in valid JSON format (e.g. `'{\"playlist_items\": \"1-10\"}'`).", + ) + input_group.add_argument( '--verbose', action=argparse.BooleanOptionalAction,