Remove --playlist_items option

ieasybooks · Nov 1, 2024 · 5e6ce95 · 5e6ce95
1 parent 2fc1d3b
commit 5e6ce95
Show file tree

Hide file tree

Showing 6 changed files with 14 additions and 73 deletions.
diff --git a/README.en.md b/README.en.md
@@ -73,7 +73,6 @@
     <ul>
       <li>Links or file paths: Pass the links or file paths of the materials to be transcribed directly after the Tafrigh tool name. For example: <code>tafrigh "https://yout..." "https://yout..." "C:\Users\ieasybooks\leactue.wav"</code></li>
       <li>Skip transcription if output exists: Use the <code>--skip_if_output_exist</code> option to skip transcription if the required outputs already exist in the specified output folder</li>
-      <li>Specify items to transcribe from a playlist: You can specify a range of items to be transcribed from a playlist using the <code>--playlist_items</code> option by passing a value in the format <code>"[START]:[STOP][:STEP]"</code>. For example, passing <code>2:5</code> will download items from <code>2</code> to <code>5</code> from the playlist. This option affects all playlists passed as inputs to Tafrigh</li>
       <li>Number of download retries: If downloading a full playlist using the <code>yt-dlp</code> library, some items may fail to download. The <code>--download_retries</code> option can be used to specify the number of retry attempts if a download fails. The default value is <code>3</code></li>
     </ul>
   </li>
@@ -159,14 +158,10 @@
 
 ```
 ➜ tafrigh --help
-usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--playlist_items PLAYLIST_ITEMS]
-               [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}]
+usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}]
                [-l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}]
-               [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE]
-               [--ct2_compute_type {default,int8,int8_float16,int16,float16}]
-               [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]]
-               [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact]
-               [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE]
+               [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] [--ct2_compute_type {default,int8,int8_float16,int16,float16}] [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]]
+               [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE]
                [-f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...]] [-o OUTPUT_DIR]
                urls_or_paths [urls_or_paths ...]
 
@@ -178,8 +173,6 @@ Input:
   urls_or_paths         Video/Playlist URLs or local folder/file(s) to transcribe.
   --skip_if_output_exist, --no-skip_if_output_exist
                         Whether to skip generating the output if the output file already exists.
-  --playlist_items PLAYLIST_ITEMS
-                        Comma separated playlist_index of the items to download. You can specify a range using "[START]:[STOP][:STEP]".
   --download_retries DOWNLOAD_RETRIES
                         Number of retries for yt-dlp downloads that fail.
   --verbose, --no-verbose
@@ -201,22 +194,19 @@ Whisper:
 
 Wit:
   -w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...], --wit_client_access_tokens WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]
-                        List of wit.ai client access tokens. If provided, wit.ai APIs will be used to do the transcription, otherwise
-                        whisper will be used.
+                        List of wit.ai client access tokens. If provided, wit.ai APIs will be used to do the transcription, otherwise whisper will be used.
   --max_cutting_duration [1-17]
                         The maximum allowed cutting duration. It should be between 1 and 17.
 
 Output:
   --min_words_per_segment MIN_WORDS_PER_SEGMENT
-                        The minimum number of words should appear in each transcript segment. Any segment have words count less than
-                        this threshold will be merged with the next one. Pass 0 to disable this behavior.
+                        The minimum number of words should appear in each transcript segment. Any segment have words count less than this threshold will be merged with the next one. Pass 0 to disable this behavior.
   --save_files_before_compact, --no-save_files_before_compact
                         Saves the output files before applying the compact logic that is based on --min_words_per_segment.
   --save_yt_dlp_responses, --no-save_yt_dlp_responses
                         Whether to save the yt-dlp library JSON responses or not.
   --output_sample OUTPUT_SAMPLE
-                        Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to
-                        disable this behavior.
+                        Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to disable this behavior.
   -f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...], --output_formats {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...]
                         Format of the output file; if not specified, all available formats will be produced.
   -o OUTPUT_DIR, --output_dir OUTPUT_DIR
@@ -321,7 +311,6 @@ if __name__ == '__main__':
     input=Config.Input(
       urls_or_paths=['https://youtu.be/qFsUwp5iomU'],
       skip_if_output_exist=False,
-      playlist_items='',
       download_retries=3,
       verbose=False,
     ),

diff --git a/README.md b/README.md
@@ -72,7 +72,6 @@
     <ul dir="rtl">
       <li>الروابط أو مسارات الملفات: يجب تمرير الروابط أو مسارات الملفات للمواد المُراد تفريغها بعد اسم أداة تفريغ بشكل مباشر. على سبيل المثال: <code dir="ltr">tafrigh "https://yout..." "https://yout..." "C:\Users\ieasybooks\leactue.wav"</code></li>
       <li>تخطي عملية التفريغ في حال وجود المخرجات مسبقًا: يمكن تمرير الاختيار <code dir="ltr">--skip_if_output_exist</code> لتخطي عملية التفريغ إذا كانت المخرجات المطلوبة موجودة بالفعل في مجلد الإخراج المحدد</li>
-      <li>المواد المُراد تفريفها من قائمة التشغيل: يمكن تحديد نطاق معين من المواد ليتم تفريغه من قائمة التشغيل من خلال الاختيار <code dir="ltr">--playlist_items</code> من خلال تمرير قيمة على صيغة <code dir="ltr">"[START]:[STOP][:STEP]"</code>. على سبيل المثال، عند تمرير <code dir="ltr">2:5</code> سيتم تنزيل المواد من <code>2</code> إلى <code>5</code> من قائمة التشغيل. هذا الاختيار يُؤثّر على كل قوائم التشغيل التي يتم تمريرها كمدخلات لتفريغ</li>
       <li>عدد مرات محاولة إعادة تحميل المواد: قد يفشل تحميل بعض المواد عند تحميل قائمة تشغيل كاملة باستخدام مكتبة <code dir="ltr">yt-dlp</code>، يمكن من خلال الاختيار <code dir="ltr">--download_retries</code> تحديد عدد مرات محاولة إعادة التحميل في حال فشل تحميل إحدى المواد. القيمة الافتراضية هي <code dir="ltr">3</code></li>
     </ul>
   </li>
@@ -158,14 +157,10 @@
 
 ```
 ➜ tafrigh --help
-usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--playlist_items PLAYLIST_ITEMS]
-               [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}]
+usage: tafrigh [-h] [--version] [--skip_if_output_exist | --no-skip_if_output_exist] [--download_retries DOWNLOAD_RETRIES] [--verbose | --no-verbose] [-m MODEL_NAME_OR_PATH] [-t {transcribe,translate}]
                [-l {af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh}]
-               [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE]
-               [--ct2_compute_type {default,int8,int8_float16,int16,float16}]
-               [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]]
-               [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact]
-               [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE]
+               [--use_faster_whisper | --no-use_faster_whisper] [--beam_size BEAM_SIZE] [--ct2_compute_type {default,int8,int8_float16,int16,float16}] [-w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]] [--max_cutting_duration [1-17]]
+               [--min_words_per_segment MIN_WORDS_PER_SEGMENT] [--save_files_before_compact | --no-save_files_before_compact] [--save_yt_dlp_responses | --no-save_yt_dlp_responses] [--output_sample OUTPUT_SAMPLE]
                [-f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...]] [-o OUTPUT_DIR]
                urls_or_paths [urls_or_paths ...]
 
@@ -177,8 +172,6 @@ Input:
   urls_or_paths         Video/Playlist URLs or local folder/file(s) to transcribe.
   --skip_if_output_exist, --no-skip_if_output_exist
                         Whether to skip generating the output if the output file already exists.
-  --playlist_items PLAYLIST_ITEMS
-                        Comma separated playlist_index of the items to download. You can specify a range using "[START]:[STOP][:STEP]".
   --download_retries DOWNLOAD_RETRIES
                         Number of retries for yt-dlp downloads that fail.
   --verbose, --no-verbose
@@ -200,22 +193,19 @@ Whisper:
 
 Wit:
   -w WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...], --wit_client_access_tokens WIT_CLIENT_ACCESS_TOKENS [WIT_CLIENT_ACCESS_TOKENS ...]
-                        List of wit.ai client access tokens. If provided, wit.ai APIs will be used to do the transcription, otherwise
-                        whisper will be used.
+                        List of wit.ai client access tokens. If provided, wit.ai APIs will be used to do the transcription, otherwise whisper will be used.
   --max_cutting_duration [1-17]
                         The maximum allowed cutting duration. It should be between 1 and 17.
 
 Output:
   --min_words_per_segment MIN_WORDS_PER_SEGMENT
-                        The minimum number of words should appear in each transcript segment. Any segment have words count less than
-                        this threshold will be merged with the next one. Pass 0 to disable this behavior.
+                        The minimum number of words should appear in each transcript segment. Any segment have words count less than this threshold will be merged with the next one. Pass 0 to disable this behavior.
   --save_files_before_compact, --no-save_files_before_compact
                         Saves the output files before applying the compact logic that is based on --min_words_per_segment.
   --save_yt_dlp_responses, --no-save_yt_dlp_responses
                         Whether to save the yt-dlp library JSON responses or not.
   --output_sample OUTPUT_SAMPLE
-                        Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to
-                        disable this behavior.
+                        Samples random compacted segments from the output and generates a CSV file contains the sampled data. Pass 0 to disable this behavior.
   -f {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...], --output_formats {all,txt,srt,vtt,csv,tsv,json,none} [{all,txt,srt,vtt,csv,tsv,json,none} ...]
                         Format of the output file; if not specified, all available formats will be produced.
   -o OUTPUT_DIR, --output_dir OUTPUT_DIR
@@ -320,7 +310,6 @@ if __name__ == '__main__':
     input=Config.Input(
       urls_or_paths=['https://youtu.be/qFsUwp5iomU'],
       skip_if_output_exist=False,
-      playlist_items='',
       download_retries=3,
       verbose=False,
     ),

diff --git a/tafrigh/cli.py b/tafrigh/cli.py
@@ -42,7 +42,6 @@ def main():
     input=Config.Input(
       urls_or_paths=args.urls_or_paths,
       skip_if_output_exist=args.skip_if_output_exist,
-      playlist_items=args.playlist_items,
       download_retries=args.download_retries,
       verbose=args.verbose,
     ),
@@ -193,7 +192,7 @@ def process_url(
   config: Config,
   progress_info: dict,
 ) -> Generator[tuple[dict[str, Any], list[SegmentType]], None, None]:
-  url_data = Downloader(playlist_items=config.input.playlist_items, output_dir=config.output.output_dir).download(
+  url_data = Downloader(output_dir=config.output.output_dir).download(
     url,
     retries=config.input.download_retries,
     save_response=config.output.save_yt_dlp_responses,

diff --git a/tafrigh/config.py b/tafrigh/config.py
@@ -20,13 +20,11 @@ def __init__(
       self,
       urls_or_paths: list[str],
       skip_if_output_exist: bool,
-      playlist_items: str,
       download_retries: int,
       verbose: bool,
     ):
       self.urls_or_paths = urls_or_paths
       self.skip_if_output_exist = skip_if_output_exist
-      self.playlist_items = playlist_items
       self.download_retries = download_retries
       self.verbose = verbose
 

diff --git a/tafrigh/downloader.py b/tafrigh/downloader.py
@@ -7,8 +7,7 @@
 
 
 class Downloader:
-  def __init__(self, playlist_items: str, output_dir: str):
-    self.playlist_items = playlist_items
+  def __init__(self, output_dir: str):
     self.output_dir = output_dir
 
     self._initialize_youtube_dl_with_archive()
@@ -44,7 +43,6 @@ def _config(self, **kwargs: Any) -> dict[str, Any]:
       'format': 'bestaudio',
       'ignoreerrors': True,
       'outtmpl': os.path.join(self.output_dir, '%(id)s.%(ext)s'),
-      'playlist_items': self.playlist_items,
       'postprocessors': [
         {
           'key': 'FFmpegExtractAudio',

diff --git a/tafrigh/utils/cli_utils.py b/tafrigh/utils/cli_utils.py
@@ -5,16 +5,6 @@
 from tafrigh.types.transcript_type import TranscriptType
 
 
-PLAYLIST_ITEMS_RE = re.compile(
-  r'''(?x)
-      (?P<start>[+-]?\d+)?
-      (?P<range>[:-]
-          (?P<end>[+-]?\d+|inf(?:inite)?)?
-          (?::(?P<step>[+-]?\d+))?
-      )?'''
-)
-
-
 def parse_args(argv: list[str]) -> argparse.Namespace:
   parser = argparse.ArgumentParser()
 
@@ -39,12 +29,6 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
     help='Whether to skip generating the output if the output file already exists.',
   )
 
-  input_group.add_argument(
-    '--playlist_items',
-    type=parse_playlist_items,
-    help='Comma separated playlist_index of the items to download. You can specify a range using "[START]:[STOP][:STEP]".',
-  )
-
   input_group.add_argument(
     '--download_retries',
     type=int,
@@ -181,22 +165,6 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
   return parser.parse_args(argv)
 
 
-def parse_playlist_items(arg_value: str) -> str:
-  for segment in arg_value.split(','):
-    if not segment:
-      raise ValueError('There is two or more consecutive commas.')
-
-    mobj = PLAYLIST_ITEMS_RE.fullmatch(segment)
-    if not mobj:
-      raise ValueError(f'{segment!r} is not a valid specification.')
-
-    _, _, step, _ = mobj.group('start', 'end', 'step', 'range')
-    if int_or_none(step) == 0:
-      raise ValueError(f'Step in {segment!r} cannot be zero.')
-
-  return arg_value
-
-
 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
   if get_attr and v is not None:
     v = getattr(v, get_attr, None)