Remove --playlist_items and --download_retries and improve the downlo…

…ader
ieasybooks · Oct 30, 2024 · 8f78721 · 8f78721
1 parent f0b3fc7
commit 8f78721
Show file tree

Hide file tree

Showing 7 changed files with 54 additions and 107 deletions.
diff --git a/README.en.md b/README.en.md
@@ -73,8 +73,6 @@
     <ul>
       <li>Links or file paths: Pass the links or file paths of the materials to be transcribed directly after the Tafrigh tool name. For example: <code>tafrigh "https://yout..." "https://yout..." "C:\Users\ieasybooks\leactue.wav"</code></li>
       <li>Skip transcription if output exists: Use the <code>--skip_if_output_exist</code> option to skip transcription if the required outputs already exist in the specified output folder</li>
-      <li>Specify items to transcribe from a playlist: You can specify a range of items to be transcribed from a playlist using the <code>--playlist_items</code> option by passing a value in the format <code>"[START]:[STOP][:STEP]"</code>. For example, passing <code>2:5</code> will download items from <code>2</code> to <code>5</code> from the playlist. This option affects all playlists passed as inputs to Tafrigh</li>
-      <li>Number of download retries: If downloading a full playlist using the <code>yt-dlp</code> library, some items may fail to download. The <code>--download_retries</code> option can be used to specify the number of retry attempts if a download fails. The default value is <code>3</code></li>
     </ul>
   </li>
 
@@ -321,8 +319,6 @@ if __name__ == '__main__':
     input=Config.Input(
       urls_or_paths=['https://youtu.be/qFsUwp5iomU'],
       skip_if_output_exist=False,
-      playlist_items='',
-      download_retries=3,
       verbose=False,
     ),
     whisper=Config.Whisper(

diff --git a/README.md b/README.md
@@ -72,8 +72,6 @@
     <ul dir="rtl">
       <li>الروابط أو مسارات الملفات: يجب تمرير الروابط أو مسارات الملفات للمواد المُراد تفريغها بعد اسم أداة تفريغ بشكل مباشر. على سبيل المثال: <code dir="ltr">tafrigh "https://yout..." "https://yout..." "C:\Users\ieasybooks\leactue.wav"</code></li>
       <li>تخطي عملية التفريغ في حال وجود المخرجات مسبقًا: يمكن تمرير الاختيار <code dir="ltr">--skip_if_output_exist</code> لتخطي عملية التفريغ إذا كانت المخرجات المطلوبة موجودة بالفعل في مجلد الإخراج المحدد</li>
-      <li>المواد المُراد تفريفها من قائمة التشغيل: يمكن تحديد نطاق معين من المواد ليتم تفريغه من قائمة التشغيل من خلال الاختيار <code dir="ltr">--playlist_items</code> من خلال تمرير قيمة على صيغة <code dir="ltr">"[START]:[STOP][:STEP]"</code>. على سبيل المثال، عند تمرير <code dir="ltr">2:5</code> سيتم تنزيل المواد من <code>2</code> إلى <code>5</code> من قائمة التشغيل. هذا الاختيار يُؤثّر على كل قوائم التشغيل التي يتم تمريرها كمدخلات لتفريغ</li>
-      <li>عدد مرات محاولة إعادة تحميل المواد: قد يفشل تحميل بعض المواد عند تحميل قائمة تشغيل كاملة باستخدام مكتبة <code dir="ltr">yt-dlp</code>، يمكن من خلال الاختيار <code dir="ltr">--download_retries</code> تحديد عدد مرات محاولة إعادة التحميل في حال فشل تحميل إحدى المواد. القيمة الافتراضية هي <code dir="ltr">3</code></li>
     </ul>
   </li>
 
@@ -320,8 +318,6 @@ if __name__ == '__main__':
     input=Config.Input(
       urls_or_paths=['https://youtu.be/qFsUwp5iomU'],
       skip_if_output_exist=False,
-      playlist_items='',
-      download_retries=3,
       verbose=False,
     ),
     whisper=Config.Whisper(

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "tafrigh"
-version = "1.6.2"
+version = "2.0.0"
 description = "تفريغ النصوص وإنشاء ملفات SRT و VTT باستخدام نماذج Whisper وتقنية wit.ai."
 authors = ["EasyBooks <[email protected]>"]
 license = "MIT"

diff --git a/tafrigh/cli.py b/tafrigh/cli.py
@@ -42,8 +42,6 @@ def main():
     input=Config.Input(
       urls_or_paths=args.urls_or_paths,
       skip_if_output_exist=args.skip_if_output_exist,
-      playlist_items=args.playlist_items,
-      download_retries=args.download_retries,
       verbose=args.verbose,
     ),
     whisper=Config.Whisper(
@@ -193,9 +191,8 @@ def process_url(
   config: Config,
   progress_info: dict,
 ) -> Generator[tuple[dict[str, Any], list[SegmentType]], None, None]:
-  url_data = Downloader(playlist_items=config.input.playlist_items, output_dir=config.output.output_dir).download(
+  url_data = Downloader(output_dir=config.output.output_dir).download(
     url,
-    retries=config.input.download_retries,
     save_response=config.output.save_yt_dlp_responses,
   )
 

diff --git a/tafrigh/config.py b/tafrigh/config.py
@@ -20,14 +20,10 @@ def __init__(
       self,
       urls_or_paths: list[str],
       skip_if_output_exist: bool,
-      playlist_items: str,
-      download_retries: int,
       verbose: bool,
     ):
       self.urls_or_paths = urls_or_paths
       self.skip_if_output_exist = skip_if_output_exist
-      self.playlist_items = playlist_items
-      self.download_retries = download_retries
       self.verbose = verbose
 
   class Whisper:

diff --git a/tafrigh/downloader.py b/tafrigh/downloader.py
@@ -7,22 +7,44 @@
 
 
 class Downloader:
-  def __init__(self, playlist_items: str, output_dir: str):
-    self.playlist_items = playlist_items
+  def __init__(self, output_dir: str):
     self.output_dir = output_dir
-    self._initialize_youtube_dl_with_archive()
-    self._initialize_youtube_dl_without_archive()
 
-  def _config(self, download_archive: str | bool) -> dict[str, Any]:
-    return {
+    self._initialize_youtube_dl()
+
+  def download(self, url: str, save_response: bool = False) -> dict[str, Any]:
+    url_data: dict[str, Any] = {}
+
+    while True:
+      old_mp3_count = self._mp3_count()
+      current_url_data = self.youtube_dl.extract_info(url)
+
+      if old_mp3_count == self._mp3_count():
+        break
+      else:
+        if url_data == {}:
+          url_data = current_url_data
+        else:
+          url_data = self._merge_yt_dlp_responses(url_data, current_url_data)
+
+      self._initialize_youtube_dl()
+
+    if save_response:
+      self._save_response(url_data)
+
+    return url_data
+
+  def _initialize_youtube_dl(self) -> None:
+    self.youtube_dl = yt_dlp.YoutubeDL(self._config(download_archive=os.path.join(self.output_dir, 'archive.txt')))
+
+  def _config(self, **kwargs: Any) -> dict[str, Any]:
+    config = {
       'quiet': True,
       'verbose': False,
       'format': 'bestaudio',
       'extract_audio': True,
       'outtmpl': os.path.join(self.output_dir, '%(id)s.%(ext)s'),
       'ignoreerrors': True,
-      'download_archive': download_archive,
-      'playlist_items': self.playlist_items,
       'postprocessors': [
         {
           'key': 'FFmpegExtractAudio',
@@ -31,56 +53,36 @@ def _config(self, download_archive: str | bool) -> dict[str, Any]:
       ],
     }
 
-  def download(self, url: str, retries: int = 3, save_response: bool = False) -> dict[str, Any]:
-    while True:
-      self.youtube_dl_with_archive.download(url)
-      url_data = self.youtube_dl_without_archive.extract_info(url, download=False)
-
-      if retries <= 0 or not self._should_retry(url_data):
-        break
+    config.update(kwargs)
 
-      self._initialize_youtube_dl_with_archive()
-      retries -= 1
+    return config
 
-    if save_response:
-      self._save_response(url_data)
-
-    return url_data
+  def _mp3_count(self) -> int:
+    return len([file_name for file_name in os.listdir(self.output_dir) if file_name.endswith('.mp3')])
 
-  def _initialize_youtube_dl_with_archive(self) -> None:
-    self.youtube_dl_with_archive = yt_dlp.YoutubeDL(self._config(os.path.join(self.output_dir, 'archive.txt')))
+  def _save_response(self, url_data: dict[str, Any]) -> None:
+    file_path = os.path.join(self.output_dir, f"{url_data['id']}.json")
 
-  def _initialize_youtube_dl_without_archive(self) -> None:
-    self.youtube_dl_without_archive = yt_dlp.YoutubeDL(self._config(False))
+    if os.path.exists(file_path):
+      old_url_data = json.load(open(file_path, encoding='utf-8'))
 
-  def _should_retry(self, url_data: dict[str, Any]) -> bool:
-    def file_exists(file_name: str) -> bool:
-      extensions = ['mp3', 'wav', 'm4a', 'webm']
-      return any(os.path.exists(os.path.join(self.output_dir, f"{file_name}.{ext}")) for ext in extensions)
+      url_data = self._merge_yt_dlp_responses(old_url_data, url_data)
 
-    if '_type' in url_data and url_data['_type'] == 'playlist':
-      for entry in url_data['entries']:
-        if entry and not file_exists(entry['id']):
-          return True
-    else:
-      if not file_exists(url_data['id']):
-        return True
+    with open(file_path, 'w', encoding='utf-8') as fp:
+      json.dump(self.youtube_dl.sanitize_info(url_data), fp, indent=2, ensure_ascii=False)
 
-    return False
+  def _merge_yt_dlp_responses(self, old_response: dict[str, Any], new_response: dict[str, Any]) -> dict[str, Any]:
+    if 'entries' not in old_response.keys() or 'entries' not in new_response.keys():
+      raise ValueError('No entries found in the responses')
 
-  def _save_response(self, url_data: dict[str, Any]) -> None:
-    if '_type' in url_data and url_data['_type'] == 'playlist':
-      for entry in url_data['entries']:
-        if entry and 'requested_downloads' in entry:
-          self._remove_postprocessors(entry['requested_downloads'])
-    elif 'requested_downloads' in url_data:
-      self._remove_postprocessors(url_data['requested_downloads'])
+    seen_ids = set()
+    unique_entries = []
 
-    file_path = os.path.join(self.output_dir, f"{url_data['id']}.json")
+    for entry in old_response['entries'] + new_response['entries']:
+      if entry['id'] not in seen_ids:
+        seen_ids.add(entry['id'])
+        unique_entries.append(entry)
 
-    with open(file_path, 'w', encoding='utf-8') as fp:
-      json.dump(url_data, fp, indent=2, ensure_ascii=False)
+    new_response['entries'] = unique_entries
 
-  def _remove_postprocessors(self, requested_downloads: list[dict[str, Any]]) -> None:
-    for requested_download in requested_downloads:
-      requested_download.pop('__postprocessors')
+    return new_response
diff --git a/tafrigh/utils/cli_utils.py b/tafrigh/utils/cli_utils.py
@@ -1,20 +1,9 @@
 import argparse
 import importlib.metadata
-import re
 
 from tafrigh.types.transcript_type import TranscriptType
 
 
-PLAYLIST_ITEMS_RE = re.compile(
-  r'''(?x)
-      (?P<start>[+-]?\d+)?
-      (?P<range>[:-]
-          (?P<end>[+-]?\d+|inf(?:inite)?)?
-          (?::(?P<step>[+-]?\d+))?
-      )?'''
-)
-
-
 def parse_args(argv: list[str]) -> argparse.Namespace:
   parser = argparse.ArgumentParser()
 
@@ -39,19 +28,6 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
     help='Whether to skip generating the output if the output file already exists.',
   )
 
-  input_group.add_argument(
-    '--playlist_items',
-    type=parse_playlist_items,
-    help='Comma separated playlist_index of the items to download. You can specify a range using "[START]:[STOP][:STEP]".',
-  )
-
-  input_group.add_argument(
-    '--download_retries',
-    type=int,
-    default=3,
-    help="Number of retries for yt-dlp downloads that fail.",
-  )
-
   input_group.add_argument(
     '--verbose',
     action=argparse.BooleanOptionalAction,
@@ -181,22 +157,6 @@ def parse_args(argv: list[str]) -> argparse.Namespace:
   return parser.parse_args(argv)
 
 
-def parse_playlist_items(arg_value: str) -> str:
-  for segment in arg_value.split(','):
-    if not segment:
-      raise ValueError('There is two or more consecutive commas.')
-
-    mobj = PLAYLIST_ITEMS_RE.fullmatch(segment)
-    if not mobj:
-      raise ValueError(f'{segment!r} is not a valid specification.')
-
-    _, _, step, _ = mobj.group('start', 'end', 'step', 'range')
-    if int_or_none(step) == 0:
-      raise ValueError(f'Step in {segment!r} cannot be zero.')
-
-  return arg_value
-
-
 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
   if get_attr and v is not None:
     v = getattr(v, get_attr, None)