fix: Danbooru download Cloudflare 403 error

y-young · Jan 8, 2024 · 7c0df16 · 7c0df16
1 parent aabab6d
commit 7c0df16
Show file tree

Hide file tree

Showing 11 changed files with 190 additions and 51 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,12 +1,18 @@
 ARG PYTHON_VERSION=3.8
 
+ARG CURL_IMPERSONATE_VERSION=0.5-chrome
+FROM lwthiker/curl-impersonate:${CURL_IMPERSONATE_VERSION} as curl
+
 # Builder
 FROM python:${PYTHON_VERSION}-alpine as builder
 
-RUN apk add --update git build-base libffi-dev
+RUN apk add --update git build-base libffi-dev curl-dev
 
 WORKDIR /root
 
+COPY --from=curl /usr/local/bin/curl_* /usr/local/bin/
+COPY --from=curl /usr/local/lib/ /usr/local/lib/
+
 # Install requirements
 COPY requirements.txt /root
 RUN pip install --prefix="/install" --no-warn-script-location -r requirements.txt
@@ -34,9 +40,18 @@ RUN apk add --no-cache curl
 # Install FFmpeg
 COPY --from=builder /usr/local/bin/ffmpeg /usr/local/bin/
 
+# cURL Impersonate libraries
+COPY --from=builder /usr/local/bin/curl_* /usr/local/bin/
+COPY --from=builder /usr/local/lib/libcurl-* /usr/local/lib/
+
 # Copy pip requirements
 COPY --from=builder /install /usr/local
 
+# Copy CA certificates for curl_cffi, can be removed once v0.6 is officially released
+RUN PYTHON_LIB_PATH="$(python -c "import site; print(site.getsitepackages()[0])")" &&\
+    CA_FILE="$(python -c "import certifi; print(certifi.where())")" && \
+    cp $CA_FILE $PYTHON_LIB_PATH/curl_cffi/
+
 WORKDIR /app
 COPY nazurin ./nazurin
 

diff --git a/Dockerfile.debian b/Dockerfile.debian
@@ -1,12 +1,20 @@
 ARG PYTHON_VERSION=3.8
 
+ARG CURL_IMPERSONATE_VERSION=0.5-chrome
+FROM lwthiker/curl-impersonate:${CURL_IMPERSONATE_VERSION} as curl
+
 # Builder
 FROM python:${PYTHON_VERSION}-slim as builder
 
-RUN apt-get update && apt-get install -y --no-install-recommends git wget gcc xz-utils
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    git wget gcc xz-utils libcurl4-openssl-dev
 
 WORKDIR /root
 
+COPY --from=curl /usr/local/bin/curl_* /usr/local/bin/
+COPY --from=curl /usr/local/lib/ /usr/local/lib/
+
 # Install requirements
 COPY requirements.txt /root
 RUN pip install --prefix="/install" --no-warn-script-location -r requirements.txt
@@ -34,9 +42,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl
 # Install FFmpeg
 COPY --from=builder /usr/local/bin/ffmpeg /usr/local/bin/
 
+# cURL Impersonate libraries
+COPY --from=builder /usr/local/bin/curl_* /usr/local/bin/
+COPY --from=builder /usr/local/lib/libcurl-* /usr/local/lib/
+
 # Copy pip requirements
 COPY --from=builder /install /usr/local
 
+# Copy CA certificates for curl_cffi, can be removed once v0.6 is officially released
+RUN PYTHON_LIB_PATH="$(python -c "import site; print(site.getsitepackages()[0])")" &&\
+    CA_FILE="$(python -c "import certifi; print(certifi.where())")" && \
+    cp $CA_FILE $PYTHON_LIB_PATH/curl_cffi/
+
 WORKDIR /app
 COPY nazurin ./nazurin
 

diff --git a/nazurin/config.py b/nazurin/config.py
@@ -1,50 +1,51 @@
 from os import path
+from typing import List, Optional
 
 from environs import Env
 
 env = Env()
 # read config from .env file if exists
 env.read_env()
 
-ENV = env.str("ENV", default="production")
-TOKEN = env.str("TOKEN")
+ENV: str = env.str("ENV", default="production")
+TOKEN: str = env.str("TOKEN")
 
 # Webhook url, eg: https://xxx.fly.dev/, should end with '/'
-WEBHOOK_URL = env.str("WEBHOOK_URL", default=None)
-HOST = env.str("HOST", default="0.0.0.0")
+WEBHOOK_URL: str = env.str("WEBHOOK_URL", default=None)
+HOST: str = env.str("HOST", default="0.0.0.0")
 # Port is automatically set if on Heroku or fly.io
-PORT = env.int("PORT", default=80)
+PORT: int = env.int("PORT", default=80)
 
-STORAGE = env.list("STORAGE", subcast=str, default=["Local"])
-STORAGE_DIR = env.str("STORAGE_DIR", default="Pictures")
+STORAGE: List[str] = env.list("STORAGE", subcast=str, default=["Local"])
+STORAGE_DIR: str = env.str("STORAGE_DIR", default="Pictures")
 
-DATABASE = env.str("DATABASE", default="Local")
+DATABASE: str = env.str("DATABASE", default="Local")
 # Nazurin data collection in database
-NAZURIN_DATA = "nazurin"
+NAZURIN_DATA: str = "nazurin"
 # Ignored items in image caption
-CAPTION_IGNORE = env.list("CAPTION_IGNORE", subcast=str, default=[])
+CAPTION_IGNORE: List[str] = env.list("CAPTION_IGNORE", subcast=str, default=[])
 
-GALLERY_ID = env.int("GALLERY_ID", default=None)
+GALLERY_ID: Optional[int] = env.int("GALLERY_ID", default=None)
 
-ADMIN_ID = env.int("ADMIN_ID")
-IS_PUBLIC = env.bool("IS_PUBLIC", default=False)
+ADMIN_ID: int = env.int("ADMIN_ID")
+IS_PUBLIC: bool = env.bool("IS_PUBLIC", default=False)
 # If IS_PUBLIC is True, the following items will be ignored
-ALLOW_ID = env.list("ALLOW_ID", subcast=int, default=[])
-ALLOW_USERNAME = env.list("ALLOW_USERNAME", default=[])
-ALLOW_GROUP = env.list("ALLOW_GROUP", subcast=int, default=[])
-
-RETRIES = env.int("RETRIES", default=5)
-TIMEOUT = env.int("TIMEOUT", default=20)
-DOWNLOAD_CHUNK_SIZE = env.int("DOWNLOAD_CHUNK_SIZE", default=4096)
-PROXY = env.str("HTTP_PROXY", default=None)
-UA = (
+ALLOW_ID: List[int] = env.list("ALLOW_ID", subcast=int, default=[])
+ALLOW_USERNAME: List[str] = env.list("ALLOW_USERNAME", default=[])
+ALLOW_GROUP: List[int] = env.list("ALLOW_GROUP", subcast=int, default=[])
+
+RETRIES: int = env.int("RETRIES", default=5)
+TIMEOUT: int = env.int("TIMEOUT", default=20)
+DOWNLOAD_CHUNK_SIZE: int = env.int("DOWNLOAD_CHUNK_SIZE", default=4096)
+PROXY: str = env.str("HTTP_PROXY", default=None)
+UA: str = (
     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
     "AppleWebKit/537.36 (KHTML, like Gecko) "
     "Chrome/120.0.0.0 Safari/537.36"
 )
 
 # Local directory to store database and temporary files
-DATA_DIR = "data"
-TEMP_DIR = path.join(DATA_DIR, "temp")
-CLEANUP_INTERVAL = env.int("CLEANUP_INTERVAL", default=7)
-ACCESS_LOG_FORMAT = '%a "%r" %s %b "%{Referer}i" "%{User-Agent}i"'
+DATA_DIR: str = "data"
+TEMP_DIR: str = path.join(DATA_DIR, "temp")
+CLEANUP_INTERVAL: int = env.int("CLEANUP_INTERVAL", default=7)
+ACCESS_LOG_FORMAT: str = '%a "%r" %s %b "%{Referer}i" "%{User-Agent}i"'
diff --git a/nazurin/models/file.py b/nazurin/models/file.py
@@ -4,16 +4,16 @@
 
 import aiofiles
 import aiofiles.os
-import aiohttp
 
-from nazurin.config import DOWNLOAD_CHUNK_SIZE, STORAGE_DIR, TEMP_DIR
+from nazurin.config import STORAGE_DIR, TEMP_DIR
 from nazurin.utils import logger
 from nazurin.utils.decorators import network_retry
 from nazurin.utils.helpers import (
     ensure_existence_async,
     sanitize_filename,
     sanitize_path,
 )
+from nazurin.utils.network import NazurinRequestSession
 
 
 @dataclass
@@ -63,15 +63,11 @@ async def exists(self) -> bool:
         return False
 
     @network_retry
-    async def download(self, session: aiohttp.ClientSession):
+    async def download(self, session: NazurinRequestSession):
         if await self.exists():
             logger.info("File {} already exists", self.path)
             return True
         await ensure_existence_async(TEMP_DIR)
-        async with session.get(self.url) as response:
-            logger.info("Downloading {} to {}...", self.url, self.path)
-            response.raise_for_status()
-            async with aiofiles.open(self.path, "wb") as f:
-                async for chunk in response.content.iter_chunked(DOWNLOAD_CHUNK_SIZE):
-                    await f.write(chunk)
+        logger.info("Downloading {} to {}...", self.url, self.path)
+        await session.download(self.url, self.path)
         logger.info("Downloaded to {}", self.path)
diff --git a/nazurin/models/illust.py b/nazurin/models/illust.py
@@ -3,6 +3,7 @@
 from typing import List
 
 from nazurin.utils import Request
+from nazurin.utils.network import NazurinRequestSession
 
 from .caption import Caption
 from .file import File
@@ -26,8 +27,10 @@ def has_image(self) -> bool:
     def has_multiple_images(self) -> bool:
         return len(self.images) > 1
 
-    async def download(self, **kwargs):
-        async with Request(**kwargs) as session:
+    async def download(
+        self, *, request_class: NazurinRequestSession = Request, **kwargs
+    ):
+        async with request_class(**kwargs) as session:
             tasks = []
             for file in self.all_files:
                 if not file.url:

diff --git a/nazurin/models/image.py b/nazurin/models/image.py
@@ -98,7 +98,9 @@ async def download(self, session: aiohttp.ClientSession):
                 i + 1,
                 RETRIES,
             )
-            os.remove(self.path)
+            if i < RETRIES - 1:
+                # Keep the last one for debugging
+                os.remove(self.path)
         if not is_valid:
             raise NazurinError(
                 "Download failed with invalid image, please check logs for details"

diff --git a/nazurin/sites/danbooru/api.py b/nazurin/sites/danbooru/api.py
@@ -7,7 +7,8 @@
 from pybooru import Danbooru as danbooru
 from pybooru import PybooruHTTPError
 
-from nazurin.models import Caption, File, Illust, Image
+from nazurin.models import Caption, File, Image
+from nazurin.sites.danbooru.models import DanbooruIllust
 from nazurin.utils.decorators import async_wrap
 from nazurin.utils.exceptions import NazurinError
 from nazurin.utils.helpers import is_image
@@ -43,12 +44,12 @@ async def get_post(self, post_id: Optional[int] = None, md5: Optional[str] = Non
 
     async def view(
         self, post_id: Optional[int] = None, md5: Optional[str] = None
-    ) -> Illust:
+    ) -> DanbooruIllust:
         post = await self.get_post(post_id, md5)
         illust = self.parse_post(post)
         return illust
 
-    def parse_post(self, post) -> Illust:
+    def parse_post(self, post) -> DanbooruIllust:
         """Get images and build caption."""
         # Get images
         url = post["file_url"]
@@ -88,7 +89,7 @@ def parse_post(self, post) -> Illust:
                 "has_children": post["has_children"],
             }
         )
-        return Illust(imgs, caption, post, files)
+        return DanbooruIllust(imgs, caption, post, files)
 
     @staticmethod
     def get_storage_dest(post: dict, filename: str) -> Tuple[str, str]:

diff --git a/nazurin/sites/danbooru/models.py b/nazurin/sites/danbooru/models.py
@@ -0,0 +1,10 @@
+from dataclasses import dataclass
+
+from nazurin.models import Illust
+from nazurin.utils.network import CurlRequest
+
+
+@dataclass
+class DanbooruIllust(Illust):
+    async def download(self, **kwargs):
+        await super().download(request_class=CurlRequest, **kwargs)
diff --git a/nazurin/utils/helpers.py b/nazurin/utils/helpers.py
@@ -208,6 +208,6 @@ def check_image(path: Union[str, os.PathLike]) -> bool:
             image = Image.open(path)
             image.load()
         return True
-    except Exception as error:
+    except OSError as error:
         logger.warning("Invalid image {}: {}", path, error)
         return False