Skip to content

Commit

Permalink
fix: Danbooru download Cloudflare 403 error
Browse files Browse the repository at this point in the history
  • Loading branch information
y-young committed Jan 8, 2024
1 parent aabab6d commit 7c0df16
Show file tree
Hide file tree
Showing 11 changed files with 190 additions and 51 deletions.
17 changes: 16 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
ARG PYTHON_VERSION=3.8

ARG CURL_IMPERSONATE_VERSION=0.5-chrome
FROM lwthiker/curl-impersonate:${CURL_IMPERSONATE_VERSION} as curl

# Builder
FROM python:${PYTHON_VERSION}-alpine as builder

RUN apk add --update git build-base libffi-dev
RUN apk add --update git build-base libffi-dev curl-dev

WORKDIR /root

COPY --from=curl /usr/local/bin/curl_* /usr/local/bin/
COPY --from=curl /usr/local/lib/ /usr/local/lib/

# Install requirements
COPY requirements.txt /root
RUN pip install --prefix="/install" --no-warn-script-location -r requirements.txt
Expand Down Expand Up @@ -34,9 +40,18 @@ RUN apk add --no-cache curl
# Install FFmpeg
COPY --from=builder /usr/local/bin/ffmpeg /usr/local/bin/

# cURL Impersonate libraries
COPY --from=builder /usr/local/bin/curl_* /usr/local/bin/
COPY --from=builder /usr/local/lib/libcurl-* /usr/local/lib/

# Copy pip requirements
COPY --from=builder /install /usr/local

# Copy CA certificates for curl_cffi, can be removed once v0.6 is officially released
RUN PYTHON_LIB_PATH="$(python -c "import site; print(site.getsitepackages()[0])")" &&\

Check notice on line 51 in Dockerfile

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

Dockerfile#L51

Double quote to prevent globbing and word splitting.
CA_FILE="$(python -c "import certifi; print(certifi.where())")" && \
cp $CA_FILE $PYTHON_LIB_PATH/curl_cffi/

WORKDIR /app
COPY nazurin ./nazurin

Expand Down
19 changes: 18 additions & 1 deletion Dockerfile.debian
Original file line number Diff line number Diff line change
@@ -1,12 +1,20 @@
ARG PYTHON_VERSION=3.8

ARG CURL_IMPERSONATE_VERSION=0.5-chrome
FROM lwthiker/curl-impersonate:${CURL_IMPERSONATE_VERSION} as curl

# Builder
FROM python:${PYTHON_VERSION}-slim as builder

RUN apt-get update && apt-get install -y --no-install-recommends git wget gcc xz-utils
RUN apt-get update && \
apt-get install -y --no-install-recommends \
git wget gcc xz-utils libcurl4-openssl-dev

WORKDIR /root

COPY --from=curl /usr/local/bin/curl_* /usr/local/bin/
COPY --from=curl /usr/local/lib/ /usr/local/lib/

# Install requirements
COPY requirements.txt /root
RUN pip install --prefix="/install" --no-warn-script-location -r requirements.txt
Expand Down Expand Up @@ -34,9 +42,18 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl
# Install FFmpeg
COPY --from=builder /usr/local/bin/ffmpeg /usr/local/bin/

# cURL Impersonate libraries
COPY --from=builder /usr/local/bin/curl_* /usr/local/bin/
COPY --from=builder /usr/local/lib/libcurl-* /usr/local/lib/

# Copy pip requirements
COPY --from=builder /install /usr/local

# Copy CA certificates for curl_cffi, can be removed once v0.6 is officially released
RUN PYTHON_LIB_PATH="$(python -c "import site; print(site.getsitepackages()[0])")" &&\
CA_FILE="$(python -c "import certifi; print(certifi.where())")" && \
cp $CA_FILE $PYTHON_LIB_PATH/curl_cffi/

WORKDIR /app
COPY nazurin ./nazurin

Expand Down
53 changes: 27 additions & 26 deletions nazurin/config.py
Original file line number Diff line number Diff line change
@@ -1,50 +1,51 @@
from os import path
from typing import List, Optional

from environs import Env

env = Env()
# read config from .env file if exists
env.read_env()

ENV = env.str("ENV", default="production")
TOKEN = env.str("TOKEN")
ENV: str = env.str("ENV", default="production")
TOKEN: str = env.str("TOKEN")

# Webhook url, eg: https://xxx.fly.dev/, should end with '/'
WEBHOOK_URL = env.str("WEBHOOK_URL", default=None)
HOST = env.str("HOST", default="0.0.0.0")
WEBHOOK_URL: str = env.str("WEBHOOK_URL", default=None)
HOST: str = env.str("HOST", default="0.0.0.0")
# Port is automatically set if on Heroku or fly.io
PORT = env.int("PORT", default=80)
PORT: int = env.int("PORT", default=80)

STORAGE = env.list("STORAGE", subcast=str, default=["Local"])
STORAGE_DIR = env.str("STORAGE_DIR", default="Pictures")
STORAGE: List[str] = env.list("STORAGE", subcast=str, default=["Local"])
STORAGE_DIR: str = env.str("STORAGE_DIR", default="Pictures")

DATABASE = env.str("DATABASE", default="Local")
DATABASE: str = env.str("DATABASE", default="Local")
# Nazurin data collection in database
NAZURIN_DATA = "nazurin"
NAZURIN_DATA: str = "nazurin"
# Ignored items in image caption
CAPTION_IGNORE = env.list("CAPTION_IGNORE", subcast=str, default=[])
CAPTION_IGNORE: List[str] = env.list("CAPTION_IGNORE", subcast=str, default=[])

GALLERY_ID = env.int("GALLERY_ID", default=None)
GALLERY_ID: Optional[int] = env.int("GALLERY_ID", default=None)

ADMIN_ID = env.int("ADMIN_ID")
IS_PUBLIC = env.bool("IS_PUBLIC", default=False)
ADMIN_ID: int = env.int("ADMIN_ID")
IS_PUBLIC: bool = env.bool("IS_PUBLIC", default=False)
# If IS_PUBLIC is True, the following items will be ignored
ALLOW_ID = env.list("ALLOW_ID", subcast=int, default=[])
ALLOW_USERNAME = env.list("ALLOW_USERNAME", default=[])
ALLOW_GROUP = env.list("ALLOW_GROUP", subcast=int, default=[])

RETRIES = env.int("RETRIES", default=5)
TIMEOUT = env.int("TIMEOUT", default=20)
DOWNLOAD_CHUNK_SIZE = env.int("DOWNLOAD_CHUNK_SIZE", default=4096)
PROXY = env.str("HTTP_PROXY", default=None)
UA = (
ALLOW_ID: List[int] = env.list("ALLOW_ID", subcast=int, default=[])
ALLOW_USERNAME: List[str] = env.list("ALLOW_USERNAME", default=[])
ALLOW_GROUP: List[int] = env.list("ALLOW_GROUP", subcast=int, default=[])

RETRIES: int = env.int("RETRIES", default=5)
TIMEOUT: int = env.int("TIMEOUT", default=20)
DOWNLOAD_CHUNK_SIZE: int = env.int("DOWNLOAD_CHUNK_SIZE", default=4096)
PROXY: str = env.str("HTTP_PROXY", default=None)
UA: str = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
)

# Local directory to store database and temporary files
DATA_DIR = "data"
TEMP_DIR = path.join(DATA_DIR, "temp")
CLEANUP_INTERVAL = env.int("CLEANUP_INTERVAL", default=7)
ACCESS_LOG_FORMAT = '%a "%r" %s %b "%{Referer}i" "%{User-Agent}i"'
DATA_DIR: str = "data"
TEMP_DIR: str = path.join(DATA_DIR, "temp")
CLEANUP_INTERVAL: int = env.int("CLEANUP_INTERVAL", default=7)
ACCESS_LOG_FORMAT: str = '%a "%r" %s %b "%{Referer}i" "%{User-Agent}i"'
14 changes: 5 additions & 9 deletions nazurin/models/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@

import aiofiles
import aiofiles.os
import aiohttp

from nazurin.config import DOWNLOAD_CHUNK_SIZE, STORAGE_DIR, TEMP_DIR
from nazurin.config import STORAGE_DIR, TEMP_DIR
from nazurin.utils import logger
from nazurin.utils.decorators import network_retry
from nazurin.utils.helpers import (
ensure_existence_async,
sanitize_filename,
sanitize_path,
)
from nazurin.utils.network import NazurinRequestSession


@dataclass
Expand Down Expand Up @@ -63,15 +63,11 @@ async def exists(self) -> bool:
return False

@network_retry
async def download(self, session: aiohttp.ClientSession):
async def download(self, session: NazurinRequestSession):
if await self.exists():
logger.info("File {} already exists", self.path)
return True
await ensure_existence_async(TEMP_DIR)
async with session.get(self.url) as response:
logger.info("Downloading {} to {}...", self.url, self.path)
response.raise_for_status()
async with aiofiles.open(self.path, "wb") as f:
async for chunk in response.content.iter_chunked(DOWNLOAD_CHUNK_SIZE):
await f.write(chunk)
logger.info("Downloading {} to {}...", self.url, self.path)
await session.download(self.url, self.path)
logger.info("Downloaded to {}", self.path)
7 changes: 5 additions & 2 deletions nazurin/models/illust.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List

from nazurin.utils import Request
from nazurin.utils.network import NazurinRequestSession

from .caption import Caption
from .file import File
Expand All @@ -26,8 +27,10 @@ def has_image(self) -> bool:
def has_multiple_images(self) -> bool:
return len(self.images) > 1

async def download(self, **kwargs):
async with Request(**kwargs) as session:
async def download(
self, *, request_class: NazurinRequestSession = Request, **kwargs
):
async with request_class(**kwargs) as session:
tasks = []
for file in self.all_files:
if not file.url:
Expand Down
4 changes: 3 additions & 1 deletion nazurin/models/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,9 @@ async def download(self, session: aiohttp.ClientSession):
i + 1,
RETRIES,
)
os.remove(self.path)
if i < RETRIES - 1:
# Keep the last one for debugging
os.remove(self.path)
if not is_valid:
raise NazurinError(
"Download failed with invalid image, please check logs for details"
Expand Down
9 changes: 5 additions & 4 deletions nazurin/sites/danbooru/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from pybooru import Danbooru as danbooru
from pybooru import PybooruHTTPError

from nazurin.models import Caption, File, Illust, Image
from nazurin.models import Caption, File, Image
from nazurin.sites.danbooru.models import DanbooruIllust
from nazurin.utils.decorators import async_wrap
from nazurin.utils.exceptions import NazurinError
from nazurin.utils.helpers import is_image
Expand Down Expand Up @@ -43,12 +44,12 @@ async def get_post(self, post_id: Optional[int] = None, md5: Optional[str] = Non

async def view(
self, post_id: Optional[int] = None, md5: Optional[str] = None
) -> Illust:
) -> DanbooruIllust:
post = await self.get_post(post_id, md5)
illust = self.parse_post(post)
return illust

def parse_post(self, post) -> Illust:
def parse_post(self, post) -> DanbooruIllust:
"""Get images and build caption."""
# Get images
url = post["file_url"]
Expand Down Expand Up @@ -88,7 +89,7 @@ def parse_post(self, post) -> Illust:
"has_children": post["has_children"],
}
)
return Illust(imgs, caption, post, files)
return DanbooruIllust(imgs, caption, post, files)

@staticmethod
def get_storage_dest(post: dict, filename: str) -> Tuple[str, str]:
Expand Down
10 changes: 10 additions & 0 deletions nazurin/sites/danbooru/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from dataclasses import dataclass

from nazurin.models import Illust
from nazurin.utils.network import CurlRequest


@dataclass
class DanbooruIllust(Illust):
async def download(self, **kwargs):
await super().download(request_class=CurlRequest, **kwargs)
2 changes: 1 addition & 1 deletion nazurin/utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,6 @@ def check_image(path: Union[str, os.PathLike]) -> bool:
image = Image.open(path)
image.load()
return True
except Exception as error:
except OSError as error:
logger.warning("Invalid image {}: {}", path, error)
return False
Loading

0 comments on commit 7c0df16

Please sign in to comment.