Skip to content

Commit

Permalink
refactor: convert other into dataclass
Browse files Browse the repository at this point in the history
  • Loading branch information
yzqzss committed Apr 8, 2024
1 parent 0923364 commit 14fa468
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 90 deletions.
2 changes: 1 addition & 1 deletion wikiteam3/dumpgenerator/api/get_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def get_JSON(request: requests.Response):
# request.encoding = request.apparent_encoding
try:
return request.json()
except:
except Exception:
# Maybe an older API version which did not return correct JSON
print("Error: Could not parse JSON")
return {}
43 changes: 22 additions & 21 deletions wikiteam3/dumpgenerator/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import re
import sys
import traceback
from typing import Dict, Tuple
from typing import Tuple

import requests
from requests.adapters import DEFAULT_RETRIES as REQUESTS_DEFAULT_RETRIES
Expand All @@ -22,7 +22,7 @@
)
from wikiteam3.dumpgenerator.api.index_check import check_index
from wikiteam3.dumpgenerator.cli.delay import Delay
from wikiteam3.dumpgenerator.config import Config, new_config
from wikiteam3.dumpgenerator.config import Config, OtherConfig, new_config
from wikiteam3.dumpgenerator.version import getVersion
from wikiteam3.utils import (
get_random_UserAgent,
Expand Down Expand Up @@ -276,7 +276,7 @@ def checkParameters(args=argparse.Namespace()) -> bool:

return passed

def get_parameters(params=None) -> Tuple[Config, Dict]:
def get_parameters(params=None) -> Tuple[Config, OtherConfig]:
# if not params:
# params = sys.argv

Expand Down Expand Up @@ -535,24 +535,25 @@ def sleep(self, response=None):
"retries": int(args.retries),
})

other = {
"resume": args.resume,
"force": args.force,
"session": session,
"stdout_log_path": args.stdout_log_path,
"bypass_cdn_image_compression": args.bypass_cdn_image_compression,
"add_referer_header": args.add_referer_header,
"image_timestamp_interval": args.image_timestamp_interval,
"ia_wbm_booster": args.ia_wbm_booster,

"assert_max_pages": args.assert_max_pages,
"assert_max_edits": args.assert_max_edits,
"assert_max_images": args.assert_max_images,
"assert_max_images_bytes": args.assert_max_images_bytes,

"upload": args.upload,
"uploader_args": args.uploader_args,
}

other = OtherConfig(
resume = args.resume,
force = args.force,
session = session,
stdout_log_path = args.stdout_log_path,
bypass_cdn_image_compression = args.bypass_cdn_image_compression,
add_referer_header = args.add_referer_header,
image_timestamp_interval = args.image_timestamp_interval,
ia_wbm_booster = args.ia_wbm_booster,

assert_max_pages = args.assert_max_pages,
assert_max_edits = args.assert_max_edits,
assert_max_images = args.assert_max_images,
assert_max_images_bytes = args.assert_max_images_bytes,

upload = args.upload,
uploader_args = args.uploader_args,
)

# calculating path, if not defined by user with --path=
if not config.path:
Expand Down
28 changes: 26 additions & 2 deletions wikiteam3/dumpgenerator/config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import dataclasses
import json
from typing import List
from typing import List, Optional

import requests


def _dataclass_from_dict(klass_or_obj, d: dict):
Expand Down Expand Up @@ -90,4 +92,26 @@ def save_config(config: Config, config_filename: str):
"""Save config file"""

with open(f"{config.path}/{config_filename}", "w", encoding="utf-8") as outfile:
json.dump(dataclasses.asdict(config), outfile, indent=4, sort_keys=True)
json.dump(dataclasses.asdict(config), outfile, indent=4, sort_keys=True)


@dataclasses.dataclass
class OtherConfig:
resume: bool
force: bool
session: requests.Session
stdout_log_path: Optional[str]
bypass_cdn_image_compression: bool
add_referer_header: Optional[str]
'''None, "auto", {URL}'''
image_timestamp_interval: Optional[str]
''' 2019-01-02T01:36:06Z/2023-08-12T10:36:06Z '''
ia_wbm_booster: int

assert_max_pages: Optional[int]
assert_max_edits: Optional[int]
assert_max_images: Optional[int]
assert_max_images_bytes: Optional[int]

upload: bool
uploader_args: List[str]
52 changes: 26 additions & 26 deletions wikiteam3/dumpgenerator/dump/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from file_read_backwards import FileReadBackwards

from wikiteam3.dumpgenerator.config import load_config, save_config
from wikiteam3.dumpgenerator.config import OtherConfig, load_config, save_config
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.dumpgenerator.cli import get_parameters, bye, welcome
from wikiteam3.dumpgenerator.dump.image.image import FILENAME_LIMIT, Image
Expand Down Expand Up @@ -56,12 +56,12 @@ def __init__(params=None):
config, other = get_parameters(params=params)
avoid_WikiMedia_projects(config=config, other=other)

with (Tee(other["stdout_log_path"]) if other["stdout_log_path"] is not None else contextlib.nullcontext()):
with (Tee(other.stdout_log_path) if other.stdout_log_path else contextlib.nullcontext()):
print(welcome())
print("Analysing %s" % (config.api if config.api else config.index))

# do not enter if resume is requested from begining
while not other["resume"] and os.path.isdir(config.path):
while not other.resume and os.path.isdir(config.path):
print('\nWarning!: "%s" path exists' % (config.path))
reply = "y" if config.failfast else ""
while reply.lower()[:1] not in ["y", "n"]:
Expand All @@ -79,45 +79,45 @@ def __init__(params=None):
print("No config file found. I can't resume. Aborting.")
sys.exit(1)
print("You have selected: YES")
other["resume"] = True
other.resume = True
break
elif reply == "n":
print("You have selected: NO.\nbye.")
# other["resume"] = False
# other.resume = False
sys.exit(0)

if asserts_enabled := [(arg, v) for arg, v in other.items() if arg.startswith("assert_") and v is not None]:
site_info = get_siteinfo(config=config, session=other["session"])
if asserts_enabled := [(arg, v) for arg, v in other.__dict__.items() if arg.startswith("assert_") and v is not None]:
site_info = get_siteinfo(config=config, session=other.session)
assert_siteinfo(site_info, other)
[print(f"--{arg}: {v}, passed") for arg, v in asserts_enabled]

if other["resume"]:
if other.resume:
print("Loading config file to resume...")
config = load_config(config=config, config_filename=config_filename)
else:
if not other['force'] and any_recent_ia_item_exists(config, days=365):
if not other.force and any_recent_ia_item_exists(config, days=365):
print("A dump of this wiki was uploaded to IA in the last 365 days. Aborting.")
sys.exit(88)

os.mkdir(config.path)
save_config(config=config, config_filename=config_filename)

if other["resume"]:
if other.resume:
DumpGenerator.resumePreviousDump(config=config, other=other)
else:
DumpGenerator.createNewDump(config=config, other=other)

if config.index:
save_IndexPHP(config=config, session=other["session"])
save_SpecialVersion(config=config, session=other["session"])
save_IndexPHP(config=config, session=other.session)
save_SpecialVersion(config=config, session=other.session)
if config.api:
save_siteinfo(config=config, session=other["session"])
save_siteinfo(config=config, session=other.session)

mark_as_done(config=config, mark=ALL_DUMPED_MARK)
bye(config.path)
if other["upload"]:
if other.upload:
print('Calling uploader... (--upload)')
retcode = subprocess.call([sys.executable, '-m', 'wikiteam3.uploader', config.path] + other["uploader_args"],
retcode = subprocess.call([sys.executable, '-m', 'wikiteam3.uploader', config.path] + other.uploader_args,
shell=False)
if retcode:
print(f'--upload: Failed: {retcode}')
Expand All @@ -126,25 +126,25 @@ def __init__(params=None):
print('--upload: Done')

@staticmethod
def createNewDump(config: Config, other: Dict):
def createNewDump(config: Config, other: OtherConfig):
# we do lazy title dumping here :)
images = []
print("Trying generating a new dump into a new directory...")
if config.xml:
generate_XML_dump(config=config, session=other["session"])
check_XML_integrity(config=config, session=other["session"])
generate_XML_dump(config=config, session=other.session)
check_XML_integrity(config=config, session=other.session)
if config.images:
images += Image.get_image_names(config=config, session=other["session"])
images += Image.get_image_names(config=config, session=other.session)
Image.save_image_names(config=config, other=other, images=images)
Image.generate_image_dump(
config=config, other=other, images=images, session=other["session"]
config=config, other=other, images=images, session=other.session
)
if config.logs:
pass # TODO
# save_SpecialLog(config=config, session=other["session"])
# save_SpecialLog(config=config, session=other.session)

@staticmethod
def resumePreviousDump(config: Config, other: Dict):
def resumePreviousDump(config: Config, other: OtherConfig):
images = []
print("Resuming previous dump process...")
if config.xml:
Expand Down Expand Up @@ -188,13 +188,13 @@ def resumePreviousDump(config: Config, other: Dict):
print('Resuming XML dump from "%s" (revision id %s)' % (last_xml_title, last_xml_revid))
generate_XML_dump(
config=config,
session=other["session"],
session=other.session,
resume=True,
)
else:
# corrupt? only has XML header?
print("XML is corrupt? Regenerating...")
generate_XML_dump(config=config, session=other["session"])
generate_XML_dump(config=config, session=other.session)

if config.images:
# load images list
Expand All @@ -219,7 +219,7 @@ def resumePreviousDump(config: Config, other: Dict):
print("Image list is incomplete. Reloading...")
# do not resume, reload, to avoid inconsistences, deleted images or
# so
images = Image.get_image_names(config=config, session=other["session"])
images = Image.get_image_names(config=config, session=other.session)
Image.save_image_names(config=config, other=other, images=images)
# checking images directory
files = set()
Expand Down Expand Up @@ -282,7 +282,7 @@ def resumePreviousDump(config: Config, other: Dict):
config=config,
other=other,
images=images,
session=other["session"],
session=other.session,
)

if config.logs:
Expand Down
49 changes: 21 additions & 28 deletions wikiteam3/dumpgenerator/dump/image/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from wikiteam3.dumpgenerator.api import get_JSON, handle_StatusCode
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.dumpgenerator.config import Config, OtherConfig
from wikiteam3.dumpgenerator.dump.image.html_regexs import R_NEXT, REGEX_CANDIDATES
from wikiteam3.dumpgenerator.exceptions import FileSha1Error, FileSizeError
from wikiteam3.dumpgenerator.log import log_error
Expand Down Expand Up @@ -42,18 +42,13 @@ def check_response(r: requests.Response) -> None:
class Image:

@staticmethod
def generate_image_dump(config: Config, other: Dict, images: List[List],
def generate_image_dump(config: Config, other: OtherConfig, images: List[List],
session: requests.Session):
""" Save files and descriptions using a file list """

bypass_cdn_image_compression: bool = other["bypass_cdn_image_compression"]
image_timestamp_interval: Optional[str] = other["image_timestamp_interval"]
ia_wbm_booster: int = other["ia_wbm_booster"]
add_referer_header: Optional[str] = other["add_referer_header"] # None, "auto", {URL}

image_timestamp_intervals = None
if image_timestamp_interval: # 2019-01-02T01:36:06Z/2023-08-12T10:36:06Z
image_timestamp_intervals = image_timestamp_interval.split("/")
if other.image_timestamp_interval:
image_timestamp_intervals = other.image_timestamp_interval.split("/")
assert len(image_timestamp_intervals) == 2
image_timestamp_intervals = [
datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ")
Expand Down Expand Up @@ -88,7 +83,7 @@ def modify_params(params: Optional[Dict] = None) -> Dict:
""" bypass Cloudflare Polish (image optimization) """
if params is None:
params = {}
if bypass_cdn_image_compression is True:
if other.bypass_cdn_image_compression is True:
# bypass Cloudflare Polish (image optimization)
# <https://developers.cloudflare.com/images/polish/>
params["_wiki_t"] = int(time.time()*1000)
Expand All @@ -100,11 +95,11 @@ def modify_headers(headers: Optional[Dict] = None) -> Dict:
""" add HTTP Referer header """
if headers is None:
headers = {}
if add_referer_header:
if other.add_referer_header:
url = config.index if config.index else config.api
parsed_url = urllib.parse.urlparse(
add_referer_header
if add_referer_header != "auto"
other.add_referer_header
if other.add_referer_header != "auto"
else url
)

Expand Down Expand Up @@ -144,10 +139,10 @@ def modify_headers(headers: Optional[Dict] = None) -> Dict:
<= datetime.datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")
<= image_timestamp_intervals[1]
):
print(f" timestamp {timestamp} is not in interval {image_timestamp_interval}: {filename_underscore}")
print(f" timestamp {timestamp} is not in interval {other.image_timestamp_interval}: {filename_underscore}")
continue
else:
print(f" timestamp {timestamp} is in interval {image_timestamp_interval}: {filename_underscore}")
print(f" timestamp {timestamp} is in interval {other.image_timestamp_interval}: {filename_underscore}")

# saving file
if filename_underscore != urllib.parse.unquote(filename_underscore):
Expand Down Expand Up @@ -188,21 +183,21 @@ def modify_headers(headers: Optional[Dict] = None) -> Dict:
url = url_raw

r: Optional[requests.Response] = None
if ia_wbm_booster:
if other.ia_wbm_booster:
def get_ia_wbm_response() -> Optional[requests.Response]:
""" Get response from Internet Archive Wayback Machine
return None if not found / failed """
if ia_wbm_booster in (WBM_EARLIEST, WBN_LATEST):
ia_timestamp = ia_wbm_booster
elif ia_wbm_booster == WBM_BEST:
if other.ia_wbm_booster in (WBM_EARLIEST, WBN_LATEST):
ia_timestamp = other.ia_wbm_booster
elif other.ia_wbm_booster == WBM_BEST:
if timestamp != NULL:
ia_timestamp = [x for x in timestamp if x.isdigit()][0:8]
ia_timestamp = "".join(ia_timestamp)
else:
print(f"ia_wbm_booster: timestamp is {NULL}, use latest timestamp")
ia_timestamp = 2
else:
raise ValueError(f"ia_wbm_booster is {ia_wbm_booster}, but it should be 0, 1, 2 or 3")
raise ValueError(f"ia_wbm_booster is {other.ia_wbm_booster}, but it should be 0, 1, 2 or 3")

available_api = "http://archive.org/wayback/available"
# TODO: cdx_api = "http://web.archive.org/cdx/search/cdx"
Expand Down Expand Up @@ -358,7 +353,7 @@ def get_ia_wbm_response() -> Optional[requests.Response]:
patch_sess.release()
print(f"Downloaded {c_savedImageFiles} files to 'images' dir")
print(f"Downloaded {c_savedMismatchImageFiles} files to 'images_mismatch' dir")
if ia_wbm_booster and c_wbm_speedup_files:
if other.ia_wbm_booster and c_wbm_speedup_files:
print(f"(WBM speedup: {c_wbm_speedup_files} files)")


Expand Down Expand Up @@ -646,7 +641,7 @@ def get_image_names_API(config: Config, session: requests.Session):


@staticmethod
def save_image_names(config: Config, other: Dict, images: List[List]):
def save_image_names(config: Config, other: OtherConfig, images: List[List]):
"""Save image list in a file, including filename, url, uploader and other metadata"""

images_filename = "{}-{}-images.txt".format(
Expand Down Expand Up @@ -681,13 +676,11 @@ def save_image_names(config: Config, other: Dict, images: List[List]):
print("Image metadata (images.txt) saved at:", images_filename)
print(f"Estimated size of all images (images.txt): {c_images_size} bytes ({c_images_size/1024/1024/1024:.2f} GiB)")

assert_max_images: Optional[int] = other["assert_max_images"]
assert_max_images_bytes: Optional[int] = other["assert_max_images_bytes"]
try:
assert len(images) <= assert_max_images if assert_max_images is not None else True
print(f"--assert_max_images: {assert_max_images}, passed")
assert c_images_size <= assert_max_images_bytes if assert_max_images_bytes is not None else True
print(f"--assert_max_images_bytes: {assert_max_images_bytes}, passed")
assert len(images) <= other.assert_max_images if other.assert_max_images is not None else True
print(f"--assert_max_images: {other.assert_max_images}, passed")
assert c_images_size <= other.assert_max_images_bytes if other.assert_max_images_bytes is not None else True
print(f"--assert_max_images_bytes: {other.assert_max_images_bytes}, passed")
except AssertionError:
import traceback
traceback.print_exc()
Expand Down
Loading

0 comments on commit 14fa468

Please sign in to comment.