-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
420 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
from dataclasses import dataclass | ||
|
||
@dataclass | ||
class ResponseCheck: | ||
discuzversion: str | ||
""" X3.4 """ | ||
charset: str | ||
""" utf-8""" | ||
version: str | ||
""" 4 """ | ||
pluginversion: str | ||
""" 1.4.7 """ | ||
oemversion: str | ||
""" 0 """ | ||
regname: str | ||
""" ref """ | ||
qqconnect: str | ||
""" 0 """ | ||
sitename: str | ||
""" 论坛名 """ | ||
mysiteid: str | ||
""" 论坛id? """ | ||
ucenterurl: str | ||
""" https://avatar.elecfans.com/uc_server """ | ||
setting: dict | ||
""" {'closeforumorderby': '0'} """ | ||
extends: dict | ||
""" {'used': None, 'lastupdate': None}) """ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from dataclasses import dataclass | ||
|
||
@dataclass | ||
class Catlist: | ||
fid: str | ||
name: str | ||
forums: str | ||
|
||
|
||
class Forumlist: | ||
fid: str | ||
name: str | ||
threads: str | ||
""" 不准确 """ | ||
posts: str | ||
""" 不准确 """ | ||
todayposts: str | ||
description: str | ||
icon: str | ||
sublist: list | ||
|
||
def __init__(self, **kwargs): | ||
for k, v in kwargs.items(): | ||
setattr(self, k, v) | ||
|
||
def __repr__(self): | ||
return f"<Forumlist {' '.join([f'{k}={v}' for k, v in self.__dict__.items()])}>" | ||
|
||
@dataclass | ||
class ResponseForumIndex: | ||
Version: str | ||
""" 4 """ | ||
Charset: str | ||
""" UTF-8 """ | ||
Variables: dict | ||
|
||
@property | ||
def catlist(self): | ||
return [Catlist(**cat) for cat in self.Variables["catlist"]] | ||
|
||
@property | ||
def forumlist(self): | ||
return [Forumlist(**forum) for forum in self.Variables["forumlist"]] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
|
||
from dataclasses import dataclass | ||
from typing import Optional | ||
|
||
|
||
@dataclass | ||
class Post: | ||
pid: str | ||
tid: str | ||
first: str | ||
author: str | ||
authorid: str | ||
dateline: str | ||
message: str | ||
anonymous: str | ||
attachment: str | ||
status: str | ||
replycredit: str | ||
position: str | ||
groupid: str | ||
number: str | ||
dbdateline: str | ||
groupiconid: str | ||
|
||
adminid: Optional[str] = None | ||
attachments: Optional[list] = None | ||
imagelist: Optional[list] = None | ||
memberstatus: Optional[str] = None | ||
username: Optional[str] = None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
from dataclasses import dataclass | ||
|
||
# { | ||
# "tid": "23816", | ||
# "typeid": "9", | ||
# "readperm": "0", | ||
# "price": "0", | ||
# "author": "gl695133087", | ||
# "authorid": "54639", | ||
# "subject": "关于一些homeassistant启动后掉线的经验", | ||
# "dateline": "2023-12-31", | ||
# "lastpost": "2023-12-31 11:15", | ||
# "lastposter": "gl695133087", | ||
# "views": "570", | ||
# "replies": "0", | ||
# "displayorder": "0", | ||
# "digest": "0", | ||
# "special": "0", | ||
# "attachment": "0", | ||
# "recommend_add": "0", | ||
# "replycredit": "0", | ||
# "dbdateline": "1703992521", | ||
# "dblastpost": "1703992521", | ||
# "rushreply": "0", | ||
# "reply": [ | ||
# { | ||
# "pid": "581729", | ||
# "author": "pangls", | ||
# "authorid": "82180", | ||
# "message": "百度胖老师吧上海宝山公安通河新村派出所民警欺负绑 架谋杀胖老师百度360搜索百度胖老师吧上海宝山公安通河 ..." | ||
# } | ||
# ] | ||
# }, | ||
@dataclass | ||
class ThreadMeta: | ||
tid: str | ||
typeid: str | ||
readperm: str | ||
price: str | ||
author: str | ||
authorid: str | ||
subject: str | ||
dateline: str | ||
lastpost: str | ||
lastposter: str | ||
views: str | ||
replies: str | ||
displayorder: str | ||
digest: str | ||
special: str | ||
attachment: str | ||
recommend_add: str | ||
replycredit: str | ||
dbdateline: str | ||
dblastpost: str | ||
rushreply: str | ||
reply: list | ||
|
||
def __init__(self, **kwargs): | ||
for k, v in kwargs.items(): | ||
setattr(self, k, v) | ||
|
||
def __repr__(self): | ||
return f"<ThreadMeta {' '.join([f'{k}={v}' for k, v in self.__dict__.items()])}>" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import asyncio | ||
import json | ||
from pathlib import Path | ||
from typing import Coroutine, Tuple | ||
|
||
import httpx | ||
from tqdm import tqdm | ||
|
||
from discuz_logger.define.forum_index import Forumlist | ||
from discuz_logger.define.thread import ThreadMeta | ||
from discuz_logger.mobile_api import MobileApi | ||
from discuz_logger.utils import APIHelper, arg_parser, json_dump | ||
|
||
async def forum_worker(queue: asyncio.Queue[Coroutine]): | ||
while True: | ||
task = await queue.get() | ||
await task | ||
queue.task_done() | ||
|
||
|
||
async def thread_worker(queue: asyncio.Queue[Tuple[MobileApi, ThreadMeta, Path]]): | ||
no_first_check = True | ||
while True: | ||
site, thread_meta, thread_dir = await queue.get() | ||
thread_dir.mkdir(parents=True, exist_ok=True) | ||
|
||
maxposition = maxposition_last_run = -1 | ||
fetched_postion = -2 | ||
thread_json_path = thread_dir / "thread.json" | ||
if thread_json_path.exists(): | ||
if no_first_check: | ||
continue | ||
thread = json.loads(thread_json_path.read_text()) | ||
maxposition_last_run = int(thread["maxposition"]) | ||
viewthread = None | ||
tqd = None | ||
page = 1 | ||
while fetched_postion < maxposition: | ||
for t in range(3): | ||
try: | ||
viewthread, _ = await site.viewthread(thread_meta.tid, page=page) | ||
except Exception as e: | ||
if t == 2: | ||
raise e | ||
print(f"tid={thread_meta.tid} page={page} ,retry={t} error:{e}") | ||
await asyncio.sleep(10) | ||
continue | ||
maxposition = APIHelper.get_maxposition(viewthread) | ||
|
||
if maxposition == maxposition_last_run: | ||
break | ||
if len(APIHelper.get_posts(viewthread)) == 0: | ||
assert int(thread_meta.readperm) != 0 | ||
print(f"tid={thread_meta.tid} readperm:{thread_meta.readperm}") | ||
break | ||
|
||
if tqd is None: | ||
tqd = tqdm(desc=f"tid={thread_meta.tid} subj:{thread_meta.subject}", unit="posts", dynamic_ncols=True) | ||
|
||
for post, post_raw in APIHelper.get_posts(viewthread): | ||
fetched_postion = int(post.position) if int(post.position) > fetched_postion else fetched_postion | ||
|
||
with open(thread_dir / f"pid-{post.pid}.json", "w") as f: | ||
f.write(json_dump(post_raw)) | ||
|
||
if fetched_postion < 0: | ||
fetched_postion = 0 | ||
|
||
tqd.total = maxposition | ||
tqd.n = fetched_postion | ||
tqd.refresh() | ||
|
||
page += 1 | ||
|
||
assert viewthread is not None | ||
thread_json_path.write_text(json_dump(APIHelper.get_thread(viewthread))) | ||
|
||
tqd.close() if tqd is not None else None | ||
queue.task_done() | ||
|
||
|
||
async def _main(): | ||
args = arg_parser() | ||
|
||
forum_queue: asyncio.Queue[Coroutine] = asyncio.Queue(maxsize=5) | ||
threads_queue: asyncio.Queue[Tuple[MobileApi, ThreadMeta, Path]] = asyncio.Queue(maxsize=100) | ||
for i in range(5): | ||
asyncio.create_task(forum_worker(forum_queue)) | ||
for i in range(3): | ||
asyncio.create_task(thread_worker(threads_queue)) | ||
|
||
transport = httpx.AsyncHTTPTransport(retries=5, http1=True, http2=True) | ||
client = httpx.AsyncClient(transport=transport, headers={"User-Agent": "saveweb/0.1 ([email protected])"}, timeout=30) | ||
site = MobileApi(client, args.site) | ||
check, r = await site.check_mobile_api() | ||
site_dir = Path("data") / "site" / check.mysiteid | ||
site_dir.mkdir(parents=True, exist_ok=True) | ||
with open(site_dir / "check.json", "w") as f: | ||
f.write(json_dump(r.json())) | ||
print(check) | ||
index, _ = await site.get_forum_index() | ||
|
||
async def put_threads_to_queue(forum: Forumlist): | ||
async for thread in site.iter_threads(forum.fid): | ||
await threads_queue.put((site, thread, site_dir / "thread" / thread.tid)) | ||
|
||
for forum in index.forumlist: | ||
co = put_threads_to_queue(forum) | ||
await forum_queue.put(co) | ||
|
||
await forum_queue.join() | ||
await threads_queue.join() | ||
|
||
|
||
def main(): | ||
asyncio.run(_main()) | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
# from rich import print | ||
import httpx | ||
from tqdm import tqdm | ||
|
||
from discuz_logger.define.check import ResponseCheck | ||
from discuz_logger.define.forum_index import ResponseForumIndex | ||
from discuz_logger.utils import APIHelper | ||
# https://bbs.emath.ac.cn/api/mobile/index.php?check=check | ||
|
||
|
||
class MobileApi: | ||
client: httpx.AsyncClient | ||
base_url: str | ||
API_PATH: str = "/api/mobile/index.php" | ||
|
||
def __init__(self, client: httpx.AsyncClient, site: str): | ||
self.client = client | ||
assert not site.endswith("/") | ||
self.base_url = site | ||
|
||
async def check_mobile_api(self): | ||
r = await self.client.get( | ||
self.base_url + "/api/mobile/index.php", params={"check": "check"} | ||
) | ||
return ResponseCheck(**r.json()), r | ||
|
||
@property | ||
def bbs_logo_url(self): | ||
return self.base_url + "/static/image/common/logo.png" | ||
|
||
def get_bbs_medal_image_url(self, image): | ||
return self.base_url + f"/static/image/common/{image}" | ||
|
||
def get_bbs_mobile_image_url(self, image): | ||
return self.base_url + f"/static/image/mobile/{image}" | ||
|
||
@property | ||
def login_web_url(self): | ||
return self.base_url + "/member.php?mod=logging&action=login" | ||
|
||
def get_attachment_with_alien_code(self, alien_code): | ||
return self.base_url + "/forum.php?mod=attachment&aid=" + alien_code | ||
|
||
async def get_forum_index(self): | ||
r = await self.client.get( | ||
self.base_url + self.API_PATH, params={"version": 4, "module": "forumindex"} | ||
) | ||
index = ResponseForumIndex(**r.json()) | ||
return index, r | ||
|
||
async def viewthread(self, tid, page=1): | ||
r = await self.client.get( | ||
self.base_url + self.API_PATH, | ||
params={"version": 4, "module": "viewthread", "tid": tid, "page": page}, | ||
) | ||
return r.json(), r | ||
|
||
# p?mod=forumdisplay&fid=2&orderby=dateline&orderby=dateline&filter=author&page=4&t=1812754 | ||
async def forumdisplay(self, fid, page=1, orderby="dateline", filter="author"): | ||
r = await self.client.get( | ||
self.base_url + self.API_PATH, | ||
params={ | ||
"version": 4, | ||
"module": "forumdisplay", | ||
"fid": fid, | ||
"page": page, | ||
"orderby": orderby, | ||
"filter": filter, | ||
}, | ||
) | ||
return r.json() | ||
|
||
# forum.php?mod=forumdisplay&fid=39 | ||
# forum.php?mod=forumdisplay&fid=39&orderby=dateline&filter=author&orderby=dateline&page=2 | ||
# forum.php?mod=redirect&tid=479118&goto=lastpost # lastpost 跳转到最后一页最后一个回复 | ||
async def iter_threads(self, fid: str | int): | ||
page = 1 | ||
forumdisplay = await self.forumdisplay(fid, page=page) | ||
assert int(forumdisplay["Variables"]["forum"]["threads"]) <= int( | ||
forumdisplay["Variables"]["forum"]["threadcount"] | ||
) | ||
threads_total: int = int(forumdisplay["Variables"]["forum"]["threads"]) | ||
threads_fetched: int = 0 | ||
tqd = tqdm(total=threads_total, desc=f"fid={fid}", unit="threads",dynamic_ncols=True) | ||
while threads_fetched < threads_total: | ||
forumdisplay = await self.forumdisplay(fid, page=page) | ||
threads_total = int(forumdisplay["Variables"]["forum"]["threads"]) | ||
threads = APIHelper.get_thread_metas(forumdisplay) | ||
for thread in threads: | ||
yield thread | ||
|
||
threads_fetched += len(threads) | ||
page += 1 | ||
|
||
tqd.total = threads_total | ||
tqd.update(len(threads)) | ||
tqd.close() |
Oops, something went wrong.