Skip to content

Commit

Permalink
poc
Browse files Browse the repository at this point in the history
  • Loading branch information
yzqzss committed May 2, 2024
1 parent 70d9464 commit 42d9a6f
Show file tree
Hide file tree
Showing 10 changed files with 420 additions and 0 deletions.
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@ requires-python = ">=3.9"
readme = "README.md"
license = {text = "AGPLv3"}

[project.scripts]
discuz-logger = "discuz_logger.main:main"

[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"
Expand Down
Empty file added src/discuz_logger/__init__.py
Empty file.
28 changes: 28 additions & 0 deletions src/discuz_logger/define/check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from dataclasses import dataclass

@dataclass
class ResponseCheck:
discuzversion: str
""" X3.4 """
charset: str
""" utf-8"""
version: str
""" 4 """
pluginversion: str
""" 1.4.7 """
oemversion: str
""" 0 """
regname: str
""" ref """
qqconnect: str
""" 0 """
sitename: str
""" 论坛名 """
mysiteid: str
""" 论坛id? """
ucenterurl: str
""" https://avatar.elecfans.com/uc_server """
setting: dict
""" {'closeforumorderby': '0'} """
extends: dict
""" {'used': None, 'lastupdate': None}) """
43 changes: 43 additions & 0 deletions src/discuz_logger/define/forum_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from dataclasses import dataclass

@dataclass
class Catlist:
fid: str
name: str
forums: str


class Forumlist:
fid: str
name: str
threads: str
""" 不准确 """
posts: str
""" 不准确 """
todayposts: str
description: str
icon: str
sublist: list

def __init__(self, **kwargs):
for k, v in kwargs.items():
setattr(self, k, v)

def __repr__(self):
return f"<Forumlist {' '.join([f'{k}={v}' for k, v in self.__dict__.items()])}>"

@dataclass
class ResponseForumIndex:
Version: str
""" 4 """
Charset: str
""" UTF-8 """
Variables: dict

@property
def catlist(self):
return [Catlist(**cat) for cat in self.Variables["catlist"]]

@property
def forumlist(self):
return [Forumlist(**forum) for forum in self.Variables["forumlist"]]
29 changes: 29 additions & 0 deletions src/discuz_logger/define/post.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@

from dataclasses import dataclass
from typing import Optional


@dataclass
class Post:
pid: str
tid: str
first: str
author: str
authorid: str
dateline: str
message: str
anonymous: str
attachment: str
status: str
replycredit: str
position: str
groupid: str
number: str
dbdateline: str
groupiconid: str

adminid: Optional[str] = None
attachments: Optional[list] = None
imagelist: Optional[list] = None
memberstatus: Optional[str] = None
username: Optional[str] = None
64 changes: 64 additions & 0 deletions src/discuz_logger/define/thread.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from dataclasses import dataclass

# {
# "tid": "23816",
# "typeid": "9",
# "readperm": "0",
# "price": "0",
# "author": "gl695133087",
# "authorid": "54639",
# "subject": "关于一些homeassistant启动后掉线的经验",
# "dateline": "2023-12-31",
# "lastpost": "2023-12-31 11:15",
# "lastposter": "gl695133087",
# "views": "570",
# "replies": "0",
# "displayorder": "0",
# "digest": "0",
# "special": "0",
# "attachment": "0",
# "recommend_add": "0",
# "replycredit": "0",
# "dbdateline": "1703992521",
# "dblastpost": "1703992521",
# "rushreply": "0",
# "reply": [
# {
# "pid": "581729",
# "author": "pangls",
# "authorid": "82180",
# "message": "百度胖老师吧上海宝山公安通河新村派出所民警欺负绑 架谋杀胖老师百度360搜索百度胖老师吧上海宝山公安通河 ..."
# }
# ]
# },
@dataclass
class ThreadMeta:
tid: str
typeid: str
readperm: str
price: str
author: str
authorid: str
subject: str
dateline: str
lastpost: str
lastposter: str
views: str
replies: str
displayorder: str
digest: str
special: str
attachment: str
recommend_add: str
replycredit: str
dbdateline: str
dblastpost: str
rushreply: str
reply: list

def __init__(self, **kwargs):
for k, v in kwargs.items():
setattr(self, k, v)

def __repr__(self):
return f"<ThreadMeta {' '.join([f'{k}={v}' for k, v in self.__dict__.items()])}>"
119 changes: 119 additions & 0 deletions src/discuz_logger/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import asyncio
import json
from pathlib import Path
from typing import Coroutine, Tuple

import httpx
from tqdm import tqdm

from discuz_logger.define.forum_index import Forumlist
from discuz_logger.define.thread import ThreadMeta
from discuz_logger.mobile_api import MobileApi
from discuz_logger.utils import APIHelper, arg_parser, json_dump

async def forum_worker(queue: asyncio.Queue[Coroutine]):
while True:
task = await queue.get()
await task
queue.task_done()


async def thread_worker(queue: asyncio.Queue[Tuple[MobileApi, ThreadMeta, Path]]):
no_first_check = True
while True:
site, thread_meta, thread_dir = await queue.get()
thread_dir.mkdir(parents=True, exist_ok=True)

maxposition = maxposition_last_run = -1
fetched_postion = -2
thread_json_path = thread_dir / "thread.json"
if thread_json_path.exists():
if no_first_check:
continue
thread = json.loads(thread_json_path.read_text())
maxposition_last_run = int(thread["maxposition"])
viewthread = None
tqd = None
page = 1
while fetched_postion < maxposition:
for t in range(3):
try:
viewthread, _ = await site.viewthread(thread_meta.tid, page=page)
except Exception as e:
if t == 2:
raise e
print(f"tid={thread_meta.tid} page={page} ,retry={t} error:{e}")
await asyncio.sleep(10)
continue
maxposition = APIHelper.get_maxposition(viewthread)

if maxposition == maxposition_last_run:
break
if len(APIHelper.get_posts(viewthread)) == 0:
assert int(thread_meta.readperm) != 0
print(f"tid={thread_meta.tid} readperm:{thread_meta.readperm}")
break

if tqd is None:
tqd = tqdm(desc=f"tid={thread_meta.tid} subj:{thread_meta.subject}", unit="posts", dynamic_ncols=True)

for post, post_raw in APIHelper.get_posts(viewthread):
fetched_postion = int(post.position) if int(post.position) > fetched_postion else fetched_postion

with open(thread_dir / f"pid-{post.pid}.json", "w") as f:
f.write(json_dump(post_raw))

if fetched_postion < 0:
fetched_postion = 0

tqd.total = maxposition
tqd.n = fetched_postion
tqd.refresh()

page += 1

assert viewthread is not None
thread_json_path.write_text(json_dump(APIHelper.get_thread(viewthread)))

tqd.close() if tqd is not None else None
queue.task_done()


async def _main():
args = arg_parser()

forum_queue: asyncio.Queue[Coroutine] = asyncio.Queue(maxsize=5)
threads_queue: asyncio.Queue[Tuple[MobileApi, ThreadMeta, Path]] = asyncio.Queue(maxsize=100)
for i in range(5):
asyncio.create_task(forum_worker(forum_queue))
for i in range(3):
asyncio.create_task(thread_worker(threads_queue))

transport = httpx.AsyncHTTPTransport(retries=5, http1=True, http2=True)
client = httpx.AsyncClient(transport=transport, headers={"User-Agent": "saveweb/0.1 ([email protected])"}, timeout=30)
site = MobileApi(client, args.site)
check, r = await site.check_mobile_api()
site_dir = Path("data") / "site" / check.mysiteid
site_dir.mkdir(parents=True, exist_ok=True)
with open(site_dir / "check.json", "w") as f:
f.write(json_dump(r.json()))
print(check)
index, _ = await site.get_forum_index()

async def put_threads_to_queue(forum: Forumlist):
async for thread in site.iter_threads(forum.fid):
await threads_queue.put((site, thread, site_dir / "thread" / thread.tid))

for forum in index.forumlist:
co = put_threads_to_queue(forum)
await forum_queue.put(co)

await forum_queue.join()
await threads_queue.join()


def main():
asyncio.run(_main())

if __name__ == "__main__":
main()
97 changes: 97 additions & 0 deletions src/discuz_logger/mobile_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# from rich import print
import httpx
from tqdm import tqdm

from discuz_logger.define.check import ResponseCheck
from discuz_logger.define.forum_index import ResponseForumIndex
from discuz_logger.utils import APIHelper
# https://bbs.emath.ac.cn/api/mobile/index.php?check=check


class MobileApi:
client: httpx.AsyncClient
base_url: str
API_PATH: str = "/api/mobile/index.php"

def __init__(self, client: httpx.AsyncClient, site: str):
self.client = client
assert not site.endswith("/")
self.base_url = site

async def check_mobile_api(self):
r = await self.client.get(
self.base_url + "/api/mobile/index.php", params={"check": "check"}
)
return ResponseCheck(**r.json()), r

@property
def bbs_logo_url(self):
return self.base_url + "/static/image/common/logo.png"

def get_bbs_medal_image_url(self, image):
return self.base_url + f"/static/image/common/{image}"

def get_bbs_mobile_image_url(self, image):
return self.base_url + f"/static/image/mobile/{image}"

@property
def login_web_url(self):
return self.base_url + "/member.php?mod=logging&action=login"

def get_attachment_with_alien_code(self, alien_code):
return self.base_url + "/forum.php?mod=attachment&aid=" + alien_code

async def get_forum_index(self):
r = await self.client.get(
self.base_url + self.API_PATH, params={"version": 4, "module": "forumindex"}
)
index = ResponseForumIndex(**r.json())
return index, r

async def viewthread(self, tid, page=1):
r = await self.client.get(
self.base_url + self.API_PATH,
params={"version": 4, "module": "viewthread", "tid": tid, "page": page},
)
return r.json(), r

# p?mod=forumdisplay&fid=2&orderby=dateline&orderby=dateline&filter=author&page=4&t=1812754
async def forumdisplay(self, fid, page=1, orderby="dateline", filter="author"):
r = await self.client.get(
self.base_url + self.API_PATH,
params={
"version": 4,
"module": "forumdisplay",
"fid": fid,
"page": page,
"orderby": orderby,
"filter": filter,
},
)
return r.json()

# forum.php?mod=forumdisplay&fid=39
# forum.php?mod=forumdisplay&fid=39&orderby=dateline&filter=author&orderby=dateline&page=2
# forum.php?mod=redirect&tid=479118&goto=lastpost # lastpost 跳转到最后一页最后一个回复
async def iter_threads(self, fid: str | int):
page = 1
forumdisplay = await self.forumdisplay(fid, page=page)
assert int(forumdisplay["Variables"]["forum"]["threads"]) <= int(
forumdisplay["Variables"]["forum"]["threadcount"]
)
threads_total: int = int(forumdisplay["Variables"]["forum"]["threads"])
threads_fetched: int = 0
tqd = tqdm(total=threads_total, desc=f"fid={fid}", unit="threads",dynamic_ncols=True)
while threads_fetched < threads_total:
forumdisplay = await self.forumdisplay(fid, page=page)
threads_total = int(forumdisplay["Variables"]["forum"]["threads"])
threads = APIHelper.get_thread_metas(forumdisplay)
for thread in threads:
yield thread

threads_fetched += len(threads)
page += 1

tqd.total = threads_total
tqd.update(len(threads))
tqd.close()
Loading

0 comments on commit 42d9a6f

Please sign in to comment.