-
Notifications
You must be signed in to change notification settings - Fork 0
/
2.py
89 lines (75 loc) · 3.08 KB
/
2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import re
import logging
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO)
def clean_filename(filename):
# 替换连续的标点符号和空白字符为单个下划线,并确保不以下划线结尾
filename = re.sub(r'[\W_]+', '_', filename)
return filename.rstrip('_') # 移除字符串末尾的下划线
chrome_options = Options()
chrome_options.add_argument("--headless")
driver_path = "C:\\Users\\ykla\\chromedriver.exe"
service = Service(executable_path=driver_path)
driver = webdriver.Chrome(service=service, options=chrome_options)
if os.path.exists('SEP'):
os.rename('SEP', 'SEP_old')
if not os.path.exists("SEP"):
os.makedirs("SEP")
with open("1.txt", "r", encoding='utf-8') as f:
links = f.readlines()
failed_links = []
total_titles = len(links)
with tqdm(total=total_titles, desc="进度") as pbar:
for link in links:
link = link.strip()
last_http_index = link.rfind("http")
if last_http_index != -1:
title = link[:last_http_index].strip().replace(" ", "")
url = link[last_http_index:].strip()
else:
logging.warning(f"无法提取有效的链接: {link}")
failed_links.append(link)
continue
# 提取entries/后面的内容作为文件名
entry_content = re.search(r'/entries/([^/]+)/?$', url)
if entry_content:
filename = clean_filename(entry_content.group(1)) # 清理文件名
else:
logging.warning(f"无法提取有效的文件名: {url}")
failed_links.append(link)
continue
logging.info(f"正在抓取链接: {url}")
try:
driver.get(url)
if driver.title:
logging.info("成功连接到网站")
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
content_div = soup.find('div', {'id': 'content'})
if content_div:
content = content_div.get_text(separator='\n')
file_path = os.path.join("SEP", f"{filename}.txt")
if not os.path.exists(file_path):
with open(file_path, "w", encoding='utf-8') as f:
f.write(content)
logging.info(f"成功写入内容到文件: {file_path}")
else:
logging.warning(f"文件已存在: {file_path}。跳过。")
else:
logging.warning("无法在页面上找到内容。")
else:
logging.warning("连接失败")
except Exception as e:
logging.error(f"抓取链接失败: {url}。错误: {e}")
failed_links.append(link)
pbar.update(1)
driver.quit()
if failed_links:
logging.error("无法抓取以下链接:")
for link in failed_links:
logging.error(link)