-
Notifications
You must be signed in to change notification settings - Fork 0
/
1.py
84 lines (69 loc) · 2.92 KB
/
1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import os
import requests
from bs4 import BeautifulSoup
def fetch_and_write_links(url, filename):
try:
print("正在连接到目录地址...")
response = requests.get(url)
print("连接成功!")
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
entries_count = 0
unique_entries = set() # 用于存储唯一的entries/后面的内容
with open(filename, 'w', encoding='utf-8') as f:
for link in links:
text = link.text.strip()
href = link.get('href')
if href and href.startswith('entries/'):
entry_id = href.split('entries/')[1] # 获取entries/后面的内容
if entry_id not in unique_entries:
unique_entries.add(entry_id)
entries_count += 1
full_url = 'https://plato.stanford.edu/' + href
line = f"{text}: {full_url}\n"
f.write(line)
print(f"写入链接:{line.strip()}")
else:
print(f"重复的entries/内容已排除:{entry_id}")
print("爬取完成!包含'entries'的链接数量为:", entries_count)
print("结果已保存到", filename)
except requests.RequestException as e:
print("请求页面内容时发生错误:", e)
def compare_and_output_diff():
if os.path.exists("1_old.txt"):
with open('1.txt', 'r', encoding='utf-8') as f1, open('1_old.txt', 'r', encoding='utf-8') as f2:
lines1 = f1.readlines()
lines2 = f2.readlines()
new_entries = [line for line in lines1 if line not in lines2]
missing_entries = [line for line in lines2 if line not in lines1]
with open('diff_txt.txt', 'w', encoding='utf-8') as f_diff:
if new_entries:
f_diff.write("新增的条目:\n")
f_diff.writelines(new_entries)
if missing_entries:
f_diff.write("缺少的条目:\n")
f_diff.writelines(missing_entries)
if new_entries:
print("新增的条目:")
for entry in new_entries:
print(entry.strip())
if missing_entries:
print("缺少的条目:")
for entry in missing_entries:
print(entry.strip())
else:
print("不存在 1_old.txt 文件,无法进行比较。")
def main():
if os.path.exists("check_time"):
if os.path.exists("1.txt"):
os.rename("1.txt", "1_old.txt")
else:
if os.path.exists("1.txt"):
os.remove("1.txt")
url = "https://plato.stanford.edu/contents.html"
fetch_and_write_links(url, '1.txt')
compare_and_output_diff()
with open("check_time", "w") as check_file:
pass
if __name__ == "__main__":
main()