forked from mayooear/gpt4-pdf-chatbot-langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxml-to-txt.py
67 lines (53 loc) · 2.46 KB
/
xml-to-txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import scrapy
from scrapy.loader import ItemLoader
from scrapy.spiders import XMLFeedSpider
from scrapy.http import Request
import xmltodict
import os
class WebsiteAnalyzer: # Replace with your desired class name
def __init__(self, input_dir, output_file="urls.txt"):
"""
Initializes the WebsiteAnalyzer with the input directory and output file paths.
Args:
input_dir (str): Path to the directory containing XML sitemap files.
output_file (str, optional): Path to the output file where extracted URLs will be saved. Defaults to "urls.txt".
"""
self.input_dir = input_dir
self.output_file = output_file
self.seen_urls = [] # Track processed URLs
def parse_sitemap(self, xml_file):
"""
Parses a single XML sitemap file and extracts relevant URLs, appending them to the output file.
Args:
xml_file (str): Path to the XML sitemap file.
"""
if not os.path.isfile(xml_file):
print(f"Warning: Skipping non-existent file: {xml_file}")
return
with open(xml_file, "r") as xml_file:
obj = xmltodict.parse(xml_file.read())
urls = obj['urlset']['url']
append_mode = "w" if not os.path.exists(self.output_file) else "a"
with open(self.output_file, append_mode) as text_file:
for url in urls:
href = None # Initialize href in case 'xhtml:link' is missing
if 'xhtml:link' in url:
for link in url['xhtml:link']:
if link.get('@hreflang') == "x-default":
href = link.get('@href')
break # Stop after finding the first "x-default" hreflang
if href and href not in self.seen_urls:
text_file.write(href + "\n")
self.seen_urls.append(href) # Add extracted URL to the set
def parse_multiple_sitemaps(self):
"""
Iterates through all XML files in the input directory, calling `parse_sitemap` for each.
"""
for filename in os.listdir(self.input_dir):
if filename.endswith(".xml"):
xml_file = os.path.join(self.input_dir, filename)
self.parse_sitemap(xml_file)
if __name__ == "__main__":
input_dir = "sitemaps/" # Replace with your actual directory path
analyzer = WebsiteAnalyzer(input_dir)
analyzer.parse_multiple_sitemaps()