-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxml2urlparse_v2.py
48 lines (39 loc) · 1.43 KB
/
xml2urlparse_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
from bs4 import BeautifulSoup
import urllib.request
import lxml
import pandas as pd
#import logging
urlstart = 'https://www.domain.com/sitemap_index.xml'
# Function to parse XML for urls
def xmlparse(urlstart):
urllist = []
## Parse and Cleanup the XML
req = urllib.request.urlopen(urlstart)
xml = BeautifulSoup(req, 'lxml')
for item in xml.findAll('loc'):
urllist.append(str(item))
for x in range(len(urllist)):
urllist[x] = urllist[x].replace("<loc>", "")
urllist[x] = urllist[x].replace("</loc>", "")
## Grab the urls; and build the big final list of urls and suburls
url_interm = []
for url in urllist:
if url.endswith(".xml"):
url_interm.append(url)
# recursively call the function; if we have a .xml subdomain
# this lets us grab the nested levels of xml pages
burner_list = xmlparse(url)
# url_interm list combination of the suburls to our base url
url_interm = url_interm + burner_list
else:
# If the url doesnt have .xml at the end, append it, and keep going
url_interm.append(url)
# our final output to the user
return url_interm
output = xmlparse(urlstart)
# Get the data into a dataframe
df = pd.DataFrame(output, columns=['url'])
# Then export to a csv
df.to_csv('urls.csv')
# Tell me it worked!
print("The output csv file written successfully and generated...")