You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I made few corrections to get the code working, I masked out things I don't need, but left then for other users. This code collects User, Data, Title and Review - THERE ARE ISSUES WITH THE USER, sometimes appends the city/country.
I needed the code to export to JSON, so I created a list of lists;
[0,{user, date, review}],[1,{user,date,review]},[2,{user,date,review]}....
I also corrected some cosmetic issues with the URL as it requested or{} and or{0}, and added a way out of the infinite loop issue reported on another thread.
Initial code was VERY HELPFUL, but didn't do what I needed to do and has some bugs; I hope this helps other like me, that want to extract reports to JSON.
# -*- coding: utf-8 -*- import requestsfrom bs4 import BeautifulSoupimport jsonimport webbrowserimport io def display(content, filename='output.html'): with open(filename, 'wb') as f: f.write(content) webbrowser.open(filename) def get_soup(session, url, show=False): r = session.get(url) if show: display(r.content, 'temp.html') if r.status_code != 200: # not OK print('[get_soup] status code:', r.status_code) else: return BeautifulSoup(r.text, 'html.parser') def post_soup(session, url, params, show=False): '''Read HTML from server and convert to Soup''' r = session.post(url, data=params) display(r.content, 'temp.html') if r.status_code != 200: # not OK print('[post_soup] status code:', r.status_code) else: def scrape(url, lang='ALL'): # create session to keep all cookies (etc.) between requests session = requests.Session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0', }) items = parse(session, url + '?filterLang=' + lang) return items def parse(session, url): '''Get number of reviews and start getting subpages with reviews''' print('[parse] url:', url) soup = get_soup(session, url) if not soup: print('[parse] no soup:', url) return num_reviews = soup.find('span', class_='reviews_header_count').text # get text num_reviews = num_reviews[1:-1] num_reviews = num_reviews.replace(',', '') num_reviews = int(num_reviews) # convert text into integer print('[parse] num_reviews ALL:', num_reviews) url_template = url.replace('.html', '-or{}.html') #print('[parse] url_template:', url_template) items = [] offset = 0 while(True): subpage_url = url_template.format(offset) if (not offset) or (offset == 0): subpage_items = parse_reviews(session, url) else: subpage_items = parse_reviews(session, subpage_url) if not subpage_items: break items += subpage_items if len(subpage_items) < 10: break offset += 10 if offset > num_reviews: break return items def get_reviews_ids(soup): items = soup.find_all('div', attrs={'data-reviewid': True}) if items: reviews_ids = [x.attrs['data-reviewid'] for x in items][::2] print('[get_reviews_ids] data-reviewid:', reviews_ids) return reviews_ids def parse_reviews(session, url): '''Get all reviews from one page''' print('[parse_reviews] url:', url) soup = get_soup(session, url) if not soup: print('[parse_reviews] no soup:', url) return hotel_name = soup.find('h1').text reviews_ids = get_reviews_ids(soup) if not reviews_ids: return #soup = get_more(session, reviews_ids) if not soup: print('[parse_reviews] no soup:', url) return items = [] for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')): # # badgets = review.find_all('span', class_='badgetext') # if len(badgets) > 0: # contributions = badgets[0].text # else: # contributions = '0' # # if len(badgets) > 1:# helpful_vote = badgets[1].text# else: # helpful_vote = '0' # user_loc = review.select_one('div.userLoc strong') # if user_loc: # user_loc = user_loc.text # else: # user_loc = '' # # bubble_rating = review.select_one('span.ui_bubble_rating')['class']# bubble_rating = bubble_rating[1].split('_')[-1] item = { 'review_user' : review.find('div', class_='info_text').text, 'review_date' : review.find('span', class_='ratingDate').text, # 'ratingDate' instead of 'relativeDate' 'review_title': review.find('span', class_='noQuotes').text, #'review_content': review.find('div', class_='entry').text, } items.append(item) print('\n--- Review --- \n') for key,val in item.items(): print(key+' :', val) print() return items def write_in_json(items, filename,): review = 0 reviews = [] total_reviews = len(items) #Create a List of Lists while (review < total_reviews) : reviews.append((review,items[review])) print(reviews[review]) review += 1 print('--- Writing File ---') with open(filename, 'w') as jsonfile: json.dump(reviews, jsonfile, ensure_ascii=False, indent=2) print('--- Done ---') start_urls = [ 'https://www.tripadvisor.es/Restaurant_Review-g187452-d12206985-Reviews-Baiba_Cafe-Oviedo_Asturias.html', ] lang = 'es' for url in start_urls: # get all reviews for 'url' and 'lang' items = scrape(url, lang) if not items: print('No reviews') else: #Write in JSON values filename = url.split('Reviews-')[1][:-5] + '_' + lang write_in_json(items, filename + '.json')
The text was updated successfully, but these errors were encountered:
I made few corrections to get the code working, I masked out things I don't need, but left then for other users. This code collects User, Data, Title and Review - THERE ARE ISSUES WITH THE USER, sometimes appends the city/country.
I needed the code to export to JSON, so I created a list of lists;
[0,{user, date, review}],[1,{user,date,review]},[2,{user,date,review]}....
I also corrected some cosmetic issues with the URL as it requested or{} and or{0}, and added a way out of the infinite loop issue reported on another thread.
Initial code was VERY HELPFUL, but didn't do what I needed to do and has some bugs; I hope this helps other like me, that want to extract reports to JSON.
# -*- coding: utf-8 -*-
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
r = session.post(url, data=params)
print('[post_soup] status code:', r.status_code)
else:
# create session to keep all cookies (etc.) between requests
session = requests.Session()
items = parse(session, url + '?filterLang=' + lang)
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
soup = get_soup(session, url)
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
items = []
while(True):
if (not offset) or (offset == 0):
subpage_items = parse_reviews(session, url)
else:
subpage_items = parse_reviews(session, subpage_url)
items += subpage_items
if offset > num_reviews:
break
def get_reviews_ids(soup):
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
print('[parse_reviews] url:', url)
if not soup:
print('[parse_reviews] no soup:', url)
return
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
# badgets = review.find_all('span', class_='badgetext')
# if len(badgets) > 0:
# contributions = badgets[0].text
# else:
# contributions = '0'
#
# if len(badgets) > 1:# helpful_vote = '0'
# user_loc = review.select_one('div.userLoc strong')
# if user_loc:
# user_loc = user_loc.text
# else:
# user_loc = ''
#
# bubble_rating = review.select_one('span.ui_bubble_rating')['class']item = {
'review_user' : review.find('div', class_='info_text').text,
'review_date' : review.find('span', class_='ratingDate').text, # 'ratingDate' instead of 'relativeDate'
'review_title': review.find('span', class_='noQuotes').text,
#'review_content': review.find('div', class_='entry').text,
}
print('\n--- Review --- \n')
print()
def write_in_json(items, filename,):
#Create a List of Lists
while (review < total_reviews) :
reviews.append((review,items[review]))
print(reviews[review])
review += 1
with open(filename, 'w') as jsonfile:
json.dump(reviews, jsonfile, ensure_ascii=False, indent=2)
start_urls = [
'https://www.tripadvisor.es/Restaurant_Review-g187452-d12206985-Reviews-Baiba_Cafe-Oviedo_Asturias.html',
]
for url in start_urls:
if not items:
print('No reviews')
else:
#Write in JSON values
filename = url.split('Reviews-')[1][:-5] + '_' + lang
write_in_json(items, filename + '.json')
The text was updated successfully, but these errors were encountered: