Skip to content

Commit

Permalink
Merge pull request #9 from KennBro/add-output-param
Browse files Browse the repository at this point in the history
Add output param
  • Loading branch information
opsdisk authored Jan 21, 2022
2 parents d80e9de + fde7bcb commit ab59ec3
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 19 deletions.
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,9 @@ client = yagooglesearch.SearchClient(
max_search_result_urls_to_return=100,
http_429_cool_off_time_in_minutes=45,
http_429_cool_off_factor=1.5,
proxy="socks5h://127.0.0.1:9050",
# proxy="socks5h://127.0.0.1:9050",
verbosity=5,
verbose_output=True, # False (only URLs) or True (rank, title, description, and URL)
)
client.assign_random_user_agent()

Expand Down Expand Up @@ -286,3 +287,7 @@ Project Link: [https://github.com/opsdisk/yagooglesearch](https://github.com/ops

* [Mario Vilas](https://github.com/MarioVilas) for his amazing work on the original
[googlesearch](https://github.com/MarioVilas/googlesearch) library.

## Contributors

* [KennBro](https://github.com/KennBro) - <https://github.com/opsdisk/yagooglesearch/pull/9>
Binary file added img/http429_detection_string_in_returned_list.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="yagooglesearch",
version="1.5.0",
version="1.6.0",
author="Brennon Thomas",
author_email="[email protected]",
description="A Python library for executing intelligent, realistic-looking, and tunable Google searches.",
Expand Down
64 changes: 47 additions & 17 deletions yagooglesearch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

# Custom Python libraries.

__version__ = "1.5.0"
__version__ = "1.6.0"

# Logging
ROOT_LOGGER = logging.getLogger("yagooglesearch")
Expand Down Expand Up @@ -85,6 +85,7 @@ def __init__(
proxy="",
verify_ssl=True,
verbosity=5,
verbose_output=False,
):

"""
Expand Down Expand Up @@ -116,9 +117,10 @@ def __init__(
:param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True.
This may need to be disabled in some HTTPS proxy instances.
:param int verbosity: Logging and console output verbosity.
:param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False.
:rtype: List of str
:return: List of found URLs.
:return: List of URLs found or list of {"rank", "title", "description", "url"}
"""

self.query = urllib.parse.quote_plus(query)
Expand All @@ -139,6 +141,7 @@ def __init__(
self.proxy = proxy
self.verify_ssl = verify_ssl
self.verbosity = verbosity
self.verbose_output = verbose_output

# Assign log level.
ROOT_LOGGER.setLevel((6 - self.verbosity) * 10)
Expand Down Expand Up @@ -394,11 +397,11 @@ def search(self):
"""Start the Google search.
:rtype: List of str
:return: List of URLs found
:return: List of URLs found or list of {"rank", "title", "description", "url"}
"""

# Set of URLs for the results found.
unique_urls_set = set()
# Consolidate search results.
self.search_result_list = []

# Count the number of valid, non-duplicate links found.
total_valid_links_found = 0
Expand Down Expand Up @@ -450,9 +453,8 @@ def search(self):
# HTTP 429 message returned from get_page() function, add "HTTP_429_DETECTED" to the set and return to the
# calling script.
if html == "HTTP_429_DETECTED":
unique_urls_set.add("HTTP_429_DETECTED")
self.unique_urls_list = list(unique_urls_set)
return self.unique_urls_list
self.search_result_list.append("HTTP_429_DETECTED")
return self.search_result_list

# Create the BeautifulSoup object.
soup = BeautifulSoup(html, "html.parser")
Expand Down Expand Up @@ -486,32 +488,60 @@ def search(self):
if not link:
continue

if self.verbose_output:

# Extract the URL title.
try:
title = a.get_text()
except Exception:
ROOT_LOGGER.warning(f"No title for link: {link}")
title = ""

# Extract the URL description.
try:
description = a.parent.parent.contents[1].get_text()

# Sometimes Google returns different structures.
if description == "":
description = a.parent.parent.contents[2].get_text()

except Exception:
ROOT_LOGGER.warning(f"No description for link: {link}")
description = ""

# Check if URL has already been found.
if link not in unique_urls_set:
if link not in self.search_result_list:

# Increase the counters.
valid_links_found_in_this_search += 1
total_valid_links_found += 1

ROOT_LOGGER.info(f"Found unique URL #{total_valid_links_found}: {link}")
unique_urls_set.add(link)

if self.verbose_output:
self.search_result_list.append(
{
"rank": total_valid_links_found, # Approximate rank according to yagooglesearch.
"title": title.strip(), # Remove leading and trailing spaces.
"description": description.strip(), # Remove leading and trailing spaces.
"url": link,
}
)
else:
self.search_result_list.append(link)

else:
ROOT_LOGGER.info(f"Duplicate URL found: {link}")

# If we reached the limit of requested URLS, return with the results.
if self.max_search_result_urls_to_return <= len(unique_urls_set):
# Convert to a list.
self.unique_urls_list = list(unique_urls_set)
return self.unique_urls_list
if self.max_search_result_urls_to_return <= len(self.search_result_list):
return self.search_result_list

# Determining if a "Next" URL page of results is not straightforward. If no valid links are found, the
# search results have been exhausted.
if valid_links_found_in_this_search == 0:
ROOT_LOGGER.info("No valid search results found on this page. Moving on...")
# Convert to a list.
self.unique_urls_list = list(unique_urls_set)
return self.unique_urls_list
return self.search_result_list

# Bump the starting page URL parameter for the next request.
self.start += self.num
Expand Down

0 comments on commit ab59ec3

Please sign in to comment.