Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add output param #9

Merged
merged 9 commits into from
Jan 21, 2022
Merged
7 changes: 6 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,9 @@ client = yagooglesearch.SearchClient(
max_search_result_urls_to_return=100,
http_429_cool_off_time_in_minutes=45,
http_429_cool_off_factor=1.5,
proxy="socks5h://127.0.0.1:9050",
# proxy="socks5h://127.0.0.1:9050",
verbosity=5,
verbose_output=True, # False (only URLs) or True (rank, title, description, and URL)
)
client.assign_random_user_agent()

Expand Down Expand Up @@ -286,3 +287,7 @@ Project Link: [https://github.com/opsdisk/yagooglesearch](https://github.com/ops

* [Mario Vilas](https://github.com/MarioVilas) for his amazing work on the original
[googlesearch](https://github.com/MarioVilas/googlesearch) library.

## Contributors

* [KennBro](https://github.com/KennBro) - <https://github.com/opsdisk/yagooglesearch/pull/9>
Binary file added img/http429_detection_string_in_returned_list.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="yagooglesearch",
version="1.5.0",
version="1.6.0",
author="Brennon Thomas",
author_email="[email protected]",
description="A Python library for executing intelligent, realistic-looking, and tunable Google searches.",
Expand Down
64 changes: 47 additions & 17 deletions yagooglesearch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

# Custom Python libraries.

__version__ = "1.5.0"
__version__ = "1.6.0"

# Logging
ROOT_LOGGER = logging.getLogger("yagooglesearch")
Expand Down Expand Up @@ -85,6 +85,7 @@ def __init__(
proxy="",
verify_ssl=True,
verbosity=5,
verbose_output=False,
):

"""
Expand Down Expand Up @@ -116,9 +117,10 @@ def __init__(
:param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True.
This may need to be disabled in some HTTPS proxy instances.
:param int verbosity: Logging and console output verbosity.
:param bool verbose_output: False (only URLs) or True (rank, title, description, and URL). Defaults to False.

:rtype: List of str
:return: List of found URLs.
:return: List of URLs found or list of {"rank", "title", "description", "url"}
"""

self.query = urllib.parse.quote_plus(query)
Expand All @@ -139,6 +141,7 @@ def __init__(
self.proxy = proxy
self.verify_ssl = verify_ssl
self.verbosity = verbosity
self.verbose_output = verbose_output

# Assign log level.
ROOT_LOGGER.setLevel((6 - self.verbosity) * 10)
Expand Down Expand Up @@ -394,11 +397,11 @@ def search(self):
"""Start the Google search.

:rtype: List of str
:return: List of URLs found
:return: List of URLs found or list of {"rank", "title", "description", "url"}
"""

# Set of URLs for the results found.
unique_urls_set = set()
# Consolidate search results.
self.search_result_list = []

# Count the number of valid, non-duplicate links found.
total_valid_links_found = 0
Expand Down Expand Up @@ -450,9 +453,8 @@ def search(self):
# HTTP 429 message returned from get_page() function, add "HTTP_429_DETECTED" to the set and return to the
# calling script.
if html == "HTTP_429_DETECTED":
unique_urls_set.add("HTTP_429_DETECTED")
self.unique_urls_list = list(unique_urls_set)
return self.unique_urls_list
self.search_result_list.append("HTTP_429_DETECTED")
return self.search_result_list

# Create the BeautifulSoup object.
soup = BeautifulSoup(html, "html.parser")
Expand Down Expand Up @@ -486,32 +488,60 @@ def search(self):
if not link:
continue

if self.verbose_output:

# Extract the URL title.
try:
title = a.get_text()
except Exception:
ROOT_LOGGER.warning(f"No title for link: {link}")
title = ""

# Extract the URL description.
try:
description = a.parent.parent.contents[1].get_text()

# Sometimes Google returns different structures.
if description == "":
description = a.parent.parent.contents[2].get_text()

except Exception:
ROOT_LOGGER.warning(f"No description for link: {link}")
description = ""

# Check if URL has already been found.
if link not in unique_urls_set:
if link not in self.search_result_list:

# Increase the counters.
valid_links_found_in_this_search += 1
total_valid_links_found += 1

ROOT_LOGGER.info(f"Found unique URL #{total_valid_links_found}: {link}")
unique_urls_set.add(link)

if self.verbose_output:
self.search_result_list.append(
{
"rank": total_valid_links_found, # Approximate rank according to yagooglesearch.
"title": title.strip(), # Remove leading and trailing spaces.
"description": description.strip(), # Remove leading and trailing spaces.
"url": link,
}
)
else:
self.search_result_list.append(link)

else:
ROOT_LOGGER.info(f"Duplicate URL found: {link}")

# If we reached the limit of requested URLS, return with the results.
if self.max_search_result_urls_to_return <= len(unique_urls_set):
# Convert to a list.
self.unique_urls_list = list(unique_urls_set)
return self.unique_urls_list
if self.max_search_result_urls_to_return <= len(self.search_result_list):
return self.search_result_list

# Determining if a "Next" URL page of results is not straightforward. If no valid links are found, the
# search results have been exhausted.
if valid_links_found_in_this_search == 0:
ROOT_LOGGER.info("No valid search results found on this page. Moving on...")
# Convert to a list.
self.unique_urls_list = list(unique_urls_set)
return self.unique_urls_list
return self.search_result_list

# Bump the starting page URL parameter for the next request.
self.start += self.num
Expand Down