Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add output param #9

Merged
merged 9 commits into from
Jan 21, 2022
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ client = yagooglesearch.SearchClient(
http_429_cool_off_factor=1.5,
proxy="socks5h://127.0.0.1:9050",
verbosity=5,
output="complete" # "normal" : Only url list // "complete" : List of {title, desc, url}
)
client.assign_random_user_agent()

Expand Down
48 changes: 42 additions & 6 deletions yagooglesearch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def __init__(
proxy="",
verify_ssl=True,
verbosity=5,
output="normal",
):

"""
Expand Down Expand Up @@ -112,6 +113,7 @@ def __init__(
:param bool verify_ssl: Verify the SSL certificate to prevent traffic interception attacks. Defaults to True.
This may need to be disabled in some HTTPS proxy instances.
:param int verbosity: Logging and console output verbosity.
:param str output: "normal" (Only URLs) or "complete" (Title, Description and urls)

:rtype: List of str
:return: List of found URLs.
Expand All @@ -134,6 +136,7 @@ def __init__(
self.proxy = proxy
self.verify_ssl = verify_ssl
self.verbosity = verbosity
self.output = output

# Assign log level.
ROOT_LOGGER.setLevel((6 - self.verbosity) * 10)
Expand Down Expand Up @@ -379,11 +382,12 @@ def search(self):
"""Start the Google search.

:rtype: List of str
:return: List of URLs found
:return: List of URLs found or List of {"title", "desc", "url"}
"""

# Set of URLs for the results found.
unique_urls_set = set()
unique_complete_result = []

# Count the number of valid, non-duplicate links found.
total_valid_links_found = 0
Expand Down Expand Up @@ -461,6 +465,25 @@ def search(self):
ROOT_LOGGER.warning(f"No href for link: {link}")
continue

if (self.output == "complete"):
# Get the first SPAN from the anchor tag.
try:
title = a.get_text()
except Exception:
ROOT_LOGGER.warning(f"No title and desc for link")
title = ''
continue
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With this continue, it will go to the top of the for loop and not populate a "complete" dictionary for the result found. So setting title="" does nothing.

The logic is if there is no title (same for the desc found later in the code), that search result would be discarded. Are you looking to have a result populated even if the title or desc isn't set found? I'd say yes (option 1), but wanted to confirm.

  1. Option 1, despite not all values being populated, the dictionary will be still added to the unique_complete_result list.
{
    "url": "https://github.com",
    "title": "GitHub site",
    "desc": "",
}
  1. Option 2, not all values are populated, discard and don't return as part of the unique_complete_result list.
{
    "url": "https://example.com",
    "title": "",
    "desc": "Example site description",
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need it to behave like option 1.
You are right, both 'continue' interfere with the behavior of option 1. I would only discard if the link does not exist.


try:
desc = a.parent.parent.contents[1].get_text()
# Sometimes google returns different structures
if (desc == ''):
desc = a.parent.parent.contents[2].get_text()
except Exception:
ROOT_LOGGER.warning(f"No title and desc for link")
desc = ''
continue

# Filter invalid links and links pointing to Google itself.
link = self.filter_search_result_urls(link)
if not link:
Expand All @@ -476,11 +499,20 @@ def search(self):
ROOT_LOGGER.info(f"Found unique URL #{total_valid_links_found}: {link}")
unique_urls_set.add(link)

if (self.output == "complete"):
unique_complete_result.append({"title": title,
"desc": desc,
"url": link})


# If we reached the limit of requested URLS, return with the results.
if self.max_search_result_urls_to_return <= len(unique_urls_set):
# Convert to a list.
self.unique_urls_list = list(unique_urls_set)
return self.unique_urls_list
if (self.output == "complete"):
return unique_complete_result
else:
# Convert to a list.
self.unique_urls_list = list(unique_urls_set)
return self.unique_urls_list

# See comment for the "valid_links_found_in_this_search" variable. This is because determining if a "Next"
# URL page of results is not straightforward. For example, this can happen if
Expand All @@ -492,8 +524,12 @@ def search(self):
"any search results on the next page either. Moving on..."
)
# Convert to a list.
self.unique_urls_list = list(unique_urls_set)
return self.unique_urls_list
if (self.output == "complete"):
return unique_complete_result
else:
# Convert to a list.
self.unique_urls_list = list(unique_urls_set)
return self.unique_urls_list

# Bump the starting page URL parameter for the next request.
self.start += self.num
Expand Down