From 4e3c7ecc8e4ef40b4bc589cbab52ef2d34155b08 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Fri, 1 Jun 2018 15:18:13 +0200 Subject: [PATCH 01/29] Started implementation of fake_browser and distributor base classes. --- kicost/distributors/distributor.py | 211 +++++++++++++++++++++++++++ kicost/distributors/fake_browser.py | 215 ++++++++++++++++++++++++++++ 2 files changed, 426 insertions(+) create mode 100644 kicost/distributors/distributor.py create mode 100644 kicost/distributors/fake_browser.py diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py new file mode 100644 index 000000000..5510ac4b3 --- /dev/null +++ b/kicost/distributors/distributor.py @@ -0,0 +1,211 @@ +# MIT license +# +# Copyright (C) 2018 by XESS Corporation / Max Maisel +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +# Author information. +__author__ = 'Max Maisel' +__webpage__ = 'https://github.com/mmmaisel/' + +# Libraries. +import sys +from bs4 import BeautifulSoup # XML file interpreter. +import multiprocessing # To deal with the parallel scrape. +import logging +import time +from random import choice +from ..eda_tools.eda_tools import order_refs # To better print the warnings about the parts. + +from . import fake_browser + +import http.client # For web scraping exceptions. +try: + # This is for Python 3. + from urllib.parse import urlencode, quote_plus as urlquote, urlsplit, urlunsplit + from urllib.request import urlopen, Request + import urllib.error + WEB_SCRAPE_EXCEPTIONS = (urllib.error.URLError, http.client.HTTPException) +except ImportError: + # This is for Python 2. + from urlparse import urlsplit, urlunsplit + from urllib import urlencode, quote_plus as urlquote + from urllib2 import urlopen, Request + import urllib2 + WEB_SCRAPE_EXCEPTIONS = (urllib2.URLError, http.client.HTTPException) + +from ..globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE # Debug configurations. +from ..globals import SEPRTR +from ..globals import PartHtmlError +from . import distributor_dict + +import os, re + +class distributor: + def __init__(self, scrape_retries, log_level, throttle_delay): + self.name = None + self.page_accessed = False + self.scrape_retries = scrape_retries + self.logger = logger + self.log_level = log_level + self.throttle_delay = throttle_delay + self.throttle_timeout = time.time() + self.domain = None + self.browser = fake_browser.fake_browser(self.logger, self.scrape_retries) + + # Abstract methods, implemented in distributor specific modules + def dist_get_part_html_tree(self, pn, extra_search_terms, url, descend): + raise NotImplementedError() + + def dist_get_part_num(self, html_tree): + raise NotImplementedError() + + def dist_get_qty_avail(self, html_tree): + raise NotImplementedError() + + def dist_get_price_tiers(self, html_tree): + raise NotImplementedError() + + def dist_get_extra_info(self, html_tree): + raise NotImplementedError() + + def dist_define_locale_currency(self, locale, currency): + raise NotImplementedError() + + def define_locale_currency(self, locale_currency='USD'): + '''@brief Configure the distributor for some locale/country and + currency second ISO3166 and ISO4217 + + @param `str` Alpha 2 country or alpha 3 currency or even one slash other.''' + try: + if distributor_dict[self.name]['scrape'] == 'web': + # Not make sense to configurate a local distributor (yet). + locale_currency = re.findall('\w{2,}', locale_currency) + locale = None + currency = None + for alpha in locale_currency: + if len(alpha)==2: + locale = alpha + elif len(alpha)==3: + currency = alpha + self.dist_define_locale_currency(locale, currency) + except NotImplementedError: + logger.warning('No currency/country configuration for {}.'.format(self.name)) + pass + + def scrape_part(self, id, part, local_part_html): + '''@brief Scrape the data for a part from each distributor website or local HTML. + + Use distributors submodules to scrape each distributor part page and get + informations such as price, quantity avaliable and others; + + @param `int` Count of the main loop. + @param `str` String with the part number / distributor stock. + @param `str` Local part HTML + @return id, distributor_name, url, `str` distributor stock part number, + `dict` price tiers, `int` qty avail, `dict` extrainfo dist + ''' + + if multiprocessing.current_process().name == "MainProcess": + self.logger = logging.getLogger('kicost') + else: + self.logger = multiprocessing.get_logger() + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(self.log_level) + self.logger.addHandler(handler) + self.logger.setLevel(self.log_level) + self.browser.logger = self.logger + + url = {} + part_num = {} + qty_avail = {} + price_tiers = {} + info_dist = {} + + if distributor_dict[self.name]['scrape']=='web': + if self.page_accessed == True: + # Check the throttling timeout for the chosen distributor to see if + # another access to its website is allowed. + if self.throttle_timeout > time.time(): + time.sleep(self.throttle_timeout - time.time()) + + # Update the timeout for this distributor website and release the sync. lock. + self.throttle_timeout = time.time() + self.throttle_delay + # Founded manufacturer / distributor code valid (not empty). + else: + self.logger.log(DEBUG_OBSESSIVE,'No delay for %s, type=%s' \ + % (self.name, distributor_dict[self.name]['scrape'])) + + # Get the HTML tree for the part. + html_tree, url = self.get_part_html_tree(part, local_part_html=local_part_html) + + # Call the functions that extract the data from the HTML tree. + part_num = self.dist_get_part_num(html_tree) + qty_avail = self.dist_get_qty_avail(html_tree) + price_tiers = self.dist_get_price_tiers(html_tree) + + try: + # Get extra characeristics of the part in the web page. + # This will be use to comment in the 'cat#' column of the + # spreadsheet and some validations (in the future implementaions) + info_dist = self.dist_get_extra_info(html_tree) + except: + info_dist = {} + pass + + # Return the part data. + return id, self.name, url, part_num, price_tiers, qty_avail, info_dist + + def get_part_html_tree(self, part, local_part_html): + '''@brief Get the HTML tree for a part. + + Get the HTML tree for a part from the given distributor website or local HTML. + @param `str` part Part manufactor code or distributor stock code. + @param `str` local_part_html + @return `str` with the HTML webpage.''' + + self.logger.log(DEBUG_OBSESSIVE, 'Looking in %s by %s:', self.name, order_refs(part.refs, True)) + + for extra_search_terms in set([part.fields.get('manf', ''), '']): + try: + # Search for part information using one of the following: + # 1) the distributor's catalog number. + # 2) the manufacturer's part number. + for key in (self.name+'#', self.name+SEPRTR+'cat#', 'manf#'): + if key in part.fields: + if part.fields[key]: + self.page_accessed = True + return self.dist_get_part_html_tree \ + (part.fields[key], extra_search_terms, local_part_html=local_part_html) + # No distributor or manufacturer number, so give up. + else: + self.page_accessed = False + self.logger.warning("No '%s#' or 'manf#' field: cannot lookup part %s at %s.", \ + self.name, part.refs, self.name) + return BeautifulSoup('', 'lxml'), '' + #raise PartHtmlError + except PartHtmlError: + pass + except AttributeError: + break + self.logger.warning("Part %s not found at %s.", order_refs(part.refs, False), self.name) + # If no HTML page was found, then return a tree for an empty page. + return BeautifulSoup('', 'lxml'), '' + + diff --git a/kicost/distributors/fake_browser.py b/kicost/distributors/fake_browser.py new file mode 100644 index 000000000..7477cdec8 --- /dev/null +++ b/kicost/distributors/fake_browser.py @@ -0,0 +1,215 @@ +# -*- coding: utf-8 -*- +# MIT license +# +# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior / Max Maisel +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +__author__ = 'XESS Corporation' +__email__ = 'info@xess.com' + +from random import choice + +import http.client # For web scraping exceptions. +import http.cookiejar + +from ..globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE + +try: + # This is for Python 3 + from urllib.parse import urlencode, quote_plus as urlquote, urlsplit, urlunsplit + from urllib.request import urlopen, Request + import urllib.error + WEB_SCRAPE_EXCEPTIONS = (urllib.error.URLError, http.client.HTTPException) +except ImportError: + # This is for Python 2 + from urlparse import urlsplit, urlunsplit + from urllib import urlencode, quote_plus as urlquote + from urllib2 import urlopen, Request + import urllib2 + WEB_SCRAPE_EXCEPTIONS = (urllib2.URLError, http.client.HTTPException) + + +def get_user_agent(): + ''' The default user_agent_list comprises chrome, IE, firefox, Mozilla, opera, netscape. + You can find more user agent strings at https://techblog.willshouse.com/2012/01/03/most-common-user-agents/. + Used for the function `fake_browser(url, retries)` + ''' + user_agent_list = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5", + "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.47", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Safari/604.1.38", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0", + "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Safari/604.1.38", + "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:57.0) Gecko/20100101 Firefox/57.0", + "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; rv:57.0) Gecko/20100101 Firefox/57.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.39", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8", + "Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52", + "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.75 Chrome/62.0.3202.75 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.94 Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; Trident/5.0)", + "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko", + "Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0", + "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (iPad; CPU OS 11_1_2 like Mac OS X) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0 Mobile/15B202 Safari/604.1", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko", + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; Trident/5.0)", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4", + "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0) Gecko/20100101 Firefox/56.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.89 Chrome/62.0.3202.89 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", + "Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", + ] + return choice(user_agent_list) + +# Open the URL, read the HTML from it, and parse it into a tree structure. +class fake_browser: + def __init__(self, logger, scrape_retries): + '''@brief fake_browser + @param logger + @param scrape_retries `int` Quantity of retries in case of fail. + ''' + self.cookiejar = http.cookiejar.CookieJar() + self.userAgent = get_user_agent() + self.opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self.cookiejar)) + self.scrape_retries = scrape_retries + self.logger = logger + + def show_cookies(self, name): + for x in self.cookiejar: + # TODO: use logger + self.logger.log(DEBUG_OBSESSIVE,"%s Cookie %s" % (name, x.name)) + print("%s Cookie %s" % (name, x.name)) + + def add_cookie(self, domain, name, value): + self.cookiejar.set_cookie(http.cookiejar.Cookie( + version=0, + name=name, + value=value, + port=None, + port_specified=False, + domain=domain, + domain_specified=True, + domain_initial_dot=False, + path="/", + path_specified=False, + secure=False, + expires=None, + discard=False, + comment=None, + comment_url=None, + rest=None)) + + def scrape_URL(self, url, add_header=None): + for _ in range(self.scrape_retries): + try: + req = Request(url) + if add_header: + req.add_header(add_header) + req.add_header('User-agent', self.userAgent) + req.add_header('Accept', 'text/html') + req.add_header('Accept-Language', 'en-US') + req.add_header('Accept-Encoding', 'identity') + response = self.opener.open(req, timeout=10) + html = response.read() + break + #except WEB_SCRAPE_EXCEPTIONS: + except Exception as ex: + # TODO: remove print + print('Exception of type "%s" while web-scraping %s' \ + % (type(ex).__name__, format(url))) + self.logger.log(DEBUG_DETAILED,'Exception of type "%s" while web-scraping %s' \ + % (type(ex).__name__, format(url))) + pass + else: + # TODO: remove print + print('No page') + raise ValueError('No page') + return html + From 54e1a8db7fc9a4facc7ba817443a585a920e03d0 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Fri, 1 Jun 2018 15:18:56 +0200 Subject: [PATCH 02/29] Started refactoring as discussed in issue #242. Goals are: - Add stateful user-agent and cookie handling in fake_browser - Scrape parts in distrtibutor then part order - Use a class inheritance approach to distributor modules, this allows adding state information and reduces the amount of variabled passed around. - One scraping thread per distributor, simplify locking Implemented in this commit: - Used class approach to distributors and fake_browser - Parts are scraped in distributor -> part order - One (IO limited) python thread per distributor - Simplified locking --- kicost/distributors/__init__.py | 143 --------------------------- kicost/kicost.py | 168 ++++++++++++++++++-------------- 2 files changed, 93 insertions(+), 218 deletions(-) diff --git a/kicost/distributors/__init__.py b/kicost/distributors/__init__.py index 85e47b3bd..666d7388e 100644 --- a/kicost/distributors/__init__.py +++ b/kicost/distributors/__init__.py @@ -24,149 +24,6 @@ __author__ = 'XESS Corporation' __email__ = 'info@xess.com' -from random import choice - -import http.client # For web scraping exceptions. -try: - # This is for Python 3 - from urllib.parse import urlencode, quote_plus as urlquote, urlsplit, urlunsplit - from urllib.request import urlopen, Request - import urllib.error - WEB_SCRAPE_EXCEPTIONS = (urllib.error.URLError, http.client.HTTPException) -except ImportError: - # This is for Python 2 - from urlparse import urlsplit, urlunsplit - from urllib import urlencode, quote_plus as urlquote - from urllib2 import urlopen, Request - import urllib2 - WEB_SCRAPE_EXCEPTIONS = (urllib2.URLError, http.client.HTTPException) - - -def get_user_agent(): - ''' The default user_agent_list comprises chrome, IE, firefox, Mozilla, opera, netscape. - You can find more user agent strings at https://techblog.willshouse.com/2012/01/03/most-common-user-agents/. - Used for the function `fake_browser(url, retries)` - ''' - user_agent_list = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5", - "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0", - "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0", - "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0", - "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.47", - "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Safari/604.1.38", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0", - "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Safari/604.1.38", - "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:57.0) Gecko/20100101 Firefox/57.0", - "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0", - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; rv:57.0) Gecko/20100101 Firefox/57.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.39", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8", - "Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52", - "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.75 Chrome/62.0.3202.75 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.94 Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; Trident/5.0)", - "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko", - "Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0", - "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (iPad; CPU OS 11_1_2 like Mac OS X) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0 Mobile/15B202 Safari/604.1", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko", - "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; Trident/5.0)", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4", - "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0) Gecko/20100101 Firefox/56.0", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.89 Chrome/62.0.3202.89 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36", - "Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0", - "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", - ] - return choice(user_agent_list) - - -# Open the URL, read the HTML from it, and parse it into a tree structure. - -def fake_browser(url, scrape_retries=4, add_header=None): - for _ in range(scrape_retries): - try: - req = Request(url) - if add_header: - req.add_header(add_header) - req.add_header('Accept-Language', 'en-US') - req.add_header('Accept', 'text/html') - req.add_header('Cookie', 'foo=bar') - req.add_header('User-agent', get_user_agent()) - response = urlopen(req) - html = response.read() - break - except WEB_SCRAPE_EXCEPTIONS: - logger.log(DEBUG_DETAILED,'Exception while web-scraping {}'.format(url)) - pass - else: - raise ValueError('No page') - return html - # Extra informations to by got by each part in the distributors. EXTRA_INFO_DIST = ['value', 'tolerance', 'footprint', 'power', 'current', 'voltage', 'frequency', 'temp_coeff', 'manf', 'size', 'op temp', 'orientation', 'color', diff --git a/kicost/kicost.py b/kicost/kicost.py index 15cab4b33..3a09507e6 100644 --- a/kicost/kicost.py +++ b/kicost/kicost.py @@ -31,7 +31,7 @@ import pprint import tqdm from time import time -from multiprocessing import Pool, Manager, Lock +from multiprocessing.pool import ThreadPool # Stops UnicodeDecodeError exceptions. try: @@ -51,8 +51,27 @@ # Import information about various distributors. from .distributors import distributor_dict -from .distributors.web_routines import scrape_part, config_distributor -from .distributors.local.local import create_part_html as create_local_part_html +from .distributors import distributor, fake_browser + +# The distributor module directories will be found in this directory. +directory = os.path.dirname(__file__) + "/distributors" + +# Search for the distributor modules and import them. +for module in os.listdir(directory): + + # Avoid importing non-directories. + abs_module = os.path.join(directory, module) + if not os.path.isdir(abs_module): + continue + + # Avoid directories like __pycache__. + if module.startswith('__'): + continue + + # Import the module. + tmp = __import__("distributors."+module, globals(), locals(), [], level=1) + tmp_mod = getattr(tmp, module); + globals()["dist_"+module] = getattr(tmp_mod, "dist_"+module) # Import information for various EDA tools. from .eda_tools import eda_modules @@ -63,7 +82,7 @@ def kicost(in_file, eda_tool_name, out_filename, user_fields, ignore_fields, group_fields, variant, dist_list=list(distributor_dict.keys()), - num_processes=4, scrape_retries=5, throttling_delay=0.0, + num_processes=4, scrape_retries=5, throttling_delay=5.0, collapse_refs=True, local_currency='USD'): ''' @brief Run KiCost. @@ -186,7 +205,8 @@ def kicost(in_file, eda_tool_name, out_filename, distributor_dict.pop(d, None) # Create an HTML page containing all the local part information. - local_part_html = create_local_part_html(parts, distributor_dict) + local_distributor = dist_local(scrape_retries, 5, throttling_delay) # TODO: log level + local_part_html = local_distributor.create_part_html(parts, distributor_dict) if logger.isEnabledFor(DEBUG_DETAILED): pprint.pprint(distributor_dict) @@ -194,27 +214,32 @@ def kicost(in_file, eda_tool_name, out_filename, # Get the distributor product page for each part and scrape the part data. if dist_list: + # Instanciate distributors + for d in list(distributor_dict.keys()): + try: + ctor = globals()["dist_"+d] + # TODO: use logger, not print + # TODO: logger does not print anything + logger.log(DEBUG_OVERVIEW, "Initialising %s" % d) + print("Initialising %s" % d) + # TODO: farnell does not respond + distributor_dict[d]['instance'] = ctor(scrape_retries, 5, throttling_delay) # TODO: log level + except: + logger.log(DEBUG_OVERVIEW, "Initialising %s failed, exculding this distributor..." % d) + distributor_dict.pop(d, None) + pass + + # TODO: multithreaded init, use another pool + if local_currency: logger.log(DEBUG_OVERVIEW, '# Configuring the distributors locale and currency...') - if num_processes <= 1: - for d in distributor_dict: - config_distributor(distributor_dict[d]['module'], local_currency) - else: - logger.log(DEBUG_OBSESSIVE, 'Using {} simultaneous access...'.format(min(len(distributor_dict), num_processes))) - pool = Pool(num_processes) - for d in distributor_dict: - args = [distributor_dict[d]['module'], local_currency] - pool.apply_async(config_distributor, args) - pool.close() - pool.join() + for d in distributor_dict: + distributor_dict[d]['instance'].define_locale_currency(local_currency) logger.log(DEBUG_OVERVIEW, '# Scraping part data for each component group...') - # Set the throttling delay for each distributor. - for d in distributor_dict: - distributor_dict[d]['throttling_delay'] = throttling_delay global scraping_progress - scraping_progress = tqdm.tqdm(desc='Progress', total=len(parts), unit='part', miniters=1) + scraping_progress = tqdm.tqdm(desc='Progress', total=len(parts)*len(distributor_dict), unit='part', miniters=1) # Change the logging print channel to `tqdm` to keep the process bar to the end of terminal. class TqdmLoggingHandler(logging.Handler): @@ -232,77 +257,70 @@ def emit(self, record): self.handleError(record) logger.addHandler(TqdmLoggingHandler()) + # Init part info dictionaries + for part in parts: + pprint.pprint(vars(part)) + part.part_num = {} + part.url = {} + part.price_tiers = {} + part.qty_avail = {} + part.info_dist = {} + #partsByDist = partListByDistributors(parts) + if num_processes <= 1: # Scrape data, one part at a time using single processing. - - class DummyLock: - """Dummy synchronization lock used when single processing.""" - def __init__(self): - pass - def acquire(*args, **kwargs): - return True # Lock can ALWAYS be acquired when just one process is running. - def release(*args, **kwargs): - pass - - # Create sync lock and timeouts to control the rate at which distributor - # websites are scraped. - throttle_lock = DummyLock() - throttle_timeouts = dict() - throttle_timeouts = {d:time() for d in distributor_dict} - - for i in range(len(parts)): - args = (i, parts[i], distributor_dict, local_part_html, scrape_retries, - logger.getEffectiveLevel(), throttle_lock, throttle_timeouts) - id, url, part_num, price_tiers, qty_avail, info_dist = scrape_part(args) - parts[id].part_num = part_num - parts[id].url = url - parts[id].price_tiers = price_tiers - parts[id].qty_avail = qty_avail - parts[id].info_dist = info_dist # Extra distributor web page. - scraping_progress.update(1) + for d in distributor_dict: + print("Dist loop d=%s" % d) + for i in range(len(parts)): + print("Part loop i=%d" % i) + id, dist, url, part_num, price_tiers, qty_avail, info_dist = \ + scrape_result = distributor_dict[d]['instance'].scrape_part \ + (i, parts[i], local_part_html) + + parts[id].part_num[dist] = part_num + parts[id].url[dist] = url + parts[id].price_tiers[dist] = price_tiers + parts[id].qty_avail[dist] = qty_avail + parts[id].info_dist[dist] = info_dist # Extra distributor web page. + scraping_progress.update(1) else: # Scrape data, multiple parts at a time using multiprocessing. - # Create sync lock and timeouts to control the rate at which distributor - # websites are scraped. - throttle_manager = Manager() # Manages shared lock and `dict`. - throttle_lock = throttle_manager.Lock() - throttle_timeouts = throttle_manager.dict() - for d in distributor_dict: - throttle_timeouts[d] = time() - - # Create pool of processes to scrape data for multiple parts simultaneously. - pool = Pool(num_processes) + # Create thread pool to scrape data for multiple distributors simultaneously. + # PYthon threads are time-sliced but they work in our I/O limited scenario + # and avoid all kinds of pickle issues. + pool = ThreadPool(num_processes) # Package part data for passing to each process. - arg_sets = [(i, parts[i], distributor_dict, local_part_html, scrape_retries, - logger.getEffectiveLevel(), throttle_lock, throttle_timeouts) for i in range(len(parts))] - - # Define a callback routine for updating the scraping progress bar. - def update(x): - scraping_progress.update(1) - return x + arg_sets = [(distributor_dict[d]['instance'], parts, \ + local_part_html, scraping_progress) for d in distributor_dict] + + def mt_scrape_part(inst, parts, local_part_html, scraping_progress): + retval = list() + for i in range(len(parts)): + retval.append(inst.scrape_part(i, parts[i], local_part_html)) + scraping_progress.update(1) + return retval # Start the web scraping processes, one for each part. logger.log(DEBUG_OBSESSIVE, 'Starting {} parallels process to scrap parts...'.format(num_processes)) - results = [pool.apply_async(scrape_part, [args], callback=update) for args in arg_sets] + results = [pool.apply_async(mt_scrape_part, args) for args in arg_sets] # Wait for all the processes to have results, then kill-off all the scraping processes. - for r in results: - while(not r.ready()): - pass - logger.log(DEBUG_OVERVIEW, 'All parallels process finished with success.') pool.close() pool.join() + logger.log(DEBUG_OVERVIEW, 'All parallels process finished with success.') # Get the data from each process result structure. - for result in results: - id, url, part_num, price_tiers, qty_avail, info_dist = result.get() - parts[id].part_num = part_num - parts[id].url = url - parts[id].price_tiers = price_tiers - parts[id].qty_avail = qty_avail - parts[id].info_dist = info_dist # Extra distributor web page. + for res_proc in results: + res_dist = res_proc.get() + for res_part in res_dist: + id, dist, url, part_num, price_tiers, qty_avail, info_dist = res_part + parts[id].part_num[dist] = part_num + parts[id].url[dist] = url + parts[id].price_tiers[dist] = price_tiers + parts[id].qty_avail[dist] = qty_avail + parts[id].info_dist[dist] = info_dist # Extra distributor web page. # Done with the scraping progress bar so delete it or else we get an # error when the program terminates. From f58cca99aad241b4c402276e85d6e8e8c6f9c3fa Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Fri, 1 Jun 2018 15:21:20 +0200 Subject: [PATCH 03/29] Implemented dist_mouser class. --- kicost/distributors/mouser/mouser.py | 344 ++++++++++++++------------- 1 file changed, 176 insertions(+), 168 deletions(-) diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py index aade51493..ecd454c97 100644 --- a/kicost/distributors/mouser/mouser.py +++ b/kicost/distributors/mouser/mouser.py @@ -1,6 +1,6 @@ # MIT license # -# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior +# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior / Max Maisel # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -38,176 +38,184 @@ import difflib from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -from .. import urlencode, urlquote, urlsplit, urlunsplit +#from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError -from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE - - -def get_price_tiers(html_tree): - '''@brief Get the pricing tiers from the parsed tree of the Mouser product page. - @param html_tree `str()` html of the distributor part page. - @return `dict()` price breaks, the keys are the quantities breaks. - ''' - price_tiers = {} - try: - pricing_tbl_tree = html_tree.find('div', class_='pdp-pricing-table') - price_row_trees = pricing_tbl_tree.find_all('div', class_='div-table-row') - for row_tree in price_row_trees: - qty_tree, unit_price_tree, _ = row_tree.find('div', class_='row').find_all('div', class_='col-xs-4') - try: - qty = int(re.sub('[^0-9]', '', qty_tree.text)) - unit_price = float(re.sub('[^0-9.]', '', unit_price_tree.text)) - price_tiers[qty] = unit_price - except ValueError: - pass # In case of "quote price", ignore and pass to next (check pn STM32F411RCT6). +from ...globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE + +from .. import distributor, distributor_dict + +from urllib.parse import quote_plus as urlquote + +class dist_mouser(distributor.distributor): + def __init__(self, scrape_retries, log_level, throttle_delay): + super(dist_mouser, self).__init__(scrape_retries, log_level, throttle_delay) + self.name = 'mouser' + self.domain = distributor_dict[self.name]['site']['url'] + self.browser.add_cookie('.mouser.com', 'preferences', 'ps=www2&pl=en-US&pc_www2=USDe') + + self.browser.scrape_URL(self.domain) + self.browser.show_cookies(self.name) + + def dist_get_price_tiers(self, html_tree): + '''@brief Get the pricing tiers from the parsed tree of the Mouser product page. + @param html_tree `str()` html of the distributor part page. + @return `dict()` price breaks, the keys are the quantities breaks. + ''' + price_tiers = {} + try: + pricing_tbl_tree = html_tree.find('div', class_='pdp-pricing-table') + price_row_trees = pricing_tbl_tree.find_all('div', class_='div-table-row') + for row_tree in price_row_trees: + qty_tree, unit_price_tree, _ = row_tree.find('div', class_='row').find_all('div', class_='col-xs-4') + try: + qty = int(re.sub('[^0-9]', '', qty_tree.text)) + unit_price = float(re.sub('[^0-9.]', '', unit_price_tree.text)) + price_tiers[qty] = unit_price + except ValueError: + pass # In case of "quote price", ignore and pass to next (check pn STM32F411RCT6). + return price_tiers + + qty_strs = [] + for qty in html_tree.find('div', + class_='PriceBreaks').find_all( + 'div', + class_='PriceBreakQuantity'): + qty_strs.append(qty.text) + price_strs = [] + for price in html_tree.find('div', + class_='PriceBreaks').find_all( + 'div', + class_='PriceBreakPrice'): + price_strs.append(price.text) + qtys_prices = list(zip(qty_strs, price_strs)) + for qty_str, price_str in qtys_prices: + try: + qty = re.search('(\s*)([0-9,]+)', qty_str).group(2) + qty = int(re.sub('[^0-9]', '', qty)) + price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str)) + except (TypeError, AttributeError, ValueError, IndexError): + continue + except AttributeError: + # This happens when no pricing info is found in the tree. + self.logger.log(DEBUG_OBSESSIVE, 'No Mouser pricing information found!') + return price_tiers # Return empty price tiers. return price_tiers - qty_strs = [] - for qty in html_tree.find('div', - class_='PriceBreaks').find_all( - 'div', - class_='PriceBreakQuantity'): - qty_strs.append(qty.text) - price_strs = [] - for price in html_tree.find('div', - class_='PriceBreaks').find_all( - 'div', - class_='PriceBreakPrice'): - price_strs.append(price.text) - qtys_prices = list(zip(qty_strs, price_strs)) - for qty_str, price_str in qtys_prices: - try: - qty = re.search('(\s*)([0-9,]+)', qty_str).group(2) - qty = int(re.sub('[^0-9]', '', qty)) - price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str)) - except (TypeError, AttributeError, ValueError, IndexError): - continue - except AttributeError: - # This happens when no pricing info is found in the tree. - logger.log(DEBUG_OBSESSIVE, 'No Mouser pricing information found!') - return price_tiers # Return empty price tiers. - return price_tiers - - -def get_part_num(html_tree): - '''@brief Get the part number from the Mouser product page. - @param html_tree `str()` html of the distributor part page. - @return `list()`of the parts that match. - ''' - try: - partnum = html_tree.find( - 'span', id='spnMouserPartNumFormattedForProdInfo' - ).text - return partnum.strip() - except AttributeError: - logger.log(DEBUG_OBSESSIVE, 'No Mouser part number found!') - return '' - - -def get_qty_avail(html_tree): - '''@brief Get the available quantity of the part from the Mouser product page. - @param html_tree `str()` html of the distributor part page. - @return `int` avaliable quantity. - ''' - try: - qty_str = html_tree.find( - 'div', class_='pdp-product-availability').find( - 'div', class_='row').find( - 'div', class_='col-xs-8').find('div').text - except AttributeError as e: - # No quantity found (not even 0) so this is probably a non-stocked part. - # Return None so the part won't show in the spreadsheet for this dist. - logger.log(DEBUG_OBSESSIVE, 'No Mouser part quantity found!') - return None - try: - qty_str = re.search('(\s*)([0-9,]*)', qty_str, re.IGNORECASE).group(2) - return int(re.sub('[^0-9]', '', qty_str)) - except ValueError: - # No quantity found (not even 0) so this is probably a non-stocked part. - # Return None so the part won't show in the spreadsheet for this dist. - logger.log(DEBUG_OBSESSIVE, 'No Mouser part quantity found!') - return None - - -def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2): - '''@brief Find the Mouser HTML page for a part number and return the URL and parse tree. - @param dist - @param pn Part number `str()`. - @param extra_search_terms - @param url - @param descend - @param local_part_html - @param scrape_retries `int` Quantity of retries in case of fail. - @return (html `str()` of the page, url) - ''' - - # Use the part number to lookup the part using the site search function, unless a starting url was given. - if url is None: - url = 'https://www.mouser.com/Search/Refine.aspx?Keyword=' + urlquote( - pn + ' ' + extra_search_terms, - safe='') - elif url[0] == '/': - url = 'https://www.mouser.com' + url - elif url.startswith('..'): - url = 'https://www.mouser.com/Search/' + url - - # Open the URL, read the HTML from it, and parse it into a tree structure. - try: - html = fake_browser(url, scrape_retries, ('Cookie', 'preferences=ps=www2&pl=en-US&pc_www2=USDe')) - except: - logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist)) - raise PartHtmlError - - # Abort if the part number isn't in the HTML somewhere. - # (Only use the numbers and letters to compare PN to HTML.) - if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): - logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist)) - raise PartHtmlError - - try: - tree = BeautifulSoup(html, 'lxml') - except Exception: - logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist)) - raise PartHtmlError - - # If the tree contains the tag for a product page, then just return it. - if tree.find('div', id='pdpPricingAvailability') is not None: - return tree, url + def dist_get_part_num(self, html_tree): + '''@brief Get the part number from the Mouser product page. + @param html_tree `str()` html of the distributor part page. + @return `list()`of the parts that match. + ''' + try: + partnum = html_tree.find( + 'span', id='spnMouserPartNumFormattedForProdInfo' + ).text + return partnum.strip() + except AttributeError: + self.logger.log(DEBUG_OBSESSIVE, 'No Mouser part number found!') + return '' + + def dist_get_qty_avail(self, html_tree): + '''@brief Get the available quantity of the part from the Mouser product page. + @param html_tree `str()` html of the distributor part page. + @return `int` avaliable quantity. + ''' + try: + qty_str = html_tree.find( + 'div', class_='pdp-product-availability').find( + 'div', class_='row').find( + 'div', class_='col-xs-8').find('div').text + except AttributeError as e: + # No quantity found (not even 0) so this is probably a non-stocked part. + # Return None so the part won't show in the spreadsheet for this dist. + self.logger.log(DEBUG_OBSESSIVE, 'No Mouser part quantity found!') + return None + try: + qty_str = re.search('(\s*)([0-9,]*)', qty_str, re.IGNORECASE).group(2) + return int(re.sub('[^0-9]', '', qty_str)) + except ValueError: + # No quantity found (not even 0) so this is probably a non-stocked part. + # Return None so the part won't show in the spreadsheet for this dist. + self.logger.log(DEBUG_OBSESSIVE, 'No Mouser part quantity found!') + return None + + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + '''@brief Find the Mouser HTML page for a part number and return the URL and parse tree. + @param pn Part number `str()`. + @param extra_search_terms + @param url + @param descend + @param local_part_html + @return (html `str()` of the page, url) + ''' + + # Use the part number to lookup the part using the site search function, unless a starting url was given. + if url is None: + url = 'https://www.mouser.com/Search/Refine.aspx?Keyword=' + urlquote( + pn + ' ' + extra_search_terms, + safe='') + elif url[0] == '/': + url = 'https://www.mouser.com' + url + elif url.startswith('..'): + url = 'https://www.mouser.com/Search/' + url + + # Open the URL, read the HTML from it, and parse it into a tree structure. + try: + html = self.browser.scrape_URL(url) + except Exception as ex: + self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name)) + raise PartHtmlError - # If the tree is for a list of products, then examine the links to try to find the part number. - if tree.find('div', id='searchResultsTbl') is not None: - logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist)) - if descend <= 0: - logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist)) + # Abort if the part number isn't in the HTML somewhere. + # (Only use the numbers and letters to compare PN to HTML.) + if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): + self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, self.name)) + raise PartHtmlError + + try: + tree = BeautifulSoup(html, 'lxml') + except Exception: + self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name)) raise PartHtmlError - else: - # Look for the table of products. - products = tree.find( - 'table', - class_='SearchResultsTable').find_all( - 'tr', - class_=('SearchResultsRowOdd', 'SearchResultsRowEven')) - - # Extract the product links for the part numbers from the table. - product_links = [p.find('div', class_='mfrDiv').a for p in products] - - # Extract all the part numbers from the text portion of the links. - part_numbers = [l.text for l in product_links] - - # Look for the part number in the list that most closely matches the requested part number. - match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] - - # Now look for the link that goes with the closest matching part number. - for l in product_links: - if l.text == match: - # Get the tree for the linked-to page and return that. - logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text, pn, dist)) - return get_part_html_tree(dist, pn, extra_search_terms, - url=l.get('href', ''), - descend=descend-1, - scrape_retries=scrape_retries) - - # I don't know what happened here, so give up. - logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist)) - raise PartHtmlError + + # If the tree contains the tag for a product page, then just return it. + if tree.find('div', id='pdpPricingAvailability') is not None: + return tree, url + + # If the tree is for a list of products, then examine the links to try to find the part number. + if tree.find('div', id='searchResultsTbl') is not None: + self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name)) + if descend <= 0: + self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name)) + raise PartHtmlError + else: + # Look for the table of products. + products = tree.find( + 'table', + class_='SearchResultsTable').find_all( + 'tr', + class_=('SearchResultsRowOdd', 'SearchResultsRowEven')) + + # Extract the product links for the part numbers from the table. + product_links = [p.find('div', class_='mfrDiv').a for p in products] + + # Extract all the part numbers from the text portion of the links. + part_numbers = [l.text for l in product_links] + + # Look for the part number in the list that most closely matches the requested part number. + match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] + + # Now look for the link that goes with the closest matching part number. + for l in product_links: + if l.text == match: + # Get the tree for the linked-to page and return that. + self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text, pn, self.name)) + return self.dist_get_part_html_tree(pn, extra_search_terms, + url=l.get('href', ''), + descend=descend-1) + + # I don't know what happened here, so give up. + self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name)) + self.logger.log(DEBUG_OBSESSIVE,'Response was %s' % html) + raise PartHtmlError From 817b136a8672b797cbd44e447f33024f2b2bc5b6 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Fri, 1 Jun 2018 15:21:39 +0200 Subject: [PATCH 04/29] Implemented dist_farnell class. --- kicost/distributors/farnell/__init__.py | 2 +- kicost/distributors/farnell/farnell.py | 331 ++++++++++++------------ 2 files changed, 172 insertions(+), 161 deletions(-) diff --git a/kicost/distributors/farnell/__init__.py b/kicost/distributors/farnell/__init__.py index 7af68b1a4..c0203fa64 100644 --- a/kicost/distributors/farnell/__init__.py +++ b/kicost/distributors/farnell/__init__.py @@ -25,7 +25,7 @@ }, # Web site defitions. 'site': { - 'url': 'http://farnell.com/', + 'url': 'https://it.farnell.com/', 'currency': 'USD', 'locale': 'US' }, diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py index 59776392f..fb645c25f 100644 --- a/kicost/distributors/farnell/farnell.py +++ b/kicost/distributors/farnell/farnell.py @@ -1,6 +1,6 @@ # MIT license # -# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior +# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior / Max Maisel # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -38,173 +38,184 @@ import difflib from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -from .. import urlencode, urlquote, urlsplit, urlunsplit -from .. import fake_browser, WEB_SCRAPE_EXCEPTIONS +#from .. import urlencode, urlquote, urlsplit, urlunsplit +from .. import fake_browser from ...globals import PartHtmlError from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE from currency_converter import CurrencyConverter currency = CurrencyConverter() -__author__='Giacinto Luigi Cerone' - +from .. import distributor, distributor_dict -def get_price_tiers(html_tree): - '''@brief Get the pricing tiers from the parsed tree of the farnell product page. - @param html_tree `str()` html of the distributor part page. - @return `dict()` price breaks, the keys are the quantities breaks. - ''' - price_tiers = {} - try: - qty_strs = [] - for qty in html_tree.find( - 'table', - class_=('tableProductDetailPrice', 'pricing')).find_all( - 'td', - class_='qty'): - qty_strs.append(qty.text) - price_strs = [] - for price in html_tree.find( - 'table', - class_=('tableProductDetailPrice', 'pricing')).find_all( - 'td', - class_='threeColTd'): - price_strs.append(price.text) - qtys_prices = list(zip(qty_strs, price_strs)) - for qty_str, price_str in qtys_prices: - try: - qty = re.search('(\s*)([0-9,]+)', qty_str).group(2) - qty = int(re.sub('[^0-9]', '', qty)) - price_str=price_str.replace(',','.') - price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str)) - price_tiers[qty] = currency.convert(price_tiers[qty], 'EUR', 'USD') - except (TypeError, AttributeError, ValueError): - continue - except AttributeError: - # This happens when no pricing info is found in the tree. - return price_tiers # Return empty price tiers. - return price_tiers - -def get_part_num(html_tree): - '''@brief Get the part number from the farnell product page. - @param html_tree `str()` html of the distributor part page. - @return `list()`of the parts that match. - ''' - try: - # farnell catalog number is stored in a description list, so get - # all the list terms and descriptions, strip all the spaces from those, - # and pair them up. - div = html_tree.find('div', class_='productDescription').find('dl') - dt = [re.sub('\s','',d.text) for d in div.find_all('dt')] - dd = [re.sub('\s','',d.text) for d in div.find_all('dd')] - dtdd = {k:v for k,v in zip(dt,dd)} # Pair terms with descriptions. -# return dtdd.get('farnellPartNo.:', '') - return dtdd.get('CodiceProdotto', '') - except KeyError: - return '' # No catalog number found in page. - except AttributeError: - return '' # No ProductDescription found in page. - -def get_qty_avail(html_tree): - '''@brief Get the available quantity of the part from the farnell product page. - @param html_tree `str()` html of the distributor part page. - @return `int` avaliable quantity. - ''' - try: - qty_str = html_tree.find('p', class_='availabilityHeading').text - except (AttributeError, ValueError): - # No quantity found (not even 0) so this is probably a non-stocked part. - # Return None so the part won't show in the spreadsheet for this dist. - return None - try: - qty = re.sub('[^0-9]','',qty_str) # Strip all non-number chars. - return int(re.sub('[^0-9]', '', qty_str)) # Return integer for quantity. - except ValueError: - # No quantity found (not even 0) so this is probably a non-stocked part. - # Return None so the part won't show in the spreadsheet for this dist. - return None - -def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2): - '''@brief Find the farnell HTML page for a part number and return the URL and parse tree. - @param dist - @param pn Part number `str()`. - @param extra_search_terms - @param url - @param descend - @param local_part_html - @param scrape_retries `int` Quantity of retries in case of fail. - @return (html `str()` of the page, url) - ''' - - # Use the part number to lookup the part using the site search function, unless a starting url was given. - if url is None: - url = 'http://it.farnell.com/Search?catalogId=15001&langId=-4&storeId=10165&gs=true&st=' + urlquote( - pn + ' ' + extra_search_terms, - safe='') - elif url[0] == '/': - url = 'http://www.farnell.com' + url - elif url.startswith('..'): - url = 'http://www.farnell.com/Search/' + url - - # Open the URL, read the HTML from it, and parse it into a tree structure. - try: - html = fake_browser(url, scrape_retries) - except: - logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist)) - raise PartHtmlError +from urllib.parse import quote_plus as urlquote - # Abort if the part number isn't in the HTML somewhere. - # (Only use the numbers and letters to compare PN to HTML.) - if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): - logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist)) - raise PartHtmlError +__author__='Giacinto Luigi Cerone' - try: - tree = BeautifulSoup(html, 'lxml') - except Exception: - logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist)) - raise PartHtmlError +class dist_farnell(distributor.distributor): + def __init__(self, scrape_retries, log_level, throttle_delay): + super(dist_farnell, self).__init__(scrape_retries, log_level, throttle_delay) + self.name = 'farnell' + self.domain = distributor_dict[self.name]['site']['url'] + + self.browser.scrape_URL(self.domain) + self.browser.show_cookies(self.name) + + def dist_get_price_tiers(self, html_tree): + '''@brief Get the pricing tiers from the parsed tree of the farnell product page. + @param html_tree `str()` html of the distributor part page. + @return `dict()` price breaks, the keys are the quantities breaks. + ''' + price_tiers = {} + try: + qty_strs = [] + for qty in html_tree.find( + 'table', + class_=('tableProductDetailPrice', 'pricing')).find_all( + 'td', + class_='qty'): + qty_strs.append(qty.text) + price_strs = [] + for price in html_tree.find( + 'table', + class_=('tableProductDetailPrice', 'pricing')).find_all( + 'td', + class_='threeColTd'): + price_strs.append(price.text) + qtys_prices = list(zip(qty_strs, price_strs)) + for qty_str, price_str in qtys_prices: + try: + qty = re.search('(\s*)([0-9,]+)', qty_str).group(2) + qty = int(re.sub('[^0-9]', '', qty)) + price_str=price_str.replace(',','.') + price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str)) + price_tiers[qty] = currency.convert(price_tiers[qty], 'EUR', 'USD') + except (TypeError, AttributeError, ValueError): + continue + except AttributeError: + # This happens when no pricing info is found in the tree. + return price_tiers # Return empty price tiers. + return price_tiers + + def dist_get_part_num(self, html_tree): + '''@brief Get the part number from the farnell product page. + @param html_tree `str()` html of the distributor part page. + @return `list()`of the parts that match. + ''' + try: + # farnell catalog number is stored in a description list, so get + # all the list terms and descriptions, strip all the spaces from those, + # and pair them up. + div = html_tree.find('div', class_='productDescription').find('dl') + dt = [re.sub('\s','',d.text) for d in div.find_all('dt')] + dd = [re.sub('\s','',d.text) for d in div.find_all('dd')] + dtdd = {k:v for k,v in zip(dt,dd)} # Pair terms with descriptions. + # return dtdd.get('farnellPartNo.:', '') + return dtdd.get('CodiceProdotto', '') + except KeyError: + return '' # No catalog number found in page. + except AttributeError: + return '' # No ProductDescription found in page. + + def dist_get_qty_avail(self, html_tree): + '''@brief Get the available quantity of the part from the farnell product page. + @param html_tree `str()` html of the distributor part page. + @return `int` avaliable quantity. + ''' + try: + qty_str = html_tree.find('p', class_='availabilityHeading').text + except (AttributeError, ValueError): + # No quantity found (not even 0) so this is probably a non-stocked part. + # Return None so the part won't show in the spreadsheet for this dist. + return None + try: + qty = re.sub('[^0-9]','',qty_str) # Strip all non-number chars. + return int(re.sub('[^0-9]', '', qty_str)) # Return integer for quantity. + except ValueError: + # No quantity found (not even 0) so this is probably a non-stocked part. + # Return None so the part won't show in the spreadsheet for this dist. + return None + + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + '''@brief Find the farnell HTML page for a part number and return the URL and parse tree. + @param pn Part number `str()`. + @param extra_search_terms + @param url + @param descend + @param local_part_html + @return (html `str()` of the page, url) + ''' + + # Use the part number to lookup the part using the site search function, unless a starting url was given. + if url is None: + url = 'http://it.farnell.com/Search?storeId=10165&catalogId=15001&categoryName=&selectedCategoryId=&langId=-4&categoryIdBox=&st=' + urlquote( + pn + ' ' + extra_search_terms, + safe='') + + elif url[0] == '/': + url = 'http://www.farnell.com' + url + elif url.startswith('..'): + url = 'http://www.farnell.com/Search/' + url + + # Open the URL, read the HTML from it, and parse it into a tree structure. + try: + html = self.browser.scrape_URL(url) + except: + self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name)) + raise PartHtmlError - # If the tree contains the tag for a product page, then just return it. - if tree.find('div', class_='productDisplay', id='page') is not None: - return tree, url + # Abort if the part number isn't in the HTML somewhere. + # (Only use the numbers and letters to compare PN to HTML.) + if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): + self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, self.name)) + raise PartHtmlError - # If the tree is for a list of products, then examine the links to try to find the part number. - if tree.find('table', class_='productLister', id='sProdList') is not None: - logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist)) - if descend <= 0: - logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist)) + try: + tree = BeautifulSoup(html, 'lxml') + except Exception: + self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name)) raise PartHtmlError - else: - # Look for the table of products. - products = tree.find('table', - class_='productLister').find_all('tr', - class_='altRow') - - # Extract the product links for the part numbers from the table. - product_links = [] - for p in products: - try: - product_links.append(p.find('td', class_='mftrPart').find('a')) - except AttributeError: - continue - print('>>> ',pn,products,product_liks)#TODO - - # Extract all the part numbers from the text portion of the links. - part_numbers = [l.text for l in product_links] - - # Look for the part number in the list that most closely matches the requested part number. - match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] - - # Now look for the link that goes with the closest matching part number. - for l in product_links: - if l.text == match: - # Get the tree for the linked-to page and return that. - logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, dist)) - return get_part_html_tree(dist, pn, extra_search_terms, - url=l.get('href', ''), - descend=descend-1, - scrape_retries=scrape_retries) - - # I don't know what happened here, so give up. - logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist)) - raise PartHtmlError + + # If the tree contains the tag for a product page, then just return it. + if tree.find('div', class_='productDisplay', id='page') is not None: + return tree, url + + # If the tree is for a list of products, then examine the links to try to find the part number. + if tree.find('table', class_='productLister', id='sProdList') is not None: + self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name)) + if descend <= 0: + self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name)) + raise PartHtmlError + else: + # Look for the table of products. + products = tree.find('table', + class_='productLister').find_all('tr', + class_='altRow') + + # Extract the product links for the part numbers from the table. + product_links = [] + for p in products: + try: + product_links.append(p.find('td', class_='mftrPart').find('a')) + except AttributeError: + continue + print('>>> ',pn,products,product_links)#TODO + + # Extract all the part numbers from the text portion of the links. + part_numbers = [l.text for l in product_links] + + # Look for the part number in the list that most closely matches the requested part number. + match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] + + # Now look for the link that goes with the closest matching part number. + for l in product_links: + if l.text == match: + # Get the tree for the linked-to page and return that. + self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, self.name)) + return self.dist_get_part_html_tree(pn, extra_search_terms, + url=l.get('href', ''), + descend=descend-1) + + # I don't know what happened here, so give up. + self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name)) + self.logger.log(DEBUG_OBSESSIVE,'Response was %s' % html) + raise PartHtmlError From aa1f175db99e5783bbc2a9a844868ddb2ba7e7be Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Fri, 1 Jun 2018 15:22:01 +0200 Subject: [PATCH 05/29] Implemented dist_local class. --- kicost/distributors/local/local.py | 323 +++++++++++++++-------------- 1 file changed, 164 insertions(+), 159 deletions(-) diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py index 882fa5fee..a3900b6cd 100644 --- a/kicost/distributors/local/local.py +++ b/kicost/distributors/local/local.py @@ -1,6 +1,6 @@ # MIT license # -# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior +# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior / Max Maisel # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -35,165 +35,170 @@ from bs4 import BeautifulSoup from yattag import Doc, indent # For generating HTML page for local parts. import copy # To be possible create more than one local distributor. -from .. import urlsplit, urlunsplit from ...globals import PartHtmlError from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE -from ...globals import SEPRTR - - -def create_part_html(parts, distributors): - '''@brief Create HTML page containing info for local (non-webscraped) parts. - @param parts `list()` of parts. - @parm `list()`of the distributors to check each one is local. - @return `str()` of the HTML page to be read by `get_part_html_tree()` - ''' - - logger.log(DEBUG_OVERVIEW, 'Create HTML page for parts with custom pricing...') - - doc, tag, text = Doc().tagtext() - with tag('html'): - with tag('body'): - for p in parts: - # Find the manufacturer's part number if it exists. - pn = p.fields.get('manf#') # Returns None if no manf# field. - - # Find the various distributors for this part by - # looking for leading fields terminated by SEPRTR. - for key in p.fields: - try: - dist = key[:key.index(SEPRTR)] - except ValueError: - continue - - # If the distributor is not in the list of web-scrapable distributors, - # then it's a local distributor. Copy the local distributor template - # and add it to the table of distributors. - if dist not in distributors: - distributors[dist] = copy.copy(distributors['local_template']) - distributors[dist]['label'] = dist # Set dist name for spreadsheet header. - - # Now look for catalog number, price list and webpage link for this part. - for dist in distributors: - cat_num = p.fields.get(dist+':cat#') - pricing = p.fields.get(dist+':pricing') - link = p.fields.get(dist+':link') - if cat_num is None and pricing is None and link is None: - continue - - def make_random_catalog_number(p): - hash_fields = {k: p.fields[k] for k in p.fields} - hash_fields['dist'] = dist - return '#{0:08X}'.format(abs(hash(tuple(sorted(hash_fields.items()))))) - - cat_num = cat_num or pn or make_random_catalog_number(p) - p.fields[dist+':cat#'] = cat_num # Store generated cat#. - with tag('div', klass=dist+SEPRTR+cat_num): - with tag('div', klass='cat#'): - text(cat_num) - if pricing is not None: - with tag('div', klass='pricing'): - text(pricing) - if link is not None: - url_parts = list(urlsplit(link)) - if url_parts[0] == '': - url_parts[0] = u'http' - link = urlunsplit(url_parts) - with tag('div', klass='link'): - text(link) - - # Remove the local distributor template so it won't be processed later on. - # It has served its purpose. - try: - del distributors['local_template'] - except: - pass - - html = doc.getvalue() - if logger.isEnabledFor(DEBUG_OBSESSIVE): - print(indent(html)) - return html - - -def get_price_tiers(html_tree): - '''@brief Get the pricing tiers from the parsed tree of the local product page. - @param html_tree `str()` html of the distributor part page. - @return `dict()` price breaks, the keys are the quantities breaks. - ''' - price_tiers = {} - try: - pricing = html_tree.find('div', class_='pricing').text - pricing = re.sub('[^0-9.;:]', '', pricing) # Keep only digits, decimals, delimiters. - for qty_price in pricing.split(';'): - qty, price = qty_price.split(SEPRTR) - price_tiers[int(qty)] = float(price) - except AttributeError: - # This happens when no pricing info is found in the tree. - logger.log(DEBUG_OBSESSIVE, 'No local pricing information found!') - return price_tiers # Return empty price tiers. - return price_tiers - - -def get_part_num(html_tree): - '''@brief Get the part number from the local product page. - @param html_tree `str()` html of the distributor part page. - @return `list()`of the parts that match. - ''' - try: - part_num_str = html_tree.find('div', class_='cat#').text - return part_num_str - except AttributeError: - return '' - - -def get_qty_avail(html_tree): - '''@brief Get the available quantity of the part from the local product page. - @param html_tree `str()` html of the distributor part page. - @return `int` avaliable quantity. - ''' - try: - qty_str = html_tree.find('div', class_='quantity').text - except (AttributeError, ValueError): - # Return 0 (not None) so this part will show in the spreadsheet - # even if there is no quantity found. - return 0 - try: - return int(re.sub('[^0-9]', '', qty_str)) - except ValueError: - # Return 0 (not None) so this part will show in the spreadsheet - # even if there is no quantity found. - logger.log(DEBUG_OBSESSIVE, 'No local part quantity found!') - return 0 - - -def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=None, local_part_html=None, scrape_retries=2): - '''Extract the HTML tree from the HTML page for local parts. - @param dist - @param pn Part number `str()`. - @param extra_search_terms - @param url - @param descend - @param local_part_html - @param scrape_retries `int` Quantity of retries in case of fail. - @return (html `str()` of the page, `None`) The second argument is always `None` bacause there is not url to return. - ''' - - # Extract the HTML tree from the local part HTML page. - try: - tree = BeautifulSoup(local_part_html, 'lxml') - except Exception: - raise PartHtmlError - - try: - # Find the DIV in the tree for the given part and distributor. - class_ = dist + SEPRTR + pn - part_tree = tree.find('div', class_=class_) - url_tree = part_tree.find('div', class_='link') +from ...globals import SEPRTR as SEPRTR + +from .. import distributor + +from urllib.parse import urlsplit, urlunsplit + +class dist_local(distributor.distributor): + def __init__(self, scrape_retries, log_level, throttle_delay): + super(dist_local, self).__init__(scrape_retries, log_level, throttle_delay) + self.name = 'local' + + def create_part_html(self, parts, distributors): + '''@brief Create HTML page containing info for local (non-webscraped) parts. + @param parts `list()` of parts. + @parm `list()`of the distributors to check each one is local. + @return `str()` of the HTML page to be read by `get_part_html_tree()` + ''' + + self.logger.log(DEBUG_OVERVIEW, 'Create HTML page for parts with custom pricing...') + + doc, tag, text = Doc().tagtext() + with tag('html'): + with tag('body'): + for p in parts: + # Find the manufacturer's part number if it exists. + pn = p.fields.get('manf#') # Returns None if no manf# field. + + # Find the various distributors for this part by + # looking for leading fields terminated by SEPRTR. + for key in p.fields: + try: + dist = key[:key.index(SEPRTR)] + except ValueError: + continue + + # If the distributor is not in the list of web-scrapable distributors, + # then it's a local distributor. Copy the local distributor template + # and add it to the table of distributors. + if dist not in distributors: + distributors[dist] = copy.copy(distributors['local_template']) + distributors[dist]['label'] = dist # Set dist name for spreadsheet header. + + # Now look for catalog number, price list and webpage link for this part. + for dist in distributors: + cat_num = p.fields.get(dist+':cat#') + pricing = p.fields.get(dist+':pricing') + link = p.fields.get(dist+':link') + if cat_num is None and pricing is None and link is None: + continue + + def make_random_catalog_number(p): + hash_fields = {k: p.fields[k] for k in p.fields} + hash_fields['dist'] = dist + return '#{0:08X}'.format(abs(hash(tuple(sorted(hash_fields.items()))))) + + cat_num = cat_num or pn or make_random_catalog_number(p) + p.fields[dist+':cat#'] = cat_num # Store generated cat#. + with tag('div', klass=dist+SEPRTR+cat_num): + with tag('div', klass='cat#'): + text(cat_num) + if pricing is not None: + with tag('div', klass='pricing'): + text(pricing) + if link is not None: + url_parts = list(urlsplit(link)) + if url_parts[0] == '': + url_parts[0] = u'http' + link = urlunsplit(url_parts) + with tag('div', klass='link'): + text(link) + + # Remove the local distributor template so it won't be processed later on. + # It has served its purpose. try: - # Return the part data tree and any URL associated with the part. - return part_tree, url_tree.text.strip() + del distributors['local_template'] + except: + pass + + html = doc.getvalue() + if self.logger.isEnabledFor(DEBUG_OBSESSIVE): + print(indent(html)) + return html + + + def dist_get_price_tiers(self, html_tree): + '''@brief Get the pricing tiers from the parsed tree of the local product page. + @param html_tree `str()` html of the distributor part page. + @return `dict()` price breaks, the keys are the quantities breaks. + ''' + price_tiers = {} + try: + pricing = html_tree.find('div', class_='pricing').text + pricing = re.sub('[^0-9.;:]', '', pricing) # Keep only digits, decimals, delimiters. + for qty_price in pricing.split(';'): + qty, price = qty_price.split(SEPRTR) + price_tiers[int(qty)] = float(price) + except AttributeError: + # This happens when no pricing info is found in the tree. + self.logger.log(DEBUG_OBSESSIVE, 'No local pricing information found!') + return price_tiers # Return empty price tiers. + return price_tiers + + + def dist_get_part_num(self, html_tree): + '''@brief Get the part number from the local product page. + @param html_tree `str()` html of the distributor part page. + @return `list()`of the parts that match. + ''' + try: + part_num_str = html_tree.find('div', class_='cat#').text + return part_num_str + except AttributeError: + return '' + + + def dist_get_qty_avail(self, html_tree): + '''@brief Get the available quantity of the part from the local product page. + @param html_tree `str()` html of the distributor part page. + @return `int` avaliable quantity. + ''' + try: + qty_str = html_tree.find('div', class_='quantity').text + except (AttributeError, ValueError): + # Return 0 (not None) so this part will show in the spreadsheet + # even if there is no quantity found. + return 0 + try: + return int(re.sub('[^0-9]', '', qty_str)) + except ValueError: + # Return 0 (not None) so this part will show in the spreadsheet + # even if there is no quantity found. + self.logger.log(DEBUG_OBSESSIVE, 'No local part quantity found!') + return 0 + + # TODO: dist param + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=None, local_part_html=None): + '''Extract the HTML tree from the HTML page for local parts. + @param pn Part number `str()`. + @param extra_search_terms + @param url + @param descend + @param local_part_html + @return (html `str()` of the page, `None`) The second argument is always `None` bacause there is not url to return. + ''' + + # Extract the HTML tree from the local part HTML page. + try: + tree = BeautifulSoup(local_part_html, 'lxml') + except Exception: + raise PartHtmlError + + try: + # Find the DIV in the tree for the given part and distributor. + class_ = self.name + SEPRTR + pn + part_tree = tree.find('div', class_=class_) + url_tree = part_tree.find('div', class_='link') + try: + # Return the part data tree and any URL associated with the part. + return part_tree, url_tree.text.strip() + except AttributeError: + # Return part data tree and None if the URL is not found. + return part_tree, None except AttributeError: - # Return part data tree and None if the URL is not found. - return part_tree, None - except AttributeError: - # Return an error if the part_tree is not found. - raise PartHtmlError + # Return an error if the part_tree is not found. + raise PartHtmlError From 0ec9ee17e69ded78d036e63df8d84f2b628a8573 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Fri, 1 Jun 2018 15:22:12 +0200 Subject: [PATCH 06/29] Implemented dist_digikey class. --- kicost/distributors/digikey/digikey.py | 628 +++++++++++++------------ 1 file changed, 320 insertions(+), 308 deletions(-) diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py index 0653485a3..bd33295bd 100644 --- a/kicost/distributors/digikey/digikey.py +++ b/kicost/distributors/digikey/digikey.py @@ -1,6 +1,6 @@ # MIT license # -# Copyright (C) 2015 by XESS Corporation / Hildo Guillardi Junior +# Copyright (C) 2015 by XESS Corporation / Hildo Guillardi Junior / Max Maisel # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -34,193 +34,343 @@ import future +# TODO: not working yet ? + import re, difflib from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -from .. import urlencode, urlquote, urlsplit, urlunsplit +#from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from .. import EXTRA_INFO_DIST, extra_info_dist_name_translations from ...globals import PartHtmlError from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE -from .. import distributor_dict +from .. import distributor, distributor_dict + +from urllib.parse import quote_plus as urlquote + import pycountry -def define_locale_currency(locale_iso=None, currency_iso=None): - '''@brief Configure the distributor for the country and currency intended. - - Scrape the configuration page and define the base URL of DigiKey for the - currency and locale chosen. - The currency is predominant over the locale/country and the defauld are - currency='USD' and locale='US' for DigiKey. - - @param locale_iso `str` Country in ISO3166 alpha 2 standard. - @param currency_iso `str` Currency in ISO4217 alpha 3 standard.''' - url = 'https://www.digikey.com/en/resources/international' - - try: - html = fake_browser(url, 4) - except: # Could not get a good read from the website. - logger.log(DEBUG_OBSESSIVE,'No HTML page for DigiKey configuration.') - raise PartHtmlError - html = BeautifulSoup(html, 'lxml') - try: - if currency_iso and not locale_iso: - money = pycountry.currencies.get(alpha_3=currency_iso.upper()) - locale_iso = pycountry.countries.get(numeric=money.numeric).alpha_2 - if locale_iso: - locale_iso = locale_iso.upper() - country = pycountry.countries.get(alpha_2=locale_iso.upper()).name - html = html.find('li', text=re.compile(country, re.IGNORECASE)) - url = html.find('a', id='linkcolor').get('href') - - distributor_dict['digikey']['site']['url'] = url - distributor_dict['digikey']['site']['currency'] = pycountry.currencies.get(numeric=country.numeric).alpha_3 - distributor_dict['digikey']['site']['locale'] = locale_iso - except: - logger.log(DEBUG_OVERVIEW, 'Kept the last configuration {}, {} on {}.'.format( - pycountry.currencies.get(alpha_3=distributor_dict['digikey']['site']['currency']).name, - pycountry.countries.get(alpha_2=distributor_dict['digikey']['site']['locale']).name, - distributor_dict['digikey']['site']['url'] - )) # Keep the current configuration. - return - - -def get_extra_info(html_tree): - '''@brief Get the extra characteristics `EXTRA_INFO_DIST` from the part web page. - @param html_tree `str()` html of the distributor part page. - @return `dict()` keys as characteristics names. - ''' - info = {} - try: - table = html_tree.find('table', id='prod-att-table') - for row in table.find_all('tr', id=None): # `None`to ignore the header row. - try: - k = row.find('th').text.strip().lower() - v = row.find('td').text.strip() - k = extra_info_dist_name_translations.get(k, k) - if k in EXTRA_INFO_DIST: - info[k] = v - except: - continue - if 'datasheet' in EXTRA_INFO_DIST: - try: - info['datasheet'] = html_tree.find('a', href=True, target='_blank').get('href') - if info['datasheet'][0:2]=='//': - info['datasheet'] = 'https:' + info['datasheet'] # Digikey missing definitions. - except: - pass - if 'image' in EXTRA_INFO_DIST: - try: - info['image'] = html_tree.find('img', itemprop="image").get('src') - if info['image'][0:2]=='//': - info['image'] = 'https:' + info['image'] # Digikey missing definitions. - except: - pass - except AttributeError: - # This happens when no pricing info is found in the tree. - logger.log(DEBUG_OBSESSIVE, 'No Digikey pricing information found!') - return info - - -def get_price_tiers(html_tree): - '''@brief Get the pricing tiers from the parsed tree of the Digikey product page. - @param html_tree `str()` html of the distributor part page. - @return `dict()` price breaks, the keys are the quantities breaks. - ''' - price_tiers = {} - try: - for tr in html_tree.find('table', id='product-dollars').find_all('tr'): - try: - td = tr.find_all('td') - qty = int(re.sub('[^0-9]', '', td[0].text)) - price_tiers[qty] = float(re.sub('[^0-9\.]', '', td[1].text)) - except (TypeError, AttributeError, ValueError, - IndexError): # Happens when there's no in table row. - continue - except AttributeError: - # This happens when no pricing info is found in the tree. - logger.log(DEBUG_OBSESSIVE, 'No Digikey pricing information found!') - return price_tiers - - -def part_is_reeled(html_tree): - '''@brief Returns True if this Digi-Key part is reeled or Digi-reeled. - @param html_tree `str()` html of the distributor part page. - @return `True` or `False`. - ''' - qty_tiers = list(get_price_tiers(html_tree).keys()) - if len(qty_tiers) > 0 and min(qty_tiers) >= 100: - return True - if html_tree.find('table', - id='product-details-reel-pricing') is not None: - return True - return False - - -def get_part_num(html_tree): - '''@brief Get the part number from the Digikey product page. - @param html_tree `str()` html of the distributor part page. - @return `list()`of the parts that match. - ''' - try: - return re.sub('\s', '', html_tree.find('td', - id='reportPartNumber').text) - except AttributeError: - logger.log(DEBUG_OBSESSIVE, 'No Digikey part number found!') - return '' - - -def get_qty_avail(html_tree): - '''@brief Get the available quantity of the part from the Digikey product page. - @param html_tree `str()` html of the distributor part page. - @return `int` avaliable quantity. - ''' - try: - qty_tree = html_tree.find('td', id='quantityAvailable').find('span', id='dkQty') - qty_str = qty_tree.text - except AttributeError: - # No quantity found (not even 0) so this is probably a non-stocked part. - # Return None so the part won't show in the spreadsheet for this dist. - return None - try: - qty_str = re.search('([0-9,]*)', qty_str, re.IGNORECASE).group(1) - return int(re.sub('[^0-9]', '', qty_str)) - except (AttributeError, ValueError): - # Didn't find the usual quantity text field. This might be one of those - # input fields for requesting a quantity, so get the value from the - # input field. +class dist_digikey(distributor.distributor): + def __init__(self, scrape_retries, log_level, throttle_delay): + super(dist_digikey, self).__init__(scrape_retries, log_level, throttle_delay) + self.name = 'digikey' + self.domain = distributor_dict[self.name]['site']['url'] + + self.browser.scrape_URL(self.domain) + self.browser.show_cookies(self.name) + + def dist_get_price_tiers(self, html_tree): + '''@brief Get the pricing tiers from the parsed tree of the Digikey product page. + @param html_tree `str()` html of the distributor part page. + @return `dict()` price breaks, the keys are the quantities breaks. + ''' + price_tiers = {} try: - logger.log(DEBUG_OBSESSIVE, 'No Digikey part quantity found!') - return int(qty_tree.find('input', type='text').get('value')) + for tr in html_tree.find('table', id='product-dollars').find_all('tr'): + try: + td = tr.find_all('td') + qty = int(re.sub('[^0-9]', '', td[0].text)) + price_tiers[qty] = float(re.sub('[^0-9\.]', '', td[1].text)) + except (TypeError, AttributeError, ValueError, + IndexError): # Happens when there's no in table row. + continue + except AttributeError: + # This happens when no pricing info is found in the tree. + self.logger.log(DEBUG_OBSESSIVE, 'No Digikey pricing information found!') + return price_tiers + + def dist_get_extra_info(self, html_tree): + '''@brief Get the extra characteristics `EXTRA_INFO_DIST` from the part web page. + @param html_tree `str()` html of the distributor part page. + @return `dict()` keys as characteristics names. + ''' + info = {} + try: + table = html_tree.find('table', id='prod-att-table') + for row in table.find_all('tr', id=None): # `None`to ignore the header row. + try: + k = row.find('th').text.strip().lower() + v = row.find('td').text.strip() + k = extra_info_dist_name_translations.get(k, k) + if k in EXTRA_INFO_DIST: + info[k] = v + except: + continue + if 'datasheet' in EXTRA_INFO_DIST: + try: + info['datasheet'] = html_tree.find('a', href=True, target='_blank').get('href') + if info['datasheet'][0:2]=='//': + info['datasheet'] = 'https:' + info['datasheet'] # Digikey missing definitions. + except: + pass + if 'image' in EXTRA_INFO_DIST: + try: + info['image'] = html_tree.find('img', itemprop="image").get('src') + if info['image'][0:2]=='//': + info['image'] = 'https:' + info['image'] # Digikey missing definitions. + except: + pass + except AttributeError: + # This happens when no pricing info is found in the tree. + self.logger.log(DEBUG_OBSESSIVE, 'No Digikey pricing information found!') + return info + + def dist_define_locale_currency(self, locale_iso=None, currency_iso=None): + '''@brief Configure the distributor for the country and currency intended. + + Scrape the configuration page and define the base URL of DigiKey for the + currency and locale chosen. + The currency is predominant over the locale/country and the defauld are + currency='USD' and locale='US' for DigiKey. + + @param locale_iso `str` Country in ISO3166 alpha 2 standard. + @param currency_iso `str` Currency in ISO4217 alpha 3 standard.''' + + url = 'https://www.digikey.com/en/resources/international' + + try: + html = self.browser.scrape_URL(url) + except: # Could not get a good read from the website. + self.logger.log(DEBUG_OBSESSIVE,'No HTML page for DigiKey configuration.') + raise PartHtmlError + html = BeautifulSoup(html, 'lxml') + try: + if currency_iso and not locale_iso: + money = pycountry.currencies.get(alpha_3=currency_iso.upper()) + locale_iso = pycountry.countries.get(numeric=money.numeric).alpha_2 + if locale_iso: + locale_iso = locale_iso.upper() + country = pycountry.countries.get(alpha_2=locale_iso.upper()).name + html = html.find('li', text=re.compile(country, re.IGNORECASE)) + url = html.find('a', id='linkcolor').get('href') + + distributor_dict[self.name]['site']['url'] = url + distributor_dict[self.name]['site']['currency'] = pycountry.currencies.get(numeric=country.numeric).alpha_3 + distributor_dict[self.name]['site']['locale'] = locale_iso + + # Fetch cookies for new URL. + self.browser.scrape_URL(url) + except: + self.logger.log(DEBUG_OVERVIEW, 'Kept the last configuration {}, {} on {}.'.format( + pycountry.currencies.get(alpha_3=distributor_dict['digikey']['site']['currency']).name, + pycountry.countries.get(alpha_2=distributor_dict['digikey']['site']['locale']).name, + distributor_dict[self.name]['site']['url'] + )) # Keep the current configuration. + return + + def dist_get_part_num(self, html_tree): + '''@brief Get the part number from the Digikey product page. + @param html_tree `str()` html of the distributor part page. + @return `list()`of the parts that match. + ''' + try: + return re.sub('\s', '', html_tree.find('td', + id='reportPartNumber').text) + except AttributeError: + self.logger.log(DEBUG_OBSESSIVE, 'No Digikey part number found!') + return '' + + + def dist_get_qty_avail(self, html_tree): + '''@brief Get the available quantity of the part from the Digikey product page. + @param html_tree `str()` html of the distributor part page. + @return `int` avaliable quantity. + ''' + try: + qty_tree = html_tree.find('td', id='quantityAvailable').find('span', id='dkQty') + qty_str = qty_tree.text + except AttributeError: + # No quantity found (not even 0) so this is probably a non-stocked part. + # Return None so the part won't show in the spreadsheet for this dist. + return None + try: + qty_str = re.search('([0-9,]*)', qty_str, re.IGNORECASE).group(1) + return int(re.sub('[^0-9]', '', qty_str)) except (AttributeError, ValueError): - # Well, there's a quantityAvailable section in the website, but - # it doesn't contain anything decipherable. Let's just assume it's 0. - return 0 - - -def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2): - '''@brief Find the Digikey HTML page for a part number and return the URL and parse tree. - @param dist - @param pn Part number `str()`. - @param extra_search_terms - @param url - @param descend - @param local_part_html - @param scrape_retries `int` Quantity of retries in case of fail. - @return (html `str()` of the page, url) - ''' - - def merge_price_tiers(main_tree, alt_tree): + # Didn't find the usual quantity text field. This might be one of those + # input fields for requesting a quantity, so get the value from the + # input field. + try: + self.logger.log(DEBUG_OBSESSIVE, 'No Digikey part quantity found!') + return int(qty_tree.find('input', type='text').get('value')) + except (AttributeError, ValueError): + # Well, there's a quantityAvailable section in the website, but + # it doesn't contain anything decipherable. Let's just assume it's 0. + return 0 + + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + '''@brief Find the Digikey HTML page for a part number and return the URL and parse tree. + @param pn Part number `str()`. + @param extra_search_terms + @param url + @param descend + @param local_part_html + @return (html `str()` of the page, url) + ''' + + # Use the part number to lookup the part using the site search function, unless a starting url was given. + if url is None: + url = distributor_dict['digikey']['site']['url'] + '/products/en?keywords=' + urlquote( + #'/scripts/DkSearch/dksus.dll?WT.z_header=search_go&lang=en&keywords=' + urlquote( + pn + ' ' + extra_search_terms, + safe='') + #url = distributor_dict['digikey']['site']['url'] + '/product-search/en?KeyWords=' + urlquote(pn,safe='') + '&WT.z_header=search_go' + elif url[0] == '/': + url = distributor_dict['digikey']['site']['url'] + url + + # Open the URL, read the HTML from it, and parse it into a tree structure. + try: + html = fake_browser(url, scrape_retries) + except: + self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name)) + raise PartHtmlError + + # Abort if the part number isn't in the HTML somewhere. + # (Only use the numbers and letters to compare PN to HTML.) + if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): + self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, self.name)) + raise PartHtmlError + + # Use the following code if Javascript challenge pages are used to block scrapers. + # try: + # ghst = Ghost() + # sess = ghst.start(plugins_enabled=False, download_images=False, show_scrollbars=False, javascript_enabled=False) + # html, resources = sess.open(url) + # print('type of HTML is {}'.format(type(html.content))) + # html = html.content + # except Exception as e: + # print('Exception reading with Ghost: {}'.format(e)) + + try: + tree = BeautifulSoup(html, 'lxml') + except Exception: + self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name)) + raise PartHtmlError + + # If the tree contains the tag for a product page, then return it. + if tree.find('div', class_='product-top-section') is not None: + + # Digikey separprint(ates cut-tape and reel packaging, so we need to examine more pages + # to get all the pricing info. But don't descend any further if limit has been reached. + if descend > 0: + try: + # Find all the URLs to alternate-packaging pages for this part. + ap_urls = [ + ap.find('li', class_='lnkAltPack').find_all('a')[-1].get('href') + for ap in tree.find( + 'div', class_='bota', + id='additionalPackaging').find_all( + 'ul', class_='more-expander-item') + ] + self.logger.log(DEBUG_OBSESSIVE,'Found {} alternate packagings for {} from {}'.format(len(ap_urls), pn, self.name)) + ap_trees_and_urls = [] # Initialize as empty in case no alternate packagings are found. + try: + ap_trees_and_urls = [get_part_html_tree(self.name, pn, + extra_search_terms, ap_url, descend=0, scrape_retries=scrape_retries) + for ap_url in ap_urls] + except Exception: + self.logger.log(DEBUG_OBSESSIVE,'Failed to find alternate packagings for {} from {}'.format(pn, self.name)) + + # Put the main tree on the list as well and then look through + # the entire list for one that's non-reeled. Use this as the + # main page for the part. + ap_trees_and_urls.append((tree, url)) + if part_is_reeled(tree): + for ap_tree, ap_url in ap_trees_and_urls: + if not part_is_reeled(ap_tree): + # Found a non-reeled part, so use it as the main page. + tree = ap_tree + url = ap_url + break # Done looking. + + # Now go through the other pages, merging their pricing and quantity + # info into the main page. + for ap_tree, ap_url in ap_trees_and_urls: + if ap_tree is tree: + continue # Skip examining the main tree. It already contains its info. + try: + # Merge the pricing info from that into the main parse tree to make + # a single, unified set of price tiers... + merge_price_tiers(tree, ap_tree) + # and merge available quantity, using the maximum found. + merge_qty_avail(tree, ap_tree) + except AttributeError: + self.logger.log(DEBUG_OBSESSIVE,'Problem merging price/qty for {} from {}'.format(pn, self.name)) + continue + except AttributeError as e: + self.logger.log(DEBUG_OBSESSIVE,'Problem parsing URLs from product page for {} from {}'.format(pn, self.name)) + + return tree, url # Return the parse tree and the URL where it came from. + + # If the tree is for a list of products, then examine the links to try to find the part number. + if tree.find('table', id='productTable') is not None: + self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name)) + if descend <= 0: + self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name)) + raise PartHtmlError + else: + # Look for the table of products. + products = tree.find( + 'table', + id='productTable').find('tbody').find_all('tr') + + # Extract the product links for the part numbers from the table. + # Extract links for both manufacturer and catalog numbers. + product_links = [p.find('td', + class_='tr-mfgPartNumber').a + for p in products] + product_links.extend([p.find('td', + class_='tr-dkPartNumber').a + for p in products]) + + # Extract all the part numbers from the text portion of the links. + part_numbers = [l.text for l in product_links] + + # Look for the part number in the list that most closely matches the requested part number. + match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] + + # Now look for the link that goes with the closest matching part number. + for l in product_links: + if l.text == match: + # Get the tree for the linked-to page and return that. + self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, self.name)) + return self.dist_get_part_html_tree(pn, extra_search_terms, + url=l.get('href', ''), + descend=descend - 1) + + # If the HTML contains a list of part categories, then give up. + if tree.find('form', id='keywordSearchForm') is not None: + self.logger.log(DEBUG_OBSESSIVE,'Found high-level part categories for {} from {}'.format(pn, self.name)) + raise PartHtmlError + + # I don't know what happened here, so give up. + self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name)) + raise PartHtmlError + + def part_is_reeled(self, html_tree): + '''@brief Returns True if this Digi-Key part is reeled or Digi-reeled. + @param html_tree `str()` html of the distributor part page. + @return `True` or `False`. + ''' + qty_tiers = list(get_price_tiers(html_tree).keys()) + if len(qty_tiers) > 0 and min(qty_tiers) >= 100: + return True + if html_tree.find('table', + id='product-details-reel-pricing') is not None: + return True + return False + + def merge_price_tiers(self, main_tree, alt_tree): '''Merge the price tiers from the alternate-packaging tree into the main tree.''' try: insertion_point = main_tree.find('table', id='product-dollars').find('tr') for tr in alt_tree.find('table', id='product-dollars').find_all('tr'): insertion_point.insert_after(tr) except AttributeError: - logger.log(DEBUG_OBSESSIVE, 'Problem merging price tiers for Digikey part {} with alternate packaging!'.format(pn)) + self.logger.log(DEBUG_OBSESSIVE, 'Problem merging price tiers for Digikey part {} with alternate packaging!'.format(pn)) - def merge_qty_avail(main_tree, alt_tree): + def merge_qty_avail(self, main_tree, alt_tree): '''Merge the quantities from the alternate-packaging tree into the main tree.''' try: main_qty = get_qty_avail(main_tree) @@ -235,144 +385,6 @@ def merge_qty_avail(main_tree, alt_tree): insertion_point = main_tree.find('td', id='quantityAvailable').find('span', id='dkQty') insertion_point.string = '{}'.format(merged_qty) except AttributeError: - logger.log(DEBUG_OBSESSIVE, 'Problem merging available quantities for Digikey part {} with alternate packaging!'.format(pn)) - - # Use the part number to lookup the part using the site search function, unless a starting url was given. - if url is None: - url = distributor_dict['digikey']['site']['url'] + '/products/en?keywords=' + urlquote( - #'/scripts/DkSearch/dksus.dll?WT.z_header=search_go&lang=en&keywords=' + urlquote( - pn + ' ' + extra_search_terms, - safe='') - #url = distributor_dict['digikey']['site']['url'] + '/product-search/en?KeyWords=' + urlquote(pn,safe='') + '&WT.z_header=search_go' - elif url[0] == '/': - url = distributor_dict['digikey']['site']['url'] + url - - # Open the URL, read the HTML from it, and parse it into a tree structure. - try: - html = fake_browser(url, scrape_retries) - except: - logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist)) - raise PartHtmlError - - # Abort if the part number isn't in the HTML somewhere. - # (Only use the numbers and letters to compare PN to HTML.) - if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): - logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist)) - raise PartHtmlError - - # Use the following code if Javascript challenge pages are used to block scrapers. - # try: - # ghst = Ghost() - # sess = ghst.start(plugins_enabled=False, download_images=False, show_scrollbars=False, javascript_enabled=False) - # html, resources = sess.open(url) - # print('type of HTML is {}'.format(type(html.content))) - # html = html.content - # except Exception as e: - # print('Exception reading with Ghost: {}'.format(e)) - - try: - tree = BeautifulSoup(html, 'lxml') - except Exception: - logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist)) - raise PartHtmlError + self.logger.log(DEBUG_OBSESSIVE, 'Problem merging available quantities for Digikey part {} with alternate packaging!'.format(pn)) - # If the tree contains the tag for a product page, then return it. - if tree.find('div', class_='product-top-section') is not None: - - # Digikey separprint(ates cut-tape and reel packaging, so we need to examine more pages - # to get all the pricing info. But don't descend any further if limit has been reached. - if descend > 0: - try: - # Find all the URLs to alternate-packaging pages for this part. - ap_urls = [ - ap.find('li', class_='lnkAltPack').find_all('a')[-1].get('href') - for ap in tree.find( - 'div', class_='bota', - id='additionalPackaging').find_all( - 'ul', class_='more-expander-item') - ] - logger.log(DEBUG_OBSESSIVE,'Found {} alternate packagings for {} from {}'.format(len(ap_urls), pn, dist)) - ap_trees_and_urls = [] # Initialize as empty in case no alternate packagings are found. - try: - ap_trees_and_urls = [get_part_html_tree(dist, pn, - extra_search_terms, ap_url, descend=0, scrape_retries=scrape_retries) - for ap_url in ap_urls] - except Exception: - logger.log(DEBUG_OBSESSIVE,'Failed to find alternate packagings for {} from {}'.format(pn, dist)) - - # Put the main tree on the list as well and then look through - # the entire list for one that's non-reeled. Use this as the - # main page for the part. - ap_trees_and_urls.append((tree, url)) - if part_is_reeled(tree): - for ap_tree, ap_url in ap_trees_and_urls: - if not part_is_reeled(ap_tree): - # Found a non-reeled part, so use it as the main page. - tree = ap_tree - url = ap_url - break # Done looking. - - # Now go through the other pages, merging their pricing and quantity - # info into the main page. - for ap_tree, ap_url in ap_trees_and_urls: - if ap_tree is tree: - continue # Skip examining the main tree. It already contains its info. - try: - # Merge the pricing info from that into the main parse tree to make - # a single, unified set of price tiers... - merge_price_tiers(tree, ap_tree) - # and merge available quantity, using the maximum found. - merge_qty_avail(tree, ap_tree) - except AttributeError: - logger.log(DEBUG_OBSESSIVE,'Problem merging price/qty for {} from {}'.format(pn, dist)) - continue - except AttributeError as e: - logger.log(DEBUG_OBSESSIVE,'Problem parsing URLs from product page for {} from {}'.format(pn, dist)) - - return tree, url # Return the parse tree and the URL where it came from. - - # If the tree is for a list of products, then examine the links to try to find the part number. - if tree.find('table', id='productTable') is not None: - logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist)) - if descend <= 0: - logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist)) - raise PartHtmlError - else: - # Look for the table of products. - products = tree.find( - 'table', - id='productTable').find('tbody').find_all('tr') - - # Extract the product links for the part numbers from the table. - # Extract links for both manufacturer and catalog numbers. - product_links = [p.find('td', - class_='tr-mfgPartNumber').a - for p in products] - product_links.extend([p.find('td', - class_='tr-dkPartNumber').a - for p in products]) - - # Extract all the part numbers from the text portion of the links. - part_numbers = [l.text for l in product_links] - - # Look for the part number in the list that most closely matches the requested part number. - match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] - - # Now look for the link that goes with the closest matching part number. - for l in product_links: - if l.text == match: - # Get the tree for the linked-to page and return that. - logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, dist)) - return get_part_html_tree(dist, pn, extra_search_terms, - url=l.get('href', ''), - descend=descend - 1, - scrape_retries=scrape_retries) - - # If the HTML contains a list of part categories, then give up. - if tree.find('form', id='keywordSearchForm') is not None: - logger.log(DEBUG_OBSESSIVE,'Found high-level part categories for {} from {}'.format(pn, dist)) - raise PartHtmlError - # I don't know what happened here, so give up. - logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist)) - raise PartHtmlError From 4e10f2b58eacff38617c654b6aa33437da81bd98 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Fri, 1 Jun 2018 15:22:26 +0200 Subject: [PATCH 07/29] Implemented dist_newark class. --- kicost/distributors/newark/__init__.py | 2 +- kicost/distributors/newark/newark.py | 337 +++++++++++++------------ 2 files changed, 174 insertions(+), 165 deletions(-) diff --git a/kicost/distributors/newark/__init__.py b/kicost/distributors/newark/__init__.py index 3be052ed8..ec6b05197 100644 --- a/kicost/distributors/newark/__init__.py +++ b/kicost/distributors/newark/__init__.py @@ -26,7 +26,7 @@ }, # Web site defitions. 'site': { - 'url': 'http://www.newark.com/', + 'url': 'https://www.newark.com/', 'currency': 'USD', 'locale': 'US' }, diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py index 88079b644..bbd03fa9e 100644 --- a/kicost/distributors/newark/newark.py +++ b/kicost/distributors/newark/newark.py @@ -38,176 +38,185 @@ import difflib from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -from .. import urlencode, urlquote, urlsplit, urlunsplit -from .. import fake_browser, WEB_SCRAPE_EXCEPTIONS +#from .. import urlencode, urlquote, urlsplit, urlunsplit +from .. import fake_browser from ...globals import PartHtmlError from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE +from .. import distributor, distributor_dict + +from urllib.parse import quote_plus as urlquote + +class dist_newark(distributor.distributor): + def __init__(self, scrape_retries, log_level, throttle_delay): + super(dist_newark, self).__init__(scrape_retries, log_level, throttle_delay) + self.name = 'newark' + self.domain = distributor_dict[self.name]['site']['url'] + + self.browser.scrape_URL(self.domain) + self.browser.show_cookies(self.name) + + def dist_get_price_tiers(self, html_tree): + '''@brief Get the pricing tiers from the parsed tree of the Newark product page. + @param html_tree `str()` html of the distributor part page. + @return `dict()` price breaks, the keys are the quantities breaks. + ''' + price_tiers = {} + try: + qty_strs = [] + for qty in html_tree.find( + 'table', + class_=('tableProductDetailPrice', 'pricing')).find_all( + 'td', + class_='qty'): + qty_strs.append(qty.text) + price_strs = [] + for price in html_tree.find( + 'table', + class_=('tableProductDetailPrice', 'pricing')).find_all( + 'td', + class_='threeColTd'): + price_strs.append(price.text) + qtys_prices = list(zip(qty_strs, price_strs)) + for qty_str, price_str in qtys_prices: + try: + qty = re.search('(\s*)([0-9,]+)', qty_str).group(2) + qty = int(re.sub('[^0-9]', '', qty)) + price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str)) + except (TypeError, AttributeError, ValueError): + continue + except AttributeError: + # This happens when no pricing info is found in the tree. + self.logger.log(DEBUG_OBSESSIVE, 'No Newark pricing information found!') + return price_tiers # Return empty price tiers. + return price_tiers + + + def dist_get_part_num(self, html_tree): + '''@brief Get the part number from the Newark product page. + @param html_tree `str()` html of the distributor part page. + @return `list()`of the parts that match. + ''' + try: + # Newark catalog number is stored in a description list, so get + # all the list terms and descriptions, strip all the spaces from those, + # and pair them up. + div = html_tree.find('div', class_='productDescription').find('dl') + dt = [re.sub('\s','',d.text) for d in div.find_all('dt')] + dd = [re.sub('\s','',d.text) for d in div.find_all('dd')] + dtdd = {k:v for k,v in zip(dt,dd)} # Pair terms with descriptions. + return dtdd.get('NewarkPartNo.:', '') + except KeyError: + self.logger.log(DEBUG_OBSESSIVE, 'No Newark catalog number found!') + return '' # No catalog number found in page. + except AttributeError: + self.logger.log(DEBUG_OBSESSIVE, 'No Newark product description found!') + return '' # No ProductDescription found in page. + + + def dist_get_qty_avail(self, html_tree): + '''@brief Get the available quantity of the part from the Newark product page. + @param html_tree `str()` html of the distributor part page. + @return `int` avaliable quantity. + ''' + try: + qty_str = html_tree.find('p', class_='availabilityHeading').text + except (AttributeError, ValueError): + # No quantity found (not even 0) so this is probably a non-stocked part. + # Return None so the part won't show in the spreadsheet for this dist. + return None + try: + qty = re.sub('[^0-9]','',qty_str) # Strip all non-number chars. + return int(re.sub('[^0-9]', '', qty_str)) # Return integer for quantity. + except ValueError: + # No quantity found (not even 0) so this is probably a non-stocked part. + # Return None so the part won't show in the spreadsheet for this dist. + self.logger.log(DEBUG_OBSESSIVE, 'No Newark part quantity found!') + return None + + + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + '''@brief Find the Newark HTML page for a part number and return the URL and parse tree. + @param pn Part number `str()`. + @param extra_search_terms + @param url + @param descend + @param local_part_html + @return (html `str()` of the page, url) + ''' + + # Use the part number to lookup the part using the site search function, unless a starting url was given. + if url is None: + url = 'http://www.newark.com/webapp/wcs/stores/servlet/Search?catalogId=15003&langId=-1&storeId=10194&gs=true&st=' + urlquote( + pn + ' ' + extra_search_terms, + safe='') + elif url[0] == '/': + url = 'http://www.newark.com' + url + elif url.startswith('..'): + url = 'http://www.newark.com/Search/' + url + + # Open the URL, read the HTML from it, and parse it into a tree structure. + try: + html = self.browser.scrape_URL(url) + except: + self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name)) + raise PartHtmlError -def get_price_tiers(html_tree): - '''@brief Get the pricing tiers from the parsed tree of the Newark product page. - @param html_tree `str()` html of the distributor part page. - @return `dict()` price breaks, the keys are the quantities breaks. - ''' - price_tiers = {} - try: - qty_strs = [] - for qty in html_tree.find( - 'table', - class_=('tableProductDetailPrice', 'pricing')).find_all( - 'td', - class_='qty'): - qty_strs.append(qty.text) - price_strs = [] - for price in html_tree.find( - 'table', - class_=('tableProductDetailPrice', 'pricing')).find_all( - 'td', - class_='threeColTd'): - price_strs.append(price.text) - qtys_prices = list(zip(qty_strs, price_strs)) - for qty_str, price_str in qtys_prices: - try: - qty = re.search('(\s*)([0-9,]+)', qty_str).group(2) - qty = int(re.sub('[^0-9]', '', qty)) - price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str)) - except (TypeError, AttributeError, ValueError): - continue - except AttributeError: - # This happens when no pricing info is found in the tree. - logger.log(DEBUG_OBSESSIVE, 'No Newark pricing information found!') - return price_tiers # Return empty price tiers. - return price_tiers - - -def get_part_num(html_tree): - '''@brief Get the part number from the Newark product page. - @param html_tree `str()` html of the distributor part page. - @return `list()`of the parts that match. - ''' - try: - # Newark catalog number is stored in a description list, so get - # all the list terms and descriptions, strip all the spaces from those, - # and pair them up. - div = html_tree.find('div', class_='productDescription').find('dl') - dt = [re.sub('\s','',d.text) for d in div.find_all('dt')] - dd = [re.sub('\s','',d.text) for d in div.find_all('dd')] - dtdd = {k:v for k,v in zip(dt,dd)} # Pair terms with descriptions. - return dtdd.get('NewarkPartNo.:', '') - except KeyError: - logger.log(DEBUG_OBSESSIVE, 'No Newark catalog number found!') - return '' # No catalog number found in page. - except AttributeError: - logger.log(DEBUG_OBSESSIVE, 'No Newark product description found!') - return '' # No ProductDescription found in page. - - -def get_qty_avail(html_tree): - '''@brief Get the available quantity of the part from the Newark product page. - @param html_tree `str()` html of the distributor part page. - @return `int` avaliable quantity. - ''' - try: - qty_str = html_tree.find('p', class_='availabilityHeading').text - except (AttributeError, ValueError): - # No quantity found (not even 0) so this is probably a non-stocked part. - # Return None so the part won't show in the spreadsheet for this dist. - return None - try: - qty = re.sub('[^0-9]','',qty_str) # Strip all non-number chars. - return int(re.sub('[^0-9]', '', qty_str)) # Return integer for quantity. - except ValueError: - # No quantity found (not even 0) so this is probably a non-stocked part. - # Return None so the part won't show in the spreadsheet for this dist. - logger.log(DEBUG_OBSESSIVE, 'No Newark part quantity found!') - return None - - -def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2): - '''@brief Find the Newark HTML page for a part number and return the URL and parse tree. - @param dist - @param pn Part number `str()`. - @param extra_search_terms - @param url - @param descend - @param local_part_html - @param scrape_retries `int` Quantity of retries in case of fail. - @return (html `str()` of the page, url) - ''' - - # Use the part number to lookup the part using the site search function, unless a starting url was given. - if url is None: - url = 'http://www.newark.com/webapp/wcs/stores/servlet/Search?catalogId=15003&langId=-1&storeId=10194&gs=true&st=' + urlquote( - pn + ' ' + extra_search_terms, - safe='') - elif url[0] == '/': - url = 'http://www.newark.com' + url - elif url.startswith('..'): - url = 'http://www.newark.com/Search/' + url - - # Open the URL, read the HTML from it, and parse it into a tree structure. - try: - html = fake_browser(url, scrape_retries) - except: - logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist)) - raise PartHtmlError - - try: - tree = BeautifulSoup(html, 'lxml') - except Exception: - logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist)) - raise PartHtmlError - - # Abort if the part number isn't in the HTML somewhere. - # (Only use the numbers and letters to compare PN to HTML.) - if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): - logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist)) - raise PartHtmlError - - # If the tree contains the tag for a product page, then just return it. - if tree.find('div', class_='productDisplay', id='page') is not None: - return tree, url + try: + tree = BeautifulSoup(html, 'lxml') + except Exception: + self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name)) + raise PartHtmlError - # If the tree is for a list of products, then examine the links to try to find the part number. - if tree.find('table', class_='productLister', id='sProdList') is not None: - logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist)) - if descend <= 0: - logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist)) + # Abort if the part number isn't in the HTML somewhere. + # (Only use the numbers and letters to compare PN to HTML.) + if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): + self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, self.name)) raise PartHtmlError - else: - # Look for the table of products. - products = tree.find('table', - class_='productLister', - id='sProdList').find('tbody').find_all('tr') - - # Extract the product links for the part numbers from the table. - product_links = [] - for p in products: - try: - product_links.append( - p.find('td', class_='mftrPart').find('a')) - except AttributeError: - continue - # Extract all the part numbers from the text portion of the links. - part_numbers = [l.text for l in product_links] + # If the tree contains the tag for a product page, then just return it. + if tree.find('div', class_='productDisplay', id='page') is not None: + return tree, url - # Look for the part number in the list that most closely matches the requested part number. - try: - match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] - except IndexError: + # If the tree is for a list of products, then examine the links to try to find the part number. + if tree.find('table', class_='productLister', id='sProdList') is not None: + self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name)) + if descend <= 0: + self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name)) raise PartHtmlError - - # Now look for the link that goes with the closest matching part number. - for l in product_links: - if l.text == match: - # Get the tree for the linked-to page and return that. - logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, dist)) - return get_part_html_tree(dist, pn, extra_search_terms, - url=l.get('href', ''), - descend=descend-1, - scrape_retries=scrape_retries) - - # I don't know what happened here, so give up. - logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist)) - raise PartHtmlError + else: + # Look for the table of products. + products = tree.find('table', + class_='productLister', + id='sProdList').find('tbody').find_all('tr') + + # Extract the product links for the part numbers from the table. + product_links = [] + for p in products: + try: + product_links.append( + p.find('td', class_='mftrPart').find('a')) + except AttributeError: + continue + + # Extract all the part numbers from the text portion of the links. + part_numbers = [l.text for l in product_links] + + # Look for the part number in the list that most closely matches the requested part number. + try: + match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] + except IndexError: + raise PartHtmlError + + # Now look for the link that goes with the closest matching part number. + for l in product_links: + if l.text == match: + # Get the tree for the linked-to page and return that. + self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, self.name)) + return self.dist_get_part_html_tree(pn, extra_search_terms, + url=l.get('href', ''), + descend=descend-1) + + # I don't know what happened here, so give up. + self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name)) + raise PartHtmlError From 7fcc91fcb9f8e1d9a5cacf81dbc75e5fe05f0f91 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Fri, 1 Jun 2018 15:22:39 +0200 Subject: [PATCH 08/29] Implemented dist_rs class. --- kicost/distributors/rs/__init__.py | 2 +- kicost/distributors/rs/rs.py | 275 +++++++++++++++-------------- 2 files changed, 143 insertions(+), 134 deletions(-) diff --git a/kicost/distributors/rs/__init__.py b/kicost/distributors/rs/__init__.py index de299c740..5c06b17b3 100644 --- a/kicost/distributors/rs/__init__.py +++ b/kicost/distributors/rs/__init__.py @@ -25,7 +25,7 @@ }, # Web site defitions. 'site': { - 'url': 'http://rs-online.com/', + 'url': 'https://rs-online.com/', 'currency': 'USD', 'locale': 'UK' }, diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py index 61de641ad..34b58fa8d 100644 --- a/kicost/distributors/rs/rs.py +++ b/kicost/distributors/rs/rs.py @@ -34,148 +34,157 @@ import re, difflib from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -from .. import urlencode, urlquote, urlsplit, urlunsplit +#from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE from currency_converter import CurrencyConverter currency = CurrencyConverter() +from .. import distributor, distributor_dict -def get_price_tiers(html_tree): - '''@brief Get the pricing tiers from the parsed tree of the RS Components product page. - @param html_tree `str()` html of the distributor part page. - @return `dict()` price breaks, the keys are the quantities breaks. - ''' - price_tiers = {} - - try: - for row in html_tree.find_all('div', class_='table-row value-row'): - qty = row.find('div', - class_='breakRangeWithoutUnit col-xs-4').text - price = row.find('div', - class_='unitPrice col-xs-4').text - try: - qty = int( re.findall('\s*([0-9\,]+)', qty)[0] ) - price = re.sub('[^0-9\.]', '', price.replace(',','.') ) - price = currency.convert(float(price), 'EUR', 'USD') - price_tiers[qty] = price - except (TypeError, AttributeError, ValueError): - continue - except AttributeError: - # This happens when no pricing info is found in the tree. - return price_tiers # Return empty price tiers. - return price_tiers - -def get_part_num(html_tree): - '''@brief Get the part number from the RS product page. - @param html_tree `str()` html of the distributor part page. - @return `dict()` price breaks, the keys are the quantities breaks. - ''' - try: - pn_str = html_tree.find('span', class_='keyValue').text - pn = re.sub('[^0-9\-]','', pn_str) - return pn - except KeyError: - return '' # No catalog number found in page. - except AttributeError: - return '' # No ProductDescription found in page. - -def get_qty_avail(html_tree): - '''Get the available quantity of the part from the RS product page. - @param html_tree `str()` html of the distributor part page. - @return `int` avaliable quantity. - ''' +from urllib.parse import quote_plus as urlquote + +class dist_rs(distributor.distributor): + def __init__(self, scrape_retries, log_level, throttle_delay): + super(dist_rs, self).__init__(scrape_retries, log_level, throttle_delay) + self.name = 'rs' + self.domain = distributor_dict[self.name]['site']['url'] + + self.browser.scrape_URL(self.domain) + self.browser.show_cookies(self.name) + + def dist_get_price_tiers(self, html_tree): + '''@brief Get the pricing tiers from the parsed tree of the RS Components product page. + @param html_tree `str()` html of the distributor part page. + @return `dict()` price breaks, the keys are the quantities breaks. + ''' + price_tiers = {} - try: - # Note that 'availability' is misspelled in the container class name! - qty_str = html_tree.find('span', class_=('stock-msg-content', 'table-cell')).text - except (AttributeError, ValueError): - # No quantity found (not even 0) so this is probably a non-stocked part. - # Return None so the part won't show in the spreadsheet for this dist. - return None - try: - qty = re.sub('[^0-9]','',qty_str[0:10]) # Strip all non-number chars. - return int(qty) # Return integer for quantity. - except ValueError: - # No quantity found (not even 0) so this is probably a non-stocked part. - # Return None so the part won't show in the spreadsheet for this dist. - return None - -def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2): - '''@brief Find the RS Components HTML page for a part number and return the URL and parse tree. - @param dist - @param pn Part number `str()`. - @param extra_search_terms - @param url - @param descend - @param local_part_html - @param scrape_retries `int` Quantity of retries in case of fail. - @return (html `str()` of the page, url) - ''' + try: + for row in html_tree.find_all('div', class_='table-row value-row'): + qty = row.find('div', + class_='breakRangeWithoutUnit col-xs-4').text + price = row.find('div', + class_='unitPrice col-xs-4').text + try: + qty = int( re.findall('\s*([0-9\,]+)', qty)[0] ) + price = re.sub('[^0-9\.]', '', price.replace(',','.') ) + price = currency.convert(float(price), 'EUR', 'USD') + price_tiers[qty] = price + except (TypeError, AttributeError, ValueError): + continue + except AttributeError: + # This happens when no pricing info is found in the tree. + return price_tiers # Return empty price tiers. + return price_tiers + + def dist_get_part_num(self, html_tree): + '''@brief Get the part number from the RS product page. + @param html_tree `str()` html of the distributor part page. + @return `dict()` price breaks, the keys are the quantities breaks. + ''' + try: + pn_str = html_tree.find('span', class_='keyValue').text + pn = re.sub('[^0-9\-]','', pn_str) + return pn + except KeyError: + return '' # No catalog number found in page. + except AttributeError: + return '' # No ProductDescription found in page. + + def dist_get_qty_avail(self, html_tree): + '''Get the available quantity of the part from the RS product page. + @param html_tree `str()` html of the distributor part page. + @return `int` avaliable quantity. + ''' - # Use the part number to lookup the part using the site search function, unless a starting url was given. - if url is None: - url = 'http://it.rs-online.com/web/c/?searchTerm=' + urlquote(pn + ' ' + extra_search_terms, safe='') - - elif url[0] == '/': - url = 'http://it.rs-online.com' + url - elif url.startswith('..'): - url = 'http://it.rs-online.com/Search/' + url - - # Open the URL, read the HTML from it, and parse it into a tree structure. - try: - html = fake_browser(url, scrape_retries) - except: - logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist)) - raise PartHtmlError + try: + # Note that 'availability' is misspelled in the container class name! + qty_str = html_tree.find('span', class_=('stock-msg-content', 'table-cell')).text + except (AttributeError, ValueError): + # No quantity found (not even 0) so this is probably a non-stocked part. + # Return None so the part won't show in the spreadsheet for this dist. + return None + try: + qty = re.sub('[^0-9]','',qty_str[0:10]) # Strip all non-number chars. + return int(qty) # Return integer for quantity. + except ValueError: + # No quantity found (not even 0) so this is probably a non-stocked part. + # Return None so the part won't show in the spreadsheet for this dist. + return None + + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + '''@brief Find the RS Components HTML page for a part number and return the URL and parse tree. + @param pn Part number `str()`. + @param extra_search_terms + @param url + @param descend + @param local_part_html + @return (html `str()` of the page, url) + ''' + + # Use the part number to lookup the part using the site search function, unless a starting url was given. + if url is None: + url = 'http://it.rs-online.com/web/c/?searchTerm=' + urlquote(pn + ' ' + extra_search_terms, safe='') + + elif url[0] == '/': + url = 'http://it.rs-online.com' + url + elif url.startswith('..'): + url = 'http://it.rs-online.com/Search/' + url + + # Open the URL, read the HTML from it, and parse it into a tree structure. + try: + html = self.browser.scrape_URL(url) + except: + self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist)) + raise PartHtmlError - try: - tree = BeautifulSoup(html, 'lxml') - except Exception: - logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist)) - raise PartHtmlError + try: + tree = BeautifulSoup(html, 'lxml') + except Exception: + self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist)) + raise PartHtmlError - # Abort if the part number isn't in the HTML somewhere. - # (Only use the numbers and letters to compare PN to HTML.) - if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): - logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist)) - raise PartHtmlError - - # If the tree contains the tag for a product page, then just return it. - if tree.find('div', class_='advLineLevelContainer'): - return tree, url - - # If the tree is for a list of products, then examine the links to try to find the part number. - if tree.find('div', class_=('resultsTable','results-table-container')) is not None: - logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist)) - if descend <= 0: - logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist)) + # Abort if the part number isn't in the HTML somewhere. + # (Only use the numbers and letters to compare PN to HTML.) + if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): + self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist)) raise PartHtmlError - else: - # Look for the table of products. - products = tree.find('table', id='results-table').find_all( - 'tr', class_='resultRow') - - # Extract the product links for the part numbers from the table. - product_links = [p.find('a', class_='product-name').get('href') for p in products] - - # Extract all the part numbers from the text portion of the links. - part_numbers = [p.find('span', class_='text-contents').get_text() for p in products] - - # Look for the part number in the list that most closely matches the requested part number. - match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] - - # Now look for the link that goes with the closest matching part number. - for i in range(len(product_links)): - if part_numbers[i] == match: - # Get the tree for the linked-to page and return that. - logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(part_numbers[i], pn, dist)) - return get_part_html_tree(dist, pn, extra_search_terms, - url=product_links[i], - descend=descend-1, - scrape_retries=scrape_retries) - - # I don't know what happened here, so give up. - logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist)) - raise PartHtmlError + + # If the tree contains the tag for a product page, then just return it. + if tree.find('div', class_='advLineLevelContainer'): + return tree, url + + # If the tree is for a list of products, then examine the links to try to find the part number. + if tree.find('div', class_=('resultsTable','results-table-container')) is not None: + self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist)) + if descend <= 0: + self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist)) + raise PartHtmlError + else: + # Look for the table of products. + products = tree.find('table', id='results-table').find_all( + 'tr', class_='resultRow') + + # Extract the product links for the part numbers from the table. + product_links = [p.find('a', class_='product-name').get('href') for p in products] + + # Extract all the part numbers from the text portion of the links. + part_numbers = [p.find('span', class_='text-contents').get_text() for p in products] + + # Look for the part number in the list that most closely matches the requested part number. + match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] + + # Now look for the link that goes with the closest matching part number. + for i in range(len(product_links)): + if part_numbers[i] == match: + # Get the tree for the linked-to page and return that. + self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(part_numbers[i], pn, dist)) + return self.dist_get_part_html_tree(pn, extra_search_terms, + url=product_links[i], + descend=descend-1) + + # I don't know what happened here, so give up. + self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist)) + raise PartHtmlError From 5c7be6a22bcd5aba97fe839decaade166be9cdc6 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Fri, 1 Jun 2018 15:22:54 +0200 Subject: [PATCH 09/29] Implemented dist_tme class. --- kicost/distributors/tme/tme.py | 401 +++++++++++++++++---------------- 1 file changed, 203 insertions(+), 198 deletions(-) diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py index a593b9e68..f2403e629 100644 --- a/kicost/distributors/tme/tme.py +++ b/kicost/distributors/tme/tme.py @@ -39,211 +39,216 @@ import json from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -from .. import urlencode, urlquote, urlsplit, urlunsplit +#from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE -HTML_RESPONSE_RETRIES = 2 - -def __ajax_details(pn): - '''@brief Load part details from TME using XMLHttpRequest. - @param pn `str()` part number - @return (html, quantity avaliable) - ''' - data = urlencode({ - 'symbol': pn, - 'currency': 'USD' - }).encode("utf-8") - - try: - html = fake_browser('https://www.tme.eu/en/_ajax/ProductInformationPage/_getStocks.html', 4, ('X-Requested-With', 'XMLHttpRequest') ) - except: # Couldn't get a good read from the website. - logger.log(DEBUG_OBSESSIVE,'No AJAX data for {} from {}'.format(pn, 'TME')) - return None, None - - try: - r = r.decode('utf-8') # Convert bytes to string in Python 3. - p = json.loads(r).get('Products') - if p is not None and isinstance(p, list): - p = p[0] - html_tree = BeautifulSoup(p.get('PriceTpl', '').replace("\n", ""), "lxml") - quantity = p.get('InStock', '0') - return html_tree, quantity - else: +from .. import distributor, distributor_dict + +from urllib.parse import quote_plus as urlquote, urlencode + +class dist_tme(distributor.distributor): + def __init__(self, scrape_retries, log_level, throttle_delay): + super(dist_tme, self).__init__(scrape_retries, log_level, throttle_delay) + self.name = 'tme' + self.domain = distributor_dict[self.name]['site']['url'] + + self.browser.scrape_URL(self.domain) + self.browser.show_cookies(self.name) + + def __ajax_details(self, pn): + '''@brief Load part details from TME using XMLHttpRequest. + @param pn `str()` part number + @return (html, quantity avaliable) + ''' + data = urlencode({ + 'symbol': pn, + 'currency': 'USD' + }).encode("utf-8") + + try: + html = self.browser.scrape_URL('https://www.tme.eu/en/_ajax/ProductInformationPage/_getStocks.html', ('X-Requested-With', 'XMLHttpRequest')) + except: # Couldn't get a good read from the website. + self.logger.log(DEBUG_OBSESSIVE,'No AJAX data for {} from {}'.format(pn, 'TME')) return None, None - except (ValueError, KeyError, IndexError): - logger.log(DEBUG_OBSESSIVE, 'Could not obtain AJAX data from TME!') - return None, None - -def get_price_tiers(html_tree): - '''@brief Get the pricing tiers from the parsed tree of the TME product page. - @param html_tree `str()` html of the distributor part page. - @return `dict()` price breaks, the keys are the quantities breaks. - ''' - price_tiers = {} - try: - pn = get_part_num(html_tree) - if pn == '': - return price_tiers - - ajax_tree, quantity = __ajax_details(pn) - if ajax_tree is None: - return price_tiers - - qty_strs = [] - price_strs = [] - for tr in ajax_tree.find('tbody', id='prices_body').find_all('tr'): - td = tr.find_all('td') - if len(td) == 3: - qty_strs.append(td[0].text) - price_strs.append(td[2].text) - - qtys_prices = list(zip(qty_strs, price_strs)) - for qty_str, price_str in qtys_prices: - try: - qty = re.search('(\s*)([0-9,]+)', qty_str).group(2) - qty = int(re.sub('[^0-9]', '', qty)) - price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str)) - except (TypeError, AttributeError, ValueError, IndexError): - continue - except AttributeError: - # This happens when no pricing info is found in the tree. - logger.log(DEBUG_OBSESSIVE, 'No TME pricing information found!') - return price_tiers # Return empty price tiers. - return price_tiers - - -def get_part_num(html_tree): - '''@brief Get the part number from the TME product page. - @param html_tree `str()` html of the distributor part page. - @return `list()`of the parts that match. - ''' - try: - return html_tree.find('td', class_="pip-product-symbol").text - except AttributeError: - logger.log(DEBUG_OBSESSIVE, 'No TME part number found!') - return '' - - -def get_qty_avail(html_tree): - '''@brief Get the available quantity of the part from the TME product page. - @param html_tree `str()` html of the distributor part page. - @return `int` avaliable quantity. - ''' - pn = get_part_num(html_tree) - if pn == '': - logger.log(DEBUG_OBSESSIVE, 'No TME part quantity found!') - return None - - ajax_tree, qty_str = __ajax_details(pn) - if qty_str is None: - return None - - try: - return int(qty_str) - except ValueError: - # No quantity found (not even 0) so this is probably a non-stocked part. - # Return None so the part won't show in the spreadsheet for this dist. - logger.log(DEBUG_OBSESSIVE, 'No TME part quantity found!') - return None - - -def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2): - '''@brief Find the TME HTML page for a part number and return the URL and parse tree. - @param dist - @param pn Part number `str()`. - @param extra_search_terms - @param url - @param descend - @param local_part_html - @param scrape_retries `int` Quantity of retries in case of fail. - @return (html `str()` of the page, url) - ''' - - global HTML_RESPONSE_RETRIES - HTML_RESPONSE_RETRIES = scrape_retries - - # Use the part number to lookup the part using the site search function, unless a starting url was given. - if url is None: - url = 'https://www.tme.eu/en/katalog/?search=' + urlquote( - pn + ' ' + extra_search_terms, - safe='') - elif url[0] == '/': - url = 'https://www.tme.eu' + url - - # Open the URL, read the HTML from it, and parse it into a tree structure. - try: - html = fake_browser(url, scrape_retries) - except: - logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist)) - raise PartHtmlError - # Abort if the part number isn't in the HTML somewhere. - # (Only use the numbers and letters to compare PN to HTML.) - if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): - logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {} ({})'.format(pn, dist, url)) - raise PartHtmlError + try: + r = r.decode('utf-8') # Convert bytes to string in Python 3. + p = json.loads(r).get('Products') + if p is not None and isinstance(p, list): + p = p[0] + html_tree = BeautifulSoup(p.get('PriceTpl', '').replace("\n", ""), "lxml") + quantity = p.get('InStock', '0') + return html_tree, quantity + else: + return None, None + except (ValueError, KeyError, IndexError): + self.logger.log(DEBUG_OBSESSIVE, 'Could not obtain AJAX data from TME!') + return None, None - try: - tree = BeautifulSoup(html, 'lxml') - except Exception: - logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist)) - raise PartHtmlError + def dist_get_price_tiers(self, html_tree): + '''@brief Get the pricing tiers from the parsed tree of the TME product page. + @param html_tree `str()` html of the distributor part page. + @return `dict()` price breaks, the keys are the quantities breaks. + ''' + price_tiers = {} + try: + pn = self.dist_get_part_num(html_tree) + if pn == '': + return price_tiers + + ajax_tree, quantity = self.__ajax_details(pn) + if ajax_tree is None: + return price_tiers + + qty_strs = [] + price_strs = [] + for tr in ajax_tree.find('tbody', id='prices_body').find_all('tr'): + td = tr.find_all('td') + if len(td) == 3: + qty_strs.append(td[0].text) + price_strs.append(td[2].text) + + qtys_prices = list(zip(qty_strs, price_strs)) + for qty_str, price_str in qtys_prices: + try: + qty = re.search('(\s*)([0-9,]+)', qty_str).group(2) + qty = int(re.sub('[^0-9]', '', qty)) + price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str)) + except (TypeError, AttributeError, ValueError, IndexError): + continue + except AttributeError: + # This happens when no pricing info is found in the tree. + self.logger.log(DEBUG_OBSESSIVE, 'No TME pricing information found!') + return price_tiers # Return empty price tiers. + return price_tiers + + + def dist_get_part_num(self, html_tree): + '''@brief Get the part number from the TME product page. + @param html_tree `str()` html of the distributor part page. + @return `list()`of the parts that match. + ''' + try: + return html_tree.find('td', class_="pip-product-symbol").text + except AttributeError: + self.logger.log(DEBUG_OBSESSIVE, 'No TME part number found!') + return '' + + + def dist_get_qty_avail(self, html_tree): + '''@brief Get the available quantity of the part from the TME product page. + @param html_tree `str()` html of the distributor part page. + @return `int` avaliable quantity. + ''' + pn = self.dist_get_part_num(html_tree) + if pn == '': + self.logger.log(DEBUG_OBSESSIVE, 'No TME part quantity found!') + return None + + ajax_tree, qty_str = self.__ajax_details(pn) + if qty_str is None: + return None + + try: + return int(qty_str) + except ValueError: + # No quantity found (not even 0) so this is probably a non-stocked part. + # Return None so the part won't show in the spreadsheet for this dist. + self.logger.log(DEBUG_OBSESSIVE, 'No TME part quantity found!') + return None + + + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + '''@brief Find the TME HTML page for a part number and return the URL and parse tree. + @param pn Part number `str()`. + @param extra_search_terms + @param url + @param descend + @param local_part_html + @return (html `str()` of the page, url) + ''' + + # Use the part number to lookup the part using the site search function, unless a starting url was given. + if url is None: + url = 'https://www.tme.eu/en/katalog/?search=' + urlquote( + pn + ' ' + extra_search_terms, + safe='') + elif url[0] == '/': + url = 'https://www.tme.eu' + url + + # Open the URL, read the HTML from it, and parse it into a tree structure. + try: + html = self.browser.scrape_URL(url) + except: + self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name)) + raise PartHtmlError - # If the tree contains the tag for a product page, then just return it. - if tree.find('div', id='ph') is not None: - return tree, url + # Abort if the part number isn't in the HTML somewhere. + # (Only use the numbers and letters to compare PN to HTML.) + if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): + self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {} ({})'.format(pn, self.name, url)) + raise PartHtmlError - # If the tree is for a list of products, then examine the links to try to find the part number. - if tree.find('table', id="products") is not None: - logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist)) - if descend <= 0: - logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist)) + try: + tree = BeautifulSoup(html, 'lxml') + except Exception: + self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name)) raise PartHtmlError - else: - # Look for the table of products. - products = tree.find( - 'table', - id="products").find_all( - 'tr', - class_=('product-row')) - - # Extract the product links for the part numbers from the table. - product_links = [] - for p in products: - for a in p.find('td', class_='product').find_all('a'): - product_links.append(a) - - # Extract all the part numbers from the text portion of the links. - part_numbers = [l.text for l in product_links] - - # Look for the part number in the list that most closely matches the requested part number. - match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] - - # Now look for the link that goes with the closest matching part number. - for l in product_links: - try: - if (not l.get('href', '').startswith('./katalog')) and l.text == match: - # Get the tree for the linked-to page and return that. - logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text, pn, dist)) - # TODO: The current implementation does up to four HTTP - # requests per part (search, part details page for TME P/N, - # XHR for pricing information, and XHR for stock - # availability). This is mainly for the compatibility with - # other distributor implementations (html_tree gets passed - # to all functions). - # A modified implementation (which would pass JSON data - # obtained by the XHR instead of the HTML DOM tree) might be - # able to do the same with just two requests (search for TME - # P/N, XHR for pricing and stock availability). - return get_part_html_tree(dist, pn, extra_search_terms, - url=l.get('href', ''), - descend=descend-1, - scrape_retries=scrape_retries) - except KeyError: - pass # This happens if there is no 'href' in the link, so just skip it. - - # I don't know what happened here, so give up. - logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist)) - raise PartHtmlError + + # If the tree contains the tag for a product page, then just return it. + if tree.find('div', id='ph') is not None: + return tree, url + + # If the tree is for a list of products, then examine the links to try to find the part number. + if tree.find('table', id="products") is not None: + self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name)) + if descend <= 0: + self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name)) + raise PartHtmlError + else: + # Look for the table of products. + products = tree.find( + 'table', + id="products").find_all( + 'tr', + class_=('product-row')) + + # Extract the product links for the part numbers from the table. + product_links = [] + for p in products: + for a in p.find('td', class_='product').find_all('a'): + product_links.append(a) + + # Extract all the part numbers from the text portion of the links. + part_numbers = [l.text for l in product_links] + + # Look for the part number in the list that most closely matches the requested part number. + match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0] + + # Now look for the link that goes with the closest matching part number. + for l in product_links: + try: + if (not l.get('href', '').startswith('./katalog')) and l.text == match: + # Get the tree for the linked-to page and return that. + self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text, pn, self.name)) + # TODO: The current implementation does up to four HTTP + # requests per part (search, part details page for TME P/N, + # XHR for pricing information, and XHR for stock + # availability). This is mainly for the compatibility with + # other distributor implementations (html_tree gets passed + # to all functions). + # A modified implementation (which would pass JSON data + # obtained by the XHR instead of the HTML DOM tree) might be + # able to do the same with just two requests (search for TME + # P/N, XHR for pricing and stock availability). + return self.dist_get_part_html_tree(pn, extra_search_terms, + url=l.get('href', ''), + descend=descend-1) + except KeyError: + pass # This happens if there is no 'href' in the link, so just skip it. + + # I don't know what happened here, so give up. + self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name)) + raise PartHtmlError From 0b688232aeada2142b76c27c89b8d998e0f996fa Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Fri, 1 Jun 2018 19:47:14 +0200 Subject: [PATCH 10/29] fake_browser: Replace "urllib" with "requests" to remove problematic "Connection: close" http header. --- kicost/distributors/fake_browser.py | 59 ++++++++++------------------- 1 file changed, 19 insertions(+), 40 deletions(-) diff --git a/kicost/distributors/fake_browser.py b/kicost/distributors/fake_browser.py index 7477cdec8..f9612251a 100644 --- a/kicost/distributors/fake_browser.py +++ b/kicost/distributors/fake_browser.py @@ -27,7 +27,7 @@ from random import choice import http.client # For web scraping exceptions. -import http.cookiejar +import requests from ..globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE @@ -155,61 +155,40 @@ def __init__(self, logger, scrape_retries): @param logger @param scrape_retries `int` Quantity of retries in case of fail. ''' - self.cookiejar = http.cookiejar.CookieJar() + self.userAgent = get_user_agent() - self.opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self.cookiejar)) + + # Use "requests" instead of "urllib" because "urllib" does not allow + # to remove "Connection: close" header which causes problems with some servers. + self.session = requests.session() + self.session.headers["User-Agent"] = self.userAgent + self.scrape_retries = scrape_retries self.logger = logger def show_cookies(self, name): - for x in self.cookiejar: - # TODO: use logger - self.logger.log(DEBUG_OBSESSIVE,"%s Cookie %s" % (name, x.name)) - print("%s Cookie %s" % (name, x.name)) + for x in self.session.cookies: + self.logger.log(DEBUG_OBSESSIVE,"%s Cookie %s" % (x.domain, x.name)) def add_cookie(self, domain, name, value): - self.cookiejar.set_cookie(http.cookiejar.Cookie( - version=0, - name=name, - value=value, - port=None, - port_specified=False, - domain=domain, - domain_specified=True, - domain_initial_dot=False, - path="/", - path_specified=False, - secure=False, - expires=None, - discard=False, - comment=None, - comment_url=None, - rest=None)) + self.session.cookies.set(name, value, domain=domain) + + def scrape_URL(self, url, add_header=[]): + headers = self.session.headers + for header in add_header: + self.session.headers[header[1]] = header[2] - def scrape_URL(self, url, add_header=None): for _ in range(self.scrape_retries): try: - req = Request(url) - if add_header: - req.add_header(add_header) - req.add_header('User-agent', self.userAgent) - req.add_header('Accept', 'text/html') - req.add_header('Accept-Language', 'en-US') - req.add_header('Accept-Encoding', 'identity') - response = self.opener.open(req, timeout=10) - html = response.read() + html = self.session.get(url, timeout=5).text break - #except WEB_SCRAPE_EXCEPTIONS: except Exception as ex: - # TODO: remove print - print('Exception of type "%s" while web-scraping %s' \ - % (type(ex).__name__, format(url))) self.logger.log(DEBUG_DETAILED,'Exception of type "%s" while web-scraping %s' \ % (type(ex).__name__, format(url))) pass else: - # TODO: remove print - print('No page') + self.session.headers = headers raise ValueError('No page') + self.session.headers = headers return html From 0bbfe98677dccaac9dcb90905dc7b5a64ad1a191 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Sat, 2 Jun 2018 11:15:55 +0200 Subject: [PATCH 11/29] Removed now unused file web_routines.py. --- kicost/distributors/web_routines.py | 225 ---------------------------- 1 file changed, 225 deletions(-) delete mode 100644 kicost/distributors/web_routines.py diff --git a/kicost/distributors/web_routines.py b/kicost/distributors/web_routines.py deleted file mode 100644 index 450bc642c..000000000 --- a/kicost/distributors/web_routines.py +++ /dev/null @@ -1,225 +0,0 @@ -# MIT license -# -# Copyright (C) 2018 by XESS Corporation / Hildo G Jr -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to deal -# in the Software without restriction, including without limitation the rights -# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -# copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in -# all copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -# THE SOFTWARE. - -# Author information. -__author__ = 'Hildo Guillardi Junior' -__webpage__ = 'https://github.com/hildogjr/' -__company__ = 'University of Campinas - Brazil' - -# Libraries. -import sys -from bs4 import BeautifulSoup # XML file interpreter. -import multiprocessing # To deal with the parallel scrape. -import logging -from time import time -from random import choice -from ..eda_tools.eda_tools import order_refs # To better print the warnings about the parts. - -try: - # This is for Python 3. - from urllib.parse import urlsplit, urlunsplit -except ImportError: - # This is for Python 2. - from urlparse import urlsplit, urlunsplit - -from ..globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE # Debug configurations. -from ..globals import SEPRTR -from ..globals import PartHtmlError -from . import distributor_dict - -import os, re - -# The distributor module directories will be found in this directory. -directory = os.path.dirname(__file__) - -# Search for the distributor modules and import them. -dist_modules = {} -for module in os.listdir(directory): - - # Avoid importing non-directories. - abs_module = os.path.join(directory, module) - if not os.path.isdir(abs_module): - continue - - # Avoid directories like __pycache__. - if module.startswith('__'): - continue - - # Import the module. - dist_modules[module] = __import__(module, globals(), locals(), [], level=1) - -__all__ = ['scrape_part', 'config_distributor'] - -def config_distributor(dist_name, locale_currency='USD'): - '''@brief Configure the distributor for some locale/country and - currency second ISO3166 and ISO4217 - - @param `str` dist Distributor to configure. - @param `str` Alpha 2 country or alpha 3 currency or even one slash other.''' - try: - dist_module = dist_modules[dist_name] - except KeyError: # When use local distributor with personalized name. - dist_module = dist_modules[distributor_dict[dist_name]['module']] - try: - if distributor_dict[dist_name]['scrape']=='web': - # Not make sense to configurate a local distributor (yet). - locale_currency = re.findall('\w{2,}', locale_currency) - locale = None - currency = None - for alpha in locale_currency: - if len(alpha)==2: - locale = alpha - elif len(alpha)==3: - currency = alpha - dist_module.define_locale_currency(locale_iso=locale, currency_iso=currency) - except AttributeError: - logger.warning('No currency/country configuration for {}.'.format(distributor_dict[dist_name]['label'])) - pass - - -def get_part_html_tree(part, dist, get_html_tree_func, local_part_html, scrape_retries, logger): - '''@brief Get the HTML tree for a part. - - Get the HTML tree for a part from the given distributor website or local HTML. - @param `str` part Part manufactor code or distributor stock code. - @param `str` dist Distributor do scrape. - @param `str` get_html_tree_func - @param `str` local_part_html - @param `int` scrape_retries Maximum times of web ritries. - @param logger Logger handle. - @return `str` with the HTML webpage.''' - - logger.log(DEBUG_OBSESSIVE, 'Looking in %s by %s:', distributor_dict[dist]['label'], order_refs(part.refs, True)) - - for extra_search_terms in set([part.fields.get('manf', ''), '']): - try: - # Search for part information using one of the following: - # 1) the distributor's catalog number. - # 2) the manufacturer's part number. - for key in (dist+'#', dist+SEPRTR+'cat#', 'manf#'): - if key in part.fields: - if part.fields[key]: - # Founded manufacturer / distributor code valid (not empty). - return get_html_tree_func(dist, part.fields[key], extra_search_terms, local_part_html=local_part_html, scrape_retries=scrape_retries) - # No distributor or manufacturer number, so give up. - else: - logger.warning("No '%s#' or 'manf#' field: cannot lookup part %s at %s.", dist, part.refs, dist) - return BeautifulSoup('', 'lxml'), '' - #raise PartHtmlError - except PartHtmlError: - pass - except AttributeError: - break - logger.warning("Part %s not found at %s.", order_refs(part.refs, False), distributor_dict[dist]['label']) - # If no HTML page was found, then return a tree for an empty page. - return BeautifulSoup('', 'lxml'), '' - - -def scrape_part(args): - '''@brief Scrape the data for a part from each distributor website or local HTML. - - Use distributors submodules to scrape each distributor part page and get - informations such as price, quantity avaliable and others; - - @param `int` Count of the main loop. - @param `str`String with the part number / distributor stock. - @param `dict` - @param `str` - @param `int`Number of scrape retries. - @param logger.getEffectiveLevel() - @param throttle_lock - @param throttle_tim - @return id, url, `str` distributor stock part number, `dict` price tiers, `int` qty avail, `dict` extrainfo dist - ''' - - id, part, distributor_dict, local_part_html, scrape_retries, log_level, throttle_lock, throttle_timeouts = args # Unpack the arguments. - - if multiprocessing.current_process().name == "MainProcess": - scrape_logger = logging.getLogger('kicost') - else: - scrape_logger = multiprocessing.get_logger() - handler = logging.StreamHandler(sys.stdout) - handler.setLevel(log_level) - scrape_logger.addHandler(handler) - scrape_logger.setLevel(log_level) - - # Create dictionaries for the various items of part data from each distributor. - url = {} - part_num = {} - price_tiers = {} - qty_avail = {} - info_dist = {} - - # Scrape the part data from each distributor website or the local HTML. - # Create a list of the distributor keys and randomly choose one of the - # keys to scrape. After scraping, remove the distributor key. - # Do this until all the distributors have been scraped. - distributors = list(distributor_dict.keys()) - while distributors: - - d = choice(distributors) # Randomly choose one of the available distributors. - - try: - #dist_module = getattr(THIS_MODULE, d) - dist_module = dist_modules[d] - except KeyError: # When use local distributor with personalized name. - dist_module = dist_modules[distributor_dict[d]['module']] - - # Try to access the list of distributor throttling timeouts. - # Abort if some other process is already using the timeouts. - if throttle_lock.acquire(blocking=False): - - # Check the throttling timeout for the chosen distributor to see if - # another access to its website is allowed. - if throttle_timeouts[d] <= time(): - - # Update the timeout for this distributor website and release the sync. lock. - throttle_timeouts[d] = time() + distributor_dict[d]['throttling_delay'] - throttle_lock.release() - - # Get the HTML tree for the part. - html_tree, url[d] = get_part_html_tree(part, d, dist_module.get_part_html_tree, local_part_html, scrape_retries, scrape_logger) - - # Call the functions that extract the data from the HTML tree. - part_num[d] = dist_module.get_part_num(html_tree) - qty_avail[d] = dist_module.get_qty_avail(html_tree) - price_tiers[d] = dist_module.get_price_tiers(html_tree) - - try: - # Get extra characeristics of the part in the web page. - # This will be use to comment in the 'cat#' column of the - # spreadsheet and some validations (in the future implementaions) - info_dist[d] = dist_module.get_extra_info(html_tree) - except: - info_dist[d] = {} - pass - - # The part data has been scraped from this distributor, so remove it from the list. - distributors.remove(d) - - # If the timeout for this distributor has not expired, then release - # the sync. lock and try another distributor. - else: - throttle_lock.release() - - # Return the part data. - return id, url, part_num, price_tiers, qty_avail, info_dist From 8cccfc771962b37b381cb37d008a1e00dc8c3a63 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Sat, 2 Jun 2018 11:22:01 +0200 Subject: [PATCH 12/29] Local distributor improvements and fixed custom local distributor names regression. --- kicost/distributors/digikey/digikey.py | 8 ++--- kicost/distributors/distributor.py | 15 ++++------ kicost/distributors/farnell/farnell.py | 8 ++--- kicost/distributors/local/local.py | 29 +++++++++--------- kicost/distributors/mouser/mouser.py | 8 ++--- kicost/distributors/newark/newark.py | 8 ++--- kicost/distributors/rs/rs.py | 8 ++--- kicost/distributors/tme/tme.py | 8 ++--- kicost/kicost.py | 41 +++++++++++--------------- 9 files changed, 56 insertions(+), 77 deletions(-) diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py index bd33295bd..b8f7561cb 100644 --- a/kicost/distributors/digikey/digikey.py +++ b/kicost/distributors/digikey/digikey.py @@ -52,9 +52,8 @@ import pycountry class dist_digikey(distributor.distributor): - def __init__(self, scrape_retries, log_level, throttle_delay): - super(dist_digikey, self).__init__(scrape_retries, log_level, throttle_delay) - self.name = 'digikey' + def __init__(self, name, scrape_retries, log_level, throttle_delay): + super(dist_digikey, self).__init__(name, scrape_retries, log_level, throttle_delay) self.domain = distributor_dict[self.name]['site']['url'] self.browser.scrape_URL(self.domain) @@ -199,13 +198,12 @@ def dist_get_qty_avail(self, html_tree): # it doesn't contain anything decipherable. Let's just assume it's 0. return 0 - def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2): '''@brief Find the Digikey HTML page for a part number and return the URL and parse tree. @param pn Part number `str()`. @param extra_search_terms @param url @param descend - @param local_part_html @return (html `str()` of the page, url) ''' diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py index 5510ac4b3..e14278176 100644 --- a/kicost/distributors/distributor.py +++ b/kicost/distributors/distributor.py @@ -58,8 +58,8 @@ import os, re class distributor: - def __init__(self, scrape_retries, log_level, throttle_delay): - self.name = None + def __init__(self, name, scrape_retries, log_level, throttle_delay): + self.name = name self.page_accessed = False self.scrape_retries = scrape_retries self.logger = logger @@ -109,7 +109,7 @@ def define_locale_currency(self, locale_currency='USD'): logger.warning('No currency/country configuration for {}.'.format(self.name)) pass - def scrape_part(self, id, part, local_part_html): + def scrape_part(self, id, part): '''@brief Scrape the data for a part from each distributor website or local HTML. Use distributors submodules to scrape each distributor part page and get @@ -117,7 +117,6 @@ def scrape_part(self, id, part, local_part_html): @param `int` Count of the main loop. @param `str` String with the part number / distributor stock. - @param `str` Local part HTML @return id, distributor_name, url, `str` distributor stock part number, `dict` price tiers, `int` qty avail, `dict` extrainfo dist ''' @@ -153,7 +152,7 @@ def scrape_part(self, id, part, local_part_html): % (self.name, distributor_dict[self.name]['scrape'])) # Get the HTML tree for the part. - html_tree, url = self.get_part_html_tree(part, local_part_html=local_part_html) + html_tree, url = self.get_part_html_tree(part) # Call the functions that extract the data from the HTML tree. part_num = self.dist_get_part_num(html_tree) @@ -172,12 +171,11 @@ def scrape_part(self, id, part, local_part_html): # Return the part data. return id, self.name, url, part_num, price_tiers, qty_avail, info_dist - def get_part_html_tree(self, part, local_part_html): + def get_part_html_tree(self, part): '''@brief Get the HTML tree for a part. Get the HTML tree for a part from the given distributor website or local HTML. @param `str` part Part manufactor code or distributor stock code. - @param `str` local_part_html @return `str` with the HTML webpage.''' self.logger.log(DEBUG_OBSESSIVE, 'Looking in %s by %s:', self.name, order_refs(part.refs, True)) @@ -191,8 +189,7 @@ def get_part_html_tree(self, part, local_part_html): if key in part.fields: if part.fields[key]: self.page_accessed = True - return self.dist_get_part_html_tree \ - (part.fields[key], extra_search_terms, local_part_html=local_part_html) + return self.dist_get_part_html_tree(part.fields[key], extra_search_terms) # No distributor or manufacturer number, so give up. else: self.page_accessed = False diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py index fb645c25f..287cb368f 100644 --- a/kicost/distributors/farnell/farnell.py +++ b/kicost/distributors/farnell/farnell.py @@ -52,9 +52,8 @@ __author__='Giacinto Luigi Cerone' class dist_farnell(distributor.distributor): - def __init__(self, scrape_retries, log_level, throttle_delay): - super(dist_farnell, self).__init__(scrape_retries, log_level, throttle_delay) - self.name = 'farnell' + def __init__(self, name, scrape_retries, log_level, throttle_delay): + super(dist_farnell, self).__init__(name, scrape_retries, log_level, throttle_delay) self.domain = distributor_dict[self.name]['site']['url'] self.browser.scrape_URL(self.domain) @@ -135,13 +134,12 @@ def dist_get_qty_avail(self, html_tree): # Return None so the part won't show in the spreadsheet for this dist. return None - def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2): '''@brief Find the farnell HTML page for a part number and return the URL and parse tree. @param pn Part number `str()`. @param extra_search_terms @param url @param descend - @param local_part_html @return (html `str()` of the page, url) ''' diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py index a3900b6cd..673fda658 100644 --- a/kicost/distributors/local/local.py +++ b/kicost/distributors/local/local.py @@ -44,18 +44,20 @@ from urllib.parse import urlsplit, urlunsplit class dist_local(distributor.distributor): - def __init__(self, scrape_retries, log_level, throttle_delay): - super(dist_local, self).__init__(scrape_retries, log_level, throttle_delay) - self.name = 'local' + # Static variable which contains local part html. + html = None - def create_part_html(self, parts, distributors): + def __init__(self, name, scrape_retries, log_level, throttle_delay): + super(dist_local, self).__init__(name, scrape_retries, log_level, throttle_delay) + + def create_part_html(parts, distributors, logger): '''@brief Create HTML page containing info for local (non-webscraped) parts. @param parts `list()` of parts. @parm `list()`of the distributors to check each one is local. - @return `str()` of the HTML page to be read by `get_part_html_tree()` + @param logger ''' - self.logger.log(DEBUG_OVERVIEW, 'Create HTML page for parts with custom pricing...') + logger.log(DEBUG_OVERVIEW, 'Create HTML page for parts with custom pricing...') doc, tag, text = Doc().tagtext() with tag('html'): @@ -115,10 +117,9 @@ def make_random_catalog_number(p): except: pass - html = doc.getvalue() - if self.logger.isEnabledFor(DEBUG_OBSESSIVE): - print(indent(html)) - return html + dist_local.html = doc.getvalue() + if logger.isEnabledFor(DEBUG_OBSESSIVE): + print(indent(dist_local.html)) def dist_get_price_tiers(self, html_tree): @@ -171,20 +172,20 @@ def dist_get_qty_avail(self, html_tree): self.logger.log(DEBUG_OBSESSIVE, 'No local part quantity found!') return 0 - # TODO: dist param - def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=None, local_part_html=None): + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=None): '''Extract the HTML tree from the HTML page for local parts. @param pn Part number `str()`. @param extra_search_terms @param url @param descend - @param local_part_html @return (html `str()` of the page, `None`) The second argument is always `None` bacause there is not url to return. ''' # Extract the HTML tree from the local part HTML page. try: - tree = BeautifulSoup(local_part_html, 'lxml') + print("dist_local.html") + print(dist_local.html) + tree = BeautifulSoup(dist_local.html, 'lxml') except Exception: raise PartHtmlError diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py index ecd454c97..23fd54b4e 100644 --- a/kicost/distributors/mouser/mouser.py +++ b/kicost/distributors/mouser/mouser.py @@ -48,9 +48,8 @@ from urllib.parse import quote_plus as urlquote class dist_mouser(distributor.distributor): - def __init__(self, scrape_retries, log_level, throttle_delay): - super(dist_mouser, self).__init__(scrape_retries, log_level, throttle_delay) - self.name = 'mouser' + def __init__(self, name, scrape_retries, log_level, throttle_delay): + super(dist_mouser, self).__init__(name, scrape_retries, log_level, throttle_delay) self.domain = distributor_dict[self.name]['site']['url'] self.browser.add_cookie('.mouser.com', 'preferences', 'ps=www2&pl=en-US&pc_www2=USDe') @@ -140,13 +139,12 @@ def dist_get_qty_avail(self, html_tree): self.logger.log(DEBUG_OBSESSIVE, 'No Mouser part quantity found!') return None - def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2): '''@brief Find the Mouser HTML page for a part number and return the URL and parse tree. @param pn Part number `str()`. @param extra_search_terms @param url @param descend - @param local_part_html @return (html `str()` of the page, url) ''' diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py index bbd03fa9e..44532dc95 100644 --- a/kicost/distributors/newark/newark.py +++ b/kicost/distributors/newark/newark.py @@ -48,9 +48,8 @@ from urllib.parse import quote_plus as urlquote class dist_newark(distributor.distributor): - def __init__(self, scrape_retries, log_level, throttle_delay): - super(dist_newark, self).__init__(scrape_retries, log_level, throttle_delay) - self.name = 'newark' + def __init__(self, name, scrape_retries, log_level, throttle_delay): + super(dist_newark, self).__init__(name, scrape_retries, log_level, throttle_delay) self.domain = distributor_dict[self.name]['site']['url'] self.browser.scrape_URL(self.domain) @@ -135,13 +134,12 @@ def dist_get_qty_avail(self, html_tree): return None - def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2): '''@brief Find the Newark HTML page for a part number and return the URL and parse tree. @param pn Part number `str()`. @param extra_search_terms @param url @param descend - @param local_part_html @return (html `str()` of the page, url) ''' diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py index 34b58fa8d..bf07c42ec 100644 --- a/kicost/distributors/rs/rs.py +++ b/kicost/distributors/rs/rs.py @@ -46,9 +46,8 @@ from urllib.parse import quote_plus as urlquote class dist_rs(distributor.distributor): - def __init__(self, scrape_retries, log_level, throttle_delay): - super(dist_rs, self).__init__(scrape_retries, log_level, throttle_delay) - self.name = 'rs' + def __init__(self, name, scrape_retries, log_level, throttle_delay): + super(dist_rs, self).__init__(name, scrape_retries, log_level, throttle_delay) self.domain = distributor_dict[self.name]['site']['url'] self.browser.scrape_URL(self.domain) @@ -114,13 +113,12 @@ def dist_get_qty_avail(self, html_tree): # Return None so the part won't show in the spreadsheet for this dist. return None - def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2): '''@brief Find the RS Components HTML page for a part number and return the URL and parse tree. @param pn Part number `str()`. @param extra_search_terms @param url @param descend - @param local_part_html @return (html `str()` of the page, url) ''' diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py index f2403e629..edf3ccb62 100644 --- a/kicost/distributors/tme/tme.py +++ b/kicost/distributors/tme/tme.py @@ -49,9 +49,8 @@ from urllib.parse import quote_plus as urlquote, urlencode class dist_tme(distributor.distributor): - def __init__(self, scrape_retries, log_level, throttle_delay): - super(dist_tme, self).__init__(scrape_retries, log_level, throttle_delay) - self.name = 'tme' + def __init__(self, name, scrape_retries, log_level, throttle_delay): + super(dist_tme, self).__init__(name, scrape_retries, log_level, throttle_delay) self.domain = distributor_dict[self.name]['site']['url'] self.browser.scrape_URL(self.domain) @@ -160,13 +159,12 @@ def dist_get_qty_avail(self, html_tree): return None - def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None): + def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2): '''@brief Find the TME HTML page for a part number and return the URL and parse tree. @param pn Part number `str()`. @param extra_search_terms @param url @param descend - @param local_part_html @return (html `str()` of the page, url) ''' diff --git a/kicost/kicost.py b/kicost/kicost.py index 3a09507e6..b592fcd93 100644 --- a/kicost/kicost.py +++ b/kicost/kicost.py @@ -204,33 +204,29 @@ def kicost(in_file, eda_tool_name, out_filename, logger.warning("No 'manf#' and '%s#' field in any part: distributor '%s' will be not scraped.", d, distributor_dict[d]['label']) distributor_dict.pop(d, None) - # Create an HTML page containing all the local part information. - local_distributor = dist_local(scrape_retries, 5, throttling_delay) # TODO: log level - local_part_html = local_distributor.create_part_html(parts, distributor_dict) - if logger.isEnabledFor(DEBUG_DETAILED): pprint.pprint(distributor_dict) + # Create an HTML page containing all the local part information. + dist_local.create_part_html(parts, distributor_dict, logger) + # Get the distributor product page for each part and scrape the part data. if dist_list: - # Instanciate distributors for d in list(distributor_dict.keys()): try: - ctor = globals()["dist_"+d] - # TODO: use logger, not print - # TODO: logger does not print anything logger.log(DEBUG_OVERVIEW, "Initialising %s" % d) - print("Initialising %s" % d) - # TODO: farnell does not respond - distributor_dict[d]['instance'] = ctor(scrape_retries, 5, throttling_delay) # TODO: log level - except: - logger.log(DEBUG_OVERVIEW, "Initialising %s failed, exculding this distributor..." % d) + if distributor_dict[d]['scrape'] == 'local': + ctor = globals()['dist_local'] + else: + ctor = globals()['dist_'+d] + distributor_dict[d]['instance'] = ctor(d, scrape_retries, 5, throttling_delay) # TODO: log level + except Exception as ex: + logger.log(DEBUG_OVERVIEW, "Initialising %s failed with %s, exculding this distributor..." \ + % (d, type(ex).__name__)) distributor_dict.pop(d, None) pass - # TODO: multithreaded init, use another pool - if local_currency: logger.log(DEBUG_OVERVIEW, '# Configuring the distributors locale and currency...') for d in distributor_dict: @@ -259,23 +255,19 @@ def emit(self, record): # Init part info dictionaries for part in parts: - pprint.pprint(vars(part)) part.part_num = {} part.url = {} part.price_tiers = {} part.qty_avail = {} part.info_dist = {} - #partsByDist = partListByDistributors(parts) if num_processes <= 1: # Scrape data, one part at a time using single processing. for d in distributor_dict: - print("Dist loop d=%s" % d) + logger.log(DEBUG_OVERVIEW, "Scraping "+ inst.name) for i in range(len(parts)): - print("Part loop i=%d" % i) id, dist, url, part_num, price_tiers, qty_avail, info_dist = \ - scrape_result = distributor_dict[d]['instance'].scrape_part \ - (i, parts[i], local_part_html) + scrape_result = distributor_dict[d]['instance'].scrape_part(i, parts[i]) parts[id].part_num[dist] = part_num parts[id].url[dist] = url @@ -293,12 +285,13 @@ def emit(self, record): # Package part data for passing to each process. arg_sets = [(distributor_dict[d]['instance'], parts, \ - local_part_html, scraping_progress) for d in distributor_dict] + scraping_progress) for d in distributor_dict] - def mt_scrape_part(inst, parts, local_part_html, scraping_progress): + def mt_scrape_part(inst, parts, scraping_progress): + logger.log(DEBUG_OVERVIEW, "Scraping "+ inst.name) retval = list() for i in range(len(parts)): - retval.append(inst.scrape_part(i, parts[i], local_part_html)) + retval.append(inst.scrape_part(i, parts[i])) scraping_progress.update(1) return retval From 7182bfe49f6de9930637c44dbc1395b0e4b134bf Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Sat, 2 Jun 2018 11:23:52 +0200 Subject: [PATCH 13/29] Fixed logger initialization, log messages from kicost.py should now show up correctly. --- kicost/__main__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kicost/__main__.py b/kicost/__main__.py index 9f26339df..e4178eb5c 100644 --- a/kicost/__main__.py +++ b/kicost/__main__.py @@ -47,7 +47,6 @@ HTML_RESPONSE_RETRIES = 2 # Number of attempts to retrieve part data from a website. from .globals import * -logger = logging.getLogger('kicost') ############################################################################### # Command-line interface. @@ -174,7 +173,7 @@ def main(): #handler = logging.StreamHandler(sys.stdout) #handler.setLevel(log_level) #logger.addHandler(handler) # It's not necessary to add a handle here, the default is already `sys.stdout` and adding twice it creates the BUG #193, doesn't allowing to use correctly the `tqdm` (process bar) print handle. - logger.setLevel(log_level) + logging.basicConfig(level=log_level, format='%(message)s') if args.show_dist_list: print('Distributor list:', *sorted(list(distributor_dict.keys()))) From 071f5d031a8fea06ff528a15fa924513e7deaaa5 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Sat, 2 Jun 2018 13:43:38 +0200 Subject: [PATCH 14/29] Added new debugging level DEBUG_HTTP_RESPONSES and added per thread timing traces. --- kicost/distributors/digikey/digikey.py | 3 ++- kicost/distributors/distributor.py | 3 +++ kicost/distributors/farnell/farnell.py | 6 +++--- kicost/distributors/local/local.py | 2 -- kicost/distributors/mouser/mouser.py | 4 ++-- kicost/distributors/newark/newark.py | 3 ++- kicost/distributors/rs/rs.py | 3 ++- kicost/distributors/tme/tme.py | 3 ++- kicost/globals.py | 1 + 9 files changed, 17 insertions(+), 11 deletions(-) diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py index b8f7561cb..7564beab0 100644 --- a/kicost/distributors/digikey/digikey.py +++ b/kicost/distributors/digikey/digikey.py @@ -43,7 +43,7 @@ from .. import fake_browser from .. import EXTRA_INFO_DIST, extra_info_dist_name_translations from ...globals import PartHtmlError -from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE +from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES from .. import distributor, distributor_dict @@ -344,6 +344,7 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # I don't know what happened here, so give up. self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name)) + self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html) raise PartHtmlError def part_is_reeled(self, html_tree): diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py index e14278176..a4f5b4631 100644 --- a/kicost/distributors/distributor.py +++ b/kicost/distributors/distributor.py @@ -58,6 +58,7 @@ import os, re class distributor: + start_time = time.time() def __init__(self, name, scrape_retries, log_level, throttle_delay): self.name = name self.page_accessed = False @@ -189,6 +190,8 @@ def get_part_html_tree(self, part): if key in part.fields: if part.fields[key]: self.page_accessed = True + self.logger.log(DEBUG_OBSESSIVE, "%s: scrape timing: %.2f" \ + % (self.name, time.time() - distributor.start_time)) return self.dist_get_part_html_tree(part.fields[key], extra_search_terms) # No distributor or manufacturer number, so give up. else: diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py index 287cb368f..cd6d0758d 100644 --- a/kicost/distributors/farnell/farnell.py +++ b/kicost/distributors/farnell/farnell.py @@ -41,7 +41,7 @@ #from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError -from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE +from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES from currency_converter import CurrencyConverter currency = CurrencyConverter() @@ -196,7 +196,7 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 product_links.append(p.find('td', class_='mftrPart').find('a')) except AttributeError: continue - print('>>> ',pn,products,product_links)#TODO + #print('>>> ',pn,products,product_links)#TODO # Extract all the part numbers from the text portion of the links. part_numbers = [l.text for l in product_links] @@ -215,5 +215,5 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # I don't know what happened here, so give up. self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name)) - self.logger.log(DEBUG_OBSESSIVE,'Response was %s' % html) + self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html) raise PartHtmlError diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py index 673fda658..a5ced0487 100644 --- a/kicost/distributors/local/local.py +++ b/kicost/distributors/local/local.py @@ -183,8 +183,6 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=N # Extract the HTML tree from the local part HTML page. try: - print("dist_local.html") - print(dist_local.html) tree = BeautifulSoup(dist_local.html, 'lxml') except Exception: raise PartHtmlError diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py index 23fd54b4e..be2108267 100644 --- a/kicost/distributors/mouser/mouser.py +++ b/kicost/distributors/mouser/mouser.py @@ -41,7 +41,7 @@ #from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError -from ...globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE +from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES from .. import distributor, distributor_dict @@ -215,5 +215,5 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # I don't know what happened here, so give up. self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name)) - self.logger.log(DEBUG_OBSESSIVE,'Response was %s' % html) + self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html) raise PartHtmlError diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py index 44532dc95..5f216b9e1 100644 --- a/kicost/distributors/newark/newark.py +++ b/kicost/distributors/newark/newark.py @@ -41,7 +41,7 @@ #from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError -from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE +from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES from .. import distributor, distributor_dict @@ -217,4 +217,5 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # I don't know what happened here, so give up. self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name)) + self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html) raise PartHtmlError diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py index bf07c42ec..960cf87d7 100644 --- a/kicost/distributors/rs/rs.py +++ b/kicost/distributors/rs/rs.py @@ -37,7 +37,7 @@ #from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError -from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE +from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES from currency_converter import CurrencyConverter currency = CurrencyConverter() @@ -185,4 +185,5 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # I don't know what happened here, so give up. self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist)) + self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html) raise PartHtmlError diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py index edf3ccb62..1c22b6d89 100644 --- a/kicost/distributors/tme/tme.py +++ b/kicost/distributors/tme/tme.py @@ -42,7 +42,7 @@ #from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError -from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE +from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES from .. import distributor, distributor_dict @@ -249,4 +249,5 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # I don't know what happened here, so give up. self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name)) + self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html) raise PartHtmlError diff --git a/kicost/globals.py b/kicost/globals.py index 444b22198..226da2797 100644 --- a/kicost/globals.py +++ b/kicost/globals.py @@ -28,6 +28,7 @@ DEBUG_OVERVIEW = logging.DEBUG DEBUG_DETAILED = logging.DEBUG-1 DEBUG_OBSESSIVE = logging.DEBUG-2 +DEBUG_HTTP_RESPONSES = logging.DEBUG-3 SEPRTR = ':' # Delimiter between library:component, distributor:field, etc. From 253346bef776c3de561dfb170066c4f56f50c01d Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Sat, 2 Jun 2018 13:44:51 +0200 Subject: [PATCH 15/29] Limit num_processes to distributor count, fixed name error. --- kicost/kicost.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kicost/kicost.py b/kicost/kicost.py index b592fcd93..58b84a5eb 100644 --- a/kicost/kicost.py +++ b/kicost/kicost.py @@ -210,6 +210,10 @@ def kicost(in_file, eda_tool_name, out_filename, # Create an HTML page containing all the local part information. dist_local.create_part_html(parts, distributor_dict, logger) + num_processes = min(num_processes, len(distributor_dict)) + logger.log(DEBUG_OBSESSIVE, "Initialising scraper with %d processes" % num_processes) + logger.log(DEBUG_OBSESSIVE, "throttling_delay=%d" % throttling_delay) + # Get the distributor product page for each part and scrape the part data. if dist_list: # Instanciate distributors @@ -261,10 +265,12 @@ def emit(self, record): part.qty_avail = {} part.info_dist = {} + num_processes = min(num_processes, len(distributor_dict)) + if num_processes <= 1: # Scrape data, one part at a time using single processing. for d in distributor_dict: - logger.log(DEBUG_OVERVIEW, "Scraping "+ inst.name) + logger.log(DEBUG_OVERVIEW, "Scraping "+ distributor_dict[d]['instance'].name) for i in range(len(parts)): id, dist, url, part_num, price_tiers, qty_avail, info_dist = \ scrape_result = distributor_dict[d]['instance'].scrape_part(i, parts[i]) From 53bd3621ba6196b4c9d1ba5b1c649b3523ce477a Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Sat, 2 Jun 2018 13:45:46 +0200 Subject: [PATCH 16/29] Default throttling_delay to 5 seconds to avoid getting banned. --- kicost/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kicost/__main__.py b/kicost/__main__.py index e4178eb5c..ea06262d7 100644 --- a/kicost/__main__.py +++ b/kicost/__main__.py @@ -148,7 +148,7 @@ def main(): metavar = 'NUM_RETRIES', help='Specify the number of attempts to retrieve part data from a website.') parser.add_argument('--throttling_delay', - nargs='?', type=float, default=0.0, + nargs='?', type=float, default=5.0, metavar='DELAY', help="Specify minimum delay (in seconds) between successive accesses to a distributor's website.") parser.add_argument('--currency', '--locale', From 9f38aabce0e930f416f787006c2b7489061b965d Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Sun, 3 Jun 2018 10:06:54 +0200 Subject: [PATCH 17/29] Moved throttling_delay handling to fake_browser. --- kicost/distributors/distributor.py | 20 +------------------- kicost/distributors/fake_browser.py | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py index a4f5b4631..121cb56ba 100644 --- a/kicost/distributors/distributor.py +++ b/kicost/distributors/distributor.py @@ -61,14 +61,11 @@ class distributor: start_time = time.time() def __init__(self, name, scrape_retries, log_level, throttle_delay): self.name = name - self.page_accessed = False self.scrape_retries = scrape_retries self.logger = logger self.log_level = log_level - self.throttle_delay = throttle_delay - self.throttle_timeout = time.time() self.domain = None - self.browser = fake_browser.fake_browser(self.logger, self.scrape_retries) + self.browser = fake_browser.fake_browser(self.logger, self.scrape_retries, throttle_delay) # Abstract methods, implemented in distributor specific modules def dist_get_part_html_tree(self, pn, extra_search_terms, url, descend): @@ -138,20 +135,6 @@ def scrape_part(self, id, part): price_tiers = {} info_dist = {} - if distributor_dict[self.name]['scrape']=='web': - if self.page_accessed == True: - # Check the throttling timeout for the chosen distributor to see if - # another access to its website is allowed. - if self.throttle_timeout > time.time(): - time.sleep(self.throttle_timeout - time.time()) - - # Update the timeout for this distributor website and release the sync. lock. - self.throttle_timeout = time.time() + self.throttle_delay - # Founded manufacturer / distributor code valid (not empty). - else: - self.logger.log(DEBUG_OBSESSIVE,'No delay for %s, type=%s' \ - % (self.name, distributor_dict[self.name]['scrape'])) - # Get the HTML tree for the part. html_tree, url = self.get_part_html_tree(part) @@ -189,7 +172,6 @@ def get_part_html_tree(self, part): for key in (self.name+'#', self.name+SEPRTR+'cat#', 'manf#'): if key in part.fields: if part.fields[key]: - self.page_accessed = True self.logger.log(DEBUG_OBSESSIVE, "%s: scrape timing: %.2f" \ % (self.name, time.time() - distributor.start_time)) return self.dist_get_part_html_tree(part.fields[key], extra_search_terms) diff --git a/kicost/distributors/fake_browser.py b/kicost/distributors/fake_browser.py index f9612251a..7ab5e4d26 100644 --- a/kicost/distributors/fake_browser.py +++ b/kicost/distributors/fake_browser.py @@ -25,6 +25,7 @@ __email__ = 'info@xess.com' from random import choice +import time import http.client # For web scraping exceptions. import requests @@ -150,7 +151,7 @@ def get_user_agent(): # Open the URL, read the HTML from it, and parse it into a tree structure. class fake_browser: - def __init__(self, logger, scrape_retries): + def __init__(self, logger, scrape_retries, throttle_delay): '''@brief fake_browser @param logger @param scrape_retries `int` Quantity of retries in case of fail. @@ -163,6 +164,9 @@ def __init__(self, logger, scrape_retries): self.session = requests.session() self.session.headers["User-Agent"] = self.userAgent + self.throttle_delay = throttle_delay + self.throttle_timeout = time.time() + self.scrape_retries = scrape_retries self.logger = logger @@ -180,6 +184,18 @@ def scrape_URL(self, url, add_header=[]): for _ in range(self.scrape_retries): try: + # Check the throttling timeout of this browser to see if + # another access to its website is allowed. + + sleepTime = self.throttle_timeout - time.time() + self.logger.log(DEBUG_OBSESSIVE, "browser: time=%.2f, timeout=%.2f, sleep=%.2f" \ + % (time.time(), self.throttle_timeout, sleepTime)) + if sleepTime > 0: + time.sleep(sleepTime) + + # Update the timeout for this browser. + self.throttle_timeout = time.time() + self.throttle_delay + html = self.session.get(url, timeout=5).text break except Exception as ex: From 6ac3ec258dbe863cb5961c5b4f979df0d1cefaf7 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Sun, 3 Jun 2018 10:08:59 +0200 Subject: [PATCH 18/29] Revised python2/3 import differences as suggested by hildogr. --- kicost/distributors/distributor.py | 4 ++-- kicost/distributors/fake_browser.py | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py index 121cb56ba..3989458a1 100644 --- a/kicost/distributors/distributor.py +++ b/kicost/distributors/distributor.py @@ -36,13 +36,13 @@ from . import fake_browser import http.client # For web scraping exceptions. -try: +if sys.version_info>=(3,0): # This is for Python 3. from urllib.parse import urlencode, quote_plus as urlquote, urlsplit, urlunsplit from urllib.request import urlopen, Request import urllib.error WEB_SCRAPE_EXCEPTIONS = (urllib.error.URLError, http.client.HTTPException) -except ImportError: +else: # This is for Python 2. from urlparse import urlsplit, urlunsplit from urllib import urlencode, quote_plus as urlquote diff --git a/kicost/distributors/fake_browser.py b/kicost/distributors/fake_browser.py index 7ab5e4d26..0fd332e33 100644 --- a/kicost/distributors/fake_browser.py +++ b/kicost/distributors/fake_browser.py @@ -24,6 +24,7 @@ __author__ = 'XESS Corporation' __email__ = 'info@xess.com' +import sys from random import choice import time @@ -32,13 +33,13 @@ from ..globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE -try: +if sys.version_info>=(3,0): # This is for Python 3 from urllib.parse import urlencode, quote_plus as urlquote, urlsplit, urlunsplit from urllib.request import urlopen, Request import urllib.error WEB_SCRAPE_EXCEPTIONS = (urllib.error.URLError, http.client.HTTPException) -except ImportError: +else: # This is for Python 2 from urlparse import urlsplit, urlunsplit from urllib import urlencode, quote_plus as urlquote From dd69881ff18e6efdf5a4e6aa68a7fc2ea3ce269b Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Sun, 3 Jun 2018 10:11:05 +0200 Subject: [PATCH 19/29] Implemented multithreaded distributor initialisation and reduced mt_scrape_part argument count. --- kicost/kicost.py | 61 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 42 insertions(+), 19 deletions(-) diff --git a/kicost/kicost.py b/kicost/kicost.py index 58b84a5eb..4d83b0474 100644 --- a/kicost/kicost.py +++ b/kicost/kicost.py @@ -211,34 +211,56 @@ def kicost(in_file, eda_tool_name, out_filename, dist_local.create_part_html(parts, distributor_dict, logger) num_processes = min(num_processes, len(distributor_dict)) - logger.log(DEBUG_OBSESSIVE, "Initialising scraper with %d processes" % num_processes) + logger.log(DEBUG_OBSESSIVE, "Initialising scraper with %d threads" % num_processes) logger.log(DEBUG_OBSESSIVE, "throttling_delay=%d" % throttling_delay) # Get the distributor product page for each part and scrape the part data. if dist_list: - # Instanciate distributors - for d in list(distributor_dict.keys()): + + # Create thread pool to init multiple distributors simultaneously. + pool = ThreadPool(num_processes) + + # Package part data for passing to each process. + arg_sets = [(d, distributor_dict[d]['scrape']) for d in distributor_dict] + + def mt_init_dist(d, scrape): + instance = None try: logger.log(DEBUG_OVERVIEW, "Initialising %s" % d) - if distributor_dict[d]['scrape'] == 'local': + if scrape == 'local': ctor = globals()['dist_local'] else: ctor = globals()['dist_'+d] - distributor_dict[d]['instance'] = ctor(d, scrape_retries, 5, throttling_delay) # TODO: log level + instance = ctor(d, scrape_retries, 5, throttling_delay) # TODO: log level except Exception as ex: logger.log(DEBUG_OVERVIEW, "Initialising %s failed with %s, exculding this distributor..." \ % (d, type(ex).__name__)) + return (d, None) + + if local_currency: + logger.log(DEBUG_OVERVIEW, '# Configuring the distributors locale and currency...') + instance.define_locale_currency(local_currency) + return (d, instance) + + logger.log(DEBUG_OBSESSIVE, 'Starting {} threads to init distributors...'.format(num_processes)) + pprint.pprint(arg_sets) + results = [pool.apply_async(mt_init_dist, args) for args in arg_sets] + + # Wait for all the processes to have results. + pool.close() + pool.join() + + # Get the data from each process result structure. + for result in results: + d, instance = result.get() + # Distributor initialisation failed, remove it from distributor_dict. + if instance == None: distributor_dict.pop(d, None) - pass - - if local_currency: - logger.log(DEBUG_OVERVIEW, '# Configuring the distributors locale and currency...') - for d in distributor_dict: - distributor_dict[d]['instance'].define_locale_currency(local_currency) + # Distributor initialised successfully, add instance to distributor_dict. + else: + distributor_dict[d]['instance'] = instance logger.log(DEBUG_OVERVIEW, '# Scraping part data for each component group...') - - global scraping_progress scraping_progress = tqdm.tqdm(desc='Progress', total=len(parts)*len(distributor_dict), unit='part', miniters=1) # Change the logging print channel to `tqdm` to keep the process bar to the end of terminal. @@ -285,15 +307,16 @@ def emit(self, record): # Scrape data, multiple parts at a time using multiprocessing. # Create thread pool to scrape data for multiple distributors simultaneously. - # PYthon threads are time-sliced but they work in our I/O limited scenario + # Python threads are time-sliced but they work in our I/O limited scenario # and avoid all kinds of pickle issues. pool = ThreadPool(num_processes) # Package part data for passing to each process. - arg_sets = [(distributor_dict[d]['instance'], parts, \ - scraping_progress) for d in distributor_dict] + # pool.async_apply needs at least two arguments per function so add dummy argument + # (otherwise it fails with "arguments after * must be an iterable, not ...") + arg_sets = [(distributor_dict[d]['instance'], None) for d in distributor_dict] - def mt_scrape_part(inst, parts, scraping_progress): + def mt_scrape_part(inst, dummy): logger.log(DEBUG_OVERVIEW, "Scraping "+ inst.name) retval = list() for i in range(len(parts)): @@ -302,13 +325,13 @@ def mt_scrape_part(inst, parts, scraping_progress): return retval # Start the web scraping processes, one for each part. - logger.log(DEBUG_OBSESSIVE, 'Starting {} parallels process to scrap parts...'.format(num_processes)) + logger.log(DEBUG_OBSESSIVE, 'Starting {} parallel threads to scrap parts...'.format(num_processes)) results = [pool.apply_async(mt_scrape_part, args) for args in arg_sets] # Wait for all the processes to have results, then kill-off all the scraping processes. pool.close() pool.join() - logger.log(DEBUG_OVERVIEW, 'All parallels process finished with success.') + logger.log(DEBUG_OVERVIEW, 'All parallel threads finished with success.') # Get the data from each process result structure. for res_proc in results: From 0718fd3b713e805199a46f304ff7be084e1affc6 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Mon, 4 Jun 2018 17:03:49 +0200 Subject: [PATCH 20/29] Added "requests" dependency to setup.py. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 689abd43a..8e0c3c202 100644 --- a/setup.py +++ b/setup.py @@ -23,6 +23,7 @@ 'lxml >= 3.7.2', 'yattag >= 1.5.2', 'tqdm >= 4.4.0', + 'requests >= 2.18.4', 'CurrencyConverter >= 0.5', # Used to convert price to a not avaiable currecy in one distributor. 'pycountry >= 18.2', # ISO4117, ISO3166 country and currency definitons from Debian’s pkg-isocodes. # 'wxPython >= 4.0', # Graphical package/library needed to user guide. From 500116e413bae1abc053fd4cd15ec6e75d33bc0b Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Mon, 4 Jun 2018 17:05:21 +0200 Subject: [PATCH 21/29] Moved CurrencyConverter instance to globals. --- kicost/distributors/farnell/farnell.py | 3 +-- kicost/distributors/rs/rs.py | 3 +-- kicost/globals.py | 2 ++ kicost/kicost.py | 2 ++ 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py index cd6d0758d..3d0afdce5 100644 --- a/kicost/distributors/farnell/farnell.py +++ b/kicost/distributors/farnell/farnell.py @@ -41,9 +41,8 @@ #from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError +from ...globals import currency from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES -from currency_converter import CurrencyConverter -currency = CurrencyConverter() from .. import distributor, distributor_dict diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py index 960cf87d7..bf88b44f4 100644 --- a/kicost/distributors/rs/rs.py +++ b/kicost/distributors/rs/rs.py @@ -38,8 +38,7 @@ from .. import fake_browser from ...globals import PartHtmlError from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES -from currency_converter import CurrencyConverter -currency = CurrencyConverter() +from ...globals import currency from .. import distributor, distributor_dict diff --git a/kicost/globals.py b/kicost/globals.py index 226da2797..4d95d392f 100644 --- a/kicost/globals.py +++ b/kicost/globals.py @@ -23,6 +23,7 @@ """Stuff that everybody else needs to know about.""" import logging +from currency_converter import CurrencyConverter logger = logging.getLogger('kicost') DEBUG_OVERVIEW = logging.DEBUG @@ -32,6 +33,7 @@ SEPRTR = ':' # Delimiter between library:component, distributor:field, etc. +currency = CurrencyConverter() class PartHtmlError(Exception): '''Exception for failed retrieval of an HTML parse tree for a part.''' diff --git a/kicost/kicost.py b/kicost/kicost.py index 4d83b0474..9a2ed03e9 100644 --- a/kicost/kicost.py +++ b/kicost/kicost.py @@ -109,6 +109,8 @@ def kicost(in_file, eda_tool_name, out_filename, @param local_currency `str()` Local/country in ISO3166:2 and currency in ISO4217. Default 'USD'. ''' + logger.log(DEBUG_OVERVIEW, 'Exchange rate: 1 EUR = %.2f USD' % currency.convert(1, 'EUR', 'USD')) + # Only keep distributors in the included list and not in the excluded list. if dist_list!=None: if not dist_list: From 34c34faf25c5205d82223314f220af6f4cfde04b Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Mon, 4 Jun 2018 17:06:09 +0200 Subject: [PATCH 22/29] Fixed remaining digikey refactoring issues. --- kicost/distributors/digikey/digikey.py | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py index 7564beab0..2c8e5ca87 100644 --- a/kicost/distributors/digikey/digikey.py +++ b/kicost/distributors/digikey/digikey.py @@ -34,8 +34,6 @@ import future -# TODO: not working yet ? - import re, difflib from bs4 import BeautifulSoup import http.client # For web scraping exceptions. @@ -219,9 +217,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # Open the URL, read the HTML from it, and parse it into a tree structure. try: - html = fake_browser(url, scrape_retries) - except: - self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name)) + html = self.browser.scrape_URL(url) + except Exception as ex: + self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}, ex: {}'.format(pn, self.name, type(ex).__name__)) raise PartHtmlError # Abort if the part number isn't in the HTML somewhere. @@ -274,9 +272,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # the entire list for one that's non-reeled. Use this as the # main page for the part. ap_trees_and_urls.append((tree, url)) - if part_is_reeled(tree): + if self.part_is_reeled(tree): for ap_tree, ap_url in ap_trees_and_urls: - if not part_is_reeled(ap_tree): + if not self.part_is_reeled(ap_tree): # Found a non-reeled part, so use it as the main page. tree = ap_tree url = ap_url @@ -290,9 +288,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 try: # Merge the pricing info from that into the main parse tree to make # a single, unified set of price tiers... - merge_price_tiers(tree, ap_tree) + self.merge_price_tiers(tree, ap_tree) # and merge available quantity, using the maximum found. - merge_qty_avail(tree, ap_tree) + self.merge_qty_avail(tree, ap_tree) except AttributeError: self.logger.log(DEBUG_OBSESSIVE,'Problem merging price/qty for {} from {}'.format(pn, self.name)) continue @@ -352,7 +350,7 @@ def part_is_reeled(self, html_tree): @param html_tree `str()` html of the distributor part page. @return `True` or `False`. ''' - qty_tiers = list(get_price_tiers(html_tree).keys()) + qty_tiers = list(self.dist_get_price_tiers(html_tree).keys()) if len(qty_tiers) > 0 and min(qty_tiers) >= 100: return True if html_tree.find('table', @@ -372,8 +370,8 @@ def merge_price_tiers(self, main_tree, alt_tree): def merge_qty_avail(self, main_tree, alt_tree): '''Merge the quantities from the alternate-packaging tree into the main tree.''' try: - main_qty = get_qty_avail(main_tree) - alt_qty = get_qty_avail(alt_tree) + main_qty = self.dist_get_qty_avail(main_tree) + alt_qty = self.dist_get_qty_avail(alt_tree) if main_qty is None: merged_qty = alt_qty elif alt_qty is None: From 08412e359c056f914abd7230acd8783f42601ad6 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Mon, 4 Jun 2018 17:06:40 +0200 Subject: [PATCH 23/29] Fixed remaining RS refactoring issues and updated URL. --- kicost/distributors/rs/__init__.py | 2 +- kicost/distributors/rs/rs.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kicost/distributors/rs/__init__.py b/kicost/distributors/rs/__init__.py index 5c06b17b3..0b4b173d7 100644 --- a/kicost/distributors/rs/__init__.py +++ b/kicost/distributors/rs/__init__.py @@ -25,7 +25,7 @@ }, # Web site defitions. 'site': { - 'url': 'https://rs-online.com/', + 'url': 'https://it.rs-online.com/', 'currency': 'USD', 'locale': 'UK' }, diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py index bf88b44f4..1fd87af0e 100644 --- a/kicost/distributors/rs/rs.py +++ b/kicost/distributors/rs/rs.py @@ -134,19 +134,19 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 try: html = self.browser.scrape_URL(url) except: - self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist)) + self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name)) raise PartHtmlError try: tree = BeautifulSoup(html, 'lxml') except Exception: - self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist)) + self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name)) raise PartHtmlError # Abort if the part number isn't in the HTML somewhere. # (Only use the numbers and letters to compare PN to HTML.) if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))): - self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist)) + self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, self.name)) raise PartHtmlError # If the tree contains the tag for a product page, then just return it. @@ -155,9 +155,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # If the tree is for a list of products, then examine the links to try to find the part number. if tree.find('div', class_=('resultsTable','results-table-container')) is not None: - self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist)) + self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name)) if descend <= 0: - self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist)) + self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name)) raise PartHtmlError else: # Look for the table of products. @@ -177,12 +177,12 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 for i in range(len(product_links)): if part_numbers[i] == match: # Get the tree for the linked-to page and return that. - self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(part_numbers[i], pn, dist)) + self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(part_numbers[i], pn, self.name)) return self.dist_get_part_html_tree(pn, extra_search_terms, url=product_links[i], descend=descend-1) # I don't know what happened here, so give up. - self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist)) + self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name)) self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html) raise PartHtmlError From 5a904941bbfea34beb29d4fa167b7c14581779ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hildo=20Guillardi=20J=C3=BAnior?= Date: Mon, 4 Jun 2018 17:27:15 +0200 Subject: [PATCH 24/29] TME fix and clean-up imports --- AUTHORS.rst | 3 ++- HISTORY.rst | 12 +++++++----- kicost/distributors/digikey/digikey.py | 12 ++---------- kicost/distributors/farnell/farnell.py | 15 +++------------ kicost/distributors/local/local.py | 6 +----- kicost/distributors/mouser/mouser.py | 15 +++------------ kicost/distributors/newark/newark.py | 15 +++------------ kicost/distributors/rs/rs.py | 7 +------ kicost/distributors/tme/tme.py | 18 ++++-------------- 9 files changed, 26 insertions(+), 77 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index df70686fb..8e7c4299a 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -16,4 +16,5 @@ Contributors * Diorcet Yann: https://github.com/diorcety * Giacinto Luigi Cerone https://github.com/glcerone * Hildo Guillardi Júnior https://github.com/hildogjr -* Adam Heinrich https://github.com/adamheinrich \ No newline at end of file +* Adam Heinrich https://github.com/adamheinrich +* Max Maisel https://github.com/mmmaisel diff --git a/HISTORY.rst b/HISTORY.rst index f8ca7a592..a9165c305 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -7,14 +7,16 @@ History ______________________ * Changed Farnell link and table result format. - +* Fixed TME `fake_browser` migration. +* Re-factored the distributors modules to class kind and improved the scrape sequence to decrease probability of ban. +* Fixed the multi-threading configuration. 0.1.44 (2018-05-28) ______________________ -* Fixed `logging` messages when using `tqdm`(process bar) for sequencial scrape, missing fix for multithreads scrape. -* Improve the `spreadsheet.py` to a lighter file when use just one distributor. -* Improved log messages to better comunity debug. +* Fixed ``logging`` messages when using ``tqdm``(process bar) for sequential scrape, missing fix for multithreads scrape. +* Improve the ``spreadsheet.py`` to a lighter file when use just one distributor. +* Improved log messages to better community debug. * Add Upverter CSV compatibility. * Fixed Mouser "quote price" exception in the price tiers. * Fixed wxPython exception import. @@ -38,7 +40,7 @@ ______________________ * Added ``--group_fields`` option to ignore differences in fields of the components and group them. * Fixed the not ungrouping issue when ``manf#`` equal ``None``. * CSV now accepts files from Proteus and Eagle EDA tools. -* Cleared up unused Python imports and better placed functions into files (spreasheet creation files are now in ``spreadsheet.py``). +* Cleared up unused Python imports and better placed functions into files (spreadsheet creation files are now in ``spreadsheet.py``). * Added a KiCost stamp version at the end of the spreadsheet and file information in the beginning, if they are not inside it. * Fixed issues related to user visualization in the spreadsheet (added gray formatted conditioning and the "exclude desc and manf columns"). * Added "user errors" and software scape in the case of not recognized references characters given the message of how to solve. diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py index 2c8e5ca87..2dc56c7d0 100644 --- a/kicost/distributors/digikey/digikey.py +++ b/kicost/distributors/digikey/digikey.py @@ -21,23 +21,15 @@ # THE SOFTWARE. # Inserted by Pasteurize tool. -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division -from __future__ import absolute_import -from builtins import zip -from builtins import range -from builtins import int -from builtins import str +from __future__ import print_function, unicode_literals, division, absolute_import +from builtins import zip, range, int, str from future import standard_library standard_library.install_aliases() - import future import re, difflib from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -#from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from .. import EXTRA_INFO_DIST, extra_info_dist_name_translations from ...globals import PartHtmlError diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py index 3d0afdce5..fa64308f9 100644 --- a/kicost/distributors/farnell/farnell.py +++ b/kicost/distributors/farnell/farnell.py @@ -21,24 +21,15 @@ # THE SOFTWARE. # Inserted by Pasteurize tool. -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division -from __future__ import absolute_import -from builtins import zip -from builtins import range -from builtins import int -from builtins import str +from __future__ import print_function, unicode_literals, division, absolute_import +from builtins import zip, range, int, str from future import standard_library standard_library.install_aliases() - import future -import re -import difflib +import re, difflib from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -#from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError from ...globals import currency diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py index a5ced0487..93fff0723 100644 --- a/kicost/distributors/local/local.py +++ b/kicost/distributors/local/local.py @@ -21,14 +21,10 @@ # THE SOFTWARE. # Inserted by Pasteurize tool. -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division -from __future__ import absolute_import +from __future__ import print_function, unicode_literals, division, absolute_import from builtins import zip, range, int, str from future import standard_library standard_library.install_aliases() - import future import re, difflib diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py index be2108267..ea724f78a 100644 --- a/kicost/distributors/mouser/mouser.py +++ b/kicost/distributors/mouser/mouser.py @@ -21,24 +21,15 @@ # THE SOFTWARE. # Inserted by Pasteurize tool. -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division -from __future__ import absolute_import -from builtins import zip -from builtins import range -from builtins import int -from builtins import str +from __future__ import print_function, unicode_literals, division, absolute_import +from builtins import zip, range, int, str from future import standard_library standard_library.install_aliases() - import future -import re -import difflib +import re, difflib from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -#from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py index 5f216b9e1..5deb4a5f6 100644 --- a/kicost/distributors/newark/newark.py +++ b/kicost/distributors/newark/newark.py @@ -21,24 +21,15 @@ # THE SOFTWARE. # Inserted by Pasteurize tool. -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division -from __future__ import absolute_import -from builtins import zip -from builtins import range -from builtins import int -from builtins import str +from __future__ import print_function, unicode_literals, division, absolute_import +from builtins import zip, range, int, str from future import standard_library standard_library.install_aliases() - import future -import re -import difflib +import re, difflib from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -#from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py index 1fd87af0e..5ca7dbc5b 100644 --- a/kicost/distributors/rs/rs.py +++ b/kicost/distributors/rs/rs.py @@ -21,20 +21,15 @@ # THE SOFTWARE. # Inserted by Pasteurize tool. -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division -from __future__ import absolute_import +from __future__ import print_function, unicode_literals, division, absolute_import from builtins import zip, range, int, str from future import standard_library standard_library.install_aliases() - import future import re, difflib from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -#from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py index 1c22b6d89..138771d96 100644 --- a/kicost/distributors/tme/tme.py +++ b/kicost/distributors/tme/tme.py @@ -21,25 +21,16 @@ # THE SOFTWARE. # Inserted by Pasteurize tool. -from __future__ import print_function -from __future__ import unicode_literals -from __future__ import division -from __future__ import absolute_import -from builtins import zip -from builtins import range -from builtins import int -from builtins import str +from __future__ import print_function, unicode_literals, division, absolute_import +from builtins import zip, range, int, str from future import standard_library standard_library.install_aliases() - import future -import re -import difflib +import re, difflib import json from bs4 import BeautifulSoup import http.client # For web scraping exceptions. -#from .. import urlencode, urlquote, urlsplit, urlunsplit from .. import fake_browser from ...globals import PartHtmlError from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES @@ -73,8 +64,7 @@ def __ajax_details(self, pn): return None, None try: - r = r.decode('utf-8') # Convert bytes to string in Python 3. - p = json.loads(r).get('Products') + p = json.loads(html).get('Products') if p is not None and isinstance(p, list): p = p[0] html_tree = BeautifulSoup(p.get('PriceTpl', '').replace("\n", ""), "lxml") From bcfbe7de293d70ec15d0c0c66d8370726615dd65 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Tue, 5 Jun 2018 17:09:27 +0200 Subject: [PATCH 25/29] Fixed syntax/naming error in digikey part number from table function. --- kicost/distributors/digikey/digikey.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py index 2dc56c7d0..a07f3ce8e 100644 --- a/kicost/distributors/digikey/digikey.py +++ b/kicost/distributors/digikey/digikey.py @@ -254,8 +254,8 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 self.logger.log(DEBUG_OBSESSIVE,'Found {} alternate packagings for {} from {}'.format(len(ap_urls), pn, self.name)) ap_trees_and_urls = [] # Initialize as empty in case no alternate packagings are found. try: - ap_trees_and_urls = [get_part_html_tree(self.name, pn, - extra_search_terms, ap_url, descend=0, scrape_retries=scrape_retries) + ap_trees_and_urls = [self.dist_get_part_html_tree(pn, + extra_search_terms, ap_url, descend=0) for ap_url in ap_urls] except Exception: self.logger.log(DEBUG_OBSESSIVE,'Failed to find alternate packagings for {} from {}'.format(pn, self.name)) From d1902b37ba8107a5fa142d2aa820544c147dc698 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Wed, 6 Jun 2018 17:15:00 +0200 Subject: [PATCH 26/29] Fixed compiler errors in digikey dist_define_locale_currency, removed unneccessary non-logger debug print in kicost.py. --- kicost/distributors/digikey/digikey.py | 7 ++++--- kicost/kicost.py | 1 - 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py index a07f3ce8e..da7a970eb 100644 --- a/kicost/distributors/digikey/digikey.py +++ b/kicost/distributors/digikey/digikey.py @@ -130,10 +130,11 @@ def dist_define_locale_currency(self, locale_iso=None, currency_iso=None): locale_iso = pycountry.countries.get(numeric=money.numeric).alpha_2 if locale_iso: locale_iso = locale_iso.upper() - country = pycountry.countries.get(alpha_2=locale_iso.upper()).name - html = html.find('li', text=re.compile(country, re.IGNORECASE)) + country = pycountry.countries.get(alpha_2=locale_iso.upper()) + html = html.find('li', text=re.compile(country.name, re.IGNORECASE)) url = html.find('a', id='linkcolor').get('href') - + + # Store new localized url in distributor_dict. distributor_dict[self.name]['site']['url'] = url distributor_dict[self.name]['site']['currency'] = pycountry.currencies.get(numeric=country.numeric).alpha_3 distributor_dict[self.name]['site']['locale'] = locale_iso diff --git a/kicost/kicost.py b/kicost/kicost.py index 9a2ed03e9..5f121fb3f 100644 --- a/kicost/kicost.py +++ b/kicost/kicost.py @@ -245,7 +245,6 @@ def mt_init_dist(d, scrape): return (d, instance) logger.log(DEBUG_OBSESSIVE, 'Starting {} threads to init distributors...'.format(num_processes)) - pprint.pprint(arg_sets) results = [pool.apply_async(mt_init_dist, args) for args in arg_sets] # Wait for all the processes to have results. From 3aa06596a897ce1fdf1b2c667e136d95ac0cefe1 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Wed, 6 Jun 2018 17:17:54 +0200 Subject: [PATCH 27/29] Implemented session recreation strategy if scraper gets detected. --- kicost/distributors/digikey/digikey.py | 7 +--- kicost/distributors/distributor.py | 10 +++-- kicost/distributors/fake_browser.py | 58 +++++++++++++++++++++----- kicost/distributors/farnell/farnell.py | 7 +--- kicost/distributors/local/local.py | 2 +- kicost/distributors/mouser/mouser.py | 7 +--- kicost/distributors/newark/newark.py | 7 +--- kicost/distributors/rs/rs.py | 7 +--- kicost/distributors/tme/tme.py | 7 +--- kicost/globals.py | 3 +- 10 files changed, 69 insertions(+), 46 deletions(-) diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py index da7a970eb..0999f005e 100644 --- a/kicost/distributors/digikey/digikey.py +++ b/kicost/distributors/digikey/digikey.py @@ -43,11 +43,8 @@ class dist_digikey(distributor.distributor): def __init__(self, name, scrape_retries, log_level, throttle_delay): - super(dist_digikey, self).__init__(name, scrape_retries, log_level, throttle_delay) - self.domain = distributor_dict[self.name]['site']['url'] - - self.browser.scrape_URL(self.domain) - self.browser.show_cookies(self.name) + super(dist_digikey, self).__init__(name, distributor_dict[name]['site']['url'], + scrape_retries, log_level, throttle_delay) def dist_get_price_tiers(self, html_tree): '''@brief Get the pricing tiers from the parsed tree of the Digikey product page. diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py index 3989458a1..1c2b96f56 100644 --- a/kicost/distributors/distributor.py +++ b/kicost/distributors/distributor.py @@ -59,13 +59,17 @@ class distributor: start_time = time.time() - def __init__(self, name, scrape_retries, log_level, throttle_delay): + def __init__(self, name, domain, scrape_retries, log_level, throttle_delay): self.name = name self.scrape_retries = scrape_retries self.logger = logger self.log_level = log_level - self.domain = None - self.browser = fake_browser.fake_browser(self.logger, self.scrape_retries, throttle_delay) + self.domain = domain + + # Don't create fake_browser for "local" distributor. + if self.domain != None: + self.browser = fake_browser.fake_browser \ + (self.domain, self.logger, self.scrape_retries, throttle_delay) # Abstract methods, implemented in distributor specific modules def dist_get_part_html_tree(self, pn, extra_search_terms, url, descend): diff --git a/kicost/distributors/fake_browser.py b/kicost/distributors/fake_browser.py index 0fd332e33..2d8cd99bf 100644 --- a/kicost/distributors/fake_browser.py +++ b/kicost/distributors/fake_browser.py @@ -31,7 +31,7 @@ import http.client # For web scraping exceptions. import requests -from ..globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE +from ..globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_HEADERS, DEBUG_HTTP_RESPONSES if sys.version_info>=(3,0): # This is for Python 3 @@ -152,12 +152,23 @@ def get_user_agent(): # Open the URL, read the HTML from it, and parse it into a tree structure. class fake_browser: - def __init__(self, logger, scrape_retries, throttle_delay): + def __init__(self, domain, logger, scrape_retries, throttle_delay): '''@brief fake_browser @param logger @param scrape_retries `int` Quantity of retries in case of fail. ''' - + + self.config_cookies = list() + self.domain = domain + self.throttle_delay = throttle_delay + self.throttle_timeout = time.time() + + self.scrape_retries = scrape_retries + self.logger = logger + + self.start_new_session() + + def start_new_session(self): self.userAgent = get_user_agent() # Use "requests" instead of "urllib" because "urllib" does not allow @@ -165,25 +176,31 @@ def __init__(self, logger, scrape_retries, throttle_delay): self.session = requests.session() self.session.headers["User-Agent"] = self.userAgent - self.throttle_delay = throttle_delay - self.throttle_timeout = time.time() + # Restore configuration cookies from previous session. + for c in self.config_cookies: + print("Restore cookie: %s", c) + self.session.cookies.set(c[1], c[2], domain=c[0]) - self.scrape_retries = scrape_retries - self.logger = logger + self.scrape_URL(self.domain, retry=False) + self.show_cookies() - def show_cookies(self, name): + def show_cookies(self): for x in self.session.cookies: self.logger.log(DEBUG_OBSESSIVE,"%s Cookie %s" % (x.domain, x.name)) def add_cookie(self, domain, name, value): self.session.cookies.set(name, value, domain=domain) + self.config_cookies.append((domain, name, value)) - def scrape_URL(self, url, add_header=[]): + def scrape_URL(self, url, add_header=[], retry=True): headers = self.session.headers for header in add_header: self.session.headers[header[1]] = header[2] - for _ in range(self.scrape_retries): + retries = self.scrape_retries + if retry == False: + retries = 1 + for _ in range(retries): try: # Check the throttling timeout of this browser to see if # another access to its website is allowed. @@ -197,7 +214,26 @@ def scrape_URL(self, url, add_header=[]): # Update the timeout for this browser. self.throttle_timeout = time.time() + self.throttle_delay - html = self.session.get(url, timeout=5).text + resp = self.session.get(url, timeout=5) + self.logger.log(DEBUG_HTTP_HEADERS, "Request headers: %s" % resp.request.headers) + self.logger.log(DEBUG_HTTP_HEADERS, "Response headers: %s" % resp.headers) + + # Uncomment this to dump received HTML to file. + #if self.logger.isEnabledFor(DEBUG_HTTP_RESPONSES): + # f = open("debug-page.html", "w") + # f.write(resp.text) + # f.close() + # input("Received page dumped, Press enter to continue.") + + # start new session if we are detected (received 403) + # TODO: add detection logic for captchas and javascript only pages as well + if resp.status_code == 403: + self.start_new_session() + self.logger.warning("Received 403, scraper possibly detected:" \ + " Starting new session for %s" % self.domain) + continue + + html = resp.text break except Exception as ex: self.logger.log(DEBUG_DETAILED,'Exception of type "%s" while web-scraping %s' \ diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py index fa64308f9..b7289b2d0 100644 --- a/kicost/distributors/farnell/farnell.py +++ b/kicost/distributors/farnell/farnell.py @@ -43,11 +43,8 @@ class dist_farnell(distributor.distributor): def __init__(self, name, scrape_retries, log_level, throttle_delay): - super(dist_farnell, self).__init__(name, scrape_retries, log_level, throttle_delay) - self.domain = distributor_dict[self.name]['site']['url'] - - self.browser.scrape_URL(self.domain) - self.browser.show_cookies(self.name) + super(dist_farnell, self).__init__(name, distributor_dict[name]['site']['url'], + scrape_retries, log_level, throttle_delay) def dist_get_price_tiers(self, html_tree): '''@brief Get the pricing tiers from the parsed tree of the farnell product page. diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py index 93fff0723..20489d1b2 100644 --- a/kicost/distributors/local/local.py +++ b/kicost/distributors/local/local.py @@ -44,7 +44,7 @@ class dist_local(distributor.distributor): html = None def __init__(self, name, scrape_retries, log_level, throttle_delay): - super(dist_local, self).__init__(name, scrape_retries, log_level, throttle_delay) + super(dist_local, self).__init__(name, None, scrape_retries, log_level, throttle_delay) def create_part_html(parts, distributors, logger): '''@brief Create HTML page containing info for local (non-webscraped) parts. diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py index ea724f78a..1d95de231 100644 --- a/kicost/distributors/mouser/mouser.py +++ b/kicost/distributors/mouser/mouser.py @@ -40,13 +40,10 @@ class dist_mouser(distributor.distributor): def __init__(self, name, scrape_retries, log_level, throttle_delay): - super(dist_mouser, self).__init__(name, scrape_retries, log_level, throttle_delay) - self.domain = distributor_dict[self.name]['site']['url'] + super(dist_mouser, self).__init__(name, distributor_dict[name]['site']['url'], + scrape_retries, log_level, throttle_delay) self.browser.add_cookie('.mouser.com', 'preferences', 'ps=www2&pl=en-US&pc_www2=USDe') - self.browser.scrape_URL(self.domain) - self.browser.show_cookies(self.name) - def dist_get_price_tiers(self, html_tree): '''@brief Get the pricing tiers from the parsed tree of the Mouser product page. @param html_tree `str()` html of the distributor part page. diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py index 5deb4a5f6..356053600 100644 --- a/kicost/distributors/newark/newark.py +++ b/kicost/distributors/newark/newark.py @@ -40,11 +40,8 @@ class dist_newark(distributor.distributor): def __init__(self, name, scrape_retries, log_level, throttle_delay): - super(dist_newark, self).__init__(name, scrape_retries, log_level, throttle_delay) - self.domain = distributor_dict[self.name]['site']['url'] - - self.browser.scrape_URL(self.domain) - self.browser.show_cookies(self.name) + super(dist_newark, self).__init__(name, distributor_dict[name]['site']['url'], + scrape_retries, log_level, throttle_delay) def dist_get_price_tiers(self, html_tree): '''@brief Get the pricing tiers from the parsed tree of the Newark product page. diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py index 5ca7dbc5b..fd80265ef 100644 --- a/kicost/distributors/rs/rs.py +++ b/kicost/distributors/rs/rs.py @@ -41,11 +41,8 @@ class dist_rs(distributor.distributor): def __init__(self, name, scrape_retries, log_level, throttle_delay): - super(dist_rs, self).__init__(name, scrape_retries, log_level, throttle_delay) - self.domain = distributor_dict[self.name]['site']['url'] - - self.browser.scrape_URL(self.domain) - self.browser.show_cookies(self.name) + super(dist_rs, self).__init__(name, distributor_dict[name]['site']['url'], + scrape_retries, log_level, throttle_delay) def dist_get_price_tiers(self, html_tree): '''@brief Get the pricing tiers from the parsed tree of the RS Components product page. diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py index 138771d96..21f8aa6d5 100644 --- a/kicost/distributors/tme/tme.py +++ b/kicost/distributors/tme/tme.py @@ -41,11 +41,8 @@ class dist_tme(distributor.distributor): def __init__(self, name, scrape_retries, log_level, throttle_delay): - super(dist_tme, self).__init__(name, scrape_retries, log_level, throttle_delay) - self.domain = distributor_dict[self.name]['site']['url'] - - self.browser.scrape_URL(self.domain) - self.browser.show_cookies(self.name) + super(dist_tme, self).__init__(name, distributor_dict[name]['site']['url'], + scrape_retries, log_level, throttle_delay) def __ajax_details(self, pn): '''@brief Load part details from TME using XMLHttpRequest. diff --git a/kicost/globals.py b/kicost/globals.py index 4d95d392f..d6e75fa12 100644 --- a/kicost/globals.py +++ b/kicost/globals.py @@ -29,7 +29,8 @@ DEBUG_OVERVIEW = logging.DEBUG DEBUG_DETAILED = logging.DEBUG-1 DEBUG_OBSESSIVE = logging.DEBUG-2 -DEBUG_HTTP_RESPONSES = logging.DEBUG-3 +DEBUG_HTTP_HEADERS = logging.DEBUG-3 +DEBUG_HTTP_RESPONSES = logging.DEBUG-4 SEPRTR = ':' # Delimiter between library:component, distributor:field, etc. From fa1d98f0db7065230a00f026bf6e1a69b42a3c84 Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Wed, 6 Jun 2018 17:20:07 +0200 Subject: [PATCH 28/29] Removed unused "log_level" parameter in distributor class. This commit finishes the refactoring. --- kicost/distributors/digikey/digikey.py | 4 ++-- kicost/distributors/distributor.py | 7 +++---- kicost/distributors/farnell/farnell.py | 4 ++-- kicost/distributors/local/local.py | 4 ++-- kicost/distributors/mouser/mouser.py | 4 ++-- kicost/distributors/newark/newark.py | 4 ++-- kicost/distributors/rs/rs.py | 4 ++-- kicost/distributors/tme/tme.py | 4 ++-- kicost/globals.py | 1 + kicost/kicost.py | 2 +- 10 files changed, 19 insertions(+), 19 deletions(-) diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py index 0999f005e..1059392b7 100644 --- a/kicost/distributors/digikey/digikey.py +++ b/kicost/distributors/digikey/digikey.py @@ -42,9 +42,9 @@ import pycountry class dist_digikey(distributor.distributor): - def __init__(self, name, scrape_retries, log_level, throttle_delay): + def __init__(self, name, scrape_retries, throttle_delay): super(dist_digikey, self).__init__(name, distributor_dict[name]['site']['url'], - scrape_retries, log_level, throttle_delay) + scrape_retries, throttle_delay) def dist_get_price_tiers(self, html_tree): '''@brief Get the pricing tiers from the parsed tree of the Digikey product page. diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py index 1c2b96f56..cdddfbde2 100644 --- a/kicost/distributors/distributor.py +++ b/kicost/distributors/distributor.py @@ -59,11 +59,10 @@ class distributor: start_time = time.time() - def __init__(self, name, domain, scrape_retries, log_level, throttle_delay): + def __init__(self, name, domain, scrape_retries, throttle_delay): self.name = name self.scrape_retries = scrape_retries self.logger = logger - self.log_level = log_level self.domain = domain # Don't create fake_browser for "local" distributor. @@ -128,9 +127,9 @@ def scrape_part(self, id, part): else: self.logger = multiprocessing.get_logger() handler = logging.StreamHandler(sys.stdout) - handler.setLevel(self.log_level) + handler.setLevel(1) self.logger.addHandler(handler) - self.logger.setLevel(self.log_level) + self.logger.setLevel(1) self.browser.logger = self.logger url = {} diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py index b7289b2d0..78e6f05bc 100644 --- a/kicost/distributors/farnell/farnell.py +++ b/kicost/distributors/farnell/farnell.py @@ -42,9 +42,9 @@ __author__='Giacinto Luigi Cerone' class dist_farnell(distributor.distributor): - def __init__(self, name, scrape_retries, log_level, throttle_delay): + def __init__(self, name, scrape_retries, throttle_delay): super(dist_farnell, self).__init__(name, distributor_dict[name]['site']['url'], - scrape_retries, log_level, throttle_delay) + scrape_retries, throttle_delay) def dist_get_price_tiers(self, html_tree): '''@brief Get the pricing tiers from the parsed tree of the farnell product page. diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py index 20489d1b2..a203c2a2e 100644 --- a/kicost/distributors/local/local.py +++ b/kicost/distributors/local/local.py @@ -43,8 +43,8 @@ class dist_local(distributor.distributor): # Static variable which contains local part html. html = None - def __init__(self, name, scrape_retries, log_level, throttle_delay): - super(dist_local, self).__init__(name, None, scrape_retries, log_level, throttle_delay) + def __init__(self, name, scrape_retries, throttle_delay): + super(dist_local, self).__init__(name, None, scrape_retries, throttle_delay) def create_part_html(parts, distributors, logger): '''@brief Create HTML page containing info for local (non-webscraped) parts. diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py index 1d95de231..60b813e5e 100644 --- a/kicost/distributors/mouser/mouser.py +++ b/kicost/distributors/mouser/mouser.py @@ -39,9 +39,9 @@ from urllib.parse import quote_plus as urlquote class dist_mouser(distributor.distributor): - def __init__(self, name, scrape_retries, log_level, throttle_delay): + def __init__(self, name, scrape_retries, throttle_delay): super(dist_mouser, self).__init__(name, distributor_dict[name]['site']['url'], - scrape_retries, log_level, throttle_delay) + scrape_retries, throttle_delay) self.browser.add_cookie('.mouser.com', 'preferences', 'ps=www2&pl=en-US&pc_www2=USDe') def dist_get_price_tiers(self, html_tree): diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py index 356053600..3aa69043c 100644 --- a/kicost/distributors/newark/newark.py +++ b/kicost/distributors/newark/newark.py @@ -39,9 +39,9 @@ from urllib.parse import quote_plus as urlquote class dist_newark(distributor.distributor): - def __init__(self, name, scrape_retries, log_level, throttle_delay): + def __init__(self, name, scrape_retries, throttle_delay): super(dist_newark, self).__init__(name, distributor_dict[name]['site']['url'], - scrape_retries, log_level, throttle_delay) + scrape_retries, throttle_delay) def dist_get_price_tiers(self, html_tree): '''@brief Get the pricing tiers from the parsed tree of the Newark product page. diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py index fd80265ef..f63109ce0 100644 --- a/kicost/distributors/rs/rs.py +++ b/kicost/distributors/rs/rs.py @@ -40,9 +40,9 @@ from urllib.parse import quote_plus as urlquote class dist_rs(distributor.distributor): - def __init__(self, name, scrape_retries, log_level, throttle_delay): + def __init__(self, name, scrape_retries, throttle_delay): super(dist_rs, self).__init__(name, distributor_dict[name]['site']['url'], - scrape_retries, log_level, throttle_delay) + scrape_retries, throttle_delay) def dist_get_price_tiers(self, html_tree): '''@brief Get the pricing tiers from the parsed tree of the RS Components product page. diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py index 21f8aa6d5..301434485 100644 --- a/kicost/distributors/tme/tme.py +++ b/kicost/distributors/tme/tme.py @@ -40,9 +40,9 @@ from urllib.parse import quote_plus as urlquote, urlencode class dist_tme(distributor.distributor): - def __init__(self, name, scrape_retries, log_level, throttle_delay): + def __init__(self, name, scrape_retries, throttle_delay): super(dist_tme, self).__init__(name, distributor_dict[name]['site']['url'], - scrape_retries, log_level, throttle_delay) + scrape_retries, throttle_delay) def __ajax_details(self, pn): '''@brief Load part details from TME using XMLHttpRequest. diff --git a/kicost/globals.py b/kicost/globals.py index d6e75fa12..54e78caf8 100644 --- a/kicost/globals.py +++ b/kicost/globals.py @@ -31,6 +31,7 @@ DEBUG_OBSESSIVE = logging.DEBUG-2 DEBUG_HTTP_HEADERS = logging.DEBUG-3 DEBUG_HTTP_RESPONSES = logging.DEBUG-4 +# Minimum possible log level is logging.DEBUG-9 ! SEPRTR = ':' # Delimiter between library:component, distributor:field, etc. diff --git a/kicost/kicost.py b/kicost/kicost.py index 5f121fb3f..8b567b158 100644 --- a/kicost/kicost.py +++ b/kicost/kicost.py @@ -233,7 +233,7 @@ def mt_init_dist(d, scrape): ctor = globals()['dist_local'] else: ctor = globals()['dist_'+d] - instance = ctor(d, scrape_retries, 5, throttling_delay) # TODO: log level + instance = ctor(d, scrape_retries, throttling_delay) except Exception as ex: logger.log(DEBUG_OVERVIEW, "Initialising %s failed with %s, exculding this distributor..." \ % (d, type(ex).__name__)) From 51ba35d8c903120ea2a4c8991199d770523984be Mon Sep 17 00:00:00 2001 From: Max Maisel Date: Thu, 7 Jun 2018 17:08:19 +0200 Subject: [PATCH 29/29] Remove trailing space from search URLs. --- kicost/distributors/digikey/digikey.py | 8 +++----- kicost/distributors/farnell/farnell.py | 8 ++++---- kicost/distributors/mouser/mouser.py | 6 +++--- kicost/distributors/newark/newark.py | 7 ++++--- kicost/distributors/rs/rs.py | 5 +++-- kicost/distributors/tme/tme.py | 6 +++--- 6 files changed, 20 insertions(+), 20 deletions(-) diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py index 1059392b7..5863e8b7d 100644 --- a/kicost/distributors/digikey/digikey.py +++ b/kicost/distributors/digikey/digikey.py @@ -197,11 +197,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # Use the part number to lookup the part using the site search function, unless a starting url was given. if url is None: - url = distributor_dict['digikey']['site']['url'] + '/products/en?keywords=' + urlquote( - #'/scripts/DkSearch/dksus.dll?WT.z_header=search_go&lang=en&keywords=' + urlquote( - pn + ' ' + extra_search_terms, - safe='') - #url = distributor_dict['digikey']['site']['url'] + '/product-search/en?KeyWords=' + urlquote(pn,safe='') + '&WT.z_header=search_go' + url = distributor_dict['digikey']['site']['url'] + '/products/en?keywords=' + urlquote(pn, safe='') + if extra_search_terms: + url = url + urlquote(' ' + extra_search_terms, safe='') elif url[0] == '/': url = distributor_dict['digikey']['site']['url'] + url diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py index 78e6f05bc..7509d8186 100644 --- a/kicost/distributors/farnell/farnell.py +++ b/kicost/distributors/farnell/farnell.py @@ -132,10 +132,10 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # Use the part number to lookup the part using the site search function, unless a starting url was given. if url is None: - url = 'http://it.farnell.com/Search?storeId=10165&catalogId=15001&categoryName=&selectedCategoryId=&langId=-4&categoryIdBox=&st=' + urlquote( - pn + ' ' + extra_search_terms, - safe='') - + url = 'http://it.farnell.com/Search?storeId=10165&catalogId=15001&categoryName=&selectedCategoryId=&langId=-4&categoryIdBox=&st=' \ + + urlquote(pn, safe='') + if extra_search_terms: + url = url + urlquote(' ' + extra_search_terms, safe='') elif url[0] == '/': url = 'http://www.farnell.com' + url elif url.startswith('..'): diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py index 60b813e5e..e299147cb 100644 --- a/kicost/distributors/mouser/mouser.py +++ b/kicost/distributors/mouser/mouser.py @@ -138,9 +138,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # Use the part number to lookup the part using the site search function, unless a starting url was given. if url is None: - url = 'https://www.mouser.com/Search/Refine.aspx?Keyword=' + urlquote( - pn + ' ' + extra_search_terms, - safe='') + url = 'https://www.mouser.com/Search/Refine.aspx?Keyword=' + urlquote(pn, safe='') + if extra_search_terms: + url = url + urlquote(' ' + extra_search_terms, safe='') elif url[0] == '/': url = 'https://www.mouser.com' + url elif url.startswith('..'): diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py index 3aa69043c..1e3a2176d 100644 --- a/kicost/distributors/newark/newark.py +++ b/kicost/distributors/newark/newark.py @@ -133,9 +133,10 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # Use the part number to lookup the part using the site search function, unless a starting url was given. if url is None: - url = 'http://www.newark.com/webapp/wcs/stores/servlet/Search?catalogId=15003&langId=-1&storeId=10194&gs=true&st=' + urlquote( - pn + ' ' + extra_search_terms, - safe='') + url = 'http://www.newark.com/webapp/wcs/stores/servlet/Search?catalogId=15003&langId=-1&storeId=10194&gs=true&st=' \ + + urlquote(pn, safe='') + if extra_search_terms: + url = url + urlquote(' ' + extra_search_terms, safe='') elif url[0] == '/': url = 'http://www.newark.com' + url elif url.startswith('..'): diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py index f63109ce0..67dabcee2 100644 --- a/kicost/distributors/rs/rs.py +++ b/kicost/distributors/rs/rs.py @@ -115,8 +115,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # Use the part number to lookup the part using the site search function, unless a starting url was given. if url is None: - url = 'http://it.rs-online.com/web/c/?searchTerm=' + urlquote(pn + ' ' + extra_search_terms, safe='') - + url = 'http://it.rs-online.com/web/c/?searchTerm=' + urlquote(pn, safe='') + if extra_search_terms: + url = url + urlquote(' ' + extra_search_terms, safe='') elif url[0] == '/': url = 'http://it.rs-online.com' + url elif url.startswith('..'): diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py index 301434485..1b1aa4e3f 100644 --- a/kicost/distributors/tme/tme.py +++ b/kicost/distributors/tme/tme.py @@ -157,9 +157,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2 # Use the part number to lookup the part using the site search function, unless a starting url was given. if url is None: - url = 'https://www.tme.eu/en/katalog/?search=' + urlquote( - pn + ' ' + extra_search_terms, - safe='') + url = 'https://www.tme.eu/en/katalog/?search=' + urlquote(pn, safe='') + if extra_search_terms: + url = url + urlquote(' ' + extra_search_terms, safe='') elif url[0] == '/': url = 'https://www.tme.eu' + url