From 4e3c7ecc8e4ef40b4bc589cbab52ef2d34155b08 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Fri, 1 Jun 2018 15:18:13 +0200
Subject: [PATCH 01/29] Started implementation of fake_browser and distributor
 base classes.

---
 kicost/distributors/distributor.py  | 211 +++++++++++++++++++++++++++
 kicost/distributors/fake_browser.py | 215 ++++++++++++++++++++++++++++
 2 files changed, 426 insertions(+)
 create mode 100644 kicost/distributors/distributor.py
 create mode 100644 kicost/distributors/fake_browser.py

diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py
new file mode 100644
index 000000000..5510ac4b3
--- /dev/null
+++ b/kicost/distributors/distributor.py
@@ -0,0 +1,211 @@
+# MIT license
+#
+# Copyright (C) 2018 by XESS Corporation / Max Maisel
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+# Author information.
+__author__ = 'Max Maisel'
+__webpage__ = 'https://github.com/mmmaisel/'
+
+# Libraries.
+import sys
+from bs4 import BeautifulSoup # XML file interpreter.
+import multiprocessing # To deal with the parallel scrape.
+import logging
+import time
+from random import choice
+from ..eda_tools.eda_tools import order_refs # To better print the warnings about the parts.
+
+from . import fake_browser
+
+import http.client # For web scraping exceptions.
+try:
+    # This is for Python 3.
+    from urllib.parse import urlencode, quote_plus as urlquote, urlsplit, urlunsplit
+    from urllib.request import urlopen, Request
+    import urllib.error
+    WEB_SCRAPE_EXCEPTIONS = (urllib.error.URLError, http.client.HTTPException)
+except ImportError:
+    # This is for Python 2.
+    from urlparse import urlsplit, urlunsplit
+    from urllib import urlencode, quote_plus as urlquote
+    from urllib2 import urlopen, Request
+    import urllib2
+    WEB_SCRAPE_EXCEPTIONS = (urllib2.URLError, http.client.HTTPException)
+
+from ..globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE # Debug configurations.
+from ..globals import SEPRTR
+from ..globals import PartHtmlError
+from . import distributor_dict
+
+import os, re
+
+class distributor:
+    def __init__(self, scrape_retries, log_level, throttle_delay):
+        self.name = None
+        self.page_accessed = False
+        self.scrape_retries = scrape_retries
+        self.logger = logger
+        self.log_level = log_level
+        self.throttle_delay = throttle_delay
+        self.throttle_timeout = time.time()
+        self.domain = None
+        self.browser = fake_browser.fake_browser(self.logger, self.scrape_retries)
+
+    # Abstract methods, implemented in distributor specific modules
+    def dist_get_part_html_tree(self, pn, extra_search_terms, url, descend):
+        raise NotImplementedError()
+
+    def dist_get_part_num(self, html_tree):
+        raise NotImplementedError()
+
+    def dist_get_qty_avail(self, html_tree):
+        raise NotImplementedError()
+
+    def dist_get_price_tiers(self, html_tree):
+        raise NotImplementedError()
+
+    def dist_get_extra_info(self, html_tree):
+        raise NotImplementedError()
+
+    def dist_define_locale_currency(self, locale, currency):
+        raise NotImplementedError()
+
+    def define_locale_currency(self, locale_currency='USD'):
+        '''@brief Configure the distributor for some locale/country and
+        currency second ISO3166 and ISO4217
+        
+        @param `str` Alpha 2 country or alpha 3 currency or even one slash other.'''
+        try:
+            if distributor_dict[self.name]['scrape'] == 'web':
+                # Not make sense to configurate a local distributor (yet).
+                locale_currency = re.findall('\w{2,}', locale_currency)
+                locale = None
+                currency = None
+                for alpha in locale_currency:
+                    if len(alpha)==2:
+                        locale = alpha
+                    elif len(alpha)==3:
+                        currency = alpha
+                self.dist_define_locale_currency(locale, currency)
+        except NotImplementedError:
+            logger.warning('No currency/country configuration for {}.'.format(self.name))
+            pass
+
+    def scrape_part(self, id, part, local_part_html):
+        '''@brief Scrape the data for a part from each distributor website or local HTML.
+        
+        Use distributors submodules to scrape each distributor part page and get
+        informations such as price, quantity avaliable and others;
+        
+        @param `int` Count of the main loop.
+        @param `str` String with the part number / distributor stock.
+        @param `str` Local part HTML
+        @return id, distributor_name, url, `str` distributor stock part number,
+            `dict` price tiers, `int` qty avail, `dict` extrainfo dist
+        '''
+
+        if multiprocessing.current_process().name == "MainProcess":
+            self.logger = logging.getLogger('kicost')
+        else:
+            self.logger = multiprocessing.get_logger()
+            handler = logging.StreamHandler(sys.stdout)
+            handler.setLevel(self.log_level)
+            self.logger.addHandler(handler)
+            self.logger.setLevel(self.log_level)
+            self.browser.logger = self.logger
+
+        url = {}
+        part_num = {}
+        qty_avail = {}
+        price_tiers = {}
+        info_dist = {}
+
+        if distributor_dict[self.name]['scrape']=='web':
+            if self.page_accessed == True:
+                # Check the throttling timeout for the chosen distributor to see if
+                # another access to its website is allowed.
+                if self.throttle_timeout > time.time():
+                    time.sleep(self.throttle_timeout - time.time())
+
+            # Update the timeout for this distributor website and release the sync. lock.
+            self.throttle_timeout = time.time() + self.throttle_delay
+            # Founded manufacturer / distributor code valid (not empty).
+        else:
+            self.logger.log(DEBUG_OBSESSIVE,'No delay for %s, type=%s' \
+                % (self.name, distributor_dict[self.name]['scrape']))
+
+        # Get the HTML tree for the part.
+        html_tree, url = self.get_part_html_tree(part, local_part_html=local_part_html)
+
+        # Call the functions that extract the data from the HTML tree.
+        part_num = self.dist_get_part_num(html_tree)
+        qty_avail = self.dist_get_qty_avail(html_tree)
+        price_tiers = self.dist_get_price_tiers(html_tree)
+        
+        try:
+            # Get extra characeristics of the part in the web page.
+            # This will be use to comment in the 'cat#' column of the
+            # spreadsheet and some validations (in the future implementaions)
+            info_dist = self.dist_get_extra_info(html_tree)
+        except:
+            info_dist = {}
+            pass
+
+        # Return the part data.
+        return id, self.name, url, part_num, price_tiers, qty_avail, info_dist
+
+    def get_part_html_tree(self, part, local_part_html):
+        '''@brief Get the HTML tree for a part.
+        
+        Get the HTML tree for a part from the given distributor website or local HTML.
+        @param `str` part Part manufactor code or distributor stock code.
+        @param `str` local_part_html
+        @return `str` with the HTML webpage.'''
+
+        self.logger.log(DEBUG_OBSESSIVE, 'Looking in %s by %s:', self.name, order_refs(part.refs, True))
+
+        for extra_search_terms in set([part.fields.get('manf', ''), '']):
+            try:
+                # Search for part information using one of the following:
+                #    1) the distributor's catalog number.
+                #    2) the manufacturer's part number.
+                for key in (self.name+'#', self.name+SEPRTR+'cat#', 'manf#'):
+                    if key in part.fields:
+                        if part.fields[key]:
+                            self.page_accessed = True
+                            return self.dist_get_part_html_tree \
+                                (part.fields[key], extra_search_terms, local_part_html=local_part_html)
+                # No distributor or manufacturer number, so give up.
+                else:
+                    self.page_accessed = False
+                    self.logger.warning("No '%s#' or 'manf#' field: cannot lookup part %s at %s.", \
+                        self.name, part.refs, self.name)
+                    return BeautifulSoup('<html></html>', 'lxml'), ''
+                    #raise PartHtmlError
+            except PartHtmlError:
+                pass
+            except AttributeError:
+                break
+        self.logger.warning("Part %s not found at %s.", order_refs(part.refs, False), self.name)
+        # If no HTML page was found, then return a tree for an empty page.
+        return BeautifulSoup('<html></html>', 'lxml'), ''
+
+
diff --git a/kicost/distributors/fake_browser.py b/kicost/distributors/fake_browser.py
new file mode 100644
index 000000000..7477cdec8
--- /dev/null
+++ b/kicost/distributors/fake_browser.py
@@ -0,0 +1,215 @@
+# -*- coding: utf-8 -*-
+# MIT license
+#
+# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior / Max Maisel
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+__author__ = 'XESS Corporation'
+__email__ = 'info@xess.com'
+
+from random import choice
+
+import http.client # For web scraping exceptions.
+import http.cookiejar
+
+from ..globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
+
+try:
+    # This is for Python 3
+    from urllib.parse import urlencode, quote_plus as urlquote, urlsplit, urlunsplit
+    from urllib.request import urlopen, Request
+    import urllib.error
+    WEB_SCRAPE_EXCEPTIONS = (urllib.error.URLError, http.client.HTTPException)
+except ImportError:
+    # This is for Python 2
+    from urlparse import urlsplit, urlunsplit
+    from urllib import urlencode, quote_plus as urlquote
+    from urllib2 import urlopen, Request
+    import urllib2
+    WEB_SCRAPE_EXCEPTIONS = (urllib2.URLError, http.client.HTTPException)
+
+
+def get_user_agent():
+    ''' The default user_agent_list comprises chrome, IE, firefox, Mozilla, opera, netscape.
+      You can find more user agent strings at https://techblog.willshouse.com/2012/01/03/most-common-user-agents/.
+      Used for the function `fake_browser(url, retries)`
+    '''
+    user_agent_list = [
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5",
+        "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0",
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.47",
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Safari/604.1.38",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0",
+        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Safari/604.1.38",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:57.0) Gecko/20100101 Firefox/57.0",
+        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; rv:57.0) Gecko/20100101 Firefox/57.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.39",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
+        "Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52",
+        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.75 Chrome/62.0.3202.75 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.94 Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;  Trident/5.0)",
+        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko",
+        "Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0",
+        "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (iPad; CPU OS 11_1_2 like Mac OS X) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0 Mobile/15B202 Safari/604.1",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0;  Trident/5.0)",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4",
+        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0) Gecko/20100101 Firefox/56.0",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.89 Chrome/62.0.3202.89 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
+        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
+    ]
+    return choice(user_agent_list)
+
+# Open the URL, read the HTML from it, and parse it into a tree structure.
+class fake_browser:
+    def __init__(self, logger, scrape_retries):
+        '''@brief fake_browser
+           @param logger
+           @param scrape_retries `int` Quantity of retries in case of fail.
+        '''
+        self.cookiejar = http.cookiejar.CookieJar()
+        self.userAgent = get_user_agent()
+        self.opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self.cookiejar))
+        self.scrape_retries = scrape_retries
+        self.logger = logger
+
+    def show_cookies(self, name):
+        for x in self.cookiejar:
+            # TODO: use logger
+            self.logger.log(DEBUG_OBSESSIVE,"%s Cookie %s" % (name, x.name))
+            print("%s Cookie %s" % (name, x.name))
+
+    def add_cookie(self, domain, name, value):
+        self.cookiejar.set_cookie(http.cookiejar.Cookie(
+            version=0, 
+            name=name, 
+            value=value,
+            port=None, 
+            port_specified=False,
+            domain=domain, 
+            domain_specified=True, 
+            domain_initial_dot=False,
+            path="/", 
+            path_specified=False,
+            secure=False,
+            expires=None,
+            discard=False,
+            comment=None,
+            comment_url=None,
+            rest=None))
+
+    def scrape_URL(self, url, add_header=None):
+        for _ in range(self.scrape_retries):
+            try:
+                req = Request(url)
+                if add_header:
+                    req.add_header(add_header)
+                req.add_header('User-agent', self.userAgent)
+                req.add_header('Accept', 'text/html')
+                req.add_header('Accept-Language', 'en-US')
+                req.add_header('Accept-Encoding', 'identity')
+                response = self.opener.open(req, timeout=10)
+                html = response.read()
+                break
+            #except WEB_SCRAPE_EXCEPTIONS:
+            except Exception as ex:
+                # TODO: remove print
+                print('Exception of type "%s" while web-scraping %s' \
+                    % (type(ex).__name__, format(url)))
+                self.logger.log(DEBUG_DETAILED,'Exception of type "%s" while web-scraping %s' \
+                    % (type(ex).__name__, format(url)))
+                pass
+        else:
+            # TODO: remove print
+            print('No page')
+            raise ValueError('No page')
+        return html
+

From 54e1a8db7fc9a4facc7ba817443a585a920e03d0 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Fri, 1 Jun 2018 15:18:56 +0200
Subject: [PATCH 02/29] Started refactoring as discussed in issue #242.

Goals are:
- Add stateful user-agent and cookie handling in fake_browser
- Scrape parts in distrtibutor then part order
- Use a class inheritance approach to distributor modules, this allows
  adding state information and reduces the amount of variabled passed around.
- One scraping thread per distributor, simplify locking

Implemented in this commit:
- Used class approach to distributors and fake_browser
- Parts are scraped in distributor -> part order
- One (IO limited) python thread per distributor
- Simplified locking
---
 kicost/distributors/__init__.py | 143 ---------------------------
 kicost/kicost.py                | 168 ++++++++++++++++++--------------
 2 files changed, 93 insertions(+), 218 deletions(-)

diff --git a/kicost/distributors/__init__.py b/kicost/distributors/__init__.py
index 85e47b3bd..666d7388e 100644
--- a/kicost/distributors/__init__.py
+++ b/kicost/distributors/__init__.py
@@ -24,149 +24,6 @@
 __author__ = 'XESS Corporation'
 __email__ = 'info@xess.com'
 
-from random import choice
-
-import http.client # For web scraping exceptions.
-try:
-    # This is for Python 3
-    from urllib.parse import urlencode, quote_plus as urlquote, urlsplit, urlunsplit
-    from urllib.request import urlopen, Request
-    import urllib.error
-    WEB_SCRAPE_EXCEPTIONS = (urllib.error.URLError, http.client.HTTPException)
-except ImportError:
-    # This is for Python 2
-    from urlparse import urlsplit, urlunsplit
-    from urllib import urlencode, quote_plus as urlquote
-    from urllib2 import urlopen, Request
-    import urllib2
-    WEB_SCRAPE_EXCEPTIONS = (urllib2.URLError, http.client.HTTPException)
-
-
-def get_user_agent():
-    ''' The default user_agent_list comprises chrome, IE, firefox, Mozilla, opera, netscape.
-      You can find more user agent strings at https://techblog.willshouse.com/2012/01/03/most-common-user-agents/.
-      Used for the function `fake_browser(url, retries)`
-    '''
-    user_agent_list = [
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5",
-        "Mozilla/5.0 (X11; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:57.0) Gecko/20100101 Firefox/57.0",
-        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:57.0) Gecko/20100101 Firefox/57.0",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
-        "Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.47",
-        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0.1 Safari/604.3.5",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/600.5.17 (KHTML, like Gecko) Version/8.0.5 Safari/600.5.17",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Safari/604.1.38",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0",
-        "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Safari/604.1.38",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:57.0) Gecko/20100101 Firefox/57.0",
-        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; rv:57.0) Gecko/20100101 Firefox/57.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36 OPR/49.0.2725.39",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/603.3.8 (KHTML, like Gecko) Version/10.1.2 Safari/603.3.8",
-        "Mozilla/5.0 (Windows NT 6.1; rv:52.0) Gecko/20100101 Firefox/52.0",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36 OPR/48.0.2685.52",
-        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.75 Chrome/62.0.3202.75 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.94 Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;  Trident/5.0)",
-        "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; Touch; rv:11.0) like Gecko",
-        "Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0",
-        "Mozilla/5.0 (X11; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (iPad; CPU OS 11_1_2 like Mac OS X) AppleWebKit/604.3.5 (KHTML, like Gecko) Version/11.0 Mobile/15B202 Safari/604.1",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko",
-        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0;  Trident/5.0)",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; rv:56.0) Gecko/20100101 Firefox/56.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4",
-        "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:56.0) Gecko/20100101 Firefox/56.0",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.89 Chrome/62.0.3202.89 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36",
-        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36",
-        "Mozilla/5.0 (Windows NT 5.1; rv:52.0) Gecko/20100101 Firefox/52.0",
-        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36",
-    ]
-    return choice(user_agent_list)
-
-
-# Open the URL, read the HTML from it, and parse it into a tree structure.
-    
-def fake_browser(url, scrape_retries=4, add_header=None):
-    for _ in range(scrape_retries):
-        try:
-            req = Request(url)
-            if add_header:
-                req.add_header(add_header)
-            req.add_header('Accept-Language', 'en-US')
-            req.add_header('Accept', 'text/html')
-            req.add_header('Cookie', 'foo=bar')
-            req.add_header('User-agent', get_user_agent())
-            response = urlopen(req)
-            html = response.read()
-            break
-        except WEB_SCRAPE_EXCEPTIONS:
-            logger.log(DEBUG_DETAILED,'Exception while web-scraping {}'.format(url))
-            pass
-    else:
-        raise ValueError('No page')
-    return html
-
 # Extra informations to by got by each part in the distributors.
 EXTRA_INFO_DIST = ['value', 'tolerance', 'footprint', 'power', 'current', 'voltage', 'frequency', 'temp_coeff', 'manf',
               'size', 'op temp', 'orientation', 'color',
diff --git a/kicost/kicost.py b/kicost/kicost.py
index 15cab4b33..3a09507e6 100644
--- a/kicost/kicost.py
+++ b/kicost/kicost.py
@@ -31,7 +31,7 @@
 import pprint
 import tqdm
 from time import time
-from multiprocessing import Pool, Manager, Lock
+from multiprocessing.pool import ThreadPool
 
 # Stops UnicodeDecodeError exceptions.
 try:
@@ -51,8 +51,27 @@
 
 # Import information about various distributors.
 from .distributors import distributor_dict
-from .distributors.web_routines import scrape_part, config_distributor
-from .distributors.local.local import create_part_html as create_local_part_html
+from .distributors import distributor, fake_browser
+
+# The distributor module directories will be found in this directory.
+directory = os.path.dirname(__file__) + "/distributors"
+
+# Search for the distributor modules and import them.
+for module in os.listdir(directory):
+
+    # Avoid importing non-directories.
+    abs_module = os.path.join(directory, module)
+    if not os.path.isdir(abs_module):
+        continue
+
+    # Avoid directories like __pycache__.
+    if module.startswith('__'):
+        continue
+
+    # Import the module.
+    tmp = __import__("distributors."+module, globals(), locals(), [], level=1)
+    tmp_mod = getattr(tmp, module);
+    globals()["dist_"+module] = getattr(tmp_mod, "dist_"+module)
 
 # Import information for various EDA tools.
 from .eda_tools import eda_modules
@@ -63,7 +82,7 @@
 def kicost(in_file, eda_tool_name, out_filename,
         user_fields, ignore_fields, group_fields, variant,
         dist_list=list(distributor_dict.keys()),
-        num_processes=4, scrape_retries=5, throttling_delay=0.0,
+        num_processes=4, scrape_retries=5, throttling_delay=5.0,
         collapse_refs=True,
         local_currency='USD'):
     ''' @brief Run KiCost.
@@ -186,7 +205,8 @@ def kicost(in_file, eda_tool_name, out_filename,
                 distributor_dict.pop(d, None)
 
     # Create an HTML page containing all the local part information.
-    local_part_html = create_local_part_html(parts, distributor_dict)
+    local_distributor = dist_local(scrape_retries, 5, throttling_delay) # TODO: log level
+    local_part_html = local_distributor.create_part_html(parts, distributor_dict)
     
     if logger.isEnabledFor(DEBUG_DETAILED):
         pprint.pprint(distributor_dict)
@@ -194,27 +214,32 @@ def kicost(in_file, eda_tool_name, out_filename,
     # Get the distributor product page for each part and scrape the part data.
     if dist_list:
 
+        # Instanciate distributors
+        for d in list(distributor_dict.keys()):
+            try:
+                ctor = globals()["dist_"+d]
+                # TODO: use logger, not print
+                # TODO: logger does not print anything
+                logger.log(DEBUG_OVERVIEW, "Initialising %s" % d)
+                print("Initialising %s" % d)
+                # TODO: farnell does not respond
+                distributor_dict[d]['instance'] = ctor(scrape_retries, 5, throttling_delay) # TODO: log level
+            except:
+                logger.log(DEBUG_OVERVIEW, "Initialising %s failed, exculding this distributor..." % d)
+                distributor_dict.pop(d, None)
+                pass
+
+	# TODO: multithreaded init, use another pool
+
         if local_currency:
             logger.log(DEBUG_OVERVIEW, '# Configuring the distributors locale and currency...')
-            if num_processes <= 1:
-                for d in distributor_dict:
-                    config_distributor(distributor_dict[d]['module'], local_currency)
-            else:
-                logger.log(DEBUG_OBSESSIVE, 'Using {} simultaneous access...'.format(min(len(distributor_dict), num_processes)))
-                pool = Pool(num_processes)
-                for d in distributor_dict:
-                    args = [distributor_dict[d]['module'], local_currency]
-                    pool.apply_async(config_distributor, args)
-                pool.close()
-                pool.join()
+            for d in distributor_dict:
+                distributor_dict[d]['instance'].define_locale_currency(local_currency)
 
         logger.log(DEBUG_OVERVIEW, '# Scraping part data for each component group...')
-        # Set the throttling delay for each distributor.
-        for d in distributor_dict:
-            distributor_dict[d]['throttling_delay'] = throttling_delay
 
         global scraping_progress
-        scraping_progress = tqdm.tqdm(desc='Progress', total=len(parts), unit='part', miniters=1)
+        scraping_progress = tqdm.tqdm(desc='Progress', total=len(parts)*len(distributor_dict), unit='part', miniters=1)
 
         # Change the logging print channel to `tqdm` to keep the process bar to the end of terminal.
         class TqdmLoggingHandler(logging.Handler):
@@ -232,77 +257,70 @@ def emit(self, record):
                     self.handleError(record)
         logger.addHandler(TqdmLoggingHandler())
 
+        # Init part info dictionaries
+        for part in parts:
+            pprint.pprint(vars(part))
+            part.part_num = {}
+            part.url = {}
+            part.price_tiers = {}
+            part.qty_avail = {}
+            part.info_dist = {}
+        #partsByDist = partListByDistributors(parts)
+
         if num_processes <= 1:
             # Scrape data, one part at a time using single processing.
-
-            class DummyLock:
-                """Dummy synchronization lock used when single processing."""
-                def __init__(self):
-                    pass
-                def acquire(*args, **kwargs):
-                    return True  # Lock can ALWAYS be acquired when just one process is running.
-                def release(*args, **kwargs):
-                    pass
-
-            # Create sync lock and timeouts to control the rate at which distributor
-            # websites are scraped.
-            throttle_lock = DummyLock()
-            throttle_timeouts = dict()
-            throttle_timeouts = {d:time() for d in distributor_dict}
-
-            for i in range(len(parts)):
-                args = (i, parts[i], distributor_dict, local_part_html, scrape_retries,
-                        logger.getEffectiveLevel(), throttle_lock, throttle_timeouts)
-                id, url, part_num, price_tiers, qty_avail, info_dist = scrape_part(args)
-                parts[id].part_num = part_num
-                parts[id].url = url
-                parts[id].price_tiers = price_tiers
-                parts[id].qty_avail = qty_avail
-                parts[id].info_dist = info_dist # Extra distributor web page.
-                scraping_progress.update(1)
+            for d in distributor_dict:
+                print("Dist loop d=%s" % d)
+                for i in range(len(parts)):
+                    print("Part loop i=%d" % i)
+                    id, dist, url, part_num, price_tiers, qty_avail, info_dist = \
+                        scrape_result = distributor_dict[d]['instance'].scrape_part \
+                        (i, parts[i], local_part_html)
+
+                    parts[id].part_num[dist] = part_num
+                    parts[id].url[dist] = url
+                    parts[id].price_tiers[dist] = price_tiers
+                    parts[id].qty_avail[dist] = qty_avail
+                    parts[id].info_dist[dist] = info_dist # Extra distributor web page.
+                    scraping_progress.update(1)
         else:
             # Scrape data, multiple parts at a time using multiprocessing.
 
-            # Create sync lock and timeouts to control the rate at which distributor
-            # websites are scraped.
-            throttle_manager = Manager()  # Manages shared lock and `dict`.
-            throttle_lock = throttle_manager.Lock()
-            throttle_timeouts = throttle_manager.dict()
-            for d in distributor_dict:
-                throttle_timeouts[d] = time()
-
-            # Create pool of processes to scrape data for multiple parts simultaneously.
-            pool = Pool(num_processes)
+            # Create thread pool to scrape data for multiple distributors simultaneously.
+            # PYthon threads are time-sliced but they work in our I/O limited scenario
+            # and avoid all kinds of pickle issues.
+            pool = ThreadPool(num_processes)
 
             # Package part data for passing to each process.
-            arg_sets = [(i, parts[i], distributor_dict, local_part_html, scrape_retries, 
-                        logger.getEffectiveLevel(), throttle_lock, throttle_timeouts) for i in range(len(parts))]
-            
-            # Define a callback routine for updating the scraping progress bar.
-            def update(x):
-                scraping_progress.update(1)
-                return x
+            arg_sets = [(distributor_dict[d]['instance'], parts, \
+                local_part_html, scraping_progress) for d in distributor_dict]
+
+            def mt_scrape_part(inst, parts, local_part_html, scraping_progress):
+                retval = list()
+                for i in range(len(parts)):
+                    retval.append(inst.scrape_part(i, parts[i], local_part_html))
+                    scraping_progress.update(1)
+                return retval
 
             # Start the web scraping processes, one for each part.
             logger.log(DEBUG_OBSESSIVE, 'Starting {} parallels process to scrap parts...'.format(num_processes))
-            results = [pool.apply_async(scrape_part, [args], callback=update) for args in arg_sets]
+            results = [pool.apply_async(mt_scrape_part, args) for args in arg_sets]
 
             # Wait for all the processes to have results, then kill-off all the scraping processes.
-            for r in results:
-                while(not r.ready()):
-                    pass
-            logger.log(DEBUG_OVERVIEW, 'All parallels process finished with success.')
             pool.close()
             pool.join()
+            logger.log(DEBUG_OVERVIEW, 'All parallels process finished with success.')
 
             # Get the data from each process result structure.
-            for result in results:
-                id, url, part_num, price_tiers, qty_avail, info_dist = result.get()
-                parts[id].part_num = part_num
-                parts[id].url = url
-                parts[id].price_tiers = price_tiers
-                parts[id].qty_avail = qty_avail
-                parts[id].info_dist = info_dist # Extra distributor web page.
+            for res_proc in results:
+                res_dist = res_proc.get()
+                for res_part in res_dist:
+                    id, dist, url, part_num, price_tiers, qty_avail, info_dist = res_part
+                    parts[id].part_num[dist] = part_num
+                    parts[id].url[dist] = url
+                    parts[id].price_tiers[dist] = price_tiers
+                    parts[id].qty_avail[dist] = qty_avail
+                    parts[id].info_dist[dist] = info_dist # Extra distributor web page.
 
         # Done with the scraping progress bar so delete it or else we get an 
         # error when the program terminates.

From f58cca99aad241b4c402276e85d6e8e8c6f9c3fa Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Fri, 1 Jun 2018 15:21:20 +0200
Subject: [PATCH 03/29] Implemented dist_mouser class.

---
 kicost/distributors/mouser/mouser.py | 344 ++++++++++++++-------------
 1 file changed, 176 insertions(+), 168 deletions(-)

diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py
index aade51493..ecd454c97 100644
--- a/kicost/distributors/mouser/mouser.py
+++ b/kicost/distributors/mouser/mouser.py
@@ -1,6 +1,6 @@
 # MIT license
 #
-# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior
+# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior / Max Maisel
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -38,176 +38,184 @@
 import difflib
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-from .. import urlencode, urlquote, urlsplit, urlunsplit
+#from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
-from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
-
-
-def get_price_tiers(html_tree):
-    '''@brief Get the pricing tiers from the parsed tree of the Mouser product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `dict()` price breaks, the keys are the quantities breaks.
-    '''
-    price_tiers = {}
-    try:
-        pricing_tbl_tree = html_tree.find('div', class_='pdp-pricing-table')
-        price_row_trees = pricing_tbl_tree.find_all('div', class_='div-table-row')
-        for row_tree in price_row_trees:
-            qty_tree, unit_price_tree, _ = row_tree.find('div', class_='row').find_all('div', class_='col-xs-4')
-            try:
-                qty = int(re.sub('[^0-9]', '', qty_tree.text))
-                unit_price = float(re.sub('[^0-9.]', '', unit_price_tree.text))
-                price_tiers[qty] = unit_price
-            except ValueError:
-                pass # In case of "quote price", ignore and pass to next (check pn STM32F411RCT6).
+from ...globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
+
+from .. import distributor, distributor_dict
+
+from urllib.parse import quote_plus as urlquote
+
+class dist_mouser(distributor.distributor):
+    def __init__(self, scrape_retries, log_level, throttle_delay):
+        super(dist_mouser, self).__init__(scrape_retries, log_level, throttle_delay)
+        self.name = 'mouser'
+        self.domain = distributor_dict[self.name]['site']['url']
+        self.browser.add_cookie('.mouser.com', 'preferences', 'ps=www2&pl=en-US&pc_www2=USDe')
+
+        self.browser.scrape_URL(self.domain)
+        self.browser.show_cookies(self.name)
+
+    def dist_get_price_tiers(self, html_tree):
+        '''@brief Get the pricing tiers from the parsed tree of the Mouser product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `dict()` price breaks, the keys are the quantities breaks.
+        '''
+        price_tiers = {}
+        try:
+            pricing_tbl_tree = html_tree.find('div', class_='pdp-pricing-table')
+            price_row_trees = pricing_tbl_tree.find_all('div', class_='div-table-row')
+            for row_tree in price_row_trees:
+                qty_tree, unit_price_tree, _ = row_tree.find('div', class_='row').find_all('div', class_='col-xs-4')
+                try:
+                    qty = int(re.sub('[^0-9]', '', qty_tree.text))
+                    unit_price = float(re.sub('[^0-9.]', '', unit_price_tree.text))
+                    price_tiers[qty] = unit_price
+                except ValueError:
+                    pass # In case of "quote price", ignore and pass to next (check pn STM32F411RCT6).
+            return price_tiers
+
+            qty_strs = []
+            for qty in html_tree.find('div',
+                                      class_='PriceBreaks').find_all(
+                                          'div',
+                                          class_='PriceBreakQuantity'):
+                qty_strs.append(qty.text)
+            price_strs = []
+            for price in html_tree.find('div',
+                                        class_='PriceBreaks').find_all(
+                                            'div',
+                                            class_='PriceBreakPrice'):
+                price_strs.append(price.text)
+            qtys_prices = list(zip(qty_strs, price_strs))
+            for qty_str, price_str in qtys_prices:
+                try:
+                    qty = re.search('(\s*)([0-9,]+)', qty_str).group(2)
+                    qty = int(re.sub('[^0-9]', '', qty))
+                    price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str))
+                except (TypeError, AttributeError, ValueError, IndexError):
+                    continue
+        except AttributeError:
+            # This happens when no pricing info is found in the tree.
+            self.logger.log(DEBUG_OBSESSIVE, 'No Mouser pricing information found!')
+            return price_tiers  # Return empty price tiers.
         return price_tiers
 
-        qty_strs = []
-        for qty in html_tree.find('div',
-                                  class_='PriceBreaks').find_all(
-                                      'div',
-                                      class_='PriceBreakQuantity'):
-            qty_strs.append(qty.text)
-        price_strs = []
-        for price in html_tree.find('div',
-                                    class_='PriceBreaks').find_all(
-                                        'div',
-                                        class_='PriceBreakPrice'):
-            price_strs.append(price.text)
-        qtys_prices = list(zip(qty_strs, price_strs))
-        for qty_str, price_str in qtys_prices:
-            try:
-                qty = re.search('(\s*)([0-9,]+)', qty_str).group(2)
-                qty = int(re.sub('[^0-9]', '', qty))
-                price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str))
-            except (TypeError, AttributeError, ValueError, IndexError):
-                continue
-    except AttributeError:
-        # This happens when no pricing info is found in the tree.
-        logger.log(DEBUG_OBSESSIVE, 'No Mouser pricing information found!')
-        return price_tiers  # Return empty price tiers.
-    return price_tiers
-
-
-def get_part_num(html_tree):
-    '''@brief Get the part number from the Mouser product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `list()`of the parts that match.
-    '''
-    try:
-        partnum = html_tree.find(
-                        'span', id='spnMouserPartNumFormattedForProdInfo'
-                        ).text
-        return partnum.strip()
-    except AttributeError:
-        logger.log(DEBUG_OBSESSIVE, 'No Mouser part number found!')
-        return ''
-
-
-def get_qty_avail(html_tree):
-    '''@brief Get the available quantity of the part from the Mouser product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `int` avaliable quantity.
-    '''
-    try:
-        qty_str = html_tree.find(
-                                'div', class_='pdp-product-availability').find(
-                                'div', class_='row').find(
-                                'div', class_='col-xs-8').find('div').text
-    except AttributeError as e:
-        # No quantity found (not even 0) so this is probably a non-stocked part.
-        # Return None so the part won't show in the spreadsheet for this dist.
-        logger.log(DEBUG_OBSESSIVE, 'No Mouser part quantity found!')
-        return None
-    try:
-        qty_str = re.search('(\s*)([0-9,]*)', qty_str, re.IGNORECASE).group(2)
-        return int(re.sub('[^0-9]', '', qty_str))
-    except ValueError:
-        # No quantity found (not even 0) so this is probably a non-stocked part.
-        # Return None so the part won't show in the spreadsheet for this dist.
-        logger.log(DEBUG_OBSESSIVE, 'No Mouser part quantity found!')
-        return None
-
-
-def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2):
-    '''@brief Find the Mouser HTML page for a part number and return the URL and parse tree.
-       @param dist
-       @param pn Part number `str()`.
-       @param extra_search_terms
-       @param url
-       @param descend
-       @param local_part_html
-       @param scrape_retries `int` Quantity of retries in case of fail.
-       @return (html `str()` of the page, url)
-    '''
-
-    # Use the part number to lookup the part using the site search function, unless a starting url was given.
-    if url is None:
-        url = 'https://www.mouser.com/Search/Refine.aspx?Keyword=' + urlquote(
-            pn + ' ' + extra_search_terms,
-            safe='')
-    elif url[0] == '/':
-        url = 'https://www.mouser.com' + url
-    elif url.startswith('..'):
-        url = 'https://www.mouser.com/Search/' + url
-
-    # Open the URL, read the HTML from it, and parse it into a tree structure.
-    try:
-        html = fake_browser(url, scrape_retries, ('Cookie', 'preferences=ps=www2&pl=en-US&pc_www2=USDe'))
-    except:
-        logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist))
-        raise PartHtmlError
-
-    # Abort if the part number isn't in the HTML somewhere.
-    # (Only use the numbers and letters to compare PN to HTML.)
-    if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
-        logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist))
-        raise PartHtmlError
-    
-    try:
-        tree = BeautifulSoup(html, 'lxml')
-    except Exception:
-        logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist))
-        raise PartHtmlError
-
-    # If the tree contains the tag for a product page, then just return it.
-    if tree.find('div', id='pdpPricingAvailability') is not None:
-        return tree, url
+    def dist_get_part_num(self, html_tree):
+        '''@brief Get the part number from the Mouser product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `list()`of the parts that match.
+        '''
+        try:
+            partnum = html_tree.find(
+                            'span', id='spnMouserPartNumFormattedForProdInfo'
+                            ).text
+            return partnum.strip()
+        except AttributeError:
+            self.logger.log(DEBUG_OBSESSIVE, 'No Mouser part number found!')
+            return ''
+
+    def dist_get_qty_avail(self, html_tree):
+        '''@brief Get the available quantity of the part from the Mouser product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `int` avaliable quantity.
+        '''
+        try:
+            qty_str = html_tree.find(
+                                    'div', class_='pdp-product-availability').find(
+                                    'div', class_='row').find(
+                                    'div', class_='col-xs-8').find('div').text
+        except AttributeError as e:
+            # No quantity found (not even 0) so this is probably a non-stocked part.
+            # Return None so the part won't show in the spreadsheet for this dist.
+            self.logger.log(DEBUG_OBSESSIVE, 'No Mouser part quantity found!')
+            return None
+        try:
+            qty_str = re.search('(\s*)([0-9,]*)', qty_str, re.IGNORECASE).group(2)
+            return int(re.sub('[^0-9]', '', qty_str))
+        except ValueError:
+            # No quantity found (not even 0) so this is probably a non-stocked part.
+            # Return None so the part won't show in the spreadsheet for this dist.
+            self.logger.log(DEBUG_OBSESSIVE, 'No Mouser part quantity found!')
+            return None
+
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+        '''@brief Find the Mouser HTML page for a part number and return the URL and parse tree.
+           @param pn Part number `str()`.
+           @param extra_search_terms
+           @param url
+           @param descend
+           @param local_part_html
+           @return (html `str()` of the page, url)
+        '''
+
+        # Use the part number to lookup the part using the site search function, unless a starting url was given.
+        if url is None:
+            url = 'https://www.mouser.com/Search/Refine.aspx?Keyword=' + urlquote(
+                pn + ' ' + extra_search_terms,
+                safe='')
+        elif url[0] == '/':
+            url = 'https://www.mouser.com' + url
+        elif url.startswith('..'):
+            url = 'https://www.mouser.com/Search/' + url
+
+        # Open the URL, read the HTML from it, and parse it into a tree structure.
+        try:
+            html = self.browser.scrape_URL(url)
+        except Exception as ex:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name))
+            raise PartHtmlError
 
-    # If the tree is for a list of products, then examine the links to try to find the part number.
-    if tree.find('div', id='searchResultsTbl') is not None:
-        logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist))
-        if descend <= 0:
-            logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist))
+        # Abort if the part number isn't in the HTML somewhere.
+        # (Only use the numbers and letters to compare PN to HTML.)
+        if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
+            self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, self.name))
+            raise PartHtmlError
+        
+        try:
+            tree = BeautifulSoup(html, 'lxml')
+        except Exception:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name))
             raise PartHtmlError
-        else:
-            # Look for the table of products.
-            products = tree.find(
-                'table',
-                class_='SearchResultsTable').find_all(
-                    'tr',
-                    class_=('SearchResultsRowOdd', 'SearchResultsRowEven'))
-
-            # Extract the product links for the part numbers from the table.
-            product_links = [p.find('div', class_='mfrDiv').a for p in products]
-
-            # Extract all the part numbers from the text portion of the links.
-            part_numbers = [l.text for l in product_links]
-
-            # Look for the part number in the list that most closely matches the requested part number.
-            match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
-
-            # Now look for the link that goes with the closest matching part number.
-            for l in product_links:
-                if l.text == match:
-                    # Get the tree for the linked-to page and return that.
-                    logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text, pn, dist))
-                    return get_part_html_tree(dist, pn, extra_search_terms,
-                                              url=l.get('href', ''),
-                                              descend=descend-1,
-                                              scrape_retries=scrape_retries)
-
-    # I don't know what happened here, so give up.
-    logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist))
-    raise PartHtmlError
+
+        # If the tree contains the tag for a product page, then just return it.
+        if tree.find('div', id='pdpPricingAvailability') is not None:
+            return tree, url
+
+        # If the tree is for a list of products, then examine the links to try to find the part number.
+        if tree.find('div', id='searchResultsTbl') is not None:
+            self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name))
+            if descend <= 0:
+                self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name))
+                raise PartHtmlError
+            else:
+                # Look for the table of products.
+                products = tree.find(
+                    'table',
+                    class_='SearchResultsTable').find_all(
+                        'tr',
+                        class_=('SearchResultsRowOdd', 'SearchResultsRowEven'))
+
+                # Extract the product links for the part numbers from the table.
+                product_links = [p.find('div', class_='mfrDiv').a for p in products]
+
+                # Extract all the part numbers from the text portion of the links.
+                part_numbers = [l.text for l in product_links]
+
+                # Look for the part number in the list that most closely matches the requested part number.
+                match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
+
+                # Now look for the link that goes with the closest matching part number.
+                for l in product_links:
+                    if l.text == match:
+                        # Get the tree for the linked-to page and return that.
+                        self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text, pn, self.name))
+                        return self.dist_get_part_html_tree(pn, extra_search_terms,
+                                                  url=l.get('href', ''),
+                                                  descend=descend-1)
+
+        # I don't know what happened here, so give up.
+        self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name))
+        self.logger.log(DEBUG_OBSESSIVE,'Response was %s' % html)
+        raise PartHtmlError

From 817b136a8672b797cbd44e447f33024f2b2bc5b6 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Fri, 1 Jun 2018 15:21:39 +0200
Subject: [PATCH 04/29] Implemented dist_farnell class.

---
 kicost/distributors/farnell/__init__.py |   2 +-
 kicost/distributors/farnell/farnell.py  | 331 ++++++++++++------------
 2 files changed, 172 insertions(+), 161 deletions(-)

diff --git a/kicost/distributors/farnell/__init__.py b/kicost/distributors/farnell/__init__.py
index 7af68b1a4..c0203fa64 100644
--- a/kicost/distributors/farnell/__init__.py
+++ b/kicost/distributors/farnell/__init__.py
@@ -25,7 +25,7 @@
             },
             # Web site defitions.
             'site': {
-                'url': 'http://farnell.com/',
+                'url': 'https://it.farnell.com/',
                 'currency': 'USD',
                 'locale': 'US'
             },
diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py
index 59776392f..fb645c25f 100644
--- a/kicost/distributors/farnell/farnell.py
+++ b/kicost/distributors/farnell/farnell.py
@@ -1,6 +1,6 @@
 # MIT license
 #
-# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior
+# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior / Max Maisel
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -38,173 +38,184 @@
 import difflib
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-from .. import urlencode, urlquote, urlsplit, urlunsplit
-from .. import fake_browser, WEB_SCRAPE_EXCEPTIONS
+#from .. import urlencode, urlquote, urlsplit, urlunsplit
+from .. import fake_browser
 from ...globals import PartHtmlError
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
 from currency_converter import CurrencyConverter
 currency = CurrencyConverter()
 
-__author__='Giacinto Luigi Cerone'
-
+from .. import distributor, distributor_dict
 
-def get_price_tiers(html_tree):
-    '''@brief Get the pricing tiers from the parsed tree of the farnell product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `dict()` price breaks, the keys are the quantities breaks.
-    '''
-    price_tiers = {}
-    try:
-        qty_strs = []
-        for qty in html_tree.find(
-            'table',
-            class_=('tableProductDetailPrice', 'pricing')).find_all(
-                'td',
-                class_='qty'):
-            qty_strs.append(qty.text)
-        price_strs = []
-        for price in html_tree.find(
-            'table',
-            class_=('tableProductDetailPrice', 'pricing')).find_all(
-                'td',
-                class_='threeColTd'):
-            price_strs.append(price.text)
-        qtys_prices = list(zip(qty_strs, price_strs))
-        for qty_str, price_str in qtys_prices:
-            try:
-                qty = re.search('(\s*)([0-9,]+)', qty_str).group(2)
-                qty = int(re.sub('[^0-9]', '', qty))
-                price_str=price_str.replace(',','.')
-                price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str))
-                price_tiers[qty] = currency.convert(price_tiers[qty], 'EUR', 'USD')
-            except (TypeError, AttributeError, ValueError):
-                continue
-    except AttributeError:
-        # This happens when no pricing info is found in the tree.
-        return price_tiers  # Return empty price tiers.
-    return price_tiers
-    
-def get_part_num(html_tree):
-    '''@brief Get the part number from the farnell product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `list()`of the parts that match.
-    '''
-    try:
-        # farnell catalog number is stored in a description list, so get
-        # all the list terms and descriptions, strip all the spaces from those,
-        # and pair them up.
-        div = html_tree.find('div', class_='productDescription').find('dl')
-        dt = [re.sub('\s','',d.text) for d in div.find_all('dt')]
-        dd = [re.sub('\s','',d.text) for d in div.find_all('dd')]
-        dtdd = {k:v for k,v in zip(dt,dd)}  # Pair terms with descriptions.
-#        return dtdd.get('farnellPartNo.:', '')
-        return dtdd.get('CodiceProdotto', '')
-    except KeyError:
-        return '' # No catalog number found in page.
-    except AttributeError:
-        return '' # No ProductDescription found in page.
-
-def get_qty_avail(html_tree):
-    '''@brief Get the available quantity of the part from the farnell product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `int` avaliable quantity.
-    '''
-    try:
-        qty_str = html_tree.find('p', class_='availabilityHeading').text
-    except (AttributeError, ValueError):
-        # No quantity found (not even 0) so this is probably a non-stocked part.
-        # Return None so the part won't show in the spreadsheet for this dist.
-        return None
-    try:
-        qty = re.sub('[^0-9]','',qty_str)  # Strip all non-number chars.
-        return int(re.sub('[^0-9]', '', qty_str))  # Return integer for quantity.
-    except ValueError:
-        # No quantity found (not even 0) so this is probably a non-stocked part.
-        # Return None so the part won't show in the spreadsheet for this dist.
-        return None
-
-def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2):
-    '''@brief Find the farnell HTML page for a part number and return the URL and parse tree.
-       @param dist
-       @param pn Part number `str()`.
-       @param extra_search_terms
-       @param url
-       @param descend
-       @param local_part_html
-       @param scrape_retries `int` Quantity of retries in case of fail.
-       @return (html `str()` of the page, url)
-    '''
-
-    # Use the part number to lookup the part using the site search function, unless a starting url was given.
-    if url is None:
-        url = 'http://it.farnell.com/Search?catalogId=15001&langId=-4&storeId=10165&gs=true&st=' + urlquote(
-            pn + ' ' + extra_search_terms,
-            safe='')
-    elif url[0] == '/':
-        url = 'http://www.farnell.com' + url
-    elif url.startswith('..'):
-        url = 'http://www.farnell.com/Search/' + url
-
-    # Open the URL, read the HTML from it, and parse it into a tree structure.
-    try:
-        html = fake_browser(url, scrape_retries)
-    except:
-        logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist))
-        raise PartHtmlError
+from urllib.parse import quote_plus as urlquote
 
-    # Abort if the part number isn't in the HTML somewhere.
-    # (Only use the numbers and letters to compare PN to HTML.)
-    if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
-        logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist))
-        raise PartHtmlError
+__author__='Giacinto Luigi Cerone'
 
-    try:
-        tree = BeautifulSoup(html, 'lxml')
-    except Exception:
-        logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist))
-        raise PartHtmlError
+class dist_farnell(distributor.distributor):
+    def __init__(self, scrape_retries, log_level, throttle_delay):
+        super(dist_farnell, self).__init__(scrape_retries, log_level, throttle_delay)
+        self.name = 'farnell'
+        self.domain = distributor_dict[self.name]['site']['url']
+
+        self.browser.scrape_URL(self.domain)
+        self.browser.show_cookies(self.name)
+
+    def dist_get_price_tiers(self, html_tree):
+        '''@brief Get the pricing tiers from the parsed tree of the farnell product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `dict()` price breaks, the keys are the quantities breaks.
+        '''
+        price_tiers = {}
+        try:
+            qty_strs = []
+            for qty in html_tree.find(
+                'table',
+                class_=('tableProductDetailPrice', 'pricing')).find_all(
+                    'td',
+                    class_='qty'):
+                qty_strs.append(qty.text)
+            price_strs = []
+            for price in html_tree.find(
+                'table',
+                class_=('tableProductDetailPrice', 'pricing')).find_all(
+                    'td',
+                    class_='threeColTd'):
+                price_strs.append(price.text)
+            qtys_prices = list(zip(qty_strs, price_strs))
+            for qty_str, price_str in qtys_prices:
+                try:
+                    qty = re.search('(\s*)([0-9,]+)', qty_str).group(2)
+                    qty = int(re.sub('[^0-9]', '', qty))
+                    price_str=price_str.replace(',','.')
+                    price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str))
+                    price_tiers[qty] = currency.convert(price_tiers[qty], 'EUR', 'USD')
+                except (TypeError, AttributeError, ValueError):
+                    continue
+        except AttributeError:
+            # This happens when no pricing info is found in the tree.
+            return price_tiers  # Return empty price tiers.
+        return price_tiers
+    
+    def dist_get_part_num(self, html_tree):
+        '''@brief Get the part number from the farnell product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `list()`of the parts that match.
+        '''
+        try:
+            # farnell catalog number is stored in a description list, so get
+            # all the list terms and descriptions, strip all the spaces from those,
+            # and pair them up.
+            div = html_tree.find('div', class_='productDescription').find('dl')
+            dt = [re.sub('\s','',d.text) for d in div.find_all('dt')]
+            dd = [re.sub('\s','',d.text) for d in div.find_all('dd')]
+            dtdd = {k:v for k,v in zip(dt,dd)}  # Pair terms with descriptions.
+    #        return dtdd.get('farnellPartNo.:', '')
+            return dtdd.get('CodiceProdotto', '')
+        except KeyError:
+            return '' # No catalog number found in page.
+        except AttributeError:
+            return '' # No ProductDescription found in page.
+
+    def dist_get_qty_avail(self, html_tree):
+        '''@brief Get the available quantity of the part from the farnell product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `int` avaliable quantity.
+        '''
+        try:
+            qty_str = html_tree.find('p', class_='availabilityHeading').text
+        except (AttributeError, ValueError):
+            # No quantity found (not even 0) so this is probably a non-stocked part.
+            # Return None so the part won't show in the spreadsheet for this dist.
+            return None
+        try:
+            qty = re.sub('[^0-9]','',qty_str)  # Strip all non-number chars.
+            return int(re.sub('[^0-9]', '', qty_str))  # Return integer for quantity.
+        except ValueError:
+            # No quantity found (not even 0) so this is probably a non-stocked part.
+            # Return None so the part won't show in the spreadsheet for this dist.
+            return None
+
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+        '''@brief Find the farnell HTML page for a part number and return the URL and parse tree.
+           @param pn Part number `str()`.
+           @param extra_search_terms
+           @param url
+           @param descend
+           @param local_part_html
+           @return (html `str()` of the page, url)
+        '''
+
+        # Use the part number to lookup the part using the site search function, unless a starting url was given.
+        if url is None:
+            url = 'http://it.farnell.com/Search?storeId=10165&catalogId=15001&categoryName=&selectedCategoryId=&langId=-4&categoryIdBox=&st=' + urlquote(
+                pn + ' ' + extra_search_terms,
+                safe='')
+
+        elif url[0] == '/':
+            url = 'http://www.farnell.com' + url
+        elif url.startswith('..'):
+            url = 'http://www.farnell.com/Search/' + url
+
+        # Open the URL, read the HTML from it, and parse it into a tree structure.
+        try:
+            html = self.browser.scrape_URL(url)
+        except:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name))
+            raise PartHtmlError
 
-    # If the tree contains the tag for a product page, then just return it.
-    if tree.find('div', class_='productDisplay', id='page') is not None:
-        return tree, url
+        # Abort if the part number isn't in the HTML somewhere.
+        # (Only use the numbers and letters to compare PN to HTML.)
+        if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
+            self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, self.name))
+            raise PartHtmlError
 
-    # If the tree is for a list of products, then examine the links to try to find the part number.
-    if tree.find('table', class_='productLister', id='sProdList') is not None:
-        logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist))
-        if descend <= 0:
-            logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist))
+        try:
+            tree = BeautifulSoup(html, 'lxml')
+        except Exception:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name))
             raise PartHtmlError
-        else:
-            # Look for the table of products.
-            products = tree.find('table',
-                                 class_='productLister').find_all('tr',
-                                                          class_='altRow')
-
-            # Extract the product links for the part numbers from the table.
-            product_links = []
-            for p in products:
-                try:
-                    product_links.append(p.find('td', class_='mftrPart').find('a'))
-                except AttributeError:
-                    continue
-            print('>>>  ',pn,products,product_liks)#TODO
-
-            # Extract all the part numbers from the text portion of the links.
-            part_numbers = [l.text for l in product_links]
-
-            # Look for the part number in the list that most closely matches the requested part number.
-            match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
-
-            # Now look for the link that goes with the closest matching part number.
-            for l in product_links:
-                if l.text == match:
-                    # Get the tree for the linked-to page and return that.
-                    logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, dist))
-                    return get_part_html_tree(dist, pn, extra_search_terms,
-                                              url=l.get('href', ''),
-                                              descend=descend-1,
-                                              scrape_retries=scrape_retries)
-
-    # I don't know what happened here, so give up.
-    logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist))
-    raise PartHtmlError
+
+        # If the tree contains the tag for a product page, then just return it.
+        if tree.find('div', class_='productDisplay', id='page') is not None:
+            return tree, url
+
+        # If the tree is for a list of products, then examine the links to try to find the part number.
+        if tree.find('table', class_='productLister', id='sProdList') is not None:
+            self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name))
+            if descend <= 0:
+                self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name))
+                raise PartHtmlError
+            else:
+                # Look for the table of products.
+                products = tree.find('table',
+                                     class_='productLister').find_all('tr',
+                                                              class_='altRow')
+
+                # Extract the product links for the part numbers from the table.
+                product_links = []
+                for p in products:
+                    try:
+                        product_links.append(p.find('td', class_='mftrPart').find('a'))
+                    except AttributeError:
+                        continue
+                print('>>>  ',pn,products,product_links)#TODO
+
+                # Extract all the part numbers from the text portion of the links.
+                part_numbers = [l.text for l in product_links]
+
+                # Look for the part number in the list that most closely matches the requested part number.
+                match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
+
+                # Now look for the link that goes with the closest matching part number.
+                for l in product_links:
+                    if l.text == match:
+                        # Get the tree for the linked-to page and return that.
+                        self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, self.name))
+                        return self.dist_get_part_html_tree(pn, extra_search_terms,
+                                                  url=l.get('href', ''),
+                                                  descend=descend-1)
+
+        # I don't know what happened here, so give up.
+        self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name))
+        self.logger.log(DEBUG_OBSESSIVE,'Response was %s' % html)
+        raise PartHtmlError

From aa1f175db99e5783bbc2a9a844868ddb2ba7e7be Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Fri, 1 Jun 2018 15:22:01 +0200
Subject: [PATCH 05/29] Implemented dist_local class.

---
 kicost/distributors/local/local.py | 323 +++++++++++++++--------------
 1 file changed, 164 insertions(+), 159 deletions(-)

diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py
index 882fa5fee..a3900b6cd 100644
--- a/kicost/distributors/local/local.py
+++ b/kicost/distributors/local/local.py
@@ -1,6 +1,6 @@
 # MIT license
 #
-# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior
+# Copyright (C) 2018 by XESS Corporation / Hildo Guillardi Junior / Max Maisel
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -35,165 +35,170 @@
 from bs4 import BeautifulSoup
 from yattag import Doc, indent # For generating HTML page for local parts.
 import copy # To be possible create more than one local distributor.
-from .. import urlsplit, urlunsplit
 from ...globals import PartHtmlError
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
-from ...globals import SEPRTR
-
-
-def create_part_html(parts, distributors):
-    '''@brief Create HTML page containing info for local (non-webscraped) parts.
-    @param parts `list()` of parts.
-    @parm `list()`of the distributors to check each one is local.
-    @return `str()` of the HTML page to be read by `get_part_html_tree()`
-    '''
-    
-    logger.log(DEBUG_OVERVIEW, 'Create HTML page for parts with custom pricing...')
-    
-    doc, tag, text = Doc().tagtext()
-    with tag('html'):
-        with tag('body'):
-            for p in parts:
-                # Find the manufacturer's part number if it exists.
-                pn = p.fields.get('manf#') # Returns None if no manf# field.
-
-                # Find the various distributors for this part by
-                # looking for leading fields terminated by SEPRTR.
-                for key in p.fields:
-                    try:
-                        dist = key[:key.index(SEPRTR)]
-                    except ValueError:
-                        continue
-
-                    # If the distributor is not in the list of web-scrapable distributors,
-                    # then it's a local distributor. Copy the local distributor template
-                    # and add it to the table of distributors.
-                    if dist not in distributors:
-                        distributors[dist] = copy.copy(distributors['local_template'])
-                        distributors[dist]['label'] = dist  # Set dist name for spreadsheet header.
-
-                # Now look for catalog number, price list and webpage link for this part.
-                for dist in distributors:
-                    cat_num = p.fields.get(dist+':cat#')
-                    pricing = p.fields.get(dist+':pricing')
-                    link = p.fields.get(dist+':link')
-                    if cat_num is None and pricing is None and link is None:
-                        continue
-
-                    def make_random_catalog_number(p):
-                        hash_fields = {k: p.fields[k] for k in p.fields}
-                        hash_fields['dist'] = dist
-                        return '#{0:08X}'.format(abs(hash(tuple(sorted(hash_fields.items())))))
-
-                    cat_num = cat_num or pn or make_random_catalog_number(p)
-                    p.fields[dist+':cat#'] = cat_num # Store generated cat#.
-                    with tag('div', klass=dist+SEPRTR+cat_num):
-                        with tag('div', klass='cat#'):
-                            text(cat_num)
-                        if pricing is not None:
-                            with tag('div', klass='pricing'):
-                                text(pricing)
-                        if link is not None:
-                            url_parts = list(urlsplit(link))
-                            if url_parts[0] == '':
-                                url_parts[0] = u'http'
-                            link = urlunsplit(url_parts)
-                            with tag('div', klass='link'):
-                                text(link)
-
-    # Remove the local distributor template so it won't be processed later on.
-    # It has served its purpose.
-    try:
-        del distributors['local_template']
-    except:
-        pass
-
-    html = doc.getvalue()
-    if logger.isEnabledFor(DEBUG_OBSESSIVE):
-        print(indent(html))
-    return html
-
-
-def get_price_tiers(html_tree):
-    '''@brief Get the pricing tiers from the parsed tree of the local product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `dict()` price breaks, the keys are the quantities breaks.
-    '''
-    price_tiers = {}
-    try:
-        pricing = html_tree.find('div', class_='pricing').text
-        pricing = re.sub('[^0-9.;:]', '', pricing) # Keep only digits, decimals, delimiters.
-        for qty_price in pricing.split(';'):
-            qty, price = qty_price.split(SEPRTR)
-            price_tiers[int(qty)] = float(price)
-    except AttributeError:
-        # This happens when no pricing info is found in the tree.
-        logger.log(DEBUG_OBSESSIVE, 'No local pricing information found!')
-        return price_tiers  # Return empty price tiers.
-    return price_tiers
-
-
-def get_part_num(html_tree):
-    '''@brief Get the part number from the local product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `list()`of the parts that match.
-    '''
-    try:
-        part_num_str = html_tree.find('div', class_='cat#').text
-        return part_num_str
-    except AttributeError:
-        return ''
-
-
-def get_qty_avail(html_tree):
-    '''@brief Get the available quantity of the part from the local product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `int` avaliable quantity.
-    '''
-    try:
-        qty_str = html_tree.find('div', class_='quantity').text
-    except (AttributeError, ValueError):
-        # Return 0 (not None) so this part will show in the spreadsheet
-        # even if there is no quantity found.
-        return 0
-    try:
-        return int(re.sub('[^0-9]', '', qty_str))
-    except ValueError:
-        # Return 0 (not None) so this part will show in the spreadsheet
-        # even if there is no quantity found.
-        logger.log(DEBUG_OBSESSIVE, 'No local part quantity found!')
-        return 0
-
-
-def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=None, local_part_html=None, scrape_retries=2):
-    '''Extract the HTML tree from the HTML page for local parts.
-       @param dist
-       @param pn Part number `str()`.
-       @param extra_search_terms
-       @param url
-       @param descend
-       @param local_part_html
-       @param scrape_retries `int` Quantity of retries in case of fail.
-       @return (html `str()` of the page, `None`) The second argument is always `None` bacause there is not url to return.
-    '''
-
-    # Extract the HTML tree from the local part HTML page.
-    try:
-        tree = BeautifulSoup(local_part_html, 'lxml')
-    except Exception:
-        raise PartHtmlError
-
-    try:
-        # Find the DIV in the tree for the given part and distributor.
-        class_ = dist + SEPRTR + pn
-        part_tree = tree.find('div', class_=class_)
-        url_tree = part_tree.find('div', class_='link')
+from ...globals import SEPRTR as SEPRTR
+
+from .. import distributor
+
+from urllib.parse import urlsplit, urlunsplit
+
+class dist_local(distributor.distributor):
+    def __init__(self, scrape_retries, log_level, throttle_delay):
+        super(dist_local, self).__init__(scrape_retries, log_level, throttle_delay)
+        self.name = 'local'
+
+    def create_part_html(self, parts, distributors):
+        '''@brief Create HTML page containing info for local (non-webscraped) parts.
+        @param parts `list()` of parts.
+        @parm `list()`of the distributors to check each one is local.
+        @return `str()` of the HTML page to be read by `get_part_html_tree()`
+        '''
+        
+        self.logger.log(DEBUG_OVERVIEW, 'Create HTML page for parts with custom pricing...')
+        
+        doc, tag, text = Doc().tagtext()
+        with tag('html'):
+            with tag('body'):
+                for p in parts:
+                    # Find the manufacturer's part number if it exists.
+                    pn = p.fields.get('manf#') # Returns None if no manf# field.
+
+                    # Find the various distributors for this part by
+                    # looking for leading fields terminated by SEPRTR.
+                    for key in p.fields:
+                        try:
+                            dist = key[:key.index(SEPRTR)]
+                        except ValueError:
+                            continue
+
+                        # If the distributor is not in the list of web-scrapable distributors,
+                        # then it's a local distributor. Copy the local distributor template
+                        # and add it to the table of distributors.
+                        if dist not in distributors:
+                            distributors[dist] = copy.copy(distributors['local_template'])
+                            distributors[dist]['label'] = dist  # Set dist name for spreadsheet header.
+
+                    # Now look for catalog number, price list and webpage link for this part.
+                    for dist in distributors:
+                        cat_num = p.fields.get(dist+':cat#')
+                        pricing = p.fields.get(dist+':pricing')
+                        link = p.fields.get(dist+':link')
+                        if cat_num is None and pricing is None and link is None:
+                            continue
+
+                        def make_random_catalog_number(p):
+                            hash_fields = {k: p.fields[k] for k in p.fields}
+                            hash_fields['dist'] = dist
+                            return '#{0:08X}'.format(abs(hash(tuple(sorted(hash_fields.items())))))
+
+                        cat_num = cat_num or pn or make_random_catalog_number(p)
+                        p.fields[dist+':cat#'] = cat_num # Store generated cat#.
+                        with tag('div', klass=dist+SEPRTR+cat_num):
+                            with tag('div', klass='cat#'):
+                                text(cat_num)
+                            if pricing is not None:
+                                with tag('div', klass='pricing'):
+                                    text(pricing)
+                            if link is not None:
+                                url_parts = list(urlsplit(link))
+                                if url_parts[0] == '':
+                                    url_parts[0] = u'http'
+                                link = urlunsplit(url_parts)
+                                with tag('div', klass='link'):
+                                    text(link)
+
+        # Remove the local distributor template so it won't be processed later on.
+        # It has served its purpose.
         try:
-            # Return the part data tree and any URL associated with the part.
-            return part_tree, url_tree.text.strip()
+            del distributors['local_template']
+        except:
+            pass
+
+        html = doc.getvalue()
+        if self.logger.isEnabledFor(DEBUG_OBSESSIVE):
+            print(indent(html))
+        return html
+
+
+    def dist_get_price_tiers(self, html_tree):
+        '''@brief Get the pricing tiers from the parsed tree of the local product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `dict()` price breaks, the keys are the quantities breaks.
+        '''
+        price_tiers = {}
+        try:
+            pricing = html_tree.find('div', class_='pricing').text
+            pricing = re.sub('[^0-9.;:]', '', pricing) # Keep only digits, decimals, delimiters.
+            for qty_price in pricing.split(';'):
+                qty, price = qty_price.split(SEPRTR)
+                price_tiers[int(qty)] = float(price)
+        except AttributeError:
+            # This happens when no pricing info is found in the tree.
+            self.logger.log(DEBUG_OBSESSIVE, 'No local pricing information found!')
+            return price_tiers  # Return empty price tiers.
+        return price_tiers
+
+
+    def dist_get_part_num(self, html_tree):
+        '''@brief Get the part number from the local product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `list()`of the parts that match.
+        '''
+        try:
+            part_num_str = html_tree.find('div', class_='cat#').text
+            return part_num_str
+        except AttributeError:
+            return ''
+
+
+    def dist_get_qty_avail(self, html_tree):
+        '''@brief Get the available quantity of the part from the local product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `int` avaliable quantity.
+        '''
+        try:
+            qty_str = html_tree.find('div', class_='quantity').text
+        except (AttributeError, ValueError):
+            # Return 0 (not None) so this part will show in the spreadsheet
+            # even if there is no quantity found.
+            return 0
+        try:
+            return int(re.sub('[^0-9]', '', qty_str))
+        except ValueError:
+            # Return 0 (not None) so this part will show in the spreadsheet
+            # even if there is no quantity found.
+            self.logger.log(DEBUG_OBSESSIVE, 'No local part quantity found!')
+            return 0
+
+    # TODO: dist param
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=None, local_part_html=None):
+        '''Extract the HTML tree from the HTML page for local parts.
+           @param pn Part number `str()`.
+           @param extra_search_terms
+           @param url
+           @param descend
+           @param local_part_html
+           @return (html `str()` of the page, `None`) The second argument is always `None` bacause there is not url to return.
+        '''
+
+        # Extract the HTML tree from the local part HTML page.
+        try:
+            tree = BeautifulSoup(local_part_html, 'lxml')
+        except Exception:
+            raise PartHtmlError
+
+        try:
+            # Find the DIV in the tree for the given part and distributor.
+            class_ = self.name + SEPRTR + pn
+            part_tree = tree.find('div', class_=class_)
+            url_tree = part_tree.find('div', class_='link')
+            try:
+                # Return the part data tree and any URL associated with the part.
+                return part_tree, url_tree.text.strip()
+            except AttributeError:
+                # Return part data tree and None if the URL is not found.
+                return part_tree, None
         except AttributeError:
-            # Return part data tree and None if the URL is not found.
-            return part_tree, None
-    except AttributeError:
-        # Return an error if the part_tree is not found.
-        raise PartHtmlError
+            # Return an error if the part_tree is not found.
+            raise PartHtmlError

From 0ec9ee17e69ded78d036e63df8d84f2b628a8573 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Fri, 1 Jun 2018 15:22:12 +0200
Subject: [PATCH 06/29] Implemented dist_digikey class.

---
 kicost/distributors/digikey/digikey.py | 628 +++++++++++++------------
 1 file changed, 320 insertions(+), 308 deletions(-)

diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py
index 0653485a3..bd33295bd 100644
--- a/kicost/distributors/digikey/digikey.py
+++ b/kicost/distributors/digikey/digikey.py
@@ -1,6 +1,6 @@
 # MIT license
 #
-# Copyright (C) 2015 by XESS Corporation / Hildo Guillardi Junior
+# Copyright (C) 2015 by XESS Corporation / Hildo Guillardi Junior / Max Maisel
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -34,193 +34,343 @@
 
 import future
 
+# TODO: not working yet ?
+
 import re, difflib
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-from .. import urlencode, urlquote, urlsplit, urlunsplit
+#from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from .. import EXTRA_INFO_DIST, extra_info_dist_name_translations
 from ...globals import PartHtmlError
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
 
-from .. import distributor_dict
+from .. import distributor, distributor_dict
+
+from urllib.parse import quote_plus as urlquote
+
 import pycountry
 
-def define_locale_currency(locale_iso=None, currency_iso=None):
-    '''@brief Configure the distributor for the country and currency intended.
-    
-    Scrape the configuration page and define the base URL of DigiKey for the
-    currency and locale chosen.
-    The currency is predominant over the locale/country and the defauld are
-    currency='USD' and locale='US' for DigiKey.
-    
-    @param locale_iso `str` Country in ISO3166 alpha 2 standard.
-    @param currency_iso `str` Currency in ISO4217 alpha 3 standard.'''
-    url = 'https://www.digikey.com/en/resources/international'
-    
-    try:
-        html = fake_browser(url, 4)
-    except: # Could not get a good read from the website.
-        logger.log(DEBUG_OBSESSIVE,'No HTML page for DigiKey configuration.')
-        raise PartHtmlError
-    html = BeautifulSoup(html, 'lxml')
-    try:
-        if currency_iso and not locale_iso:
-            money = pycountry.currencies.get(alpha_3=currency_iso.upper())
-            locale_iso = pycountry.countries.get(numeric=money.numeric).alpha_2
-        if locale_iso:
-            locale_iso = locale_iso.upper()
-            country = pycountry.countries.get(alpha_2=locale_iso.upper()).name
-            html = html.find('li', text=re.compile(country, re.IGNORECASE))
-            url = html.find('a', id='linkcolor').get('href')
-            
-            distributor_dict['digikey']['site']['url'] = url
-            distributor_dict['digikey']['site']['currency'] = pycountry.currencies.get(numeric=country.numeric).alpha_3
-            distributor_dict['digikey']['site']['locale'] = locale_iso
-    except:
-        logger.log(DEBUG_OVERVIEW, 'Kept the last configuration {}, {} on {}.'.format(
-                pycountry.currencies.get(alpha_3=distributor_dict['digikey']['site']['currency']).name,
-                pycountry.countries.get(alpha_2=distributor_dict['digikey']['site']['locale']).name,
-                distributor_dict['digikey']['site']['url']
-            )) # Keep the current configuration.
-    return
-
-
-def get_extra_info(html_tree):
-    '''@brief Get the extra characteristics `EXTRA_INFO_DIST` from the part web page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `dict()` keys as characteristics names.
-    '''
-    info = {}
-    try:
-        table =  html_tree.find('table', id='prod-att-table')
-        for row in table.find_all('tr', id=None): # `None`to ignore the header row.
-            try:
-                k = row.find('th').text.strip().lower()
-                v = row.find('td').text.strip()
-                k = extra_info_dist_name_translations.get(k, k)
-                if k in EXTRA_INFO_DIST:
-                    info[k] = v
-            except:
-                continue
-        if 'datasheet' in EXTRA_INFO_DIST:
-            try:
-                info['datasheet'] = html_tree.find('a', href=True, target='_blank').get('href')
-                if info['datasheet'][0:2]=='//':
-                    info['datasheet'] = 'https:' + info['datasheet'] # Digikey missing definitions.
-            except:
-                pass
-        if 'image' in EXTRA_INFO_DIST:
-            try:
-                info['image'] = html_tree.find('img', itemprop="image").get('src')
-                if info['image'][0:2]=='//':
-                    info['image'] = 'https:' + info['image'] # Digikey missing definitions.
-            except:
-                pass
-    except AttributeError:
-        # This happens when no pricing info is found in the tree.
-        logger.log(DEBUG_OBSESSIVE, 'No Digikey pricing information found!')
-    return info
-
-
-def get_price_tiers(html_tree):
-    '''@brief Get the pricing tiers from the parsed tree of the Digikey product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `dict()` price breaks, the keys are the quantities breaks.
-    '''
-    price_tiers = {}
-    try:
-        for tr in html_tree.find('table', id='product-dollars').find_all('tr'):
-            try:
-                td = tr.find_all('td')
-                qty = int(re.sub('[^0-9]', '', td[0].text))
-                price_tiers[qty] = float(re.sub('[^0-9\.]', '', td[1].text))
-            except (TypeError, AttributeError, ValueError,
-                    IndexError):  # Happens when there's no <td> in table row.
-                continue
-    except AttributeError:
-        # This happens when no pricing info is found in the tree.
-        logger.log(DEBUG_OBSESSIVE, 'No Digikey pricing information found!')
-    return price_tiers
-
-
-def part_is_reeled(html_tree):
-    '''@brief Returns True if this Digi-Key part is reeled or Digi-reeled.
-       @param html_tree `str()` html of the distributor part page.
-       @return `True` or `False`.
-    '''
-    qty_tiers = list(get_price_tiers(html_tree).keys())
-    if len(qty_tiers) > 0 and min(qty_tiers) >= 100:
-        return True
-    if html_tree.find('table',
-                      id='product-details-reel-pricing') is not None:
-        return True
-    return False
-
-
-def get_part_num(html_tree):
-    '''@brief Get the part number from the Digikey product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `list()`of the parts that match.
-    '''
-    try:
-        return re.sub('\s', '', html_tree.find('td',
-                                               id='reportPartNumber').text)
-    except AttributeError:
-        logger.log(DEBUG_OBSESSIVE, 'No Digikey part number found!')
-        return ''
-
-
-def get_qty_avail(html_tree):
-    '''@brief Get the available quantity of the part from the Digikey product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `int` avaliable quantity.
-    '''
-    try:
-        qty_tree = html_tree.find('td', id='quantityAvailable').find('span', id='dkQty')
-        qty_str = qty_tree.text
-    except AttributeError:
-        # No quantity found (not even 0) so this is probably a non-stocked part.
-        # Return None so the part won't show in the spreadsheet for this dist.
-        return None
-    try:
-        qty_str = re.search('([0-9,]*)', qty_str, re.IGNORECASE).group(1)
-        return int(re.sub('[^0-9]', '', qty_str))
-    except (AttributeError, ValueError):
-        # Didn't find the usual quantity text field. This might be one of those
-        # input fields for requesting a quantity, so get the value from the
-        # input field.
+class dist_digikey(distributor.distributor):
+    def __init__(self, scrape_retries, log_level, throttle_delay):
+        super(dist_digikey, self).__init__(scrape_retries, log_level, throttle_delay)
+        self.name = 'digikey'
+        self.domain = distributor_dict[self.name]['site']['url']
+
+        self.browser.scrape_URL(self.domain)
+        self.browser.show_cookies(self.name)
+
+    def dist_get_price_tiers(self, html_tree):
+        '''@brief Get the pricing tiers from the parsed tree of the Digikey product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `dict()` price breaks, the keys are the quantities breaks.
+        '''
+        price_tiers = {}
         try:
-            logger.log(DEBUG_OBSESSIVE, 'No Digikey part quantity found!')
-            return int(qty_tree.find('input', type='text').get('value'))
+            for tr in html_tree.find('table', id='product-dollars').find_all('tr'):
+                try:
+                    td = tr.find_all('td')
+                    qty = int(re.sub('[^0-9]', '', td[0].text))
+                    price_tiers[qty] = float(re.sub('[^0-9\.]', '', td[1].text))
+                except (TypeError, AttributeError, ValueError,
+                        IndexError):  # Happens when there's no <td> in table row.
+                    continue
+        except AttributeError:
+            # This happens when no pricing info is found in the tree.
+            self.logger.log(DEBUG_OBSESSIVE, 'No Digikey pricing information found!')
+        return price_tiers
+
+    def dist_get_extra_info(self, html_tree):
+        '''@brief Get the extra characteristics `EXTRA_INFO_DIST` from the part web page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `dict()` keys as characteristics names.
+        '''
+        info = {}
+        try:
+            table =  html_tree.find('table', id='prod-att-table')
+            for row in table.find_all('tr', id=None): # `None`to ignore the header row.
+                try:
+                    k = row.find('th').text.strip().lower()
+                    v = row.find('td').text.strip()
+                    k = extra_info_dist_name_translations.get(k, k)
+                    if k in EXTRA_INFO_DIST:
+                        info[k] = v
+                except:
+                    continue
+            if 'datasheet' in EXTRA_INFO_DIST:
+                try:
+                    info['datasheet'] = html_tree.find('a', href=True, target='_blank').get('href')
+                    if info['datasheet'][0:2]=='//':
+                        info['datasheet'] = 'https:' + info['datasheet'] # Digikey missing definitions.
+                except:
+                    pass
+            if 'image' in EXTRA_INFO_DIST:
+                try:
+                    info['image'] = html_tree.find('img', itemprop="image").get('src')
+                    if info['image'][0:2]=='//':
+                        info['image'] = 'https:' + info['image'] # Digikey missing definitions.
+                except:
+                    pass
+        except AttributeError:
+            # This happens when no pricing info is found in the tree.
+            self.logger.log(DEBUG_OBSESSIVE, 'No Digikey pricing information found!')
+        return info
+
+    def dist_define_locale_currency(self, locale_iso=None, currency_iso=None):
+        '''@brief Configure the distributor for the country and currency intended.
+        
+        Scrape the configuration page and define the base URL of DigiKey for the
+        currency and locale chosen.
+        The currency is predominant over the locale/country and the defauld are
+        currency='USD' and locale='US' for DigiKey.
+        
+        @param locale_iso `str` Country in ISO3166 alpha 2 standard.
+        @param currency_iso `str` Currency in ISO4217 alpha 3 standard.'''
+
+        url = 'https://www.digikey.com/en/resources/international'
+
+        try:
+            html = self.browser.scrape_URL(url)
+        except: # Could not get a good read from the website.
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML page for DigiKey configuration.')
+            raise PartHtmlError
+        html = BeautifulSoup(html, 'lxml')
+        try:
+            if currency_iso and not locale_iso:
+                money = pycountry.currencies.get(alpha_3=currency_iso.upper())
+                locale_iso = pycountry.countries.get(numeric=money.numeric).alpha_2
+            if locale_iso:
+                locale_iso = locale_iso.upper()
+                country = pycountry.countries.get(alpha_2=locale_iso.upper()).name
+                html = html.find('li', text=re.compile(country, re.IGNORECASE))
+                url = html.find('a', id='linkcolor').get('href')
+                
+                distributor_dict[self.name]['site']['url'] = url
+                distributor_dict[self.name]['site']['currency'] = pycountry.currencies.get(numeric=country.numeric).alpha_3
+                distributor_dict[self.name]['site']['locale'] = locale_iso
+
+                # Fetch cookies for new URL.
+                self.browser.scrape_URL(url)
+        except:
+            self.logger.log(DEBUG_OVERVIEW, 'Kept the last configuration {}, {} on {}.'.format(
+                    pycountry.currencies.get(alpha_3=distributor_dict['digikey']['site']['currency']).name,
+                    pycountry.countries.get(alpha_2=distributor_dict['digikey']['site']['locale']).name,
+                    distributor_dict[self.name]['site']['url']
+                )) # Keep the current configuration.
+        return
+
+    def dist_get_part_num(self, html_tree):
+        '''@brief Get the part number from the Digikey product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `list()`of the parts that match.
+        '''
+        try:
+            return re.sub('\s', '', html_tree.find('td',
+                                                   id='reportPartNumber').text)
+        except AttributeError:
+            self.logger.log(DEBUG_OBSESSIVE, 'No Digikey part number found!')
+            return ''
+
+
+    def dist_get_qty_avail(self, html_tree):
+        '''@brief Get the available quantity of the part from the Digikey product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `int` avaliable quantity.
+        '''
+        try:
+            qty_tree = html_tree.find('td', id='quantityAvailable').find('span', id='dkQty')
+            qty_str = qty_tree.text
+        except AttributeError:
+            # No quantity found (not even 0) so this is probably a non-stocked part.
+            # Return None so the part won't show in the spreadsheet for this dist.
+            return None
+        try:
+            qty_str = re.search('([0-9,]*)', qty_str, re.IGNORECASE).group(1)
+            return int(re.sub('[^0-9]', '', qty_str))
         except (AttributeError, ValueError):
-            # Well, there's a quantityAvailable section in the website, but
-            # it doesn't contain anything decipherable. Let's just assume it's 0.
-            return 0
-
-
-def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2):
-    '''@brief Find the Digikey HTML page for a part number and return the URL and parse tree.
-       @param dist
-       @param pn Part number `str()`.
-       @param extra_search_terms
-       @param url
-       @param descend
-       @param local_part_html
-       @param scrape_retries `int` Quantity of retries in case of fail.
-       @return (html `str()` of the page, url)
-    '''
-
-    def merge_price_tiers(main_tree, alt_tree):
+            # Didn't find the usual quantity text field. This might be one of those
+            # input fields for requesting a quantity, so get the value from the
+            # input field.
+            try:
+                self.logger.log(DEBUG_OBSESSIVE, 'No Digikey part quantity found!')
+                return int(qty_tree.find('input', type='text').get('value'))
+            except (AttributeError, ValueError):
+                # Well, there's a quantityAvailable section in the website, but
+                # it doesn't contain anything decipherable. Let's just assume it's 0.
+                return 0
+
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+        '''@brief Find the Digikey HTML page for a part number and return the URL and parse tree.
+           @param pn Part number `str()`.
+           @param extra_search_terms
+           @param url
+           @param descend
+           @param local_part_html
+           @return (html `str()` of the page, url)
+        '''
+
+        # Use the part number to lookup the part using the site search function, unless a starting url was given.
+        if url is None:
+            url = distributor_dict['digikey']['site']['url'] + '/products/en?keywords=' + urlquote(
+            #'/scripts/DkSearch/dksus.dll?WT.z_header=search_go&lang=en&keywords=' + urlquote(
+                pn + ' ' + extra_search_terms,
+                safe='')
+            #url = distributor_dict['digikey']['site']['url'] + '/product-search/en?KeyWords=' + urlquote(pn,safe='') + '&WT.z_header=search_go'
+        elif url[0] == '/':
+            url = distributor_dict['digikey']['site']['url'] + url
+
+        # Open the URL, read the HTML from it, and parse it into a tree structure.
+        try:
+            html = fake_browser(url, scrape_retries)
+        except:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name))
+            raise PartHtmlError
+
+        # Abort if the part number isn't in the HTML somewhere.
+        # (Only use the numbers and letters to compare PN to HTML.)
+        if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
+            self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, self.name))
+            raise PartHtmlError
+
+        # Use the following code if Javascript challenge pages are used to block scrapers.
+        # try:
+        # ghst = Ghost()
+        # sess = ghst.start(plugins_enabled=False, download_images=False, show_scrollbars=False, javascript_enabled=False)
+        # html, resources = sess.open(url)
+        # print('type of HTML is {}'.format(type(html.content)))
+        # html = html.content
+        # except Exception as e:
+        # print('Exception reading with Ghost: {}'.format(e))
+
+        try:
+            tree = BeautifulSoup(html, 'lxml')
+        except Exception:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name))
+            raise PartHtmlError
+
+        # If the tree contains the tag for a product page, then return it.
+        if tree.find('div', class_='product-top-section') is not None:
+
+            # Digikey separprint(ates cut-tape and reel packaging, so we need to examine more pages
+            # to get all the pricing info. But don't descend any further if limit has been reached.
+            if descend > 0:
+                try:
+                    # Find all the URLs to alternate-packaging pages for this part.
+                    ap_urls = [
+                        ap.find('li', class_='lnkAltPack').find_all('a')[-1].get('href')
+                        for ap in tree.find(
+                            'div', class_='bota',
+                            id='additionalPackaging').find_all(
+                                'ul', class_='more-expander-item')
+                    ]
+                    self.logger.log(DEBUG_OBSESSIVE,'Found {} alternate packagings for {} from {}'.format(len(ap_urls), pn, self.name))
+                    ap_trees_and_urls = []  # Initialize as empty in case no alternate packagings are found.
+                    try:
+                        ap_trees_and_urls = [get_part_html_tree(self.name, pn, 
+                                         extra_search_terms, ap_url, descend=0, scrape_retries=scrape_retries)
+                                         for ap_url in ap_urls]
+                    except Exception:
+                        self.logger.log(DEBUG_OBSESSIVE,'Failed to find alternate packagings for {} from {}'.format(pn, self.name))
+
+                    # Put the main tree on the list as well and then look through
+                    # the entire list for one that's non-reeled. Use this as the
+                    # main page for the part.
+                    ap_trees_and_urls.append((tree, url))
+                    if part_is_reeled(tree):
+                        for ap_tree, ap_url in ap_trees_and_urls:
+                            if not part_is_reeled(ap_tree):
+                                # Found a non-reeled part, so use it as the main page.
+                                tree = ap_tree
+                                url = ap_url
+                                break  # Done looking.
+
+                    # Now go through the other pages, merging their pricing and quantity
+                    # info into the main page.
+                    for ap_tree, ap_url in ap_trees_and_urls:
+                        if ap_tree is tree:
+                            continue  # Skip examining the main tree. It already contains its info.
+                        try:
+                            # Merge the pricing info from that into the main parse tree to make
+                            # a single, unified set of price tiers...
+                            merge_price_tiers(tree, ap_tree)
+                            # and merge available quantity, using the maximum found.
+                            merge_qty_avail(tree, ap_tree)
+                        except AttributeError:
+                            self.logger.log(DEBUG_OBSESSIVE,'Problem merging price/qty for {} from {}'.format(pn, self.name))
+                            continue
+                except AttributeError as e:
+                    self.logger.log(DEBUG_OBSESSIVE,'Problem parsing URLs from product page for {} from {}'.format(pn, self.name))
+
+            return tree, url  # Return the parse tree and the URL where it came from.
+
+        # If the tree is for a list of products, then examine the links to try to find the part number.
+        if tree.find('table', id='productTable') is not None:
+            self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name))
+            if descend <= 0:
+                self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name))
+                raise PartHtmlError
+            else:
+                # Look for the table of products.
+                products = tree.find(
+                    'table',
+                    id='productTable').find('tbody').find_all('tr')
+
+                # Extract the product links for the part numbers from the table.
+                # Extract links for both manufacturer and catalog numbers.
+                product_links = [p.find('td',
+                                        class_='tr-mfgPartNumber').a
+                                 for p in products]
+                product_links.extend([p.find('td',
+                                        class_='tr-dkPartNumber').a
+                                 for p in products])
+
+                # Extract all the part numbers from the text portion of the links.
+                part_numbers = [l.text for l in product_links]
+
+                # Look for the part number in the list that most closely matches the requested part number.
+                match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
+
+                # Now look for the link that goes with the closest matching part number.
+                for l in product_links:
+                    if l.text == match:
+                        # Get the tree for the linked-to page and return that.
+                        self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, self.name))
+                        return self.dist_get_part_html_tree(pn, extra_search_terms,
+                                                  url=l.get('href', ''),
+                                                  descend=descend - 1)
+
+        # If the HTML contains a list of part categories, then give up.
+        if tree.find('form', id='keywordSearchForm') is not None:
+            self.logger.log(DEBUG_OBSESSIVE,'Found high-level part categories for {} from {}'.format(pn, self.name))
+            raise PartHtmlError
+
+        # I don't know what happened here, so give up.
+        self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name))
+        raise PartHtmlError
+
+    def part_is_reeled(self, html_tree):
+        '''@brief Returns True if this Digi-Key part is reeled or Digi-reeled.
+           @param html_tree `str()` html of the distributor part page.
+           @return `True` or `False`.
+        '''
+        qty_tiers = list(get_price_tiers(html_tree).keys())
+        if len(qty_tiers) > 0 and min(qty_tiers) >= 100:
+            return True
+        if html_tree.find('table',
+                          id='product-details-reel-pricing') is not None:
+            return True
+        return False
+
+    def merge_price_tiers(self, main_tree, alt_tree):
         '''Merge the price tiers from the alternate-packaging tree into the main tree.'''
         try:
             insertion_point = main_tree.find('table', id='product-dollars').find('tr')
             for tr in alt_tree.find('table', id='product-dollars').find_all('tr'):
                 insertion_point.insert_after(tr)
         except AttributeError:
-            logger.log(DEBUG_OBSESSIVE, 'Problem merging price tiers for Digikey part {} with alternate packaging!'.format(pn))
+            self.logger.log(DEBUG_OBSESSIVE, 'Problem merging price tiers for Digikey part {} with alternate packaging!'.format(pn))
 
-    def merge_qty_avail(main_tree, alt_tree):
+    def merge_qty_avail(self, main_tree, alt_tree):
         '''Merge the quantities from the alternate-packaging tree into the main tree.'''
         try:
             main_qty = get_qty_avail(main_tree)
@@ -235,144 +385,6 @@ def merge_qty_avail(main_tree, alt_tree):
                 insertion_point = main_tree.find('td', id='quantityAvailable').find('span', id='dkQty')
                 insertion_point.string = '{}'.format(merged_qty)
         except AttributeError:
-            logger.log(DEBUG_OBSESSIVE, 'Problem merging available quantities for Digikey part {} with alternate packaging!'.format(pn))
-
-    # Use the part number to lookup the part using the site search function, unless a starting url was given.
-    if url is None:
-        url = distributor_dict['digikey']['site']['url'] + '/products/en?keywords=' + urlquote(
-        #'/scripts/DkSearch/dksus.dll?WT.z_header=search_go&lang=en&keywords=' + urlquote(
-            pn + ' ' + extra_search_terms,
-            safe='')
-        #url = distributor_dict['digikey']['site']['url'] + '/product-search/en?KeyWords=' + urlquote(pn,safe='') + '&WT.z_header=search_go'
-    elif url[0] == '/':
-        url = distributor_dict['digikey']['site']['url'] + url
-
-    # Open the URL, read the HTML from it, and parse it into a tree structure.
-    try:
-        html = fake_browser(url, scrape_retries)
-    except:
-        logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist))
-        raise PartHtmlError
-
-    # Abort if the part number isn't in the HTML somewhere.
-    # (Only use the numbers and letters to compare PN to HTML.)
-    if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
-        logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist))
-        raise PartHtmlError
-
-    # Use the following code if Javascript challenge pages are used to block scrapers.
-    # try:
-    # ghst = Ghost()
-    # sess = ghst.start(plugins_enabled=False, download_images=False, show_scrollbars=False, javascript_enabled=False)
-    # html, resources = sess.open(url)
-    # print('type of HTML is {}'.format(type(html.content)))
-    # html = html.content
-    # except Exception as e:
-    # print('Exception reading with Ghost: {}'.format(e))
-
-    try:
-        tree = BeautifulSoup(html, 'lxml')
-    except Exception:
-        logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist))
-        raise PartHtmlError
+            self.logger.log(DEBUG_OBSESSIVE, 'Problem merging available quantities for Digikey part {} with alternate packaging!'.format(pn))
 
-    # If the tree contains the tag for a product page, then return it.
-    if tree.find('div', class_='product-top-section') is not None:
-
-        # Digikey separprint(ates cut-tape and reel packaging, so we need to examine more pages
-        # to get all the pricing info. But don't descend any further if limit has been reached.
-        if descend > 0:
-            try:
-                # Find all the URLs to alternate-packaging pages for this part.
-                ap_urls = [
-                    ap.find('li', class_='lnkAltPack').find_all('a')[-1].get('href')
-                    for ap in tree.find(
-                        'div', class_='bota',
-                        id='additionalPackaging').find_all(
-                            'ul', class_='more-expander-item')
-                ]
-                logger.log(DEBUG_OBSESSIVE,'Found {} alternate packagings for {} from {}'.format(len(ap_urls), pn, dist))
-                ap_trees_and_urls = []  # Initialize as empty in case no alternate packagings are found.
-                try:
-                    ap_trees_and_urls = [get_part_html_tree(dist, pn, 
-                                     extra_search_terms, ap_url, descend=0, scrape_retries=scrape_retries)
-                                     for ap_url in ap_urls]
-                except Exception:
-                    logger.log(DEBUG_OBSESSIVE,'Failed to find alternate packagings for {} from {}'.format(pn, dist))
-
-                # Put the main tree on the list as well and then look through
-                # the entire list for one that's non-reeled. Use this as the
-                # main page for the part.
-                ap_trees_and_urls.append((tree, url))
-                if part_is_reeled(tree):
-                    for ap_tree, ap_url in ap_trees_and_urls:
-                        if not part_is_reeled(ap_tree):
-                            # Found a non-reeled part, so use it as the main page.
-                            tree = ap_tree
-                            url = ap_url
-                            break  # Done looking.
-
-                # Now go through the other pages, merging their pricing and quantity
-                # info into the main page.
-                for ap_tree, ap_url in ap_trees_and_urls:
-                    if ap_tree is tree:
-                        continue  # Skip examining the main tree. It already contains its info.
-                    try:
-                        # Merge the pricing info from that into the main parse tree to make
-                        # a single, unified set of price tiers...
-                        merge_price_tiers(tree, ap_tree)
-                        # and merge available quantity, using the maximum found.
-                        merge_qty_avail(tree, ap_tree)
-                    except AttributeError:
-                        logger.log(DEBUG_OBSESSIVE,'Problem merging price/qty for {} from {}'.format(pn, dist))
-                        continue
-            except AttributeError as e:
-                logger.log(DEBUG_OBSESSIVE,'Problem parsing URLs from product page for {} from {}'.format(pn, dist))
-
-        return tree, url  # Return the parse tree and the URL where it came from.
-
-    # If the tree is for a list of products, then examine the links to try to find the part number.
-    if tree.find('table', id='productTable') is not None:
-        logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist))
-        if descend <= 0:
-            logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist))
-            raise PartHtmlError
-        else:
-            # Look for the table of products.
-            products = tree.find(
-                'table',
-                id='productTable').find('tbody').find_all('tr')
-
-            # Extract the product links for the part numbers from the table.
-            # Extract links for both manufacturer and catalog numbers.
-            product_links = [p.find('td',
-                                    class_='tr-mfgPartNumber').a
-                             for p in products]
-            product_links.extend([p.find('td',
-                                    class_='tr-dkPartNumber').a
-                             for p in products])
-
-            # Extract all the part numbers from the text portion of the links.
-            part_numbers = [l.text for l in product_links]
-
-            # Look for the part number in the list that most closely matches the requested part number.
-            match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
-
-            # Now look for the link that goes with the closest matching part number.
-            for l in product_links:
-                if l.text == match:
-                    # Get the tree for the linked-to page and return that.
-                    logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, dist))
-                    return get_part_html_tree(dist, pn, extra_search_terms,
-                                              url=l.get('href', ''),
-                                              descend=descend - 1, 
-                                              scrape_retries=scrape_retries)
-
-    # If the HTML contains a list of part categories, then give up.
-    if tree.find('form', id='keywordSearchForm') is not None:
-        logger.log(DEBUG_OBSESSIVE,'Found high-level part categories for {} from {}'.format(pn, dist))
-        raise PartHtmlError
 
-    # I don't know what happened here, so give up.
-    logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist))
-    raise PartHtmlError

From 4e10f2b58eacff38617c654b6aa33437da81bd98 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Fri, 1 Jun 2018 15:22:26 +0200
Subject: [PATCH 07/29] Implemented dist_newark class.

---
 kicost/distributors/newark/__init__.py |   2 +-
 kicost/distributors/newark/newark.py   | 337 +++++++++++++------------
 2 files changed, 174 insertions(+), 165 deletions(-)

diff --git a/kicost/distributors/newark/__init__.py b/kicost/distributors/newark/__init__.py
index 3be052ed8..ec6b05197 100644
--- a/kicost/distributors/newark/__init__.py
+++ b/kicost/distributors/newark/__init__.py
@@ -26,7 +26,7 @@
             },
             # Web site defitions.
             'site': {
-                'url': 'http://www.newark.com/',
+                'url': 'https://www.newark.com/',
                 'currency': 'USD',
                 'locale': 'US'
             },
diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py
index 88079b644..bbd03fa9e 100644
--- a/kicost/distributors/newark/newark.py
+++ b/kicost/distributors/newark/newark.py
@@ -38,176 +38,185 @@
 import difflib
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-from .. import urlencode, urlquote, urlsplit, urlunsplit
-from .. import fake_browser, WEB_SCRAPE_EXCEPTIONS
+#from .. import urlencode, urlquote, urlsplit, urlunsplit
+from .. import fake_browser
 from ...globals import PartHtmlError
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
 
+from .. import distributor, distributor_dict
+
+from urllib.parse import quote_plus as urlquote
+
+class dist_newark(distributor.distributor):
+    def __init__(self, scrape_retries, log_level, throttle_delay):
+        super(dist_newark, self).__init__(scrape_retries, log_level, throttle_delay)
+        self.name = 'newark'
+        self.domain = distributor_dict[self.name]['site']['url']
+
+        self.browser.scrape_URL(self.domain)
+        self.browser.show_cookies(self.name)
+
+    def dist_get_price_tiers(self, html_tree):
+        '''@brief Get the pricing tiers from the parsed tree of the Newark product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `dict()` price breaks, the keys are the quantities breaks.
+        '''
+        price_tiers = {}
+        try:
+            qty_strs = []
+            for qty in html_tree.find(
+                'table',
+                class_=('tableProductDetailPrice', 'pricing')).find_all(
+                    'td',
+                    class_='qty'):
+                qty_strs.append(qty.text)
+            price_strs = []
+            for price in html_tree.find(
+                'table',
+                class_=('tableProductDetailPrice', 'pricing')).find_all(
+                    'td',
+                    class_='threeColTd'):
+                price_strs.append(price.text)
+            qtys_prices = list(zip(qty_strs, price_strs))
+            for qty_str, price_str in qtys_prices:
+                try:
+                    qty = re.search('(\s*)([0-9,]+)', qty_str).group(2)
+                    qty = int(re.sub('[^0-9]', '', qty))
+                    price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str))
+                except (TypeError, AttributeError, ValueError):
+                    continue
+        except AttributeError:
+            # This happens when no pricing info is found in the tree.
+            self.logger.log(DEBUG_OBSESSIVE, 'No Newark pricing information found!')
+            return price_tiers  # Return empty price tiers.
+        return price_tiers
+
+
+    def dist_get_part_num(self, html_tree):
+        '''@brief Get the part number from the Newark product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `list()`of the parts that match.
+        '''
+        try:
+            # Newark catalog number is stored in a description list, so get
+            # all the list terms and descriptions, strip all the spaces from those,
+            # and pair them up.
+            div = html_tree.find('div', class_='productDescription').find('dl')
+            dt = [re.sub('\s','',d.text) for d in div.find_all('dt')]
+            dd = [re.sub('\s','',d.text) for d in div.find_all('dd')]
+            dtdd = {k:v for k,v in zip(dt,dd)}  # Pair terms with descriptions.
+            return dtdd.get('NewarkPartNo.:', '')
+        except KeyError:
+            self.logger.log(DEBUG_OBSESSIVE, 'No Newark catalog number found!')
+            return '' # No catalog number found in page.
+        except AttributeError:
+            self.logger.log(DEBUG_OBSESSIVE, 'No Newark product description found!')
+            return '' # No ProductDescription found in page.
+
+
+    def dist_get_qty_avail(self, html_tree):
+        '''@brief Get the available quantity of the part from the Newark product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `int` avaliable quantity.
+        '''
+        try:
+            qty_str = html_tree.find('p', class_='availabilityHeading').text
+        except (AttributeError, ValueError):
+            # No quantity found (not even 0) so this is probably a non-stocked part.
+            # Return None so the part won't show in the spreadsheet for this dist.
+            return None
+        try:
+            qty = re.sub('[^0-9]','',qty_str)  # Strip all non-number chars.
+            return int(re.sub('[^0-9]', '', qty_str))  # Return integer for quantity.
+        except ValueError:
+            # No quantity found (not even 0) so this is probably a non-stocked part.
+            # Return None so the part won't show in the spreadsheet for this dist.
+            self.logger.log(DEBUG_OBSESSIVE, 'No Newark part quantity found!')
+            return None
+
+
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+        '''@brief Find the Newark HTML page for a part number and return the URL and parse tree.
+           @param pn Part number `str()`.
+           @param extra_search_terms
+           @param url
+           @param descend
+           @param local_part_html
+           @return (html `str()` of the page, url)
+        '''
+
+        # Use the part number to lookup the part using the site search function, unless a starting url was given.
+        if url is None:
+            url = 'http://www.newark.com/webapp/wcs/stores/servlet/Search?catalogId=15003&langId=-1&storeId=10194&gs=true&st=' + urlquote(
+                pn + ' ' + extra_search_terms,
+                safe='')
+        elif url[0] == '/':
+            url = 'http://www.newark.com' + url
+        elif url.startswith('..'):
+            url = 'http://www.newark.com/Search/' + url
+
+        # Open the URL, read the HTML from it, and parse it into a tree structure.
+        try:
+            html = self.browser.scrape_URL(url)
+        except:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name))
+            raise PartHtmlError
 
-def get_price_tiers(html_tree):
-    '''@brief Get the pricing tiers from the parsed tree of the Newark product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `dict()` price breaks, the keys are the quantities breaks.
-    '''
-    price_tiers = {}
-    try:
-        qty_strs = []
-        for qty in html_tree.find(
-            'table',
-            class_=('tableProductDetailPrice', 'pricing')).find_all(
-                'td',
-                class_='qty'):
-            qty_strs.append(qty.text)
-        price_strs = []
-        for price in html_tree.find(
-            'table',
-            class_=('tableProductDetailPrice', 'pricing')).find_all(
-                'td',
-                class_='threeColTd'):
-            price_strs.append(price.text)
-        qtys_prices = list(zip(qty_strs, price_strs))
-        for qty_str, price_str in qtys_prices:
-            try:
-                qty = re.search('(\s*)([0-9,]+)', qty_str).group(2)
-                qty = int(re.sub('[^0-9]', '', qty))
-                price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str))
-            except (TypeError, AttributeError, ValueError):
-                continue
-    except AttributeError:
-        # This happens when no pricing info is found in the tree.
-        logger.log(DEBUG_OBSESSIVE, 'No Newark pricing information found!')
-        return price_tiers  # Return empty price tiers.
-    return price_tiers
-
-
-def get_part_num(html_tree):
-    '''@brief Get the part number from the Newark product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `list()`of the parts that match.
-    '''
-    try:
-        # Newark catalog number is stored in a description list, so get
-        # all the list terms and descriptions, strip all the spaces from those,
-        # and pair them up.
-        div = html_tree.find('div', class_='productDescription').find('dl')
-        dt = [re.sub('\s','',d.text) for d in div.find_all('dt')]
-        dd = [re.sub('\s','',d.text) for d in div.find_all('dd')]
-        dtdd = {k:v for k,v in zip(dt,dd)}  # Pair terms with descriptions.
-        return dtdd.get('NewarkPartNo.:', '')
-    except KeyError:
-        logger.log(DEBUG_OBSESSIVE, 'No Newark catalog number found!')
-        return '' # No catalog number found in page.
-    except AttributeError:
-        logger.log(DEBUG_OBSESSIVE, 'No Newark product description found!')
-        return '' # No ProductDescription found in page.
-
-
-def get_qty_avail(html_tree):
-    '''@brief Get the available quantity of the part from the Newark product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `int` avaliable quantity.
-    '''
-    try:
-        qty_str = html_tree.find('p', class_='availabilityHeading').text
-    except (AttributeError, ValueError):
-        # No quantity found (not even 0) so this is probably a non-stocked part.
-        # Return None so the part won't show in the spreadsheet for this dist.
-        return None
-    try:
-        qty = re.sub('[^0-9]','',qty_str)  # Strip all non-number chars.
-        return int(re.sub('[^0-9]', '', qty_str))  # Return integer for quantity.
-    except ValueError:
-        # No quantity found (not even 0) so this is probably a non-stocked part.
-        # Return None so the part won't show in the spreadsheet for this dist.
-        logger.log(DEBUG_OBSESSIVE, 'No Newark part quantity found!')
-        return None
-
-
-def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2):
-    '''@brief Find the Newark HTML page for a part number and return the URL and parse tree.
-       @param dist
-       @param pn Part number `str()`.
-       @param extra_search_terms
-       @param url
-       @param descend
-       @param local_part_html
-       @param scrape_retries `int` Quantity of retries in case of fail.
-       @return (html `str()` of the page, url)
-    '''
-
-    # Use the part number to lookup the part using the site search function, unless a starting url was given.
-    if url is None:
-        url = 'http://www.newark.com/webapp/wcs/stores/servlet/Search?catalogId=15003&langId=-1&storeId=10194&gs=true&st=' + urlquote(
-            pn + ' ' + extra_search_terms,
-            safe='')
-    elif url[0] == '/':
-        url = 'http://www.newark.com' + url
-    elif url.startswith('..'):
-        url = 'http://www.newark.com/Search/' + url
-
-    # Open the URL, read the HTML from it, and parse it into a tree structure.
-    try:
-        html = fake_browser(url, scrape_retries)
-    except:
-        logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist))
-        raise PartHtmlError
-
-    try:
-        tree = BeautifulSoup(html, 'lxml')
-    except Exception:
-        logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist))
-        raise PartHtmlError
-
-    # Abort if the part number isn't in the HTML somewhere.
-    # (Only use the numbers and letters to compare PN to HTML.)
-    if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
-        logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist))
-        raise PartHtmlError
-
-    # If the tree contains the tag for a product page, then just return it.
-    if tree.find('div', class_='productDisplay', id='page') is not None:
-        return tree, url
+        try:
+            tree = BeautifulSoup(html, 'lxml')
+        except Exception:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name))
+            raise PartHtmlError
 
-    # If the tree is for a list of products, then examine the links to try to find the part number.
-    if tree.find('table', class_='productLister', id='sProdList') is not None:
-        logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist))
-        if descend <= 0:
-            logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist))
+        # Abort if the part number isn't in the HTML somewhere.
+        # (Only use the numbers and letters to compare PN to HTML.)
+        if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
+            self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, self.name))
             raise PartHtmlError
-        else:
-            # Look for the table of products.
-            products = tree.find('table',
-                                 class_='productLister',
-                                 id='sProdList').find('tbody').find_all('tr')
-
-            # Extract the product links for the part numbers from the table.
-            product_links = []
-            for p in products:
-                try:
-                    product_links.append(
-                        p.find('td', class_='mftrPart').find('a'))
-                except AttributeError:
-                    continue
 
-            # Extract all the part numbers from the text portion of the links.
-            part_numbers = [l.text for l in product_links]
+        # If the tree contains the tag for a product page, then just return it.
+        if tree.find('div', class_='productDisplay', id='page') is not None:
+            return tree, url
 
-            # Look for the part number in the list that most closely matches the requested part number.
-            try:
-                match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
-            except IndexError:
+        # If the tree is for a list of products, then examine the links to try to find the part number.
+        if tree.find('table', class_='productLister', id='sProdList') is not None:
+            self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name))
+            if descend <= 0:
+                self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name))
                 raise PartHtmlError
-
-            # Now look for the link that goes with the closest matching part number.
-            for l in product_links:
-                if l.text == match:
-                    # Get the tree for the linked-to page and return that.
-                    logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, dist))
-                    return get_part_html_tree(dist, pn, extra_search_terms,
-                                url=l.get('href', ''),
-                                descend=descend-1,
-                                scrape_retries=scrape_retries)
-
-    # I don't know what happened here, so give up.
-    logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist))
-    raise PartHtmlError
+            else:
+                # Look for the table of products.
+                products = tree.find('table',
+                                     class_='productLister',
+                                     id='sProdList').find('tbody').find_all('tr')
+
+                # Extract the product links for the part numbers from the table.
+                product_links = []
+                for p in products:
+                    try:
+                        product_links.append(
+                            p.find('td', class_='mftrPart').find('a'))
+                    except AttributeError:
+                        continue
+
+                # Extract all the part numbers from the text portion of the links.
+                part_numbers = [l.text for l in product_links]
+
+                # Look for the part number in the list that most closely matches the requested part number.
+                try:
+                    match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
+                except IndexError:
+                    raise PartHtmlError
+
+                # Now look for the link that goes with the closest matching part number.
+                for l in product_links:
+                    if l.text == match:
+                        # Get the tree for the linked-to page and return that.
+                        self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text.strip(), pn, self.name))
+                        return self.dist_get_part_html_tree(pn, extra_search_terms,
+                                    url=l.get('href', ''),
+                                    descend=descend-1)
+
+        # I don't know what happened here, so give up.
+        self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name))
+        raise PartHtmlError

From 7fcc91fcb9f8e1d9a5cacf81dbc75e5fe05f0f91 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Fri, 1 Jun 2018 15:22:39 +0200
Subject: [PATCH 08/29] Implemented dist_rs class.

---
 kicost/distributors/rs/__init__.py |   2 +-
 kicost/distributors/rs/rs.py       | 275 +++++++++++++++--------------
 2 files changed, 143 insertions(+), 134 deletions(-)

diff --git a/kicost/distributors/rs/__init__.py b/kicost/distributors/rs/__init__.py
index de299c740..5c06b17b3 100644
--- a/kicost/distributors/rs/__init__.py
+++ b/kicost/distributors/rs/__init__.py
@@ -25,7 +25,7 @@
             },
             # Web site defitions.
             'site': {
-                'url': 'http://rs-online.com/',
+                'url': 'https://rs-online.com/',
                 'currency': 'USD',
                 'locale': 'UK'
             },
diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py
index 61de641ad..34b58fa8d 100644
--- a/kicost/distributors/rs/rs.py
+++ b/kicost/distributors/rs/rs.py
@@ -34,148 +34,157 @@
 import re, difflib
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-from .. import urlencode, urlquote, urlsplit, urlunsplit
+#from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
 from currency_converter import CurrencyConverter
 currency = CurrencyConverter()
 
+from .. import distributor, distributor_dict
 
-def get_price_tiers(html_tree):
-    '''@brief Get the pricing tiers from the parsed tree of the RS Components product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `dict()` price breaks, the keys are the quantities breaks.
-    '''
-    price_tiers = {}
-    
-    try:
-        for row in html_tree.find_all('div', class_='table-row value-row'):
-            qty = row.find('div',
-                        class_='breakRangeWithoutUnit col-xs-4').text
-            price = row.find('div',
-                        class_='unitPrice col-xs-4').text
-            try:
-                qty = int( re.findall('\s*([0-9\,]+)', qty)[0] )
-                price = re.sub('[^0-9\.]', '', price.replace(',','.') )
-                price = currency.convert(float(price), 'EUR', 'USD')
-                price_tiers[qty] = price
-            except (TypeError, AttributeError, ValueError):
-                continue
-    except AttributeError:
-        # This happens when no pricing info is found in the tree.
-        return price_tiers  # Return empty price tiers.
-    return price_tiers
-    
-def get_part_num(html_tree):
-    '''@brief Get the part number from the RS product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `dict()` price breaks, the keys are the quantities breaks.
-    '''
-    try:
-        pn_str = html_tree.find('span', class_='keyValue').text
-        pn = re.sub('[^0-9\-]','', pn_str)
-        return pn
-    except KeyError:
-        return '' # No catalog number found in page.
-    except AttributeError:
-        return '' # No ProductDescription found in page.
-
-def get_qty_avail(html_tree):
-    '''Get the available quantity of the part from the RS product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `int` avaliable quantity.
-    '''
+from urllib.parse import quote_plus as urlquote
+
+class dist_rs(distributor.distributor):
+    def __init__(self, scrape_retries, log_level, throttle_delay):
+        super(dist_rs, self).__init__(scrape_retries, log_level, throttle_delay)
+        self.name = 'rs'
+        self.domain = distributor_dict[self.name]['site']['url']
+
+        self.browser.scrape_URL(self.domain)
+        self.browser.show_cookies(self.name)
+
+    def dist_get_price_tiers(self, html_tree):
+        '''@brief Get the pricing tiers from the parsed tree of the RS Components product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `dict()` price breaks, the keys are the quantities breaks.
+        '''
+        price_tiers = {}
         
-    try:
-        # Note that 'availability' is misspelled in the container class name!        
-        qty_str = html_tree.find('span', class_=('stock-msg-content', 'table-cell')).text
-    except (AttributeError, ValueError):
-        # No quantity found (not even 0) so this is probably a non-stocked part.
-        # Return None so the part won't show in the spreadsheet for this dist.
-        return None
-    try:
-        qty = re.sub('[^0-9]','',qty_str[0:10])  # Strip all non-number chars.
-        return int(qty)  # Return integer for quantity.
-    except ValueError:
-        # No quantity found (not even 0) so this is probably a non-stocked part.
-        # Return None so the part won't show in the spreadsheet for this dist.
-        return None
-
-def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2):
-    '''@brief Find the RS Components HTML page for a part number and return the URL and parse tree.
-       @param dist
-       @param pn Part number `str()`.
-       @param extra_search_terms
-       @param url
-       @param descend
-       @param local_part_html
-       @param scrape_retries `int` Quantity of retries in case of fail.
-       @return (html `str()` of the page, url)
-    '''
+        try:
+            for row in html_tree.find_all('div', class_='table-row value-row'):
+                qty = row.find('div',
+                            class_='breakRangeWithoutUnit col-xs-4').text
+                price = row.find('div',
+                            class_='unitPrice col-xs-4').text
+                try:
+                    qty = int( re.findall('\s*([0-9\,]+)', qty)[0] )
+                    price = re.sub('[^0-9\.]', '', price.replace(',','.') )
+                    price = currency.convert(float(price), 'EUR', 'USD')
+                    price_tiers[qty] = price
+                except (TypeError, AttributeError, ValueError):
+                    continue
+        except AttributeError:
+            # This happens when no pricing info is found in the tree.
+            return price_tiers  # Return empty price tiers.
+        return price_tiers
+        
+    def dist_get_part_num(self, html_tree):
+        '''@brief Get the part number from the RS product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `dict()` price breaks, the keys are the quantities breaks.
+        '''
+        try:
+            pn_str = html_tree.find('span', class_='keyValue').text
+            pn = re.sub('[^0-9\-]','', pn_str)
+            return pn
+        except KeyError:
+            return '' # No catalog number found in page.
+        except AttributeError:
+            return '' # No ProductDescription found in page.
+
+    def dist_get_qty_avail(self, html_tree):
+        '''Get the available quantity of the part from the RS product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `int` avaliable quantity.
+        '''
             
-    # Use the part number to lookup the part using the site search function, unless a starting url was given.
-    if url is None:
-        url = 'http://it.rs-online.com/web/c/?searchTerm=' + urlquote(pn + ' ' + extra_search_terms, safe='')
-
-    elif url[0] == '/':
-        url = 'http://it.rs-online.com' + url
-    elif url.startswith('..'):
-        url = 'http://it.rs-online.com/Search/' + url
-
-    # Open the URL, read the HTML from it, and parse it into a tree structure.
-    try:
-        html = fake_browser(url, scrape_retries)
-    except:
-        logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist))
-        raise PartHtmlError
+        try:
+            # Note that 'availability' is misspelled in the container class name!        
+            qty_str = html_tree.find('span', class_=('stock-msg-content', 'table-cell')).text
+        except (AttributeError, ValueError):
+            # No quantity found (not even 0) so this is probably a non-stocked part.
+            # Return None so the part won't show in the spreadsheet for this dist.
+            return None
+        try:
+            qty = re.sub('[^0-9]','',qty_str[0:10])  # Strip all non-number chars.
+            return int(qty)  # Return integer for quantity.
+        except ValueError:
+            # No quantity found (not even 0) so this is probably a non-stocked part.
+            # Return None so the part won't show in the spreadsheet for this dist.
+            return None
+
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+        '''@brief Find the RS Components HTML page for a part number and return the URL and parse tree.
+           @param pn Part number `str()`.
+           @param extra_search_terms
+           @param url
+           @param descend
+           @param local_part_html
+           @return (html `str()` of the page, url)
+        '''
+                
+        # Use the part number to lookup the part using the site search function, unless a starting url was given.
+        if url is None:
+            url = 'http://it.rs-online.com/web/c/?searchTerm=' + urlquote(pn + ' ' + extra_search_terms, safe='')
+
+        elif url[0] == '/':
+            url = 'http://it.rs-online.com' + url
+        elif url.startswith('..'):
+            url = 'http://it.rs-online.com/Search/' + url
+
+        # Open the URL, read the HTML from it, and parse it into a tree structure.
+        try:
+            html = self.browser.scrape_URL(url)
+        except:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist))
+            raise PartHtmlError
 
-    try:
-        tree = BeautifulSoup(html, 'lxml')
-    except Exception:
-        logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist))
-        raise PartHtmlError
+        try:
+            tree = BeautifulSoup(html, 'lxml')
+        except Exception:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist))
+            raise PartHtmlError
 
-    # Abort if the part number isn't in the HTML somewhere.
-    # (Only use the numbers and letters to compare PN to HTML.)
-    if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
-        logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist))
-        raise PartHtmlError
-        
-    # If the tree contains the tag for a product page, then just return it.
-    if tree.find('div', class_='advLineLevelContainer'):
-        return tree, url
-
-    # If the tree is for a list of products, then examine the links to try to find the part number.
-    if tree.find('div', class_=('resultsTable','results-table-container')) is not None:
-        logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist))
-        if descend <= 0:
-            logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist))
+        # Abort if the part number isn't in the HTML somewhere.
+        # (Only use the numbers and letters to compare PN to HTML.)
+        if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
+            self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist))
             raise PartHtmlError
-        else:
-            # Look for the table of products.
-            products = tree.find('table', id='results-table').find_all(
-                    'tr', class_='resultRow')
-
-            # Extract the product links for the part numbers from the table.
-            product_links = [p.find('a', class_='product-name').get('href') for p in products]
-
-            # Extract all the part numbers from the text portion of the links.
-            part_numbers = [p.find('span', class_='text-contents').get_text() for p in products]
-
-            # Look for the part number in the list that most closely matches the requested part number.
-            match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
-
-            # Now look for the link that goes with the closest matching part number.
-            for i in range(len(product_links)):
-                if part_numbers[i] == match:
-                    # Get the tree for the linked-to page and return that.
-                    logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(part_numbers[i], pn, dist))
-                    return get_part_html_tree(dist, pn, extra_search_terms,
-                                              url=product_links[i],
-                                              descend=descend-1,
-                                              scrape_retries=scrape_retries)
-
-    # I don't know what happened here, so give up.
-    logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist))
-    raise PartHtmlError
+            
+        # If the tree contains the tag for a product page, then just return it.
+        if tree.find('div', class_='advLineLevelContainer'):
+            return tree, url
+
+        # If the tree is for a list of products, then examine the links to try to find the part number.
+        if tree.find('div', class_=('resultsTable','results-table-container')) is not None:
+            self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist))
+            if descend <= 0:
+                self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist))
+                raise PartHtmlError
+            else:
+                # Look for the table of products.
+                products = tree.find('table', id='results-table').find_all(
+                        'tr', class_='resultRow')
+
+                # Extract the product links for the part numbers from the table.
+                product_links = [p.find('a', class_='product-name').get('href') for p in products]
+
+                # Extract all the part numbers from the text portion of the links.
+                part_numbers = [p.find('span', class_='text-contents').get_text() for p in products]
+
+                # Look for the part number in the list that most closely matches the requested part number.
+                match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
+
+                # Now look for the link that goes with the closest matching part number.
+                for i in range(len(product_links)):
+                    if part_numbers[i] == match:
+                        # Get the tree for the linked-to page and return that.
+                        self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(part_numbers[i], pn, dist))
+                        return self.dist_get_part_html_tree(pn, extra_search_terms,
+                                                  url=product_links[i],
+                                                  descend=descend-1)
+
+        # I don't know what happened here, so give up.
+        self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist))
+        raise PartHtmlError

From 5c7be6a22bcd5aba97fe839decaade166be9cdc6 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Fri, 1 Jun 2018 15:22:54 +0200
Subject: [PATCH 09/29] Implemented dist_tme class.

---
 kicost/distributors/tme/tme.py | 401 +++++++++++++++++----------------
 1 file changed, 203 insertions(+), 198 deletions(-)

diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py
index a593b9e68..f2403e629 100644
--- a/kicost/distributors/tme/tme.py
+++ b/kicost/distributors/tme/tme.py
@@ -39,211 +39,216 @@
 import json
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-from .. import urlencode, urlquote, urlsplit, urlunsplit
+#from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
 
-HTML_RESPONSE_RETRIES = 2
-
-def __ajax_details(pn):
-    '''@brief Load part details from TME using XMLHttpRequest.
-       @param pn `str()` part number
-       @return (html, quantity avaliable)
-    '''
-    data = urlencode({
-        'symbol': pn,
-        'currency': 'USD'
-    }).encode("utf-8")
-    
-    try:
-        html = fake_browser('https://www.tme.eu/en/_ajax/ProductInformationPage/_getStocks.html', 4, ('X-Requested-With', 'XMLHttpRequest') )
-    except: # Couldn't get a good read from the website.
-        logger.log(DEBUG_OBSESSIVE,'No AJAX data for {} from {}'.format(pn, 'TME'))
-        return None, None
-
-    try:
-        r = r.decode('utf-8')  # Convert bytes to string in Python 3.
-        p = json.loads(r).get('Products')
-        if p is not None and isinstance(p, list):
-            p = p[0]
-            html_tree = BeautifulSoup(p.get('PriceTpl', '').replace("\n", ""), "lxml")
-            quantity = p.get('InStock', '0')
-            return html_tree, quantity
-        else:
+from .. import distributor, distributor_dict
+
+from urllib.parse import quote_plus as urlquote, urlencode
+
+class dist_tme(distributor.distributor):
+    def __init__(self, scrape_retries, log_level, throttle_delay):
+        super(dist_tme, self).__init__(scrape_retries, log_level, throttle_delay)
+        self.name = 'tme'
+        self.domain = distributor_dict[self.name]['site']['url']
+
+        self.browser.scrape_URL(self.domain)
+        self.browser.show_cookies(self.name)
+
+    def __ajax_details(self, pn):
+        '''@brief Load part details from TME using XMLHttpRequest.
+           @param pn `str()` part number
+           @return (html, quantity avaliable)
+        '''
+        data = urlencode({
+            'symbol': pn,
+            'currency': 'USD'
+        }).encode("utf-8")
+        
+        try:
+            html = self.browser.scrape_URL('https://www.tme.eu/en/_ajax/ProductInformationPage/_getStocks.html', ('X-Requested-With', 'XMLHttpRequest'))
+        except: # Couldn't get a good read from the website.
+            self.logger.log(DEBUG_OBSESSIVE,'No AJAX data for {} from {}'.format(pn, 'TME'))
             return None, None
-    except (ValueError, KeyError, IndexError):
-        logger.log(DEBUG_OBSESSIVE, 'Could not obtain AJAX data from TME!')
-        return None, None
-
-def get_price_tiers(html_tree):
-    '''@brief Get the pricing tiers from the parsed tree of the TME product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `dict()` price breaks, the keys are the quantities breaks.
-    '''
-    price_tiers = {}
-    try:
-        pn = get_part_num(html_tree)
-        if pn == '':
-            return price_tiers
-
-        ajax_tree, quantity = __ajax_details(pn)
-        if ajax_tree is None:
-            return price_tiers
-
-        qty_strs = []
-        price_strs = []
-        for tr in ajax_tree.find('tbody', id='prices_body').find_all('tr'):
-            td = tr.find_all('td')
-            if len(td) == 3:
-                qty_strs.append(td[0].text)
-                price_strs.append(td[2].text)
-
-        qtys_prices = list(zip(qty_strs, price_strs))
-        for qty_str, price_str in qtys_prices:
-            try:
-                qty = re.search('(\s*)([0-9,]+)', qty_str).group(2)
-                qty = int(re.sub('[^0-9]', '', qty))
-                price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str))
-            except (TypeError, AttributeError, ValueError, IndexError):
-                continue
-    except AttributeError:
-        # This happens when no pricing info is found in the tree.
-        logger.log(DEBUG_OBSESSIVE, 'No TME pricing information found!')
-        return price_tiers  # Return empty price tiers.
-    return price_tiers
-
-
-def get_part_num(html_tree):
-    '''@brief Get the part number from the TME product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `list()`of the parts that match.
-    '''
-    try:
-        return html_tree.find('td', class_="pip-product-symbol").text
-    except AttributeError:
-        logger.log(DEBUG_OBSESSIVE, 'No TME part number found!')
-        return ''
-
-
-def get_qty_avail(html_tree):
-    '''@brief Get the available quantity of the part from the TME product page.
-       @param html_tree `str()` html of the distributor part page.
-       @return `int` avaliable quantity.
-    '''
-    pn = get_part_num(html_tree)
-    if pn == '':
-        logger.log(DEBUG_OBSESSIVE, 'No TME part quantity found!')
-        return None
-
-    ajax_tree, qty_str = __ajax_details(pn)
-    if qty_str is None:
-        return None
-
-    try:
-        return int(qty_str)
-    except ValueError:
-        # No quantity found (not even 0) so this is probably a non-stocked part.
-        # Return None so the part won't show in the spreadsheet for this dist.
-        logger.log(DEBUG_OBSESSIVE, 'No TME part quantity found!')
-        return None
-
-
-def get_part_html_tree(dist, pn, extra_search_terms='', url=None, descend=2, local_part_html=None, scrape_retries=2):
-    '''@brief Find the TME HTML page for a part number and return the URL and parse tree.
-       @param dist
-       @param pn Part number `str()`.
-       @param extra_search_terms
-       @param url
-       @param descend
-       @param local_part_html
-       @param scrape_retries `int` Quantity of retries in case of fail.
-       @return (html `str()` of the page, url)
-    '''
-
-    global HTML_RESPONSE_RETRIES
-    HTML_RESPONSE_RETRIES = scrape_retries
-
-    # Use the part number to lookup the part using the site search function, unless a starting url was given.
-    if url is None:
-        url = 'https://www.tme.eu/en/katalog/?search=' + urlquote(
-            pn + ' ' + extra_search_terms,
-            safe='')
-    elif url[0] == '/':
-        url = 'https://www.tme.eu' + url
-
-    # Open the URL, read the HTML from it, and parse it into a tree structure.
-    try:
-        html = fake_browser(url, scrape_retries)
-    except:
-        logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist))
-        raise PartHtmlError
 
-    # Abort if the part number isn't in the HTML somewhere.
-    # (Only use the numbers and letters to compare PN to HTML.)
-    if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
-        logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {} ({})'.format(pn, dist, url))
-        raise PartHtmlError
+        try:
+            r = r.decode('utf-8')  # Convert bytes to string in Python 3.
+            p = json.loads(r).get('Products')
+            if p is not None and isinstance(p, list):
+                p = p[0]
+                html_tree = BeautifulSoup(p.get('PriceTpl', '').replace("\n", ""), "lxml")
+                quantity = p.get('InStock', '0')
+                return html_tree, quantity
+            else:
+                return None, None
+        except (ValueError, KeyError, IndexError):
+            self.logger.log(DEBUG_OBSESSIVE, 'Could not obtain AJAX data from TME!')
+            return None, None
 
-    try:
-        tree = BeautifulSoup(html, 'lxml')
-    except Exception:
-        logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist))
-        raise PartHtmlError
+    def dist_get_price_tiers(self, html_tree):
+        '''@brief Get the pricing tiers from the parsed tree of the TME product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `dict()` price breaks, the keys are the quantities breaks.
+        '''
+        price_tiers = {}
+        try:
+            pn = self.dist_get_part_num(html_tree)
+            if pn == '':
+                return price_tiers
+
+            ajax_tree, quantity = self.__ajax_details(pn)
+            if ajax_tree is None:
+                return price_tiers
+
+            qty_strs = []
+            price_strs = []
+            for tr in ajax_tree.find('tbody', id='prices_body').find_all('tr'):
+                td = tr.find_all('td')
+                if len(td) == 3:
+                    qty_strs.append(td[0].text)
+                    price_strs.append(td[2].text)
+
+            qtys_prices = list(zip(qty_strs, price_strs))
+            for qty_str, price_str in qtys_prices:
+                try:
+                    qty = re.search('(\s*)([0-9,]+)', qty_str).group(2)
+                    qty = int(re.sub('[^0-9]', '', qty))
+                    price_tiers[qty] = float(re.sub('[^0-9\.]', '', price_str))
+                except (TypeError, AttributeError, ValueError, IndexError):
+                    continue
+        except AttributeError:
+            # This happens when no pricing info is found in the tree.
+            self.logger.log(DEBUG_OBSESSIVE, 'No TME pricing information found!')
+            return price_tiers  # Return empty price tiers.
+        return price_tiers
+
+
+    def dist_get_part_num(self, html_tree):
+        '''@brief Get the part number from the TME product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `list()`of the parts that match.
+        '''
+        try:
+            return html_tree.find('td', class_="pip-product-symbol").text
+        except AttributeError:
+            self.logger.log(DEBUG_OBSESSIVE, 'No TME part number found!')
+            return ''
+
+
+    def dist_get_qty_avail(self, html_tree):
+        '''@brief Get the available quantity of the part from the TME product page.
+           @param html_tree `str()` html of the distributor part page.
+           @return `int` avaliable quantity.
+        '''
+        pn = self.dist_get_part_num(html_tree)
+        if pn == '':
+            self.logger.log(DEBUG_OBSESSIVE, 'No TME part quantity found!')
+            return None
+
+        ajax_tree, qty_str = self.__ajax_details(pn)
+        if qty_str is None:
+            return None
+
+        try:
+            return int(qty_str)
+        except ValueError:
+            # No quantity found (not even 0) so this is probably a non-stocked part.
+            # Return None so the part won't show in the spreadsheet for this dist.
+            self.logger.log(DEBUG_OBSESSIVE, 'No TME part quantity found!')
+            return None
+
+
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+        '''@brief Find the TME HTML page for a part number and return the URL and parse tree.
+           @param pn Part number `str()`.
+           @param extra_search_terms
+           @param url
+           @param descend
+           @param local_part_html
+           @return (html `str()` of the page, url)
+        '''
+
+        # Use the part number to lookup the part using the site search function, unless a starting url was given.
+        if url is None:
+            url = 'https://www.tme.eu/en/katalog/?search=' + urlquote(
+                pn + ' ' + extra_search_terms,
+                safe='')
+        elif url[0] == '/':
+            url = 'https://www.tme.eu' + url
+
+        # Open the URL, read the HTML from it, and parse it into a tree structure.
+        try:
+            html = self.browser.scrape_URL(url)
+        except:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name))
+            raise PartHtmlError
 
-    # If the tree contains the tag for a product page, then just return it.
-    if tree.find('div', id='ph') is not None:
-        return tree, url
+        # Abort if the part number isn't in the HTML somewhere.
+        # (Only use the numbers and letters to compare PN to HTML.)
+        if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
+            self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {} ({})'.format(pn, self.name, url))
+            raise PartHtmlError
 
-    # If the tree is for a list of products, then examine the links to try to find the part number.
-    if tree.find('table', id="products") is not None:
-        logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist))
-        if descend <= 0:
-            logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist))
+        try:
+            tree = BeautifulSoup(html, 'lxml')
+        except Exception:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name))
             raise PartHtmlError
-        else:
-            # Look for the table of products.
-            products = tree.find(
-                'table',
-                id="products").find_all(
-                    'tr',
-                    class_=('product-row'))
-
-            # Extract the product links for the part numbers from the table.
-            product_links = []
-            for p in products:
-                for a in p.find('td', class_='product').find_all('a'):
-                    product_links.append(a)
-
-            # Extract all the part numbers from the text portion of the links.
-            part_numbers = [l.text for l in product_links]
-
-            # Look for the part number in the list that most closely matches the requested part number.
-            match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
-
-            # Now look for the link that goes with the closest matching part number.
-            for l in product_links:
-                try:
-                    if (not l.get('href', '').startswith('./katalog')) and l.text == match:
-                        # Get the tree for the linked-to page and return that.
-                        logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text, pn, dist))
-                        # TODO: The current implementation does up to four HTTP
-                        # requests per part (search, part details page for TME P/N,
-                        # XHR for pricing information, and XHR for stock
-                        # availability). This is mainly for the compatibility with
-                        # other distributor implementations (html_tree gets passed
-                        # to all functions).
-                        # A modified implementation (which would pass JSON data
-                        # obtained by the XHR instead of the HTML DOM tree) might be
-                        # able to do the same with just two requests (search for TME
-                        # P/N, XHR for pricing and stock availability).
-                        return get_part_html_tree(dist, pn, extra_search_terms,
-                                                  url=l.get('href', ''),
-                                                  descend=descend-1,
-                                                  scrape_retries=scrape_retries)
-                except KeyError:
-                    pass    # This happens if there is no 'href' in the link, so just skip it.
-
-    # I don't know what happened here, so give up.
-    logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist))
-    raise PartHtmlError
+
+        # If the tree contains the tag for a product page, then just return it.
+        if tree.find('div', id='ph') is not None:
+            return tree, url
+
+        # If the tree is for a list of products, then examine the links to try to find the part number.
+        if tree.find('table', id="products") is not None:
+            self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name))
+            if descend <= 0:
+                self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name))
+                raise PartHtmlError
+            else:
+                # Look for the table of products.
+                products = tree.find(
+                    'table',
+                    id="products").find_all(
+                        'tr',
+                        class_=('product-row'))
+
+                # Extract the product links for the part numbers from the table.
+                product_links = []
+                for p in products:
+                    for a in p.find('td', class_='product').find_all('a'):
+                        product_links.append(a)
+
+                # Extract all the part numbers from the text portion of the links.
+                part_numbers = [l.text for l in product_links]
+
+                # Look for the part number in the list that most closely matches the requested part number.
+                match = difflib.get_close_matches(pn, part_numbers, 1, 0.0)[0]
+
+                # Now look for the link that goes with the closest matching part number.
+                for l in product_links:
+                    try:
+                        if (not l.get('href', '').startswith('./katalog')) and l.text == match:
+                            # Get the tree for the linked-to page and return that.
+                            self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(l.text, pn, self.name))
+                            # TODO: The current implementation does up to four HTTP
+                            # requests per part (search, part details page for TME P/N,
+                            # XHR for pricing information, and XHR for stock
+                            # availability). This is mainly for the compatibility with
+                            # other distributor implementations (html_tree gets passed
+                            # to all functions).
+                            # A modified implementation (which would pass JSON data
+                            # obtained by the XHR instead of the HTML DOM tree) might be
+                            # able to do the same with just two requests (search for TME
+                            # P/N, XHR for pricing and stock availability).
+                            return self.dist_get_part_html_tree(pn, extra_search_terms,
+                                                      url=l.get('href', ''),
+                                                      descend=descend-1)
+                    except KeyError:
+                        pass    # This happens if there is no 'href' in the link, so just skip it.
+
+        # I don't know what happened here, so give up.
+        self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name))
+        raise PartHtmlError

From 0b688232aeada2142b76c27c89b8d998e0f996fa Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Fri, 1 Jun 2018 19:47:14 +0200
Subject: [PATCH 10/29] fake_browser: Replace "urllib" with "requests" to
 remove problematic "Connection: close" http header.

---
 kicost/distributors/fake_browser.py | 59 ++++++++++-------------------
 1 file changed, 19 insertions(+), 40 deletions(-)

diff --git a/kicost/distributors/fake_browser.py b/kicost/distributors/fake_browser.py
index 7477cdec8..f9612251a 100644
--- a/kicost/distributors/fake_browser.py
+++ b/kicost/distributors/fake_browser.py
@@ -27,7 +27,7 @@
 from random import choice
 
 import http.client # For web scraping exceptions.
-import http.cookiejar
+import requests
 
 from ..globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
 
@@ -155,61 +155,40 @@ def __init__(self, logger, scrape_retries):
            @param logger
            @param scrape_retries `int` Quantity of retries in case of fail.
         '''
-        self.cookiejar = http.cookiejar.CookieJar()
+        
         self.userAgent = get_user_agent()
-        self.opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(self.cookiejar))
+
+        # Use "requests" instead of "urllib" because "urllib" does not allow
+        # to remove "Connection: close" header which causes problems with some servers.
+        self.session = requests.session()
+        self.session.headers["User-Agent"] = self.userAgent
+
         self.scrape_retries = scrape_retries
         self.logger = logger
 
     def show_cookies(self, name):
-        for x in self.cookiejar:
-            # TODO: use logger
-            self.logger.log(DEBUG_OBSESSIVE,"%s Cookie %s" % (name, x.name))
-            print("%s Cookie %s" % (name, x.name))
+        for x in self.session.cookies:
+            self.logger.log(DEBUG_OBSESSIVE,"%s Cookie %s" % (x.domain, x.name))
 
     def add_cookie(self, domain, name, value):
-        self.cookiejar.set_cookie(http.cookiejar.Cookie(
-            version=0, 
-            name=name, 
-            value=value,
-            port=None, 
-            port_specified=False,
-            domain=domain, 
-            domain_specified=True, 
-            domain_initial_dot=False,
-            path="/", 
-            path_specified=False,
-            secure=False,
-            expires=None,
-            discard=False,
-            comment=None,
-            comment_url=None,
-            rest=None))
+        self.session.cookies.set(name, value, domain=domain)
+
+    def scrape_URL(self, url, add_header=[]):
+        headers = self.session.headers
+        for header in add_header:
+            self.session.headers[header[1]] = header[2]
 
-    def scrape_URL(self, url, add_header=None):
         for _ in range(self.scrape_retries):
             try:
-                req = Request(url)
-                if add_header:
-                    req.add_header(add_header)
-                req.add_header('User-agent', self.userAgent)
-                req.add_header('Accept', 'text/html')
-                req.add_header('Accept-Language', 'en-US')
-                req.add_header('Accept-Encoding', 'identity')
-                response = self.opener.open(req, timeout=10)
-                html = response.read()
+                html = self.session.get(url, timeout=5).text
                 break
-            #except WEB_SCRAPE_EXCEPTIONS:
             except Exception as ex:
-                # TODO: remove print
-                print('Exception of type "%s" while web-scraping %s' \
-                    % (type(ex).__name__, format(url)))
                 self.logger.log(DEBUG_DETAILED,'Exception of type "%s" while web-scraping %s' \
                     % (type(ex).__name__, format(url)))
                 pass
         else:
-            # TODO: remove print
-            print('No page')
+            self.session.headers = headers
             raise ValueError('No page')
+        self.session.headers = headers
         return html
 

From 0bbfe98677dccaac9dcb90905dc7b5a64ad1a191 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Sat, 2 Jun 2018 11:15:55 +0200
Subject: [PATCH 11/29] Removed now unused file web_routines.py.

---
 kicost/distributors/web_routines.py | 225 ----------------------------
 1 file changed, 225 deletions(-)
 delete mode 100644 kicost/distributors/web_routines.py

diff --git a/kicost/distributors/web_routines.py b/kicost/distributors/web_routines.py
deleted file mode 100644
index 450bc642c..000000000
--- a/kicost/distributors/web_routines.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# MIT license
-#
-# Copyright (C) 2018 by XESS Corporation / Hildo G Jr
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-# Author information.
-__author__ = 'Hildo Guillardi Junior'
-__webpage__ = 'https://github.com/hildogjr/'
-__company__ = 'University of Campinas - Brazil'
-
-# Libraries.
-import sys
-from bs4 import BeautifulSoup # XML file interpreter.
-import multiprocessing # To deal with the parallel scrape.
-import logging
-from time import time
-from random import choice
-from ..eda_tools.eda_tools import order_refs # To better print the warnings about the parts.
-
-try:
-    # This is for Python 3.
-    from urllib.parse import urlsplit, urlunsplit
-except ImportError:
-    # This is for Python 2.
-    from urlparse import urlsplit, urlunsplit
-
-from ..globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE # Debug configurations.
-from ..globals import SEPRTR
-from ..globals import PartHtmlError
-from . import distributor_dict
-
-import os, re
-
-# The distributor module directories will be found in this directory.
-directory = os.path.dirname(__file__)
-
-# Search for the distributor modules and import them.
-dist_modules = {}
-for module in os.listdir(directory):
-
-    # Avoid importing non-directories.
-    abs_module = os.path.join(directory, module)
-    if not os.path.isdir(abs_module):
-        continue
-
-    # Avoid directories like __pycache__.
-    if module.startswith('__'):
-        continue
-
-    # Import the module.
-    dist_modules[module] = __import__(module, globals(), locals(), [], level=1)
-
-__all__ = ['scrape_part', 'config_distributor']
-
-def config_distributor(dist_name, locale_currency='USD'):
-    '''@brief Configure the distributor for some locale/country and
-    currency second ISO3166 and ISO4217
-    
-    @param `str` dist Distributor to configure.
-    @param `str` Alpha 2 country or alpha 3 currency or even one slash other.'''
-    try:
-        dist_module = dist_modules[dist_name]
-    except KeyError: # When use local distributor with personalized name.
-        dist_module = dist_modules[distributor_dict[dist_name]['module']]
-    try:
-        if distributor_dict[dist_name]['scrape']=='web':
-            # Not make sense to configurate a local distributor (yet).
-            locale_currency = re.findall('\w{2,}', locale_currency)
-            locale = None
-            currency = None
-            for alpha in locale_currency:
-                if len(alpha)==2:
-                    locale = alpha
-                elif len(alpha)==3:
-                    currency = alpha
-            dist_module.define_locale_currency(locale_iso=locale, currency_iso=currency)
-    except AttributeError:
-        logger.warning('No currency/country configuration for {}.'.format(distributor_dict[dist_name]['label']))
-        pass
-
-
-def get_part_html_tree(part, dist, get_html_tree_func, local_part_html, scrape_retries, logger):
-    '''@brief Get the HTML tree for a part.
-    
-    Get the HTML tree for a part from the given distributor website or local HTML.
-    @param `str` part Part manufactor code or distributor stock code.
-    @param `str` dist Distributor do scrape.
-    @param `str` get_html_tree_func
-    @param `str` local_part_html
-    @param `int` scrape_retries Maximum times of web ritries.
-    @param logger Logger handle.
-    @return `str` with the HTML webpage.'''
-
-    logger.log(DEBUG_OBSESSIVE, 'Looking in %s by %s:', distributor_dict[dist]['label'], order_refs(part.refs, True))
-
-    for extra_search_terms in set([part.fields.get('manf', ''), '']):
-        try:
-            # Search for part information using one of the following:
-            #    1) the distributor's catalog number.
-            #    2) the manufacturer's part number.
-            for key in (dist+'#', dist+SEPRTR+'cat#', 'manf#'):
-                if key in part.fields:
-                    if part.fields[key]:
-                        # Founded manufacturer / distributor code valid (not empty).
-                        return get_html_tree_func(dist, part.fields[key], extra_search_terms, local_part_html=local_part_html, scrape_retries=scrape_retries)
-            # No distributor or manufacturer number, so give up.
-            else:
-                logger.warning("No '%s#' or 'manf#' field: cannot lookup part %s at %s.", dist, part.refs, dist)
-                return BeautifulSoup('<html></html>', 'lxml'), ''
-                #raise PartHtmlError
-        except PartHtmlError:
-            pass
-        except AttributeError:
-            break
-    logger.warning("Part %s not found at %s.", order_refs(part.refs, False), distributor_dict[dist]['label'])
-    # If no HTML page was found, then return a tree for an empty page.
-    return BeautifulSoup('<html></html>', 'lxml'), ''
-
-
-def scrape_part(args):
-    '''@brief Scrape the data for a part from each distributor website or local HTML.
-    
-    Use distributors submodules to scrape each distributor part page and get
-    informations such as price, quantity avaliable and others;
-    
-    @param `int` Count of the main loop.
-    @param `str`String with the part number / distributor stock.
-    @param `dict`
-    @param `str`
-    @param `int`Number of scrape retries.
-    @param logger.getEffectiveLevel()
-    @param throttle_lock
-    @param throttle_tim
-    @return id, url, `str` distributor stock part number, `dict` price tiers, `int` qty avail, `dict` extrainfo dist
-    '''
-
-    id, part, distributor_dict, local_part_html, scrape_retries, log_level, throttle_lock, throttle_timeouts = args # Unpack the arguments.
-
-    if multiprocessing.current_process().name == "MainProcess":
-        scrape_logger = logging.getLogger('kicost')
-    else:
-        scrape_logger = multiprocessing.get_logger()
-        handler = logging.StreamHandler(sys.stdout)
-        handler.setLevel(log_level)
-        scrape_logger.addHandler(handler)
-        scrape_logger.setLevel(log_level)
-
-    # Create dictionaries for the various items of part data from each distributor.
-    url = {}
-    part_num = {}
-    price_tiers = {}
-    qty_avail = {}
-    info_dist = {}
-
-    # Scrape the part data from each distributor website or the local HTML.
-    # Create a list of the distributor keys and randomly choose one of the
-    # keys to scrape. After scraping, remove the distributor key.
-    # Do this until all the distributors have been scraped.
-    distributors = list(distributor_dict.keys())
-    while distributors:
-
-        d = choice(distributors)  # Randomly choose one of the available distributors.
-
-        try:
-            #dist_module = getattr(THIS_MODULE, d)
-            dist_module = dist_modules[d]
-        except KeyError: # When use local distributor with personalized name.
-            dist_module = dist_modules[distributor_dict[d]['module']]
-
-        # Try to access the list of distributor throttling timeouts.
-        # Abort if some other process is already using the timeouts.
-        if throttle_lock.acquire(blocking=False):
-
-            # Check the throttling timeout for the chosen distributor to see if
-            # another access to its website is allowed.
-            if throttle_timeouts[d] <= time():
-
-                # Update the timeout for this distributor website and release the sync. lock.
-                throttle_timeouts[d] = time() + distributor_dict[d]['throttling_delay']
-                throttle_lock.release()
-
-                # Get the HTML tree for the part.
-                html_tree, url[d] = get_part_html_tree(part, d, dist_module.get_part_html_tree, local_part_html, scrape_retries, scrape_logger)
-
-                # Call the functions that extract the data from the HTML tree.
-                part_num[d] = dist_module.get_part_num(html_tree)
-                qty_avail[d] = dist_module.get_qty_avail(html_tree)
-                price_tiers[d] = dist_module.get_price_tiers(html_tree)
-                
-                try:
-                    # Get extra characeristics of the part in the web page.
-                    # This will be use to comment in the 'cat#' column of the
-                    # spreadsheet and some validations (in the future implementaions)
-                    info_dist[d] = dist_module.get_extra_info(html_tree)
-                except:
-                    info_dist[d] = {}
-                    pass
-
-                # The part data has been scraped from this distributor, so remove it from the list.
-                distributors.remove(d)
-
-            # If the timeout for this distributor has not expired, then release
-            # the sync. lock and try another distributor.
-            else:
-                throttle_lock.release()
-
-    # Return the part data.
-    return id, url, part_num, price_tiers, qty_avail, info_dist

From 8cccfc771962b37b381cb37d008a1e00dc8c3a63 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Sat, 2 Jun 2018 11:22:01 +0200
Subject: [PATCH 12/29] Local distributor improvements and fixed custom local
 distributor names regression.

---
 kicost/distributors/digikey/digikey.py |  8 ++---
 kicost/distributors/distributor.py     | 15 ++++------
 kicost/distributors/farnell/farnell.py |  8 ++---
 kicost/distributors/local/local.py     | 29 +++++++++---------
 kicost/distributors/mouser/mouser.py   |  8 ++---
 kicost/distributors/newark/newark.py   |  8 ++---
 kicost/distributors/rs/rs.py           |  8 ++---
 kicost/distributors/tme/tme.py         |  8 ++---
 kicost/kicost.py                       | 41 +++++++++++---------------
 9 files changed, 56 insertions(+), 77 deletions(-)

diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py
index bd33295bd..b8f7561cb 100644
--- a/kicost/distributors/digikey/digikey.py
+++ b/kicost/distributors/digikey/digikey.py
@@ -52,9 +52,8 @@
 import pycountry
 
 class dist_digikey(distributor.distributor):
-    def __init__(self, scrape_retries, log_level, throttle_delay):
-        super(dist_digikey, self).__init__(scrape_retries, log_level, throttle_delay)
-        self.name = 'digikey'
+    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+        super(dist_digikey, self).__init__(name, scrape_retries, log_level, throttle_delay)
         self.domain = distributor_dict[self.name]['site']['url']
 
         self.browser.scrape_URL(self.domain)
@@ -199,13 +198,12 @@ def dist_get_qty_avail(self, html_tree):
                 # it doesn't contain anything decipherable. Let's just assume it's 0.
                 return 0
 
-    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2):
         '''@brief Find the Digikey HTML page for a part number and return the URL and parse tree.
            @param pn Part number `str()`.
            @param extra_search_terms
            @param url
            @param descend
-           @param local_part_html
            @return (html `str()` of the page, url)
         '''
 
diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py
index 5510ac4b3..e14278176 100644
--- a/kicost/distributors/distributor.py
+++ b/kicost/distributors/distributor.py
@@ -58,8 +58,8 @@
 import os, re
 
 class distributor:
-    def __init__(self, scrape_retries, log_level, throttle_delay):
-        self.name = None
+    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+        self.name = name
         self.page_accessed = False
         self.scrape_retries = scrape_retries
         self.logger = logger
@@ -109,7 +109,7 @@ def define_locale_currency(self, locale_currency='USD'):
             logger.warning('No currency/country configuration for {}.'.format(self.name))
             pass
 
-    def scrape_part(self, id, part, local_part_html):
+    def scrape_part(self, id, part):
         '''@brief Scrape the data for a part from each distributor website or local HTML.
         
         Use distributors submodules to scrape each distributor part page and get
@@ -117,7 +117,6 @@ def scrape_part(self, id, part, local_part_html):
         
         @param `int` Count of the main loop.
         @param `str` String with the part number / distributor stock.
-        @param `str` Local part HTML
         @return id, distributor_name, url, `str` distributor stock part number,
             `dict` price tiers, `int` qty avail, `dict` extrainfo dist
         '''
@@ -153,7 +152,7 @@ def scrape_part(self, id, part, local_part_html):
                 % (self.name, distributor_dict[self.name]['scrape']))
 
         # Get the HTML tree for the part.
-        html_tree, url = self.get_part_html_tree(part, local_part_html=local_part_html)
+        html_tree, url = self.get_part_html_tree(part)
 
         # Call the functions that extract the data from the HTML tree.
         part_num = self.dist_get_part_num(html_tree)
@@ -172,12 +171,11 @@ def scrape_part(self, id, part, local_part_html):
         # Return the part data.
         return id, self.name, url, part_num, price_tiers, qty_avail, info_dist
 
-    def get_part_html_tree(self, part, local_part_html):
+    def get_part_html_tree(self, part):
         '''@brief Get the HTML tree for a part.
         
         Get the HTML tree for a part from the given distributor website or local HTML.
         @param `str` part Part manufactor code or distributor stock code.
-        @param `str` local_part_html
         @return `str` with the HTML webpage.'''
 
         self.logger.log(DEBUG_OBSESSIVE, 'Looking in %s by %s:', self.name, order_refs(part.refs, True))
@@ -191,8 +189,7 @@ def get_part_html_tree(self, part, local_part_html):
                     if key in part.fields:
                         if part.fields[key]:
                             self.page_accessed = True
-                            return self.dist_get_part_html_tree \
-                                (part.fields[key], extra_search_terms, local_part_html=local_part_html)
+                            return self.dist_get_part_html_tree(part.fields[key], extra_search_terms)
                 # No distributor or manufacturer number, so give up.
                 else:
                     self.page_accessed = False
diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py
index fb645c25f..287cb368f 100644
--- a/kicost/distributors/farnell/farnell.py
+++ b/kicost/distributors/farnell/farnell.py
@@ -52,9 +52,8 @@
 __author__='Giacinto Luigi Cerone'
 
 class dist_farnell(distributor.distributor):
-    def __init__(self, scrape_retries, log_level, throttle_delay):
-        super(dist_farnell, self).__init__(scrape_retries, log_level, throttle_delay)
-        self.name = 'farnell'
+    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+        super(dist_farnell, self).__init__(name, scrape_retries, log_level, throttle_delay)
         self.domain = distributor_dict[self.name]['site']['url']
 
         self.browser.scrape_URL(self.domain)
@@ -135,13 +134,12 @@ def dist_get_qty_avail(self, html_tree):
             # Return None so the part won't show in the spreadsheet for this dist.
             return None
 
-    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2):
         '''@brief Find the farnell HTML page for a part number and return the URL and parse tree.
            @param pn Part number `str()`.
            @param extra_search_terms
            @param url
            @param descend
-           @param local_part_html
            @return (html `str()` of the page, url)
         '''
 
diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py
index a3900b6cd..673fda658 100644
--- a/kicost/distributors/local/local.py
+++ b/kicost/distributors/local/local.py
@@ -44,18 +44,20 @@
 from urllib.parse import urlsplit, urlunsplit
 
 class dist_local(distributor.distributor):
-    def __init__(self, scrape_retries, log_level, throttle_delay):
-        super(dist_local, self).__init__(scrape_retries, log_level, throttle_delay)
-        self.name = 'local'
+    # Static variable which contains local part html.
+    html = None
 
-    def create_part_html(self, parts, distributors):
+    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+        super(dist_local, self).__init__(name, scrape_retries, log_level, throttle_delay)
+
+    def create_part_html(parts, distributors, logger):
         '''@brief Create HTML page containing info for local (non-webscraped) parts.
         @param parts `list()` of parts.
         @parm `list()`of the distributors to check each one is local.
-        @return `str()` of the HTML page to be read by `get_part_html_tree()`
+        @param logger
         '''
         
-        self.logger.log(DEBUG_OVERVIEW, 'Create HTML page for parts with custom pricing...')
+        logger.log(DEBUG_OVERVIEW, 'Create HTML page for parts with custom pricing...')
         
         doc, tag, text = Doc().tagtext()
         with tag('html'):
@@ -115,10 +117,9 @@ def make_random_catalog_number(p):
         except:
             pass
 
-        html = doc.getvalue()
-        if self.logger.isEnabledFor(DEBUG_OBSESSIVE):
-            print(indent(html))
-        return html
+        dist_local.html = doc.getvalue()
+        if logger.isEnabledFor(DEBUG_OBSESSIVE):
+            print(indent(dist_local.html))
 
 
     def dist_get_price_tiers(self, html_tree):
@@ -171,20 +172,20 @@ def dist_get_qty_avail(self, html_tree):
             self.logger.log(DEBUG_OBSESSIVE, 'No local part quantity found!')
             return 0
 
-    # TODO: dist param
-    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=None, local_part_html=None):
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=None):
         '''Extract the HTML tree from the HTML page for local parts.
            @param pn Part number `str()`.
            @param extra_search_terms
            @param url
            @param descend
-           @param local_part_html
            @return (html `str()` of the page, `None`) The second argument is always `None` bacause there is not url to return.
         '''
 
         # Extract the HTML tree from the local part HTML page.
         try:
-            tree = BeautifulSoup(local_part_html, 'lxml')
+            print("dist_local.html")
+            print(dist_local.html)
+            tree = BeautifulSoup(dist_local.html, 'lxml')
         except Exception:
             raise PartHtmlError
 
diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py
index ecd454c97..23fd54b4e 100644
--- a/kicost/distributors/mouser/mouser.py
+++ b/kicost/distributors/mouser/mouser.py
@@ -48,9 +48,8 @@
 from urllib.parse import quote_plus as urlquote
 
 class dist_mouser(distributor.distributor):
-    def __init__(self, scrape_retries, log_level, throttle_delay):
-        super(dist_mouser, self).__init__(scrape_retries, log_level, throttle_delay)
-        self.name = 'mouser'
+    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+        super(dist_mouser, self).__init__(name, scrape_retries, log_level, throttle_delay)
         self.domain = distributor_dict[self.name]['site']['url']
         self.browser.add_cookie('.mouser.com', 'preferences', 'ps=www2&pl=en-US&pc_www2=USDe')
 
@@ -140,13 +139,12 @@ def dist_get_qty_avail(self, html_tree):
             self.logger.log(DEBUG_OBSESSIVE, 'No Mouser part quantity found!')
             return None
 
-    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2):
         '''@brief Find the Mouser HTML page for a part number and return the URL and parse tree.
            @param pn Part number `str()`.
            @param extra_search_terms
            @param url
            @param descend
-           @param local_part_html
            @return (html `str()` of the page, url)
         '''
 
diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py
index bbd03fa9e..44532dc95 100644
--- a/kicost/distributors/newark/newark.py
+++ b/kicost/distributors/newark/newark.py
@@ -48,9 +48,8 @@
 from urllib.parse import quote_plus as urlquote
 
 class dist_newark(distributor.distributor):
-    def __init__(self, scrape_retries, log_level, throttle_delay):
-        super(dist_newark, self).__init__(scrape_retries, log_level, throttle_delay)
-        self.name = 'newark'
+    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+        super(dist_newark, self).__init__(name, scrape_retries, log_level, throttle_delay)
         self.domain = distributor_dict[self.name]['site']['url']
 
         self.browser.scrape_URL(self.domain)
@@ -135,13 +134,12 @@ def dist_get_qty_avail(self, html_tree):
             return None
 
 
-    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2):
         '''@brief Find the Newark HTML page for a part number and return the URL and parse tree.
            @param pn Part number `str()`.
            @param extra_search_terms
            @param url
            @param descend
-           @param local_part_html
            @return (html `str()` of the page, url)
         '''
 
diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py
index 34b58fa8d..bf07c42ec 100644
--- a/kicost/distributors/rs/rs.py
+++ b/kicost/distributors/rs/rs.py
@@ -46,9 +46,8 @@
 from urllib.parse import quote_plus as urlquote
 
 class dist_rs(distributor.distributor):
-    def __init__(self, scrape_retries, log_level, throttle_delay):
-        super(dist_rs, self).__init__(scrape_retries, log_level, throttle_delay)
-        self.name = 'rs'
+    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+        super(dist_rs, self).__init__(name, scrape_retries, log_level, throttle_delay)
         self.domain = distributor_dict[self.name]['site']['url']
 
         self.browser.scrape_URL(self.domain)
@@ -114,13 +113,12 @@ def dist_get_qty_avail(self, html_tree):
             # Return None so the part won't show in the spreadsheet for this dist.
             return None
 
-    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2):
         '''@brief Find the RS Components HTML page for a part number and return the URL and parse tree.
            @param pn Part number `str()`.
            @param extra_search_terms
            @param url
            @param descend
-           @param local_part_html
            @return (html `str()` of the page, url)
         '''
                 
diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py
index f2403e629..edf3ccb62 100644
--- a/kicost/distributors/tme/tme.py
+++ b/kicost/distributors/tme/tme.py
@@ -49,9 +49,8 @@
 from urllib.parse import quote_plus as urlquote, urlencode
 
 class dist_tme(distributor.distributor):
-    def __init__(self, scrape_retries, log_level, throttle_delay):
-        super(dist_tme, self).__init__(scrape_retries, log_level, throttle_delay)
-        self.name = 'tme'
+    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+        super(dist_tme, self).__init__(name, scrape_retries, log_level, throttle_delay)
         self.domain = distributor_dict[self.name]['site']['url']
 
         self.browser.scrape_URL(self.domain)
@@ -160,13 +159,12 @@ def dist_get_qty_avail(self, html_tree):
             return None
 
 
-    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2, local_part_html=None):
+    def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2):
         '''@brief Find the TME HTML page for a part number and return the URL and parse tree.
            @param pn Part number `str()`.
            @param extra_search_terms
            @param url
            @param descend
-           @param local_part_html
            @return (html `str()` of the page, url)
         '''
 
diff --git a/kicost/kicost.py b/kicost/kicost.py
index 3a09507e6..b592fcd93 100644
--- a/kicost/kicost.py
+++ b/kicost/kicost.py
@@ -204,33 +204,29 @@ def kicost(in_file, eda_tool_name, out_filename,
                 logger.warning("No 'manf#' and '%s#' field in any part: distributor '%s' will be not scraped.", d, distributor_dict[d]['label'])
                 distributor_dict.pop(d, None)
 
-    # Create an HTML page containing all the local part information.
-    local_distributor = dist_local(scrape_retries, 5, throttling_delay) # TODO: log level
-    local_part_html = local_distributor.create_part_html(parts, distributor_dict)
-    
     if logger.isEnabledFor(DEBUG_DETAILED):
         pprint.pprint(distributor_dict)
 
+    # Create an HTML page containing all the local part information.
+    dist_local.create_part_html(parts, distributor_dict, logger)
+
     # Get the distributor product page for each part and scrape the part data.
     if dist_list:
-
         # Instanciate distributors
         for d in list(distributor_dict.keys()):
             try:
-                ctor = globals()["dist_"+d]
-                # TODO: use logger, not print
-                # TODO: logger does not print anything
                 logger.log(DEBUG_OVERVIEW, "Initialising %s" % d)
-                print("Initialising %s" % d)
-                # TODO: farnell does not respond
-                distributor_dict[d]['instance'] = ctor(scrape_retries, 5, throttling_delay) # TODO: log level
-            except:
-                logger.log(DEBUG_OVERVIEW, "Initialising %s failed, exculding this distributor..." % d)
+                if distributor_dict[d]['scrape'] == 'local':
+                    ctor = globals()['dist_local']
+                else:
+                    ctor = globals()['dist_'+d]
+                distributor_dict[d]['instance'] = ctor(d, scrape_retries, 5, throttling_delay) # TODO: log level
+            except Exception as ex:
+                logger.log(DEBUG_OVERVIEW, "Initialising %s failed with %s, exculding this distributor..." \
+                    % (d, type(ex).__name__))
                 distributor_dict.pop(d, None)
                 pass
 
-	# TODO: multithreaded init, use another pool
-
         if local_currency:
             logger.log(DEBUG_OVERVIEW, '# Configuring the distributors locale and currency...')
             for d in distributor_dict:
@@ -259,23 +255,19 @@ def emit(self, record):
 
         # Init part info dictionaries
         for part in parts:
-            pprint.pprint(vars(part))
             part.part_num = {}
             part.url = {}
             part.price_tiers = {}
             part.qty_avail = {}
             part.info_dist = {}
-        #partsByDist = partListByDistributors(parts)
 
         if num_processes <= 1:
             # Scrape data, one part at a time using single processing.
             for d in distributor_dict:
-                print("Dist loop d=%s" % d)
+                logger.log(DEBUG_OVERVIEW, "Scraping "+ inst.name)
                 for i in range(len(parts)):
-                    print("Part loop i=%d" % i)
                     id, dist, url, part_num, price_tiers, qty_avail, info_dist = \
-                        scrape_result = distributor_dict[d]['instance'].scrape_part \
-                        (i, parts[i], local_part_html)
+                        scrape_result = distributor_dict[d]['instance'].scrape_part(i, parts[i])
 
                     parts[id].part_num[dist] = part_num
                     parts[id].url[dist] = url
@@ -293,12 +285,13 @@ def emit(self, record):
 
             # Package part data for passing to each process.
             arg_sets = [(distributor_dict[d]['instance'], parts, \
-                local_part_html, scraping_progress) for d in distributor_dict]
+                scraping_progress) for d in distributor_dict]
 
-            def mt_scrape_part(inst, parts, local_part_html, scraping_progress):
+            def mt_scrape_part(inst, parts, scraping_progress):
+                logger.log(DEBUG_OVERVIEW, "Scraping "+ inst.name)
                 retval = list()
                 for i in range(len(parts)):
-                    retval.append(inst.scrape_part(i, parts[i], local_part_html))
+                    retval.append(inst.scrape_part(i, parts[i]))
                     scraping_progress.update(1)
                 return retval
 

From 7182bfe49f6de9930637c44dbc1395b0e4b134bf Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Sat, 2 Jun 2018 11:23:52 +0200
Subject: [PATCH 13/29] Fixed logger initialization, log messages from
 kicost.py should now show up correctly.

---
 kicost/__main__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kicost/__main__.py b/kicost/__main__.py
index 9f26339df..e4178eb5c 100644
--- a/kicost/__main__.py
+++ b/kicost/__main__.py
@@ -47,7 +47,6 @@
 HTML_RESPONSE_RETRIES = 2 # Number of attempts to retrieve part data from a website.
 
 from .globals import *
-logger = logging.getLogger('kicost')
 
 ###############################################################################
 # Command-line interface.
@@ -174,7 +173,7 @@ def main():
     #handler = logging.StreamHandler(sys.stdout)
     #handler.setLevel(log_level)
     #logger.addHandler(handler) # It's not necessary to add a handle here, the default is already `sys.stdout` and adding twice it creates the BUG #193, doesn't allowing to use correctly the `tqdm` (process bar) print handle.
-    logger.setLevel(log_level)
+    logging.basicConfig(level=log_level, format='%(message)s')
 
     if args.show_dist_list:
         print('Distributor list:', *sorted(list(distributor_dict.keys())))

From 071f5d031a8fea06ff528a15fa924513e7deaaa5 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Sat, 2 Jun 2018 13:43:38 +0200
Subject: [PATCH 14/29] Added new debugging level DEBUG_HTTP_RESPONSES and
 added per thread timing traces.

---
 kicost/distributors/digikey/digikey.py | 3 ++-
 kicost/distributors/distributor.py     | 3 +++
 kicost/distributors/farnell/farnell.py | 6 +++---
 kicost/distributors/local/local.py     | 2 --
 kicost/distributors/mouser/mouser.py   | 4 ++--
 kicost/distributors/newark/newark.py   | 3 ++-
 kicost/distributors/rs/rs.py           | 3 ++-
 kicost/distributors/tme/tme.py         | 3 ++-
 kicost/globals.py                      | 1 +
 9 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py
index b8f7561cb..7564beab0 100644
--- a/kicost/distributors/digikey/digikey.py
+++ b/kicost/distributors/digikey/digikey.py
@@ -43,7 +43,7 @@
 from .. import fake_browser
 from .. import EXTRA_INFO_DIST, extra_info_dist_name_translations
 from ...globals import PartHtmlError
-from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
+from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
 
 from .. import distributor, distributor_dict
 
@@ -344,6 +344,7 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # I don't know what happened here, so give up.
         self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name))
+        self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html)
         raise PartHtmlError
 
     def part_is_reeled(self, html_tree):
diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py
index e14278176..a4f5b4631 100644
--- a/kicost/distributors/distributor.py
+++ b/kicost/distributors/distributor.py
@@ -58,6 +58,7 @@
 import os, re
 
 class distributor:
+    start_time = time.time()
     def __init__(self, name, scrape_retries, log_level, throttle_delay):
         self.name = name
         self.page_accessed = False
@@ -189,6 +190,8 @@ def get_part_html_tree(self, part):
                     if key in part.fields:
                         if part.fields[key]:
                             self.page_accessed = True
+                            self.logger.log(DEBUG_OBSESSIVE, "%s: scrape timing: %.2f" \
+                                % (self.name, time.time() - distributor.start_time))
                             return self.dist_get_part_html_tree(part.fields[key], extra_search_terms)
                 # No distributor or manufacturer number, so give up.
                 else:
diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py
index 287cb368f..cd6d0758d 100644
--- a/kicost/distributors/farnell/farnell.py
+++ b/kicost/distributors/farnell/farnell.py
@@ -41,7 +41,7 @@
 #from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
-from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
+from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
 from currency_converter import CurrencyConverter
 currency = CurrencyConverter()
 
@@ -196,7 +196,7 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
                         product_links.append(p.find('td', class_='mftrPart').find('a'))
                     except AttributeError:
                         continue
-                print('>>>  ',pn,products,product_links)#TODO
+                #print('>>>  ',pn,products,product_links)#TODO
 
                 # Extract all the part numbers from the text portion of the links.
                 part_numbers = [l.text for l in product_links]
@@ -215,5 +215,5 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # I don't know what happened here, so give up.
         self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name))
-        self.logger.log(DEBUG_OBSESSIVE,'Response was %s' % html)
+        self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html)
         raise PartHtmlError
diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py
index 673fda658..a5ced0487 100644
--- a/kicost/distributors/local/local.py
+++ b/kicost/distributors/local/local.py
@@ -183,8 +183,6 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=N
 
         # Extract the HTML tree from the local part HTML page.
         try:
-            print("dist_local.html")
-            print(dist_local.html)
             tree = BeautifulSoup(dist_local.html, 'lxml')
         except Exception:
             raise PartHtmlError
diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py
index 23fd54b4e..be2108267 100644
--- a/kicost/distributors/mouser/mouser.py
+++ b/kicost/distributors/mouser/mouser.py
@@ -41,7 +41,7 @@
 #from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
-from ...globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
+from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
 
 from .. import distributor, distributor_dict
 
@@ -215,5 +215,5 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # I don't know what happened here, so give up.
         self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name))
-        self.logger.log(DEBUG_OBSESSIVE,'Response was %s' % html)
+        self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html)
         raise PartHtmlError
diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py
index 44532dc95..5f216b9e1 100644
--- a/kicost/distributors/newark/newark.py
+++ b/kicost/distributors/newark/newark.py
@@ -41,7 +41,7 @@
 #from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
-from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
+from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
 
 from .. import distributor, distributor_dict
 
@@ -217,4 +217,5 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # I don't know what happened here, so give up.
         self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name))
+        self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html)
         raise PartHtmlError
diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py
index bf07c42ec..960cf87d7 100644
--- a/kicost/distributors/rs/rs.py
+++ b/kicost/distributors/rs/rs.py
@@ -37,7 +37,7 @@
 #from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
-from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
+from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
 from currency_converter import CurrencyConverter
 currency = CurrencyConverter()
 
@@ -185,4 +185,5 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # I don't know what happened here, so give up.
         self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist))
+        self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html)
         raise PartHtmlError
diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py
index edf3ccb62..1c22b6d89 100644
--- a/kicost/distributors/tme/tme.py
+++ b/kicost/distributors/tme/tme.py
@@ -42,7 +42,7 @@
 #from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
-from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
+from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
 
 from .. import distributor, distributor_dict
 
@@ -249,4 +249,5 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # I don't know what happened here, so give up.
         self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name))
+        self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html)
         raise PartHtmlError
diff --git a/kicost/globals.py b/kicost/globals.py
index 444b22198..226da2797 100644
--- a/kicost/globals.py
+++ b/kicost/globals.py
@@ -28,6 +28,7 @@
 DEBUG_OVERVIEW = logging.DEBUG
 DEBUG_DETAILED = logging.DEBUG-1
 DEBUG_OBSESSIVE = logging.DEBUG-2
+DEBUG_HTTP_RESPONSES = logging.DEBUG-3
 
 SEPRTR = ':'  # Delimiter between library:component, distributor:field, etc.
 

From 253346bef776c3de561dfb170066c4f56f50c01d Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Sat, 2 Jun 2018 13:44:51 +0200
Subject: [PATCH 15/29] Limit num_processes to distributor count, fixed name
 error.

---
 kicost/kicost.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kicost/kicost.py b/kicost/kicost.py
index b592fcd93..58b84a5eb 100644
--- a/kicost/kicost.py
+++ b/kicost/kicost.py
@@ -210,6 +210,10 @@ def kicost(in_file, eda_tool_name, out_filename,
     # Create an HTML page containing all the local part information.
     dist_local.create_part_html(parts, distributor_dict, logger)
 
+    num_processes = min(num_processes, len(distributor_dict))
+    logger.log(DEBUG_OBSESSIVE, "Initialising scraper with %d processes" % num_processes)
+    logger.log(DEBUG_OBSESSIVE, "throttling_delay=%d" % throttling_delay)
+
     # Get the distributor product page for each part and scrape the part data.
     if dist_list:
         # Instanciate distributors
@@ -261,10 +265,12 @@ def emit(self, record):
             part.qty_avail = {}
             part.info_dist = {}
 
+        num_processes = min(num_processes, len(distributor_dict))
+
         if num_processes <= 1:
             # Scrape data, one part at a time using single processing.
             for d in distributor_dict:
-                logger.log(DEBUG_OVERVIEW, "Scraping "+ inst.name)
+                logger.log(DEBUG_OVERVIEW, "Scraping "+ distributor_dict[d]['instance'].name)
                 for i in range(len(parts)):
                     id, dist, url, part_num, price_tiers, qty_avail, info_dist = \
                         scrape_result = distributor_dict[d]['instance'].scrape_part(i, parts[i])

From 53bd3621ba6196b4c9d1ba5b1c649b3523ce477a Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Sat, 2 Jun 2018 13:45:46 +0200
Subject: [PATCH 16/29] Default throttling_delay to 5 seconds to avoid getting
 banned.

---
 kicost/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kicost/__main__.py b/kicost/__main__.py
index e4178eb5c..ea06262d7 100644
--- a/kicost/__main__.py
+++ b/kicost/__main__.py
@@ -148,7 +148,7 @@ def main():
                         metavar = 'NUM_RETRIES',
                         help='Specify the number of attempts to retrieve part data from a website.')
     parser.add_argument('--throttling_delay',
-                        nargs='?', type=float, default=0.0,
+                        nargs='?', type=float, default=5.0,
                         metavar='DELAY',
                         help="Specify minimum delay (in seconds) between successive accesses to a distributor's website.")
     parser.add_argument('--currency', '--locale',

From 9f38aabce0e930f416f787006c2b7489061b965d Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Sun, 3 Jun 2018 10:06:54 +0200
Subject: [PATCH 17/29] Moved throttling_delay handling to fake_browser.

---
 kicost/distributors/distributor.py  | 20 +-------------------
 kicost/distributors/fake_browser.py | 18 +++++++++++++++++-
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py
index a4f5b4631..121cb56ba 100644
--- a/kicost/distributors/distributor.py
+++ b/kicost/distributors/distributor.py
@@ -61,14 +61,11 @@ class distributor:
     start_time = time.time()
     def __init__(self, name, scrape_retries, log_level, throttle_delay):
         self.name = name
-        self.page_accessed = False
         self.scrape_retries = scrape_retries
         self.logger = logger
         self.log_level = log_level
-        self.throttle_delay = throttle_delay
-        self.throttle_timeout = time.time()
         self.domain = None
-        self.browser = fake_browser.fake_browser(self.logger, self.scrape_retries)
+        self.browser = fake_browser.fake_browser(self.logger, self.scrape_retries, throttle_delay)
 
     # Abstract methods, implemented in distributor specific modules
     def dist_get_part_html_tree(self, pn, extra_search_terms, url, descend):
@@ -138,20 +135,6 @@ def scrape_part(self, id, part):
         price_tiers = {}
         info_dist = {}
 
-        if distributor_dict[self.name]['scrape']=='web':
-            if self.page_accessed == True:
-                # Check the throttling timeout for the chosen distributor to see if
-                # another access to its website is allowed.
-                if self.throttle_timeout > time.time():
-                    time.sleep(self.throttle_timeout - time.time())
-
-            # Update the timeout for this distributor website and release the sync. lock.
-            self.throttle_timeout = time.time() + self.throttle_delay
-            # Founded manufacturer / distributor code valid (not empty).
-        else:
-            self.logger.log(DEBUG_OBSESSIVE,'No delay for %s, type=%s' \
-                % (self.name, distributor_dict[self.name]['scrape']))
-
         # Get the HTML tree for the part.
         html_tree, url = self.get_part_html_tree(part)
 
@@ -189,7 +172,6 @@ def get_part_html_tree(self, part):
                 for key in (self.name+'#', self.name+SEPRTR+'cat#', 'manf#'):
                     if key in part.fields:
                         if part.fields[key]:
-                            self.page_accessed = True
                             self.logger.log(DEBUG_OBSESSIVE, "%s: scrape timing: %.2f" \
                                 % (self.name, time.time() - distributor.start_time))
                             return self.dist_get_part_html_tree(part.fields[key], extra_search_terms)
diff --git a/kicost/distributors/fake_browser.py b/kicost/distributors/fake_browser.py
index f9612251a..7ab5e4d26 100644
--- a/kicost/distributors/fake_browser.py
+++ b/kicost/distributors/fake_browser.py
@@ -25,6 +25,7 @@
 __email__ = 'info@xess.com'
 
 from random import choice
+import time
 
 import http.client # For web scraping exceptions.
 import requests
@@ -150,7 +151,7 @@ def get_user_agent():
 
 # Open the URL, read the HTML from it, and parse it into a tree structure.
 class fake_browser:
-    def __init__(self, logger, scrape_retries):
+    def __init__(self, logger, scrape_retries, throttle_delay):
         '''@brief fake_browser
            @param logger
            @param scrape_retries `int` Quantity of retries in case of fail.
@@ -163,6 +164,9 @@ def __init__(self, logger, scrape_retries):
         self.session = requests.session()
         self.session.headers["User-Agent"] = self.userAgent
 
+        self.throttle_delay = throttle_delay
+        self.throttle_timeout = time.time()
+
         self.scrape_retries = scrape_retries
         self.logger = logger
 
@@ -180,6 +184,18 @@ def scrape_URL(self, url, add_header=[]):
 
         for _ in range(self.scrape_retries):
             try:
+                # Check the throttling timeout of this browser to see if
+                # another access to its website is allowed.
+
+                sleepTime = self.throttle_timeout - time.time()
+                self.logger.log(DEBUG_OBSESSIVE, "browser: time=%.2f, timeout=%.2f, sleep=%.2f" \
+                    % (time.time(), self.throttle_timeout, sleepTime))
+                if sleepTime > 0:
+                    time.sleep(sleepTime)
+
+                # Update the timeout for this browser.
+                self.throttle_timeout = time.time() + self.throttle_delay
+
                 html = self.session.get(url, timeout=5).text
                 break
             except Exception as ex:

From 6ac3ec258dbe863cb5961c5b4f979df0d1cefaf7 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Sun, 3 Jun 2018 10:08:59 +0200
Subject: [PATCH 18/29] Revised python2/3 import differences as suggested by
 hildogr.

---
 kicost/distributors/distributor.py  | 4 ++--
 kicost/distributors/fake_browser.py | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py
index 121cb56ba..3989458a1 100644
--- a/kicost/distributors/distributor.py
+++ b/kicost/distributors/distributor.py
@@ -36,13 +36,13 @@
 from . import fake_browser
 
 import http.client # For web scraping exceptions.
-try:
+if sys.version_info>=(3,0):
     # This is for Python 3.
     from urllib.parse import urlencode, quote_plus as urlquote, urlsplit, urlunsplit
     from urllib.request import urlopen, Request
     import urllib.error
     WEB_SCRAPE_EXCEPTIONS = (urllib.error.URLError, http.client.HTTPException)
-except ImportError:
+else:
     # This is for Python 2.
     from urlparse import urlsplit, urlunsplit
     from urllib import urlencode, quote_plus as urlquote
diff --git a/kicost/distributors/fake_browser.py b/kicost/distributors/fake_browser.py
index 7ab5e4d26..0fd332e33 100644
--- a/kicost/distributors/fake_browser.py
+++ b/kicost/distributors/fake_browser.py
@@ -24,6 +24,7 @@
 __author__ = 'XESS Corporation'
 __email__ = 'info@xess.com'
 
+import sys
 from random import choice
 import time
 
@@ -32,13 +33,13 @@
 
 from ..globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
 
-try:
+if sys.version_info>=(3,0):
     # This is for Python 3
     from urllib.parse import urlencode, quote_plus as urlquote, urlsplit, urlunsplit
     from urllib.request import urlopen, Request
     import urllib.error
     WEB_SCRAPE_EXCEPTIONS = (urllib.error.URLError, http.client.HTTPException)
-except ImportError:
+else:
     # This is for Python 2
     from urlparse import urlsplit, urlunsplit
     from urllib import urlencode, quote_plus as urlquote

From dd69881ff18e6efdf5a4e6aa68a7fc2ea3ce269b Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Sun, 3 Jun 2018 10:11:05 +0200
Subject: [PATCH 19/29] Implemented multithreaded distributor initialisation
 and reduced mt_scrape_part argument count.

---
 kicost/kicost.py | 61 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 42 insertions(+), 19 deletions(-)

diff --git a/kicost/kicost.py b/kicost/kicost.py
index 58b84a5eb..4d83b0474 100644
--- a/kicost/kicost.py
+++ b/kicost/kicost.py
@@ -211,34 +211,56 @@ def kicost(in_file, eda_tool_name, out_filename,
     dist_local.create_part_html(parts, distributor_dict, logger)
 
     num_processes = min(num_processes, len(distributor_dict))
-    logger.log(DEBUG_OBSESSIVE, "Initialising scraper with %d processes" % num_processes)
+    logger.log(DEBUG_OBSESSIVE, "Initialising scraper with %d threads" % num_processes)
     logger.log(DEBUG_OBSESSIVE, "throttling_delay=%d" % throttling_delay)
 
     # Get the distributor product page for each part and scrape the part data.
     if dist_list:
-        # Instanciate distributors
-        for d in list(distributor_dict.keys()):
+
+        # Create thread pool to init multiple distributors simultaneously.
+        pool = ThreadPool(num_processes)
+
+        # Package part data for passing to each process.
+        arg_sets = [(d, distributor_dict[d]['scrape']) for d in distributor_dict]
+
+        def mt_init_dist(d, scrape):
+            instance = None
             try:
                 logger.log(DEBUG_OVERVIEW, "Initialising %s" % d)
-                if distributor_dict[d]['scrape'] == 'local':
+                if scrape == 'local':
                     ctor = globals()['dist_local']
                 else:
                     ctor = globals()['dist_'+d]
-                distributor_dict[d]['instance'] = ctor(d, scrape_retries, 5, throttling_delay) # TODO: log level
+                instance = ctor(d, scrape_retries, 5, throttling_delay) # TODO: log level
             except Exception as ex:
                 logger.log(DEBUG_OVERVIEW, "Initialising %s failed with %s, exculding this distributor..." \
                     % (d, type(ex).__name__))
+                return (d, None)
+
+            if local_currency:
+                logger.log(DEBUG_OVERVIEW, '# Configuring the distributors locale and currency...')
+                instance.define_locale_currency(local_currency)
+            return (d, instance)
+
+        logger.log(DEBUG_OBSESSIVE, 'Starting {} threads to init distributors...'.format(num_processes))
+        pprint.pprint(arg_sets)
+        results = [pool.apply_async(mt_init_dist, args) for args in arg_sets]
+
+        # Wait for all the processes to have results.
+        pool.close()
+        pool.join()
+
+        # Get the data from each process result structure.
+        for result in results:
+            d, instance = result.get()
+            # Distributor initialisation failed, remove it from distributor_dict.
+            if instance == None:
                 distributor_dict.pop(d, None)
-                pass
-
-        if local_currency:
-            logger.log(DEBUG_OVERVIEW, '# Configuring the distributors locale and currency...')
-            for d in distributor_dict:
-                distributor_dict[d]['instance'].define_locale_currency(local_currency)
+            # Distributor initialised successfully, add instance to distributor_dict.
+            else:
+                distributor_dict[d]['instance'] = instance
 
         logger.log(DEBUG_OVERVIEW, '# Scraping part data for each component group...')
-
-        global scraping_progress
         scraping_progress = tqdm.tqdm(desc='Progress', total=len(parts)*len(distributor_dict), unit='part', miniters=1)
 
         # Change the logging print channel to `tqdm` to keep the process bar to the end of terminal.
@@ -285,15 +307,16 @@ def emit(self, record):
             # Scrape data, multiple parts at a time using multiprocessing.
 
             # Create thread pool to scrape data for multiple distributors simultaneously.
-            # PYthon threads are time-sliced but they work in our I/O limited scenario
+            # Python threads are time-sliced but they work in our I/O limited scenario
             # and avoid all kinds of pickle issues.
             pool = ThreadPool(num_processes)
 
             # Package part data for passing to each process.
-            arg_sets = [(distributor_dict[d]['instance'], parts, \
-                scraping_progress) for d in distributor_dict]
+            # pool.async_apply needs at least two arguments per function so add dummy argument
+            # (otherwise it fails with "arguments after * must be an iterable, not ...")
+            arg_sets = [(distributor_dict[d]['instance'], None) for d in distributor_dict]
 
-            def mt_scrape_part(inst, parts, scraping_progress):
+            def mt_scrape_part(inst, dummy):
                 logger.log(DEBUG_OVERVIEW, "Scraping "+ inst.name)
                 retval = list()
                 for i in range(len(parts)):
@@ -302,13 +325,13 @@ def mt_scrape_part(inst, parts, scraping_progress):
                 return retval
 
             # Start the web scraping processes, one for each part.
-            logger.log(DEBUG_OBSESSIVE, 'Starting {} parallels process to scrap parts...'.format(num_processes))
+            logger.log(DEBUG_OBSESSIVE, 'Starting {} parallel threads to scrap parts...'.format(num_processes))
             results = [pool.apply_async(mt_scrape_part, args) for args in arg_sets]
 
             # Wait for all the processes to have results, then kill-off all the scraping processes.
             pool.close()
             pool.join()
-            logger.log(DEBUG_OVERVIEW, 'All parallels process finished with success.')
+            logger.log(DEBUG_OVERVIEW, 'All parallel threads finished with success.')
 
             # Get the data from each process result structure.
             for res_proc in results:

From 0718fd3b713e805199a46f304ff7be084e1affc6 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Mon, 4 Jun 2018 17:03:49 +0200
Subject: [PATCH 20/29] Added "requests" dependency to setup.py.

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 689abd43a..8e0c3c202 100644
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,7 @@
     'lxml >= 3.7.2',
     'yattag >= 1.5.2',
     'tqdm >= 4.4.0',
+    'requests >= 2.18.4',
     'CurrencyConverter >= 0.5', # Used to convert price to a not avaiable currecy in one distributor.
     'pycountry >= 18.2', # ISO4117, ISO3166 country and currency definitons from Debian’s pkg-isocodes.
 #    'wxPython >= 4.0', # Graphical package/library needed to user guide.

From 500116e413bae1abc053fd4cd15ec6e75d33bc0b Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Mon, 4 Jun 2018 17:05:21 +0200
Subject: [PATCH 21/29] Moved CurrencyConverter instance to globals.

---
 kicost/distributors/farnell/farnell.py | 3 +--
 kicost/distributors/rs/rs.py           | 3 +--
 kicost/globals.py                      | 2 ++
 kicost/kicost.py                       | 2 ++
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py
index cd6d0758d..3d0afdce5 100644
--- a/kicost/distributors/farnell/farnell.py
+++ b/kicost/distributors/farnell/farnell.py
@@ -41,9 +41,8 @@
 #from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
+from ...globals import currency
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
-from currency_converter import CurrencyConverter
-currency = CurrencyConverter()
 
 from .. import distributor, distributor_dict
 
diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py
index 960cf87d7..bf88b44f4 100644
--- a/kicost/distributors/rs/rs.py
+++ b/kicost/distributors/rs/rs.py
@@ -38,8 +38,7 @@
 from .. import fake_browser
 from ...globals import PartHtmlError
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
-from currency_converter import CurrencyConverter
-currency = CurrencyConverter()
+from ...globals import currency
 
 from .. import distributor, distributor_dict
 
diff --git a/kicost/globals.py b/kicost/globals.py
index 226da2797..4d95d392f 100644
--- a/kicost/globals.py
+++ b/kicost/globals.py
@@ -23,6 +23,7 @@
 """Stuff that everybody else needs to know about."""
 
 import logging
+from currency_converter import CurrencyConverter
 
 logger = logging.getLogger('kicost')
 DEBUG_OVERVIEW = logging.DEBUG
@@ -32,6 +33,7 @@
 
 SEPRTR = ':'  # Delimiter between library:component, distributor:field, etc.
 
+currency = CurrencyConverter()
 
 class PartHtmlError(Exception):
     '''Exception for failed retrieval of an HTML parse tree for a part.'''
diff --git a/kicost/kicost.py b/kicost/kicost.py
index 4d83b0474..9a2ed03e9 100644
--- a/kicost/kicost.py
+++ b/kicost/kicost.py
@@ -109,6 +109,8 @@ def kicost(in_file, eda_tool_name, out_filename,
     @param local_currency `str()` Local/country in ISO3166:2 and currency in ISO4217. Default 'USD'.
     '''
 
+    logger.log(DEBUG_OVERVIEW, 'Exchange rate: 1 EUR = %.2f USD' % currency.convert(1, 'EUR', 'USD'))
+
     # Only keep distributors in the included list and not in the excluded list.
     if dist_list!=None:
         if not dist_list:

From 34c34faf25c5205d82223314f220af6f4cfde04b Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Mon, 4 Jun 2018 17:06:09 +0200
Subject: [PATCH 22/29] Fixed remaining digikey refactoring issues.

---
 kicost/distributors/digikey/digikey.py | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py
index 7564beab0..2c8e5ca87 100644
--- a/kicost/distributors/digikey/digikey.py
+++ b/kicost/distributors/digikey/digikey.py
@@ -34,8 +34,6 @@
 
 import future
 
-# TODO: not working yet ?
-
 import re, difflib
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
@@ -219,9 +217,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # Open the URL, read the HTML from it, and parse it into a tree structure.
         try:
-            html = fake_browser(url, scrape_retries)
-        except:
-            self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name))
+            html = self.browser.scrape_URL(url)
+        except Exception as ex:
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}, ex: {}'.format(pn, self.name, type(ex).__name__))
             raise PartHtmlError
 
         # Abort if the part number isn't in the HTML somewhere.
@@ -274,9 +272,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
                     # the entire list for one that's non-reeled. Use this as the
                     # main page for the part.
                     ap_trees_and_urls.append((tree, url))
-                    if part_is_reeled(tree):
+                    if self.part_is_reeled(tree):
                         for ap_tree, ap_url in ap_trees_and_urls:
-                            if not part_is_reeled(ap_tree):
+                            if not self.part_is_reeled(ap_tree):
                                 # Found a non-reeled part, so use it as the main page.
                                 tree = ap_tree
                                 url = ap_url
@@ -290,9 +288,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
                         try:
                             # Merge the pricing info from that into the main parse tree to make
                             # a single, unified set of price tiers...
-                            merge_price_tiers(tree, ap_tree)
+                            self.merge_price_tiers(tree, ap_tree)
                             # and merge available quantity, using the maximum found.
-                            merge_qty_avail(tree, ap_tree)
+                            self.merge_qty_avail(tree, ap_tree)
                         except AttributeError:
                             self.logger.log(DEBUG_OBSESSIVE,'Problem merging price/qty for {} from {}'.format(pn, self.name))
                             continue
@@ -352,7 +350,7 @@ def part_is_reeled(self, html_tree):
            @param html_tree `str()` html of the distributor part page.
            @return `True` or `False`.
         '''
-        qty_tiers = list(get_price_tiers(html_tree).keys())
+        qty_tiers = list(self.dist_get_price_tiers(html_tree).keys())
         if len(qty_tiers) > 0 and min(qty_tiers) >= 100:
             return True
         if html_tree.find('table',
@@ -372,8 +370,8 @@ def merge_price_tiers(self, main_tree, alt_tree):
     def merge_qty_avail(self, main_tree, alt_tree):
         '''Merge the quantities from the alternate-packaging tree into the main tree.'''
         try:
-            main_qty = get_qty_avail(main_tree)
-            alt_qty = get_qty_avail(alt_tree)
+            main_qty = self.dist_get_qty_avail(main_tree)
+            alt_qty = self.dist_get_qty_avail(alt_tree)
             if main_qty is None:
                 merged_qty = alt_qty
             elif alt_qty is None:

From 08412e359c056f914abd7230acd8783f42601ad6 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Mon, 4 Jun 2018 17:06:40 +0200
Subject: [PATCH 23/29] Fixed remaining RS refactoring issues and updated URL.

---
 kicost/distributors/rs/__init__.py |  2 +-
 kicost/distributors/rs/rs.py       | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/kicost/distributors/rs/__init__.py b/kicost/distributors/rs/__init__.py
index 5c06b17b3..0b4b173d7 100644
--- a/kicost/distributors/rs/__init__.py
+++ b/kicost/distributors/rs/__init__.py
@@ -25,7 +25,7 @@
             },
             # Web site defitions.
             'site': {
-                'url': 'https://rs-online.com/',
+                'url': 'https://it.rs-online.com/',
                 'currency': 'USD',
                 'locale': 'UK'
             },
diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py
index bf88b44f4..1fd87af0e 100644
--- a/kicost/distributors/rs/rs.py
+++ b/kicost/distributors/rs/rs.py
@@ -134,19 +134,19 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
         try:
             html = self.browser.scrape_URL(url)
         except:
-            self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, dist))
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML page for {} from {}'.format(pn, self.name))
             raise PartHtmlError
 
         try:
             tree = BeautifulSoup(html, 'lxml')
         except Exception:
-            self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, dist))
+            self.logger.log(DEBUG_OBSESSIVE,'No HTML tree for {} from {}'.format(pn, self.name))
             raise PartHtmlError
 
         # Abort if the part number isn't in the HTML somewhere.
         # (Only use the numbers and letters to compare PN to HTML.)
         if re.sub('[\W_]','',str.lower(pn)) not in re.sub('[\W_]','',str.lower(str(html))):
-            self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, dist))
+            self.logger.log(DEBUG_OBSESSIVE,'No part number {} in HTML page from {}'.format(pn, self.name))
             raise PartHtmlError
             
         # If the tree contains the tag for a product page, then just return it.
@@ -155,9 +155,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # If the tree is for a list of products, then examine the links to try to find the part number.
         if tree.find('div', class_=('resultsTable','results-table-container')) is not None:
-            self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, dist))
+            self.logger.log(DEBUG_OBSESSIVE,'Found product table for {} from {}'.format(pn, self.name))
             if descend <= 0:
-                self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, dist))
+                self.logger.log(DEBUG_OBSESSIVE,'Passed descent limit for {} from {}'.format(pn, self.name))
                 raise PartHtmlError
             else:
                 # Look for the table of products.
@@ -177,12 +177,12 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
                 for i in range(len(product_links)):
                     if part_numbers[i] == match:
                         # Get the tree for the linked-to page and return that.
-                        self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(part_numbers[i], pn, dist))
+                        self.logger.log(DEBUG_OBSESSIVE,'Selecting {} from product table for {} from {}'.format(part_numbers[i], pn, self.name))
                         return self.dist_get_part_html_tree(pn, extra_search_terms,
                                                   url=product_links[i],
                                                   descend=descend-1)
 
         # I don't know what happened here, so give up.
-        self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, dist))
+        self.logger.log(DEBUG_OBSESSIVE,'Unknown error for {} from {}'.format(pn, self.name))
         self.logger.log(DEBUG_HTTP_RESPONSES,'Response was %s' % html)
         raise PartHtmlError

From 5a904941bbfea34beb29d4fa167b7c14581779ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hildo=20Guillardi=20J=C3=BAnior?=
 <hildogjr@users.noreply.github.com>
Date: Mon, 4 Jun 2018 17:27:15 +0200
Subject: [PATCH 24/29] TME fix and clean-up imports

---
 AUTHORS.rst                            |  3 ++-
 HISTORY.rst                            | 12 +++++++-----
 kicost/distributors/digikey/digikey.py | 12 ++----------
 kicost/distributors/farnell/farnell.py | 15 +++------------
 kicost/distributors/local/local.py     |  6 +-----
 kicost/distributors/mouser/mouser.py   | 15 +++------------
 kicost/distributors/newark/newark.py   | 15 +++------------
 kicost/distributors/rs/rs.py           |  7 +------
 kicost/distributors/tme/tme.py         | 18 ++++--------------
 9 files changed, 26 insertions(+), 77 deletions(-)

diff --git a/AUTHORS.rst b/AUTHORS.rst
index df70686fb..8e7c4299a 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -16,4 +16,5 @@ Contributors
 * Diorcet Yann: https://github.com/diorcety
 * Giacinto Luigi Cerone https://github.com/glcerone
 * Hildo Guillardi Júnior https://github.com/hildogjr
-* Adam Heinrich https://github.com/adamheinrich
\ No newline at end of file
+* Adam Heinrich https://github.com/adamheinrich
+* Max Maisel https://github.com/mmmaisel
diff --git a/HISTORY.rst b/HISTORY.rst
index f8ca7a592..a9165c305 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -7,14 +7,16 @@ History
 ______________________
 
 * Changed Farnell link and table result format.
-
+* Fixed TME `fake_browser` migration.
+* Re-factored the distributors modules to class kind and improved the scrape sequence to decrease probability of ban.
+* Fixed the multi-threading configuration.
 
 0.1.44 (2018-05-28)
 ______________________
 
-* Fixed `logging` messages when using `tqdm`(process bar) for sequencial scrape, missing fix for multithreads scrape.
-* Improve the `spreadsheet.py` to a lighter file when use just one distributor.
-* Improved log messages to better comunity debug.
+* Fixed ``logging`` messages when using ``tqdm``(process bar) for sequential scrape, missing fix for multithreads scrape.
+* Improve the ``spreadsheet.py`` to a lighter file when use just one distributor.
+* Improved log messages to better community debug.
 * Add Upverter CSV compatibility.
 * Fixed Mouser "quote price" exception in the price tiers.
 * Fixed wxPython exception import.
@@ -38,7 +40,7 @@ ______________________
 * Added ``--group_fields`` option to ignore differences in fields of the components and group them.
 * Fixed the not ungrouping issue when ``manf#`` equal ``None``.
 * CSV now accepts files from Proteus and Eagle EDA tools.
-* Cleared up unused Python imports and better placed functions into files (spreasheet creation files are now in ``spreadsheet.py``).
+* Cleared up unused Python imports and better placed functions into files (spreadsheet creation files are now in ``spreadsheet.py``).
 * Added a KiCost stamp version at the end of the spreadsheet and file information in the beginning, if they are not inside it.
 * Fixed issues related to user visualization in the spreadsheet (added gray formatted conditioning and the "exclude desc and manf columns").
 * Added "user errors" and software scape in the case of not recognized references characters given the message of how to solve.
diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py
index 2c8e5ca87..2dc56c7d0 100644
--- a/kicost/distributors/digikey/digikey.py
+++ b/kicost/distributors/digikey/digikey.py
@@ -21,23 +21,15 @@
 # THE SOFTWARE.
 
 # Inserted by Pasteurize tool.
-from __future__ import print_function
-from __future__ import unicode_literals
-from __future__ import division
-from __future__ import absolute_import
-from builtins import zip
-from builtins import range
-from builtins import int
-from builtins import str
+from __future__ import print_function, unicode_literals, division, absolute_import
+from builtins import zip, range, int, str
 from future import standard_library
 standard_library.install_aliases()
-
 import future
 
 import re, difflib
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-#from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from .. import EXTRA_INFO_DIST, extra_info_dist_name_translations
 from ...globals import PartHtmlError
diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py
index 3d0afdce5..fa64308f9 100644
--- a/kicost/distributors/farnell/farnell.py
+++ b/kicost/distributors/farnell/farnell.py
@@ -21,24 +21,15 @@
 # THE SOFTWARE.
 
 # Inserted by Pasteurize tool.
-from __future__ import print_function
-from __future__ import unicode_literals
-from __future__ import division
-from __future__ import absolute_import
-from builtins import zip
-from builtins import range
-from builtins import int
-from builtins import str
+from __future__ import print_function, unicode_literals, division, absolute_import
+from builtins import zip, range, int, str
 from future import standard_library
 standard_library.install_aliases()
-
 import future
 
-import re
-import difflib
+import re, difflib
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-#from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
 from ...globals import currency
diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py
index a5ced0487..93fff0723 100644
--- a/kicost/distributors/local/local.py
+++ b/kicost/distributors/local/local.py
@@ -21,14 +21,10 @@
 # THE SOFTWARE.
 
 # Inserted by Pasteurize tool.
-from __future__ import print_function
-from __future__ import unicode_literals
-from __future__ import division
-from __future__ import absolute_import
+from __future__ import print_function, unicode_literals, division, absolute_import
 from builtins import zip, range, int, str
 from future import standard_library
 standard_library.install_aliases()
-
 import future
 
 import re, difflib
diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py
index be2108267..ea724f78a 100644
--- a/kicost/distributors/mouser/mouser.py
+++ b/kicost/distributors/mouser/mouser.py
@@ -21,24 +21,15 @@
 # THE SOFTWARE.
 
 # Inserted by Pasteurize tool.
-from __future__ import print_function
-from __future__ import unicode_literals
-from __future__ import division
-from __future__ import absolute_import
-from builtins import zip
-from builtins import range
-from builtins import int
-from builtins import str
+from __future__ import print_function, unicode_literals, division, absolute_import
+from builtins import zip, range, int, str
 from future import standard_library
 standard_library.install_aliases()
-
 import future
 
-import re
-import difflib
+import re, difflib
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-#from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py
index 5f216b9e1..5deb4a5f6 100644
--- a/kicost/distributors/newark/newark.py
+++ b/kicost/distributors/newark/newark.py
@@ -21,24 +21,15 @@
 # THE SOFTWARE.
 
 # Inserted by Pasteurize tool.
-from __future__ import print_function
-from __future__ import unicode_literals
-from __future__ import division
-from __future__ import absolute_import
-from builtins import zip
-from builtins import range
-from builtins import int
-from builtins import str
+from __future__ import print_function, unicode_literals, division, absolute_import
+from builtins import zip, range, int, str
 from future import standard_library
 standard_library.install_aliases()
-
 import future
 
-import re
-import difflib
+import re, difflib
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-#from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py
index 1fd87af0e..5ca7dbc5b 100644
--- a/kicost/distributors/rs/rs.py
+++ b/kicost/distributors/rs/rs.py
@@ -21,20 +21,15 @@
 # THE SOFTWARE.
 
 # Inserted by Pasteurize tool.
-from __future__ import print_function
-from __future__ import unicode_literals
-from __future__ import division
-from __future__ import absolute_import
+from __future__ import print_function, unicode_literals, division, absolute_import
 from builtins import zip, range, int, str
 from future import standard_library
 standard_library.install_aliases()
-
 import future
 
 import re, difflib
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-#from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py
index 1c22b6d89..138771d96 100644
--- a/kicost/distributors/tme/tme.py
+++ b/kicost/distributors/tme/tme.py
@@ -21,25 +21,16 @@
 # THE SOFTWARE.
 
 # Inserted by Pasteurize tool.
-from __future__ import print_function
-from __future__ import unicode_literals
-from __future__ import division
-from __future__ import absolute_import
-from builtins import zip
-from builtins import range
-from builtins import int
-from builtins import str
+from __future__ import print_function, unicode_literals, division, absolute_import
+from builtins import zip, range, int, str
 from future import standard_library
 standard_library.install_aliases()
-
 import future
 
-import re
-import difflib
+import re, difflib
 import json
 from bs4 import BeautifulSoup
 import http.client # For web scraping exceptions.
-#from .. import urlencode, urlquote, urlsplit, urlunsplit
 from .. import fake_browser
 from ...globals import PartHtmlError
 from ...globals import logger, DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_RESPONSES
@@ -73,8 +64,7 @@ def __ajax_details(self, pn):
             return None, None
 
         try:
-            r = r.decode('utf-8')  # Convert bytes to string in Python 3.
-            p = json.loads(r).get('Products')
+            p = json.loads(html).get('Products')
             if p is not None and isinstance(p, list):
                 p = p[0]
                 html_tree = BeautifulSoup(p.get('PriceTpl', '').replace("\n", ""), "lxml")

From bcfbe7de293d70ec15d0c0c66d8370726615dd65 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Tue, 5 Jun 2018 17:09:27 +0200
Subject: [PATCH 25/29] Fixed syntax/naming error in digikey part number from
 table function.

---
 kicost/distributors/digikey/digikey.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py
index 2dc56c7d0..a07f3ce8e 100644
--- a/kicost/distributors/digikey/digikey.py
+++ b/kicost/distributors/digikey/digikey.py
@@ -254,8 +254,8 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
                     self.logger.log(DEBUG_OBSESSIVE,'Found {} alternate packagings for {} from {}'.format(len(ap_urls), pn, self.name))
                     ap_trees_and_urls = []  # Initialize as empty in case no alternate packagings are found.
                     try:
-                        ap_trees_and_urls = [get_part_html_tree(self.name, pn, 
-                                         extra_search_terms, ap_url, descend=0, scrape_retries=scrape_retries)
+                        ap_trees_and_urls = [self.dist_get_part_html_tree(pn, 
+                                         extra_search_terms, ap_url, descend=0)
                                          for ap_url in ap_urls]
                     except Exception:
                         self.logger.log(DEBUG_OBSESSIVE,'Failed to find alternate packagings for {} from {}'.format(pn, self.name))

From d1902b37ba8107a5fa142d2aa820544c147dc698 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Wed, 6 Jun 2018 17:15:00 +0200
Subject: [PATCH 26/29] Fixed compiler errors in digikey
 dist_define_locale_currency, removed unneccessary non-logger debug print in
 kicost.py.

---
 kicost/distributors/digikey/digikey.py | 7 ++++---
 kicost/kicost.py                       | 1 -
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py
index a07f3ce8e..da7a970eb 100644
--- a/kicost/distributors/digikey/digikey.py
+++ b/kicost/distributors/digikey/digikey.py
@@ -130,10 +130,11 @@ def dist_define_locale_currency(self, locale_iso=None, currency_iso=None):
                 locale_iso = pycountry.countries.get(numeric=money.numeric).alpha_2
             if locale_iso:
                 locale_iso = locale_iso.upper()
-                country = pycountry.countries.get(alpha_2=locale_iso.upper()).name
-                html = html.find('li', text=re.compile(country, re.IGNORECASE))
+                country = pycountry.countries.get(alpha_2=locale_iso.upper())
+                html = html.find('li', text=re.compile(country.name, re.IGNORECASE))
                 url = html.find('a', id='linkcolor').get('href')
-                
+
+                # Store new localized url in distributor_dict.
                 distributor_dict[self.name]['site']['url'] = url
                 distributor_dict[self.name]['site']['currency'] = pycountry.currencies.get(numeric=country.numeric).alpha_3
                 distributor_dict[self.name]['site']['locale'] = locale_iso
diff --git a/kicost/kicost.py b/kicost/kicost.py
index 9a2ed03e9..5f121fb3f 100644
--- a/kicost/kicost.py
+++ b/kicost/kicost.py
@@ -245,7 +245,6 @@ def mt_init_dist(d, scrape):
             return (d, instance)
 
         logger.log(DEBUG_OBSESSIVE, 'Starting {} threads to init distributors...'.format(num_processes))
-        pprint.pprint(arg_sets)
         results = [pool.apply_async(mt_init_dist, args) for args in arg_sets]
 
         # Wait for all the processes to have results.

From 3aa06596a897ce1fdf1b2c667e136d95ac0cefe1 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Wed, 6 Jun 2018 17:17:54 +0200
Subject: [PATCH 27/29] Implemented session recreation strategy if scraper gets
 detected.

---
 kicost/distributors/digikey/digikey.py |  7 +---
 kicost/distributors/distributor.py     | 10 +++--
 kicost/distributors/fake_browser.py    | 58 +++++++++++++++++++++-----
 kicost/distributors/farnell/farnell.py |  7 +---
 kicost/distributors/local/local.py     |  2 +-
 kicost/distributors/mouser/mouser.py   |  7 +---
 kicost/distributors/newark/newark.py   |  7 +---
 kicost/distributors/rs/rs.py           |  7 +---
 kicost/distributors/tme/tme.py         |  7 +---
 kicost/globals.py                      |  3 +-
 10 files changed, 69 insertions(+), 46 deletions(-)

diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py
index da7a970eb..0999f005e 100644
--- a/kicost/distributors/digikey/digikey.py
+++ b/kicost/distributors/digikey/digikey.py
@@ -43,11 +43,8 @@
 
 class dist_digikey(distributor.distributor):
     def __init__(self, name, scrape_retries, log_level, throttle_delay):
-        super(dist_digikey, self).__init__(name, scrape_retries, log_level, throttle_delay)
-        self.domain = distributor_dict[self.name]['site']['url']
-
-        self.browser.scrape_URL(self.domain)
-        self.browser.show_cookies(self.name)
+        super(dist_digikey, self).__init__(name, distributor_dict[name]['site']['url'],
+            scrape_retries, log_level, throttle_delay)
 
     def dist_get_price_tiers(self, html_tree):
         '''@brief Get the pricing tiers from the parsed tree of the Digikey product page.
diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py
index 3989458a1..1c2b96f56 100644
--- a/kicost/distributors/distributor.py
+++ b/kicost/distributors/distributor.py
@@ -59,13 +59,17 @@
 
 class distributor:
     start_time = time.time()
-    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+    def __init__(self, name, domain, scrape_retries, log_level, throttle_delay):
         self.name = name
         self.scrape_retries = scrape_retries
         self.logger = logger
         self.log_level = log_level
-        self.domain = None
-        self.browser = fake_browser.fake_browser(self.logger, self.scrape_retries, throttle_delay)
+        self.domain = domain
+
+        # Don't create fake_browser for "local" distributor.
+        if self.domain != None:
+            self.browser = fake_browser.fake_browser \
+                (self.domain, self.logger, self.scrape_retries, throttle_delay)
 
     # Abstract methods, implemented in distributor specific modules
     def dist_get_part_html_tree(self, pn, extra_search_terms, url, descend):
diff --git a/kicost/distributors/fake_browser.py b/kicost/distributors/fake_browser.py
index 0fd332e33..2d8cd99bf 100644
--- a/kicost/distributors/fake_browser.py
+++ b/kicost/distributors/fake_browser.py
@@ -31,7 +31,7 @@
 import http.client # For web scraping exceptions.
 import requests
 
-from ..globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE
+from ..globals import DEBUG_OVERVIEW, DEBUG_DETAILED, DEBUG_OBSESSIVE, DEBUG_HTTP_HEADERS, DEBUG_HTTP_RESPONSES
 
 if sys.version_info>=(3,0):
     # This is for Python 3
@@ -152,12 +152,23 @@ def get_user_agent():
 
 # Open the URL, read the HTML from it, and parse it into a tree structure.
 class fake_browser:
-    def __init__(self, logger, scrape_retries, throttle_delay):
+    def __init__(self, domain, logger, scrape_retries, throttle_delay):
         '''@brief fake_browser
            @param logger
            @param scrape_retries `int` Quantity of retries in case of fail.
         '''
-        
+
+        self.config_cookies = list()
+        self.domain = domain
+        self.throttle_delay = throttle_delay
+        self.throttle_timeout = time.time()
+
+        self.scrape_retries = scrape_retries
+        self.logger = logger
+
+        self.start_new_session()
+
+    def start_new_session(self):
         self.userAgent = get_user_agent()
 
         # Use "requests" instead of "urllib" because "urllib" does not allow
@@ -165,25 +176,31 @@ def __init__(self, logger, scrape_retries, throttle_delay):
         self.session = requests.session()
         self.session.headers["User-Agent"] = self.userAgent
 
-        self.throttle_delay = throttle_delay
-        self.throttle_timeout = time.time()
+        # Restore configuration cookies from previous session.
+        for c in self.config_cookies:
+            print("Restore cookie: %s", c)
+            self.session.cookies.set(c[1], c[2], domain=c[0])
 
-        self.scrape_retries = scrape_retries
-        self.logger = logger
+        self.scrape_URL(self.domain, retry=False)
+        self.show_cookies()
 
-    def show_cookies(self, name):
+    def show_cookies(self):
         for x in self.session.cookies:
             self.logger.log(DEBUG_OBSESSIVE,"%s Cookie %s" % (x.domain, x.name))
 
     def add_cookie(self, domain, name, value):
         self.session.cookies.set(name, value, domain=domain)
+        self.config_cookies.append((domain, name, value))
 
-    def scrape_URL(self, url, add_header=[]):
+    def scrape_URL(self, url, add_header=[], retry=True):
         headers = self.session.headers
         for header in add_header:
             self.session.headers[header[1]] = header[2]
 
-        for _ in range(self.scrape_retries):
+        retries = self.scrape_retries
+        if retry == False:
+            retries = 1
+        for _ in range(retries):
             try:
                 # Check the throttling timeout of this browser to see if
                 # another access to its website is allowed.
@@ -197,7 +214,26 @@ def scrape_URL(self, url, add_header=[]):
                 # Update the timeout for this browser.
                 self.throttle_timeout = time.time() + self.throttle_delay
 
-                html = self.session.get(url, timeout=5).text
+                resp = self.session.get(url, timeout=5)
+                self.logger.log(DEBUG_HTTP_HEADERS, "Request headers: %s" % resp.request.headers)
+                self.logger.log(DEBUG_HTTP_HEADERS, "Response headers: %s" % resp.headers)
+
+                # Uncomment this to dump received HTML to file.
+                #if self.logger.isEnabledFor(DEBUG_HTTP_RESPONSES):
+                #    f = open("debug-page.html", "w")
+                #    f.write(resp.text)
+                #    f.close()
+                #    input("Received page dumped, Press enter to continue.")
+
+                # start new session if we are detected (received 403)
+                # TODO: add detection logic for captchas and javascript only pages as well
+                if resp.status_code == 403:
+                    self.start_new_session()
+                    self.logger.warning("Received 403, scraper possibly detected:" \
+                        " Starting new session for %s" % self.domain)
+                    continue
+
+                html = resp.text
                 break
             except Exception as ex:
                 self.logger.log(DEBUG_DETAILED,'Exception of type "%s" while web-scraping %s' \
diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py
index fa64308f9..b7289b2d0 100644
--- a/kicost/distributors/farnell/farnell.py
+++ b/kicost/distributors/farnell/farnell.py
@@ -43,11 +43,8 @@
 
 class dist_farnell(distributor.distributor):
     def __init__(self, name, scrape_retries, log_level, throttle_delay):
-        super(dist_farnell, self).__init__(name, scrape_retries, log_level, throttle_delay)
-        self.domain = distributor_dict[self.name]['site']['url']
-
-        self.browser.scrape_URL(self.domain)
-        self.browser.show_cookies(self.name)
+        super(dist_farnell, self).__init__(name, distributor_dict[name]['site']['url'],
+            scrape_retries, log_level, throttle_delay)
 
     def dist_get_price_tiers(self, html_tree):
         '''@brief Get the pricing tiers from the parsed tree of the farnell product page.
diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py
index 93fff0723..20489d1b2 100644
--- a/kicost/distributors/local/local.py
+++ b/kicost/distributors/local/local.py
@@ -44,7 +44,7 @@ class dist_local(distributor.distributor):
     html = None
 
     def __init__(self, name, scrape_retries, log_level, throttle_delay):
-        super(dist_local, self).__init__(name, scrape_retries, log_level, throttle_delay)
+        super(dist_local, self).__init__(name, None, scrape_retries, log_level, throttle_delay)
 
     def create_part_html(parts, distributors, logger):
         '''@brief Create HTML page containing info for local (non-webscraped) parts.
diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py
index ea724f78a..1d95de231 100644
--- a/kicost/distributors/mouser/mouser.py
+++ b/kicost/distributors/mouser/mouser.py
@@ -40,13 +40,10 @@
 
 class dist_mouser(distributor.distributor):
     def __init__(self, name, scrape_retries, log_level, throttle_delay):
-        super(dist_mouser, self).__init__(name, scrape_retries, log_level, throttle_delay)
-        self.domain = distributor_dict[self.name]['site']['url']
+        super(dist_mouser, self).__init__(name, distributor_dict[name]['site']['url'],
+            scrape_retries, log_level, throttle_delay)
         self.browser.add_cookie('.mouser.com', 'preferences', 'ps=www2&pl=en-US&pc_www2=USDe')
 
-        self.browser.scrape_URL(self.domain)
-        self.browser.show_cookies(self.name)
-
     def dist_get_price_tiers(self, html_tree):
         '''@brief Get the pricing tiers from the parsed tree of the Mouser product page.
            @param html_tree `str()` html of the distributor part page.
diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py
index 5deb4a5f6..356053600 100644
--- a/kicost/distributors/newark/newark.py
+++ b/kicost/distributors/newark/newark.py
@@ -40,11 +40,8 @@
 
 class dist_newark(distributor.distributor):
     def __init__(self, name, scrape_retries, log_level, throttle_delay):
-        super(dist_newark, self).__init__(name, scrape_retries, log_level, throttle_delay)
-        self.domain = distributor_dict[self.name]['site']['url']
-
-        self.browser.scrape_URL(self.domain)
-        self.browser.show_cookies(self.name)
+        super(dist_newark, self).__init__(name, distributor_dict[name]['site']['url'],
+            scrape_retries, log_level, throttle_delay)
 
     def dist_get_price_tiers(self, html_tree):
         '''@brief Get the pricing tiers from the parsed tree of the Newark product page.
diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py
index 5ca7dbc5b..fd80265ef 100644
--- a/kicost/distributors/rs/rs.py
+++ b/kicost/distributors/rs/rs.py
@@ -41,11 +41,8 @@
 
 class dist_rs(distributor.distributor):
     def __init__(self, name, scrape_retries, log_level, throttle_delay):
-        super(dist_rs, self).__init__(name, scrape_retries, log_level, throttle_delay)
-        self.domain = distributor_dict[self.name]['site']['url']
-
-        self.browser.scrape_URL(self.domain)
-        self.browser.show_cookies(self.name)
+        super(dist_rs, self).__init__(name, distributor_dict[name]['site']['url'],
+            scrape_retries, log_level, throttle_delay)
 
     def dist_get_price_tiers(self, html_tree):
         '''@brief Get the pricing tiers from the parsed tree of the RS Components product page.
diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py
index 138771d96..21f8aa6d5 100644
--- a/kicost/distributors/tme/tme.py
+++ b/kicost/distributors/tme/tme.py
@@ -41,11 +41,8 @@
 
 class dist_tme(distributor.distributor):
     def __init__(self, name, scrape_retries, log_level, throttle_delay):
-        super(dist_tme, self).__init__(name, scrape_retries, log_level, throttle_delay)
-        self.domain = distributor_dict[self.name]['site']['url']
-
-        self.browser.scrape_URL(self.domain)
-        self.browser.show_cookies(self.name)
+        super(dist_tme, self).__init__(name, distributor_dict[name]['site']['url'],
+            scrape_retries, log_level, throttle_delay)
 
     def __ajax_details(self, pn):
         '''@brief Load part details from TME using XMLHttpRequest.
diff --git a/kicost/globals.py b/kicost/globals.py
index 4d95d392f..d6e75fa12 100644
--- a/kicost/globals.py
+++ b/kicost/globals.py
@@ -29,7 +29,8 @@
 DEBUG_OVERVIEW = logging.DEBUG
 DEBUG_DETAILED = logging.DEBUG-1
 DEBUG_OBSESSIVE = logging.DEBUG-2
-DEBUG_HTTP_RESPONSES = logging.DEBUG-3
+DEBUG_HTTP_HEADERS = logging.DEBUG-3
+DEBUG_HTTP_RESPONSES = logging.DEBUG-4
 
 SEPRTR = ':'  # Delimiter between library:component, distributor:field, etc.
 

From fa1d98f0db7065230a00f026bf6e1a69b42a3c84 Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Wed, 6 Jun 2018 17:20:07 +0200
Subject: [PATCH 28/29] Removed unused "log_level" parameter in distributor
 class.

This commit finishes the refactoring.
---
 kicost/distributors/digikey/digikey.py | 4 ++--
 kicost/distributors/distributor.py     | 7 +++----
 kicost/distributors/farnell/farnell.py | 4 ++--
 kicost/distributors/local/local.py     | 4 ++--
 kicost/distributors/mouser/mouser.py   | 4 ++--
 kicost/distributors/newark/newark.py   | 4 ++--
 kicost/distributors/rs/rs.py           | 4 ++--
 kicost/distributors/tme/tme.py         | 4 ++--
 kicost/globals.py                      | 1 +
 kicost/kicost.py                       | 2 +-
 10 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py
index 0999f005e..1059392b7 100644
--- a/kicost/distributors/digikey/digikey.py
+++ b/kicost/distributors/digikey/digikey.py
@@ -42,9 +42,9 @@
 import pycountry
 
 class dist_digikey(distributor.distributor):
-    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+    def __init__(self, name, scrape_retries, throttle_delay):
         super(dist_digikey, self).__init__(name, distributor_dict[name]['site']['url'],
-            scrape_retries, log_level, throttle_delay)
+            scrape_retries, throttle_delay)
 
     def dist_get_price_tiers(self, html_tree):
         '''@brief Get the pricing tiers from the parsed tree of the Digikey product page.
diff --git a/kicost/distributors/distributor.py b/kicost/distributors/distributor.py
index 1c2b96f56..cdddfbde2 100644
--- a/kicost/distributors/distributor.py
+++ b/kicost/distributors/distributor.py
@@ -59,11 +59,10 @@
 
 class distributor:
     start_time = time.time()
-    def __init__(self, name, domain, scrape_retries, log_level, throttle_delay):
+    def __init__(self, name, domain, scrape_retries, throttle_delay):
         self.name = name
         self.scrape_retries = scrape_retries
         self.logger = logger
-        self.log_level = log_level
         self.domain = domain
 
         # Don't create fake_browser for "local" distributor.
@@ -128,9 +127,9 @@ def scrape_part(self, id, part):
         else:
             self.logger = multiprocessing.get_logger()
             handler = logging.StreamHandler(sys.stdout)
-            handler.setLevel(self.log_level)
+            handler.setLevel(1)
             self.logger.addHandler(handler)
-            self.logger.setLevel(self.log_level)
+            self.logger.setLevel(1)
             self.browser.logger = self.logger
 
         url = {}
diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py
index b7289b2d0..78e6f05bc 100644
--- a/kicost/distributors/farnell/farnell.py
+++ b/kicost/distributors/farnell/farnell.py
@@ -42,9 +42,9 @@
 __author__='Giacinto Luigi Cerone'
 
 class dist_farnell(distributor.distributor):
-    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+    def __init__(self, name, scrape_retries, throttle_delay):
         super(dist_farnell, self).__init__(name, distributor_dict[name]['site']['url'],
-            scrape_retries, log_level, throttle_delay)
+            scrape_retries, throttle_delay)
 
     def dist_get_price_tiers(self, html_tree):
         '''@brief Get the pricing tiers from the parsed tree of the farnell product page.
diff --git a/kicost/distributors/local/local.py b/kicost/distributors/local/local.py
index 20489d1b2..a203c2a2e 100644
--- a/kicost/distributors/local/local.py
+++ b/kicost/distributors/local/local.py
@@ -43,8 +43,8 @@ class dist_local(distributor.distributor):
     # Static variable which contains local part html.
     html = None
 
-    def __init__(self, name, scrape_retries, log_level, throttle_delay):
-        super(dist_local, self).__init__(name, None, scrape_retries, log_level, throttle_delay)
+    def __init__(self, name, scrape_retries, throttle_delay):
+        super(dist_local, self).__init__(name, None, scrape_retries, throttle_delay)
 
     def create_part_html(parts, distributors, logger):
         '''@brief Create HTML page containing info for local (non-webscraped) parts.
diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py
index 1d95de231..60b813e5e 100644
--- a/kicost/distributors/mouser/mouser.py
+++ b/kicost/distributors/mouser/mouser.py
@@ -39,9 +39,9 @@
 from urllib.parse import quote_plus as urlquote
 
 class dist_mouser(distributor.distributor):
-    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+    def __init__(self, name, scrape_retries, throttle_delay):
         super(dist_mouser, self).__init__(name, distributor_dict[name]['site']['url'],
-            scrape_retries, log_level, throttle_delay)
+            scrape_retries, throttle_delay)
         self.browser.add_cookie('.mouser.com', 'preferences', 'ps=www2&pl=en-US&pc_www2=USDe')
 
     def dist_get_price_tiers(self, html_tree):
diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py
index 356053600..3aa69043c 100644
--- a/kicost/distributors/newark/newark.py
+++ b/kicost/distributors/newark/newark.py
@@ -39,9 +39,9 @@
 from urllib.parse import quote_plus as urlquote
 
 class dist_newark(distributor.distributor):
-    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+    def __init__(self, name, scrape_retries, throttle_delay):
         super(dist_newark, self).__init__(name, distributor_dict[name]['site']['url'],
-            scrape_retries, log_level, throttle_delay)
+            scrape_retries, throttle_delay)
 
     def dist_get_price_tiers(self, html_tree):
         '''@brief Get the pricing tiers from the parsed tree of the Newark product page.
diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py
index fd80265ef..f63109ce0 100644
--- a/kicost/distributors/rs/rs.py
+++ b/kicost/distributors/rs/rs.py
@@ -40,9 +40,9 @@
 from urllib.parse import quote_plus as urlquote
 
 class dist_rs(distributor.distributor):
-    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+    def __init__(self, name, scrape_retries, throttle_delay):
         super(dist_rs, self).__init__(name, distributor_dict[name]['site']['url'],
-            scrape_retries, log_level, throttle_delay)
+            scrape_retries, throttle_delay)
 
     def dist_get_price_tiers(self, html_tree):
         '''@brief Get the pricing tiers from the parsed tree of the RS Components product page.
diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py
index 21f8aa6d5..301434485 100644
--- a/kicost/distributors/tme/tme.py
+++ b/kicost/distributors/tme/tme.py
@@ -40,9 +40,9 @@
 from urllib.parse import quote_plus as urlquote, urlencode
 
 class dist_tme(distributor.distributor):
-    def __init__(self, name, scrape_retries, log_level, throttle_delay):
+    def __init__(self, name, scrape_retries, throttle_delay):
         super(dist_tme, self).__init__(name, distributor_dict[name]['site']['url'],
-            scrape_retries, log_level, throttle_delay)
+            scrape_retries, throttle_delay)
 
     def __ajax_details(self, pn):
         '''@brief Load part details from TME using XMLHttpRequest.
diff --git a/kicost/globals.py b/kicost/globals.py
index d6e75fa12..54e78caf8 100644
--- a/kicost/globals.py
+++ b/kicost/globals.py
@@ -31,6 +31,7 @@
 DEBUG_OBSESSIVE = logging.DEBUG-2
 DEBUG_HTTP_HEADERS = logging.DEBUG-3
 DEBUG_HTTP_RESPONSES = logging.DEBUG-4
+# Minimum possible log level is logging.DEBUG-9 !
 
 SEPRTR = ':'  # Delimiter between library:component, distributor:field, etc.
 
diff --git a/kicost/kicost.py b/kicost/kicost.py
index 5f121fb3f..8b567b158 100644
--- a/kicost/kicost.py
+++ b/kicost/kicost.py
@@ -233,7 +233,7 @@ def mt_init_dist(d, scrape):
                     ctor = globals()['dist_local']
                 else:
                     ctor = globals()['dist_'+d]
-                instance = ctor(d, scrape_retries, 5, throttling_delay) # TODO: log level
+                instance = ctor(d, scrape_retries, throttling_delay)
             except Exception as ex:
                 logger.log(DEBUG_OVERVIEW, "Initialising %s failed with %s, exculding this distributor..." \
                     % (d, type(ex).__name__))

From 51ba35d8c903120ea2a4c8991199d770523984be Mon Sep 17 00:00:00 2001
From: Max Maisel <max.maisel@posteo.de>
Date: Thu, 7 Jun 2018 17:08:19 +0200
Subject: [PATCH 29/29] Remove trailing space from search URLs.

---
 kicost/distributors/digikey/digikey.py | 8 +++-----
 kicost/distributors/farnell/farnell.py | 8 ++++----
 kicost/distributors/mouser/mouser.py   | 6 +++---
 kicost/distributors/newark/newark.py   | 7 ++++---
 kicost/distributors/rs/rs.py           | 5 +++--
 kicost/distributors/tme/tme.py         | 6 +++---
 6 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/kicost/distributors/digikey/digikey.py b/kicost/distributors/digikey/digikey.py
index 1059392b7..5863e8b7d 100644
--- a/kicost/distributors/digikey/digikey.py
+++ b/kicost/distributors/digikey/digikey.py
@@ -197,11 +197,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # Use the part number to lookup the part using the site search function, unless a starting url was given.
         if url is None:
-            url = distributor_dict['digikey']['site']['url'] + '/products/en?keywords=' + urlquote(
-            #'/scripts/DkSearch/dksus.dll?WT.z_header=search_go&lang=en&keywords=' + urlquote(
-                pn + ' ' + extra_search_terms,
-                safe='')
-            #url = distributor_dict['digikey']['site']['url'] + '/product-search/en?KeyWords=' + urlquote(pn,safe='') + '&WT.z_header=search_go'
+            url = distributor_dict['digikey']['site']['url'] + '/products/en?keywords=' + urlquote(pn, safe='')
+            if extra_search_terms:
+                url = url + urlquote(' ' + extra_search_terms, safe='')
         elif url[0] == '/':
             url = distributor_dict['digikey']['site']['url'] + url
 
diff --git a/kicost/distributors/farnell/farnell.py b/kicost/distributors/farnell/farnell.py
index 78e6f05bc..7509d8186 100644
--- a/kicost/distributors/farnell/farnell.py
+++ b/kicost/distributors/farnell/farnell.py
@@ -132,10 +132,10 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # Use the part number to lookup the part using the site search function, unless a starting url was given.
         if url is None:
-            url = 'http://it.farnell.com/Search?storeId=10165&catalogId=15001&categoryName=&selectedCategoryId=&langId=-4&categoryIdBox=&st=' + urlquote(
-                pn + ' ' + extra_search_terms,
-                safe='')
-
+            url = 'http://it.farnell.com/Search?storeId=10165&catalogId=15001&categoryName=&selectedCategoryId=&langId=-4&categoryIdBox=&st=' \
+                + urlquote(pn, safe='')
+            if extra_search_terms:
+                url = url + urlquote(' ' + extra_search_terms, safe='')
         elif url[0] == '/':
             url = 'http://www.farnell.com' + url
         elif url.startswith('..'):
diff --git a/kicost/distributors/mouser/mouser.py b/kicost/distributors/mouser/mouser.py
index 60b813e5e..e299147cb 100644
--- a/kicost/distributors/mouser/mouser.py
+++ b/kicost/distributors/mouser/mouser.py
@@ -138,9 +138,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # Use the part number to lookup the part using the site search function, unless a starting url was given.
         if url is None:
-            url = 'https://www.mouser.com/Search/Refine.aspx?Keyword=' + urlquote(
-                pn + ' ' + extra_search_terms,
-                safe='')
+            url = 'https://www.mouser.com/Search/Refine.aspx?Keyword=' + urlquote(pn, safe='')
+            if extra_search_terms:
+                url = url + urlquote(' ' + extra_search_terms, safe='')
         elif url[0] == '/':
             url = 'https://www.mouser.com' + url
         elif url.startswith('..'):
diff --git a/kicost/distributors/newark/newark.py b/kicost/distributors/newark/newark.py
index 3aa69043c..1e3a2176d 100644
--- a/kicost/distributors/newark/newark.py
+++ b/kicost/distributors/newark/newark.py
@@ -133,9 +133,10 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # Use the part number to lookup the part using the site search function, unless a starting url was given.
         if url is None:
-            url = 'http://www.newark.com/webapp/wcs/stores/servlet/Search?catalogId=15003&langId=-1&storeId=10194&gs=true&st=' + urlquote(
-                pn + ' ' + extra_search_terms,
-                safe='')
+            url = 'http://www.newark.com/webapp/wcs/stores/servlet/Search?catalogId=15003&langId=-1&storeId=10194&gs=true&st=' \
+                + urlquote(pn, safe='')
+            if extra_search_terms:
+                url = url + urlquote(' ' + extra_search_terms, safe='')
         elif url[0] == '/':
             url = 'http://www.newark.com' + url
         elif url.startswith('..'):
diff --git a/kicost/distributors/rs/rs.py b/kicost/distributors/rs/rs.py
index f63109ce0..67dabcee2 100644
--- a/kicost/distributors/rs/rs.py
+++ b/kicost/distributors/rs/rs.py
@@ -115,8 +115,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
                 
         # Use the part number to lookup the part using the site search function, unless a starting url was given.
         if url is None:
-            url = 'http://it.rs-online.com/web/c/?searchTerm=' + urlquote(pn + ' ' + extra_search_terms, safe='')
-
+            url = 'http://it.rs-online.com/web/c/?searchTerm=' + urlquote(pn, safe='')
+            if extra_search_terms:
+                url = url + urlquote(' ' + extra_search_terms, safe='')
         elif url[0] == '/':
             url = 'http://it.rs-online.com' + url
         elif url.startswith('..'):
diff --git a/kicost/distributors/tme/tme.py b/kicost/distributors/tme/tme.py
index 301434485..1b1aa4e3f 100644
--- a/kicost/distributors/tme/tme.py
+++ b/kicost/distributors/tme/tme.py
@@ -157,9 +157,9 @@ def dist_get_part_html_tree(self, pn, extra_search_terms='', url=None, descend=2
 
         # Use the part number to lookup the part using the site search function, unless a starting url was given.
         if url is None:
-            url = 'https://www.tme.eu/en/katalog/?search=' + urlquote(
-                pn + ' ' + extra_search_terms,
-                safe='')
+            url = 'https://www.tme.eu/en/katalog/?search=' + urlquote(pn, safe='')
+            if extra_search_terms:
+                url = url + urlquote(' ' + extra_search_terms, safe='')
         elif url[0] == '/':
             url = 'https://www.tme.eu' + url