diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 0000000..6ef1061 --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,6 @@ +[bumpversion] +current_version = 0.2.2 +commit = True +tag = True + +[bumpversion:file:setup.py] diff --git a/.circleci/config.yml b/.circleci/config.yml index 67fdae6..c598217 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,47 +1,35 @@ -# Python CircleCI 2.0 configuration file -# -# Check https://circleci.com/docs/2.0/language-python/ for more details -# version: 2 jobs: - toxify: + python3.6: docker: - - image: themattrix/tox + - image: python:3.6 steps: - checkout - - run: - name: Run tests in supported Python versions - command: | - pip install tox tox-pyenv - pyenv local 3.5.3 3.6.0 3.7.0 3.8.0 - tox - build: + - run: | + pip install tox + tox + python3.7: docker: - - image: circleci/python:3.6.1 + - image: python:3.7 steps: - checkout - # Download and cache dependencies - - restore_cache: - keys: - - v1-dependencies-{{ checksum "requirements.txt" }} - # fallback to using the latest cache if no exact match is found - - v1-dependencies- - - run: - name: install dependencies - command: | - python3 -m venv venv - . venv/bin/activate - pip install -r requirements.txt - pip install -r requirements-dev.txt - - save_cache: - paths: - - ./venv - key: v1-dependencies-{{ checksum "requirements.txt" }} + - run: | + pip install tox + tox + python3.8: + docker: + - image: python:3.8 + steps: + - checkout + - run: | + pip install tox + tox workflows: version: 2 shublang: jobs: - - toxify - - build + - python3.6 + - python3.7 + - python3.8 diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7d27cbc --- /dev/null +++ b/LICENSE @@ -0,0 +1,26 @@ +Copyright 2020 Scrapinghub + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 8fd2a16..f75d9d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ py==1.8.0 pyparsing==2.4.2 pytest==5.2.1 six==1.12.0 +unidecode==0.4.20 w3lib==1.21.0 wcwidth==0.1.7 zipp==0.6.0 diff --git a/setup.py b/setup.py index 9159c4f..b40053d 100644 --- a/setup.py +++ b/setup.py @@ -4,13 +4,13 @@ setup( name='shublang', - version='0.1.2', + version='0.2.2', license='BSD', + author='Akshay Philar', + author_email='akshayphilar@gmail.com', description='Shublang - Data Extraction DSL', - author='Akshay', - author_email='akshay@scrapinghub.com', + url="https://github.com/scrapinghub/shublang", packages=find_packages(exclude=exclude), - #package_data={'shublang': ['*.py']}, include_package_data=True, entry_points={ 'console_scripts': [ @@ -24,11 +24,11 @@ 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', ], + python_requires='>=3.6', install_requires=[ 'pipe >= 1.5.0', 'jmespath >= 0.9.4', @@ -36,5 +36,6 @@ 'parsel >= 1.5.2', 'dateparser >= 0.7.2', 'price-parser >= 0.3.2', + 'unidecode >= 0.4.20' ] ) diff --git a/shublang/shublang.py b/shublang/shublang.py index 23a0852..01d98a2 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -10,6 +10,8 @@ import dateparser from price_parser import Price import types +from unidecode import unidecode +from urllib import parse """ Conventions @@ -29,13 +31,110 @@ logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) + +@Pipe +def map_value(iterable, rules_dict): + """Maps an input text to an output according to the map settings + configured at rules_dict. + + :param iterable: collection of data to transform + :type iterable: list + + :param rules_dict: rules dictionary where the key should be the + exactly text to look for and the value should be the desired output + :type rules_dict: dict + + """ + + return (rules_dict.get(x, x) for x in iterable) + + @Pipe def sub(iterable, pattern, repl=None): - if not repl: - repl = "" + """Replaces a substring with another substring using regular expressions. + + :param iterable: collection of data to transform + :type iterable: list + + :param pattern: regular expression to match and be replaced + :type pattern: string + + :param repl: (optional) the replacement substring + :type rep: string + """ + + repl = repl or "" return (re.sub(pattern, repl, x) for x in iterable) +@Pipe +def replace(iterable, old, new, count=None): + """Replaces a substring with another substring. + + :param iterable: collection of data to transform + :type iterable: list + + :param old: substring to be replaced + :type old: string + + :param new: the replacement substring + :type new: string + + :param count: (optional) The first n substring occurrences to be replaced + :type count: int + + NOTE: This doesn't support regular expressions which makes it safer and + easier. If you need regular expressions, make use :func:`sub` which supports + it. + """ + + if count: + return (x.replace(old, new, count) for x in iterable) + return (x.replace(old, new) for x in iterable) + + +@Pipe +def format(iterable, template): + """Formats an iterable using a given string template + + :param iterable: collection of data to transform + :type iterable: list + + :param template: substring to be replaced + :type template: string + """ + return (template.format(*x) for x in iterable) + + +@Pipe +def append(iterable, data): + """Appends data to the iterable. + + :param iterable: collection of data to transform + :type iterable: list + + :param data: any type of data to be appended + """ + + iterable.append(data) + return iterable + + +@Pipe +def extend(iterable, extension): + """Extends the iterable using another iterable. + + :param iterable: collection of data to transform + :type iterable: list + + :param extension: contains the additional iterable to extend the current one + :param extension: iterable + """ + + iterable.extend(extension) + return iterable + + @Pipe def encode(iterable, encoding, errors='ignore'): return (x.encode(encoding, errors=errors) for x in iterable) @@ -46,13 +145,54 @@ def decode(iterable, encoding): return (x.decode(encoding) for x in iterable) +@Pipe +def find(iterable, sub, start=None, end=None): + """Returns the lowest index in the string where the sub is found. + If specified, the start and end params serve to slice the string + where sub should be searched. + + :param iterable: collection of data to transform + :type iterable: list + + :param sub: the substring to search for. + :type sub: string + + :param start: (optional) where to start the search. Default to 0. + :type start: int + + :param end: (optional) where to end the search. Default to the + end of the string. + :type end: int + """ + + return (x.find(sub, start, end) for x in iterable) + + +@Pipe +def split(iterable, sep, maxsplit=-1): + """Returns a list of words in the string, using sep as the delimiter. + If maxsplit is given, at most maxsplit splits are done. + + :param iterable: collection of data to transform + :type iterable: list + + :param sep: this is a delimiter. The string will be split by this separator. + :type sep: string + + :param maxsplit: (optional) if given, there will be at most maxsplit splits. + :type maxsplit: int + """ + + return (x.split(sep, maxsplit) for x in iterable) + + @Pipe def sanitize(iterable): # TODO change name and add other options iterable = (x.strip() for x in iterable) iterable = (re.sub(r'[\n\t\r\s]+', ' ', x) for x in iterable) - iterable = (x.encode('ascii', errors='ignore').decode('ascii') for x in iterable) + iterable = (unidecode(x) for x in iterable) iterable = (replace_entities(x) for x in iterable) iterable = (remove_tags(x) for x in iterable) return iterable @@ -67,10 +207,12 @@ def xpath_getall(iterable, pred): def xpath_get(iterable, pred): return (Selector(x).xpath(pred).get() for x in iterable) + @Pipe def css_getall(iterable, pred): return (Selector(x).css(pred).getall() for x in iterable) + @Pipe def css_get(iterable, pred): return (Selector(x).css(pred).get() for x in iterable) @@ -80,14 +222,17 @@ def css_get(iterable, pred): def jmespath(iterable, query): return (jp.search(query, x) for x in iterable) + @Pipe def any(iterable): return builtins.any(iterable) + @Pipe def all(iterable): return builtins.all(iterable) + @Pipe def exists(iterable, pred): if pred in iterable: @@ -95,6 +240,7 @@ def exists(iterable, pred): else: return False + @Pipe def none(iterable, pred): if pred not in iterable: @@ -102,14 +248,22 @@ def none(iterable, pred): else: return False + @Pipe def length(iterable): return len(iterable) + @Pipe def bool(iterable): return (builtins.bool(x) for x in iterable) + +@Pipe +def str(iterable): + return (builtins.str(x) for x in iterable) + + @Pipe def float(iterable): return (builtins.float(x) for x in iterable) @@ -119,21 +273,26 @@ def float(iterable): def int(iterable): return (builtins.int(x) for x in iterable) + @Pipe def abs(iterable): return (builtins.abs(x) for x in iterable) + @Pipe def ceil(iterable): return (math.ceil(x) for x in iterable) + @Pipe def round(iterable, pred): return (builtins.round(x, pred) for x in iterable) + @Pipe def join(iterable, separator=", "): - return separator.join(builtins.map(str, iterable)) + return separator.join(builtins.map(builtins.str, iterable)) + @Pipe def capitalize(iterable): @@ -149,39 +308,88 @@ def isdigit(iterable): def isdecimal(iterable): return (x.isdecimal() for x in iterable) + @Pipe def startswith(iterable, pred): return (x.startswith(pred) for x in iterable) + @Pipe def endswith(iterable, pred): return (x.endswith(pred) for x in iterable) + @Pipe def re_search(iterable, pattern): #return (re.sub(pattern, repl, x) for x in iterable) iterable = builtins.map(lambda x: re.search(pattern, x), iterable) return (x.groups() if x else None for x in iterable) + @Pipe def json_loads(iterable): return (json.loads(x) for x in iterable) + @Pipe def date_format(iterable, fmt): return (dateparser.parse(item).strftime(fmt) for item in iterable) + @Pipe def extract_price(iterable): - return (str(Price.fromstring(item).amount) for item in iterable) + return (builtins.str(Price.fromstring(item).amount) for item in iterable) + @Pipe def extract_currency(iterable): return (Price.fromstring(item).currency for item in iterable) + +@Pipe +def urljoin(iterable, base): + return (parse.urljoin(base, url) for url in iterable) + +@Pipe +def identity(iterable, element): + """ Return the same element is passed as parameter.""" + return (element) + +@Pipe +def urlparse_netloc(iterable): + return (parse.urlparse(url).netloc for url in iterable) + +@Pipe +def urlparse_params(iterable): + return (parse.urlparse(url).params for url in iterable) + +@Pipe +def urlparse_path(iterable): + return (parse.urlparse(url).path for url in iterable) + +@Pipe +def urlparse_query(iterable): + return (parse.urlparse(url).query for url in iterable) + +@Pipe +def urlparse_scheme(iterable): + return (parse.urlparse(url).scheme for url in iterable) + +@Pipe +def urlparse_fragment(iterable): + return (parse.urlparse(url).fragment for url in iterable) + +@Pipe +def urlparse(iterable): + parsed_iterable = (parse.urlparse(url) for url in iterable) + parsed_iterable = ({"scheme": it.scheme, "netloc": it.netloc, "path": it.path, + "params": it.params, "query": it.query, "fragment": it.fragment} for it in parsed_iterable) + return parsed_iterable + filter = where map = select + def evaluate(expression, data): # TODO use StatementParser.is_safe before evaluating code. # if StatementParser.is_safe(expression): diff --git a/tests/test_functions.py b/tests/test_functions.py index a52dc72..30f86e5 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,37 +1,281 @@ # TODO add tests for functions +import pytest from shublang import evaluate -def test_sub(): - text = "Python,Haskell,Scala,Rust" - assert evaluate('sub(",", " ")', data=[text]) == ["Python Haskell Scala Rust"] -def test_sub_2(): - text = "Python,Haskell,Scala,Rust" - assert evaluate('sub(",")', data=[text]) == ["PythonHaskellScalaRust"] +@pytest.mark.parametrize( + "test_input,expected", + [ + # if a mapping is not found, return the value itself + ( + [ + 'map_value({"This is foo": "foo", "This is bar": "bar"})', + ["This is foo", "This is not bar"] + ], + ['foo', 'This is not bar'] + ), + + # usual mapping + ( + [ + 'map_value({"1": "Available", "2": "Unavailable"})', + ['1', '2'] + ], + ['Available', 'Unavailable'] + ), + + # map string to number + ( + [ + 'map_value({"InStock": 1, "OutOfStock": 2})', + ['OutOfStock', 'InStock'] + ], + [2, 1] + ), + + # map number to string + ( + [ + 'map_value({0: "Online", 1: "Offline"})', + [0, 1] + ], + ['Online', 'Offline'] + ) + ] +) +def test_map_value(test_input, expected): + assert evaluate(*test_input) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['str', [1, 2, 3]], + ['1', '2', '3'] + ), + ( + ['str', [1.1, 2.2, 3.3]], + ['1.1', '2.2', '3.3'] + ), + ( + ['str', ['1', '2', '3']], + ['1', '2', '3'] + ), + ] +) +def test_str(test_input, expected): + assert evaluate(*test_input) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['sub(",", " ")', ['Python,Haskell,Scala,Rust']], + ['Python Haskell Scala Rust'] + ), + + # Optional 'repl' param should work. + ( + ['sub(",")', ['Python,Haskell,Scala,Rust']], + ['PythonHaskellScalaRust'] + ), + + # Regular Expressions should work. + ( + ['sub("b{2}(?:\\s+)", "xx ")', ['b bb bbb']], + ['b xx bbb'] + ), + ] +) +def test_sub(test_input, expected): + assert evaluate(*test_input) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['replace("cool", "dope")', ['Pretty cool']], + ['Pretty dope'] + ), + + # Optional 'count' param should work on the first n patterns encountered. + ( + ['replace("bb", "xx", 2)', ['bbb bbb bbb']], + ['xxb xxb bbb'] + ), + + # Regular expressions won't work on `replace`. + ( + ['replace("t+", "xx")', ['Regex Attempt']], + ['Regex Attempt'] + ), + ] +) +def test_replace(test_input, expected): + assert evaluate(*test_input) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['format("Now Playing: {} and {}")', [['Rick', 'Morty']]], + ['Now Playing: Rick and Morty'] + ), + # Ordering should be respected + ( + ['format("{2}, {1}, and {0}")', [['a', 'b', 'c']]], + ['c, b, and a'] + ), + # Lists of lists are aggregated into a list + ( + ['format("{} and some value {}")', [[1, 2], ['x', 'y']]], + ['1 and some value 2', 'x and some value y'] + ), + # Args could be repeated + ( + ['format("{0}--{0}-{1}!")', [['Re', 'Remix']]], + ['Re--Re-Remix!'] + ), + # Standard Formatting should work + ( + ['format("{:.2f}")', [[7/3]]], + ['2.33'] + ), + ] +) +def test_format(test_input, expected): + assert evaluate(*test_input) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['append("new thing")', ['A', 'B']], + ['A', 'B', 'new thing'] + ), + # list could be added as a single item + ( + ['append([1, 2, "3"])', ['A', 'B']], + ['A', 'B', [1, 2, '3']] + ), + ] +) +def test_append(test_input, expected): + assert evaluate(*test_input) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['extend([1, 2, "3"])', ['A', 'B']], + ['A', 'B', 1, 2, '3'] + ), + + # generators will also work + ( + ['extend(range(3, 6))', ['A', 'B']], + ['A', 'B', 3, 4, 5] + ), + + # single strings are treated as iterables + ( + ['extend("new")', ['A', 'B']], + ['A', 'B', 'n', 'e', 'w'] + ), + ] +) +def test_extend(test_input, expected): + assert evaluate(*test_input) == expected + + +def test_extend_with_non_iterable(): + """It should raise a TypeError.""" + + with pytest.raises(TypeError): + evaluate("extend(123)", ['A', 'B']) + def test_encode(): text = "ἀἐἠἰὀὐὠὰᾀᾐ" assert evaluate('encode("UTF8")', data=[text]) ==\ - [b'\xe1\xbc\x80\xe1\xbc\x90\xe1\xbc\xa0\xe1\xbc\xb0\xe1\xbd\x80\xe1\xbd\x90\xe1\xbd\xa0\xe1\xbd\xb0\xe1' \ + [b'\xe1\xbc\x80\xe1\xbc\x90\xe1\xbc\xa0\xe1\xbc\xb0\xe1\xbd\x80\xe1\xbd\x90\xe1\xbd\xa0\xe1\xbd\xb0\xe1' b'\xbe\x80\xe1\xbe\x90'] + def test_decode(): text = b'\xe1\xbc\x80\xe1\xbc\x90\xe1\xbc\xa0\xe1\xbc\xb0\xe1\xbd\x80\xe1\xbd\x90\xe1\xbd\xa0\xe1\xbd\xb0\xe1' \ b'\xbe\x80\xe1\xbe\x90' assert evaluate('decode("UTF8")', data=[text]) == ["ἀἐἠἰὀὐὠὰᾀᾐ"] + +@pytest.mark.parametrize( + "test_input,expected", + [ + # should find at the entire string + ( + ['find("th")', ['Python']], + [2] + ), + + # should respect where the search starts + ( + ['find("th", 3)', ['Python']], + [-1] + ), + + # should respect where the search ends + ( + ['find("th", 0, 1)', ['Python']], + [-1] + ), + ] +) +def test_find(test_input, expected): + assert evaluate(*test_input) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['split(",")', ['Python,Haskell,Scala,Rust']], + [['Python', 'Haskell', 'Scala', 'Rust']] + ), + + # maxsplit should limit the number of separations + ( + ['split(",", 2)', ['Python,Haskell,Scala,Rust']], + [['Python', 'Haskell', 'Scala,Rust']] + ), + ] +) +def test_split(test_input, expected): + assert evaluate(*test_input) == expected + + def test_sanitize(): text = ["Python \t\t\t\t", "
Haskell", " Rust"] assert evaluate("sanitize", data=text) == ["Python", "Haskell", "Rust"] +def test_sanitize_1(): + text = [u"Checking unicode ko\u017eu\u0161\u010dek \t\t\t\t"] + assert evaluate("sanitize", data=text) == ["Checking unicode kozuscek"] + def test_xpath_getall(): html = '
  • ' assert evaluate(f'xpath_getall(\'//li[@class="results"]/ul/text()\')', data=[html]) == \ [["Skoda", "Vauxhall", "Peugot"]] + def test_xpath_get(): html = '
  • ' assert evaluate(f'xpath_get(\'//li[@class="results"]/ul/text()\')', data=[html]) == ["Skoda"] @@ -42,57 +286,70 @@ def test_css_getall(): assert evaluate(f'css_getall("li.results>ul::text")', data=[html]) == \ [["Skoda", "Vauxhall", "Peugot"]] + def test_css_get(): html = '
  • ' assert evaluate(f'css_get("li.results>ul::text")', data=[html]) == ["Skoda"] + def test_any(): - assert evaluate('any', [1, False, 2]) == True - assert evaluate('any', [0, False, None]) == False + assert evaluate('any', [1, False, 2]) is True + assert evaluate('any', [0, False, None]) is False + def test_all(): - assert evaluate('all', [1, False, []]) == False - assert evaluate('all', [1, True, [1]]) == True + assert evaluate('all', [1, False, []]) is False + assert evaluate('all', [1, True, [1]]) is True def test_exists(): - assert evaluate('exists(20)', [20, 0, 1]) == True - assert evaluate('exists(0)', [1, True, [1]]) == False + assert evaluate('exists(20)', [20, 0, 1]) is True + assert evaluate('exists(0)', [1, True, [1]]) is False def test_none(): - assert evaluate('none(20)', [20, 0, 1]) == False - assert evaluate('none(0)', [1, True, [1]]) == True + assert evaluate('none(20)', [20, 0, 1]) is False + assert evaluate('none(0)', [1, True, [1]]) is True + def test_length(): assert evaluate('length', [20, 0, 1]) == 3 assert evaluate('length', "length") == 6 + def test_bool(): assert list(evaluate('bool', [0, [], ''])) == [False, False, False] assert list(evaluate('bool', [1, [1], ''])) == [True, True, False] + def test_float(): assert evaluate('float', [20, 1, 2]) == [20.0, 1.0, 2.0] + def test_float_2(): assert evaluate('float', ["20", "1", "2"]) == [20.0, 1.0, 2.0] + def test_int(): assert evaluate('int', [20.0, 1.0, 2.0]) == [20, 1, 2] + def test_int_2(): assert evaluate('int', ["20", "1", "2"]) == [20, 1, 2] + def test_abs(): assert evaluate('abs', [-1, -2, 3]) == [1, 2, 3] + def test_ceil(): assert evaluate('ceil', [1.5, -2.2, 3.1]) == [2, -2, 4] + def test_round(): assert list(evaluate('round(2)', [1.221123])) == [1.22] + def test_filter(): assert list(evaluate('filter(lambda x: x>1)', [0, 1, 2])) == [2] @@ -101,35 +358,146 @@ def test_startswith(): assert list(evaluate('startswith("a")', ["andrew", "alex", "akshay"])) == [True, True, True] assert list(evaluate('startswith("b")', ["ian"])) == [False] + def test_endswith(): assert list(evaluate('endswith("a")', ["andrew", "alex", "akshay"])) == [False, False, False] assert list(evaluate('endswith("b")', ["Rob"])) == [True] + def test_re_search(): text = "Expected Price: $1233" assert evaluate('re_search(r"(\\d+)")', data=[text]) == [('1233', )] + def test_json_loads(): json_data = '{"results":["Skoda", "Peugot", "Vauxhall"]}' - assert evaluate("json_loads", data=[json_data]) == [{"results":["Skoda", "Peugot", "Vauxhall"]}] + assert evaluate("json_loads", data=[json_data]) == [{"results": ["Skoda", "Peugot", "Vauxhall"]}] + def test_date_format(): assert evaluate('date_format("%Y-%m-%d")|first', data=['15th August 2016']) == '2016-08-15' + def test_price_1(): assert evaluate('extract_price', data=['22,90 €']) == ['22.90'] + def test_price_2(): assert evaluate('extract_price', data=['$1,199.00']) == ['1199.00'] + def test_price_3(): assert evaluate('extract_price', data=['$12']) == ['12'] + def test_price_4(): assert evaluate('extract_price', data=['12.000,95']) == ['12000.95'] + def test_currency_1(): assert evaluate('extract_currency', data=['22,90 €']) == ['€'] + def test_currency_2(): assert evaluate('extract_currency', data=['$1,199.00']) == ['$'] + + +def test_urlparse_netloc(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_netloc', data=[url]) == ['scrapinghub.bamboohr.com'] + + +def test_urlparse_path(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_path', data=[url]) == ['/employees/performance/index.php'] + + +def test_urlparse_query(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_query', data=[url]) == ['page=2107&subpage=1'] + + +def test_urlparse_scheme(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_scheme', data=[url]) == ['https'] + + +def test_urlparse_fragment(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_fragment', data=[url]) == ['123'] + + +def test_urlparse(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert list(evaluate('urlparse', data=[url])[0].keys()) == ['scheme', 'netloc', 'path', 'params', 'query', 'fragment'] + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['join("")', ['A', 'B']], + 'AB' + ), + ( + ['join("")', ("A", "B")], + 'AB' + ), + ( + ['join("")', (1, 2)], + '12' + ), + + ] +) +def test_join(test_input, expected): + assert evaluate(*test_input) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['urljoin("http://scrapinghub.com/products.html")', ['autoextract.html', 'uncork.html', 'crawlera.html']], + [ + 'http://scrapinghub.com/autoextract.html', + 'http://scrapinghub.com/uncork.html', + 'http://scrapinghub.com/crawlera.html', + ] + ), + # If url is an absolute URL, the url’s host name and/or scheme will be present in the result + ( + ['urljoin("http://www.scrapinghub.com")', + ['//doc.scrapinghub.com/unified_schema.html#operation/product']], + [ + 'http://doc.scrapinghub.com/unified_schema.html#operation/product', + ] + ), + ] +) +def test_urljoin(test_input, expected): + assert evaluate(*test_input) == expected + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['identity(True)', + [10, 'far', ['boo', 3]]], + True, + ), + ( + ['identity("InStock")', + ["In Stock.", "Only 3 in Stock", "Stock Ok"]], + "InStock", + ), + ( + ['identity((1,2,3,4,5))', + "foo"], + (1,2,3,4,5), + ), + ] +) + +def test_identity(test_input, expected): + assert evaluate(*test_input) == expected diff --git a/tox.ini b/tox.ini index f08b092..75d1088 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,5 @@ [tox] -envlist = py35, py36, py37, py38 -skip_missing_interpreters = True +envlist = py [testenv] commands = pytest tests/