diff --git a/.bumpversion.cfg b/.bumpversion.cfg
new file mode 100644
index 0000000..6ef1061
--- /dev/null
+++ b/.bumpversion.cfg
@@ -0,0 +1,6 @@
+[bumpversion]
+current_version = 0.2.2
+commit = True
+tag = True
+
+[bumpversion:file:setup.py]
diff --git a/.circleci/config.yml b/.circleci/config.yml
index 67fdae6..c598217 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,47 +1,35 @@
-# Python CircleCI 2.0 configuration file
-#
-# Check https://circleci.com/docs/2.0/language-python/ for more details
-#
version: 2
jobs:
- toxify:
+ python3.6:
docker:
- - image: themattrix/tox
+ - image: python:3.6
steps:
- checkout
- - run:
- name: Run tests in supported Python versions
- command: |
- pip install tox tox-pyenv
- pyenv local 3.5.3 3.6.0 3.7.0 3.8.0
- tox
- build:
+ - run: |
+ pip install tox
+ tox
+ python3.7:
docker:
- - image: circleci/python:3.6.1
+ - image: python:3.7
steps:
- checkout
- # Download and cache dependencies
- - restore_cache:
- keys:
- - v1-dependencies-{{ checksum "requirements.txt" }}
- # fallback to using the latest cache if no exact match is found
- - v1-dependencies-
- - run:
- name: install dependencies
- command: |
- python3 -m venv venv
- . venv/bin/activate
- pip install -r requirements.txt
- pip install -r requirements-dev.txt
- - save_cache:
- paths:
- - ./venv
- key: v1-dependencies-{{ checksum "requirements.txt" }}
+ - run: |
+ pip install tox
+ tox
+ python3.8:
+ docker:
+ - image: python:3.8
+ steps:
+ - checkout
+ - run: |
+ pip install tox
+ tox
workflows:
version: 2
shublang:
jobs:
- - toxify
- - build
+ - python3.6
+ - python3.7
+ - python3.8
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..7d27cbc
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,26 @@
+Copyright 2020 Scrapinghub
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 8fd2a16..f75d9d8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,6 +16,7 @@ py==1.8.0
pyparsing==2.4.2
pytest==5.2.1
six==1.12.0
+unidecode==0.4.20
w3lib==1.21.0
wcwidth==0.1.7
zipp==0.6.0
diff --git a/setup.py b/setup.py
index 9159c4f..b40053d 100644
--- a/setup.py
+++ b/setup.py
@@ -4,13 +4,13 @@
setup(
name='shublang',
- version='0.1.2',
+ version='0.2.2',
license='BSD',
+ author='Akshay Philar',
+ author_email='akshayphilar@gmail.com',
description='Shublang - Data Extraction DSL',
- author='Akshay',
- author_email='akshay@scrapinghub.com',
+ url="https://github.com/scrapinghub/shublang",
packages=find_packages(exclude=exclude),
- #package_data={'shublang': ['*.py']},
include_package_data=True,
entry_points={
'console_scripts': [
@@ -24,11 +24,11 @@
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
],
+ python_requires='>=3.6',
install_requires=[
'pipe >= 1.5.0',
'jmespath >= 0.9.4',
@@ -36,5 +36,6 @@
'parsel >= 1.5.2',
'dateparser >= 0.7.2',
'price-parser >= 0.3.2',
+ 'unidecode >= 0.4.20'
]
)
diff --git a/shublang/shublang.py b/shublang/shublang.py
index 23a0852..01d98a2 100644
--- a/shublang/shublang.py
+++ b/shublang/shublang.py
@@ -10,6 +10,8 @@
import dateparser
from price_parser import Price
import types
+from unidecode import unidecode
+from urllib import parse
"""
Conventions
@@ -29,13 +31,110 @@
logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
+
+@Pipe
+def map_value(iterable, rules_dict):
+ """Maps an input text to an output according to the map settings
+ configured at rules_dict.
+
+ :param iterable: collection of data to transform
+ :type iterable: list
+
+ :param rules_dict: rules dictionary where the key should be the
+ exactly text to look for and the value should be the desired output
+ :type rules_dict: dict
+
+ """
+
+ return (rules_dict.get(x, x) for x in iterable)
+
+
@Pipe
def sub(iterable, pattern, repl=None):
- if not repl:
- repl = ""
+ """Replaces a substring with another substring using regular expressions.
+
+ :param iterable: collection of data to transform
+ :type iterable: list
+
+ :param pattern: regular expression to match and be replaced
+ :type pattern: string
+
+ :param repl: (optional) the replacement substring
+ :type rep: string
+ """
+
+ repl = repl or ""
return (re.sub(pattern, repl, x) for x in iterable)
+@Pipe
+def replace(iterable, old, new, count=None):
+ """Replaces a substring with another substring.
+
+ :param iterable: collection of data to transform
+ :type iterable: list
+
+ :param old: substring to be replaced
+ :type old: string
+
+ :param new: the replacement substring
+ :type new: string
+
+ :param count: (optional) The first n substring occurrences to be replaced
+ :type count: int
+
+ NOTE: This doesn't support regular expressions which makes it safer and
+ easier. If you need regular expressions, make use :func:`sub` which supports
+ it.
+ """
+
+ if count:
+ return (x.replace(old, new, count) for x in iterable)
+ return (x.replace(old, new) for x in iterable)
+
+
+@Pipe
+def format(iterable, template):
+ """Formats an iterable using a given string template
+
+ :param iterable: collection of data to transform
+ :type iterable: list
+
+ :param template: substring to be replaced
+ :type template: string
+ """
+ return (template.format(*x) for x in iterable)
+
+
+@Pipe
+def append(iterable, data):
+ """Appends data to the iterable.
+
+ :param iterable: collection of data to transform
+ :type iterable: list
+
+ :param data: any type of data to be appended
+ """
+
+ iterable.append(data)
+ return iterable
+
+
+@Pipe
+def extend(iterable, extension):
+ """Extends the iterable using another iterable.
+
+ :param iterable: collection of data to transform
+ :type iterable: list
+
+ :param extension: contains the additional iterable to extend the current one
+ :param extension: iterable
+ """
+
+ iterable.extend(extension)
+ return iterable
+
+
@Pipe
def encode(iterable, encoding, errors='ignore'):
return (x.encode(encoding, errors=errors) for x in iterable)
@@ -46,13 +145,54 @@ def decode(iterable, encoding):
return (x.decode(encoding) for x in iterable)
+@Pipe
+def find(iterable, sub, start=None, end=None):
+ """Returns the lowest index in the string where the sub is found.
+ If specified, the start and end params serve to slice the string
+ where sub should be searched.
+
+ :param iterable: collection of data to transform
+ :type iterable: list
+
+ :param sub: the substring to search for.
+ :type sub: string
+
+ :param start: (optional) where to start the search. Default to 0.
+ :type start: int
+
+ :param end: (optional) where to end the search. Default to the
+ end of the string.
+ :type end: int
+ """
+
+ return (x.find(sub, start, end) for x in iterable)
+
+
+@Pipe
+def split(iterable, sep, maxsplit=-1):
+ """Returns a list of words in the string, using sep as the delimiter.
+ If maxsplit is given, at most maxsplit splits are done.
+
+ :param iterable: collection of data to transform
+ :type iterable: list
+
+ :param sep: this is a delimiter. The string will be split by this separator.
+ :type sep: string
+
+ :param maxsplit: (optional) if given, there will be at most maxsplit splits.
+ :type maxsplit: int
+ """
+
+ return (x.split(sep, maxsplit) for x in iterable)
+
+
@Pipe
def sanitize(iterable):
# TODO change name and add other options
iterable = (x.strip() for x in iterable)
iterable = (re.sub(r'[\n\t\r\s]+', ' ', x) for x in iterable)
- iterable = (x.encode('ascii', errors='ignore').decode('ascii') for x in iterable)
+ iterable = (unidecode(x) for x in iterable)
iterable = (replace_entities(x) for x in iterable)
iterable = (remove_tags(x) for x in iterable)
return iterable
@@ -67,10 +207,12 @@ def xpath_getall(iterable, pred):
def xpath_get(iterable, pred):
return (Selector(x).xpath(pred).get() for x in iterable)
+
@Pipe
def css_getall(iterable, pred):
return (Selector(x).css(pred).getall() for x in iterable)
+
@Pipe
def css_get(iterable, pred):
return (Selector(x).css(pred).get() for x in iterable)
@@ -80,14 +222,17 @@ def css_get(iterable, pred):
def jmespath(iterable, query):
return (jp.search(query, x) for x in iterable)
+
@Pipe
def any(iterable):
return builtins.any(iterable)
+
@Pipe
def all(iterable):
return builtins.all(iterable)
+
@Pipe
def exists(iterable, pred):
if pred in iterable:
@@ -95,6 +240,7 @@ def exists(iterable, pred):
else:
return False
+
@Pipe
def none(iterable, pred):
if pred not in iterable:
@@ -102,14 +248,22 @@ def none(iterable, pred):
else:
return False
+
@Pipe
def length(iterable):
return len(iterable)
+
@Pipe
def bool(iterable):
return (builtins.bool(x) for x in iterable)
+
+@Pipe
+def str(iterable):
+ return (builtins.str(x) for x in iterable)
+
+
@Pipe
def float(iterable):
return (builtins.float(x) for x in iterable)
@@ -119,21 +273,26 @@ def float(iterable):
def int(iterable):
return (builtins.int(x) for x in iterable)
+
@Pipe
def abs(iterable):
return (builtins.abs(x) for x in iterable)
+
@Pipe
def ceil(iterable):
return (math.ceil(x) for x in iterable)
+
@Pipe
def round(iterable, pred):
return (builtins.round(x, pred) for x in iterable)
+
@Pipe
def join(iterable, separator=", "):
- return separator.join(builtins.map(str, iterable))
+ return separator.join(builtins.map(builtins.str, iterable))
+
@Pipe
def capitalize(iterable):
@@ -149,39 +308,88 @@ def isdigit(iterable):
def isdecimal(iterable):
return (x.isdecimal() for x in iterable)
+
@Pipe
def startswith(iterable, pred):
return (x.startswith(pred) for x in iterable)
+
@Pipe
def endswith(iterable, pred):
return (x.endswith(pred) for x in iterable)
+
@Pipe
def re_search(iterable, pattern):
#return (re.sub(pattern, repl, x) for x in iterable)
iterable = builtins.map(lambda x: re.search(pattern, x), iterable)
return (x.groups() if x else None for x in iterable)
+
@Pipe
def json_loads(iterable):
return (json.loads(x) for x in iterable)
+
@Pipe
def date_format(iterable, fmt):
return (dateparser.parse(item).strftime(fmt) for item in iterable)
+
@Pipe
def extract_price(iterable):
- return (str(Price.fromstring(item).amount) for item in iterable)
+ return (builtins.str(Price.fromstring(item).amount) for item in iterable)
+
@Pipe
def extract_currency(iterable):
return (Price.fromstring(item).currency for item in iterable)
+
+@Pipe
+def urljoin(iterable, base):
+ return (parse.urljoin(base, url) for url in iterable)
+
+@Pipe
+def identity(iterable, element):
+ """ Return the same element is passed as parameter."""
+ return (element)
+
+@Pipe
+def urlparse_netloc(iterable):
+ return (parse.urlparse(url).netloc for url in iterable)
+
+@Pipe
+def urlparse_params(iterable):
+ return (parse.urlparse(url).params for url in iterable)
+
+@Pipe
+def urlparse_path(iterable):
+ return (parse.urlparse(url).path for url in iterable)
+
+@Pipe
+def urlparse_query(iterable):
+ return (parse.urlparse(url).query for url in iterable)
+
+@Pipe
+def urlparse_scheme(iterable):
+ return (parse.urlparse(url).scheme for url in iterable)
+
+@Pipe
+def urlparse_fragment(iterable):
+ return (parse.urlparse(url).fragment for url in iterable)
+
+@Pipe
+def urlparse(iterable):
+ parsed_iterable = (parse.urlparse(url) for url in iterable)
+ parsed_iterable = ({"scheme": it.scheme, "netloc": it.netloc, "path": it.path,
+ "params": it.params, "query": it.query, "fragment": it.fragment} for it in parsed_iterable)
+ return parsed_iterable
+
filter = where
map = select
+
def evaluate(expression, data):
# TODO use StatementParser.is_safe before evaluating code.
# if StatementParser.is_safe(expression):
diff --git a/tests/test_functions.py b/tests/test_functions.py
index a52dc72..30f86e5 100644
--- a/tests/test_functions.py
+++ b/tests/test_functions.py
@@ -1,37 +1,281 @@
# TODO add tests for functions
+import pytest
from shublang import evaluate
-def test_sub():
- text = "Python,Haskell,Scala,Rust"
- assert evaluate('sub(",", " ")', data=[text]) == ["Python Haskell Scala Rust"]
-def test_sub_2():
- text = "Python,Haskell,Scala,Rust"
- assert evaluate('sub(",")', data=[text]) == ["PythonHaskellScalaRust"]
+@pytest.mark.parametrize(
+ "test_input,expected",
+ [
+ # if a mapping is not found, return the value itself
+ (
+ [
+ 'map_value({"This is foo": "foo", "This is bar": "bar"})',
+ ["This is foo", "This is not bar"]
+ ],
+ ['foo', 'This is not bar']
+ ),
+
+ # usual mapping
+ (
+ [
+ 'map_value({"1": "Available", "2": "Unavailable"})',
+ ['1', '2']
+ ],
+ ['Available', 'Unavailable']
+ ),
+
+ # map string to number
+ (
+ [
+ 'map_value({"InStock": 1, "OutOfStock": 2})',
+ ['OutOfStock', 'InStock']
+ ],
+ [2, 1]
+ ),
+
+ # map number to string
+ (
+ [
+ 'map_value({0: "Online", 1: "Offline"})',
+ [0, 1]
+ ],
+ ['Online', 'Offline']
+ )
+ ]
+)
+def test_map_value(test_input, expected):
+ assert evaluate(*test_input) == expected
+
+
+@pytest.mark.parametrize(
+ "test_input,expected",
+ [
+ (
+ ['str', [1, 2, 3]],
+ ['1', '2', '3']
+ ),
+ (
+ ['str', [1.1, 2.2, 3.3]],
+ ['1.1', '2.2', '3.3']
+ ),
+ (
+ ['str', ['1', '2', '3']],
+ ['1', '2', '3']
+ ),
+ ]
+)
+def test_str(test_input, expected):
+ assert evaluate(*test_input) == expected
+
+
+@pytest.mark.parametrize(
+ "test_input,expected",
+ [
+ (
+ ['sub(",", " ")', ['Python,Haskell,Scala,Rust']],
+ ['Python Haskell Scala Rust']
+ ),
+
+ # Optional 'repl' param should work.
+ (
+ ['sub(",")', ['Python,Haskell,Scala,Rust']],
+ ['PythonHaskellScalaRust']
+ ),
+
+ # Regular Expressions should work.
+ (
+ ['sub("b{2}(?:\\s+)", "xx ")', ['b bb bbb']],
+ ['b xx bbb']
+ ),
+ ]
+)
+def test_sub(test_input, expected):
+ assert evaluate(*test_input) == expected
+
+
+@pytest.mark.parametrize(
+ "test_input,expected",
+ [
+ (
+ ['replace("cool", "dope")', ['Pretty cool']],
+ ['Pretty dope']
+ ),
+
+ # Optional 'count' param should work on the first n patterns encountered.
+ (
+ ['replace("bb", "xx", 2)', ['bbb bbb bbb']],
+ ['xxb xxb bbb']
+ ),
+
+ # Regular expressions won't work on `replace`.
+ (
+ ['replace("t+", "xx")', ['Regex Attempt']],
+ ['Regex Attempt']
+ ),
+ ]
+)
+def test_replace(test_input, expected):
+ assert evaluate(*test_input) == expected
+
+
+@pytest.mark.parametrize(
+ "test_input,expected",
+ [
+ (
+ ['format("Now Playing: {} and {}")', [['Rick', 'Morty']]],
+ ['Now Playing: Rick and Morty']
+ ),
+ # Ordering should be respected
+ (
+ ['format("{2}, {1}, and {0}")', [['a', 'b', 'c']]],
+ ['c, b, and a']
+ ),
+ # Lists of lists are aggregated into a list
+ (
+ ['format("{} and some value {}")', [[1, 2], ['x', 'y']]],
+ ['1 and some value 2', 'x and some value y']
+ ),
+ # Args could be repeated
+ (
+ ['format("{0}--{0}-{1}!")', [['Re', 'Remix']]],
+ ['Re--Re-Remix!']
+ ),
+ # Standard Formatting should work
+ (
+ ['format("{:.2f}")', [[7/3]]],
+ ['2.33']
+ ),
+ ]
+)
+def test_format(test_input, expected):
+ assert evaluate(*test_input) == expected
+
+
+@pytest.mark.parametrize(
+ "test_input,expected",
+ [
+ (
+ ['append("new thing")', ['A', 'B']],
+ ['A', 'B', 'new thing']
+ ),
+ # list could be added as a single item
+ (
+ ['append([1, 2, "3"])', ['A', 'B']],
+ ['A', 'B', [1, 2, '3']]
+ ),
+ ]
+)
+def test_append(test_input, expected):
+ assert evaluate(*test_input) == expected
+
+
+@pytest.mark.parametrize(
+ "test_input,expected",
+ [
+ (
+ ['extend([1, 2, "3"])', ['A', 'B']],
+ ['A', 'B', 1, 2, '3']
+ ),
+
+ # generators will also work
+ (
+ ['extend(range(3, 6))', ['A', 'B']],
+ ['A', 'B', 3, 4, 5]
+ ),
+
+ # single strings are treated as iterables
+ (
+ ['extend("new")', ['A', 'B']],
+ ['A', 'B', 'n', 'e', 'w']
+ ),
+ ]
+)
+def test_extend(test_input, expected):
+ assert evaluate(*test_input) == expected
+
+
+def test_extend_with_non_iterable():
+ """It should raise a TypeError."""
+
+ with pytest.raises(TypeError):
+ evaluate("extend(123)", ['A', 'B'])
+
def test_encode():
text = "ἀἐἠἰὀὐὠὰᾀᾐ"
assert evaluate('encode("UTF8")', data=[text]) ==\
- [b'\xe1\xbc\x80\xe1\xbc\x90\xe1\xbc\xa0\xe1\xbc\xb0\xe1\xbd\x80\xe1\xbd\x90\xe1\xbd\xa0\xe1\xbd\xb0\xe1' \
+ [b'\xe1\xbc\x80\xe1\xbc\x90\xe1\xbc\xa0\xe1\xbc\xb0\xe1\xbd\x80\xe1\xbd\x90\xe1\xbd\xa0\xe1\xbd\xb0\xe1'
b'\xbe\x80\xe1\xbe\x90']
+
def test_decode():
text = b'\xe1\xbc\x80\xe1\xbc\x90\xe1\xbc\xa0\xe1\xbc\xb0\xe1\xbd\x80\xe1\xbd\x90\xe1\xbd\xa0\xe1\xbd\xb0\xe1' \
b'\xbe\x80\xe1\xbe\x90'
assert evaluate('decode("UTF8")', data=[text]) == ["ἀἐἠἰὀὐὠὰᾀᾐ"]
+
+@pytest.mark.parametrize(
+ "test_input,expected",
+ [
+ # should find at the entire string
+ (
+ ['find("th")', ['Python']],
+ [2]
+ ),
+
+ # should respect where the search starts
+ (
+ ['find("th", 3)', ['Python']],
+ [-1]
+ ),
+
+ # should respect where the search ends
+ (
+ ['find("th", 0, 1)', ['Python']],
+ [-1]
+ ),
+ ]
+)
+def test_find(test_input, expected):
+ assert evaluate(*test_input) == expected
+
+
+@pytest.mark.parametrize(
+ "test_input,expected",
+ [
+ (
+ ['split(",")', ['Python,Haskell,Scala,Rust']],
+ [['Python', 'Haskell', 'Scala', 'Rust']]
+ ),
+
+ # maxsplit should limit the number of separations
+ (
+ ['split(",", 2)', ['Python,Haskell,Scala,Rust']],
+ [['Python', 'Haskell', 'Scala,Rust']]
+ ),
+ ]
+)
+def test_split(test_input, expected):
+ assert evaluate(*test_input) == expected
+
+
def test_sanitize():
text = ["Python \t\t\t\t",
"
Haskell",
" Rust"]
assert evaluate("sanitize", data=text) == ["Python", "Haskell", "Rust"]
+def test_sanitize_1():
+ text = [u"Checking unicode ko\u017eu\u0161\u010dek \t\t\t\t"]
+ assert evaluate("sanitize", data=text) == ["Checking unicode kozuscek"]
+
def test_xpath_getall():
html = '