From f7a6c6579b5de85a953d20c28f2ef4f0be95106e Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 6 Aug 2020 13:57:55 +0800 Subject: [PATCH 01/31] add new 'replace' functionality --- shublang/shublang.py | 23 +++++++++++++++++++++++ tests/test_functions.py | 27 +++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/shublang/shublang.py b/shublang/shublang.py index 23a0852..b6741f2 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -36,6 +36,29 @@ def sub(iterable, pattern, repl=None): return (re.sub(pattern, repl, x) for x in iterable) +@Pipe +def replace(iterable, old, new, count=None): + """Replaces a substring with another substring. + + :param old: substring to be replaced + :type old: string + + :param new: the replacement substring + :type new: string + + :param count: (optional) The first n substring occurrences to be replaced + :type count: int + + NOTE: This doesn't support regular expressions which makes it safer and + easier. If you need regular expressions, make use :func:`sub` which supports + it. + """ + + if count: + return (x.replace(old, new, count) for x in iterable) + return (x.replace(old, new) for x in iterable) + + @Pipe def encode(iterable, encoding, errors='ignore'): return (x.encode(encoding, errors=errors) for x in iterable) diff --git a/tests/test_functions.py b/tests/test_functions.py index a52dc72..8a3d16b 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,5 +1,6 @@ # TODO add tests for functions +import pytest from shublang import evaluate def test_sub(): @@ -10,6 +11,32 @@ def test_sub_2(): text = "Python,Haskell,Scala,Rust" assert evaluate('sub(",")', data=[text]) == ["PythonHaskellScalaRust"] + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['replace("cool", "dope")', ['Pretty cool']], + ['Pretty dope'] + ), + + # Optional count param should work on the first n patterns encountered. + ( + ['replace("bb", "xx", 2)', ['bbb bbb bbb']], + ['xxb xxb bbb'] + ), + + # Regular expressions won't work on `replace`. + ( + ['replace("t+", "xx")', ['Regex Attempt']], + ['Regex Attempt'] + ), + ] +) +def test_replace(test_input, expected): + assert evaluate(*test_input) == expected + + def test_encode(): text = "ἀἐἠἰὀὐὠὰᾀᾐ" assert evaluate('encode("UTF8")', data=[text]) ==\ From a7de3410fa2a5f788b679a1c4dbc09d2b9e1da6e Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 6 Aug 2020 14:10:07 +0800 Subject: [PATCH 02/31] update 'sub' with tests and docs to differentiate it with 'replace' --- shublang/shublang.py | 19 +++++++++++++++++-- tests/test_functions.py | 31 ++++++++++++++++++++++++------- 2 files changed, 41 insertions(+), 9 deletions(-) diff --git a/shublang/shublang.py b/shublang/shublang.py index b6741f2..365c38e 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -29,10 +29,22 @@ logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) + @Pipe def sub(iterable, pattern, repl=None): - if not repl: - repl = "" + """Replaces a substring with another substring using regular expressions. + + :param iterable: collection of data to transform + :type iterable: list + + :param pattern: regular expression to match and be replaced + :type pattern: string + + :param repl: (optional) the replacement substring + :type rep: string + """ + + repl = repl or "" return (re.sub(pattern, repl, x) for x in iterable) @@ -40,6 +52,9 @@ def sub(iterable, pattern, repl=None): def replace(iterable, old, new, count=None): """Replaces a substring with another substring. + :param iterable: collection of data to transform + :type iterable: list + :param old: substring to be replaced :type old: string diff --git a/tests/test_functions.py b/tests/test_functions.py index 8a3d16b..1003de5 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -3,13 +3,30 @@ import pytest from shublang import evaluate -def test_sub(): - text = "Python,Haskell,Scala,Rust" - assert evaluate('sub(",", " ")', data=[text]) == ["Python Haskell Scala Rust"] -def test_sub_2(): - text = "Python,Haskell,Scala,Rust" - assert evaluate('sub(",")', data=[text]) == ["PythonHaskellScalaRust"] +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['sub(",", " ")', ['Python,Haskell,Scala,Rust']], + ['Python Haskell Scala Rust'] + ), + + # Optional 'repl' param should work. + ( + ['sub(",")', ['Python,Haskell,Scala,Rust']], + ['PythonHaskellScalaRust'] + ), + + # Regular Expressions should work. + ( + ['sub("b{2}(?:\s+)", "xx ")', ['b bb bbb']], + ['b xx bbb'] + ), + ] +) +def test_sub(test_input, expected): + assert evaluate(*test_input) == expected @pytest.mark.parametrize( @@ -20,7 +37,7 @@ def test_sub_2(): ['Pretty dope'] ), - # Optional count param should work on the first n patterns encountered. + # Optional 'count' param should work on the first n patterns encountered. ( ['replace("bb", "xx", 2)', ['bbb bbb bbb']], ['xxb xxb bbb'] From 2752e998e2e50c5d191d5432105b21e1a4e7b3d3 Mon Sep 17 00:00:00 2001 From: Renan Cunha Date: Thu, 6 Aug 2020 09:33:33 -0300 Subject: [PATCH 03/31] Add new 'split' functionality --- shublang/shublang.py | 19 +++++++++++++++++++ tests/test_functions.py | 21 +++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/shublang/shublang.py b/shublang/shublang.py index 23a0852..4f9a53d 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -46,6 +46,25 @@ def decode(iterable, encoding): return (x.decode(encoding) for x in iterable) +@Pipe +def split(iterable, sep, maxsplit=-1): + """Returns a list of words in the string, using sep as the delimiter. + If maxsplit is given, at most maxsplit splits are done. + + :param iterable: collection of data to transform + :type iterable: list + + :param sep: this is a delimiter. The string will be split by this separator. + :type sep: string + + :param maxsplit: (optional) if given, there will be at most maxsplit splits. + :type maxsplit: int + """ + + + return (x.split(sep, maxsplit) for x in iterable) + + @Pipe def sanitize(iterable): # TODO change name and add other options diff --git a/tests/test_functions.py b/tests/test_functions.py index a52dc72..a0c019a 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,5 +1,6 @@ # TODO add tests for functions +import pytest from shublang import evaluate def test_sub(): @@ -21,6 +22,26 @@ def test_decode(): b'\xbe\x80\xe1\xbe\x90' assert evaluate('decode("UTF8")', data=[text]) == ["ἀἐἠἰὀὐὠὰᾀᾐ"] + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['split(",")', ['Python,Haskell,Scala,Rust']], + [['Python', 'Haskell', 'Scala', 'Rust']] + ), + + # maxsplit should limit the number of separations + ( + ['split(",", 2)', ['Python,Haskell,Scala,Rust']], + [['Python', 'Haskell', 'Scala,Rust']] + ), + ] +) +def test_split(test_input, expected): + assert evaluate(*test_input) == expected + + def test_sanitize(): text = ["Python \t\t\t\t", "
Haskell", From 8cff274802490127fd2b2959f06e65253ce38ae8 Mon Sep 17 00:00:00 2001 From: Renan Cunha Date: Thu, 6 Aug 2020 15:53:40 -0300 Subject: [PATCH 04/31] Add new 'find' functionality --- shublang/shublang.py | 24 ++++++++++++++++++++++++ tests/test_functions.py | 26 ++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/shublang/shublang.py b/shublang/shublang.py index 23a0852..7e21c1d 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -46,6 +46,30 @@ def decode(iterable, encoding): return (x.decode(encoding) for x in iterable) +@Pipe +def find(iterable, sub, start=None, end=None): + """Returns the lowest index in the string where the sub is found. + If specified, the start and end params serve to slice the string + where sub should be searched. + + :param iterable: collection of data to transform + :type iterable: list + + :param sub: the substring to search for. + :type sub: string + + :param start: (optional) where to start the search. Default to 0. + :type start: int + + :param end: (optional) where to end the search. Default to the + end of the string. + :type end: int + """ + + + return (x.find(sub, start, end) for x in iterable) + + @Pipe def sanitize(iterable): # TODO change name and add other options diff --git a/tests/test_functions.py b/tests/test_functions.py index a52dc72..b0b739f 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,5 +1,6 @@ # TODO add tests for functions +import pytest from shublang import evaluate def test_sub(): @@ -21,6 +22,31 @@ def test_decode(): b'\xbe\x80\xe1\xbe\x90' assert evaluate('decode("UTF8")', data=[text]) == ["ἀἐἠἰὀὐὠὰᾀᾐ"] +@pytest.mark.parametrize( + "test_input,expected", + [ + # should find at the entire string + ( + ['find("th")', ['Python']], + [2] + ), + + # should respect where the search starts + ( + ['find("th", 3)', ['Python']], + [-1] + ), + + # should respect where the search ends + ( + ['find("th", 0, 1)', ['Python']], + [-1] + ), + ] +) +def test_find(test_input, expected): + assert evaluate(*test_input) == expected + def test_sanitize(): text = ["Python \t\t\t\t", "
Haskell", From 313dd67bc3286381fc8a648b51f5fb19d9bbd131 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Fri, 7 Aug 2020 12:46:24 +0800 Subject: [PATCH 05/31] add new 'format' functionality --- shublang/shublang.py | 13 +++++++++++++ tests/test_functions.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/shublang/shublang.py b/shublang/shublang.py index 365c38e..3e39792 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -74,6 +74,19 @@ def replace(iterable, old, new, count=None): return (x.replace(old, new) for x in iterable) +@Pipe +def format(iterable, template): + """Formats an iterable using a given string template + + :param iterable: collection of data to transform + :type iterable: list + + :param template: substring to be replaced + :type template: string + """ + return (template.format(*x) for x in iterable) + + @Pipe def encode(iterable, encoding, errors='ignore'): return (x.encode(encoding, errors=errors) for x in iterable) diff --git a/tests/test_functions.py b/tests/test_functions.py index 1003de5..efb87c0 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -54,6 +54,43 @@ def test_replace(test_input, expected): assert evaluate(*test_input) == expected +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['format("Now Playing: {} and {}")', [['Rick', 'Morty']]], + ['Now Playing: Rick and Morty'] + ), + + # Ordering should be respected + ( + ['format("{2}, {1}, and {0}")', [['a', 'b', 'c']]], + ['c, b, and a'] + ), + + # Lists of lists are aggregated into a list + ( + ['format("{} and some value {}")', [[1, 2], ['x', 'y']]], + ['1 and some value 2', 'x and some value y'] + ), + + # Args could be repeated + ( + ['format("{0}--{0}-{1}!")', [['Re', 'Remix']]], + ['Re--Re-Remix!'] + ), + + # Standard Formatting should work + ( + ['format("{:.2f}")', [[7/3]]], + ['2.33'] + ), + ] +) +def test_format(test_input, expected): + assert evaluate(*test_input) == expected + + def test_encode(): text = "ἀἐἠἰὀὐὠὰᾀᾐ" assert evaluate('encode("UTF8")', data=[text]) ==\ From b4fbef0e277049f7e4979875c50ca25b8d3204f1 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 9 Apr 2020 12:43:50 +0800 Subject: [PATCH 06/31] Add new 'append' functionality --- shublang/shublang.py | 29 ++++++++++++++++++++++ tests/test_functions.py | 54 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 78 insertions(+), 5 deletions(-) diff --git a/shublang/shublang.py b/shublang/shublang.py index 532bf91..38e9edd 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -87,6 +87,35 @@ def format(iterable, template): return (template.format(*x) for x in iterable) +@Pipe +def append(iterable, data): + """Appends data to the iterable. + + :param iterable: collection of data to transform + :type iterable: list + + :param data: any type of data to be appended + """ + + iterable.append(data) + return iterable + + +@Pipe +def extend(iterable, extension): + """Extends the iterable using another iterable. + + :param iterable: collection of data to transform + :type iterable: list + + :param extension: contains the additional iterable to extend the current one + :param extension: iterable + """ + + iterable.extend(extension) + return iterable + + @Pipe def encode(iterable, encoding, errors='ignore'): return (x.encode(encoding, errors=errors) for x in iterable) diff --git a/tests/test_functions.py b/tests/test_functions.py index 245ebff..8999817 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -61,25 +61,21 @@ def test_replace(test_input, expected): ['format("Now Playing: {} and {}")', [['Rick', 'Morty']]], ['Now Playing: Rick and Morty'] ), - # Ordering should be respected ( ['format("{2}, {1}, and {0}")', [['a', 'b', 'c']]], ['c, b, and a'] ), - # Lists of lists are aggregated into a list ( ['format("{} and some value {}")', [[1, 2], ['x', 'y']]], ['1 and some value 2', 'x and some value y'] ), - # Args could be repeated ( ['format("{0}--{0}-{1}!")', [['Re', 'Remix']]], ['Re--Re-Remix!'] ), - # Standard Formatting should work ( ['format("{:.2f}")', [[7/3]]], @@ -91,6 +87,54 @@ def test_format(test_input, expected): assert evaluate(*test_input) == expected +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['append("new thing")', ['A', 'B']], + ['A', 'B', 'new thing'] + ), + # list could be added as a single item + ( + ['append([1, 2, "3"])', ['A', 'B']], + ['A', 'B', [1, 2, '3']] + ), + ] +) +def test_append(test_input, expected): + assert evaluate(*test_input) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['extend([1, 2, "3"])', ['A', 'B']], + ['A', 'B', 1, 2, '3'] + ), + # generators will also work + ( + ['extend(range(3, 6))', ['A', 'B']], + ['A', 'B', 3, 4, 5] + ), + # single strings are treated as iterables + ( + ['extend("new")', ['A', 'B']], + ['A', 'B', 'n', 'e', 'w'] + ), + ] +) +def test_extend(test_input, expected): + assert evaluate(*test_input) == expected + + +def test_extend_with_non_iterable(): + """It should raise a TypeError.""" + + with pytest.raises(TypeError): + evaluate("extend(123)", ['A', 'B']) + + def test_encode(): text = "ἀἐἠἰὀὐὠὰᾀᾐ" assert evaluate('encode("UTF8")', data=[text]) ==\ @@ -110,7 +154,7 @@ def test_decode(): ['split(",")', ['Python,Haskell,Scala,Rust']], [['Python', 'Haskell', 'Scala', 'Rust']] ), - + # maxsplit should limit the number of separations ( ['split(",", 2)', ['Python,Haskell,Scala,Rust']], From a59a0d2615018cb5f165b4a84bf1e1b48cdb3243 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 10 Aug 2020 12:52:19 +0800 Subject: [PATCH 07/31] Add new 'extend' functionality --- tests/test_functions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_functions.py b/tests/test_functions.py index 8999817..0a9bb2e 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -112,11 +112,13 @@ def test_append(test_input, expected): ['extend([1, 2, "3"])', ['A', 'B']], ['A', 'B', 1, 2, '3'] ), + # generators will also work ( ['extend(range(3, 6))', ['A', 'B']], ['A', 'B', 3, 4, 5] ), + # single strings are treated as iterables ( ['extend("new")', ['A', 'B']], From 875126ae0a79c375a179e65cad8200c5d1069b77 Mon Sep 17 00:00:00 2001 From: Renan Cunha Date: Mon, 24 Aug 2020 13:38:54 -0300 Subject: [PATCH 08/31] add new 'str' functionality --- shublang/shublang.py | 6 +++++- tests/test_functions.py | 21 +++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/shublang/shublang.py b/shublang/shublang.py index a15eb21..32f6b5a 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -231,6 +231,10 @@ def length(iterable): def bool(iterable): return (builtins.bool(x) for x in iterable) +@Pipe +def str(iterable): + return (builtins.str(x) for x in iterable) + @Pipe def float(iterable): return (builtins.float(x) for x in iterable) @@ -294,7 +298,7 @@ def date_format(iterable, fmt): @Pipe def extract_price(iterable): - return (str(Price.fromstring(item).amount) for item in iterable) + return (builtins.str(Price.fromstring(item).amount) for item in iterable) @Pipe def extract_currency(iterable): diff --git a/tests/test_functions.py b/tests/test_functions.py index fbfec7a..416dd8d 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -4,6 +4,27 @@ from shublang import evaluate +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['str', [1, 2, 3]], + ['1', '2', '3'] + ), + ( + ['str', [1.1, 2.2, 3.3]], + ['1.1', '2.2', '3.3'] + ), + ( + ['str', ['1', '2', '3']], + ['1', '2', '3'] + ), + ] +) +def test_str(test_input, expected): + assert evaluate(*test_input) == expected + + @pytest.mark.parametrize( "test_input,expected", [ From da9be872f8b3950df389877a6482d6dd244978ee Mon Sep 17 00:00:00 2001 From: Akshay Philar Date: Fri, 28 Aug 2020 08:11:06 +0530 Subject: [PATCH 09/31] flak8 linting --- shublang/shublang.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/shublang/shublang.py b/shublang/shublang.py index 32f6b5a..eaf14e8 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -188,10 +188,12 @@ def xpath_getall(iterable, pred): def xpath_get(iterable, pred): return (Selector(x).xpath(pred).get() for x in iterable) + @Pipe def css_getall(iterable, pred): return (Selector(x).css(pred).getall() for x in iterable) + @Pipe def css_get(iterable, pred): return (Selector(x).css(pred).get() for x in iterable) @@ -201,14 +203,17 @@ def css_get(iterable, pred): def jmespath(iterable, query): return (jp.search(query, x) for x in iterable) + @Pipe def any(iterable): return builtins.any(iterable) + @Pipe def all(iterable): return builtins.all(iterable) + @Pipe def exists(iterable, pred): if pred in iterable: @@ -216,6 +221,7 @@ def exists(iterable, pred): else: return False + @Pipe def none(iterable, pred): if pred not in iterable: @@ -223,18 +229,22 @@ def none(iterable, pred): else: return False + @Pipe def length(iterable): return len(iterable) + @Pipe def bool(iterable): return (builtins.bool(x) for x in iterable) + @Pipe def str(iterable): return (builtins.str(x) for x in iterable) + @Pipe def float(iterable): return (builtins.float(x) for x in iterable) @@ -244,22 +254,27 @@ def float(iterable): def int(iterable): return (builtins.int(x) for x in iterable) + @Pipe def abs(iterable): return (builtins.abs(x) for x in iterable) + @Pipe def ceil(iterable): return (math.ceil(x) for x in iterable) + @Pipe def round(iterable, pred): return (builtins.round(x, pred) for x in iterable) + @Pipe def join(iterable, separator=", "): return separator.join(builtins.map(str, iterable)) + @Pipe def capitalize(iterable): return (x.capitalize() for x in iterable) @@ -274,39 +289,48 @@ def isdigit(iterable): def isdecimal(iterable): return (x.isdecimal() for x in iterable) + @Pipe def startswith(iterable, pred): return (x.startswith(pred) for x in iterable) + @Pipe def endswith(iterable, pred): return (x.endswith(pred) for x in iterable) + @Pipe def re_search(iterable, pattern): #return (re.sub(pattern, repl, x) for x in iterable) iterable = builtins.map(lambda x: re.search(pattern, x), iterable) return (x.groups() if x else None for x in iterable) + @Pipe def json_loads(iterable): return (json.loads(x) for x in iterable) + @Pipe def date_format(iterable, fmt): return (dateparser.parse(item).strftime(fmt) for item in iterable) + @Pipe def extract_price(iterable): return (builtins.str(Price.fromstring(item).amount) for item in iterable) + @Pipe def extract_currency(iterable): return (Price.fromstring(item).currency for item in iterable) + filter = where map = select + def evaluate(expression, data): # TODO use StatementParser.is_safe before evaluating code. # if StatementParser.is_safe(expression): From aa6a082818b8c5fce739c4ed239395b5413f2690 Mon Sep 17 00:00:00 2001 From: Akshay Philar Date: Fri, 28 Aug 2020 08:14:17 +0530 Subject: [PATCH 10/31] renamed str to string --- shublang/shublang.py | 2 +- tests/test_functions.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/shublang/shublang.py b/shublang/shublang.py index eaf14e8..74ec431 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -241,7 +241,7 @@ def bool(iterable): @Pipe -def str(iterable): +def string(iterable): return (builtins.str(x) for x in iterable) diff --git a/tests/test_functions.py b/tests/test_functions.py index 416dd8d..de90efb 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -8,15 +8,15 @@ "test_input,expected", [ ( - ['str', [1, 2, 3]], + ['string', [1, 2, 3]], ['1', '2', '3'] ), ( - ['str', [1.1, 2.2, 3.3]], + ['string', [1.1, 2.2, 3.3]], ['1.1', '2.2', '3.3'] ), ( - ['str', ['1', '2', '3']], + ['string', ['1', '2', '3']], ['1', '2', '3'] ), ] From f6d606daf25fa45b2b4d6e9aefdeedaf6eb46737 Mon Sep 17 00:00:00 2001 From: Akshay Philar Date: Fri, 28 Aug 2020 08:22:00 +0530 Subject: [PATCH 11/31] fixing join function --- shublang/shublang.py | 4 ++-- tests/test_functions.py | 28 +++++++++++++++++++++++++--- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/shublang/shublang.py b/shublang/shublang.py index 74ec431..1217f38 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -241,7 +241,7 @@ def bool(iterable): @Pipe -def string(iterable): +def str(iterable): return (builtins.str(x) for x in iterable) @@ -272,7 +272,7 @@ def round(iterable, pred): @Pipe def join(iterable, separator=", "): - return separator.join(builtins.map(str, iterable)) + return separator.join(builtins.map(builtins.str, iterable)) @Pipe diff --git a/tests/test_functions.py b/tests/test_functions.py index de90efb..30d2f49 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -8,15 +8,15 @@ "test_input,expected", [ ( - ['string', [1, 2, 3]], + ['str', [1, 2, 3]], ['1', '2', '3'] ), ( - ['string', [1.1, 2.2, 3.3]], + ['str', [1.1, 2.2, 3.3]], ['1.1', '2.2', '3.3'] ), ( - ['string', ['1', '2', '3']], + ['str', ['1', '2', '3']], ['1', '2', '3'] ), ] @@ -324,3 +324,25 @@ def test_currency_1(): def test_currency_2(): assert evaluate('extract_currency', data=['$1,199.00']) == ['$'] + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['join("")', ['A', 'B']], + 'AB' + ), + ( + ['join("")', ("A", "B")], + 'AB' + ), + ( + ['join("")', (1, 2)], + '12' + ), + + ] +) +def test_join(test_input, expected): + assert evaluate(*test_input) == expected From da72822828642195e778574a61d2faa6e1e467a9 Mon Sep 17 00:00:00 2001 From: Akshay Philar Date: Fri, 28 Aug 2020 08:25:07 +0530 Subject: [PATCH 12/31] fixing join function --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a55f8ce..2e72da2 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='shublang', - version='0.1.2', + version='0.2.0', license='BSD', description='Shublang - Data Extraction DSL', author='Akshay', From 55937a36797e2b82b89896a4afc6ca977b0546f4 Mon Sep 17 00:00:00 2001 From: Akshay Philar Date: Sat, 29 Aug 2020 20:41:27 +0530 Subject: [PATCH 13/31] added urljoin pipe function --- setup.py | 3 +- shublang/shublang.py | 6 ++++ tests/test_functions.py | 76 +++++++++++++++++++++++++++++++++++------ 3 files changed, 72 insertions(+), 13 deletions(-) diff --git a/setup.py b/setup.py index 2e72da2..5b4d56d 100644 --- a/setup.py +++ b/setup.py @@ -4,13 +4,12 @@ setup( name='shublang', - version='0.2.0', + version='0.2.1', license='BSD', description='Shublang - Data Extraction DSL', author='Akshay', author_email='akshay@scrapinghub.com', packages=find_packages(exclude=exclude), - #package_data={'shublang': ['*.py']}, include_package_data=True, entry_points={ 'console_scripts': [ diff --git a/shublang/shublang.py b/shublang/shublang.py index 1217f38..330670c 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -10,6 +10,7 @@ import dateparser from price_parser import Price import types +from urllib import parse """ Conventions @@ -327,6 +328,11 @@ def extract_currency(iterable): return (Price.fromstring(item).currency for item in iterable) +@Pipe +def urljoin(iterable, base): + return (parse.urljoin(base, url) for url in iterable) + + filter = where map = select diff --git a/tests/test_functions.py b/tests/test_functions.py index 30d2f49..d18252a 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -41,7 +41,7 @@ def test_str(test_input, expected): # Regular Expressions should work. ( - ['sub("b{2}(?:\s+)", "xx ")', ['b bb bbb']], + ['sub("b{2}(?:\\s+)", "xx ")', ['b bb bbb']], ['b xx bbb'] ), ] @@ -161,14 +161,16 @@ def test_extend_with_non_iterable(): def test_encode(): text = "ἀἐἠἰὀὐὠὰᾀᾐ" assert evaluate('encode("UTF8")', data=[text]) ==\ - [b'\xe1\xbc\x80\xe1\xbc\x90\xe1\xbc\xa0\xe1\xbc\xb0\xe1\xbd\x80\xe1\xbd\x90\xe1\xbd\xa0\xe1\xbd\xb0\xe1' \ + [b'\xe1\xbc\x80\xe1\xbc\x90\xe1\xbc\xa0\xe1\xbc\xb0\xe1\xbd\x80\xe1\xbd\x90\xe1\xbd\xa0\xe1\xbd\xb0\xe1' b'\xbe\x80\xe1\xbe\x90'] + def test_decode(): text = b'\xe1\xbc\x80\xe1\xbc\x90\xe1\xbc\xa0\xe1\xbc\xb0\xe1\xbd\x80\xe1\xbd\x90\xe1\xbd\xa0\xe1\xbd\xb0\xe1' \ b'\xbe\x80\xe1\xbe\x90' assert evaluate('decode("UTF8")', data=[text]) == ["ἀἐἠἰὀὐὠὰᾀᾐ"] + @pytest.mark.parametrize( "test_input,expected", [ @@ -194,6 +196,7 @@ def test_decode(): def test_find(test_input, expected): assert evaluate(*test_input) == expected + @pytest.mark.parametrize( "test_input,expected", [ @@ -212,17 +215,20 @@ def test_find(test_input, expected): def test_split(test_input, expected): assert evaluate(*test_input) == expected + def test_sanitize(): text = ["Python \t\t\t\t", "
Haskell", " Rust"] assert evaluate("sanitize", data=text) == ["Python", "Haskell", "Rust"] + def test_xpath_getall(): html = '
    • Skoda
      Vauxhall
      Peugot
  • ' assert evaluate(f'xpath_getall(\'//li[@class="results"]/ul/text()\')', data=[html]) == \ [["Skoda", "Vauxhall", "Peugot"]] + def test_xpath_get(): html = '
    • Skoda
      Vauxhall
      Peugot
  • ' assert evaluate(f'xpath_get(\'//li[@class="results"]/ul/text()\')', data=[html]) == ["Skoda"] @@ -233,57 +239,70 @@ def test_css_getall(): assert evaluate(f'css_getall("li.results>ul::text")', data=[html]) == \ [["Skoda", "Vauxhall", "Peugot"]] + def test_css_get(): html = '
    • Skoda
      Vauxhall
      Peugot
  • ' assert evaluate(f'css_get("li.results>ul::text")', data=[html]) == ["Skoda"] + def test_any(): - assert evaluate('any', [1, False, 2]) == True - assert evaluate('any', [0, False, None]) == False + assert evaluate('any', [1, False, 2]) is True + assert evaluate('any', [0, False, None]) is False + def test_all(): - assert evaluate('all', [1, False, []]) == False - assert evaluate('all', [1, True, [1]]) == True + assert evaluate('all', [1, False, []]) is False + assert evaluate('all', [1, True, [1]]) is True def test_exists(): - assert evaluate('exists(20)', [20, 0, 1]) == True - assert evaluate('exists(0)', [1, True, [1]]) == False + assert evaluate('exists(20)', [20, 0, 1]) is True + assert evaluate('exists(0)', [1, True, [1]]) is False def test_none(): - assert evaluate('none(20)', [20, 0, 1]) == False - assert evaluate('none(0)', [1, True, [1]]) == True + assert evaluate('none(20)', [20, 0, 1]) is False + assert evaluate('none(0)', [1, True, [1]]) is True + def test_length(): assert evaluate('length', [20, 0, 1]) == 3 assert evaluate('length', "length") == 6 + def test_bool(): assert list(evaluate('bool', [0, [], ''])) == [False, False, False] assert list(evaluate('bool', [1, [1], ''])) == [True, True, False] + def test_float(): assert evaluate('float', [20, 1, 2]) == [20.0, 1.0, 2.0] + def test_float_2(): assert evaluate('float', ["20", "1", "2"]) == [20.0, 1.0, 2.0] + def test_int(): assert evaluate('int', [20.0, 1.0, 2.0]) == [20, 1, 2] + def test_int_2(): assert evaluate('int', ["20", "1", "2"]) == [20, 1, 2] + def test_abs(): assert evaluate('abs', [-1, -2, 3]) == [1, 2, 3] + def test_ceil(): assert evaluate('ceil', [1.5, -2.2, 3.1]) == [2, -2, 4] + def test_round(): assert list(evaluate('round(2)', [1.221123])) == [1.22] + def test_filter(): assert list(evaluate('filter(lambda x: x>1)', [0, 1, 2])) == [2] @@ -292,36 +311,46 @@ def test_startswith(): assert list(evaluate('startswith("a")', ["andrew", "alex", "akshay"])) == [True, True, True] assert list(evaluate('startswith("b")', ["ian"])) == [False] + def test_endswith(): assert list(evaluate('endswith("a")', ["andrew", "alex", "akshay"])) == [False, False, False] assert list(evaluate('endswith("b")', ["Rob"])) == [True] + def test_re_search(): text = "Expected Price: $1233" assert evaluate('re_search(r"(\\d+)")', data=[text]) == [('1233', )] + def test_json_loads(): json_data = '{"results":["Skoda", "Peugot", "Vauxhall"]}' - assert evaluate("json_loads", data=[json_data]) == [{"results":["Skoda", "Peugot", "Vauxhall"]}] + assert evaluate("json_loads", data=[json_data]) == [{"results": ["Skoda", "Peugot", "Vauxhall"]}] + def test_date_format(): assert evaluate('date_format("%Y-%m-%d")|first', data=['15th August 2016']) == '2016-08-15' + def test_price_1(): assert evaluate('extract_price', data=['22,90 €']) == ['22.90'] + def test_price_2(): assert evaluate('extract_price', data=['$1,199.00']) == ['1199.00'] + def test_price_3(): assert evaluate('extract_price', data=['$12']) == ['12'] + def test_price_4(): assert evaluate('extract_price', data=['12.000,95']) == ['12000.95'] + def test_currency_1(): assert evaluate('extract_currency', data=['22,90 €']) == ['€'] + def test_currency_2(): assert evaluate('extract_currency', data=['$1,199.00']) == ['$'] @@ -346,3 +375,28 @@ def test_currency_2(): ) def test_join(test_input, expected): assert evaluate(*test_input) == expected + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['urljoin("http://scrapinghub.com/products.html")', ['autoextract.html', 'uncork.html', 'crawlera.html']], + [ + 'http://scrapinghub.com/autoextract.html', + 'http://scrapinghub.com/uncork.html', + 'http://scrapinghub.com/crawlera.html', + ] + ), + # If url is an absolute URL, the url’s host name and/or scheme will be present in the result + ( + ['urljoin("http://www.scrapinghub.com")', + ['//doc.scrapinghub.com/unified_schema.html#operation/product']], + [ + 'http://doc.scrapinghub.com/unified_schema.html#operation/product', + ] + ), + ] +) +def test_urljoin(test_input, expected): + assert evaluate(*test_input) == expected From 01597cd60e0f42b1223a17719f4da555bb05bf71 Mon Sep 17 00:00:00 2001 From: Akshay Philar Date: Sat, 29 Aug 2020 23:57:30 +0530 Subject: [PATCH 14/31] updated setup.py --- setup.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 5b4d56d..cb659e6 100644 --- a/setup.py +++ b/setup.py @@ -6,9 +6,10 @@ name='shublang', version='0.2.1', license='BSD', + author='Akshay Philar', + author_email='akshayphilar@gmail.com', description='Shublang - Data Extraction DSL', - author='Akshay', - author_email='akshay@scrapinghub.com', + url="https://github.com/scrapinghub/shublang", packages=find_packages(exclude=exclude), include_package_data=True, entry_points={ @@ -27,6 +28,7 @@ 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', ], + python_requires='>=3.6', install_requires=[ 'pipe >= 1.5.0', 'jmespath >= 0.9.4', From ee59e09660121efcd9f0ef67ec084d8dd9f06ab7 Mon Sep 17 00:00:00 2001 From: Akshay Philar Date: Sat, 29 Aug 2020 23:59:01 +0530 Subject: [PATCH 15/31] added license --- LICENSE | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..7d27cbc --- /dev/null +++ b/LICENSE @@ -0,0 +1,26 @@ +Copyright 2020 Scrapinghub + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors +may be used to endorse or promote products derived from this software without +specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file From 57e68595be3e5ebc3d1f92f842e250840d6f8609 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Ruiz?= Date: Fri, 4 Sep 2020 16:12:09 +0200 Subject: [PATCH 16/31] Added Identity function --- shublang/shublang.py | 7 ++++++- tests/test_functions.py | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/shublang/shublang.py b/shublang/shublang.py index 330670c..a3d8243 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -130,7 +130,7 @@ def decode(iterable, encoding): @Pipe def find(iterable, sub, start=None, end=None): """Returns the lowest index in the string where the sub is found. - If specified, the start and end params serve to slice the string + If specified, the start and end params serve to slice the string where sub should be searched. :param iterable: collection of data to transform @@ -332,6 +332,11 @@ def extract_currency(iterable): def urljoin(iterable, base): return (parse.urljoin(base, url) for url in iterable) +@Pipe +def identity(iterable, element): + """ Return the same element is passed as parameter.""" + return element + filter = where map = select diff --git a/tests/test_functions.py b/tests/test_functions.py index d18252a..6b2795e 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -400,3 +400,27 @@ def test_join(test_input, expected): ) def test_urljoin(test_input, expected): assert evaluate(*test_input) == expected + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + ['identity(True)', + [10, 'far', ['boo', 3]]], + True, + ), + ( + ['identity("InStock")', + ["In Stock.", "Only 3 in Stock", "Stock Ok"]], + "InStock", + ), + ( + ['identity((1,2,3,4,5))', + "foo"], + (1,2,3,4,5), + ), + ] +) + +def test_identity(test_input, expected): + assert evaluate(*test_input) == expected From 55f129a052a44a68c7b0d0aff1f853ee44e7f652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADctor=20Ruiz?= Date: Mon, 7 Sep 2020 08:51:15 +0200 Subject: [PATCH 17/31] Update shublang/shublang.py Co-authored-by: Akshay Philar --- shublang/shublang.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/shublang/shublang.py b/shublang/shublang.py index a3d8243..481db71 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -335,7 +335,7 @@ def urljoin(iterable, base): @Pipe def identity(iterable, element): """ Return the same element is passed as parameter.""" - return element + return (element) filter = where From 859b0d92c054514464e746fdda2b556c953ce2b4 Mon Sep 17 00:00:00 2001 From: Akshay Philar Date: Mon, 7 Sep 2020 13:21:48 +0530 Subject: [PATCH 18/31] adding bumpversion config --- .bumpversion.cfg | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .bumpversion.cfg diff --git a/.bumpversion.cfg b/.bumpversion.cfg new file mode 100644 index 0000000..e2ae8a6 --- /dev/null +++ b/.bumpversion.cfg @@ -0,0 +1,6 @@ +[bumpversion] +current_version = 0.2.1 +commit = True +tag = True + +[bumpversion:file:setup.py] \ No newline at end of file From a22ba9e9ed42570dde23a51977cf9777bc2e5cf8 Mon Sep 17 00:00:00 2001 From: Akshay Philar Date: Mon, 7 Sep 2020 13:21:52 +0530 Subject: [PATCH 19/31] =?UTF-8?q?Bump=20version:=200.2.1=20=E2=86=92=200.2?= =?UTF-8?q?.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 4 ++-- setup.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index e2ae8a6..6ef1061 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,6 +1,6 @@ [bumpversion] -current_version = 0.2.1 +current_version = 0.2.2 commit = True tag = True -[bumpversion:file:setup.py] \ No newline at end of file +[bumpversion:file:setup.py] diff --git a/setup.py b/setup.py index cb659e6..550766b 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name='shublang', - version='0.2.1', + version='0.2.2', license='BSD', author='Akshay Philar', author_email='akshayphilar@gmail.com', From 668bdeeb8c420af342ddb4d30b8f5b44772d0f93 Mon Sep 17 00:00:00 2001 From: Renan Cunha Date: Tue, 15 Sep 2020 09:14:01 -0300 Subject: [PATCH 20/31] add new 'map_value' feature --- shublang/shublang.py | 17 +++++++++++++++++ tests/test_functions.py | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) diff --git a/shublang/shublang.py b/shublang/shublang.py index 481db71..2c86ca6 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -31,6 +31,23 @@ logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG) +@Pipe +def map_value(iterable, rules_dict): + """Maps an input value to an output according to the map settings + configured at rules_dict. + + :param iterable: collection of data to transform + :type iterable: list + + :param rules_dict: rules dictionary where the key should be the + exactly pattern to look for and the value should be the desired output + :type rules_dict: dict + + """ + + return (rules_dict.get(x, x) for x in iterable) + + @Pipe def sub(iterable, pattern, repl=None): """Replaces a substring with another substring using regular expressions. diff --git a/tests/test_functions.py b/tests/test_functions.py index 6b2795e..3c55144 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -4,6 +4,43 @@ from shublang import evaluate +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + [ + 'map_value({"This is foo": "foo", "This is bar": "bar"})', + ["This is foo", "This is not bar"] + ], + ['foo', 'This is not bar'] + ), + ( + [ + 'map_value({"1": "Available", "2": "Unavailable"})', + ['1', '2'] + ], + ['Available', 'Unavailable'] + ), + ( + [ + 'map_value({"InStock": "1", "OutOfStock": "2"})', + ['OutOfStock', 'InStock'] + ], + ['2', '1'] + ), + ( + [ + 'map_value({1: "Online", 2: "Offline"})', + [1, 2] + ], + ['Online', 'Offline'] + ) + ] +) +def test_map_value(test_input, expected): + assert evaluate(*test_input) == expected + + @pytest.mark.parametrize( "test_input,expected", [ From 9c1c3b421b2a4982d1c271c959a8d22a86cc8295 Mon Sep 17 00:00:00 2001 From: sagar arora Date: Fri, 25 Sep 2020 16:13:57 +0530 Subject: [PATCH 21/31] Added support for urlparse functions --- shublang/shublang.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/shublang/shublang.py b/shublang/shublang.py index 481db71..bf01919 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -11,6 +11,7 @@ from price_parser import Price import types from urllib import parse +from urllib.parse import urlparse """ Conventions @@ -337,6 +338,25 @@ def identity(iterable, element): """ Return the same element is passed as parameter.""" return (element) +@Pipe +def urlparse_netloc(iterable): + return (urlparse(url).netloc for url in iterable) + +@Pipe +def urlparse_params(iterable): + return (urlparse(url).params for url in iterable) + +@Pipe +def urlparse_path(iterable): + return (urlparse(url).path for url in iterable) + +@Pipe +def urlparse_query(iterable): + return (urlparse(url).query for url in iterable) + +@Pipe +def urlparse_scheme(iterable): + return (urlparse(url).scheme for url in iterable) filter = where map = select From c625b58c36198c2bf75a5daa580eb925f0dc467d Mon Sep 17 00:00:00 2001 From: Renan Cunha Date: Tue, 29 Sep 2020 14:33:11 -0300 Subject: [PATCH 22/31] fix typos --- shublang/shublang.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/shublang/shublang.py b/shublang/shublang.py index 2c86ca6..010f0da 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -33,14 +33,14 @@ @Pipe def map_value(iterable, rules_dict): - """Maps an input value to an output according to the map settings + """Maps an input text to an output according to the map settings configured at rules_dict. :param iterable: collection of data to transform :type iterable: list :param rules_dict: rules dictionary where the key should be the - exactly pattern to look for and the value should be the desired output + exactly text to look for and the value should be the desired output :type rules_dict: dict """ From e10b4d245fd9bb10de342ca6825e4b5314db0d67 Mon Sep 17 00:00:00 2001 From: Renan Cunha Date: Tue, 29 Sep 2020 14:38:16 -0300 Subject: [PATCH 23/31] improvements on map_value unit tests --- tests/test_functions.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tests/test_functions.py b/tests/test_functions.py index 3c55144..7df8f77 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -7,6 +7,7 @@ @pytest.mark.parametrize( "test_input,expected", [ + # if a mapping is not found, return the value itself ( [ 'map_value({"This is foo": "foo", "This is bar": "bar"})', @@ -14,6 +15,8 @@ ], ['foo', 'This is not bar'] ), + + # usual mapping ( [ 'map_value({"1": "Available", "2": "Unavailable"})', @@ -21,17 +24,21 @@ ], ['Available', 'Unavailable'] ), + + # map string to number ( [ - 'map_value({"InStock": "1", "OutOfStock": "2"})', + 'map_value({"InStock": 1, "OutOfStock": 2})', ['OutOfStock', 'InStock'] ], - ['2', '1'] + [2, 1] ), + + # map number to string ( [ - 'map_value({1: "Online", 2: "Offline"})', - [1, 2] + 'map_value({0: "Online", 1: "Offline"})', + [0, 1] ], ['Online', 'Offline'] ) From 4553e14fafdbab540cfb70c1e170c25535bc2952 Mon Sep 17 00:00:00 2001 From: sagar arora Date: Fri, 2 Oct 2020 15:47:36 +0530 Subject: [PATCH 24/31] Added urlparse method to return dict & added test cases --- shublang/shublang.py | 22 ++++++++++++++++------ tests/test_functions.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/shublang/shublang.py b/shublang/shublang.py index bf01919..ea13b74 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -11,7 +11,6 @@ from price_parser import Price import types from urllib import parse -from urllib.parse import urlparse """ Conventions @@ -340,23 +339,34 @@ def identity(iterable, element): @Pipe def urlparse_netloc(iterable): - return (urlparse(url).netloc for url in iterable) + return (parse.urlparse(url).netloc for url in iterable) @Pipe def urlparse_params(iterable): - return (urlparse(url).params for url in iterable) + return (parse.urlparse(url).params for url in iterable) @Pipe def urlparse_path(iterable): - return (urlparse(url).path for url in iterable) + return (parse.urlparse(url).path for url in iterable) @Pipe def urlparse_query(iterable): - return (urlparse(url).query for url in iterable) + return (parse.urlparse(url).query for url in iterable) @Pipe def urlparse_scheme(iterable): - return (urlparse(url).scheme for url in iterable) + return (parse.urlparse(url).scheme for url in iterable) + +@Pipe +def urlparse_fragment(iterable): + return (parse.urlparse(url).fragment for url in iterable) + +@Pipe +def urlparse(iterable): + parsed_iterable = (parse.urlparse(url) for url in iterable) + parsed_iterable = ({"scheme": it.scheme, "netloc": it.netloc, "path": it.path, + "params": it.params, "query": it.query, "fragment": it.fragment} for it in parsed_iterable) + return parsed_iterable filter = where map = select diff --git a/tests/test_functions.py b/tests/test_functions.py index 6b2795e..e336114 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -355,6 +355,36 @@ def test_currency_2(): assert evaluate('extract_currency', data=['$1,199.00']) == ['$'] +def test_urlparse_netloc(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_netloc', data=[url]) == ['scrapinghub.bamboohr.com'] + + +def test_urlparse_path(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_path', data=[url]) == ['/employees/performance/index.php'] + + +def test_urlparse_query(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_query', data=[url]) == ['page=2107&subpage=1'] + + +def test_urlparse_scheme(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_scheme', data=[url]) == ['https'] + + +def test_urlparse_fragment(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_fragment', data=[url]) == ['123'] + + +def test_urlparse(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert list(evaluate('urlparse', data=[url])[0].keys()) == ['scheme', 'netloc', 'path', 'params', 'query', 'fragment'] + + @pytest.mark.parametrize( "test_input,expected", [ From 02edc0788a051727416deae5e02683b1c4a73080 Mon Sep 17 00:00:00 2001 From: sagar arora Date: Fri, 2 Oct 2020 16:47:30 +0530 Subject: [PATCH 25/31] introduced unidecode while doing encoding-decoding in sanitize --- requirements.txt | 3 ++- shublang/shublang.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2865f3c..ca84cae 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,4 +18,5 @@ pytest==5.2.1 six==1.12.0 w3lib==1.21.0 wcwidth==0.1.7 -zipp==0.6.0 \ No newline at end of file +zipp==0.6.0 +unidecode==0.4.20 diff --git a/shublang/shublang.py b/shublang/shublang.py index 010f0da..ac222e8 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -11,6 +11,7 @@ from price_parser import Price import types from urllib import parse +from unidecode import unidecode """ Conventions @@ -191,7 +192,7 @@ def sanitize(iterable): iterable = (x.strip() for x in iterable) iterable = (re.sub(r'[\n\t\r\s]+', ' ', x) for x in iterable) - iterable = (x.encode('ascii', errors='ignore').decode('ascii') for x in iterable) + iterable = (unidecode(x) for x in iterable) iterable = (replace_entities(x) for x in iterable) iterable = (remove_tags(x) for x in iterable) return iterable From e74d195d1192b9e1438bbef15aa9ed755465b7c6 Mon Sep 17 00:00:00 2001 From: sagar arora Date: Mon, 5 Oct 2020 20:20:58 +0530 Subject: [PATCH 26/31] added test case and re-order imports --- requirements.txt | 4 ++-- setup.py | 1 + shublang/shublang.py | 2 +- tests/test_functions.py | 3 +++ 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index ca84cae..f6d9787 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ py==1.8.0 pyparsing==2.4.2 pytest==5.2.1 six==1.12.0 +unidecode==0.4.20 w3lib==1.21.0 wcwidth==0.1.7 -zipp==0.6.0 -unidecode==0.4.20 +zipp==0.6.0 \ No newline at end of file diff --git a/setup.py b/setup.py index 550766b..2c78758 100644 --- a/setup.py +++ b/setup.py @@ -36,5 +36,6 @@ 'parsel >= 1.5.2', 'dateparser >= 0.7.2', 'price-parser >= 0.3.2', + 'unidecode >= 0.4.20' ] ) \ No newline at end of file diff --git a/shublang/shublang.py b/shublang/shublang.py index ac222e8..6b2f977 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -10,8 +10,8 @@ import dateparser from price_parser import Price import types -from urllib import parse from unidecode import unidecode +from urllib import parse """ Conventions diff --git a/tests/test_functions.py b/tests/test_functions.py index 7df8f77..dc1c14c 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -266,6 +266,9 @@ def test_sanitize(): " Rust"] assert evaluate("sanitize", data=text) == ["Python", "Haskell", "Rust"] +def test_sanitize_1(): + text = [u"Checking unicode ko\u017eu\u0161\u010dek \t\t\t\t"] + assert evaluate("sanitize", data=text) == ["Checking unicode kozuscek"] def test_xpath_getall(): html = '
    • Skoda
      Vauxhall
      Peugot
  • ' From 338e0bdd124bafa38d8348c8ad5a2f0d8192a626 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Feb 2021 11:44:36 +0100 Subject: [PATCH 27/31] Refactor tox.ini to ease local runs --- tox.ini | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tox.ini b/tox.ini index f08b092..75d1088 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,5 @@ [tox] -envlist = py35, py36, py37, py38 -skip_missing_interpreters = True +envlist = py [testenv] commands = pytest tests/ From 7813175a2f885d567f20efd37319f6e65aedb94a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Feb 2021 11:50:59 +0100 Subject: [PATCH 28/31] Create separate CircleCI jobs for separate Python version --- .circleci/config.yml | 57 ++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 67fdae6..48f5bf0 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,42 +1,37 @@ -# Python CircleCI 2.0 configuration file -# -# Check https://circleci.com/docs/2.0/language-python/ for more details -# version: 2 jobs: - toxify: + python3.5: docker: - - image: themattrix/tox + - image: python:3.5 steps: - checkout - - run: - name: Run tests in supported Python versions - command: | - pip install tox tox-pyenv - pyenv local 3.5.3 3.6.0 3.7.0 3.8.0 - tox - build: + - run: | + pip install tox + tox + python3.6: docker: - - image: circleci/python:3.6.1 + - image: python:3.6 steps: - checkout - # Download and cache dependencies - - restore_cache: - keys: - - v1-dependencies-{{ checksum "requirements.txt" }} - # fallback to using the latest cache if no exact match is found - - v1-dependencies- - - run: - name: install dependencies - command: | - python3 -m venv venv - . venv/bin/activate - pip install -r requirements.txt - pip install -r requirements-dev.txt - - save_cache: - paths: - - ./venv - key: v1-dependencies-{{ checksum "requirements.txt" }} + - run: | + pip install tox + tox + python3.7: + docker: + - image: python:3.7 + steps: + - checkout + - run: | + pip install tox + tox + python3.8: + docker: + - image: python:3.8 + steps: + - checkout + - run: | + pip install tox + tox workflows: version: 2 From ab729a90b7225d3c009522ece128e51671b4cb85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Feb 2021 11:54:35 +0100 Subject: [PATCH 29/31] Fix broken references in the CircleCI config file --- .circleci/config.yml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 48f5bf0..3ab7596 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -38,5 +38,7 @@ workflows: shublang: jobs: - - toxify - - build + - python3.5 + - python3.6 + - python3.7 + - python3.8 From a2adc3165e09ddfa308bb8b6535350de253ab816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Feb 2021 12:06:19 +0100 Subject: [PATCH 30/31] Remove unsupported Python 3.5 from CircleCI jobs --- .circleci/config.yml | 8 -------- setup.py | 1 - 2 files changed, 9 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 3ab7596..66fef35 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,13 +1,5 @@ version: 2 jobs: - python3.5: - docker: - - image: python:3.5 - steps: - - checkout - - run: | - pip install tox - tox python3.6: docker: - image: python:3.6 diff --git a/setup.py b/setup.py index 613f5fd..b40053d 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,6 @@ 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', From d78c59ed380f59fa3974ae3ecbeabd1e16156435 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Feb 2021 12:07:23 +0100 Subject: [PATCH 31/31] Fix broken references in the CircleCI config file --- .circleci/config.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 66fef35..c598217 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -30,7 +30,6 @@ workflows: shublang: jobs: - - python3.5 - python3.6 - python3.7 - python3.8