From 9c1c3b421b2a4982d1c271c959a8d22a86cc8295 Mon Sep 17 00:00:00 2001 From: sagar arora Date: Fri, 25 Sep 2020 16:13:57 +0530 Subject: [PATCH 1/2] Added support for urlparse functions --- shublang/shublang.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/shublang/shublang.py b/shublang/shublang.py index 481db71..bf01919 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -11,6 +11,7 @@ from price_parser import Price import types from urllib import parse +from urllib.parse import urlparse """ Conventions @@ -337,6 +338,25 @@ def identity(iterable, element): """ Return the same element is passed as parameter.""" return (element) +@Pipe +def urlparse_netloc(iterable): + return (urlparse(url).netloc for url in iterable) + +@Pipe +def urlparse_params(iterable): + return (urlparse(url).params for url in iterable) + +@Pipe +def urlparse_path(iterable): + return (urlparse(url).path for url in iterable) + +@Pipe +def urlparse_query(iterable): + return (urlparse(url).query for url in iterable) + +@Pipe +def urlparse_scheme(iterable): + return (urlparse(url).scheme for url in iterable) filter = where map = select From 4553e14fafdbab540cfb70c1e170c25535bc2952 Mon Sep 17 00:00:00 2001 From: sagar arora Date: Fri, 2 Oct 2020 15:47:36 +0530 Subject: [PATCH 2/2] Added urlparse method to return dict & added test cases --- shublang/shublang.py | 22 ++++++++++++++++------ tests/test_functions.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/shublang/shublang.py b/shublang/shublang.py index bf01919..ea13b74 100644 --- a/shublang/shublang.py +++ b/shublang/shublang.py @@ -11,7 +11,6 @@ from price_parser import Price import types from urllib import parse -from urllib.parse import urlparse """ Conventions @@ -340,23 +339,34 @@ def identity(iterable, element): @Pipe def urlparse_netloc(iterable): - return (urlparse(url).netloc for url in iterable) + return (parse.urlparse(url).netloc for url in iterable) @Pipe def urlparse_params(iterable): - return (urlparse(url).params for url in iterable) + return (parse.urlparse(url).params for url in iterable) @Pipe def urlparse_path(iterable): - return (urlparse(url).path for url in iterable) + return (parse.urlparse(url).path for url in iterable) @Pipe def urlparse_query(iterable): - return (urlparse(url).query for url in iterable) + return (parse.urlparse(url).query for url in iterable) @Pipe def urlparse_scheme(iterable): - return (urlparse(url).scheme for url in iterable) + return (parse.urlparse(url).scheme for url in iterable) + +@Pipe +def urlparse_fragment(iterable): + return (parse.urlparse(url).fragment for url in iterable) + +@Pipe +def urlparse(iterable): + parsed_iterable = (parse.urlparse(url) for url in iterable) + parsed_iterable = ({"scheme": it.scheme, "netloc": it.netloc, "path": it.path, + "params": it.params, "query": it.query, "fragment": it.fragment} for it in parsed_iterable) + return parsed_iterable filter = where map = select diff --git a/tests/test_functions.py b/tests/test_functions.py index 6b2795e..e336114 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -355,6 +355,36 @@ def test_currency_2(): assert evaluate('extract_currency', data=['$1,199.00']) == ['$'] +def test_urlparse_netloc(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_netloc', data=[url]) == ['scrapinghub.bamboohr.com'] + + +def test_urlparse_path(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_path', data=[url]) == ['/employees/performance/index.php'] + + +def test_urlparse_query(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_query', data=[url]) == ['page=2107&subpage=1'] + + +def test_urlparse_scheme(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_scheme', data=[url]) == ['https'] + + +def test_urlparse_fragment(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert evaluate('urlparse_fragment', data=[url]) == ['123'] + + +def test_urlparse(): + url = "https://scrapinghub.bamboohr.com/employees/performance/index.php?page=2107&subpage=1#123" + assert list(evaluate('urlparse', data=[url])[0].keys()) == ['scheme', 'netloc', 'path', 'params', 'query', 'fragment'] + + @pytest.mark.parametrize( "test_input,expected", [