[FIX]: Parsing tweaks.

jasyip · Sep 23, 2021 · 27bfb03 · 27bfb03
1 parent aa5a526
commit 27bfb03
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 14 deletions.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.1.3
+2.1.4
diff --git a/musescore_scraper/MuseScraper.py b/musescore_scraper/MuseScraper.py
@@ -27,6 +27,8 @@
 import warnings
 from operator import itemgetter
 
+from .helper import _valid_url
+
 
 
 
@@ -118,7 +120,6 @@ async def get_score_tags() -> str:
             "Keywords": await get_score_tags(),
         }
 
-        # svgs = await page.evaluate(bytes(get_data("musescore_scraper", "script.js"), "utf-8"))
         svgs: List[str] = await page.evaluate(str(get_data("musescore_scraper",
                                                            "script.js",
                                                           ), "utf-8"))
@@ -261,6 +262,9 @@ async def to_pdf(
         :rtype: Output destination as ``pathlib.Path`` object.
             May or may not differ depending on the output argument.
         """
+        if not _valid_url(url):
+            raise TypeError("Invalid URL.")
+
         return self._convert(output, await asyncio.wait_for(
                 self._pyppeteer_main(url), self.timeout
         ))
@@ -319,6 +323,9 @@ def to_pdf(
         :rtype: Output destination as ``pathlib.Path`` object.
             May or may not differ depending on the output argument.
         """
+        if not _valid_url(url):
+            raise TypeError("Invalid URL.")
+
         return self._convert(output, asyncio.get_event_loop().run_until_complete(
                 asyncio.wait_for(self._pyppeteer_main(url), self.timeout)
         ))
diff --git a/musescore_scraper/__init__.py b/musescore_scraper/__init__.py
@@ -3,15 +3,17 @@
 import argparse
 
 from pathlib import Path
-from urllib.parse import urlparse
+from urllib.parse import urlparse, urljoin
 from typing import Optional, Union, List
 from .MuseScraper import MuseScraper, AsyncMuseScraper
+from .helper import _valid_url
 import asyncio
 from functools import partial
 
 def _url_parse(url: str) -> str:
-    tup = urlparse(url)
-    return tup.scheme + "://" + tup.netloc + "/" + tup.path
+    if not _valid_url(url):
+        raise argparse.ArgumentTypeError("Invalid URL.")
+    return url
 
 def _debug_path(path: str) -> Union[Path, str]:
     return Path(path) if path else path
@@ -20,12 +22,15 @@ def _main(args: Union[None, List[str], str] = None) -> None:
 
     parser = argparse.ArgumentParser(description="A MuseScore PDF scraper."
                                      + " Input a URL to a MuseScore score"
-                                     + ", then outputs a multi-page PDF.")
+                                     + ", then outputs a multi-page PDF."
+                                    )
     parser.add_argument("urls", nargs='+', type=_url_parse,
-                        help="an amount of valid MuseScore score URLs")
-    parser.add_argument("-o", "--output", nargs='*', type=Path, help="file destination(s)")
+                        help="an amount of valid MuseScore score URLs"
+                       )
+    parser.add_argument("-o", "--output", nargs='+', type=Path, help="file destination(s)")
     parser.add_argument("-t", "--timeout", type=int, help=
-                        "how many milliseconds should be given before aborting.")
+                        "how many milliseconds should be given before aborting."
+                       )
     parser.add_argument("-d", "--debug-log", type=_debug_path, nargs="?", const="",
                         help="receive debug messages, to a log file if destination provided."
                        )
@@ -35,7 +40,8 @@ def _main(args: Union[None, List[str], str] = None) -> None:
 
     args = parser.parse_args(args)
 
-    assert not args.output or len(args.urls) == len(args.output)
+    if not (not args.output or len(args.urls) == len(args.output)):
+        parser.error("# of outputs must match # of urls or omit output flag.")
 
     outputs: List[Optional[Path]] = [None] * len(args.urls)
     def set_output(i: int, task: asyncio.Task) -> None:
@@ -57,7 +63,7 @@ async def run():
                 task.add_done_callback(partial(set_output, i))
                 tasks.append(task)
 
-            result = await asyncio.wait_for(asyncio.gather(*tasks), args.timeout)
+            result = await asyncio.gather(*tasks)
 
         return result
 

diff --git a/musescore_scraper/helper.py b/musescore_scraper/helper.py
@@ -0,0 +1,6 @@
+from urllib.parse import urlparse
+
+def _valid_url(url: str) -> bool:
+    final_url = urlparse(url + '/' * int(not url.endswith('/')))
+    return (all([final_url.scheme, final_url.netloc, final_url.path])
+            and '.' in final_url.netloc)
diff --git a/test/test_main.py b/test/test_main.py
@@ -8,10 +8,11 @@
 import pytest
 from typing import Any
 from tempfile import NamedTemporaryFile
+import argparse
 
 sys.path.insert(0, str(Path(__file__).parents[1].resolve()))
 
-from musescore_scraper import main
+from musescore_scraper import _main
 
 
 URLS = [
@@ -32,22 +33,32 @@ def test_main():
     for i in range(len(URLS)):
         with NamedTemporaryFile(suffix=".pdf", delete=False) as tf:
             fname: Path = Path(tf.name)
-            main([URLS[i], "-o", str(fname)])
+            _main([URLS[i], "-o", str(fname)])
 
         assert fname.read_bytes() == DATA_PDFS[i]
 
         fname.unlink()
 
+
 def test_main_multiple():
     temp_files = []
     for i in range(len(URLS)):
         temp_files.append(NamedTemporaryFile(suffix=".pdf", delete=False))
     tf_names = [tf.name for tf in temp_files]
 
-    main([*URLS, "-o", *tf_names])
+    _main([*URLS, "-o", *tf_names])
 
     for i in range(len(URLS)):
         tf_path = Path(tf_names[i])
         assert tf_path.read_bytes() == DATA_PDFS[i]
 
         tf_path.unlink()
+
+
+def test_invalid_opts():
+    with NamedTemporaryFile(suffix=".pdf") as tf:
+        with pytest.raises((argparse.ArgumentError, SystemExit)):
+            _main([*URLS, "-o", tf.name])
+
+    with pytest.raises((argparse.ArgumentTypeError, SystemExit)):
+        _main(["foo"])