asdf-format · eslavich · Aug 6, 2020 · Aug 6, 2020 · jdavies-st · Aug 6, 2020
diff --git a/asdf/generic_io.py b/asdf/generic_io.py
@@ -20,13 +20,13 @@
 from os import SEEK_SET, SEEK_CUR, SEEK_END
 
 import http.client
-from urllib import parse as urlparse
 from urllib.request import url2pathname
 
 import numpy as np
 
 from . import util
 from .extern import atomicfile
+from .util import patched_urllib_parse
 
 
 __all__ = ['get_file', 'get_uri', 'resolve_uri', 'relative_uri']
@@ -144,8 +144,8 @@ def resolve_uri(base, uri):
     """
     if base is None:
         base = ''
-    resolved = urlparse.urljoin(base, uri)
-    parsed = urlparse.urlparse(resolved)
+    resolved = patched_urllib_parse.urljoin(base, uri)
+    parsed = patched_urllib_parse.urlparse(resolved)
     if parsed.path != '' and not parsed.path.startswith('/'):
         raise ValueError(
             "Resolved to relative URL")
@@ -156,8 +156,8 @@ def relative_uri(source, target):
     """
     Make a relative URI from source to target.
     """
-    su = urlparse.urlparse(source)
-    tu = urlparse.urlparse(target)
+    su = patched_urllib_parse.urlparse(source)
+    tu = patched_urllib_parse.urlparse(target)
     extra = list(tu[3:])
     relative = None
     if tu[0] == '' and tu[1] == '':
@@ -175,7 +175,7 @@ def relative_uri(source, target):
             relative = os.path.relpath(tu[2], os.path.dirname(su[2]))
     if relative == '.':
         relative = ''
-    relative = urlparse.urlunparse(["", "", relative] + extra)
+    relative = patched_urllib_parse.urlunparse(["", "", relative] + extra)
     return relative
 
 
@@ -1044,7 +1044,7 @@ def _make_http_connection(init, mode, uri=None):
     Creates a HTTPConnection instance if the HTTP server supports
     Range requests, otherwise falls back to a generic InputStream.
     """
-    parsed = urlparse.urlparse(init)
+    parsed = patched_urllib_parse.urlparse(init)
     connection = http.client.HTTPConnection(parsed.netloc)
     connection.connect()
 
@@ -1166,7 +1166,7 @@ def get_file(init, mode='r', uri=None, close=False):
         return GenericWrapper(init)
 
     elif isinstance(init, (str, pathlib.Path)):
-        parsed = urlparse.urlparse(str(init))
+        parsed = patched_urllib_parse.urlparse(str(init))
         if parsed.scheme in ['http', 'https']:
             if 'w' in mode:
                 raise ValueError(

diff --git a/asdf/schema.py b/asdf/schema.py
@@ -454,7 +454,7 @@ def resolve_refs(node, json_id):
                 else:
                     suburl_path = suburl
                 suburl_path = resolver(suburl_path)
-                if suburl_path == url:
+                if suburl_path == url or suburl_path == schema.get("id"):
                     subschema = schema
                 else:
                     subschema = load_schema(suburl_path, resolver, True)

diff --git a/asdf/tests/test_generic_io.py b/asdf/tests/test_generic_io.py
@@ -453,6 +453,22 @@ def test_relative_uri():
         'http://www.google.com', 'file://local') == 'file://local'
 
 
+@pytest.mark.parametrize("protocol", ["http", "asdf"])
+def test_resolve_uri(protocol):
+    """
+    Confirm that the patched urllib.parse is handling
+    asdf:// URIs correctly.
+    """
+    assert generic_io.resolve_uri(
+        '{}://somewhere.org/some-schema'.format(protocol), '#/definitions/foo'
+    ) == '{}://somewhere.org/some-schema#/definitions/foo'.format(protocol)
+
+    assert generic_io.resolve_uri(
+        '{}://somewhere.org/path/to/some-schema'.format(protocol),
+        '../../some/other/path/to/some-other-schema'
+    ) == '{}://somewhere.org/some/other/path/to/some-other-schema'.format(protocol)
+
+
 def test_arbitrary_file_object():
     class Wrapper:
         def __init__(self, init):

diff --git a/asdf/tests/test_schema.py b/asdf/tests/test_schema.py
@@ -163,6 +163,31 @@ def test_load_schema_with_file_url(tmpdir):
     schema.check_schema(schema_tree)
 
 
+def test_load_schema_with_asdf_uri_scheme():
+    content = """%YAML 1.1
+---
+$schema: http://stsci.edu/schemas/asdf/asdf-schema-1.0.0
+id: asdf://somewhere.org/schemas/foo
+
+definitions:
+  bar:
+    type: string
+
+type: object
+properties:
+  id:
+    type: string
+  bar:
+    $ref: #/definitions/bar
+...
+"""
+    with asdf.config_context() as config:
+        config.add_resource_mapping({"asdf://somewhere.org/schemas/foo": content})
+
+        schema_tree = schema.load_schema("asdf://somewhere.org/schemas/foo", resolve_references=True)
+        schema.check_schema(schema_tree)
+
+
 def test_schema_caching():
     # Make sure that if we request the same URL, we get a different object
     # (despite the caching internal to load_schema).  Changes to a schema

diff --git a/asdf/tests/test_util.py b/asdf/tests/test_util.py
@@ -30,3 +30,13 @@ def test_get_class_name():
 
 def test_get_class_name_override():
     assert util.get_class_name(BuiltinExtension, instance=False) == "asdf.extension.BuiltinExtension"
+
+
+def test_patched_urllib_parse():
+    assert "asdf" in util.patched_urllib_parse.uses_relative
+    assert "asdf" in util.patched_urllib_parse.uses_netloc
+
+    import urllib.parse
+    assert urllib.parse is not util.patched_urllib_parse
+    assert "asdf" not in urllib.parse.uses_relative
+    assert "asdf" not in urllib.parse.uses_netloc
diff --git a/asdf/treeutil.py b/asdf/treeutil.py
@@ -375,8 +375,10 @@ def _recurse(node, json_id=None):
         with _context.pending(node):
             # Take note of the "id" field, in case we're modifying
             # a schema and need to know the namespace for resolving
-            # URIs.
-            if isinstance(node, dict) and "id" in node:
+            # URIs.  Ignore an id that is not a string, since it may
+            # be an object defining an id property and not an id
+            # itself (this is common in metaschemas).
+            if isinstance(node, dict) and "id" in node and isinstance(node["id"], str):
                 json_id = node["id"]
 
             if postorder:

diff --git a/asdf/util.py b/asdf/util.py
@@ -2,13 +2,27 @@
 import math
 import struct
 import types
+import importlib.util
 
-from urllib.parse import urljoin
 from urllib.request import pathname2url
-from urllib import parse as urlparse
 
 import numpy as np
 
+# We're importing our own copy of urllib.parse because
+# we need to patch it to support asdf:// URIs, but it'd
+# be irresponsible to do this for all users of a
+# standard library.
+urllib_parse_spec = importlib.util.find_spec('urllib.parse')
+patched_urllib_parse = importlib.util.module_from_spec(urllib_parse_spec)
+urllib_parse_spec.loader.exec_module(patched_urllib_parse)
+del urllib_parse_spec
+
+# urllib.parse needs to know that it should treat asdf://
+# URIs like http:// URIs for the purposes of joining
+# a relative path to a base URI.
+patched_urllib_parse.uses_relative.append('asdf')
+patched_urllib_parse.uses_netloc.append('asdf')
+
 
 __all__ = ['human_list', 'get_array_base', 'get_base_uri', 'filepath_to_url',
            'iter_subclasses', 'calculate_padding', 'resolve_name', 'NotSet',
@@ -58,15 +72,15 @@ def get_base_uri(uri):
     """
     For a given URI, return the part without any fragment.
     """
-    parts = urlparse.urlparse(uri)
-    return urlparse.urlunparse(list(parts[:5]) + [''])
+    parts = patched_urllib_parse.urlparse(uri)
+    return patched_urllib_parse.urlunparse(list(parts[:5]) + [''])
 
 
 def filepath_to_url(path):
     """
     For a given local file path, return a file:// url.
     """
-    return urljoin('file:', pathname2url(path))
+    return patched_urllib_parse.urljoin('file:', pathname2url(path))
 
 
 def iter_subclasses(cls):