edgedb · fantix · Jun 19, 2023 · May 25, 2023 · Jun 9, 2023 · Jun 9, 2023
diff --git a/edgedb/codegen/cli.py b/edgedb/codegen/cli.py
@@ -56,6 +56,11 @@ def error(self, message):
     nargs="?",
     help="Generate a single file instead of one per .edgeql file.",
 )
+parser.add_argument(
+    "--dir",
+    action="append",
+    help="Only search .edgeql files under specified directories.",
+)
 parser.add_argument(
     "--target",
     choices=["blocking", "async"],

diff --git a/edgedb/codegen/generator.py b/edgedb/codegen/generator.py
@@ -145,6 +145,20 @@ def __init__(self, args: argparse.Namespace):
         print_msg(f"Found EdgeDB project: {C.BOLD}{self._project_dir}{C.ENDC}")
         self._client = edgedb.create_client(**_get_conn_args(args))
         self._single_mode_files = args.file
+        self._search_dirs = []
+        for search_dir in args.dir or []:
+            search_dir = pathlib.Path(search_dir).absolute()
+            if (
+                search_dir == self._project_dir
+                or self._project_dir in search_dir.parents
+            ):
+                self._search_dirs.append(search_dir)
+            else:
+                print(
+                    f"--dir '{search_dir}' is not under "
+                    f"the project directory: {self._project_dir}"
+                )
+                sys.exit(1)
         self._method_names = set()
         self._describe_results = []
 
@@ -170,7 +184,11 @@ def run(self):
             print(f"Failed to connect to EdgeDB instance: {e}")
             sys.exit(61)
         with self._client:
-            self._process_dir(self._project_dir)
+            if self._search_dirs:
+                for search_dir in self._search_dirs:
+                    self._process_dir(search_dir)
+            else:
+                self._process_dir(self._project_dir)
         for target, suffix, is_async in SUFFIXES:
             if target in self._targets:
                 self._async = is_async

diff --git a/edgedb/con_utils.py b/edgedb/con_utils.py
@@ -706,6 +706,12 @@ def _parse_connect_dsn_and_args(
                     f'project defined cloud profile ("{cloud_profile}")'
                 ),
             )
+
+            opt_database_file = os.path.join(stash_dir, 'database')
+            if os.path.exists(opt_database_file):
+                with open(opt_database_file, 'rt') as f:
+                    database = f.read().strip()
+                resolved_config.set_database(database, "project")
         else:
             raise errors.ClientConnectionError(
                 f'Found `edgedb.toml` but the project is not initialized. '
@@ -870,6 +876,9 @@ def _parse_cloud_instance_name_into_config(
     org_slug: str,
     instance_name: str,
 ):
+    org_slug = org_slug.lower()
+    instance_name = instance_name.lower()
+
     label = f"{instance_name}--{org_slug}"
     if len(label) > DOMAIN_LABEL_MAX_LENGTH:
         raise ValueError(

diff --git a/edgedb/protocol/codecs/codecs.pyx b/edgedb/protocol/codecs/codecs.pyx
@@ -17,13 +17,16 @@
 #
 
 
+import array
 import decimal
 import uuid
 import datetime
 from edgedb import describe
 from edgedb import enums
 from edgedb.datatypes import datatypes
 
+from libc.string cimport memcpy
+
 
 include "./edb_types.pxi"
 
@@ -347,14 +350,16 @@ cdef dict BASE_SCALAR_CODECS = {}
 cdef register_base_scalar_codec(
         str name,
         pgproto.encode_func encoder,
-        pgproto.decode_func decoder):
+        pgproto.decode_func decoder,
+        object tid = None):
 
     cdef:
         BaseCodec codec
 
-    tid = TYPE_IDS.get(name)
     if tid is None:
-        raise RuntimeError(f'cannot find known ID for type {name!r}')
+        tid = TYPE_IDS.get(name)
+        if tid is None:
+            raise RuntimeError(f'cannot find known ID for type {name!r}')
     tid = tid.bytes
 
     if tid in BASE_SCALAR_CODECS:
@@ -510,6 +515,94 @@ cdef config_memory_decode(pgproto.CodecContext settings, FRBuffer *buf):
     return datatypes.ConfigMemory(bytes=bytes)
 
 
+DEF PGVECTOR_MAX_DIM = (1 << 16) - 1
+
+
+cdef pgvector_encode_memview(pgproto.CodecContext settings, WriteBuffer buf,
+                             float[:] obj):
+    cdef:
+        float item
+        Py_ssize_t objlen
+        Py_ssize_t i
+
+    objlen = len(obj)
+    if objlen > PGVECTOR_MAX_DIM:
+        raise ValueError('too many elements in vector value')
+
+    buf.write_int32(4 + objlen*4)
+    buf.write_int16(objlen)
+    buf.write_int16(0)
+    for i in range(objlen):
+        buf.write_float(obj[i])
+
+
+cdef pgvector_encode(pgproto.CodecContext settings, WriteBuffer buf,
+                     object obj):
+    cdef:
+        float item
+        Py_ssize_t objlen
+        float[:] memview
+        Py_ssize_t i
+
+    # If we can take a typed memview of the object, we use that.
+    # That is good, because it means we can consume array.array and
+    # numpy.ndarray without needing to unbox.
+    # Otherwise we take the slow path, indexing into the array using
+    # the normal protocol.
+    try:
+        memview = obj
+    except (ValueError, TypeError) as e:
+        pass
+    else:
+        pgvector_encode_memview(settings, buf, memview)
+        return
+
+    if not _is_array_iterable(obj):
+        raise TypeError(
+            'a sized iterable container expected (got type {!r})'.format(
+                type(obj).__name__))
+
+    # Annoyingly, this is literally identical code to the fast path...
+    # but the types are different in critical ways.
+    objlen = len(obj)
+    if objlen > PGVECTOR_MAX_DIM:
+        raise ValueError('too many elements in vector value')
+
+    buf.write_int32(4 + objlen*4)
+    buf.write_int16(objlen)
+    buf.write_int16(0)
+    for i in range(objlen):
+        buf.write_float(obj[i])
+
+
+cdef object ONE_EL_ARRAY = array.array('f', [0.0])
+
+
+cdef pgvector_decode(pgproto.CodecContext settings, FRBuffer *buf):
+    cdef:
+        int32_t dim
+        Py_ssize_t size
+        Py_buffer view
+        char *p
+        float[:] array_view
+
+    dim = hton.unpack_uint16(frb_read(buf, 2))
+    frb_read(buf, 2)
+
+    size = dim * 4
+    p = frb_read(buf, size)
+
+    # Create a float array with size dim
+    val = ONE_EL_ARRAY * dim
+
+    # And fill it with the buffer contents
+    array_view = val
+    memcpy(&array_view[0], p, size)
+    val.byteswap()
+
+    return val
+
+
 cdef checked_decimal_encode(
     pgproto.CodecContext settings, WriteBuffer buf, obj
 ):
@@ -708,4 +801,12 @@ cdef register_base_scalar_codecs():
         config_memory_decode)
 
 
+    register_base_scalar_codec(
+        'ext::pgvector::vector',
+        pgvector_encode,
+        pgvector_decode,
+        uuid.UUID('9565dd88-04f5-11ee-a691-0b6ebe179825'),
+    )
+
+
 register_base_scalar_codecs()
diff --git a/tests/shared-client-testcases b/tests/shared-client-testcases
diff --git a/tests/test_con_utils.py b/tests/test_con_utils.py
@@ -32,6 +32,7 @@
 
 
 class TestConUtils(unittest.TestCase):
+    maxDiff = 1000
 
     error_mapping = {
         'credentials_file_not_found': (
@@ -184,6 +185,9 @@ def run_testcase(self, testcase):
                             if 'cloud-profile' in v:
                                 profile = os.path.join(dir, 'cloud-profile')
                                 files[profile] = v['cloud-profile']
+                            if 'database' in v:
+                                database_file = os.path.join(dir, 'database')
+                                files[database_file] = v['database']
                             del files[f]
 
                     es.enter_context(

diff --git a/tests/test_vector.py b/tests/test_vector.py
@@ -0,0 +1,131 @@
+#
+# This source file is part of the EdgeDB open source project.
+#
+# Copyright 2019-present MagicStack Inc. and the EdgeDB authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from edgedb import _testbase as tb
+import edgedb
+
+import array
+
+
+# An array.array subtype where indexing doesn't work.
+# We use this to verify that the non-boxing memoryview based
+# fast path works, since the slow path won't work on this object.
+class brokenarray(array.array):
+    def __getitem__(self, i):
+        raise AssertionError("the fast path wasn't used!")
+
+
+class TestVector(tb.SyncQueryTestCase):
+    def setUp(self):
+        super().setUp()
+
+        if not self.client.query_required_single('''
+            select exists (
+              select sys::ExtensionPackage filter .name = 'pgvector'
+            )
+        '''):
+            self.skipTest("feature not implemented")
+
+        self.client.execute('''
+            create extension pgvector;
+        ''')
+
+    def tearDown(self):
+        try:
+            self.client.execute('''
+                drop extension pgvector;
+            ''')
+        finally:
+            super().tearDown()
+
+    async def test_vector_01(self):
+        val = self.client.query_single('''
+            select <ext::pgvector::vector>[1.5,2.0,3.8]
+        ''')
+        self.assertTrue(isinstance(val, array.array))
+        self.assertEqual(val, array.array('f', [1.5, 2.0, 3.8]))
+
+        val = self.client.query_single(
+            '''
+                select <json><ext::pgvector::vector>$0
+            ''',
+            [3.0, 9.0, -42.5],
+        )
+        self.assertEqual(val, '[3, 9, -42.5]')
+
+        val = self.client.query_single(
+            '''
+                select <json><ext::pgvector::vector>$0
+            ''',
+            array.array('f', [3.0, 9.0, -42.5])
+        )
+        self.assertEqual(val, '[3, 9, -42.5]')
+
+        val = self.client.query_single(
+            '''
+                select <json><ext::pgvector::vector>$0
+            ''',
+            array.array('i', [1, 2, 3]),
+        )
+        self.assertEqual(val, '[1, 2, 3]')
+
+        # Test that the fast-path works: if the encoder tries to
+        # call __getitem__ on this brokenarray, it will fail.
+        val = self.client.query_single(
+            '''
+                select <json><ext::pgvector::vector>$0
+            ''',
+            brokenarray('f', [3.0, 9.0, -42.5])
+        )
+        self.assertEqual(val, '[3, 9, -42.5]')
+
+        # I don't think it's worth adding a dependency to test this,
+        # but this works too:
+        # import numpy as np
+        # val = self.client.query_single(
+        #     '''
+        #     select <json><ext::pgvector::vector>$0
+        #     ''',
+        #     np.asarray([3.0, 9.0, -42.5], dtype=np.float32),
+        # )
+        # self.assertEqual(val, '[3,9,-42.5]')
+
+        # Some sad path tests
+        with self.assertRaises(edgedb.InvalidArgumentError):
+            self.client.query_single(
+                '''
+                    select <ext::pgvector::vector>$0
+                ''',
+                [3.0, None, -42.5],
+            )
+
+        with self.assertRaises(edgedb.InvalidArgumentError):
+            self.client.query_single(
+                '''
+                    select <ext::pgvector::vector>$0
+                ''',
+                [3.0, 'x', -42.5],
+            )
+
+        with self.assertRaises(edgedb.InvalidArgumentError):
+            self.client.query_single(
+                '''
+                    select <ext::pgvector::vector>$0
+                ''',
+                'foo',
+            )