Merge pull request #552 from tseaver/447-require_key_dataset_ids_matc…

…h_branch Require that keys for 'put' / 'delete' match the 'dataset_id' of the batch
googleapis · Jan 28, 2015 · 2c75763 · 2c75763
2 parents 50f0684 + 49977da
commit 2c75763
Show file tree

Hide file tree

Showing 6 changed files with 489 additions and 146 deletions.
diff --git a/gcloud/datastore/api.py b/gcloud/datastore/api.py
@@ -24,22 +24,38 @@
 from gcloud.datastore import helpers
 
 
-def _require_dataset_id(dataset_id=None):
+def _require_dataset_id(dataset_id=None, first_key=None):
     """Infer a dataset ID from the environment, if not passed explicitly.
 
+    Order of precedence:
+
+    - Passed `dataset_id` (if not None).
+    - `dataset_id` of current batch / transaction (if current exists).
+    - `dataset_id` of first key
+    - `dataset_id` inferred from the environment (if `set_default_dataset_id`
+      has been called).
+
     :type dataset_id: string
     :param dataset_id: Optional.
 
+    :type first_key: :class:`gcloud.datastore.key.Key` or None
+    :param first_key: Optional: first key being manipulated.
+
     :rtype: string
     :returns: A dataset ID based on the current environment.
     :raises: :class:`EnvironmentError` if ``dataset_id`` is ``None``,
              and cannot be inferred from the environment.
     """
-    if dataset_id is None:
-        if _implicit_environ.DATASET_ID is None:
-            raise EnvironmentError('Dataset ID could not be inferred.')
-        dataset_id = _implicit_environ.DATASET_ID
-    return dataset_id
+    if dataset_id is not None:
+        return dataset_id
+    top = Batch.current()
+    if top is not None:
+        return top.dataset_id
+    if first_key is not None:
+        return first_key.dataset_id
+    if _implicit_environ.DATASET_ID is None:
+        raise EnvironmentError('Dataset ID could not be inferred.')
+    return _implicit_environ.DATASET_ID
 
 
 def _require_connection(connection=None):
@@ -54,38 +70,17 @@ def _require_connection(connection=None):
              cannot be inferred from the environment.
     """
     if connection is None:
-        if _implicit_environ.CONNECTION is None:
-            raise EnvironmentError('Connection could not be inferred.')
-        connection = _implicit_environ.CONNECTION
+        top = Batch.current()
+        if top is not None:
+            connection = top.connection
+        else:
+            if _implicit_environ.CONNECTION is None:
+                raise EnvironmentError('Connection could not be inferred.')
+            connection = _implicit_environ.CONNECTION
     return connection
 
 
-def _get_dataset_id_from_keys(keys):
-    """Determines dataset ID from a list of keys.
-
-    :type keys: list of :class:`gcloud.datastore.key.Key`
-    :param keys: The keys from the same dataset.
-
-    :rtype: string
-    :returns: The dataset ID of the keys.
-    :raises: :class:`ValueError` if the key dataset IDs don't agree.
-    """
-    if any(key is None for key in keys):
-        raise ValueError('None not allowed')
-
-    dataset_id = keys[0].dataset_id
-    # Rather than creating a list or set of all dataset IDs, we iterate
-    # and check. We could allow the backend to check this for us if IDs
-    # with no prefix worked (GoogleCloudPlatform/google-cloud-datastore#59)
-    # or if we made sure that a prefix s~ or e~ was on each key.
-    for key in keys[1:]:
-        if key.dataset_id != dataset_id:
-            raise ValueError('All keys in get must be from the same dataset.')
-
-    return dataset_id
-
-
-def get(keys, missing=None, deferred=None, connection=None):
+def get(keys, missing=None, deferred=None, connection=None, dataset_id=None):
     """Retrieves entities, along with their attributes.
 
     :type keys: list of :class:`gcloud.datastore.key.Key`
@@ -103,22 +98,35 @@ def get(keys, missing=None, deferred=None, connection=None):
 
     :type connection: :class:`gcloud.datastore.connection.Connection`
     :param connection: Optional. The connection used to connect to datastore.
+                       If not passed, inferred from the environment.
+
+    :type dataset_id: :class:`gcloud.datastore.connection.Connection`
+    :param dataset_id: Optional. The dataset ID used to connect to datastore.
+                       If not passed, inferred from the environment.
 
     :rtype: list of :class:`gcloud.datastore.entity.Entity`
     :returns: The requested entities.
+    :raises: EnvironmentError if ``connection`` or ``dataset_id`` not passed,
+             and cannot be inferred from the environment.  ValueError if
+             one or more of ``keys`` has a dataset ID which does not match
+             the passed / inferred dataset ID.
     """
     if not keys:
         return []
 
     connection = _require_connection(connection)
-    dataset_id = _get_dataset_id_from_keys(keys)
+    dataset_id = _require_dataset_id(dataset_id, keys[0])
+
+    if list(set([key.dataset_id for key in keys])) != [dataset_id]:
+        raise ValueError('Keys do not match dataset ID')
 
     transaction = Transaction.current()
 
     entity_pbs = connection.lookup(
         dataset_id=dataset_id,
         key_pbs=[k.to_protobuf() for k in keys],
-        missing=missing, deferred=deferred,
+        missing=missing,
+        deferred=deferred,
         transaction_id=transaction and transaction.id,
     )
 
@@ -139,51 +147,70 @@ def get(keys, missing=None, deferred=None, connection=None):
     return entities
 
 
-def put(entities, connection=None):
+def put(entities, connection=None, dataset_id=None):
     """Save the entities in the Cloud Datastore.
 
     :type entities: list of :class:`gcloud.datastore.entity.Entity`
     :param entities: The entities to be saved to the datastore.
 
     :type connection: :class:`gcloud.datastore.connection.Connection`
     :param connection: Optional connection used to connect to datastore.
+                       If not passed, inferred from the environment.
+
+    :type dataset_id: :class:`gcloud.datastore.connection.Connection`
+    :param dataset_id: Optional. The dataset ID used to connect to datastore.
+                       If not passed, inferred from the environment.
+
+    :raises: EnvironmentError if ``connection`` or ``dataset_id`` not passed,
+             and cannot be inferred from the environment.  ValueError if
+             one or more entities has a key with a dataset ID not matching
+             the passed / inferred dataset ID.
     """
     if not entities:
         return
 
-    connection = connection or _implicit_environ.CONNECTION
+    connection = _require_connection(connection)
+    dataset_id = _require_dataset_id(dataset_id, entities[0].key)
 
     current = Batch.current()
     in_batch = current is not None
     if not in_batch:
-        keys = [entity.key for entity in entities]
-        dataset_id = _get_dataset_id_from_keys(keys)
         current = Batch(dataset_id=dataset_id, connection=connection)
     for entity in entities:
         current.put(entity)
     if not in_batch:
         current.commit()
 
 
-def delete(keys, connection=None):
+def delete(keys, connection=None, dataset_id=None):
     """Delete the keys in the Cloud Datastore.
 
     :type keys: list of :class:`gcloud.datastore.key.Key`
     :param keys: The keys to be deleted from the datastore.
 
     :type connection: :class:`gcloud.datastore.connection.Connection`
     :param connection: Optional connection used to connect to datastore.
+                       If not passed, inferred from the environment.
+
+    :type dataset_id: :class:`gcloud.datastore.connection.Connection`
+    :param dataset_id: Optional. The dataset ID used to connect to datastore.
+                       If not passed, inferred from the environment.
+
+    :raises: EnvironmentError if ``connection`` or ``dataset_id`` not passed,
+             and cannot be inferred from the environment.  ValueError if
+             one or more keys has a dataset ID not matching the passed /
+             inferred dataset ID.
     """
     if not keys:
         return
 
-    connection = connection or _implicit_environ.CONNECTION
+    connection = _require_connection(connection)
+    dataset_id = _require_dataset_id(dataset_id, keys[0])
 
     # We allow partial keys to attempt a delete, the backend will fail.
     current = Batch.current()
     in_batch = current is not None
     if not in_batch:
-        dataset_id = _get_dataset_id_from_keys(keys)
         current = Batch(dataset_id=dataset_id, connection=connection)
     for key in keys:
         current.delete(key)

diff --git a/gcloud/datastore/batch.py b/gcloud/datastore/batch.py
@@ -210,11 +210,16 @@ def put(self, entity):
         :type entity: :class:`gcloud.datastore.entity.Entity`
         :param entity: the entity to be saved.
 
-        :raises: ValueError if entity has no key assigned.
+        :raises: ValueError if entity has no key assigned, or if the key's
+                 ``dataset_id`` does not match ours.
         """
         if entity.key is None:
             raise ValueError("Entity must have a key")
 
+        if not helpers._dataset_ids_equal(self._dataset_id,
+                                          entity.key.dataset_id):
+            raise ValueError("Key must be from same dataset as batch")
+
         _assign_entity_to_mutation(
             self.mutation, entity, self._auto_id_entities)
 
@@ -224,11 +229,16 @@ def delete(self, key):
         :type key: :class:`gcloud.datastore.key.Key`
         :param key: the key to be deleted.
 
-        :raises: ValueError if key is not complete.
+        :raises: ValueError if key is not complete, or if the key's
+                 ``dataset_id`` does not match ours.
         """
         if key.is_partial:
             raise ValueError("Key must be complete")
 
+        if not helpers._dataset_ids_equal(self._dataset_id,
+                                          key.dataset_id):
+            raise ValueError("Key must be from same dataset as batch")
+
         key_pb = key.to_protobuf()
         helpers._add_keys_to_request(self.mutation.delete, [key_pb])
 

diff --git a/gcloud/datastore/helpers.py b/gcloud/datastore/helpers.py
@@ -319,3 +319,55 @@ def _add_keys_to_request(request_field_pb, key_pbs):
     for key_pb in key_pbs:
         key_pb = _prepare_key_for_request(key_pb)
         request_field_pb.add().CopyFrom(key_pb)
+
+
+def _dataset_ids_equal(dataset_id1, dataset_id2):
+    """Compares two dataset IDs for fuzzy equality.
+
+    Each may be prefixed or unprefixed (but not null, since dataset ID
+    is required on a key). The only allowed prefixes are 's~' and 'e~'.
+
+    Two identical prefixed match
+
+      >>> 's~foo' == 's~foo'
+      >>> 'e~bar' == 'e~bar'
+
+    while non-identical prefixed don't
+
+      >>> 's~foo' != 's~bar'
+      >>> 's~foo' != 'e~foo'
+
+    As for non-prefixed, they can match other non-prefixed or
+    prefixed:
+
+      >>> 'foo' == 'foo'
+      >>> 'foo' == 's~foo'
+      >>> 'foo' == 'e~foo'
+      >>> 'foo' != 'bar'
+      >>> 'foo' != 's~bar'
+
+    (Ties are resolved since 'foo' can only be an alias for one of
+    s~foo or e~foo in the backend.)
+
+    :type dataset_id1: string
+    :param dataset_id1: A dataset ID.
+
+    :type dataset_id2: string
+    :param dataset_id2: A dataset ID.
+
+    :rtype: boolean
+    :returns: Boolean indicating if the IDs are the same.
+    """
+    if dataset_id1 == dataset_id2:
+        return True
+
+    if dataset_id1.startswith('s~') or dataset_id1.startswith('e~'):
+        # If `dataset_id1` is prefixed and not matching, then the only way
+        # they can match is if `dataset_id2` is unprefixed.
+        return dataset_id1[2:] == dataset_id2
+    elif dataset_id2.startswith('s~') or dataset_id2.startswith('e~'):
+        # Here we know `dataset_id1` is unprefixed and `dataset_id2`
+        # is prefixed.
+        return dataset_id1 == dataset_id2[2:]
+
+    return False