Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix #690 -- blob packing/unpacking of native python bool, int, float, and complex. #709

Merged
merged 16 commits into from
Jan 14, 2020
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 55 additions & 15 deletions datajoint/blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,20 +116,24 @@ def read_blob(self, n_bytes=None):
"S": self.read_struct, # matlab struct array
"C": self.read_cell_array, # matlab cell array
# Python-native
"\xFF": self.read_none, # None
"\1": self.read_tuple, # a Sequence
"\2": self.read_list, # a MutableSequence
"\3": self.read_set, # a Set
"\4": self.read_dict, # a Mapping
"\5": self.read_string, # a UTF8-encoded string
"\6": self.read_bytes, # a ByteString
"F": self.read_recarray, # numpy array with fields, including recarrays
"d": self.read_decimal, # a decimal
"t": self.read_datetime, # date, time, or datetime
"u": self.read_uuid, # UUID
"\xFF": self.read_none, # None
"\x01": self.read_tuple, # a Sequence (e.g. tuple)
"\x02": self.read_list, # a MutableSequence (e.g. list)
"\x03": self.read_set, # a Set
"\x04": self.read_dict, # a Mapping (e.g. dict)
"\x05": self.read_string, # a UTF8-encoded string
"\x06": self.read_bytes, # a ByteString
"\x0a": self.read_int, # python-native int
"\x0b": self.read_bool, # python-native bool
"\x0c": self.read_complex, # python-native complex
"\x0d": self.read_float, # python-native float
"F": self.read_recarray, # numpy array with fields, including recarrays
"d": self.read_decimal, # a decimal
"t": self.read_datetime, # date, time, or datetime
"u": self.read_uuid, # UUID
}[data_structure_code]
except KeyError:
raise DataJointError('Unknown data structure code "%s"' % data_structure_code)
raise DataJointError('Unknown data structure code "%s". Upgrade datajoint.' % data_structure_code)
v = call()
if n_bytes is not None and self._pos - start != n_bytes:
raise DataJointError('Blob length check failed! Invalid blob')
Expand All @@ -146,13 +150,21 @@ def pack_blob(self, obj):

# blob types in the expanded dj0 blob format
self.set_dj0()
if not isinstance(obj, (np.ndarray, np.number)):
# python built-in data types
if isinstance(obj, bool):
return self.pack_bool(obj)
if isinstance(obj, int):
return self.pack_int(obj)
if isinstance(obj, complex):
return self.pack_complex(obj)
if isinstance(obj, float):
return self.pack_float(obj)
if isinstance(obj, np.ndarray) and obj.dtype.fields:
return self.pack_recarray(np.array(obj))
if isinstance(obj, np.number):
return self.pack_array(np.array(obj))
if isinstance(obj, (bool, np.bool, np.bool_)):
return self.pack_array(np.array(obj))
if isinstance(obj, (float, int, complex)):
if isinstance(obj, (np.bool, np.bool_)):
return self.pack_array(np.array(obj))
if isinstance(obj, (datetime.datetime, datetime.date, datetime.time)):
return self.pack_datetime(obj)
Expand Down Expand Up @@ -251,6 +263,34 @@ def pack_recarray(self, array):
def read_sparse_array(self):
raise DataJointError('datajoint-python does not yet support sparse arrays. Issue (#590)')

def read_int(self):
return int(self.read_value('int64'))

@staticmethod
def pack_int(v):
return b"\x0a" + np.array(v, dtype='int64').tobytes()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why we did not utilize decimal packing here? Python int are essentially boundless (memory-dependent). I believe decimal packing would be a closer representation as the length would be encoded.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

modified to support unbounded int


def read_bool(self):
return bool(self.read_value('bool'))

@staticmethod
def pack_bool(v):
return b"\x0b" + np.array(v, dtype='bool').tobytes()

def read_complex(self):
return complex(self.read_value('complex128'))

@staticmethod
def pack_complex(v):
return b"\x0c" + np.array(v, dtype='complex128').tobytes()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could utilize decimal packing here for the same reasons as float below. Python seems to capture the first 53 bits for each the real part and the complex part.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here Python is not doing anything special and just uses the standard IEEE 754 encoding.


def read_float(self):
return float(self.read_value('float64'))

@staticmethod
def pack_float(v):
return b"\x0d" + np.array(v, dtype='float64').tobytes()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why we did not utilize decimal packing here? Python float have a precision of 53 bits which means we would be storing unnecessary additional data.


def read_decimal(self):
return Decimal(self.read_string())

Expand Down
28 changes: 21 additions & 7 deletions tests/test_blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from datetime import datetime
from datajoint.blob import pack, unpack
from numpy.testing import assert_array_equal
from nose.tools import assert_equal, assert_true, \
from nose.tools import assert_equal, assert_true, assert_false, \
assert_list_equal, assert_set_equal, assert_tuple_equal, assert_dict_equal


Expand All @@ -23,23 +23,37 @@ def test_pack():
x = np.random.randn(10)
assert_array_equal(x, unpack(pack(x)), "Arrays do not match!")

x = 7j
assert_equal(x, unpack(pack(x)), "Complex scalar does not match")

x = np.float32(np.random.randn(3, 4, 5))
assert_array_equal(x, unpack(pack(x)), "Arrays do not match!")

x = np.int16(np.random.randn(1, 2, 3))
assert_array_equal(x, unpack(pack(x)), "Arrays do not match!")

x = None
assert_true(x is None, "None did not match")
assert_true(unpack(pack(x)) is None, "None did not match")

x = 7
y = unpack(pack(x))
assert_true(x == y and isinstance(y, int) and not isinstance(y, np.ndarray), "Native int did not match")

x = 7.
y = unpack(pack(x))
assert_true(x == y and isinstance(y, float) and not isinstance(y, np.ndarray), "Native float did not match")

x = 7j
y = unpack(pack(x))
assert_true(x == y and isinstance(y, complex) and not isinstance(y, np.ndarray), "Native complex did not match")

x = True
assert_true(unpack(pack(x)) is True, "Native bool did not match")

x = [None]
assert_list_equal(x, unpack(pack(x)))

x = {'name': 'Anonymous', 'age': 15, 99: datetime.now(), 'range': [110, 190], (11,12): None}
assert_dict_equal(x, unpack(pack(x)), "Dict do not match!")
x = {'name': 'Anonymous', 'age': 15, 99: datetime.now(), 'range': [110, 190], (11, 12): None}
y = unpack(pack(x))
assert_dict_equal(x, y, "Dict do not match!")
assert_false(isinstance(['range'][0], np.ndarray), "Python-native scalars did not match.")

x = uuid.uuid4()
assert_equal(x, unpack(pack(x)), 'UUID did not match')
Expand Down