Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix #690 -- blob packing/unpacking of native python bool, int, float, and complex. #709

Merged
merged 16 commits into from
Jan 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@
## Release notes

## 0.12.3 -- Nov 22, 2019
* Bugfix #675 (PR #705) networkx 2.4+ is now supported
* Bugfix #698 and #699 (PR #706) display table definition in doc string and help
* Bugfix #701 (PR #702) job reservation works with native python datatype support disabled
### 0.12.4 -- Jan 14, 2020
* Support for simple scalar datatypes in blobs (#690) PR #709
* Add support for the `serial` data type in declarations: alias for `bigint unsigned auto_increment` PR #713
* Improve the log table to avoid primary key collisions PR #713
* Improve documentation in README PR #713

### 0.12.3 -- Nov 22, 2019
* Bugfix - networkx 2.4 causes error in diagrams (#675) PR #705
* Bugfix - include table definition in doc string and help (#698, #699) PR #706
* Bugfix - job reservation fails when native python datatype support is disabled (#701) PR #702

### 0.12.2 -- Nov 11, 2019
* Bugfix - Convoluted error thrown if there is a reference to a non-existent table attribute (#691)
* Bugfix - Insert into external does not trim leading slash if defined in `dj.config['stores']['<store>']['location']` (#692)
* Bugfix - Convoluted error thrown if there is a reference to a non-existent table attribute (#691) PR #696
* Bugfix - Insert into external does not trim leading slash if defined in `dj.config['stores']['<store>']['location']` (#692) PR #693

### 0.12.1 -- Nov 2, 2019
* Bugfix - AttributeAdapter converts into a string (#684)
* Bugfix - AttributeAdapter converts into a string (#684) PR #688

### 0.12.0 -- Oct 31, 2019
* Dropped support for Python 3.4
Expand Down
4 changes: 2 additions & 2 deletions LNX-docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ services:
"
pip install --user nose nose-cov coveralls .;
pip freeze | grep datajoint;
nosetests -vsw tests --with-coverage --cover-package=datajoint;
coveralls;
nosetests -vsw tests --with-coverage --cover-package=datajoint;
# jupyter notebook;
"
# ports:
Expand Down Expand Up @@ -92,4 +92,4 @@ services:
- ./tests/nginx/fullchain.pem:/certs/fullchain.pem
- ./tests/nginx/privkey.pem:/certs/privkey.pem
networks:
main:
main:
80 changes: 61 additions & 19 deletions datajoint/blob.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
(De)serialization methods for python datatypes and numpy.ndarrays with provisions for mutual
(De)serialization methods for basic datatypes and numpy.ndarrays with provisions for mutual
compatibility with Matlab-based serialization implemented by mYm.
"""

Expand Down Expand Up @@ -115,21 +115,25 @@ def read_blob(self, n_bytes=None):
"P": self.read_sparse_array, # matlab sparse array -- not supported yet
"S": self.read_struct, # matlab struct array
"C": self.read_cell_array, # matlab cell array
# Python-native
"\xFF": self.read_none, # None
"\1": self.read_tuple, # a Sequence
"\2": self.read_list, # a MutableSequence
"\3": self.read_set, # a Set
"\4": self.read_dict, # a Mapping
"\5": self.read_string, # a UTF8-encoded string
"\6": self.read_bytes, # a ByteString
"F": self.read_recarray, # numpy array with fields, including recarrays
"d": self.read_decimal, # a decimal
"t": self.read_datetime, # date, time, or datetime
"u": self.read_uuid, # UUID
# basic data types
"\xFF": self.read_none, # None
"\x01": self.read_tuple, # a Sequence (e.g. tuple)
"\x02": self.read_list, # a MutableSequence (e.g. list)
"\x03": self.read_set, # a Set
"\x04": self.read_dict, # a Mapping (e.g. dict)
"\x05": self.read_string, # a UTF8-encoded string
"\x06": self.read_bytes, # a ByteString
"\x0a": self.read_int, # unbounded scalar int
"\x0b": self.read_bool, # scalar boolean
"\x0c": self.read_complex, # scalar 128-bit complex number
"\x0d": self.read_float, # scalar 64-bit float
"F": self.read_recarray, # numpy array with fields, including recarrays
"d": self.read_decimal, # a decimal
"t": self.read_datetime, # date, time, or datetime
"u": self.read_uuid, # UUID
}[data_structure_code]
except KeyError:
raise DataJointError('Unknown data structure code "%s"' % data_structure_code)
raise DataJointError('Unknown data structure code "%s". Upgrade datajoint.' % data_structure_code)
v = call()
if n_bytes is not None and self._pos - start != n_bytes:
raise DataJointError('Blob length check failed! Invalid blob')
Expand All @@ -146,13 +150,21 @@ def pack_blob(self, obj):

# blob types in the expanded dj0 blob format
self.set_dj0()
if not isinstance(obj, (np.ndarray, np.number)):
# python built-in data types
if isinstance(obj, bool):
return self.pack_bool(obj)
if isinstance(obj, int):
return self.pack_int(obj)
if isinstance(obj, complex):
return self.pack_complex(obj)
if isinstance(obj, float):
return self.pack_float(obj)
if isinstance(obj, np.ndarray) and obj.dtype.fields:
return self.pack_recarray(np.array(obj))
if isinstance(obj, np.number):
return self.pack_array(np.array(obj))
if isinstance(obj, (bool, np.bool, np.bool_)):
return self.pack_array(np.array(obj))
if isinstance(obj, (float, int, complex)):
if isinstance(obj, (np.bool, np.bool_)):
return self.pack_array(np.array(obj))
if isinstance(obj, (datetime.datetime, datetime.date, datetime.time)):
return self.pack_datetime(obj)
Expand Down Expand Up @@ -209,7 +221,7 @@ def pack_array(self, array):
if is_complex:
array, imaginary = np.real(array), np.imag(array)
type_id = (rev_class_id[array.dtype] if array.dtype.char != 'U'
else rev_class_id[np.dtype('O')])
else rev_class_id[np.dtype('O')])
if dtype_list[type_id] is None:
raise DataJointError("Type %s is ambiguous or unknown" % array.dtype)

Expand Down Expand Up @@ -251,6 +263,36 @@ def pack_recarray(self, array):
def read_sparse_array(self):
raise DataJointError('datajoint-python does not yet support sparse arrays. Issue (#590)')

def read_int(self):
return int.from_bytes(self.read_binary(self.read_value('uint16')), byteorder='little', signed=True)

@staticmethod
def pack_int(v):
n_bytes = v.bit_length() // 8 + 1
assert 0 < n_bytes <= 0xFFFF, 'Integers are limited to 65535 bytes'
return b"\x0a" + np.uint16(n_bytes).tobytes() + v.to_bytes(n_bytes, byteorder='little', signed=True)

def read_bool(self):
return bool(self.read_value('bool'))

@staticmethod
def pack_bool(v):
return b"\x0b" + np.array(v, dtype='bool').tobytes()

def read_complex(self):
return complex(self.read_value('complex128'))

@staticmethod
def pack_complex(v):
return b"\x0c" + np.array(v, dtype='complex128').tobytes()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could utilize decimal packing here for the same reasons as float below. Python seems to capture the first 53 bits for each the real part and the complex part.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here Python is not doing anything special and just uses the standard IEEE 754 encoding.


def read_float(self):
return float(self.read_value('float64'))

@staticmethod
def pack_float(v):
return b"\x0d" + np.array(v, dtype='float64').tobytes()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why we did not utilize decimal packing here? Python float have a precision of 53 bits which means we would be storing unnecessary additional data.


def read_decimal(self):
return Decimal(self.read_string())

Expand All @@ -269,7 +311,7 @@ def pack_string(s):

def read_bytes(self):
return self.read_binary(self.read_value())

@staticmethod
def pack_bytes(s):
return b"\6" + len_u64(s) + s
Expand Down
2 changes: 1 addition & 1 deletion datajoint/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__version__ = "0.12.3"
__version__ = "0.12.4"

assert len(__version__) <= 10 # The log table limits version to the 10 characters
17 changes: 11 additions & 6 deletions docs-parts/intro/Releases_lang1.rst
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
0.12.4 -- Jan 14, 2020
* Support for simple scalar datatypes in blobs (#690) PR #709
* Add support for the `serial` data type in declarations: alias for `bigint unsigned auto_increment` PR #713
* Improve the log table to avoid primary key collisions PR #713
* Improve documentation in README PR #713

0.12.3 -- Nov 22, 2019
----------------------
* Bugfix - networkx 2.4 causes error in diagrams (#675) PR #705
* Bugfix - include definition in doc string and help (#698, #699) PR #706
* Bugfix - job reservation fails when native python datatype support is disabled (#701) PR #702

* Bugfix - include table definition in doc string and help (#698, #699) PR #706
* Bugfix - job reservation fails when native python datatype support is disabled (#701) PR #702

0.12.2 -- Nov 11, 2019
-------------------------
* Bugfix - Convoluted error thrown if there is a reference to a non-existent table attribute (#691)
* Bugfix - Insert into external does not trim leading slash if defined in `dj.config['stores']['<store>']['location']` (#692)
* Bugfix - Convoluted error thrown if there is a reference to a non-existent table attribute (#691) PR #696
* Bugfix - Insert into external does not trim leading slash if defined in `dj.config['stores']['<store>']['location']` (#692) PR #693

0.12.1 -- Nov 2, 2019
-------------------------
* Bugfix - AttributeAdapter converts into a string (#684)
* Bugfix - AttributeAdapter converts into a string (#684) PR #688

0.12.0 -- Oct 31, 2019
-------------------------
Expand Down
38 changes: 19 additions & 19 deletions tests/schema_adapted.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import datajoint as dj
import networkx as nx
import json
from pathlib import Path
import tempfile
from datajoint import errors
Expand All @@ -11,8 +12,8 @@
S3_CONN_INFO,
protocol='s3',
location='adapted/repo',
stage=tempfile.mkdtemp())
}
stage=tempfile.mkdtemp())}

dj.config['stores'] = stores_config

schema_name = PREFIX + '_test_custom_datatype'
Expand Down Expand Up @@ -53,37 +54,36 @@ class Connectivity(dj.Manual):
errors._switch_filepath_types(True)


class Filepath2GraphAdapter(dj.AttributeAdapter):
class LayoutToFilepath(dj.AttributeAdapter):
"""
An adapted data type that saves a graph layout into fixed filepath
"""

attribute_type = 'filepath@repo_s3'

@staticmethod
def get(obj):
s = open(obj, "r").read()
return nx.spring_layout(
nx.lollipop_graph(4, 2), seed=int(s))
def get(path):
with open(path, "r") as f:
return json.load(f)

@staticmethod
def put(obj):
path = Path(
dj.config['stores']['repo_s3']['stage'], 'sample.txt')

f = open(path, "w")
f.write(str(obj*obj))
f.close()

def put(layout):
path = Path(dj.config['stores']['repo_s3']['stage'], 'layout.json')
with open(str(path), "w") as f:
json.dump(layout, f)
return path


file2graph = Filepath2GraphAdapter()
layout_to_filepath = LayoutToFilepath()


@schema
class Position(dj.Manual):
class Layout(dj.Manual):
definition = """
pos_id : int
# stores graph layout
-> Connectivity
---
seed_root: <file2graph>
layout: <layout_to_filepath>
"""

errors._switch_filepath_types(False)
Expand Down
24 changes: 16 additions & 8 deletions tests/test_adapted_attributes.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import datajoint as dj
import networkx as nx
from itertools import zip_longest
from nose.tools import assert_true, assert_equal
from nose.tools import assert_true, assert_equal, assert_dict_equal
from . import schema_adapted as adapted
from .schema_adapted import graph, file2graph
from .schema_adapted import graph, layout_to_filepath


def test_adapted_type():
Expand All @@ -22,17 +22,25 @@ def test_adapted_type():

def test_adapted_filepath_type():
# https://github.com/datajoint/datajoint-python/issues/684

dj.errors._switch_adapted_types(True)
dj.errors._switch_filepath_types(True)
c = adapted.Position()
Position.insert([{'pos_id': 0, 'seed_root': 3}])
result = (Position & 'pos_id=0').fetch1('seed_root')

assert_true(isinstance(result, dict))
assert_equal(0.3761992090175474, result[1][0])
assert_true(6 == len(result))
c = adapted.Connectivity()
c.delete()
c.insert1((0, nx.lollipop_graph(4, 2)))

layout = nx.spring_layout(c.fetch1('conn_graph'))
# make json friendly
layout = {str(k): [round(r, ndigits=4) for r in v] for k, v in layout.items()}
t = adapted.Layout()
t.insert1((0, layout))
result = t.fetch1('layout')
assert_dict_equal(result, layout)

t.delete()
c.delete()

dj.errors._switch_filepath_types(False)
dj.errors._switch_adapted_types(False)

Expand Down
32 changes: 25 additions & 7 deletions tests/test_blob.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from datetime import datetime
from datajoint.blob import pack, unpack
from numpy.testing import assert_array_equal
from nose.tools import assert_equal, assert_true, \
from nose.tools import assert_equal, assert_true, assert_false, \
assert_list_equal, assert_set_equal, assert_tuple_equal, assert_dict_equal


Expand All @@ -23,23 +23,41 @@ def test_pack():
x = np.random.randn(10)
assert_array_equal(x, unpack(pack(x)), "Arrays do not match!")

x = 7j
assert_equal(x, unpack(pack(x)), "Complex scalar does not match")

x = np.float32(np.random.randn(3, 4, 5))
assert_array_equal(x, unpack(pack(x)), "Arrays do not match!")

x = np.int16(np.random.randn(1, 2, 3))
assert_array_equal(x, unpack(pack(x)), "Arrays do not match!")

x = None
assert_true(x is None, "None did not match")
assert_true(unpack(pack(x)) is None, "None did not match")

x = -255
y = unpack(pack(x))
assert_true(x == y and isinstance(y, int) and not isinstance(y, np.ndarray), "Scalar int did not match")

x = -25523987234234287910987234987098245697129798713407812347
y = unpack(pack(x))
assert_true(x == y and isinstance(y, int) and not isinstance(y, np.ndarray), "Unbounded int did not match")

x = 7.
y = unpack(pack(x))
assert_true(x == y and isinstance(y, float) and not isinstance(y, np.ndarray), "Scalar float did not match")

x = 7j
y = unpack(pack(x))
assert_true(x == y and isinstance(y, complex) and not isinstance(y, np.ndarray), "Complex scalar did not match")

x = True
assert_true(unpack(pack(x)) is True, "Scalar bool did not match")

x = [None]
assert_list_equal(x, unpack(pack(x)))

x = {'name': 'Anonymous', 'age': 15, 99: datetime.now(), 'range': [110, 190], (11,12): None}
assert_dict_equal(x, unpack(pack(x)), "Dict do not match!")
x = {'name': 'Anonymous', 'age': 15, 99: datetime.now(), 'range': [110, 190], (11, 12): None}
y = unpack(pack(x))
assert_dict_equal(x, y, "Dict do not match!")
assert_false(isinstance(['range'][0], np.ndarray), "Scalar int was coerced into arrray.")

x = uuid.uuid4()
assert_equal(x, unpack(pack(x)), 'UUID did not match')
Expand Down