Skip to content

Commit

Permalink
Infer dtype from data when spec dtype is none or numeric (#143)
Browse files Browse the repository at this point in the history
* Infer dtype from data when spec dtype is none or numeric

* Add tests for ObjectMapper.convert_dtype

* Fix tests for linux numpy precision
  • Loading branch information
rly authored Sep 26, 2019
1 parent 709f1f9 commit 62d487e
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 12 deletions.
5 changes: 3 additions & 2 deletions src/hdmf/backends/hdf5/h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -967,10 +967,11 @@ def __get_ref(self, **kwargs):
def __is_ref(self, dtype):
if isinstance(dtype, DtypeSpec):
return self.__is_ref(dtype.dtype)
elif isinstance(dtype, RefSpec):
if isinstance(dtype, RefSpec):
return True
else:
if isinstance(dtype, str):
return dtype == DatasetBuilder.OBJECT_REF_TYPE or dtype == DatasetBuilder.REGION_REF_TYPE
return False

def __queue_ref(self, func):
'''Set aside filling dset with references
Expand Down
21 changes: 15 additions & 6 deletions src/hdmf/build/map.py
Original file line number Diff line number Diff line change
Expand Up @@ -471,12 +471,21 @@ def __check_edgecases(cls, spec, value):
return value, spec.dtype
if isinstance(value, DataIO):
return value, cls.convert_dtype(spec, value.data)[1]
if spec.dtype is None:
return value, None
if spec.dtype == 'numeric':
return value, None
if type(value) in cls.__no_convert:
return value, None
if spec.dtype is None or spec.dtype == 'numeric' or type(value) in cls.__no_convert:
# infer type from value
if hasattr(value, 'dtype'): # covers numpy types, AbstractDataChunkIterator
return value, value.dtype
if isinstance(value, (list, tuple)):
if len(value) == 0:
msg = "cannot infer dtype of empty list or tuple. Please use numpy array with specified dtype."
raise ValueError(msg)
return value, cls.__check_edgecases(spec, value[0])[1] # infer dtype from first element
ret_dtype = type(value)
if ret_dtype is str:
ret_dtype = 'utf8'
elif ret_dtype is bytes:
ret_dtype = 'ascii'
return value, ret_dtype
if isinstance(spec.dtype, RefSpec):
if not isinstance(value, ReferenceBuilder):
msg = "got RefSpec for value of type %s" % type(value)
Expand Down
4 changes: 2 additions & 2 deletions src/hdmf/spec/spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,8 @@ def __init__(self, **kwargs):
getargs('name', 'doc', 'parent', 'quantity', 'attributes',
'linkable', 'data_type_def', 'data_type_inc', kwargs)
if name == NAME_WILDCARD and data_type_def is None and data_type_inc is None:
raise ValueError("Cannot create Group or Dataset spec with wildcard name \
without specifying 'data_type_def' and/or 'data_type_inc'")
raise ValueError("Cannot create Group or Dataset spec with wildcard name "
"without specifying 'data_type_def' and/or 'data_type_inc'")
super(BaseStorageSpec, self).__init__(doc, name=name, parent=parent)
default_name = getargs('default_name', kwargs)
if default_name:
Expand Down
149 changes: 148 additions & 1 deletion tests/unit/build_tests/test_io_map.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import unittest2 as unittest
import re

from hdmf.spec import GroupSpec, AttributeSpec, DatasetSpec, SpecCatalog, SpecNamespace, NamespaceCatalog
from hdmf.spec import GroupSpec, AttributeSpec, DatasetSpec, SpecCatalog, SpecNamespace, NamespaceCatalog, RefSpec
from hdmf.build import GroupBuilder, DatasetBuilder, ObjectMapper, BuildManager, TypeMap, LinkBuilder
from hdmf import Container
from hdmf.utils import docval, getargs, get_docval
from hdmf.data_utils import DataChunkIterator
from hdmf.backends.hdf5 import H5DataIO

from abc import ABCMeta
from six import with_metaclass
import numpy as np

from tests.unit.test_utils import CORE_NAMESPACE

Expand Down Expand Up @@ -496,5 +499,149 @@ def test_build_child_link(self):
self.assertDictEqual(bar2_builder, bar2_expected)


class TestConvertDtype(unittest.TestCase):

def test_value_none(self):
spec = DatasetSpec('an example dataset', 'int', name='data')
self.assertTupleEqual(ObjectMapper.convert_dtype(spec, None), (None, 'int'))

spec = DatasetSpec('an example dataset', RefSpec(reftype='object', target_type='int'), name='data')
self.assertTupleEqual(ObjectMapper.convert_dtype(spec, None), (None, 'object'))

def test_convert_higher_precision(self):
"""Test that passing a data type with a precision <= specified returns the higher precision type"""
spec_type = 'float64'
value_types = ['float', 'float32', 'double', 'float64']
self.convert_higher_precision_helper(spec_type, value_types)

spec_type = 'int64'
value_types = ['long', 'int64', 'uint64', 'int', 'int32', 'int16', 'int8']
self.convert_higher_precision_helper(spec_type, value_types)

spec_type = 'int32'
value_types = ['int32', 'int16', 'int8']
self.convert_higher_precision_helper(spec_type, value_types)

spec_type = 'int16'
value_types = ['int16', 'int8']
self.convert_higher_precision_helper(spec_type, value_types)

spec_type = 'uint32'
value_types = ['uint32', 'uint16', 'uint8']
self.convert_higher_precision_helper(spec_type, value_types)

def convert_higher_precision_helper(self, spec_type, value_types):
data = 2
spec = DatasetSpec('an example dataset', spec_type, name='data')
match = (np.dtype(spec_type).type(data), np.dtype(spec_type))
for dtype in value_types:
value = np.dtype(dtype).type(data)
with self.subTest(dtype=dtype):
ret = ObjectMapper.convert_dtype(spec, value)
self.assertTupleEqual(ret, match)
self.assertEqual(ret[0].dtype, match[1])

def test_keep_higher_precision(self):
"""Test that passing a data type with a precision >= specified return the given type"""
spec_type = 'float'
value_types = ['double', 'float64']
self.keep_higher_precision_helper(spec_type, value_types)

spec_type = 'int'
value_types = ['int64']
self.keep_higher_precision_helper(spec_type, value_types)

spec_type = 'int8'
value_types = ['long', 'int64', 'int', 'int32', 'int16']
self.keep_higher_precision_helper(spec_type, value_types)

spec_type = 'uint'
value_types = ['uint64']
self.keep_higher_precision_helper(spec_type, value_types)

spec_type = 'uint8'
value_types = ['uint64', 'uint32', 'uint', 'uint16']
self.keep_higher_precision_helper(spec_type, value_types)

def keep_higher_precision_helper(self, spec_type, value_types):
data = 2
spec = DatasetSpec('an example dataset', spec_type, name='data')
for dtype in value_types:
value = np.dtype(dtype).type(data)
match = (value, np.dtype(dtype))
with self.subTest(dtype=dtype):
ret = ObjectMapper.convert_dtype(spec, value)
self.assertTupleEqual(ret, match)
self.assertEqual(ret[0].dtype, match[1])

def test_no_spec(self):
spec_type = None
spec = DatasetSpec('an example dataset', spec_type, name='data')

value = [1, 2, 3]
ret = ObjectMapper.convert_dtype(spec, value)
match = (value, int)
self.assertTupleEqual(ret, match)
self.assertEqual(type(ret[0][0]), match[1])

value = np.uint64(4)
ret = ObjectMapper.convert_dtype(spec, value)
match = (value, np.uint64)
self.assertTupleEqual(ret, match)
self.assertEqual(type(ret[0]), match[1])

value = 'hello'
ret = ObjectMapper.convert_dtype(spec, value)
match = (value, 'utf8')
self.assertTupleEqual(ret, match)
self.assertEqual(type(ret[0]), str)

value = bytes('hello', encoding='utf-8')
ret = ObjectMapper.convert_dtype(spec, value)
match = (value, 'ascii')
self.assertTupleEqual(ret, match)
self.assertEqual(type(ret[0]), bytes)

value = DataChunkIterator(data=[1, 2, 3])
ret = ObjectMapper.convert_dtype(spec, value)
match = (value, np.dtype(int))
self.assertTupleEqual(ret, match)
self.assertEqual(ret[0].dtype, match[1])

value = DataChunkIterator(data=[1., 2., 3.])
ret = ObjectMapper.convert_dtype(spec, value)
match = (value, np.dtype(float))
self.assertTupleEqual(ret, match)
self.assertEqual(ret[0].dtype, match[1])

value = H5DataIO(np.arange(30).reshape(5, 2, 3))
ret = ObjectMapper.convert_dtype(spec, value)
match = (value, np.dtype(int))
self.assertTupleEqual(ret, match)
self.assertEqual(ret[0].dtype, match[1])

value = H5DataIO(['foo' 'bar'])
ret = ObjectMapper.convert_dtype(spec, value)
match = (value, 'utf8')
self.assertTupleEqual(ret, match)
self.assertEqual(type(ret[0].data[0]), str)

def test_numeric_spec(self):
spec_type = 'numeric'
spec = DatasetSpec('an example dataset', spec_type, name='data')

value = np.uint64(4)
ret = ObjectMapper.convert_dtype(spec, value)
match = (value, np.uint64)
self.assertTupleEqual(ret, match)
self.assertEqual(type(ret[0]), match[1])

value = DataChunkIterator(data=[1, 2, 3])
ret = ObjectMapper.convert_dtype(spec, value)
match = (value, np.dtype(int))
self.assertTupleEqual(ret, match)
self.assertEqual(ret[0].dtype, match[1])


if __name__ == '__main__':
unittest.main()
3 changes: 2 additions & 1 deletion tests/unit/test_io_hdf5_h5tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,10 @@ def test_write_dataset_iterable_multidimensional_array_compression(self):
#############################################
def test_write_dataset_data_chunk_iterator(self):
dci = DataChunkIterator(data=np.arange(10), buffer_size=2)
self.io.write_dataset(self.f, DatasetBuilder('test_dataset', dci, attributes={}))
self.io.write_dataset(self.f, DatasetBuilder('test_dataset', dci, attributes={}, dtype=dci.dtype))
dset = self.f['test_dataset']
self.assertListEqual(dset[:].tolist(), list(range(10)))
self.assertEqual(dset[:].dtype, dci.dtype)

def test_write_dataset_data_chunk_iterator_with_compression(self):
dci = DataChunkIterator(data=np.arange(10), buffer_size=2)
Expand Down

0 comments on commit 62d487e

Please sign in to comment.