Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix incorrect dtype precision upgrade for VectorIndex #631

Merged
merged 2 commits into from
Jun 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
# HDMF Changelog

## HDMF 2.5.8 (Upcoming)
## HDMF 2.5.8 (June 16, 2021)

### Minor improvements
- Improve Sphinx documentation. @rly (#627)

### Bug fix
- Fix error with representing an indexed table column when the `VectorIndex` dtype precision is upgraded more
than one step, e.g., uint8 to uint32. This can happen when, for example, a single `add_row` call is used to
add more than 65535 elements to an empty indexed column. @rly (#631)

## HDMF 2.5.7 (June 4, 2021)

### Bug fix
- Fix generation of extension classes that extend MultiContainerInterface and use a custom _fieldsname. @rly (#626)
- Fix generation of extension classes that extend `MultiContainerInterface` and use a custom _fieldsname. @rly (#626)

## HDMF 2.5.6 (May 19, 2021)

Expand Down
17 changes: 11 additions & 6 deletions src/hdmf/common/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,28 +107,33 @@ def add_vector(self, arg, **kwargs):

def __check_precision(self, idx):
"""
Check precision of current dataset and, if
necessary, adjust precision to accommodate new value.
Check precision of current dataset and, if necessary, adjust precision to accommodate new value.

Returns:
unsigned integer encoding of idx
"""
if idx > self.__maxval:
nbits = (np.log2(self.__maxval + 1) * 2)
while idx > self.__maxval:
nbits = (np.log2(self.__maxval + 1) * 2) # 8->16, 16->32, 32->64
if nbits == 128: # pragma: no cover
msg = ('Cannot store more than 18446744073709551615 elements in a VectorData. Largest dtype '
'allowed for VectorIndex is uint64.')
raise ValueError(msg)
self.__maxval = 2 ** nbits - 1
self.__uint = np.dtype('uint%d' % nbits).type
self.__maxval = 2 ** nbits - 1
self.__adjust_precision(self.__uint)
return self.__uint(idx)

def __adjust_precision(self, uint):
"""
Adjust precision of data to specificied unsigned integer precision
Adjust precision of data to specificied unsigned integer precision.
"""
if isinstance(self.data, list):
for i in range(len(self.data)):
self.data[i] = uint(self.data[i])
elif isinstance(self.data, np.ndarray):
self._VectorIndex__data = self.data.astype(uint)
# use self._Data__data to work around restriction on resetting self.data
self._Data__data = self.data.astype(uint)
else:
raise ValueError("cannot adjust precision of type %s to %s", (type(self.data), uint))

Expand Down
63 changes: 63 additions & 0 deletions tests/unit/common/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -1838,3 +1838,66 @@ def test_dtr_references(self):
'y': [read_group1, read_group2]},
index=pd.Index(data=[102, 103], name='id'))
pd.testing.assert_frame_equal(ret, expected)


class TestVectorIndexDtype(TestCase):

def set_up_array_index(self):
data = VectorData(name='data', description='desc')
index = VectorIndex(name='index', data=np.array([]), target=data)
return index

def set_up_list_index(self):
data = VectorData(name='data', description='desc')
index = VectorIndex(name='index', data=[], target=data)
return index

def test_array_inc_precision(self):
index = self.set_up_array_index()
index.add_vector(np.empty((255, )))
self.assertEqual(index.data[0], 255)
self.assertEqual(index.data.dtype, np.uint8)

def test_array_inc_precision_1step(self):
index = self.set_up_array_index()
index.add_vector(np.empty((65535, )))
self.assertEqual(index.data[0], 65535)
self.assertEqual(index.data.dtype, np.uint16)

def test_array_inc_precision_2steps(self):
index = self.set_up_array_index()
index.add_vector(np.empty((65536, )))
self.assertEqual(index.data[0], 65536)
self.assertEqual(index.data.dtype, np.uint32)

def test_array_prev_data_inc_precision_2steps(self):
index = self.set_up_array_index()
index.add_vector(np.empty((255, ))) # dtype is still uint8
index.add_vector(np.empty((65536, )))
self.assertEqual(index.data[0], 255) # make sure the 255 is upgraded
self.assertEqual(index.data.dtype, np.uint32)

def test_list_inc_precision(self):
index = self.set_up_list_index()
index.add_vector(list(range(255)))
self.assertEqual(index.data[0], 255)
self.assertEqual(type(index.data[0]), np.uint8)

def test_list_inc_precision_1step(self):
index = self.set_up_list_index()
index.add_vector(list(range(65535)))
self.assertEqual(index.data[0], 65535)
self.assertEqual(type(index.data[0]), np.uint16)

def test_list_inc_precision_2steps(self):
index = self.set_up_list_index()
index.add_vector(list(range(65536)))
self.assertEqual(index.data[0], 65536)
self.assertEqual(type(index.data[0]), np.uint32)

def test_list_prev_data_inc_precision_2steps(self):
index = self.set_up_list_index()
index.add_vector(list(range(255)))
index.add_vector(list(range(65536 - 255)))
self.assertEqual(index.data[0], 255) # make sure the 255 is upgraded
self.assertEqual(type(index.data[0]), np.uint32)