Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid reading first chunk of DataChunkIterator on init if maxshape and dtype are specified #189

Merged
merged 3 commits into from
Oct 28, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 37 additions & 36 deletions src/hdmf/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,10 @@ class DataChunkIterator(AbstractDataChunkIterator):

@docval(*__docval_init)
def __init__(self, **kwargs):
"""Initialize the DataChunkIterator"""
"""Initialize the DataChunkIterator.
If 'data' is an iterator and 'dtype' is not specified, then next is called on the iterator in order to determine
the dtype of the data.
"""
# Get the user parameters
self.data, self.__maxshape, self.__dtype, self.buffer_size, self.iter_axis = getargs('data',
'maxshape',
Expand Down Expand Up @@ -177,14 +180,10 @@ def __init__(self, **kwargs):
elif isinstance(self.data, list) or isinstance(self.data, tuple):
self.__maxshape = get_data_shape(self.data, strict_no_data_load=True)

# If we have a data iterator, then read the first chunk
if self.__data_iter is not None: # and(self.__maxshape is None or self.__dtype is None):
# If we have a data iterator and do not know the dtype, then read the first chunk
if self.__data_iter is not None and self.__dtype is None:
self._read_next_chunk()

# If we still don't know the shape then try to determine the shape from the first chunk
if self.__maxshape is None and self.__next_chunk.data is not None:
self._set_maxshape_from_next_chunk()

# Determine the type of the data if possible
if self.__next_chunk.data is not None:
self.__dtype = self.__next_chunk.data.dtype
Expand Down Expand Up @@ -218,7 +217,7 @@ def _read_next_chunk(self):
if stop_index > iter_data_bounds:
stop_index = iter_data_bounds

selection = [slice(None)] * len(self.__maxshape)
selection = [slice(None)] * len(self.maxshape)
selection[self.iter_axis] = slice(start_index, stop_index)
selection = tuple(selection)
self.__next_chunk.data = self.data[selection]
Expand Down Expand Up @@ -261,10 +260,7 @@ def _read_next_chunk(self):
self.__next_chunk.data = np.empty(next_chunk_shape, dtype=iter_pieces[0].dtype)
self.__next_chunk.data = np.stack(iter_pieces, axis=self.iter_axis)

if self.__maxshape is None:
self._set_maxshape_from_next_chunk()

selection = [slice(None)] * len(self.__maxshape)
selection = [slice(None)] * len(self.maxshape)
selection[self.iter_axis] = slice(self.__next_chunk_start + curr_chunk_offset,
self.__next_chunk_start + curr_chunk_offset + next_chunk_size)
self.__next_chunk.selection = tuple(selection)
Expand All @@ -280,26 +276,6 @@ def _read_next_chunk(self):
self.chunk_index += 1
return self.__next_chunk

def _set_maxshape_from_next_chunk(self):
"""
Internal helper function used to determine the maxshape to be used from
the self.__next_chunk object. The function initializes self.__maxshape.
"""
data_shape = get_data_shape(self.__next_chunk.data)
self.__maxshape = list(data_shape)
try:
# Size of self.__next_chunk.data along self.iter_axis is not accurate for maxshape because it is just a
# chunk. So try to set maxshape along the dimension self.iter_axis based on the shape of self.data if
# possible. Otherwise, use None to represent an unlimited size
if hasattr(self.data, '__len__') and self.iter_axis == 0:
# special case of 1-D array
self.__maxshape[0] = len(self.data)
else:
self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis]
except AttributeError: # from self.data.shape
self.__maxshape[self.iter_axis] = None
self.__maxshape = tuple(self.__maxshape)

def __next__(self):
r"""Return the next data chunk or raise a StopIteration exception if all chunks have been retrieved.

Expand Down Expand Up @@ -347,18 +323,43 @@ def recommended_chunk_shape(self):
def recommended_data_shape(self):
"""Recommend an initial shape of the data. This is useful when progressively writing data and
we want to recommend an initial size for the dataset"""
if self.__maxshape is not None:
if np.all([i is not None for i in self.__maxshape]):
return self.__maxshape
if self.maxshape is not None:
if np.all([i is not None for i in self.maxshape]):
return self.maxshape
return self.__first_chunk_shape

@property
def maxshape(self):
"""
Get a shape tuple describing the maximum shape of the array described by this DataChunkIterator.
Get a shape tuple describing the maximum shape of the array described by this DataChunkIterator. If an iterator
is provided and no data has been read yet, then the first chunk will be read (i.e., next will be called on the
iterator) in order to determine the maxshape.

:return: Shape tuple. None is used for dimenwions where the maximum shape is not known or unlimited.
"""
if self.__maxshape is None:
# If no data has been read from the iterator yet, read the first chunk and use it to determine the maxshape
if self.__data_iter is not None and self.__next_chunk.data is None:
self._read_next_chunk()

# Determine maxshape from self.__next_chunk
if self.__next_chunk.data is None:
return None
data_shape = get_data_shape(self.__next_chunk.data)
self.__maxshape = list(data_shape)
try:
# Size of self.__next_chunk.data along self.iter_axis is not accurate for maxshape because it is just a
# chunk. So try to set maxshape along the dimension self.iter_axis based on the shape of self.data if
# possible. Otherwise, use None to represent an unlimited size
if hasattr(self.data, '__len__') and self.iter_axis == 0:
# special case of 1-D array
self.__maxshape[0] = len(self.data)
else:
self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis]
except AttributeError: # from self.data.shape
self.__maxshape[self.iter_axis] = None
self.__maxshape = tuple(self.__maxshape)

return self.__maxshape

@property
Expand Down