Avoid reading first chunk of DataChunkIterator on init if maxshape an…

…d dtype are specified (#189) * If iter is given and maxshape or dtype is given, do not read 1st chunk * Add comment and assert statement * Refactor maxshape to set maxshape only when queried
hdmf-dev · Oct 28, 2019 · b247ad6 · b247ad6
1 parent 8ddae22
commit b247ad6
Showing 1 changed file with 37 additions and 36 deletions.
diff --git a/src/hdmf/data_utils.py b/src/hdmf/data_utils.py
@@ -136,7 +136,10 @@ class DataChunkIterator(AbstractDataChunkIterator):
 
     @docval(*__docval_init)
     def __init__(self, **kwargs):
-        """Initialize the DataChunkIterator"""
+        """Initialize the DataChunkIterator.
+        If 'data' is an iterator and 'dtype' is not specified, then next is called on the iterator in order to determine
+        the dtype of the data.
+        """
         # Get the user parameters
         self.data, self.__maxshape, self.__dtype, self.buffer_size, self.iter_axis = getargs('data',
                                                                                              'maxshape',
@@ -177,14 +180,10 @@ def __init__(self, **kwargs):
             elif isinstance(self.data, list) or isinstance(self.data, tuple):
                 self.__maxshape = get_data_shape(self.data, strict_no_data_load=True)
 
-        # If we have a data iterator, then read the first chunk
-        if self.__data_iter is not None:  # and(self.__maxshape is None or self.__dtype is None):
+        # If we have a data iterator and do not know the dtype, then read the first chunk
+        if self.__data_iter is not None and self.__dtype is None:
             self._read_next_chunk()
 
-        # If we still don't know the shape then try to determine the shape from the first chunk
-        if self.__maxshape is None and self.__next_chunk.data is not None:
-            self._set_maxshape_from_next_chunk()
-
         # Determine the type of the data if possible
         if self.__next_chunk.data is not None:
             self.__dtype = self.__next_chunk.data.dtype
@@ -218,7 +217,7 @@ def _read_next_chunk(self):
                 if stop_index > iter_data_bounds:
                     stop_index = iter_data_bounds
 
-                selection = [slice(None)] * len(self.__maxshape)
+                selection = [slice(None)] * len(self.maxshape)
                 selection[self.iter_axis] = slice(start_index, stop_index)
                 selection = tuple(selection)
                 self.__next_chunk.data = self.data[selection]
@@ -261,10 +260,7 @@ def _read_next_chunk(self):
                 self.__next_chunk.data = np.empty(next_chunk_shape, dtype=iter_pieces[0].dtype)
                 self.__next_chunk.data = np.stack(iter_pieces, axis=self.iter_axis)
 
-                if self.__maxshape is None:
-                    self._set_maxshape_from_next_chunk()
-
-                selection = [slice(None)] * len(self.__maxshape)
+                selection = [slice(None)] * len(self.maxshape)
                 selection[self.iter_axis] = slice(self.__next_chunk_start + curr_chunk_offset,
                                                   self.__next_chunk_start + curr_chunk_offset + next_chunk_size)
                 self.__next_chunk.selection = tuple(selection)
@@ -280,26 +276,6 @@ def _read_next_chunk(self):
         self.chunk_index += 1
         return self.__next_chunk
 
-    def _set_maxshape_from_next_chunk(self):
-        """
-        Internal helper function used to determine the maxshape to be used from
-        the self.__next_chunk object. The function initializes self.__maxshape.
-        """
-        data_shape = get_data_shape(self.__next_chunk.data)
-        self.__maxshape = list(data_shape)
-        try:
-            # Size of self.__next_chunk.data along self.iter_axis is not accurate for maxshape because it is just a
-            # chunk. So try to set maxshape along the dimension self.iter_axis based on the shape of self.data if
-            # possible. Otherwise, use None to represent an unlimited size
-            if hasattr(self.data, '__len__') and self.iter_axis == 0:
-                # special case of 1-D array
-                self.__maxshape[0] = len(self.data)
-            else:
-                self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis]
-        except AttributeError:  # from self.data.shape
-            self.__maxshape[self.iter_axis] = None
-        self.__maxshape = tuple(self.__maxshape)
-
     def __next__(self):
         r"""Return the next data chunk or raise a StopIteration exception if all chunks have been retrieved.
 
@@ -347,18 +323,43 @@ def recommended_chunk_shape(self):
     def recommended_data_shape(self):
         """Recommend an initial shape of the data. This is useful when progressively writing data and
         we want to recommend an initial size for the dataset"""
-        if self.__maxshape is not None:
-            if np.all([i is not None for i in self.__maxshape]):
-                return self.__maxshape
+        if self.maxshape is not None:
+            if np.all([i is not None for i in self.maxshape]):
+                return self.maxshape
         return self.__first_chunk_shape
 
     @property
     def maxshape(self):
         """
-        Get a shape tuple describing the maximum shape of the array described by this DataChunkIterator.
+        Get a shape tuple describing the maximum shape of the array described by this DataChunkIterator. If an iterator
+        is provided and no data has been read yet, then the first chunk will be read (i.e., next will be called on the
+        iterator) in order to determine the maxshape.
 
         :return: Shape tuple. None is used for dimenwions where the maximum shape is not known or unlimited.
         """
+        if self.__maxshape is None:
+            # If no data has been read from the iterator yet, read the first chunk and use it to determine the maxshape
+            if self.__data_iter is not None and self.__next_chunk.data is None:
+                self._read_next_chunk()
+
+            # Determine maxshape from self.__next_chunk
+            if self.__next_chunk.data is None:
+                return None
+            data_shape = get_data_shape(self.__next_chunk.data)
+            self.__maxshape = list(data_shape)
+            try:
+                # Size of self.__next_chunk.data along self.iter_axis is not accurate for maxshape because it is just a
+                # chunk. So try to set maxshape along the dimension self.iter_axis based on the shape of self.data if
+                # possible. Otherwise, use None to represent an unlimited size
+                if hasattr(self.data, '__len__') and self.iter_axis == 0:
+                    # special case of 1-D array
+                    self.__maxshape[0] = len(self.data)
+                else:
+                    self.__maxshape[self.iter_axis] = self.data.shape[self.iter_axis]
+            except AttributeError:  # from self.data.shape
+                self.__maxshape[self.iter_axis] = None
+            self.__maxshape = tuple(self.__maxshape)
+
         return self.__maxshape
 
     @property