Skip to content

Commit

Permalink
Merge pull request #12 from appier/inherit-from-h5py
Browse files Browse the repository at this point in the history
inherit from h5py (resolved #5, resolved #6, resolved #9, resolved #10)
  • Loading branch information
ianlini authored Feb 24, 2019
2 parents f75c97c + a859371 commit d485961
Show file tree
Hide file tree
Showing 4 changed files with 79 additions and 101 deletions.
67 changes: 31 additions & 36 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -66,97 +66,92 @@ Read dataset
************
.. code:: python
In [5]: h5f = h5sparse.File("test.h5")
In [6]: h5f = h5sparse.File("test.h5")
In [6]: h5f['sparse/matrix'][1:3]
Out[6]:
In [7]: h5f['sparse/matrix'][1:3]
Out[7]:
<2x3 sparse matrix of type '<class 'numpy.float64'>'
with 1 stored elements in Compressed Sparse Row format>
In [7]: h5f['sparse/matrix'][1:3].toarray()
Out[7]:
In [8]: h5f['sparse/matrix'][1:3].toarray()
Out[8]:
array([[ 0., 0., 1.],
[ 0., 0., 0.]])
In [8]: h5f['sparse']['matrix'][1:3].toarray()
Out[8]:
In [9]: h5f['sparse']['matrix'][1:3].toarray()
Out[9]:
array([[ 0., 0., 1.],
[ 0., 0., 0.]])
In [9]: h5f['sparse']['matrix'][2:].toarray()
Out[9]:
In [10]: h5f['sparse']['matrix'][2:].toarray()
Out[10]:
array([[ 0., 0., 0.],
[ 1., 1., 0.]])
In [10]: h5f['sparse']['matrix'][:2].toarray()
Out[10]:
In [11]: h5f['sparse']['matrix'][:2].toarray()
Out[11]:
array([[ 0., 1., 0.],
[ 0., 0., 1.]])
In [11]: h5f['sparse']['matrix'][-2:].toarray()
Out[11]:
In [12]: h5f['sparse']['matrix'][-2:].toarray()
Out[12]:
array([[ 0., 0., 0.],
[ 1., 1., 0.]])
In [12]: h5f['sparse']['matrix'][:-2].toarray()
Out[12]:
In [13]: h5f['sparse']['matrix'][:-2].toarray()
Out[13]:
array([[ 0., 1., 0.],
[ 0., 0., 1.]])
In [13]: h5f['sparse']['matrix'].value.toarray()
Out[13]:
In [14]: h5f['sparse']['matrix'][()].toarray()
Out[14]:
array([[ 0., 1., 0.],
[ 0., 0., 1.],
[ 0., 0., 0.],
[ 1., 1., 0.]])
In [15]: import h5py
In [16]: h5f = h5py.File("test.h5")
In [16]: h5py_h5f = h5py.File("test.h5")
In [18]: h5sparse.Group(h5f)['sparse/matrix'].value
Out[18]:
In [17]: h5sparse.Group(h5py_h5f.id)['sparse/matrix'][()]
Out[17]:
<4x3 sparse matrix of type '<class 'numpy.float64'>'
with 4 stored elements in Compressed Sparse Row format>
In [19]: h5sparse.Group(h5f['sparse'])['matrix'].value
Out[19]:
In [18]: h5sparse.Group(h5py_h5f['sparse'].id)['matrix'][()]
Out[18]:
<4x3 sparse matrix of type '<class 'numpy.float64'>'
with 4 stored elements in Compressed Sparse Row format>
In [21]: h5sparse.Dataset(h5f['sparse/matrix']).value
Out[21]:
In [19]: h5sparse.Dataset(h5py_h5f['sparse/matrix'])[()]
Out[19]:
<4x3 sparse matrix of type '<class 'numpy.float64'>'
with 4 stored elements in Compressed Sparse Row format>
Append dataset
**************
.. code:: python
In [22]: to_append = ss.csr_matrix([[0, 1, 1],
In [20]: to_append = ss.csr_matrix([[0, 1, 1],
...: [1, 0, 0]],
...: dtype=np.float64)
In [23]: h5f.create_dataset('matrix', data=sparse_matrix, chunks=(100000,),
In [21]: h5f.create_dataset('matrix', data=sparse_matrix, chunks=(100000,),
...: maxshape=(None,))
In [24]: h5f['matrix'].append(to_append)
In [22]: h5f['matrix'].append(to_append)
In [25]: h5f['matrix'].value
Out[25]:
In [23]: h5f['matrix'][()]
Out[23]:
<6x3 sparse matrix of type '<class 'numpy.float64'>'
with 7 stored elements in Compressed Sparse Row format>
In [26]: h5f['matrix'].value.toarray()
Out[26]:
In [24]: h5f['matrix'][()].toarray()
Out[24]:
array([[ 0., 1., 0.],
[ 0., 0., 1.],
[ 0., 0., 0.],
[ 1., 1., 0.],
[ 0., 1., 1.],
[ 1., 0., 0.]])
Version scheme
--------------
We use `semantic versioning <https://www.python.org/dev/peps/pep-0440/#semantic-versioning>`_.
92 changes: 32 additions & 60 deletions h5sparse/h5sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,52 +25,38 @@ def get_format_class(format_str):
return format_class


class Group(object):
class Group(h5py.Group):
"""The HDF5 group that can detect and create sparse matrix.
Parameters
==========
h5py_group: h5py.Group
"""

def __init__(self, h5py_group):
self.h5py_group = h5py_group

def __contains__(self, item):
return item in self.h5py_group

def __getitem__(self, key):
h5py_item = self.h5py_group[key]
h5py_item = super(Group, self).__getitem__(key)
if isinstance(h5py_item, h5py.Group):
if 'h5sparse_format' in h5py_item.attrs:
# detect the sparse matrix
return Dataset(h5py_item)
else:
return Group(h5py_item)
return Group(h5py_item.id)
elif isinstance(h5py_item, h5py.Dataset):
return h5py_item
else:
raise ValueError("Unexpected item type.")

def create_dataset(self, name, shape=None, dtype=None, data=None,
format='csr', indptr_dtype=np.int64, indices_dtype=np.int32,
indptr_dtype=np.int64, indices_dtype=np.int32,
**kwargs):
"""Create 4 datasets in a group to represent the sparse array."""
if data is None:
raise NotImplementedError("Only support create_dataset with "
"existed data.")
elif isinstance(data, Dataset):
group = self.h5py_group.create_group(name)
group.attrs['h5sparse_format'] = data.h5py_group.attrs['h5sparse_format']
group.attrs['h5sparse_shape'] = data.h5py_group.attrs['h5sparse_shape']
"""Create 3 datasets in a group to represent the sparse array."""
if isinstance(data, Dataset):
group = self.create_group(name)
group.attrs['h5sparse_format'] = data.attrs['h5sparse_format']
group.attrs['h5sparse_shape'] = data.attrs['h5sparse_shape']
group.create_dataset('data', data=data.h5py_group['data'],
dtype=dtype, **kwargs)
group.create_dataset('indices', data=data.h5py_group['indices'],
dtype=indices_dtype, **kwargs)
group.create_dataset('indptr', data=data.h5py_group['indptr'],
dtype=indptr_dtype, **kwargs)
elif ss.issparse(data):
group = self.h5py_group.create_group(name)
group = self.create_group(name)
group.attrs['h5sparse_format'] = get_format_str(data)
group.attrs['h5sparse_shape'] = data.shape
group.create_dataset('data', data=data.data, dtype=dtype, **kwargs)
Expand All @@ -80,40 +66,29 @@ def create_dataset(self, name, shape=None, dtype=None, data=None,
dtype=indptr_dtype, **kwargs)
else:
# If `data` is not a sparse array, forward the arguments to h5py
return self.h5py_group.create_dataset(name, data=data, shape=shape,
dtype=dtype, **kwargs)
return super(Group, self).create_dataset(
name, data=data, shape=shape, dtype=dtype, **kwargs)
return Dataset(group)


class File(Group):
class File(h5py.File, Group):
"""The HDF5 file object that can detect and create sparse matrix.
Parameters
==========
*args, **kwargs: the parameters from h5py.File
"""
pass

def __init__(self, *args, **kwargs):
self.h5f = h5py.File(*args, **kwargs)
self.h5py_group = self.h5f

def __enter__(self):
return self

def __exit__(self, exc_type, exc_value, traceback):
self.h5f.__exit__(exc_type, exc_value, traceback)


class Dataset(object):
class Dataset(h5py.Group):
"""The HDF5 sparse matrix dataset.
Parameters
==========
h5py_group: h5py.Dataset
"""

def __init__(self, h5py_group):
super(Dataset, self).__init__(h5py_group.id)
self.h5py_group = h5py_group
self.shape = self.attrs['h5sparse_shape']
self.format_str = self.attrs['h5sparse_format']

def __getitem__(self, key):
if isinstance(key, slice):
Expand All @@ -131,30 +106,27 @@ def __getitem__(self, key):
indices = self.h5py_group['indices'][indptr[0]:indptr[-1]]
indptr -= indptr[0]
shape = (indptr.size - 1,
self.h5py_group.attrs['h5sparse_shape'][1])
self.attrs['h5sparse_shape'][1])
elif isinstance(key, tuple) and key == ():
data = self.h5py_group['data'][()]
indices = self.h5py_group['indices'][()]
indptr = self.h5py_group['indptr'][()]
shape = self.attrs['h5sparse_shape']
else:
raise NotImplementedError("Only support one slice as index.")

format_class = get_format_class(self.h5py_group.attrs['h5sparse_format'])
format_class = get_format_class(self.attrs['h5sparse_format'])
return format_class((data, indices, indptr), shape=shape)

@property
def value(self):
data = self.h5py_group['data'].value
indices = self.h5py_group['indices'].value
indptr = self.h5py_group['indptr'].value
shape = self.h5py_group.attrs['h5sparse_shape']
format_class = get_format_class(self.h5py_group.attrs['h5sparse_format'])
return format_class((data, indices, indptr), shape=shape)
return self[()]

def append(self, sparse_matrix):
shape = self.h5py_group.attrs['h5sparse_shape']
format_str = self.h5py_group.attrs['h5sparse_format']

if format_str != get_format_str(sparse_matrix):
if self.format_str != get_format_str(sparse_matrix):
raise ValueError("Format not the same.")

if format_str == 'csr':
if self.format_str == 'csr':
# data
data = self.h5py_group['data']
orig_data_size = data.shape[0]
Expand All @@ -179,9 +151,9 @@ def append(self, sparse_matrix):
indices[orig_data_size:] = sparse_matrix.indices

# shape
self.h5py_group.attrs['h5sparse_shape'] = (
shape[0] + sparse_matrix.shape[0],
max(shape[1], sparse_matrix.shape[1]))
self.attrs['h5sparse_shape'] = (
self.shape[0] + sparse_matrix.shape[0],
max(self.shape[1], sparse_matrix.shape[1]))
else:
raise NotImplementedError("The append method for format {} is not "
"implemented.".format(format_str))
"implemented.".format(self.format_str))
19 changes: 14 additions & 5 deletions h5sparse/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_create_and_read_dataset():
assert (h5f['sparse']['matrix'][:2] != sparse_matrix[:2]).size == 0
assert (h5f['sparse']['matrix'][-2:] != sparse_matrix[-2:]).size == 0
assert (h5f['sparse']['matrix'][:-2] != sparse_matrix[:-2]).size == 0
assert (h5f['sparse']['matrix'].value != sparse_matrix).size == 0
assert (h5f['sparse']['matrix'][()] != sparse_matrix).size == 0

os.remove(h5_path)

Expand All @@ -45,7 +45,7 @@ def test_create_dataset_from_dataset():
to_h5f.create_dataset('sparse/matrix', data=from_dset)
assert 'sparse' in to_h5f
assert 'matrix' in to_h5f['sparse']
assert (to_h5f['sparse/matrix'].value != sparse_matrix).size == 0
assert (to_h5f['sparse/matrix'][()] != sparse_matrix).size == 0

os.remove(from_h5_path)
os.remove(to_h5_path)
Expand All @@ -67,7 +67,7 @@ def test_dataset_append():
h5f.create_dataset('matrix', data=sparse_matrix, chunks=(100000,),
maxshape=(None,))
h5f['matrix'].append(to_append)
assert (h5f['matrix'].value != appended_matrix).size == 0
assert (h5f['matrix'][()] != appended_matrix).size == 0

os.remove(h5_path)

Expand All @@ -78,7 +78,7 @@ def test_numpy_array():
with h5sparse.File(h5_path) as h5f:
h5f.create_dataset('matrix', data=matrix)
assert 'matrix' in h5f
np.testing.assert_equal(h5f['matrix'].value, matrix)
np.testing.assert_equal(h5f['matrix'][()], matrix)
os.remove(h5_path)


Expand All @@ -89,5 +89,14 @@ def test_bytestring():
with h5sparse.File(h5_path) as h5f:
h5f.create_dataset('strings', data=data)
assert 'strings' in h5f
assert strings == json.loads(h5f['strings'].value.decode('utf8'))
assert strings == json.loads(h5f['strings'][()].decode('utf8'))
os.remove(h5_path)


def test_create_empty_dataset():
h5_path = mkstemp(suffix=".h5")[1]
with h5sparse.File(h5_path) as h5f:
h5f.create_dataset('empty_data', shape=(100, 200))
with h5sparse.File(h5_path) as h5f:
assert h5f['empty_data'].shape == (100, 200)
os.remove(h5_path)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,5 @@ scipy
pylint
tox
flake8
nose
coverage

0 comments on commit d485961

Please sign in to comment.