Skip to content
This repository has been archived by the owner on Jun 21, 2022. It is now read-only.

Commit

Permalink
Merge pull request #131 from scikit-hep/awkward-lazyarrays
Browse files Browse the repository at this point in the history
Awkward lazyarrays
  • Loading branch information
jpivarski authored May 20, 2019
2 parents 1485075 + 1a88f60 commit d20f0d5
Show file tree
Hide file tree
Showing 7 changed files with 69 additions and 25 deletions.
14 changes: 10 additions & 4 deletions awkward/array/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@

import types
import numbers
try:
from collections.abc import Iterable
except ImportError:
from collections import Iterable

import numpy

Expand Down Expand Up @@ -303,20 +307,20 @@ def _util_toarray(cls, value, defaultdtype, passthrough=None):

@classmethod
def _util_arraystr_draw(cls, x):
if isinstance(x, list):
if isinstance(x, tuple):
return "(" + ", ".join(cls._util_arraystr_draw(y) for y in x) + ")"
elif isinstance(x, Iterable):
if len(x) > 6:
return "[" + " ".join(cls._util_arraystr_draw(y) for y in x[:3]) + " ... " + " ".join(cls._util_arraystr_draw(y) for y in x[-3:]) + "]"
else:
return "[" + " ".join(cls._util_arraystr_draw(y) for y in x) + "]"
elif isinstance(x, tuple):
return "(" + ", ".join(cls._util_arraystr_draw(y) for y in x) + ")"
else:
return repr(x)

@classmethod
def _util_arraystr(cls, array):
if isinstance(array, cls.numpy.ndarray):
return cls._util_arraystr_draw(array.tolist())
return cls._util_arraystr_draw(array)
elif isinstance(array, AwkwardArray):
return str(array).replace("\n", "")
else:
Expand Down Expand Up @@ -405,6 +409,8 @@ def _concatenate_axis1(cls, arrays):
def _util_isstringslice(cls, where):
if isinstance(where, awkward.util.string):
return True
elif isinstance(where, bytes):
raise TypeError("column selection must be str, not bytes, in Python 3")
elif isinstance(where, tuple):
return False
elif isinstance(where, (cls.numpy.ndarray, AwkwardArray)) and issubclass(where.dtype.type, (numpy.str, numpy.str_)):
Expand Down
49 changes: 42 additions & 7 deletions awkward/array/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-array/blob/master/LICENSE

import numbers
import re
import types
from collections import OrderedDict
Expand All @@ -10,6 +11,8 @@
except ImportError:
from collections import Iterable

import numpy

import awkward.array.base
import awkward.type
import awkward.util
Expand Down Expand Up @@ -38,7 +41,7 @@ def __repr__(self):
elif getattr(self._table, "_showdict", False):
return "<{0} {{{1}}}>".format(self._table._rowname, ", ".join("{0}: {1}".format(repr(n), str(self[n])) for n in self._table._contents))
else:
return "<{0} {1}>".format(self._table._rowname, self._index)
return "<{0} {1}>".format(self._table._rowname, self._index + self._table.rowstart)

def __contains__(self, name):
return name in self._table._contents
Expand Down Expand Up @@ -181,6 +184,7 @@ def __init__(self, columns1={}, *columns2, **columns3):
self._view = None
self._base = None
self.rowname = "Row"
self.rowstart = None
self._contents = OrderedDict()

seen = set()
Expand Down Expand Up @@ -229,6 +233,22 @@ def rowname(self, value):
raise TypeError("rowname must be a string")
self._rowname = value

@property
def rowstart(self):
if self._rowstart is not None:
return self._rowstart
elif self._base is not None:
return self._base.rowstart
else:
return 0

@rowstart.setter
def rowstart(self, value):
if self.check_prop_valid:
if value is not None and not isinstance(value, (numbers.Integral, numpy.integer)):
raise TypeError("rowstart must be None or an integer")
self._rowstart = value

@classmethod
def fromrec(cls, recarray):
if not isinstance(recarray, cls.numpy.ndarray) or recarray.dtype.names is None:
Expand All @@ -239,10 +259,11 @@ def fromrec(cls, recarray):
return out

@classmethod
def frompairs(cls, pairs):
def frompairs(cls, pairs, rowstart):
out = cls()
for n, x in pairs:
out[n] = x
out._rowstart = rowstart
return out

@classmethod
Expand All @@ -255,12 +276,14 @@ def fromview(cls, view, base):
out = base.copy()
out._view = int(start), int(step), int(length)
out._base = base
out._rowstart = None
return out

elif isinstance(view, cls.numpy.ndarray) and cls._util_isintegertype(view.dtype.type):
out = base.copy()
out._view = view
out._base = base
out._rowstart = None
return out

else:
Expand All @@ -270,6 +293,7 @@ def copy(self, contents=None):
out = self.__class__.__new__(self.__class__)
out._view = self._view
out._base = self._base
out._rowstart = self._rowstart
out._rowname = self._rowname
out._contents = self._contents
if contents is not None and isinstance(contents, dict):
Expand All @@ -285,12 +309,14 @@ def deepcopy(self, contents=None):
out._contents = OrderedDict([(n, self._util_deepcopy(x[out._index()])) for n, x in out._contents.items()])
out._view = None
out._base = None
out._rowstart = None
return out

def empty_like(self, **overrides):
out = self.__class__.__new__(self.__class__)
out._view = None
out._base = None
out._rowstart = None
out._rowname = self._rowname
out._contents = OrderedDict()
return out
Expand All @@ -316,7 +342,8 @@ def ones_like(self, **overrides):
def __awkward_persist__(self, ident, fill, prefix, suffix, schemasuffix, storage, compression, **kwargs):
self._valid()
out = {"call": ["awkward", "Table", "frompairs"],
"args": [{"pairs": [[n, fill(x, "Table.contents", prefix, suffix, schemasuffix, storage, compression, **kwargs)] for n, x in self._contents.items()]}]}
"args": [{"pairs": [[n, fill(x, "Table.contents", prefix, suffix, schemasuffix, storage, compression, **kwargs)] for n, x in self._contents.items()]},
{"json": self.rowstart}]}
if isinstance(self._view, tuple):
start, step, length = self._view
out = {"call": ["awkward", "Table", "fromview"],
Expand Down Expand Up @@ -509,10 +536,17 @@ def __iter__(self, checkiter=True):
def __getitem__(self, where):
if self._util_isstringslice(where):
if isinstance(where, awkward.util.string):
try:
return self._contents[where][self._index()]
except KeyError:
raise ValueError("no column named {0}".format(repr(where)))
if self._view is None:
try:
return self._contents[where]
except KeyError:
raise ValueError("no column named {0}".format(repr(where)))
else:
index = self._index()
try:
return self._contents[where][index]
except KeyError:
raise ValueError("no column named {0}".format(repr(where)))
else:
contents = OrderedDict()
for n in where:
Expand Down Expand Up @@ -540,6 +574,7 @@ def __getitem__(self, where):
out = self.copy(contents=self._contents)
out._view = newslice
out._base = self
out._rowstart = None
return out

def __setitem__(self, where, what):
Expand Down
10 changes: 4 additions & 6 deletions awkward/arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def popbuffers(tpe, buffers):
pairs = []
for i in range(tpe.num_children - 1, -1, -1):
pairs.insert(0, (tpe[i].name, popbuffers(tpe[i].type, buffers)))
out = awkwardlib.Table.frompairs(pairs)
out = awkwardlib.Table.frompairs(pairs, 0) # FIXME: better rowstart
mask = buffers.pop()
if mask is not None:
mask = awkwardlib.numpy.frombuffer(mask, dtype=ARROW_BITMASKTYPE)
Expand Down Expand Up @@ -446,9 +446,8 @@ def convert(obj, message):
writer.close()

class _ParquetFile(object):
def __init__(self, file, cache=None, metadata=None, common_metadata=None):
def __init__(self, file, metadata=None, common_metadata=None):
self.file = file
self.cache = cache
self.metadata = metadata
self.common_metadata = common_metadata
self._init()
Expand All @@ -463,7 +462,6 @@ def __getstate__(self):

def __setstate__(self, state):
self.file = state["file"]
self.cache = None
self.metadata = state["metadata"]
self.common_metadata = state["common_metadata"]
self._init()
Expand All @@ -477,11 +475,11 @@ def tojson(self):

@classmethod
def fromjson(cls, state):
return cls(state["file"], cache=None, metadata=state["metadata"], common_metadata=state["common_metadata"])
return cls(state["file"], metadata=state["metadata"], common_metadata=state["common_metadata"])

def fromparquet(file, awkwardlib=None, cache=None, persistvirtual=False, metadata=None, common_metadata=None):
awkwardlib = awkward.util.awkwardlib(awkwardlib)
parquetfile = _ParquetFile(file, cache=cache, metadata=metadata, common_metadata=common_metadata)
parquetfile = _ParquetFile(file, metadata=metadata, common_metadata=common_metadata)
columns = parquetfile.type.columns

chunks = []
Expand Down
2 changes: 1 addition & 1 deletion awkward/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def append(self, obj, tpe):
return UnionFillable(self, self.awkwardlib).append(obj, tpe)

def finalize(self, **options):
return self.awkwardlib.Table.frompairs((n, self.contents[n].finalize(**options)) for n in sorted(self.fields))
return self.awkwardlib.Table.frompairs([(n, self.contents[n].finalize(**options)) for n in sorted(self.fields)], 0)

class ObjectFillable(Fillable):
__slots__ = ["content", "cls", "awkwardlib"]
Expand Down
3 changes: 3 additions & 0 deletions awkward/persist.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
["awkward.persist", "*"],
["awkward.arrow", "_ParquetFile", "fromjson"],
["uproot_methods.classes.*"],
["uproot.tree._LazyFiles"],
["uproot.tree._LazyTree"],
["uproot.tree._LazyBranch"],
]

def frompython(obj):
Expand Down
2 changes: 1 addition & 1 deletion awkward/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import re

__version__ = "0.9.1"
__version__ = "0.10.0"
version = __version__
version_info = tuple(re.split(r"[-\.]", __version__))

Expand Down
14 changes: 8 additions & 6 deletions tests/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,9 +414,11 @@ def test_arrow_writeparquet2(tmpdir):
assert len(c.chunks) == 1 and len(d.chunks) == 1
assert isinstance(c.chunks[0], awkward.Table) and isinstance(d.chunks[0], awkward.Table)
assert c.chunks[0].columns == d.chunks[0].columns
assert isinstance(c.chunks[0]["x"], awkward.BitMaskedArray) and isinstance(d.chunks[0]["x"], awkward.BitMaskedArray)
assert c.chunks[0]["x"].boolmask().tolist() == d.chunks[0]["x"].boolmask().tolist()
assert isinstance(c.chunks[0]["x"].content, awkward.JaggedArray) and isinstance(d.chunks[0]["x"].content, awkward.JaggedArray)
assert isinstance(c.chunks[0]["x"].content.content, awkward.BitMaskedArray) and isinstance(d.chunks[0]["x"].content.content, awkward.BitMaskedArray)
assert c.chunks[0]["x"].content.content.boolmask().tolist() == d.chunks[0]["x"].content.content.boolmask().tolist()
assert isinstance(c.chunks[0]["x"].content.content.content, numpy.ndarray) and isinstance(d.chunks[0]["x"].content.content.content, numpy.ndarray)
cstuff = c.chunks[0]["x"][:]
dstuff = d.chunks[0]["x"][:]
assert isinstance(cstuff, awkward.BitMaskedArray) and isinstance(dstuff, awkward.BitMaskedArray)
assert cstuff.boolmask().tolist() == dstuff.boolmask().tolist()
assert isinstance(cstuff.content, awkward.JaggedArray) and isinstance(dstuff.content, awkward.JaggedArray)
assert isinstance(cstuff.content.content, awkward.BitMaskedArray) and isinstance(dstuff.content.content, awkward.BitMaskedArray)
assert cstuff.content.content.boolmask().tolist() == dstuff.content.content.boolmask().tolist()
assert isinstance(cstuff.content.content.content, numpy.ndarray) and isinstance(dstuff.content.content.content, numpy.ndarray)

0 comments on commit d20f0d5

Please sign in to comment.