-
Notifications
You must be signed in to change notification settings - Fork 1
/
dal_helper.py
171 lines (138 loc) · 5.22 KB
/
dal_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
This file is shared among all most of my export scripts and contains various boilerplaty stuff.
If you know how to make any of this easier, please let me know!
"""
__all__ = [
'PathIsh',
'pathify',
'Json',
'Res',
'the',
]
import argparse
from datetime import datetime
from glob import glob
from pathlib import Path
from typing import Any, Dict, Union, TypeVar, Optional, Iterator
import warnings
PathIsh = Union[str, Path]
def pathify(path: PathIsh) -> Path:
"""
Helper mainly to support CPath hack
See https://github.com/karlicoss/HPI/blob/be21606075cbc15018d1f36c2581ab138e4a44cc/tests/misc.py#L29-L32
Otherwise if we do Path(CPath(...)), it will ruin the decompression hack
"""
if isinstance(path, Path):
return path
else:
return Path(path)
Json = Dict[str, Any] # todo Mapping?
T = TypeVar('T')
Res = Union[T, Exception]
def make_parser(single_source=False, package: Optional[str]=None) -> argparse.ArgumentParser:
# meh..
pkg = __package__.split('.')[0] if package is None else package
p = argparse.ArgumentParser(
'DAL (Data Access/Abstraction Layer)',
formatter_class=lambda prog: argparse.RawTextHelpFormatter(prog, width=100), # type: ignore
)
source_help = 'Path to exported data'
if not single_source:
source_help += ". Can be single file, or a glob, e.g. '/path/to/exports/*.ext'"
p.add_argument(
'--source',
type=str,
required=True,
help=source_help,
)
# todo link to exports post why multiple exports could be useful
if not single_source:
p.add_argument(
'--no-glob',
action='store_true',
help='Treat path in --source literally'
)
p.add_argument('-i', '--interactive', action='store_true', help='Start Ipython session to play with data')
p.epilog = f"""
You can use ={pkg}.dal= (stands for "Data Access/Abstraction Layer") to access your exported data, even offline.
I elaborate on motivation behind it [[https://beepb00p.xyz/exports.html#dal][here]].
- main usecase is to be imported as python module to allow for *programmatic access* to your data.
You can find some inspiration in [[https://beepb00p.xyz/mypkg.html][=my.=]] package that I'm using as an API to all my personal data.
- to test it against your export, simply run: ~python3 -m {pkg}.dal --source /path/to/export~
- you can also try it interactively: ~python3 -m {pkg}.dal --source /path/to/export --interactive~
"""
return p
def main(*, DAL, demo=None, single_source=False) -> None:
"""
single_source: used when exports are not cumulative/synthetic
(you can find out more about it here: https://beepb00p.xyz/exports.html#types)
"""
p = make_parser(single_source=single_source)
args = p.parse_args()
if single_source:
dal = DAL(args.source)
else:
if '*' in args.source and not args.no_glob:
sources = glob(args.source)
else:
ps = Path(args.source)
if ps.is_dir():
sources = list(sorted(ps.iterdir())) # hopefully, makes sense?
else:
sources = [ps]
dal = DAL(sources)
# logger.debug('using %s', sources)
print(dal)
# TODO autoreload would be nice... https://github.com/ipython/ipython/issues/1144
# TODO maybe just launch through ipython in the first place?
if args.interactive:
import IPython # type: ignore
IPython.embed(header="Feel free to mess with 'dal' object in the interactive shell")
else:
assert demo is not None, "No 'demo' in 'dal.py'?"
demo(dal)
# legacy: logger function used to be in this file
from .logging_helper import logger
from typing import Iterable
# todo rename to only, like in more_itertools?
# although it's not exactly the same, i.e. also checks that they are all equal..
# and turning to a set() isn't always an option because it's a hash set
def the(l: Iterable[T]) -> T:
it = iter(l)
try:
first = next(it)
except StopIteration as ee:
raise RuntimeError('Empty iterator?')
assert all(e == first for e in it)
return first
datetime_naive = datetime # for now just an alias
datetime_aware = datetime # for now just an alias
def json_items(p: Path, key: Optional[str]) -> Iterator[Json]:
# if key is None, means we expect list on the top level
# todo perhaps add to setup.py as 'optional' or 'faster'?
try:
import ijson # type: ignore[import]
# todo would be nice to debug output the backend?
except:
warnings.warn("recommended to 'pip install ijson' for faster json processing")
else:
extractor = 'item' if key is None else f'{key}.item'
with p.open(mode='rb') as fo:
yield from ijson.items(fo, extractor, use_float=True)
return
try:
import orjson
except:
warnings.warn("recommended to 'pip install orjson' for faster json processing")
else:
j = orjson.loads(p.read_text())
if key is not None:
j = j[key]
yield from j
return
# otherwise just fall back onto regular json
import json
j = json.loads(p.read_text())
if key is not None:
j = j[key]
yield from j