-
Notifications
You must be signed in to change notification settings - Fork 2
/
parse.py
executable file
·49 lines (41 loc) · 1.41 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python3
import sys
import json
import itertools
import tempfile
import gzip
import xml.etree.cElementTree as ET
from parseRecord import parseRecord
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
ch = logging.StreamHandler(sys.stderr)
formatter = logging.Formatter('[%(levelname)s] %(asctime)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)
def parseAll(f):
text = ''
for key,group in itertools.groupby(f, lambda l: b'<NewDataSet ' in l or b'</ICTRP>' in l):
if key:
line = list(group)[0]
if b'<NewDataSet' in line:
text = '<NewDataSet xmlns:msdata="urn:schemas-microsoft-com:xml-msdata" xmlns:diffgr="urn:schemas-microsoft-com:xml-diffgram-v1">'
else:
text = '</ICTRP>'
elif text != '':
text = text + ''.join([g.decode('utf-8') for g in group])
print(json.dumps(parseRecord(ET.fromstring(text))))
def main(argv):
if argv[1] == '--s3':
import boto3
with tempfile.TemporaryFile() as tmpfile:
s3 = boto3.client('s3')
s3.download_fileobj('ictrp-data', argv[2], tmpfile)
tmpfile.seek(0)
with gzip.GzipFile(argv[2], 'r', 9, tmpfile) as f:
parseAll(f)
else:
with gzip.open(argv[1], 'r') as f:
parseAll(f)
if __name__ == '__main__':
main(sys.argv)