Skip to content

Commit

Permalink
Merge pull request #862 from jakevdp/tojson
Browse files Browse the repository at this point in the history
ENH: make to_json & to_csv transformers have deterministic filenames
  • Loading branch information
ellisonbg authored May 22, 2018
2 parents 1e56c41 + c0d324b commit c70990b
Show file tree
Hide file tree
Showing 2 changed files with 129 additions and 34 deletions.
86 changes: 53 additions & 33 deletions altair/utils/data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import random
import uuid
import hashlib

import pandas as pd
from toolz.curried import curry, pipe # noqa
Expand Down Expand Up @@ -80,41 +80,37 @@ def sample(data, n=None, frac=None):


@curry
def to_json(data, prefix='altair-data'):
"""Write the data model to a .json file and return a url based data model."""
check_data_type(data)
ext = '.json'
filename = _compute_filename(prefix=prefix, ext=ext)
if isinstance(data, pd.DataFrame):
data = sanitize_dataframe(data)
data.to_json(filename, orient='records')
elif isinstance(data, dict):
if 'values' not in data:
raise KeyError('values expected in data dict, but not present.')
values = data['values']
with open(filename) as f:
json.dump(values, f)
def to_json(data, prefix='altair-data', extension='json',
filename="{prefix}-{hash}.{extension}"):
"""
Write the data model to a .json file and return a url based data model.
"""
data_json = _data_to_json_string(data)
data_hash = _compute_data_hash(data_json)
filename = filename.format(prefix=prefix, hash=data_hash,
extension=extension)
with open(filename, 'w') as f:
f.write(data_json)
return {
'url': filename,
'format': {'type': 'json'}
}


@curry
def to_csv(data, prefix='altair-data'):
def to_csv(data, prefix='altair-data', extension='csv',
filename="{prefix}-{hash}.{extension}"):
"""Write the data model to a .csv file and return a url based data model."""
check_data_type(data)
ext = '.csv'
filename = _compute_filename(prefix=prefix, ext=ext)
if isinstance(data, pd.DataFrame):
data = sanitize_dataframe(data)
data.to_csv(filename)
return {
'url': filename,
'format': {'type': 'csv'}
}
elif isinstance(data, dict):
raise NotImplementedError('to_csv only works with Pandas DataFrame objects.')
data_csv = _data_to_csv_string(data)
data_hash = _compute_data_hash(data_csv)
filename = filename.format(prefix=prefix, hash=data_hash,
extension=extension)
with open(filename, 'w') as f:
f.write(data_csv)
return {
'url': filename,
'format': {'type': 'csv'}
}


@curry
Expand All @@ -140,11 +136,35 @@ def check_data_type(data):
# Private utilities
# ==============================================================================

def _compute_data_hash(data_str):
return hashlib.md5(data_str.encode()).hexdigest()

def _compute_uuid_filename(prefix, ext):
return prefix + '-' + str(uuid.uuid4()) + ext

def _data_to_json_string(data):
"""Return a JSON string representation of the input data"""
check_data_type(data)
if isinstance(data, pd.DataFrame):
data = sanitize_dataframe(data)
return data.to_json(orient='records')
elif isinstance(data, dict):
if 'values' not in data:
raise KeyError('values expected in data dict, but not present.')
return json.dumps(data['values'], sort_keys=True)
else:
raise NotImplementedError("to_json only works with data expressed as "
"a DataFrame or as a dict")

def _compute_filename(prefix='altair-data', ext='.csv'):
filename = _compute_uuid_filename(prefix, ext)
return filename

def _data_to_csv_string(data):
"""return a CSV string representation of the input data"""
check_data_type(data)
if isinstance(data, pd.DataFrame):
data = sanitize_dataframe(data)
return data.to_csv(index=False)
elif isinstance(data, dict):
if 'values' not in data:
raise KeyError('values expected in data dict, but not present')
return pd.DataFrame.from_dict(data['values']).to_csv(index=False)
else:
raise NotImplementedError("to_csv only works with data expressed as "
"a DataFrame or as a dict")
77 changes: 76 additions & 1 deletion altair/utils/tests/test_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import os

import pytest
import pandas as pd


from ..data import limit_rows, MaxRowsError, sample, pipe, to_values
from ..data import (limit_rows, MaxRowsError, sample, pipe, to_values,
to_json, to_csv)


def _create_dataframe(N):
Expand Down Expand Up @@ -63,3 +66,75 @@ def test_type_error():
for f in (sample, limit_rows, to_values):
with pytest.raises(TypeError):
pipe(0, f)


def test_dataframe_to_json():
"""Test to_json
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_dataframe(10)
try:
result1 = pipe(data, to_json)
result2 = pipe(data, to_json)
filename = result1['url']
output = pd.read_json(filename)
finally:
os.remove(filename)

assert result1 == result2
assert output.equals(data)


def test_dict_to_json():
"""Test to_json
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_data_with_values(10)
try:
result1 = pipe(data, to_json)
result2 = pipe(data, to_json)
filename = result1['url']
output = pd.read_json(filename).to_dict(orient='records')
finally:
os.remove(filename)

assert result1 == result2
assert data == {'values': output}


def test_dataframe_to_csv():
"""Test to_csv with dataframe input
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_dataframe(10)
try:
result1 = pipe(data, to_csv)
result2 = pipe(data, to_csv)
filename = result1['url']
output = pd.read_csv(filename)
finally:
os.remove(filename)

assert result1 == result2
assert output.equals(data)


def test_dict_to_csv():
"""Test to_csv with dict input
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_data_with_values(10)
try:
result1 = pipe(data, to_csv)
result2 = pipe(data, to_csv)
filename = result1['url']
output = pd.read_csv(filename).to_dict(orient='records')
finally:
os.remove(filename)

assert result1 == result2
assert data == {'values': output}

0 comments on commit c70990b

Please sign in to comment.