Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: make to_json & to_csv transformers have deterministic filenames #862

Merged
merged 4 commits into from
May 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 53 additions & 33 deletions altair/utils/data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import random
import uuid
import hashlib

import pandas as pd
from toolz.curried import curry, pipe # noqa
Expand Down Expand Up @@ -80,41 +80,37 @@ def sample(data, n=None, frac=None):


@curry
def to_json(data, prefix='altair-data'):
"""Write the data model to a .json file and return a url based data model."""
check_data_type(data)
ext = '.json'
filename = _compute_filename(prefix=prefix, ext=ext)
if isinstance(data, pd.DataFrame):
data = sanitize_dataframe(data)
data.to_json(filename, orient='records')
elif isinstance(data, dict):
if 'values' not in data:
raise KeyError('values expected in data dict, but not present.')
values = data['values']
with open(filename) as f:
json.dump(values, f)
def to_json(data, prefix='altair-data', extension='json',
filename="{prefix}-{hash}.{extension}"):
"""
Write the data model to a .json file and return a url based data model.
"""
data_json = _data_to_json_string(data)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested the logic locally and it works as expected. In particular I made multiple plots of the same dataset (one file) and then mutated the data frame (second file). Very nice!

data_hash = _compute_data_hash(data_json)
filename = filename.format(prefix=prefix, hash=data_hash,
extension=extension)
with open(filename, 'w') as f:
f.write(data_json)
return {
'url': filename,
'format': {'type': 'json'}
}


@curry
def to_csv(data, prefix='altair-data'):
def to_csv(data, prefix='altair-data', extension='csv',
filename="{prefix}-{hash}.{extension}"):
"""Write the data model to a .csv file and return a url based data model."""
check_data_type(data)
ext = '.csv'
filename = _compute_filename(prefix=prefix, ext=ext)
if isinstance(data, pd.DataFrame):
data = sanitize_dataframe(data)
data.to_csv(filename)
return {
'url': filename,
'format': {'type': 'csv'}
}
elif isinstance(data, dict):
raise NotImplementedError('to_csv only works with Pandas DataFrame objects.')
data_csv = _data_to_csv_string(data)
data_hash = _compute_data_hash(data_csv)
filename = filename.format(prefix=prefix, hash=data_hash,
extension=extension)
with open(filename, 'w') as f:
f.write(data_csv)
return {
'url': filename,
'format': {'type': 'csv'}
}


@curry
Expand All @@ -140,11 +136,35 @@ def check_data_type(data):
# Private utilities
# ==============================================================================

def _compute_data_hash(data_str):
return hashlib.md5(data_str.encode()).hexdigest()

def _compute_uuid_filename(prefix, ext):
return prefix + '-' + str(uuid.uuid4()) + ext

def _data_to_json_string(data):
"""Return a JSON string representation of the input data"""
check_data_type(data)
if isinstance(data, pd.DataFrame):
data = sanitize_dataframe(data)
return data.to_json(orient='records')
elif isinstance(data, dict):
if 'values' not in data:
raise KeyError('values expected in data dict, but not present.')
return json.dumps(data['values'], sort_keys=True)
else:
raise NotImplementedError("to_json only works with data expressed as "
"a DataFrame or as a dict")

def _compute_filename(prefix='altair-data', ext='.csv'):
filename = _compute_uuid_filename(prefix, ext)
return filename

def _data_to_csv_string(data):
"""return a CSV string representation of the input data"""
check_data_type(data)
if isinstance(data, pd.DataFrame):
data = sanitize_dataframe(data)
return data.to_csv(index=False)
elif isinstance(data, dict):
if 'values' not in data:
raise KeyError('values expected in data dict, but not present')
return pd.DataFrame.from_dict(data['values']).to_csv(index=False)
else:
raise NotImplementedError("to_csv only works with data expressed as "
"a DataFrame or as a dict")
77 changes: 76 additions & 1 deletion altair/utils/tests/test_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import os

import pytest
import pandas as pd


from ..data import limit_rows, MaxRowsError, sample, pipe, to_values
from ..data import (limit_rows, MaxRowsError, sample, pipe, to_values,
to_json, to_csv)


def _create_dataframe(N):
Expand Down Expand Up @@ -63,3 +66,75 @@ def test_type_error():
for f in (sample, limit_rows, to_values):
with pytest.raises(TypeError):
pipe(0, f)


def test_dataframe_to_json():
"""Test to_json
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_dataframe(10)
try:
result1 = pipe(data, to_json)
result2 = pipe(data, to_json)
filename = result1['url']
output = pd.read_json(filename)
finally:
os.remove(filename)

assert result1 == result2
assert output.equals(data)


def test_dict_to_json():
"""Test to_json
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_data_with_values(10)
try:
result1 = pipe(data, to_json)
result2 = pipe(data, to_json)
filename = result1['url']
output = pd.read_json(filename).to_dict(orient='records')
finally:
os.remove(filename)

assert result1 == result2
assert data == {'values': output}


def test_dataframe_to_csv():
"""Test to_csv with dataframe input
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_dataframe(10)
try:
result1 = pipe(data, to_csv)
result2 = pipe(data, to_csv)
filename = result1['url']
output = pd.read_csv(filename)
finally:
os.remove(filename)

assert result1 == result2
assert output.equals(data)


def test_dict_to_csv():
"""Test to_csv with dict input
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_data_with_values(10)
try:
result1 = pipe(data, to_csv)
result2 = pipe(data, to_csv)
filename = result1['url']
output = pd.read_csv(filename).to_dict(orient='records')
finally:
os.remove(filename)

assert result1 == result2
assert data == {'values': output}