Merge pull request #862 from jakevdp/tojson

ENH: make to_json & to_csv transformers have deterministic filenames
vega · May 22, 2018 · c70990b · c70990b
2 parents 1e56c41 + c0d324b
commit c70990b
Show file tree

Hide file tree

Showing 2 changed files with 129 additions and 34 deletions.
diff --git a/altair/utils/data.py b/altair/utils/data.py
@@ -1,6 +1,6 @@
 import json
 import random
-import uuid
+import hashlib
 
 import pandas as pd
 from toolz.curried import curry, pipe  # noqa
@@ -80,41 +80,37 @@ def sample(data, n=None, frac=None):
 
 
 @curry
-def to_json(data, prefix='altair-data'):
-    """Write the data model to a .json file and return a url based data model."""
-    check_data_type(data)
-    ext = '.json'
-    filename = _compute_filename(prefix=prefix, ext=ext)
-    if isinstance(data, pd.DataFrame):
-        data = sanitize_dataframe(data)
-        data.to_json(filename, orient='records')
-    elif isinstance(data, dict):
-        if 'values' not in data:
-            raise KeyError('values expected in data dict, but not present.')
-        values = data['values']
-        with open(filename) as f:
-            json.dump(values, f)
+def to_json(data, prefix='altair-data', extension='json',
+            filename="{prefix}-{hash}.{extension}"):
+    """
+    Write the data model to a .json file and return a url based data model.
+    """
+    data_json = _data_to_json_string(data)
+    data_hash = _compute_data_hash(data_json)
+    filename = filename.format(prefix=prefix, hash=data_hash,
+                               extension=extension)
+    with open(filename, 'w') as f:
+        f.write(data_json)
     return {
         'url': filename,
         'format': {'type': 'json'}
     }
 
 
 @curry
-def to_csv(data, prefix='altair-data'):
+def to_csv(data, prefix='altair-data', extension='csv',
+           filename="{prefix}-{hash}.{extension}"):
     """Write the data model to a .csv file and return a url based data model."""
-    check_data_type(data)
-    ext = '.csv'
-    filename = _compute_filename(prefix=prefix, ext=ext)
-    if isinstance(data, pd.DataFrame):
-        data = sanitize_dataframe(data)
-        data.to_csv(filename)
-        return {
-            'url': filename,
-            'format': {'type': 'csv'}
-        }
-    elif isinstance(data, dict):
-        raise NotImplementedError('to_csv only works with Pandas DataFrame objects.')
+    data_csv = _data_to_csv_string(data)
+    data_hash = _compute_data_hash(data_csv)
+    filename = filename.format(prefix=prefix, hash=data_hash,
+                               extension=extension)
+    with open(filename, 'w') as f:
+        f.write(data_csv)
+    return {
+        'url': filename,
+        'format': {'type': 'csv'}
+    }
 
 
 @curry
@@ -140,11 +136,35 @@ def check_data_type(data):
 # Private utilities
 # ==============================================================================
 
+def _compute_data_hash(data_str):
+    return hashlib.md5(data_str.encode()).hexdigest()
 
-def _compute_uuid_filename(prefix, ext):
-    return prefix + '-' + str(uuid.uuid4()) + ext
 
+def _data_to_json_string(data):
+    """Return a JSON string representation of the input data"""
+    check_data_type(data)
+    if isinstance(data, pd.DataFrame):
+        data = sanitize_dataframe(data)
+        return data.to_json(orient='records')
+    elif isinstance(data, dict):
+        if 'values' not in data:
+            raise KeyError('values expected in data dict, but not present.')
+        return json.dumps(data['values'], sort_keys=True)
+    else:
+        raise NotImplementedError("to_json only works with data expressed as "
+                                  "a DataFrame or as a dict")
 
-def _compute_filename(prefix='altair-data', ext='.csv'):
-    filename = _compute_uuid_filename(prefix, ext)
-    return filename
+
+def _data_to_csv_string(data):
+    """return a CSV string representation of the input data"""
+    check_data_type(data)
+    if isinstance(data, pd.DataFrame):
+        data = sanitize_dataframe(data)
+        return data.to_csv(index=False)
+    elif isinstance(data, dict):
+        if 'values' not in data:
+            raise KeyError('values expected in data dict, but not present')
+        return pd.DataFrame.from_dict(data['values']).to_csv(index=False)
+    else:
+        raise NotImplementedError("to_csv only works with data expressed as "
+                                  "a DataFrame or as a dict")
diff --git a/altair/utils/tests/test_data.py b/altair/utils/tests/test_data.py
@@ -1,8 +1,11 @@
+import os
+
 import pytest
 import pandas as pd
 
 
-from ..data import limit_rows, MaxRowsError, sample, pipe, to_values
+from ..data import (limit_rows, MaxRowsError, sample, pipe, to_values,
+                    to_json, to_csv)
 
 
 def _create_dataframe(N):
@@ -63,3 +66,75 @@ def test_type_error():
     for f in (sample, limit_rows, to_values):
         with pytest.raises(TypeError):
             pipe(0, f)
+
+
+def test_dataframe_to_json():
+    """Test to_json
+    - make certain the filename is deterministic
+    - make certain the file contents match the data
+    """
+    data = _create_dataframe(10)
+    try:
+        result1 = pipe(data, to_json)
+        result2 = pipe(data, to_json)
+        filename = result1['url']
+        output = pd.read_json(filename)
+    finally:
+        os.remove(filename)
+
+    assert result1 == result2
+    assert output.equals(data)
+
+
+def test_dict_to_json():
+    """Test to_json
+    - make certain the filename is deterministic
+    - make certain the file contents match the data
+    """
+    data = _create_data_with_values(10)
+    try:
+        result1 = pipe(data, to_json)
+        result2 = pipe(data, to_json)
+        filename = result1['url']
+        output = pd.read_json(filename).to_dict(orient='records')
+    finally:
+        os.remove(filename)
+
+    assert result1 == result2
+    assert data == {'values': output}
+
+
+def test_dataframe_to_csv():
+    """Test to_csv with dataframe input
+    - make certain the filename is deterministic
+    - make certain the file contents match the data
+    """
+    data = _create_dataframe(10)
+    try:
+        result1 = pipe(data, to_csv)
+        result2 = pipe(data, to_csv)
+        filename = result1['url']
+        output = pd.read_csv(filename)
+    finally:
+        os.remove(filename)
+
+    assert result1 == result2
+    assert output.equals(data)
+
+
+def test_dict_to_csv():
+    """Test to_csv with dict input
+    - make certain the filename is deterministic
+    - make certain the file contents match the data
+    """
+    data = _create_data_with_values(10)
+    try:
+        result1 = pipe(data, to_csv)
+        result2 = pipe(data, to_csv)
+        filename = result1['url']
+        output = pd.read_csv(filename).to_dict(orient='records')
+    finally:
+        os.remove(filename)
+
+    assert result1 == result2
+    assert data == {'values': output}