Add RDS support.

Merge branch 'release/0.8'
vnmabus · Jun 7, 2022 · 4faa331 · 4faa331
2 parents cfa7cb0 + 4ee7516
commit 4faa331
Show file tree

Hide file tree

Showing 11 changed files with 138 additions and 14 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -30,5 +30,9 @@ jobs:
         pip3 install .
         coverage run --source=rdata/ --omit=rdata/tests/ setup.py test;
         
+    - name: Generate coverage XML
+      run: |
+        coverage xml
+        
     - name: Upload coverage to Codecov
-      uses: codecov/codecov-action@v1
+      uses: codecov/codecov-action@v2
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -0,0 +1,39 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+permissions:
+  contents: read
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python
+      uses: actions/setup-python@v3
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/CITATION.cff b/CITATION.cff
@@ -8,11 +8,19 @@ authors:
     email: [email protected]
 title: "rdata: Read R datasets from Python"
 date-released: 2022-03-24
+doi: 10.5281/zenodo.6382237
 url: "https://github.com/vnmabus/rdata"
 license: MIT
 keywords:
   - rdata
   - Python
   - R
   - parser
-  - conversion
+  - conversion
+identifiers:
+  - description: "This is the collection of archived snapshots of all versions of rdata"
+    type: doi
+    value: 10.5281/zenodo.6382237
+  - description: "This is the archived snapshot of version 0.7 of rdata"
+    type: doi
+    value: 10.5281/zenodo.6382238
diff --git a/README.rst b/README.rst
@@ -1,7 +1,7 @@
 rdata
 =====
 
-|build-status| |docs| |coverage| |landscape| |pypi|
+|build-status| |docs| |coverage| |landscape| |pypi| |zenodo|
 
 Read R datasets from Python.
 
@@ -130,4 +130,9 @@ Pandas `Categorical` objects:
 .. |pypi| image:: https://badge.fury.io/py/rdata.svg
     :alt: Pypi version
     :scale: 100%
-    :target: https://pypi.python.org/pypi/rdata/
+    :target: https://pypi.python.org/pypi/rdata/
+
+.. |zenodo| image:: https://zenodo.org/badge/DOI/10.5281/zenodo.6382237.svg
+    :alt: Zenodo DOI
+    :scale: 100%
+    :target: https://doi.org/10.5281/zenodo.6382237
diff --git a/docs/conf.py b/docs/conf.py
@@ -22,7 +22,9 @@
 # sys.path.insert(0, '/home/carlos/git/rdata/rdata')
 
 import sys
+
 import pkg_resources
+
 try:
     release = pkg_resources.get_distribution('rdata').version
 except pkg_resources.DistributionNotFound:
@@ -208,3 +210,6 @@
 
 intersphinx_mapping = {'python': ('https://docs.python.org/3', None),
                        'pandas': ('http://pandas.pydata.org/pandas-docs/dev', None)}
+
+autodoc_preserve_defaults = True
+autodoc_typehints = "description"
diff --git a/rdata/VERSION b/rdata/VERSION
@@ -1 +1 @@
-0.7
+0.8
diff --git a/rdata/parser/_parser.py b/rdata/parser/_parser.py
@@ -12,6 +12,7 @@
 from dataclasses import dataclass
 from types import MappingProxyType
 from typing import (
+    TYPE_CHECKING,
     Any,
     BinaryIO,
     Callable,
@@ -722,19 +723,20 @@ def parse_file(
     *,
     expand_altrep: bool = True,
     altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
+    extension: str | None = None,
 ) -> RData:
     """
     Parse a R file (.rda or .rdata).
 
     Parameters:
-        file_or_path (file-like, str, bytes or path-like): File
-            in the R serialization format.
-        expand_altrep (bool): Wether to translate ALTREPs to normal objects.
+        file_or_path: File in the R serialization format.
+        expand_altrep: Wether to translate ALTREPs to normal objects.
         altrep_constructor_dict: Dictionary mapping each ALTREP to
             its constructor.
+        extension: Extension of the file.
 
     Returns:
-        RData: Data contained in the file (versions and object).
+        Data contained in the file (versions and object).
 
     See Also:
         :func:`parse_data`: Similar function that receives the data directly.
@@ -802,6 +804,8 @@ def parse_file(
     """
     if isinstance(file_or_path, (os.PathLike, str)):
         path = pathlib.Path(file_or_path)
+        if extension is None:
+            extension = path.suffix
         data = path.read_bytes()
     else:
         # file is a pre-opened file
@@ -816,6 +820,7 @@ def parse_file(
         data,
         expand_altrep=expand_altrep,
         altrep_constructor_dict=altrep_constructor_dict,
+        extension=extension,
     )
 
 
@@ -824,18 +829,20 @@ def parse_data(
     *,
     expand_altrep: bool = True,
     altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
+    extension: str | None = None,
 ) -> RData:
     """
     Parse the data of a R file, received as a sequence of bytes.
 
     Parameters:
-        data (bytes): Data extracted of a R file.
-        expand_altrep (bool): Wether to translate ALTREPs to normal objects.
+        data: Data extracted of a R file.
+        expand_altrep: Wether to translate ALTREPs to normal objects.
         altrep_constructor_dict: Dictionary mapping each ALTREP to
             its constructor.
+        extension: Extension of the file.
 
     Returns:
-        RData: Data contained in the file (versions and object).
+        Data contained in the file (versions and object).
 
     See Also:
         :func:`parse_file`: Similar function that parses a file directly.
@@ -911,6 +918,7 @@ def parse_data(
         if filetype in {
             FileTypes.rdata_binary_v2,
             FileTypes.rdata_binary_v3,
+            None,
         } else parse_data
     )
 
@@ -921,22 +929,31 @@ def parse_data(
     elif filetype is FileTypes.xz:
         new_data = lzma.decompress(data)
     elif filetype in {FileTypes.rdata_binary_v2, FileTypes.rdata_binary_v3}:
+        if extension == ".rds":
+            warnings.warn(
+                f"Wrong extension {extension} for file in RDATA format",
+            )
+
         view = view[len(magic_dict[filetype]):]
         new_data = view
     else:
-        raise NotImplementedError("Unknown file type")
+        new_data = view
+        if extension != ".rds":
+            warnings.warn("Unknown file type: assumed RDS")
 
     return parse_function(
         new_data,  # type: ignore
         expand_altrep=expand_altrep,
         altrep_constructor_dict=altrep_constructor_dict,
+        extension=extension,
     )
 
 
 def parse_rdata_binary(
     data: memoryview,
     expand_altrep: bool = True,
     altrep_constructor_dict: AltRepConstructorMap = DEFAULT_ALTREP_MAP,
+    extension: str | None = None,
 ) -> RData:
     """Select the appropiate parser and parse all the info."""
     format_type = rdata_format(data)

diff --git a/rdata/tests/data/test_dataframe.rds b/rdata/tests/data/test_dataframe.rds
diff --git a/rdata/tests/data/test_dataframe_v3.rds b/rdata/tests/data/test_dataframe_v3.rds
diff --git a/rdata/tests/data/test_full_named_matrix.rds b/rdata/tests/data/test_full_named_matrix.rds
diff --git a/rdata/tests/test_rdata.py b/rdata/tests/test_rdata.py
@@ -8,9 +8,10 @@
 
 import numpy as np
 import pandas as pd
-import rdata
 import xarray
 
+import rdata
+
 TESTDATA_PATH = rdata.TESTDATA_PATH
 
 
@@ -161,6 +162,29 @@ def test_full_named_matrix(self) -> None:
             reference,
         )
 
+    def test_full_named_matrix_rds(self) -> None:
+        """Test that a named matrix with dim names can be parsed."""
+        parsed = rdata.parser.parse_file(
+            TESTDATA_PATH / "test_full_named_matrix.rds",
+        )
+        converted = rdata.conversion.convert(parsed)
+        reference = xarray.DataArray(
+            [
+                [1.0, 2.0, 3.0],
+                [4.0, 5.0, 6.0],
+            ],
+            dims=["my_dim_0", "my_dim_1"],
+            coords={
+                "my_dim_0": ["dim0_0", "dim0_1"],
+                "my_dim_1": ["dim1_0", "dim1_1", "dim1_2"],
+            },
+        )
+
+        xarray.testing.assert_identical(
+            converted,
+            reference,
+        )
+
     def test_list(self) -> None:
         """Test that list can be parsed."""
         parsed = rdata.parser.parse_file(TESTDATA_PATH / "test_list.rda")
@@ -241,6 +265,28 @@ def test_dataframe(self) -> None:
                     ),
                 )
 
+    def test_dataframe_rds(self) -> None:
+        """Test dataframe conversion."""
+        for f in ("test_dataframe.rds", "test_dataframe_v3.rds"):
+            with self.subTest(file=f):
+                parsed = rdata.parser.parse_file(
+                    TESTDATA_PATH / f,
+                )
+                converted = rdata.conversion.convert(parsed)
+
+                pd.testing.assert_frame_equal(
+                    converted,
+                    pd.DataFrame(
+                        {
+                            "class": pd.Categorical(
+                                ["a", "b", "b"],
+                            ),
+                            "value": [1, 2, 3],
+                        },
+                        index=pd.RangeIndex(start=1, stop=4),
+                    ),
+                )
+
     def test_dataframe_rownames(self) -> None:
         """Test dataframe conversion."""
         parsed = rdata.parser.parse_file(