Visualize Dataset statistics in metadata panel (kedro-org#1472)

* initial draft with hooks * modify test for formatFileSize * add file size to the api response using fsspec * update unit test for metadata panel * remove print statements and update stats file * update get file size to not consider empty dir * fixing linting and format errors * fix format and lint errors * fix pytest errors * add test cases and add fix for circle ci builds * resolve PR comments * fixing PR comments and add additional support for MemoryDataset * update stats and modify file_size extraction * fix lint and format errors * fix lint errors * fix lint errors * fix lint errors * fix lint errors * fix lint errors * fix for PR comments * add test coverage for transcoded data node * address PR comments * fix lint errors * modify test cases for hooks and utils * add matplotlib in requirements file for e2e tests * add design change for overflow * add design change for overflow * remove matplotlib from requirements and fix metadata suggestions * add release notes for visualizing dataset stats * add release notes for displaying dataset stats * hooks update based on Nok's comments * fix lint and format checks * modify stats based on Nok's comments * fix lint and format * fixed failing unit test * update code based on Nok's suggestion --------- Co-authored-by: Tynan DeBold <[email protected]>
ravi-kumar-pilla · Aug 14, 2023 · 3c50980 · 3c50980
1 parent 6e1d127
commit 3c50980
Show file tree

Hide file tree

Showing 27 changed files with 766 additions and 37 deletions.
diff --git a/RELEASE.md b/RELEASE.md
@@ -5,6 +5,11 @@ Please follow the established format:
 - Use present tense (e.g. 'Add new feature')
 - Include the ID number for the related PR (or PRs) in parentheses
 -->
+## Major features and improvements
+
+- Add support for displaying dataset statistics in the metadata panel. (#1472)
+
+# Release 6.3.5
 
 ## Bug fixes and other changes
 

diff --git a/cypress/tests/ui/flowchart/flowchart.cy.js b/cypress/tests/ui/flowchart/flowchart.cy.js
@@ -192,4 +192,28 @@ describe('Flowchart DAG', () => {
       .should('exist')
       .and('have.text', `Oops, there's nothing to see here`);
   });
+
+  it('verifies that users can open and see the dataset statistics in the metadata panel for datasets. #TC-51', () => {
+    const dataNodeText = 'Companies';
+
+    // Assert before action
+    cy.get('[data-label="Dataset statistics:]').should('not.exist');
+
+    // Action
+    cy.get('.pipeline-node > .pipeline-node__text')
+      .contains(dataNodeText)
+      .click({ force: true });
+
+    // Assert after action
+    cy.get('[data-label="Dataset statistics:"]').should('exist');
+    cy.get('[data-test=stats-value-rows]')
+      .invoke('text')
+      .should((rowsValue) => expect(rowsValue).to.be.eq('77,096'));
+    cy.get('[data-test=stats-value-columns]')
+      .invoke('text')
+      .should((colsValue) => expect(parseInt(colsValue)).to.be.eq(5));
+    cy.get('[data-test=stats-value-file_size]')
+      .invoke('text')
+      .should((fileSizeValue) => expect(fileSizeValue).to.be.eq('1.8MB'));
+  });
 });
diff --git a/demo-project/stats.json b/demo-project/stats.json
@@ -0,0 +1,39 @@
+{
+  "companies": { "rows": 77096, "columns": 5, "file_size": 1810602 },
+  "ingestion.int_typed_companies": {
+    "rows": 77096,
+    "columns": 5,
+    "file_size": 550616
+  },
+  "reviews": { "rows": 77096, "columns": 10, "file_size": 2937144 },
+  "ingestion.int_typed_reviews": {
+    "rows": 55790,
+    "columns": 11,
+    "file_size": 1335600
+  },
+  "shuttles": { "rows": 77096, "columns": 13, "file_size": 4195290 },
+  "ingestion.int_typed_shuttles": {
+    "rows": 77096,
+    "columns": 13,
+    "file_size": 1235685
+  },
+  "ingestion.prm_agg_companies": { "rows": 50098, "columns": 5 },
+  "prm_shuttle_company_reviews": {
+    "rows": 29768,
+    "columns": 27,
+    "file_size": 1020356
+  },
+  "prm_spine_table": { "rows": 29768, "columns": 3, "file_size": 655994 },
+  "feature_engineering.feat_derived_features": { "rows": 29768, "columns": 3 },
+  "feature_importance_output": { "rows": 15, "columns": 2, "file_size": 460 },
+  "feature_engineering.feat_static_features": { "rows": 29768, "columns": 12 },
+  "ingestion.prm_spine_table_clone": { "rows": 29768, "columns": 3 },
+  "reporting.cancellation_policy_breakdown": {
+    "rows": 21,
+    "columns": 3,
+    "file_size": 8744
+  },
+  "model_input_table": { "rows": 29768, "columns": 12, "file_size": 787351 },
+  "X_train": { "rows": 23814, "columns": 11 },
+  "X_test": { "rows": 5954, "columns": 11 }
+}
diff --git a/package/kedro_viz/api/rest/responses.py b/package/kedro_viz/api/rest/responses.py
@@ -114,6 +114,7 @@ class DataNodeMetadataAPIResponse(BaseAPIResponse):
     tracking_data: Optional[Dict]
     run_command: Optional[str]
     preview: Optional[Dict]
+    stats: Optional[Dict]
 
     class Config:
         schema_extra = {
@@ -130,6 +131,7 @@ class TranscodedDataNodeMetadataAPIReponse(BaseAPIResponse):
     original_type: str
     transcoded_types: List[str]
     run_command: Optional[str]
+    stats: Optional[Dict]
 
 
 class ParametersNodeMetadataAPIResponse(BaseAPIResponse):

diff --git a/package/kedro_viz/api/rest/router.py b/package/kedro_viz/api/rest/router.py
@@ -49,10 +49,12 @@ async def get_single_node_metadata(node_id: str):
         return TaskNodeMetadata(node)
 
     if isinstance(node, DataNode):
-        return DataNodeMetadata(node)
+        dataset_stats = data_access_manager.get_stats_for_data_node(node)
+        return DataNodeMetadata(node, dataset_stats)
 
     if isinstance(node, TranscodedDataNode):
-        return TranscodedDataNodeMetadata(node)
+        dataset_stats = data_access_manager.get_stats_for_data_node(node)
+        return TranscodedDataNodeMetadata(node, dataset_stats)
 
     return ParametersNodeMetadata(node)
 

diff --git a/package/kedro_viz/data_access/managers.py b/package/kedro_viz/data_access/managers.py
@@ -62,6 +62,7 @@ def __init__(self):
         )
         self.runs = RunsRepository()
         self.tracking_datasets = TrackingDatasetsRepository()
+        self.dataset_stats = {}
 
     def set_db_session(self, db_session_class: sessionmaker):
         """Set db session on repositories that need it."""
@@ -91,6 +92,28 @@ def add_pipelines(self, pipelines: Dict[str, KedroPipeline]):
             # Add the registered pipeline and its components to their repositories
             self.add_pipeline(registered_pipeline_id, pipeline)
 
+    def add_dataset_stats(self, stats_dict: Dict):
+        """Add dataset statistics (eg. rows, columns, file_size) as a dictionary.
+        This will help in showing the relevant stats in the metadata panel
+
+        Args:
+            stats_dict: A dictionary object loaded from stats.json file in the kedro project
+        """
+
+        self.dataset_stats = stats_dict
+
+    def get_stats_for_data_node(
+        self, data_node: Union[DataNode, TranscodedDataNode]
+    ) -> Dict:
+        """Returns the dataset statistics for the data node if found else returns an
+        empty dictionary
+
+        Args:
+            The data node for which we need the statistics
+        """
+
+        return self.dataset_stats.get(data_node.name, {})
+
     def add_pipeline(self, registered_pipeline_id: str, pipeline: KedroPipeline):
         """Iterate through all the nodes and datasets in a "registered" pipeline
         and add them to relevant repositories. Take care of extracting other relevant information

diff --git a/package/kedro_viz/integrations/kedro/data_loader.py b/package/kedro_viz/integrations/kedro/data_loader.py
@@ -6,6 +6,8 @@
 # pylint: disable=missing-function-docstring, no-else-return
 
 import base64
+import json
+import logging
 from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
 
@@ -14,23 +16,25 @@
 
 try:
     from kedro_datasets import (  # isort:skip
-        json,
+        json as json_dataset,
         matplotlib,
         plotly,
         tracking,
     )
 except ImportError:
     from kedro.extras.datasets import (  # Safe since ImportErrors are suppressed within kedro.
-        json,
+        json as json_dataset,
         matplotlib,
         plotly,
         tracking,
     )
+
 from kedro.io import DataCatalog
 from kedro.io.core import get_filepath_str
 from kedro.pipeline import Pipeline
 from semver import VersionInfo
 
+logger = logging.getLogger(__name__)
 KEDRO_VERSION = VersionInfo.parse(__version__)
 
 
@@ -54,11 +58,37 @@ def _bootstrap(project_path: Path):
         return
 
 
+def get_dataset_stats(project_path: Path) -> Dict:
+    """Return the stats saved at stats.json as a dictionary if found.
+    If not, return an empty dictionary
+
+    Args:
+        project_path: the path where the Kedro project is located.
+    """
+    try:
+        stats_file_path = project_path / "stats.json"
+
+        if not stats_file_path.exists():
+            return {}
+
+        with open(stats_file_path, encoding="utf8") as stats_file:
+            stats = json.load(stats_file)
+            return stats
+
+    except Exception as exc:  # pylint: disable=broad-exception-caught
+        logger.warning(
+            "Unable to get dataset statistics from project path %s : %s",
+            project_path,
+            exc,
+        )
+        return {}
+
+
 def load_data(
     project_path: Path,
     env: Optional[str] = None,
     extra_params: Optional[Dict[str, Any]] = None,
-) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore]:
+) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore, Dict]:
     """Load data from a Kedro project.
     Args:
         project_path: the path whether the Kedro project is located.
@@ -91,7 +121,9 @@ def load_data(
             # in case user doesn't have an active session down the line when it's first accessed.
             # Useful for users who have `get_current_session` in their `register_pipelines()`.
             pipelines_dict = dict(pipelines)
-        return catalog, pipelines_dict, session_store
+            stats_dict = get_dataset_stats(project_path)
+
+        return catalog, pipelines_dict, session_store, stats_dict
     elif KEDRO_VERSION.match(">=0.17.1"):
         from kedro.framework.session import KedroSession
 
@@ -103,8 +135,9 @@ def load_data(
         ) as session:
             context = session.load_context()
             session_store = session._store
+            stats_dict = get_dataset_stats(project_path)
 
-        return context.catalog, context.pipelines, session_store
+        return context.catalog, context.pipelines, session_store, stats_dict
     else:
         # Since Viz is only compatible with kedro>=0.17.0, this just matches 0.17.0
         from kedro.framework.session import KedroSession
@@ -120,8 +153,9 @@ def load_data(
         ) as session:
             context = session.load_context()
             session_store = session._store
+            stats_dict = get_dataset_stats(project_path)
 
-        return context.catalog, context.pipelines, session_store
+        return context.catalog, context.pipelines, session_store, stats_dict
 
 
 # The dataset type is available as an attribute if and only if the import from kedro
@@ -140,13 +174,13 @@ def matplotlib_writer_load(dataset: matplotlib.MatplotlibWriter) -> str:
     matplotlib.MatplotlibWriter._load = matplotlib_writer_load
 
 if hasattr(plotly, "JSONDataSet"):
-    plotly.JSONDataSet._load = json.JSONDataSet._load
+    plotly.JSONDataSet._load = json_dataset.JSONDataSet._load
 
 if hasattr(plotly, "PlotlyDataSet"):
-    plotly.PlotlyDataSet._load = json.JSONDataSet._load
+    plotly.PlotlyDataSet._load = json_dataset.JSONDataSet._load
 
 if hasattr(tracking, "JSONDataSet"):
-    tracking.JSONDataSet._load = json.JSONDataSet._load
+    tracking.JSONDataSet._load = json_dataset.JSONDataSet._load
 
 if hasattr(tracking, "MetricsDataSet"):
-    tracking.MetricsDataSet._load = json.JSONDataSet._load
+    tracking.MetricsDataSet._load = json_dataset.JSONDataSet._load