kedro-org · ravi-kumar-pilla · Aug 14, 2023 · Jul 31, 2023 · Jul 31, 2023 · Aug 1, 2023
@@ -192,4 +192,28 @@ describe('Flowchart DAG', () => {
       .should('exist')
       .and('have.text', `Oops, there's nothing to see here`);
   });
+
+  it('verifies that users can open and see the dataset statistics in the metadata panel for datasets. #TC-51', () => {
+    const dataNodeText = 'Companies';
+
+    // Assert before action
+    cy.get('[data-label="Dataset statistics:]').should('not.exist');
+
+    // Action
+    cy.get('.pipeline-node > .pipeline-node__text')
+      .contains(dataNodeText)
+      .click({ force: true });
+
+    // Assert after action
+    cy.get('[data-label="Dataset statistics:"]').should('exist');
+    cy.get('[data-test=stats-value-rows]')
+      .invoke('text')
+      .should((rowsValue) => expect(rowsValue).to.be.eq('77,096'));
+    cy.get('[data-test=stats-value-columns]')
+      .invoke('text')
+      .should((colsValue) => expect(parseInt(colsValue)).to.be.eq(5));
+    cy.get('[data-test=stats-value-file_size]')
+      .invoke('text')
+      .should((fileSizeValue) => expect(fileSizeValue).to.be.eq('1.8MB'));
+  });
 });
@@ -0,0 +1,17 @@
+{
+  "companies": { "rows": 77096, "columns": 5 },
+  "reviews": { "rows": 77096, "columns": 10 },
+  "shuttles": { "rows": 77096, "columns": 13 },
+  "ingestion.int_typed_companies": { "rows": 77096, "columns": 5 },
+  "ingestion.int_typed_shuttles": { "rows": 77096, "columns": 13 },
+  "ingestion.prm_agg_companies": { "rows": 50098, "columns": 5 },
+  "ingestion.int_typed_reviews": { "rows": 55790, "columns": 11 },
+  "prm_spine_table": { "rows": 29768, "columns": 3 },
+  "prm_shuttle_company_reviews": { "rows": 29768, "columns": 27 },
+  "feature_engineering.feat_static_features": { "rows": 29768, "columns": 12 },
+  "feature_engineering.feat_derived_features": { "rows": 29768, "columns": 3 },
+  "feature_importance_output": { "rows": 15, "columns": 2 },
+  "model_input_table": { "rows": 29768, "columns": 12 },
+  "X_train": { "rows": 23814, "columns": 11 },
+  "X_test": { "rows": 5954, "columns": 11 }
+}
diff --git a/package/features/steps/lower_requirements.txt b/package/features/steps/lower_requirements.txt
@@ -13,3 +13,4 @@ strawberry-graphql==0.192.0
 networkx==2.5
 orjson==3.9
 secure==0.3.0
+matplotlib==3.5
@@ -114,6 +114,7 @@ class DataNodeMetadataAPIResponse(BaseAPIResponse):
     tracking_data: Optional[Dict]
     run_command: Optional[str]
     preview: Optional[Dict]
+    stats: Optional[Dict]
 
     class Config:
         schema_extra = {
@@ -130,6 +131,7 @@ class TranscodedDataNodeMetadataAPIReponse(BaseAPIResponse):
     original_type: str
     transcoded_types: List[str]
     run_command: Optional[str]
+    stats: Optional[Dict]
 
 
 class ParametersNodeMetadataAPIResponse(BaseAPIResponse):

@@ -49,10 +49,12 @@ async def get_single_node_metadata(node_id: str):
         return TaskNodeMetadata(node)
 
     if isinstance(node, DataNode):
-        return DataNodeMetadata(node)
+        dataset_stats = data_access_manager.get_stats_for_data_node(node)
+        return DataNodeMetadata(node, dataset_stats)
 
     if isinstance(node, TranscodedDataNode):
-        return TranscodedDataNodeMetadata(node)
+        dataset_stats = data_access_manager.get_stats_for_data_node(node)
+        return TranscodedDataNodeMetadata(node, dataset_stats)
 
     return ParametersNodeMetadata(node)
 

@@ -62,6 +62,7 @@ def __init__(self):
         )
         self.runs = RunsRepository()
         self.tracking_datasets = TrackingDatasetsRepository()
+        self.dataset_stats = {}
 
     def set_db_session(self, db_session_class: sessionmaker):
         """Set db session on repositories that need it."""
@@ -91,6 +92,28 @@ def add_pipelines(self, pipelines: Dict[str, KedroPipeline]):
             # Add the registered pipeline and its components to their repositories
             self.add_pipeline(registered_pipeline_id, pipeline)
 
+    def add_dataset_stats(self, stats_dict: Dict):
+        """Add dataset statistics (eg. rows, columns, file_size) as a dictionary.
+        This will help in showing the relevant stats in the metadata panel
+
+        Args:
+            stats_dict: A dictionary object loaded from stats.json file in the kedro project
+        """
+
+        self.dataset_stats = stats_dict
+
+    def get_stats_for_data_node(
+        self, data_node: Union[DataNode, TranscodedDataNode]
+    ) -> Dict:
+        """Returns the dataset statistics for the data node if found else returns an
+        empty dictionary
+
+        Args:
+            The data node for which we need the statistics
+        """
+
+        return self.dataset_stats.get(data_node.name, {})
+
     def add_pipeline(self, registered_pipeline_id: str, pipeline: KedroPipeline):
         """Iterate through all the nodes and datasets in a "registered" pipeline
         and add them to relevant repositories. Take care of extracting other relevant information

@@ -6,6 +6,8 @@
 # pylint: disable=missing-function-docstring, no-else-return
 
 import base64
+import json
+import logging
 from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
 
@@ -14,23 +16,25 @@
 
 try:
     from kedro_datasets import (  # isort:skip
-        json,
+        json as json_dataset,
         matplotlib,
         plotly,
         tracking,
     )
 except ImportError:
     from kedro.extras.datasets import (  # Safe since ImportErrors are suppressed within kedro.
-        json,
+        json as json_dataset,
         matplotlib,
         plotly,
         tracking,
     )
+
 from kedro.io import DataCatalog
 from kedro.io.core import get_filepath_str
 from kedro.pipeline import Pipeline
 from semver import VersionInfo
 
+logger = logging.getLogger(__name__)
 KEDRO_VERSION = VersionInfo.parse(__version__)
 
 
@@ -54,11 +58,37 @@ def _bootstrap(project_path: Path):
         return
 
 
+def get_dataset_stats(project_path: Path) -> Dict:
+    """Return the stats saved at stats.json as a dictionary if found.
+    If not, return an empty dictionary
+
+    Args:
+        project_path: the path where the Kedro project is located.
+    """
+    try:
+        stats_file_path = project_path / "stats.json"
+
+        if not stats_file_path.exists():
+            return {}
+
+        with open(stats_file_path, encoding="utf8") as stats_file:
+            stats = json.load(stats_file)
+            return stats
+
+    except Exception as exc:  # pylint: disable=broad-exception-caught
+        logger.warning(
+            "Unable to get dataset statistics from project path %s : %s",
+            project_path,
+            exc,
+        )
+        return {}
+
+
 def load_data(
     project_path: Path,
     env: Optional[str] = None,
     extra_params: Optional[Dict[str, Any]] = None,
-) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore]:
+) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore, Dict]:
     """Load data from a Kedro project.
     Args:
         project_path: the path whether the Kedro project is located.
@@ -91,7 +121,9 @@ def load_data(
             # in case user doesn't have an active session down the line when it's first accessed.
             # Useful for users who have `get_current_session` in their `register_pipelines()`.
             pipelines_dict = dict(pipelines)
-        return catalog, pipelines_dict, session_store
+            stats_dict = get_dataset_stats(project_path)
+
+        return catalog, pipelines_dict, session_store, stats_dict
     elif KEDRO_VERSION.match(">=0.17.1"):
         from kedro.framework.session import KedroSession
 
@@ -103,8 +135,9 @@ def load_data(
         ) as session:
             context = session.load_context()
             session_store = session._store
+            stats_dict = get_dataset_stats(project_path)
 
-        return context.catalog, context.pipelines, session_store
+        return context.catalog, context.pipelines, session_store, stats_dict
     else:
         # Since Viz is only compatible with kedro>=0.17.0, this just matches 0.17.0
         from kedro.framework.session import KedroSession
@@ -120,8 +153,9 @@ def load_data(
         ) as session:
             context = session.load_context()
             session_store = session._store
+            stats_dict = get_dataset_stats(project_path)
 
-        return context.catalog, context.pipelines, session_store
+        return context.catalog, context.pipelines, session_store, stats_dict
 
 
 # The dataset type is available as an attribute if and only if the import from kedro
@@ -140,13 +174,13 @@ def matplotlib_writer_load(dataset: matplotlib.MatplotlibWriter) -> str:
     matplotlib.MatplotlibWriter._load = matplotlib_writer_load
 
 if hasattr(plotly, "JSONDataSet"):
-    plotly.JSONDataSet._load = json.JSONDataSet._load
+    plotly.JSONDataSet._load = json_dataset.JSONDataSet._load
 
 if hasattr(plotly, "PlotlyDataSet"):
-    plotly.PlotlyDataSet._load = json.JSONDataSet._load
+    plotly.PlotlyDataSet._load = json_dataset.JSONDataSet._load
 
 if hasattr(tracking, "JSONDataSet"):
-    tracking.JSONDataSet._load = json.JSONDataSet._load
+    tracking.JSONDataSet._load = json_dataset.JSONDataSet._load
 
 if hasattr(tracking, "MetricsDataSet"):
-    tracking.MetricsDataSet._load = json.JSONDataSet._load
+    tracking.MetricsDataSet._load = json_dataset.JSONDataSet._load
@@ -0,0 +1,68 @@
+# pylint: disable=broad-exception-caught
+"""`kedro_viz.integrations.kedro.hooks` defines hooks to add additional
+functionalities for a kedro run."""
+
+import json
+import logging
+from collections import defaultdict
+from typing import Any
+
+import pandas as pd
+from kedro.framework.hooks import hook_impl
+
+from kedro_viz.integrations.kedro.utils import get_stats_dataset_name, stats_order
+
+logger = logging.getLogger(__name__)
+
+
+class DatasetStatsHook:
+    """Class to collect dataset statistics during a kedro run
+    and save it to a JSON file. The class currently supports
+    (pd.DataFrame) dataset instances"""
+
+    def __init__(self):
+        self._stats = defaultdict(dict)
+
+    @hook_impl
+    def after_dataset_loaded(self, dataset_name: str, data: Any):
+        """Hook to be invoked after a dataset is loaded from the catalog.
+        Once the dataset is loaded, extract the required dataset statistics.
+        The hook currently supports (pd.DataFrame) dataset instances
+
+        Args:
+            dataset_name: name of the dataset that was saved to the catalog.
+            data: the actual data that was saved to the catalog.
+        """
+        try:
+            stats_dataset_name = get_stats_dataset_name(dataset_name)
+            if isinstance(data, pd.DataFrame):
+                self._stats[stats_dataset_name]["rows"] = int(data.shape[0])
+                self._stats[stats_dataset_name]["columns"] = int(data.shape[1])
+
+        except Exception as exc:  # pragma: no cover
+            logger.warning(
+                "Unable to create statistics for the dataset %s : %s", dataset_name, exc
+            )
+
+    @hook_impl
+    def after_pipeline_run(self):
+        """Hook to be invoked after a pipeline runs.
+        Once the pipeline run completes, write the dataset
+        statistics to stats.json file
+
+        """
+        try:
+            with open("stats.json", "w", encoding="utf8") as file:
+                sorted_stats_data = {
+                    dataset_name: stats_order(stats)
+                    for dataset_name, stats in self._stats.items()
+                }
+                json.dump(sorted_stats_data, file)
+
+        except Exception as exc:  # pragma: no cover
+            logger.warning(
+                "Unable to write dataset statistics for the pipeline: %s", exc
+            )
+
+
+dataset_stats_hook = DatasetStatsHook()
diff --git a/package/kedro_viz/integrations/kedro/utils.py b/package/kedro_viz/integrations/kedro/utils.py
@@ -0,0 +1,37 @@
+"""`kedro_viz.integrations.kedro.utils` contains utility
+functions used in the `kedro_viz.integrations.kedro` package"""
+
+from kedro.pipeline.pipeline import TRANSCODING_SEPARATOR, _strip_transcoding
+
+
+def stats_order(stats: dict) -> dict:
+    """Sort the stats extracted from the datasets using the sort order
+
+    Args:
+        stats: A dictionary of statistics for a dataset
+
+    Returns: A sorted dictionary based on the sort_order
+    """
+    # Custom sort order
+    sort_order = ["rows", "columns", "file_size"]
+    return {stat: stats.get(stat) for stat in sort_order if stat in stats}
+
+
+def get_stats_dataset_name(dataset_name: str):
+    """Get the dataset name for assigning stat values in the dictionary.
+    If the dataset name contains transcoded information, strip the transcoding.
+
+    Args:
+        dataset_name: name of the dataset
+
+    Returns: Dataset name without any transcoding information
+    """
+
+    stats_dataset_name = dataset_name
+
+    # Strip transcoding
+    is_transcoded_dataset = TRANSCODING_SEPARATOR in dataset_name
+    if is_transcoded_dataset:
+        stats_dataset_name = _strip_transcoding(dataset_name)
+
+    return stats_dataset_name