Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Visualize Dataset statistics in metadata panel #1472

Merged
merged 40 commits into from
Aug 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
f86ac6f
initial draft with hooks
ravi-kumar-pilla Jul 31, 2023
0370d56
modify test for formatFileSize
ravi-kumar-pilla Jul 31, 2023
e6e3898
add file size to the api response using fsspec
ravi-kumar-pilla Aug 1, 2023
6812664
update unit test for metadata panel
ravi-kumar-pilla Aug 2, 2023
ccafa98
remove print statements and update stats file
ravi-kumar-pilla Aug 2, 2023
a37459c
update get file size to not consider empty dir
ravi-kumar-pilla Aug 2, 2023
8c6965f
fixing linting and format errors
ravi-kumar-pilla Aug 2, 2023
83bf822
Merge branch 'main' of https://github.com/kedro-org/kedro-viz into fe…
ravi-kumar-pilla Aug 2, 2023
9466ea6
fix format and lint errors
ravi-kumar-pilla Aug 2, 2023
cf90243
fix pytest errors
ravi-kumar-pilla Aug 2, 2023
5a08448
add test cases and add fix for circle ci builds
ravi-kumar-pilla Aug 3, 2023
bb5a342
resolve PR comments
ravi-kumar-pilla Aug 4, 2023
9d66c92
fixing PR comments and add additional support for MemoryDataset
ravi-kumar-pilla Aug 4, 2023
3103894
update stats and modify file_size extraction
ravi-kumar-pilla Aug 4, 2023
bbbeb7d
fix lint and format errors
ravi-kumar-pilla Aug 4, 2023
72b8c74
fix lint errors
ravi-kumar-pilla Aug 4, 2023
36760b4
fix lint errors
ravi-kumar-pilla Aug 4, 2023
dd65977
fix lint errors
ravi-kumar-pilla Aug 4, 2023
86dd8ac
fix lint errors
ravi-kumar-pilla Aug 4, 2023
37f7059
fix lint errors
ravi-kumar-pilla Aug 4, 2023
3c231a0
Merge branch 'main' into feature/viz-size-datasets
tynandebold Aug 7, 2023
c3ff3e1
fix for PR comments
ravi-kumar-pilla Aug 8, 2023
5b1f7e4
add test coverage for transcoded data node
ravi-kumar-pilla Aug 8, 2023
f7a4dc1
address PR comments
ravi-kumar-pilla Aug 8, 2023
d84f01f
fix lint errors
ravi-kumar-pilla Aug 8, 2023
3b55684
modify test cases for hooks and utils
ravi-kumar-pilla Aug 9, 2023
18d9974
add matplotlib in requirements file for e2e tests
ravi-kumar-pilla Aug 9, 2023
beb1e5e
Merge branch 'main' into feature/viz-size-datasets
ravi-kumar-pilla Aug 9, 2023
4f3e77f
add design change for overflow
ravi-kumar-pilla Aug 10, 2023
b332dc2
Merge branch 'feature/viz-size-datasets' of https://github.com/kedro-…
ravi-kumar-pilla Aug 10, 2023
8ddfffe
add design change for overflow
ravi-kumar-pilla Aug 10, 2023
cf21083
remove matplotlib from requirements and fix metadata suggestions
ravi-kumar-pilla Aug 10, 2023
4213cf7
add release notes for visualizing dataset stats
ravi-kumar-pilla Aug 10, 2023
57c139b
add release notes for displaying dataset stats
ravi-kumar-pilla Aug 10, 2023
f917c22
hooks update based on Nok's comments
ravi-kumar-pilla Aug 11, 2023
7b88fc9
fix lint and format checks
ravi-kumar-pilla Aug 11, 2023
2d823da
modify stats based on Nok's comments
ravi-kumar-pilla Aug 11, 2023
381dfa4
fix lint and format
ravi-kumar-pilla Aug 11, 2023
6dd02ff
fixed failing unit test
ravi-kumar-pilla Aug 12, 2023
ac65d0d
update code based on Nok's suggestion
ravi-kumar-pilla Aug 14, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ Please follow the established format:
- Use present tense (e.g. 'Add new feature')
- Include the ID number for the related PR (or PRs) in parentheses
-->
## Major features and improvements

- Add support for displaying dataset statistics in the metadata panel. (#1472)

# Release 6.3.5

## Bug fixes and other changes

Expand Down
24 changes: 24 additions & 0 deletions cypress/tests/ui/flowchart/flowchart.cy.js
Original file line number Diff line number Diff line change
Expand Up @@ -192,4 +192,28 @@ describe('Flowchart DAG', () => {
.should('exist')
.and('have.text', `Oops, there's nothing to see here`);
});

it('verifies that users can open and see the dataset statistics in the metadata panel for datasets. #TC-51', () => {
const dataNodeText = 'Companies';

// Assert before action
cy.get('[data-label="Dataset statistics:]').should('not.exist');

// Action
cy.get('.pipeline-node > .pipeline-node__text')
.contains(dataNodeText)
.click({ force: true });

// Assert after action
cy.get('[data-label="Dataset statistics:"]').should('exist');
cy.get('[data-test=stats-value-rows]')
.invoke('text')
.should((rowsValue) => expect(rowsValue).to.be.eq('77,096'));
cy.get('[data-test=stats-value-columns]')
.invoke('text')
.should((colsValue) => expect(parseInt(colsValue)).to.be.eq(5));
cy.get('[data-test=stats-value-file_size]')
.invoke('text')
.should((fileSizeValue) => expect(fileSizeValue).to.be.eq('1.8MB'));
});
});
39 changes: 39 additions & 0 deletions demo-project/stats.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{
"companies": { "rows": 77096, "columns": 5, "file_size": 1810602 },
"ingestion.int_typed_companies": {
"rows": 77096,
"columns": 5,
"file_size": 550616
},
"reviews": { "rows": 77096, "columns": 10, "file_size": 2937144 },
"ingestion.int_typed_reviews": {
"rows": 55790,
"columns": 11,
"file_size": 1335600
},
"shuttles": { "rows": 77096, "columns": 13, "file_size": 4195290 },
"ingestion.int_typed_shuttles": {
"rows": 77096,
"columns": 13,
"file_size": 1235685
},
"ingestion.prm_agg_companies": { "rows": 50098, "columns": 5 },
"prm_shuttle_company_reviews": {
"rows": 29768,
"columns": 27,
"file_size": 1020356
},
"prm_spine_table": { "rows": 29768, "columns": 3, "file_size": 655994 },
"feature_engineering.feat_derived_features": { "rows": 29768, "columns": 3 },
"feature_importance_output": { "rows": 15, "columns": 2, "file_size": 460 },
"feature_engineering.feat_static_features": { "rows": 29768, "columns": 12 },
"ingestion.prm_spine_table_clone": { "rows": 29768, "columns": 3 },
"reporting.cancellation_policy_breakdown": {
"rows": 21,
"columns": 3,
"file_size": 8744
},
"model_input_table": { "rows": 29768, "columns": 12, "file_size": 787351 },
"X_train": { "rows": 23814, "columns": 11 },
"X_test": { "rows": 5954, "columns": 11 }
}
2 changes: 2 additions & 0 deletions package/kedro_viz/api/rest/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ class DataNodeMetadataAPIResponse(BaseAPIResponse):
tracking_data: Optional[Dict]
run_command: Optional[str]
preview: Optional[Dict]
stats: Optional[Dict]

class Config:
schema_extra = {
Expand All @@ -130,6 +131,7 @@ class TranscodedDataNodeMetadataAPIReponse(BaseAPIResponse):
original_type: str
transcoded_types: List[str]
run_command: Optional[str]
stats: Optional[Dict]


class ParametersNodeMetadataAPIResponse(BaseAPIResponse):
Expand Down
6 changes: 4 additions & 2 deletions package/kedro_viz/api/rest/router.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,12 @@ async def get_single_node_metadata(node_id: str):
return TaskNodeMetadata(node)

if isinstance(node, DataNode):
return DataNodeMetadata(node)
dataset_stats = data_access_manager.get_stats_for_data_node(node)
return DataNodeMetadata(node, dataset_stats)

if isinstance(node, TranscodedDataNode):
return TranscodedDataNodeMetadata(node)
dataset_stats = data_access_manager.get_stats_for_data_node(node)
return TranscodedDataNodeMetadata(node, dataset_stats)

return ParametersNodeMetadata(node)

Expand Down
23 changes: 23 additions & 0 deletions package/kedro_viz/data_access/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def __init__(self):
)
self.runs = RunsRepository()
self.tracking_datasets = TrackingDatasetsRepository()
self.dataset_stats = {}

def set_db_session(self, db_session_class: sessionmaker):
"""Set db session on repositories that need it."""
Expand Down Expand Up @@ -91,6 +92,28 @@ def add_pipelines(self, pipelines: Dict[str, KedroPipeline]):
# Add the registered pipeline and its components to their repositories
self.add_pipeline(registered_pipeline_id, pipeline)

def add_dataset_stats(self, stats_dict: Dict):
"""Add dataset statistics (eg. rows, columns, file_size) as a dictionary.
This will help in showing the relevant stats in the metadata panel

Args:
stats_dict: A dictionary object loaded from stats.json file in the kedro project
"""

self.dataset_stats = stats_dict

def get_stats_for_data_node(
self, data_node: Union[DataNode, TranscodedDataNode]
) -> Dict:
"""Returns the dataset statistics for the data node if found else returns an
empty dictionary

Args:
The data node for which we need the statistics
"""

return self.dataset_stats.get(data_node.name, {})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should it be None of {}?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should be an empty dict. This method is called in the router.py where I was checking for None and initializing the stats to empty dict. Instead this way, we always get a dict for a data node


def add_pipeline(self, registered_pipeline_id: str, pipeline: KedroPipeline):
"""Iterate through all the nodes and datasets in a "registered" pipeline
and add them to relevant repositories. Take care of extracting other relevant information
Expand Down
54 changes: 44 additions & 10 deletions package/kedro_viz/integrations/kedro/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
# pylint: disable=missing-function-docstring, no-else-return

import base64
import json
import logging
from pathlib import Path
from typing import Any, Dict, Optional, Tuple

Expand All @@ -14,23 +16,25 @@

try:
from kedro_datasets import ( # isort:skip
json,
json as json_dataset,
matplotlib,
plotly,
tracking,
)
except ImportError:
from kedro.extras.datasets import ( # Safe since ImportErrors are suppressed within kedro.
json,
json as json_dataset,
matplotlib,
plotly,
tracking,
)

from kedro.io import DataCatalog
from kedro.io.core import get_filepath_str
from kedro.pipeline import Pipeline
from semver import VersionInfo

logger = logging.getLogger(__name__)
KEDRO_VERSION = VersionInfo.parse(__version__)


Expand All @@ -54,11 +58,37 @@ def _bootstrap(project_path: Path):
return


def get_dataset_stats(project_path: Path) -> Dict:
"""Return the stats saved at stats.json as a dictionary if found.
If not, return an empty dictionary

Args:
project_path: the path where the Kedro project is located.
"""
try:
stats_file_path = project_path / "stats.json"

if not stats_file_path.exists():
return {}

with open(stats_file_path, encoding="utf8") as stats_file:
stats = json.load(stats_file)
return stats

except Exception as exc: # pylint: disable=broad-exception-caught
logger.warning(
"Unable to get dataset statistics from project path %s : %s",
project_path,
exc,
)
return {}


def load_data(
project_path: Path,
env: Optional[str] = None,
extra_params: Optional[Dict[str, Any]] = None,
) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore]:
) -> Tuple[DataCatalog, Dict[str, Pipeline], BaseSessionStore, Dict]:
"""Load data from a Kedro project.
Args:
project_path: the path whether the Kedro project is located.
Expand Down Expand Up @@ -91,7 +121,9 @@ def load_data(
# in case user doesn't have an active session down the line when it's first accessed.
# Useful for users who have `get_current_session` in their `register_pipelines()`.
pipelines_dict = dict(pipelines)
return catalog, pipelines_dict, session_store
stats_dict = get_dataset_stats(project_path)

return catalog, pipelines_dict, session_store, stats_dict
elif KEDRO_VERSION.match(">=0.17.1"):
from kedro.framework.session import KedroSession

Expand All @@ -103,8 +135,9 @@ def load_data(
) as session:
context = session.load_context()
session_store = session._store
stats_dict = get_dataset_stats(project_path)

return context.catalog, context.pipelines, session_store
return context.catalog, context.pipelines, session_store, stats_dict
else:
# Since Viz is only compatible with kedro>=0.17.0, this just matches 0.17.0
from kedro.framework.session import KedroSession
Expand All @@ -120,8 +153,9 @@ def load_data(
) as session:
context = session.load_context()
session_store = session._store
stats_dict = get_dataset_stats(project_path)

return context.catalog, context.pipelines, session_store
return context.catalog, context.pipelines, session_store, stats_dict


# The dataset type is available as an attribute if and only if the import from kedro
Expand All @@ -140,13 +174,13 @@ def matplotlib_writer_load(dataset: matplotlib.MatplotlibWriter) -> str:
matplotlib.MatplotlibWriter._load = matplotlib_writer_load

if hasattr(plotly, "JSONDataSet"):
plotly.JSONDataSet._load = json.JSONDataSet._load
plotly.JSONDataSet._load = json_dataset.JSONDataSet._load

if hasattr(plotly, "PlotlyDataSet"):
plotly.PlotlyDataSet._load = json.JSONDataSet._load
plotly.PlotlyDataSet._load = json_dataset.JSONDataSet._load

if hasattr(tracking, "JSONDataSet"):
tracking.JSONDataSet._load = json.JSONDataSet._load
tracking.JSONDataSet._load = json_dataset.JSONDataSet._load

if hasattr(tracking, "MetricsDataSet"):
tracking.MetricsDataSet._load = json.JSONDataSet._load
tracking.MetricsDataSet._load = json_dataset.JSONDataSet._load
Loading