diff --git a/external-docs/docs/guides/errors/inspect-command-errors/index.md b/external-docs/docs/guides/errors/inspect-command-errors/index.md index ce309735c..5cddebab2 100644 --- a/external-docs/docs/guides/errors/inspect-command-errors/index.md +++ b/external-docs/docs/guides/errors/inspect-command-errors/index.md @@ -14,3 +14,4 @@ * [An Error Occurred When Processing Dataframe](./error-processing-dataframe) * [Failed to Convert Dataframe to String](./dataframe-to-string-convert-failed.md) * [Feature Not Yet Supported](./feature-not-supported.md) +* [Invalid Observation Value Column Entitled "Value"](./invalid-obs-val-col-entitled-value.md) diff --git a/external-docs/docs/guides/errors/inspect-command-errors/invalid-obs-val-col-entitled-value.md b/external-docs/docs/guides/errors/inspect-command-errors/invalid-obs-val-col-entitled-value.md new file mode 100644 index 000000000..f0d9dae31 --- /dev/null +++ b/external-docs/docs/guides/errors/inspect-command-errors/invalid-obs-val-col-entitled-value.md @@ -0,0 +1,9 @@ +# Error - Invalid observation value column entitled "Value" + +## When it occurs + +In a pivoted shape cube, observation value columns may not be entitled "Value". + +## How to fix + +If your data set is in the pivoted shape, rename the observation value column(s) to something other than "Value". See the [pivoted shape](../../shape-data/pivoted-shape.md) documentation for more details. diff --git a/external-docs/docs/guides/shape-data/pivoted-shape.md b/external-docs/docs/guides/shape-data/pivoted-shape.md index c0f7efdce..e9c865ff9 100644 --- a/external-docs/docs/guides/shape-data/pivoted-shape.md +++ b/external-docs/docs/guides/shape-data/pivoted-shape.md @@ -4,6 +4,10 @@ See [Converting to pivoted shape](./shape-conversion.md#converting-to-the-pivoted-shape) for instructions on how to convert the shape of your data in Python and R. +## Observation value column titles + +In a pivoted shape data set, the observation value column titles must not be entitled "Value". If you attempt to inspect a data cube built using csvcubed where one or more observation columns are entitled "Value", this will raise an [error](../../guides/errors/inspect-command-errors/invalid-obs-val-col-entitled-value.md). + ## Single Measure The [standard shape](./standard-shape.md) is flexible but has a lot of redundancy which can often be removed by using the more concise pivoted form. Our data set on the distribution of the number of Arthur's Bakes stores can be expressed in the pivoted shape as follows: diff --git a/src/csvcubed/models/csvcubedexception.py b/src/csvcubed/models/csvcubedexception.py index ba10f6425..b5dd96384 100644 --- a/src/csvcubed/models/csvcubedexception.py +++ b/src/csvcubed/models/csvcubedexception.py @@ -68,6 +68,8 @@ class CsvcubedExceptionMsges(Enum): InvalidNumOfDSDComponentsForObsValColTitle = "There should be only 1 component for the observation value column with title '{obs_val_col_title}', but found {num_of_components}." + InvalidObsValColTitle = "The title of an observation value column in a pivoted shape cube cannot be 'Value'. Please rename the column." + class CsvcubedExceptionUrls(Enum): """ @@ -127,6 +129,7 @@ class CsvcubedExceptionUrls(Enum): InvalidNumOfDSDComponentsForObsValColTitle = ( "http://purl.org/csv-cubed/err/invalid-num-of-dsd-comps-for-obs-val-col" ) + InvalidObsValColTitle = "http://purl.org/csv-cubed/err/obs-val-col-entitled-value" class CsvcubedException(Exception, HasErrorUrl, ABC): @@ -418,3 +421,14 @@ def __init__(self, obs_val_col_title: str, num_of_components: int): @classmethod def get_error_url(cls) -> str: return CsvcubedExceptionUrls.InvalidNumOfDSDComponentsForObsValColTitle.value + + +class InvalidObsValColTitleException(CsvcubedException): + """Class representing the InvalidObsValColTitleException model.""" + + def __init__(self): + super().__init__(CsvcubedExceptionMsges.InvalidObsValColTitle.value) + + @classmethod + def get_error_url(cls) -> str: + return CsvcubedExceptionUrls.InvalidObsValColTitle.value diff --git a/src/csvcubed/utils/csvdataset.py b/src/csvcubed/utils/csvdataset.py index ca2b3e1da..6009b5c14 100644 --- a/src/csvcubed/utils/csvdataset.py +++ b/src/csvcubed/utils/csvdataset.py @@ -20,6 +20,7 @@ InvalidNumberOfRecordsException, InvalidNumOfDSDComponentsForObsValColTitleException, InvalidObservationColumnTitle, + InvalidObsValColTitleException, InvalidUnitColumnDefinition, ) from csvcubed.models.cube.cube_shape import CubeShape @@ -306,30 +307,19 @@ def _melt_data_set( ] id_cols = list(set(data_set.columns) - set(value_cols)) - # Checking for any columns with the title "Value" and changing the value_name - # parameter passed to the melt function to a random string so that we don't - # trigger a pandas ValueError. - value_name = "Value" - rand_value_name = f"Value_{str(uuid1())}" - for col_title in value_cols: - if col_title == "Value": - value_name = rand_value_name + # Raise an exception if any observation columns are entitled 'Value' + if "Value" in value_cols: + raise InvalidObsValColTitleException() # Melting the data set using pandas melt function. - melted_df = pd.melt( + return pd.melt( data_set, id_vars=id_cols, value_vars=value_cols, - value_name=value_name, + value_name="Value", var_name="Observation Value", ) - # Renaming columns in the returned melted df to their original title "Value" - if value_name == rand_value_name: - melted_df.rename(columns={rand_value_name: "Value"}, inplace=True) - - return melted_df - def _get_unit_measure_col_for_standard_shape_cube( qube_components: List[QubeComponentResult], diff --git a/tests/unit/inspect/test_inspectdatasetmanager.py b/tests/unit/inspect/test_inspectdatasetmanager.py index 42700bfb0..246539554 100644 --- a/tests/unit/inspect/test_inspectdatasetmanager.py +++ b/tests/unit/inspect/test_inspectdatasetmanager.py @@ -840,45 +840,3 @@ def test_get_concepts_hierarchy_info_hierarchy_with_depth_more_than_one(): assert isinstance(result.tree, Tree) assert result.tree.depth() == 2 assert len(result.tree.all_nodes_itr()) == 10 - - -def test_melt_works_when_column_titled_value_exists(): - """ - When a df with a column titled "Value" is passed to the melt function in - pandas 2.0 a ValueError is produced because of the column title. This test - uses a dataset with a "Value" titled column and should be melted when - transform_dataset_to_canonical_shape is called. In this case, the "value_name - parameter of melt() is given a different string (default is "Value") and then - we rename this column back to "Value" in the returned melted df. - """ - csvw_metadata_json_path = ( - _test_case_base_dir - / "multi-unit_single-measure" - / "final-uk-greenhouse-gas-emissions-national-statistics-1990-to-2019.csv-metadata.json" - ) - - data_cube_repository = get_data_cube_repository(csvw_metadata_json_path) - - (dataset, qube_components, csv_url) = get_arguments_qb_dataset(data_cube_repository) - - ( - canonical_shape_dataset, - measure_col, - unit_col, - ) = transform_dataset_to_canonical_shape( - data_cube_repository, - dataset, - csv_url, - qube_components, - ) - - result: DatasetObservationsByMeasureUnitInfoResult = get_dataset_val_counts_info( - canonical_shape_dataset, measure_col, unit_col - ) - - assert result is not None - assert set(dataset.columns).issubset(set(canonical_shape_dataset.columns)) - assert_frame_equal( - result.by_measure_and_unit_val_counts_df, - _expected_by_measure_and_unit_val_counts_df_with_column_titled_value, - )