Skip to content

Commit

Permalink
#860 display warning/error when csv input contains a observation valu…
Browse files Browse the repository at this point in the history
…e column titled `value` (#882)

* New exception created and external docs updated

* Purl link added
  • Loading branch information
SarahJohnsonONS authored Aug 30, 2023
1 parent 5361fcf commit a79d05a
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 58 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@
* [An Error Occurred When Processing Dataframe](./error-processing-dataframe)
* [Failed to Convert Dataframe to String](./dataframe-to-string-convert-failed.md)
* [Feature Not Yet Supported](./feature-not-supported.md)
* [Invalid Observation Value Column Entitled "Value"](./invalid-obs-val-col-entitled-value.md)
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Error - Invalid observation value column entitled "Value"

## When it occurs

In a pivoted shape cube, observation value columns may not be entitled "Value".

## How to fix

If your data set is in the pivoted shape, rename the observation value column(s) to something other than "Value". See the [pivoted shape](../../shape-data/pivoted-shape.md) documentation for more details.
4 changes: 4 additions & 0 deletions external-docs/docs/guides/shape-data/pivoted-shape.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
See [Converting to pivoted shape](./shape-conversion.md#converting-to-the-pivoted-shape) for instructions on how to convert the shape of your data in Python and R.

## Observation value column titles

In a pivoted shape data set, the observation value column titles must not be entitled "Value". If you attempt to inspect a data cube built using csvcubed where one or more observation columns are entitled "Value", this will raise an [error](../../guides/errors/inspect-command-errors/invalid-obs-val-col-entitled-value.md).

## Single Measure

The [standard shape](./standard-shape.md) is flexible but has a lot of redundancy which can often be removed by using the more concise pivoted form. Our data set on the distribution of the number of Arthur's Bakes stores can be expressed in the pivoted shape as follows:
Expand Down
14 changes: 14 additions & 0 deletions src/csvcubed/models/csvcubedexception.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ class CsvcubedExceptionMsges(Enum):

InvalidNumOfDSDComponentsForObsValColTitle = "There should be only 1 component for the observation value column with title '{obs_val_col_title}', but found {num_of_components}."

InvalidObsValColTitle = "The title of an observation value column in a pivoted shape cube cannot be 'Value'. Please rename the column."


class CsvcubedExceptionUrls(Enum):
"""
Expand Down Expand Up @@ -127,6 +129,7 @@ class CsvcubedExceptionUrls(Enum):
InvalidNumOfDSDComponentsForObsValColTitle = (
"http://purl.org/csv-cubed/err/invalid-num-of-dsd-comps-for-obs-val-col"
)
InvalidObsValColTitle = "http://purl.org/csv-cubed/err/obs-val-col-entitled-value"


class CsvcubedException(Exception, HasErrorUrl, ABC):
Expand Down Expand Up @@ -418,3 +421,14 @@ def __init__(self, obs_val_col_title: str, num_of_components: int):
@classmethod
def get_error_url(cls) -> str:
return CsvcubedExceptionUrls.InvalidNumOfDSDComponentsForObsValColTitle.value


class InvalidObsValColTitleException(CsvcubedException):
"""Class representing the InvalidObsValColTitleException model."""

def __init__(self):
super().__init__(CsvcubedExceptionMsges.InvalidObsValColTitle.value)

@classmethod
def get_error_url(cls) -> str:
return CsvcubedExceptionUrls.InvalidObsValColTitle.value
22 changes: 6 additions & 16 deletions src/csvcubed/utils/csvdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
InvalidNumberOfRecordsException,
InvalidNumOfDSDComponentsForObsValColTitleException,
InvalidObservationColumnTitle,
InvalidObsValColTitleException,
InvalidUnitColumnDefinition,
)
from csvcubed.models.cube.cube_shape import CubeShape
Expand Down Expand Up @@ -306,30 +307,19 @@ def _melt_data_set(
]
id_cols = list(set(data_set.columns) - set(value_cols))

# Checking for any columns with the title "Value" and changing the value_name
# parameter passed to the melt function to a random string so that we don't
# trigger a pandas ValueError.
value_name = "Value"
rand_value_name = f"Value_{str(uuid1())}"
for col_title in value_cols:
if col_title == "Value":
value_name = rand_value_name
# Raise an exception if any observation columns are entitled 'Value'
if "Value" in value_cols:
raise InvalidObsValColTitleException()

# Melting the data set using pandas melt function.
melted_df = pd.melt(
return pd.melt(
data_set,
id_vars=id_cols,
value_vars=value_cols,
value_name=value_name,
value_name="Value",
var_name="Observation Value",
)

# Renaming columns in the returned melted df to their original title "Value"
if value_name == rand_value_name:
melted_df.rename(columns={rand_value_name: "Value"}, inplace=True)

return melted_df


def _get_unit_measure_col_for_standard_shape_cube(
qube_components: List[QubeComponentResult],
Expand Down
42 changes: 0 additions & 42 deletions tests/unit/inspect/test_inspectdatasetmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,45 +840,3 @@ def test_get_concepts_hierarchy_info_hierarchy_with_depth_more_than_one():
assert isinstance(result.tree, Tree)
assert result.tree.depth() == 2
assert len(result.tree.all_nodes_itr()) == 10


def test_melt_works_when_column_titled_value_exists():
"""
When a df with a column titled "Value" is passed to the melt function in
pandas 2.0 a ValueError is produced because of the column title. This test
uses a dataset with a "Value" titled column and should be melted when
transform_dataset_to_canonical_shape is called. In this case, the "value_name
parameter of melt() is given a different string (default is "Value") and then
we rename this column back to "Value" in the returned melted df.
"""
csvw_metadata_json_path = (
_test_case_base_dir
/ "multi-unit_single-measure"
/ "final-uk-greenhouse-gas-emissions-national-statistics-1990-to-2019.csv-metadata.json"
)

data_cube_repository = get_data_cube_repository(csvw_metadata_json_path)

(dataset, qube_components, csv_url) = get_arguments_qb_dataset(data_cube_repository)

(
canonical_shape_dataset,
measure_col,
unit_col,
) = transform_dataset_to_canonical_shape(
data_cube_repository,
dataset,
csv_url,
qube_components,
)

result: DatasetObservationsByMeasureUnitInfoResult = get_dataset_val_counts_info(
canonical_shape_dataset, measure_col, unit_col
)

assert result is not None
assert set(dataset.columns).issubset(set(canonical_shape_dataset.columns))
assert_frame_equal(
result.by_measure_and_unit_val_counts_df,
_expected_by_measure_and_unit_val_counts_df_with_column_titled_value,
)

0 comments on commit a79d05a

Please sign in to comment.