Merge pull request #86 from aai-institute/feature/minor-improvements

Minor improvements
opcode81 · Feb 29, 2024 · 64231f2 · 64231f2
2 parents 7d20514 + 1776f62
commit 64231f2
Show file tree

Hide file tree

Showing 8 changed files with 99 additions and 32 deletions.
diff --git a/src/sensai/columngen.py b/src/sensai/columngen.py
@@ -5,7 +5,9 @@
 import numpy as np
 import pandas as pd
 
-from .util.cache import PersistentKeyValueCache
+from .data_transformation import DFTNormalisation
+from .featuregen import FeatureGeneratorFromColumnGenerator
+from .util.cache import KeyValueCache
 
 
 log = logging.getLogger(__name__)
@@ -45,6 +47,26 @@ def _generate_column(self, df: pd.DataFrame) -> Union[pd.Series, list, np.ndarra
         """
         pass
 
+    def to_feature_generator(self,
+            take_input_column_if_present: bool = False,
+            normalisation_rule_template: DFTNormalisation.RuleTemplate = None,
+            is_categorical: bool = False):
+        """
+        Transforms this column generator into a feature generator that can be used as part of a VectorModel.
+
+        :param take_input_column_if_present: if True, then if a column whose name corresponds to the column to generate exists
+            in the input data, simply copy it to generate the output (without using the column generator); if False, always
+            apply the columnGen to generate the output
+        :param is_categorical: whether the resulting column is categorical
+        :param normalisation_rule_template: template for a DFTNormalisation for the resulting column.
+            This should only be provided if is_categorical is False
+        :return:
+        """
+        return FeatureGeneratorFromColumnGenerator(self,
+            take_input_column_if_present=take_input_column_if_present,
+            normalisation_rule_template=normalisation_rule_template,
+            is_categorical=is_categorical)
+
 
 class IndexCachedColumnGenerator(ColumnGenerator):
     """
@@ -57,7 +79,7 @@ class IndexCachedColumnGenerator(ColumnGenerator):
 
     log = log.getChild(__qualname__)
 
-    def __init__(self, column_generator: ColumnGenerator, cache: PersistentKeyValueCache):
+    def __init__(self, column_generator: ColumnGenerator, cache: KeyValueCache):
         """
         :param column_generator: the column generator with which to generate values for keys not found in the cache
         :param cache: the cache in which to store key-value pairs
@@ -92,7 +114,7 @@ class ColumnGeneratorCachedByIndex(ColumnGenerator, ABC):
 
     log = log.getChild(__qualname__)
 
-    def __init__(self, generated_column_name: str, cache: Optional[PersistentKeyValueCache], persist_cache=False):
+    def __init__(self, generated_column_name: str, cache: Optional[KeyValueCache], persist_cache=False):
         """
         :param generated_column_name: the name of the column being generated
         :param cache: the cache in which to store key-value pairs. If None, caching will be disabled

diff --git a/src/sensai/data_transformation/dft.py b/src/sensai/data_transformation/dft.py
@@ -10,7 +10,6 @@
 from sklearn.preprocessing import OneHotEncoder
 
 from .sklearn_transformer import SkLearnTransformerProtocol
-from ..columngen import ColumnGenerator
 from ..util import flatten_arguments, count_not_none
 from ..util.pandas import DataFrameColumnChangeTracker
 from ..util.pickle import setstate
@@ -22,6 +21,8 @@
 
 if TYPE_CHECKING:
     from ..featuregen import FeatureGenerator
+    from ..columngen import ColumnGenerator
+
 
 log = logging.getLogger(__name__)
 
@@ -749,7 +750,7 @@ class DFTFromColumnGenerators(RuleBasedDataFrameTransformer):
     """
     Extends a data frame with columns generated from ColumnGenerator instances
     """
-    def __init__(self, column_generators: Sequence[ColumnGenerator], inplace=False):
+    def __init__(self, column_generators: Sequence['ColumnGenerator'], inplace=False):
         super().__init__()
         self.columnGenerators = column_generators
         self.inplace = inplace

diff --git a/src/sensai/distance_metric.py b/src/sensai/distance_metric.py
@@ -2,13 +2,13 @@
 import math
 import os
 from abc import abstractmethod, ABC
-from typing import Sequence, Tuple, List, Union
+from typing import Generic, Sequence, Tuple, List, Union
 
 import numpy as np
 import pandas as pd
 
 from .util import cache
-from .util.cache import DelayedUpdateHook
+from .util.cache import DelayedUpdateHook, TValue
 from .util.string import object_repr
 from .util.typing import PandasNamedTuple
 
@@ -42,8 +42,9 @@ def distance(self, named_tuple_a: PandasNamedTuple, named_tuple_b: PandasNamedTu
         return self._distance(value_a, value_b)
 
 
-class DistanceMatrixDFCache(cache.PersistentKeyValueCache):
-    def __init__(self, pickle_path, save_on_update=True, deferred_save_delay_secs=1.0):
+class DistanceMatrixDFCache(cache.PersistentKeyValueCache[Tuple[Union[str, int], Union[str, int]], TValue], Generic[TValue]):
+    """A cache for distance matrices, which are stored as dataframes with identifiers as both index and columns"""
+    def __init__(self, pickle_path: str, save_on_update: bool = True, deferred_save_delay_secs: float = 1.0):
         self.deferred_save_delay_secs = deferred_save_delay_secs
         self.save_on_update = save_on_update
         self.pickle_path = pickle_path
@@ -65,7 +66,7 @@ def shape(self):
     def _assert_tuple(key):
         assert isinstance(key, tuple) and len(key) == 2, f"Expected a tuple of two identifiers, instead got {key}"
 
-    def set(self, key: Tuple[Union[str, int], Union[str, int]], value):
+    def set(self, key: Tuple[Union[str, int], Union[str, int]], value: TValue):
         self._assert_tuple(key)
         for identifier in key:
             if identifier not in self.distance_df.columns:
@@ -83,15 +84,15 @@ def save(self):
         os.makedirs(os.path.dirname(self.pickle_path), exist_ok=True)
         self.distance_df.to_pickle(self.pickle_path)
 
-    def get(self, key: Tuple[Union[str, int], Union[str, int]]):
+    def get(self, key: Tuple[Union[str, int], Union[str, int]]) -> TValue:
         self._assert_tuple(key)
         i1, i2 = key
         try:
             pos1, pos2 = self.cached_id_to_pos_dict[i1], self.cached_id_to_pos_dict[i2]
         except KeyError:
             return None
         result = self.distance_df.iloc[pos1, pos2]
-        if result is None or np.isnan(result):
+        if np.isnan(result):
             return None
         return result
 
@@ -108,7 +109,7 @@ class CachedDistanceMetric(DistanceMetric, cache.CachedValueProviderMixin):
     value for the given pair of identifiers is not found within the persistent cache
     """
 
-    def __init__(self, distance_metric: DistanceMetric, key_value_cache: cache.PersistentKeyValueCache, persist_cache=False):
+    def __init__(self, distance_metric: DistanceMetric, key_value_cache: cache.KeyValueCache, persist_cache=False):
         cache.CachedValueProviderMixin.__init__(self, key_value_cache, persist_cache=persist_cache)
         self.metric = distance_metric
 

diff --git a/src/sensai/evaluation/eval_stats/eval_stats_classification.py b/src/sensai/evaluation/eval_stats/eval_stats_classification.py
@@ -326,6 +326,15 @@ def _compute_value(self, y_true: PredictionArray, y_predicted: PredictionArray,
         return f if f is not None else self.zero_value
 
 
+DEFAULT_MULTICLASS_CLASSIFICATION_METRICS = (ClassificationMetricAccuracy(), ClassificationMetricBalancedAccuracy(),
+                                             ClassificationMetricGeometricMeanOfTrueClassProbability())
+
+
+def create_default_binary_classification_metrics(positive_class_label: Any) -> List[BinaryClassificationMetric]:
+    return [BinaryClassificationMetricPrecision(positive_class_label), BinaryClassificationMetricRecall(positive_class_label),
+            BinaryClassificationMetricF1Score(positive_class_label)]
+
+
 class ClassificationEvalStats(PredictionEvalStats["ClassificationMetric"]):
     def __init__(self, y_predicted: Optional[PredictionArray] = None,
             y_true: Optional[PredictionArray] = None,
@@ -340,6 +349,7 @@ def __init__(self, y_predicted: Optional[PredictionArray] = None,
         :param y_predicted_class_probabilities: a data frame whose columns are the class labels and whose values are probabilities
         :param labels: the list of class labels
         :param metrics: the metrics to compute for evaluation; if None, use default metrics
+            (see DEFAULT_MULTICLASS_CLASSIFICATION_METRICS and :func:`create_default_binary_classification_metrics`)
         :param additional_metrics: the metrics to additionally compute
         :param binary_positive_label: the label of the positive class for the case where it is a binary classification, adding further
             binary metrics by default;
@@ -381,13 +391,9 @@ def __init__(self, y_predicted: Optional[PredictionArray] = None,
         self.is_binary = binary_positive_label is not None
 
         if metrics is None:
-            metrics = [ClassificationMetricAccuracy(), ClassificationMetricBalancedAccuracy(),
-                ClassificationMetricGeometricMeanOfTrueClassProbability()]
+            metrics = list(DEFAULT_MULTICLASS_CLASSIFICATION_METRICS)
             if self.is_binary:
-                metrics.extend([
-                    BinaryClassificationMetricPrecision(self.binary_positive_label),
-                    BinaryClassificationMetricRecall(self.binary_positive_label),
-                    BinaryClassificationMetricF1Score(self.binary_positive_label)])
+                metrics.extend(create_default_binary_classification_metrics(self.binary_positive_label))
 
         metrics = list(metrics)
         if additional_metrics is not None:

diff --git a/src/sensai/evaluation/eval_stats/eval_stats_regression.py b/src/sensai/evaluation/eval_stats/eval_stats_regression.py
@@ -112,6 +112,10 @@ def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: Vecto
         return np.median(cls.compute_abs_errors(y_true, y_predicted))
 
 
+DEFAULT_REGRESSION_METRICS = (RegressionMetricRRSE(), RegressionMetricR2(), RegressionMetricMAE(),
+        RegressionMetricMSE(), RegressionMetricRMSE(), RegressionMetricStdDevAE())
+
+
 class RegressionEvalStats(PredictionEvalStats["RegressionMetric"]):
     """
     Collects data for the evaluation of predicted continuous values and computes corresponding metrics
@@ -126,21 +130,19 @@ class RegressionEvalStats(PredictionEvalStats["RegressionMetric"]):
     SCATTER_PLOT_POINT_COLOR = (0, 0, 1, 0.05)
 
     def __init__(self, y_predicted: Optional[PredictionArray] = None, y_true: Optional[PredictionArray] = None,
-            metrics: Sequence["RegressionMetric"] = None, additional_metrics: Sequence["RegressionMetric"] = None,
+            metrics: Optional[Sequence["RegressionMetric"]] = None, additional_metrics: Sequence["RegressionMetric"] = None,
             model: VectorRegressionModel = None, io_data: InputOutputData = None):
         """
         :param y_predicted: the predicted values
         :param y_true: the true values
-        :param metrics: the metrics to compute for evaluation; if None, use default metrics
+        :param metrics: the metrics to compute for evaluation; if None, will use DEFAULT_REGRESSION_METRICS
         :param additional_metrics: the metrics to additionally compute
         """
         self.model = model
         self.ioData = io_data
 
         if metrics is None:
-            metrics = [RegressionMetricRRSE(), RegressionMetricR2(),
-                       RegressionMetricMAE(), RegressionMetricMSE(), RegressionMetricRMSE(),
-                       RegressionMetricStdDevAE()]
+            metrics = DEFAULT_REGRESSION_METRICS
         metrics = list(metrics)
 
         super().__init__(y_predicted, y_true, metrics, additional_metrics=additional_metrics)

diff --git a/src/sensai/featuregen/feature_generator.py b/src/sensai/featuregen/feature_generator.py
@@ -8,14 +8,15 @@
 import pandas as pd
 
 from .. import util, data_transformation
-from ..columngen import ColumnGenerator
 from ..data_transformation import DFTNormalisation, DFTFromFeatureGenerator, DataFrameTransformer
 from ..util import flatten_arguments
 from ..util.string import or_regex_group, ToStringMixin, list_string
 from ..util.typing import PandasNamedTuple
 
 if TYPE_CHECKING:
     from ..vector_model import VectorModel
+    from ..columngen import ColumnGenerator
+
 
 log = logging.getLogger(__name__)
 
@@ -392,7 +393,7 @@ class FeatureGeneratorFromNamedTuples(FeatureGenerator, ABC):
     Generates feature values for one data point at a time, creating a dictionary with
     feature values from each named tuple
     """
-    def __init__(self, cache: util.cache.PersistentKeyValueCache = None, categorical_feature_names: Sequence[str] = (),
+    def __init__(self, cache: util.cache.KeyValueCache = None, categorical_feature_names: Sequence[str] = (),
                  normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),
                  normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):
         super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,
@@ -532,7 +533,7 @@ class FeatureGeneratorFromColumnGenerator(RuleBasedFeatureGenerator):
     """
     log = log.getChild(__qualname__)
 
-    def __init__(self, column_gen: ColumnGenerator, take_input_column_if_present=False, is_categorical=False,
+    def __init__(self, column_gen: 'ColumnGenerator', take_input_column_if_present=False, is_categorical=False,
             normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):
         """
         :param column_gen: the underlying column generator
@@ -541,7 +542,7 @@ def __init__(self, column_gen: ColumnGenerator, take_input_column_if_present=Fal
             apply the columnGen to generate the output
         :param is_categorical: whether the resulting column is categorical
         :param normalisation_rule_template: template for a DFTNormalisation for the resulting column.
-            This should only be provided if isCategorical is False
+            This should only be provided if is_categorical is False
         """
         if is_categorical and normalisation_rule_template is not None:
             raise ValueError(f"normalisationRuleTemplate should be None when the generated column is categorical")

diff --git a/src/sensai/nearest_neighbors.py b/src/sensai/nearest_neighbors.py
@@ -343,7 +343,7 @@ def __init__(self, num_neighbors: int,
             neighbor_attributes: typing.List[str],
             distance_metric: DistanceMetric,
             neighbor_provider_factory: typing.Callable[[pd.DataFrame], NeighborProvider] = AllNeighborsProvider,
-            cache: util.cache.PersistentKeyValueCache = None,
+            cache: util.cache.KeyValueCache = None,
             categorical_feature_names: typing.Sequence[str] = (),
             normalisation_rules: typing.Sequence[data_transformation.DFTNormalisation.Rule] = ()):
         """

diff --git a/src/sensai/util/cache.py b/src/sensai/util/cache.py
@@ -32,7 +32,7 @@ def __init__(self, value: TValue):
         self.value = value
 
 
-class PersistentKeyValueCache(Generic[TKey, TValue], ABC):
+class KeyValueCache(Generic[TKey, TValue], ABC):
     @abstractmethod
     def set(self, key: TKey, value: TValue):
         """
@@ -55,6 +55,40 @@ def get(self, key: TKey) -> Optional[TValue]:
         pass
 
 
+class InMemoryKeyValueCache(KeyValueCache[TKey, TValue], Generic[TKey, TValue]):
+    """A simple in-memory cache (which uses a dictionary internally).
+
+    This class can be instantiated directly, but for better typing support, one can instead
+    inherit from it and provide the types of the key and value as type arguments. For example for
+    a cache with string keys and integer values:
+
+    .. code-block:: python
+
+        class MyCache(InMemoryKeyValueCache[str, int]):
+            pass
+    """
+    def __init__(self):
+        self.cache = {}
+
+    def set(self, key: TKey, value: TValue):
+        self.cache[key] = value
+
+    def get(self, key: TKey) -> Optional[TValue]:
+        return self.cache.get(key)
+
+    def empty(self):
+        self.cache = {}
+
+    def __len__(self):
+        return len(self.cache)
+
+
+
+# mainly kept as a marker and for backwards compatibility, but may be extended in the future
+class PersistentKeyValueCache(KeyValueCache[TKey, TValue], Generic[TKey, TValue], ABC):
+    pass
+
+
 class PersistentList(Generic[TValue], ABC):
     @abstractmethod
     def append(self, item: TValue):
@@ -536,8 +570,8 @@ class CachedValueProviderMixin(Generic[TKey, TValue, TData], ABC):
     Represents a value provider that can provide values associated with (hashable) keys via a cache or, if
     cached values are not yet present, by computing them.
     """
-    def __init__(self, cache: Optional[PersistentKeyValueCache[TKey, TValue]] = None,
-            cache_factory: Optional[Callable[[], PersistentKeyValueCache[TKey, TValue]]] = None, persist_cache=False, box_values=False):
+    def __init__(self, cache: Optional[KeyValueCache[TKey, TValue]] = None,
+            cache_factory: Optional[Callable[[], KeyValueCache[TKey, TValue]]] = None, persist_cache=False, box_values=False):
         """
         :param cache: the cache to use or None. If None, caching will be disabled
         :param cache_factory: a factory with which to create the cache (or recreate it after unpickling if `persistCache` is False, in which