Skip to content

Commit

Permalink
Merge pull request #86 from aai-institute/feature/minor-improvements
Browse files Browse the repository at this point in the history
Minor improvements
  • Loading branch information
opcode81 committed Feb 29, 2024
2 parents 7d20514 + 1776f62 commit 64231f2
Show file tree
Hide file tree
Showing 8 changed files with 99 additions and 32 deletions.
28 changes: 25 additions & 3 deletions src/sensai/columngen.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
import numpy as np
import pandas as pd

from .util.cache import PersistentKeyValueCache
from .data_transformation import DFTNormalisation
from .featuregen import FeatureGeneratorFromColumnGenerator
from .util.cache import KeyValueCache


log = logging.getLogger(__name__)
Expand Down Expand Up @@ -45,6 +47,26 @@ def _generate_column(self, df: pd.DataFrame) -> Union[pd.Series, list, np.ndarra
"""
pass

def to_feature_generator(self,
take_input_column_if_present: bool = False,
normalisation_rule_template: DFTNormalisation.RuleTemplate = None,
is_categorical: bool = False):
"""
Transforms this column generator into a feature generator that can be used as part of a VectorModel.
:param take_input_column_if_present: if True, then if a column whose name corresponds to the column to generate exists
in the input data, simply copy it to generate the output (without using the column generator); if False, always
apply the columnGen to generate the output
:param is_categorical: whether the resulting column is categorical
:param normalisation_rule_template: template for a DFTNormalisation for the resulting column.
This should only be provided if is_categorical is False
:return:
"""
return FeatureGeneratorFromColumnGenerator(self,
take_input_column_if_present=take_input_column_if_present,
normalisation_rule_template=normalisation_rule_template,
is_categorical=is_categorical)


class IndexCachedColumnGenerator(ColumnGenerator):
"""
Expand All @@ -57,7 +79,7 @@ class IndexCachedColumnGenerator(ColumnGenerator):

log = log.getChild(__qualname__)

def __init__(self, column_generator: ColumnGenerator, cache: PersistentKeyValueCache):
def __init__(self, column_generator: ColumnGenerator, cache: KeyValueCache):
"""
:param column_generator: the column generator with which to generate values for keys not found in the cache
:param cache: the cache in which to store key-value pairs
Expand Down Expand Up @@ -92,7 +114,7 @@ class ColumnGeneratorCachedByIndex(ColumnGenerator, ABC):

log = log.getChild(__qualname__)

def __init__(self, generated_column_name: str, cache: Optional[PersistentKeyValueCache], persist_cache=False):
def __init__(self, generated_column_name: str, cache: Optional[KeyValueCache], persist_cache=False):
"""
:param generated_column_name: the name of the column being generated
:param cache: the cache in which to store key-value pairs. If None, caching will be disabled
Expand Down
5 changes: 3 additions & 2 deletions src/sensai/data_transformation/dft.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from sklearn.preprocessing import OneHotEncoder

from .sklearn_transformer import SkLearnTransformerProtocol
from ..columngen import ColumnGenerator
from ..util import flatten_arguments, count_not_none
from ..util.pandas import DataFrameColumnChangeTracker
from ..util.pickle import setstate
Expand All @@ -22,6 +21,8 @@

if TYPE_CHECKING:
from ..featuregen import FeatureGenerator
from ..columngen import ColumnGenerator


log = logging.getLogger(__name__)

Expand Down Expand Up @@ -749,7 +750,7 @@ class DFTFromColumnGenerators(RuleBasedDataFrameTransformer):
"""
Extends a data frame with columns generated from ColumnGenerator instances
"""
def __init__(self, column_generators: Sequence[ColumnGenerator], inplace=False):
def __init__(self, column_generators: Sequence['ColumnGenerator'], inplace=False):
super().__init__()
self.columnGenerators = column_generators
self.inplace = inplace
Expand Down
17 changes: 9 additions & 8 deletions src/sensai/distance_metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
import math
import os
from abc import abstractmethod, ABC
from typing import Sequence, Tuple, List, Union
from typing import Generic, Sequence, Tuple, List, Union

import numpy as np
import pandas as pd

from .util import cache
from .util.cache import DelayedUpdateHook
from .util.cache import DelayedUpdateHook, TValue
from .util.string import object_repr
from .util.typing import PandasNamedTuple

Expand Down Expand Up @@ -42,8 +42,9 @@ def distance(self, named_tuple_a: PandasNamedTuple, named_tuple_b: PandasNamedTu
return self._distance(value_a, value_b)


class DistanceMatrixDFCache(cache.PersistentKeyValueCache):
def __init__(self, pickle_path, save_on_update=True, deferred_save_delay_secs=1.0):
class DistanceMatrixDFCache(cache.PersistentKeyValueCache[Tuple[Union[str, int], Union[str, int]], TValue], Generic[TValue]):
"""A cache for distance matrices, which are stored as dataframes with identifiers as both index and columns"""
def __init__(self, pickle_path: str, save_on_update: bool = True, deferred_save_delay_secs: float = 1.0):
self.deferred_save_delay_secs = deferred_save_delay_secs
self.save_on_update = save_on_update
self.pickle_path = pickle_path
Expand All @@ -65,7 +66,7 @@ def shape(self):
def _assert_tuple(key):
assert isinstance(key, tuple) and len(key) == 2, f"Expected a tuple of two identifiers, instead got {key}"

def set(self, key: Tuple[Union[str, int], Union[str, int]], value):
def set(self, key: Tuple[Union[str, int], Union[str, int]], value: TValue):
self._assert_tuple(key)
for identifier in key:
if identifier not in self.distance_df.columns:
Expand All @@ -83,15 +84,15 @@ def save(self):
os.makedirs(os.path.dirname(self.pickle_path), exist_ok=True)
self.distance_df.to_pickle(self.pickle_path)

def get(self, key: Tuple[Union[str, int], Union[str, int]]):
def get(self, key: Tuple[Union[str, int], Union[str, int]]) -> TValue:
self._assert_tuple(key)
i1, i2 = key
try:
pos1, pos2 = self.cached_id_to_pos_dict[i1], self.cached_id_to_pos_dict[i2]
except KeyError:
return None
result = self.distance_df.iloc[pos1, pos2]
if result is None or np.isnan(result):
if np.isnan(result):
return None
return result

Expand All @@ -108,7 +109,7 @@ class CachedDistanceMetric(DistanceMetric, cache.CachedValueProviderMixin):
value for the given pair of identifiers is not found within the persistent cache
"""

def __init__(self, distance_metric: DistanceMetric, key_value_cache: cache.PersistentKeyValueCache, persist_cache=False):
def __init__(self, distance_metric: DistanceMetric, key_value_cache: cache.KeyValueCache, persist_cache=False):
cache.CachedValueProviderMixin.__init__(self, key_value_cache, persist_cache=persist_cache)
self.metric = distance_metric

Expand Down
18 changes: 12 additions & 6 deletions src/sensai/evaluation/eval_stats/eval_stats_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,15 @@ def _compute_value(self, y_true: PredictionArray, y_predicted: PredictionArray,
return f if f is not None else self.zero_value


DEFAULT_MULTICLASS_CLASSIFICATION_METRICS = (ClassificationMetricAccuracy(), ClassificationMetricBalancedAccuracy(),
ClassificationMetricGeometricMeanOfTrueClassProbability())


def create_default_binary_classification_metrics(positive_class_label: Any) -> List[BinaryClassificationMetric]:
return [BinaryClassificationMetricPrecision(positive_class_label), BinaryClassificationMetricRecall(positive_class_label),
BinaryClassificationMetricF1Score(positive_class_label)]


class ClassificationEvalStats(PredictionEvalStats["ClassificationMetric"]):
def __init__(self, y_predicted: Optional[PredictionArray] = None,
y_true: Optional[PredictionArray] = None,
Expand All @@ -340,6 +349,7 @@ def __init__(self, y_predicted: Optional[PredictionArray] = None,
:param y_predicted_class_probabilities: a data frame whose columns are the class labels and whose values are probabilities
:param labels: the list of class labels
:param metrics: the metrics to compute for evaluation; if None, use default metrics
(see DEFAULT_MULTICLASS_CLASSIFICATION_METRICS and :func:`create_default_binary_classification_metrics`)
:param additional_metrics: the metrics to additionally compute
:param binary_positive_label: the label of the positive class for the case where it is a binary classification, adding further
binary metrics by default;
Expand Down Expand Up @@ -381,13 +391,9 @@ def __init__(self, y_predicted: Optional[PredictionArray] = None,
self.is_binary = binary_positive_label is not None

if metrics is None:
metrics = [ClassificationMetricAccuracy(), ClassificationMetricBalancedAccuracy(),
ClassificationMetricGeometricMeanOfTrueClassProbability()]
metrics = list(DEFAULT_MULTICLASS_CLASSIFICATION_METRICS)
if self.is_binary:
metrics.extend([
BinaryClassificationMetricPrecision(self.binary_positive_label),
BinaryClassificationMetricRecall(self.binary_positive_label),
BinaryClassificationMetricF1Score(self.binary_positive_label)])
metrics.extend(create_default_binary_classification_metrics(self.binary_positive_label))

metrics = list(metrics)
if additional_metrics is not None:
Expand Down
12 changes: 7 additions & 5 deletions src/sensai/evaluation/eval_stats/eval_stats_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: Vecto
return np.median(cls.compute_abs_errors(y_true, y_predicted))


DEFAULT_REGRESSION_METRICS = (RegressionMetricRRSE(), RegressionMetricR2(), RegressionMetricMAE(),
RegressionMetricMSE(), RegressionMetricRMSE(), RegressionMetricStdDevAE())


class RegressionEvalStats(PredictionEvalStats["RegressionMetric"]):
"""
Collects data for the evaluation of predicted continuous values and computes corresponding metrics
Expand All @@ -126,21 +130,19 @@ class RegressionEvalStats(PredictionEvalStats["RegressionMetric"]):
SCATTER_PLOT_POINT_COLOR = (0, 0, 1, 0.05)

def __init__(self, y_predicted: Optional[PredictionArray] = None, y_true: Optional[PredictionArray] = None,
metrics: Sequence["RegressionMetric"] = None, additional_metrics: Sequence["RegressionMetric"] = None,
metrics: Optional[Sequence["RegressionMetric"]] = None, additional_metrics: Sequence["RegressionMetric"] = None,
model: VectorRegressionModel = None, io_data: InputOutputData = None):
"""
:param y_predicted: the predicted values
:param y_true: the true values
:param metrics: the metrics to compute for evaluation; if None, use default metrics
:param metrics: the metrics to compute for evaluation; if None, will use DEFAULT_REGRESSION_METRICS
:param additional_metrics: the metrics to additionally compute
"""
self.model = model
self.ioData = io_data

if metrics is None:
metrics = [RegressionMetricRRSE(), RegressionMetricR2(),
RegressionMetricMAE(), RegressionMetricMSE(), RegressionMetricRMSE(),
RegressionMetricStdDevAE()]
metrics = DEFAULT_REGRESSION_METRICS
metrics = list(metrics)

super().__init__(y_predicted, y_true, metrics, additional_metrics=additional_metrics)
Expand Down
9 changes: 5 additions & 4 deletions src/sensai/featuregen/feature_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
import pandas as pd

from .. import util, data_transformation
from ..columngen import ColumnGenerator
from ..data_transformation import DFTNormalisation, DFTFromFeatureGenerator, DataFrameTransformer
from ..util import flatten_arguments
from ..util.string import or_regex_group, ToStringMixin, list_string
from ..util.typing import PandasNamedTuple

if TYPE_CHECKING:
from ..vector_model import VectorModel
from ..columngen import ColumnGenerator


log = logging.getLogger(__name__)

Expand Down Expand Up @@ -392,7 +393,7 @@ class FeatureGeneratorFromNamedTuples(FeatureGenerator, ABC):
Generates feature values for one data point at a time, creating a dictionary with
feature values from each named tuple
"""
def __init__(self, cache: util.cache.PersistentKeyValueCache = None, categorical_feature_names: Sequence[str] = (),
def __init__(self, cache: util.cache.KeyValueCache = None, categorical_feature_names: Sequence[str] = (),
normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),
normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):
super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,
Expand Down Expand Up @@ -532,7 +533,7 @@ class FeatureGeneratorFromColumnGenerator(RuleBasedFeatureGenerator):
"""
log = log.getChild(__qualname__)

def __init__(self, column_gen: ColumnGenerator, take_input_column_if_present=False, is_categorical=False,
def __init__(self, column_gen: 'ColumnGenerator', take_input_column_if_present=False, is_categorical=False,
normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):
"""
:param column_gen: the underlying column generator
Expand All @@ -541,7 +542,7 @@ def __init__(self, column_gen: ColumnGenerator, take_input_column_if_present=Fal
apply the columnGen to generate the output
:param is_categorical: whether the resulting column is categorical
:param normalisation_rule_template: template for a DFTNormalisation for the resulting column.
This should only be provided if isCategorical is False
This should only be provided if is_categorical is False
"""
if is_categorical and normalisation_rule_template is not None:
raise ValueError(f"normalisationRuleTemplate should be None when the generated column is categorical")
Expand Down
2 changes: 1 addition & 1 deletion src/sensai/nearest_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ def __init__(self, num_neighbors: int,
neighbor_attributes: typing.List[str],
distance_metric: DistanceMetric,
neighbor_provider_factory: typing.Callable[[pd.DataFrame], NeighborProvider] = AllNeighborsProvider,
cache: util.cache.PersistentKeyValueCache = None,
cache: util.cache.KeyValueCache = None,
categorical_feature_names: typing.Sequence[str] = (),
normalisation_rules: typing.Sequence[data_transformation.DFTNormalisation.Rule] = ()):
"""
Expand Down
40 changes: 37 additions & 3 deletions src/sensai/util/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self, value: TValue):
self.value = value


class PersistentKeyValueCache(Generic[TKey, TValue], ABC):
class KeyValueCache(Generic[TKey, TValue], ABC):
@abstractmethod
def set(self, key: TKey, value: TValue):
"""
Expand All @@ -55,6 +55,40 @@ def get(self, key: TKey) -> Optional[TValue]:
pass


class InMemoryKeyValueCache(KeyValueCache[TKey, TValue], Generic[TKey, TValue]):
"""A simple in-memory cache (which uses a dictionary internally).
This class can be instantiated directly, but for better typing support, one can instead
inherit from it and provide the types of the key and value as type arguments. For example for
a cache with string keys and integer values:
.. code-block:: python
class MyCache(InMemoryKeyValueCache[str, int]):
pass
"""
def __init__(self):
self.cache = {}

def set(self, key: TKey, value: TValue):
self.cache[key] = value

def get(self, key: TKey) -> Optional[TValue]:
return self.cache.get(key)

def empty(self):
self.cache = {}

def __len__(self):
return len(self.cache)



# mainly kept as a marker and for backwards compatibility, but may be extended in the future
class PersistentKeyValueCache(KeyValueCache[TKey, TValue], Generic[TKey, TValue], ABC):
pass


class PersistentList(Generic[TValue], ABC):
@abstractmethod
def append(self, item: TValue):
Expand Down Expand Up @@ -536,8 +570,8 @@ class CachedValueProviderMixin(Generic[TKey, TValue, TData], ABC):
Represents a value provider that can provide values associated with (hashable) keys via a cache or, if
cached values are not yet present, by computing them.
"""
def __init__(self, cache: Optional[PersistentKeyValueCache[TKey, TValue]] = None,
cache_factory: Optional[Callable[[], PersistentKeyValueCache[TKey, TValue]]] = None, persist_cache=False, box_values=False):
def __init__(self, cache: Optional[KeyValueCache[TKey, TValue]] = None,
cache_factory: Optional[Callable[[], KeyValueCache[TKey, TValue]]] = None, persist_cache=False, box_values=False):
"""
:param cache: the cache to use or None. If None, caching will be disabled
:param cache_factory: a factory with which to create the cache (or recreate it after unpickling if `persistCache` is False, in which
Expand Down

0 comments on commit 64231f2

Please sign in to comment.