cbrkit.sim

CBRkit contains a selection of similarity measures for different data types. Besides measures for standard data types like numbers (cbrkit.sim.numbers), strings (cbrkit.sim.strings), lists/collections (cbrkit.sim.collections), and generic data (cbrkit.sim.generic), there is also a measure for attribute-value data. Additionally, the module contains an aggregator to combine multiple local measures into a global score.

 1"""
 2CBRkit contains a selection of similarity measures for different data types.
 3Besides measures for standard data types like
 4numbers (`cbrkit.sim.numbers`),
 5strings (`cbrkit.sim.strings`),
 6lists/collections (`cbrkit.sim.collections`),
 7and generic data (`cbrkit.sim.generic`),
 8there is also a measure for attribute-value data.
 9Additionally, the module contains an aggregator to combine multiple local measures into a global score.
10"""
11
12from . import collections, generic, numbers, strings
13from ._aggregator import PoolingName, aggregator
14from ._attribute_value import AttributeValueData, AttributeValueSim, attribute_value
15
16__all__ = [
17    "collections",
18    "generic",
19    "numbers",
20    "strings",
21    "attribute_value",
22    "aggregator",
23    "PoolingName",
24    "AttributeValueData",
25    "AttributeValueSim",
26]
def attribute_value( attributes: collections.abc.Mapping[str, typing.Union[cbrkit.typing.SimMapFunc[~KeyType, typing.Any, ~SimType], cbrkit.typing.SimSeqFunc[typing.Any, ~SimType], cbrkit.typing.SimPairFunc[typing.Any, ~SimType]]] | None = None, types: collections.abc.Mapping[type[typing.Any], typing.Union[cbrkit.typing.SimMapFunc[~KeyType, typing.Any, ~SimType], cbrkit.typing.SimSeqFunc[typing.Any, ~SimType], cbrkit.typing.SimPairFunc[typing.Any, ~SimType]]] | None = None, types_fallback: Union[cbrkit.typing.SimMapFunc[~KeyType, Any, ~SimType], cbrkit.typing.SimSeqFunc[Any, ~SimType], cbrkit.typing.SimPairFunc[Any, ~SimType], NoneType] = None, aggregator: cbrkit.typing.AggregatorFunc[str, ~SimType] = <function aggregator.<locals>.wrapped_func>, value_getter: collections.abc.Callable[[typing.Any, str], typing.Any] = <function _value_getter>, key_getter: collections.abc.Callable[[typing.Any], collections.abc.Iterator[str]] = <function _key_getter>) -> cbrkit.typing.SimMapFunc[typing.Any, collections.abc.Mapping[typing.Any, typing.Any] | pandas.core.series.Series, AttributeValueSim[~SimType]]:
 54def attribute_value(
 55    attributes: Mapping[str, AnySimFunc[KeyType, Any, SimType]] | None = None,
 56    types: Mapping[type[Any], AnySimFunc[KeyType, Any, SimType]] | None = None,
 57    types_fallback: AnySimFunc[KeyType, Any, SimType] | None = None,
 58    aggregator: AggregatorFunc[str, SimType] = _aggregator,
 59    value_getter: Callable[[Any, str], Any] = _value_getter,
 60    key_getter: Callable[[Any], Iterator[str]] = _key_getter,
 61) -> SimMapFunc[Any, AttributeValueData, AttributeValueSim[SimType]]:
 62    """
 63    Similarity function that computes the attribute value similarity between two cases.
 64
 65    Args:
 66        attributes: A mapping of attribute names to the similarity functions to be used for those attributes. Takes precedence over types.
 67        types: A mapping of attribute types to the similarity functions to be used for those types.
 68        types_fallback: A similarity function to be used as a fallback when no specific similarity function
 69            is defined for an attribute type.
 70        aggregator: A function that aggregates the local similarity scores for each attribute into a single global similarity.
 71        value_getter: A function that retrieves the value of an attribute from a case.
 72        key_getter: A function that retrieves the attribute names from a target case.
 73
 74    Examples:
 75        >>> equality = lambda x, y: 1.0 if x == y else 0.0
 76        >>> sim = attribute_value(
 77        ...     attributes={
 78        ...         "name": equality,
 79        ...         "age": equality,
 80        ...     },
 81        ... )
 82        >>> scores = sim(
 83        ...     {
 84        ...         "a": {"name": "John", "age": 25},
 85        ...         "b": {"name": "Jane", "age": 30},
 86        ...     },
 87        ...     {"name": "John", "age": 30},
 88        ... )
 89        >>> scores["a"]
 90        AttributeValueSim(value=0.5, by_attribute={'age': 0.0, 'name': 1.0})
 91        >>> scores["b"]
 92        AttributeValueSim(value=0.5, by_attribute={'age': 1.0, 'name': 0.0})
 93    """
 94
 95    attributes_map: Mapping[str, AnySimFunc[KeyType, Any, SimType]] = (
 96        {} if attributes is None else attributes
 97    )
 98    types_map: Mapping[type[Any], AnySimFunc[KeyType, Any, SimType]] = (
 99        {} if types is None else types
100    )
101
102    def wrapped_func(
103        x_map: Casebase[KeyType, ValueType], y: ValueType
104    ) -> SimMap[KeyType, AttributeValueSim[SimType]]:
105        local_sims: defaultdict[KeyType, dict[str, SimType]] = defaultdict(dict)
106
107        attribute_names = (
108            set(attributes_map).intersection(key_getter(y))
109            if len(attributes_map) > 0
110            and len(types_map) == 0
111            and types_fallback is None
112            else set(key_getter(y))
113        )
114
115        for attr_name in attribute_names:
116            x_attributes = {
117                key: value_getter(value, attr_name) for key, value in x_map.items()
118            }
119            y_attribute = value_getter(y, attr_name)
120            attr_type = type(y_attribute)
121
122            sim_func = (
123                attributes_map[attr_name]
124                if attr_name in attributes_map
125                else types_map.get(attr_type, types_fallback)
126            )
127
128            assert (
129                sim_func is not None
130            ), f"no similarity function for {attr_name} with type {attr_type}"
131
132            sim_func = sim2map(sim_func)
133            sim_func_result = sim_func(x_attributes, y_attribute)
134
135            for key, sim in sim_func_result.items():
136                local_sims[key][attr_name] = sim
137
138        return {
139            key: AttributeValueSim(aggregator(sims), sims)
140            for key, sims in local_sims.items()
141        }
142
143    return wrapped_func

Similarity function that computes the attribute value similarity between two cases.

Arguments:
  • attributes: A mapping of attribute names to the similarity functions to be used for those attributes. Takes precedence over types.
  • types: A mapping of attribute types to the similarity functions to be used for those types.
  • types_fallback: A similarity function to be used as a fallback when no specific similarity function is defined for an attribute type.
  • aggregator: A function that aggregates the local similarity scores for each attribute into a single global similarity.
  • value_getter: A function that retrieves the value of an attribute from a case.
  • key_getter: A function that retrieves the attribute names from a target case.
Examples:
>>> equality = lambda x, y: 1.0 if x == y else 0.0
>>> sim = attribute_value(
...     attributes={
...         "name": equality,
...         "age": equality,
...     },
... )
>>> scores = sim(
...     {
...         "a": {"name": "John", "age": 25},
...         "b": {"name": "Jane", "age": 30},
...     },
...     {"name": "John", "age": 30},
... )
>>> scores["a"]
AttributeValueSim(value=0.5, by_attribute={'age': 0.0, 'name': 1.0})
>>> scores["b"]
AttributeValueSim(value=0.5, by_attribute={'age': 1.0, 'name': 0.0})
def aggregator( pooling: Union[Literal['mean', 'fmean', 'geometric_mean', 'harmonic_mean', 'median', 'median_low', 'median_high', 'mode', 'min', 'max', 'sum'], cbrkit.typing.PoolingFunc] = 'mean', pooling_weights: collections.abc.Mapping[~KeyType, float] | collections.abc.Sequence[float] | None = None, default_pooling_weight: float = 1.0) -> cbrkit.typing.AggregatorFunc[~KeyType, float | cbrkit.typing.FloatProtocol]:
50def aggregator(
51    pooling: PoolingName | PoolingFunc = "mean",
52    pooling_weights: SimSeqOrMap[KeyType, float] | None = None,
53    default_pooling_weight: float = 1.0,
54) -> AggregatorFunc[KeyType, AnyFloat]:
55    """
56    Aggregates local similarities to a global similarity using the specified pooling function.
57
58    Args:
59        pooling: The pooling function to use. It can be either a string representing the name of the pooling function or a custom pooling function (see `cbrkit.typing.PoolingFunc`).
60        pooling_weights: The weights to apply to the similarities during pooling. It can be a sequence or a mapping. If None, every local similarity is weighted equally.
61        default_pooling_weight: The default weight to use if a similarity key is not found in the pooling_weights mapping.
62
63    Examples:
64        >>> agg = aggregator("mean")
65        >>> agg([0.5, 0.75, 1.0])
66        0.75
67    """
68
69    pooling_func = _pooling_funcs[pooling] if isinstance(pooling, str) else pooling
70
71    def wrapped_func(similarities: SimSeqOrMap[KeyType, AnyFloat]) -> float:
72        assert pooling_weights is None or type(similarities) == type(pooling_weights)
73
74        sims: Sequence[float]  # noqa: F821
75
76        if isinstance(similarities, Mapping) and isinstance(pooling_weights, Mapping):
77            sims = [
78                unpack_sim(sim) * pooling_weights.get(key, default_pooling_weight)
79                for key, sim in similarities.items()
80            ]
81        elif isinstance(similarities, Sequence) and isinstance(
82            pooling_weights, Sequence
83        ):
84            sims = [
85                unpack_sim(s) * w
86                for s, w in zip(similarities, pooling_weights, strict=True)
87            ]
88        elif isinstance(similarities, Sequence) and pooling_weights is None:
89            sims = [unpack_sim(s) for s in similarities]
90        elif isinstance(similarities, Mapping) and pooling_weights is None:
91            sims = [unpack_sim(s) for s in similarities.values()]
92        else:
93            raise NotImplementedError()
94
95        return pooling_func(sims)
96
97    return wrapped_func

Aggregates local similarities to a global similarity using the specified pooling function.

Arguments:
  • pooling: The pooling function to use. It can be either a string representing the name of the pooling function or a custom pooling function (see cbrkit.typing.PoolingFunc).
  • pooling_weights: The weights to apply to the similarities during pooling. It can be a sequence or a mapping. If None, every local similarity is weighted equally.
  • default_pooling_weight: The default weight to use if a similarity key is not found in the pooling_weights mapping.
Examples:
>>> agg = aggregator("mean")
>>> agg([0.5, 0.75, 1.0])
0.75
PoolingName = typing.Literal['mean', 'fmean', 'geometric_mean', 'harmonic_mean', 'median', 'median_low', 'median_high', 'mode', 'min', 'max', 'sum']
AttributeValueData = collections.abc.Mapping[typing.Any, typing.Any] | pandas.core.series.Series
@dataclass(slots=True, frozen=True)
class AttributeValueSim(cbrkit.typing.FloatProtocol, typing.Generic[~SimType]):
45@dataclass(slots=True, frozen=True)
46class AttributeValueSim(FloatProtocol, Generic[SimType]):
47    value: float
48    by_attribute: Mapping[str, SimType]
by_attribute: collections.abc.Mapping[str, ~SimType]