AllUniqueError

Bases: ValueError

When there are no hierchical columns because everything is unique within it.

Source code in wt_ml/layers/hier_embedding.py
21
22
class AllUniqueError(ValueError):
    """When there are no hierchical columns because everything is unique within it."""

HierarchyChangedWarning

Bases: UserWarning, ValueError

Passed hierarchy columns have changed.

Source code in wt_ml/layers/hier_embedding.py
17
18
class HierarchyChangedWarning(UserWarning, ValueError):
    """Passed hierarchy columns have changed."""

HierchicalEmbedding

Bases: Module

Hierarchical Embedding creates embeddings for a layer with different input hierarchy levels as trainable weights such that the deviations from the expected deviations are penalized. These trained embeddings are used to calculate the model parameters for a layer.

Source code in wt_ml/layers/hier_embedding.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
class HierchicalEmbedding(Module):
    """Hierarchical Embedding creates embeddings for a layer with different input hierarchy levels
    as trainable weights such that the deviations from the expected deviations are penalized.
    These trained embeddings are used to calculate the model parameters for a layer.
    """

    def __init__(
        self,
        shape: list[int],
        encodings: dict[str, Any],
        columns: list[str | list[str]] | None = None,
        use_bias: bool = True,
        dropped_columns=[],
        initializer: Initializer = 0.0,
        bias_initializer: Initializer = 0.0,
        hyperparameters: Hyperparams | None = None,
        feature_names: list[list[str]] | list[str] | None = None,
        name: str | None = None,
        increase_lr: float | None = None,
    ):
        """Initializes the hierarchical embedding object with hierarchy levels, parameter shape
        and other initializers.

        Args:
            shape (list[int]): Desired dimensions of model parameters only within final result.
            hierarchy (pd.DataFrame): The hierarchy for which embeddings are trained.
            columns (list[str  |  list[str]] | None, optional): Hierarchy levels to learn embeddings. Defaults to None.
            use_bias (bool, optional): Whether to include bias. Defaults to True.
            dropped_columns (list, optional): Columns to exclude in hierarchy.
                                        Defaults to ["granular", "region", "coastal", "populationdensity", "medianage"].
            initializer (Initializer, optional): Initializer for embeddings(weights). Defaults to 0.0.
            bias_initializer (Initializer, optional): Initializer for bias. Defaults to 0.0.
            hyperparameters (Hyperparams | None, optional): Dictionary of hyperparameters for buidling this layer.
                                                            Defaults to None.
            name (str | None, optional): Name of the layer. Defaults to None.
        """
        super().__init__(hyperparameters=hyperparameters, name=name)
        self.use_bias = use_bias
        encodings_dropped = {k: v for k, v in encodings.items() if k not in dropped_columns}
        assert encodings_dropped, "No cols in hierarchy."
        self.encodings = encodings_dropped
        self.shape = shape
        self.increase_lr = increase_lr
        self.initializer = initializer
        self.bias_initializer = bias_initializer
        self.created_reg = False
        self.feature_names = feature_names
        if self.feature_names is not None and isinstance(self.feature_names[0], list):
            self.feature_names = [elem for sublist in self.feature_names for elem in sublist]
        if columns is not None:
            self._process_columns(columns)
        else:
            self.columns = list(self.encodings.keys())
            self.used_cols = set(self.columns)

    def _process_columns(self, columns: list[str | list[str]]):
        """
        Process columns and remove duplicates or column(s) which has unique hierarchies.
        Sets `columns` and `used_cols` attribute.

        Args:
            columns (list[str  |  list[str]]): Hierarchy levels to learn embeddings.
        """
        used_cols = set(tf.nest.flatten(columns))
        missing_cols = used_cols.difference(self.encodings.keys())
        assert not missing_cols, f"Column(s) passed not in hierarchy. {missing_cols}"

        issues: list[str] = []
        new_columns: list[str | tuple[str]] = []
        for column in columns:
            if isinstance(column, (list, tuple)):
                new_column = []
                for sub_col in column:
                    encodings = self.encodings[sub_col]
                    if encodings == "continuous" or len(encodings) > 1:
                        new_column.append(sub_col)
                    else:
                        issues.append(f"{sub_col} in {column} has single encoding.")

                if len(new_column) == 1:
                    # convert to str so duplicates can be detected easily.
                    new_columns.append(new_column[0])
                elif len(new_column) > 1:
                    new_columns.append(tuple(new_column))
                else:
                    issues.append(f"Dropping {column} as it has unique encodings.")
            else:
                encodings = self.encodings[column]
                if encodings == "continuous" or len(encodings) > 1:
                    new_columns.append(column)
                else:
                    issues.append(f"Dropping {column} as it has single encoding.")

        org_col_len = len(new_columns)
        # if any column is duplicated, we need to get rid of it.
        new_columns = list(dict.fromkeys(new_columns))
        assert len(new_columns), "All columns are dopped since they are all unique."
        if len(new_columns) != org_col_len:
            issues.append("Duplicate hierarchies removed.")

        if issues:
            warn_issues(self.name, issues, new_columns, columns)

        self.columns = new_columns
        self.used_cols = set(tf.nest.flatten(self.columns))

    def build(self, input_shapes):  # noqa: U100
        """Builds hyperparamters, deviations, embeddings(weights), bias and other intermediate variables.

        Args:
            input_shapes (InputShapes): The effect and hierarchy shapes.

        Raises:
            AllUniqueError: When there are no hierchical columns because everything is unique within it.
        """
        self.use_l2_squared = self.hyperparameters.get_bool(
            "use_l2_squared",
            default=False,
            help="Use the l2 norm to the fourth power instead of only using it for large values for stability.",
        )
        self.desired_stddev = self.hyperparameters.get_float(
            "desired_stddev",
            default=0.10,
            min=0.01,
            max=100.0,
            help="The desired maximum value for the stddev along the full hierarchy.",
        )
        self.use_inv_sqrt = self.hyperparameters.get_bool(
            "use_inv_sqrt",
            default=True,
            help="Scale the stddev for each category by the inverse square root of the number of unique values.",
        )

        if self.use_bias:
            self.reg_bias = self.hyperparameters.get_float(
                "reg_bias",
                default=0.0,
                min=0.0,
                max=1e4,
                help="The strength of l2 regularization to apply to the bias term.",
            )
        self.offsets = {}
        self.col_counts = {
            k: (
                self.encodings[k]
                if isinstance(self.encodings[k], (float, int))
                else ((max(self.encodings[k].values()) + 1) if not isinstance(self.encodings[k], str) else 1)
            )
            for k in tf.nest.flatten(self.columns)
        }
        var_counts = [
            self.col_counts[col] if isinstance(col, str) else np.prod([self.col_counts[k] for k in col])
            for col in self.columns
        ]

        self.columns = [col for col, count in zip(self.columns, var_counts) if count > 1 or self.is_continuous(col)]
        count = 0
        desired_stddevs = []
        # Scatters is the inverse of gathering from num_regularized_categories + 1 to count of weights
        self.scatters = []
        self.penalty_mults = []
        multipliers = []
        reg_counts = []
        flattened = int(np.prod(self.shape)) if len(self.shape) > 0 else 1
        for col_names in self.columns:
            if isinstance(col_names, str):
                col_names = [col_names]
            cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
            name = self.stitched_cols(col_names)
            num_cont_cols = len(col_names) - len(cat_cols)
            if num_cont_cols > 1:
                raise ValueError(
                    "You can only have one continuous hierarchical variable within a single hierarchical level"
                )
            number = int(np.prod([self.col_counts[k] for k in col_names]))
            multipliers.append(1 / number)
            self.scatters += [len(multipliers)] * number
            desired_stddevs.append(1 / np.sqrt(number) if self.use_inv_sqrt else 1)
            reg_counts.append(max(1, number - 1))
            self.offsets[name] = count
            count += number
            self.penalty_mults.append(self.get_reg_mult(col_names))

        if count == 0 or len(multipliers) == 0:
            raise AllUniqueError("There are no hierchical columns everything is unique.")
        # scatters is shape (count,)
        self.scatters = np.array(self.scatters)
        # multipliers is shape (1 + regularized_counts,)
        self.multipliers = np.array([0] + multipliers, dtype=np.float32)
        self.penalty_mults = np.array(self.penalty_mults, dtype=np.float32)
        self.dense_shape = [len(self.multipliers), flattened]
        # desired_* is shape (regularized_counts,)
        self.desired_stddevs = (
            self.desired_stddev * np.array(desired_stddevs or [1], dtype=np.float32) / np.sqrt(max(1, len(multipliers)))
        )
        self.desired_l2norms = (
            np.array(reg_counts, dtype=np.float32) * self.desired_stddevs**2 / (1 if self.use_l2_squared else 2)
        )
        self.weights = self.create_var(
            "weights", shape=[count, flattened], dtype=tf.float32, trainable=True, initializer=self.initializer
        )
        if self.use_bias:
            self.bias = self.create_var(
                "bias", shape=[flattened], dtype=tf.float32, trainable=True, initializer=self.bias_initializer
            )

    def stitched_cols(self, col_names: str | list[str] | tuple) -> str:
        """Returns a string representation of the columns."""
        return col_names if isinstance(col_names, str) else "-".join(col_names)

    def get_reg_config(self, col_names: tuple[str] | list[str]) -> tuple[str, float]:
        """Creates name used for regularization and default value for the penalty multiplier.
        If all columns are categorical, we can just join their names in order to find penalty.
        Otherwise, when different continuous features are paired with a same categorical column,
        the resulting hierarchical categories share same penalty. Always, suffix the continuous
        string to the end of the name.
        Example: brand-DEM and brand-GOP have the same penalty called reg_brand-continuous.
        Examples of mixed categories: reg_brand-continuous, reg_vehicle-continuous.

        Args:
            col_names (list[str]): Hierarchical column names.

        Returns:
            tuple[str, float]: Regularization penalty name and the default value.
        """
        count = int(np.prod([self.col_counts[k] for k in col_names]))
        if count == 1:
            # Purely continuous features
            default_value = 0.0
        else:
            default_value = 1.0
        names = [name for name in col_names if not self.is_continuous(name)]
        names.append("continuous") if self.is_continuous(col_names) else None
        reg_name = f"reg_{self.stitched_cols(names)}"
        return reg_name, default_value

    def get_reg_mult(self, col_names: list[str] | tuple[str]) -> float:
        """Returns the penalty multiplier for hierarchy level reg loss."""
        reg_name, default = self.get_reg_config(col_names)
        mult = self.hyperparameters.get_float(
            name=reg_name,
            default=default,
            help="Penalty multiplier for hierarchy level reg loss.",
        )
        return mult

    def is_continuous(self, k: str | Iterable[str]) -> bool:
        return is_continuous(k, self.encodings)

    def get_hierarchical_parameters(self, hierarchy: Mapping[str, TensorLike]) -> tuple[tf.Tensor, tf.Tensor]:
        """Returns the model parameters' for every hierarchical level (non-aggregated weights)

        Args:
            hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.

        NOTE: this currently does not depend on training flag. Possible we change how things work such that it will.

        Returns:

            tuple[tf.Tensor, tf.Tensor]: weights, indices

                the 1st list[tf.Tensor]=A: A[i] corresponds to the multiplicative data for the continuous aspects
                                                of the hierarchy in self.columns[i]
                the 2nd list[tf.Tensor]=B: B[i] corresponds to the indices in self.weights that corresponds to the
                                                correct learned coefficients of the hierarchy in self.columns[i]
        """
        # Shape is [count, ...] for both of these
        weights = []
        indices = []
        for col_names in self.columns:
            if isinstance(col_names, str):
                # We want to assume col_names is a list of column names
                col_names = [col_names]
            num_cols = len(col_names)
            cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
            cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
            num_cat_cols = len(cat_cols)
            num_cont_cols = num_cols - num_cat_cols

            name = self.stitched_cols(col_names)
            # The start of the region for this weight
            start = self.offsets[name]
            if num_cont_cols == 0:
                shape = tf.shape(hierarchy[cat_cols[0]])
                weight = tf.ones(shape, dtype=tf.float32, name=f"{name}_weights")
            else:
                # you can only have 1 cont col in col_names
                weight = hierarchy[cont_cols[0]]

            if num_cat_cols == 0:
                index = tf.cast(
                    tf.fill(tf.shape(weight), start),
                    dtype=tf.int64,
                    name=f"{name}_indices",
                )
                # if no categorical columns, we have the value of continuous column as the index
            else:
                # The standard encoding of left to right indices given base col_counts[col] for each col
                offsets = np.cumprod([1] + [self.col_counts[col] for col in cat_cols[:-1]])

                # The index in weights where we look up the first of the embeddings for this set of columns
                # This lets us concatenate all embeddings into a single weights matrix rather than defining
                # them separately, and deterministicly able to derive the index in this larger weight matrix.
                index = start + tf.math.add_n(
                    [
                        # hierarchy[col] is column of dataframe
                        tf.constant(offset, dtype=tf.int64) * tf.cast(hierarchy[col], dtype=tf.int64)
                        for offset, col in zip(offsets, cat_cols)
                    ],
                    name=f"{name}_indices",
                )

            # store index for heirarchical parameters and the corresponding continuous weightage
            indices.append(index)
            weights.append(weight)

        # len(self.columns), *shape(hierarchy[<any>])
        weights = tf.stack(weights, axis=0, name="weights_stacked")
        indices = tf.stack(indices, axis=0, name="indices")
        return weights, indices

    def __call__(
        self,
        hierarchy: dict[str, TensorLike] | tuple[tf.Tensor, tf.Tensor],
        training: bool = False,  # noqa: U100
        debug: bool = False,  # noqa: U100
        skip_metrics: bool = False,
    ) -> tf.Tensor:
        """Returns the model parameters' embeddings calculated from the weights.
        Adds l2 regularization penalties to loss based on deviations and bias.

        Args:
            hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.
            training (bool, optional): Whether this is a training or inference run. Defaults to False.

        Returns:
            tf.Tensor: Model parameters' embeddings.
        """

        if isinstance(hierarchy, tuple):
            weights, indices = hierarchy
        else:
            # get the hierarchical parameters that correspond to the input hierarchy
            # NOTE: weights is the proper multiplicative relationship using continuous hierarchical variables, not
            #       something from self.weights. Probably should change name in future for readability.
            weights, indices = self.get_hierarchical_parameters(hierarchy)
        # Look up embeddings by indices
        # len(self.columns), *shape(hierarchy[<any>]), np.prod(self.shape)

        if self.increase_lr is not None:
            lr_scaled_weights = self.weights * tf.constant(self.increase_lr, dtype=tf.float32)
        else:
            lr_scaled_weights = self.weights

        looked_up = tf.gather(lr_scaled_weights, indices, name="embeds")
        # Optimization and convert to tensor
        # counts,
        scatters = tf.constant(self.scatters, dtype=tf.int64)
        # Do a matrix multiply to sum over columns
        # *shape(hierarchy[<any>]), np.prod(self.shape)
        weighted = tf.einsum("c...f,c...->...f", looked_up, weights, name="weighted")

        # This is num_regularized_categories x flattened using the same scatter trick as for means
        # len(self.multipliers), np.prod(self.shape)
        cur_l2_norm = tf.scatter_nd(
            scatters[:, None],
            tf.math.square(lr_scaled_weights, name="shifted_squared"),
            shape=self.dense_shape,
            name="cur_l2_norm",
        )[1:]
        # We want we want to apply l2 regularization so that this ratio is pushed to be 1 or less.
        # len(self.multipliers), np.prod(self.shape)
        cur_ratio = cur_l2_norm / tf.constant(self.desired_l2norms[:, None] + EPSILON, dtype=tf.float32)
        # Old negative feedback was roughly cur_ratio ** 2 (in the steady state). This just makes it explicit.
        # We don't care if it is over 0 so we shift down by 1 then up by 1 to get it to be the same scale
        if self.use_l2_squared:
            hier_reg = tf.math.reduce_sum(
                tf.math.square(cur_ratio) * tf.constant(self.penalty_mults[:, None], dtype=tf.float32),
                name="hier_reg",
            )
        else:
            hier_reg = cur_ratio * tf.constant(self.penalty_mults[:, None], dtype=tf.float32)
        if not skip_metrics:
            self.add_loss("hier_reg", hier_reg, category="hier")
        if self.use_bias:
            if self.increase_lr is not None:
                lr_scaled_bias = self.bias * tf.constant(self.increase_lr, dtype=tf.float32)
            else:
                lr_scaled_bias = self.bias

            if self.reg_bias > 0 and not skip_metrics:
                bias_loss = tf.math.reduce_sum(tf.math.square(lr_scaled_bias))
                self.add_loss("reg_bias", bias_loss, category="aux", mult=self.reg_bias)

            result = tf.nn.bias_add(weighted, lr_scaled_bias, name="biased")
        else:
            result = weighted

        # We want to undo the flattening we did for simpler logic.
        initial_shape = [tf.shape(result)[i] for i in range(len(result.shape) - 1)]
        # *shape(hierarchy[<any>]), *self.shape
        return tf.reshape(result, [*initial_shape, *self.shape], name="final_var")

    def get_tensors(
        self, dy_dweights: tf.Tensor | tf.Variable | None = None, dy_dbias: tf.Tensor | tf.Variable | None = None
    ) -> tuple[dict[str, tf.Tensor], dict[str, list[dict] | pd.MultiIndex], list[str | int]]:
        """Get the learned weights for a HierarchicalEmbedding layer"""
        output_tensors: dict[str, tf.Tensor] = {}
        output_indices: dict[str, pd.Index | pd.MultiIndex] = {}

        weights = self.weights if dy_dweights is None else dy_dweights
        feature_names = self._get_feature_names(weights)
        if self.use_bias:
            bias = self.bias if dy_dbias is None else dy_dbias
            self._process_bias(bias, output_tensors, output_indices)
        self._process_columns_in_tensors(output_tensors, output_indices, weights)

        return output_tensors, output_indices, feature_names

    def _get_feature_names(self, weights: tf.Tensor | tf.Variable) -> list[str]:
        n_features = weights.shape[-1]
        feature_names = list(range(n_features)) if self.feature_names is None else self.feature_names
        if len(feature_names) != n_features:
            if n_features % len(feature_names) == 0:
                num_dups = n_features // len(feature_names)
                feature_names = [f"{name}_{i+1}" for name in feature_names for i in range(num_dups)]
            else:
                raise ValueError(
                    f"feature_names must be a list of size {n_features}, but got size {len(feature_names)}"
                )
        return feature_names

    def _process_bias(
        self,
        bias: tf.Tensor | tf.Variable,
        output_tensors: dict[str, tf.Tensor],
        output_indices: dict[str, pd.Index | pd.MultiIndex],
    ):
        output_tensors["bias"] = tf.expand_dims(bias, axis=0)
        output_indices["bias"] = pd.Index(["bias"])

    def _process_columns_in_tensors(
        self,
        output_tensors: dict[str, tf.Tensor],
        output_indices: dict[str, pd.Index | pd.MultiIndex],
        weights: tf.Tensor,
    ):
        n_features = weights.shape[-1]
        for col_names in self.columns:
            if isinstance(col_names, str):
                col_names = [col_names]
            hierarchy, output_index = self._get_hierarchy_and_output_index(col_names)
            learned_weights = self._get_learned_weights(hierarchy, col_names, weights)
            learned_weights = self._reshape_learned_weights_if_needed(learned_weights, n_features)
            output_tensors[self.stitched_cols(col_names)] = learned_weights
            output_indices[self.stitched_cols(col_names)] = output_index

    def _get_hierarchy_and_output_index(
        self, col_names: list[str]
    ) -> tuple[dict[str, NDArray], pd.Index | pd.MultiIndex]:
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
        num_cat_cols = len(cat_cols)

        if num_cat_cols == 0:
            hierarchy = {cont_cols[0]: np.asarray([1.0])}
            output_index = pd.Index([cont_cols[0]])
        else:
            midx = pd.MultiIndex.from_product([self.encodings[c].values() for c in cat_cols], names=cat_cols)
            output_index = pd.MultiIndex.from_product([self.encodings[c].keys() for c in cat_cols], names=cat_cols)
            if len(cont_cols) > 0:
                # TODO (@RyanSaxe): why is cont_cols[0] is used? adding a comment will be helpful.
                midx = pd.concat({1.0: pd.DataFrame(index=midx)}, names=[cont_cols[0]]).index
            hierarchy = {h: midx.get_level_values(h).to_numpy() for h in midx.names}

        return hierarchy, output_index

    def _get_learned_weights(
        self, hierarchy: dict[str, NDArray], col_names: list[str], weights: tf.Tensor
    ) -> tf.Tensor:
        name = self.stitched_cols(col_names)
        start = self.offsets[name]
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]

        if len(cont_cols) == 0:
            shape = tf.shape(hierarchy[list(hierarchy.keys())[0]])
            weight = tf.ones(shape, dtype=tf.float32, name=f"{self.stitched_cols(col_names)}_weights")
        else:
            weight = hierarchy[cont_cols[0]]
        if len(cat_cols) == 0:
            index = tf.cast(
                tf.fill(tf.shape(weight), self.offsets[name]),
                dtype=tf.int64,
                name=f"{name}_indices",
            )
        else:
            offsets = np.cumprod([1] + [self.col_counts[col] for col in cat_cols[:-1]])
            index = start + tf.math.add_n(
                [
                    tf.constant(offset, dtype=tf.int64) * tf.cast(hierarchy[col], dtype=tf.int64)
                    for offset, col in zip(offsets, cat_cols)
                ]
            )

        return tf.gather(weights, index, name="embeds")

    def _reshape_learned_weights_if_needed(self, learned_weights: tf.Tensor, n_features: int) -> tf.Tensor:
        if len(learned_weights.shape) > 2:
            flattened_shape = prod(learned_weights.shape[:-1])
            return tf.reshape(learned_weights, (flattened_shape, n_features))
        return learned_weights

    def get_dfs(
        self, dy_dweights: tf.Tensor | None = None, dy_dbias: tf.Tensor | None = None
    ) -> dict[str, pd.DataFrame]:
        """Get the learned weights for a HierarchicalEmbedding layer as a DataFrame"""
        # NOTE: separated this function so we could more easily differentiate
        output_tensors, output_indices, feature_names = self.get_tensors(dy_dweights=dy_dweights, dy_dbias=dy_dbias)
        return {
            key: pd.DataFrame(tensor, index=output_indices[key], columns=feature_names)
            for key, tensor in output_tensors.items()
        }

    @property
    def dfs(self) -> dict[str, pd.DataFrame]:
        return self.get_dfs()

__call__(hierarchy, training=False, debug=False, skip_metrics=False)

Returns the model parameters' embeddings calculated from the weights. Adds l2 regularization penalties to loss based on deviations and bias.

Parameters:

Name Type Description Default
hierarchy dict[str, TensorLike]

Hierarchy placeholder for Hierarchial embedding variable.

required
training bool

Whether this is a training or inference run. Defaults to False.

False

Returns:

Type Description
Tensor

tf.Tensor: Model parameters' embeddings.

Source code in wt_ml/layers/hier_embedding.py
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
def __call__(
    self,
    hierarchy: dict[str, TensorLike] | tuple[tf.Tensor, tf.Tensor],
    training: bool = False,  # noqa: U100
    debug: bool = False,  # noqa: U100
    skip_metrics: bool = False,
) -> tf.Tensor:
    """Returns the model parameters' embeddings calculated from the weights.
    Adds l2 regularization penalties to loss based on deviations and bias.

    Args:
        hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.
        training (bool, optional): Whether this is a training or inference run. Defaults to False.

    Returns:
        tf.Tensor: Model parameters' embeddings.
    """

    if isinstance(hierarchy, tuple):
        weights, indices = hierarchy
    else:
        # get the hierarchical parameters that correspond to the input hierarchy
        # NOTE: weights is the proper multiplicative relationship using continuous hierarchical variables, not
        #       something from self.weights. Probably should change name in future for readability.
        weights, indices = self.get_hierarchical_parameters(hierarchy)
    # Look up embeddings by indices
    # len(self.columns), *shape(hierarchy[<any>]), np.prod(self.shape)

    if self.increase_lr is not None:
        lr_scaled_weights = self.weights * tf.constant(self.increase_lr, dtype=tf.float32)
    else:
        lr_scaled_weights = self.weights

    looked_up = tf.gather(lr_scaled_weights, indices, name="embeds")
    # Optimization and convert to tensor
    # counts,
    scatters = tf.constant(self.scatters, dtype=tf.int64)
    # Do a matrix multiply to sum over columns
    # *shape(hierarchy[<any>]), np.prod(self.shape)
    weighted = tf.einsum("c...f,c...->...f", looked_up, weights, name="weighted")

    # This is num_regularized_categories x flattened using the same scatter trick as for means
    # len(self.multipliers), np.prod(self.shape)
    cur_l2_norm = tf.scatter_nd(
        scatters[:, None],
        tf.math.square(lr_scaled_weights, name="shifted_squared"),
        shape=self.dense_shape,
        name="cur_l2_norm",
    )[1:]
    # We want we want to apply l2 regularization so that this ratio is pushed to be 1 or less.
    # len(self.multipliers), np.prod(self.shape)
    cur_ratio = cur_l2_norm / tf.constant(self.desired_l2norms[:, None] + EPSILON, dtype=tf.float32)
    # Old negative feedback was roughly cur_ratio ** 2 (in the steady state). This just makes it explicit.
    # We don't care if it is over 0 so we shift down by 1 then up by 1 to get it to be the same scale
    if self.use_l2_squared:
        hier_reg = tf.math.reduce_sum(
            tf.math.square(cur_ratio) * tf.constant(self.penalty_mults[:, None], dtype=tf.float32),
            name="hier_reg",
        )
    else:
        hier_reg = cur_ratio * tf.constant(self.penalty_mults[:, None], dtype=tf.float32)
    if not skip_metrics:
        self.add_loss("hier_reg", hier_reg, category="hier")
    if self.use_bias:
        if self.increase_lr is not None:
            lr_scaled_bias = self.bias * tf.constant(self.increase_lr, dtype=tf.float32)
        else:
            lr_scaled_bias = self.bias

        if self.reg_bias > 0 and not skip_metrics:
            bias_loss = tf.math.reduce_sum(tf.math.square(lr_scaled_bias))
            self.add_loss("reg_bias", bias_loss, category="aux", mult=self.reg_bias)

        result = tf.nn.bias_add(weighted, lr_scaled_bias, name="biased")
    else:
        result = weighted

    # We want to undo the flattening we did for simpler logic.
    initial_shape = [tf.shape(result)[i] for i in range(len(result.shape) - 1)]
    # *shape(hierarchy[<any>]), *self.shape
    return tf.reshape(result, [*initial_shape, *self.shape], name="final_var")

__init__(shape, encodings, columns=None, use_bias=True, dropped_columns=[], initializer=0.0, bias_initializer=0.0, hyperparameters=None, feature_names=None, name=None, increase_lr=None)

Initializes the hierarchical embedding object with hierarchy levels, parameter shape and other initializers.

Parameters:

Name Type Description Default
shape list[int]

Desired dimensions of model parameters only within final result.

required
hierarchy DataFrame

The hierarchy for which embeddings are trained.

required
columns list[str | list[str]] | None

Hierarchy levels to learn embeddings. Defaults to None.

None
use_bias bool

Whether to include bias. Defaults to True.

True
dropped_columns list

Columns to exclude in hierarchy. Defaults to ["granular", "region", "coastal", "populationdensity", "medianage"].

[]
initializer Initializer

Initializer for embeddings(weights). Defaults to 0.0.

0.0
bias_initializer Initializer

Initializer for bias. Defaults to 0.0.

0.0
hyperparameters Hyperparams | None

Dictionary of hyperparameters for buidling this layer. Defaults to None.

None
name str | None

Name of the layer. Defaults to None.

None
Source code in wt_ml/layers/hier_embedding.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def __init__(
    self,
    shape: list[int],
    encodings: dict[str, Any],
    columns: list[str | list[str]] | None = None,
    use_bias: bool = True,
    dropped_columns=[],
    initializer: Initializer = 0.0,
    bias_initializer: Initializer = 0.0,
    hyperparameters: Hyperparams | None = None,
    feature_names: list[list[str]] | list[str] | None = None,
    name: str | None = None,
    increase_lr: float | None = None,
):
    """Initializes the hierarchical embedding object with hierarchy levels, parameter shape
    and other initializers.

    Args:
        shape (list[int]): Desired dimensions of model parameters only within final result.
        hierarchy (pd.DataFrame): The hierarchy for which embeddings are trained.
        columns (list[str  |  list[str]] | None, optional): Hierarchy levels to learn embeddings. Defaults to None.
        use_bias (bool, optional): Whether to include bias. Defaults to True.
        dropped_columns (list, optional): Columns to exclude in hierarchy.
                                    Defaults to ["granular", "region", "coastal", "populationdensity", "medianage"].
        initializer (Initializer, optional): Initializer for embeddings(weights). Defaults to 0.0.
        bias_initializer (Initializer, optional): Initializer for bias. Defaults to 0.0.
        hyperparameters (Hyperparams | None, optional): Dictionary of hyperparameters for buidling this layer.
                                                        Defaults to None.
        name (str | None, optional): Name of the layer. Defaults to None.
    """
    super().__init__(hyperparameters=hyperparameters, name=name)
    self.use_bias = use_bias
    encodings_dropped = {k: v for k, v in encodings.items() if k not in dropped_columns}
    assert encodings_dropped, "No cols in hierarchy."
    self.encodings = encodings_dropped
    self.shape = shape
    self.increase_lr = increase_lr
    self.initializer = initializer
    self.bias_initializer = bias_initializer
    self.created_reg = False
    self.feature_names = feature_names
    if self.feature_names is not None and isinstance(self.feature_names[0], list):
        self.feature_names = [elem for sublist in self.feature_names for elem in sublist]
    if columns is not None:
        self._process_columns(columns)
    else:
        self.columns = list(self.encodings.keys())
        self.used_cols = set(self.columns)

build(input_shapes)

Builds hyperparamters, deviations, embeddings(weights), bias and other intermediate variables.

Parameters:

Name Type Description Default
input_shapes InputShapes

The effect and hierarchy shapes.

required

Raises:

Type Description
AllUniqueError

When there are no hierchical columns because everything is unique within it.

Source code in wt_ml/layers/hier_embedding.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def build(self, input_shapes):  # noqa: U100
    """Builds hyperparamters, deviations, embeddings(weights), bias and other intermediate variables.

    Args:
        input_shapes (InputShapes): The effect and hierarchy shapes.

    Raises:
        AllUniqueError: When there are no hierchical columns because everything is unique within it.
    """
    self.use_l2_squared = self.hyperparameters.get_bool(
        "use_l2_squared",
        default=False,
        help="Use the l2 norm to the fourth power instead of only using it for large values for stability.",
    )
    self.desired_stddev = self.hyperparameters.get_float(
        "desired_stddev",
        default=0.10,
        min=0.01,
        max=100.0,
        help="The desired maximum value for the stddev along the full hierarchy.",
    )
    self.use_inv_sqrt = self.hyperparameters.get_bool(
        "use_inv_sqrt",
        default=True,
        help="Scale the stddev for each category by the inverse square root of the number of unique values.",
    )

    if self.use_bias:
        self.reg_bias = self.hyperparameters.get_float(
            "reg_bias",
            default=0.0,
            min=0.0,
            max=1e4,
            help="The strength of l2 regularization to apply to the bias term.",
        )
    self.offsets = {}
    self.col_counts = {
        k: (
            self.encodings[k]
            if isinstance(self.encodings[k], (float, int))
            else ((max(self.encodings[k].values()) + 1) if not isinstance(self.encodings[k], str) else 1)
        )
        for k in tf.nest.flatten(self.columns)
    }
    var_counts = [
        self.col_counts[col] if isinstance(col, str) else np.prod([self.col_counts[k] for k in col])
        for col in self.columns
    ]

    self.columns = [col for col, count in zip(self.columns, var_counts) if count > 1 or self.is_continuous(col)]
    count = 0
    desired_stddevs = []
    # Scatters is the inverse of gathering from num_regularized_categories + 1 to count of weights
    self.scatters = []
    self.penalty_mults = []
    multipliers = []
    reg_counts = []
    flattened = int(np.prod(self.shape)) if len(self.shape) > 0 else 1
    for col_names in self.columns:
        if isinstance(col_names, str):
            col_names = [col_names]
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        name = self.stitched_cols(col_names)
        num_cont_cols = len(col_names) - len(cat_cols)
        if num_cont_cols > 1:
            raise ValueError(
                "You can only have one continuous hierarchical variable within a single hierarchical level"
            )
        number = int(np.prod([self.col_counts[k] for k in col_names]))
        multipliers.append(1 / number)
        self.scatters += [len(multipliers)] * number
        desired_stddevs.append(1 / np.sqrt(number) if self.use_inv_sqrt else 1)
        reg_counts.append(max(1, number - 1))
        self.offsets[name] = count
        count += number
        self.penalty_mults.append(self.get_reg_mult(col_names))

    if count == 0 or len(multipliers) == 0:
        raise AllUniqueError("There are no hierchical columns everything is unique.")
    # scatters is shape (count,)
    self.scatters = np.array(self.scatters)
    # multipliers is shape (1 + regularized_counts,)
    self.multipliers = np.array([0] + multipliers, dtype=np.float32)
    self.penalty_mults = np.array(self.penalty_mults, dtype=np.float32)
    self.dense_shape = [len(self.multipliers), flattened]
    # desired_* is shape (regularized_counts,)
    self.desired_stddevs = (
        self.desired_stddev * np.array(desired_stddevs or [1], dtype=np.float32) / np.sqrt(max(1, len(multipliers)))
    )
    self.desired_l2norms = (
        np.array(reg_counts, dtype=np.float32) * self.desired_stddevs**2 / (1 if self.use_l2_squared else 2)
    )
    self.weights = self.create_var(
        "weights", shape=[count, flattened], dtype=tf.float32, trainable=True, initializer=self.initializer
    )
    if self.use_bias:
        self.bias = self.create_var(
            "bias", shape=[flattened], dtype=tf.float32, trainable=True, initializer=self.bias_initializer
        )

get_dfs(dy_dweights=None, dy_dbias=None)

Get the learned weights for a HierarchicalEmbedding layer as a DataFrame

Source code in wt_ml/layers/hier_embedding.py
548
549
550
551
552
553
554
555
556
557
def get_dfs(
    self, dy_dweights: tf.Tensor | None = None, dy_dbias: tf.Tensor | None = None
) -> dict[str, pd.DataFrame]:
    """Get the learned weights for a HierarchicalEmbedding layer as a DataFrame"""
    # NOTE: separated this function so we could more easily differentiate
    output_tensors, output_indices, feature_names = self.get_tensors(dy_dweights=dy_dweights, dy_dbias=dy_dbias)
    return {
        key: pd.DataFrame(tensor, index=output_indices[key], columns=feature_names)
        for key, tensor in output_tensors.items()
    }

get_hierarchical_parameters(hierarchy)

Returns the model parameters' for every hierarchical level (non-aggregated weights)

Parameters:

Name Type Description Default
hierarchy dict[str, TensorLike]

Hierarchy placeholder for Hierarchial embedding variable.

required

NOTE: this currently does not depend on training flag. Possible we change how things work such that it will.

Returns:

tuple[tf.Tensor, tf.Tensor]: weights, indices

    the 1st list[tf.Tensor]=A: A[i] corresponds to the multiplicative data for the continuous aspects
                                    of the hierarchy in self.columns[i]
    the 2nd list[tf.Tensor]=B: B[i] corresponds to the indices in self.weights that corresponds to the
                                    correct learned coefficients of the hierarchy in self.columns[i]
Source code in wt_ml/layers/hier_embedding.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
def get_hierarchical_parameters(self, hierarchy: Mapping[str, TensorLike]) -> tuple[tf.Tensor, tf.Tensor]:
    """Returns the model parameters' for every hierarchical level (non-aggregated weights)

    Args:
        hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.

    NOTE: this currently does not depend on training flag. Possible we change how things work such that it will.

    Returns:

        tuple[tf.Tensor, tf.Tensor]: weights, indices

            the 1st list[tf.Tensor]=A: A[i] corresponds to the multiplicative data for the continuous aspects
                                            of the hierarchy in self.columns[i]
            the 2nd list[tf.Tensor]=B: B[i] corresponds to the indices in self.weights that corresponds to the
                                            correct learned coefficients of the hierarchy in self.columns[i]
    """
    # Shape is [count, ...] for both of these
    weights = []
    indices = []
    for col_names in self.columns:
        if isinstance(col_names, str):
            # We want to assume col_names is a list of column names
            col_names = [col_names]
        num_cols = len(col_names)
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
        num_cat_cols = len(cat_cols)
        num_cont_cols = num_cols - num_cat_cols

        name = self.stitched_cols(col_names)
        # The start of the region for this weight
        start = self.offsets[name]
        if num_cont_cols == 0:
            shape = tf.shape(hierarchy[cat_cols[0]])
            weight = tf.ones(shape, dtype=tf.float32, name=f"{name}_weights")
        else:
            # you can only have 1 cont col in col_names
            weight = hierarchy[cont_cols[0]]

        if num_cat_cols == 0:
            index = tf.cast(
                tf.fill(tf.shape(weight), start),
                dtype=tf.int64,
                name=f"{name}_indices",
            )
            # if no categorical columns, we have the value of continuous column as the index
        else:
            # The standard encoding of left to right indices given base col_counts[col] for each col
            offsets = np.cumprod([1] + [self.col_counts[col] for col in cat_cols[:-1]])

            # The index in weights where we look up the first of the embeddings for this set of columns
            # This lets us concatenate all embeddings into a single weights matrix rather than defining
            # them separately, and deterministicly able to derive the index in this larger weight matrix.
            index = start + tf.math.add_n(
                [
                    # hierarchy[col] is column of dataframe
                    tf.constant(offset, dtype=tf.int64) * tf.cast(hierarchy[col], dtype=tf.int64)
                    for offset, col in zip(offsets, cat_cols)
                ],
                name=f"{name}_indices",
            )

        # store index for heirarchical parameters and the corresponding continuous weightage
        indices.append(index)
        weights.append(weight)

    # len(self.columns), *shape(hierarchy[<any>])
    weights = tf.stack(weights, axis=0, name="weights_stacked")
    indices = tf.stack(indices, axis=0, name="indices")
    return weights, indices

get_reg_config(col_names)

Creates name used for regularization and default value for the penalty multiplier. If all columns are categorical, we can just join their names in order to find penalty. Otherwise, when different continuous features are paired with a same categorical column, the resulting hierarchical categories share same penalty. Always, suffix the continuous string to the end of the name. Example: brand-DEM and brand-GOP have the same penalty called reg_brand-continuous. Examples of mixed categories: reg_brand-continuous, reg_vehicle-continuous.

Parameters:

Name Type Description Default
col_names list[str]

Hierarchical column names.

required

Returns:

Type Description
tuple[str, float]

tuple[str, float]: Regularization penalty name and the default value.

Source code in wt_ml/layers/hier_embedding.py
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
def get_reg_config(self, col_names: tuple[str] | list[str]) -> tuple[str, float]:
    """Creates name used for regularization and default value for the penalty multiplier.
    If all columns are categorical, we can just join their names in order to find penalty.
    Otherwise, when different continuous features are paired with a same categorical column,
    the resulting hierarchical categories share same penalty. Always, suffix the continuous
    string to the end of the name.
    Example: brand-DEM and brand-GOP have the same penalty called reg_brand-continuous.
    Examples of mixed categories: reg_brand-continuous, reg_vehicle-continuous.

    Args:
        col_names (list[str]): Hierarchical column names.

    Returns:
        tuple[str, float]: Regularization penalty name and the default value.
    """
    count = int(np.prod([self.col_counts[k] for k in col_names]))
    if count == 1:
        # Purely continuous features
        default_value = 0.0
    else:
        default_value = 1.0
    names = [name for name in col_names if not self.is_continuous(name)]
    names.append("continuous") if self.is_continuous(col_names) else None
    reg_name = f"reg_{self.stitched_cols(names)}"
    return reg_name, default_value

get_reg_mult(col_names)

Returns the penalty multiplier for hierarchy level reg loss.

Source code in wt_ml/layers/hier_embedding.py
271
272
273
274
275
276
277
278
279
def get_reg_mult(self, col_names: list[str] | tuple[str]) -> float:
    """Returns the penalty multiplier for hierarchy level reg loss."""
    reg_name, default = self.get_reg_config(col_names)
    mult = self.hyperparameters.get_float(
        name=reg_name,
        default=default,
        help="Penalty multiplier for hierarchy level reg loss.",
    )
    return mult

get_tensors(dy_dweights=None, dy_dbias=None)

Get the learned weights for a HierarchicalEmbedding layer

Source code in wt_ml/layers/hier_embedding.py
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
def get_tensors(
    self, dy_dweights: tf.Tensor | tf.Variable | None = None, dy_dbias: tf.Tensor | tf.Variable | None = None
) -> tuple[dict[str, tf.Tensor], dict[str, list[dict] | pd.MultiIndex], list[str | int]]:
    """Get the learned weights for a HierarchicalEmbedding layer"""
    output_tensors: dict[str, tf.Tensor] = {}
    output_indices: dict[str, pd.Index | pd.MultiIndex] = {}

    weights = self.weights if dy_dweights is None else dy_dweights
    feature_names = self._get_feature_names(weights)
    if self.use_bias:
        bias = self.bias if dy_dbias is None else dy_dbias
        self._process_bias(bias, output_tensors, output_indices)
    self._process_columns_in_tensors(output_tensors, output_indices, weights)

    return output_tensors, output_indices, feature_names

stitched_cols(col_names)

Returns a string representation of the columns.

Source code in wt_ml/layers/hier_embedding.py
241
242
243
def stitched_cols(self, col_names: str | list[str] | tuple) -> str:
    """Returns a string representation of the columns."""
    return col_names if isinstance(col_names, str) else "-".join(col_names)