AllUniqueError

Bases: ValueError

When there are no hierchical columns because everything is unique within it.

Source code in wt_ml/layers/hier_embedding.py
21
22
class AllUniqueError(ValueError):
    """When there are no hierchical columns because everything is unique within it."""

BetaGammaDecay

Bases: Module

Class to learn decayed impacts for the ensuing time periods after a spend in a media vehicle

Source code in wt_ml/layers/beta_gamma_decay.py
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
class BetaGammaDecay(Module):
    """Class to learn decayed impacts for the ensuing time periods after a spend in a media vehicle"""

    def __init__(
        self,
        encodings: dict[str, Any],
        hierarchy_categories: list[str | list[str]] | None = None,
        hyperparameters: Hyperparams | None = None,
        name: str | None = None,
    ):
        """Creates a betagammadecay object to learn decayed impacts using beta and gamma parameters.

        Args:
            hierarchy (pd.DataFrame): The hierarchy that the impact learns on.
            hyperparameters (Hyperparams | None, optional): Dictionary of hyperparameters for buidling this layer.
                                                            Defaults to None.
            name (str | None, optional): Name of the layer. Defaults to None.
        """
        super().__init__(hyperparameters=hyperparameters, name=name)
        self.encodings = encodings
        self.hierarchy_categories = hierarchy_categories

    def build(self, input_shapes):  # noqa: U100
        """Build the layer parameters needed for calculating decays.

        Args:
            input_shapes (InputShapes): The effect and hierarchy shapes.
        """
        self.gamma_min = self.hyperparameters.get_float(
            "gamma_min",
            default=0.01,
            min=0.00,
            max=1.0,
            help="The minimum possible value to learn for the exponential decay factor.",
        )
        self.gamma_max = self.hyperparameters.get_float(
            "gamma_max",
            default=1.0,
            min=self.gamma_min,
            max=1.0,
            help="The maximum possible value to learn for the exponential decay factor.",
        )
        self.beta_min = self.hyperparameters.get_float(
            "beta_min",
            default=0.01,
            min=0.00,
            max=1.0,
            help="The minimum possible value to learn for the first step of the decay.",
        )
        self.beta_max = self.hyperparameters.get_float(
            "beta_max",
            default=1.0,
            min=self.beta_min,
            max=1.0,
            help="The maximum possible value to learn for the first step of the decay.",
        )
        self.betagamma_emb_layer = self.hyperparameters.get_submodule(
            name="betagamma_hier",
            module_type=HierchicalEmbedding,
            kwargs=dict(
                encodings=self.encodings,
                columns=self.hierarchy_categories,
                dropped_columns=[],
                shape=[2],
                feature_names=["beta", "gamma"],
            ),
            help="The embedding layer for the decay parameters.",
        )

    @property
    def gamma_range(self) -> float:
        return self.gamma_max - self.gamma_min

    @property
    def beta_range(self) -> float:
        return self.beta_max - self.beta_min

    def __call__(
        self, batch: BetaGammaDecayInput, training: bool = False, debug: bool = False, skip_metrics: bool = False
    ) -> BetaGammaDecayIntermediaries:
        """Calculate decays, total impacts using the learned beta gamma parameters

        Args:
            impact_by_signal_instant (TensorLike): Instant impacts
            hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.
            training (bool, optional): Whether this is a training or inference run. Defaults to False.

        Returns:
            BetaGammaDecayIntermediaries: Intermediate calculations for beta gamma decay - beta, gamma, impacts etc.
        """
        # batch x vehicles x 2
        betagamma_emb = self.betagamma_emb_layer(
            batch.hierarchy, training=training, skip_metrics=skip_metrics, debug=debug
        )
        beta_emb, gamma_emb = tf.unstack(betagamma_emb, num=2, axis=2)
        if self.gamma_max < 1:
            gamma = 1.0 - 1.0 / (
                transform_softbounded(
                    gamma_emb,
                    max_val=3.0,
                    min_val=-3,
                    name="gamma",
                    add_loss=self.add_loss,
                    mult=0.1,
                    enabled=not skip_metrics,
                )
                * (1 / (1 - self.gamma_max) - 1 / (1 - self.gamma_min))
                + 1 / (1 - self.gamma_min)
            )
        else:
            gamma = 1.0 - 1.0 / (softplus(gamma_emb) + 1 / (1 - self.gamma_min))
        # batch x vehicles
        beta = (
            transform_softbounded(
                beta_emb,
                max_val=3.0,
                min_val=-3,
                name="beta",
                mult=0.1,
                add_loss=self.add_loss,
                enabled=not skip_metrics,
            )
            * self.beta_range
            + self.beta_min
        )
        # batch x 1 x vehicle
        decayed_impact_mult = tf.expand_dims(1 + beta * (1 - tf.math.pow(gamma, DECAY_LENGTH)) / (1 - gamma), 1)
        impact_by_signal_total = batch.impact_by_signal_instant * decayed_impact_mult
        impact_by_signal_decayed = exp_moving_avg(
            batch.impact_by_signal_instant,
            beta,
            gamma,
            name="impact_by_signal_decayed",
            decay_length=DECAY_LENGTH,
        )
        impact = tf.math.reduce_sum(impact_by_signal_decayed, axis=2)
        return BetaGammaDecayIntermediaries(
            beta_emb=beta_emb if debug else None,
            gamma_emb=gamma_emb if debug else None,
            beta=beta,
            gamma=gamma,
            decayed_impact_mult=decayed_impact_mult,
            impact_by_signal_total=impact_by_signal_total,
            impact_by_signal=impact_by_signal_decayed,
            impact=impact,
            signal_names=tf.gather(
                tf.convert_to_tensor(tuple(f"{vehicle}_decayed" for vehicle in get_lookups(self.encodings["vehicle"]))),
                batch.hierarchy["vehicle"][0],
            ),
        )

__call__(batch, training=False, debug=False, skip_metrics=False)

Calculate decays, total impacts using the learned beta gamma parameters

Parameters:

Name Type Description Default
impact_by_signal_instant TensorLike

Instant impacts

required
hierarchy dict[str, TensorLike]

Hierarchy placeholder for Hierarchial embedding variable.

required
training bool

Whether this is a training or inference run. Defaults to False.

False

Returns:

Name Type Description
BetaGammaDecayIntermediaries BetaGammaDecayIntermediaries

Intermediate calculations for beta gamma decay - beta, gamma, impacts etc.

Source code in wt_ml/layers/beta_gamma_decay.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def __call__(
    self, batch: BetaGammaDecayInput, training: bool = False, debug: bool = False, skip_metrics: bool = False
) -> BetaGammaDecayIntermediaries:
    """Calculate decays, total impacts using the learned beta gamma parameters

    Args:
        impact_by_signal_instant (TensorLike): Instant impacts
        hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.
        training (bool, optional): Whether this is a training or inference run. Defaults to False.

    Returns:
        BetaGammaDecayIntermediaries: Intermediate calculations for beta gamma decay - beta, gamma, impacts etc.
    """
    # batch x vehicles x 2
    betagamma_emb = self.betagamma_emb_layer(
        batch.hierarchy, training=training, skip_metrics=skip_metrics, debug=debug
    )
    beta_emb, gamma_emb = tf.unstack(betagamma_emb, num=2, axis=2)
    if self.gamma_max < 1:
        gamma = 1.0 - 1.0 / (
            transform_softbounded(
                gamma_emb,
                max_val=3.0,
                min_val=-3,
                name="gamma",
                add_loss=self.add_loss,
                mult=0.1,
                enabled=not skip_metrics,
            )
            * (1 / (1 - self.gamma_max) - 1 / (1 - self.gamma_min))
            + 1 / (1 - self.gamma_min)
        )
    else:
        gamma = 1.0 - 1.0 / (softplus(gamma_emb) + 1 / (1 - self.gamma_min))
    # batch x vehicles
    beta = (
        transform_softbounded(
            beta_emb,
            max_val=3.0,
            min_val=-3,
            name="beta",
            mult=0.1,
            add_loss=self.add_loss,
            enabled=not skip_metrics,
        )
        * self.beta_range
        + self.beta_min
    )
    # batch x 1 x vehicle
    decayed_impact_mult = tf.expand_dims(1 + beta * (1 - tf.math.pow(gamma, DECAY_LENGTH)) / (1 - gamma), 1)
    impact_by_signal_total = batch.impact_by_signal_instant * decayed_impact_mult
    impact_by_signal_decayed = exp_moving_avg(
        batch.impact_by_signal_instant,
        beta,
        gamma,
        name="impact_by_signal_decayed",
        decay_length=DECAY_LENGTH,
    )
    impact = tf.math.reduce_sum(impact_by_signal_decayed, axis=2)
    return BetaGammaDecayIntermediaries(
        beta_emb=beta_emb if debug else None,
        gamma_emb=gamma_emb if debug else None,
        beta=beta,
        gamma=gamma,
        decayed_impact_mult=decayed_impact_mult,
        impact_by_signal_total=impact_by_signal_total,
        impact_by_signal=impact_by_signal_decayed,
        impact=impact,
        signal_names=tf.gather(
            tf.convert_to_tensor(tuple(f"{vehicle}_decayed" for vehicle in get_lookups(self.encodings["vehicle"]))),
            batch.hierarchy["vehicle"][0],
        ),
    )

__init__(encodings, hierarchy_categories=None, hyperparameters=None, name=None)

Creates a betagammadecay object to learn decayed impacts using beta and gamma parameters.

Parameters:

Name Type Description Default
hierarchy DataFrame

The hierarchy that the impact learns on.

required
hyperparameters Hyperparams | None

Dictionary of hyperparameters for buidling this layer. Defaults to None.

None
name str | None

Name of the layer. Defaults to None.

None
Source code in wt_ml/layers/beta_gamma_decay.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def __init__(
    self,
    encodings: dict[str, Any],
    hierarchy_categories: list[str | list[str]] | None = None,
    hyperparameters: Hyperparams | None = None,
    name: str | None = None,
):
    """Creates a betagammadecay object to learn decayed impacts using beta and gamma parameters.

    Args:
        hierarchy (pd.DataFrame): The hierarchy that the impact learns on.
        hyperparameters (Hyperparams | None, optional): Dictionary of hyperparameters for buidling this layer.
                                                        Defaults to None.
        name (str | None, optional): Name of the layer. Defaults to None.
    """
    super().__init__(hyperparameters=hyperparameters, name=name)
    self.encodings = encodings
    self.hierarchy_categories = hierarchy_categories

build(input_shapes)

Build the layer parameters needed for calculating decays.

Parameters:

Name Type Description Default
input_shapes InputShapes

The effect and hierarchy shapes.

required
Source code in wt_ml/layers/beta_gamma_decay.py
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def build(self, input_shapes):  # noqa: U100
    """Build the layer parameters needed for calculating decays.

    Args:
        input_shapes (InputShapes): The effect and hierarchy shapes.
    """
    self.gamma_min = self.hyperparameters.get_float(
        "gamma_min",
        default=0.01,
        min=0.00,
        max=1.0,
        help="The minimum possible value to learn for the exponential decay factor.",
    )
    self.gamma_max = self.hyperparameters.get_float(
        "gamma_max",
        default=1.0,
        min=self.gamma_min,
        max=1.0,
        help="The maximum possible value to learn for the exponential decay factor.",
    )
    self.beta_min = self.hyperparameters.get_float(
        "beta_min",
        default=0.01,
        min=0.00,
        max=1.0,
        help="The minimum possible value to learn for the first step of the decay.",
    )
    self.beta_max = self.hyperparameters.get_float(
        "beta_max",
        default=1.0,
        min=self.beta_min,
        max=1.0,
        help="The maximum possible value to learn for the first step of the decay.",
    )
    self.betagamma_emb_layer = self.hyperparameters.get_submodule(
        name="betagamma_hier",
        module_type=HierchicalEmbedding,
        kwargs=dict(
            encodings=self.encodings,
            columns=self.hierarchy_categories,
            dropped_columns=[],
            shape=[2],
            feature_names=["beta", "gamma"],
        ),
        help="The embedding layer for the decay parameters.",
    )

HierchicalEmbedding

Bases: Module

Hierarchical Embedding creates embeddings for a layer with different input hierarchy levels as trainable weights such that the deviations from the expected deviations are penalized. These trained embeddings are used to calculate the model parameters for a layer.

Source code in wt_ml/layers/hier_embedding.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
class HierchicalEmbedding(Module):
    """Hierarchical Embedding creates embeddings for a layer with different input hierarchy levels
    as trainable weights such that the deviations from the expected deviations are penalized.
    These trained embeddings are used to calculate the model parameters for a layer.
    """

    def __init__(
        self,
        shape: list[int],
        encodings: dict[str, Any],
        columns: list[str | list[str]] | None = None,
        use_bias: bool = True,
        dropped_columns=[],
        initializer: Initializer = 0.0,
        bias_initializer: Initializer = 0.0,
        hyperparameters: Hyperparams | None = None,
        feature_names: list[list[str]] | list[str] | None = None,
        name: str | None = None,
        increase_lr: float | None = None,
    ):
        """Initializes the hierarchical embedding object with hierarchy levels, parameter shape
        and other initializers.

        Args:
            shape (list[int]): Desired dimensions of model parameters only within final result.
            hierarchy (pd.DataFrame): The hierarchy for which embeddings are trained.
            columns (list[str  |  list[str]] | None, optional): Hierarchy levels to learn embeddings. Defaults to None.
            use_bias (bool, optional): Whether to include bias. Defaults to True.
            dropped_columns (list, optional): Columns to exclude in hierarchy.
                                        Defaults to ["granular", "region", "coastal", "populationdensity", "medianage"].
            initializer (Initializer, optional): Initializer for embeddings(weights). Defaults to 0.0.
            bias_initializer (Initializer, optional): Initializer for bias. Defaults to 0.0.
            hyperparameters (Hyperparams | None, optional): Dictionary of hyperparameters for buidling this layer.
                                                            Defaults to None.
            name (str | None, optional): Name of the layer. Defaults to None.
        """
        super().__init__(hyperparameters=hyperparameters, name=name)
        self.use_bias = use_bias
        encodings_dropped = {k: v for k, v in encodings.items() if k not in dropped_columns}
        assert encodings_dropped, "No cols in hierarchy."
        self.encodings = encodings_dropped
        self.shape = shape
        self.increase_lr = increase_lr
        self.initializer = initializer
        self.bias_initializer = bias_initializer
        self.created_reg = False
        self.feature_names = feature_names
        if self.feature_names is not None and isinstance(self.feature_names[0], list):
            self.feature_names = [elem for sublist in self.feature_names for elem in sublist]
        if columns is not None:
            self._process_columns(columns)
        else:
            self.columns = list(self.encodings.keys())
            self.used_cols = set(self.columns)

    def _process_columns(self, columns: list[str | list[str]]):
        """
        Process columns and remove duplicates or column(s) which has unique hierarchies.
        Sets `columns` and `used_cols` attribute.

        Args:
            columns (list[str  |  list[str]]): Hierarchy levels to learn embeddings.
        """
        used_cols = set(tf.nest.flatten(columns))
        missing_cols = used_cols.difference(self.encodings.keys())
        assert not missing_cols, f"Column(s) passed not in hierarchy. {missing_cols}"

        issues: list[str] = []
        new_columns: list[str | tuple[str]] = []
        for column in columns:
            if isinstance(column, (list, tuple)):
                new_column = []
                for sub_col in column:
                    encodings = self.encodings[sub_col]
                    if encodings == "continuous" or len(encodings) > 1:
                        new_column.append(sub_col)
                    else:
                        issues.append(f"{sub_col} in {column} has single encoding.")

                if len(new_column) == 1:
                    # convert to str so duplicates can be detected easily.
                    new_columns.append(new_column[0])
                elif len(new_column) > 1:
                    new_columns.append(tuple(new_column))
                else:
                    issues.append(f"Dropping {column} as it has unique encodings.")
            else:
                encodings = self.encodings[column]
                if encodings == "continuous" or len(encodings) > 1:
                    new_columns.append(column)
                else:
                    issues.append(f"Dropping {column} as it has single encoding.")

        org_col_len = len(new_columns)
        # if any column is duplicated, we need to get rid of it.
        new_columns = list(dict.fromkeys(new_columns))
        assert len(new_columns), "All columns are dopped since they are all unique."
        if len(new_columns) != org_col_len:
            issues.append("Duplicate hierarchies removed.")

        if issues:
            warn_issues(self.name, issues, new_columns, columns)

        self.columns = new_columns
        self.used_cols = set(tf.nest.flatten(self.columns))

    def build(self, input_shapes):  # noqa: U100
        """Builds hyperparamters, deviations, embeddings(weights), bias and other intermediate variables.

        Args:
            input_shapes (InputShapes): The effect and hierarchy shapes.

        Raises:
            AllUniqueError: When there are no hierchical columns because everything is unique within it.
        """
        self.use_l2_squared = self.hyperparameters.get_bool(
            "use_l2_squared",
            default=False,
            help="Use the l2 norm to the fourth power instead of only using it for large values for stability.",
        )
        self.desired_stddev = self.hyperparameters.get_float(
            "desired_stddev",
            default=0.10,
            min=0.01,
            max=100.0,
            help="The desired maximum value for the stddev along the full hierarchy.",
        )
        self.use_inv_sqrt = self.hyperparameters.get_bool(
            "use_inv_sqrt",
            default=True,
            help="Scale the stddev for each category by the inverse square root of the number of unique values.",
        )

        if self.use_bias:
            self.reg_bias = self.hyperparameters.get_float(
                "reg_bias",
                default=0.0,
                min=0.0,
                max=1e4,
                help="The strength of l2 regularization to apply to the bias term.",
            )
        self.offsets = {}
        self.col_counts = {
            k: (
                self.encodings[k]
                if isinstance(self.encodings[k], (float, int))
                else ((max(self.encodings[k].values()) + 1) if not isinstance(self.encodings[k], str) else 1)
            )
            for k in tf.nest.flatten(self.columns)
        }
        var_counts = [
            self.col_counts[col] if isinstance(col, str) else np.prod([self.col_counts[k] for k in col])
            for col in self.columns
        ]

        self.columns = [col for col, count in zip(self.columns, var_counts) if count > 1 or self.is_continuous(col)]
        count = 0
        desired_stddevs = []
        # Scatters is the inverse of gathering from num_regularized_categories + 1 to count of weights
        self.scatters = []
        self.penalty_mults = []
        multipliers = []
        reg_counts = []
        flattened = int(np.prod(self.shape)) if len(self.shape) > 0 else 1
        for col_names in self.columns:
            if isinstance(col_names, str):
                col_names = [col_names]
            cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
            name = self.stitched_cols(col_names)
            num_cont_cols = len(col_names) - len(cat_cols)
            if num_cont_cols > 1:
                raise ValueError(
                    "You can only have one continuous hierarchical variable within a single hierarchical level"
                )
            number = int(np.prod([self.col_counts[k] for k in col_names]))
            multipliers.append(1 / number)
            self.scatters += [len(multipliers)] * number
            desired_stddevs.append(1 / np.sqrt(number) if self.use_inv_sqrt else 1)
            reg_counts.append(max(1, number - 1))
            self.offsets[name] = count
            count += number
            self.penalty_mults.append(self.get_reg_mult(col_names))

        if count == 0 or len(multipliers) == 0:
            raise AllUniqueError("There are no hierchical columns everything is unique.")
        # scatters is shape (count,)
        self.scatters = np.array(self.scatters)
        # multipliers is shape (1 + regularized_counts,)
        self.multipliers = np.array([0] + multipliers, dtype=np.float32)
        self.penalty_mults = np.array(self.penalty_mults, dtype=np.float32)
        self.dense_shape = [len(self.multipliers), flattened]
        # desired_* is shape (regularized_counts,)
        self.desired_stddevs = (
            self.desired_stddev * np.array(desired_stddevs or [1], dtype=np.float32) / np.sqrt(max(1, len(multipliers)))
        )
        self.desired_l2norms = (
            np.array(reg_counts, dtype=np.float32) * self.desired_stddevs**2 / (1 if self.use_l2_squared else 2)
        )
        self.weights = self.create_var(
            "weights", shape=[count, flattened], dtype=tf.float32, trainable=True, initializer=self.initializer
        )
        if self.use_bias:
            self.bias = self.create_var(
                "bias", shape=[flattened], dtype=tf.float32, trainable=True, initializer=self.bias_initializer
            )

    def stitched_cols(self, col_names: str | list[str] | tuple) -> str:
        """Returns a string representation of the columns."""
        return col_names if isinstance(col_names, str) else "-".join(col_names)

    def get_reg_config(self, col_names: tuple[str] | list[str]) -> tuple[str, float]:
        """Creates name used for regularization and default value for the penalty multiplier.
        If all columns are categorical, we can just join their names in order to find penalty.
        Otherwise, when different continuous features are paired with a same categorical column,
        the resulting hierarchical categories share same penalty. Always, suffix the continuous
        string to the end of the name.
        Example: brand-DEM and brand-GOP have the same penalty called reg_brand-continuous.
        Examples of mixed categories: reg_brand-continuous, reg_vehicle-continuous.

        Args:
            col_names (list[str]): Hierarchical column names.

        Returns:
            tuple[str, float]: Regularization penalty name and the default value.
        """
        count = int(np.prod([self.col_counts[k] for k in col_names]))
        if count == 1:
            # Purely continuous features
            default_value = 0.0
        else:
            default_value = 1.0
        names = [name for name in col_names if not self.is_continuous(name)]
        names.append("continuous") if self.is_continuous(col_names) else None
        reg_name = f"reg_{self.stitched_cols(names)}"
        return reg_name, default_value

    def get_reg_mult(self, col_names: list[str] | tuple[str]) -> float:
        """Returns the penalty multiplier for hierarchy level reg loss."""
        reg_name, default = self.get_reg_config(col_names)
        mult = self.hyperparameters.get_float(
            name=reg_name,
            default=default,
            help="Penalty multiplier for hierarchy level reg loss.",
        )
        return mult

    def is_continuous(self, k: str | Iterable[str]) -> bool:
        return is_continuous(k, self.encodings)

    def get_hierarchical_parameters(self, hierarchy: Mapping[str, TensorLike]) -> tuple[tf.Tensor, tf.Tensor]:
        """Returns the model parameters' for every hierarchical level (non-aggregated weights)

        Args:
            hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.

        NOTE: this currently does not depend on training flag. Possible we change how things work such that it will.

        Returns:

            tuple[tf.Tensor, tf.Tensor]: weights, indices

                the 1st list[tf.Tensor]=A: A[i] corresponds to the multiplicative data for the continuous aspects
                                                of the hierarchy in self.columns[i]
                the 2nd list[tf.Tensor]=B: B[i] corresponds to the indices in self.weights that corresponds to the
                                                correct learned coefficients of the hierarchy in self.columns[i]
        """
        # Shape is [count, ...] for both of these
        weights = []
        indices = []
        for col_names in self.columns:
            if isinstance(col_names, str):
                # We want to assume col_names is a list of column names
                col_names = [col_names]
            num_cols = len(col_names)
            cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
            cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
            num_cat_cols = len(cat_cols)
            num_cont_cols = num_cols - num_cat_cols

            name = self.stitched_cols(col_names)
            # The start of the region for this weight
            start = self.offsets[name]
            if num_cont_cols == 0:
                shape = tf.shape(hierarchy[cat_cols[0]])
                weight = tf.ones(shape, dtype=tf.float32, name=f"{name}_weights")
            else:
                # you can only have 1 cont col in col_names
                weight = hierarchy[cont_cols[0]]

            if num_cat_cols == 0:
                index = tf.cast(
                    tf.fill(tf.shape(weight), start),
                    dtype=tf.int64,
                    name=f"{name}_indices",
                )
                # if no categorical columns, we have the value of continuous column as the index
            else:
                # The standard encoding of left to right indices given base col_counts[col] for each col
                offsets = np.cumprod([1] + [self.col_counts[col] for col in cat_cols[:-1]])

                # The index in weights where we look up the first of the embeddings for this set of columns
                # This lets us concatenate all embeddings into a single weights matrix rather than defining
                # them separately, and deterministicly able to derive the index in this larger weight matrix.
                index = start + tf.math.add_n(
                    [
                        # hierarchy[col] is column of dataframe
                        tf.constant(offset, dtype=tf.int64) * tf.cast(hierarchy[col], dtype=tf.int64)
                        for offset, col in zip(offsets, cat_cols)
                    ],
                    name=f"{name}_indices",
                )

            # store index for heirarchical parameters and the corresponding continuous weightage
            indices.append(index)
            weights.append(weight)

        # len(self.columns), *shape(hierarchy[<any>])
        weights = tf.stack(weights, axis=0, name="weights_stacked")
        indices = tf.stack(indices, axis=0, name="indices")
        return weights, indices

    def __call__(
        self,
        hierarchy: dict[str, TensorLike] | tuple[tf.Tensor, tf.Tensor],
        training: bool = False,  # noqa: U100
        debug: bool = False,  # noqa: U100
        skip_metrics: bool = False,
    ) -> tf.Tensor:
        """Returns the model parameters' embeddings calculated from the weights.
        Adds l2 regularization penalties to loss based on deviations and bias.

        Args:
            hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.
            training (bool, optional): Whether this is a training or inference run. Defaults to False.

        Returns:
            tf.Tensor: Model parameters' embeddings.
        """

        if isinstance(hierarchy, tuple):
            weights, indices = hierarchy
        else:
            # get the hierarchical parameters that correspond to the input hierarchy
            # NOTE: weights is the proper multiplicative relationship using continuous hierarchical variables, not
            #       something from self.weights. Probably should change name in future for readability.
            weights, indices = self.get_hierarchical_parameters(hierarchy)
        # Look up embeddings by indices
        # len(self.columns), *shape(hierarchy[<any>]), np.prod(self.shape)

        if self.increase_lr is not None:
            lr_scaled_weights = self.weights * tf.constant(self.increase_lr, dtype=tf.float32)
        else:
            lr_scaled_weights = self.weights

        looked_up = tf.gather(lr_scaled_weights, indices, name="embeds")
        # Optimization and convert to tensor
        # counts,
        scatters = tf.constant(self.scatters, dtype=tf.int64)
        # Do a matrix multiply to sum over columns
        # *shape(hierarchy[<any>]), np.prod(self.shape)
        weighted = tf.einsum("c...f,c...->...f", looked_up, weights, name="weighted")

        # This is num_regularized_categories x flattened using the same scatter trick as for means
        # len(self.multipliers), np.prod(self.shape)
        cur_l2_norm = tf.scatter_nd(
            scatters[:, None],
            tf.math.square(lr_scaled_weights, name="shifted_squared"),
            shape=self.dense_shape,
            name="cur_l2_norm",
        )[1:]
        # We want we want to apply l2 regularization so that this ratio is pushed to be 1 or less.
        # len(self.multipliers), np.prod(self.shape)
        cur_ratio = cur_l2_norm / tf.constant(self.desired_l2norms[:, None] + EPSILON, dtype=tf.float32)
        # Old negative feedback was roughly cur_ratio ** 2 (in the steady state). This just makes it explicit.
        # We don't care if it is over 0 so we shift down by 1 then up by 1 to get it to be the same scale
        if self.use_l2_squared:
            hier_reg = tf.math.reduce_sum(
                tf.math.square(cur_ratio) * tf.constant(self.penalty_mults[:, None], dtype=tf.float32),
                name="hier_reg",
            )
        else:
            hier_reg = cur_ratio * tf.constant(self.penalty_mults[:, None], dtype=tf.float32)
        if not skip_metrics:
            self.add_loss("hier_reg", hier_reg, category="hier")
        if self.use_bias:
            if self.increase_lr is not None:
                lr_scaled_bias = self.bias * tf.constant(self.increase_lr, dtype=tf.float32)
            else:
                lr_scaled_bias = self.bias

            if self.reg_bias > 0 and not skip_metrics:
                bias_loss = tf.math.reduce_sum(tf.math.square(lr_scaled_bias))
                self.add_loss("reg_bias", bias_loss, category="aux", mult=self.reg_bias)

            result = tf.nn.bias_add(weighted, lr_scaled_bias, name="biased")
        else:
            result = weighted

        # We want to undo the flattening we did for simpler logic.
        initial_shape = [tf.shape(result)[i] for i in range(len(result.shape) - 1)]
        # *shape(hierarchy[<any>]), *self.shape
        return tf.reshape(result, [*initial_shape, *self.shape], name="final_var")

    def get_tensors(
        self, dy_dweights: tf.Tensor | tf.Variable | None = None, dy_dbias: tf.Tensor | tf.Variable | None = None
    ) -> tuple[dict[str, tf.Tensor], dict[str, list[dict] | pd.MultiIndex], list[str | int]]:
        """Get the learned weights for a HierarchicalEmbedding layer"""
        output_tensors: dict[str, tf.Tensor] = {}
        output_indices: dict[str, pd.Index | pd.MultiIndex] = {}

        weights = self.weights if dy_dweights is None else dy_dweights
        feature_names = self._get_feature_names(weights)
        if self.use_bias:
            bias = self.bias if dy_dbias is None else dy_dbias
            self._process_bias(bias, output_tensors, output_indices)
        self._process_columns_in_tensors(output_tensors, output_indices, weights)

        return output_tensors, output_indices, feature_names

    def _get_feature_names(self, weights: tf.Tensor | tf.Variable) -> list[str]:
        n_features = weights.shape[-1]
        feature_names = list(range(n_features)) if self.feature_names is None else self.feature_names
        if len(feature_names) != n_features:
            if n_features % len(feature_names) == 0:
                num_dups = n_features // len(feature_names)
                feature_names = [f"{name}_{i+1}" for name in feature_names for i in range(num_dups)]
            else:
                raise ValueError(
                    f"feature_names must be a list of size {n_features}, but got size {len(feature_names)}"
                )
        return feature_names

    def _process_bias(
        self,
        bias: tf.Tensor | tf.Variable,
        output_tensors: dict[str, tf.Tensor],
        output_indices: dict[str, pd.Index | pd.MultiIndex],
    ):
        output_tensors["bias"] = tf.expand_dims(bias, axis=0)
        output_indices["bias"] = pd.Index(["bias"])

    def _process_columns_in_tensors(
        self,
        output_tensors: dict[str, tf.Tensor],
        output_indices: dict[str, pd.Index | pd.MultiIndex],
        weights: tf.Tensor,
    ):
        n_features = weights.shape[-1]
        for col_names in self.columns:
            if isinstance(col_names, str):
                col_names = [col_names]
            hierarchy, output_index = self._get_hierarchy_and_output_index(col_names)
            learned_weights = self._get_learned_weights(hierarchy, col_names, weights)
            learned_weights = self._reshape_learned_weights_if_needed(learned_weights, n_features)
            output_tensors[self.stitched_cols(col_names)] = learned_weights
            output_indices[self.stitched_cols(col_names)] = output_index

    def _get_hierarchy_and_output_index(
        self, col_names: list[str]
    ) -> tuple[dict[str, NDArray], pd.Index | pd.MultiIndex]:
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
        num_cat_cols = len(cat_cols)

        if num_cat_cols == 0:
            hierarchy = {cont_cols[0]: np.asarray([1.0])}
            output_index = pd.Index([cont_cols[0]])
        else:
            midx = pd.MultiIndex.from_product([self.encodings[c].values() for c in cat_cols], names=cat_cols)
            output_index = pd.MultiIndex.from_product([self.encodings[c].keys() for c in cat_cols], names=cat_cols)
            if len(cont_cols) > 0:
                # TODO (@RyanSaxe): why is cont_cols[0] is used? adding a comment will be helpful.
                midx = pd.concat({1.0: pd.DataFrame(index=midx)}, names=[cont_cols[0]]).index
            hierarchy = {h: midx.get_level_values(h).to_numpy() for h in midx.names}

        return hierarchy, output_index

    def _get_learned_weights(
        self, hierarchy: dict[str, NDArray], col_names: list[str], weights: tf.Tensor
    ) -> tf.Tensor:
        name = self.stitched_cols(col_names)
        start = self.offsets[name]
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]

        if len(cont_cols) == 0:
            shape = tf.shape(hierarchy[list(hierarchy.keys())[0]])
            weight = tf.ones(shape, dtype=tf.float32, name=f"{self.stitched_cols(col_names)}_weights")
        else:
            weight = hierarchy[cont_cols[0]]
        if len(cat_cols) == 0:
            index = tf.cast(
                tf.fill(tf.shape(weight), self.offsets[name]),
                dtype=tf.int64,
                name=f"{name}_indices",
            )
        else:
            offsets = np.cumprod([1] + [self.col_counts[col] for col in cat_cols[:-1]])
            index = start + tf.math.add_n(
                [
                    tf.constant(offset, dtype=tf.int64) * tf.cast(hierarchy[col], dtype=tf.int64)
                    for offset, col in zip(offsets, cat_cols)
                ]
            )

        return tf.gather(weights, index, name="embeds")

    def _reshape_learned_weights_if_needed(self, learned_weights: tf.Tensor, n_features: int) -> tf.Tensor:
        if len(learned_weights.shape) > 2:
            flattened_shape = prod(learned_weights.shape[:-1])
            return tf.reshape(learned_weights, (flattened_shape, n_features))
        return learned_weights

    def get_dfs(
        self, dy_dweights: tf.Tensor | None = None, dy_dbias: tf.Tensor | None = None
    ) -> dict[str, pd.DataFrame]:
        """Get the learned weights for a HierarchicalEmbedding layer as a DataFrame"""
        # NOTE: separated this function so we could more easily differentiate
        output_tensors, output_indices, feature_names = self.get_tensors(dy_dweights=dy_dweights, dy_dbias=dy_dbias)
        return {
            key: pd.DataFrame(tensor, index=output_indices[key], columns=feature_names)
            for key, tensor in output_tensors.items()
        }

    @property
    def dfs(self) -> dict[str, pd.DataFrame]:
        return self.get_dfs()

__call__(hierarchy, training=False, debug=False, skip_metrics=False)

Returns the model parameters' embeddings calculated from the weights. Adds l2 regularization penalties to loss based on deviations and bias.

Parameters:

Name Type Description Default
hierarchy dict[str, TensorLike]

Hierarchy placeholder for Hierarchial embedding variable.

required
training bool

Whether this is a training or inference run. Defaults to False.

False

Returns:

Type Description
Tensor

tf.Tensor: Model parameters' embeddings.

Source code in wt_ml/layers/hier_embedding.py
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
def __call__(
    self,
    hierarchy: dict[str, TensorLike] | tuple[tf.Tensor, tf.Tensor],
    training: bool = False,  # noqa: U100
    debug: bool = False,  # noqa: U100
    skip_metrics: bool = False,
) -> tf.Tensor:
    """Returns the model parameters' embeddings calculated from the weights.
    Adds l2 regularization penalties to loss based on deviations and bias.

    Args:
        hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.
        training (bool, optional): Whether this is a training or inference run. Defaults to False.

    Returns:
        tf.Tensor: Model parameters' embeddings.
    """

    if isinstance(hierarchy, tuple):
        weights, indices = hierarchy
    else:
        # get the hierarchical parameters that correspond to the input hierarchy
        # NOTE: weights is the proper multiplicative relationship using continuous hierarchical variables, not
        #       something from self.weights. Probably should change name in future for readability.
        weights, indices = self.get_hierarchical_parameters(hierarchy)
    # Look up embeddings by indices
    # len(self.columns), *shape(hierarchy[<any>]), np.prod(self.shape)

    if self.increase_lr is not None:
        lr_scaled_weights = self.weights * tf.constant(self.increase_lr, dtype=tf.float32)
    else:
        lr_scaled_weights = self.weights

    looked_up = tf.gather(lr_scaled_weights, indices, name="embeds")
    # Optimization and convert to tensor
    # counts,
    scatters = tf.constant(self.scatters, dtype=tf.int64)
    # Do a matrix multiply to sum over columns
    # *shape(hierarchy[<any>]), np.prod(self.shape)
    weighted = tf.einsum("c...f,c...->...f", looked_up, weights, name="weighted")

    # This is num_regularized_categories x flattened using the same scatter trick as for means
    # len(self.multipliers), np.prod(self.shape)
    cur_l2_norm = tf.scatter_nd(
        scatters[:, None],
        tf.math.square(lr_scaled_weights, name="shifted_squared"),
        shape=self.dense_shape,
        name="cur_l2_norm",
    )[1:]
    # We want we want to apply l2 regularization so that this ratio is pushed to be 1 or less.
    # len(self.multipliers), np.prod(self.shape)
    cur_ratio = cur_l2_norm / tf.constant(self.desired_l2norms[:, None] + EPSILON, dtype=tf.float32)
    # Old negative feedback was roughly cur_ratio ** 2 (in the steady state). This just makes it explicit.
    # We don't care if it is over 0 so we shift down by 1 then up by 1 to get it to be the same scale
    if self.use_l2_squared:
        hier_reg = tf.math.reduce_sum(
            tf.math.square(cur_ratio) * tf.constant(self.penalty_mults[:, None], dtype=tf.float32),
            name="hier_reg",
        )
    else:
        hier_reg = cur_ratio * tf.constant(self.penalty_mults[:, None], dtype=tf.float32)
    if not skip_metrics:
        self.add_loss("hier_reg", hier_reg, category="hier")
    if self.use_bias:
        if self.increase_lr is not None:
            lr_scaled_bias = self.bias * tf.constant(self.increase_lr, dtype=tf.float32)
        else:
            lr_scaled_bias = self.bias

        if self.reg_bias > 0 and not skip_metrics:
            bias_loss = tf.math.reduce_sum(tf.math.square(lr_scaled_bias))
            self.add_loss("reg_bias", bias_loss, category="aux", mult=self.reg_bias)

        result = tf.nn.bias_add(weighted, lr_scaled_bias, name="biased")
    else:
        result = weighted

    # We want to undo the flattening we did for simpler logic.
    initial_shape = [tf.shape(result)[i] for i in range(len(result.shape) - 1)]
    # *shape(hierarchy[<any>]), *self.shape
    return tf.reshape(result, [*initial_shape, *self.shape], name="final_var")

__init__(shape, encodings, columns=None, use_bias=True, dropped_columns=[], initializer=0.0, bias_initializer=0.0, hyperparameters=None, feature_names=None, name=None, increase_lr=None)

Initializes the hierarchical embedding object with hierarchy levels, parameter shape and other initializers.

Parameters:

Name Type Description Default
shape list[int]

Desired dimensions of model parameters only within final result.

required
hierarchy DataFrame

The hierarchy for which embeddings are trained.

required
columns list[str | list[str]] | None

Hierarchy levels to learn embeddings. Defaults to None.

None
use_bias bool

Whether to include bias. Defaults to True.

True
dropped_columns list

Columns to exclude in hierarchy. Defaults to ["granular", "region", "coastal", "populationdensity", "medianage"].

[]
initializer Initializer

Initializer for embeddings(weights). Defaults to 0.0.

0.0
bias_initializer Initializer

Initializer for bias. Defaults to 0.0.

0.0
hyperparameters Hyperparams | None

Dictionary of hyperparameters for buidling this layer. Defaults to None.

None
name str | None

Name of the layer. Defaults to None.

None
Source code in wt_ml/layers/hier_embedding.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def __init__(
    self,
    shape: list[int],
    encodings: dict[str, Any],
    columns: list[str | list[str]] | None = None,
    use_bias: bool = True,
    dropped_columns=[],
    initializer: Initializer = 0.0,
    bias_initializer: Initializer = 0.0,
    hyperparameters: Hyperparams | None = None,
    feature_names: list[list[str]] | list[str] | None = None,
    name: str | None = None,
    increase_lr: float | None = None,
):
    """Initializes the hierarchical embedding object with hierarchy levels, parameter shape
    and other initializers.

    Args:
        shape (list[int]): Desired dimensions of model parameters only within final result.
        hierarchy (pd.DataFrame): The hierarchy for which embeddings are trained.
        columns (list[str  |  list[str]] | None, optional): Hierarchy levels to learn embeddings. Defaults to None.
        use_bias (bool, optional): Whether to include bias. Defaults to True.
        dropped_columns (list, optional): Columns to exclude in hierarchy.
                                    Defaults to ["granular", "region", "coastal", "populationdensity", "medianage"].
        initializer (Initializer, optional): Initializer for embeddings(weights). Defaults to 0.0.
        bias_initializer (Initializer, optional): Initializer for bias. Defaults to 0.0.
        hyperparameters (Hyperparams | None, optional): Dictionary of hyperparameters for buidling this layer.
                                                        Defaults to None.
        name (str | None, optional): Name of the layer. Defaults to None.
    """
    super().__init__(hyperparameters=hyperparameters, name=name)
    self.use_bias = use_bias
    encodings_dropped = {k: v for k, v in encodings.items() if k not in dropped_columns}
    assert encodings_dropped, "No cols in hierarchy."
    self.encodings = encodings_dropped
    self.shape = shape
    self.increase_lr = increase_lr
    self.initializer = initializer
    self.bias_initializer = bias_initializer
    self.created_reg = False
    self.feature_names = feature_names
    if self.feature_names is not None and isinstance(self.feature_names[0], list):
        self.feature_names = [elem for sublist in self.feature_names for elem in sublist]
    if columns is not None:
        self._process_columns(columns)
    else:
        self.columns = list(self.encodings.keys())
        self.used_cols = set(self.columns)

build(input_shapes)

Builds hyperparamters, deviations, embeddings(weights), bias and other intermediate variables.

Parameters:

Name Type Description Default
input_shapes InputShapes

The effect and hierarchy shapes.

required

Raises:

Type Description
AllUniqueError

When there are no hierchical columns because everything is unique within it.

Source code in wt_ml/layers/hier_embedding.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
def build(self, input_shapes):  # noqa: U100
    """Builds hyperparamters, deviations, embeddings(weights), bias and other intermediate variables.

    Args:
        input_shapes (InputShapes): The effect and hierarchy shapes.

    Raises:
        AllUniqueError: When there are no hierchical columns because everything is unique within it.
    """
    self.use_l2_squared = self.hyperparameters.get_bool(
        "use_l2_squared",
        default=False,
        help="Use the l2 norm to the fourth power instead of only using it for large values for stability.",
    )
    self.desired_stddev = self.hyperparameters.get_float(
        "desired_stddev",
        default=0.10,
        min=0.01,
        max=100.0,
        help="The desired maximum value for the stddev along the full hierarchy.",
    )
    self.use_inv_sqrt = self.hyperparameters.get_bool(
        "use_inv_sqrt",
        default=True,
        help="Scale the stddev for each category by the inverse square root of the number of unique values.",
    )

    if self.use_bias:
        self.reg_bias = self.hyperparameters.get_float(
            "reg_bias",
            default=0.0,
            min=0.0,
            max=1e4,
            help="The strength of l2 regularization to apply to the bias term.",
        )
    self.offsets = {}
    self.col_counts = {
        k: (
            self.encodings[k]
            if isinstance(self.encodings[k], (float, int))
            else ((max(self.encodings[k].values()) + 1) if not isinstance(self.encodings[k], str) else 1)
        )
        for k in tf.nest.flatten(self.columns)
    }
    var_counts = [
        self.col_counts[col] if isinstance(col, str) else np.prod([self.col_counts[k] for k in col])
        for col in self.columns
    ]

    self.columns = [col for col, count in zip(self.columns, var_counts) if count > 1 or self.is_continuous(col)]
    count = 0
    desired_stddevs = []
    # Scatters is the inverse of gathering from num_regularized_categories + 1 to count of weights
    self.scatters = []
    self.penalty_mults = []
    multipliers = []
    reg_counts = []
    flattened = int(np.prod(self.shape)) if len(self.shape) > 0 else 1
    for col_names in self.columns:
        if isinstance(col_names, str):
            col_names = [col_names]
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        name = self.stitched_cols(col_names)
        num_cont_cols = len(col_names) - len(cat_cols)
        if num_cont_cols > 1:
            raise ValueError(
                "You can only have one continuous hierarchical variable within a single hierarchical level"
            )
        number = int(np.prod([self.col_counts[k] for k in col_names]))
        multipliers.append(1 / number)
        self.scatters += [len(multipliers)] * number
        desired_stddevs.append(1 / np.sqrt(number) if self.use_inv_sqrt else 1)
        reg_counts.append(max(1, number - 1))
        self.offsets[name] = count
        count += number
        self.penalty_mults.append(self.get_reg_mult(col_names))

    if count == 0 or len(multipliers) == 0:
        raise AllUniqueError("There are no hierchical columns everything is unique.")
    # scatters is shape (count,)
    self.scatters = np.array(self.scatters)
    # multipliers is shape (1 + regularized_counts,)
    self.multipliers = np.array([0] + multipliers, dtype=np.float32)
    self.penalty_mults = np.array(self.penalty_mults, dtype=np.float32)
    self.dense_shape = [len(self.multipliers), flattened]
    # desired_* is shape (regularized_counts,)
    self.desired_stddevs = (
        self.desired_stddev * np.array(desired_stddevs or [1], dtype=np.float32) / np.sqrt(max(1, len(multipliers)))
    )
    self.desired_l2norms = (
        np.array(reg_counts, dtype=np.float32) * self.desired_stddevs**2 / (1 if self.use_l2_squared else 2)
    )
    self.weights = self.create_var(
        "weights", shape=[count, flattened], dtype=tf.float32, trainable=True, initializer=self.initializer
    )
    if self.use_bias:
        self.bias = self.create_var(
            "bias", shape=[flattened], dtype=tf.float32, trainable=True, initializer=self.bias_initializer
        )

get_dfs(dy_dweights=None, dy_dbias=None)

Get the learned weights for a HierarchicalEmbedding layer as a DataFrame

Source code in wt_ml/layers/hier_embedding.py
548
549
550
551
552
553
554
555
556
557
def get_dfs(
    self, dy_dweights: tf.Tensor | None = None, dy_dbias: tf.Tensor | None = None
) -> dict[str, pd.DataFrame]:
    """Get the learned weights for a HierarchicalEmbedding layer as a DataFrame"""
    # NOTE: separated this function so we could more easily differentiate
    output_tensors, output_indices, feature_names = self.get_tensors(dy_dweights=dy_dweights, dy_dbias=dy_dbias)
    return {
        key: pd.DataFrame(tensor, index=output_indices[key], columns=feature_names)
        for key, tensor in output_tensors.items()
    }

get_hierarchical_parameters(hierarchy)

Returns the model parameters' for every hierarchical level (non-aggregated weights)

Parameters:

Name Type Description Default
hierarchy dict[str, TensorLike]

Hierarchy placeholder for Hierarchial embedding variable.

required

NOTE: this currently does not depend on training flag. Possible we change how things work such that it will.

Returns:

tuple[tf.Tensor, tf.Tensor]: weights, indices

    the 1st list[tf.Tensor]=A: A[i] corresponds to the multiplicative data for the continuous aspects
                                    of the hierarchy in self.columns[i]
    the 2nd list[tf.Tensor]=B: B[i] corresponds to the indices in self.weights that corresponds to the
                                    correct learned coefficients of the hierarchy in self.columns[i]
Source code in wt_ml/layers/hier_embedding.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
def get_hierarchical_parameters(self, hierarchy: Mapping[str, TensorLike]) -> tuple[tf.Tensor, tf.Tensor]:
    """Returns the model parameters' for every hierarchical level (non-aggregated weights)

    Args:
        hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.

    NOTE: this currently does not depend on training flag. Possible we change how things work such that it will.

    Returns:

        tuple[tf.Tensor, tf.Tensor]: weights, indices

            the 1st list[tf.Tensor]=A: A[i] corresponds to the multiplicative data for the continuous aspects
                                            of the hierarchy in self.columns[i]
            the 2nd list[tf.Tensor]=B: B[i] corresponds to the indices in self.weights that corresponds to the
                                            correct learned coefficients of the hierarchy in self.columns[i]
    """
    # Shape is [count, ...] for both of these
    weights = []
    indices = []
    for col_names in self.columns:
        if isinstance(col_names, str):
            # We want to assume col_names is a list of column names
            col_names = [col_names]
        num_cols = len(col_names)
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
        num_cat_cols = len(cat_cols)
        num_cont_cols = num_cols - num_cat_cols

        name = self.stitched_cols(col_names)
        # The start of the region for this weight
        start = self.offsets[name]
        if num_cont_cols == 0:
            shape = tf.shape(hierarchy[cat_cols[0]])
            weight = tf.ones(shape, dtype=tf.float32, name=f"{name}_weights")
        else:
            # you can only have 1 cont col in col_names
            weight = hierarchy[cont_cols[0]]

        if num_cat_cols == 0:
            index = tf.cast(
                tf.fill(tf.shape(weight), start),
                dtype=tf.int64,
                name=f"{name}_indices",
            )
            # if no categorical columns, we have the value of continuous column as the index
        else:
            # The standard encoding of left to right indices given base col_counts[col] for each col
            offsets = np.cumprod([1] + [self.col_counts[col] for col in cat_cols[:-1]])

            # The index in weights where we look up the first of the embeddings for this set of columns
            # This lets us concatenate all embeddings into a single weights matrix rather than defining
            # them separately, and deterministicly able to derive the index in this larger weight matrix.
            index = start + tf.math.add_n(
                [
                    # hierarchy[col] is column of dataframe
                    tf.constant(offset, dtype=tf.int64) * tf.cast(hierarchy[col], dtype=tf.int64)
                    for offset, col in zip(offsets, cat_cols)
                ],
                name=f"{name}_indices",
            )

        # store index for heirarchical parameters and the corresponding continuous weightage
        indices.append(index)
        weights.append(weight)

    # len(self.columns), *shape(hierarchy[<any>])
    weights = tf.stack(weights, axis=0, name="weights_stacked")
    indices = tf.stack(indices, axis=0, name="indices")
    return weights, indices

get_reg_config(col_names)

Creates name used for regularization and default value for the penalty multiplier. If all columns are categorical, we can just join their names in order to find penalty. Otherwise, when different continuous features are paired with a same categorical column, the resulting hierarchical categories share same penalty. Always, suffix the continuous string to the end of the name. Example: brand-DEM and brand-GOP have the same penalty called reg_brand-continuous. Examples of mixed categories: reg_brand-continuous, reg_vehicle-continuous.

Parameters:

Name Type Description Default
col_names list[str]

Hierarchical column names.

required

Returns:

Type Description
tuple[str, float]

tuple[str, float]: Regularization penalty name and the default value.

Source code in wt_ml/layers/hier_embedding.py
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
def get_reg_config(self, col_names: tuple[str] | list[str]) -> tuple[str, float]:
    """Creates name used for regularization and default value for the penalty multiplier.
    If all columns are categorical, we can just join their names in order to find penalty.
    Otherwise, when different continuous features are paired with a same categorical column,
    the resulting hierarchical categories share same penalty. Always, suffix the continuous
    string to the end of the name.
    Example: brand-DEM and brand-GOP have the same penalty called reg_brand-continuous.
    Examples of mixed categories: reg_brand-continuous, reg_vehicle-continuous.

    Args:
        col_names (list[str]): Hierarchical column names.

    Returns:
        tuple[str, float]: Regularization penalty name and the default value.
    """
    count = int(np.prod([self.col_counts[k] for k in col_names]))
    if count == 1:
        # Purely continuous features
        default_value = 0.0
    else:
        default_value = 1.0
    names = [name for name in col_names if not self.is_continuous(name)]
    names.append("continuous") if self.is_continuous(col_names) else None
    reg_name = f"reg_{self.stitched_cols(names)}"
    return reg_name, default_value

get_reg_mult(col_names)

Returns the penalty multiplier for hierarchy level reg loss.

Source code in wt_ml/layers/hier_embedding.py
271
272
273
274
275
276
277
278
279
def get_reg_mult(self, col_names: list[str] | tuple[str]) -> float:
    """Returns the penalty multiplier for hierarchy level reg loss."""
    reg_name, default = self.get_reg_config(col_names)
    mult = self.hyperparameters.get_float(
        name=reg_name,
        default=default,
        help="Penalty multiplier for hierarchy level reg loss.",
    )
    return mult

get_tensors(dy_dweights=None, dy_dbias=None)

Get the learned weights for a HierarchicalEmbedding layer

Source code in wt_ml/layers/hier_embedding.py
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
def get_tensors(
    self, dy_dweights: tf.Tensor | tf.Variable | None = None, dy_dbias: tf.Tensor | tf.Variable | None = None
) -> tuple[dict[str, tf.Tensor], dict[str, list[dict] | pd.MultiIndex], list[str | int]]:
    """Get the learned weights for a HierarchicalEmbedding layer"""
    output_tensors: dict[str, tf.Tensor] = {}
    output_indices: dict[str, pd.Index | pd.MultiIndex] = {}

    weights = self.weights if dy_dweights is None else dy_dweights
    feature_names = self._get_feature_names(weights)
    if self.use_bias:
        bias = self.bias if dy_dbias is None else dy_dbias
        self._process_bias(bias, output_tensors, output_indices)
    self._process_columns_in_tensors(output_tensors, output_indices, weights)

    return output_tensors, output_indices, feature_names

stitched_cols(col_names)

Returns a string representation of the columns.

Source code in wt_ml/layers/hier_embedding.py
241
242
243
def stitched_cols(self, col_names: str | list[str] | tuple) -> str:
    """Returns a string representation of the columns."""
    return col_names if isinstance(col_names, str) else "-".join(col_names)

LinearBaseline

Bases: Module

Source code in wt_ml/layers/linear_baseline.py
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
class LinearBaseline(Module):
    def __init__(
        self,
        starting_sales: np.ndarray,
        num_starts: int,
        encodings: dict[str, int],
        hyperparameters: Hyperparams | None = None,
        name: str | None = None,
    ):
        """Class initialization to create linear regression lines for calculating baseline, for each granularity.

        Args:
            starting_sales (np.ndarray): Sales at the start of each baseline. shape = num_starts x num_granularity.
            num_starts (int): No. of starting points for each granularity.
            hyperparameters (Hyperparams, optional): All hyperparameters.
            name (str | None, optional): Name of the layer. Defaults to None.
        """
        super().__init__(hyperparameters=hyperparameters, name=name)
        self.starting_sales = starting_sales
        self.num_starts = num_starts
        self.encodings = encodings

    def build(self, input_shapes):
        """Build the layer parameters needed for calculating linear baseline.

        Args:
            input_shapes (Tuple[tf.Tensor, ...]): Tuple of tensor shapes of `*args`(without the defaults)
                                                  passed to `__call__()`.
        """
        self.num_starts = self.num_starts if input_shapes.sales_num_restarts is not ... else 1
        shape = [self.num_starts, len(self.encodings["wholesaler"]), len(self.encodings["brand"])]
        self.use_perfect_adjustment = self.hyperparameters.get_bool(
            "use_perfect_adjustment",
            default=False,
            help="Instead of using gradient descent change directly to the optimal values.",
        )
        if self.use_perfect_adjustment:
            self.lr_scale = 1.0
            if self.num_starts > 1:
                raise ValueError("Perfect adjustments does not support restarts at this time.")
            self.scalar_so_softplus_approaches_0 = 1.0
            self.allow_slope = False
            self.perfection_speed = self.hyperparameters.get_float(
                "perfection_speed",
                default=0.99,
                min=0.01,
                max=1.0,
                help="How close to make the step to the optimal value.",
            )
            self.relative_scale_cap = self.hyperparameters.get_float(
                "relative_scale_cap",
                default=10.0,
                min=1.0,
                max=1000.0,
                help="The maximum ratio it can have to the provided initial value.",
            )
            self.upper_bound = np.tile(
                (
                    (
                        self.relative_scale_cap * self.starting_sales
                        + np.log(
                            -np.expm1(
                                -self.relative_scale_cap * self.starting_sales * self.scalar_so_softplus_approaches_0
                            )
                            + EPSILON
                        )
                        / self.scalar_so_softplus_approaches_0
                    )
                    / self.lr_scale
                ),
                (self.num_starts, 1, 1),
            )
        else:
            self.lr_scale = self.hyperparameters.get_float(
                "lr_scale",
                default=10.0,
                min=1.0,
                max=1000.0,
                help="A factor to multiply the raw weights by so they get larger gradients.",
            )
            self.scalar_so_softplus_approaches_0 = self.hyperparameters.get_float(
                "scalar_so_softplus_approaches_0",
                default=1.0,
                min=1.0,
                max=100.0,
                help="The temperature to apply to softplus to max it better approximate the relu function.",
            )
            self.allow_slope = (
                self.hyperparameters.get_bool(
                    "allow_slope",
                    default=False,
                    help="Whether to allow slope in baseline",
                )
                and input_shapes.dates_since_start is not ...
            )
        self.baseline_intercept = self.create_var(
            # Start a little lower to allow roicurves to start higher.
            "intercept",
            shape=shape,
            dtype=tf.float32,
            initializer=np.tile(
                (
                    (
                        0.8 * self.starting_sales
                        + np.log(-np.expm1(-0.8 * self.starting_sales * self.scalar_so_softplus_approaches_0) + EPSILON)
                        / self.scalar_so_softplus_approaches_0
                    )
                    / self.lr_scale
                ).astype(np.float32),
                (self.num_starts, 1, 1),
            ),
            trainable=not self.use_perfect_adjustment,
        )
        if self.allow_slope:
            self.baseline_slope = self.create_var(
                "slope", shape=shape, dtype=tf.float32, trainable=not self.use_perfect_adjustment
            )
            self.base_under_0_lambda = self.hyperparameters.get_float(
                "base_under_0_lambda",
                default=1.0e-03,
                min=1.0e-08,
                max=1.0,
                help="The weight for the loss applied to the baseline being below -10 before the softplus.",
            )

    def do_perfect_adjustment(self, batch: EconomicModelInput, intermediaries: "EconomicIntermediaries"):
        if not self.use_perfect_adjustment:
            logger.warning("Cannot do perfect adjustment if the hyperparameter is not enabled.")
            return
        y_mask = intermediaries.mask if intermediaries.mask is not None else tf.ones_like(intermediaries.y_smooth)
        multiplicative_impact = prod_n(intermediaries.impacts.multiplicative_impacts)
        additive_impact = tf.math.add_n(intermediaries.impacts.additive_impacts)
        # I thought this needed to be divided by 2, not sure why it doesn't.
        post_softplus_unclipped = tf.einsum(
            "bt,bt,bt->b", y_mask, multiplicative_impact, intermediaries.y_smooth - additive_impact
        ) / (tf.einsum("bt,bt,bt->b", y_mask, multiplicative_impact, multiplicative_impact) + EPSILON)
        post_softplus = tf.math.maximum(post_softplus_unclipped, EPSILON)
        pre_softplus = (
            post_softplus
            + tf.math.log(-tf.math.expm1(-self.scalar_so_softplus_approaches_0 * post_softplus))
            / self.scalar_so_softplus_approaches_0
        ) / self.lr_scale
        gather_indices = tf.squeeze(self.get_indices(batch.wholesaler_index, batch.brand_index, None), 1)
        existing_values = self.baseline_intercept.gather_nd(gather_indices)
        max_values = tf.gather_nd(tf.constant(self.upper_bound, dtype=tf.float32), gather_indices)
        pre_softplus = tf.math.minimum(pre_softplus, max_values)
        self.baseline_intercept.scatter_nd_update(
            gather_indices, self.perfection_speed * pre_softplus + (1 - self.perfection_speed) * existing_values
        )

    def get_indices(self, wholesaler_index: tf.Tensor, brand_index: tf.Tensor, sales_num_restarts: tf.Tensor | None):
        wholesaler_indices = tf.tile(
            tf.cast(wholesaler_index[:, None], dtype=tf.int32),
            [1, tf.shape(sales_num_restarts)[1] if sales_num_restarts is not None else 1],
        )
        brand_indices = tf.tile(
            tf.cast(brand_index[:, None], dtype=tf.int32),
            [1, tf.shape(sales_num_restarts)[1] if sales_num_restarts is not None else 1],
        )
        return tf.cast(
            tf.stack(
                [
                    sales_num_restarts if sales_num_restarts is not None else tf.zeros_like(brand_indices),
                    wholesaler_indices,
                    brand_indices,
                ],
                axis=2,
            ),
            tf.int64,
        )

    def __call__(
        self,
        batch: LinearBaselineInput,
        training=False,  # noqa: U100
        debug=False,
        skip_metrics=False,  # noqa: U100
    ) -> LinearBaselineIntermediaries:
        """Calcuate baseline using slope-intercept form (y=mx+c).

        Args:
            dates_since_start (TensorLike): Number of timestamps since the last restart.
                                            shape = num_time x num_granular.
            sales_num_restarts (TensorLike): Number of restarts that occurred before this point.
                                             shape = num_time x num_granular.
            hierarchy (dict[str, TensorLike]): The lookup tables for categorical values.
            mask (TensorLike): Filter for 0 sales or unrealistic sales.
            training (bool, optional): Whether training the layer parameters or not.
                                       Defaults to False.

        Returns:
            LinearBaselineIntermediaries: Intermediate calculations for baseline like slope, intercept, etc.
        """
        indices = self.get_indices(batch.hierarchy["wholesaler"], batch.hierarchy["brand"], batch.sales_num_restarts)
        # For each granularity, gathering the initial baseline intercept across the time axis
        # The purpose is to use the same intercept for a given baseline across all the data points
        # The shape is converted to `num_time x num_granular` from `num_starts x num_granular`
        # The indices to gather for each baseline are stored in `sales_num_restarts`
        # NOTE: Issue in M1 Macbook version of tensorflow causes gather_nd to break when operating
        #       on a variable. tf.convert_to_tensor solves this. Open github issue can be found here:
        #       https://github.com/tensorflow/tensorflow/issues/57549
        broadcasted_intercept = tf.gather_nd(tf.convert_to_tensor(self.baseline_intercept), indices) * tf.constant(
            self.lr_scale, dtype=tf.float32
        )
        if self.allow_slope:
            broadcasted_slope = tf.gather_nd(tf.convert_to_tensor(self.baseline_slope), indices)
            # to convert, run: dates_since_start = tf.cast(
            # dates_since_start, dtype=tf.float32, name="dates_since_start")
            slope_impact = (
                broadcasted_slope
                * batch.dates_since_start
                * 2.0
                / (tf.math.reduce_max(batch.dates_since_start, axis=1, keepdims=True, name="max_dates") + EPSILON)
            )

            baseline_raw = slope_impact + broadcasted_intercept
        else:
            broadcasted_slope = tf.zeros_like(broadcasted_intercept)
            slope_impact = tf.zeros_like(broadcasted_intercept)
            baseline_raw = broadcasted_intercept
        if self.allow_slope and not skip_metrics:
            mask_weekly = (
                tf.cast(batch.mask, dtype=tf.float32) if batch.mask is not None else tf.ones_like(baseline_raw)
            )
            baseline_raw_masked_for_min = baseline_raw * mask_weekly + (1.0 - mask_weekly) * LARGE_EPSILON
            min_base = tf.reduce_min(baseline_raw_masked_for_min, axis=0, keepdims=True)
            # if our minimum value is very negative, then get a loss
            # this is just a softplus with a scale of .1, per product. if the product has negative min_base,
            # (more neg than -0.5), then this will start to be more and more positive
            min_base_with_0_soft = softplus(-10.0 - min_base, AUX_SCALE, name="min_base_with_0_soft")
            # take the MSE of that signal. above
            base_under_0 = tf.reduce_sum(tf.square(min_base_with_0_soft), name="base_under_0")
            self.add_loss("base_under_0", base_under_0, "aux", self.base_under_0_lambda)
        baseline = softplus(baseline_raw, scale=1 / self.scalar_so_softplus_approaches_0)
        return LinearBaselineIntermediaries(
            intercept=broadcasted_intercept if debug else None,
            slope=broadcasted_slope if debug else None,
            slope_impact=slope_impact if debug else None,
            baseline_raw=baseline_raw if debug else None,
            baseline=baseline,
        )

__call__(batch, training=False, debug=False, skip_metrics=False)

Calcuate baseline using slope-intercept form (y=mx+c).

Parameters:

Name Type Description Default
dates_since_start TensorLike

Number of timestamps since the last restart. shape = num_time x num_granular.

required
sales_num_restarts TensorLike

Number of restarts that occurred before this point. shape = num_time x num_granular.

required
hierarchy dict[str, TensorLike]

The lookup tables for categorical values.

required
mask TensorLike

Filter for 0 sales or unrealistic sales.

required
training bool

Whether training the layer parameters or not. Defaults to False.

False

Returns:

Name Type Description
LinearBaselineIntermediaries LinearBaselineIntermediaries

Intermediate calculations for baseline like slope, intercept, etc.

Source code in wt_ml/layers/linear_baseline.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
def __call__(
    self,
    batch: LinearBaselineInput,
    training=False,  # noqa: U100
    debug=False,
    skip_metrics=False,  # noqa: U100
) -> LinearBaselineIntermediaries:
    """Calcuate baseline using slope-intercept form (y=mx+c).

    Args:
        dates_since_start (TensorLike): Number of timestamps since the last restart.
                                        shape = num_time x num_granular.
        sales_num_restarts (TensorLike): Number of restarts that occurred before this point.
                                         shape = num_time x num_granular.
        hierarchy (dict[str, TensorLike]): The lookup tables for categorical values.
        mask (TensorLike): Filter for 0 sales or unrealistic sales.
        training (bool, optional): Whether training the layer parameters or not.
                                   Defaults to False.

    Returns:
        LinearBaselineIntermediaries: Intermediate calculations for baseline like slope, intercept, etc.
    """
    indices = self.get_indices(batch.hierarchy["wholesaler"], batch.hierarchy["brand"], batch.sales_num_restarts)
    # For each granularity, gathering the initial baseline intercept across the time axis
    # The purpose is to use the same intercept for a given baseline across all the data points
    # The shape is converted to `num_time x num_granular` from `num_starts x num_granular`
    # The indices to gather for each baseline are stored in `sales_num_restarts`
    # NOTE: Issue in M1 Macbook version of tensorflow causes gather_nd to break when operating
    #       on a variable. tf.convert_to_tensor solves this. Open github issue can be found here:
    #       https://github.com/tensorflow/tensorflow/issues/57549
    broadcasted_intercept = tf.gather_nd(tf.convert_to_tensor(self.baseline_intercept), indices) * tf.constant(
        self.lr_scale, dtype=tf.float32
    )
    if self.allow_slope:
        broadcasted_slope = tf.gather_nd(tf.convert_to_tensor(self.baseline_slope), indices)
        # to convert, run: dates_since_start = tf.cast(
        # dates_since_start, dtype=tf.float32, name="dates_since_start")
        slope_impact = (
            broadcasted_slope
            * batch.dates_since_start
            * 2.0
            / (tf.math.reduce_max(batch.dates_since_start, axis=1, keepdims=True, name="max_dates") + EPSILON)
        )

        baseline_raw = slope_impact + broadcasted_intercept
    else:
        broadcasted_slope = tf.zeros_like(broadcasted_intercept)
        slope_impact = tf.zeros_like(broadcasted_intercept)
        baseline_raw = broadcasted_intercept
    if self.allow_slope and not skip_metrics:
        mask_weekly = (
            tf.cast(batch.mask, dtype=tf.float32) if batch.mask is not None else tf.ones_like(baseline_raw)
        )
        baseline_raw_masked_for_min = baseline_raw * mask_weekly + (1.0 - mask_weekly) * LARGE_EPSILON
        min_base = tf.reduce_min(baseline_raw_masked_for_min, axis=0, keepdims=True)
        # if our minimum value is very negative, then get a loss
        # this is just a softplus with a scale of .1, per product. if the product has negative min_base,
        # (more neg than -0.5), then this will start to be more and more positive
        min_base_with_0_soft = softplus(-10.0 - min_base, AUX_SCALE, name="min_base_with_0_soft")
        # take the MSE of that signal. above
        base_under_0 = tf.reduce_sum(tf.square(min_base_with_0_soft), name="base_under_0")
        self.add_loss("base_under_0", base_under_0, "aux", self.base_under_0_lambda)
    baseline = softplus(baseline_raw, scale=1 / self.scalar_so_softplus_approaches_0)
    return LinearBaselineIntermediaries(
        intercept=broadcasted_intercept if debug else None,
        slope=broadcasted_slope if debug else None,
        slope_impact=slope_impact if debug else None,
        baseline_raw=baseline_raw if debug else None,
        baseline=baseline,
    )

__init__(starting_sales, num_starts, encodings, hyperparameters=None, name=None)

Class initialization to create linear regression lines for calculating baseline, for each granularity.

Parameters:

Name Type Description Default
starting_sales ndarray

Sales at the start of each baseline. shape = num_starts x num_granularity.

required
num_starts int

No. of starting points for each granularity.

required
hyperparameters Hyperparams

All hyperparameters.

None
name str | None

Name of the layer. Defaults to None.

None
Source code in wt_ml/layers/linear_baseline.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def __init__(
    self,
    starting_sales: np.ndarray,
    num_starts: int,
    encodings: dict[str, int],
    hyperparameters: Hyperparams | None = None,
    name: str | None = None,
):
    """Class initialization to create linear regression lines for calculating baseline, for each granularity.

    Args:
        starting_sales (np.ndarray): Sales at the start of each baseline. shape = num_starts x num_granularity.
        num_starts (int): No. of starting points for each granularity.
        hyperparameters (Hyperparams, optional): All hyperparameters.
        name (str | None, optional): Name of the layer. Defaults to None.
    """
    super().__init__(hyperparameters=hyperparameters, name=name)
    self.starting_sales = starting_sales
    self.num_starts = num_starts
    self.encodings = encodings

build(input_shapes)

Build the layer parameters needed for calculating linear baseline.

Parameters:

Name Type Description Default
input_shapes Tuple[Tensor, ...]

Tuple of tensor shapes of *args(without the defaults) passed to __call__().

required
Source code in wt_ml/layers/linear_baseline.py
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
def build(self, input_shapes):
    """Build the layer parameters needed for calculating linear baseline.

    Args:
        input_shapes (Tuple[tf.Tensor, ...]): Tuple of tensor shapes of `*args`(without the defaults)
                                              passed to `__call__()`.
    """
    self.num_starts = self.num_starts if input_shapes.sales_num_restarts is not ... else 1
    shape = [self.num_starts, len(self.encodings["wholesaler"]), len(self.encodings["brand"])]
    self.use_perfect_adjustment = self.hyperparameters.get_bool(
        "use_perfect_adjustment",
        default=False,
        help="Instead of using gradient descent change directly to the optimal values.",
    )
    if self.use_perfect_adjustment:
        self.lr_scale = 1.0
        if self.num_starts > 1:
            raise ValueError("Perfect adjustments does not support restarts at this time.")
        self.scalar_so_softplus_approaches_0 = 1.0
        self.allow_slope = False
        self.perfection_speed = self.hyperparameters.get_float(
            "perfection_speed",
            default=0.99,
            min=0.01,
            max=1.0,
            help="How close to make the step to the optimal value.",
        )
        self.relative_scale_cap = self.hyperparameters.get_float(
            "relative_scale_cap",
            default=10.0,
            min=1.0,
            max=1000.0,
            help="The maximum ratio it can have to the provided initial value.",
        )
        self.upper_bound = np.tile(
            (
                (
                    self.relative_scale_cap * self.starting_sales
                    + np.log(
                        -np.expm1(
                            -self.relative_scale_cap * self.starting_sales * self.scalar_so_softplus_approaches_0
                        )
                        + EPSILON
                    )
                    / self.scalar_so_softplus_approaches_0
                )
                / self.lr_scale
            ),
            (self.num_starts, 1, 1),
        )
    else:
        self.lr_scale = self.hyperparameters.get_float(
            "lr_scale",
            default=10.0,
            min=1.0,
            max=1000.0,
            help="A factor to multiply the raw weights by so they get larger gradients.",
        )
        self.scalar_so_softplus_approaches_0 = self.hyperparameters.get_float(
            "scalar_so_softplus_approaches_0",
            default=1.0,
            min=1.0,
            max=100.0,
            help="The temperature to apply to softplus to max it better approximate the relu function.",
        )
        self.allow_slope = (
            self.hyperparameters.get_bool(
                "allow_slope",
                default=False,
                help="Whether to allow slope in baseline",
            )
            and input_shapes.dates_since_start is not ...
        )
    self.baseline_intercept = self.create_var(
        # Start a little lower to allow roicurves to start higher.
        "intercept",
        shape=shape,
        dtype=tf.float32,
        initializer=np.tile(
            (
                (
                    0.8 * self.starting_sales
                    + np.log(-np.expm1(-0.8 * self.starting_sales * self.scalar_so_softplus_approaches_0) + EPSILON)
                    / self.scalar_so_softplus_approaches_0
                )
                / self.lr_scale
            ).astype(np.float32),
            (self.num_starts, 1, 1),
        ),
        trainable=not self.use_perfect_adjustment,
    )
    if self.allow_slope:
        self.baseline_slope = self.create_var(
            "slope", shape=shape, dtype=tf.float32, trainable=not self.use_perfect_adjustment
        )
        self.base_under_0_lambda = self.hyperparameters.get_float(
            "base_under_0_lambda",
            default=1.0e-03,
            min=1.0e-08,
            max=1.0,
            help="The weight for the loss applied to the baseline being below -10 before the softplus.",
        )

MonotonicPositiveUnboundedLayer

Bases: Module, IMixedEffect

Source code in wt_ml/layers/monotonic_positive_unbounded.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
class MonotonicPositiveUnboundedLayer(Module, IMixedEffect):
    def __init__(
        self,
        encodings: Encodings,
        signal_type: str,
        hierarchy_categories: list[str | list[str]] | None = None,
        has_time: bool = False,
        has_signal: bool = False,
        hyperparameters: Hyperparams | None = None,
        non_pos: bool = False,
        non_neg: bool = False,
        non_pos_by_signal: list[bool] | None = None,
        non_neg_by_signal: list[bool] | None = None,
        maximum_strength: float | None = None,
        use_bias: bool | None = None,
        increase_lr: float | None = None,
        name: str | None = None,
    ):
        """Monotonic multiplicative factors affecting sales that also scales ROIs of investments.

        Args:
            hierarchy (pd.DataFrame): The hierarchy that the impact learns on.
            n_instances (int): Number of mixed effect signals. Axis index 2 of effect.
            has_time (bool, optional): Whether the hierarchy is on the time axis. Defaults to False.
            hyperparameters (Hyperparams, optional): Dictionary of hyperparameters for buidling this layer.
            name (str | None, optional): Name of the mixed effect captured. Module parent class sets
                                         to name of class if passes as None.
        """
        super().__init__(hyperparameters=hyperparameters, name=name)
        self.signal_type = signal_type
        self.encodings: Encodings = encodings
        self.has_time = has_time
        self.has_signal = has_signal
        self.hierarchy_categories = hierarchy_categories
        self.non_neg = non_neg
        self.non_pos = non_pos
        self.non_pos_by_signal = non_pos_by_signal
        self.non_neg_by_signal = non_neg_by_signal
        self.maximum_strength = maximum_strength
        self.use_bias = use_bias
        self.increase_lr = increase_lr

    def build(self, input_shapes: InputShapes):
        """Builds the sales_mult hierarchical variable.

        Args:
            input_shapes (InputShapes): The effect and hierarchy shapes.
        """
        self.n_instances = input_shapes.signals[2] if len(input_shapes.signals) > 2 else 1
        if self.has_signal:
            self.n_instances = 1
        shape = [self.n_instances]
        if self.use_bias is None:
            self.use_bias = not (self.has_time or self.has_signal)

        if not self.has_signal:
            signal_enc = self.encodings[self.signal_type]
            if TYPE_CHECKING:
                assert isinstance(signal_enc, Mapping)
            feature_names = tuple(get_lookups(signal_enc))  # pyright: ignore [reportArgumentType]
        else:
            feature_names = ("sales_mult",)

        self.sales_mult = self.hyperparameters.get_submodule(
            "effect_mult",
            module_type=HierchicalEmbedding,
            kwargs=dict(
                encodings=self.encodings,
                columns=self.hierarchy_categories,
                shape=shape,
                use_bias=self.use_bias,
                bias_initializer=0.01,
                increase_lr=self.increase_lr,
                feature_names=feature_names,
            ),
            help="The embedding for the multiplier to apply to each signal before exponentiation.",
        )
        self.use_softplus = self.hyperparameters.get_bool(
            "use_softplus",
            default=True,
            help="Whether to use softplus or exp for the standardization to positive multipliers.",
        )
        self.use_mono = self.hyperparameters.get_bool(
            "use_monotonic", default=False, help="Whether to use a monotonic concave layer to combine signals."
        )
        if self.use_mono:
            self.mono_effect = self.hyperparameters.get_submodule(
                "concave_effect_mult",
                module_type=MonoEffect,
                kwargs=dict(
                    encodings=self.encodings,
                    signal_type=self.signal_type,
                    n_instances=self.n_instances,
                    hierarchy_categories=self.hierarchy_categories,
                    has_signal=self.has_signal,
                ),
                help="Neural Network model that learns the monotonic effect.",
            )

    def __call__(
        self,
        batch: MonotonicPositiveUnboundedInput,
        training: bool = False,
        debug: bool = False,
        skip_metrics: bool = False,
    ) -> MonotonicPositiveUnboundedIntermediaries | DistIntermediaries:
        signals = batch.signals
        if self.use_mono:
            mono_effect_intermediaries = self.mono_effect(
                MonoEffectInput(
                    signals=batch.signals,
                    hierarchy=batch.hierarchy,
                ),
                training=training,
                debug=debug,
                skip_metrics=skip_metrics,
            )
            signals = mono_effect_intermediaries.signals
        # num_gran x num_inst if not has_time and not has_signal
        # num_gran x num_time x num_inst if has_time and not has_signal
        # num_gran x num_inst x 1 if not has_time and has_signal
        # num_gran x num_time x num_inst x 1 if has_time and has_signal
        baseline_sales_effect_raw = self.sales_mult(batch.hierarchy, training=training, skip_metrics=skip_metrics)
        if not self.has_time:
            baseline_sales_effect_raw = tf.expand_dims(baseline_sales_effect_raw, 1)
        if self.has_signal:
            baseline_sales_effect_raw = tf.squeeze(baseline_sales_effect_raw, -1)
        if self.signal_type == "distribution":
            baseline_sales_effect_raw = baseline_sales_effect_raw + tf.constant(3.0, dtype=tf.float32)
        # At this point baseline_sales_effect_raw is always broadcastable to # num_gran x num_time x num_inst
        # learns weightage of each effect signal and applies on it!
        # This is batch x time x n_instances
        # Shifted by -3 to make initialization more sane(before it was very large impacts in initial state).
        softplus_baseline_effect_raw = softplus(baseline_sales_effect_raw - tf.constant(3.0, dtype=tf.float32))
        if self.non_neg:
            baseline_sales_effect_raw = softplus_baseline_effect_raw
        if self.non_pos:
            baseline_sales_effect_raw = -softplus_baseline_effect_raw
        if self.non_pos_by_signal:
            non_pos_by_signal: tf.Tensor = tf.gather(
                tf.constant(self.non_pos_by_signal, dtype=tf.float32, name="non_pos_by_signal"), batch.signal_index
            )
            baseline_sales_effect_raw = baseline_sales_effect_raw * (
                tf.constant(1.0, dtype=tf.float32) - non_pos_by_signal
            ) - (non_pos_by_signal * softplus_baseline_effect_raw)
        if self.non_neg_by_signal:
            non_neg_by_signal = tf.gather(
                tf.constant(self.non_neg_by_signal, dtype=tf.float32, name="non_neg_by_signal"), batch.signal_index
            )
            baseline_sales_effect_raw = (
                baseline_sales_effect_raw * (1 - non_neg_by_signal) + non_neg_by_signal * softplus_baseline_effect_raw
            )
        if self.maximum_strength is not None:
            baseline_sales_effect_raw = (
                tf.math.tanh(baseline_sales_effect_raw / self.maximum_strength) * self.maximum_strength
            )
        baseline_sales_effect = tf.math.multiply(baseline_sales_effect_raw, signals, name="sales_effect")
        baseline_sales_effect = tf.grad_pass_through(lambda x: tf.maximum(-16.0, x, "baseline_sales_effect_clipped"))(
            baseline_sales_effect
        )
        if self.use_softplus:
            # Force 0 to map to 1 after softplus.
            baseline_sales_mult_by_signal = softplus(baseline_sales_effect + np.log(np.e - 1), name="mult_by_signal")
        else:
            baseline_sales_mult_by_signal = tf.math.exp(baseline_sales_effect, name="mult_by_signal")
        # batch x time
        baseline_sales_mult = tf.reduce_prod(baseline_sales_mult_by_signal, 2, name="impact")
        return MonotonicPositiveUnboundedIntermediaries(
            baseline_sales_effect_raw=baseline_sales_effect_raw if debug else None,
            baseline_sales_effect=baseline_sales_effect if debug else None,
            impact_by_signal=baseline_sales_mult_by_signal,
            baseline_sales_mult=baseline_sales_mult if debug else None,
            impact=baseline_sales_mult,
            signal_names=tf.gather(
                tf.convert_to_tensor(get_lookups(self.encodings[self.signal_type])), batch.signal_index
            ),
        )

__init__(encodings, signal_type, hierarchy_categories=None, has_time=False, has_signal=False, hyperparameters=None, non_pos=False, non_neg=False, non_pos_by_signal=None, non_neg_by_signal=None, maximum_strength=None, use_bias=None, increase_lr=None, name=None)

Monotonic multiplicative factors affecting sales that also scales ROIs of investments.

Parameters:

Name Type Description Default
hierarchy DataFrame

The hierarchy that the impact learns on.

required
n_instances int

Number of mixed effect signals. Axis index 2 of effect.

required
has_time bool

Whether the hierarchy is on the time axis. Defaults to False.

False
hyperparameters Hyperparams

Dictionary of hyperparameters for buidling this layer.

None
name str | None

Name of the mixed effect captured. Module parent class sets to name of class if passes as None.

None
Source code in wt_ml/layers/monotonic_positive_unbounded.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def __init__(
    self,
    encodings: Encodings,
    signal_type: str,
    hierarchy_categories: list[str | list[str]] | None = None,
    has_time: bool = False,
    has_signal: bool = False,
    hyperparameters: Hyperparams | None = None,
    non_pos: bool = False,
    non_neg: bool = False,
    non_pos_by_signal: list[bool] | None = None,
    non_neg_by_signal: list[bool] | None = None,
    maximum_strength: float | None = None,
    use_bias: bool | None = None,
    increase_lr: float | None = None,
    name: str | None = None,
):
    """Monotonic multiplicative factors affecting sales that also scales ROIs of investments.

    Args:
        hierarchy (pd.DataFrame): The hierarchy that the impact learns on.
        n_instances (int): Number of mixed effect signals. Axis index 2 of effect.
        has_time (bool, optional): Whether the hierarchy is on the time axis. Defaults to False.
        hyperparameters (Hyperparams, optional): Dictionary of hyperparameters for buidling this layer.
        name (str | None, optional): Name of the mixed effect captured. Module parent class sets
                                     to name of class if passes as None.
    """
    super().__init__(hyperparameters=hyperparameters, name=name)
    self.signal_type = signal_type
    self.encodings: Encodings = encodings
    self.has_time = has_time
    self.has_signal = has_signal
    self.hierarchy_categories = hierarchy_categories
    self.non_neg = non_neg
    self.non_pos = non_pos
    self.non_pos_by_signal = non_pos_by_signal
    self.non_neg_by_signal = non_neg_by_signal
    self.maximum_strength = maximum_strength
    self.use_bias = use_bias
    self.increase_lr = increase_lr

build(input_shapes)

Builds the sales_mult hierarchical variable.

Parameters:

Name Type Description Default
input_shapes InputShapes

The effect and hierarchy shapes.

required
Source code in wt_ml/layers/monotonic_positive_unbounded.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def build(self, input_shapes: InputShapes):
    """Builds the sales_mult hierarchical variable.

    Args:
        input_shapes (InputShapes): The effect and hierarchy shapes.
    """
    self.n_instances = input_shapes.signals[2] if len(input_shapes.signals) > 2 else 1
    if self.has_signal:
        self.n_instances = 1
    shape = [self.n_instances]
    if self.use_bias is None:
        self.use_bias = not (self.has_time or self.has_signal)

    if not self.has_signal:
        signal_enc = self.encodings[self.signal_type]
        if TYPE_CHECKING:
            assert isinstance(signal_enc, Mapping)
        feature_names = tuple(get_lookups(signal_enc))  # pyright: ignore [reportArgumentType]
    else:
        feature_names = ("sales_mult",)

    self.sales_mult = self.hyperparameters.get_submodule(
        "effect_mult",
        module_type=HierchicalEmbedding,
        kwargs=dict(
            encodings=self.encodings,
            columns=self.hierarchy_categories,
            shape=shape,
            use_bias=self.use_bias,
            bias_initializer=0.01,
            increase_lr=self.increase_lr,
            feature_names=feature_names,
        ),
        help="The embedding for the multiplier to apply to each signal before exponentiation.",
    )
    self.use_softplus = self.hyperparameters.get_bool(
        "use_softplus",
        default=True,
        help="Whether to use softplus or exp for the standardization to positive multipliers.",
    )
    self.use_mono = self.hyperparameters.get_bool(
        "use_monotonic", default=False, help="Whether to use a monotonic concave layer to combine signals."
    )
    if self.use_mono:
        self.mono_effect = self.hyperparameters.get_submodule(
            "concave_effect_mult",
            module_type=MonoEffect,
            kwargs=dict(
                encodings=self.encodings,
                signal_type=self.signal_type,
                n_instances=self.n_instances,
                hierarchy_categories=self.hierarchy_categories,
                has_signal=self.has_signal,
            ),
            help="Neural Network model that learns the monotonic effect.",
        )

Pricing

Bases: Module, IMixedEffect

Source code in wt_ml/layers/pricing.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
class Pricing(Module, IMixedEffect):
    def __init__(
        self,
        encodings: dict[str, Any],
        hierarchy_categories: list[str | list[str]] | None = None,
        hyperparameters: Hyperparams | None = None,
        name: str | None = None,
    ):
        """Multiplicative price elasticity factor affecting baseline sales that also scales ROI of investments.

        Args:
            hierarchy (pd.DataFrame): The hierarchy used to build features learnt by the model to generate impacts.
            hyperparameters (Hyperparams | None, optional): An instance of `FileHyperparameterConfig` class
                                                            that stores all the hyperparameters of Pricing layer.
                                                            `Module` parent class sets these hyperparameters if None.
            name (str | None, optional): Name of the Pricing Layer.
                                         `Module` parent class sets name of the class if None.
        """
        super().__init__(hyperparameters=hyperparameters, name=name)
        self.encodings = encodings
        self.hierarchy_categories = hierarchy_categories

    def build(self, input_shapes: InputShapes):  # noqa: U100
        """Builds the `price_params_emb_layer` hierarchical variable
        for generating price elasticity curve for each granularity.
        Shape of the variable: (num_granular, 2). 2 denotes offset and exponent.

        Args:
            input_shapes (InputShapes): A tuple of tensor shapes of `price` and `hierarchy` passed to `__call__`.
        """
        n_instances = input_shapes.prices[2]
        self.pricing_params_emb_layer = self.hyperparameters.get_submodule(
            "pricing_params_emb_layer",
            module_type=HierchicalEmbedding,
            kwargs=dict(
                encodings=self.encodings,
                columns=self.hierarchy_categories,
                # 2 here denotes offset and exponent
                shape=[n_instances, 2],
                # This initializes to a state where any change in price reduces revenue while still making learning
                # other distributions easy. You can roughly think of this as price * (mult / (price + 1) ** 2)
                # Where mult is a specially calculated value so that the result with price = 1 is 1.
                # Order is offset (softplus), exponent (softplus + 1)
                bias_initializer=tf.constant_initializer(np.tile([[0.6, 0.6]], (n_instances, 1)).reshape(-1)),
                feature_names=[
                    [f"{signal}_offset", f"{signal}_exponent"] for signal in get_lookups(self.encodings["price_dev"])
                ],
            ),
            help="The embedding for the parameters for the pricing elasticity curve.",
        )

    def __call__(
        self, batch: PricingInput, training: bool = False, debug: bool = False, skip_metrics: bool = False
    ) -> PricingIntermediaries:
        """Pricing Layer Forward Propagation.
        We take in the mean normalized $price$ signal of shape `(num_time, num_granular, n_sim)`.
        Then we take the $offset$ and $exponent$ learnt by the model, each of shape `(num_granular,)`.
        The impact is calculated as follows:

        $volume = \\frac{normalization\\_mult} {(price + offset) ^ {exponent}}$

        $normalization\\_{mult} = (1 + offset) ^ {exponent}$

        $impact = volume * price$

        This $impact$ is of shape `(num_time, num_granular, n_sim)`

        > NOTE: normalization\\_mult is a factor to neglect the impact of prices which equal the average price.

        Args:
            price (TensorLike): mean normalized price_per_hl for each granularity each week.
                                Shape: (num_time, num_granular, n_sim)
            hierarchy (dict[str, TensorLike]): Hierarchical Placeholder for creating hierarchical variable.
            training (bool, optional): Whether this is a training or inference run. Defaults to False.

        Returns:
            PricingIntermediaries: Intermediate calculations like offset, asymptote, exponent, etc., and final impact.
        """
        params_emb = self.pricing_params_emb_layer(batch.hierarchy, training=training, skip_metrics=skip_metrics)
        offset_emb, exponent_emb = tf.unstack(params_emb, axis=2)
        offset = softplus(offset_emb * 10) + 0.01
        exponent = monotonic_sigmoid(exponent_emb / 4) * 4 + 1
        # Here we introduce a mult which is a normalization parameter
        # the equation for volume by price is: volume = mult / ((price + offset) ** exponent)
        # We want volume = 1 when price = 1 so we need to solve
        # 1 = mult / ((1 + offset) ** exponent)
        # mult = (1 + offset) ** exponent
        normalization_mult = (1 + offset) ** exponent
        volume = normalization_mult[:, None] / ((batch.prices + offset[:, None]) ** exponent[:, None])
        # Impact is the revenue of this volume at that price, so just the product.
        revenue = tf.math.multiply(
            volume, tf.where(batch.prices > 0, batch.prices, tf.math.reciprocal_no_nan(volume)), name="impact"
        )
        impact_by_signal = revenue
        # Reduce over signal axis
        impact = tf.math.reduce_prod(impact_by_signal, axis=2, name="impact")
        return PricingIntermediaries(
            offset_emb=offset_emb if debug else None,
            exponent_emb=exponent_emb if debug else None,
            offset=offset if debug else None,
            exponent=exponent if debug else None,
            volume=volume if debug else None,
            revenue=revenue if debug else None,
            impact_by_signal=impact_by_signal,
            impact=impact,
            signal_names=tf.gather(tf.convert_to_tensor(get_lookups(self.encodings["price_dev"])), batch.signal_index),
        )

__call__(batch, training=False, debug=False, skip_metrics=False)

Pricing Layer Forward Propagation. We take in the mean normalized \(price\) signal of shape (num_time, num_granular, n_sim). Then we take the \(offset\) and \(exponent\) learnt by the model, each of shape (num_granular,). The impact is calculated as follows:

\(volume = \frac{normalization\_mult} {(price + offset) ^ {exponent}}\)

\(normalization\_{mult} = (1 + offset) ^ {exponent}\)

\(impact = volume * price\)

This \(impact\) is of shape (num_time, num_granular, n_sim)

NOTE: normalization_mult is a factor to neglect the impact of prices which equal the average price.

Parameters:

Name Type Description Default
price TensorLike

mean normalized price_per_hl for each granularity each week. Shape: (num_time, num_granular, n_sim)

required
hierarchy dict[str, TensorLike]

Hierarchical Placeholder for creating hierarchical variable.

required
training bool

Whether this is a training or inference run. Defaults to False.

False

Returns:

Name Type Description
PricingIntermediaries PricingIntermediaries

Intermediate calculations like offset, asymptote, exponent, etc., and final impact.

Source code in wt_ml/layers/pricing.py
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def __call__(
    self, batch: PricingInput, training: bool = False, debug: bool = False, skip_metrics: bool = False
) -> PricingIntermediaries:
    """Pricing Layer Forward Propagation.
    We take in the mean normalized $price$ signal of shape `(num_time, num_granular, n_sim)`.
    Then we take the $offset$ and $exponent$ learnt by the model, each of shape `(num_granular,)`.
    The impact is calculated as follows:

    $volume = \\frac{normalization\\_mult} {(price + offset) ^ {exponent}}$

    $normalization\\_{mult} = (1 + offset) ^ {exponent}$

    $impact = volume * price$

    This $impact$ is of shape `(num_time, num_granular, n_sim)`

    > NOTE: normalization\\_mult is a factor to neglect the impact of prices which equal the average price.

    Args:
        price (TensorLike): mean normalized price_per_hl for each granularity each week.
                            Shape: (num_time, num_granular, n_sim)
        hierarchy (dict[str, TensorLike]): Hierarchical Placeholder for creating hierarchical variable.
        training (bool, optional): Whether this is a training or inference run. Defaults to False.

    Returns:
        PricingIntermediaries: Intermediate calculations like offset, asymptote, exponent, etc., and final impact.
    """
    params_emb = self.pricing_params_emb_layer(batch.hierarchy, training=training, skip_metrics=skip_metrics)
    offset_emb, exponent_emb = tf.unstack(params_emb, axis=2)
    offset = softplus(offset_emb * 10) + 0.01
    exponent = monotonic_sigmoid(exponent_emb / 4) * 4 + 1
    # Here we introduce a mult which is a normalization parameter
    # the equation for volume by price is: volume = mult / ((price + offset) ** exponent)
    # We want volume = 1 when price = 1 so we need to solve
    # 1 = mult / ((1 + offset) ** exponent)
    # mult = (1 + offset) ** exponent
    normalization_mult = (1 + offset) ** exponent
    volume = normalization_mult[:, None] / ((batch.prices + offset[:, None]) ** exponent[:, None])
    # Impact is the revenue of this volume at that price, so just the product.
    revenue = tf.math.multiply(
        volume, tf.where(batch.prices > 0, batch.prices, tf.math.reciprocal_no_nan(volume)), name="impact"
    )
    impact_by_signal = revenue
    # Reduce over signal axis
    impact = tf.math.reduce_prod(impact_by_signal, axis=2, name="impact")
    return PricingIntermediaries(
        offset_emb=offset_emb if debug else None,
        exponent_emb=exponent_emb if debug else None,
        offset=offset if debug else None,
        exponent=exponent if debug else None,
        volume=volume if debug else None,
        revenue=revenue if debug else None,
        impact_by_signal=impact_by_signal,
        impact=impact,
        signal_names=tf.gather(tf.convert_to_tensor(get_lookups(self.encodings["price_dev"])), batch.signal_index),
    )

__init__(encodings, hierarchy_categories=None, hyperparameters=None, name=None)

Multiplicative price elasticity factor affecting baseline sales that also scales ROI of investments.

Parameters:

Name Type Description Default
hierarchy DataFrame

The hierarchy used to build features learnt by the model to generate impacts.

required
hyperparameters Hyperparams | None

An instance of FileHyperparameterConfig class that stores all the hyperparameters of Pricing layer. Module parent class sets these hyperparameters if None.

None
name str | None

Name of the Pricing Layer. Module parent class sets name of the class if None.

None
Source code in wt_ml/layers/pricing.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def __init__(
    self,
    encodings: dict[str, Any],
    hierarchy_categories: list[str | list[str]] | None = None,
    hyperparameters: Hyperparams | None = None,
    name: str | None = None,
):
    """Multiplicative price elasticity factor affecting baseline sales that also scales ROI of investments.

    Args:
        hierarchy (pd.DataFrame): The hierarchy used to build features learnt by the model to generate impacts.
        hyperparameters (Hyperparams | None, optional): An instance of `FileHyperparameterConfig` class
                                                        that stores all the hyperparameters of Pricing layer.
                                                        `Module` parent class sets these hyperparameters if None.
        name (str | None, optional): Name of the Pricing Layer.
                                     `Module` parent class sets name of the class if None.
    """
    super().__init__(hyperparameters=hyperparameters, name=name)
    self.encodings = encodings
    self.hierarchy_categories = hierarchy_categories

build(input_shapes)

Builds the price_params_emb_layer hierarchical variable for generating price elasticity curve for each granularity. Shape of the variable: (num_granular, 2). 2 denotes offset and exponent.

Parameters:

Name Type Description Default
input_shapes InputShapes

A tuple of tensor shapes of price and hierarchy passed to __call__.

required
Source code in wt_ml/layers/pricing.py
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
def build(self, input_shapes: InputShapes):  # noqa: U100
    """Builds the `price_params_emb_layer` hierarchical variable
    for generating price elasticity curve for each granularity.
    Shape of the variable: (num_granular, 2). 2 denotes offset and exponent.

    Args:
        input_shapes (InputShapes): A tuple of tensor shapes of `price` and `hierarchy` passed to `__call__`.
    """
    n_instances = input_shapes.prices[2]
    self.pricing_params_emb_layer = self.hyperparameters.get_submodule(
        "pricing_params_emb_layer",
        module_type=HierchicalEmbedding,
        kwargs=dict(
            encodings=self.encodings,
            columns=self.hierarchy_categories,
            # 2 here denotes offset and exponent
            shape=[n_instances, 2],
            # This initializes to a state where any change in price reduces revenue while still making learning
            # other distributions easy. You can roughly think of this as price * (mult / (price + 1) ** 2)
            # Where mult is a specially calculated value so that the result with price = 1 is 1.
            # Order is offset (softplus), exponent (softplus + 1)
            bias_initializer=tf.constant_initializer(np.tile([[0.6, 0.6]], (n_instances, 1)).reshape(-1)),
            feature_names=[
                [f"{signal}_offset", f"{signal}_exponent"] for signal in get_lookups(self.encodings["price_dev"])
            ],
        ),
        help="The embedding for the parameters for the pricing elasticity curve.",
    )

apply_impacts(baseline, multiplicative_impacts, additive_impacts)

Apply the impacts on top of the basline to get yhat.

\[ yhat = (baseline*\underset{}{\overset{m}{\prod }}({multiplicativeEffect}_{m})) + \underset{}{\overset{a}{\sum }}({additiveEffect}_{a}) \]

Parameters:

Name Type Description Default
baseline Tensor

The baseline impact. This will be the starting point where impacts are applied on.

required
multiplicative_impacts list[Tensor]

These impacts scales the baseline multiplicatively (larger effect).

required
additive_impacts list[Tensor]

These impacts, increase the baseline additively (smaller effect).

required

Returns:

Type Description
Tensor

tf.Tensor: The yhat after applying all the impacts on the baseline.

Source code in wt_ml/layers/impact_utils.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def apply_impacts(
    baseline: tf.Tensor,
    multiplicative_impacts: list[tf.Tensor] | tuple[tf.Tensor, ...],
    additive_impacts: list[tf.Tensor] | tuple[tf.Tensor, ...],
) -> tf.Tensor:
    """Apply the impacts on top of the basline to get yhat.

    $$
    yhat = (baseline*\\underset{}{\\overset{m}{\\prod }}({multiplicativeEffect}_{m}))
        + \\underset{}{\\overset{a}{\\sum }}({additiveEffect}_{a})
    $$

    Args:
        baseline (tf.Tensor): The baseline impact. This will be the starting point where impacts are applied on.
        multiplicative_impacts (list[tf.Tensor]): These impacts scales the baseline multiplicatively (larger effect).
        additive_impacts (list[tf.Tensor]): These impacts, increase the baseline additively (smaller effect).

    Returns:
        tf.Tensor: The yhat after applying all the impacts on the baseline.
    """  # noqa: E501
    yhat = baseline
    for impact in multiplicative_impacts:
        yhat = yhat * impact
    for impact in additive_impacts:
        yhat = yhat + impact
    return yhat

apply_inverse_impacts(y, multiplicative_impacts, additive_impacts)

Remove the impacts from y to get the baseline back. baseline = (yhat - Σ(additive effects)) / ∏(multiplicative effects)

Here its inverse impacts, i.e.,

inverse multiplicative effect = 1/multiplicative effect inverse additive effect = -1 * additive effect

Parameters:

Name Type Description Default
y Tensor

The y total impact.

required
multiplicative_impacts list[Tensor]

Multiplicative impacts, are scaled down (larger effect).

required
additive_impacts list[Tensor]

Additive impacts, are subtracted off the y (smaller effect).

required

Returns:

Type Description
Tensor

tf.Tensor: The baseline after all the impacts are removed from y.

Source code in wt_ml/layers/impact_utils.py
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def apply_inverse_impacts(
    y: tf.Tensor, multiplicative_impacts: list[tf.Tensor], additive_impacts: list[tf.Tensor]
) -> tf.Tensor:
    """Remove the impacts from `y` to get the baseline back.
    baseline = (yhat - Σ(additive effects)) / ∏(multiplicative effects)

    NOTE: Here its inverse impacts, i.e.,
        inverse multiplicative effect = 1/multiplicative effect
        inverse additive effect = -1 * additive effect

    Args:
        y (tf.Tensor): The `y` total impact.
        multiplicative_impacts (list[tf.Tensor]): Multiplicative impacts, are scaled down (larger effect).
        additive_impacts (list[tf.Tensor]): Additive impacts, are subtracted off the `y` (smaller effect).

    Returns:
        tf.Tensor: The baseline after all the impacts are removed from `y`.
    """
    baseline = y
    for impact in additive_impacts:
        baseline = baseline - impact
    for impact in multiplicative_impacts:
        baseline = tf.math.divide_no_nan(baseline, impact)
    return baseline