35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561 | class HierchicalEmbedding(Module):
"""Hierarchical Embedding creates embeddings for a layer with different input hierarchy levels
as trainable weights such that the deviations from the expected deviations are penalized.
These trained embeddings are used to calculate the model parameters for a layer.
"""
def __init__(
self,
shape: list[int],
encodings: dict[str, Any],
columns: list[str | list[str]] | None = None,
use_bias: bool = True,
dropped_columns=[],
initializer: Initializer = 0.0,
bias_initializer: Initializer = 0.0,
hyperparameters: Hyperparams | None = None,
feature_names: list[list[str]] | list[str] | None = None,
name: str | None = None,
increase_lr: float | None = None,
):
"""Initializes the hierarchical embedding object with hierarchy levels, parameter shape
and other initializers.
Args:
shape (list[int]): Desired dimensions of model parameters only within final result.
hierarchy (pd.DataFrame): The hierarchy for which embeddings are trained.
columns (list[str | list[str]] | None, optional): Hierarchy levels to learn embeddings. Defaults to None.
use_bias (bool, optional): Whether to include bias. Defaults to True.
dropped_columns (list, optional): Columns to exclude in hierarchy.
Defaults to ["granular", "region", "coastal", "populationdensity", "medianage"].
initializer (Initializer, optional): Initializer for embeddings(weights). Defaults to 0.0.
bias_initializer (Initializer, optional): Initializer for bias. Defaults to 0.0.
hyperparameters (Hyperparams | None, optional): Dictionary of hyperparameters for buidling this layer.
Defaults to None.
name (str | None, optional): Name of the layer. Defaults to None.
"""
super().__init__(hyperparameters=hyperparameters, name=name)
self.use_bias = use_bias
encodings_dropped = {k: v for k, v in encodings.items() if k not in dropped_columns}
assert encodings_dropped, "No cols in hierarchy."
self.encodings = encodings_dropped
self.shape = shape
self.increase_lr = increase_lr
self.initializer = initializer
self.bias_initializer = bias_initializer
self.created_reg = False
self.feature_names = feature_names
if self.feature_names is not None and isinstance(self.feature_names[0], list):
self.feature_names = [elem for sublist in self.feature_names for elem in sublist]
if columns is not None:
self._process_columns(columns)
else:
self.columns = list(self.encodings.keys())
self.used_cols = set(self.columns)
def _process_columns(self, columns: list[str | list[str]]):
"""
Process columns and remove duplicates or column(s) which has unique hierarchies.
Sets `columns` and `used_cols` attribute.
Args:
columns (list[str | list[str]]): Hierarchy levels to learn embeddings.
"""
used_cols = set(tf.nest.flatten(columns))
missing_cols = used_cols.difference(self.encodings.keys())
assert not missing_cols, f"Column(s) passed not in hierarchy. {missing_cols}"
issues: list[str] = []
new_columns: list[str | tuple[str]] = []
for column in columns:
if isinstance(column, (list, tuple)):
new_column = []
for sub_col in column:
encodings = self.encodings[sub_col]
if encodings == "continuous" or len(encodings) > 1:
new_column.append(sub_col)
else:
issues.append(f"{sub_col} in {column} has single encoding.")
if len(new_column) == 1:
# convert to str so duplicates can be detected easily.
new_columns.append(new_column[0])
elif len(new_column) > 1:
new_columns.append(tuple(new_column))
else:
issues.append(f"Dropping {column} as it has unique encodings.")
else:
encodings = self.encodings[column]
if encodings == "continuous" or len(encodings) > 1:
new_columns.append(column)
else:
issues.append(f"Dropping {column} as it has single encoding.")
org_col_len = len(new_columns)
# if any column is duplicated, we need to get rid of it.
new_columns = list(dict.fromkeys(new_columns))
assert len(new_columns), "All columns are dopped since they are all unique."
if len(new_columns) != org_col_len:
issues.append("Duplicate hierarchies removed.")
if issues:
warn_issues(self.name, issues, new_columns, columns)
self.columns = new_columns
self.used_cols = set(tf.nest.flatten(self.columns))
def build(self, input_shapes): # noqa: U100
"""Builds hyperparamters, deviations, embeddings(weights), bias and other intermediate variables.
Args:
input_shapes (InputShapes): The effect and hierarchy shapes.
Raises:
AllUniqueError: When there are no hierchical columns because everything is unique within it.
"""
self.use_l2_squared = self.hyperparameters.get_bool(
"use_l2_squared",
default=False,
help="Use the l2 norm to the fourth power instead of only using it for large values for stability.",
)
self.desired_stddev = self.hyperparameters.get_float(
"desired_stddev",
default=0.10,
min=0.01,
max=100.0,
help="The desired maximum value for the stddev along the full hierarchy.",
)
self.use_inv_sqrt = self.hyperparameters.get_bool(
"use_inv_sqrt",
default=True,
help="Scale the stddev for each category by the inverse square root of the number of unique values.",
)
if self.use_bias:
self.reg_bias = self.hyperparameters.get_float(
"reg_bias",
default=0.0,
min=0.0,
max=1e4,
help="The strength of l2 regularization to apply to the bias term.",
)
self.offsets = {}
self.col_counts = {
k: (
self.encodings[k]
if isinstance(self.encodings[k], (float, int))
else ((max(self.encodings[k].values()) + 1) if not isinstance(self.encodings[k], str) else 1)
)
for k in tf.nest.flatten(self.columns)
}
var_counts = [
self.col_counts[col] if isinstance(col, str) else np.prod([self.col_counts[k] for k in col])
for col in self.columns
]
self.columns = [col for col, count in zip(self.columns, var_counts) if count > 1 or self.is_continuous(col)]
count = 0
desired_stddevs = []
# Scatters is the inverse of gathering from num_regularized_categories + 1 to count of weights
self.scatters = []
self.penalty_mults = []
multipliers = []
reg_counts = []
flattened = int(np.prod(self.shape)) if len(self.shape) > 0 else 1
for col_names in self.columns:
if isinstance(col_names, str):
col_names = [col_names]
cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
name = self.stitched_cols(col_names)
num_cont_cols = len(col_names) - len(cat_cols)
if num_cont_cols > 1:
raise ValueError(
"You can only have one continuous hierarchical variable within a single hierarchical level"
)
number = int(np.prod([self.col_counts[k] for k in col_names]))
multipliers.append(1 / number)
self.scatters += [len(multipliers)] * number
desired_stddevs.append(1 / np.sqrt(number) if self.use_inv_sqrt else 1)
reg_counts.append(max(1, number - 1))
self.offsets[name] = count
count += number
self.penalty_mults.append(self.get_reg_mult(col_names))
if count == 0 or len(multipliers) == 0:
raise AllUniqueError("There are no hierchical columns everything is unique.")
# scatters is shape (count,)
self.scatters = np.array(self.scatters)
# multipliers is shape (1 + regularized_counts,)
self.multipliers = np.array([0] + multipliers, dtype=np.float32)
self.penalty_mults = np.array(self.penalty_mults, dtype=np.float32)
self.dense_shape = [len(self.multipliers), flattened]
# desired_* is shape (regularized_counts,)
self.desired_stddevs = (
self.desired_stddev * np.array(desired_stddevs or [1], dtype=np.float32) / np.sqrt(max(1, len(multipliers)))
)
self.desired_l2norms = (
np.array(reg_counts, dtype=np.float32) * self.desired_stddevs**2 / (1 if self.use_l2_squared else 2)
)
self.weights = self.create_var(
"weights", shape=[count, flattened], dtype=tf.float32, trainable=True, initializer=self.initializer
)
if self.use_bias:
self.bias = self.create_var(
"bias", shape=[flattened], dtype=tf.float32, trainable=True, initializer=self.bias_initializer
)
def stitched_cols(self, col_names: str | list[str] | tuple) -> str:
"""Returns a string representation of the columns."""
return col_names if isinstance(col_names, str) else "-".join(col_names)
def get_reg_config(self, col_names: tuple[str] | list[str]) -> tuple[str, float]:
"""Creates name used for regularization and default value for the penalty multiplier.
If all columns are categorical, we can just join their names in order to find penalty.
Otherwise, when different continuous features are paired with a same categorical column,
the resulting hierarchical categories share same penalty. Always, suffix the continuous
string to the end of the name.
Example: brand-DEM and brand-GOP have the same penalty called reg_brand-continuous.
Examples of mixed categories: reg_brand-continuous, reg_vehicle-continuous.
Args:
col_names (list[str]): Hierarchical column names.
Returns:
tuple[str, float]: Regularization penalty name and the default value.
"""
count = int(np.prod([self.col_counts[k] for k in col_names]))
if count == 1:
# Purely continuous features
default_value = 0.0
else:
default_value = 1.0
names = [name for name in col_names if not self.is_continuous(name)]
names.append("continuous") if self.is_continuous(col_names) else None
reg_name = f"reg_{self.stitched_cols(names)}"
return reg_name, default_value
def get_reg_mult(self, col_names: list[str] | tuple[str]) -> float:
"""Returns the penalty multiplier for hierarchy level reg loss."""
reg_name, default = self.get_reg_config(col_names)
mult = self.hyperparameters.get_float(
name=reg_name,
default=default,
help="Penalty multiplier for hierarchy level reg loss.",
)
return mult
def is_continuous(self, k: str | Iterable[str]) -> bool:
return is_continuous(k, self.encodings)
def get_hierarchical_parameters(self, hierarchy: Mapping[str, TensorLike]) -> tuple[tf.Tensor, tf.Tensor]:
"""Returns the model parameters' for every hierarchical level (non-aggregated weights)
Args:
hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.
NOTE: this currently does not depend on training flag. Possible we change how things work such that it will.
Returns:
tuple[tf.Tensor, tf.Tensor]: weights, indices
the 1st list[tf.Tensor]=A: A[i] corresponds to the multiplicative data for the continuous aspects
of the hierarchy in self.columns[i]
the 2nd list[tf.Tensor]=B: B[i] corresponds to the indices in self.weights that corresponds to the
correct learned coefficients of the hierarchy in self.columns[i]
"""
# Shape is [count, ...] for both of these
weights = []
indices = []
for col_names in self.columns:
if isinstance(col_names, str):
# We want to assume col_names is a list of column names
col_names = [col_names]
num_cols = len(col_names)
cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
num_cat_cols = len(cat_cols)
num_cont_cols = num_cols - num_cat_cols
name = self.stitched_cols(col_names)
# The start of the region for this weight
start = self.offsets[name]
if num_cont_cols == 0:
shape = tf.shape(hierarchy[cat_cols[0]])
weight = tf.ones(shape, dtype=tf.float32, name=f"{name}_weights")
else:
# you can only have 1 cont col in col_names
weight = hierarchy[cont_cols[0]]
if num_cat_cols == 0:
index = tf.cast(
tf.fill(tf.shape(weight), start),
dtype=tf.int64,
name=f"{name}_indices",
)
# if no categorical columns, we have the value of continuous column as the index
else:
# The standard encoding of left to right indices given base col_counts[col] for each col
offsets = np.cumprod([1] + [self.col_counts[col] for col in cat_cols[:-1]])
# The index in weights where we look up the first of the embeddings for this set of columns
# This lets us concatenate all embeddings into a single weights matrix rather than defining
# them separately, and deterministicly able to derive the index in this larger weight matrix.
index = start + tf.math.add_n(
[
# hierarchy[col] is column of dataframe
tf.constant(offset, dtype=tf.int64) * tf.cast(hierarchy[col], dtype=tf.int64)
for offset, col in zip(offsets, cat_cols)
],
name=f"{name}_indices",
)
# store index for heirarchical parameters and the corresponding continuous weightage
indices.append(index)
weights.append(weight)
# len(self.columns), *shape(hierarchy[<any>])
weights = tf.stack(weights, axis=0, name="weights_stacked")
indices = tf.stack(indices, axis=0, name="indices")
return weights, indices
def __call__(
self,
hierarchy: dict[str, TensorLike] | tuple[tf.Tensor, tf.Tensor],
training: bool = False, # noqa: U100
debug: bool = False, # noqa: U100
skip_metrics: bool = False,
) -> tf.Tensor:
"""Returns the model parameters' embeddings calculated from the weights.
Adds l2 regularization penalties to loss based on deviations and bias.
Args:
hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.
training (bool, optional): Whether this is a training or inference run. Defaults to False.
Returns:
tf.Tensor: Model parameters' embeddings.
"""
if isinstance(hierarchy, tuple):
weights, indices = hierarchy
else:
# get the hierarchical parameters that correspond to the input hierarchy
# NOTE: weights is the proper multiplicative relationship using continuous hierarchical variables, not
# something from self.weights. Probably should change name in future for readability.
weights, indices = self.get_hierarchical_parameters(hierarchy)
# Look up embeddings by indices
# len(self.columns), *shape(hierarchy[<any>]), np.prod(self.shape)
if self.increase_lr is not None:
lr_scaled_weights = self.weights * tf.constant(self.increase_lr, dtype=tf.float32)
else:
lr_scaled_weights = self.weights
looked_up = tf.gather(lr_scaled_weights, indices, name="embeds")
# Optimization and convert to tensor
# counts,
scatters = tf.constant(self.scatters, dtype=tf.int64)
# Do a matrix multiply to sum over columns
# *shape(hierarchy[<any>]), np.prod(self.shape)
weighted = tf.einsum("c...f,c...->...f", looked_up, weights, name="weighted")
# This is num_regularized_categories x flattened using the same scatter trick as for means
# len(self.multipliers), np.prod(self.shape)
cur_l2_norm = tf.scatter_nd(
scatters[:, None],
tf.math.square(lr_scaled_weights, name="shifted_squared"),
shape=self.dense_shape,
name="cur_l2_norm",
)[1:]
# We want we want to apply l2 regularization so that this ratio is pushed to be 1 or less.
# len(self.multipliers), np.prod(self.shape)
cur_ratio = cur_l2_norm / tf.constant(self.desired_l2norms[:, None] + EPSILON, dtype=tf.float32)
# Old negative feedback was roughly cur_ratio ** 2 (in the steady state). This just makes it explicit.
# We don't care if it is over 0 so we shift down by 1 then up by 1 to get it to be the same scale
if self.use_l2_squared:
hier_reg = tf.math.reduce_sum(
tf.math.square(cur_ratio) * tf.constant(self.penalty_mults[:, None], dtype=tf.float32),
name="hier_reg",
)
else:
hier_reg = cur_ratio * tf.constant(self.penalty_mults[:, None], dtype=tf.float32)
if not skip_metrics:
self.add_loss("hier_reg", hier_reg, category="hier")
if self.use_bias:
if self.increase_lr is not None:
lr_scaled_bias = self.bias * tf.constant(self.increase_lr, dtype=tf.float32)
else:
lr_scaled_bias = self.bias
if self.reg_bias > 0 and not skip_metrics:
bias_loss = tf.math.reduce_sum(tf.math.square(lr_scaled_bias))
self.add_loss("reg_bias", bias_loss, category="aux", mult=self.reg_bias)
result = tf.nn.bias_add(weighted, lr_scaled_bias, name="biased")
else:
result = weighted
# We want to undo the flattening we did for simpler logic.
initial_shape = [tf.shape(result)[i] for i in range(len(result.shape) - 1)]
# *shape(hierarchy[<any>]), *self.shape
return tf.reshape(result, [*initial_shape, *self.shape], name="final_var")
def get_tensors(
self, dy_dweights: tf.Tensor | tf.Variable | None = None, dy_dbias: tf.Tensor | tf.Variable | None = None
) -> tuple[dict[str, tf.Tensor], dict[str, list[dict] | pd.MultiIndex], list[str | int]]:
"""Get the learned weights for a HierarchicalEmbedding layer"""
output_tensors: dict[str, tf.Tensor] = {}
output_indices: dict[str, pd.Index | pd.MultiIndex] = {}
weights = self.weights if dy_dweights is None else dy_dweights
feature_names = self._get_feature_names(weights)
if self.use_bias:
bias = self.bias if dy_dbias is None else dy_dbias
self._process_bias(bias, output_tensors, output_indices)
self._process_columns_in_tensors(output_tensors, output_indices, weights)
return output_tensors, output_indices, feature_names
def _get_feature_names(self, weights: tf.Tensor | tf.Variable) -> list[str]:
n_features = weights.shape[-1]
feature_names = list(range(n_features)) if self.feature_names is None else self.feature_names
if len(feature_names) != n_features:
if n_features % len(feature_names) == 0:
num_dups = n_features // len(feature_names)
feature_names = [f"{name}_{i+1}" for name in feature_names for i in range(num_dups)]
else:
raise ValueError(
f"feature_names must be a list of size {n_features}, but got size {len(feature_names)}"
)
return feature_names
def _process_bias(
self,
bias: tf.Tensor | tf.Variable,
output_tensors: dict[str, tf.Tensor],
output_indices: dict[str, pd.Index | pd.MultiIndex],
):
output_tensors["bias"] = tf.expand_dims(bias, axis=0)
output_indices["bias"] = pd.Index(["bias"])
def _process_columns_in_tensors(
self,
output_tensors: dict[str, tf.Tensor],
output_indices: dict[str, pd.Index | pd.MultiIndex],
weights: tf.Tensor,
):
n_features = weights.shape[-1]
for col_names in self.columns:
if isinstance(col_names, str):
col_names = [col_names]
hierarchy, output_index = self._get_hierarchy_and_output_index(col_names)
learned_weights = self._get_learned_weights(hierarchy, col_names, weights)
learned_weights = self._reshape_learned_weights_if_needed(learned_weights, n_features)
output_tensors[self.stitched_cols(col_names)] = learned_weights
output_indices[self.stitched_cols(col_names)] = output_index
def _get_hierarchy_and_output_index(
self, col_names: list[str]
) -> tuple[dict[str, NDArray], pd.Index | pd.MultiIndex]:
cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
num_cat_cols = len(cat_cols)
if num_cat_cols == 0:
hierarchy = {cont_cols[0]: np.asarray([1.0])}
output_index = pd.Index([cont_cols[0]])
else:
midx = pd.MultiIndex.from_product([self.encodings[c].values() for c in cat_cols], names=cat_cols)
output_index = pd.MultiIndex.from_product([self.encodings[c].keys() for c in cat_cols], names=cat_cols)
if len(cont_cols) > 0:
# TODO (@RyanSaxe): why is cont_cols[0] is used? adding a comment will be helpful.
midx = pd.concat({1.0: pd.DataFrame(index=midx)}, names=[cont_cols[0]]).index
hierarchy = {h: midx.get_level_values(h).to_numpy() for h in midx.names}
return hierarchy, output_index
def _get_learned_weights(
self, hierarchy: dict[str, NDArray], col_names: list[str], weights: tf.Tensor
) -> tf.Tensor:
name = self.stitched_cols(col_names)
start = self.offsets[name]
cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
if len(cont_cols) == 0:
shape = tf.shape(hierarchy[list(hierarchy.keys())[0]])
weight = tf.ones(shape, dtype=tf.float32, name=f"{self.stitched_cols(col_names)}_weights")
else:
weight = hierarchy[cont_cols[0]]
if len(cat_cols) == 0:
index = tf.cast(
tf.fill(tf.shape(weight), self.offsets[name]),
dtype=tf.int64,
name=f"{name}_indices",
)
else:
offsets = np.cumprod([1] + [self.col_counts[col] for col in cat_cols[:-1]])
index = start + tf.math.add_n(
[
tf.constant(offset, dtype=tf.int64) * tf.cast(hierarchy[col], dtype=tf.int64)
for offset, col in zip(offsets, cat_cols)
]
)
return tf.gather(weights, index, name="embeds")
def _reshape_learned_weights_if_needed(self, learned_weights: tf.Tensor, n_features: int) -> tf.Tensor:
if len(learned_weights.shape) > 2:
flattened_shape = prod(learned_weights.shape[:-1])
return tf.reshape(learned_weights, (flattened_shape, n_features))
return learned_weights
def get_dfs(
self, dy_dweights: tf.Tensor | None = None, dy_dbias: tf.Tensor | None = None
) -> dict[str, pd.DataFrame]:
"""Get the learned weights for a HierarchicalEmbedding layer as a DataFrame"""
# NOTE: separated this function so we could more easily differentiate
output_tensors, output_indices, feature_names = self.get_tensors(dy_dweights=dy_dweights, dy_dbias=dy_dbias)
return {
key: pd.DataFrame(tensor, index=output_indices[key], columns=feature_names)
for key, tensor in output_tensors.items()
}
@property
def dfs(self) -> dict[str, pd.DataFrame]:
return self.get_dfs()
|