Hier embedding - WatchTower Documentation

`AllUniqueError`

Bases: ValueError

When there are no hierchical columns because everything is unique within it.

Source code in wt_ml/layers/hier_embedding.py

class AllUniqueError(ValueError):
    """When there are no hierchical columns because everything is unique within it."""

`HierarchyChangedWarning`

Bases: UserWarning, ValueError

Passed hierarchy columns have changed.

Source code in wt_ml/layers/hier_embedding.py

class HierarchyChangedWarning(UserWarning, ValueError):
    """Passed hierarchy columns have changed."""

`HierchicalEmbedding`

Bases: Module

Hierarchical Embedding creates embeddings for a layer with different input hierarchy levels as trainable weights such that the deviations from the expected deviations are penalized. These trained embeddings are used to calculate the model parameters for a layer.

Source code in wt_ml/layers/hier_embedding.py

class HierchicalEmbedding(Module):
    """Hierarchical Embedding creates embeddings for a layer with different input hierarchy levels
    as trainable weights such that the deviations from the expected deviations are penalized.
    These trained embeddings are used to calculate the model parameters for a layer.
    """

    def __init__(
        self,
        shape: list[int],
        encodings: dict[str, Any],
        columns: list[str | list[str]] | None = None,
        use_bias: bool = True,
        dropped_columns=[],
        initializer: Initializer = 0.0,
        bias_initializer: Initializer = 0.0,
        hyperparameters: Hyperparams | None = None,
        feature_names: list[list[str]] | list[str] | None = None,
        name: str | None = None,
        increase_lr: float | None = None,
    ):
        """Initializes the hierarchical embedding object with hierarchy levels, parameter shape
        and other initializers.

        Args:
            shape (list[int]): Desired dimensions of model parameters only within final result.
            hierarchy (pd.DataFrame): The hierarchy for which embeddings are trained.
            columns (list[str  |  list[str]] | None, optional): Hierarchy levels to learn embeddings. Defaults to None.
            use_bias (bool, optional): Whether to include bias. Defaults to True.
            dropped_columns (list, optional): Columns to exclude in hierarchy.
                                        Defaults to ["granular", "region", "coastal", "populationdensity", "medianage"].
            initializer (Initializer, optional): Initializer for embeddings(weights). Defaults to 0.0.
            bias_initializer (Initializer, optional): Initializer for bias. Defaults to 0.0.
            hyperparameters (Hyperparams | None, optional): Dictionary of hyperparameters for buidling this layer.
                                                            Defaults to None.
            name (str | None, optional): Name of the layer. Defaults to None.
        """
        super().__init__(hyperparameters=hyperparameters, name=name)
        self.use_bias = use_bias
        encodings_dropped = {k: v for k, v in encodings.items() if k not in dropped_columns}
        assert encodings_dropped, "No cols in hierarchy."
        self.encodings = encodings_dropped
        self.shape = shape
        self.increase_lr = increase_lr
        self.initializer = initializer
        self.bias_initializer = bias_initializer
        self.created_reg = False
        self.feature_names = feature_names
        if self.feature_names is not None and isinstance(self.feature_names[0], list):
            self.feature_names = [elem for sublist in self.feature_names for elem in sublist]
        if columns is not None:
            self._process_columns(columns)
        else:
            self.columns = list(self.encodings.keys())
            self.used_cols = set(self.columns)

    def _process_columns(self, columns: list[str | list[str]]):
        """
        Process columns and remove duplicates or column(s) which has unique hierarchies.
        Sets `columns` and `used_cols` attribute.

        Args:
            columns (list[str  |  list[str]]): Hierarchy levels to learn embeddings.
        """
        used_cols = set(tf.nest.flatten(columns))
        missing_cols = used_cols.difference(self.encodings.keys())
        assert not missing_cols, f"Column(s) passed not in hierarchy. {missing_cols}"

        issues: list[str] = []
        new_columns: list[str | tuple[str]] = []
        for column in columns:
            if isinstance(column, (list, tuple)):
                new_column = []
                for sub_col in column:
                    encodings = self.encodings[sub_col]
                    if encodings == "continuous" or len(encodings) > 1:
                        new_column.append(sub_col)
                    else:
                        issues.append(f"{sub_col} in {column} has single encoding.")

                if len(new_column) == 1:
                    # convert to str so duplicates can be detected easily.
                    new_columns.append(new_column[0])
                elif len(new_column) > 1:
                    new_columns.append(tuple(new_column))
                else:
                    issues.append(f"Dropping {column} as it has unique encodings.")
            else:
                encodings = self.encodings[column]
                if encodings == "continuous" or len(encodings) > 1:
                    new_columns.append(column)
                else:
                    issues.append(f"Dropping {column} as it has single encoding.")

        org_col_len = len(new_columns)
        # if any column is duplicated, we need to get rid of it.
        new_columns = list(dict.fromkeys(new_columns))
        assert len(new_columns), "All columns are dopped since they are all unique."
        if len(new_columns) != org_col_len:
            issues.append("Duplicate hierarchies removed.")

        if issues:
            warn_issues(self.name, issues, new_columns, columns)

        self.columns = new_columns
        self.used_cols = set(tf.nest.flatten(self.columns))

    def build(self, input_shapes):  # noqa: U100
        """Builds hyperparamters, deviations, embeddings(weights), bias and other intermediate variables.

        Args:
            input_shapes (InputShapes): The effect and hierarchy shapes.

        Raises:
            AllUniqueError: When there are no hierchical columns because everything is unique within it.
        """
        self.use_l2_squared = self.hyperparameters.get_bool(
            "use_l2_squared",
            default=False,
            help="Use the l2 norm to the fourth power instead of only using it for large values for stability.",
        )
        self.desired_stddev = self.hyperparameters.get_float(
            "desired_stddev",
            default=0.10,
            min=0.01,
            max=100.0,
            help="The desired maximum value for the stddev along the full hierarchy.",
        )
        self.use_inv_sqrt = self.hyperparameters.get_bool(
            "use_inv_sqrt",
            default=True,
            help="Scale the stddev for each category by the inverse square root of the number of unique values.",
        )

        if self.use_bias:
            self.reg_bias = self.hyperparameters.get_float(
                "reg_bias",
                default=0.0,
                min=0.0,
                max=1e4,
                help="The strength of l2 regularization to apply to the bias term.",
            )
        self.offsets = {}
        self.col_counts = {
            k: (
                self.encodings[k]
                if isinstance(self.encodings[k], (float, int))
                else ((max(self.encodings[k].values()) + 1) if not isinstance(self.encodings[k], str) else 1)
            )
            for k in tf.nest.flatten(self.columns)
        }
        var_counts = [
            self.col_counts[col] if isinstance(col, str) else np.prod([self.col_counts[k] for k in col])
            for col in self.columns
        ]

        self.columns = [col for col, count in zip(self.columns, var_counts) if count > 1 or self.is_continuous(col)]
        count = 0
        desired_stddevs = []
        # Scatters is the inverse of gathering from num_regularized_categories + 1 to count of weights
        self.scatters = []
        self.penalty_mults = []
        multipliers = []
        reg_counts = []
        flattened = int(np.prod(self.shape)) if len(self.shape) > 0 else 1
        for col_names in self.columns:
            if isinstance(col_names, str):
                col_names = [col_names]
            cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
            name = self.stitched_cols(col_names)
            num_cont_cols = len(col_names) - len(cat_cols)
            if num_cont_cols > 1:
                raise ValueError(
                    "You can only have one continuous hierarchical variable within a single hierarchical level"
                )
            number = int(np.prod([self.col_counts[k] for k in col_names]))
            multipliers.append(1 / number)
            self.scatters += [len(multipliers)] * number
            desired_stddevs.append(1 / np.sqrt(number) if self.use_inv_sqrt else 1)
            reg_counts.append(max(1, number - 1))
            self.offsets[name] = count
            count += number
            self.penalty_mults.append(self.get_reg_mult(col_names))

        if count == 0 or len(multipliers) == 0:
            raise AllUniqueError("There are no hierchical columns everything is unique.")
        # scatters is shape (count,)
        self.scatters = np.array(self.scatters)
        # multipliers is shape (1 + regularized_counts,)
        self.multipliers = np.array([0] + multipliers, dtype=np.float32)
        self.penalty_mults = np.array(self.penalty_mults, dtype=np.float32)
        self.dense_shape = [len(self.multipliers), flattened]
        # desired_* is shape (regularized_counts,)
        self.desired_stddevs = (
            self.desired_stddev * np.array(desired_stddevs or [1], dtype=np.float32) / np.sqrt(max(1, len(multipliers)))
        )
        self.desired_l2norms = (
            np.array(reg_counts, dtype=np.float32) * self.desired_stddevs**2 / (1 if self.use_l2_squared else 2)
        )
        self.weights = self.create_var(
            "weights", shape=[count, flattened], dtype=tf.float32, trainable=True, initializer=self.initializer
        )
        if self.use_bias:
            self.bias = self.create_var(
                "bias", shape=[flattened], dtype=tf.float32, trainable=True, initializer=self.bias_initializer
            )

    def stitched_cols(self, col_names: str | list[str] | tuple) -> str:
        """Returns a string representation of the columns."""
        return col_names if isinstance(col_names, str) else "-".join(col_names)

    def get_reg_config(self, col_names: tuple[str] | list[str]) -> tuple[str, float]:
        """Creates name used for regularization and default value for the penalty multiplier.
        If all columns are categorical, we can just join their names in order to find penalty.
        Otherwise, when different continuous features are paired with a same categorical column,
        the resulting hierarchical categories share same penalty. Always, suffix the continuous
        string to the end of the name.
        Example: brand-DEM and brand-GOP have the same penalty called reg_brand-continuous.
        Examples of mixed categories: reg_brand-continuous, reg_vehicle-continuous.

        Args:
            col_names (list[str]): Hierarchical column names.

        Returns:
            tuple[str, float]: Regularization penalty name and the default value.
        """
        count = int(np.prod([self.col_counts[k] for k in col_names]))
        if count == 1:
            # Purely continuous features
            default_value = 0.0
        else:
            default_value = 1.0
        names = [name for name in col_names if not self.is_continuous(name)]
        names.append("continuous") if self.is_continuous(col_names) else None
        reg_name = f"reg_{self.stitched_cols(names)}"
        return reg_name, default_value

    def get_reg_mult(self, col_names: list[str] | tuple[str]) -> float:
        """Returns the penalty multiplier for hierarchy level reg loss."""
        reg_name, default = self.get_reg_config(col_names)
        mult = self.hyperparameters.get_float(
            name=reg_name,
            default=default,
            help="Penalty multiplier for hierarchy level reg loss.",
        )
        return mult

    def is_continuous(self, k: str | Iterable[str]) -> bool:
        return is_continuous(k, self.encodings)

    def get_hierarchical_parameters(self, hierarchy: Mapping[str, TensorLike]) -> tuple[tf.Tensor, tf.Tensor]:
        """Returns the model parameters' for every hierarchical level (non-aggregated weights)

        Args:
            hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.

        NOTE: this currently does not depend on training flag. Possible we change how things work such that it will.

        Returns:

            tuple[tf.Tensor, tf.Tensor]: weights, indices

                the 1st list[tf.Tensor]=A: A[i] corresponds to the multiplicative data for the continuous aspects
                                                of the hierarchy in self.columns[i]
                the 2nd list[tf.Tensor]=B: B[i] corresponds to the indices in self.weights that corresponds to the
                                                correct learned coefficients of the hierarchy in self.columns[i]
        """
        # Shape is [count, ...] for both of these
        weights = []
        indices = []
        for col_names in self.columns:
            if isinstance(col_names, str):
                # We want to assume col_names is a list of column names
                col_names = [col_names]
            num_cols = len(col_names)
            cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
            cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
            num_cat_cols = len(cat_cols)
            num_cont_cols = num_cols - num_cat_cols

            name = self.stitched_cols(col_names)
            # The start of the region for this weight
            start = self.offsets[name]
            if num_cont_cols == 0:
                shape = tf.shape(hierarchy[cat_cols[0]])
                weight = tf.ones(shape, dtype=tf.float32, name=f"{name}_weights")
            else:
                # you can only have 1 cont col in col_names
                weight = hierarchy[cont_cols[0]]

            if num_cat_cols == 0:
                index = tf.cast(
                    tf.fill(tf.shape(weight), start),
                    dtype=tf.int64,
                    name=f"{name}_indices",
                )
                # if no categorical columns, we have the value of continuous column as the index
            else:
                # The standard encoding of left to right indices given base col_counts[col] for each col
                offsets = np.cumprod([1] + [self.col_counts[col] for col in cat_cols[:-1]])

                # The index in weights where we look up the first of the embeddings for this set of columns
                # This lets us concatenate all embeddings into a single weights matrix rather than defining
                # them separately, and deterministicly able to derive the index in this larger weight matrix.
                index = start + tf.math.add_n(
                    [
                        # hierarchy[col] is column of dataframe
                        tf.constant(offset, dtype=tf.int64) * tf.cast(hierarchy[col], dtype=tf.int64)
                        for offset, col in zip(offsets, cat_cols)
                    ],
                    name=f"{name}_indices",
                )

            # store index for heirarchical parameters and the corresponding continuous weightage
            indices.append(index)
            weights.append(weight)

        # len(self.columns), *shape(hierarchy[<any>])
        weights = tf.stack(weights, axis=0, name="weights_stacked")
        indices = tf.stack(indices, axis=0, name="indices")
        return weights, indices

    def __call__(
        self,
        hierarchy: dict[str, TensorLike] | tuple[tf.Tensor, tf.Tensor],
        training: bool = False,  # noqa: U100
        debug: bool = False,  # noqa: U100
        skip_metrics: bool = False,
    ) -> tf.Tensor:
        """Returns the model parameters' embeddings calculated from the weights.
        Adds l2 regularization penalties to loss based on deviations and bias.

        Args:
            hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.
            training (bool, optional): Whether this is a training or inference run. Defaults to False.

        Returns:
            tf.Tensor: Model parameters' embeddings.
        """

        if isinstance(hierarchy, tuple):
            weights, indices = hierarchy
        else:
            # get the hierarchical parameters that correspond to the input hierarchy
            # NOTE: weights is the proper multiplicative relationship using continuous hierarchical variables, not
            #       something from self.weights. Probably should change name in future for readability.
            weights, indices = self.get_hierarchical_parameters(hierarchy)
        # Look up embeddings by indices
        # len(self.columns), *shape(hierarchy[<any>]), np.prod(self.shape)

        if self.increase_lr is not None:
            lr_scaled_weights = self.weights * tf.constant(self.increase_lr, dtype=tf.float32)
        else:
            lr_scaled_weights = self.weights

        looked_up = tf.gather(lr_scaled_weights, indices, name="embeds")
        # Optimization and convert to tensor
        # counts,
        scatters = tf.constant(self.scatters, dtype=tf.int64)
        # Do a matrix multiply to sum over columns
        # *shape(hierarchy[<any>]), np.prod(self.shape)
        weighted = tf.einsum("c...f,c...->...f", looked_up, weights, name="weighted")

        # This is num_regularized_categories x flattened using the same scatter trick as for means
        # len(self.multipliers), np.prod(self.shape)
        cur_l2_norm = tf.scatter_nd(
            scatters[:, None],
            tf.math.square(lr_scaled_weights, name="shifted_squared"),
            shape=self.dense_shape,
            name="cur_l2_norm",
        )[1:]
        # We want we want to apply l2 regularization so that this ratio is pushed to be 1 or less.
        # len(self.multipliers), np.prod(self.shape)
        cur_ratio = cur_l2_norm / tf.constant(self.desired_l2norms[:, None] + EPSILON, dtype=tf.float32)
        # Old negative feedback was roughly cur_ratio ** 2 (in the steady state). This just makes it explicit.
        # We don't care if it is over 0 so we shift down by 1 then up by 1 to get it to be the same scale
        if self.use_l2_squared:
            hier_reg = tf.math.reduce_sum(
                tf.math.square(cur_ratio) * tf.constant(self.penalty_mults[:, None], dtype=tf.float32),
                name="hier_reg",
            )
        else:
            hier_reg = cur_ratio * tf.constant(self.penalty_mults[:, None], dtype=tf.float32)
        if not skip_metrics:
            self.add_loss("hier_reg", hier_reg, category="hier")
        if self.use_bias:
            if self.increase_lr is not None:
                lr_scaled_bias = self.bias * tf.constant(self.increase_lr, dtype=tf.float32)
            else:
                lr_scaled_bias = self.bias

            if self.reg_bias > 0 and not skip_metrics:
                bias_loss = tf.math.reduce_sum(tf.math.square(lr_scaled_bias))
                self.add_loss("reg_bias", bias_loss, category="aux", mult=self.reg_bias)

            result = tf.nn.bias_add(weighted, lr_scaled_bias, name="biased")
        else:
            result = weighted

        # We want to undo the flattening we did for simpler logic.
        initial_shape = [tf.shape(result)[i] for i in range(len(result.shape) - 1)]
        # *shape(hierarchy[<any>]), *self.shape
        return tf.reshape(result, [*initial_shape, *self.shape], name="final_var")

    def get_tensors(
        self, dy_dweights: tf.Tensor | tf.Variable | None = None, dy_dbias: tf.Tensor | tf.Variable | None = None
    ) -> tuple[dict[str, tf.Tensor], dict[str, list[dict] | pd.MultiIndex], list[str | int]]:
        """Get the learned weights for a HierarchicalEmbedding layer"""
        output_tensors: dict[str, tf.Tensor] = {}
        output_indices: dict[str, pd.Index | pd.MultiIndex] = {}

        weights = self.weights if dy_dweights is None else dy_dweights
        feature_names = self._get_feature_names(weights)
        if self.use_bias:
            bias = self.bias if dy_dbias is None else dy_dbias
            self._process_bias(bias, output_tensors, output_indices)
        self._process_columns_in_tensors(output_tensors, output_indices, weights)

        return output_tensors, output_indices, feature_names

    def _get_feature_names(self, weights: tf.Tensor | tf.Variable) -> list[str]:
        n_features = weights.shape[-1]
        feature_names = list(range(n_features)) if self.feature_names is None else self.feature_names
        if len(feature_names) != n_features:
            if n_features % len(feature_names) == 0:
                num_dups = n_features // len(feature_names)
                feature_names = [f"{name}_{i+1}" for name in feature_names for i in range(num_dups)]
            else:
                raise ValueError(
                    f"feature_names must be a list of size {n_features}, but got size {len(feature_names)}"
                )
        return feature_names

    def _process_bias(
        self,
        bias: tf.Tensor | tf.Variable,
        output_tensors: dict[str, tf.Tensor],
        output_indices: dict[str, pd.Index | pd.MultiIndex],
    ):
        output_tensors["bias"] = tf.expand_dims(bias, axis=0)
        output_indices["bias"] = pd.Index(["bias"])

    def _process_columns_in_tensors(
        self,
        output_tensors: dict[str, tf.Tensor],
        output_indices: dict[str, pd.Index | pd.MultiIndex],
        weights: tf.Tensor,
    ):
        n_features = weights.shape[-1]
        for col_names in self.columns:
            if isinstance(col_names, str):
                col_names = [col_names]
            hierarchy, output_index = self._get_hierarchy_and_output_index(col_names)
            learned_weights = self._get_learned_weights(hierarchy, col_names, weights)
            learned_weights = self._reshape_learned_weights_if_needed(learned_weights, n_features)
            output_tensors[self.stitched_cols(col_names)] = learned_weights
            output_indices[self.stitched_cols(col_names)] = output_index

    def _get_hierarchy_and_output_index(
        self, col_names: list[str]
    ) -> tuple[dict[str, NDArray], pd.Index | pd.MultiIndex]:
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
        num_cat_cols = len(cat_cols)

        if num_cat_cols == 0:
            hierarchy = {cont_cols[0]: np.asarray([1.0])}
            output_index = pd.Index([cont_cols[0]])
        else:
            midx = pd.MultiIndex.from_product([self.encodings[c].values() for c in cat_cols], names=cat_cols)
            output_index = pd.MultiIndex.from_product([self.encodings[c].keys() for c in cat_cols], names=cat_cols)
            if len(cont_cols) > 0:
                # TODO (@RyanSaxe): why is cont_cols[0] is used? adding a comment will be helpful.
                midx = pd.concat({1.0: pd.DataFrame(index=midx)}, names=[cont_cols[0]]).index
            hierarchy = {h: midx.get_level_values(h).to_numpy() for h in midx.names}

        return hierarchy, output_index

    def _get_learned_weights(
        self, hierarchy: dict[str, NDArray], col_names: list[str], weights: tf.Tensor
    ) -> tf.Tensor:
        name = self.stitched_cols(col_names)
        start = self.offsets[name]
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]

        if len(cont_cols) == 0:
            shape = tf.shape(hierarchy[list(hierarchy.keys())[0]])
            weight = tf.ones(shape, dtype=tf.float32, name=f"{self.stitched_cols(col_names)}_weights")
        else:
            weight = hierarchy[cont_cols[0]]
        if len(cat_cols) == 0:
            index = tf.cast(
                tf.fill(tf.shape(weight), self.offsets[name]),
                dtype=tf.int64,
                name=f"{name}_indices",
            )
        else:
            offsets = np.cumprod([1] + [self.col_counts[col] for col in cat_cols[:-1]])
            index = start + tf.math.add_n(
                [
                    tf.constant(offset, dtype=tf.int64) * tf.cast(hierarchy[col], dtype=tf.int64)
                    for offset, col in zip(offsets, cat_cols)
                ]
            )

        return tf.gather(weights, index, name="embeds")

    def _reshape_learned_weights_if_needed(self, learned_weights: tf.Tensor, n_features: int) -> tf.Tensor:
        if len(learned_weights.shape) > 2:
            flattened_shape = prod(learned_weights.shape[:-1])
            return tf.reshape(learned_weights, (flattened_shape, n_features))
        return learned_weights

    def get_dfs(
        self, dy_dweights: tf.Tensor | None = None, dy_dbias: tf.Tensor | None = None
    ) -> dict[str, pd.DataFrame]:
        """Get the learned weights for a HierarchicalEmbedding layer as a DataFrame"""
        # NOTE: separated this function so we could more easily differentiate
        output_tensors, output_indices, feature_names = self.get_tensors(dy_dweights=dy_dweights, dy_dbias=dy_dbias)
        return {
            key: pd.DataFrame(tensor, index=output_indices[key], columns=feature_names)
            for key, tensor in output_tensors.items()
        }

    @property
    def dfs(self) -> dict[str, pd.DataFrame]:
        return self.get_dfs()

`call(hierarchy, training=False, debug=False, skip_metrics=False)`

Returns the model parameters' embeddings calculated from the weights. Adds l2 regularization penalties to loss based on deviations and bias.

Parameters:

Name	Type	Description	Default
`hierarchy`	`dict[str, TensorLike]`	Hierarchy placeholder for Hierarchial embedding variable.	required
`training`	`bool`	Whether this is a training or inference run. Defaults to False.	`False`

Returns:

Type	Description
`Tensor`	tf.Tensor: Model parameters' embeddings.

Source code in wt_ml/layers/hier_embedding.py

def __call__(
    self,
    hierarchy: dict[str, TensorLike] | tuple[tf.Tensor, tf.Tensor],
    training: bool = False,  # noqa: U100
    debug: bool = False,  # noqa: U100
    skip_metrics: bool = False,
) -> tf.Tensor:
    """Returns the model parameters' embeddings calculated from the weights.
    Adds l2 regularization penalties to loss based on deviations and bias.

    Args:
        hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.
        training (bool, optional): Whether this is a training or inference run. Defaults to False.

    Returns:
        tf.Tensor: Model parameters' embeddings.
    """

    if isinstance(hierarchy, tuple):
        weights, indices = hierarchy
    else:
        # get the hierarchical parameters that correspond to the input hierarchy
        # NOTE: weights is the proper multiplicative relationship using continuous hierarchical variables, not
        #       something from self.weights. Probably should change name in future for readability.
        weights, indices = self.get_hierarchical_parameters(hierarchy)
    # Look up embeddings by indices
    # len(self.columns), *shape(hierarchy[<any>]), np.prod(self.shape)

    if self.increase_lr is not None:
        lr_scaled_weights = self.weights * tf.constant(self.increase_lr, dtype=tf.float32)
    else:
        lr_scaled_weights = self.weights

    looked_up = tf.gather(lr_scaled_weights, indices, name="embeds")
    # Optimization and convert to tensor
    # counts,
    scatters = tf.constant(self.scatters, dtype=tf.int64)
    # Do a matrix multiply to sum over columns
    # *shape(hierarchy[<any>]), np.prod(self.shape)
    weighted = tf.einsum("c...f,c...->...f", looked_up, weights, name="weighted")

    # This is num_regularized_categories x flattened using the same scatter trick as for means
    # len(self.multipliers), np.prod(self.shape)
    cur_l2_norm = tf.scatter_nd(
        scatters[:, None],
        tf.math.square(lr_scaled_weights, name="shifted_squared"),
        shape=self.dense_shape,
        name="cur_l2_norm",
    )[1:]
    # We want we want to apply l2 regularization so that this ratio is pushed to be 1 or less.
    # len(self.multipliers), np.prod(self.shape)
    cur_ratio = cur_l2_norm / tf.constant(self.desired_l2norms[:, None] + EPSILON, dtype=tf.float32)
    # Old negative feedback was roughly cur_ratio ** 2 (in the steady state). This just makes it explicit.
    # We don't care if it is over 0 so we shift down by 1 then up by 1 to get it to be the same scale
    if self.use_l2_squared:
        hier_reg = tf.math.reduce_sum(
            tf.math.square(cur_ratio) * tf.constant(self.penalty_mults[:, None], dtype=tf.float32),
            name="hier_reg",
        )
    else:
        hier_reg = cur_ratio * tf.constant(self.penalty_mults[:, None], dtype=tf.float32)
    if not skip_metrics:
        self.add_loss("hier_reg", hier_reg, category="hier")
    if self.use_bias:
        if self.increase_lr is not None:
            lr_scaled_bias = self.bias * tf.constant(self.increase_lr, dtype=tf.float32)
        else:
            lr_scaled_bias = self.bias

        if self.reg_bias > 0 and not skip_metrics:
            bias_loss = tf.math.reduce_sum(tf.math.square(lr_scaled_bias))
            self.add_loss("reg_bias", bias_loss, category="aux", mult=self.reg_bias)

        result = tf.nn.bias_add(weighted, lr_scaled_bias, name="biased")
    else:
        result = weighted

    # We want to undo the flattening we did for simpler logic.
    initial_shape = [tf.shape(result)[i] for i in range(len(result.shape) - 1)]
    # *shape(hierarchy[<any>]), *self.shape
    return tf.reshape(result, [*initial_shape, *self.shape], name="final_var")

`init(shape, encodings, columns=None, use_bias=True, dropped_columns=[], initializer=0.0, bias_initializer=0.0, hyperparameters=None, feature_names=None, name=None, increase_lr=None)`

Initializes the hierarchical embedding object with hierarchy levels, parameter shape and other initializers.

Parameters:

Name	Type	Description	Default
`shape`	`list[int]`	Desired dimensions of model parameters only within final result.	required
`hierarchy`	`DataFrame`	The hierarchy for which embeddings are trained.	required
`columns`	`list[str \| list[str]] \| None`	Hierarchy levels to learn embeddings. Defaults to None.	`None`
`use_bias`	`bool`	Whether to include bias. Defaults to True.	`True`
`dropped_columns`	`list`	Columns to exclude in hierarchy. Defaults to ["granular", "region", "coastal", "populationdensity", "medianage"].	`[]`
`initializer`	`Initializer`	Initializer for embeddings(weights). Defaults to 0.0.	`0.0`
`bias_initializer`	`Initializer`	Initializer for bias. Defaults to 0.0.	`0.0`
`hyperparameters`	`Hyperparams \| None`	Dictionary of hyperparameters for buidling this layer. Defaults to None.	`None`
`name`	`str \| None`	Name of the layer. Defaults to None.	`None`

Source code in wt_ml/layers/hier_embedding.py

def __init__(
    self,
    shape: list[int],
    encodings: dict[str, Any],
    columns: list[str | list[str]] | None = None,
    use_bias: bool = True,
    dropped_columns=[],
    initializer: Initializer = 0.0,
    bias_initializer: Initializer = 0.0,
    hyperparameters: Hyperparams | None = None,
    feature_names: list[list[str]] | list[str] | None = None,
    name: str | None = None,
    increase_lr: float | None = None,
):
    """Initializes the hierarchical embedding object with hierarchy levels, parameter shape
    and other initializers.

    Args:
        shape (list[int]): Desired dimensions of model parameters only within final result.
        hierarchy (pd.DataFrame): The hierarchy for which embeddings are trained.
        columns (list[str  |  list[str]] | None, optional): Hierarchy levels to learn embeddings. Defaults to None.
        use_bias (bool, optional): Whether to include bias. Defaults to True.
        dropped_columns (list, optional): Columns to exclude in hierarchy.
                                    Defaults to ["granular", "region", "coastal", "populationdensity", "medianage"].
        initializer (Initializer, optional): Initializer for embeddings(weights). Defaults to 0.0.
        bias_initializer (Initializer, optional): Initializer for bias. Defaults to 0.0.
        hyperparameters (Hyperparams | None, optional): Dictionary of hyperparameters for buidling this layer.
                                                        Defaults to None.
        name (str | None, optional): Name of the layer. Defaults to None.
    """
    super().__init__(hyperparameters=hyperparameters, name=name)
    self.use_bias = use_bias
    encodings_dropped = {k: v for k, v in encodings.items() if k not in dropped_columns}
    assert encodings_dropped, "No cols in hierarchy."
    self.encodings = encodings_dropped
    self.shape = shape
    self.increase_lr = increase_lr
    self.initializer = initializer
    self.bias_initializer = bias_initializer
    self.created_reg = False
    self.feature_names = feature_names
    if self.feature_names is not None and isinstance(self.feature_names[0], list):
        self.feature_names = [elem for sublist in self.feature_names for elem in sublist]
    if columns is not None:
        self._process_columns(columns)
    else:
        self.columns = list(self.encodings.keys())
        self.used_cols = set(self.columns)

`build(input_shapes)`

Builds hyperparamters, deviations, embeddings(weights), bias and other intermediate variables.

Parameters:

Name	Type	Description	Default
`input_shapes`	`InputShapes`	The effect and hierarchy shapes.	required

Raises:

Type	Description
`AllUniqueError`	When there are no hierchical columns because everything is unique within it.

Source code in wt_ml/layers/hier_embedding.py

def build(self, input_shapes):  # noqa: U100
    """Builds hyperparamters, deviations, embeddings(weights), bias and other intermediate variables.

    Args:
        input_shapes (InputShapes): The effect and hierarchy shapes.

    Raises:
        AllUniqueError: When there are no hierchical columns because everything is unique within it.
    """
    self.use_l2_squared = self.hyperparameters.get_bool(
        "use_l2_squared",
        default=False,
        help="Use the l2 norm to the fourth power instead of only using it for large values for stability.",
    )
    self.desired_stddev = self.hyperparameters.get_float(
        "desired_stddev",
        default=0.10,
        min=0.01,
        max=100.0,
        help="The desired maximum value for the stddev along the full hierarchy.",
    )
    self.use_inv_sqrt = self.hyperparameters.get_bool(
        "use_inv_sqrt",
        default=True,
        help="Scale the stddev for each category by the inverse square root of the number of unique values.",
    )

    if self.use_bias:
        self.reg_bias = self.hyperparameters.get_float(
            "reg_bias",
            default=0.0,
            min=0.0,
            max=1e4,
            help="The strength of l2 regularization to apply to the bias term.",
        )
    self.offsets = {}
    self.col_counts = {
        k: (
            self.encodings[k]
            if isinstance(self.encodings[k], (float, int))
            else ((max(self.encodings[k].values()) + 1) if not isinstance(self.encodings[k], str) else 1)
        )
        for k in tf.nest.flatten(self.columns)
    }
    var_counts = [
        self.col_counts[col] if isinstance(col, str) else np.prod([self.col_counts[k] for k in col])
        for col in self.columns
    ]

    self.columns = [col for col, count in zip(self.columns, var_counts) if count > 1 or self.is_continuous(col)]
    count = 0
    desired_stddevs = []
    # Scatters is the inverse of gathering from num_regularized_categories + 1 to count of weights
    self.scatters = []
    self.penalty_mults = []
    multipliers = []
    reg_counts = []
    flattened = int(np.prod(self.shape)) if len(self.shape) > 0 else 1
    for col_names in self.columns:
        if isinstance(col_names, str):
            col_names = [col_names]
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        name = self.stitched_cols(col_names)
        num_cont_cols = len(col_names) - len(cat_cols)
        if num_cont_cols > 1:
            raise ValueError(
                "You can only have one continuous hierarchical variable within a single hierarchical level"
            )
        number = int(np.prod([self.col_counts[k] for k in col_names]))
        multipliers.append(1 / number)
        self.scatters += [len(multipliers)] * number
        desired_stddevs.append(1 / np.sqrt(number) if self.use_inv_sqrt else 1)
        reg_counts.append(max(1, number - 1))
        self.offsets[name] = count
        count += number
        self.penalty_mults.append(self.get_reg_mult(col_names))

    if count == 0 or len(multipliers) == 0:
        raise AllUniqueError("There are no hierchical columns everything is unique.")
    # scatters is shape (count,)
    self.scatters = np.array(self.scatters)
    # multipliers is shape (1 + regularized_counts,)
    self.multipliers = np.array([0] + multipliers, dtype=np.float32)
    self.penalty_mults = np.array(self.penalty_mults, dtype=np.float32)
    self.dense_shape = [len(self.multipliers), flattened]
    # desired_* is shape (regularized_counts,)
    self.desired_stddevs = (
        self.desired_stddev * np.array(desired_stddevs or [1], dtype=np.float32) / np.sqrt(max(1, len(multipliers)))
    )
    self.desired_l2norms = (
        np.array(reg_counts, dtype=np.float32) * self.desired_stddevs**2 / (1 if self.use_l2_squared else 2)
    )
    self.weights = self.create_var(
        "weights", shape=[count, flattened], dtype=tf.float32, trainable=True, initializer=self.initializer
    )
    if self.use_bias:
        self.bias = self.create_var(
            "bias", shape=[flattened], dtype=tf.float32, trainable=True, initializer=self.bias_initializer
        )

`get_dfs(dy_dweights=None, dy_dbias=None)`

Get the learned weights for a HierarchicalEmbedding layer as a DataFrame

Source code in wt_ml/layers/hier_embedding.py

def get_dfs(
    self, dy_dweights: tf.Tensor | None = None, dy_dbias: tf.Tensor | None = None
) -> dict[str, pd.DataFrame]:
    """Get the learned weights for a HierarchicalEmbedding layer as a DataFrame"""
    # NOTE: separated this function so we could more easily differentiate
    output_tensors, output_indices, feature_names = self.get_tensors(dy_dweights=dy_dweights, dy_dbias=dy_dbias)
    return {
        key: pd.DataFrame(tensor, index=output_indices[key], columns=feature_names)
        for key, tensor in output_tensors.items()
    }

`get_hierarchical_parameters(hierarchy)`

Returns the model parameters' for every hierarchical level (non-aggregated weights)

Parameters:

Name	Type	Description	Default
`hierarchy`	`dict[str, TensorLike]`	Hierarchy placeholder for Hierarchial embedding variable.	required

NOTE: this currently does not depend on training flag. Possible we change how things work such that it will.

Returns:

tuple[tf.Tensor, tf.Tensor]: weights, indices

    the 1st list[tf.Tensor]=A: A[i] corresponds to the multiplicative data for the continuous aspects
                                    of the hierarchy in self.columns[i]
    the 2nd list[tf.Tensor]=B: B[i] corresponds to the indices in self.weights that corresponds to the
                                    correct learned coefficients of the hierarchy in self.columns[i]

Source code in wt_ml/layers/hier_embedding.py

def get_hierarchical_parameters(self, hierarchy: Mapping[str, TensorLike]) -> tuple[tf.Tensor, tf.Tensor]:
    """Returns the model parameters' for every hierarchical level (non-aggregated weights)

    Args:
        hierarchy (dict[str, TensorLike]): Hierarchy placeholder for Hierarchial embedding variable.

    NOTE: this currently does not depend on training flag. Possible we change how things work such that it will.

    Returns:

        tuple[tf.Tensor, tf.Tensor]: weights, indices

            the 1st list[tf.Tensor]=A: A[i] corresponds to the multiplicative data for the continuous aspects
                                            of the hierarchy in self.columns[i]
            the 2nd list[tf.Tensor]=B: B[i] corresponds to the indices in self.weights that corresponds to the
                                            correct learned coefficients of the hierarchy in self.columns[i]
    """
    # Shape is [count, ...] for both of these
    weights = []
    indices = []
    for col_names in self.columns:
        if isinstance(col_names, str):
            # We want to assume col_names is a list of column names
            col_names = [col_names]
        num_cols = len(col_names)
        cat_cols = [col for col in col_names if self.encodings[col] != "continuous"]
        cont_cols = [col for col in col_names if self.encodings[col] == "continuous"]
        num_cat_cols = len(cat_cols)
        num_cont_cols = num_cols - num_cat_cols

        name = self.stitched_cols(col_names)
        # The start of the region for this weight
        start = self.offsets[name]
        if num_cont_cols == 0:
            shape = tf.shape(hierarchy[cat_cols[0]])
            weight = tf.ones(shape, dtype=tf.float32, name=f"{name}_weights")
        else:
            # you can only have 1 cont col in col_names
            weight = hierarchy[cont_cols[0]]

        if num_cat_cols == 0:
            index = tf.cast(
                tf.fill(tf.shape(weight), start),
                dtype=tf.int64,
                name=f"{name}_indices",
            )
            # if no categorical columns, we have the value of continuous column as the index
        else:
            # The standard encoding of left to right indices given base col_counts[col] for each col
            offsets = np.cumprod([1] + [self.col_counts[col] for col in cat_cols[:-1]])

            # The index in weights where we look up the first of the embeddings for this set of columns
            # This lets us concatenate all embeddings into a single weights matrix rather than defining
            # them separately, and deterministicly able to derive the index in this larger weight matrix.
            index = start + tf.math.add_n(
                [
                    # hierarchy[col] is column of dataframe
                    tf.constant(offset, dtype=tf.int64) * tf.cast(hierarchy[col], dtype=tf.int64)
                    for offset, col in zip(offsets, cat_cols)
                ],
                name=f"{name}_indices",
            )

        # store index for heirarchical parameters and the corresponding continuous weightage
        indices.append(index)
        weights.append(weight)

    # len(self.columns), *shape(hierarchy[<any>])
    weights = tf.stack(weights, axis=0, name="weights_stacked")
    indices = tf.stack(indices, axis=0, name="indices")
    return weights, indices

`get_reg_config(col_names)`

Creates name used for regularization and default value for the penalty multiplier. If all columns are categorical, we can just join their names in order to find penalty. Otherwise, when different continuous features are paired with a same categorical column, the resulting hierarchical categories share same penalty. Always, suffix the continuous string to the end of the name. Example: brand-DEM and brand-GOP have the same penalty called reg_brand-continuous. Examples of mixed categories: reg_brand-continuous, reg_vehicle-continuous.

Parameters:

Name	Type	Description	Default
`col_names`	`list[str]`	Hierarchical column names.	required

Returns:

Type	Description
`tuple[str, float]`	tuple[str, float]: Regularization penalty name and the default value.

Source code in wt_ml/layers/hier_embedding.py

def get_reg_config(self, col_names: tuple[str] | list[str]) -> tuple[str, float]:
    """Creates name used for regularization and default value for the penalty multiplier.
    If all columns are categorical, we can just join their names in order to find penalty.
    Otherwise, when different continuous features are paired with a same categorical column,
    the resulting hierarchical categories share same penalty. Always, suffix the continuous
    string to the end of the name.
    Example: brand-DEM and brand-GOP have the same penalty called reg_brand-continuous.
    Examples of mixed categories: reg_brand-continuous, reg_vehicle-continuous.

    Args:
        col_names (list[str]): Hierarchical column names.

    Returns:
        tuple[str, float]: Regularization penalty name and the default value.
    """
    count = int(np.prod([self.col_counts[k] for k in col_names]))
    if count == 1:
        # Purely continuous features
        default_value = 0.0
    else:
        default_value = 1.0
    names = [name for name in col_names if not self.is_continuous(name)]
    names.append("continuous") if self.is_continuous(col_names) else None
    reg_name = f"reg_{self.stitched_cols(names)}"
    return reg_name, default_value

`get_reg_mult(col_names)`

Returns the penalty multiplier for hierarchy level reg loss.

Source code in wt_ml/layers/hier_embedding.py

def get_reg_mult(self, col_names: list[str] | tuple[str]) -> float:
    """Returns the penalty multiplier for hierarchy level reg loss."""
    reg_name, default = self.get_reg_config(col_names)
    mult = self.hyperparameters.get_float(
        name=reg_name,
        default=default,
        help="Penalty multiplier for hierarchy level reg loss.",
    )
    return mult

`get_tensors(dy_dweights=None, dy_dbias=None)`

Get the learned weights for a HierarchicalEmbedding layer

Source code in wt_ml/layers/hier_embedding.py

def get_tensors(
    self, dy_dweights: tf.Tensor | tf.Variable | None = None, dy_dbias: tf.Tensor | tf.Variable | None = None
) -> tuple[dict[str, tf.Tensor], dict[str, list[dict] | pd.MultiIndex], list[str | int]]:
    """Get the learned weights for a HierarchicalEmbedding layer"""
    output_tensors: dict[str, tf.Tensor] = {}
    output_indices: dict[str, pd.Index | pd.MultiIndex] = {}

    weights = self.weights if dy_dweights is None else dy_dweights
    feature_names = self._get_feature_names(weights)
    if self.use_bias:
        bias = self.bias if dy_dbias is None else dy_dbias
        self._process_bias(bias, output_tensors, output_indices)
    self._process_columns_in_tensors(output_tensors, output_indices, weights)

    return output_tensors, output_indices, feature_names

`stitched_cols(col_names)`

Returns a string representation of the columns.

Source code in wt_ml/layers/hier_embedding.py

def stitched_cols(self, col_names: str | list[str] | tuple) -> str:
    """Returns a string representation of the columns."""
    return col_names if isinstance(col_names, str) else "-".join(col_names)

AllUniqueError

HierarchyChangedWarning

HierchicalEmbedding

__call__(hierarchy, training=False, debug=False, skip_metrics=False)

__init__(shape, encodings, columns=None, use_bias=True, dropped_columns=[], initializer=0.0, bias_initializer=0.0, hyperparameters=None, feature_names=None, name=None, increase_lr=None)

build(input_shapes)

get_dfs(dy_dweights=None, dy_dbias=None)

get_hierarchical_parameters(hierarchy)

get_reg_config(col_names)

get_reg_mult(col_names)

get_tensors(dy_dweights=None, dy_dbias=None)

stitched_cols(col_names)

`AllUniqueError`

`HierarchyChangedWarning`

`HierchicalEmbedding`

`call(hierarchy, training=False, debug=False, skip_metrics=False)`

`init(shape, encodings, columns=None, use_bias=True, dropped_columns=[], initializer=0.0, bias_initializer=0.0, hyperparameters=None, feature_names=None, name=None, increase_lr=None)`

`build(input_shapes)`

`get_dfs(dy_dweights=None, dy_dbias=None)`

`get_hierarchical_parameters(hierarchy)`

`get_reg_config(col_names)`

`get_reg_mult(col_names)`

`get_tensors(dy_dweights=None, dy_dbias=None)`

`stitched_cols(col_names)`