Distribution hacks - WatchTower Documentation

`cleanup_distribution_outliers(wholesaler_data)`

Interpolate values where distribution signals exceed 3 standard deviations from the rolling mean. Interpolation is performed only if distribution is non-zero.

Source code in wt_ml/dataset/region_hacks/us_hacks/distribution_hacks.py

@register_hack("dataset", "us", "distribution_json")
def cleanup_distribution_outliers(wholesaler_data: tuple[pd.DataFrame, str]) -> tuple[pd.DataFrame, str]:
    """
    Interpolate values where distribution signals exceed 3 standard deviations from the rolling mean.
    Interpolation is performed only if distribution is non-zero.
    """
    window = 12
    n_stds = 3
    wholesaler_df = wholesaler_data[0]
    for dist_series in ("simple_distribution", "multiple_distribution"):
        dist_df = wholesaler_df[dist_series].unstack(["brand_code", "product_code"])
        rolling_means = dist_df.rolling(window).mean().bfill()
        n_rolling_stds_away = rolling_means + dist_df.rolling(window).std().bfill() * n_stds
        n_rolling_stds_below = rolling_means - dist_df.rolling(window).std().bfill() * n_stds
        is_extreme_value = (dist_df > n_rolling_stds_away) | (dist_df < n_rolling_stds_below)
        is_non_zero = dist_df > 0
        is_outlier = np.expand_dims(dist_df.index.get_level_values("week_date").isin(DISTRIBUTION_OUTLIER_DATES), 1)
        dist_df = dist_df.mask((is_extreme_value | is_outlier) & is_non_zero).interpolate(method="linear")
        wholesaler_df[dist_series] = (
            dist_df.stack(["brand_code", "product_code"])
            .reorder_levels(["brand_code", "week_date", "product_code"])
            .sort_index()
        )
    return (wholesaler_df, wholesaler_data[1])

`clip_distribution_values(wholesaler_data)`

Simple Distribution should not be less than 0 or greater than 1 and Multiple Distribution should not be less than 1.

Source code in wt_ml/dataset/region_hacks/us_hacks/distribution_hacks.py

@register_hack("dataset", "us", "distribution_json")
def clip_distribution_values(wholesaler_data: tuple[pd.DataFrame, str]) -> tuple[pd.DataFrame, str]:
    """
    Simple Distribution should not be less than 0 or greater than 1 and Multiple Distribution
    should not be less than 1.
    """
    simple_distribution = wholesaler_data[0]["simple_distribution"]
    multiple_distribution = wholesaler_data[0]["multiple_distribution"]
    simple_min = simple_distribution.min()
    simple_max = simple_distribution.max()
    wholesaler_data[0]["simple_distribution"] = (simple_distribution - min(0, simple_min)) / max(1, simple_max)
    multiple_min_pos = min(1, multiple_distribution[multiple_distribution > 0].min())
    multiple_distribution[multiple_distribution <= 0] = multiple_min_pos
    wholesaler_data[0]["multiple_distribution"] = multiple_distribution + (1 - multiple_min_pos)
    return wholesaler_data

`fill_distribution_endpoints(wholesaler_data)`

Forward fills simple distribution wherever value exceeds 1. Replaces beginning and end values with their nearest dates for all distribution signals.

Source code in wt_ml/dataset/region_hacks/us_hacks/distribution_hacks.py

@register_hack("dataset", "us", "distribution_json")
def fill_distribution_endpoints(wholesaler_data: tuple[pd.DataFrame, str]) -> tuple[pd.DataFrame, str]:
    """
    Forward fills simple distribution wherever value exceeds 1.
    Replaces beginning and end values with their nearest dates for all distribution signals.
    """
    for dist_series in ("simple_distribution", "multiple_distribution"):
        dist_df = wholesaler_data[0][dist_series].unstack("week_date").sort_index(axis=1)
        dist_df.iloc[:, :START_WEEKS_TO_BFILL] = dist_df.iloc[:, START_WEEKS_TO_BFILL : START_WEEKS_TO_BFILL + 1]
        dist_df.iloc[:, -END_WEEKS_TO_FFILL:] = dist_df.iloc[:, -END_WEEKS_TO_FFILL - 1 : -END_WEEKS_TO_FFILL]
        wholesaler_data[0][dist_series] = (
            dist_df.stack("week_date").reorder_levels(["brand_code", "week_date", "product_code"]).sort_index()
        )
    return wholesaler_data

`impute_distribution_signals_outliers(wholesaler_data)`

Impute missing/outlier values in distribution signals for specific brands and dates as mentioned in WIBBLES_TO_PATCH using either mean imputation, forward fill, backward fill, rolling mean or linear interpolation.

Source code in wt_ml/dataset/region_hacks/us_hacks/distribution_hacks.py

@register_hack("dataset", "us", "distribution_json")
def impute_distribution_signals_outliers(wholesaler_data: tuple[pd.DataFrame, str]) -> tuple[pd.DataFrame, str]:
    """
    Impute missing/outlier values in distribution signals for specific brands and dates as mentioned in WIBBLES_TO_PATCH
    using either mean imputation, forward fill, backward fill, rolling mean or linear interpolation.
    """
    wslr_code = wholesaler_data[1]
    patch_details = WIBBLES_TO_PATCH.get(wslr_code)
    if patch_details is None:
        return wholesaler_data
    distribution_data = wholesaler_data[0].copy()
    for patch_detail in patch_details:
        brands = patch_detail["brand"] if isinstance(patch_detail["brand"], list) else [patch_detail["brand"]]
        dates = patch_detail["dates"]
        signal_names = patch_detail["signal_names"]
        imputation_method = patch_detail["method"]
        brand_slice = distribution_data.index.get_level_values("brand_code").isin(brands)
        distribution_data.loc[
            brand_slice & distribution_data.index.get_level_values("week_date").isin(dates),
            signal_names,
        ] = np.nan
        distribution_data = (
            impute_data(distribution_data.unstack(["brand_code", "product_code"]), imputation_method)
            .stack(["brand_code", "product_code"])
            .reorder_levels(["brand_code", "week_date", "product_code"])
            .sort_index()
        )
    return (distribution_data, wslr_code)

`impute_zero_distribution_signals(wholesaler_data)`

Impute the simple and multiple distribution signals for weeks where sales are greater than 10K but distribution signal values are missing.

Source code in wt_ml/dataset/region_hacks/us_hacks/distribution_hacks.py

@register_hack("dataset", "us", "distribution_json")
def impute_zero_distribution_signals(wholesaler_data: tuple[pd.DataFrame, str]) -> tuple[pd.DataFrame, str]:
    """
    Impute the simple and multiple distribution signals for weeks where sales are greater than 10K but
    distribution signal values are missing.
    """
    wslr_code = wholesaler_data[1]
    distribution_data = wholesaler_data[0].copy()
    distribution_data_positive_sales = distribution_data.loc[distribution_data["sales"] > 0].sort_index(
        level=["brand_code", "week_date"]
    )
    simple_distribution_zero_index = distribution_data_positive_sales.loc[
        distribution_data_positive_sales["simple_distribution"] == 0
    ].index
    distribution_data_positive_sales.loc[simple_distribution_zero_index, [SD, MD]] = np.nan
    distribution_data_positive_sales[[SD, MD]] = (
        distribution_data_positive_sales[[SD, MD]].groupby(["brand_code", "product_code"]).ffill().bfill()
    )
    distribution_data.loc[distribution_data_positive_sales.index] = distribution_data_positive_sales

    return (distribution_data, wslr_code)