cleanup_distribution_outliers(wholesaler_data)

Interpolate values where distribution signals exceed 3 standard deviations from the rolling mean. Interpolation is performed only if distribution is non-zero.

Source code in wt_ml/dataset/region_hacks/us_hacks/distribution_hacks.py
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
@register_hack("dataset", "us", "distribution_json")
def cleanup_distribution_outliers(wholesaler_data: tuple[pd.DataFrame, str]) -> tuple[pd.DataFrame, str]:
    """
    Interpolate values where distribution signals exceed 3 standard deviations from the rolling mean.
    Interpolation is performed only if distribution is non-zero.
    """
    window = 12
    n_stds = 3
    wholesaler_df = wholesaler_data[0]
    for dist_series in ("simple_distribution", "multiple_distribution"):
        dist_df = wholesaler_df[dist_series].unstack(["brand_code", "product_code"])
        rolling_means = dist_df.rolling(window).mean().bfill()
        n_rolling_stds_away = rolling_means + dist_df.rolling(window).std().bfill() * n_stds
        n_rolling_stds_below = rolling_means - dist_df.rolling(window).std().bfill() * n_stds
        is_extreme_value = (dist_df > n_rolling_stds_away) | (dist_df < n_rolling_stds_below)
        is_non_zero = dist_df > 0
        is_outlier = np.expand_dims(dist_df.index.get_level_values("week_date").isin(DISTRIBUTION_OUTLIER_DATES), 1)
        dist_df = dist_df.mask((is_extreme_value | is_outlier) & is_non_zero).interpolate(method="linear")
        wholesaler_df[dist_series] = (
            dist_df.stack(["brand_code", "product_code"])
            .reorder_levels(["brand_code", "week_date", "product_code"])
            .sort_index()
        )
    return (wholesaler_df, wholesaler_data[1])

clip_distribution_values(wholesaler_data)

Simple Distribution should not be less than 0 or greater than 1 and Multiple Distribution should not be less than 1.

Source code in wt_ml/dataset/region_hacks/us_hacks/distribution_hacks.py
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
@register_hack("dataset", "us", "distribution_json")
def clip_distribution_values(wholesaler_data: tuple[pd.DataFrame, str]) -> tuple[pd.DataFrame, str]:
    """
    Simple Distribution should not be less than 0 or greater than 1 and Multiple Distribution
    should not be less than 1.
    """
    simple_distribution = wholesaler_data[0]["simple_distribution"]
    multiple_distribution = wholesaler_data[0]["multiple_distribution"]
    simple_min = simple_distribution.min()
    simple_max = simple_distribution.max()
    wholesaler_data[0]["simple_distribution"] = (simple_distribution - min(0, simple_min)) / max(1, simple_max)
    multiple_min_pos = min(1, multiple_distribution[multiple_distribution > 0].min())
    multiple_distribution[multiple_distribution <= 0] = multiple_min_pos
    wholesaler_data[0]["multiple_distribution"] = multiple_distribution + (1 - multiple_min_pos)
    return wholesaler_data

fill_distribution_endpoints(wholesaler_data)

Forward fills simple distribution wherever value exceeds 1. Replaces beginning and end values with their nearest dates for all distribution signals.

Source code in wt_ml/dataset/region_hacks/us_hacks/distribution_hacks.py
79
80
81
82
83
84
85
86
87
88
89
90
91
92
@register_hack("dataset", "us", "distribution_json")
def fill_distribution_endpoints(wholesaler_data: tuple[pd.DataFrame, str]) -> tuple[pd.DataFrame, str]:
    """
    Forward fills simple distribution wherever value exceeds 1.
    Replaces beginning and end values with their nearest dates for all distribution signals.
    """
    for dist_series in ("simple_distribution", "multiple_distribution"):
        dist_df = wholesaler_data[0][dist_series].unstack("week_date").sort_index(axis=1)
        dist_df.iloc[:, :START_WEEKS_TO_BFILL] = dist_df.iloc[:, START_WEEKS_TO_BFILL : START_WEEKS_TO_BFILL + 1]
        dist_df.iloc[:, -END_WEEKS_TO_FFILL:] = dist_df.iloc[:, -END_WEEKS_TO_FFILL - 1 : -END_WEEKS_TO_FFILL]
        wholesaler_data[0][dist_series] = (
            dist_df.stack("week_date").reorder_levels(["brand_code", "week_date", "product_code"]).sort_index()
        )
    return wholesaler_data

impute_distribution_signals_outliers(wholesaler_data)

Impute missing/outlier values in distribution signals for specific brands and dates as mentioned in WIBBLES_TO_PATCH using either mean imputation, forward fill, backward fill, rolling mean or linear interpolation.

Source code in wt_ml/dataset/region_hacks/us_hacks/distribution_hacks.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
@register_hack("dataset", "us", "distribution_json")
def impute_distribution_signals_outliers(wholesaler_data: tuple[pd.DataFrame, str]) -> tuple[pd.DataFrame, str]:
    """
    Impute missing/outlier values in distribution signals for specific brands and dates as mentioned in WIBBLES_TO_PATCH
    using either mean imputation, forward fill, backward fill, rolling mean or linear interpolation.
    """
    wslr_code = wholesaler_data[1]
    patch_details = WIBBLES_TO_PATCH.get(wslr_code)
    if patch_details is None:
        return wholesaler_data
    distribution_data = wholesaler_data[0].copy()
    for patch_detail in patch_details:
        brands = patch_detail["brand"] if isinstance(patch_detail["brand"], list) else [patch_detail["brand"]]
        dates = patch_detail["dates"]
        signal_names = patch_detail["signal_names"]
        imputation_method = patch_detail["method"]
        brand_slice = distribution_data.index.get_level_values("brand_code").isin(brands)
        distribution_data.loc[
            brand_slice & distribution_data.index.get_level_values("week_date").isin(dates),
            signal_names,
        ] = np.nan
        distribution_data = (
            impute_data(distribution_data.unstack(["brand_code", "product_code"]), imputation_method)
            .stack(["brand_code", "product_code"])
            .reorder_levels(["brand_code", "week_date", "product_code"])
            .sort_index()
        )
    return (distribution_data, wslr_code)

impute_zero_distribution_signals(wholesaler_data)

Impute the simple and multiple distribution signals for weeks where sales are greater than 10K but distribution signal values are missing.

Source code in wt_ml/dataset/region_hacks/us_hacks/distribution_hacks.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
@register_hack("dataset", "us", "distribution_json")
def impute_zero_distribution_signals(wholesaler_data: tuple[pd.DataFrame, str]) -> tuple[pd.DataFrame, str]:
    """
    Impute the simple and multiple distribution signals for weeks where sales are greater than 10K but
    distribution signal values are missing.
    """
    wslr_code = wholesaler_data[1]
    distribution_data = wholesaler_data[0].copy()
    distribution_data_positive_sales = distribution_data.loc[distribution_data["sales"] > 0].sort_index(
        level=["brand_code", "week_date"]
    )
    simple_distribution_zero_index = distribution_data_positive_sales.loc[
        distribution_data_positive_sales["simple_distribution"] == 0
    ].index
    distribution_data_positive_sales.loc[simple_distribution_zero_index, [SD, MD]] = np.nan
    distribution_data_positive_sales[[SD, MD]] = (
        distribution_data_positive_sales[[SD, MD]].groupby(["brand_code", "product_code"]).ffill().bfill()
    )
    distribution_data.loc[distribution_data_positive_sales.index] = distribution_data_positive_sales

    return (distribution_data, wslr_code)