Imputing the spiky price data for wholesaler, brands to be either clipped or ffill/bfill and logging them
Source code in wt_ml/dataset/region_hacks/us_hacks/pricing_hacks.py
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55 | @register_hack("dataset", "us", "pricing_json")
def impute_spiky_wlsr_price(wlsr_df: pd.DataFrame) -> pd.DataFrame:
"""Imputing the spiky price data for wholesaler, brands to be either clipped or ffill/bfill and logging them"""
wlsr_df["imputed_price_per_bbl"] = wlsr_df["price_per_bbl"].copy()
wlsr_price = wlsr_df["price_per_bbl"]
wlsr_sales = wlsr_df["sales"]
for brand, prod in wlsr_price.index.droplevel("week_date").drop_duplicates():
brand_prod_price = wlsr_price.loc[pd.IndexSlice[brand, :, prod]]
brand_prod_sales = wlsr_sales.loc[pd.IndexSlice[brand, :, prod]]
brand_prod_index = brand_prod_sales.index.unique("week_date")
price_index, stable_region, _ = get_price_index_per_stable_region(brand_prod_price.values)
price_index_series = pd.Series(price_index, index=brand_prod_index)
stable_region_series = pd.Series(stable_region, index=brand_prod_index)
median_price_index = price_index_series.median()
# Identifyng any discount below 50% or price hike above 130% as problematic
lw_thresh, up_thresh = (
median_price_index * MAX_DISCOUNT_THRESHOLD,
median_price_index * MAX_PRICE_HIKE_THRESHOLD,
)
problematic_weeks = price_index_series[~price_index_series.between(lw_thresh, up_thresh)].index
if len(problematic_weeks) > 0:
sales_diff = brand_prod_sales.diff().loc[problematic_weeks].to_frame(name="sales_diff")
price_diff = brand_prod_price.diff().loc[problematic_weeks].to_frame(name="price_diff")
merged_sales_price = sales_diff.join(price_diff)
# Inverse relation of price and sales is desirable: price increases and sales decreases.
merged_sales_price["sales_price_diff_mult"] = (
merged_sales_price["sales_diff"] * merged_sales_price["price_diff"]
)
# clipping first to ensure that the spiky price weeks which had desirable sales-price relation get fixed
weeks_to_clip = merged_sales_price[merged_sales_price.sales_price_diff_mult < 0].index
brand_prod_price.loc[weeks_to_clip] = brand_prod_price.loc[weeks_to_clip].clip(
(lw_thresh * stable_region_series.loc[weeks_to_clip]).values,
(up_thresh * stable_region_series.loc[weeks_to_clip]).values,
)
# ffill/bfill next to ensure that spiky price which had same direction of sales and price get fixed
# clipping first ensures that these ffill/bfill weeks are imputed properly incase they were adjacent weeks
weeks_to_ffill_bfill = merged_sales_price[merged_sales_price.sales_price_diff_mult >= 0].index
brand_prod_price.loc[weeks_to_ffill_bfill] = np.nan
brand_prod_price = brand_prod_price.ffill().bfill()
wlsr_df.loc[pd.IndexSlice[brand, :, prod], "imputed_price_per_bbl"] = brand_prod_price.values
return wlsr_df
|