VersionDiff

Source code in wt_ml/utils/diff_utils.py
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
class VersionDiff:
    def __init__(self, version_a: str, version_b: str, include_dbg: bool):
        self.version_a = version_a
        self.version_b = version_b
        self.include_dbg = include_dbg
        self.zip_a = None
        self.zip_b = None

    @staticmethod
    def _check_file_exists(file_path: Path):
        if not isinstance(file_path, Path):
            file_path = Path(file_path)
        if file_path.is_file():
            logger.info(f"{file_path} already exists.")
            return True
        else:
            logger.info(f"{file_path} not found.")
            return False

    @staticmethod
    def _download_from_blob(version_name, include_dbg):
        """
        Downloads the versions using wt_core.delivery_utils.
        """
        return delivery_utils.download_version(version_name=version_name, include_dbg=include_dbg)

    def _load_version(self, version_name):
        """
        Downlaods and loads versions into class attributes. Download is skipped if already present on local.
        """
        version_full_path = delivery_utils.DEFAULT_DOWNLOAD_PATH / f"{version_name}.zip"
        if not self._check_file_exists(version_full_path):
            self._download_from_blob(version_name, self.include_dbg)

        return load_zip_as_dict(version_full_path)

    def load_versions(self):
        version_dict = {"zip_a": self.version_a, "zip_b": self.version_b}
        for zip_ver in version_dict.keys():
            if getattr(self, zip_ver) is None:
                zip_dict = self._load_version(version_dict[zip_ver])
                setattr(self, zip_ver, zip_dict)
            else:
                logger.info(f"{version_dict[zip_ver]} is already loaded.")

    def get_all_hierarchies(self, hierarchy: SignalHierarchy, impact_df):
        signals_set = set(impact_df.columns.unique("signal"))

        def construct_leaf(display_name: str, node: LeafSignal) -> tuple[pd.DataFrame, dict[str, pd.DataFrame]]:
            signal = node["signal"]
            if signal not in signals_set:
                logger.info(f"{signal} in hierarchy but not in impact_df signals, ignored.")
                return None
            else:
                values = impact_df.xs(node["signal"], axis=1, level="signal")
                return values, {display_name: values}

        def construct_parent(
            display_name: str, node: ParentSignal, children: Sequence[tuple[pd.DataFrame, dict[str, pd.DataFrame]]]
        ):
            value = map_reduce(lambda prev, cur: prev.add(cur), lambda child: child[0], children)
            combined_dict = map_reduce(operator.or_, lambda child: child[1], children) | {display_name: value}
            return value, combined_dict

        pop_hier = populate_hierarchy(
            hierarchy=hierarchy, construct_leaf=construct_leaf, construct_parent=construct_parent
        )
        return map_reduce(operator.or_, lambda tup: tup[1], pop_hier)

    def get_all_depth_n(self, node: LeafSignal | ParentSignal, depth: int, parent_signal_name: str | list = None):
        if depth < 0:
            logger.warning(f"Trying to get negative ({depth}) signals which doesn't make sense.")

        if depth > 0:
            return itertools.chain.from_iterable(
                self.get_all_depth_n(child, depth - 1, parent_signals)
                for child, parent_signals in zip(node.get("children", {}).values(), node.get("children", {}).keys())
            )
        else:
            if node.get("signal", False):
                node["signal"] = parent_signal_name
            return [node]

    def get_signals_at_depth(self, level: int):
        signals_a = list(
            self.get_all_depth_n({"tooltip": "", "children": copy.deepcopy(self.zip_a["signal_hierarchy"])}, level)
        )
        signals_b = list(
            self.get_all_depth_n({"tooltip": "", "children": copy.deepcopy(self.zip_b["signal_hierarchy"])}, level)
        )
        return (signals_a, signals_b)

    def filter_and_create_impacts_df(self, signal_dicts, final_impact_dict):
        # Get list of signal names from a given hierarchy
        signal_list = []
        for signal_dict in signal_dicts:
            parent_signal = signal_dict.get("children", {})
            signal_list.extend(list(parent_signal.keys()))
            if parent_signal == {}:
                signal_list.append(signal_dict.get("signal", {}))

        filtered_impact_dfs = {}
        for signal in signal_list:
            if signal not in final_impact_dict:
                logger.info(f"{signal} not in impacts, skipped.")
                continue
            filtered_impact_dfs[signal] = final_impact_dict[signal]

        df_concat = pd.concat(
            filtered_impact_dfs.values(), axis=1, keys=filtered_impact_dfs.keys(), names=["parent_signal", "wibble"]
        )

        return df_concat

    def expand_wibble_mapping(self, impacts_df, brand_map, wslr_map, region_map, state_map):
        impacts_df.columns = pd.MultiIndex.from_tuples(
            [
                (brand_map[col], wslr_map[col], state_map[col], region_map[col], signal)
                for col, signal in zip(
                    impacts_df.columns.get_level_values("wibble"), impacts_df.columns.get_level_values("parent_signal")
                )
            ],
            names=["brand", "wholesaler", "state", "region", "parent_signal"],
        )
        return impacts_df

    def remove_signals_from_hierarchy(self):
        self.zip_a["signal_hierarchy"].pop("Residual", None)
        self.zip_a["signal_hierarchy"]["Execution"]["children"]["Baseline at Distribution"]["children"].pop(
            "Extra Weeks", None
        )
        self.zip_a["signal_hierarchy"]["External"]["children"].pop("Stockup Effect", None)

        self.zip_b["signal_hierarchy"].pop("Residual", None)
        self.zip_b["signal_hierarchy"]["Execution"]["children"]["Baseline at Distribution"]["children"].pop(
            "Extra Weeks", None
        )
        self.zip_b["signal_hierarchy"]["External"]["children"].pop("Stockup Effect", None)

    def diff_impacts_at_hierarchy_level(self, level=1):
        # Remove Residuals, Extra Weeks and Stockup Effect from consideration
        self.remove_signals_from_hierarchy()

        # Get list of signals at given depth level
        sig_list_a, sig_list_b = self.get_signals_at_depth(level)

        final_impact_dict_a = self.get_all_hierarchies(self.zip_a["signal_hierarchy"], self.zip_a["impact_df"])
        final_impact_dict_b = self.get_all_hierarchies(self.zip_b["signal_hierarchy"], self.zip_b["impact_df"])

        impacts_df_a = self.filter_and_create_impacts_df(sig_list_a, final_impact_dict_a)
        impacts_df_b = self.filter_and_create_impacts_df(sig_list_b, final_impact_dict_b)

        impacts_df_a = self.expand_wibble_mapping(
            impacts_df_a,
            self.zip_a["wibble_mappings"]["brand"],
            self.zip_a["wibble_mappings"]["wholesaler"],
            self.zip_a["wibble_mappings"]["region"],
            self.zip_a["wibble_mappings"]["state"],
        )

        impacts_df_b = self.expand_wibble_mapping(
            impacts_df_b,
            self.zip_b["wibble_mappings"]["brand"],
            self.zip_b["wibble_mappings"]["wholesaler"],
            self.zip_b["wibble_mappings"]["region"],
            self.zip_b["wibble_mappings"]["state"],
        )

        diff_a = diff_parent_signal_over_consecutive_1yr(impacts_df_a)
        diff_b = diff_parent_signal_over_consecutive_1yr(impacts_df_b)
        diff = diff_a - diff_b
        return (
            pd.concat([diff, diff.abs()], axis=1)
            .rename(columns={0: "yoy_delta", 1: "abs"})
            .sort_values(by="abs", ascending=False)
        )