check_other_cpnl_mapping(dataloader)

Validates All Other Media/ All Other Sponsorship vehicles to their parent mappings across parquet, json and hdf5.

Source code in wt_ml/dataset/data_validator/checks/check_other_cpnl_mapping.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
def check_other_cpnl_mapping(dataloader: DataLoader) -> DataStatus:
    """Validates All Other Media/ All Other Sponsorship vehicles to their parent mappings across
    parquet, json and hdf5."""
    statuses: list[StatusType] = []
    messages: list[str] = []
    other_cpnl_parquet = pd.read_parquet(
        list((dataloader.parquet_dir / "other_cpnl_mapping").glob("*.parquet"))[0]
    ).rename(columns={"other_cpnl": "vehicle"})
    if not all(other_cpnl_parquet["parent_vehicle"].str.startswith("All Other")):
        other_cpnl_parquet["parent_vehicle"] = other_cpnl_parquet["parent_vehicle"].map(lambda x: f"All Other {x}")
    other_cpnl_json = pd.read_json(
        dataloader.json_dir / "other_cpnl_mapping" / "other_cpnl_mapping.json", orient="split"
    ).reset_index(names="vehicle")[["parent_vehicle", "vehicle"]]
    if not all(other_cpnl_json["parent_vehicle"].str.startswith("All Other")):
        other_cpnl_json["parent_vehicle"] = other_cpnl_json["parent_vehicle"].map(lambda x: f"All Other {x}")

    other_cpnl_vehicles_from_json = set(other_cpnl_json.vehicle)
    other_cpnl_vehicles_from_hdf5 = set(
        [k for k in dataloader.encodings["vehicle"].keys() if k.startswith("other_cpnl")]
    )
    if other_cpnl_parquet.equals(other_cpnl_json):
        statuses.append(StatusType.PASS)
    else:
        mismatch_data = (
            pd.MultiIndex.from_frame(other_cpnl_json)
            .symmetric_difference(pd.MultiIndex.from_frame(other_cpnl_parquet))
            .values
        )
        statuses.append(StatusType.FAIL)
        messages.append(f"Following data mismatch between parquet and json in other_cpnl map : {mismatch_data}")
    if other_cpnl_vehicles_from_hdf5.issubset(other_cpnl_vehicles_from_json):
        statuses.append(StatusType.PASS)
    else:
        mismatch = list(other_cpnl_vehicles_from_hdf5.difference(other_cpnl_vehicles_from_json))
        statuses.append(StatusType.FAIL)
        messages.append(f"Following extra vehicles are in hdf5 and not in json for other_cpnl map: {mismatch}")
    return DataStatus(
        status=StatusType.PASS if all(status == StatusType.PASS for status in statuses) else StatusType.FAIL,
        message="\n".join(messages),
    )