Validates sponsorship leagues to their parent mappings across parquet, json and hdf5
Source code in wt_ml/dataset/data_validator/checks/check_sponsorship_mapping.py
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41 | def check_sponsorship_mapping(dataloader: DataLoader) -> DataStatus:
"""Validates sponsorship leagues to their parent mappings across parquet, json and hdf5"""
statuses: list[StatusType] = []
messages: list[str] = []
sponsorship_parquet = pd.read_parquet(
list((dataloader.parquet_dir / "sponsorship_mapping").glob("*.parquet"))[0]
).rename(columns={"league": "vehicle"})
sponsorship_json = (
pd.read_json(dataloader.json_dir / "sponsorship_mapping" / "sponsorship_mapping.json", orient="split")
.reset_index(names="team")
.rename(columns={"league": "vehicle"})[["parent_vehicle", "vehicle", "team"]]
)
sponsorship_vehicles_from_json = set(sponsorship_json.vehicle)
sponsorship_vehicles_from_hdf5 = set([k for k in dataloader.encodings["vehicle"].keys() if k.startswith("league_")])
if sponsorship_parquet.equals(sponsorship_json):
statuses.append(StatusType.PASS)
else:
mismatch_data = (
pd.MultiIndex.from_frame(sponsorship_json)
.symmetric_difference(pd.MultiIndex.from_frame(sponsorship_parquet))
.values
)
statuses.append(StatusType.FAIL)
messages.append(f"Following data mismatch between parquet and json in sponsorship map : {mismatch_data}")
if sponsorship_vehicles_from_hdf5.issubset(sponsorship_vehicles_from_json):
statuses.append(StatusType.PASS)
else:
mismatch = list(sponsorship_vehicles_from_hdf5.difference(sponsorship_vehicles_from_json))
statuses.append(StatusType.FAIL)
messages.append(f"Following extra vehicles are in hdf5 and not in json for sponsorship map: {mismatch}")
return DataStatus(
status=StatusType.PASS if all(status == StatusType.PASS for status in statuses) else StatusType.FAIL,
message="\n".join(messages),
)
|