get_dates_info(historical_data)

Returns Latest date: Max date in the historical data Start date(for extraction): 1 year prior to the latest date

Source code in wt_ml/dataset/economics/fred_extraction_utils.py
47
48
49
50
51
52
53
54
def get_dates_info(historical_data: pd.DataFrame) -> tuple[str, str]:
    """
    Returns
    Latest date: Max date in the historical data
    Start date(for extraction): 1 year prior to the latest date"""
    latest_date = historical_data.index.unique("date").max()
    start_date = latest_date + pd.DateOffset(years=-2)
    return latest_date.strftime("%Y-%m-%d"), start_date.strftime("%Y-%m-%d")

impute_missing_data(df)

Impute missing values at a state and signal level by the mean

Source code in wt_ml/dataset/economics/fred_extraction_utils.py
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def impute_missing_data(df: pd.DataFrame) -> pd.DataFrame:
    """Impute missing values at a state and signal level by the mean"""
    data = pd.DataFrame(df.stack()).rename(columns={0: "value"})
    stats = data.groupby(["state", "signals"]).agg(
        value_mean=("value", np.mean),
        value_std=("value", np.std),
    )
    stats["missing_series"] = 0
    stats.loc[(stats["value_mean"] == 0) & (stats["value_std"] == 0), "missing_series"] = 1

    # Compute mean by removing missing values
    data_available = data.loc[data["value"] != 0]
    mean = pd.DataFrame(data_available.groupby(["date", "signals"])["value"].mean()).rename(
        columns={"value": "national_average"}
    )

    data_imputed = data.merge(mean, how="left", left_index=True, right_index=True, on=data.index.name, validate=None)
    data_imputed.loc[data_imputed["value"] == 0, "value"] = data_imputed.loc[
        data_imputed["value"] == 0, "national_average"
    ]
    data_imputed = data_imputed.drop(columns="national_average")
    data_imputed = data_imputed.unstack("signals").swaplevel()
    data_imputed.columns = data_imputed.columns.droplevel(None)

    return data_imputed

trim_historical_data(historical_data, start_date)

Remove the last 1 year data in the historical data

Source code in wt_ml/dataset/economics/fred_extraction_utils.py
57
58
59
60
def trim_historical_data(historical_data: pd.DataFrame, start_date: str) -> pd.DataFrame:
    """Remove the last 1 year data in the historical data"""
    trimmed_historical_data = historical_data.loc[historical_data.index.get_level_values("date") <= start_date]
    return trimmed_historical_data