Ext

Source code in wt_ml/dataset/df_wrapper.py
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
@pd.api.extensions.register_dataframe_accessor("ext")
class Ext:
    def __init__(self, pandas_obj):
        self._obj = pandas_obj
        if not hasattr(self._obj, "attrs"):
            # NOTE: this is a pandas experimental feature. Making sure it exists always in subsequent versions
            self._obj.attrs = {}
        self._obj.attrs["cache_len"] = -1

    @property
    def shape_nd(self):
        if hasattr(self._obj.index, "levshape"):
            idx_shape = self._obj.index.levshape
        else:
            idx_shape = self._obj.index.shape
        if hasattr(self._obj.columns, "levshape"):
            cols = self._obj.columns
            col_shape = [cols.get_level_values(i).nunique() for i in range(len(cols.levshape))]
        else:
            col_shape = self._obj.columns.shape
        return (*idx_shape, *col_shape)

    @property
    def shape_nd_1(self):
        """
        A useful property to expand the dimensions for shape_nd which can be used in model placeholders.
        It assumes that, every dataframe must have a "sim" column in it.
        If it has a "sim" column, it will return shape_nd.
        Else, it will expand shape_nd to include a "sim" dimension. This way it makes the placeholder shapes consistent.
        """
        shape_nd = self.shape_nd
        if "sim" not in self._obj.columns.names:
            # expands to include "sim" dimension
            shape_nd = list(shape_nd) + [1]
        return shape_nd

    @property
    def shape_1(self):
        full_shape = list(self._obj.shape)
        if "sim" not in self._obj.columns.names:
            full_shape.append(1)
        return full_shape

    @property
    def values_nd(self):
        return self._obj.values.reshape(self.shape_nd)

    @property
    def values_nd_1(self):
        """
        A useful property to expand the dimensions for values_nd which can be used in model placeholders.
        It assumes that, every dataframe must have a "sim" column in it.
        If it has a "sim" column, it will return values_nd.
        Else, it will expand values_nd to include a "sim" dimension. This way it makes the placeholder
        shapes consistent.
        """
        if "sim" in self._obj.columns.names:
            return self.values_nd
        else:
            return np.expand_dims(self.values_nd, -1)

    @property
    def values_1(self):
        """
        Adds a sim axis to `dataframe.values`
        """
        full_shape = self.shape_nd_1
        return self._obj.values.reshape(full_shape[0], -1, full_shape[-1])

    @property
    def name(self):
        """The name of the dataset"""
        if hasattr(self._obj, "attrs") and "dataset_name" in self._obj.attrs:
            return self._obj.attrs["dataset_name"]
        return None  # default no name

    @name.setter
    def name(self, value):
        """
        We can save the dataset name into the `attrs` attribute of pandas.
        This is a debug feature which allows us to know which dataset was loaded in the variable.
        NOTE: `attrs` is an experimental feature, meaning it could be removed in subsequent versions of pandas.
        The `attrs` attribute, ensures that the `attrs` dictionary are passed through subsequent pandas chained
        functions. Find more info here,
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.attrs.html

        Args:
            value (str): The name of the dataset.
        """
        if not hasattr(self._obj, "attrs"):
            # NOTE: this is a pandas experimental feature. Making sure it exists always in subsequent versions
            self._obj.attrs = {}
        self._obj.attrs["dataset_name"] = value

    @property
    def freq(self) -> str:
        """Returns the frequency of the dataframe based on date columns/index.

        Raises:
            AttributeError: Will be raised if date or timedesc is not found in column/index

        Returns:
            str: The frequency. 'W-SAT': weekly frequency. 'M': monthly frequency.
        """
        df = self._obj
        if hasattr(self._obj, "attrs") and "freq" in df.attrs and len(df) == df.attrs.get("cache_len", -1):
            return self._obj.attrs["freq"]
        if not hasattr(self._obj, "attrs"):
            # NOTE: this is a pandas experimental feature. Making sure it exists always in subsequent versions
            self._obj.attrs = {}
        self._obj.attrs["cache_len"] = len(df)
        found_date = False
        if df.index.nlevels > 1:
            for i, lev in enumerate(df.index.levels):
                if type(lev) is pd.DatetimeIndex:
                    date = pd.Series(lev.sort_values().unique(), name="date")
                    found_date = True
                    break
        if not found_date:
            if type(df.index) is pd.DatetimeIndex:
                date = df.index.to_series(name="date")
            elif "date" in df.columns:
                date = df["date"]
                date = pd.Series(date.sort_values().unique(), name="date")
            elif "timedesc" in df.columns:
                date = df["timedesc"]
                date = pd.Series(date.sort_values().unique(), name="date")
            else:
                raise AttributeError("DataFrame doesn't have date column.")
        # difference should be 7 if freq is in weeks. here we take 10 just incase if there are missing dates
        freq = {True: "W-SAT", False: "M"}[date.diff().median().days <= 10]
        self._obj.attrs["freq"] = freq
        return freq

freq: str property

Returns the frequency of the dataframe based on date columns/index.

Raises:

Type Description
AttributeError

Will be raised if date or timedesc is not found in column/index

Returns:

Name Type Description
str str

The frequency. 'W-SAT': weekly frequency. 'M': monthly frequency.

name property writable

The name of the dataset

shape_nd_1 property

A useful property to expand the dimensions for shape_nd which can be used in model placeholders. It assumes that, every dataframe must have a "sim" column in it. If it has a "sim" column, it will return shape_nd. Else, it will expand shape_nd to include a "sim" dimension. This way it makes the placeholder shapes consistent.

values_1 property

Adds a sim axis to dataframe.values

values_nd_1 property

A useful property to expand the dimensions for values_nd which can be used in model placeholders. It assumes that, every dataframe must have a "sim" column in it. If it has a "sim" column, it will return values_nd. Else, it will expand values_nd to include a "sim" dimension. This way it makes the placeholder shapes consistent.