5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137 | @pd.api.extensions.register_dataframe_accessor("ext")
class Ext:
def __init__(self, pandas_obj):
self._obj = pandas_obj
if not hasattr(self._obj, "attrs"):
# NOTE: this is a pandas experimental feature. Making sure it exists always in subsequent versions
self._obj.attrs = {}
self._obj.attrs["cache_len"] = -1
@property
def shape_nd(self):
if hasattr(self._obj.index, "levshape"):
idx_shape = self._obj.index.levshape
else:
idx_shape = self._obj.index.shape
if hasattr(self._obj.columns, "levshape"):
cols = self._obj.columns
col_shape = [cols.get_level_values(i).nunique() for i in range(len(cols.levshape))]
else:
col_shape = self._obj.columns.shape
return (*idx_shape, *col_shape)
@property
def shape_nd_1(self):
"""
A useful property to expand the dimensions for shape_nd which can be used in model placeholders.
It assumes that, every dataframe must have a "sim" column in it.
If it has a "sim" column, it will return shape_nd.
Else, it will expand shape_nd to include a "sim" dimension. This way it makes the placeholder shapes consistent.
"""
shape_nd = self.shape_nd
if "sim" not in self._obj.columns.names:
# expands to include "sim" dimension
shape_nd = list(shape_nd) + [1]
return shape_nd
@property
def shape_1(self):
full_shape = list(self._obj.shape)
if "sim" not in self._obj.columns.names:
full_shape.append(1)
return full_shape
@property
def values_nd(self):
return self._obj.values.reshape(self.shape_nd)
@property
def values_nd_1(self):
"""
A useful property to expand the dimensions for values_nd which can be used in model placeholders.
It assumes that, every dataframe must have a "sim" column in it.
If it has a "sim" column, it will return values_nd.
Else, it will expand values_nd to include a "sim" dimension. This way it makes the placeholder
shapes consistent.
"""
if "sim" in self._obj.columns.names:
return self.values_nd
else:
return np.expand_dims(self.values_nd, -1)
@property
def values_1(self):
"""
Adds a sim axis to `dataframe.values`
"""
full_shape = self.shape_nd_1
return self._obj.values.reshape(full_shape[0], -1, full_shape[-1])
@property
def name(self):
"""The name of the dataset"""
if hasattr(self._obj, "attrs") and "dataset_name" in self._obj.attrs:
return self._obj.attrs["dataset_name"]
return None # default no name
@name.setter
def name(self, value):
"""
We can save the dataset name into the `attrs` attribute of pandas.
This is a debug feature which allows us to know which dataset was loaded in the variable.
NOTE: `attrs` is an experimental feature, meaning it could be removed in subsequent versions of pandas.
The `attrs` attribute, ensures that the `attrs` dictionary are passed through subsequent pandas chained
functions. Find more info here,
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.attrs.html
Args:
value (str): The name of the dataset.
"""
if not hasattr(self._obj, "attrs"):
# NOTE: this is a pandas experimental feature. Making sure it exists always in subsequent versions
self._obj.attrs = {}
self._obj.attrs["dataset_name"] = value
@property
def freq(self) -> str:
"""Returns the frequency of the dataframe based on date columns/index.
Raises:
AttributeError: Will be raised if date or timedesc is not found in column/index
Returns:
str: The frequency. 'W-SAT': weekly frequency. 'M': monthly frequency.
"""
df = self._obj
if hasattr(self._obj, "attrs") and "freq" in df.attrs and len(df) == df.attrs.get("cache_len", -1):
return self._obj.attrs["freq"]
if not hasattr(self._obj, "attrs"):
# NOTE: this is a pandas experimental feature. Making sure it exists always in subsequent versions
self._obj.attrs = {}
self._obj.attrs["cache_len"] = len(df)
found_date = False
if df.index.nlevels > 1:
for i, lev in enumerate(df.index.levels):
if type(lev) is pd.DatetimeIndex:
date = pd.Series(lev.sort_values().unique(), name="date")
found_date = True
break
if not found_date:
if type(df.index) is pd.DatetimeIndex:
date = df.index.to_series(name="date")
elif "date" in df.columns:
date = df["date"]
date = pd.Series(date.sort_values().unique(), name="date")
elif "timedesc" in df.columns:
date = df["timedesc"]
date = pd.Series(date.sort_values().unique(), name="date")
else:
raise AttributeError("DataFrame doesn't have date column.")
# difference should be 7 if freq is in weeks. here we take 10 just incase if there are missing dates
freq = {True: "W-SAT", False: "M"}[date.diff().median().days <= 10]
self._obj.attrs["freq"] = freq
return freq
|