-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
235 lines (208 loc) · 8.85 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
import random
import numpy as np
import tensorflow as tf
import os
import pandas as pd
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
AQS_IDs = ['32-003-0043',
'32-003-0071',
'32-003-0073',
'32-003-0075',
'32-003-0298',
'32-003-0540',
'32-003-0561',
'32-003-1019',
'32-003-1501']
MONTHS = ['', 'JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC', ]
WIND_DIRS = ['N', 'NNE', 'NE', 'ENE', 'E', 'ESE', 'SE', 'SSE', 'S', 'SSW', 'SW', 'WSW', 'W', 'WNW', 'NW', 'NNW']
CORR_THRESHOLD = 0.4
def set_seed():
SEED = 123
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
def get_datetime_index(start='20220603', end='20220604', freq='H'):
full_range = pd.date_range(start=start, end=end, freq=freq)
full_range = pd.DataFrame(full_range, columns = ['DATE'])
full_range = full_range.set_index('DATE')
return full_range
def series_to_supervised(data, n_in=1, n_pm25=1, n_out=1, dropnan=True):
'''
:param data: data built for predicting PM2.5
:param n_in: window size
:param n_pm25: num of PM2.5s to predict
:param n_out: num of future time points to predict, use only 1 in WiDS 2022 experiments
:param dropnan:
:return: pandas.core.frame.DataFrame
'''
n_features = 1 if type(data) is list else data.shape[1]
df = pd.DataFrame(data)
cols, names = list(), list()
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_features)]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j+1)) for j in range(n_features)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_features)]
# put it all together
agg = pd.concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg = agg.dropna(inplace=False)
# drop the non-target columns for time t, s.t. the last n_pm25 columns are the target columns
agg = agg.drop(agg.columns[[i for i in range(-n_features, -n_pm25)]], axis=1, inplace=False)
return agg
def split(values, n_rows=8760, n_pm25=1):
'''
:param values: numpy.ndarray, spuervised dataset
:param n_rows: num of row to be in training dataset
:param n_pm25: num of PM2.5 (target) columns
:return: train_X, train_y, test_X, test_y, 4 numpy.ndarray instances
'''
train = values[:n_rows, :]
test = values[n_rows:, :]
# split into input and outputs
train_X, train_y = train[:, :-n_pm25], train[:, -n_pm25:]
test_X, test_y = test[:, :-n_pm25], test[:, -n_pm25:]
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], 1, train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))
return train_X, train_y, test_X, test_y
def calc_pearson_correlation(data, aqs_id=None):
df = None
site_num = ''
if bool(aqs_id):
_, _, site_num = aqs_id.split('-')
for key, col in data.items():
if site_num in key:
df = col if df is None else df.join(col, how='inner', on='DATE')
if df is not None:
return df.corr()
return None
def get_correlated_cols_one_pm25(corr, col='1501_PM25', threshold=0.5):
df = corr[[col]]
df = df[abs(df[col])>threshold]
return set(list(df.index))
def get_own_correlated_cols_one_pm25(corr, col='1501_PM25', threshold=0.5):
site_num, _ = col.split('_')
a = get_correlated_cols_one_pm25(corr, col=col, threshold=threshold)
return [e for e in a if site_num in e]
def get_correlated_cols_all_pm25(corr, threshold=0.5):
cols = set()
for col in corr.index:
if 'PM25' in col:
tmp = get_correlated_cols_one_pm25(corr, col, threshold=threshold)
# print(col, len(tmp))
cols |= tmp
return cols
def _add_wind_directions(full_data, data, site_num):
wind_dir = f'{site_num}_WindDirection'
if wind_dir in data:
wind_dir_data = data[wind_dir]
wind_dir_data['WindDirCat'] = \
wind_dir_data.apply(lambda x: None if pd.isna(x[wind_dir])
else WIND_DIRS[math.floor(x[wind_dir] % 360 / 22.5)], axis=1)
wind_dir_data = wind_dir_data.drop(columns=[wind_dir])
wind_dir_data = wind_dir_data.rename(columns={'WindDirCat': wind_dir})
full_data = full_data.join(wind_dir_data)
return full_data
def build_data(
data, corr, site_num=None, pm_only=False, current_site_only=False, threshold=0.4, with_wind=False, with_lockdown=False,
byear=2020, eyear=2021, freq='H', encoder_name='LabelEncoder', scalar_name='MinMax'):
# create datetime index as the init of the full data
full_data = get_datetime_index(start=f'{byear}0101', end=f'{eyear}1231',
freq=freq) # init the full data as datetime index
# get correlated columns and target columns
if site_num is None:
correlated_cols = get_correlated_cols_all_pm25(corr, threshold=threshold)
print(correlated_cols)
if pm_only:
correlated_cols = [col for col in correlated_cols if '_PM' in col]
print(correlated_cols)
target_cols = [col for col in correlated_cols if col.endswith('_PM25')]
else:
target_cols = [f'{site_num}_PM25']
correlated_cols = get_correlated_cols_one_pm25(corr, col=target_cols[0], threshold=threshold)
if current_site_only:
correlated_cols = [e for e in correlated_cols if site_num in e]
# get site_numbers
site_numbers = set([col.split('_')[0] for col in correlated_cols])
# add month
full_data['Month'] = full_data.index.month
full_data['Month'] = full_data.apply(lambda x: MONTHS[x['Month']], axis=1)
n_cat_cols = 1 # the Month column
if with_wind:
# add wind directions
for site_num in site_numbers:
n_cat_cols += 1
full_data = _add_wind_directions(full_data, data, site_num)
# add wind speed
for site_num in site_numbers:
wind = f'{site_num}_WindSpeed'
if wind in data:
full_data = full_data.join(data[wind])
if with_lockdown:
full_data = full_data.join(data['CovidDelta'])
# add other columns except the target columns
for col in (set(correlated_cols) - set(target_cols)):
full_data = full_data.join(data[col])
# add target columns
for col in target_cols:
full_data = full_data.join(data[col])
# encoding
if encoder_name == 'LabelEncoder':
encoder = LabelEncoder()
for col in full_data.columns[:n_cat_cols]:
full_data[col] = encoder.fit_transform(full_data[col])
full_data = full_data.astype('float32')
# fill in missing values
full_data = full_data.interpolate()
# scaling
scalar = None
if scalar_name == 'MinMax':
scalar = MinMaxScaler(feature_range=(0, 1))
full_data = scalar.fit_transform(full_data)
# return results
return full_data, scalar, len(target_cols) # numpy.ndarray, MinMaxScaler, num of outputs
def test():
from data_loader import load_lv_data
data = load_lv_data()
corr = calc_pearson_correlation(data)
full_data, scalar, n_pm25 = build_data(data, corr, site_num=None, current_site_only=True, threshold=0.4,
with_wind=False, with_lockdown=False)
print('=' * 100)
print(full_data, scalar, n_pm25)
print(full_data.shape)
full_data, scalar, n_pm25 = build_data(data, corr, site_num=None, pm_only=True, current_site_only=True,
threshold=0.4,
with_wind=False, with_lockdown=False)
print('=' * 100)
print(full_data, scalar, n_pm25)
print(full_data.shape)
full_data, scalar, n_pm25 = build_data(data, corr, site_num='1501', current_site_only=True, threshold=0.4,
with_wind=True, with_lockdown=True)
print('=' * 100)
print(full_data, scalar, n_pm25)
print(full_data.shape)
full_data, scalar, n_pm25 = build_data(data, corr, site_num='1501', current_site_only=False, threshold=0.4,
with_wind=True, with_lockdown=True)
print('=' * 100)
print(full_data, scalar, n_pm25)
print(full_data.shape)
full_data, scalar, n_pm25 = build_data(data, corr, site_num='1501', current_site_only=False, threshold=0.4,
with_wind=False, with_lockdown=True)
print('=' * 100)
print(full_data, scalar, n_pm25)
print(full_data.shape)
# if __name__ == "__main__":
# test()