-
Notifications
You must be signed in to change notification settings - Fork 1
/
split_data_stratified.py
208 lines (142 loc) · 9.1 KB
/
split_data_stratified.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jan 15 15:23:00 2024
@author: rbouman
"""
import os
import shutil
from sklearn.model_selection import ParameterGrid
from hashlib import sha256
import numpy as np
import pandas as pd
from src.io_functions import load_batch
from src.preprocess import preprocess_per_batch_and_write
raw_data_folder = "raw_data"
processed_data_folder = "data"
dataset = "route_data" #alternatively: route_data
intermediates_folder = os.path.join(raw_data_folder, dataset+"_preprocessed")
table_folder = os.path.join("Tables", dataset)
all_cutoffs = [(0, 24), (24, 288), (288, 4032), (4032, np.inf)]
#%%
dry_run = False
make_table = True
#%%
def split_series(series, counts=np.zeros((3,))):
# Sort the series in descending order
sorted_series = series.sort_values(ascending=False)
# Initialize three empty dictionaries to store the parts
parts = [{}, {}, {}]
# Iterate through the sorted series and allocate items to each part in a round-robin fashion
for i, count in enumerate(sorted_series):
min_counts_part = np.argmin(counts)
parts[min_counts_part][sorted_series.index[i]] = count
counts[min_counts_part] += count
# Create Series for each part
series_part_1 = pd.Series(parts[0], name='Count')
series_part_2 = pd.Series(parts[1], name='Count')
series_part_3 = pd.Series(parts[2], name='Count')
return series_part_1.index, series_part_2.index, series_part_3.index
# %%
if dataset == "route_data":
station_exclude_list = [25, 106, 130, 190] #106 has already been deleted beforehand due to being an invalid file
elif dataset == "OS_data":
station_exclude_list = [] #90?
else:
station_exclude_list = []
station_exclude_list_filenames = [str(station)+".csv" for station in station_exclude_list]
#%% calculate event lengths
X_dfs, y_dfs, file_names = load_batch(raw_data_folder, dataset)
#filter exclude list:
filtered_data = [(X_df, y_df, file_name) for X_df, y_df, file_name in zip(X_dfs, y_dfs, file_names) if file_name not in station_exclude_list_filenames]
filtered_X_dfs, filtered_y_dfs, filtered_file_names = zip(*filtered_data)
filtered_X_dfs, filtered_y_dfs, filtered_file_names = list(filtered_X_dfs), list(filtered_y_dfs), list(filtered_file_names)
if dataset == "OS_data":
all_preprocessing_hyperparameters = {'subsequent_nr': [5], 'lin_fit_quantiles': [(10, 90)], "label_transform_dict": [{0:0, 1:1, 4:5, 5:5}], "remove_uncertain": [False]}
elif dataset == "route_data":
all_preprocessing_hyperparameters = {'subsequent_nr': [5], 'lin_fit_quantiles': [(10, 90)], "label_transform_dict": [{0:0, 1:1, 4:5, 5:5}], "remove_uncertain": [True], "rescale_S_to_kW":[True]}
preprocessing_hyperparameters = list(ParameterGrid(all_preprocessing_hyperparameters))[0]
preprocessing_hyperparameter_string = str(preprocessing_hyperparameters)
preprocessing_hash = sha256(preprocessing_hyperparameter_string.encode("utf-8")).hexdigest()
_, _, _, event_lengths = preprocess_per_batch_and_write(filtered_X_dfs, filtered_y_dfs, intermediates_folder, dataset, preprocessing_overwrite=False, write_csv_intermediates=False, file_names=filtered_file_names, all_cutoffs=all_cutoffs, hyperparameters=preprocessing_hyperparameters, hyperparameter_hash=preprocessing_hash, remove_missing=True)
#%%
normalized_length_count_per_cutoff_dict = {}
length_count_per_cutoff_dict = {}
for event_length, file_name in zip(event_lengths, filtered_file_names):
unique_lengths, length_counts = np.unique(event_length, return_counts=True)
unique_lengths = unique_lengths[1:]
length_counts = length_counts[1:]
normalized_length_counts = []
for unique_length, length_count in zip(unique_lengths, length_counts):#skip 0
normalized_length_counts.append(length_count/unique_length)
normalized_length_count_per_cutoff = {str(cutoff):0 for cutoff in all_cutoffs}
length_count_per_cutoff = {str(cutoff):0 for cutoff in all_cutoffs}
for unique_length, normalized_length_count, length_count in zip(unique_lengths,normalized_length_counts, length_counts):
for cutoff in all_cutoffs:
if unique_length >= cutoff[0] and unique_length < cutoff[1]:
normalized_length_count_per_cutoff[str(cutoff)] += normalized_length_count
length_count_per_cutoff[str(cutoff)] += length_count
normalized_length_count_per_cutoff_dict[file_name] = {key:[value][0] for key, value in normalized_length_count_per_cutoff.items()}
length_count_per_cutoff_dict[file_name] = {key:[value][0] for key, value in length_count_per_cutoff.items()}
normalized_lengths_df = pd.DataFrame(normalized_length_count_per_cutoff_dict).T
lengths_df = pd.DataFrame(length_count_per_cutoff_dict).T
total_normalized_length_count_per_cutoff = normalized_lengths_df.sum()
total_length_count_per_cutoff = lengths_df.sum()
#%% start dividing datasets by category with lowest normalized length count:
all_train_stations, all_val_stations, all_test_stations, all_considered_stations = [], [], [], []
previous_categories = []
remaining_normalized_lengths_df = normalized_lengths_df
#%% divide over each category, starting with the smallest:
sorted_categories = total_normalized_length_count_per_cutoff.sort_values().index
for smallest_category in sorted_categories:
stations_in_category_with_events = remaining_normalized_lengths_df[smallest_category].iloc[remaining_normalized_lengths_df[smallest_category].values.nonzero()]
#get current counts:
counts = np.zeros((3,))
for i, stations in enumerate((all_train_stations, all_val_stations, all_test_stations)):
counts[i] = (normalized_lengths_df.loc[stations].sum())[smallest_category]
print(counts)
#print(len(stations_in_category_with_events))
(train_stations, test_stations, val_stations) = split_series(stations_in_category_with_events, counts)
all_train_stations += list(train_stations)
all_val_stations += list(val_stations)
all_test_stations += list(test_stations)
all_considered_stations = all_train_stations + all_val_stations + all_test_stations
remaining_stations = [file_name for file_name in filtered_file_names if file_name not in all_considered_stations]
remaining_normalized_lengths_df = normalized_lengths_df.loc[remaining_stations]
previous_categories += [smallest_category]
#%% Divide remaining stations (with no events)
stations_in_category_with_events_index = remaining_normalized_lengths_df.index
stations_in_category_with_events = pd.Series(np.ones(len(stations_in_category_with_events_index)), index=stations_in_category_with_events_index)
counts = np.array([len(all_train_stations), len(all_val_stations), len(all_test_stations)])
(train_stations, val_stations, test_stations) = split_series(stations_in_category_with_events, counts)
all_train_stations += list(train_stations)
all_val_stations += list(val_stations)
all_test_stations += list(test_stations)
all_considered_stations = all_train_stations + all_val_stations + all_test_stations
#%% sanity check to see how well division was done:
print("Events in Train, Val, Test:")
for stations in (all_train_stations, all_val_stations, all_test_stations):
print(normalized_lengths_df.loc[stations].sum())
#%% Make table of event length distribution per split:
event_length_distribution_table = pd.DataFrame(dtype=np.int32)
if make_table:
for stations, split_name in zip((all_train_stations, all_val_stations, all_test_stations), ("Train", "Validation", "Test")):
event_length_distribution_table[split_name] = normalized_lengths_df.loc[stations].sum()
event_length_distribution_table = event_length_distribution_table.transpose()
cutoff_replacement_dict = {"(0, 24)":"1-24", "(24, 288)":"25-288","(288, 4032)":"288-4032", "(4032, inf)":"4033 and longer"}
event_length_distribution_table.rename(columns=cutoff_replacement_dict, inplace=True)
os.makedirs(table_folder, exist_ok=True)
event_length_distribution_table.applymap("{0:.0f}".format).to_latex(buf=os.path.join(table_folder, "event_length_distribution_table.tex"), escape=False, multirow=True)
#%% Save data to folder based on calculated split:
if not dry_run:
stations_per_split = {"Train":all_train_stations, "Validation":all_val_stations, "Test": all_test_stations}
for split, station_names in stations_per_split.items():
for station_name in station_names:
original_X_file = os.path.join(raw_data_folder, dataset, "X", station_name)
new_X_file = os.path.join(processed_data_folder, dataset, split, "X", station_name)
original_y_file = os.path.join(raw_data_folder, dataset, "y", station_name)
new_y_file = os.path.join(processed_data_folder, dataset, split, "y", station_name)
os.makedirs(os.path.join(processed_data_folder, dataset, split, "X"), exist_ok=True)
os.makedirs(os.path.join(processed_data_folder, dataset, split, "y"), exist_ok=True)
shutil.copy(original_X_file, new_X_file)
shutil.copy(original_y_file, new_y_file)