-
Notifications
You must be signed in to change notification settings - Fork 0
/
smoothing.py
137 lines (123 loc) · 4.8 KB
/
smoothing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import pandas as pd
import matplotlib
matplotlib.use("macosx")
import matplotlib.pyplot as plt
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.nonparametric.smoothers_lowess import lowess
from scipy.signal import find_peaks, peak_prominences, peak_widths, find_peaks_cwt, savgol_filter
import numpy as np
from whittaker_eilers import WhittakerSmoother
from pykalman import KalmanFilter
df = pd.read_csv('result_df.csv')
# def split_list(input_list, chunk_size):
# to_cut = len(input_list) % chunk_size
# input_list = input_list[:len(input_list)-to_cut]
# return [pd.Series(input_list[i:i + chunk_size], index=range(i, i + chunk_size))
# for i in range(0, len(input_list), chunk_size)]
#
# chunks = split_list(df['proportion'].values, 50)
#
# def get_chunk_extremums(chunks):
# return [(chunk.idxmax(), chunk.max()) for chunk in chunks]
#
# chunk_max = get_chunk_extremums(chunks)
# print(chunk_max)
# second_values = [t[1] for t in chunk_max]
#
# # Calculate median and maximum
# median_value = np.median(second_values)
# max_value = max(second_values)
#
# print("Median of all second values:", median_value)
# print("Maximum of all second values:", max_value)
#
# pairwise_distances = [abs(chunk_max[i][0] - chunk_max[i+1][0]) for i in range(len(chunk_max) - 1)]
# print(pairwise_distances)
# print(np.mean(pairwise_distances))
# # Create an empty array of zeros with a length of 900
# curve = np.zeros(len(df['proportion'].values))
#
# # Interpolate the given values into the curve
# for tpl in chunk_max:
# index = tpl[0]
# value = tpl[1]
# curve[index] = value
#
# # Perform linear interpolation to fill in the gaps
# indices = np.arange(len(df['proportion'].values))
# curve_interpolated = np.interp(indices, np.where(curve != 0)[0], curve[curve != 0])
#
# # Create pandas series with indexed values of the curve
# series = pd.Series(curve_interpolated, index=indices)
#
# print(series)
# df['smoothed'] = series
# df['smoothed'].plot()
# df['proportion'].plot()
#
#
# peaks, _ = find_peaks(df['smoothed'].values)
# print(peaks)
# plt.scatter(peaks, df.loc[df['index'].isin(peaks), 'proportion'], color='red', zorder=5)
#
# plt.show()
# SMOOTHING
whittaker_smoother = WhittakerSmoother(
lmbda=len(df), order=2, data_length=len(df)
)
smoothed_data = whittaker_smoother.smooth(df['proportion'])
df['whittaker_smoother'] = smoothed_data
print(df.describe())
df['whittaker_smoother'] = df['whittaker_smoother'].apply(lambda x: 0.0 if x < 0 else x)
df['lowess_smoothed'] = lowess(df['proportion'], range(len(df)), frac=0.06)[:, 1]
# peak_indices, bases = find_peaks(df['proportion'].values, width=1, height=0.0001, distance=20)
# print(peak_indices)
# print(bases)
df['savgol'] = savgol_filter(df['proportion'].values, window_length=40, polyorder=2)
# Define Kalman filter model
kf = KalmanFilter(transition_matrices=[1],
observation_matrices=[1],
initial_state_mean=df['proportion'].iloc[0],
initial_state_covariance=1,
observation_covariance=1,
transition_covariance=0.01)
# Fit model and make predictions
state_means, _ = kf.filter(df['proportion'])
state_means = state_means.flatten()
df['kalman'] = state_means
peaks, _ = find_peaks(df['whittaker_smoother'].values)
print(peaks)
prominences = peak_prominences(df['whittaker_smoother'].values, peaks)[0]
results_full = peak_widths(df['whittaker_smoother'].values, peaks, rel_height=1)[0]
avg_prominence = np.median(prominences)
print('PROMINENCES')
print(prominences)
print(avg_prominence)
print('WIDTHS')
print(results_full)
avg_width = np.percentile(results_full, 15)
print(avg_width)
peak_indices_smoothed, bases_smoothed = find_peaks(df['whittaker_smoother'].values,
#wlen=20,
width=avg_width,
# height=0.0001,
distance=20,
prominence=avg_prominence)
#prominence=0.0001)
print(peak_indices_smoothed)
print(bases_smoothed)
# df['savgol'] = savgol_filter(df['proportion'], window_length=10, polyorder=1, mode='nearest')
# peak_indices_savgol, bases_savgol = find_peaks(df['savgol'].values, width=1, height=0.0001,distance=20)
# print(peak_indices_savgol)
# print(bases_savgol)
#
# peak_indices_ = find_peaks_cwt(df['smoothed'].values, np.arange(1,200))
# print(peak_indices_)
df['proportion'].plot()
# df['smoothed'].plot()
df['whittaker_smoother'].plot()
plt.scatter(peak_indices_smoothed, df.loc[df['index'].isin(peak_indices_smoothed), 'proportion'], color='red', zorder=5)
# df['smoothed'].plot()
# df['savgol'].plot()
# df['proportion'].ewm(span=50).mean().plot()
plt.show()