-
Notifications
You must be signed in to change notification settings - Fork 18
/
Time-Series-Analysis-Tutorial.csl
268 lines (231 loc) · 9.42 KB
/
Time-Series-Analysis-Tutorial.csl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
//
// Time Series Analysis operator & functions
//
#connect cluster('demo11.westus').database('ML')
// make-series operator
//
// The main operator used to build a set of time series from tabular data
//
// Demo table contains 2 weeks from Web service telemetry views
demo_make_series1 | take 10
demo_make_series1 | count
// Building time series of service traffic in 1h resolution per operating system
//
let min_t = toscalar(demo_make_series1 | summarize min(TimeStamp));
let max_t = toscalar(demo_make_series1 | summarize max(TimeStamp));
demo_make_series1
| make-series num=count() default=0 on TimeStamp from min_t to max_t step 1h by OsVer
| render timechart
// Series functions
// Demo series showing trend change (typical pattern when monitoring of application failures, memory consumption, IoT sensor reading etc.)
//
demo_series1
| render linechart
// series_stats()
//
// Calculates simple statistics
//
demo_series1
| project (ymin, id1, ymax, id2, avg, stdev, var) = series_stats(y)
// series_fit_line()
//
// Linear regression, fitting the best line to a series
//
demo_series1
| extend series_fit_line(y)
| render linechart with(ycolumns=y, series_fit_line_y_line_fit)
// series_fit_2lines()
//
// 2 segments linear regression - optimal split of the series to 2 segments, fitting best line to each
//
demo_series1
| extend series_fit_2lines(y), series_fit_line(y)
| render linechart with(ycolumns=y, series_fit_2lines_y_line_fit, series_fit_line_y_line_fit)
// 2 segments linear regression - jump example
//
demo_series2
| extend series_fit_2lines(y), series_fit_line(y)
| render linechart | render linechart with(ycolumns=y, series_fit_2lines_y_line_fit, series_fit_line_y_line_fit)
// series_moving_avg_sf()
//
// Defining a user function to calculate moving average using native series_fir() function that calculate FIR (Finite Impulse Response)low pass filter on a series
//
let series_moving_avg_sf = (series:dynamic, size:int, center:bool=false)
{
series_fir(series, repeat(1, size), true, center)
}
;
demo_series1
| extend ma_y = series_moving_avg_sf(y, 8)
| render linechart
// Automatic seasonality detection & validation
// Demo series contains typical events traffic in 1 month, manifesting weekly & daily periods (2h bins)
demo_series3
| render timechart
// series_periods_detect()
//
// Automatic detection of top periods. Return 2 arrays: one with the periods (in number of bins units) and the other and one with the respective scores from 0 (no period) to 1 (pure period with no noise)
// In this query we detect top 2 periods whose duration is between 0 and 8 days (i.e. 8d/2h=192 bins)
//
demo_series3
| project (periods, scores) = series_periods_detect(num, 0., 8d/2h, 2)
| mvexpand periods, scores
| extend days=2h*todouble(periods)/1d
// series_periods_validate()
//
// Test existence of expected periods
// In this query we test for weekly, daily & 8 hours periods. The last one doesn't exist in the demo series
//
demo_series3
| project (periods, scores) = series_periods_validate(num, 8h/2h, 1d/2h, 7d/2h)
| mvexpand periods, scores
| extend days=2h*todouble(periods)/1d
// Using moving average filter to remove daily seasonality
//
let series_moving_avg_sf = (series:dynamic, size:int, center:bool=false)
{
series_fir(series, repeat(1, size), true, center)
}
;
demo_series3
| extend ma_num = series_moving_avg_sf(num, 1d/2h, true)
| render timechart
// Verify daily seasonality was removed
//
demo_series3
| extend ma_num = series_moving_avg_sf(num, 1d/2h)
| extend periods = series_periods_detect(num, 0., 8d/2h, 2)
| extend periods_ma = series_periods_detect(ma_num, 0., 8d/2h, 2)
| project periods, periods_ma
| mvexpand periods, periods_ma
| extend days=2h*todouble(periods)/1d, days_ma=2h*todouble(periods_ma)/1d
//
// Native series functions for Anomaly Detection & Forecasting
//
// series_decompose()
//
// Decomposition of a set of time series to seasonal, trend, residual & baseline (which is just seasonal+trend)
// This function is used for the following series_decompose_anomalies() and series_decompose_forecast()
//
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t step dt by sid
| where sid == 'TS1' // select a single time series just to get cleaner visualization
| extend (baseline, seasonal, trend, residual) = series_decompose(num, -1, 'linefit')
| render timechart with(title='Web app. traffic of a month, decmposition', ysplit=panels)
//
// series_decompose_anomalies()
//
// Anomaly Detection by decomposing the series using series_decompose() and then calculates custom Tukey's fence test (like in series_outliers()) on the residual component
// Note that in this example we use "| render anomalychart with anomalycolumns=anomalies" just to render the anomalies as bold points on top of the series charts
//
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t step dt by sid
| where sid == 'TS1' // select a single time series just to get cleaner visualization
| extend (anomalies, score, baseline) = series_decompose_anomalies(num, 1.5, -1, 'linefit')
| render anomalychart with(anomalycolumns=anomalies, title='Web app. traffic of a month, anomalies')
//
// series_decompose_anomalies()
//
// Note that in this example series_decompose_anomalies() is called implicitly with default values by the "| render anomalychart" clause
//
let min_t = datetime(2016-08-29);
let max_t = datetime(2016-09-02);
demo_make_series1
| make-series num=count() on TimeStamp from min_t to max_t step 10m by OsVer
| where OsVer == 'Windows 10'
| render anomalychart with(title='Web app. traffic of 4 days, Point Anomalies by Time Series Decmposition')
//
// series_decompose_forecast()
//
// Forecasting by decomposing the series using series_decompose() and then extrapolate the baseline for the next week
//
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let horizon=7d;
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t+horizon step dt by sid
| where sid == 'TS1' // select a single time series just to get cleaner visualization
| extend forecast = series_decompose_forecast(num, toint(horizon/dt))
| render timechart with(title='Web app. traffic of a month, forecasting the next week by Time Series Decmposition')
//
// Forecast few time series at once
//
let min_t = datetime(2017-01-05);
let max_t = datetime(2017-02-03 22:00);
let dt = 2h;
let horizon=7d;
demo_make_series2
| make-series num=avg(num) on TimeStamp from min_t to max_t+horizon step dt by sid
| extend offset=case(sid=='TS3', 4000000, sid=='TS2', 2000000, 0) // add artificial offset for easy visualization of multiple time series
| extend num=series_add(num, offset)
| extend forecast = series_decompose_forecast(num, toint(horizon/dt))
| render timechart with(title='Web app. traffic of a month, forecasting the next week for 3 time series')
//
// Applying the above function on big set of time series is powerful!
//
// Demo of detection few anomalous time series out of thousands time series
//
// Demo table of internal DB sampled data for 4 days in 9/2016
demo_many_series1
| summarize num=count(), min_t=min(TIMESTAMP), max_t=max(TIMESTAMP)
demo_many_series1
| take 10
// Time series of data read metric in 1h resolution (total 4*24=96 points): root segment (i.e. global aggregation) looks ok
let min_t = toscalar(demo_many_series1 | summarize min(TIMESTAMP));
let max_t = toscalar(demo_many_series1 | summarize max(TIMESTAMP));
demo_many_series1
| make-series reads=avg(DataRead) on TIMESTAMP from min_t to max_t step 1h
| render timechart with(ymin=0)
// Partition the records set by Loc, Op & DB, total 18339 partitions
//
demo_many_series1
| summarize by Loc, Op, DB
| count
// View time series for 5 sample partitions
//
// helper function to filter series that had empty bins (filled by make-series using default 0 value)
//
let series_partial_sf = (series:dynamic, empty_val:real)
{
let se = series_equals(series, repeat(empty_val, array_length(series)));
let s = series_stats_dynamic(se);
let max_val = todouble(s.max);
max_val == 1
}
;
let min_t = toscalar(demo_many_series1 | summarize min(TIMESTAMP));
let max_t = toscalar(demo_many_series1 | summarize max(TIMESTAMP));
demo_many_series1
| make-series reads=avg(DataRead) on TIMESTAMP from min_t to max_t step 1h by Loc, Op, DB
| where series_partial_sf(reads, 0) == false
//| where Op == '41' and DB == '976' // nice ones
| sample 5
| render timechart with(ysplit=axes)
//
// Find top 2 trending down segments (out of 18339 segments)
//
let series_partial_sf = (series:dynamic, empty_val:real)
{
let se = series_equals(series, repeat(empty_val, array_length(series)));
let s = series_stats_dynamic(se);
let max_val = todouble(s.max);
max_val == 1
}
;
let min_t = toscalar(demo_many_series1 | summarize min(TIMESTAMP));
let max_t = toscalar(demo_many_series1 | summarize max(TIMESTAMP));
demo_many_series1
| make-series reads=avg(DataRead) on TIMESTAMP from min_t to max_t step 1h by Loc, Op, DB
| where series_partial_sf(reads, 0) == false
| extend series_fit_line(reads)
| top 2 by series_fit_line_reads_slope asc
| extend series_fit_2lines(reads)
| project Loc, Op, DB, TIMESTAMP, reads
| render timechart with(title='Service Traffic Outage for 2 instances (out of 18339)')