-
Notifications
You must be signed in to change notification settings - Fork 18
/
Python-Plugin-Tutorial.csl
206 lines (186 loc) · 6.12 KB
/
Python-Plugin-Tutorial.csl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
//
// Demo of KQL + inline Python
//
#connect cluster('demo11.westus.kusto.windows.net').database('ML')
// Writing & pasting this simple Python script in KE as is doesn't work, it generates parsing errors by KQL
result = df
result["x2"] = df["x"].pow(2)
result["x3"] = df["x"].pow(3)
// Mark the avove lines and press either F2, Ok or Ctrl+K, Ctrl+S to decorate it. Verifying it’s a legal KQL multi-line string
print
'result = df\n'
'result["x2"] = df["x"].pow(2)\n'
'result["x3"] = df["x"].pow(3)\n'
//
// Run it
//
range x from 1 to 10 step 1
let code =
'result = df\n'
'result["x2"] = df["x"].pow(2)\n'
'result["x3"] = df["x"].pow(3)\n'
;
range x from 1 to 10 step 1
| evaluate python(typeof(*, x2:int, x3:int), code)
//
// Demo of fitting time series with high order polynomial using numpy.polyfit
//
// Original time series (Web service telemetry views)
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| render timechart
// Linear fit (1st degree polynom) using native series_fit_line()
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend series_fit_line(num)
| render timechart
//
// Better fit with high order polynomial using numpy.polyfit
//
// Note we Wrap the Python script in Kusto function/lambda for passing parameters and readability
//
let series_fit_poly=(tbl:(*), in_series:string, out_series:string, degree:int)
{
let kwargs = pack('in_series', in_series, 'out_series', out_series, 'degree', degree);
let code=
'\n'
'in_series = kargs["in_series"]\n'
'out_series = kargs["out_series"]\n'
'degree = kargs["degree"]\n'
'\n'
'def fit(s, deg):\n'
' x = np.arange(len(s))\n'
' coeff = np.polyfit(x, s, deg)\n'
' p = np.poly1d(coeff)\n'
' z = p(x)\n'
' return z\n'
'\n'
'result = df\n'
'result[out_series] = df[in_series].apply(fit, args=(degree,))\n'
;
tbl
| evaluate python(typeof(*), code, kwargs) // new plugin syntax
};
//
let max_t = datetime(2016-09-03);
demo_make_series1
| make-series num=count() on TimeStamp from max_t-1d to max_t step 5m by OsVer
| extend series_fit_line(num)
| extend fit_num=dynamic(null)
| invoke series_fit_poly('num', 'fit_num', 5)
| render timechart
//
// Demo of K-Means (from Scikit-learn)
//
//
// OccupancyDetection dataset from UCI Repository (https://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+)
//
// Experimental data used for binary classification (room occupancy) from Temperature,Humidity,Light and CO2.
// Ground-truth occupancy was obtained from time stamped pictures that were taken every minute
//
OccupancyDetection
| take 20
OccupancyDetection
| count
//
// K-Means clustering on the sensors measurements
//
// Append cluster id per each record
//
let kmeans_sf=(tbl:(*), k:int, features:dynamic, cluster_col:string)
{
let kwargs = pack('k', k, 'features', features, 'cluster_col', cluster_col);
let code =
'\n'
'from sklearn.cluster import KMeans\n'
'\n'
'k = kargs["k"]\n'
'features = kargs["features"]\n'
'cluster_col = kargs["cluster_col"]\n'
'\n'
'km = KMeans(n_clusters=k)\n'
'df1 = df[features]\n'
'km.fit(df1)\n'
'result = df\n'
'result[cluster_col] = km.labels_\n'
;
tbl
| evaluate python(typeof(*), code, kwargs)
};
//
let num_clusters=5;
//
OccupancyDetection
| extend cluster_id=double(null)
| invoke kmeans_sf(num_clusters, pack_array("Temperature", "Humidity", "Light", "CO2", "HumidityRatio"), "cluster_id")
| sample 10
//
// same but output the centroids and size of the clusters:
//
let kmeans=(tbl:(*), k:int, features:dynamic)
{
let args = pack('k', k, 'features', features);
let code =
'\n'
'from sklearn.cluster import KMeans\n'
'\n'
'k = kargs["k"]\n'
'features = kargs["features"]\n'
'\n'
'km = KMeans(n_clusters=k)\n'
'df1 = df[features]\n'
'km.fit(df1)\n'
'result = pd.DataFrame(km.cluster_centers_, columns=df1.columns)\n'
'result.insert(df.shape[1], "num", pd.DataFrame(km.labels_, columns=["n"]).groupby("n").size())\n'
'result.insert(df.shape[1], "cluster_id", range(k))\n'
;
tbl
| evaluate python(typeof(*, cluster_id:int, num:long), code, args)
};
//
let num_clusters=5;
//
OccupancyDetection
| project-away Timestamp , Occupancy , Test
| invoke kmeans(num_clusters, pack_array("Temperature", "Humidity", "Light", "CO2", "HumidityRatio"))
//
// Demo of predicting (a.k.a. scoring) by classifier (from Scikit-learn)
//
// Training the classifier in Jupyter (or other Python IDE), persist the resulting model in Kusto table & use it to classify new samples
//
// Using the same OccupancyDetection dataset to predict room occupancy
//
let classify_sf=(samples:(*), models_tbl:(name:string, timestamp:datetime, model:string), model_name:string, features_cols:dynamic, pred_col:string)
{
let model_str = toscalar(models_tbl | where name == model_name | top 1 by timestamp desc | project model);
let kwargs = pack('smodel', model_str, 'features_cols', features_cols, 'pred_col', pred_col);
let code =
'\n'
'import pickle\n'
'import binascii\n'
'\n'
'smodel = kargs["smodel"]\n'
'features_cols = kargs["features_cols"]\n'
'pred_col = kargs["pred_col"]\n'
'bmodel = binascii.unhexlify(smodel)\n'
'clf1 = pickle.loads(bmodel)\n'
'df1 = df[features_cols]\n'
'predictions = clf1.predict(df1)\n'
'\n'
'result = df\n'
'result[pred_col] = pd.DataFrame(predictions, columns=[pred_col])'
'\n'
;
samples | evaluate python(typeof(*), code, kwargs)
};
OccupancyDetection
| where Test == 1
| extend pred_Occupancy=bool(0)
| invoke classify_sf(ML_Models, 'Occupancy', pack_array('Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio'), 'pred_Occupancy')
| fork
(summarize n=count() by Occupancy, pred_Occupancy) // confusion matrix
(summarize accuracy = 100.0*count(Occupancy == pred_Occupancy)/count()) // accuracy