-
Notifications
You must be signed in to change notification settings - Fork 0
/
stat_xplore_table.py
389 lines (301 loc) · 17.6 KB
/
stat_xplore_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
# Functions to interact with the 'table' end point of the Stat-Xplore API
import json
import pandas as pd
import numpy as np
import requests
import os
import stat_xplore_schema
table_url = 'https://stat-xplore.dwp.gov.uk/webapi/rest/v1/table'
def json_response_to_dataframe(dict_response):
'''Take input sting of JSON formatted data returned by the Stat-Xplore API table end point and
unpack it into a pandas dataframe. The returned dataframe is in a 'long' format with a column for each field (uri and label)
and a column for the data value.
Args:
json_response (dict): Dictionary of data returned by the Stat-Xpore API table end point
Returns:
pandas DataFrame: The Stat-Xplore API data formatted as a DataFrame.
'''
# Unpack field labels and uris (IDs) plus the labels and uris (IDs) of items within each field.
field_items = { 'labels':[],
'uris':[]}
field_headers = { 'labels':[],
'uris':[]}
for field in dict_response['fields']:
# Unpack the labels
field_items['labels'].append(unpack_field_items(field['items'], item_values_to_return = 'labels'))
field_headers['labels'].append(field['label'])
# Unpack the uris
field_items['uris'].append(unpack_field_items(field['items'], item_values_to_return = 'uris'))
field_headers['uris'].append(field['uri'])
measure_uri = dict_response['measures'][0]['uri']
cubes_array = np.array(dict_response['cubes'][ measure_uri]['values'])
# unpack the data, using the field item values' IDs to index values
dict_data = unpack_cube_data(field_items['uris'],field_headers['uris'], cubes_array)
df = pd.DataFrame(dict_data)
# Add in the labels
final_headers = []
for i in range(len(field_headers['uris'])):
# Form lookup dictionary from uri to label
dict_field_lookup = dict(zip(field_items['uris'][i], field_items['labels'][i]))
# Create a new labels column by replacing the uris with their labels
df[field_headers['labels'][i]] = df[field_headers['uris'][i]].replace(dict_field_lookup)
# Add each header to list so that they are ordered by field type
final_headers.append(field_headers['uris'][i])
final_headers.append(field_headers['labels'][i])
# Add the value header
final_headers.append('value')
return df.reindex(columns = final_headers)
def unpack_cube_data(labels, headers, cubes_array):
'''For input lists of the field labels and the array of data, unpak the data assigning the coorect labels to each value.
Function can unpack 1d,2d and 3d data arrays. THis covers all possible dimensions returned by the stat-xplore API.
Data is unpacked into a dictionary of tuples.
Args:
labels (array of str): A 2d array containing lists of the labels to index data values with
headers (list of str): A list of the headers for the fields data is indexed by
cubes_array (multi-dim numpy array): The data values to unpack
Returns:
dict: Dictionary of the labels and the data values.
'''
dict_data = {}
for header in headers:
dict_data[header] = []
dict_data['value'] = []
cube_shape = cubes_array.shape
# Unpacking 1d
if len(cube_shape) == 1:
assert len(labels[0] == len(cubes_array[:]))
dict_data[headers[0]] += (labels[0])
dict_data['value'] += (list(cubes_array[:]))
# Unpacking 2d data
elif len(cube_shape) == 2:
assert len(labels[0]) == cube_shape[0]
assert len(labels[1]) == cube_shape[1]
for x in range(cube_shape[0]):
assert len(labels[1][:]) == len(cubes_array[x,:])
dict_data[headers[0]] += ([labels[0][x]]*cube_shape[1])
dict_data[headers[1]] += (labels[1][:])
dict_data['value'] += (list(cubes_array[x,:]))
# Unpacking 3d data
elif len(cube_shape) == 3:
assert len(labels[0]) == cube_shape[0]
assert len(labels[1]) == cube_shape[1]
assert len(labels[2]) == cube_shape[2]
for x in range(cube_shape[0]):
for y in range(cube_shape[1]):
assert len(labels[2][:]) == len(cubes_array[x,y,:])
dict_data[headers[0]] += ([labels[0][x]]*cube_shape[2])
dict_data[headers[1]] += ([labels[1][y]]*cube_shape[2])
dict_data[headers[2]] += (labels[2][:])
dict_data['value'] += list(cubes_array[x,y,:])
return dict_data
# Could change this function to unpack both ids and labels, return multidimensional array
def unpack_field_items(field_items, item_values_to_return = 'labels'):
'''The Stat-Xplore API returns fie;d values as an array of arrays, ie [ [value1], [value2], ...].
This function unpacks field values into a 1d array so that they can be parsed into a pandas DataFrame.
Args:
field_items (dict): Dictionary of field items. Each field has 'labels' and 'uris' items under which the values are stored
Kwargs:
item_values_to_return (str): Defaults to 'labels' sets which itemm type to unpack the values from.
Returns:
list: The unpacked field values
'''
item_values = []
if item_values_to_return not in ['labels','uris']:
print("unpack_field_items: Failed to unpack items. Unrecognised value type to return. Must be either 'labels' or 'uris'")
# Get the label or uri from each field item value
# Where field items are the 'Total' for all field values, a uri isn't given. Use the label in this instance
for item in field_items:
if item['type'] == 'Total':
item_values.append(item['labels'][0])
else:
item_values.append(item[item_values_to_return][0])
return item_values
def get_stat_xplore_measure_data(table_headers, schema_headers, measure_id, field_ids = None, fields_include_total = None, df_schema = None, geog_folder_label = 'Geography (residence-based)', geog_field_label= 'National - Regional - LA - OAs', geog_level_label = 'Local Authority'):
'''For an input measure ID and field IDs as well as the labels for the geography folder, field and level to get data for
build that dictionary of data to send to the Stat-Xplore table end point to request data.
Args:
table_headers (dict): The headers to uses for the request to the table endpoint of the Stat-Xplore API
schema_headers (dict): The headers to uses for the request to the schema endpoint of the Stat-Xplore API
measure_id (str): The id of the measure to request data for. This is the dataset, such as Attendence Allowance claimants,
that data is returned for
Kwargs:
field_ids (list of str, None): Default None. The field IDs of the fields to in intersect they data by
fields_include_total (str or list of str): The field IDs of the fields which will include the total across all field values in the returned data.
This cannot be the date field.
df_schema (pandas DataFrame, None): Default to None. A DataFrame of the Stat-Xplore schema
geog_folder_label (str): Defaults to 'Geography (residence-based)'. The label of the geography folder to get geography recodes from
geog_field_label (str): Defaults tp 'National - Regional - LA - OAs'. The label of the geography field to get geography recodes from.
geog_level_label (str): Defaults to 'Local Authority'. The label of the level (ie LAs, LSOAs etc) to get recodes for
Returns:
dict: Dictionary with the following items: 'data' - a pandas Data Frame of the request data, or None if request was unsucessfull;
'annotations' - A string of the annotations accoumpanying the data. Contains info on what the data show.
'''
# Build request body
body = build_request_body(table_headers, schema_headers, measure_id, field_ids = field_ids, fields_include_total = fields_include_total, df_schema = df_schema, geog_folder_label = geog_folder_label, geog_field_label = geog_field_label, geog_level_label = geog_level_label)
# Request data
response_dict = request_table(table_headers, json.dumps(body))
if response_dict['success'] == True:
json_data = response_dict['response'].json()
# Format data into dataframe
df_data = json_response_to_dataframe(json_data)
# Get database annotations (footnaotes)
database_annotation_keys = json_data['database']['annotationKeys']
database_annotations = {}
for key in database_annotation_keys:
database_annotations[key] = json_data['annotationMap'][key]
return {'data': df_data, 'annotations':database_annotations}
else:
return {'data':None, 'annotations':None}
def build_request_body(table_headers, schema_headers, measure_id, field_ids = None, fields_include_total = None, df_schema = None, geog_folder_label = 'Geography (residence-based)', geog_field_label= 'National - Regional - LA - OAs', geog_level_label = 'Local Authority'):
'''For an input measure ID and field IDs as well as the labels for the geography folder, field and level to get data for
build that dictionary of data to send to the Stat-Xplore table end point to request data.
Args:
table_headers (dict): The headers to uses for the request to the table endpoint of the Stat-Xplore API
schema_headers (dict): The headers to uses for the request to the schema endpoint of the Stat-Xplore API
measure_id (str): The id of the measure to request data for. This is the dataset, such as Attendence Allowance claimants,
that data is returned for
Kwargs:
field_ids (list of str, None): Default None. The field IDs of the fields to in intersect they data by
fields_include_total (str or list of str): The field IDs of the fields which will include the total across all field values in the returned data
This cannot be the date field.
df_schema (pandas DataFrame, None): Default to None. A DataFrame of the Stat-Xplore schema
geog_folder_label (str): Defaults to 'Geography (residence-based)'. The label of the geography folder to get geography recodes from
geog_field_label (str): Defaults tp 'National - Regional - LA - OAs'. The label of the geography field to get geography recodes from.
geog_level_label (str): Defaults to 'Local Authority'. The label of the level (ie LAs, LSOAs etc) to get recodes for
Returns:
dict: Dictionary with keys 'database', 'measures', 'recodes', 'dimensions'. This dictionary is sent to the stat-xplore API when requesting data
'''
# Get database id
database_id = 'str:database:' + measure_id.split(':')[-2]
database_value = database_id
measures_values = get_measures_request_body(measure_id)
recodes_values = get_geography_recodes_request_body(schema_headers, database_id, geog_folder_label = geog_folder_label, geog_field_label= geog_field_label, geog_level_label = geog_level_label, df_schema = df_schema)
dimensions_values = get_dimensions_body(schema_headers, database_id, field_ids, df_schema = df_schema)
# Add in geography recode field id to the dimensions
dimensions_values = dimensions_values + [[i] for i in list(recodes_values.keys())]
# Add in the field totals to the recodes so that the returned data includes field totals (ie total for all genders)
if fields_include_total is not None:
fields_include_total = [fields_include_total] if isinstance(fields_include_total, str) else fields_include_total
# Ensure that existing recode items are not changed
# The dimension id is the element within the array
for field in fields_include_total:
if field in recodes_values.keys():
continue
recodes_values[field] = {'total':True}
body = {'database':database_value,
'measures':measures_values,
'recodes': recodes_values,
'dimensions':dimensions_values}
return body
def get_dimensions_body(schema_headers, database_id, field_ids, df_schema = None):
'''Format the fields IDs to the required format for the dimensions section of the data request body.
If field IDs is None, get all available fields for the given database.
Args:
schema_headers (dict): The headers of the request.
database_id (str): The ID of the database to get dimension fields for.
field_ids (str or list of str or None): The fields to use as dimensions in the data request
df_schema (pandas DataFrame or None): The stat-xplore schema
Returns:
list: The field ids to use as dimensions properly formatted
'''
all_field_ids_dict = stat_xplore_schema.get_database_fields(schema_headers, database_id, df_schema = df_schema)
all_field_ids = list(all_field_ids_dict.values())
if field_ids is None:
return all_field_ids
field_ids = [field_ids] if isinstance(field_ids, str) else field_ids
dimensions_array = [[i] for i in field_ids if i in all_field_ids]
return dimensions_array
def get_measures_request_body(measure_ids):
'''Given an input measure ID or list of measure IDs return a dictionary of the format:
{'measures':['measure_id_1','measure_id_2', ...]}
Args:
measure_ids (str or list of str): The measure IDs to format
Returns:
list: A list of measure ids
'''
measure_ids = [measure_ids] if isinstance(measure_ids, str) else measure_ids
return measure_ids
def get_geography_recodes_request_body(schema_headers, database_id, geog_folder_label = 'Geography (residence-based)', geog_field_label= 'National - Regional - LA - OAs', geog_level_label = 'Local Authority', df_schema = None, check_cache = False, schema_filename = 'schema.csv'):
'''A wrapped function that gets the requested geography recodes and passes them to a functino that formats tha recodes into the
dictionary format required for requesting data.
Args:
schema_headers (dict): The headers of the request
database_id (str): The ID of the database to get dimension fields for
Kwargs:
geog_folder_label (str): Defaults to 'Geography (residence-based)'. The label of the geography folder to get geography recodes from
geog_field_label (str): Defaults tp 'National - Regional - LA - OAs'. The label of the geography field to get geography recodes from
geog_level_label (str): Defaults to 'Local Authority'. The label of the level (ie LAs, LSOAs etc) to get recodes for
'''
# Get the recodes
recodes_dict = stat_xplore_schema.geography_recodes_for_geog_folder_geog_level(schema_headers, database_id, geog_folder_label, geog_field_label, geog_level_label, df_schema, check_cache, schema_filename)
# Format the recodes
recodes_data = format_recodes_for_api(recodes_dict, include_total = True)
return recodes_data
def format_recodes_for_api(recodes_dict, include_total = True):
'''Takes input dictionary of recode field id:list of desired re code values
and formats this into a valid recodes object for reuesting table data from the API.
A valid recodes objects is a dictionary with recode field ids as keys. Each id looks to
another dictionary oject with a 'map' key and a 'total' key. Desired recodes are set as an array
of values, each in their own array, within the map object.
EG
'field_id':{ 'map':[
['field_value_1'],
['field_value_2'],
['field_value_3']
],
'total': true}
Args:
recodes_dict (dict): A dictionary with the recode field id as a key and a list of recode field
values as the item.
Kwargs:
include_total (bool): Default False. Set whether or not to set the 'total' option to 'true' or 'false'
This controls whether data returned by the request includes a total across all recode field values.
Returns:
dict: Recode field id and field values in the required format
'''
# Assumes there is only one recode field id
field_id = list(recodes_dict.keys())[0]
field_values = recodes_dict[field_id]
# Convert values into an array of arrays
map_values = [ [i] for i in field_values ]
if include_total == False:
total_value = False
else:
total_value = True
# Combine into single dict
recodes_data = {field_id:{
'map': map_values,
'total':total_value
}
}
# Might need to convert into json object (ie just text)
return recodes_data
def gen_xyz(x_max, y_max, z_max):
'''Generator. For input integers x_max, y_max, z_max iterate over each one and yield
a tuple of (x,y,z) coordinates.
Args:
x_max (int)
y_max (int)
z_max (int)
Returns:
tuple: (x,y,z) coordinates
'''
for x in range(x_max):
for y in range(y_max):
for z in range(z_max):
yield x,y,z
def request_table(table_headers, table_data):
'''Send request for table to API. Check request was successful.
Args:
url (str): The url of the request.
table_headers (dict): The headers of the request.
'''
global table_url
table_response = requests.post(table_url, headers = table_headers, data = table_data)
# Check that request was successful. If not print message and exit.
if table_response.raise_for_status() is not None:
print("Unsuccessful request to url:{}\nCheck url and API key.".format(url))
print("Response status:\n{}".format(table_response.raise_for_status()))
return {'success':False, 'response':None}
else:
return {'success':True, 'response':table_response}