generated from byuibigdata/project_safegraph_old
-
Notifications
You must be signed in to change notification settings - Fork 0
/
safegraph_functions.py
100 lines (79 loc) · 2.68 KB
/
safegraph_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# https://towardsdatascience.com/cleaning-and-extracting-json-from-pandas-dataframes-f0c15f93cb38
# https://packaging.python.org/tutorials/packaging-projects/
import pandas as pd
import json
import re
def jsonloads(x):
if pd.isna(x):
return None
else:
return json.loads(x)
def createlist(x):
try:
return x.str.strip('][').str.split(',')
except:
return None
def rangenumbers(x):
if x.size == 1:
return 0
else:
return range(1, x.size + 1)
def expand_json(var, dat, wide=True):
rowid = dat.placekey
start_date = dat.date_range_start
end_date = dat.date_range_end
parsedat = dat[var]
loadsdat = parsedat.apply(jsonloads)
df_wide = pd.json_normalize(loadsdat)
# clean up store names so they work as column names
col_names = df_wide.columns
col_names = [re.sub(r'[^\w\s]','', x) for x in col_names] # remove non-alphanumeric characters
col_names = [str(col).lower().replace(" ", "_") for col in col_names] # replace spaces with dashes
col_names_long = [var + '-' + col for col in col_names]
# rename the columns
df_wide.columns = col_names_long # add variable name to column names
#id cols
id_cols = ["placekey", "startDate", "endDate"]
df_wide = df_wide.assign(
placekey = rowid,
startDate = start_date,
endDate = end_date)
out = df_wide.loc[:, id_cols + col_names_long]
if not wide:
out = (out
.melt(id_vars = id_cols)
.dropna(axis=0, subset = ['value'])
.assign(variable = lambda x: x.variable.str.replace(var + '-', ''))
)
return out
def expand_list(var, dat):
date_df = pd.DataFrame({
"startDate": dat.date_range_start,
"endDate": dat.date_range_end,
"placekey": dat.placekey})
dat_expand = (dat
.assign(lvar = createlist(dat[var]))
.filter(["placekey", "lvar"])
.explode("lvar")
.reset_index(drop=True)
.rename(columns={"lvar":var})
)
dat_label = (dat_expand
.groupby('placekey', sort = False)
.transform(lambda x: rangenumbers(x))
.reset_index(drop=True)
)
if var.find("hour") !=-1:
orderName = 'hour'
elif var.find("day") !=-1:
orderName = 'day'
else :
orderName = 'sequence'
#dat_label.columns = ['sequence']
dat_label.rename(columns = {var:orderName}, inplace=True)
out = (pd.concat([dat_expand, dat_label], axis=1)
.merge(date_df, on = 'placekey')
)
out[var] = out[var].astype(float)
out = out.filter(['placekey', 'startDate', 'endDate', orderName, var], axis=1)
return out