-
Notifications
You must be signed in to change notification settings - Fork 0
/
02-missing_clean.r
90 lines (70 loc) · 2.4 KB
/
02-missing_clean.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
source("libraries.R")
source("functions.R")
# parameters --------------------------------------------------------------
# % threshold of missing to drop variable
k_mis_var = 45
# read tabla --------------------------------------------------------------
# tot
base = readRDS(file="data/working/tabla.rds")
# ids
base_ids = base %>%
select(starts_with("id_"))
# targets
base_y = base %>%
select(starts_with("y_"))
# numeric (without id and targets)
base_num = base %>%
select_if(is.numeric) %>%
select(-starts_with("id_")) %>%
select(-starts_with("y_"))
# categorical
base_cat = base %>%
select_if(function(x) !is.numeric(x))
# missing analysis --------------------------------------------------------
# missing per row
mis_row = base %>%
mutate(
misperc = apply(base, 1, function(x) sum(is.na(x))/length(x)*100 )
) %>%
select(id_tot, misperc) %>%
arrange(-misperc)
# missing per variable
mis_col = skimr::skim_to_wide(base) %>%
mutate(misperc = as.numeric(missing)/as.numeric(n)*100) %>%
select(variable, misperc) %>%
arrange(-misperc)
# clean data -----------------------------------------------------
# NOTA: NO SE INCLUYE METODO DE IMPUTACION PARA VARIABLES NO NUMERICAS
# (porque luego no se terminan usando)
# (incluir si se usan para outliers y/o gam)
# vars to drop (exceed missing% threshold)
vars_drop_mis = mis_col %>%
dplyr::filter(misperc>k_mis_var) %$% variable
# clean
base_clean = base_num %>%
# drop numeric with high missing%
select(-one_of(vars_drop_mis)) %>%
# impute numeric NA with median polish
imp_medpol() %>%
# bind with categorical and ids
{bind_cols(base_ids,
base_cat,
base_y,
.)} %>%
# drop categorical with high missing% (warning message if there are none)
select(-one_of(vars_drop_mis)) %>%
# drop obs where targets (y_*) are missing (ninguna por ahora)
drop_na(starts_with("y_"))
# save tabla --------------------------------------------------------------
saveRDS(base_clean, "data/working/base_clean_01.rds")
# old ---------------------------------------------------------------------
# base_clean = base_num %>%
# # impute NAs in numeric with median
# mutate_all(
# function(x) ifelse(is.na(x), median(x, na.rm=T), x)
# ) %>%
# {bind_cols(base_ids,
# base_cat,
# .)} %>%
# # drop vars with high perc of missing
# select(-vars_drop_mis)