From 45fb6c1e791dc8db7f15c71f60e1a43893b243bc Mon Sep 17 00:00:00 2001 From: Rowley Date: Tue, 2 Jan 2024 15:49:06 +0000 Subject: [PATCH] added step to reclassify first_learned data as self-taught --- NAMESPACE | 2 + R/data_cleaning.R | 46 +++++++++++++++++++ R/frequency-tables.R | 1 + main.R | 3 +- man/clean_data.Rd | 17 +++++++ man/clean_first_learned.Rd | 17 +++++++ .../test-summarise_where_learned_code.R | 9 ++-- 7 files changed, 90 insertions(+), 5 deletions(-) create mode 100644 man/clean_data.Rd create mode 100644 man/clean_first_learned.Rd diff --git a/NAMESPACE b/NAMESPACE index c99188b..7640c9f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,7 +5,9 @@ export(apply_skip_logic) export(break_q_names) export(calculate_freqs) export(check_skip_logic) +export(clean_data) export(clean_departments) +export(clean_first_learned) export(clean_workplace) export(compare_models) export(create_filtered_pages) diff --git a/R/data_cleaning.R b/R/data_cleaning.R index 8f7e1e7..e713deb 100644 --- a/R/data_cleaning.R +++ b/R/data_cleaning.R @@ -129,6 +129,25 @@ rename_cols <- function(data) { return(data) } +#' @title Clean data +#' +#' @description Recategorise department, workplace and first_learned data +#' +#' @param data cleaned CARS dataset +#' +#' @return CARS dataset +#' @export + +clean_data <- function(data){ + + data %>% + clean_departments() %>% + clean_workplace() %>% + clean_first_learned() + +} + + #' @title Clean department data #' #' @description add NHS to department list and merge departments where needed. @@ -207,3 +226,30 @@ clean_workplace <- function(data) { return(data) } + +#' @title Clean first learned data +#' +#' @description reclassify 'other' free text responses into self-taught based on common terms used +#' +#' @param data cleaned CARS dataset +#' +#' @return CARS dataset +#' @export + +clean_first_learned <- function(data) { + + matches <- c("self", + "hobby", + "personal", + "independ", + "home", + "for fun", + "free time", + "spare time", + "childhood") + + data$first_learned[stringr::str_detect(tolower(data$first_learned), stringr::str_c(matches, collapse = "|"))] <- "Self-taught" + + return(data) + +} diff --git a/R/frequency-tables.R b/R/frequency-tables.R index 36406c3..a314847 100644 --- a/R/frequency-tables.R +++ b/R/frequency-tables.R @@ -167,6 +167,7 @@ summarise_where_learned_code <- function(data){ "Education", "Previous private sector employment", "Previous public sector employment", + "Self-taught", "Other") data <- data %>% diff --git a/main.R b/main.R index b0d2b2b..02b0fb8 100644 --- a/main.R +++ b/main.R @@ -3,8 +3,7 @@ library(magrittr) data <- CARS::get_tidy_data_file("2023_data.csv") %>% CARS::rename_cols() %>% CARS::apply_skip_logic() %>% - CARS::clean_workplace() %>% - CARS::clean_departments() %>% + CARS::clean_data() %>% CARS::derive_vars() CARS::create_filtered_pages(data, type = "departments") diff --git a/man/clean_data.Rd b/man/clean_data.Rd new file mode 100644 index 0000000..45d510f --- /dev/null +++ b/man/clean_data.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_cleaning.R +\name{clean_data} +\alias{clean_data} +\title{Clean data} +\usage{ +clean_data(data) +} +\arguments{ +\item{data}{cleaned CARS dataset} +} +\value{ +CARS dataset +} +\description{ +Recategorise department, workplace and first_learned data +} diff --git a/man/clean_first_learned.Rd b/man/clean_first_learned.Rd new file mode 100644 index 0000000..bb09ffe --- /dev/null +++ b/man/clean_first_learned.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/data_cleaning.R +\name{clean_first_learned} +\alias{clean_first_learned} +\title{Clean first learned data} +\usage{ +clean_first_learned(data) +} +\arguments{ +\item{data}{cleaned CARS dataset} +} +\value{ +CARS dataset +} +\description{ +reclassify 'other' free text responses into self-taught based on common terms used +} diff --git a/tests/testthat/test-summarise_where_learned_code.R b/tests/testthat/test-summarise_where_learned_code.R index 2122ac8..736dbd2 100644 --- a/tests/testthat/test-summarise_where_learned_code.R +++ b/tests/testthat/test-summarise_where_learned_code.R @@ -7,14 +7,14 @@ dummy_data <- data.frame( "Sometimes", "Regularly", "All the time"), - each=18), + each = 21), other_coding_experience = rep(c( NA, "Yes", "No"), times = 6, - each = 6), + each = 7), first_learned = rep(c( NA, @@ -22,6 +22,7 @@ dummy_data <- data.frame( "Education", "Previous private sector employment", "Previous public sector employment", + "Self-taught", "Other"), times = 18) @@ -46,15 +47,17 @@ test_that("summarise_where_learned_code output is as expected", { "Education", "Previous private sector employment", "Previous public sector employment", + "Self-taught", "Other"), levels = c( "Current employment", "Education", "Previous private sector employment", "Previous public sector employment", + "Self-taught", "Other")), - n = c(19/47, rep(7/47, times=4)) + n = c(24/64, rep(8/64, times=5)) )