From 45fb6c1e791dc8db7f15c71f60e1a43893b243bc Mon Sep 17 00:00:00 2001
From: Rowley <Charlotte.Rowley@ons.gov.uk>
Date: Tue, 2 Jan 2024 15:49:06 +0000
Subject: [PATCH] added step to reclassify first_learned data as self-taught

---
 NAMESPACE                                     |  2 +
 R/data_cleaning.R                             | 46 +++++++++++++++++++
 R/frequency-tables.R                          |  1 +
 main.R                                        |  3 +-
 man/clean_data.Rd                             | 17 +++++++
 man/clean_first_learned.Rd                    | 17 +++++++
 .../test-summarise_where_learned_code.R       |  9 ++--
 7 files changed, 90 insertions(+), 5 deletions(-)
 create mode 100644 man/clean_data.Rd
 create mode 100644 man/clean_first_learned.Rd

diff --git a/NAMESPACE b/NAMESPACE
index c99188b..7640c9f 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -5,7 +5,9 @@ export(apply_skip_logic)
 export(break_q_names)
 export(calculate_freqs)
 export(check_skip_logic)
+export(clean_data)
 export(clean_departments)
+export(clean_first_learned)
 export(clean_workplace)
 export(compare_models)
 export(create_filtered_pages)
diff --git a/R/data_cleaning.R b/R/data_cleaning.R
index 8f7e1e7..e713deb 100644
--- a/R/data_cleaning.R
+++ b/R/data_cleaning.R
@@ -129,6 +129,25 @@ rename_cols <- function(data) {
   return(data)
 }
 
+#' @title Clean data
+#'
+#' @description Recategorise department, workplace and first_learned data
+#'
+#' @param data cleaned CARS dataset
+#'
+#' @return CARS dataset
+#' @export
+
+clean_data <- function(data){
+
+ data %>%
+   clean_departments() %>%
+   clean_workplace() %>%
+   clean_first_learned()
+
+}
+
+
 #' @title Clean department data
 #'
 #' @description add NHS to department list and merge departments where needed.
@@ -207,3 +226,30 @@ clean_workplace <- function(data) {
   return(data)
 
 }
+
+#' @title Clean first learned data
+#'
+#' @description reclassify 'other' free text responses into self-taught based on common terms used
+#'
+#' @param data cleaned CARS dataset
+#'
+#' @return CARS dataset
+#' @export
+
+clean_first_learned <- function(data) {
+
+  matches <- c("self",
+               "hobby",
+               "personal",
+               "independ",
+               "home",
+               "for fun",
+               "free time",
+               "spare time",
+               "childhood")
+
+  data$first_learned[stringr::str_detect(tolower(data$first_learned), stringr::str_c(matches, collapse = "|"))] <- "Self-taught"
+
+  return(data)
+
+}
diff --git a/R/frequency-tables.R b/R/frequency-tables.R
index 36406c3..a314847 100644
--- a/R/frequency-tables.R
+++ b/R/frequency-tables.R
@@ -167,6 +167,7 @@ summarise_where_learned_code <- function(data){
               "Education",
               "Previous private sector employment",
               "Previous public sector employment",
+              "Self-taught",
               "Other")
 
   data <- data %>%
diff --git a/main.R b/main.R
index b0d2b2b..02b0fb8 100644
--- a/main.R
+++ b/main.R
@@ -3,8 +3,7 @@ library(magrittr)
 data <- CARS::get_tidy_data_file("2023_data.csv") %>%
   CARS::rename_cols() %>%
   CARS::apply_skip_logic() %>%
-  CARS::clean_workplace() %>%
-  CARS::clean_departments() %>%
+  CARS::clean_data() %>%
   CARS::derive_vars()
 
 CARS::create_filtered_pages(data, type = "departments")
diff --git a/man/clean_data.Rd b/man/clean_data.Rd
new file mode 100644
index 0000000..45d510f
--- /dev/null
+++ b/man/clean_data.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_cleaning.R
+\name{clean_data}
+\alias{clean_data}
+\title{Clean data}
+\usage{
+clean_data(data)
+}
+\arguments{
+\item{data}{cleaned CARS dataset}
+}
+\value{
+CARS dataset
+}
+\description{
+Recategorise department, workplace and first_learned data
+}
diff --git a/man/clean_first_learned.Rd b/man/clean_first_learned.Rd
new file mode 100644
index 0000000..bb09ffe
--- /dev/null
+++ b/man/clean_first_learned.Rd
@@ -0,0 +1,17 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/data_cleaning.R
+\name{clean_first_learned}
+\alias{clean_first_learned}
+\title{Clean first learned data}
+\usage{
+clean_first_learned(data)
+}
+\arguments{
+\item{data}{cleaned CARS dataset}
+}
+\value{
+CARS dataset
+}
+\description{
+reclassify 'other' free text responses into self-taught based on common terms used
+}
diff --git a/tests/testthat/test-summarise_where_learned_code.R b/tests/testthat/test-summarise_where_learned_code.R
index 2122ac8..736dbd2 100644
--- a/tests/testthat/test-summarise_where_learned_code.R
+++ b/tests/testthat/test-summarise_where_learned_code.R
@@ -7,14 +7,14 @@ dummy_data <- data.frame(
     "Sometimes",
     "Regularly",
     "All the time"),
-    each=18),
+    each = 21),
 
   other_coding_experience = rep(c(
     NA,
     "Yes",
     "No"),
     times = 6,
-    each = 6),
+    each = 7),
 
   first_learned = rep(c(
     NA,
@@ -22,6 +22,7 @@ dummy_data <- data.frame(
     "Education",
     "Previous private sector employment",
     "Previous public sector employment",
+    "Self-taught",
     "Other"),
     times = 18)
 
@@ -46,15 +47,17 @@ test_that("summarise_where_learned_code output is as expected", {
       "Education",
       "Previous private sector employment",
       "Previous public sector employment",
+      "Self-taught",
       "Other"),
       levels = c(
         "Current employment",
         "Education",
         "Previous private sector employment",
         "Previous public sector employment",
+        "Self-taught",
         "Other")),
 
-    n = c(19/47, rep(7/47, times=4))
+    n = c(24/64, rep(8/64, times=5))
 
   )