Skip to content

Commit

Permalink
Improve docs for data_to_wide() (#506)
Browse files Browse the repository at this point in the history
* Improve docs for data_to_wide

* fix

* fix

* lintr

* update docs, deprecate arg, update test

* update

* update readme

* add examples

* also improve data_to_long

* update test

* wordlist

* update docs

* address comments

* apply suggestions

* docs

* update docs

* address suggestions

* address comments

* typo

* Update NEWS.md

* formatting news

* plural

* by -> id_cols

* news

* fix

* fix warning in test

* typo

* lintr, whitespace

* lintr (simplify else)

---------

Co-authored-by: Etienne Bacher <[email protected]>
  • Loading branch information
strengejacke and etiennebacher authored May 31, 2024
1 parent 2c2a906 commit a7d3c80
Show file tree
Hide file tree
Showing 13 changed files with 521 additions and 171 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ CHANGES
If you recode into a numeric variable, and one of the recode values is `NA`,
you no longer need to use `NA_real_` for numeric `NA` values.

* Improved documentation for some functions.

BUG FIXES

* `data_to_long()` did not work for data frame where columns had attributes
Expand Down
39 changes: 21 additions & 18 deletions R/data_read.R
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
#' factors, where imported value labels will be set as factor levels. If a
#' numeric variable has _no_ value labels or less value labels than values, it
#' is not converted to factor. In this case, value labels are preserved as
#' `"labels"` attribute. Character vectors are preserved. Use
#' `"labels"` attribute. Character vectors are preserved. Use
#' `convert_factors = FALSE` to remove the automatic conversion of numeric
#' variables to factors.
#'
Expand Down Expand Up @@ -105,7 +105,7 @@ data_read <- function(path,
por = .read_spss(path, encoding, convert_factors, verbose, ...),
dta = .read_stata(path, encoding, convert_factors, verbose, ...),
sas7bdat = .read_sas(path, path_catalog, encoding, convert_factors, verbose, ...),
.read_unknown(path, convert_factors, verbose, ...)
.read_unknown(path, file_type, convert_factors, verbose, ...)
)

# tell user about empty columns
Expand Down Expand Up @@ -178,20 +178,18 @@ data_read <- function(path,
if (is.character(i)) {
# we need this to drop haven-specific class attributes
i <- as.character(i)
} else {
} else if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
# if all values are labelled, we assume factor. Use labels as levels
if (!is.null(value_labels) && length(value_labels) == insight::n_unique(i)) {
if (is.numeric(i)) {
i <- factor(i, labels = names(value_labels))
} else {
i <- factor(as.character(i), labels = names(value_labels))
}
value_labels <- NULL
attr(i, "converted_to_factor") <- TRUE
if (is.numeric(i)) {
i <- factor(i, labels = names(value_labels))
} else {
# else, fall back to numeric
i <- as.numeric(i)
i <- factor(as.character(i), labels = names(value_labels))
}
value_labels <- NULL
attr(i, "converted_to_factor") <- TRUE
} else {
# else, fall back to numeric
i <- as.numeric(i)
}

# drop unused value labels
Expand Down Expand Up @@ -290,12 +288,18 @@ data_read <- function(path,
}


.read_unknown <- function(path, convert_factors, verbose, ...) {
insight::check_if_installed("rio", reason = paste0("to read files of type '", .file_ext(path), "'"))
.read_unknown <- function(path, file_type, convert_factors, verbose, ...) {
insight::check_if_installed("rio", reason = paste0("to read files of type '", file_type, "'"))
if (verbose) {
insight::format_alert("Reading data...")
}
out <- rio::import(file = path, ...)
# set up arguments. for RDS, we set trust = TRUE, to avoid warnings
rio_args <- list(file = path)
# check if we have RDS, and if so, add trust = TRUE
if (file_type == "rds") {
rio_args$trust <- TRUE
}
out <- do.call(rio::import, c(rio_args, list(...)))

# for "unknown" data formats (like .RDS), which still can be imported via
# "rio::import()", we must check whether we actually have a data frame or
Expand All @@ -310,9 +314,8 @@ data_read <- function(path,
)
}
return(out)
} else {
out <- tmp
}
out <- tmp
}

.post_process_imported_data(out, convert_factors, verbose)
Expand Down
1 change: 1 addition & 0 deletions R/data_restoretype.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#' Restore the type of columns according to a reference data frame
#'
#' @param data A data frame for which to restore the column types.
#' @inheritParams data_to_long
#' @inheritParams data_rename
#' @param reference A reference data frame from which to find the correct
Expand Down
105 changes: 82 additions & 23 deletions R/data_to_long.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,65 +4,124 @@
#' the number of columns. This is a dependency-free base-R equivalent of
#' `tidyr::pivot_longer()`.
#'
#' @param data A data frame to pivot.
#' @param names_to The name of the new column that will contain the column
#' names.
#' @param data A data frame to convert to long format, so that it has more
#' rows and fewer columns after the operation.
#' @param names_to The name of the new column (variable) that will contain the
#' _names_ from columns in `select` as values, to identify the source of the
#' values. `names_to` can be a character vector with more than one column name,
#' in which case `names_sep` or `names_pattern` must be provided in order to
#' identify which parts of the column names go into newly created columns.
#' See also 'Examples'.
#' @param names_prefix A regular expression used to remove matching text from
#' the start of each variable name.
#' @param names_sep,names_pattern If `names_to` contains multiple values, this
#' argument controls how the column name is broken up.
#' `names_pattern` takes a regular expression containing matching groups, i.e. "()".
#' @param values_to The name of the new column that will contain the values of
#' the pivoted variables.
#' argument controls how the column name is broken up. `names_pattern` takes a
#' regular expression containing matching groups, i.e. "()".
#' @param values_to The name of the new column that will contain the _values_ of
#' the columns in `select`.
#' @param values_drop_na If `TRUE`, will drop rows that contain only `NA` in the
#' `values_to` column. This effectively converts explicit missing values to
#' implicit missing values, and should generally be used only when missing values
#' in data were created by its structure.
#' `values_to` column. This effectively converts explicit missing values to
#' implicit missing values, and should generally be used only when missing values
#' in data were created by its structure.
#' @param rows_to The name of the column that will contain the row names or row
#' numbers from the original data. If `NULL`, will be removed.
#' numbers from the original data. If `NULL`, will be removed.
#' @param ... Currently not used.
#' @inheritParams extract_column_names
#' @param cols Identical to `select`. This argument is here to ensure compatibility
#' with `tidyr::pivot_longer()`. If both `select` and `cols` are provided, `cols`
#' is used.
#' with `tidyr::pivot_longer()`. If both `select` and `cols` are provided, `cols`
#' is used.
#'
#' @details
#' Reshaping data into long format usually means that the input data frame is
#' in _wide_ format, where multiple measurements taken on the same subject are
#' stored in multiple columns (variables). The long format stores the same
#' information in a single column, with each measurement per subject stored in
#' a separate row. The values of all variables that are not in `select` will
#' be repeated.
#'
#' The necessary information for `data_to_long()` is:
#'
#' - The columns that contain the repeated measurements (`select`).
#' - The name of the newly created column that will contain the names of the
#' columns in `select` (`names_to`), to identify the source of the values.
#' `names_to` can also be a character vector with more than one column name,
#' in which case `names_sep` or `names_pattern` must be provided to specify
#' which parts of the column names go into the newly created columns.
#' - The name of the newly created column that contains the values of the
#' columns in `select` (`values_to`).
#'
#' In other words: repeated measurements that are spread across several columns
#' will be gathered into a single column (`values_to`), with the original column
#' names, that identify the source of the gathered values, stored in one or more
#' new columns (`names_to`).
#'
#' @return If a tibble was provided as input, `reshape_longer()` also returns a
#' tibble. Otherwise, it returns a data frame.
#'
#' @examplesIf requireNamespace("psych") && requireNamespace("tidyr")
#' wide_data <- data.frame(replicate(5, rnorm(10)))
#' wide_data <- setNames(
#' data.frame(replicate(2, rnorm(8))),
#' c("Time1", "Time2")
#' )
#' wide_data$ID <- 1:8
#' wide_data
#'
#' # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:5))
#' # Default behaviour (equivalent to tidyr::pivot_longer(wide_data, cols = 1:3))
#' # probably doesn't make much sense to mix "time" and "id"
#' data_to_long(wide_data)
#'
#' # Customizing the names
#' data_to_long(wide_data,
#' select = c(1, 2),
#' names_to = "Column",
#' values_to = "Numbers",
#' rows_to = "Row"
#' data_to_long(
#' wide_data,
#' select = c("Time1", "Time2"),
#' names_to = "Timepoint",
#' values_to = "Score"
#' )
#'
#' # Reshape multiple columns into long format.
#' mydat <- data.frame(
#' age = c(20, 30, 40),
#' sex = c("Female", "Male", "Male"),
#' score_t1 = c(30, 35, 32),
#' score_t2 = c(33, 34, 37),
#' score_t3 = c(36, 35, 38),
#' speed_t1 = c(2, 3, 1),
#' speed_t2 = c(3, 4, 5),
#' speed_t3 = c(1, 8, 6)
#' )
#' # The column names are split into two columns: "type" and "time". The
#' # pattern for splitting column names is provided in `names_pattern`. Values
#' # of all "score_*" and "speed_*" columns are gathered into a single column
#' # named "count".
#' data_to_long(
#' mydat,
#' select = 3:8,
#' names_to = c("type", "time"),
#' names_pattern = "(score|speed)_t(\\d+)",
#' values_to = "count"
#' )
#'
#' # Full example
#' # ------------------
#' data <- psych::bfi # Wide format with one row per participant's personality test
#'
#' # Pivot long format
#' data_to_long(data,
#' very_long_data <- data_to_long(data,
#' select = regex("\\d"), # Select all columns that contain a digit
#' names_to = "Item",
#' values_to = "Score",
#' rows_to = "Participant"
#' )
#' head(very_long_data)
#'
#' data_to_long(
#' even_longer_data <- data_to_long(
#' tidyr::who,
#' select = new_sp_m014:newrel_f65,
#' names_to = c("diagnosis", "gender", "age"),
#' names_pattern = "new_?(.*)_(.)(.*)",
#' values_to = "count"
#' )
#'
#' head(even_longer_data)
#' @inherit data_rename
#' @export
data_to_long <- function(data,
Expand Down
Loading

0 comments on commit a7d3c80

Please sign in to comment.