diff --git a/R/build_geo_ftp_urls.R b/R/build_geo_ftp_urls.R index f8f42d9..84199fd 100644 --- a/R/build_geo_ftp_urls.R +++ b/R/build_geo_ftp_urls.R @@ -9,7 +9,7 @@ build_geo_ftp_url <- function(ids, file_type = "soft", ftp_over_https = FALSE) { tolower(file_type), c("soft", "soft_full", "annot", "miniml", "suppl", "matrix") ) - super_ids <- sub("\\d{1,3}$", "nnn", ids, perl = TRUE) + super_ids <- str_replace(ids, "\\d{1,3}$", "nnn") if (ftp_over_https) { geo_ftp_site <- geo_ftp_over_https } else { diff --git a/R/download_geo_files.R b/R/download_geo_files.R index 34bc099..2e78d93 100644 --- a/R/download_geo_files.R +++ b/R/download_geo_files.R @@ -6,10 +6,7 @@ download_geo_suppl_or_gse_matrix_files <- function(ids, dest_dir, file_type, pat ftp_over_https = ftp_over_https ) if (!is.null(pattern)) { - url_list <- lapply(url_list, grep, - pattern = pattern, - perl = TRUE, value = TRUE - ) + url_list <- lapply(url_list, str_subset, pattern = pattern) } file_path_list <- lapply(url_list, function(urls) { # urls may be NULL or character(0L) @@ -189,7 +186,7 @@ list_geo_file_url <- function(id, file_type, handle_opts = list(), ftp_over_http } else { file_names <- readLines(url_connection) } - file_names <- grep("^G", file_names, perl = TRUE, value = TRUE) + file_names <- str_subset(file_names, "^G") # build urls for all found files ------------------------ if (length(file_names)) { diff --git a/R/parsers.R b/R/parsers.R index e0511a2..fff1a31 100644 --- a/R/parsers.R +++ b/R/parsers.R @@ -49,9 +49,9 @@ parse_gse_matrix <- function(file_text, gse_sample_data = NULL) { other = meta_data$Series ) # fetch GPL accession - gpl_id <- meta_data$Sample[[grep( - "platform_id", colnames(meta_data$Sample), - ignore.case = TRUE, value = FALSE + gpl_id <- meta_data$Sample[[str_which( + colnames(meta_data$Sample), "platform_id", + ignore.case = TRUE )]][[1L]] list( @@ -69,9 +69,7 @@ parse_gse_matrix <- function(file_text, gse_sample_data = NULL) { #' @noRd parse_gse_soft <- function(file_text, entity_type = "all", only_meta = FALSE) { if (entity_type == "all") { - entity_indices <- grep("^\\^(SAMPLE|PLATFORM)", file_text, - perl = TRUE, value = FALSE - ) + entity_indices <- str_which(file_text, "^\\^(SAMPLE|PLATFORM)") if (length(entity_indices)) { soft_meta <- parse_meta( file_text[seq_len(entity_indices[[1L]] - 1L)] @@ -90,16 +88,14 @@ parse_gse_soft <- function(file_text, entity_type = "all", only_meta = FALSE) { platform = "PLATFORM" ) ) - entity_indices <- grep(entity_marker, file_text, - perl = TRUE, value = FALSE - ) + entity_indices <- str_which(file_text, entity_marker) } soft_data_list <- vector(mode = "list", length = length(entity_indices)) # For every entity data, the data is seperated by "=" into name-value pairs # Don't use `data.table::tstrsplit`, as it will split string into three or # more element. entity <- data.table::transpose( - str_split(file_text[entity_indices], "\\s*=\\s*") + str_split_fixed(file_text[entity_indices], "\\s*=\\s*") ) names(soft_data_list) <- entity[[2L]] seq_line_temp <- c(entity_indices, length(file_text)) @@ -175,10 +171,7 @@ parse_gpl_or_gsm_soft <- function(file_text, only_meta = FALSE) { #' @importFrom data.table merge.data.table parse_gds_soft <- function(file_text, only_meta = FALSE) { - subset_lines <- grep( - "^!subset", file_text, - perl = TRUE, value = FALSE - ) + subset_lines <- str_which(file_text, "^!subset") # parse meta data meta_data <- parse_meta(file_text[-subset_lines]) if (only_meta) { @@ -219,24 +212,17 @@ parse_gse_matrix_meta <- function(file_text) { meta_groups <- c("Series", "Sample") names(meta_groups) <- meta_groups meta_data <- lapply(meta_groups, function(group) { - meta_text <- grep( - paste0("^!", group, "_"), file_text, - value = TRUE, fixed = FALSE, perl = TRUE - ) + meta_text <- str_subset(file_text, paste0("^!", group, "_")) meta_data <- parse_meta(meta_text) rlang::set_names( meta_data, - function(x) sub(paste0("^", group, "_"), "", x, perl = TRUE) + function(x) str_replace(x, paste0("^", group, "_"), "") ) }) data.table::setDT(meta_data$Sample) for (x in c("sample_id", "pubmed_id", "platform_id")) { if (x %chin% names(meta_data$Series)) { - meta_data$Series[[x]] <- strsplit( - meta_data$Series[[x]], - split = ";?+ ", fixed = FALSE, - perl = TRUE - )[[1L]] + meta_data$Series[[x]] <- str_split(meta_data$Series[[x]], ";?+ ")[[1L]] } } meta_data @@ -252,7 +238,7 @@ parse_gds_subset <- function(subset_file_text) { # group by `subset_sample_id` subset_data[ , unlist( - strsplit(subset_sample_id, ",", perl = TRUE), + str_split(subset_sample_id, ","), use.names = FALSE ), by = c( @@ -298,7 +284,7 @@ parse_columns <- function(file_text, target_rownames) { # than 1L and the last value of which is a blank string ""; after above # transformation, a tail "; " will be inserted in this element, So we just # remove the tail "; " string. - labelDescription <- sub(";\\s*$", "", labelDescription, perl = TRUE) + labelDescription <- str_replace(labelDescription, ";\\s*$", "") labelDescription <- data.table::fifelse( labelDescription == "", NA_character_, labelDescription, @@ -317,10 +303,7 @@ parse_columns <- function(file_text, target_rownames) { #' @return a list #' @noRd parse_meta <- function(file_text) { - line_with_equality <- grepl( - "^[^\\t]*=", file_text, - fixed = FALSE, perl = TRUE - ) + line_with_equality <- str_detect(file_text, "^[^\\t]*=") # For lines seperated by "=" meta_sep_by_equality <- read_meta(file_text[line_with_equality], "equality") meta_sep_by_equality <- parse_line_sep_by_equality(meta_sep_by_equality) @@ -346,11 +329,11 @@ parse_line_sep_by_equality <- function(dt) { return(NULL) } name_value_pairs <- data.table::transpose( - str_split(dt[[1L]], "\\s*=\\s*") + str_split_fixed(dt[[1L]], "\\s*=\\s*") ) split( name_value_pairs[[2L]], - factor(sub("^[#!]\\s*+", "", name_value_pairs[[1L]], perl = TRUE)) + factor(str_replace(name_value_pairs[[1L]], "^[#!]\\s*+", "")) ) } @@ -361,7 +344,7 @@ parse_line_sep_by_table <- function(dt) { if (!nrow(dt) || ncol(dt) == 1L) { return(NULL) } - dt[, V1 := factor(sub("^!\\s*+", "", V1, perl = TRUE))] + dt[, V1 := factor(str_replace(V1, "^!\\s*+", ""))] meta_list <- split( dt[, lapply(.SD, paste0, collapse = ""), by = "V1"], by = "V1", drop = TRUE, @@ -374,18 +357,14 @@ parse_line_sep_by_table <- function(dt) { read_data_table <- function(file_text) { read_text( - text = grep("^[\\^!#]", file_text, - value = TRUE, fixed = FALSE, perl = TRUE, invert = TRUE - ), + text = str_subset(file_text, "^[\\^!#]", invert = TRUE), sep = "\t", header = TRUE, blank.lines.skip = TRUE, check.names = FALSE ) } read_meta <- function(file_text, meta_type = "table") { read_text( - text = grep("^!\\w*", file_text, - value = TRUE, fixed = FALSE, perl = TRUE - ), + text = str_subset(file_text, "^!\\w*"), sep = switch(meta_type, table = "\t", equality = "" @@ -400,9 +379,7 @@ read_meta <- function(file_text, meta_type = "table") { } read_column <- function(file_text) { read_text( - text = grep("^#\\w[^\\t]*=", file_text, - value = TRUE, fixed = FALSE, perl = TRUE - ), + text = str_subset(file_text, "^#\\w[^\\t]*="), sep = "", header = FALSE, blank.lines.skip = TRUE, colClasses = "character", check.names = FALSE diff --git a/R/phenodata.R b/R/phenodata.R index ce0ee97..15c9d24 100644 --- a/R/phenodata.R +++ b/R/phenodata.R @@ -76,27 +76,22 @@ parse_pdata <- function(data, columns = NULL, sep = ":", split = ";") { # `sample_dt` should be a data.table parse_gse_matrix_sample_characteristics <- function(sample_dt, characteristics_cols = NULL, sep = ":", split = ";") { if (is.null(characteristics_cols)) { - characteristics_cols <- grep( - "^characteristics_ch", - colnames(sample_dt), - value = TRUE, perl = TRUE + characteristics_cols <- str_subset( + colnames(sample_dt), "^characteristics_ch" ) } else { - characteristics_cols <- grep( - "ch\\d*(\\.\\d*)?$", - characteristics_cols, - value = TRUE, perl = TRUE + characteristics_cols <- str_subset( + characteristics_cols, "ch\\d*(\\.\\d*)?$" ) } if (length(characteristics_cols)) { split <- paste0("(\\s*+)", split, "(\\s*+)") for (.characteristic_col in characteristics_cols) { - characteristic_list <- strsplit( - sample_dt[[as.character(.characteristic_col)]], - split = split, perl = TRUE + characteristic_list <- str_split( + sample_dt[[as.character(.characteristic_col)]], split ) characteristic_list <- lapply(characteristic_list, function(x) { - grep(sep, x, perl = TRUE, value = TRUE) + str_subset(x, sep) }) have_more_than_one_sep <- vapply( characteristic_list, @@ -200,7 +195,7 @@ parse_gse_soft_sample_characteristics <- function(gsm_list) { ) data.table::setnames( sample_meta_dt, - function(x) sub("^Sample_", "", x, perl = TRUE) + function(x) str_replace(x, "^Sample_", "") ) # We select columns with names starting with "characteristics_ch" and at # least 50% of the elements in the column contains character ":", @@ -210,7 +205,7 @@ parse_gse_soft_sample_characteristics <- function(gsm_list) { ) column_have_sep <- sample_meta_dt[, vapply(.SD, function(list_col) { have_sep <- vapply(list_col, function(x) { - all(grepl(":", x, fixed = TRUE), na.rm = TRUE) + all(str_detect(x, ":", fixed = TRUE), na.rm = TRUE) }, logical(1L)) mean(have_sep, na.rm = TRUE) >= 0.5 }, logical(1L)), .SDcols = characteristics_cols] @@ -301,7 +296,7 @@ parse_name_value_pairs <- function(pair_list, sep = ":") { # Don't use `data.table::tstrsplit`, as it will split string into three # or more elements. name_value_pairs <- data.table::transpose( - str_split(x, paste0("(\\s*+)", sep, "(\\s*+)")), + str_split_fixed(x, paste0("(\\s*+)", sep, "(\\s*+)")), fill = NA_character_ ) if (length(name_value_pairs) < 2L) { diff --git a/R/search_geo.R b/R/search_geo.R index 2ce2757..165c066 100644 --- a/R/search_geo.R +++ b/R/search_geo.R @@ -20,7 +20,7 @@ #' [entrez_db_searchable][rentrez::entrez_db_searchable]. #' @param step the number of records to fetch from the database each time. You #' may choose a smaller value if failed. -#' @param interval The time interval (seconds) between each step. +#' @param interval The time interval (seconds) between each step. #' @return a data.frame contains the search results #' @examples #' rgeo::search_geo("diabetes[ALL] AND Homo sapiens[ORGN] AND GSE[ETYP]") @@ -44,8 +44,8 @@ search_geo <- function(query, step = 500L, interval = 1L) { ) Sys.sleep(interval) } - records <- strsplit( - gsub("^\\n|\\n$", "", paste0(records, collapse = "")), + records <- str_split( + str_replace_all(paste0(records, collapse = ""), "^\\n|\\n$", ""), "\\n\\n" )[[1L]] name_value_pairs <- parse_name_value_pairs(preprocess_records(records)) @@ -54,12 +54,13 @@ search_geo <- function(query, step = 500L, interval = 1L) { c("Contains", "Datasets", "Series", "Platforms"), names(name_value_pairs) ), - grep("Accession$", names(name_value_pairs), perl = TRUE, value = TRUE) + str_subset(names(name_value_pairs), "Accession$") ) data.table::setDT(name_value_pairs) data.table::setcolorder( - name_value_pairs, - tail_col, after = ncol(name_value_pairs) + name_value_pairs, + tail_col, + after = ncol(name_value_pairs) ) data.table::setDF(name_value_pairs) name_value_pairs @@ -68,23 +69,22 @@ search_geo <- function(query, step = 500L, interval = 1L) { # this function just processed GEO searched results returned by `entrez_fetch` # into key-values paris preprocess_records <- function(x) { - x <- sub("^\\d+\\.", "Title:", x, perl = TRUE) - x <- sub( - "(Title:[^\\n]*\\n)(?:\\(Submitter supplied\\))?\\s*", - "\\1Summary: ", x, - perl = TRUE + x <- str_replace(x, "^\\d+\\.", "Title:") + x <- str_replace( + x, "(Title:[^\\n]*\\n)(?:\\(Submitter supplied\\))?\\s*", + "\\1Summary: " ) - x <- gsub( + x <- str_replace_all( + x, "(Platform|Dataset|Serie)s?: *((?:GPL\\d+ *|GDS\\d+ *|GSE\\d+ *)+)", - "\\1s: \\2\n", x, - perl = TRUE + "\\1s: \\2\n" ) - x <- sub("\\tID:\\s*", "\nID: ", x, perl = TRUE) - x <- sub( + x <- str_replace(x, "\\tID:\\s*", "\nID: ") + x <- str_replace( + x, "\\n((\\d+( Related| related)? (DataSet|Platform|Sample|Serie)s? *)+)\\n", - "\nContains: \\1\n", x, - perl = TRUE + "\nContains: \\1\n" ) - x <- gsub("\\t+", " ", x, perl = TRUE) - strsplit(x, "\\n\\n?", perl = TRUE) + x <- str_replace_all(x, "\\t+", " ") + str_split(x, "\\n\\n?") } diff --git a/R/utils-str.R b/R/utils-str.R new file mode 100644 index 0000000..6d641ae --- /dev/null +++ b/R/utils-str.R @@ -0,0 +1,94 @@ +# stingr from base R --------------------------------- +str_which <- function(string, pattern, ..., fixed = FALSE) { + grep( + pattern = pattern, x = string, ..., + perl = !fixed, value = FALSE, + fixed = fixed + ) +} + +str_detect <- function(string, pattern, ..., fixed = FALSE) { + grepl(pattern = pattern, x = string, ..., perl = !fixed) +} + +str_subset <- function(string, pattern, ..., fixed = FALSE) { + grep( + pattern = pattern, x = string, ..., + perl = !fixed, value = TRUE, + fixed = fixed + ) +} + +str_replace <- function(string, pattern, replacement, ..., fixed = FALSE) { + sub(pattern = pattern, replacement = replacement, x = string, + perl = !fixed, fixed = fixed, ... + ) +} + +str_replace_all <- function(string, pattern, replacement, ..., fixed = FALSE) { + gsub(pattern = pattern, replacement = replacement, x = string, + perl = !fixed, fixed = fixed, ... + ) +} + +str_extract <- function(string, pattern, ..., fixed = FALSE) { + matches <- regexpr(pattern, string, perl = !fixed, ..., fixed = fixed) + start <- as.vector(matches) + end <- start + attr(matches, "match.length") - 1L + start[start == -1L] <- NA_integer_ + substring(string, start, end) +} +str_extract_all <- function(string, pattern, ..., fixed = FALSE) { + regmatches( + string, + m = gregexpr( + pattern = pattern, text = string, + perl = !fixed, ..., fixed = fixed + ) + ) +} + +# split string based on pattern, Only split once, Return a list of character, +# the length of every element is two +str_split_fixed <- function(string, pattern, ..., fixed = FALSE) { + regmatches( + string, + regexpr( + pattern = pattern, text = string, + perl = !fixed, ..., fixed = fixed + ), + invert = TRUE + ) +} + +str_split <- function(string, pattern, fixed = FALSE) { + strsplit(x = string, split = pattern, fixed = fixed, perl = !fixed) +} + +str_match <- function(string, pattern, ..., fixed = FALSE) { + out <- regmatches( + string, + regexec( + pattern = pattern, text = string, + perl = !fixed, ..., fixed = fixed + ), + invert = FALSE + ) + out <- lapply(out, function(x) { + if (!length(x)) "" else x + }) + out <- do.call("rbind", out) + out[out == ""] <- NA_character_ + out +} + +str_match_all <- function(string, pattern, ..., fixed = FALSE) { + regmatches( + string, + gregexec( + pattern = pattern, text = string, + perl = !fixed, ..., fixed = fixed + ), + invert = FALSE + ) +} diff --git a/R/utils.R b/R/utils.R index 8b752c6..1b0bb28 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,53 +1,3 @@ -str_extract <- function(string, pattern, ignore.case = FALSE) { - matches <- regexpr(pattern, string, - perl = TRUE, fixed = FALSE, - ignore.case = ignore.case - ) - start <- as.vector(matches) - end <- start + attr(matches, "match.length") - 1L - start[start == -1L] <- NA_integer_ - substring(string, start, end) -} -str_extract_all <- function(string, pattern, ignore.case = FALSE) { - regmatches( - string, - gregexpr(pattern, string, - perl = TRUE, fixed = FALSE, - ignore.case = ignore.case - ), - invert = FALSE - ) -} -# split string based on pattern, Only split once, Return a list of character, -# the length of every element is two -str_split <- function(string, pattern, ignore.case = FALSE) { - regmatches( - string, - regexpr(pattern, string, - perl = TRUE, fixed = FALSE, - ignore.case = ignore.case - ), - invert = TRUE - ) -} - -str_match <- function(string, pattern, ignore.case = FALSE) { - out <- regmatches( - string, - regexec(pattern, string, - perl = TRUE, fixed = FALSE, - ignore.case = ignore.case - ), - invert = FALSE - ) - out <- lapply(out, function(x) { - if (!length(x)) "" else x - }) - out <- do.call("rbind", out) - out[out == ""] <- NA_character_ - out -} - return_object_or_list <- function(x, names = NULL) { if (length(x) == 1L) { x[[1L]]