all use perl regex

Yunuuuu · Sep 12, 2023 · 3d016d9 · 3d016d9
1 parent ebd63e1
commit 3d016d9
Show file tree

Hide file tree

Showing 7 changed files with 146 additions and 133 deletions.
diff --git a/R/build_geo_ftp_urls.R b/R/build_geo_ftp_urls.R
@@ -9,7 +9,7 @@ build_geo_ftp_url <- function(ids, file_type = "soft", ftp_over_https = FALSE) {
         tolower(file_type),
         c("soft", "soft_full", "annot", "miniml", "suppl", "matrix")
     )
-    super_ids <- sub("\\d{1,3}$", "nnn", ids, perl = TRUE)
+    super_ids <- str_replace(ids, "\\d{1,3}$", "nnn")
     if (ftp_over_https) {
         geo_ftp_site <- geo_ftp_over_https
     } else {

diff --git a/R/download_geo_files.R b/R/download_geo_files.R
@@ -6,10 +6,7 @@ download_geo_suppl_or_gse_matrix_files <- function(ids, dest_dir, file_type, pat
         ftp_over_https = ftp_over_https
     )
     if (!is.null(pattern)) {
-        url_list <- lapply(url_list, grep,
-            pattern = pattern,
-            perl = TRUE, value = TRUE
-        )
+        url_list <- lapply(url_list, str_subset, pattern = pattern)
     }
     file_path_list <- lapply(url_list, function(urls) {
         # urls may be NULL or character(0L)
@@ -189,7 +186,7 @@ list_geo_file_url <- function(id, file_type, handle_opts = list(), ftp_over_http
     } else {
         file_names <- readLines(url_connection)
     }
-    file_names <- grep("^G", file_names, perl = TRUE, value = TRUE)
+    file_names <- str_subset(file_names, "^G")
 
     # build urls for all found files ------------------------
     if (length(file_names)) {

diff --git a/R/parsers.R b/R/parsers.R
@@ -49,9 +49,9 @@ parse_gse_matrix <- function(file_text, gse_sample_data = NULL) {
         other = meta_data$Series
     )
     # fetch GPL accession
-    gpl_id <- meta_data$Sample[[grep(
-        "platform_id", colnames(meta_data$Sample),
-        ignore.case = TRUE, value = FALSE
+    gpl_id <- meta_data$Sample[[str_which(
+        colnames(meta_data$Sample), "platform_id",
+        ignore.case = TRUE
     )]][[1L]]
 
     list(
@@ -69,9 +69,7 @@ parse_gse_matrix <- function(file_text, gse_sample_data = NULL) {
 #' @noRd
 parse_gse_soft <- function(file_text, entity_type = "all", only_meta = FALSE) {
     if (entity_type == "all") {
-        entity_indices <- grep("^\\^(SAMPLE|PLATFORM)", file_text,
-            perl = TRUE, value = FALSE
-        )
+        entity_indices <- str_which(file_text, "^\\^(SAMPLE|PLATFORM)")
         if (length(entity_indices)) {
             soft_meta <- parse_meta(
                 file_text[seq_len(entity_indices[[1L]] - 1L)]
@@ -90,16 +88,14 @@ parse_gse_soft <- function(file_text, entity_type = "all", only_meta = FALSE) {
                 platform = "PLATFORM"
             )
         )
-        entity_indices <- grep(entity_marker, file_text,
-            perl = TRUE, value = FALSE
-        )
+        entity_indices <- str_which(file_text, entity_marker)
     }
     soft_data_list <- vector(mode = "list", length = length(entity_indices))
     # For every entity data, the data is seperated by "=" into name-value pairs
     # Don't use `data.table::tstrsplit`, as it will split string into three or
     # more element.
     entity <- data.table::transpose(
-        str_split(file_text[entity_indices], "\\s*=\\s*")
+        str_split_fixed(file_text[entity_indices], "\\s*=\\s*")
     )
     names(soft_data_list) <- entity[[2L]]
     seq_line_temp <- c(entity_indices, length(file_text))
@@ -175,10 +171,7 @@ parse_gpl_or_gsm_soft <- function(file_text, only_meta = FALSE) {
 
 #' @importFrom data.table merge.data.table
 parse_gds_soft <- function(file_text, only_meta = FALSE) {
-    subset_lines <- grep(
-        "^!subset", file_text,
-        perl = TRUE, value = FALSE
-    )
+    subset_lines <- str_which(file_text, "^!subset")
     # parse meta data
     meta_data <- parse_meta(file_text[-subset_lines])
     if (only_meta) {
@@ -219,24 +212,17 @@ parse_gse_matrix_meta <- function(file_text) {
     meta_groups <- c("Series", "Sample")
     names(meta_groups) <- meta_groups
     meta_data <- lapply(meta_groups, function(group) {
-        meta_text <- grep(
-            paste0("^!", group, "_"), file_text,
-            value = TRUE, fixed = FALSE, perl = TRUE
-        )
+        meta_text <- str_subset(file_text, paste0("^!", group, "_"))
         meta_data <- parse_meta(meta_text)
         rlang::set_names(
             meta_data,
-            function(x) sub(paste0("^", group, "_"), "", x, perl = TRUE)
+            function(x) str_replace(x, paste0("^", group, "_"), "")
         )
     })
     data.table::setDT(meta_data$Sample)
     for (x in c("sample_id", "pubmed_id", "platform_id")) {
         if (x %chin% names(meta_data$Series)) {
-            meta_data$Series[[x]] <- strsplit(
-                meta_data$Series[[x]],
-                split = ";?+ ", fixed = FALSE,
-                perl = TRUE
-            )[[1L]]
+            meta_data$Series[[x]] <- str_split(meta_data$Series[[x]], ";?+ ")[[1L]]
         }
     }
     meta_data
@@ -252,7 +238,7 @@ parse_gds_subset <- function(subset_file_text) {
     # group by `subset_sample_id`
     subset_data[
         , unlist(
-            strsplit(subset_sample_id, ",", perl = TRUE),
+            str_split(subset_sample_id, ","),
             use.names = FALSE
         ),
         by = c(
@@ -298,7 +284,7 @@ parse_columns <- function(file_text, target_rownames) {
     # than 1L and the last value of which is a blank string ""; after above
     # transformation, a tail "; " will be inserted in this element, So we just
     # remove the tail "; " string.
-    labelDescription <- sub(";\\s*$", "", labelDescription, perl = TRUE)
+    labelDescription <- str_replace(labelDescription, ";\\s*$", "")
     labelDescription <- data.table::fifelse(
         labelDescription == "",
         NA_character_, labelDescription,
@@ -317,10 +303,7 @@ parse_columns <- function(file_text, target_rownames) {
 #' @return a list
 #' @noRd
 parse_meta <- function(file_text) {
-    line_with_equality <- grepl(
-        "^[^\\t]*=", file_text,
-        fixed = FALSE, perl = TRUE
-    )
+    line_with_equality <- str_detect(file_text, "^[^\\t]*=")
     # For lines seperated by "="
     meta_sep_by_equality <- read_meta(file_text[line_with_equality], "equality")
     meta_sep_by_equality <- parse_line_sep_by_equality(meta_sep_by_equality)
@@ -346,11 +329,11 @@ parse_line_sep_by_equality <- function(dt) {
         return(NULL)
     }
     name_value_pairs <- data.table::transpose(
-        str_split(dt[[1L]], "\\s*=\\s*")
+        str_split_fixed(dt[[1L]], "\\s*=\\s*")
     )
     split(
         name_value_pairs[[2L]],
-        factor(sub("^[#!]\\s*+", "", name_value_pairs[[1L]], perl = TRUE))
+        factor(str_replace(name_value_pairs[[1L]], "^[#!]\\s*+", ""))
     )
 }
 
@@ -361,7 +344,7 @@ parse_line_sep_by_table <- function(dt) {
     if (!nrow(dt) || ncol(dt) == 1L) {
         return(NULL)
     }
-    dt[, V1 := factor(sub("^!\\s*+", "", V1, perl = TRUE))]
+    dt[, V1 := factor(str_replace(V1, "^!\\s*+", ""))]
     meta_list <- split(
         dt[, lapply(.SD, paste0, collapse = ""), by = "V1"],
         by = "V1", drop = TRUE,
@@ -374,18 +357,14 @@ parse_line_sep_by_table <- function(dt) {
 
 read_data_table <- function(file_text) {
     read_text(
-        text = grep("^[\\^!#]", file_text,
-            value = TRUE, fixed = FALSE, perl = TRUE, invert = TRUE
-        ),
+        text = str_subset(file_text, "^[\\^!#]", invert = TRUE),
         sep = "\t", header = TRUE, blank.lines.skip = TRUE,
         check.names = FALSE
     )
 }
 read_meta <- function(file_text, meta_type = "table") {
     read_text(
-        text = grep("^!\\w*", file_text,
-            value = TRUE, fixed = FALSE, perl = TRUE
-        ),
+        text = str_subset(file_text, "^!\\w*"),
         sep = switch(meta_type,
             table = "\t",
             equality = ""
@@ -400,9 +379,7 @@ read_meta <- function(file_text, meta_type = "table") {
 }
 read_column <- function(file_text) {
     read_text(
-        text = grep("^#\\w[^\\t]*=", file_text,
-            value = TRUE, fixed = FALSE, perl = TRUE
-        ),
+        text = str_subset(file_text, "^#\\w[^\\t]*="),
         sep = "", header = FALSE, blank.lines.skip = TRUE,
         colClasses = "character",
         check.names = FALSE

diff --git a/R/phenodata.R b/R/phenodata.R
@@ -76,27 +76,22 @@ parse_pdata <- function(data, columns = NULL, sep = ":", split = ";") {
 # `sample_dt` should be a data.table
 parse_gse_matrix_sample_characteristics <- function(sample_dt, characteristics_cols = NULL, sep = ":", split = ";") {
     if (is.null(characteristics_cols)) {
-        characteristics_cols <- grep(
-            "^characteristics_ch",
-            colnames(sample_dt),
-            value = TRUE, perl = TRUE
+        characteristics_cols <- str_subset(
+            colnames(sample_dt), "^characteristics_ch"
         )
     } else {
-        characteristics_cols <- grep(
-            "ch\\d*(\\.\\d*)?$",
-            characteristics_cols,
-            value = TRUE, perl = TRUE
+        characteristics_cols <- str_subset(
+            characteristics_cols, "ch\\d*(\\.\\d*)?$"
         )
     }
     if (length(characteristics_cols)) {
         split <- paste0("(\\s*+)", split, "(\\s*+)")
         for (.characteristic_col in characteristics_cols) {
-            characteristic_list <- strsplit(
-                sample_dt[[as.character(.characteristic_col)]],
-                split = split, perl = TRUE
+            characteristic_list <- str_split(
+                sample_dt[[as.character(.characteristic_col)]], split
             )
             characteristic_list <- lapply(characteristic_list, function(x) {
-                grep(sep, x, perl = TRUE, value = TRUE)
+                str_subset(x, sep)
             })
             have_more_than_one_sep <- vapply(
                 characteristic_list,
@@ -200,7 +195,7 @@ parse_gse_soft_sample_characteristics <- function(gsm_list) {
     )
     data.table::setnames(
         sample_meta_dt,
-        function(x) sub("^Sample_", "", x, perl = TRUE)
+        function(x) str_replace(x, "^Sample_", "")
     )
     # We select columns with names starting with "characteristics_ch" and at
     # least 50% of the elements in the column contains character ":",
@@ -210,7 +205,7 @@ parse_gse_soft_sample_characteristics <- function(gsm_list) {
     )
     column_have_sep <- sample_meta_dt[, vapply(.SD, function(list_col) {
         have_sep <- vapply(list_col, function(x) {
-            all(grepl(":", x, fixed = TRUE), na.rm = TRUE)
+            all(str_detect(x, ":", fixed = TRUE), na.rm = TRUE)
         }, logical(1L))
         mean(have_sep, na.rm = TRUE) >= 0.5
     }, logical(1L)), .SDcols = characteristics_cols]
@@ -301,7 +296,7 @@ parse_name_value_pairs <- function(pair_list, sep = ":") {
         # Don't use `data.table::tstrsplit`, as it will split string into three
         # or more elements.
         name_value_pairs <- data.table::transpose(
-            str_split(x, paste0("(\\s*+)", sep, "(\\s*+)")),
+            str_split_fixed(x, paste0("(\\s*+)", sep, "(\\s*+)")),
             fill = NA_character_
         )
         if (length(name_value_pairs) < 2L) {

diff --git a/R/search_geo.R b/R/search_geo.R
@@ -20,7 +20,7 @@
 #'   [entrez_db_searchable][rentrez::entrez_db_searchable].
 #' @param step the number of records to fetch from the database each time. You
 #' may choose a smaller value if failed.
-#' @param interval The time interval (seconds) between each step. 
+#' @param interval The time interval (seconds) between each step.
 #' @return a data.frame contains the search results
 #' @examples
 #'   rgeo::search_geo("diabetes[ALL] AND Homo sapiens[ORGN] AND GSE[ETYP]")
@@ -44,8 +44,8 @@ search_geo <- function(query, step = 500L, interval = 1L) {
         )
         Sys.sleep(interval)
     }
-    records <- strsplit(
-        gsub("^\\n|\\n$", "", paste0(records, collapse = "")),
+    records <- str_split(
+        str_replace_all(paste0(records, collapse = ""), "^\\n|\\n$", ""),
         "\\n\\n"
     )[[1L]]
     name_value_pairs <- parse_name_value_pairs(preprocess_records(records))
@@ -54,12 +54,13 @@ search_geo <- function(query, step = 500L, interval = 1L) {
             c("Contains", "Datasets", "Series", "Platforms"),
             names(name_value_pairs)
         ),
-        grep("Accession$", names(name_value_pairs), perl = TRUE, value = TRUE)
+        str_subset(names(name_value_pairs), "Accession$")
     )
     data.table::setDT(name_value_pairs)
     data.table::setcolorder(
-        name_value_pairs, 
-        tail_col, after = ncol(name_value_pairs)
+        name_value_pairs,
+        tail_col,
+        after = ncol(name_value_pairs)
     )
     data.table::setDF(name_value_pairs)
     name_value_pairs
@@ -68,23 +69,22 @@ search_geo <- function(query, step = 500L, interval = 1L) {
 # this function just processed GEO searched results returned by `entrez_fetch`
 # into key-values paris
 preprocess_records <- function(x) {
-    x <- sub("^\\d+\\.", "Title:", x, perl = TRUE)
-    x <- sub(
-        "(Title:[^\\n]*\\n)(?:\\(Submitter supplied\\))?\\s*",
-        "\\1Summary: ", x,
-        perl = TRUE
+    x <- str_replace(x, "^\\d+\\.", "Title:")
+    x <- str_replace(
+        x, "(Title:[^\\n]*\\n)(?:\\(Submitter supplied\\))?\\s*",
+        "\\1Summary: "
     )
-    x <- gsub(
+    x <- str_replace_all(
+        x,
         "(Platform|Dataset|Serie)s?: *((?:GPL\\d+ *|GDS\\d+ *|GSE\\d+ *)+)",
-        "\\1s: \\2\n", x,
-        perl = TRUE
+        "\\1s: \\2\n"
     )
-    x <- sub("\\tID:\\s*", "\nID: ", x, perl = TRUE)
-    x <- sub(
+    x <- str_replace(x, "\\tID:\\s*", "\nID: ")
+    x <- str_replace(
+        x,
         "\\n((\\d+( Related| related)? (DataSet|Platform|Sample|Serie)s? *)+)\\n",
-        "\nContains: \\1\n", x,
-        perl = TRUE
+        "\nContains: \\1\n"
     )
-    x <- gsub("\\t+", " ", x, perl = TRUE)
-    strsplit(x, "\\n\\n?", perl = TRUE)
+    x <- str_replace_all(x, "\\t+", " ")
+    str_split(x, "\\n\\n?")
 }