Skip to content

Commit

Permalink
all use perl regex
Browse files Browse the repository at this point in the history
  • Loading branch information
Yunuuuu committed Sep 12, 2023
1 parent ebd63e1 commit 3d016d9
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 133 deletions.
2 changes: 1 addition & 1 deletion R/build_geo_ftp_urls.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ build_geo_ftp_url <- function(ids, file_type = "soft", ftp_over_https = FALSE) {
tolower(file_type),
c("soft", "soft_full", "annot", "miniml", "suppl", "matrix")
)
super_ids <- sub("\\d{1,3}$", "nnn", ids, perl = TRUE)
super_ids <- str_replace(ids, "\\d{1,3}$", "nnn")
if (ftp_over_https) {
geo_ftp_site <- geo_ftp_over_https
} else {
Expand Down
7 changes: 2 additions & 5 deletions R/download_geo_files.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@ download_geo_suppl_or_gse_matrix_files <- function(ids, dest_dir, file_type, pat
ftp_over_https = ftp_over_https
)
if (!is.null(pattern)) {
url_list <- lapply(url_list, grep,
pattern = pattern,
perl = TRUE, value = TRUE
)
url_list <- lapply(url_list, str_subset, pattern = pattern)
}
file_path_list <- lapply(url_list, function(urls) {
# urls may be NULL or character(0L)
Expand Down Expand Up @@ -189,7 +186,7 @@ list_geo_file_url <- function(id, file_type, handle_opts = list(), ftp_over_http
} else {
file_names <- readLines(url_connection)
}
file_names <- grep("^G", file_names, perl = TRUE, value = TRUE)
file_names <- str_subset(file_names, "^G")

# build urls for all found files ------------------------
if (length(file_names)) {
Expand Down
61 changes: 19 additions & 42 deletions R/parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@ parse_gse_matrix <- function(file_text, gse_sample_data = NULL) {
other = meta_data$Series
)
# fetch GPL accession
gpl_id <- meta_data$Sample[[grep(
"platform_id", colnames(meta_data$Sample),
ignore.case = TRUE, value = FALSE
gpl_id <- meta_data$Sample[[str_which(
colnames(meta_data$Sample), "platform_id",
ignore.case = TRUE
)]][[1L]]

list(
Expand All @@ -69,9 +69,7 @@ parse_gse_matrix <- function(file_text, gse_sample_data = NULL) {
#' @noRd
parse_gse_soft <- function(file_text, entity_type = "all", only_meta = FALSE) {
if (entity_type == "all") {
entity_indices <- grep("^\\^(SAMPLE|PLATFORM)", file_text,
perl = TRUE, value = FALSE
)
entity_indices <- str_which(file_text, "^\\^(SAMPLE|PLATFORM)")
if (length(entity_indices)) {
soft_meta <- parse_meta(
file_text[seq_len(entity_indices[[1L]] - 1L)]
Expand All @@ -90,16 +88,14 @@ parse_gse_soft <- function(file_text, entity_type = "all", only_meta = FALSE) {
platform = "PLATFORM"
)
)
entity_indices <- grep(entity_marker, file_text,
perl = TRUE, value = FALSE
)
entity_indices <- str_which(file_text, entity_marker)
}
soft_data_list <- vector(mode = "list", length = length(entity_indices))
# For every entity data, the data is seperated by "=" into name-value pairs
# Don't use `data.table::tstrsplit`, as it will split string into three or
# more element.
entity <- data.table::transpose(
str_split(file_text[entity_indices], "\\s*=\\s*")
str_split_fixed(file_text[entity_indices], "\\s*=\\s*")
)
names(soft_data_list) <- entity[[2L]]
seq_line_temp <- c(entity_indices, length(file_text))
Expand Down Expand Up @@ -175,10 +171,7 @@ parse_gpl_or_gsm_soft <- function(file_text, only_meta = FALSE) {

#' @importFrom data.table merge.data.table
parse_gds_soft <- function(file_text, only_meta = FALSE) {
subset_lines <- grep(
"^!subset", file_text,
perl = TRUE, value = FALSE
)
subset_lines <- str_which(file_text, "^!subset")
# parse meta data
meta_data <- parse_meta(file_text[-subset_lines])
if (only_meta) {
Expand Down Expand Up @@ -219,24 +212,17 @@ parse_gse_matrix_meta <- function(file_text) {
meta_groups <- c("Series", "Sample")
names(meta_groups) <- meta_groups
meta_data <- lapply(meta_groups, function(group) {
meta_text <- grep(
paste0("^!", group, "_"), file_text,
value = TRUE, fixed = FALSE, perl = TRUE
)
meta_text <- str_subset(file_text, paste0("^!", group, "_"))
meta_data <- parse_meta(meta_text)
rlang::set_names(
meta_data,
function(x) sub(paste0("^", group, "_"), "", x, perl = TRUE)
function(x) str_replace(x, paste0("^", group, "_"), "")
)
})
data.table::setDT(meta_data$Sample)
for (x in c("sample_id", "pubmed_id", "platform_id")) {
if (x %chin% names(meta_data$Series)) {
meta_data$Series[[x]] <- strsplit(
meta_data$Series[[x]],
split = ";?+ ", fixed = FALSE,
perl = TRUE
)[[1L]]
meta_data$Series[[x]] <- str_split(meta_data$Series[[x]], ";?+ ")[[1L]]
}
}
meta_data
Expand All @@ -252,7 +238,7 @@ parse_gds_subset <- function(subset_file_text) {
# group by `subset_sample_id`
subset_data[
, unlist(
strsplit(subset_sample_id, ",", perl = TRUE),
str_split(subset_sample_id, ","),
use.names = FALSE
),
by = c(
Expand Down Expand Up @@ -298,7 +284,7 @@ parse_columns <- function(file_text, target_rownames) {
# than 1L and the last value of which is a blank string ""; after above
# transformation, a tail "; " will be inserted in this element, So we just
# remove the tail "; " string.
labelDescription <- sub(";\\s*$", "", labelDescription, perl = TRUE)
labelDescription <- str_replace(labelDescription, ";\\s*$", "")
labelDescription <- data.table::fifelse(
labelDescription == "",
NA_character_, labelDescription,
Expand All @@ -317,10 +303,7 @@ parse_columns <- function(file_text, target_rownames) {
#' @return a list
#' @noRd
parse_meta <- function(file_text) {
line_with_equality <- grepl(
"^[^\\t]*=", file_text,
fixed = FALSE, perl = TRUE
)
line_with_equality <- str_detect(file_text, "^[^\\t]*=")
# For lines seperated by "="
meta_sep_by_equality <- read_meta(file_text[line_with_equality], "equality")
meta_sep_by_equality <- parse_line_sep_by_equality(meta_sep_by_equality)
Expand All @@ -346,11 +329,11 @@ parse_line_sep_by_equality <- function(dt) {
return(NULL)
}
name_value_pairs <- data.table::transpose(
str_split(dt[[1L]], "\\s*=\\s*")
str_split_fixed(dt[[1L]], "\\s*=\\s*")
)
split(
name_value_pairs[[2L]],
factor(sub("^[#!]\\s*+", "", name_value_pairs[[1L]], perl = TRUE))
factor(str_replace(name_value_pairs[[1L]], "^[#!]\\s*+", ""))
)
}

Expand All @@ -361,7 +344,7 @@ parse_line_sep_by_table <- function(dt) {
if (!nrow(dt) || ncol(dt) == 1L) {
return(NULL)
}
dt[, V1 := factor(sub("^!\\s*+", "", V1, perl = TRUE))]
dt[, V1 := factor(str_replace(V1, "^!\\s*+", ""))]
meta_list <- split(
dt[, lapply(.SD, paste0, collapse = ""), by = "V1"],
by = "V1", drop = TRUE,
Expand All @@ -374,18 +357,14 @@ parse_line_sep_by_table <- function(dt) {

read_data_table <- function(file_text) {
read_text(
text = grep("^[\\^!#]", file_text,
value = TRUE, fixed = FALSE, perl = TRUE, invert = TRUE
),
text = str_subset(file_text, "^[\\^!#]", invert = TRUE),
sep = "\t", header = TRUE, blank.lines.skip = TRUE,
check.names = FALSE
)
}
read_meta <- function(file_text, meta_type = "table") {
read_text(
text = grep("^!\\w*", file_text,
value = TRUE, fixed = FALSE, perl = TRUE
),
text = str_subset(file_text, "^!\\w*"),
sep = switch(meta_type,
table = "\t",
equality = ""
Expand All @@ -400,9 +379,7 @@ read_meta <- function(file_text, meta_type = "table") {
}
read_column <- function(file_text) {
read_text(
text = grep("^#\\w[^\\t]*=", file_text,
value = TRUE, fixed = FALSE, perl = TRUE
),
text = str_subset(file_text, "^#\\w[^\\t]*="),
sep = "", header = FALSE, blank.lines.skip = TRUE,
colClasses = "character",
check.names = FALSE
Expand Down
25 changes: 10 additions & 15 deletions R/phenodata.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,27 +76,22 @@ parse_pdata <- function(data, columns = NULL, sep = ":", split = ";") {
# `sample_dt` should be a data.table
parse_gse_matrix_sample_characteristics <- function(sample_dt, characteristics_cols = NULL, sep = ":", split = ";") {
if (is.null(characteristics_cols)) {
characteristics_cols <- grep(
"^characteristics_ch",
colnames(sample_dt),
value = TRUE, perl = TRUE
characteristics_cols <- str_subset(
colnames(sample_dt), "^characteristics_ch"
)
} else {
characteristics_cols <- grep(
"ch\\d*(\\.\\d*)?$",
characteristics_cols,
value = TRUE, perl = TRUE
characteristics_cols <- str_subset(
characteristics_cols, "ch\\d*(\\.\\d*)?$"
)
}
if (length(characteristics_cols)) {
split <- paste0("(\\s*+)", split, "(\\s*+)")
for (.characteristic_col in characteristics_cols) {
characteristic_list <- strsplit(
sample_dt[[as.character(.characteristic_col)]],
split = split, perl = TRUE
characteristic_list <- str_split(
sample_dt[[as.character(.characteristic_col)]], split
)
characteristic_list <- lapply(characteristic_list, function(x) {
grep(sep, x, perl = TRUE, value = TRUE)
str_subset(x, sep)
})
have_more_than_one_sep <- vapply(
characteristic_list,
Expand Down Expand Up @@ -200,7 +195,7 @@ parse_gse_soft_sample_characteristics <- function(gsm_list) {
)
data.table::setnames(
sample_meta_dt,
function(x) sub("^Sample_", "", x, perl = TRUE)
function(x) str_replace(x, "^Sample_", "")
)
# We select columns with names starting with "characteristics_ch" and at
# least 50% of the elements in the column contains character ":",
Expand All @@ -210,7 +205,7 @@ parse_gse_soft_sample_characteristics <- function(gsm_list) {
)
column_have_sep <- sample_meta_dt[, vapply(.SD, function(list_col) {
have_sep <- vapply(list_col, function(x) {
all(grepl(":", x, fixed = TRUE), na.rm = TRUE)
all(str_detect(x, ":", fixed = TRUE), na.rm = TRUE)
}, logical(1L))
mean(have_sep, na.rm = TRUE) >= 0.5
}, logical(1L)), .SDcols = characteristics_cols]
Expand Down Expand Up @@ -301,7 +296,7 @@ parse_name_value_pairs <- function(pair_list, sep = ":") {
# Don't use `data.table::tstrsplit`, as it will split string into three
# or more elements.
name_value_pairs <- data.table::transpose(
str_split(x, paste0("(\\s*+)", sep, "(\\s*+)")),
str_split_fixed(x, paste0("(\\s*+)", sep, "(\\s*+)")),
fill = NA_character_
)
if (length(name_value_pairs) < 2L) {
Expand Down
40 changes: 20 additions & 20 deletions R/search_geo.R
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#' [entrez_db_searchable][rentrez::entrez_db_searchable].
#' @param step the number of records to fetch from the database each time. You
#' may choose a smaller value if failed.
#' @param interval The time interval (seconds) between each step.
#' @param interval The time interval (seconds) between each step.
#' @return a data.frame contains the search results
#' @examples
#' rgeo::search_geo("diabetes[ALL] AND Homo sapiens[ORGN] AND GSE[ETYP]")
Expand All @@ -44,8 +44,8 @@ search_geo <- function(query, step = 500L, interval = 1L) {
)
Sys.sleep(interval)
}
records <- strsplit(
gsub("^\\n|\\n$", "", paste0(records, collapse = "")),
records <- str_split(
str_replace_all(paste0(records, collapse = ""), "^\\n|\\n$", ""),
"\\n\\n"
)[[1L]]
name_value_pairs <- parse_name_value_pairs(preprocess_records(records))
Expand All @@ -54,12 +54,13 @@ search_geo <- function(query, step = 500L, interval = 1L) {
c("Contains", "Datasets", "Series", "Platforms"),
names(name_value_pairs)
),
grep("Accession$", names(name_value_pairs), perl = TRUE, value = TRUE)
str_subset(names(name_value_pairs), "Accession$")
)
data.table::setDT(name_value_pairs)
data.table::setcolorder(
name_value_pairs,
tail_col, after = ncol(name_value_pairs)
name_value_pairs,
tail_col,
after = ncol(name_value_pairs)
)
data.table::setDF(name_value_pairs)
name_value_pairs
Expand All @@ -68,23 +69,22 @@ search_geo <- function(query, step = 500L, interval = 1L) {
# this function just processed GEO searched results returned by `entrez_fetch`
# into key-values paris
preprocess_records <- function(x) {
x <- sub("^\\d+\\.", "Title:", x, perl = TRUE)
x <- sub(
"(Title:[^\\n]*\\n)(?:\\(Submitter supplied\\))?\\s*",
"\\1Summary: ", x,
perl = TRUE
x <- str_replace(x, "^\\d+\\.", "Title:")
x <- str_replace(
x, "(Title:[^\\n]*\\n)(?:\\(Submitter supplied\\))?\\s*",
"\\1Summary: "
)
x <- gsub(
x <- str_replace_all(
x,
"(Platform|Dataset|Serie)s?: *((?:GPL\\d+ *|GDS\\d+ *|GSE\\d+ *)+)",
"\\1s: \\2\n", x,
perl = TRUE
"\\1s: \\2\n"
)
x <- sub("\\tID:\\s*", "\nID: ", x, perl = TRUE)
x <- sub(
x <- str_replace(x, "\\tID:\\s*", "\nID: ")
x <- str_replace(
x,
"\\n((\\d+( Related| related)? (DataSet|Platform|Sample|Serie)s? *)+)\\n",
"\nContains: \\1\n", x,
perl = TRUE
"\nContains: \\1\n"
)
x <- gsub("\\t+", " ", x, perl = TRUE)
strsplit(x, "\\n\\n?", perl = TRUE)
x <- str_replace_all(x, "\\t+", " ")
str_split(x, "\\n\\n?")
}
Loading

0 comments on commit 3d016d9

Please sign in to comment.