diff --git a/R/GEOSoft-helper.R b/R/GEOSoft-helper.R index 8223de6..a665078 100644 --- a/R/GEOSoft-helper.R +++ b/R/GEOSoft-helper.R @@ -1,7 +1,7 @@ get_geo_soft <- function(ids, geo_type, dest_dir, ftp_over_https, handle_opts) { soft_data_list <- download_and_parse_soft( ids = ids, geo_type = geo_type, - dest_dir = dest_dir, + dest_dir = dest_dir, ftp_over_https = ftp_over_https, handle_opts = handle_opts, only_meta = FALSE @@ -22,8 +22,8 @@ new_geo_obj <- function(id, geo_type, soft_data) { GDS = methods::new( "GEOSoft", meta = soft_data$meta, - columns = soft_data$columns, - datatable = soft_data$data_table, + columns = column_to_rownames(soft_data$columns), + datatable = set_rownames(soft_data$data_table), accession = id ), GSE = methods::new( diff --git a/R/GSEMatrix-helper.R b/R/GSEMatrix-helper.R index cd9581b..43556d5 100644 --- a/R/GSEMatrix-helper.R +++ b/R/GSEMatrix-helper.R @@ -81,20 +81,24 @@ download_and_parse_annotation <- function(annotation, assay, dest_dir, ftp_over_ ) gpl_data <- parse_gpl_or_gsm_soft(read_lines(gpl_file_path)) if (nrow(gpl_data$data_table)) { + feature_data <- set_rownames(gpl_data$data_table) # NCBI GEO uses case-insensitive matching between platform # IDs and series ID Refs - feature_data <- gpl_data$data_table[ + feature_data <- feature_data[ data.table::chmatch( tolower(rownames(assay)), - tolower(rownames(gpl_data$data_table)) + tolower(rownames(feature_data)) ), , drop = FALSE ] rownames(feature_data) <- rownames(assay) Biobase::AnnotatedDataFrame(feature_data, - varMetadata = gpl_data$columns + varMetadata = column_to_rownames(gpl_data$columns) ) } else { - Biobase::AnnotatedDataFrame(data.frame(row.names = rownames(assay))) + Biobase::AnnotatedDataFrame( + data.frame(row.names = rownames(assay)), + varMetadata = column_to_rownames(gpl_data$columns) + ) } } diff --git a/R/get_geo_meta.R b/R/get_geo_meta.R index 96f7d3f..78e89c2 100644 --- a/R/get_geo_meta.R +++ b/R/get_geo_meta.R @@ -4,7 +4,7 @@ #' `search_geo()` cannot get all long metadata of GEO identities. #' #' @inheritParams get_geo -#' @return A data.frame contains metadata of all ids. +#' @return A [data.table][data.table] contains metadata of all ids. #' @export get_geo_meta <- function(ids, dest_dir = getwd(), ftp_over_https = TRUE, handle_opts = list(connecttimeout = 60L)) { ids <- toupper(ids) @@ -28,7 +28,5 @@ get_geo_meta <- function(ids, dest_dir = getwd(), ftp_over_https = TRUE, handle_ ) data.table::setDT(meta) }) - out <- data.table::rbindlist(meta_list, use.names = TRUE, fill = TRUE) - data.table::setDF(out) - out + data.table::rbindlist(meta_list, use.names = TRUE, fill = TRUE) } diff --git a/R/parsers.R b/R/parsers.R index 35663bc..069c59e 100644 --- a/R/parsers.R +++ b/R/parsers.R @@ -8,26 +8,18 @@ parse_gse_matrix <- function(file_text, gse_sample_data = NULL) { # fetch phenoData if (is.null(gse_sample_data)) { - parse_gse_matrix_sample_characteristics(meta_data$Sample) - data.table::setDF( - meta_data$Sample, - rownames = as.character(meta_data$Sample[["geo_accession"]]) - ) - pheno_data <- Biobase::AnnotatedDataFrame( - data = meta_data$Sample[colnames(matrix_data), , drop = FALSE] - ) + gse_sample_data <- meta_data$Sample + parse_gse_matrix_sample_characteristics(gse_sample_data) } else { gse_sample_data <- parse_gse_soft_sample_characteristics( gse_sample_data[colnames(matrix_data)] ) - data.table::setDF( - gse_sample_data, - rownames = gse_sample_data[["geo_accession"]] - ) - pheno_data <- Biobase::AnnotatedDataFrame( - data = gse_sample_data[colnames(matrix_data), , drop = FALSE] - ) } + gse_sample_data <- gse_sample_data[colnames(matrix_data), + on = "geo_accession" + ] + set_rownames(gse_sample_data, "geo_accession") + pheno_data <- Biobase::AnnotatedDataFrame(data = gse_sample_data) # fetch experiment data experiment_data <- Biobase::MIAME( @@ -39,8 +31,8 @@ parse_gse_matrix <- function(file_text, gse_sample_data = NULL) { url = if (!is.null(meta_data$Series$web_link)) { meta_data$Series$web_link } else if (!is.null(meta_data$Series$geo_accession)) { - paste0( - "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=", + sprintf( + "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=%s", meta_data$Series$geo_accession ) } else { @@ -113,8 +105,8 @@ parse_gse_soft <- function(file_text, entity_type = "all", only_meta = FALSE) { soft_data_list[[i]] <- methods::new( "GEOSoft", meta = entity_data$meta, - columns = entity_data$columns, - datatable = entity_data$data_table, + columns = column_to_rownames(entity_data$columns), + datatable = set_rownames(entity_data$data_table), accession = accession ) cli::cli_progress_update() @@ -155,9 +147,6 @@ parse_gpl_or_gsm_soft <- function(file_text, only_meta = FALSE) { by = names(data_table)[[1L]] ] } - data.table::setDF(data_table, rownames = as.character(data_table[[1L]])) - } else { - data.table::setDF(data_table) } # parse column data @@ -180,26 +169,18 @@ parse_gds_soft <- function(file_text, only_meta = FALSE) { # parse data_table data data_table <- read_data_table(file_text[-subset_lines]) - if (nrow(data_table)) { - data.table::setDF(data_table, rownames = as.character(data_table[[1L]])) - } else { - data.table::setDF(data_table) - } # parse column data column_data <- parse_columns(file_text[-subset_lines], colnames(data_table)) - data.table::setDT(column_data, keep.rownames = "V1") subset_data <- parse_gds_subset(file_text[subset_lines]) # Merge subset data into column data - column_data <- merge( - column_data, - subset_data, + column_data <- merge(column_data, subset_data, by = "V1", all.x = TRUE, sort = FALSE ) column_data <- column_data[colnames(data_table), on = "V1"] - column_data <- as.data.frame(column_data[, !1L], column_data$V1) + # column_data <- as.data.frame(column_data[, !1L], column_data$V1) list( data_table = data_table, meta = meta_data, @@ -246,7 +227,7 @@ parse_gds_subset <- function(subset_file_text) { "subset_description", "subset_type" ) - ][, lapply(.SD, paste0, collapse = "; "), by = V1] + ][, lapply(.SD, paste0, collapse = "; "), by = "V1"] } #' There are four different types of line that are recognized in SOFT. The @@ -266,7 +247,7 @@ parse_gds_subset <- function(subset_file_text) { # names and values; For line seperated by "=", every row represents a item. But # every item in `columns` data should only own a value of length one, so we # collapse it. -#' @return a data.frame +#' @return a data.table #' @noRd parse_columns <- function(file_text, target_rownames) { column_data <- read_column(file_text) @@ -290,9 +271,9 @@ parse_columns <- function(file_text, target_rownames) { NA_character_, labelDescription, na = NA_character_ ) - data.frame( - labelDescription = labelDescription, - row.names = target_rownames + data.table::data.table( + V1 = target_rownames, + labelDescription = labelDescription ) } diff --git a/R/phenodata.R b/R/phenodata.R index 15c9d24..52a16df 100644 --- a/R/phenodata.R +++ b/R/phenodata.R @@ -139,6 +139,7 @@ parse_gse_matrix_sample_characteristics <- function(sample_dt, characteristics_c } } } + sample_dt } #' Parse key-value pairs in GEO series soft file @@ -174,8 +175,7 @@ parse_gsm_list <- function(gsm_list) { ) } res <- parse_gse_soft_sample_characteristics(gsm_list) - data.table::setDF(res, rownames = res[["geo_accession"]]) - res + set_rownames(res, "geo_accession") } parse_gse_soft_sample_characteristics <- function(gsm_list) { @@ -187,7 +187,6 @@ parse_gse_soft_sample_characteristics <- function(gsm_list) { ) } data.table::setDT(sample_meta_data) - sample_meta_data }) sample_meta_dt <- data.table::rbindlist( sample_meta_list, diff --git a/R/search_geo.R b/R/search_geo.R index 165c066..08f5f85 100644 --- a/R/search_geo.R +++ b/R/search_geo.R @@ -21,7 +21,7 @@ #' @param step the number of records to fetch from the database each time. You #' may choose a smaller value if failed. #' @param interval The time interval (seconds) between each step. -#' @return a data.frame contains the search results +#' @return A [data.table][data.table] contains the search results #' @examples #' rgeo::search_geo("diabetes[ALL] AND Homo sapiens[ORGN] AND GSE[ETYP]") #' @export @@ -62,8 +62,6 @@ search_geo <- function(query, step = 500L, interval = 1L) { tail_col, after = ncol(name_value_pairs) ) - data.table::setDF(name_value_pairs) - name_value_pairs } # this function just processed GEO searched results returned by `entrez_fetch` diff --git a/R/utils.R b/R/utils.R index 59ae9a2..d3c4be5 100644 --- a/R/utils.R +++ b/R/utils.R @@ -7,6 +7,20 @@ return_object_or_list <- function(x, names = NULL) { } } +set_rownames <- function(x, var = 1L) { + if (nrow(x)) { + data.table::setDF(x, rownames = as.character(x[[var]])) + } else { + data.table::setDF(x) + } +} + +column_to_rownames <- function(x, var = 1L) { + data.table::setDF(x[, .SD, .SDcols = !var], + rownames = as.character(x[[var]]) + ) +} + read_lines <- function(file) { data.table::fread( file = file, sep = "", header = FALSE, diff --git a/man/get_geo_meta.Rd b/man/get_geo_meta.Rd index 7f19e7b..392235b 100644 --- a/man/get_geo_meta.Rd +++ b/man/get_geo_meta.Rd @@ -28,7 +28,7 @@ will use \url{ftp://ftp.ncbi.nlm.nih.gov/geo} directly.} \link[curl:handle]{handle}.} } \value{ -A data.frame contains metadata of all ids. +A \link{data.table} contains metadata of all ids. } \description{ This is useful to combine with \code{\link[=search_geo]{search_geo()}} and filter results since diff --git a/man/search_geo.Rd b/man/search_geo.Rd index 59318b7..ee80c5a 100644 --- a/man/search_geo.Rd +++ b/man/search_geo.Rd @@ -21,7 +21,7 @@ may choose a smaller value if failed.} \item{interval}{The time interval (seconds) between each step.} } \value{ -a data.frame contains the search results +A \link{data.table} contains the search results } \description{ This function searchs \href{https://www.ncbi.nlm.nih.gov/gds}{GDS} database,