Skip to content

Commit

Permalink
Use data.table as the main object
Browse files Browse the repository at this point in the history
  • Loading branch information
Yunuuuu committed Sep 22, 2023
1 parent 08031da commit e04ac8f
Show file tree
Hide file tree
Showing 9 changed files with 50 additions and 56 deletions.
6 changes: 3 additions & 3 deletions R/GEOSoft-helper.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
get_geo_soft <- function(ids, geo_type, dest_dir, ftp_over_https, handle_opts) {
soft_data_list <- download_and_parse_soft(
ids = ids, geo_type = geo_type,
dest_dir = dest_dir,
dest_dir = dest_dir,
ftp_over_https = ftp_over_https,
handle_opts = handle_opts,
only_meta = FALSE
Expand All @@ -22,8 +22,8 @@ new_geo_obj <- function(id, geo_type, soft_data) {
GDS = methods::new(
"GEOSoft",
meta = soft_data$meta,
columns = soft_data$columns,
datatable = soft_data$data_table,
columns = column_to_rownames(soft_data$columns),
datatable = set_rownames(soft_data$data_table),
accession = id
),
GSE = methods::new(
Expand Down
12 changes: 8 additions & 4 deletions R/GSEMatrix-helper.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,20 +81,24 @@ download_and_parse_annotation <- function(annotation, assay, dest_dir, ftp_over_
)
gpl_data <- parse_gpl_or_gsm_soft(read_lines(gpl_file_path))
if (nrow(gpl_data$data_table)) {
feature_data <- set_rownames(gpl_data$data_table)
# NCBI GEO uses case-insensitive matching between platform
# IDs and series ID Refs
feature_data <- gpl_data$data_table[
feature_data <- feature_data[
data.table::chmatch(
tolower(rownames(assay)),
tolower(rownames(gpl_data$data_table))
tolower(rownames(feature_data))
), ,
drop = FALSE
]
rownames(feature_data) <- rownames(assay)
Biobase::AnnotatedDataFrame(feature_data,
varMetadata = gpl_data$columns
varMetadata = column_to_rownames(gpl_data$columns)
)
} else {
Biobase::AnnotatedDataFrame(data.frame(row.names = rownames(assay)))
Biobase::AnnotatedDataFrame(
data.frame(row.names = rownames(assay)),
varMetadata = column_to_rownames(gpl_data$columns)
)
}
}
6 changes: 2 additions & 4 deletions R/get_geo_meta.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#' `search_geo()` cannot get all long metadata of GEO identities.
#'
#' @inheritParams get_geo
#' @return A data.frame contains metadata of all ids.
#' @return A [data.table][data.table] contains metadata of all ids.
#' @export
get_geo_meta <- function(ids, dest_dir = getwd(), ftp_over_https = TRUE, handle_opts = list(connecttimeout = 60L)) {
ids <- toupper(ids)
Expand All @@ -28,7 +28,5 @@ get_geo_meta <- function(ids, dest_dir = getwd(), ftp_over_https = TRUE, handle_
)
data.table::setDT(meta)
})
out <- data.table::rbindlist(meta_list, use.names = TRUE, fill = TRUE)
data.table::setDF(out)
out
data.table::rbindlist(meta_list, use.names = TRUE, fill = TRUE)
}
55 changes: 18 additions & 37 deletions R/parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,18 @@ parse_gse_matrix <- function(file_text, gse_sample_data = NULL) {

# fetch phenoData
if (is.null(gse_sample_data)) {
parse_gse_matrix_sample_characteristics(meta_data$Sample)
data.table::setDF(
meta_data$Sample,
rownames = as.character(meta_data$Sample[["geo_accession"]])
)
pheno_data <- Biobase::AnnotatedDataFrame(
data = meta_data$Sample[colnames(matrix_data), , drop = FALSE]
)
gse_sample_data <- meta_data$Sample
parse_gse_matrix_sample_characteristics(gse_sample_data)
} else {
gse_sample_data <- parse_gse_soft_sample_characteristics(
gse_sample_data[colnames(matrix_data)]
)
data.table::setDF(
gse_sample_data,
rownames = gse_sample_data[["geo_accession"]]
)
pheno_data <- Biobase::AnnotatedDataFrame(
data = gse_sample_data[colnames(matrix_data), , drop = FALSE]
)
}
gse_sample_data <- gse_sample_data[colnames(matrix_data),
on = "geo_accession"
]
set_rownames(gse_sample_data, "geo_accession")
pheno_data <- Biobase::AnnotatedDataFrame(data = gse_sample_data)

# fetch experiment data
experiment_data <- Biobase::MIAME(
Expand All @@ -39,8 +31,8 @@ parse_gse_matrix <- function(file_text, gse_sample_data = NULL) {
url = if (!is.null(meta_data$Series$web_link)) {
meta_data$Series$web_link
} else if (!is.null(meta_data$Series$geo_accession)) {
paste0(
"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=",
sprintf(
"https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=%s",
meta_data$Series$geo_accession
)
} else {
Expand Down Expand Up @@ -113,8 +105,8 @@ parse_gse_soft <- function(file_text, entity_type = "all", only_meta = FALSE) {
soft_data_list[[i]] <- methods::new(
"GEOSoft",
meta = entity_data$meta,
columns = entity_data$columns,
datatable = entity_data$data_table,
columns = column_to_rownames(entity_data$columns),
datatable = set_rownames(entity_data$data_table),
accession = accession
)
cli::cli_progress_update()
Expand Down Expand Up @@ -155,9 +147,6 @@ parse_gpl_or_gsm_soft <- function(file_text, only_meta = FALSE) {
by = names(data_table)[[1L]]
]
}
data.table::setDF(data_table, rownames = as.character(data_table[[1L]]))
} else {
data.table::setDF(data_table)
}

# parse column data
Expand All @@ -180,26 +169,18 @@ parse_gds_soft <- function(file_text, only_meta = FALSE) {

# parse data_table data
data_table <- read_data_table(file_text[-subset_lines])
if (nrow(data_table)) {
data.table::setDF(data_table, rownames = as.character(data_table[[1L]]))
} else {
data.table::setDF(data_table)
}

# parse column data
column_data <- parse_columns(file_text[-subset_lines], colnames(data_table))
data.table::setDT(column_data, keep.rownames = "V1")

subset_data <- parse_gds_subset(file_text[subset_lines])
# Merge subset data into column data
column_data <- merge(
column_data,
subset_data,
column_data <- merge(column_data, subset_data,
by = "V1",
all.x = TRUE, sort = FALSE
)
column_data <- column_data[colnames(data_table), on = "V1"]
column_data <- as.data.frame(column_data[, !1L], column_data$V1)
# column_data <- as.data.frame(column_data[, !1L], column_data$V1)
list(
data_table = data_table,
meta = meta_data,
Expand Down Expand Up @@ -246,7 +227,7 @@ parse_gds_subset <- function(subset_file_text) {
"subset_description",
"subset_type"
)
][, lapply(.SD, paste0, collapse = "; "), by = V1]
][, lapply(.SD, paste0, collapse = "; "), by = "V1"]
}

#' There are four different types of line that are recognized in SOFT. The
Expand All @@ -266,7 +247,7 @@ parse_gds_subset <- function(subset_file_text) {
# names and values; For line seperated by "=", every row represents a item. But
# every item in `columns` data should only own a value of length one, so we
# collapse it.
#' @return a data.frame
#' @return a data.table
#' @noRd
parse_columns <- function(file_text, target_rownames) {
column_data <- read_column(file_text)
Expand All @@ -290,9 +271,9 @@ parse_columns <- function(file_text, target_rownames) {
NA_character_, labelDescription,
na = NA_character_
)
data.frame(
labelDescription = labelDescription,
row.names = target_rownames
data.table::data.table(
V1 = target_rownames,
labelDescription = labelDescription
)
}

Expand Down
5 changes: 2 additions & 3 deletions R/phenodata.R
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ parse_gse_matrix_sample_characteristics <- function(sample_dt, characteristics_c
}
}
}
sample_dt
}

#' Parse key-value pairs in GEO series soft file
Expand Down Expand Up @@ -174,8 +175,7 @@ parse_gsm_list <- function(gsm_list) {
)
}
res <- parse_gse_soft_sample_characteristics(gsm_list)
data.table::setDF(res, rownames = res[["geo_accession"]])
res
set_rownames(res, "geo_accession")
}

parse_gse_soft_sample_characteristics <- function(gsm_list) {
Expand All @@ -187,7 +187,6 @@ parse_gse_soft_sample_characteristics <- function(gsm_list) {
)
}
data.table::setDT(sample_meta_data)
sample_meta_data
})
sample_meta_dt <- data.table::rbindlist(
sample_meta_list,
Expand Down
4 changes: 1 addition & 3 deletions R/search_geo.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#' @param step the number of records to fetch from the database each time. You
#' may choose a smaller value if failed.
#' @param interval The time interval (seconds) between each step.
#' @return a data.frame contains the search results
#' @return A [data.table][data.table] contains the search results
#' @examples
#' rgeo::search_geo("diabetes[ALL] AND Homo sapiens[ORGN] AND GSE[ETYP]")
#' @export
Expand Down Expand Up @@ -62,8 +62,6 @@ search_geo <- function(query, step = 500L, interval = 1L) {
tail_col,
after = ncol(name_value_pairs)
)
data.table::setDF(name_value_pairs)
name_value_pairs
}

# this function just processed GEO searched results returned by `entrez_fetch`
Expand Down
14 changes: 14 additions & 0 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,20 @@ return_object_or_list <- function(x, names = NULL) {
}
}

set_rownames <- function(x, var = 1L) {
if (nrow(x)) {
data.table::setDF(x, rownames = as.character(x[[var]]))
} else {
data.table::setDF(x)
}
}

column_to_rownames <- function(x, var = 1L) {
data.table::setDF(x[, .SD, .SDcols = !var],
rownames = as.character(x[[var]])
)
}

read_lines <- function(file) {
data.table::fread(
file = file, sep = "", header = FALSE,
Expand Down
2 changes: 1 addition & 1 deletion man/get_geo_meta.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/search_geo.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit e04ac8f

Please sign in to comment.