diff --git a/R/clinspacy.R b/R/clinspacy.R index 2c2d561..f95b856 100644 --- a/R/clinspacy.R +++ b/R/clinspacy.R @@ -74,6 +74,8 @@ clinspacy <- function(text) { entity = character(0), lemma = character(0), negated = logical(0), + semantic_type = character(0), + definition = character(0), stringsAsFactors = FALSE) for (entity_num in seq_len(entity_nums)) { @@ -85,6 +87,7 @@ clinspacy <- function(text) { temp_df$entity = parsed_text$ents[[entity_num]]$text temp_df$lemma = parsed_text$ents[[entity_num]]$lemma_ + temp_df = merge(temp_df, cui2vec_definitions, all.x = TRUE) # adds semantic_type and definition temp_df$negated = parsed_text$ents[[entity_num]]$`_`$negex return_df = rbind(return_df, temp_df) @@ -124,3 +127,37 @@ bind_clinspacy <- function(df, text) { cbind(df, as.data.frame(dt)) } +#' This function binds columns containing concept unique identifiers with which scispacy has +#' 99 percent confidence of being present with values containing frequencies. Negated concepts, +#' as identified by negspacy's NegEx implementation, are ignored and do not count towards +#' the frequencies. +#' +#' @param df A data frame. +#' @param text A character string containing the name of the column to process. +#' @return A data frame containing the original data frame as well as additional column names +#' for each UMLS concept unique identifer found with values containing frequencies. +#' +#' @examples +#' data(mtsamples) +#' mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description') +#' str(mtsamples_with_cuis) +bind_clinspacy_embeddings <- function(df, text, num_embeddings = 500) { + clinspacy_text = text + assertthat::assert_that(assertthat::has_name(df, text)) + assertthat::assert_that(nrow(df) > 0) + df_nrow = nrow(df) + + dt = data.table(df)[, .(clinspacy_id = 1:.N, text = get(clinspacy_text))] + dt = dt[, clinspacy(.SD[,text]), clinspacy_id] + dt = dt[negated == FALSE] + dt[, n := .N, by = .(clinspacy_id, cui)] + dt = merge(dt, cui2vec_embeddings) # inner join on cui + dt = dt[, .(clinspacy_id, cui, n, n*.SD),.SDcols = paste0('emb_',sprintf('%03d', 1:500))] + dt[, n := sum(n), by = clinspacy_id] + dt = dt[, lapply(.SD, function (x) sum(x)/n), by = clinspacy_id, .SDcols = paste0('emb_',sprintf('%03d', 1:500))] + dt = unique(dt) + dt2 = data.table(clinspacy_id = 1:df_nrow) + dt = merge(dt, dt2, all.y=TRUE) + dt[, clinspacy_id := NULL] + cbind(df, as.data.frame(dt)) +} diff --git a/R/cui2vec_data.R b/R/cui2vec_data.R new file mode 100644 index 0000000..a833e3f --- /dev/null +++ b/R/cui2vec_data.R @@ -0,0 +1,41 @@ +#' Cui2vec concept embeddings +#' +#' This dataset contains sample medical transcriptions for various medical specialties. +#' +#' Acknowledgements +#' +#' This data was scraped from mtsamples.com by Tara Boyle and is made available +#' under a CC0: Public Domain license. +#' +#' @format A data frame with 4999 rows and 6 variables: +#' \describe{ +#' \item{note_id}{A unique identifier for each note} +#' \item{description}{A description or chief concern} +#' \item{medical_specialty}{Medical specialty of the note} +#' \item{sample_name}{mtsamples.com note name} +#' \item{transcription}{Transcription of note text} +#' \item{keywords}{Keywords} +#' } +#' @source \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data} +'cui2vec_embeddings' + +#' Cui2vec concept definitions +#' +#' This dataset contains sample medical transcriptions for various medical specialties. +#' +#' Acknowledgements +#' +#' This data was scraped from mtsamples.com by Tara Boyle and is made available +#' under a CC0: Public Domain license. +#' +#' @format A data frame with 4999 rows and 6 variables: +#' \describe{ +#' \item{note_id}{A unique identifier for each note} +#' \item{description}{A description or chief concern} +#' \item{medical_specialty}{Medical specialty of the note} +#' \item{sample_name}{mtsamples.com note name} +#' \item{transcription}{Transcription of note text} +#' \item{keywords}{Keywords} +#' } +#' @source \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data} +'cui2vec_definitions' diff --git a/data/cui2vec_definitions.rda b/data/cui2vec_definitions.rda new file mode 100644 index 0000000..1f1e3ca Binary files /dev/null and b/data/cui2vec_definitions.rda differ