2 new datasets, a new bind_clinspacy_embeddings() function, still nee…

…d to finish up documentation.
kdpsingh · Aug 19, 2020 · a873190 · a873190
1 parent 626949d
commit a873190
Show file tree

Hide file tree

Showing 3 changed files with 78 additions and 0 deletions.
diff --git a/R/clinspacy.R b/R/clinspacy.R
@@ -74,6 +74,8 @@ clinspacy <- function(text) {
                          entity = character(0),
                          lemma = character(0),
                          negated = logical(0),
+                         semantic_type = character(0),
+                         definition = character(0),
                          stringsAsFactors = FALSE)
 
   for (entity_num in seq_len(entity_nums)) {
@@ -85,6 +87,7 @@ clinspacy <- function(text) {
 
     temp_df$entity = parsed_text$ents[[entity_num]]$text
     temp_df$lemma = parsed_text$ents[[entity_num]]$lemma_
+    temp_df = merge(temp_df, cui2vec_definitions, all.x = TRUE) # adds semantic_type and definition
     temp_df$negated = parsed_text$ents[[entity_num]]$`_`$negex
 
     return_df = rbind(return_df, temp_df)
@@ -124,3 +127,37 @@ bind_clinspacy <- function(df, text) {
   cbind(df, as.data.frame(dt))
 }
 
+#' This function binds columns containing concept unique identifiers with which scispacy has
+#' 99 percent confidence of being present with values containing frequencies. Negated concepts,
+#' as identified by negspacy's NegEx implementation, are ignored and do not count towards
+#' the frequencies.
+#'
+#' @param df A data frame.
+#' @param text A character string containing the name of the column to process.
+#' @return A data frame containing the original data frame as well as additional column names
+#' for each UMLS concept unique identifer found with values containing frequencies.
+#'
+#' @examples
+#' data(mtsamples)
+#' mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description')
+#' str(mtsamples_with_cuis)
+bind_clinspacy_embeddings <- function(df, text, num_embeddings = 500) {
+  clinspacy_text = text
+  assertthat::assert_that(assertthat::has_name(df, text))
+  assertthat::assert_that(nrow(df) > 0)
+  df_nrow = nrow(df)
+
+  dt = data.table(df)[, .(clinspacy_id = 1:.N, text = get(clinspacy_text))]
+  dt = dt[, clinspacy(.SD[,text]), clinspacy_id]
+  dt = dt[negated == FALSE]
+  dt[, n := .N, by = .(clinspacy_id, cui)]
+  dt = merge(dt, cui2vec_embeddings) # inner join on cui
+  dt = dt[, .(clinspacy_id, cui, n, n*.SD),.SDcols = paste0('emb_',sprintf('%03d', 1:500))]
+  dt[, n := sum(n), by = clinspacy_id]
+  dt = dt[, lapply(.SD, function (x) sum(x)/n), by = clinspacy_id, .SDcols = paste0('emb_',sprintf('%03d', 1:500))]
+  dt = unique(dt)
+  dt2 = data.table(clinspacy_id = 1:df_nrow)
+  dt = merge(dt, dt2, all.y=TRUE)
+  dt[, clinspacy_id := NULL]
+  cbind(df, as.data.frame(dt))
+}
diff --git a/R/cui2vec_data.R b/R/cui2vec_data.R
@@ -0,0 +1,41 @@
+#' Cui2vec concept embeddings
+#'
+#' This dataset contains sample medical transcriptions for various medical specialties.
+#'
+#' Acknowledgements
+#'
+#' This data was scraped from mtsamples.com by Tara Boyle and is made available
+#' under a CC0: Public Domain license.
+#'
+#' @format A data frame with 4999 rows and 6 variables:
+#' \describe{
+#'   \item{note_id}{A unique identifier for each note}
+#'   \item{description}{A description or chief concern}
+#'   \item{medical_specialty}{Medical specialty of the note}
+#'   \item{sample_name}{mtsamples.com note name}
+#'   \item{transcription}{Transcription of note text}
+#'   \item{keywords}{Keywords}
+#' }
+#' @source \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data}
+'cui2vec_embeddings'
+
+#' Cui2vec concept definitions
+#'
+#' This dataset contains sample medical transcriptions for various medical specialties.
+#'
+#' Acknowledgements
+#'
+#' This data was scraped from mtsamples.com by Tara Boyle and is made available
+#' under a CC0: Public Domain license.
+#'
+#' @format A data frame with 4999 rows and 6 variables:
+#' \describe{
+#'   \item{note_id}{A unique identifier for each note}
+#'   \item{description}{A description or chief concern}
+#'   \item{medical_specialty}{Medical specialty of the note}
+#'   \item{sample_name}{mtsamples.com note name}
+#'   \item{transcription}{Transcription of note text}
+#'   \item{keywords}{Keywords}
+#' }
+#' @source \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data}
+'cui2vec_definitions'
diff --git a/data/cui2vec_definitions.rda b/data/cui2vec_definitions.rda