Skip to content

Commit

Permalink
2 new datasets, a new bind_clinspacy_embeddings() function, still nee…
Browse files Browse the repository at this point in the history
…d to finish up documentation.
  • Loading branch information
Singh authored and Singh committed Aug 19, 2020
1 parent 626949d commit a873190
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 0 deletions.
37 changes: 37 additions & 0 deletions R/clinspacy.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ clinspacy <- function(text) {
entity = character(0),
lemma = character(0),
negated = logical(0),
semantic_type = character(0),
definition = character(0),
stringsAsFactors = FALSE)

for (entity_num in seq_len(entity_nums)) {
Expand All @@ -85,6 +87,7 @@ clinspacy <- function(text) {

temp_df$entity = parsed_text$ents[[entity_num]]$text
temp_df$lemma = parsed_text$ents[[entity_num]]$lemma_
temp_df = merge(temp_df, cui2vec_definitions, all.x = TRUE) # adds semantic_type and definition
temp_df$negated = parsed_text$ents[[entity_num]]$`_`$negex

return_df = rbind(return_df, temp_df)
Expand Down Expand Up @@ -124,3 +127,37 @@ bind_clinspacy <- function(df, text) {
cbind(df, as.data.frame(dt))
}

#' This function binds columns containing concept unique identifiers with which scispacy has
#' 99 percent confidence of being present with values containing frequencies. Negated concepts,
#' as identified by negspacy's NegEx implementation, are ignored and do not count towards
#' the frequencies.
#'
#' @param df A data frame.
#' @param text A character string containing the name of the column to process.
#' @return A data frame containing the original data frame as well as additional column names
#' for each UMLS concept unique identifer found with values containing frequencies.
#'
#' @examples
#' data(mtsamples)
#' mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description')
#' str(mtsamples_with_cuis)
bind_clinspacy_embeddings <- function(df, text, num_embeddings = 500) {
clinspacy_text = text
assertthat::assert_that(assertthat::has_name(df, text))
assertthat::assert_that(nrow(df) > 0)
df_nrow = nrow(df)

dt = data.table(df)[, .(clinspacy_id = 1:.N, text = get(clinspacy_text))]
dt = dt[, clinspacy(.SD[,text]), clinspacy_id]
dt = dt[negated == FALSE]
dt[, n := .N, by = .(clinspacy_id, cui)]
dt = merge(dt, cui2vec_embeddings) # inner join on cui
dt = dt[, .(clinspacy_id, cui, n, n*.SD),.SDcols = paste0('emb_',sprintf('%03d', 1:500))]
dt[, n := sum(n), by = clinspacy_id]
dt = dt[, lapply(.SD, function (x) sum(x)/n), by = clinspacy_id, .SDcols = paste0('emb_',sprintf('%03d', 1:500))]
dt = unique(dt)
dt2 = data.table(clinspacy_id = 1:df_nrow)
dt = merge(dt, dt2, all.y=TRUE)
dt[, clinspacy_id := NULL]
cbind(df, as.data.frame(dt))
}
41 changes: 41 additions & 0 deletions R/cui2vec_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#' Cui2vec concept embeddings
#'
#' This dataset contains sample medical transcriptions for various medical specialties.
#'
#' Acknowledgements
#'
#' This data was scraped from mtsamples.com by Tara Boyle and is made available
#' under a CC0: Public Domain license.
#'
#' @format A data frame with 4999 rows and 6 variables:
#' \describe{
#' \item{note_id}{A unique identifier for each note}
#' \item{description}{A description or chief concern}
#' \item{medical_specialty}{Medical specialty of the note}
#' \item{sample_name}{mtsamples.com note name}
#' \item{transcription}{Transcription of note text}
#' \item{keywords}{Keywords}
#' }
#' @source \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data}
'cui2vec_embeddings'

#' Cui2vec concept definitions
#'
#' This dataset contains sample medical transcriptions for various medical specialties.
#'
#' Acknowledgements
#'
#' This data was scraped from mtsamples.com by Tara Boyle and is made available
#' under a CC0: Public Domain license.
#'
#' @format A data frame with 4999 rows and 6 variables:
#' \describe{
#' \item{note_id}{A unique identifier for each note}
#' \item{description}{A description or chief concern}
#' \item{medical_specialty}{Medical specialty of the note}
#' \item{sample_name}{mtsamples.com note name}
#' \item{transcription}{Transcription of note text}
#' \item{keywords}{Keywords}
#' }
#' @source \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data}
'cui2vec_definitions'
Binary file added data/cui2vec_definitions.rda
Binary file not shown.

0 comments on commit a873190

Please sign in to comment.