diff --git a/R/clinspacy.R b/R/clinspacy.R index f95b856..a0ecbeb 100644 --- a/R/clinspacy.R +++ b/R/clinspacy.R @@ -12,7 +12,53 @@ negex <- NULL linker <- NULL .onLoad <- function(libname, pkgname) { - reticulate::configure_environment(force = TRUE) + # reticulate::configure_environment(force = TRUE) +} + +.onAttach <- function(libname, pkgname) { + packageStartupMessage('Welcome to clinspacy.') + packageStartupMessage('By default, this package will install and use miniconda and create a "clinspacy" conda environment.') + packageStartupMessage('If you want to override this behavior, use clinspacy_init(miniconda = FALSE) and specify an alternative environment using use_python() or use_conda().') +} + + +#' Initializes clinspacy. This function is optional to run but gives you more control over +#' the parameters used by scispacy at initiation. If you do not run this function, it will be +#' run with default parameters the first time that any of the package functions are run. +#' +#' @param miniconda Defaults to TRUE, which results in miniconda being installed (~400 MB) +#' and configured with the "clinspacy" conda environment. If you want to override this behavior, +#' set \code{miniconda} to \code{FALSE} and specify an alternative environment using use_python() +#' or use_conda(). +#' @param linker_threshold Defaults to 0.99. The confidence threshold value used by the scispacy UMLS entity +#' linker. Note: This can be lower than the \code{threshold} from \code{\link{clinspacy_init}}). +#' The linker_threshold can only be set once per session. +#' @param ... Additional settings available from: \href{https://github.com/allenai/scispacy}{https://github.com/allenai/scispacy}. + +clinspacy_init <- function(miniconda = TRUE, linker_threshold = 0.99, ...) { + + assertthat::assert_that(assertthat::is.flag(miniconda)) + assertthat::assert_that(linker_threshold >= 0.70 & linker_threshold <= 0.99) + + message('Initializing clinspacy using clinspacy_init()...') + + if (miniconda) { + message('Checking if miniconda is installed...') + tryCatch(reticulate::install_miniconda(), + error = function (e) {NULL}) + + # By now, miniconda should be installed. Let's check if the clinspacy environment is configured + is_clinspacy_env_installed = tryCatch(reticulate::use_miniconda(condaenv = 'clinspacy', required = TRUE), + error = function (e) {'not installed'}) + + if (!is.null(is_clinspacy_env_installed)) { # this means the 'clinspacy' condaenv *is not* installed + message('Clinspacy requires the clinspacy conda environment. Attempting to create...') + reticulate::conda_create(envname = 'clinspacy') + } + + # This is intentional -- will throw an error if environment creation failed + reticulate::use_miniconda(condaenv = 'clinspacy', required = TRUE) + } if (!reticulate::py_module_available('spacy')) { packageStartupMessage('Spacy not found. Installing spacy...') @@ -29,30 +75,29 @@ linker <- NULL reticulate::py_install('negspacy', pip = TRUE) } - if (!reticulate::py_module_available('en_core_sci_sm')) { - packageStartupMessage('en_core_sci_sm language model not found. Installing en_core_sci_sm...') - reticulate::py_install('https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz', pip = TRUE) + if (!reticulate::py_module_available('en_core_sci_lg')) { + packageStartupMessage('en_core_sci_lg language model not found. Installing en_core_sci_lg...') + reticulate::py_install('https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_lg-0.2.5.tar.gz', pip = TRUE) } - packageStartupMessage('Importing spacy...') - spacy <<- reticulate::import('spacy') - packageStartupMessage('Importing scispacy...') - scispacy <<- reticulate::import('scispacy') - packageStartupMessage('Importing negspacy...') - negspacy <<- reticulate::import('negspacy') - packageStartupMessage('Loading the en_core_sci_sm language model...') - nlp <<- spacy$load("en_core_sci_sm") - packageStartupMessage('Loading NegEx...') + message('Importing spacy...') + spacy <<- reticulate::import('spacy', delay_load = TRUE) + message('Importing scispacy...') + scispacy <<- reticulate::import('scispacy', delay_load = TRUE) + message('Importing negspacy...') + negspacy <<- reticulate::import('negspacy', delay_load = TRUE) + + message('Loading the en_core_sci_lg language model...') + nlp <<- spacy$load("en_core_sci_lg") + message('Loading NegEx...') negex <<- negspacy$negation$Negex(nlp) - packageStartupMessage('Loading the UMLS entity linker... (this may take a while)') - linker <<- scispacy$linking$EntityLinker(resolve_abbreviations=TRUE, name="umls", threshold = 0.99) - packageStartupMessage('Adding the UMLS entity linker and NegEx to the spacy pipeline...') + message('Loading the UMLS entity linker... (this may take a while)') + linker <<- scispacy$linking$EntityLinker(resolve_abbreviations=TRUE, + name="umls", + threshold = linker_threshold, ...) + message('Adding the UMLS entity linker and NegEx to the spacy pipeline...') nlp$add_pipe(linker) - nlp$add_pipe(negex, last=TRUE) -} - -.onAttach <- function(libname, pkgname) { - packageStartupMessage('\nWelcome to clinspacy. Take a look at help(clinspacy) to get started.') + nlp$add_pipe(negex) } #' Performs biomedical named entity recognition, Unified Medical Language System (UMLS) @@ -61,12 +106,151 @@ linker <- NULL #' 99 percent confidence of being present. Negation is identified using negspacy's NegEx implementation. #' #' @param text A character string containing medical text that you would like to process. +#' @param threshold Defaults to 0.99. The confidence threshold value used by clinspacy (can be higher than the +#' \code{linker_threshold} from \code{\link{clinspacy_init}}). Note that whereas the +#' linker_threshold can only be set once per session, this threshold can be updated during the R session. +#' @param semantic_types Character vector containing any combination of the following: +#' c("Acquired Abnormality", "Activity", "Age Group", "Amino Acid Sequence", "Amino Acid, Peptide, or Protein", "Amphibian", "Anatomical Abnormality", "Anatomical Structure", "Animal", "Antibiotic", "Archaeon", "Bacterium", "Behavior", "Biologic Function", "Biologically Active Substance", "Biomedical Occupation or Discipline", "Biomedical or Dental Material", "Bird", "Body Location or Region", "Body Part, Organ, or Organ Component", "Body Space or Junction", "Body Substance", "Body System", "Carbohydrate Sequence", "Cell", "Cell Component", "Cell Function", "Cell or Molecular Dysfunction", "Chemical", "Chemical Viewed Functionally", "Chemical Viewed Structurally", "Classification", "Clinical Attribute", "Clinical Drug", "Conceptual Entity", "Congenital Abnormality", "Daily or Recreational Activity", "Diagnostic Procedure", "Disease or Syndrome", "Drug Delivery Device", "Educational Activity", "Element, Ion, or Isotope", "Embryonic Structure", "Entity", "Environmental Effect of Humans", "Enzyme", "Eukaryote", "Event", "Experimental Model of Disease", "Family Group", "Finding", "Fish", "Food", "Fully Formed Anatomical Structure", "Functional Concept", "Fungus", "Gene or Genome", "Genetic Function", "Geographic Area", "Governmental or Regulatory Activity", "Group", "Group Attribute", "Hazardous or Poisonous Substance", "Health Care Activity", "Health Care Related Organization", "Hormone", "Human", "Human-caused Phenomenon or Process", "Idea or Concept", "Immunologic Factor", "Indicator, Reagent, or Diagnostic Aid", "Individual Behavior", "Injury or Poisoning", "Inorganic Chemical", "Intellectual Product", "Laboratory or Test Result", "Laboratory Procedure", "Language", "Machine Activity", "Mammal", "Manufactured Object", "Medical Device", "Mental or Behavioral Dysfunction", "Mental Process", "Molecular Biology Research Technique", "Molecular Function", "Molecular Sequence", "Natural Phenomenon or Process", "Neoplastic Process", "Nucleic Acid, Nucleoside, or Nucleotide", "Nucleotide Sequence", "Occupation or Discipline", "Occupational Activity", "Organ or Tissue Function", "Organic Chemical", "Organism", "Organism Attribute", "Organism Function", "Organization", "Pathologic Function", "Patient or Disabled Group", "Pharmacologic Substance", "Phenomenon or Process", "Physical Object", "Physiologic Function", "Plant", "Population Group", "Professional or Occupational Group", "Professional Society", "Qualitative Concept", "Quantitative Concept", "Receptor", "Regulation or Law", "Reptile", "Research Activity", "Research Device", "Self-help or Relief Organization", "Sign or Symptom", "Social Behavior", "Spatial Concept", "Substance", "Temporal Concept", "Therapeutic or Preventive Procedure", "Tissue", "Vertebrate", "Virus", "Vitamin") #' @return A data frame containing the UMLS concept unique identifiers (cui), entities, #' lemmatized entities, and NegEx negation status (\code{TRUE} means negated, \code{FALSE} means *not* negated). #' #' @examples #' clinspacy('This patient has diabetes and CKD stage 3 but no HTN.') -clinspacy <- function(text) { +clinspacy <- function(text, threshold = 0.99, + semantic_types = c("Acquired Abnormality", + "Activity", + "Age Group", + "Amino Acid Sequence", + "Amino Acid, Peptide, or Protein", + "Amphibian", + "Anatomical Abnormality", + "Anatomical Structure", + "Animal", + "Antibiotic", + "Archaeon", + "Bacterium", + "Behavior", + "Biologic Function", + "Biologically Active Substance", + "Biomedical Occupation or Discipline", + "Biomedical or Dental Material", + "Bird", + "Body Location or Region", + "Body Part, Organ, or Organ Component", + "Body Space or Junction", + "Body Substance", + "Body System", + "Carbohydrate Sequence", + "Cell", + "Cell Component", + "Cell Function", + "Cell or Molecular Dysfunction", + "Chemical", + "Chemical Viewed Functionally", + "Chemical Viewed Structurally", + "Classification", + "Clinical Attribute", + "Clinical Drug", + "Conceptual Entity", + "Congenital Abnormality", + "Daily or Recreational Activity", + "Diagnostic Procedure", + "Disease or Syndrome", + "Drug Delivery Device", + "Educational Activity", + "Element, Ion, or Isotope", + "Embryonic Structure", + "Entity", + "Environmental Effect of Humans", + "Enzyme", + "Eukaryote", + "Event", + "Experimental Model of Disease", + "Family Group", + "Finding", + "Fish", + "Food", + "Fully Formed Anatomical Structure", + "Functional Concept", + "Fungus", + "Gene or Genome", + "Genetic Function", + "Geographic Area", + "Governmental or Regulatory Activity", + "Group", + "Group Attribute", + "Hazardous or Poisonous Substance", + "Health Care Activity", + "Health Care Related Organization", + "Hormone", + "Human", + "Human-caused Phenomenon or Process", + "Idea or Concept", + "Immunologic Factor", + "Indicator, Reagent, or Diagnostic Aid", + "Individual Behavior", + "Injury or Poisoning", + "Inorganic Chemical", + "Intellectual Product", + "Laboratory or Test Result", + "Laboratory Procedure", + "Language", + "Machine Activity", + "Mammal", + "Manufactured Object", + "Medical Device", + "Mental or Behavioral Dysfunction", + "Mental Process", + "Molecular Biology Research Technique", + "Molecular Function", + "Molecular Sequence", + "Natural Phenomenon or Process", + "Neoplastic Process", + "Nucleic Acid, Nucleoside, or Nucleotide", + "Nucleotide Sequence", + "Occupation or Discipline", + "Occupational Activity", + "Organ or Tissue Function", + "Organic Chemical", + "Organism", + "Organism Attribute", + "Organism Function", + "Organization", + "Pathologic Function", + "Patient or Disabled Group", + "Pharmacologic Substance", + "Phenomenon or Process", + "Physical Object", + "Physiologic Function", + "Plant", + "Population Group", + "Professional or Occupational Group", + "Professional Society", + "Qualitative Concept", + "Quantitative Concept", + "Receptor", + "Regulation or Law", + "Reptile", + "Research Activity", + "Research Device", + "Self-help or Relief Organization", + "Sign or Symptom", + "Social Behavior", + "Spatial Concept", + "Substance", + "Temporal Concept", + "Therapeutic or Preventive Procedure", + "Tissue", + "Vertebrate", + "Virus", + "Vitamin")) { + + if (is.null(nlp)) { + clinspacy_init() + } + + assertthat::assert_that(threshold >= 0.70 & threshold <= 0.99) + parsed_text = nlp(text) entity_nums = length(parsed_text$ents) @@ -78,22 +262,39 @@ clinspacy <- function(text) { definition = character(0), stringsAsFactors = FALSE) + return_df_list = list() + for (entity_num in seq_len(entity_nums)) { if (is.null(unlist(parsed_text$ents[[entity_num]]$`_`$kb_ents))) next temp_cuis = parsed_text$ents[[entity_num]]$`_`$kb_ents temp_cuis = unlist(temp_cuis) - temp_df = data.frame(cui = temp_cuis[seq(1, length(temp_cuis), by = 2)], stringsAsFactors = FALSE) + temp_df = data.frame(cui = temp_cuis[seq(1, length(temp_cuis), by = 2)], + confidence = temp_cuis[seq(2, length(temp_cuis), by = 2)], + stringsAsFactors = FALSE) temp_df$entity = parsed_text$ents[[entity_num]]$text temp_df$lemma = parsed_text$ents[[entity_num]]$lemma_ temp_df = merge(temp_df, cui2vec_definitions, all.x = TRUE) # adds semantic_type and definition + temp_df$negated = parsed_text$ents[[entity_num]]$`_`$negex - return_df = rbind(return_df, temp_df) + temp_df = temp_df[temp_df$confidence > threshold, ] + temp_df$confidence = NULL + + temp_df = temp_df[temp_df$semantic_type %in% semantic_types, ] + + return_df_list[[entity_num]] = temp_df } - return_df + if (length(return_df_list) > 0) { + return_df = rbindlist(return_df_list, use.names = TRUE, fill = TRUE) + setDF(return_df) + return(return_df) + } else + { + return(return_df) + } } @@ -104,6 +305,7 @@ clinspacy <- function(text) { #' #' @param df A data frame. #' @param text A character string containing the name of the column to process. +#' @param ... Arguments passed down to \code{\link{clinspacy}} #' @return A data frame containing the original data frame as well as additional column names #' for each UMLS concept unique identifer found with values containing frequencies. #' @@ -111,14 +313,14 @@ clinspacy <- function(text) { #' data(mtsamples) #' mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description') #' str(mtsamples_with_cuis) -bind_clinspacy <- function(df, text) { +bind_clinspacy <- function(df, text, ...) { clinspacy_text = text assertthat::assert_that(assertthat::has_name(df, text)) assertthat::assert_that(nrow(df) > 0) df_nrow = nrow(df) dt = data.table(df)[, .(clinspacy_id = 1:.N, text = get(clinspacy_text))] - dt = dt[,clinspacy(.SD[,text]), clinspacy_id][negated == FALSE, .(clinspacy_id, cui, present = 1)] + dt = dt[,clinspacy(.SD[,text], ...), clinspacy_id][negated == FALSE, .(clinspacy_id, cui, present = 1)] dt = dcast(dt, clinspacy_id ~ cui, value.var = 'present', fun.aggregate = sum) dt2 = data.table(clinspacy_id = 1:df_nrow) dt = merge(dt, dt2, all.y=TRUE) @@ -127,13 +329,31 @@ bind_clinspacy <- function(df, text) { cbind(df, as.data.frame(dt)) } -#' This function binds columns containing concept unique identifiers with which scispacy has +#' This function binds columns containing concept embeddings for concepts with which scispacy has #' 99 percent confidence of being present with values containing frequencies. Negated concepts, #' as identified by negspacy's NegEx implementation, are ignored and do not count towards -#' the frequencies. +#' the embeddings. The concept embeddings are derived from the cui2vec_embeddings dataset +#' included with this package. +#' +#' The embeddings are derived from Andrew Beam's +#' \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. +#' +#' Citation +#' +#' Beam, A.L., Kompa, B., Schmaltz, A., Fried, I., Griffin, W, Palmer, N.P., Shi, X., +#' Cai, T., and Kohane, I.S.,, 2019. Clinical Concept Embeddings Learned from Massive +#' Sources of Multimodal Medical Data. arXiv preprint arXiv:1804.01486. +#' +#' License +#' +#' This data is made available under a +#' \href{https://creativecommons.org/licenses/by/4.0/}{CC BY 4.0 license}. The only change +#' made to the original dataset is the renaming of columns. #' #' @param df A data frame. #' @param text A character string containing the name of the column to process. +#' @param num_embeddings The number of embeddings to return (must be a number 1 through 500). +#' @param ... Arguments passed down to \code{\link{clinspacy}} #' @return A data frame containing the original data frame as well as additional column names #' for each UMLS concept unique identifer found with values containing frequencies. #' @@ -141,14 +361,18 @@ bind_clinspacy <- function(df, text) { #' data(mtsamples) #' mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description') #' str(mtsamples_with_cuis) -bind_clinspacy_embeddings <- function(df, text, num_embeddings = 500) { +bind_clinspacy_embeddings <- function(df, text, + num_embeddings = 500, ...) { + + assertthat::assert_that(num_embeddings >= 1 & num_embeddings <= 500) + clinspacy_text = text assertthat::assert_that(assertthat::has_name(df, text)) assertthat::assert_that(nrow(df) > 0) df_nrow = nrow(df) dt = data.table(df)[, .(clinspacy_id = 1:.N, text = get(clinspacy_text))] - dt = dt[, clinspacy(.SD[,text]), clinspacy_id] + dt = dt[, clinspacy(.SD[,text], ...), clinspacy_id] dt = dt[negated == FALSE] dt[, n := .N, by = .(clinspacy_id, cui)] dt = merge(dt, cui2vec_embeddings) # inner join on cui @@ -159,5 +383,6 @@ bind_clinspacy_embeddings <- function(df, text, num_embeddings = 500) { dt2 = data.table(clinspacy_id = 1:df_nrow) dt = merge(dt, dt2, all.y=TRUE) dt[, clinspacy_id := NULL] + dt = dt[, 1:num_embeddings] cbind(df, as.data.frame(dt)) } diff --git a/R/cui2vec_data.R b/R/cui2vec_data.R index a833e3f..e91d33d 100644 --- a/R/cui2vec_data.R +++ b/R/cui2vec_data.R @@ -1,41 +1,50 @@ #' Cui2vec concept embeddings #' -#' This dataset contains sample medical transcriptions for various medical specialties. +#' This dataset contains Unified Medical Langauge System (UMLS) concept embeddings from +#' Andrew Beam's \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. There are +#' 500 embeddings included for each concept. #' -#' Acknowledgements +#' Citation #' -#' This data was scraped from mtsamples.com by Tara Boyle and is made available -#' under a CC0: Public Domain license. +#' Beam, A.L., Kompa, B., Schmaltz, A., Fried, I., Griffin, W, Palmer, N.P., Shi, X., +#' Cai, T., and Kohane, I.S.,, 2019. Clinical Concept Embeddings Learned from Massive +#' Sources of Multimodal Medical Data. arXiv preprint arXiv:1804.01486. #' -#' @format A data frame with 4999 rows and 6 variables: +#' License +#' +#' This data is made available under a +#' \href{https://creativecommons.org/licenses/by/4.0/}{CC BY 4.0 license}. The only change +#' made to the original dataset is the renaming of columns. +#' +#' @format A data frame with 109053 rows and 501 variables: #' \describe{ -#' \item{note_id}{A unique identifier for each note} -#' \item{description}{A description or chief concern} -#' \item{medical_specialty}{Medical specialty of the note} -#' \item{sample_name}{mtsamples.com note name} -#' \item{transcription}{Transcription of note text} -#' \item{keywords}{Keywords} +#' \item{cui}{A Unified Medical Language System (UMLS) Concept Unique Identifier (CUI)} +#' \item{emb_001}{Concept embedding vector #1} +#' \item{emb_002}{Concept embedding vector #2} +#' \item{...}{...} +#' \item{emb_500}{Concept embedding vector #500} #' } -#' @source \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data} +#' @source \url{https://figshare.com/s/00d69861786cd0156d81} 'cui2vec_embeddings' #' Cui2vec concept definitions #' -#' This dataset contains sample medical transcriptions for various medical specialties. +#' This dataset contains definitions for the Unified Medical Language System (UMLS) +#' Concept Unique Identifiers (CUIs). These come from Andrew Beam's +#' \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. #' -#' Acknowledgements +#' License #' -#' This data was scraped from mtsamples.com by Tara Boyle and is made available -#' under a CC0: Public Domain license. +#' This data is made available under a +#' \href{https://github.com/beamandrew/cui2vec/blob/master/LICENSE.md}{MIT license}. The data +#' is copyrighted in 2019 by Benjamin Kompa, Andrew Beam, and Allen Schmaltz. The only change +#' made to the original dataset is the renaming of columns. #' -#' @format A data frame with 4999 rows and 6 variables: +#' @format A data frame with 3053795 rows and 3 variables: #' \describe{ -#' \item{note_id}{A unique identifier for each note} -#' \item{description}{A description or chief concern} -#' \item{medical_specialty}{Medical specialty of the note} -#' \item{sample_name}{mtsamples.com note name} -#' \item{transcription}{Transcription of note text} -#' \item{keywords}{Keywords} +#' \item{cui}{A Unified Medical Language System (UMLS) Concept Unique Identifier (CUI)} +#' \item{semantic_type}{Semantic type of the CUI} +#' \item{definition}{Definition of the CUI} #' } -#' @source \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data} +#' @source \url{https://github.com/beamandrew/cui2vec} 'cui2vec_definitions' diff --git a/R/mtsamples.R b/R/mtsamples.R index 8e74ae2..6b958d2 100644 --- a/R/mtsamples.R +++ b/R/mtsamples.R @@ -4,8 +4,11 @@ #' #' Acknowledgements #' -#' This data was scraped from mtsamples.com by Tara Boyle and is made available -#' under a CC0: Public Domain license. +#' This data was scraped from \href{https://mtsamples.com}{https://mtsamples.com} by Tara Boyle. +#' +#' License +#' This data is made available under a +#' \href{https://creativecommons.org/share-your-work/public-domain/cc0/}{CC0: Public Domain license}. #' #' @format A data frame with 4999 rows and 6 variables: #' \describe{ diff --git a/README.Rmd b/README.Rmd index 79db56a..d027dbc 100644 --- a/README.Rmd +++ b/README.Rmd @@ -29,12 +29,18 @@ You can install the GitHub version of clinspacy with: remotes::install_github('ML4LHS/clinspacy', INSTALL_opts = '--no-multiarch') ``` -## Example +## Examples ```{r} library(clinspacy) clinspacy('This patient has diabetes and CKD stage 3 but no HTN.') + +clinspacy('This patient is taking omeprazole, Protonix, and lisinopril 10 mg. He has diabetes.', + semantic_types = 'Disease or Syndrome') + +clinspacy('This patient is taking omeprazole, Protonix, and lisinopril 10 mg. He has diabetes.', + semantic_types = 'Pharmacologic Substance') ``` ## Using the mtsamples dataset @@ -42,7 +48,7 @@ clinspacy('This patient has diabetes and CKD stage 3 but no HTN.') ```{r} data(mtsamples) -str(mtsamples[1:5,]) +mtsamples[1:5,] ``` @@ -51,8 +57,30 @@ str(mtsamples[1:5,]) This function binds columns containing concept unique identifiers with which scispacy has 99% confidence of being present with values containing frequencies. Negated concepts, as identified by negspacy's NegEx implementation, are ignored and do not count towards the frequencies. ```{r} -mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description') +bind_clinspacy(mtsamples[1:5, 1:2], + text = 'description') -str(mtsamples_with_cuis) +bind_clinspacy(mtsamples[1:5, 1:2], + text = 'description', + semantic_types = 'Diagnostic Procedure') ``` +## Binding Concept Embeddings to a Data Frame + +```{r} +bind_clinspacy_embeddings(mtsamples[1:5, 1:2], + text = 'description', + num_embeddings = 5) + +bind_clinspacy_embeddings(mtsamples[1:5, 1:2], + text = 'description', + num_embeddings = 5, + semantic_types = 'Diagnostic Procedure') +``` + +# UMLS CUI definitions + +```{r} +data(cui2vec_definitions) +head(cui2vec_definitions) +``` diff --git a/README.html b/README.html index 1119af0..f7d2ff5 100644 --- a/README.html +++ b/README.html @@ -612,51 +612,130 @@
You can install the GitHub version of clinspacy with:
-library(clinspacy)
clinspacy('This patient has diabetes and CKD stage 3 but no HTN.')
-#> cui entity lemma negated
-#> 1 C0030705 patient patient FALSE
-#> 2 C1705908 patient patient FALSE
-#> 3 C1578481 patient patient FALSE
-#> 4 C1578485 patient patient FALSE
-#> 5 C1578486 patient patient FALSE
-#> 6 C0011847 diabetes diabete FALSE
-#> 7 C0011849 diabetes diabete FALSE
-#> 8 C2316787 CKD stage 3 ckd stage 3 FALSE
-#> 9 C0020538 HTN htn TRUE
data(mtsamples)
-str(mtsamples[1:5,])
-#> 'data.frame': 5 obs. of 6 variables:
-#> $ note_id : int 1 2 3 4 5
-#> $ description : chr "A 23-year-old white female presents with complaint of allergies." "Consult for laparoscopic gastric bypass." "Consult for laparoscopic gastric bypass." "2-D M-Mode. Doppler." ...
-#> $ medical_specialty: chr "Allergy / Immunology" "Bariatrics" "Bariatrics" "Cardiovascular / Pulmonary" ...
-#> $ sample_name : chr "Allergic Rhinitis" "Laparoscopic Gastric Bypass Consult - 2" "Laparoscopic Gastric Bypass Consult - 1" "2-D Echocardiogram - 1" ...
-#> $ transcription : chr "SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies w"| __truncated__ "PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to p"| __truncated__ "HISTORY OF PRESENT ILLNESS: , I have seen ABC today. He is a very pleasant gentleman who is 42 years old, 344 "| __truncated__ "2-D M-MODE: , ,1. Left atrial enlargement with left atrial diameter of 4.7 cm.,2. Normal size right and left "| __truncated__ ...
-#> $ keywords : chr "allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegr"| __truncated__ "bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, "| __truncated__ "bariatrics, laparoscopic gastric bypass, heart attacks, body weight, pulmonary embolism, potential complication"| __truncated__ "cardiovascular / pulmonary, 2-d m-mode, doppler, aortic valve, atrial enlargement, diastolic function, ejection"| __truncated__ ...
This function binds columns containing concept unique identifiers with which scispacy has 99% confidence of being present with values containing frequencies. Negated concepts, as identified by negspacy’s NegEx implementation, are ignored and do not count towards the frequencies.
-mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description')
-
-str(mtsamples_with_cuis)
-#> 'data.frame': 5 obs. of 14 variables:
-#> $ note_id : int 1 2 3 4 5
-#> $ description : chr "A 23-year-old white female presents with complaint of allergies." "Consult for laparoscopic gastric bypass." "Consult for laparoscopic gastric bypass." "2-D M-Mode. Doppler." ...
-#> $ medical_specialty: chr "Allergy / Immunology" "Bariatrics" "Bariatrics" "Cardiovascular / Pulmonary" ...
-#> $ sample_name : chr "Allergic Rhinitis" "Laparoscopic Gastric Bypass Consult - 2" "Laparoscopic Gastric Bypass Consult - 1" "2-D Echocardiogram - 1" ...
-#> $ transcription : chr "SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies w"| __truncated__ "PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to p"| __truncated__ "HISTORY OF PRESENT ILLNESS: , I have seen ABC today. He is a very pleasant gentleman who is 42 years old, 344 "| __truncated__ "2-D M-MODE: , ,1. Left atrial enlargement with left atrial diameter of 4.7 cm.,2. Normal size right and left "| __truncated__ ...
-#> $ keywords : chr "allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegr"| __truncated__ "bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, "| __truncated__ "bariatrics, laparoscopic gastric bypass, heart attacks, body weight, pulmonary embolism, potential complication"| __truncated__ "cardiovascular / pulmonary, 2-d m-mode, doppler, aortic valve, atrial enlargement, diastolic function, ejection"| __truncated__ ...
-#> $ C0009818 : num 0 1 1 0 0
-#> $ C0020517 : num 1 0 0 0 0
-#> $ C0277786 : num 1 0 0 0 0
-#> $ C0554756 : num 0 0 0 1 0
-#> $ C1705052 : num 0 0 0 1 0
-#> $ C3864418 : num 1 0 0 0 0
-#> $ C4039248 : num 0 1 1 0 0
-#> $ C4331911 : num 0 0 0 1 0
bind_clinspacy(mtsamples[1:5, 1:2],
+ text = 'description')
+#> note_id description C0009818 C0013516 C0020517
+#> 1 1 A 23-year-old white female presents with complaint of allergies. 0 0 1
+#> 2 2 Consult for laparoscopic gastric bypass. 1 0 0
+#> 3 3 Consult for laparoscopic gastric bypass. 1 0 0
+#> 4 4 2-D M-Mode. Doppler. 0 0 0
+#> 5 5 2-D Echocardiogram 0 1 0
+#> C0554756 C1705052 C2243117 C3864418 C4039248
+#> 1 0 0 0 1 0
+#> 2 0 0 0 0 1
+#> 3 0 0 0 0 1
+#> 4 1 0 0 0 0
+#> 5 0 1 1 0 0
+
+bind_clinspacy(mtsamples[1:5, 1:2],
+ text = 'description',
+ semantic_types = 'Diagnostic Procedure')
+#> note_id description C0013516 C0554756
+#> 1 1 A 23-year-old white female presents with complaint of allergies. 0 0
+#> 2 2 Consult for laparoscopic gastric bypass. 0 0
+#> 3 3 Consult for laparoscopic gastric bypass. 0 0
+#> 4 4 2-D M-Mode. Doppler. 0 1
+#> 5 5 2-D Echocardiogram 1 0
bind_clinspacy_embeddings(mtsamples[1:5, 1:2],
+ text = 'description',
+ num_embeddings = 5)
+#> note_id description emb_001 emb_002
+#> 1 1 A 23-year-old white female presents with complaint of allergies. -0.02252676 0.00981737
+#> 2 2 Consult for laparoscopic gastric bypass. -0.06431815 0.02979208
+#> 3 3 Consult for laparoscopic gastric bypass. -0.06431815 0.02979208
+#> 4 4 2-D M-Mode. Doppler. -0.06111055 0.03059523
+#> 5 5 2-D Echocardiogram -0.08545282 0.03965676
+#> emb_003 emb_004 emb_005
+#> 1 -7.112366e-17 -0.015715369 0.00204883
+#> 2 -1.353084e-16 -0.046832239 0.03387485
+#> 3 -1.353084e-16 -0.046832239 0.03387485
+#> 4 -1.340074e-16 -0.032813400 -0.02400309
+#> 5 -4.336809e-17 -0.008077436 -0.04463792
+
+bind_clinspacy_embeddings(mtsamples[1:5, 1:2],
+ text = 'description',
+ num_embeddings = 5,
+ semantic_types = 'Diagnostic Procedure')
+#> note_id description emb_001 emb_002
+#> 1 1 A 23-year-old white female presents with complaint of allergies. NA NA
+#> 2 2 Consult for laparoscopic gastric bypass. NA NA
+#> 3 3 Consult for laparoscopic gastric bypass. NA NA
+#> 4 4 2-D M-Mode. Doppler. -0.06111055 0.03059523
+#> 5 5 2-D Echocardiogram -0.08545282 0.03965676
+#> emb_003 emb_004 emb_005
+#> 1 NA NA NA
+#> 2 NA NA NA
+#> 3 NA NA NA
+#> 4 -1.340074e-16 -0.032813400 -0.02400309
+#> 5 -4.336809e-17 -0.008077436 -0.04463792
data(cui2vec_definitions)
+head(cui2vec_definitions)
+#> cui semantic_type definition
+#> 1 C0000005 Amino Acid, Peptide, or Protein (131)I-Macroaggregated Albumin
+#> 2 C0000005 Pharmacologic Substance (131)I-Macroaggregated Albumin
+#> 3 C0000005 Indicator, Reagent, or Diagnostic Aid (131)I-Macroaggregated Albumin
+#> 4 C0000039 Organic Chemical 1,2-Dipalmitoylphosphatidylcholine
+#> 5 C0000039 Pharmacologic Substance 1,2-Dipalmitoylphosphatidylcholine
+#> 6 C0000052 Amino Acid, Peptide, or Protein 1,4-alpha-Glucan Branching Enzyme