diff --git a/R/clinspacy.R b/R/clinspacy.R index f95b856..a0ecbeb 100644 --- a/R/clinspacy.R +++ b/R/clinspacy.R @@ -12,7 +12,53 @@ negex <- NULL linker <- NULL .onLoad <- function(libname, pkgname) { - reticulate::configure_environment(force = TRUE) + # reticulate::configure_environment(force = TRUE) +} + +.onAttach <- function(libname, pkgname) { + packageStartupMessage('Welcome to clinspacy.') + packageStartupMessage('By default, this package will install and use miniconda and create a "clinspacy" conda environment.') + packageStartupMessage('If you want to override this behavior, use clinspacy_init(miniconda = FALSE) and specify an alternative environment using use_python() or use_conda().') +} + + +#' Initializes clinspacy. This function is optional to run but gives you more control over +#' the parameters used by scispacy at initiation. If you do not run this function, it will be +#' run with default parameters the first time that any of the package functions are run. +#' +#' @param miniconda Defaults to TRUE, which results in miniconda being installed (~400 MB) +#' and configured with the "clinspacy" conda environment. If you want to override this behavior, +#' set \code{miniconda} to \code{FALSE} and specify an alternative environment using use_python() +#' or use_conda(). +#' @param linker_threshold Defaults to 0.99. The confidence threshold value used by the scispacy UMLS entity +#' linker. Note: This can be lower than the \code{threshold} from \code{\link{clinspacy_init}}). +#' The linker_threshold can only be set once per session. +#' @param ... Additional settings available from: \href{https://github.com/allenai/scispacy}{https://github.com/allenai/scispacy}. + +clinspacy_init <- function(miniconda = TRUE, linker_threshold = 0.99, ...) { + + assertthat::assert_that(assertthat::is.flag(miniconda)) + assertthat::assert_that(linker_threshold >= 0.70 & linker_threshold <= 0.99) + + message('Initializing clinspacy using clinspacy_init()...') + + if (miniconda) { + message('Checking if miniconda is installed...') + tryCatch(reticulate::install_miniconda(), + error = function (e) {NULL}) + + # By now, miniconda should be installed. Let's check if the clinspacy environment is configured + is_clinspacy_env_installed = tryCatch(reticulate::use_miniconda(condaenv = 'clinspacy', required = TRUE), + error = function (e) {'not installed'}) + + if (!is.null(is_clinspacy_env_installed)) { # this means the 'clinspacy' condaenv *is not* installed + message('Clinspacy requires the clinspacy conda environment. Attempting to create...') + reticulate::conda_create(envname = 'clinspacy') + } + + # This is intentional -- will throw an error if environment creation failed + reticulate::use_miniconda(condaenv = 'clinspacy', required = TRUE) + } if (!reticulate::py_module_available('spacy')) { packageStartupMessage('Spacy not found. Installing spacy...') @@ -29,30 +75,29 @@ linker <- NULL reticulate::py_install('negspacy', pip = TRUE) } - if (!reticulate::py_module_available('en_core_sci_sm')) { - packageStartupMessage('en_core_sci_sm language model not found. Installing en_core_sci_sm...') - reticulate::py_install('https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz', pip = TRUE) + if (!reticulate::py_module_available('en_core_sci_lg')) { + packageStartupMessage('en_core_sci_lg language model not found. Installing en_core_sci_lg...') + reticulate::py_install('https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_lg-0.2.5.tar.gz', pip = TRUE) } - packageStartupMessage('Importing spacy...') - spacy <<- reticulate::import('spacy') - packageStartupMessage('Importing scispacy...') - scispacy <<- reticulate::import('scispacy') - packageStartupMessage('Importing negspacy...') - negspacy <<- reticulate::import('negspacy') - packageStartupMessage('Loading the en_core_sci_sm language model...') - nlp <<- spacy$load("en_core_sci_sm") - packageStartupMessage('Loading NegEx...') + message('Importing spacy...') + spacy <<- reticulate::import('spacy', delay_load = TRUE) + message('Importing scispacy...') + scispacy <<- reticulate::import('scispacy', delay_load = TRUE) + message('Importing negspacy...') + negspacy <<- reticulate::import('negspacy', delay_load = TRUE) + + message('Loading the en_core_sci_lg language model...') + nlp <<- spacy$load("en_core_sci_lg") + message('Loading NegEx...') negex <<- negspacy$negation$Negex(nlp) - packageStartupMessage('Loading the UMLS entity linker... (this may take a while)') - linker <<- scispacy$linking$EntityLinker(resolve_abbreviations=TRUE, name="umls", threshold = 0.99) - packageStartupMessage('Adding the UMLS entity linker and NegEx to the spacy pipeline...') + message('Loading the UMLS entity linker... (this may take a while)') + linker <<- scispacy$linking$EntityLinker(resolve_abbreviations=TRUE, + name="umls", + threshold = linker_threshold, ...) + message('Adding the UMLS entity linker and NegEx to the spacy pipeline...') nlp$add_pipe(linker) - nlp$add_pipe(negex, last=TRUE) -} - -.onAttach <- function(libname, pkgname) { - packageStartupMessage('\nWelcome to clinspacy. Take a look at help(clinspacy) to get started.') + nlp$add_pipe(negex) } #' Performs biomedical named entity recognition, Unified Medical Language System (UMLS) @@ -61,12 +106,151 @@ linker <- NULL #' 99 percent confidence of being present. Negation is identified using negspacy's NegEx implementation. #' #' @param text A character string containing medical text that you would like to process. +#' @param threshold Defaults to 0.99. The confidence threshold value used by clinspacy (can be higher than the +#' \code{linker_threshold} from \code{\link{clinspacy_init}}). Note that whereas the +#' linker_threshold can only be set once per session, this threshold can be updated during the R session. +#' @param semantic_types Character vector containing any combination of the following: +#' c("Acquired Abnormality", "Activity", "Age Group", "Amino Acid Sequence", "Amino Acid, Peptide, or Protein", "Amphibian", "Anatomical Abnormality", "Anatomical Structure", "Animal", "Antibiotic", "Archaeon", "Bacterium", "Behavior", "Biologic Function", "Biologically Active Substance", "Biomedical Occupation or Discipline", "Biomedical or Dental Material", "Bird", "Body Location or Region", "Body Part, Organ, or Organ Component", "Body Space or Junction", "Body Substance", "Body System", "Carbohydrate Sequence", "Cell", "Cell Component", "Cell Function", "Cell or Molecular Dysfunction", "Chemical", "Chemical Viewed Functionally", "Chemical Viewed Structurally", "Classification", "Clinical Attribute", "Clinical Drug", "Conceptual Entity", "Congenital Abnormality", "Daily or Recreational Activity", "Diagnostic Procedure", "Disease or Syndrome", "Drug Delivery Device", "Educational Activity", "Element, Ion, or Isotope", "Embryonic Structure", "Entity", "Environmental Effect of Humans", "Enzyme", "Eukaryote", "Event", "Experimental Model of Disease", "Family Group", "Finding", "Fish", "Food", "Fully Formed Anatomical Structure", "Functional Concept", "Fungus", "Gene or Genome", "Genetic Function", "Geographic Area", "Governmental or Regulatory Activity", "Group", "Group Attribute", "Hazardous or Poisonous Substance", "Health Care Activity", "Health Care Related Organization", "Hormone", "Human", "Human-caused Phenomenon or Process", "Idea or Concept", "Immunologic Factor", "Indicator, Reagent, or Diagnostic Aid", "Individual Behavior", "Injury or Poisoning", "Inorganic Chemical", "Intellectual Product", "Laboratory or Test Result", "Laboratory Procedure", "Language", "Machine Activity", "Mammal", "Manufactured Object", "Medical Device", "Mental or Behavioral Dysfunction", "Mental Process", "Molecular Biology Research Technique", "Molecular Function", "Molecular Sequence", "Natural Phenomenon or Process", "Neoplastic Process", "Nucleic Acid, Nucleoside, or Nucleotide", "Nucleotide Sequence", "Occupation or Discipline", "Occupational Activity", "Organ or Tissue Function", "Organic Chemical", "Organism", "Organism Attribute", "Organism Function", "Organization", "Pathologic Function", "Patient or Disabled Group", "Pharmacologic Substance", "Phenomenon or Process", "Physical Object", "Physiologic Function", "Plant", "Population Group", "Professional or Occupational Group", "Professional Society", "Qualitative Concept", "Quantitative Concept", "Receptor", "Regulation or Law", "Reptile", "Research Activity", "Research Device", "Self-help or Relief Organization", "Sign or Symptom", "Social Behavior", "Spatial Concept", "Substance", "Temporal Concept", "Therapeutic or Preventive Procedure", "Tissue", "Vertebrate", "Virus", "Vitamin") #' @return A data frame containing the UMLS concept unique identifiers (cui), entities, #' lemmatized entities, and NegEx negation status (\code{TRUE} means negated, \code{FALSE} means *not* negated). #' #' @examples #' clinspacy('This patient has diabetes and CKD stage 3 but no HTN.') -clinspacy <- function(text) { +clinspacy <- function(text, threshold = 0.99, + semantic_types = c("Acquired Abnormality", + "Activity", + "Age Group", + "Amino Acid Sequence", + "Amino Acid, Peptide, or Protein", + "Amphibian", + "Anatomical Abnormality", + "Anatomical Structure", + "Animal", + "Antibiotic", + "Archaeon", + "Bacterium", + "Behavior", + "Biologic Function", + "Biologically Active Substance", + "Biomedical Occupation or Discipline", + "Biomedical or Dental Material", + "Bird", + "Body Location or Region", + "Body Part, Organ, or Organ Component", + "Body Space or Junction", + "Body Substance", + "Body System", + "Carbohydrate Sequence", + "Cell", + "Cell Component", + "Cell Function", + "Cell or Molecular Dysfunction", + "Chemical", + "Chemical Viewed Functionally", + "Chemical Viewed Structurally", + "Classification", + "Clinical Attribute", + "Clinical Drug", + "Conceptual Entity", + "Congenital Abnormality", + "Daily or Recreational Activity", + "Diagnostic Procedure", + "Disease or Syndrome", + "Drug Delivery Device", + "Educational Activity", + "Element, Ion, or Isotope", + "Embryonic Structure", + "Entity", + "Environmental Effect of Humans", + "Enzyme", + "Eukaryote", + "Event", + "Experimental Model of Disease", + "Family Group", + "Finding", + "Fish", + "Food", + "Fully Formed Anatomical Structure", + "Functional Concept", + "Fungus", + "Gene or Genome", + "Genetic Function", + "Geographic Area", + "Governmental or Regulatory Activity", + "Group", + "Group Attribute", + "Hazardous or Poisonous Substance", + "Health Care Activity", + "Health Care Related Organization", + "Hormone", + "Human", + "Human-caused Phenomenon or Process", + "Idea or Concept", + "Immunologic Factor", + "Indicator, Reagent, or Diagnostic Aid", + "Individual Behavior", + "Injury or Poisoning", + "Inorganic Chemical", + "Intellectual Product", + "Laboratory or Test Result", + "Laboratory Procedure", + "Language", + "Machine Activity", + "Mammal", + "Manufactured Object", + "Medical Device", + "Mental or Behavioral Dysfunction", + "Mental Process", + "Molecular Biology Research Technique", + "Molecular Function", + "Molecular Sequence", + "Natural Phenomenon or Process", + "Neoplastic Process", + "Nucleic Acid, Nucleoside, or Nucleotide", + "Nucleotide Sequence", + "Occupation or Discipline", + "Occupational Activity", + "Organ or Tissue Function", + "Organic Chemical", + "Organism", + "Organism Attribute", + "Organism Function", + "Organization", + "Pathologic Function", + "Patient or Disabled Group", + "Pharmacologic Substance", + "Phenomenon or Process", + "Physical Object", + "Physiologic Function", + "Plant", + "Population Group", + "Professional or Occupational Group", + "Professional Society", + "Qualitative Concept", + "Quantitative Concept", + "Receptor", + "Regulation or Law", + "Reptile", + "Research Activity", + "Research Device", + "Self-help or Relief Organization", + "Sign or Symptom", + "Social Behavior", + "Spatial Concept", + "Substance", + "Temporal Concept", + "Therapeutic or Preventive Procedure", + "Tissue", + "Vertebrate", + "Virus", + "Vitamin")) { + + if (is.null(nlp)) { + clinspacy_init() + } + + assertthat::assert_that(threshold >= 0.70 & threshold <= 0.99) + parsed_text = nlp(text) entity_nums = length(parsed_text$ents) @@ -78,22 +262,39 @@ clinspacy <- function(text) { definition = character(0), stringsAsFactors = FALSE) + return_df_list = list() + for (entity_num in seq_len(entity_nums)) { if (is.null(unlist(parsed_text$ents[[entity_num]]$`_`$kb_ents))) next temp_cuis = parsed_text$ents[[entity_num]]$`_`$kb_ents temp_cuis = unlist(temp_cuis) - temp_df = data.frame(cui = temp_cuis[seq(1, length(temp_cuis), by = 2)], stringsAsFactors = FALSE) + temp_df = data.frame(cui = temp_cuis[seq(1, length(temp_cuis), by = 2)], + confidence = temp_cuis[seq(2, length(temp_cuis), by = 2)], + stringsAsFactors = FALSE) temp_df$entity = parsed_text$ents[[entity_num]]$text temp_df$lemma = parsed_text$ents[[entity_num]]$lemma_ temp_df = merge(temp_df, cui2vec_definitions, all.x = TRUE) # adds semantic_type and definition + temp_df$negated = parsed_text$ents[[entity_num]]$`_`$negex - return_df = rbind(return_df, temp_df) + temp_df = temp_df[temp_df$confidence > threshold, ] + temp_df$confidence = NULL + + temp_df = temp_df[temp_df$semantic_type %in% semantic_types, ] + + return_df_list[[entity_num]] = temp_df } - return_df + if (length(return_df_list) > 0) { + return_df = rbindlist(return_df_list, use.names = TRUE, fill = TRUE) + setDF(return_df) + return(return_df) + } else + { + return(return_df) + } } @@ -104,6 +305,7 @@ clinspacy <- function(text) { #' #' @param df A data frame. #' @param text A character string containing the name of the column to process. +#' @param ... Arguments passed down to \code{\link{clinspacy}} #' @return A data frame containing the original data frame as well as additional column names #' for each UMLS concept unique identifer found with values containing frequencies. #' @@ -111,14 +313,14 @@ clinspacy <- function(text) { #' data(mtsamples) #' mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description') #' str(mtsamples_with_cuis) -bind_clinspacy <- function(df, text) { +bind_clinspacy <- function(df, text, ...) { clinspacy_text = text assertthat::assert_that(assertthat::has_name(df, text)) assertthat::assert_that(nrow(df) > 0) df_nrow = nrow(df) dt = data.table(df)[, .(clinspacy_id = 1:.N, text = get(clinspacy_text))] - dt = dt[,clinspacy(.SD[,text]), clinspacy_id][negated == FALSE, .(clinspacy_id, cui, present = 1)] + dt = dt[,clinspacy(.SD[,text], ...), clinspacy_id][negated == FALSE, .(clinspacy_id, cui, present = 1)] dt = dcast(dt, clinspacy_id ~ cui, value.var = 'present', fun.aggregate = sum) dt2 = data.table(clinspacy_id = 1:df_nrow) dt = merge(dt, dt2, all.y=TRUE) @@ -127,13 +329,31 @@ bind_clinspacy <- function(df, text) { cbind(df, as.data.frame(dt)) } -#' This function binds columns containing concept unique identifiers with which scispacy has +#' This function binds columns containing concept embeddings for concepts with which scispacy has #' 99 percent confidence of being present with values containing frequencies. Negated concepts, #' as identified by negspacy's NegEx implementation, are ignored and do not count towards -#' the frequencies. +#' the embeddings. The concept embeddings are derived from the cui2vec_embeddings dataset +#' included with this package. +#' +#' The embeddings are derived from Andrew Beam's +#' \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. +#' +#' Citation +#' +#' Beam, A.L., Kompa, B., Schmaltz, A., Fried, I., Griffin, W, Palmer, N.P., Shi, X., +#' Cai, T., and Kohane, I.S.,, 2019. Clinical Concept Embeddings Learned from Massive +#' Sources of Multimodal Medical Data. arXiv preprint arXiv:1804.01486. +#' +#' License +#' +#' This data is made available under a +#' \href{https://creativecommons.org/licenses/by/4.0/}{CC BY 4.0 license}. The only change +#' made to the original dataset is the renaming of columns. #' #' @param df A data frame. #' @param text A character string containing the name of the column to process. +#' @param num_embeddings The number of embeddings to return (must be a number 1 through 500). +#' @param ... Arguments passed down to \code{\link{clinspacy}} #' @return A data frame containing the original data frame as well as additional column names #' for each UMLS concept unique identifer found with values containing frequencies. #' @@ -141,14 +361,18 @@ bind_clinspacy <- function(df, text) { #' data(mtsamples) #' mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description') #' str(mtsamples_with_cuis) -bind_clinspacy_embeddings <- function(df, text, num_embeddings = 500) { +bind_clinspacy_embeddings <- function(df, text, + num_embeddings = 500, ...) { + + assertthat::assert_that(num_embeddings >= 1 & num_embeddings <= 500) + clinspacy_text = text assertthat::assert_that(assertthat::has_name(df, text)) assertthat::assert_that(nrow(df) > 0) df_nrow = nrow(df) dt = data.table(df)[, .(clinspacy_id = 1:.N, text = get(clinspacy_text))] - dt = dt[, clinspacy(.SD[,text]), clinspacy_id] + dt = dt[, clinspacy(.SD[,text], ...), clinspacy_id] dt = dt[negated == FALSE] dt[, n := .N, by = .(clinspacy_id, cui)] dt = merge(dt, cui2vec_embeddings) # inner join on cui @@ -159,5 +383,6 @@ bind_clinspacy_embeddings <- function(df, text, num_embeddings = 500) { dt2 = data.table(clinspacy_id = 1:df_nrow) dt = merge(dt, dt2, all.y=TRUE) dt[, clinspacy_id := NULL] + dt = dt[, 1:num_embeddings] cbind(df, as.data.frame(dt)) } diff --git a/R/cui2vec_data.R b/R/cui2vec_data.R index a833e3f..e91d33d 100644 --- a/R/cui2vec_data.R +++ b/R/cui2vec_data.R @@ -1,41 +1,50 @@ #' Cui2vec concept embeddings #' -#' This dataset contains sample medical transcriptions for various medical specialties. +#' This dataset contains Unified Medical Langauge System (UMLS) concept embeddings from +#' Andrew Beam's \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. There are +#' 500 embeddings included for each concept. #' -#' Acknowledgements +#' Citation #' -#' This data was scraped from mtsamples.com by Tara Boyle and is made available -#' under a CC0: Public Domain license. +#' Beam, A.L., Kompa, B., Schmaltz, A., Fried, I., Griffin, W, Palmer, N.P., Shi, X., +#' Cai, T., and Kohane, I.S.,, 2019. Clinical Concept Embeddings Learned from Massive +#' Sources of Multimodal Medical Data. arXiv preprint arXiv:1804.01486. #' -#' @format A data frame with 4999 rows and 6 variables: +#' License +#' +#' This data is made available under a +#' \href{https://creativecommons.org/licenses/by/4.0/}{CC BY 4.0 license}. The only change +#' made to the original dataset is the renaming of columns. +#' +#' @format A data frame with 109053 rows and 501 variables: #' \describe{ -#' \item{note_id}{A unique identifier for each note} -#' \item{description}{A description or chief concern} -#' \item{medical_specialty}{Medical specialty of the note} -#' \item{sample_name}{mtsamples.com note name} -#' \item{transcription}{Transcription of note text} -#' \item{keywords}{Keywords} +#' \item{cui}{A Unified Medical Language System (UMLS) Concept Unique Identifier (CUI)} +#' \item{emb_001}{Concept embedding vector #1} +#' \item{emb_002}{Concept embedding vector #2} +#' \item{...}{...} +#' \item{emb_500}{Concept embedding vector #500} #' } -#' @source \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data} +#' @source \url{https://figshare.com/s/00d69861786cd0156d81} 'cui2vec_embeddings' #' Cui2vec concept definitions #' -#' This dataset contains sample medical transcriptions for various medical specialties. +#' This dataset contains definitions for the Unified Medical Language System (UMLS) +#' Concept Unique Identifiers (CUIs). These come from Andrew Beam's +#' \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. #' -#' Acknowledgements +#' License #' -#' This data was scraped from mtsamples.com by Tara Boyle and is made available -#' under a CC0: Public Domain license. +#' This data is made available under a +#' \href{https://github.com/beamandrew/cui2vec/blob/master/LICENSE.md}{MIT license}. The data +#' is copyrighted in 2019 by Benjamin Kompa, Andrew Beam, and Allen Schmaltz. The only change +#' made to the original dataset is the renaming of columns. #' -#' @format A data frame with 4999 rows and 6 variables: +#' @format A data frame with 3053795 rows and 3 variables: #' \describe{ -#' \item{note_id}{A unique identifier for each note} -#' \item{description}{A description or chief concern} -#' \item{medical_specialty}{Medical specialty of the note} -#' \item{sample_name}{mtsamples.com note name} -#' \item{transcription}{Transcription of note text} -#' \item{keywords}{Keywords} +#' \item{cui}{A Unified Medical Language System (UMLS) Concept Unique Identifier (CUI)} +#' \item{semantic_type}{Semantic type of the CUI} +#' \item{definition}{Definition of the CUI} #' } -#' @source \url{https://www.kaggle.com/tboyle10/medicaltranscriptions/data} +#' @source \url{https://github.com/beamandrew/cui2vec} 'cui2vec_definitions' diff --git a/R/mtsamples.R b/R/mtsamples.R index 8e74ae2..6b958d2 100644 --- a/R/mtsamples.R +++ b/R/mtsamples.R @@ -4,8 +4,11 @@ #' #' Acknowledgements #' -#' This data was scraped from mtsamples.com by Tara Boyle and is made available -#' under a CC0: Public Domain license. +#' This data was scraped from \href{https://mtsamples.com}{https://mtsamples.com} by Tara Boyle. +#' +#' License +#' This data is made available under a +#' \href{https://creativecommons.org/share-your-work/public-domain/cc0/}{CC0: Public Domain license}. #' #' @format A data frame with 4999 rows and 6 variables: #' \describe{ diff --git a/README.Rmd b/README.Rmd index 79db56a..d027dbc 100644 --- a/README.Rmd +++ b/README.Rmd @@ -29,12 +29,18 @@ You can install the GitHub version of clinspacy with: remotes::install_github('ML4LHS/clinspacy', INSTALL_opts = '--no-multiarch') ``` -## Example +## Examples ```{r} library(clinspacy) clinspacy('This patient has diabetes and CKD stage 3 but no HTN.') + +clinspacy('This patient is taking omeprazole, Protonix, and lisinopril 10 mg. He has diabetes.', + semantic_types = 'Disease or Syndrome') + +clinspacy('This patient is taking omeprazole, Protonix, and lisinopril 10 mg. He has diabetes.', + semantic_types = 'Pharmacologic Substance') ``` ## Using the mtsamples dataset @@ -42,7 +48,7 @@ clinspacy('This patient has diabetes and CKD stage 3 but no HTN.') ```{r} data(mtsamples) -str(mtsamples[1:5,]) +mtsamples[1:5,] ``` @@ -51,8 +57,30 @@ str(mtsamples[1:5,]) This function binds columns containing concept unique identifiers with which scispacy has 99% confidence of being present with values containing frequencies. Negated concepts, as identified by negspacy's NegEx implementation, are ignored and do not count towards the frequencies. ```{r} -mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description') +bind_clinspacy(mtsamples[1:5, 1:2], + text = 'description') -str(mtsamples_with_cuis) +bind_clinspacy(mtsamples[1:5, 1:2], + text = 'description', + semantic_types = 'Diagnostic Procedure') ``` +## Binding Concept Embeddings to a Data Frame + +```{r} +bind_clinspacy_embeddings(mtsamples[1:5, 1:2], + text = 'description', + num_embeddings = 5) + +bind_clinspacy_embeddings(mtsamples[1:5, 1:2], + text = 'description', + num_embeddings = 5, + semantic_types = 'Diagnostic Procedure') +``` + +# UMLS CUI definitions + +```{r} +data(cui2vec_definitions) +head(cui2vec_definitions) +``` diff --git a/README.html b/README.html index 1119af0..f7d2ff5 100644 --- a/README.html +++ b/README.html @@ -612,51 +612,130 @@

clinspacy

Installation

You can install the GitHub version of clinspacy with:

remotes::install_github('ML4LHS/clinspacy', INSTALL_opts = '--no-multiarch')
-

Example

+

Examples

library(clinspacy)
 
 clinspacy('This patient has diabetes and CKD stage 3 but no HTN.')
-#>        cui      entity       lemma negated
-#> 1 C0030705     patient     patient   FALSE
-#> 2 C1705908     patient     patient   FALSE
-#> 3 C1578481     patient     patient   FALSE
-#> 4 C1578485     patient     patient   FALSE
-#> 5 C1578486     patient     patient   FALSE
-#> 6 C0011847    diabetes     diabete   FALSE
-#> 7 C0011849    diabetes     diabete   FALSE
-#> 8 C2316787 CKD stage 3 ckd stage 3   FALSE
-#> 9 C0020538         HTN         htn    TRUE
+#> cui entity lemma semantic_type definition negated +#> 1 C0030705 patient patient Patient or Disabled Group Patients FALSE +#> 2 C1550655 patient patient Body Substance Specimen Type - Patient FALSE +#> 3 C1578483 patient patient Idea or Concept Report source - Patient FALSE +#> 4 C1578484 patient patient Idea or Concept Relationship modifier - Patient FALSE +#> 5 C1578486 patient patient Intellectual Product Disabled Person Code - Patient FALSE +#> 6 C0011847 diabetes diabetes Disease or Syndrome Diabetes FALSE +#> 7 C0011849 diabetes diabetes Disease or Syndrome Diabetes Mellitus FALSE +#> 8 C2316787 CKD stage 3 ckd stage 3 Disease or Syndrome Chronic kidney disease stage 3 FALSE +#> 9 C0020538 HTN htn Disease or Syndrome Hypertensive disease TRUE + +clinspacy('This patient is taking omeprazole, Protonix, and lisinopril 10 mg. He has diabetes.', + semantic_types = 'Disease or Syndrome') +#> cui entity lemma semantic_type definition negated +#> 1 C0011847 diabetes diabete Disease or Syndrome Diabetes FALSE +#> 2 C0011849 diabetes diabete Disease or Syndrome Diabetes Mellitus FALSE + +clinspacy('This patient is taking omeprazole, Protonix, and lisinopril 10 mg. He has diabetes.', + semantic_types = 'Pharmacologic Substance') +#> cui entity lemma semantic_type definition negated +#> 1 C0028978 omeprazole omeprazole Pharmacologic Substance Omeprazole FALSE +#> 2 C0876139 Protonix Protonix Pharmacologic Substance Protonix FALSE +#> 3 C0065374 lisinopril lisinopril Pharmacologic Substance Lisinopril FALSE

Using the mtsamples dataset

data(mtsamples)
 
-str(mtsamples[1:5,])
-#> 'data.frame':    5 obs. of  6 variables:
-#>  $ note_id          : int  1 2 3 4 5
-#>  $ description      : chr  "A 23-year-old white female presents with complaint of allergies." "Consult for laparoscopic gastric bypass." "Consult for laparoscopic gastric bypass." "2-D M-Mode. Doppler." ...
-#>  $ medical_specialty: chr  "Allergy / Immunology" "Bariatrics" "Bariatrics" "Cardiovascular / Pulmonary" ...
-#>  $ sample_name      : chr  "Allergic Rhinitis" "Laparoscopic Gastric Bypass Consult - 2" "Laparoscopic Gastric Bypass Consult - 1" "2-D Echocardiogram - 1" ...
-#>  $ transcription    : chr  "SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies w"| __truncated__ "PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to p"| __truncated__ "HISTORY OF PRESENT ILLNESS: , I have seen ABC today.  He is a very pleasant gentleman who is 42 years old, 344 "| __truncated__ "2-D M-MODE: , ,1.  Left atrial enlargement with left atrial diameter of 4.7 cm.,2.  Normal size right and left "| __truncated__ ...
-#>  $ keywords         : chr  "allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegr"| __truncated__ "bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, "| __truncated__ "bariatrics, laparoscopic gastric bypass, heart attacks, body weight, pulmonary embolism, potential complication"| __truncated__ "cardiovascular / pulmonary, 2-d m-mode, doppler, aortic valve, atrial enlargement, diastolic function, ejection"| __truncated__ ...
+mtsamples[1:5,] +#> note_id description medical_specialty +#> 1 1 A 23-year-old white female presents with complaint of allergies. Allergy / Immunology +#> 2 2 Consult for laparoscopic gastric bypass. Bariatrics +#> 3 3 Consult for laparoscopic gastric bypass. Bariatrics +#> 4 4 2-D M-Mode. Doppler. Cardiovascular / Pulmonary +#> 5 5 2-D Echocardiogram Cardiovascular / Pulmonary +#> sample_name +#> 1 Allergic Rhinitis +#> 2 Laparoscopic Gastric Bypass Consult - 2 +#> 3 Laparoscopic Gastric Bypass Consult - 1 +#> 4 2-D Echocardiogram - 1 +#> 5 2-D Echocardiogram - 2 +#> transcription +#> 1 SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies when she lived in Seattle but she thinks they are worse here. In the past, she has tried Claritin, and Zyrtec. Both worked for short time but then seemed to lose effectiveness. She has used Allegra also. She used that last summer and she began using it again two weeks ago. It does not appear to be working very well. She has used over-the-counter sprays but no prescription nasal sprays. She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals: Weight was 130 pounds and blood pressure 124/78.,HEENT: Her throat was mildly erythematous without exudate. Nasal mucosa was erythematous and swollen. Only clear drainage was seen. TMs were clear.,Neck: Supple without adenopathy.,Lungs: Clear.,ASSESSMENT:, Allergic rhinitis.,PLAN:,1. She will try Zyrtec instead of Allegra again. Another option will be to use loratadine. She does not think she has prescription coverage so that might be cheaper.,2. Samples of Nasonex two sprays in each nostril given for three weeks. A prescription was written as well. +#> 2 PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, and lifting objects off the floor. He exercises three times a week at home and does cardio. He has difficulty walking two blocks or five flights of stairs. Difficulty with snoring. He has muscle and joint pains including knee pain, back pain, foot and ankle pain, and swelling. He has gastroesophageal reflux disease.,PAST SURGICAL HISTORY:, Includes reconstructive surgery on his right hand 13 years ago. ,SOCIAL HISTORY:, He is currently single. He has about ten drinks a year. He had smoked significantly up until several months ago. He now smokes less than three cigarettes a day.,FAMILY HISTORY:, Heart disease in both grandfathers, grandmother with stroke, and a grandmother with diabetes. Denies obesity and hypertension in other family members.,CURRENT MEDICATIONS:, None.,ALLERGIES:, He is allergic to Penicillin.,MISCELLANEOUS/EATING HISTORY:, He has been going to support groups for seven months with Lynn Holmberg in Greenwich and he is from Eastchester, New York and he feels that we are the appropriate program. He had a poor experience with the Greenwich program. Eating history, he is not an emotional eater. Does not like sweets. He likes big portions and carbohydrates. He likes chicken and not steak. He currently weighs 312 pounds. Ideal body weight would be 170 pounds. He is 142 pounds overweight. If ,he lost 60% of his excess body weight that would be 84 pounds and he should weigh about 228.,REVIEW OF SYSTEMS: ,Negative for head, neck, heart, lungs, GI, GU, orthopedic, and skin. Specifically denies chest pain, heart attack, coronary artery disease, congestive heart failure, arrhythmia, atrial fibrillation, pacemaker, high cholesterol, pulmonary embolism, high blood pressure, CVA, venous insufficiency, thrombophlebitis, asthma, shortness of breath, COPD, emphysema, sleep apnea, diabetes, leg and foot swelling, osteoarthritis, rheumatoid arthritis, hiatal hernia, peptic ulcer disease, gallstones, infected gallbladder, pancreatitis, fatty liver, hepatitis, hemorrhoids, rectal bleeding, polyps, incontinence of stool, urinary stress incontinence, or cancer. Denies cellulitis, pseudotumor cerebri, meningitis, or encephalitis.,PHYSICAL EXAMINATION:, He is alert and oriented x 3. Cranial nerves II-XII are intact. Afebrile. Vital Signs are stable. +#> 3 HISTORY OF PRESENT ILLNESS: , I have seen ABC today. He is a very pleasant gentleman who is 42 years old, 344 pounds. He is 5'9". He has a BMI of 51. He has been overweight for ten years since the age of 33, at his highest he was 358 pounds, at his lowest 260. He is pursuing surgical attempts of weight loss to feel good, get healthy, and begin to exercise again. He wants to be able to exercise and play volleyball. Physically, he is sluggish. He gets tired quickly. He does not go out often. When he loses weight he always regains it and he gains back more than he lost. His biggest weight loss is 25 pounds and it was three months before he gained it back. He did six months of not drinking alcohol and not taking in many calories. He has been on multiple commercial weight loss programs including Slim Fast for one month one year ago and Atkin's Diet for one month two years ago.,PAST MEDICAL HISTORY: , He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, difficulty walking, high cholesterol, and high blood pressure. He has asthma and difficulty walking two blocks or going eight to ten steps. He has sleep apnea and snoring. He is a diabetic, on medication. He has joint pain, knee pain, back pain, foot and ankle pain, leg and foot swelling. He has hemorrhoids.,PAST SURGICAL HISTORY: , Includes orthopedic or knee surgery.,SOCIAL HISTORY: , He is currently single. He drinks alcohol ten to twelve drinks a week, but does not drink five days a week and then will binge drink. He smokes one and a half pack a day for 15 years, but he has recently stopped smoking for the past two weeks.,FAMILY HISTORY: , Obesity, heart disease, and diabetes. Family history is negative for hypertension and stroke.,CURRENT MEDICATIONS:, Include Diovan, Crestor, and Tricor.,MISCELLANEOUS/EATING HISTORY: ,He says a couple of friends of his have had heart attacks and have had died. He used to drink everyday, but stopped two years ago. He now only drinks on weekends. He is on his second week of Chantix, which is a medication to come off smoking completely. Eating, he eats bad food. He is single. He eats things like bacon, eggs, and cheese, cheeseburgers, fast food, eats four times a day, seven in the morning, at noon, 9 p.m., and 2 a.m. He currently weighs 344 pounds and 5'9". His ideal body weight is 160 pounds. He is 184 pounds overweight. If he lost 70% of his excess body weight that would be 129 pounds and that would get him down to 215.,REVIEW OF SYSTEMS: , Negative for head, neck, heart, lungs, GI, GU, orthopedic, or skin. He also is positive for gout. He denies chest pain, heart attack, coronary artery disease, congestive heart failure, arrhythmia, atrial fibrillation, pacemaker, pulmonary embolism, or CVA. He denies venous insufficiency or thrombophlebitis. Denies shortness of breath, COPD, or emphysema. Denies thyroid problems, hip pain, osteoarthritis, rheumatoid arthritis, GERD, hiatal hernia, peptic ulcer disease, gallstones, infected gallbladder, pancreatitis, fatty liver, hepatitis, rectal bleeding, polyps, incontinence of stool, urinary stress incontinence, or cancer. He denies cellulitis, pseudotumor cerebri, meningitis, or encephalitis.,PHYSICAL EXAMINATION: ,He is alert and oriented x 3. Cranial nerves II-XII are intact. Neck is soft and supple. Lungs: He has positive wheezing bilaterally. Heart is regular rhythm and rate. His abdomen is soft. Extremities: He has 1+ pitting edema.,IMPRESSION/PLAN:, I have explained to him the risks and potential complications of laparoscopic gastric bypass in detail and these include bleeding, infection, deep venous thrombosis, pulmonary embolism, leakage from the gastrojejuno-anastomosis, jejunojejuno-anastomosis, and possible bowel obstruction among other potential complications. He understands. He wants to proceed with workup and evaluation for laparoscopic Roux-en-Y gastric bypass. He will need to get a letter of approval from Dr. XYZ. He will need to see a nutritionist and mental health worker. He will need an upper endoscopy by either Dr. XYZ. He will need to go to Dr. XYZ as he previously had a sleep study. We will need another sleep study. He will need H. pylori testing, thyroid function tests, LFTs, glycosylated hemoglobin, and fasting blood sugar. After this is performed, we will submit him for insurance approval. +#> 4 2-D M-MODE: , ,1. Left atrial enlargement with left atrial diameter of 4.7 cm.,2. Normal size right and left ventricle.,3. Normal LV systolic function with left ventricular ejection fraction of 51%.,4. Normal LV diastolic function.,5. No pericardial effusion.,6. Normal morphology of aortic valve, mitral valve, tricuspid valve, and pulmonary valve.,7. PA systolic pressure is 36 mmHg.,DOPPLER: , ,1. Mild mitral and tricuspid regurgitation.,2. Trace aortic and pulmonary regurgitation. +#> 5 1. The left ventricular cavity size and wall thickness appear normal. The wall motion and left ventricular systolic function appears hyperdynamic with estimated ejection fraction of 70% to 75%. There is near-cavity obliteration seen. There also appears to be increased left ventricular outflow tract gradient at the mid cavity level consistent with hyperdynamic left ventricular systolic function. There is abnormal left ventricular relaxation pattern seen as well as elevated left atrial pressures seen by Doppler examination.,2. The left atrium appears mildly dilated.,3. The right atrium and right ventricle appear normal.,4. The aortic root appears normal.,5. The aortic valve appears calcified with mild aortic valve stenosis, calculated aortic valve area is 1.3 cm square with a maximum instantaneous gradient of 34 and a mean gradient of 19 mm.,6. There is mitral annular calcification extending to leaflets and supportive structures with thickening of mitral valve leaflets with mild mitral regurgitation.,7. The tricuspid valve appears normal with trace tricuspid regurgitation with moderate pulmonary artery hypertension. Estimated pulmonary artery systolic pressure is 49 mmHg. Estimated right atrial pressure of 10 mmHg.,8. The pulmonary valve appears normal with trace pulmonary insufficiency.,9. There is no pericardial effusion or intracardiac mass seen.,10. There is a color Doppler suggestive of a patent foramen ovale with lipomatous hypertrophy of the interatrial septum.,11. The study was somewhat technically limited and hence subtle abnormalities could be missed from the study., +#> keywords +#> 1 allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegra, sprays, allergic, +#> 2 bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, body weight, laparoscopic gastric, weight loss, pounds, months, weight, laparoscopic, band, loss, diets, overweight, lost +#> 3 bariatrics, laparoscopic gastric bypass, heart attacks, body weight, pulmonary embolism, potential complications, sleep study, weight loss, gastric bypass, anastomosis, loss, sleep, laparoscopic, gastric, bypass, heart, pounds, weight, +#> 4 cardiovascular / pulmonary, 2-d m-mode, doppler, aortic valve, atrial enlargement, diastolic function, ejection fraction, mitral, mitral valve, pericardial effusion, pulmonary valve, regurgitation, systolic function, tricuspid, tricuspid valve, normal lv +#> 5 cardiovascular / pulmonary, 2-d, doppler, echocardiogram, annular, aortic root, aortic valve, atrial, atrium, calcification, cavity, ejection fraction, mitral, obliteration, outflow, regurgitation, relaxation pattern, stenosis, systolic function, tricuspid, valve, ventricular, ventricular cavity, wall motion, pulmonary artery

Binding UMLS Concept Unique Identifiers to a Data Frame

This function binds columns containing concept unique identifiers with which scispacy has 99% confidence of being present with values containing frequencies. Negated concepts, as identified by negspacy’s NegEx implementation, are ignored and do not count towards the frequencies.

-
mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description')
-
-str(mtsamples_with_cuis)
-#> 'data.frame':    5 obs. of  14 variables:
-#>  $ note_id          : int  1 2 3 4 5
-#>  $ description      : chr  "A 23-year-old white female presents with complaint of allergies." "Consult for laparoscopic gastric bypass." "Consult for laparoscopic gastric bypass." "2-D M-Mode. Doppler." ...
-#>  $ medical_specialty: chr  "Allergy / Immunology" "Bariatrics" "Bariatrics" "Cardiovascular / Pulmonary" ...
-#>  $ sample_name      : chr  "Allergic Rhinitis" "Laparoscopic Gastric Bypass Consult - 2" "Laparoscopic Gastric Bypass Consult - 1" "2-D Echocardiogram - 1" ...
-#>  $ transcription    : chr  "SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies w"| __truncated__ "PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to p"| __truncated__ "HISTORY OF PRESENT ILLNESS: , I have seen ABC today.  He is a very pleasant gentleman who is 42 years old, 344 "| __truncated__ "2-D M-MODE: , ,1.  Left atrial enlargement with left atrial diameter of 4.7 cm.,2.  Normal size right and left "| __truncated__ ...
-#>  $ keywords         : chr  "allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegr"| __truncated__ "bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, "| __truncated__ "bariatrics, laparoscopic gastric bypass, heart attacks, body weight, pulmonary embolism, potential complication"| __truncated__ "cardiovascular / pulmonary, 2-d m-mode, doppler, aortic valve, atrial enlargement, diastolic function, ejection"| __truncated__ ...
-#>  $ C0009818         : num  0 1 1 0 0
-#>  $ C0020517         : num  1 0 0 0 0
-#>  $ C0277786         : num  1 0 0 0 0
-#>  $ C0554756         : num  0 0 0 1 0
-#>  $ C1705052         : num  0 0 0 1 0
-#>  $ C3864418         : num  1 0 0 0 0
-#>  $ C4039248         : num  0 1 1 0 0
-#>  $ C4331911         : num  0 0 0 1 0
+
bind_clinspacy(mtsamples[1:5, 1:2],
+               text = 'description')
+#>   note_id                                                      description C0009818 C0013516 C0020517
+#> 1       1 A 23-year-old white female presents with complaint of allergies.        0        0        1
+#> 2       2                         Consult for laparoscopic gastric bypass.        1        0        0
+#> 3       3                         Consult for laparoscopic gastric bypass.        1        0        0
+#> 4       4                                             2-D M-Mode. Doppler.        0        0        0
+#> 5       5                                               2-D Echocardiogram        0        1        0
+#>   C0554756 C1705052 C2243117 C3864418 C4039248
+#> 1        0        0        0        1        0
+#> 2        0        0        0        0        1
+#> 3        0        0        0        0        1
+#> 4        1        0        0        0        0
+#> 5        0        1        1        0        0
+
+bind_clinspacy(mtsamples[1:5, 1:2],
+               text = 'description',
+               semantic_types = 'Diagnostic Procedure')
+#>   note_id                                                      description C0013516 C0554756
+#> 1       1 A 23-year-old white female presents with complaint of allergies.        0        0
+#> 2       2                         Consult for laparoscopic gastric bypass.        0        0
+#> 3       3                         Consult for laparoscopic gastric bypass.        0        0
+#> 4       4                                             2-D M-Mode. Doppler.        0        1
+#> 5       5                                               2-D Echocardiogram        1        0
+

Binding Concept Embeddings to a Data Frame

+
bind_clinspacy_embeddings(mtsamples[1:5, 1:2],
+                          text = 'description',
+                          num_embeddings = 5)
+#>   note_id                                                      description     emb_001    emb_002
+#> 1       1 A 23-year-old white female presents with complaint of allergies. -0.02252676 0.00981737
+#> 2       2                         Consult for laparoscopic gastric bypass. -0.06431815 0.02979208
+#> 3       3                         Consult for laparoscopic gastric bypass. -0.06431815 0.02979208
+#> 4       4                                             2-D M-Mode. Doppler. -0.06111055 0.03059523
+#> 5       5                                               2-D Echocardiogram -0.08545282 0.03965676
+#>         emb_003      emb_004     emb_005
+#> 1 -7.112366e-17 -0.015715369  0.00204883
+#> 2 -1.353084e-16 -0.046832239  0.03387485
+#> 3 -1.353084e-16 -0.046832239  0.03387485
+#> 4 -1.340074e-16 -0.032813400 -0.02400309
+#> 5 -4.336809e-17 -0.008077436 -0.04463792
+
+bind_clinspacy_embeddings(mtsamples[1:5, 1:2],
+                          text = 'description',
+                          num_embeddings = 5,
+                          semantic_types = 'Diagnostic Procedure')
+#>   note_id                                                      description     emb_001    emb_002
+#> 1       1 A 23-year-old white female presents with complaint of allergies.          NA         NA
+#> 2       2                         Consult for laparoscopic gastric bypass.          NA         NA
+#> 3       3                         Consult for laparoscopic gastric bypass.          NA         NA
+#> 4       4                                             2-D M-Mode. Doppler. -0.06111055 0.03059523
+#> 5       5                                               2-D Echocardiogram -0.08545282 0.03965676
+#>         emb_003      emb_004     emb_005
+#> 1            NA           NA          NA
+#> 2            NA           NA          NA
+#> 3            NA           NA          NA
+#> 4 -1.340074e-16 -0.032813400 -0.02400309
+#> 5 -4.336809e-17 -0.008077436 -0.04463792
+

UMLS CUI definitions

+
data(cui2vec_definitions)
+head(cui2vec_definitions)
+#>        cui                         semantic_type                         definition
+#> 1 C0000005       Amino Acid, Peptide, or Protein     (131)I-Macroaggregated Albumin
+#> 2 C0000005               Pharmacologic Substance     (131)I-Macroaggregated Albumin
+#> 3 C0000005 Indicator, Reagent, or Diagnostic Aid     (131)I-Macroaggregated Albumin
+#> 4 C0000039                      Organic Chemical 1,2-Dipalmitoylphosphatidylcholine
+#> 5 C0000039               Pharmacologic Substance 1,2-Dipalmitoylphosphatidylcholine
+#> 6 C0000052       Amino Acid, Peptide, or Protein  1,4-alpha-Glucan Branching Enzyme
diff --git a/README.md b/README.md index 7ad7414..7ec185e 100644 --- a/README.md +++ b/README.md @@ -22,22 +22,35 @@ with: remotes::install_github('ML4LHS/clinspacy', INSTALL_opts = '--no-multiarch') ``` -## Example +## Examples ``` r library(clinspacy) clinspacy('This patient has diabetes and CKD stage 3 but no HTN.') -#> cui entity lemma negated -#> 1 C0030705 patient patient FALSE -#> 2 C1705908 patient patient FALSE -#> 3 C1578481 patient patient FALSE -#> 4 C1578485 patient patient FALSE -#> 5 C1578486 patient patient FALSE -#> 6 C0011847 diabetes diabete FALSE -#> 7 C0011849 diabetes diabete FALSE -#> 8 C2316787 CKD stage 3 ckd stage 3 FALSE -#> 9 C0020538 HTN htn TRUE +#> cui entity lemma semantic_type definition negated +#> 1 C0030705 patient patient Patient or Disabled Group Patients FALSE +#> 2 C1550655 patient patient Body Substance Specimen Type - Patient FALSE +#> 3 C1578483 patient patient Idea or Concept Report source - Patient FALSE +#> 4 C1578484 patient patient Idea or Concept Relationship modifier - Patient FALSE +#> 5 C1578486 patient patient Intellectual Product Disabled Person Code - Patient FALSE +#> 6 C0011847 diabetes diabetes Disease or Syndrome Diabetes FALSE +#> 7 C0011849 diabetes diabetes Disease or Syndrome Diabetes Mellitus FALSE +#> 8 C2316787 CKD stage 3 ckd stage 3 Disease or Syndrome Chronic kidney disease stage 3 FALSE +#> 9 C0020538 HTN htn Disease or Syndrome Hypertensive disease TRUE + +clinspacy('This patient is taking omeprazole, Protonix, and lisinopril 10 mg. He has diabetes.', + semantic_types = 'Disease or Syndrome') +#> cui entity lemma semantic_type definition negated +#> 1 C0011847 diabetes diabete Disease or Syndrome Diabetes FALSE +#> 2 C0011849 diabetes diabete Disease or Syndrome Diabetes Mellitus FALSE + +clinspacy('This patient is taking omeprazole, Protonix, and lisinopril 10 mg. He has diabetes.', + semantic_types = 'Pharmacologic Substance') +#> cui entity lemma semantic_type definition negated +#> 1 C0028978 omeprazole omeprazole Pharmacologic Substance Omeprazole FALSE +#> 2 C0876139 Protonix Protonix Pharmacologic Substance Protonix FALSE +#> 3 C0065374 lisinopril lisinopril Pharmacologic Substance Lisinopril FALSE ``` ## Using the mtsamples dataset @@ -45,14 +58,31 @@ clinspacy('This patient has diabetes and CKD stage 3 but no HTN.') ``` r data(mtsamples) -str(mtsamples[1:5,]) -#> 'data.frame': 5 obs. of 6 variables: -#> $ note_id : int 1 2 3 4 5 -#> $ description : chr "A 23-year-old white female presents with complaint of allergies." "Consult for laparoscopic gastric bypass." "Consult for laparoscopic gastric bypass." "2-D M-Mode. Doppler." ... -#> $ medical_specialty: chr "Allergy / Immunology" "Bariatrics" "Bariatrics" "Cardiovascular / Pulmonary" ... -#> $ sample_name : chr "Allergic Rhinitis" "Laparoscopic Gastric Bypass Consult - 2" "Laparoscopic Gastric Bypass Consult - 1" "2-D Echocardiogram - 1" ... -#> $ transcription : chr "SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies w"| __truncated__ "PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to p"| __truncated__ "HISTORY OF PRESENT ILLNESS: , I have seen ABC today. He is a very pleasant gentleman who is 42 years old, 344 "| __truncated__ "2-D M-MODE: , ,1. Left atrial enlargement with left atrial diameter of 4.7 cm.,2. Normal size right and left "| __truncated__ ... -#> $ keywords : chr "allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegr"| __truncated__ "bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, "| __truncated__ "bariatrics, laparoscopic gastric bypass, heart attacks, body weight, pulmonary embolism, potential complication"| __truncated__ "cardiovascular / pulmonary, 2-d m-mode, doppler, aortic valve, atrial enlargement, diastolic function, ejection"| __truncated__ ... +mtsamples[1:5,] +#> note_id description medical_specialty +#> 1 1 A 23-year-old white female presents with complaint of allergies. Allergy / Immunology +#> 2 2 Consult for laparoscopic gastric bypass. Bariatrics +#> 3 3 Consult for laparoscopic gastric bypass. Bariatrics +#> 4 4 2-D M-Mode. Doppler. Cardiovascular / Pulmonary +#> 5 5 2-D Echocardiogram Cardiovascular / Pulmonary +#> sample_name +#> 1 Allergic Rhinitis +#> 2 Laparoscopic Gastric Bypass Consult - 2 +#> 3 Laparoscopic Gastric Bypass Consult - 1 +#> 4 2-D Echocardiogram - 1 +#> 5 2-D Echocardiogram - 2 +#> transcription +#> 1 SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies when she lived in Seattle but she thinks they are worse here. In the past, she has tried Claritin, and Zyrtec. Both worked for short time but then seemed to lose effectiveness. She has used Allegra also. She used that last summer and she began using it again two weeks ago. It does not appear to be working very well. She has used over-the-counter sprays but no prescription nasal sprays. She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals: Weight was 130 pounds and blood pressure 124/78.,HEENT: Her throat was mildly erythematous without exudate. Nasal mucosa was erythematous and swollen. Only clear drainage was seen. TMs were clear.,Neck: Supple without adenopathy.,Lungs: Clear.,ASSESSMENT:, Allergic rhinitis.,PLAN:,1. She will try Zyrtec instead of Allegra again. Another option will be to use loratadine. She does not think she has prescription coverage so that might be cheaper.,2. Samples of Nasonex two sprays in each nostril given for three weeks. A prescription was written as well. +#> 2 PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, and lifting objects off the floor. He exercises three times a week at home and does cardio. He has difficulty walking two blocks or five flights of stairs. Difficulty with snoring. He has muscle and joint pains including knee pain, back pain, foot and ankle pain, and swelling. He has gastroesophageal reflux disease.,PAST SURGICAL HISTORY:, Includes reconstructive surgery on his right hand 13 years ago. ,SOCIAL HISTORY:, He is currently single. He has about ten drinks a year. He had smoked significantly up until several months ago. He now smokes less than three cigarettes a day.,FAMILY HISTORY:, Heart disease in both grandfathers, grandmother with stroke, and a grandmother with diabetes. Denies obesity and hypertension in other family members.,CURRENT MEDICATIONS:, None.,ALLERGIES:, He is allergic to Penicillin.,MISCELLANEOUS/EATING HISTORY:, He has been going to support groups for seven months with Lynn Holmberg in Greenwich and he is from Eastchester, New York and he feels that we are the appropriate program. He had a poor experience with the Greenwich program. Eating history, he is not an emotional eater. Does not like sweets. He likes big portions and carbohydrates. He likes chicken and not steak. He currently weighs 312 pounds. Ideal body weight would be 170 pounds. He is 142 pounds overweight. If ,he lost 60% of his excess body weight that would be 84 pounds and he should weigh about 228.,REVIEW OF SYSTEMS: ,Negative for head, neck, heart, lungs, GI, GU, orthopedic, and skin. Specifically denies chest pain, heart attack, coronary artery disease, congestive heart failure, arrhythmia, atrial fibrillation, pacemaker, high cholesterol, pulmonary embolism, high blood pressure, CVA, venous insufficiency, thrombophlebitis, asthma, shortness of breath, COPD, emphysema, sleep apnea, diabetes, leg and foot swelling, osteoarthritis, rheumatoid arthritis, hiatal hernia, peptic ulcer disease, gallstones, infected gallbladder, pancreatitis, fatty liver, hepatitis, hemorrhoids, rectal bleeding, polyps, incontinence of stool, urinary stress incontinence, or cancer. Denies cellulitis, pseudotumor cerebri, meningitis, or encephalitis.,PHYSICAL EXAMINATION:, He is alert and oriented x 3. Cranial nerves II-XII are intact. Afebrile. Vital Signs are stable. +#> 3 HISTORY OF PRESENT ILLNESS: , I have seen ABC today. He is a very pleasant gentleman who is 42 years old, 344 pounds. He is 5'9". He has a BMI of 51. He has been overweight for ten years since the age of 33, at his highest he was 358 pounds, at his lowest 260. He is pursuing surgical attempts of weight loss to feel good, get healthy, and begin to exercise again. He wants to be able to exercise and play volleyball. Physically, he is sluggish. He gets tired quickly. He does not go out often. When he loses weight he always regains it and he gains back more than he lost. His biggest weight loss is 25 pounds and it was three months before he gained it back. He did six months of not drinking alcohol and not taking in many calories. He has been on multiple commercial weight loss programs including Slim Fast for one month one year ago and Atkin's Diet for one month two years ago.,PAST MEDICAL HISTORY: , He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to public seating, difficulty walking, high cholesterol, and high blood pressure. He has asthma and difficulty walking two blocks or going eight to ten steps. He has sleep apnea and snoring. He is a diabetic, on medication. He has joint pain, knee pain, back pain, foot and ankle pain, leg and foot swelling. He has hemorrhoids.,PAST SURGICAL HISTORY: , Includes orthopedic or knee surgery.,SOCIAL HISTORY: , He is currently single. He drinks alcohol ten to twelve drinks a week, but does not drink five days a week and then will binge drink. He smokes one and a half pack a day for 15 years, but he has recently stopped smoking for the past two weeks.,FAMILY HISTORY: , Obesity, heart disease, and diabetes. Family history is negative for hypertension and stroke.,CURRENT MEDICATIONS:, Include Diovan, Crestor, and Tricor.,MISCELLANEOUS/EATING HISTORY: ,He says a couple of friends of his have had heart attacks and have had died. He used to drink everyday, but stopped two years ago. He now only drinks on weekends. He is on his second week of Chantix, which is a medication to come off smoking completely. Eating, he eats bad food. He is single. He eats things like bacon, eggs, and cheese, cheeseburgers, fast food, eats four times a day, seven in the morning, at noon, 9 p.m., and 2 a.m. He currently weighs 344 pounds and 5'9". His ideal body weight is 160 pounds. He is 184 pounds overweight. If he lost 70% of his excess body weight that would be 129 pounds and that would get him down to 215.,REVIEW OF SYSTEMS: , Negative for head, neck, heart, lungs, GI, GU, orthopedic, or skin. He also is positive for gout. He denies chest pain, heart attack, coronary artery disease, congestive heart failure, arrhythmia, atrial fibrillation, pacemaker, pulmonary embolism, or CVA. He denies venous insufficiency or thrombophlebitis. Denies shortness of breath, COPD, or emphysema. Denies thyroid problems, hip pain, osteoarthritis, rheumatoid arthritis, GERD, hiatal hernia, peptic ulcer disease, gallstones, infected gallbladder, pancreatitis, fatty liver, hepatitis, rectal bleeding, polyps, incontinence of stool, urinary stress incontinence, or cancer. He denies cellulitis, pseudotumor cerebri, meningitis, or encephalitis.,PHYSICAL EXAMINATION: ,He is alert and oriented x 3. Cranial nerves II-XII are intact. Neck is soft and supple. Lungs: He has positive wheezing bilaterally. Heart is regular rhythm and rate. His abdomen is soft. Extremities: He has 1+ pitting edema.,IMPRESSION/PLAN:, I have explained to him the risks and potential complications of laparoscopic gastric bypass in detail and these include bleeding, infection, deep venous thrombosis, pulmonary embolism, leakage from the gastrojejuno-anastomosis, jejunojejuno-anastomosis, and possible bowel obstruction among other potential complications. He understands. He wants to proceed with workup and evaluation for laparoscopic Roux-en-Y gastric bypass. He will need to get a letter of approval from Dr. XYZ. He will need to see a nutritionist and mental health worker. He will need an upper endoscopy by either Dr. XYZ. He will need to go to Dr. XYZ as he previously had a sleep study. We will need another sleep study. He will need H. pylori testing, thyroid function tests, LFTs, glycosylated hemoglobin, and fasting blood sugar. After this is performed, we will submit him for insurance approval. +#> 4 2-D M-MODE: , ,1. Left atrial enlargement with left atrial diameter of 4.7 cm.,2. Normal size right and left ventricle.,3. Normal LV systolic function with left ventricular ejection fraction of 51%.,4. Normal LV diastolic function.,5. No pericardial effusion.,6. Normal morphology of aortic valve, mitral valve, tricuspid valve, and pulmonary valve.,7. PA systolic pressure is 36 mmHg.,DOPPLER: , ,1. Mild mitral and tricuspid regurgitation.,2. Trace aortic and pulmonary regurgitation. +#> 5 1. The left ventricular cavity size and wall thickness appear normal. The wall motion and left ventricular systolic function appears hyperdynamic with estimated ejection fraction of 70% to 75%. There is near-cavity obliteration seen. There also appears to be increased left ventricular outflow tract gradient at the mid cavity level consistent with hyperdynamic left ventricular systolic function. There is abnormal left ventricular relaxation pattern seen as well as elevated left atrial pressures seen by Doppler examination.,2. The left atrium appears mildly dilated.,3. The right atrium and right ventricle appear normal.,4. The aortic root appears normal.,5. The aortic valve appears calcified with mild aortic valve stenosis, calculated aortic valve area is 1.3 cm square with a maximum instantaneous gradient of 34 and a mean gradient of 19 mm.,6. There is mitral annular calcification extending to leaflets and supportive structures with thickening of mitral valve leaflets with mild mitral regurgitation.,7. The tricuspid valve appears normal with trace tricuspid regurgitation with moderate pulmonary artery hypertension. Estimated pulmonary artery systolic pressure is 49 mmHg. Estimated right atrial pressure of 10 mmHg.,8. The pulmonary valve appears normal with trace pulmonary insufficiency.,9. There is no pericardial effusion or intracardiac mass seen.,10. There is a color Doppler suggestive of a patent foramen ovale with lipomatous hypertrophy of the interatrial septum.,11. The study was somewhat technically limited and hence subtle abnormalities could be missed from the study., +#> keywords +#> 1 allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegra, sprays, allergic, +#> 2 bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, body weight, laparoscopic gastric, weight loss, pounds, months, weight, laparoscopic, band, loss, diets, overweight, lost +#> 3 bariatrics, laparoscopic gastric bypass, heart attacks, body weight, pulmonary embolism, potential complications, sleep study, weight loss, gastric bypass, anastomosis, loss, sleep, laparoscopic, gastric, bypass, heart, pounds, weight, +#> 4 cardiovascular / pulmonary, 2-d m-mode, doppler, aortic valve, atrial enlargement, diastolic function, ejection fraction, mitral, mitral valve, pericardial effusion, pulmonary valve, regurgitation, systolic function, tricuspid, tricuspid valve, normal lv +#> 5 cardiovascular / pulmonary, 2-d, doppler, echocardiogram, annular, aortic root, aortic valve, atrial, atrium, calcification, cavity, ejection fraction, mitral, obliteration, outflow, regurgitation, relaxation pattern, stenosis, systolic function, tricuspid, valve, ventricular, ventricular cavity, wall motion, pulmonary artery ``` ## Binding UMLS Concept Unique Identifiers to a Data Frame @@ -64,22 +94,79 @@ NegEx implementation, are ignored and do not count towards the frequencies. ``` r -mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description') - -str(mtsamples_with_cuis) -#> 'data.frame': 5 obs. of 14 variables: -#> $ note_id : int 1 2 3 4 5 -#> $ description : chr "A 23-year-old white female presents with complaint of allergies." "Consult for laparoscopic gastric bypass." "Consult for laparoscopic gastric bypass." "2-D M-Mode. Doppler." ... -#> $ medical_specialty: chr "Allergy / Immunology" "Bariatrics" "Bariatrics" "Cardiovascular / Pulmonary" ... -#> $ sample_name : chr "Allergic Rhinitis" "Laparoscopic Gastric Bypass Consult - 2" "Laparoscopic Gastric Bypass Consult - 1" "2-D Echocardiogram - 1" ... -#> $ transcription : chr "SUBJECTIVE:, This 23-year-old white female presents with complaint of allergies. She used to have allergies w"| __truncated__ "PAST MEDICAL HISTORY:, He has difficulty climbing stairs, difficulty with airline seats, tying shoes, used to p"| __truncated__ "HISTORY OF PRESENT ILLNESS: , I have seen ABC today. He is a very pleasant gentleman who is 42 years old, 344 "| __truncated__ "2-D M-MODE: , ,1. Left atrial enlargement with left atrial diameter of 4.7 cm.,2. Normal size right and left "| __truncated__ ... -#> $ keywords : chr "allergy / immunology, allergic rhinitis, allergies, asthma, nasal sprays, rhinitis, nasal, erythematous, allegr"| __truncated__ "bariatrics, laparoscopic gastric bypass, weight loss programs, gastric bypass, atkin's diet, weight watcher's, "| __truncated__ "bariatrics, laparoscopic gastric bypass, heart attacks, body weight, pulmonary embolism, potential complication"| __truncated__ "cardiovascular / pulmonary, 2-d m-mode, doppler, aortic valve, atrial enlargement, diastolic function, ejection"| __truncated__ ... -#> $ C0009818 : num 0 1 1 0 0 -#> $ C0020517 : num 1 0 0 0 0 -#> $ C0277786 : num 1 0 0 0 0 -#> $ C0554756 : num 0 0 0 1 0 -#> $ C1705052 : num 0 0 0 1 0 -#> $ C3864418 : num 1 0 0 0 0 -#> $ C4039248 : num 0 1 1 0 0 -#> $ C4331911 : num 0 0 0 1 0 +bind_clinspacy(mtsamples[1:5, 1:2], + text = 'description') +#> note_id description C0009818 C0013516 C0020517 +#> 1 1 A 23-year-old white female presents with complaint of allergies. 0 0 1 +#> 2 2 Consult for laparoscopic gastric bypass. 1 0 0 +#> 3 3 Consult for laparoscopic gastric bypass. 1 0 0 +#> 4 4 2-D M-Mode. Doppler. 0 0 0 +#> 5 5 2-D Echocardiogram 0 1 0 +#> C0554756 C1705052 C2243117 C3864418 C4039248 +#> 1 0 0 0 1 0 +#> 2 0 0 0 0 1 +#> 3 0 0 0 0 1 +#> 4 1 0 0 0 0 +#> 5 0 1 1 0 0 + +bind_clinspacy(mtsamples[1:5, 1:2], + text = 'description', + semantic_types = 'Diagnostic Procedure') +#> note_id description C0013516 C0554756 +#> 1 1 A 23-year-old white female presents with complaint of allergies. 0 0 +#> 2 2 Consult for laparoscopic gastric bypass. 0 0 +#> 3 3 Consult for laparoscopic gastric bypass. 0 0 +#> 4 4 2-D M-Mode. Doppler. 0 1 +#> 5 5 2-D Echocardiogram 1 0 +``` + +## Binding Concept Embeddings to a Data Frame + +``` r +bind_clinspacy_embeddings(mtsamples[1:5, 1:2], + text = 'description', + num_embeddings = 5) +#> note_id description emb_001 emb_002 +#> 1 1 A 23-year-old white female presents with complaint of allergies. -0.02252676 0.00981737 +#> 2 2 Consult for laparoscopic gastric bypass. -0.06431815 0.02979208 +#> 3 3 Consult for laparoscopic gastric bypass. -0.06431815 0.02979208 +#> 4 4 2-D M-Mode. Doppler. -0.06111055 0.03059523 +#> 5 5 2-D Echocardiogram -0.08545282 0.03965676 +#> emb_003 emb_004 emb_005 +#> 1 -7.112366e-17 -0.015715369 0.00204883 +#> 2 -1.353084e-16 -0.046832239 0.03387485 +#> 3 -1.353084e-16 -0.046832239 0.03387485 +#> 4 -1.340074e-16 -0.032813400 -0.02400309 +#> 5 -4.336809e-17 -0.008077436 -0.04463792 + +bind_clinspacy_embeddings(mtsamples[1:5, 1:2], + text = 'description', + num_embeddings = 5, + semantic_types = 'Diagnostic Procedure') +#> note_id description emb_001 emb_002 +#> 1 1 A 23-year-old white female presents with complaint of allergies. NA NA +#> 2 2 Consult for laparoscopic gastric bypass. NA NA +#> 3 3 Consult for laparoscopic gastric bypass. NA NA +#> 4 4 2-D M-Mode. Doppler. -0.06111055 0.03059523 +#> 5 5 2-D Echocardiogram -0.08545282 0.03965676 +#> emb_003 emb_004 emb_005 +#> 1 NA NA NA +#> 2 NA NA NA +#> 3 NA NA NA +#> 4 -1.340074e-16 -0.032813400 -0.02400309 +#> 5 -4.336809e-17 -0.008077436 -0.04463792 +``` + +# UMLS CUI definitions + +``` r +data(cui2vec_definitions) +head(cui2vec_definitions) +#> cui semantic_type definition +#> 1 C0000005 Amino Acid, Peptide, or Protein (131)I-Macroaggregated Albumin +#> 2 C0000005 Pharmacologic Substance (131)I-Macroaggregated Albumin +#> 3 C0000005 Indicator, Reagent, or Diagnostic Aid (131)I-Macroaggregated Albumin +#> 4 C0000039 Organic Chemical 1,2-Dipalmitoylphosphatidylcholine +#> 5 C0000039 Pharmacologic Substance 1,2-Dipalmitoylphosphatidylcholine +#> 6 C0000052 Amino Acid, Peptide, or Protein 1,4-alpha-Glucan Branching Enzyme ``` diff --git a/man/bind_clinspacy.Rd b/man/bind_clinspacy.Rd index fc49aab..06cd5f8 100644 --- a/man/bind_clinspacy.Rd +++ b/man/bind_clinspacy.Rd @@ -7,12 +7,14 @@ as identified by negspacy's NegEx implementation, are ignored and do not count towards the frequencies.} \usage{ -bind_clinspacy(df, text) +bind_clinspacy(df, text, ...) } \arguments{ \item{df}{A data frame.} \item{text}{A character string containing the name of the column to process.} + +\item{...}{Arguments passed down to \code{\link{clinspacy}}} } \value{ A data frame containing the original data frame as well as additional column names diff --git a/man/bind_clinspacy_embeddings.Rd b/man/bind_clinspacy_embeddings.Rd new file mode 100644 index 0000000..0fff15f --- /dev/null +++ b/man/bind_clinspacy_embeddings.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clinspacy.R +\name{bind_clinspacy_embeddings} +\alias{bind_clinspacy_embeddings} +\title{This function binds columns containing concept embeddings for concepts with which scispacy has +99 percent confidence of being present with values containing frequencies. Negated concepts, +as identified by negspacy's NegEx implementation, are ignored and do not count towards +the embeddings. The concept embeddings are derived from the cui2vec_embeddings dataset +included with this package.} +\usage{ +bind_clinspacy_embeddings(df, text, num_embeddings = 500, ...) +} +\arguments{ +\item{df}{A data frame.} + +\item{text}{A character string containing the name of the column to process.} + +\item{num_embeddings}{The number of embeddings to return (must be a number 1 through 500).} + +\item{...}{Arguments passed down to \code{\link{clinspacy}}} +} +\value{ +A data frame containing the original data frame as well as additional column names +for each UMLS concept unique identifer found with values containing frequencies. +} +\description{ +The embeddings are derived from Andrew Beam's +\href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. +} +\details{ +Citation + +Beam, A.L., Kompa, B., Schmaltz, A., Fried, I., Griffin, W, Palmer, N.P., Shi, X., +Cai, T., and Kohane, I.S.,, 2019. Clinical Concept Embeddings Learned from Massive +Sources of Multimodal Medical Data. arXiv preprint arXiv:1804.01486. + +License + +This data is made available under a +\href{https://creativecommons.org/licenses/by/4.0/}{CC BY 4.0 license}. The only change +made to the original dataset is the renaming of columns. +} +\examples{ +data(mtsamples) +mtsamples_with_cuis = bind_clinspacy(mtsamples[1:5,], text = 'description') +str(mtsamples_with_cuis) +} diff --git a/man/clinspacy.Rd b/man/clinspacy.Rd index ce450cc..7be9534 100644 --- a/man/clinspacy.Rd +++ b/man/clinspacy.Rd @@ -7,10 +7,60 @@ concept mapping, and negation detection using the Python spaCy, scispacy, and ne This function identifies only those concept unique identifiers with with scispacy has 99 percent confidence of being present. Negation is identified using negspacy's NegEx implementation.} \usage{ -clinspacy(text) +clinspacy( + text, + threshold = 0.99, + semantic_types = c("Acquired Abnormality", "Activity", "Age Group", + "Amino Acid Sequence", "Amino Acid, Peptide, or Protein", "Amphibian", + "Anatomical Abnormality", "Anatomical Structure", "Animal", "Antibiotic", "Archaeon", + "Bacterium", "Behavior", "Biologic Function", "Biologically Active Substance", + "Biomedical Occupation or Discipline", "Biomedical or Dental Material", "Bird", + "Body Location or Region", "Body Part, Organ, or Organ Component", + "Body Space or Junction", "Body Substance", "Body System", "Carbohydrate Sequence", + "Cell", "Cell Component", "Cell Function", "Cell or Molecular Dysfunction", + "Chemical", "Chemical Viewed Functionally", "Chemical Viewed Structurally", + "Classification", "Clinical Attribute", "Clinical Drug", "Conceptual Entity", + "Congenital Abnormality", "Daily or Recreational Activity", "Diagnostic Procedure", + "Disease or Syndrome", "Drug Delivery Device", "Educational Activity", + "Element, Ion, or Isotope", "Embryonic Structure", "Entity", + "Environmental Effect of Humans", "Enzyme", "Eukaryote", "Event", + "Experimental Model of Disease", "Family Group", "Finding", "Fish", "Food", + "Fully Formed Anatomical Structure", "Functional Concept", "Fungus", + "Gene or Genome", "Genetic Function", "Geographic Area", + "Governmental or Regulatory Activity", "Group", "Group Attribute", + "Hazardous or Poisonous Substance", "Health Care Activity", + "Health Care Related Organization", "Hormone", "Human", + "Human-caused Phenomenon or Process", "Idea or Concept", "Immunologic Factor", + "Indicator, Reagent, or Diagnostic Aid", "Individual Behavior", + "Injury or Poisoning", "Inorganic Chemical", "Intellectual Product", + "Laboratory or Test Result", "Laboratory Procedure", "Language", "Machine Activity", + "Mammal", "Manufactured Object", "Medical Device", + "Mental or Behavioral Dysfunction", "Mental Process", + "Molecular Biology Research Technique", "Molecular Function", "Molecular Sequence", + "Natural Phenomenon or Process", "Neoplastic Process", + "Nucleic Acid, Nucleoside, or Nucleotide", "Nucleotide Sequence", + "Occupation or Discipline", "Occupational Activity", "Organ or Tissue Function", + "Organic Chemical", "Organism", "Organism Attribute", "Organism Function", + "Organization", "Pathologic Function", "Patient or Disabled Group", + "Pharmacologic Substance", "Phenomenon or Process", "Physical Object", + "Physiologic Function", "Plant", "Population Group", + "Professional or Occupational Group", "Professional Society", "Qualitative Concept", + "Quantitative Concept", "Receptor", "Regulation or Law", "Reptile", + "Research Activity", "Research Device", "Self-help or Relief Organization", + "Sign or Symptom", "Social Behavior", "Spatial Concept", "Substance", + "Temporal Concept", "Therapeutic or Preventive Procedure", "Tissue", "Vertebrate", + "Virus", "Vitamin") +) } \arguments{ \item{text}{A character string containing medical text that you would like to process.} + +\item{threshold}{Defaults to 0.99. The confidence threshold value used by clinspacy (can be higher than the +\code{linker_threshold} from \code{\link{clinspacy_init}}). Note that whereas the +linker_threshold can only be set once per session, this threshold can be updated during the R session.} + +\item{semantic_types}{Character vector containing any combination of the following: +c("Acquired Abnormality", "Activity", "Age Group", "Amino Acid Sequence", "Amino Acid, Peptide, or Protein", "Amphibian", "Anatomical Abnormality", "Anatomical Structure", "Animal", "Antibiotic", "Archaeon", "Bacterium", "Behavior", "Biologic Function", "Biologically Active Substance", "Biomedical Occupation or Discipline", "Biomedical or Dental Material", "Bird", "Body Location or Region", "Body Part, Organ, or Organ Component", "Body Space or Junction", "Body Substance", "Body System", "Carbohydrate Sequence", "Cell", "Cell Component", "Cell Function", "Cell or Molecular Dysfunction", "Chemical", "Chemical Viewed Functionally", "Chemical Viewed Structurally", "Classification", "Clinical Attribute", "Clinical Drug", "Conceptual Entity", "Congenital Abnormality", "Daily or Recreational Activity", "Diagnostic Procedure", "Disease or Syndrome", "Drug Delivery Device", "Educational Activity", "Element, Ion, or Isotope", "Embryonic Structure", "Entity", "Environmental Effect of Humans", "E} } \value{ A data frame containing the UMLS concept unique identifiers (cui), entities, diff --git a/man/clinspacy_init.Rd b/man/clinspacy_init.Rd new file mode 100644 index 0000000..b0fc7a0 --- /dev/null +++ b/man/clinspacy_init.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/clinspacy.R +\name{clinspacy_init} +\alias{clinspacy_init} +\title{Initializes clinspacy. This function is optional to run but gives you more control over +the parameters used by scispacy at initiation. If you do not run this function, it will be +run with default parameters the first time that any of the package functions are run.} +\usage{ +clinspacy_init(miniconda = TRUE, linker_threshold = 0.99, ...) +} +\arguments{ +\item{miniconda}{Defaults to TRUE, which results in miniconda being installed (~400 MB) +and configured with the "clinspacy" conda environment. If you want to override this behavior, +set \code{miniconda} to \code{FALSE} and specify an alternative environment using use_python() +or use_conda().} + +\item{linker_threshold}{Defaults to 0.99. The confidence threshold value used by the scispacy UMLS entity +linker. Note: This can be lower than the \code{threshold} from \code{\link{clinspacy_init}}). +The linker_threshold can only be set once per session.} + +\item{...}{Additional settings available from: \href{https://github.com/allenai/scispacy}{https://github.com/allenai/scispacy}.} +} +\description{ +Initializes clinspacy. This function is optional to run but gives you more control over +the parameters used by scispacy at initiation. If you do not run this function, it will be +run with default parameters the first time that any of the package functions are run. +} diff --git a/man/cui2vec_definitions.Rd b/man/cui2vec_definitions.Rd new file mode 100644 index 0000000..d69a1b6 --- /dev/null +++ b/man/cui2vec_definitions.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cui2vec_data.R +\docType{data} +\name{cui2vec_definitions} +\alias{cui2vec_definitions} +\title{Cui2vec concept definitions} +\format{ +A data frame with 3053795 rows and 3 variables: +\describe{ + \item{cui}{A Unified Medical Language System (UMLS) Concept Unique Identifier (CUI)} + \item{semantic_type}{Semantic type of the CUI} + \item{definition}{Definition of the CUI} +} +} +\source{ +\url{https://github.com/beamandrew/cui2vec} +} +\usage{ +cui2vec_definitions +} +\description{ +This dataset contains definitions for the Unified Medical Language System (UMLS) +Concept Unique Identifiers (CUIs). These come from Andrew Beam's +\href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. +} +\details{ +License + +This data is made available under a +\href{https://github.com/beamandrew/cui2vec/blob/master/LICENSE.md}{MIT license}. The data +is copyrighted in 2019 by Benjamin Kompa, Andrew Beam, and Allen Schmaltz. The only change +made to the original dataset is the renaming of columns. +} +\keyword{datasets} diff --git a/man/cui2vec_embeddings.Rd b/man/cui2vec_embeddings.Rd new file mode 100644 index 0000000..7e3735d --- /dev/null +++ b/man/cui2vec_embeddings.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cui2vec_data.R +\docType{data} +\name{cui2vec_embeddings} +\alias{cui2vec_embeddings} +\title{Cui2vec concept embeddings} +\format{ +A data frame with 109053 rows and 501 variables: +\describe{ + \item{cui}{A Unified Medical Language System (UMLS) Concept Unique Identifier (CUI)} + \item{emb_001}{Concept embedding vector #1} + \item{emb_002}{Concept embedding vector #2} + \item{...}{...} + \item{emb_500}{Concept embedding vector #500} +} +} +\source{ +\url{https://figshare.com/s/00d69861786cd0156d81} +} +\usage{ +cui2vec_embeddings +} +\description{ +This dataset contains Unified Medical Langauge System (UMLS) concept embeddings from +Andrew Beam's \href{https://github.com/beamandrew/cui2vec}{cui2vec R package}. There are +500 embeddings included for each concept. +} +\details{ +Citation + +Beam, A.L., Kompa, B., Schmaltz, A., Fried, I., Griffin, W, Palmer, N.P., Shi, X., +Cai, T., and Kohane, I.S.,, 2019. Clinical Concept Embeddings Learned from Massive +Sources of Multimodal Medical Data. arXiv preprint arXiv:1804.01486. + +License + +This data is made available under a +\href{https://creativecommons.org/licenses/by/4.0/}{CC BY 4.0 license}. The only change +made to the original dataset is the renaming of columns. +} +\keyword{datasets} diff --git a/man/mtsamples.Rd b/man/mtsamples.Rd index 4f71e0e..40303cd 100644 --- a/man/mtsamples.Rd +++ b/man/mtsamples.Rd @@ -27,7 +27,10 @@ This dataset contains sample medical transcriptions for various medical specialt \details{ Acknowledgements -This data was scraped from mtsamples.com by Tara Boyle and is made available -under a CC0: Public Domain license. +This data was scraped from \href{https://mtsamples.com}{https://mtsamples.com} by Tara Boyle. + +License +This data is made available under a +\href{https://creativecommons.org/share-your-work/public-domain/cc0/}{CC0: Public Domain license}. } \keyword{datasets}