Skip to content

Commit

Permalink
Merge pull request #299 from USEPA/function_names_eh
Browse files Browse the repository at this point in the history
Function names eh
  • Loading branch information
cristinamullin authored Jul 5, 2023
2 parents e89d0a7 + 7b9745e commit 5e7185b
Show file tree
Hide file tree
Showing 72 changed files with 906 additions and 983 deletions.
73 changes: 36 additions & 37 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,47 +1,46 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(AboveNationalWQXUpperThreshold)
export(AggregatedContinuousData)
export(BelowNationalWQXLowerThreshold)
export(GetActivityTypeRef)
export(GetDetCondRef)
export(GetDetLimitRef)
export(GetMeasureUnitRef)
export(GetWQXCharValRef)
export(HarmonizationRefTable)
export(HarmonizeData)
export(InvalidCoordinates)
export(InvalidFraction)
export(InvalidMethod)
export(InvalidResultUnit)
export(InvalidSpeciation)
export(JoinWQPProfiles)
export(PotentialDuplicateRowID)
export(QAPPDocAvailable)
export(QAPPapproved)
export(QualityControlActivity)
export(TADABigdataRetrieval)
export(TADAOverviewMap)
export(TADAReadWQPWebServices)
export(TADA_AutoClean)
export(TADA_BigDataRetrieval)
export(TADA_Boxplot)
export(TADA_ConvertDepthUnits)
export(TADA_ConvertResultUnits)
export(TADA_ConvertSpecialChars)
export(TADA_CreateComparableID)
export(TADA_DataRetrieval)
export(TADA_FieldCounts)
export(TADA_FieldValuesPie)
export(TADA_FieldValuesTable)
export(TADA_FindContinuousData)
export(TADA_FindNearbySites)
export(TADA_FindPotentialDuplicates)
export(TADA_NearbySites)
export(TADA_FindQualityControlData)
export(TADA_FlagCoordinates)
export(TADA_FlagFraction)
export(TADA_FlagMethod)
export(TADA_FlagResultAboveThreshold)
export(TADA_FlagResultBelowThreshold)
export(TADA_FlagResultUnit)
export(TADA_FlagSpeciation)
export(TADA_GetActivityTypeRef)
export(TADA_GetDetCondRef)
export(TADA_GetDetLimitRef)
export(TADA_GetMeasureUnitRef)
export(TADA_GetSynonymRef)
export(TADA_GetTemplate)
export(TADA_GetWQXCharValRef)
export(TADA_HarmonizeSynonyms)
export(TADA_Histogram)
export(TADA_IDCensoredData)
export(TADA_JoinWQPProfiles)
export(TADA_OrderCols)
export(TADA_boxplot)
export(TADA_hist)
export(TADA_stats)
export(TADA_summarizeColumn)
export(TADAdataRetrieval)
export(createComparableId)
export(fieldCounts)
export(fieldValuesPie)
export(fieldValuesTable)
export(getTADATemplate)
export(idCensoredData)
export(simpleCensoredMethods)
export(summarizeCensoredData)
export(TADA_OverviewMap)
export(TADA_QAPPDocAvailable)
export(TADA_QAPPapproved)
export(TADA_ReadWQPWebServices)
export(TADA_SimpleCensoredMethods)
export(TADA_Stats)
export(TADA_SummarizeCensoredData)
export(TADA_SummarizeColumn)
importFrom(magrittr,"%>%")
89 changes: 10 additions & 79 deletions R/CensoredDataSuite.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@
#' @export
#'

idCensoredData <- function(.data){
TADA_IDCensoredData <- function(.data){
# check .data has all of the required columns
expected_cols <- c(
"ResultDetectionConditionText",
"DetectionQuantitationLimitTypeName",
"TADA.ResultMeasureValueDataTypes.Flag"
)
checkColumns(.data, expected_cols)
TADA_CheckColumns(.data, expected_cols)

## First step: identify censored data
cens = .data%>%dplyr::filter(TADA.ResultMeasureValueDataTypes.Flag=="Result Value/Unit Copied from Detection Limit")
Expand All @@ -41,7 +41,7 @@ idCensoredData <- function(.data){

if(dim(cens)[1]>0){
## Bring in det cond reference table
cond.ref = GetDetCondRef()%>%dplyr::rename(ResultDetectionConditionText = Name)%>%dplyr::select(ResultDetectionConditionText, TADA.Detection_Type)
cond.ref = TADA_GetDetCondRef()%>%dplyr::rename(ResultDetectionConditionText = Name)%>%dplyr::select(ResultDetectionConditionText, TADA.Detection_Type)

## Join to censored data
cens = dplyr::left_join(cens, cond.ref, by = "ResultDetectionConditionText")
Expand All @@ -61,7 +61,7 @@ idCensoredData <- function(.data){
}

## Bring in det limit type reference table
limtype.ref = GetDetLimitRef()%>%dplyr::rename(DetectionQuantitationLimitTypeName = Name)%>%dplyr::select(DetectionQuantitationLimitTypeName, TADA.Limit_Type)
limtype.ref = TADA_GetDetLimitRef()%>%dplyr::rename(DetectionQuantitationLimitTypeName = Name)%>%dplyr::select(DetectionQuantitationLimitTypeName, TADA.Limit_Type)

## Join to censored data
cens = dplyr::left_join(cens, limtype.ref, by = "DetectionQuantitationLimitTypeName")
Expand Down Expand Up @@ -130,22 +130,22 @@ idCensoredData <- function(.data){
#' # and in instances where the measurement is non-detect, set the result value
#' # to half of the detection limit value. For over-detect measurements, retain
#' # the detection limit value as the result value as-is.
#' Nutrients_Utah_CensoredFlag = simpleCensoredMethods(Nutrients_Utah, nd_method = "multiplier", nd_multiplier = 0.5, od_method = "as-is", od_multiplier = "null")
#' Nutrients_Utah_CensoredFlag = TADA_SimpleCensoredMethods(Nutrients_Utah, nd_method = "multiplier", nd_multiplier = 0.5, od_method = "as-is", od_multiplier = "null")
#'
#' # Check for agreement between detection condition and detection limit type, and in instances where the measurement is non-detect, set the result value to a random value between 0 and the detection limit value. For over-detect measurements, retain the detection limit value as the result value as-is.
#' Nutrients_Utah_CensoredFlag = simpleCensoredMethods(Nutrients_Utah, nd_method = "randombelowlimit", nd_multiplier = "null", od_method = "as-is", od_multiplier = "null")
#' Nutrients_Utah_CensoredFlag = TADA_SimpleCensoredMethods(Nutrients_Utah, nd_method = "randombelowlimit", nd_multiplier = "null", od_method = "as-is", od_multiplier = "null")
#'



simpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multiplier = 0.5, od_method = "as-is", od_multiplier = "null"){
TADA_SimpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multiplier = 0.5, od_method = "as-is", od_multiplier = "null"){
# check .data has all of the required columns
expected_cols <- c(
"ResultDetectionConditionText",
"DetectionQuantitationLimitTypeName",
"TADA.ResultMeasureValueDataTypes.Flag"
)
checkColumns(.data, expected_cols)
TADA_CheckColumns(.data, expected_cols)

# check that multiplier is provided if method = "multiplier"
if(nd_method == "multiplier"&nd_multiplier == "null"){
Expand All @@ -155,9 +155,9 @@ simpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multiplier
stop("Please provide a multiplier for the upper detection limit handling method of 'multiplier'")
}

# If user has not previously run idCensoredData function, run it here to get required columns
# If user has not previously run TADA_IDCensoredData function, run it here to get required columns
if(!"TADA.CensoredData.Flag"%in%names(.data)){
cens.data = idCensoredData(.data)
cens.data = TADA_IDCensoredData(.data)
}else{
cens.data = .data
}
Expand Down Expand Up @@ -206,72 +206,3 @@ simpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multiplier
}
return(.data)
}

#' Summarize Censored Data
#'
#' This function creates a summary table of the percentage of non-detects by
#' specified ID columns. It can be used to determine the best method for handling
#' censored data estimation methods that depend upon the distribution of the dataset.
#'
#' @param .data A TADA dataframe
#' @param spec_cols A vector of column names to be used as aggregating variables when summarizing censored data information.
#' @return A summary dataframe yielding measurement ncounts, censored data ncounts,
#' and percent of dataset that is censored, aggregated by user-defined grouping
#' variables. Also produces a column "TADA.Censored.Note" that identifies
#' when there is sufficient non-censored data to estimate censored data using statistical
#' methods including Maximum Likelihood Estimation, Robust ROS and Kaplan Meier.
#' The decision tree used to identify applicable statistical analyses is based
#' on the Baseline Assessment of Left-Censored Environmental Data Using R Tech Note.
#' More info can be found here: https://www.epa.gov/sites/default/files/2016-05/documents/tech_notes_10_jun2014_r.pdf
#'
#'
#' @export
#'
#' @examples
#' # Load example dataset:
#' data(TADAProfileCleanTP)
#' # TADAProfileCleanTP dataframe is clean, harmonized, and filtered
#' # down to one Comparable Data Identifier
#'
#' # Create summarizeCensoredData table:
#' TADAProfileCleanTP_summarizeCensoredData <- summarizeCensoredData(TADAProfileCleanTP)
#'

summarizeCensoredData <- function(.data, spec_cols = c("TADA.CharacteristicName","TADA.ResultMeasure.MeasureUnitCode","TADA.ResultSampleFractionText","TADA.MethodSpecificationName")){

if(any(is.na(.data$TADA.ResultMeasureValue))){
warning("Dataset contains data missing both a result value and a detection limit. Suggest removing or handling. See TADA Harmonization vignette for an example.")
}

if(!"TADA.CensoredData.Flag"%in%names(.data)){
cens = idCensoredData(.data)
}else{
cens = .data
}

sum_low = cens%>%dplyr::group_by_at(spec_cols)%>%
dplyr::filter(TADA.CensoredData.Flag%in%c("Non-Detect", "Uncensored"))%>%
dplyr::summarise(Measurement_Count = length(unique(ResultIdentifier)), Censored_Count = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Non-Detect"]), Percent_Censored = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Non-Detect"])/length(TADA.CensoredData.Flag)*100, Censoring_Levels = length(unique(TADA.ResultMeasureValue[TADA.CensoredData.Flag=="Non-Detect"])))%>%
dplyr::filter(Censored_Count>0)%>%
dplyr::mutate("TADA.CensoredData.Flag" = "Non-Detect")

sum_hi = cens%>%dplyr::group_by_at(spec_cols)%>%
dplyr::filter(TADA.CensoredData.Flag%in%c("Over-Detect", "Uncensored"))%>%
dplyr::summarise(Measurement_Count = length(unique(ResultIdentifier)), Censored_Count = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Over-Detect"]), Percent_Censored = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Over-Detect"])/length(TADA.CensoredData.Flag)*100, Censoring_Levels = length(unique(TADA.ResultMeasureValue[TADA.CensoredData.Flag=="Over-Detect"])))%>%
dplyr::filter(Censored_Count>0)%>%
dplyr::mutate("TADA.CensoredData.Flag" = "Over-Detect")

sum_all = plyr::rbind.fill(sum_low, sum_hi)

sum_all = sum_all%>%dplyr::mutate(TADA.Censored.Note = dplyr::case_when(
Percent_Censored>80 ~ as.character("Percent censored too high for estimation methods"), # greater than 80, cannot estimate
Percent_Censored<50&Censoring_Levels>1 ~ as.character("Kaplan-Meier"), # less than 50% censored, and multiple censoring levels (no minimum n)
Percent_Censored<50 ~ as.character("Robust Regression Order Statistics"), # less than 50% censored and one censoring level (no minimum n?)
Measurement_Count>=50 ~ as.character("Maximum Likelihood Estimation"), # 50%-80% censored, 50 or more measurements
Measurement_Count<50 ~ as.character("Robust Regression Order Statistics"), # 50%-80% censored, less than 50 measures
))
if(dim(sum_all)[1]==0){
print("No censored data to summarize. Returning empty data frame.")
}
return(sum_all)
}
Loading

0 comments on commit 5e7185b

Please sign in to comment.