Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Function names eh #299

Merged
merged 16 commits into from
Jul 5, 2023
73 changes: 36 additions & 37 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,47 +1,46 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(AboveNationalWQXUpperThreshold)
export(AggregatedContinuousData)
export(BelowNationalWQXLowerThreshold)
export(GetActivityTypeRef)
export(GetDetCondRef)
export(GetDetLimitRef)
export(GetMeasureUnitRef)
export(GetWQXCharValRef)
export(HarmonizationRefTable)
export(HarmonizeData)
export(InvalidCoordinates)
export(InvalidFraction)
export(InvalidMethod)
export(InvalidResultUnit)
export(InvalidSpeciation)
export(JoinWQPProfiles)
export(PotentialDuplicateRowID)
export(QAPPDocAvailable)
export(QAPPapproved)
export(QualityControlActivity)
export(TADABigdataRetrieval)
export(TADAOverviewMap)
export(TADAReadWQPWebServices)
export(TADA_AutoClean)
export(TADA_BigDataRetrieval)
export(TADA_Boxplot)
export(TADA_ConvertDepthUnits)
export(TADA_ConvertResultUnits)
export(TADA_ConvertSpecialChars)
export(TADA_CreateComparableID)
export(TADA_DataRetrieval)
export(TADA_FieldCounts)
export(TADA_FieldValuesPie)
export(TADA_FieldValuesTable)
export(TADA_FindContinuousData)
export(TADA_FindNearbySites)
export(TADA_FindPotentialDuplicates)
export(TADA_NearbySites)
export(TADA_FindQualityControlData)
export(TADA_FlagCoordinates)
export(TADA_FlagFraction)
export(TADA_FlagMethod)
export(TADA_FlagResultAboveThreshold)
export(TADA_FlagResultBelowThreshold)
export(TADA_FlagResultUnit)
export(TADA_FlagSpeciation)
export(TADA_GetActivityTypeRef)
export(TADA_GetDetCondRef)
export(TADA_GetDetLimitRef)
export(TADA_GetMeasureUnitRef)
export(TADA_GetSynonymRef)
export(TADA_GetTemplate)
export(TADA_GetWQXCharValRef)
export(TADA_HarmonizeSynonyms)
export(TADA_Histogram)
export(TADA_IDCensoredData)
export(TADA_JoinWQPProfiles)
export(TADA_OrderCols)
export(TADA_boxplot)
export(TADA_hist)
export(TADA_stats)
export(TADA_summarizeColumn)
export(TADAdataRetrieval)
export(createComparableId)
export(fieldCounts)
export(fieldValuesPie)
export(fieldValuesTable)
export(getTADATemplate)
export(idCensoredData)
export(simpleCensoredMethods)
export(summarizeCensoredData)
export(TADA_OverviewMap)
export(TADA_QAPPDocAvailable)
export(TADA_QAPPapproved)
export(TADA_ReadWQPWebServices)
export(TADA_SimpleCensoredMethods)
export(TADA_Stats)
export(TADA_SummarizeCensoredData)
export(TADA_SummarizeColumn)
importFrom(magrittr,"%>%")
89 changes: 10 additions & 79 deletions R/CensoredDataSuite.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@
#' @export
#'

idCensoredData <- function(.data){
TADA_IDCensoredData <- function(.data){
# check .data has all of the required columns
expected_cols <- c(
"ResultDetectionConditionText",
"DetectionQuantitationLimitTypeName",
"TADA.ResultMeasureValueDataTypes.Flag"
)
checkColumns(.data, expected_cols)
TADA_CheckColumns(.data, expected_cols)

## First step: identify censored data
cens = .data%>%dplyr::filter(TADA.ResultMeasureValueDataTypes.Flag=="Result Value/Unit Copied from Detection Limit")
Expand All @@ -41,7 +41,7 @@ idCensoredData <- function(.data){

if(dim(cens)[1]>0){
## Bring in det cond reference table
cond.ref = GetDetCondRef()%>%dplyr::rename(ResultDetectionConditionText = Name)%>%dplyr::select(ResultDetectionConditionText, TADA.Detection_Type)
cond.ref = TADA_GetDetCondRef()%>%dplyr::rename(ResultDetectionConditionText = Name)%>%dplyr::select(ResultDetectionConditionText, TADA.Detection_Type)

## Join to censored data
cens = dplyr::left_join(cens, cond.ref, by = "ResultDetectionConditionText")
Expand All @@ -61,7 +61,7 @@ idCensoredData <- function(.data){
}

## Bring in det limit type reference table
limtype.ref = GetDetLimitRef()%>%dplyr::rename(DetectionQuantitationLimitTypeName = Name)%>%dplyr::select(DetectionQuantitationLimitTypeName, TADA.Limit_Type)
limtype.ref = TADA_GetDetLimitRef()%>%dplyr::rename(DetectionQuantitationLimitTypeName = Name)%>%dplyr::select(DetectionQuantitationLimitTypeName, TADA.Limit_Type)

## Join to censored data
cens = dplyr::left_join(cens, limtype.ref, by = "DetectionQuantitationLimitTypeName")
Expand Down Expand Up @@ -130,22 +130,22 @@ idCensoredData <- function(.data){
#' # and in instances where the measurement is non-detect, set the result value
#' # to half of the detection limit value. For over-detect measurements, retain
#' # the detection limit value as the result value as-is.
#' Nutrients_Utah_CensoredFlag = simpleCensoredMethods(Nutrients_Utah, nd_method = "multiplier", nd_multiplier = 0.5, od_method = "as-is", od_multiplier = "null")
#' Nutrients_Utah_CensoredFlag = TADA_SimpleCensoredMethods(Nutrients_Utah, nd_method = "multiplier", nd_multiplier = 0.5, od_method = "as-is", od_multiplier = "null")
#'
#' # Check for agreement between detection condition and detection limit type, and in instances where the measurement is non-detect, set the result value to a random value between 0 and the detection limit value. For over-detect measurements, retain the detection limit value as the result value as-is.
#' Nutrients_Utah_CensoredFlag = simpleCensoredMethods(Nutrients_Utah, nd_method = "randombelowlimit", nd_multiplier = "null", od_method = "as-is", od_multiplier = "null")
#' Nutrients_Utah_CensoredFlag = TADA_SimpleCensoredMethods(Nutrients_Utah, nd_method = "randombelowlimit", nd_multiplier = "null", od_method = "as-is", od_multiplier = "null")
#'



simpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multiplier = 0.5, od_method = "as-is", od_multiplier = "null"){
TADA_SimpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multiplier = 0.5, od_method = "as-is", od_multiplier = "null"){
# check .data has all of the required columns
expected_cols <- c(
"ResultDetectionConditionText",
"DetectionQuantitationLimitTypeName",
"TADA.ResultMeasureValueDataTypes.Flag"
)
checkColumns(.data, expected_cols)
TADA_CheckColumns(.data, expected_cols)

# check that multiplier is provided if method = "multiplier"
if(nd_method == "multiplier"&nd_multiplier == "null"){
Expand All @@ -155,9 +155,9 @@ simpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multiplier
stop("Please provide a multiplier for the upper detection limit handling method of 'multiplier'")
}

# If user has not previously run idCensoredData function, run it here to get required columns
# If user has not previously run TADA_IDCensoredData function, run it here to get required columns
if(!"TADA.CensoredData.Flag"%in%names(.data)){
cens.data = idCensoredData(.data)
cens.data = TADA_IDCensoredData(.data)
}else{
cens.data = .data
}
Expand Down Expand Up @@ -206,72 +206,3 @@ simpleCensoredMethods <- function(.data, nd_method = "multiplier", nd_multiplier
}
return(.data)
}

#' Summarize Censored Data
#'
#' This function creates a summary table of the percentage of non-detects by
#' specified ID columns. It can be used to determine the best method for handling
#' censored data estimation methods that depend upon the distribution of the dataset.
#'
#' @param .data A TADA dataframe
#' @param spec_cols A vector of column names to be used as aggregating variables when summarizing censored data information.
#' @return A summary dataframe yielding measurement ncounts, censored data ncounts,
#' and percent of dataset that is censored, aggregated by user-defined grouping
#' variables. Also produces a column "TADA.Censored.Note" that identifies
#' when there is sufficient non-censored data to estimate censored data using statistical
#' methods including Maximum Likelihood Estimation, Robust ROS and Kaplan Meier.
#' The decision tree used to identify applicable statistical analyses is based
#' on the Baseline Assessment of Left-Censored Environmental Data Using R Tech Note.
#' More info can be found here: https://www.epa.gov/sites/default/files/2016-05/documents/tech_notes_10_jun2014_r.pdf
#'
#'
#' @export
#'
#' @examples
#' # Load example dataset:
#' data(TADAProfileCleanTP)
#' # TADAProfileCleanTP dataframe is clean, harmonized, and filtered
#' # down to one Comparable Data Identifier
#'
#' # Create summarizeCensoredData table:
#' TADAProfileCleanTP_summarizeCensoredData <- summarizeCensoredData(TADAProfileCleanTP)
#'

summarizeCensoredData <- function(.data, spec_cols = c("TADA.CharacteristicName","TADA.ResultMeasure.MeasureUnitCode","TADA.ResultSampleFractionText","TADA.MethodSpecificationName")){

if(any(is.na(.data$TADA.ResultMeasureValue))){
warning("Dataset contains data missing both a result value and a detection limit. Suggest removing or handling. See TADA Harmonization vignette for an example.")
}

if(!"TADA.CensoredData.Flag"%in%names(.data)){
cens = idCensoredData(.data)
}else{
cens = .data
}

sum_low = cens%>%dplyr::group_by_at(spec_cols)%>%
dplyr::filter(TADA.CensoredData.Flag%in%c("Non-Detect", "Uncensored"))%>%
dplyr::summarise(Measurement_Count = length(unique(ResultIdentifier)), Censored_Count = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Non-Detect"]), Percent_Censored = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Non-Detect"])/length(TADA.CensoredData.Flag)*100, Censoring_Levels = length(unique(TADA.ResultMeasureValue[TADA.CensoredData.Flag=="Non-Detect"])))%>%
dplyr::filter(Censored_Count>0)%>%
dplyr::mutate("TADA.CensoredData.Flag" = "Non-Detect")

sum_hi = cens%>%dplyr::group_by_at(spec_cols)%>%
dplyr::filter(TADA.CensoredData.Flag%in%c("Over-Detect", "Uncensored"))%>%
dplyr::summarise(Measurement_Count = length(unique(ResultIdentifier)), Censored_Count = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Over-Detect"]), Percent_Censored = length(TADA.CensoredData.Flag[TADA.CensoredData.Flag=="Over-Detect"])/length(TADA.CensoredData.Flag)*100, Censoring_Levels = length(unique(TADA.ResultMeasureValue[TADA.CensoredData.Flag=="Over-Detect"])))%>%
dplyr::filter(Censored_Count>0)%>%
dplyr::mutate("TADA.CensoredData.Flag" = "Over-Detect")

sum_all = plyr::rbind.fill(sum_low, sum_hi)

sum_all = sum_all%>%dplyr::mutate(TADA.Censored.Note = dplyr::case_when(
Percent_Censored>80 ~ as.character("Percent censored too high for estimation methods"), # greater than 80, cannot estimate
Percent_Censored<50&Censoring_Levels>1 ~ as.character("Kaplan-Meier"), # less than 50% censored, and multiple censoring levels (no minimum n)
Percent_Censored<50 ~ as.character("Robust Regression Order Statistics"), # less than 50% censored and one censoring level (no minimum n?)
Measurement_Count>=50 ~ as.character("Maximum Likelihood Estimation"), # 50%-80% censored, 50 or more measurements
Measurement_Count<50 ~ as.character("Robust Regression Order Statistics"), # 50%-80% censored, less than 50 measures
))
if(dim(sum_all)[1]==0){
print("No censored data to summarize. Returning empty data frame.")
}
return(sum_all)
}
Loading