Merge pull request #64 from leopoldguyot/reference_drop_issue

Issue #63
statOmics · Sep 3, 2024 · 285af81 · 285af81
2 parents 8373e72 + bddff79
commit 285af81
Show file tree

Hide file tree

Showing 21 changed files with 606 additions and 44 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -82,7 +82,7 @@ Collate: 'msqrob-framework.R' 'allGenerics.R'
 Encoding: UTF-8
 VignetteBuilder: knitr
 Roxygen: list(markdown=TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 biocViews:
     Proteomics,
     MassSpectrometry,

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,7 @@ export(getDF)
 export(getDfPosterior)
 export(getFitMethod)
 export(getModel)
+export(getReferencePresent)
 export(getSigma)
 export(getSigmaPosterior)
 export(getVar)

diff --git a/R/StatModel-methods.R b/R/StatModel-methods.R
@@ -11,6 +11,10 @@
 #'        the linear model coefficients to be tested equal to zero.
 #'        The rownames of the matrix should be equal to the names of
 #'        parameters of the model.
+#' @param acceptDifferentReference `boolean(1)` to indicate if the contrasts that involve
+#'        parameters with modified reference levels are returned. Watch out putting this
+#'        parameter to TRUE can change the interpretation of the logFC. Default is FALSE.
+#'
 #' @examples
 #' data(pe)
 #' # Aggregate peptide intensities in protein expression values
@@ -33,11 +37,14 @@
 
 setMethod(
     "getContrast", "StatModel",
-    function(object, L) {
+    function(object, L, acceptDifferentReference = FALSE) {
         if (!is(L, "matrix")) L <- as.matrix(L)
-        coefs <- getCoef(object)
         out <- matrix(rep(NA, ncol(L)))
         rownames(out) <- colnames(L)
+        if (!referenceContrast(getReferencePresent(object), L, acceptDifferentReference)) {
+            return(out)
+        }
+        coefs <- getCoef(object)
         hlp <- try(t(L) %*% coefs[rownames(L)], silent = TRUE)
         if (!is(hlp, "try-error")) out[] <- hlp
         return(out)
@@ -49,10 +56,13 @@ setMethod(
 
 setMethod(
     "varContrast", "StatModel",
-    function(object, L) {
+    function(object, L, acceptDifferentReference = FALSE) {
         if (!is(L, "matrix")) L <- as.matrix(L)
         out <- matrix(NA, ncol(L), ncol(L))
         rownames(out) <- colnames(out) <- colnames(L)
+        if (!referenceContrast(getReferencePresent(object), L, acceptDifferentReference)) {
+            return(out)
+        }
         vcovTmp <- getVcovUnscaled(object) * object@varPosterior
         hlp <- try(t(L) %*% vcovTmp[rownames(L), rownames(L)] %*% L, silent = TRUE)
         if (!is(hlp, "try-error")) out[] <- hlp

diff --git a/R/accessors.R b/R/accessors.R
@@ -14,6 +14,8 @@
 #'              \item{getSigmaPosterior(object)}{to get the empirical Bayes standard deviation}
 #'              \item{getVcovUnscaled(object)}{to get the unscaled variance covariance matrix
 #'                of the model parameters}
+#'              \item{getReferencePresent(object)}{to get the vector of logicals indicating if the
+#'               reference level associated with a model parameter is present}
 #'              }
 #'
 #' @rdname statModelAccessors
@@ -95,3 +97,9 @@ setMethod("getVcovUnscaled",
     signature = "StatModel",
     definition = function(object) object@params$vcovUnscaled
 )
+
+#' @rdname statModelAccessors
+setMethod("getReferencePresent",
+    signature = "StatModel",
+    definition = function(object) object@params$referencePresent
+)
diff --git a/R/allGenerics.R b/R/allGenerics.R
@@ -19,7 +19,9 @@ setGeneric("getVarPosterior", function(object) standardGeneric("getVarPosterior"
 #' @export
 setGeneric("getSigmaPosterior", function(object) standardGeneric("getSigmaPosterior"))
 #' @export
-setGeneric("getContrast", function(object, L) standardGeneric("getContrast"))
+setGeneric("getReferencePresent", function(object) standardGeneric("getReferencePresent"))
+#' @export
+setGeneric("getContrast", function(object, ...) standardGeneric("getContrast"))
 #' @export
 setGeneric("getVcovUnscaled", function(object) standardGeneric("getVcovUnscaled"))
 #' @export

diff --git a/R/hypothesisTest-methods.R b/R/hypothesisTest-methods.R
@@ -122,16 +122,16 @@ setMethod(
     adjust.method = "BH",
     modelColumn = "msqrobModels",
     resultsColumnNamePrefix = "",
-    overwrite = FALSE) {
+    overwrite = FALSE,
+    acceptDifferentReference = FALSE) {
         if (!(modelColumn %in% colnames(rowData(object)))) stop("There is no column named \'", modelColumn, "\' with stored models of an msqrob fit in the rowData of the SummarizedExperiment object")
         if (is.null(colnames(contrast)) & resultsColumnNamePrefix == "") resultsColumnNamePrefix <- "msqrobResults"
         if (is.null(colnames(contrast)) & ncol(contrast) > 1) colnames(contrast) <- seq_len(ncol(contrast))
         if ((sum(paste0(resultsColumnNamePrefix, colnames(contrast)) %in% colnames(rowData(object))) > 0) & !overwrite) stop("There is/are already column(s) with names starting with\'", resultsColumnNamePrefix, "\' in the rowData of the SummarizedExperiment object, set the argument overwrite=TRUE to replace the column(s) with the new results or use another name for the argument resultsColumnNamePrefix")
         for (j in seq_len(ncol(contrast)))
         {
-            contrHlp <- contrast[, j]
-            names(contrHlp) <- rownames(contrast)
-            rowData(object)[[paste0(resultsColumnNamePrefix, colnames(contrast)[j])]] <- topFeatures(rowData(object)[, modelColumn], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1)
+            contrHlp <- contrast[, j, drop = FALSE]
+            rowData(object)[[paste0(resultsColumnNamePrefix, colnames(contrast)[j])]] <- topFeatures(rowData(object)[, modelColumn], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1, acceptDifferentReference = acceptDifferentReference)
         }
         return(object)
     }
@@ -149,7 +149,8 @@ setMethod(
     adjust.method = "BH",
     modelColumn = "msqrobHurdle",
     resultsColumnNamePrefix = "hurdle_",
-    overwrite = FALSE) {
+    overwrite = FALSE,
+    acceptDifferentReference = FALSE) {
         if (sum(paste0(modelColumn, c("Intensity", "Count")) %in% colnames(rowData(object))) != 2) stop("There are no columns for the models of the hurdle components in the rowData of the SummarizedExperiment")
         if (is.null(colnames(contrast)) & resultsColumnNamePrefix == "hurdle_") resultsColumnNamePrefix <- "hurdleResults"
         if (is.null(colnames(contrast)) & ncol(contrast) > 1) colnames(contrast) <- seq_len(ncol(contrast))
@@ -158,8 +159,8 @@ setMethod(
         {
             contrHlp <- contrast[, j]
             names(contrHlp) <- rownames(contrast)
-            intensityComponent <- topFeatures(rowData(object)[, paste0(modelColumn, "Intensity")], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1)
-            countComponent <- topFeatures(rowData(object)[, paste0(modelColumn, "Count")], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1)
+            intensityComponent <- topFeatures(rowData(object)[, paste0(modelColumn, "Intensity")], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1, acceptDifferentReference = acceptDifferentReference)
+            countComponent <- topFeatures(rowData(object)[, paste0(modelColumn, "Count")], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1, acceptDifferentReference = acceptDifferentReference)
 
             sam <- cbind(
                 intensityComponent[, seq_len(5)],
@@ -205,7 +206,8 @@ setMethod(
     adjust.method = "BH",
     modelColumn = "msqrobModels",
     resultsColumnNamePrefix = "",
-    overwrite = FALSE) {
+    overwrite = FALSE,
+    acceptDifferentReference = FALSE) {
         if (is.null(object[[i]])) stop("QFeatures object does not contain an assay with the name ", i)
         if (!(modelColumn %in% colnames(rowData(object[[i]])))) stop("There is no column named \'", modelColumn, "\' with stored models of an msqrob fit in the rowData of assay ", i, "of the QFeatures object.")
         if (is.null(colnames(contrast)) & resultsColumnNamePrefix == "") resultsColumnNamePrefix <- "msqrobResults"
@@ -215,7 +217,7 @@ setMethod(
         {
             contrHlp <- contrast[, j]
             names(contrHlp) <- rownames(contrast)
-            rowData(object[[i]])[[paste0(resultsColumnNamePrefix, colnames(contrast)[j])]] <- topFeatures(rowData(object[[i]])[, modelColumn], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1)
+            rowData(object[[i]])[[paste0(resultsColumnNamePrefix, colnames(contrast)[j])]] <- topFeatures(rowData(object[[i]])[, modelColumn], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1, acceptDifferentReference = acceptDifferentReference)
         }
         return(object)
     }
@@ -232,7 +234,8 @@ setMethod(
     adjust.method = "BH",
     modelColumn = "msqrobHurdle",
     resultsColumnNamePrefix = "hurdle_",
-    overwrite = FALSE) {
+    overwrite = FALSE,
+    acceptDifferentReference = FALSE) {
         if (is.null(object[[i]])) stop("QFeatures object does not contain an assay with the name ", i)
         if (sum(paste0(modelColumn, c("Intensity", "Count")) %in% colnames(rowData(object[[i]]))) != 2) stop("There are no columns for the models of the hurdle components in the rowData of assay ", i, "of the QFeatures object.")
         if (is.null(colnames(contrast)) & resultsColumnNamePrefix == "hurdle_") resultsColumnNamePrefix <- "hurdleResults"
@@ -242,8 +245,8 @@ setMethod(
         {
             contrHlp <- contrast[, j]
             names(contrHlp) <- rownames(contrast)
-            intensityComponent <- topFeatures(rowData(object[[i]])[, paste0(modelColumn, "Intensity")], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1)
-            countComponent <- topFeatures(rowData(object[[i]])[, paste0(modelColumn, "Count")], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1)
+            intensityComponent <- topFeatures(rowData(object[[i]])[, paste0(modelColumn, "Intensity")], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1, acceptDifferentReference = acceptDifferentReference)
+            countComponent <- topFeatures(rowData(object[[i]])[, paste0(modelColumn, "Count")], contrast = contrHlp, adjust.method = adjust.method, sort = FALSE, alpha = 1, acceptDifferentReference = acceptDifferentReference)
 
             sam <- cbind(
                 intensityComponent[, seq_len(5)],

diff --git a/R/msqrob-utils.R b/R/msqrob-utils.R
@@ -86,9 +86,142 @@ makeContrast <- function(contrasts, parameterNames) {
     return(t(.chrlinfct2matrix(contrasts, parameterNames)$K))
 }
 
+
+#' Check for presence of reference levels
+#' @description Check if the reference levels of the factors in the model are present for a specific feature.
+#'
+#' @param y A numeric vector with the feature values for each sample.
+#'
+#' @param data A `DataFrame` with information on the design. It has
+#'        the same number of rows as the number of columns (samples) of
+#'        `y`.
+#' 
+#' @param paramNames A `character` vector with the names of the parameters in the model.
+#' 
+#' @param factorLevels A list with the levels of the factors in the model. The names of the list
+#'       should correspond to the factor variables present in the model.
+#' 
+#' @return A logical vector indicating for each parameter (that comes from a factor variable) in the model 
+#' if his reference level has values for the current feature.
+#' 
+#' @rdname checkReference
+#'
+checkReference <- function(y, data, paramNames, factorLevels) {
+    if (length(factorLevels) == 0) {
+        return(logical(0))
+    }
+    factors <- factorLevels
+    subset <- droplevels(data[!is.na(y), names(factors), drop = FALSE])
+    paramSplit <- list()
+    for (x in names(factors)) {
+        paramSplit[[x]] <- paramNames[grep(x, paramNames)]
+    }
+    factor_status <- unlist(lapply(names(factors), function(x) {
+        factorLevels <- factors[[x]]
+        referenceRef <- factorLevels[1]
+        noIntercept <- all(paste0(x, factorLevels) %in% paramNames)
+        refPres <- referenceRef %in% levels(subset[, x])
+        return(noIntercept || refPres)
+    }))
+    names(factor_status) <- names(factors)
+
+    referencePresent <- propagateFalseStatus(paramSplit, factor_status)
+
+    return(unlist(referencePresent))
+}
+
+#' Get the levels of the factors in the model
+#' 
+#' @description Get the levels of the factors in the model associated with the factors
+#'            names.
+#' 
+#' @param data A `DataFrame` with information on the design. Contains the variables that are used in the model.
+#' 
+#' @param formula A `formula` object that specifies the model.
+#' 
+#' @return A list with the levels of the factors in the model. The names of the list
+#'        correspond to the factor variables present in the model.
+#' 
+#' @rdname getFormulaFactors
+#' 
+getFormulaFactors <- function(formula, dataFrame) {
+    vars <- all.vars(formula)
+    if(length(vars) == 0) {
+        return(character(0))
+    }
+    nam <- vars[sapply(vars, function(x) is.factor(dataFrame[, x]) || is.character(dataFrame[, x]) )]
+    res <- lapply(nam, function(x) {
+        levels(as.factor(dataFrame[, x]))
+    })
+    names(res) <- nam
+    return(res)
+}
+
+#' Check if the reference levels associated with the contrast are present
+#' @description Check if the reference levels associated with the contrast are present
+#'
+#' @param referencePresent A logical vector indicating for each parameter in the model 
+#' if his reference level has values for the current feature.
+#'
+#' @param L A numeric contrast matrix with rownames that equal the model parameters 
+#' that are involved in the contrasts
+#'
+#' @param acceptDifferentReference `boolean(1)` to indicate if the contrasts that involve
+#' parameters with modified reference levels are returned.
+#' 
+#' @return A boolean
+#' 
+#' @rdname referenceContrast
+#' 
+referenceContrast <- function(referencePresent, L, acceptDifferentReference) {
+  acceptDifferentReference || is.null(referencePresent) || !any(!referencePresent[rownames(L)])
+}
+
+checkFullRank <- function(modelMatrix) {
+    return(qr(modelMatrix)$rank == ncol(modelMatrix))
+}
+
+
 ##### None exported functions from multcomp package is included here to
 ##### During R and Bioc checks
 
+propagateFalseStatus <- function(vectors, statuses) {
+    # Initialize a flag to track if any status has changed
+    has_changed <- TRUE
+
+    # Use a while loop that runs until no statuses change
+    while (has_changed) {
+        # Collect all elements from vectors marked as FALSE
+        false_elements <- unique(unlist(vectors[statuses == FALSE]))
+
+        # Initialize new statuses as the current statuses
+        new_statuses <- statuses
+
+        # Check each vector and update status if it contains any false element
+        for (i in seq_along(vectors)) {
+        if (statuses[i] == TRUE && any(vectors[[i]] %in% false_elements)) {
+            new_statuses[i] <- FALSE  # Change status to FALSE
+        }
+        }
+
+        # Check if any status has changed
+        has_changed <- !all(new_statuses == statuses)
+
+        # Update statuses with new values
+        statuses <- new_statuses
+    }
+    # Create a named vector of unique elements with their final statuses
+    all_elements <- unique(unlist(vectors))
+    element_status <- setNames(rep(TRUE, length(all_elements)), all_elements)
+
+    # Set the final status of each element based on the propagated statuses
+    for (i in seq_along(vectors)) {
+        element_status[vectors[[i]]] <- statuses[i]
+    }
+
+    return(element_status)
+}
+
 
 .chrlinfct2matrix <- function(ex, var) {
     if (!is.character(ex)) {
@@ -668,3 +801,4 @@ makeContrast <- function(contrasts, parameterNames) {
         ex, " to character"
     )
 }
+
diff --git a/R/msqrob.R b/R/msqrob.R
@@ -57,22 +57,29 @@ msqrobLm <- function(y,
     robust = TRUE,
     maxitRob = 5) {
     myDesign <- model.matrix(formula, data)
+    missingSamples <- colSums(!is.na(y)) == 0
+    subsettedDesign <- myDesign[!missingSamples, , drop = FALSE]
+    stopifnot("The provided model must be full rank" = checkFullRank(subsettedDesign))
+    paramNames <- colnames(myDesign)
+    factorLevels <- getFormulaFactors(formula, data)
     models <- apply(y, 1,
         function(y, design) {
             ## computatability check
             obs <- is.finite(y)
             type <- "fitError"
+            referencePresent <- checkReference(y, data, paramNames, factorLevels)
             model <- list(
-                coefficients = NA, vcovUnscaled = NA,
-                sigma = NA, df.residual = NA, w = NA
+                referencePresent = referencePresent, coefficients = NA,
+                vcovUnscaled = NA, sigma = NA,
+                df.residual = NA, w = NA
             )
 
             if (sum(obs) > 0) {
                 ## subset to finite observations, attention with R column switching
                 X <- design[obs, , drop = FALSE]
-                X <- X[,colMeans(X == 0) != 1 , drop = FALSE]
+                qrX <- qr(X)
+                X <- X[, qrX$pivot[seq_len(qrX$rank)], drop = FALSE]
                 y <- y[obs]
-                colnames_orig <- colnames(design)
 
                 if (robust) {
                     ## use robust regression from MASS package, "M" estimation is used
@@ -108,16 +115,17 @@ msqrobLm <- function(y,
                 }
 
                 if (type != "fitError") {
-                    coef <- rep(NA, length(colnames_orig))
-                    names(coef) <- colnames_orig
+                    coef <- rep(NA, length(paramNames))
+                    names(coef) <- paramNames
                     coef[names(mod$coef)] <- mod$coef
-                    vcovUnscaled <- matrix(NA, nrow =length(colnames_orig), ncol = length(colnames_orig))
-                    rownames(vcovUnscaled) <- colnames(vcovUnscaled) <-  colnames_orig
+                    vcovUnscaled <- matrix(NA, nrow =length(paramNames), ncol = length(paramNames))
+                    rownames(vcovUnscaled) <- colnames(vcovUnscaled) <-  paramNames
                     vcovUnscaled[names(mod$coef), names(mod$coef)] <- msqrob2:::.vcovUnscaled(mod)
 
                     model <- list(
-                        coefficients = mod$coef,
-                        vcovUnscaled = .vcovUnscaled(mod),
+                        referencePresent = referencePresent,
+                        coefficients = coef,
+                        vcovUnscaled = vcovUnscaled,
                         sigma = sigma,
                         df.residual = df.residual,
                         w = w
@@ -770,3 +778,5 @@ msqrobGlm <- function(y,
 
     return(models)
 }
+
+