Update doc note and comments

apache · Nov 5, 2024 · 1c470cc · 1c470cc
1 parent ac69fba
commit 1c470cc
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 8 deletions.
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
@@ -62,7 +62,10 @@ supported_dplyr_methods <- list(
   relocate = NULL,
   compute = NULL,
   collapse = NULL,
-  distinct = "`.keep_all = TRUE` not supported",
+  distinct = c(
+    "`.keep_all = TRUE` returns a non-missing value if present,",
+    "only returning missing values if all are missing."
+  ),
   left_join = "the `copy` argument is ignored",
   right_join = "the `copy` argument is ignored",
   inner_join = "the `copy` argument is ignored",

diff --git a/r/R/dplyr-distinct.R b/r/R/dplyr-distinct.R
@@ -28,14 +28,10 @@ distinct.arrow_dplyr_query <- function(.data, ..., .keep_all = FALSE) {
   }
 
   if (isTRUE(.keep_all)) {
-    # (TODO) `.keep_all = TRUE` return first row value,
-    # but this implementation do NOT always return the same result
-    # because `hash_one` skips rows if they contain null value.
-    # Skipping null values is happened by each cols,
-    # so this option has possiblity to destory data.
+    # Note: in regular dplyr, `.keep_all = TRUE` returns the first row's value.
+    # However, Acero's `hash_one` function prefers returning non-null values.
+    # So, you'll get the same shape of data, but the values may differ.
     keeps <- names(.data)[!(names(.data) %in% .data$group_by_vars)]
-    # `one()` is wrapper for calling "hash_one" function (implemented ARROW-13993)
-    # `USAGE: summarize(x = one(x), y = one(y) ...)` for x, y in non-group cols
     exprs <- lapply(keeps, function(x) call2("one", sym(x)))
     names(exprs) <- keeps
   } else {