ARROW-16038: [R] different behavior from dplyr when mutate's .keep option is set

boshek · jonkeane · commit b8299436c8b1 · 2022-04-07T16:07:34.000-05:00
This PR does two things to match some dplyr behaviour around column order: 1) Mimics dplyr implementation of `mutate(..., .keep = "none")` to append new columns after the existing columns (if suggested) as [per](tidyverse/dplyr#6086) 2) As per this [discussion](tidyverse/dplyr#6086), this required a bespoke approach to `transmute` as it not simply a wrapper for `mutate(..., .keep = "none")`. This cascades into needing to catch a couple edge cases. I have also added some tests which will test for this behaviour. Closes #12818 from boshek/mutate-keep Authored-by: SAm Albers <sam.albers@gmail.com> Signed-off-by: Jonathan Keane <jkeane@gmail.com>
diff --git a/r/NAMESPACE b/r/NAMESPACE
@@ -331,6 +331,7 @@ importFrom(rlang,":=")
 importFrom(rlang,.data)
 importFrom(rlang,abort)
 importFrom(rlang,as_function)
+importFrom(rlang,as_label)
 importFrom(rlang,as_quosure)
 importFrom(rlang,call2)
 importFrom(rlang,caller_env)
diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
@@ -23,7 +23,7 @@
 #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec
 #' @importFrom rlang is_bare_character quo_get_expr quo_get_env quo_set_expr .data seq2 is_interactive
 #' @importFrom rlang expr caller_env is_character quo_name is_quosure enexpr enexprs as_quosure
-#' @importFrom rlang is_list call2 is_empty as_function
+#' @importFrom rlang is_list call2 is_empty as_function as_label
 #' @importFrom tidyselect vars_pull vars_rename vars_select eval_select
 #' @useDynLib arrow, .registration = TRUE
 #' @keywords internal
diff --git a/r/R/dplyr-mutate.R b/r/R/dplyr-mutate.R
@@ -94,7 +94,10 @@ mutate.arrow_dplyr_query <- function(.data,
 
   # Respect .keep
   if (.keep == "none") {
-    .data$selected_columns <- .data$selected_columns[new_vars]
+    ## for consistency with dplyr, this appends new columns after existing columns
+    ## by specifying the order
+    new_cols_last <- c(intersect(old_vars, new_vars), setdiff(new_vars, old_vars))
+    .data$selected_columns <- .data$selected_columns[new_cols_last]
   } else if (.keep != "all") {
     # "used" or "unused"
     used_vars <- unlist(lapply(exprs, all.vars), use.names = FALSE)
@@ -112,7 +115,17 @@ mutate.Dataset <- mutate.ArrowTabular <- mutate.RecordBatchReader <- mutate.arro
 
 transmute.arrow_dplyr_query <- function(.data, ...) {
   dots <- check_transmute_args(...)
-  dplyr::mutate(.data, !!!dots, .keep = "none")
+  has_null <- map_lgl(dots, quo_is_null)
+  .data <- dplyr::mutate(.data, !!!dots, .keep = "none")
+  if (is_empty(dots) | any(has_null)) {
+    return(.data)
+  }
+
+  ## keeping with: https://github.com/tidyverse/dplyr/issues/6086
+  cur_exprs <- map_chr(dots, as_label)
+  transmute_order <- names(cur_exprs)
+  transmute_order[!nzchar(transmute_order)] <- cur_exprs[!nzchar(transmute_order)]
+  dplyr::select(.data, all_of(transmute_order))
 }
 transmute.Dataset <- transmute.ArrowTabular <- transmute.RecordBatchReader <- transmute.arrow_dplyr_query
 
diff --git a/r/tests/testthat/test-dplyr-mutate.R b/r/tests/testthat/test-dplyr-mutate.R
@@ -74,6 +74,16 @@ test_that("transmute", {
   )
 })
 
+test_that("transmute respect bespoke dplyr implementation", {
+  ## see: https://github.com/tidyverse/dplyr/issues/6086
+  compare_dplyr_binding(
+    .input %>%
+      transmute(dbl, int = int + 6L) %>%
+      collect(),
+    tbl
+  )
+})
+
 test_that("transmute() with NULL inputs", {
   compare_dplyr_binding(
     .input %>%
@@ -92,6 +102,20 @@ test_that("empty transmute()", {
   )
 })
 
+test_that("transmute with unnamed expressions", {
+  compare_dplyr_binding(
+    .input %>%
+      select(int, padded_strings) %>%
+      transmute(
+        int, # bare column name
+        nchar(padded_strings) # expression
+      ) %>%
+      filter(int > 5) %>%
+      collect(),
+    tbl
+  )
+})
+
 test_that("transmute() with unsupported arguments", {
   expect_error(
     tbl %>%
@@ -329,13 +353,13 @@ test_that("dplyr::mutate's examples", {
   #>   <chr> <chr> <dbl>
   #> 1 a     b         3
   compare_dplyr_binding(
-    .input %>% mutate(z = x + y, .keep = "none") %>% collect(), # same as transmute()
+    .input %>% mutate(z = x + y, x, .keep = "none") %>% collect(),
     df
   )
-  #> # A tibble: 1 x 1
-  #>       z
-  #>   <dbl>
-  #> 1     3
+  #> # A tibble: 1 × 2
+  #>       x     z
+  #>   <dbl> <dbl>
+  #> 1     1     3
 
   # Grouping ----------------------------------------
   # The mutate operation may yield different results on grouped