8 Advanced Operations

8.1 Simple wrapper function

Load the connections and dplyr libraries

library(connections)
library(dplyr)
library(dbplyr)
library(config)

Use connection_open() to open a Database connection

con <- connection_open(
  RPostgres::Postgres(),
  host =  "localhost",
  user = get("user"),
  password = get("pwd"),
  port = 5432,
  dbname = "postgres",
  bigint = "integer"
)

Create a variable that points to the v_orders table

orders <- tbl(con, in_schema("retail", "v_orders"))

Create a simple dplyr call that gets the average of all order totals

orders %>%
  summarise(mean = mean(order_total, na.rm = TRUE))

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  38.0

Load the rlang library
```
library(rlang)
```
Create a new function call my_mean() that will take an argument, x, and then returns the results of enquo(x)
```
my_mean <- function(x){
  enquo(x)
}
```
Test the new function. It should return the same variable name, but inside quosure. Use order_total as its argument’s value to test
```
my_mean(order_total)
```
```
## <quosure>
## expr: ^order_total
## env:  global
```
In the function, re-assign x to the result of enquo(x), and then return x
```
my_mean <- function(x){
  x <- enquo(x)
  x
}
```
Test the same way again, the output should match to what it was as before
```
my_mean(order_total)
```
```
## <quosure>
## expr: ^order_total
## env:  global
```
Remove the last line that has x, add the contents of the function with the initial dplyr code from step 3. Then replace order_total with !! x
```
my_mean <- function(x){
  x <- enquo(x)
  orders %>%
    summarise(mean = mean(!! x, na.rm = TRUE))
}
```

Test the new function by passing order_total as x

my_mean(order_total)

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  38.0

In the summarise() step, replace the name mean, with !! as_label(x), also replace the = sign, with :=

my_mean <- function(x){
  x <- enquo(x)
  orders %>%
    summarise(!! as_label(x) := mean(!! x, na.rm = TRUE))
}

Run the function again, the name of the column should match the argument value

my_mean(order_total)

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##   order_total
##         <dbl>
## 1        38.0

Test the function by passing a formula, such as order_total / order_qty

my_mean(order_total / order_qty)

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##   `order_total/order_qty`
##                     <dbl>
## 1                    6.65

Make the function generic, add a new argument called: .data. Inisde the function, replace orders with .data

my_mean <- function(.data, x){
  x <- enquo(x)
  .data %>%
    summarise(!! as_label(x) := mean(!! x, na.rm = TRUE))
}

The function now behaves more like a dplyr verb. Start with orders and then pipe into the function

orders %>% 
  my_mean(order_total)

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##   order_total
##         <dbl>
## 1        38.0

Clean up the code by removing the pipe that inside the function

my_mean <- function(.data, x){
  x <- enquo(x)
  summarise(
    .data,
    !! as_label(x) := mean(!! x, na.rm = TRUE)
    )
}

Confirm that there is no change in the behavior of the function

orders %>% 
  my_mean(order_total)

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##   order_total
##         <dbl>
## 1        38.0

Add a show_query() step to preview the resulting SQL statement

orders %>% 
  my_mean(order_total) %>%
  show_query()

## <SQL>
## SELECT AVG("order_total") AS "order_total"
## FROM retail.v_orders

Try the function with a non-DB backed variable, such as mtcars. Use mpg as the aggregating variable
```
mtcars %>% 
  my_mean(mpg)
```
```
##        mpg
## 1 20.09062
```

8.2 Multiple variables

Create functions that handle a variable number of arguments. The goal of the exercise is to create an anti-select() function.

Load the purrr package
```
library(purrr)
```
Use … as the second argument of a function called de_select(). Inside the function use enquos() to parse it
```
de_select <- function(.data, ...){
  vars <- enquos(...)
  vars
}
```

Test the function using airports

orders %>%
  de_select(order_id, date)

## <list_of<quosure>>
## 
## [[1]]
## <quosure>
## expr: ^order_id
## env:  0x56522573ace8
## 
## [[2]]
## <quosure>
## expr: ^date
## env:  0x56522573ace8

Add a step to the function that iterates through each quosure and prefixes a minus sign to tell select() to drop that specific field. Use map() for the iteration, and quo() to create the prefixed expression.
```
de_select <- function(.data, ...){
  vars <- enquos(...)
  vars <- map(vars, ~ quo(- !! .x))
  vars
}
```

Run the same test to view the new results

orders %>%
  de_select(order_id, date)

## [[1]]
## <quosure>
## expr: ^-^order_id
## env:  0x565225b688f0
## 
## [[2]]
## <quosure>
## expr: ^-^date
## env:  0x565225b6b610

Add the select() step. Use !!! to parse the vars variable inside select()

de_select <- function(.data, ...){
  vars <- enquos(...)
  vars <- map(vars, ~ quo(- !! .x))
  select(.data, !!! vars)
}

Run the test again, this time the operation will take place.

orders %>%
  de_select(order_id, date)

## # Source:   lazy query [?? x 8]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    date_year date_month customer_id customer_name customer_lon customer_lat
##        <int>      <int>       <int> <chr>                <dbl>        <dbl>
##  1      2016          1          22 Dr. Birdie K…        -122.         37.7
##  2      2016          1           6 Meggan Bruen         -122.         37.7
##  3      2016          1          80 Jessee Rodri…        -122.         37.7
##  4      2016          1          55 Kathryn Stehr        -122.         37.8
##  5      2016          1          73 Merlyn Runol…        -122.         37.7
##  6      2016          1          70 Reggie Mills         -122.         37.7
##  7      2016          1          55 Kathryn Stehr        -122.         37.8
##  8      2016          1          40 Dr. Trace Gl…        -122.         37.8
##  9      2016          1          78 Pricilla Goo…        -122.         37.8
## 10      2016          1          35 Mr. Commodor…        -122.         37.7
## # … with more rows, and 2 more variables: order_total <dbl>, order_qty <int>

Add a show_query() step to see the resulting SQL

orders %>%
  de_select(order_id, date) %>%
  show_query()

## <SQL>
## SELECT "date_year", "date_month", "customer_id", "customer_name", "customer_lon", "customer_lat", "order_total", "order_qty"
## FROM retail.v_orders

Test the function with a different data set, such as mtcars

mtcars %>%
  de_select(mpg, wt, am)

##                     cyl  disp  hp drat  qsec vs gear carb
## Mazda RX4             6 160.0 110 3.90 16.46  0    4    4
## Mazda RX4 Wag         6 160.0 110 3.90 17.02  0    4    4
## Datsun 710            4 108.0  93 3.85 18.61  1    4    1
## Hornet 4 Drive        6 258.0 110 3.08 19.44  1    3    1
## Hornet Sportabout     8 360.0 175 3.15 17.02  0    3    2
## Valiant               6 225.0 105 2.76 20.22  1    3    1
## Duster 360            8 360.0 245 3.21 15.84  0    3    4
## Merc 240D             4 146.7  62 3.69 20.00  1    4    2
## Merc 230              4 140.8  95 3.92 22.90  1    4    2
## Merc 280              6 167.6 123 3.92 18.30  1    4    4
## Merc 280C             6 167.6 123 3.92 18.90  1    4    4
## Merc 450SE            8 275.8 180 3.07 17.40  0    3    3
## Merc 450SL            8 275.8 180 3.07 17.60  0    3    3
## Merc 450SLC           8 275.8 180 3.07 18.00  0    3    3
## Cadillac Fleetwood    8 472.0 205 2.93 17.98  0    3    4
## Lincoln Continental   8 460.0 215 3.00 17.82  0    3    4
## Chrysler Imperial     8 440.0 230 3.23 17.42  0    3    4
## Fiat 128              4  78.7  66 4.08 19.47  1    4    1
## Honda Civic           4  75.7  52 4.93 18.52  1    4    2
## Toyota Corolla        4  71.1  65 4.22 19.90  1    4    1
## Toyota Corona         4 120.1  97 3.70 20.01  1    3    1
## Dodge Challenger      8 318.0 150 2.76 16.87  0    3    2
## AMC Javelin           8 304.0 150 3.15 17.30  0    3    2
## Camaro Z28            8 350.0 245 3.73 15.41  0    3    4
## Pontiac Firebird      8 400.0 175 3.08 17.05  0    3    2
## Fiat X1-9             4  79.0  66 4.08 18.90  1    4    1
## Porsche 914-2         4 120.3  91 4.43 16.70  0    5    2
## Lotus Europa          4  95.1 113 3.77 16.90  1    5    2
## Ford Pantera L        8 351.0 264 4.22 14.50  0    5    4
## Ferrari Dino          6 145.0 175 3.62 15.50  0    5    6
## Maserati Bora         8 301.0 335 3.54 14.60  0    5    8
## Volvo 142E            4 121.0 109 4.11 18.60  1    4    2

8.3 Multiple queries

Suggested approach to avoid passing multiple, and similar, queries to the database

Create a simple dplyr piped operation that returns the mean of order_total for the months of January, February and March as a group

orders %>%
  filter(date_month %in% c(1,2,3)) %>%
  summarise(mean = mean(order_total, na.rm = TRUE))

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  37.9

Assign the first operation to a variable called a, and create copy of the operation but changing the selected months to January, March and April. Assign the second one to a variable called b.

a <- orders %>%
  filter(date_month %in% c(1,2,3)) %>%
  summarise(mean = mean(order_total, na.rm = TRUE)) 

b <- orders %>%
  filter(date_month %in% c(1,3,4)) %>%
  summarise(mean = mean(order_total, na.rm = TRUE))

Use union() to pass a and b at the same time to the database

union(a, b)

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  37.9
## 2  38.0

Pipe the previous instruction to show_query() to confirm that the resulting query is a single one

union(a, b) %>%
  show_query()

## <SQL>
## (SELECT AVG("order_total") AS "mean"
## FROM retail.v_orders
## WHERE ("date_month" IN (1.0, 2.0, 3.0)))
## UNION
## (SELECT AVG("order_total") AS "mean"
## FROM retail.v_orders
## WHERE ("date_month" IN (1.0, 3.0, 4.0)))

Assign to a new variable called months an overlapping set of months
```
months <- list(
  c(1,2,3),
  c(1,3,4),
  c(2,4,6)
)
```

Use map() to cycle through each set of overlapping months. Notice that it returns three separate results, meaning that it went to the database three times

months %>%
  map(
    ~ orders %>%
        filter(date_month %in% .x) %>%
        summarise(mean = mean(order_total, na.rm = TRUE)) 
  )

## [[1]]
## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  37.9
## 
## [[2]]
## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  38.0
## 
## [[3]]
## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  38.2

Add a reduce() operation and use union() command to create a single query

months %>%
  map(
    ~ orders %>%
        filter(date_month %in% .x) %>%
        summarise(mean = mean(order_total, na.rm = TRUE)) 
  ) %>%
  reduce(function(x, y) union(x, y))

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  38.2
## 2  38.0
## 3  37.9

Use show_query() to see the resulting single query sent to the database

months %>%
  map(
    ~ orders %>%
        filter(date_month %in% .x) %>%
        summarise(mean = mean(order_total, na.rm = TRUE)) 
  ) %>%
  reduce(function(x, y) union(x, y)) %>%
  show_query()

## <SQL>
## ((SELECT AVG("order_total") AS "mean"
## FROM retail.v_orders
## WHERE ("date_month" IN (1.0, 2.0, 3.0)))
## UNION
## (SELECT AVG("order_total") AS "mean"
## FROM retail.v_orders
## WHERE ("date_month" IN (1.0, 3.0, 4.0))))
## UNION
## (SELECT AVG("order_total") AS "mean"
## FROM retail.v_orders
## WHERE ("date_month" IN (2.0, 4.0, 6.0)))

8.4 Multiple queries with an overlapping range

Create a table with a from and to ranges

ranges <- tribble(
  ~ from, ~to, 
       1,   4,
       2,   5,
       3,   7
)

See how map2() works by passing the two variables as the x and y arguments, and adding them as the function

map2(ranges$from, ranges$to, ~.x + .y)

## [[1]]
## [1] 5
## 
## [[2]]
## [1] 7
## 
## [[3]]
## [1] 10

Replace x + y with the dplyr operation from the previous exercise. In it, re-write the filter to use x and y as the month ranges

map2(
  ranges$from, 
  ranges$to,
   ~ orders %>%
        filter(date_month >= .x & date_month <= .y) %>%
        summarise(mean = mean(order_total, na.rm = TRUE)) 
)

## [[1]]
## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  38.0
## 
## [[2]]
## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  38.2
## 
## [[3]]
## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  38.3

Add the reduce() operation

map2(
  ranges$from, 
  ranges$to,
   ~ orders %>%
        filter(date_month >= .x & date_month <= .y) %>%
        summarise(mean = mean(order_total, na.rm = TRUE)) 
) %>%
reduce(function(x, y) union(x, y))

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    mean
##   <dbl>
## 1  38.3
## 2  38.0
## 3  38.2

Add a show_query() step to see how the final query was constructed.

map2(
  ranges$from, 
  ranges$to,
   ~ orders %>%
        filter(date_month >= .x & date_month <= .y) %>%
        summarise(mean = mean(order_total, na.rm = TRUE)) 
) %>%
reduce(function(x, y) union(x, y)) %>%
show_query()

## <SQL>
## ((SELECT AVG("order_total") AS "mean"
## FROM retail.v_orders
## WHERE ("date_month" >= 1.0 AND "date_month" <= 4.0))
## UNION
## (SELECT AVG("order_total") AS "mean"
## FROM retail.v_orders
## WHERE ("date_month" >= 2.0 AND "date_month" <= 5.0)))
## UNION
## (SELECT AVG("order_total") AS "mean"
## FROM retail.v_orders
## WHERE ("date_month" >= 3.0 AND "date_month" <= 7.0))

8.5 Characters to field names

Create two character variables. One with the name of a field in flights and another with a new name to be given to the field
```
my_field <- "new"
orders_field <- "order_total"
```

Add a mutate() step that adds the new field. And then another step selecting just the new field

orders %>%
  mutate(my_field = !! orders_field) %>%
  select(my_field)

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    my_field   
##    <chr>      
##  1 order_total
##  2 order_total
##  3 order_total
##  4 order_total
##  5 order_total
##  6 order_total
##  7 order_total
##  8 order_total
##  9 order_total
## 10 order_total
## # … with more rows

Add a mutate() step that adds the new field. And then another step selecting just the new field

orders %>%
  mutate(!! my_field := !! orders_field) %>%
  select(my_field)

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##    new        
##    <chr>      
##  1 order_total
##  2 order_total
##  3 order_total
##  4 order_total
##  5 order_total
##  6 order_total
##  7 order_total
##  8 order_total
##  9 order_total
## 10 order_total
## # … with more rows

Wrap orders_field inside a sym() function

orders %>%
  mutate(!! my_field := !! sym(orders_field)) %>%
  select(my_field)

## # Source:   lazy query [?? x 1]
## # Database: postgres [rstudio_admin@localhost:5432/postgres]
##      new
##    <dbl>
##  1 27.9 
##  2 24.8 
##  3 42.2 
##  4 16.6 
##  5 32.6 
##  6  6.7 
##  7 46.8 
##  8 15.8 
##  9  9.69
## 10 21.6 
## # … with more rows

Pipe the code into show_query()

orders %>%
  mutate(!! my_field := !! sym(orders_field)) %>%
  select(my_field) %>%
  show_query()

## <SQL>
## SELECT "order_total" AS "new"
## FROM retail.v_orders