for (i in some_vector) {
# Code to do stuff based on i
}
11 Loops and iteration - Part 1
Settling In
Data Storytelling Moment
Learning goals
After this lesson, you should be able to:
- Write a
for
loop in R to handle repeated tasks- Use
seq_along()
andseq_len()
to iterate over vectors or rows of a data frame - Store output from a
for
loop in a list or other storage container
- Use
- Recognize that vectorized functions help us avoid
for
loops in many situations - Use the
across()
function to wrangle multiple variables simultaneously
Iteration
When we want to reuse code, we can use:
- functions to save the steps of the task
- iteration to apply that function to new inputs
. . .
Let’s talk about iteration in R.
- Iterating using a
for
loop & how to do it appropriately in R (today) - Iterating
across
data frame columns (today) - Iterating using
map
andwalk
for iteration, instead of afor
loop (next class)
Purposes of Iteration
Creating or saving a new object
- Performing an action with a return output
- Ex. creating new variables/datasets, fitting models, etc.
- We’ll need to consider storage (memory allocation).
- Next class, we’ll use
map()
for this…
. . .
Side effects
- Performing an action without a R return output
- Ex: saving data using
write_csv()
or saving a plot withpng()
- We don’t need to worry about storage in R.
- We could use
walk()
for this…
for
loops
In R, for
loops have the following general structure:
. . .
some_vector
can be any vector, including:
- An indexing integer vector:
1:3
[Most common!] - A character vector:
c("group1", "group2", "group3")
- A vector of any other class
<- c("group1", "group2", "group3") # a vector to iterate over
groups
for (i in 1:3) { # i: index
print(groups[i])
}
[1] "group1"
[1] "group2"
[1] "group3"
for (g in groups) { # iterating over the vector
print(g)
}
[1] "group1"
[1] "group2"
[1] "group3"
. . .
To iterate over an indexing vector, it is useful to know about the following two functions:
seq_along(v)
: generates an integer sequence from 1 to the length of the vectorv
suppliedseq_len(n)
: generates an integer sequence from 1 ton
for (i in seq_along(groups)) {
print(groups[i])
}
[1] "group1"
[1] "group2"
[1] "group3"
for (i in seq_len(length(groups))) {
print(groups[i])
}
[1] "group1"
[1] "group2"
[1] "group3"
. . .
A nice feature of seq_along()
is that it generates an empty iteration vector if the vector you’re iterating over itself has length 0.
<- c()
no_groups seq_along(no_groups)
integer(0)
for (i in seq_along(no_groups)) {
print(no_groups[i])
}
seq_len()
is useful for iterating over the rows of a data frame because seq_along()
would iterate over columns:
<- tibble(a = 1:2, b = 2:3, c = 4:5)
small_data
for (col in small_data) { # each element is the column
print(col)
}
[1] 1 2
[1] 2 3
[1] 4 5
for (r in seq_len(nrow(small_data))) {
print(r)
}
[1] 1
[1] 2
. . .
Indexing is particularly important when we want to create/save new objects as we need to store output created during a for
loop.
<- vector("character", length = 3)
char_storage <- vector("numeric", length = 3)
num_storage <- vector("list", length = 3)
list_storage
char_storage
[1] "" "" ""
num_storage
[1] 0 0 0
list_storage
[[1]]
NULL
[[2]]
NULL
[[3]]
NULL
for (i in seq_len(3)) {
<- str_c("Number: ", i)
char_storage[i] <- 2*i
num_storage[i] <- i # Note the [[ for subsetting here
list_storage[[i]]
}
char_storage
[1] "Number: 1" "Number: 2" "Number: 3"
num_storage
[1] 2 4 6
list_storage
[[1]]
[1] 1
[[2]]
[1] 2
[[3]]
[1] 3
for
loops and memory allocation
If you don’t create a storage container ahead of time, you will regret it. Let’s bench mark our code to see the difference!
# Example 1: No storage container; append new value to existing vector
<- integer()
x ::mark(
benchfor (i in 1:1e5) {
<- c(x, i)
x
} )
# A tibble: 1 × 6
expression min median `itr/sec` mem_alloc `gc/sec`
<bch:expr> <bch:> <bch:> <dbl> <bch:byt> <dbl>
1 for (i in 1:1e+05) { x <- c(x, i) } 24.7s 24.7s 0.0406 18.6GB 51.0
# Example 2: Using a storage container!
#Notice median (time to run), mem_alloc (how much memory you are using), and gc (garbage collection)
<- vector('integer', length = 1e5)
x ::mark(
benchfor (i in 1:1e5) {
<- i
x[i]
} )
# A tibble: 1 × 6
expression min median `itr/sec` mem_alloc `gc/sec`
<bch:expr> <bch:tm> <bch:t> <dbl> <bch:byt> <dbl>
1 for (i in 1:1e+05) { x[i] <- i } 2.78ms 3.24ms 284. 402KB 4.08
# Example 3: Vectorized function
#Notice median (time to run), mem_alloc (how much memory you are using), and gc (garbage collection)
::mark(
bench<- seq(1, 1e5, by = 1)
x )
# A tibble: 1 × 6
expression min median `itr/sec` mem_alloc `gc/sec`
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
1 x <- seq(1, 1e+05, by = 1) 348µs 493µs 1299. 1.91MB 41.9
Exercises
Pair programming exercises: Whoever has most recently eaten dessert (broadly interpreted) should be the driver first. Switch after Exercise 2.
Write for
-loops that do each of the following:
- Prints the even numbers from 2:20
- Then, produce the same output with the
seq()
function
- Then, produce the same output with the
# for loop
# vectorized approach with `seq()`
- Iterates over the
month.name
vector and stores a character vector of output containing strings like “Month 1: January”, “Month 2: February”.- Then, produce the same output with
str_c()
only.
- Then, produce the same output with
# for loop
# vectorized approach with `str_c()`
- On the
diamonds
dataset, fit stratified models of price vs. carat, model separately estimated for each value ofcut
, and store the fitted models in a list storage container.
# for loop
Iteration across
data frame columns
Often we will have to perform the same data wrangling on many variables (e.g., rounding numbers)
%>%
diamonds mutate(
carat = round(carat, 1),
x = round(x, 1),
y = round(y, 1),
z = round(z, 1)
)
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.2 Ideal E SI2 61.5 55 326 4 4 2.4
2 0.2 Premium E SI1 59.8 61 326 3.9 3.8 2.3
3 0.2 Good E VS1 56.9 65 327 4 4.1 2.3
4 0.3 Premium I VS2 62.4 58 334 4.2 4.2 2.6
5 0.3 Good J SI2 63.3 58 335 4.3 4.3 2.8
6 0.2 Very Good J VVS2 62.8 57 336 3.9 4 2.5
7 0.2 Very Good I VVS1 62.3 57 336 4 4 2.5
8 0.3 Very Good H SI1 61.9 55 337 4.1 4.1 2.5
9 0.2 Fair E VS2 65.1 61 337 3.9 3.8 2.5
10 0.2 Very Good H VS1 59.4 61 338 4 4 2.4
# ℹ 53,930 more rows
dplyr
provides the across()
function for performing these repeated function calls:
# Option 1: Create our own named function
<- function(x) {
round_to_one round(x, digits = 1)
}%>%
diamonds mutate(across(.cols = c(carat, x, y, z), .fns = round_to_one))
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.2 Ideal E SI2 61.5 55 326 4 4 2.4
2 0.2 Premium E SI1 59.8 61 326 3.9 3.8 2.3
3 0.2 Good E VS1 56.9 65 327 4 4.1 2.3
4 0.3 Premium I VS2 62.4 58 334 4.2 4.2 2.6
5 0.3 Good J SI2 63.3 58 335 4.3 4.3 2.8
6 0.2 Very Good J VVS2 62.8 57 336 3.9 4 2.5
7 0.2 Very Good I VVS1 62.3 57 336 4 4 2.5
8 0.3 Very Good H SI1 61.9 55 337 4.1 4.1 2.5
9 0.2 Fair E VS2 65.1 61 337 3.9 3.8 2.5
10 0.2 Very Good H VS1 59.4 61 338 4 4 2.4
# ℹ 53,930 more rows
# Option 2: Use an "anonymous" or "lambda" function that isn't named/saved
%>%
diamonds mutate(across(.cols = c(carat, x, y, z), .fns = function(x) {round(x, digits = 1)} ))
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.2 Ideal E SI2 61.5 55 326 4 4 2.4
2 0.2 Premium E SI1 59.8 61 326 3.9 3.8 2.3
3 0.2 Good E VS1 56.9 65 327 4 4.1 2.3
4 0.3 Premium I VS2 62.4 58 334 4.2 4.2 2.6
5 0.3 Good J SI2 63.3 58 335 4.3 4.3 2.8
6 0.2 Very Good J VVS2 62.8 57 336 3.9 4 2.5
7 0.2 Very Good I VVS1 62.3 57 336 4 4 2.5
8 0.3 Very Good H SI1 61.9 55 337 4.1 4.1 2.5
9 0.2 Fair E VS2 65.1 61 337 3.9 3.8 2.5
10 0.2 Very Good H VS1 59.4 61 338 4 4 2.4
# ℹ 53,930 more rows
# Anonymous functions can be written in a more concise way
# function(x){ length(unique(x)) } OR ~ length(unique(.x))
# function(x,y){ length(unique(x + y)) } OR ~ length(unique(.x + .y))
. . .
When we look at the documentation for across()
, we see that the .cols
argument specifies which variables we want to transform, and it has a <tidy-select>
tag. This means that the syntax we use for .cols
follows certain rules.
Let’s click this link to explore the possibilities for selecting variables.
- Read through the “Overview of selection features” section to get an overall sense of the many ways to select variables.
- Navigate back to the
across()
documentation and read through the Examples section at the bottom. Click the “Run examples” link to view the output for all of the examples.
. . .
What if we wanted to perform multiple transformations on each of many variables?
Within the different values of diamond cut
, let’s summarize the mean, median, and standard deviation of the numeric variables. When we look at the .fns
argument in the across()
documentation, we see that we can provide a list of functions:
%>%
diamonds group_by(cut) %>%
summarize(across(.cols = where(is.numeric), .fns = list(mean = mean, med = median, sd = sd)))
# A tibble: 5 × 22
cut carat_mean carat_med carat_sd depth_mean depth_med depth_sd table_mean
<ord> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Fair 1.05 1 0.516 64.0 65 3.64 59.1
2 Good 0.849 0.82 0.454 62.4 63.4 2.17 58.7
3 Very G… 0.806 0.71 0.459 61.8 62.1 1.38 58.0
4 Premium 0.892 0.86 0.515 61.3 61.4 1.16 58.7
5 Ideal 0.703 0.54 0.433 61.7 61.8 0.719 56.0
# ℹ 14 more variables: table_med <dbl>, table_sd <dbl>, price_mean <dbl>,
# price_med <dbl>, price_sd <dbl>, x_mean <dbl>, x_med <dbl>, x_sd <dbl>,
# y_mean <dbl>, y_med <dbl>, y_sd <dbl>, z_mean <dbl>, z_med <dbl>,
# z_sd <dbl>
What does the list of functions look like? What is the structure of this list object?
<- list(mean = mean, med = median, sd = sd)
list_of_fcts list_of_fcts
$mean
function (x, ...)
UseMethod("mean")
<bytecode: 0x1208308d0>
<environment: namespace:base>
$med
function (x, na.rm = FALSE, ...)
UseMethod("median")
<bytecode: 0x107080458>
<environment: namespace:stats>
$sd
function (x, na.rm = FALSE)
sqrt(var(if (is.vector(x) || is.factor(x)) x else as.double(x),
na.rm = na.rm))
<bytecode: 0x1302684e8>
<environment: namespace:stats>
str(list_of_fcts)
List of 3
$ mean:function (x, ...)
$ med :function (x, na.rm = FALSE, ...)
$ sd :function (x, na.rm = FALSE)
Exercises
Using the diamonds
dataset:
- Transform the
x
,y
, andz
columns so that the units of millimeters are displayed (e.g., “4.0 mm”).
- Convert all numeric columns into character columns.
- Hint: type
is.
and hit Tab in the Console. Scroll through the function options. Do the same withas.
- Hint: type
Case study: performing many different versions of an analysis
One of my most common use cases for writing functions and iteration/looping is to perform some exploration or modeling repeatedly for several different “settings”.
For example, our broad goal might be to fit a linear regression model to our data. However, there are often multiple choices that we have to make in practice:
- Keep missing values or fill them in (imputation)?
- Fit the model only on a certain group of cases?
- Filter out outliers in one or more variables?
- Transform certain variables? (e.g., log transformation)
. . .
We can map any number of these choices to arguments in a custom model-fitting function:
impute
: TRUE or FALSEfilter_to
: This could be a string description like “All cases”, “Group 1”, or “Groups 1 and 2”
<- function(data, impute, filter_to) {
fit_model if (impute) {
<- some_imputation_function(data)
data
}
if (filter_to=="Group 1") {
<- data %>%
data filter(group == 1)
else if (filter_to == "Groups 1 and 2") {
} <- data %>%
data filter(group %in% c(1,2))
}
lm(y ~ x1 + x2, data = data)
}
The tidyr
package has a useful function called crossing()
that is useful for generating argument combinations.
For each argument, we specify all possible values for that argument. crossing()
generates all combinations:
<- crossing(
df_arg_combos impute = c(TRUE, FALSE),
filter_to = c("All cases", "Group 1", "Groups 1 and 2")
) df_arg_combos
# A tibble: 6 × 2
impute filter_to
<lgl> <chr>
1 FALSE All cases
2 FALSE Group 1
3 FALSE Groups 1 and 2
4 TRUE All cases
5 TRUE Group 1
6 TRUE Groups 1 and 2
# Another example
crossing(
option1 = 1:3,
option2 = c(TRUE, FALSE),
option3 = c("low", "medium", "high")
)
# A tibble: 18 × 3
option1 option2 option3
<int> <lgl> <chr>
1 1 FALSE high
2 1 FALSE low
3 1 FALSE medium
4 1 TRUE high
5 1 TRUE low
6 1 TRUE medium
7 2 FALSE high
8 2 FALSE low
9 2 FALSE medium
10 2 TRUE high
11 2 TRUE low
12 2 TRUE medium
13 3 FALSE high
14 3 FALSE low
15 3 FALSE medium
16 3 TRUE high
17 3 TRUE low
18 3 TRUE medium
With iteration in our toobox, we can iterate the fit_model()
function over the combinations in df_arg_combos
.
Exercises
Goal: In the diamonds
dataset, we want to understand the relationship between price
and size (carat
). We want to explore variation along two choices:
- The variables included in the model. We’ll explore 3 sets of variables:
- No further variables (just
price
andcarat
) - Adjusting for
cut
- Adjusting for
cut
andclarity
- Adjusting for
cut
,clarity
, andcolor
- No further variables (just
- Whether or not to remove outliers in the
carat
variable. We’ll define outliers as cases whosecarat
is over 3 SDs away from the mean.
Work with your partner on the following exercises. As you work, make note of what is challenging and any helpful thought processes/strategies that arise from the collaboration.
- Use
crossing()
to create the data frame of argument combinations for our analyses. Call itdf_arg_combos
. Note that you can create a list of formula objects in R withc(y ~ x1, y ~ x1 + x2)
. (Something like this will be the right hand side of an argument tocrossing()
.)
- Write a function called
remove_outliers
that removes outliers in a dataset. The user should be able to supply the dataset (data
), the variable to remove outliers in (what_var
), and a threshold on the number of SDs away from the mean used to define outliers (sd_thresh
). Write your function so that it runs as follows:remove_outliers(diamonds, what_var = carat, sd_thresh = 3)
.
- Write a function called
fit_model
that implements the different settings for ourdiamonds
analysis.
<- function(data, mod_formula, remove_outliers) {
fit_model # remove_outliers is a TRUE/FALSE flag of whether or not to remove outliers
# This function implements our specific use case: outliers are cases that are 3 SDs away from the mean for the carat variable
# Use mod_formula as the first argument of lm()
}
- Write a
for
loop that stores the fitted linear models from all versions of the analysis.
Recall that you can pull out the contents of a single data frame column in many ways. For a data frame df
with a variable named x
:
df$x
df %>% pull(x)
df[["x"]]
Note that in df_arg_combos
:
mod_formula
: this column is a list and you can extract the 1st element with[[1]]
remove_outliers
: this column is a vector and you can extract the 1st element with[1]
Solutions
for
loops
Solutions
for (i in seq_len(10)) {
print(2*i)
}
[1] 2
[1] 4
[1] 6
[1] 8
[1] 10
[1] 12
[1] 14
[1] 16
[1] 18
[1] 20
seq(from = 2, to = 20, by = 2)
[1] 2 4 6 8 10 12 14 16 18 20
<- vector("character", length = length(month.name))
month_strings
for (i in seq_along(month.name)) {
<- str_c("Month ", i, ": ", month.name[i])
month_strings[i]
} month_strings
[1] "Month 1: January" "Month 2: February" "Month 3: March"
[4] "Month 4: April" "Month 5: May" "Month 6: June"
[7] "Month 7: July" "Month 8: August" "Month 9: September"
[10] "Month 10: October" "Month 11: November" "Month 12: December"
str_c("Month ", 1:12, ": ", month.name)
[1] "Month 1: January" "Month 2: February" "Month 3: March"
[4] "Month 4: April" "Month 5: May" "Month 6: June"
[7] "Month 7: July" "Month 8: August" "Month 9: September"
[10] "Month 10: October" "Month 11: November" "Month 12: December"
data(diamonds)
# Fit models of price vs. carat separately for each value of cut
<- diamonds %>% pull(cut) %>% levels()
unique_cuts <- vector(mode = "list", length = length(unique_cuts))
lin_mod_results
for (i in seq_along(unique_cuts)) {
<- unique_cuts[i]
this_cut <- diamonds %>%
diamonds_sub filter(cut==this_cut)
# The double square brackets [[i]] accesses the ith element of a list
<- lm(price ~ carat, data = diamonds_sub)
lin_mod_results[[i]] }
across
iteration
Solutions
<- function(x) {
add_mm str_c(x, " mm")
}
%>%
diamonds mutate(across(.cols = c(x, y, z), .fns = add_mm))
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <chr> <chr> <chr>
1 0.23 Ideal E SI2 61.5 55 326 3.95 mm 3.98 mm 2.43 mm
2 0.21 Premium E SI1 59.8 61 326 3.89 mm 3.84 mm 2.31 mm
3 0.23 Good E VS1 56.9 65 327 4.05 mm 4.07 mm 2.31 mm
4 0.29 Premium I VS2 62.4 58 334 4.2 mm 4.23 mm 2.63 mm
5 0.31 Good J SI2 63.3 58 335 4.34 mm 4.35 mm 2.75 mm
6 0.24 Very Good J VVS2 62.8 57 336 3.94 mm 3.96 mm 2.48 mm
7 0.24 Very Good I VVS1 62.3 57 336 3.95 mm 3.98 mm 2.47 mm
8 0.26 Very Good H SI1 61.9 55 337 4.07 mm 4.11 mm 2.53 mm
9 0.22 Fair E VS2 65.1 61 337 3.87 mm 3.78 mm 2.49 mm
10 0.23 Very Good H VS1 59.4 61 338 4 mm 4.05 mm 2.39 mm
# ℹ 53,930 more rows
%>%
diamonds mutate(across(.cols = where(is.numeric), .fns = as.character))
# A tibble: 53,940 × 10
carat cut color clarity depth table price x y z
<chr> <ord> <ord> <ord> <chr> <chr> <chr> <chr> <chr> <chr>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.29 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
# ℹ 53,930 more rows
Case Study
- .
Solution
<- crossing(
df_arg_combos mod_formula = c(price ~ carat, price ~ carat + cut, price ~ carat + cut + clarity, price ~ carat + cut + clarity + color),
remove_outliers = c(TRUE, FALSE)
) df_arg_combos
# A tibble: 8 × 2
mod_formula remove_outliers
<list> <lgl>
1 <formula> FALSE
2 <formula> TRUE
3 <formula> FALSE
4 <formula> TRUE
5 <formula> FALSE
6 <formula> TRUE
7 <formula> FALSE
8 <formula> TRUE
- .
Solution
<- function(data, what_var, sd_thresh) {
remove_outliers %>%
data mutate(zscore = ({{ what_var }} - mean({{ what_var }}, na.rm = TRUE))/sd({{ what_var }}, na.rm = TRUE)) %>%
filter(zscore <= sd_thresh)
}
- .
Solution
<- function(data, mod_formula, remove_outliers) {
fit_model if (remove_outliers) {
<- remove_outliers(data, what_var = carat, sd_thresh = 3)
data_clean else {
} <- data
data_clean
}
lm(mod_formula, data = data_clean)
}
- .
Solution
<- vector(mode = "list", length = nrow(df_arg_combos))
lin_mod_res_for
for (i in seq_along(lin_mod_res_for)) {
<- df_arg_combos$mod_formula[[i]] # Double [[ for the **list** of formulas
this_formula <- df_arg_combos$remove_outliers[i] # Single [ for the **atomic vector** of logicals
this_remove_outliers <- fit_model(
lin_mod_res_for[[i]] data = diamonds,
mod_formula = this_formula,
remove_outliers = this_remove_outliers
) }
After Class
- Take a look at the Schedule page to see how to prepare for the next class
- Work on Homework 6, which will be posted later today.