library(data.table)
library(dplyr)
= function() {
refresh_data <<- data.table(
DT V1 = rep(1:2, 5)[-10],
V2 = 1:9,
V3 = c(0.5, 1.0, 1.5),
V4 = rep(LETTERS[1:3], 3)
)
<<- data.frame(
DF V1 = rep(1:2, 5)[-10],
V2 = 1:9,
V3 = c(0.5, 1.0, 1.5),
V4 = rep(LETTERS[1:3], 3)
)
<<- tibble(
TB V1 = rep(1:2, 5)[-10],
V2 = 1:9,
V3 = rep(c(0.5, 1.0, 1.5), 3),
V4 = rep(LETTERS[1:3], 3)
)
}
refresh_data()
This page presents a side-by-side comparison of common data manipulation operations in R in three idioms: data.table
, base
, and dplyr
. This allows you to compare syntax and understand how to accomplish tasks across these popular frameworks.
This reference guide covers everything from basic filtering and sorting to advanced operations like joins and reshaping data. Many of these examples were originally crafted by Atrebas. They were then reorganized and augmented with base
examples by a team of contributors.
To begin, we create example data. The base R
data frame is called DF
, the data.table
table is called DT
, and the dplyr
tibble is called TB
. Data creation is wrapped in a refresh_data()
function, which is called periodically throughout the document to ensure that the data is reset after modifications.
When using the let()
and set*()
functions or :=
operator modifies a data.table
“in place,” which means that it does not copy the object at all. This is more efficient than re-assigning the entire data set. However, when modified in place, the data table will not be printed immediately to the console after modification. You must call the object again to see the changes.
Filter
Integer index
Positive indices keep the specified rows.
3:4,] DT[
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 3 1.5 C
2: 2 4 0.5 A
3:4,] DF[
V1 V2 V3 V4
3 1 3 1.5 C
4 2 4 0.5 A
|> slice(3:4) TB
# A tibble: 2 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 3 1.5 C
2 2 4 0.5 A
Negative indices exclude the specified rows.
-(3:7),] DT[
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
2: 2 2 1.0 B
3: 2 8 1.0 B
4: 1 9 1.5 C
-(3:7),] DF[
V1 V2 V3 V4
1 1 1 0.5 A
2 2 2 1.0 B
8 2 8 1.0 B
9 1 9 1.5 C
|> slice(-(3:7)) TB
# A tibble: 4 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
2 2 2 1 B
3 2 8 1 B
4 1 9 1.5 C
Logical index
Filter rows using a logical vector. Keep the rows where the condition is TRUE. %chin%
is a fast version of %in%
, optimized for strings. %like%
is a convenient operator for regular expression matches.
> 5] DT[V2
V1 V2 V3 V4
<int> <int> <num> <char>
1: 2 6 1.5 C
2: 1 7 0.5 A
3: 2 8 1.0 B
4: 1 9 1.5 C
%chin% c("A", "C")] # faster than %in% for strings DT[V4
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
2: 1 3 1.5 C
3: 2 4 0.5 A
4: 2 6 1.5 C
5: 1 7 0.5 A
6: 1 9 1.5 C
%like% c("A|C")] # regular expressions DT[V4
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
2: 1 3 1.5 C
3: 2 4 0.5 A
4: 2 6 1.5 C
5: 1 7 0.5 A
6: 1 9 1.5 C
subset(DF, V2 > 5)
V1 V2 V3 V4
6 2 6 1.5 C
7 1 7 0.5 A
8 2 8 1.0 B
9 1 9 1.5 C
subset(DF, V4 %in% c("A", "C"))
V1 V2 V3 V4
1 1 1 0.5 A
3 1 3 1.5 C
4 2 4 0.5 A
6 2 6 1.5 C
7 1 7 0.5 A
9 1 9 1.5 C
subset(DF, grepl("A|C", V4))
V1 V2 V3 V4
1 1 1 0.5 A
3 1 3 1.5 C
4 2 4 0.5 A
6 2 6 1.5 C
7 1 7 0.5 A
9 1 9 1.5 C
|> filter(V2 > 5) TB
# A tibble: 4 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 2 6 1.5 C
2 1 7 0.5 A
3 2 8 1 B
4 1 9 1.5 C
|> filter(V4 %in% c("A", "C")) TB
# A tibble: 6 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
2 1 3 1.5 C
3 2 4 0.5 A
4 2 6 1.5 C
5 1 7 0.5 A
6 1 9 1.5 C
|> filter(grepl("A|C", V4)) TB
# A tibble: 6 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
2 1 3 1.5 C
3 2 4 0.5 A
4 2 6 1.5 C
5 1 7 0.5 A
6 1 9 1.5 C
Filter rows based on multiple conditions.
== 1 & V4 == "A"] DT[V1
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
2: 1 7 0.5 A
subset(DF, V1 == 1 & V4 == "A")
V1 V2 V3 V4
1 1 1 0.5 A
7 1 7 0.5 A
|> filter(V1 == 1, V4 == "A") TB
# A tibble: 2 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
2 1 7 0.5 A
Unique
unique(DT)
Indices: <V4>, <V4__V1>
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
2: 2 2 1.0 B
3: 1 3 1.5 C
4: 2 4 0.5 A
5: 1 5 1.0 B
6: 2 6 1.5 C
7: 1 7 0.5 A
8: 2 8 1.0 B
9: 1 9 1.5 C
unique(DT, by = c("V1", "V4"))
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
2: 2 2 1.0 B
3: 1 3 1.5 C
4: 2 4 0.5 A
5: 1 5 1.0 B
6: 2 6 1.5 C
!duplicated(DF), ] DF[
V1 V2 V3 V4
1 1 1 0.5 A
2 2 2 1.0 B
3 1 3 1.5 C
4 2 4 0.5 A
5 1 5 1.0 B
6 2 6 1.5 C
7 1 7 0.5 A
8 2 8 1.0 B
9 1 9 1.5 C
!duplicated(DF[c("V1", "V4")]), ] DF[
V1 V2 V3 V4
1 1 1 0.5 A
2 2 2 1.0 B
3 1 3 1.5 C
4 2 4 0.5 A
5 1 5 1.0 B
6 2 6 1.5 C
|> distinct() TB
# A tibble: 9 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
2 2 2 1 B
3 1 3 1.5 C
4 2 4 0.5 A
5 1 5 1 B
6 2 6 1.5 C
7 1 7 0.5 A
8 2 8 1 B
9 1 9 1.5 C
|> distinct(V1, V4, .keep_all = TRUE) TB
# A tibble: 6 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
2 2 2 1 B
3 1 3 1.5 C
4 2 4 0.5 A
5 1 5 1 B
6 2 6 1.5 C
Missing values
Drop rows with missing values in specified columns.
na.omit(DT, cols = 1:4)
Indices: <V4>, <V4__V1>
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
2: 2 2 1.0 B
3: 1 3 1.5 C
4: 2 4 0.5 A
5: 1 5 1.0 B
6: 2 6 1.5 C
7: 1 7 0.5 A
8: 2 8 1.0 B
9: 1 9 1.5 C
complete.cases(DF[, 1:4]), ] DF[
V1 V2 V3 V4
1 1 1 0.5 A
2 2 2 1.0 B
3 1 3 1.5 C
4 2 4 0.5 A
5 1 5 1.0 B
6 2 6 1.5 C
7 1 7 0.5 A
8 2 8 1.0 B
9 1 9 1.5 C
|> tidyr::drop_na(1:4) TB
# A tibble: 9 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
2 2 2 1 B
3 1 3 1.5 C
4 2 4 0.5 A
5 1 5 1 B
6 2 6 1.5 C
7 1 7 0.5 A
8 2 8 1 B
9 1 9 1.5 C
Random sample
Draw a random sample of rows.
sample(.N, 3)] DT[
V1 V2 V3 V4
<int> <int> <num> <char>
1: 2 6 1.5 C
2: 2 8 1.0 B
3: 2 4 0.5 A
sample(.N, .N / 2)] DT[
V1 V2 V3 V4
<int> <int> <num> <char>
1: 2 6 1.5 C
2: 2 8 1.0 B
3: 1 7 0.5 A
4: 1 5 1.0 B
sample(nrow(DF), 3), ] DF[
V1 V2 V3 V4
3 1 3 1.5 C
5 1 5 1.0 B
4 2 4 0.5 A
sample(nrow(DF), nrow(DF) / 2), ] DF[
V1 V2 V3 V4
2 2 2 1.0 B
9 1 9 1.5 C
1 1 1 0.5 A
3 1 3 1.5 C
|> slice_sample(n = 3) TB
# A tibble: 3 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 5 1 B
2 1 7 0.5 A
3 1 3 1.5 C
|> slice_sample(prop = 0.5) TB
# A tibble: 4 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
2 1 9 1.5 C
3 2 6 1.5 C
4 1 3 1.5 C
Other
%between% c(3, 5)]
DT[V2 ::between(V2, 3, 5, incbounds = FALSE)]
DT[data.table%inrange% list(-1:1, 1:3)] DT[V2
subset(DF, grepl("^B", V4))
subset(DF, V2 >= 3 & V2 <= 5)
subset(DF, V2 > 3 & V2 < 5)
subset(DF, V2 %in% c(-1:1, 1:3))
|> filter(grepl("^B", V4))
TB |> filter(dplyr::between(V2, 3, 5))
TB |> filter(V2 > 3 & V2 < 5)
TB |> filter(V2 >= -1:1 & V2 <= 1:3) TB
Sort
Rows
Sort rows in ascending order.
order(V3)] DT[
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
2: 2 4 0.5 A
3: 1 7 0.5 A
4: 2 2 1.0 B
5: 1 5 1.0 B
6: 2 8 1.0 B
7: 1 3 1.5 C
8: 2 6 1.5 C
9: 1 9 1.5 C
sort_by(DF, ~V3)
V1 V2 V3 V4
1 1 1 0.5 A
4 2 4 0.5 A
7 1 7 0.5 A
2 2 2 1.0 B
5 1 5 1.0 B
8 2 8 1.0 B
3 1 3 1.5 C
6 2 6 1.5 C
9 1 9 1.5 C
|> arrange(V3) TB
# A tibble: 9 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
2 2 4 0.5 A
3 1 7 0.5 A
4 2 2 1 B
5 1 5 1 B
6 2 8 1 B
7 1 3 1.5 C
8 2 6 1.5 C
9 1 9 1.5 C
Sort rows in decreasing order.
order(-V3)] DT[
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 3 1.5 C
2: 2 6 1.5 C
3: 1 9 1.5 C
4: 2 2 1.0 B
5: 1 5 1.0 B
6: 2 8 1.0 B
7: 1 1 0.5 A
8: 2 4 0.5 A
9: 1 7 0.5 A
sort_by(DF, ~list(-V3))
V1 V2 V3 V4
3 1 3 1.5 C
6 2 6 1.5 C
9 1 9 1.5 C
2 2 2 1.0 B
5 1 5 1.0 B
8 2 8 1.0 B
1 1 1 0.5 A
4 2 4 0.5 A
7 1 7 0.5 A
|> arrange(desc(V3)) TB
# A tibble: 9 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 3 1.5 C
2 2 6 1.5 C
3 1 9 1.5 C
4 2 2 1 B
5 1 5 1 B
6 2 8 1 B
7 1 1 0.5 A
8 2 4 0.5 A
9 1 7 0.5 A
Sort rows by multiple columns.
order(V1, -V2)] DT[
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 9 1.5 C
2: 1 7 0.5 A
3: 1 5 1.0 B
4: 1 3 1.5 C
5: 1 1 0.5 A
6: 2 8 1.0 B
7: 2 6 1.5 C
8: 2 4 0.5 A
9: 2 2 1.0 B
sort_by(DF, ~list(V1, -V2))
V1 V2 V3 V4
9 1 9 1.5 C
7 1 7 0.5 A
5 1 5 1.0 B
3 1 3 1.5 C
1 1 1 0.5 A
8 2 8 1.0 B
6 2 6 1.5 C
4 2 4 0.5 A
2 2 2 1.0 B
|> arrange(V1, desc(V2)) TB
# A tibble: 9 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 9 1.5 C
2 1 7 0.5 A
3 1 5 1 B
4 1 3 1.5 C
5 1 1 0.5 A
6 2 8 1 B
7 2 6 1.5 C
8 2 4 0.5 A
9 2 2 1 B
Sort dataset and the column passed in argument becomes key. Output is an object the same type as the object indexed. This code is not executed to avoid sorting the original data set in place.
setorder(DT, V4, -V1)
setorderv(DT, c("V4", "V1"), c(1, -1))
= DF[order(DF$V4, -DF$V1), ] DF
= TB |> arrange(V4, desc(V1)) TB
refresh_data()
Columns
Reorders the columns of a dataset. This code is not executed to avoid reordering columns in the original data set.
setcolorder(DT, c("V4", "V1", "V2"))
DT
V4 V1 V2 V3
<char> <int> <int> <num>
1: A 1 1 0.5
2: B 2 2 1.0
3: C 1 3 1.5
4: A 2 4 0.5
5: B 1 5 1.0
6: C 2 6 1.5
7: A 1 7 0.5
8: B 2 8 1.0
9: C 1 9 1.5
= DF[, c("V4", "V1", "V2")]
DF DF
V4 V1 V2
1 A 1 1
2 B 2 2
3 C 1 3
4 A 2 4
5 B 1 5
6 C 2 6
7 A 1 7
8 B 2 8
9 C 1 9
= TB |> select(V4, V1, V2)
TB TB
# A tibble: 9 × 3
V4 V1 V2
<chr> <int> <int>
1 A 1 1
2 B 2 2
3 C 1 3
4 A 2 4
5 B 1 5
6 C 2 6
7 A 1 7
8 B 2 8
9 C 1 9
refresh_data()
Select
Keep
Extract one column as a vector.
# DT[["V3"]]
# DT[, V3]
3]] DT[[
[1] 0.5 1.0 1.5 0.5 1.0 1.5 0.5 1.0 1.5
# DF[["V3"]]
# DF[, 3, drop = TRUE]
3]] DF[[
[1] 0.5 1.0 1.5 0.5 1.0 1.5 0.5 1.0 1.5
# TB[["V3"]]
# TB |> pull(V3)
3]] TB[[
[1] 0.5 1.0 1.5 0.5 1.0 1.5 0.5 1.0 1.5
Extract one column as a data frame.
# DT[, .SD, .SDcols = "V3"]
"V3"] DT[,
V3
<num>
1: 0.5
2: 1.0
3: 1.5
4: 0.5
5: 1.0
6: 1.5
7: 0.5
8: 1.0
9: 1.5
# DF[, "V3", drop = FALSE]
3, drop = FALSE] DF[,
V3
1 0.5
2 1.0
3 1.5
4 0.5
5 1.0
6 1.5
7 0.5
8 1.0
9 1.5
# TB |> select(V3)
"V3"] TB[,
# A tibble: 9 × 1
V3
<dbl>
1 0.5
2 1
3 1.5
4 0.5
5 1
6 1.5
7 0.5
8 1
9 1.5
Select several columns by column names.
DT[, .(V2, V3, V4)]:V4]
DT[, V2= V2:V4]
DT[, .SD, .SDcols = c("V2", "V3", "V4")]
DT[, .SD, .SDcols = c("V2", "V3")
cols DT[, ..cols]
c("V2", "V3", "V4")]
DF[, subset(DF, select = c("V2", "V3", "V4"))
= c("V2", "V3")
cols
DF[, cols]names(DF) %in% cols] DF[ ,
|> select(V2, V3, V4)
TB |> select(V2:V4)
TB |> select(any_of(c("V2", "V3", "V4")))
TB = c("V2", "V3")
cols |> select(!!cols) DF
Drop
Exclude several columns by column name.
# DT[, .SD, .SDcols = !c("V2", "V3")]
!c("V2", "V3")] DT[,
V1 V4
<int> <char>
1: 1 A
2: 2 B
3: 1 C
4: 2 A
5: 1 B
6: 2 C
7: 1 A
8: 2 B
9: 1 C
!(names(DF) %in% c("V2", "V3"))] DF[ ,
V1 V4
1 1 A
2 2 B
3 1 C
4 2 A
5 1 B
6 2 C
7 1 A
8 2 B
9 1 C
|> select(-V2, -V3) TB
# A tibble: 9 × 2
V1 V4
<int> <chr>
1 1 A
2 2 B
3 1 C
4 2 A
5 1 B
6 2 C
7 1 A
8 2 B
9 1 C
Remove a column from the data set. Using let()
is efficient because it modifies the data set in place. This code is not executed because the V5
column is not present in the data set.
let(V1 = NULL)]
DT[, DT
V2 V3 V4
<int> <num> <char>
1: 1 0.5 A
2: 2 1.0 B
3: 3 1.5 C
4: 4 0.5 A
5: 5 1.0 B
6: 6 1.5 C
7: 7 0.5 A
8: 8 1.0 B
9: 9 1.5 C
= DF[, !names(DF) %in% "V1"]
DF DF
V2 V3 V4
1 1 0.5 A
2 2 1.0 B
3 3 1.5 C
4 4 0.5 A
5 5 1.0 B
6 6 1.5 C
7 7 0.5 A
8 8 1.0 B
9 9 1.5 C
= TB |> select(-V1)
TB TB
# A tibble: 9 × 3
V2 V3 V4
<int> <dbl> <chr>
1 1 0.5 A
2 2 1 B
3 3 1.5 C
4 4 0.5 A
5 5 1 B
6 6 1.5 C
7 7 0.5 A
8 8 1 B
9 9 1.5 C
Remove several columns from the data set. Using :=
is efficient because it modifies the data set in place. This code is not executed because the V6
and V7
columns are not present in the data set.
= c("V2", "V3")
cols := NULL]
DT[, (cols) DT
V4
<char>
1: A
2: B
3: C
4: A
5: B
6: C
7: A
8: B
9: C
= DF[, !(names(DF) %in% c("V2", "V3"))]
DF DF
[1] "A" "B" "C" "A" "B" "C" "A" "B" "C"
= TB |> select(-V2, -V3)
TB TB
# A tibble: 9 × 1
V4
<chr>
1 A
2 B
3 C
4 A
5 B
6 C
7 A
8 B
9 C
refresh_data()
Rename
Select and rename.
X1 = V1, X2 = V2)] DT[, .(
X1 X2
<int> <int>
1: 1 1
2: 2 2
3: 1 3
4: 2 4
5: 1 5
6: 2 6
7: 1 7
8: 2 8
9: 1 9
setNames(
c("V1", "V2")],
DF[, c("X1", "X2"))
X1 X2
1 1 1
2 2 2
3 1 3
4 2 4
5 1 5
6 2 6
7 1 7
8 2 8
9 1 9
|> select(X1 = V1, X2 = V2) TB
# A tibble: 9 × 2
X1 X2
<int> <int>
1 1 1
2 2 2
3 1 3
4 2 4
5 1 5
6 2 6
7 1 7
8 2 8
9 1 9
Using the data.table::setnames()
to rename columns is efficient because it renames column in place. This code is not executed to avoid renaming columns in the original data set.
setnames(DT, old = c("V1", "V2"), new = c("X1", "X2"))
DT
X1 X2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
2: 2 2 1.0 B
3: 1 3 1.5 C
4: 2 4 0.5 A
5: 1 5 1.0 B
6: 2 6 1.5 C
7: 1 7 0.5 A
8: 2 8 1.0 B
9: 1 9 1.5 C
colnames(DF)[match(c("V1", "V2"), colnames(DF))] = c("X1", "X2")
DF
X1 X2 V3 V4
1 1 1 0.5 A
2 2 2 1.0 B
3 1 3 1.5 C
4 2 4 0.5 A
5 1 5 1.0 B
6 2 6 1.5 C
7 1 7 0.5 A
8 2 8 1.0 B
9 1 9 1.5 C
= TB |> rename(X1 = V1, X2 = V2)
TB TB
# A tibble: 9 × 4
X1 X2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
2 2 2 1 B
3 1 3 1.5 C
4 2 4 0.5 A
5 1 5 1 B
6 2 6 1.5 C
7 1 7 0.5 A
8 2 8 1 B
9 1 9 1.5 C
refresh_data()
Advanced selections
Complex selections using regular expressions or dedicated functions.
= c("V1", "V2")]
DT[, .SD, .SDcols = patterns("^V[1-2]$")]
DT[, .SD, .SDcols = patterns("V")]
DT[, .SD, .SDcols = patterns("3$")]
DT[, .SD, .SDcols = patterns(".2")]
DT[, .SD, .SDcols = patterns("^V1$|^X$")]
DT[, .SD, .SDcols = patterns("^(?!V2)", perl = TRUE)] DT[, .SD, .SDcols
c("V1", "V2")]
DF[, grep("^V[1-2]$", names(DF))]
DF[ , c("V4", setdiff(names(DF), "V4"))]
DF[ , grep("V", names(DF))]
DF[ , grep("3$", names(DF))]
DF[ , grep(".2", names(DF))]
DF[ , c("V1", "X")]
DF[ , !grepl("^V2", names(DF))] DF[ ,
|> select(V1, V2)
TB |> select(num_range("V", 1:2))
TB |> select(contains("V"))
TB |> select(ends_with("3"))
TB |> select(matches(".2"))
TB |> select(one_of(c("V1", "X")))
TB |> select(-starts_with("V2")) TB
Summarize
Single columns
Create a new data frame with a single row and a single column, summarizing the information of one column. Named or unnamed results.
sum(V1)] DT[,
[1] 13
sumV1 = sum(V1))] DT[, .(
sumV1
<int>
1: 13
sum(DF$V1)
[1] 13
data.frame(sumV1 = sum(DF$V1))
sumV1
1 13
|> pull(V1) |> sum() TB
[1] 13
|> summarise(sumV1 = sum(V1)) TB
# A tibble: 1 × 1
sumV1
<int>
1 13
Create a new data frame with a single row and two columns, summarizing the information of two manually specified columns.
sumV1 = sum(V1), sdV3 = sd(V3))] DT[, .(
sumV1 sdV3
<int> <num>
1: 13 0.4330127
data.frame(sumV1 = sum(DF$V1), sdV3 = sd(DF$V3))
sumV1 sdV3
1 13 0.4330127
|> summarise(sumV1 = sum(V1), sdV3 = sd(V3)) TB
# A tibble: 1 × 2
sumV1 sdV3
<int> <dbl>
1 13 0.433
Multiple columns
Apply a function to each column.
lapply(.SD, head, 1)] DT[,
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
data.frame(lapply(DF, head, 1))
V1 V2 V3 V4
1 1 1 0.5 A
|> summarize(across(everything(), \(x) head(x, 1))) TB
# A tibble: 1 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
Apply a function to each column that matches a specific type.
lapply(.SD, mean), .SDcols = is.numeric] DT[,
V1 V2 V3
<num> <num> <num>
1: 1.444444 5 1
data.frame(lapply(DF[sapply(DF, is.numeric)], mean))
V1 V2 V3
1 1.444444 5 1
|> summarize(across(where(is.numeric), mean)) TB
# A tibble: 1 × 3
V1 V2 V3
<dbl> <dbl> <dbl>
1 1.44 5 1
By group
Count the number of observation by group.
= V4] DT[, .N, by
V4 N
<char> <int>
1: A 3
2: B 3
3: C 3
as.data.frame(table(DF$V4))
Var1 Freq
1 A 3
2 B 3
3 C 3
|>
TB group_by(V4) |>
tally()
# A tibble: 3 × 2
V4 n
<chr> <int>
1 A 3
2 B 3
3 C 3
Multiple named summaries
nobs = .N, meanV1 = mean(V1)), by = V4] DT[, .(
V4 nobs meanV1
<char> <int> <num>
1: A 3 1.333333
2: B 3 1.666667
3: C 3 1.333333
do.call(rbind, by(DF, ~V4, \(x) {
data.frame(nobs = nrow(x), meanV1 = mean(x$V1))
} ))
nobs meanV1
A 3 1.333333
B 3 1.666667
C 3 1.333333
|>
TB group_by(V4) |>
summarize(nobs = n(), meanV1 = mean(V1)) |>
ungroup()
# A tibble: 3 × 3
V4 nobs meanV1
<chr> <int> <dbl>
1 A 3 1.33
2 B 3 1.67
3 C 3 1.33
Apply a function to the full data frame in each group. Here, we return the first row in each group using the head()
function.
head(.SD, 1), by = V4] DT[,
V4 V1 V2 V3
<char> <int> <int> <num>
1: A 1 1 0.5
2: B 2 2 1.0
3: C 1 3 1.5
do.call(rbind, by(DF, DF$V4, \(x) head(x, 1)))
V1 V2 V3 V4
A 1 1 0.5 A
B 2 2 1.0 B
C 1 3 1.5 C
|>
TB group_by(V4) |>
summarize(across(everything(), head, 1))
Warning: There was 1 warning in `summarize()`.
ℹ In argument: `across(everything(), head, 1)`.
ℹ In group 1: `V4 = "A"`.
Caused by warning:
! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
Supply arguments directly to `.fns` through an anonymous function instead.
# Previously
across(a:b, mean, na.rm = TRUE)
# Now
across(a:b, \(x) mean(x, na.rm = TRUE))
# A tibble: 3 × 4
V4 V1 V2 V3
<chr> <int> <int> <dbl>
1 A 1 1 0.5
2 B 2 2 1
3 C 1 3 1.5
Get the row number of first (and last) observation by group. In data.table
, we use the .I
operator, which reports the row number.
1], by = V4] DT[, .I[
V4 V1
<char> <int>
1: A 1
2: B 2
3: C 3
range(.I), by = V4] DT[,
V4 V1
<char> <int>
1: A 1
2: A 7
3: B 2
4: B 8
5: C 3
6: C 9
do.call(rbind, by(DF, ~V4, \(x) x[1,]))
V1 V2 V3 V4
A 1 1 0.5 A
B 2 2 1.0 B
C 1 3 1.5 C
do.call(rbind, by(DF, ~V4, \(x) x[c(1, nrow(x)),]))
V1 V2 V3 V4
A.1 1 1 0.5 A
A.7 1 7 0.5 A
B.2 2 2 1.0 B
B.8 2 8 1.0 B
C.3 1 3 1.5 C
C.9 1 9 1.5 C
|>
TB group_by(V4) |>
summarize(cur_group_rows()[1]) |>
ungroup()
# A tibble: 3 × 2
V4 `cur_group_rows()[1]`
<chr> <int>
1 A 1
2 B 2
3 C 3
|>
TB group_by(V4) |>
reframe(cur_group_rows()[c(1, n())]) |>
ungroup()
# A tibble: 6 × 2
V4 `cur_group_rows()[c(1, n())]`
<chr> <int>
1 A 1
2 A 7
3 B 2
4 B 8
5 C 3
6 C 9
List-columns are columns where each element is a vector, data frame, or other object.
= V4] DT[, .(.(V1)), by
V4 V1
<char> <list>
1: A 1,2,1
2: B 2,1,2
3: C 1,2,1
= V4] DT[, .(.(.SD)), by
V4 V1
<char> <list>
1: A <data.table[3x3]>
2: B <data.table[3x3]>
3: C <data.table[3x3]>
tapply(DF$V1, DF$V4,
function(x) list(x))
$A
[1] 1 2 1
$B
[1] 2 1 2
$C
[1] 1 2 1
split(DF, DF$V4)
$A
V1 V2 V3 V4
1 1 1 0.5 A
4 2 4 0.5 A
7 1 7 0.5 A
$B
V1 V2 V3 V4
2 2 2 1 B
5 1 5 1 B
8 2 8 1 B
$C
V1 V2 V3 V4
3 1 3 1.5 C
6 2 6 1.5 C
9 1 9 1.5 C
|>
TB group_by(V4) |>
summarise(list(V1))
# A tibble: 3 × 2
V4 `list(V1)`
<chr> <list>
1 A <int [3]>
2 B <int [3]>
3 C <int [3]>
|>
TB group_by(V4) |>
group_nest()
# A tibble: 3 × 2
V4 data
<chr> <list<tibble[,3]>>
1 A [3 × 3]
2 B [3 × 3]
3 C [3 × 3]
Modify
Single columns
Create or modify a column.
let(V5 = V1^2)]
DT[, DT
V1 V2 V3 V4 V5
<int> <int> <num> <char> <num>
1: 1 1 0.5 A 1
2: 2 2 1.0 B 4
3: 1 3 1.5 C 1
4: 2 4 0.5 A 4
5: 1 5 1.0 B 1
6: 2 6 1.5 C 4
7: 1 7 0.5 A 1
8: 2 8 1.0 B 4
9: 1 9 1.5 C 1
$V5 = DF$V1^2
DF DF
V1 V2 V3 V4 V5
1 1 1 0.5 A 1
2 2 2 1.0 B 4
3 1 3 1.5 C 1
4 2 4 0.5 A 4
5 1 5 1.0 B 1
6 2 6 1.5 C 4
7 1 7 0.5 A 1
8 2 8 1.0 B 4
9 1 9 1.5 C 1
= TB |> mutate(V5 = V1^2)
TB TB
# A tibble: 9 × 5
V1 V2 V3 V4 V5
<int> <int> <dbl> <chr> <dbl>
1 1 1 0.5 A 1
2 2 2 1 B 4
3 1 3 1.5 C 1
4 2 4 0.5 A 4
5 1 5 1 B 1
6 2 6 1.5 C 4
7 1 7 0.5 A 1
8 2 8 1 B 4
9 1 9 1.5 C 1
Create several new columns, each named explicitly.
let(
DT[, V5 = sqrt(V1),
V6 = "X")]
DT
V1 V2 V3 V4 V5 V6
<int> <int> <num> <char> <num> <char>
1: 1 1 0.5 A 1.000000 X
2: 2 2 1.0 B 1.414214 X
3: 1 3 1.5 C 1.000000 X
4: 2 4 0.5 A 1.414214 X
5: 1 5 1.0 B 1.000000 X
6: 2 6 1.5 C 1.414214 X
7: 1 7 0.5 A 1.000000 X
8: 2 8 1.0 B 1.414214 X
9: 1 9 1.5 C 1.000000 X
$V5 = sqrt(DF$V1)
DF$V6 = "X"
DF DF
V1 V2 V3 V4 V5 V6
1 1 1 0.5 A 1.000000 X
2 2 2 1.0 B 1.414214 X
3 1 3 1.5 C 1.000000 X
4 2 4 0.5 A 1.414214 X
5 1 5 1.0 B 1.000000 X
6 2 6 1.5 C 1.414214 X
7 1 7 0.5 A 1.000000 X
8 2 8 1.0 B 1.414214 X
9 1 9 1.5 C 1.000000 X
= TB |> mutate(
TB V5 = sqrt(V1),
V6 = "X")
TB
# A tibble: 9 × 6
V1 V2 V3 V4 V5 V6
<int> <int> <dbl> <chr> <dbl> <chr>
1 1 1 0.5 A 1 X
2 2 2 1 B 1.41 X
3 1 3 1.5 C 1 X
4 2 4 0.5 A 1.41 X
5 1 5 1 B 1 X
6 2 6 1.5 C 1.41 X
7 1 7 0.5 A 1 X
8 2 8 1 B 1.41 X
9 1 9 1.5 C 1 X
Multiple columns
Take the square of every numeric column.
names(.SD) := lapply(.SD, \(x) x^2), .SDcols = is.numeric]
DT[, DT
V1 V2 V3 V4 V5 V6
<num> <num> <num> <char> <num> <char>
1: 1 1 0.25 A 1 X
2: 4 4 1.00 B 2 X
3: 1 9 2.25 C 1 X
4: 4 16 0.25 A 2 X
5: 1 25 1.00 B 1 X
6: 4 36 2.25 C 2 X
7: 1 49 0.25 A 1 X
8: 4 64 1.00 B 2 X
9: 1 81 2.25 C 1 X
TODO ## dplyr
= TB |>
TB mutate(across(where(is.numeric), \(x) x^2))
TB
# A tibble: 9 × 6
V1 V2 V3 V4 V5 V6
<dbl> <dbl> <dbl> <chr> <dbl> <chr>
1 1 1 0.25 A 1 X
2 4 4 1 B 2 X
3 1 9 2.25 C 1 X
4 4 16 0.25 A 2 X
5 1 25 1 B 1 X
6 4 36 2.25 C 2 X
7 1 49 0.25 A 1 X
8 4 64 1 B 2 X
9 1 81 2.25 C 1 X
refresh_data()
Replace
Replace values in rows that match a condition applied to a column.
<= 4, let(V2 = 0)]
DT[V2 DT
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 0 0.5 A
2: 2 0 1.0 B
3: 1 0 1.5 C
4: 2 0 0.5 A
5: 1 5 1.0 B
6: 2 6 1.5 C
7: 1 7 0.5 A
8: 2 8 1.0 B
9: 1 9 1.5 C
$V2 = replace(DF$V2, DF$V2 < 4, 0)
DF DF
V1 V2 V3 V4
1 1 0 0.5 A
2 2 0 1.0 B
3 1 0 1.5 C
4 2 4 0.5 A
5 1 5 1.0 B
6 2 6 1.5 C
7 1 7 0.5 A
8 2 8 1.0 B
9 1 9 1.5 C
= DF |>
TB mutate(V2 = base::replace(V2, V2 < 4, 0))
TB
V1 V2 V3 V4
1 1 0 0.5 A
2 2 0 1.0 B
3 1 0 1.5 C
4 2 4 0.5 A
5 1 5 1.0 B
6 2 6 1.5 C
7 1 7 0.5 A
8 2 8 1.0 B
9 1 9 1.5 C
refresh_data()
If, else, case
Fast version of ifelse()
. Return a vector the same length as the vector tested, with specific value for in case where the condition is TRUE
, where it is FALSE
, and where it is NA
.
= c(-3:3, NA)
x fifelse(test = x < 0,
yes = "neg",
no = "pos",
na = "NA")
[1] "neg" "neg" "neg" "pos" "pos" "pos" "pos" "NA"
= c(-3:3, NA)
x = ifelse(is.na(x),
result "NA",
ifelse(x < 0, "neg", "pos"))
result
[1] "neg" "neg" "neg" "pos" "pos" "pos" "pos" "NA"
= c(-3:3, NA)
x if_else(condition = x < 0,
true = "neg",
false = "pos",
missing = "NA")
[1] "neg" "neg" "neg" "pos" "pos" "pos" "pos" "NA"
Recode several cases at once, based on a vector of conditions.
= 1:10
x fcase(
%% 6 == 0, "fizz buzz",
x %% 2 == 0, "fizz",
x %% 3 == 0, "buzz",
x default = as.character(x)
)
[1] "1" "fizz" "buzz" "fizz" "5" "fizz buzz"
[7] "7" "fizz" "buzz" "fizz"
= 1:10
x = ifelse(x %% 6 == 0, "fizz buzz",
result ifelse(x %% 2 == 0, "fizz",
ifelse(x %% 3 == 0, "buzz", as.character(x))))
result
[1] "1" "fizz" "buzz" "fizz" "5" "fizz buzz"
[7] "7" "fizz" "buzz" "fizz"
= 1:10
x case_when(
%% 6 == 0 ~ "fizz buzz",
x %% 2 == 0 ~ "fizz",
x %% 3 == 0 ~ "buzz",
x TRUE ~ as.character(x)
)
[1] "1" "fizz" "buzz" "fizz" "5" "fizz buzz"
[7] "7" "fizz" "buzz" "fizz"
Modify by group
Group summaries of a column in categories.
= "V4", .(sumV2 = sum(V2))] DT[, by
V4 sumV2
<char> <int>
1: A 12
2: B 15
3: C 18
aggregate(V2 ~ V4, data = DF, FUN = sum)
V4 V2
1 A 12
2 B 15
3 C 18
|>
TB group_by(V4) |>
summarise(sumV2 = sum(V2)) |>
ungroup()
# A tibble: 3 × 2
V4 sumV2
<chr> <int>
1 A 12
2 B 15
3 C 18
Group values of a column in groups while applying a function to each category.
DT[,= tolower(V4),
by sumV1 = sum(V1))] .(
tolower sumV1
<char> <int>
1: a 4
2: b 5
3: c 4
aggregate(
~ tolower(V4),
V1 data = DF,
FUN = sum)
tolower(V4) V1
1 a 4
2 b 5
3 c 4
|>
TB group_by(tolower(V4)) |>
summarise(sumV1 = sum(V1))
# A tibble: 3 × 2
`tolower(V4)` sumV1
<chr> <int>
1 a 4
2 b 5
3 c 4
Group values of a column in two categories, TRUE (for rows matching the condition) and FALSE (For rows not matching the condition).
DT[,= V4 == "A",
keyby sum(V1)]
Key: <V4>
V4 V1
<lgcl> <int>
1: FALSE 9
2: TRUE 4
aggregate(V1 ~ groupA,
data = transform(DF, groupA = V4 == "A"),
FUN = sum)
groupA V1
1 FALSE 9
2 TRUE 4
|>
TB group_by(V4 == "A") |>
summarise(sum(V1))
# A tibble: 2 × 2
`V4 == "A"` `sum(V1)`
<lgl> <int>
1 FALSE 9
2 TRUE 4
Group values of a column in several categories with some of the rows of the initial dataset removed.
1:5,
DT[= V4,
by sumV1 = sum(V1))] .(
V4 sumV1
<char> <int>
1: A 3
2: B 3
3: C 1
aggregate(V1 ~ V4,
data = DF[1:5,],
FUN = sum)
V4 V1
1 A 3
2 B 3
3 C 1
|>
TB slice(1:5) |>
group_by(V4) |>
summarise(sumV1 = sum(V1))
# A tibble: 3 × 2
V4 sumV1
<chr> <int>
1 A 3
2 B 3
3 C 1
Add a new column with the number of observations per group.
let(n = .N), by = V1]
DT[, DT
V1 V2 V3 V4 n
<int> <int> <num> <char> <int>
1: 1 1 0.5 A 5
2: 2 2 1.0 B 4
3: 1 3 1.5 C 5
4: 2 4 0.5 A 4
5: 1 5 1.0 B 5
6: 2 6 1.5 C 4
7: 1 7 0.5 A 5
8: 2 8 1.0 B 4
9: 1 9 1.5 C 5
$n <-
DFave(DF$V1,
$V1,
DFFUN = length)
DF
V1 V2 V3 V4 n
1 1 1 0.5 A 5
2 2 2 1.0 B 4
3 1 3 1.5 C 5
4 2 4 0.5 A 4
5 1 5 1.0 B 5
6 2 6 1.5 C 4
7 1 7 0.5 A 5
8 2 8 1.0 B 4
9 1 9 1.5 C 5
= TB |>
TB group_by(V1) |>
add_tally()
TB
# A tibble: 9 × 5
# Groups: V1 [2]
V1 V2 V3 V4 n
<int> <int> <dbl> <chr> <int>
1 1 1 0.5 A 5
2 2 2 1 B 4
3 1 3 1.5 C 5
4 2 4 0.5 A 4
5 1 5 1 B 5
6 2 6 1.5 C 4
7 1 7 0.5 A 5
8 2 8 1 B 4
9 1 9 1.5 C 5
refresh_data()
Advanced
Summarise all the columns, typically using an aggregation function.
lapply(.SD, max)] DT[,
V1 V2 V3 V4
<int> <int> <num> <char>
1: 2 9 1.5 C
apply(DF, 2, max)
V1 V2 V3 V4
"2" "9" "1.5" "C"
|> summarise(across(everything(), max)) TB
# A tibble: 1 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 2 9 1.5 C
Summarise several columns, typically using an aggregation function.
lapply(.SD, mean),
DT[, = c("V1", "V2")] .SDcols
V1 V2
<num> <num>
1: 1.444444 5
data.frame(
mean_V1 = mean(DF$V1),
mean_V2 = mean(DF$V2))
mean_V1 mean_V2
1 1.444444 5
TODO
Summarise several columns by groups, typically using an aggregation function.
= V4,
DT[, by lapply(.SD, mean),
= c("V1", "V2")] .SDcols
V4 V1 V2
<char> <num> <num>
1: A 1.333333 4
2: B 1.666667 5
3: C 1.333333 6
= V4,
DT[, by lapply(.SD, mean),
= patterns("V1|V2|Z0")] .SDcols
V4 V1 V2
<char> <num> <num>
1: A 1.333333 4
2: B 1.666667 5
3: C 1.333333 6
= intersect(c("V1", "V2", "Z0"), names(DF))
cols aggregate(DF[cols], by = list(DF$V4), FUN = mean, na.rm = TRUE)
Group.1 V1 V2
1 A 1.333333 4
2 B 1.666667 5
3 C 1.333333 6
|>
TB group_by(V4) |>
summarise(across(c(V1, V2), mean)) |>
ungroup()
# A tibble: 3 × 3
V4 V1 V2
<chr> <dbl> <dbl>
1 A 1.33 4
2 B 1.67 5
3 C 1.33 6
Summarise several columns by group using multiple aggregation functions, grouping by one or more variables.
= V4,
DT[, by c(lapply(.SD, sum),
lapply(.SD, mean))]
V4 V1 V2 V3 V1 V2 V3
<char> <int> <int> <num> <num> <num> <num>
1: A 4 12 1.5 1.333333 4 0.5
2: B 5 15 3.0 1.666667 5 1.0
3: C 4 18 4.5 1.333333 6 1.5
aggregate(cbind(DF$V1, DF$V2, DF$V3) ~ V4,
data = DF,
FUN = function(x) c(sum = sum(x), mean = mean(x)))
V4 V1.sum V1.mean V2.sum V2.mean V3.sum V3.mean
1 A 4.000000 1.333333 12 4 1.5 0.5
2 B 5.000000 1.666667 15 5 3.0 1.0
3 C 4.000000 1.333333 18 6 4.5 1.5
|>
TB group_by(V4) |>
summarise(across(everything(),
list(sum = sum, mean = mean)))
# A tibble: 3 × 7
V4 V1_sum V1_mean V2_sum V2_mean V3_sum V3_mean
<chr> <int> <dbl> <int> <dbl> <dbl> <dbl>
1 A 4 1.33 12 4 1.5 0.5
2 B 5 1.67 15 5 3 1
3 C 4 1.33 18 6 4.5 1.5
Summarise a subset of columns by column type or condition.
lapply(.SD, mean), .SDcols = is.numeric] DT[,
V1 V2 V3
<num> <num> <num>
1: 1.444444 5 1
= function(x) {is.numeric(x) && mean(x) > 3}
foo lapply(.SD, mean), .SDcols = foo] DT[,
V2
<num>
1: 5
sapply(DF[sapply(DF, is.numeric)],
mean)
V1 V2 V3
1.444444 5.000000 1.000000
sapply(DF[sapply(DF, \(x) {
is.numeric(x) && mean(x) > 3
})], mean)
V2
5
|>
TB summarise(across(where(is.numeric),
mean))
# A tibble: 1 × 3
V1 V2 V3
<dbl> <dbl> <dbl>
1 1.44 5 1
|> summarise(across(
TB where(~ is.numeric(.x) && mean(.x) > 3), mean))
# A tibble: 1 × 1
V2
<dbl>
1 5
Modify all the columns using the same function.
lapply(.SD, rev)] DT[,
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 9 1.5 C
2: 2 8 1.0 B
3: 1 7 0.5 A
4: 2 6 1.5 C
5: 1 5 1.0 B
6: 2 4 0.5 A
7: 1 3 1.5 C
8: 2 2 1.0 B
9: 1 1 0.5 A
data.frame(lapply(DF, rev))
V1 V2 V3 V4
1 1 9 1.5 C
2 2 8 1.0 B
3 1 7 0.5 A
4 2 6 1.5 C
5 1 5 1.0 B
6 2 4 0.5 A
7 1 3 1.5 C
8 2 2 1.0 B
9 1 1 0.5 A
|> mutate(across(everything(), rev)) TB
# A tibble: 9 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 9 1.5 C
2 2 8 1 B
3 1 7 0.5 A
4 2 6 1.5 C
5 1 5 1 B
6 2 4 0.5 A
7 1 3 1.5 C
8 2 2 1 B
9 1 1 0.5 A
Apply a transformation to each element of the variables selected.
lapply(.SD, sqrt), .SDcols = V1:V2] DT[,
V1 V2
<num> <num>
1: 1.000000 1.000000
2: 1.414214 1.414214
3: 1.000000 1.732051
4: 1.414214 2.000000
5: 1.000000 2.236068
6: 1.414214 2.449490
7: 1.000000 2.645751
8: 1.414214 2.828427
9: 1.000000 3.000000
TODO
TODO
Apply a transformation for each element of the variables selected.
names(.SD) := lapply(.SD, sqrt), .SDcols = V1:V3]
DT[, DT
V1 V2 V3 V4
<num> <num> <num> <char>
1: 1.000000 1.000000 0.7071068 A
2: 1.414214 1.414214 1.0000000 B
3: 1.000000 1.732051 1.2247449 C
4: 1.414214 2.000000 0.7071068 A
5: 1.000000 2.236068 1.0000000 B
6: 1.414214 2.449490 1.2247449 C
7: 1.000000 2.645751 0.7071068 A
8: 1.414214 2.828427 1.0000000 B
9: 1.000000 3.000000 1.2247449 C
TODO ## dplyr
TODO
Apply a transformation to each element of the variables selected by condition.
- 1, .SDcols = is.numeric] DT[, .SD
V1 V2 V3
<num> <num> <num>
1: 0.0000000 0.0000000 -0.2928932
2: 0.4142136 0.4142136 0.0000000
3: 0.0000000 0.7320508 0.2247449
4: 0.4142136 1.0000000 -0.2928932
5: 0.0000000 1.2360680 0.0000000
6: 0.4142136 1.4494897 0.2247449
7: 0.0000000 1.6457513 -0.2928932
8: 0.4142136 1.8284271 0.0000000
9: 0.0000000 2.0000000 0.2247449
data.frame(lapply(DF,
if (is.numeric(x)) x - 1 else x)) \(x)
V1 V2 V3 V4
1 0 0 -0.5 A
2 1 1 0.0 B
3 0 2 0.5 C
4 1 3 -0.5 A
5 0 4 0.0 B
6 1 5 0.5 C
7 0 6 -0.5 A
8 1 7 0.0 B
9 0 8 0.5 C
|>
TB transmute(across(where(is.numeric),
~ '-'(., 1L)))
# A tibble: 9 × 3
V1 V2 V3
<int> <int> <dbl>
1 0 0 -0.5
2 1 1 0
3 0 2 0.5
4 1 3 -0.5
5 0 4 0
6 1 5 0.5
7 0 6 -0.5
8 1 7 0
9 0 8 0.5
Apply a transformation to each element of the variables selected by condition.
names(.SD) := lapply(.SD, as.integer),
DT[, = is.numeric]
.SDcols DT
V1 V2 V3 V4
<int> <int> <int> <char>
1: 1 1 0 A
2: 1 1 1 B
3: 1 1 1 C
4: 1 2 0 A
5: 1 2 1 B
6: 1 2 1 C
7: 1 2 0 A
8: 1 2 1 B
9: 1 3 1 C
sapply(DF, is.numeric)] <-
DF[lapply(DF[sapply(DF, is.numeric)],
as.integer) DF
V1 V2 V3 V4
1 1 1 0 A
2 2 2 1 B
3 1 3 1 C
4 2 4 0 A
5 1 5 1 B
6 2 6 1 C
7 1 7 0 A
8 2 8 1 B
9 1 9 1 C
= TB |>
TB mutate(across(where(is.numeric),
as.integer)) TB
# A tibble: 9 × 4
V1 V2 V3 V4
<int> <int> <int> <chr>
1 1 1 0 A
2 2 2 1 B
3 1 3 1 C
4 2 4 0 A
5 1 5 1 B
6 2 6 1 C
7 1 7 0 A
8 2 8 1 B
9 1 9 1 C
Combine multiple functions in a single statement.
= V4,
DT[, by 1:2], "X")] .(V1[
V4 V1 V2
<char> <int> <char>
1: A 1 X
2: A 1 X
3: B 1 X
4: B 1 X
5: C 1 X
6: C 1 X
= do.call(rbind,
DF by(DF, DF$V4, function(sub) {
head(data.frame(V1 = sub$V1,
V2 = "X"), 2)
}))
|>
TB group_by(V4) |>
slice(1:2) |>
transmute(V1 = V1,
V2 = "X")
# A tibble: 6 × 3
# Groups: V4 [3]
V4 V1 V2
<chr> <int> <chr>
1 A 1 X
2 A 2 X
3 B 2 X
4 B 1 X
5 C 1 X
6 C 2 X
refresh_data()
Chain (pipe)
Expression chaining allows you to perform multiple operations in sequence without creating intermediate objects.
# Chain operations using [][]
= V4, .(V1sum = sum(V1))][
DT[, by > 4] V1sum
V4 V1sum
<char> <int>
1: B 5
# Sort results in descending order
= V4, .(V1sum = sum(V1))][
DT[, by order(-V1sum)]
V4 V1sum
<char> <int>
1: B 5
2: A 4
3: C 4
# Chain using intermediate assignment
subset(aggregate(V1 ~ V4,
data = DF,
FUN = sum), V1 > 4)
V4 V1
2 B 5
# Sort results
# TODO
# Chain using pipe operator
|>
TB group_by(V4) |>
summarise(V1sum = sum(V1)) |>
filter(V1sum > 4)
# A tibble: 1 × 2
V4 V1sum
<chr> <int>
1 B 5
# Sort results
|>
TB group_by(V4) |>
summarise(V1sum = sum(V1)) |>
arrange(desc(V1sum))
# A tibble: 3 × 2
V4 V1sum
<chr> <int>
1 B 5
2 A 4
3 C 4
refresh_data()
Join
First, let’s create example datasets for demonstrating joins.
= data.table(
x Id = c("A", "B", "C", "C"),
X1 = c(1L, 3L, 5L, 7L),
XY = c("x2", "x4", "x6", "x8")
)= data.table(
y Id = c("A", "B", "B", "D"),
Y1 = c(1L, 3L, 5L, 7L),
XY = c("y1", "y3", "y5", "y7")
)
Basic Joins
There are two basic ways to perform joins in data.table
. First, we can use the standard merge()
function that should be familiar to most R users.
Second, we can use the square bracket notation with the on
argument. Here, the logic is that we select every row from the table outside that match rows from the table inside the bracket. The table inside the brackets is used as an “index”, so x[y]
essentially means that we are merging x
into y
. The benefit of this syntax is that we can efficiently combine it with other operations like summaries, sorts, etc.
Note that, in data.table
, the on
argument specifies the joining columns, and the i.
prefix refers to columns from the right table, while the x.
prefix refers to columns from the left table.
# Left join (keep all rows from x)
# merge(x, y, all.x = TRUE, by = "Id")
= "Id"] y[x, on
Id Y1 XY X1 i.XY
<char> <int> <char> <int> <char>
1: A 1 y1 1 x2
2: B 3 y3 3 x4
3: B 5 y5 3 x4
4: C NA <NA> 5 x6
5: C NA <NA> 7 x8
# Right join (keep all rows from y)
# merge(x, y, all.y = TRUE, by = "Id")
= "Id"] x[y, on
Id X1 XY Y1 i.XY
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: D NA <NA> 7 y7
# Inner join (keep only matching rows)
# merge(x, y, by = "Id"
= "Id", nomatch = NULL] x[y, on
Id X1 XY Y1 i.XY
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
# Anti join (keep rows from x with no match in y)
!y, on = "Id"] x[
Id X1 XY
<char> <int> <char>
1: C 5 x6
2: C 7 x8
# Full join (keep all rows)
merge(x, y, by = "Id", all = TRUE)
Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: C 5 x6 NA <NA>
5: C 7 x8 NA <NA>
6: D NA <NA> 7 y7
# Left join (keep all rows from x)
merge(x, y, by = "Id", all.x = TRUE)
Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: C 5 x6 NA <NA>
5: C 7 x8 NA <NA>
# Right join (keep all rows from y)
merge(x, y, by = "Id", all.y = TRUE)
Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: D NA <NA> 7 y7
# Inner join (keep only matching rows)
merge(x, y, by = "Id")
Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
# Full join (keep all rows)
merge(x, y, by = "Id", all = TRUE)
Key: <Id>
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: C 5 x6 NA <NA>
5: C 7 x8 NA <NA>
6: D NA <NA> 7 y7
# Anti join (more complex in base R)
!(x$Id %in% y$Id), ] x[
Id X1 XY
<char> <int> <char>
1: C 5 x6
2: C 7 x8
# Left join (keep all rows from x)
left_join(x, y, by = "Id")
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: C 5 x6 NA <NA>
5: C 7 x8 NA <NA>
# Right join (keep all rows from y)
right_join(x, y, by = "Id")
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: D NA <NA> 7 y7
# Inner join (keep only matching rows)
inner_join(x, y, by = "Id")
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
# Full join (keep all rows)
full_join(x, y, by = "Id")
Id X1 XY.x Y1 XY.y
<char> <int> <char> <int> <char>
1: A 1 x2 1 y1
2: B 3 x4 3 y3
3: B 3 x4 5 y5
4: C 5 x6 NA <NA>
5: C 7 x8 NA <NA>
6: D NA <NA> 7 y7
# Anti join
anti_join(x, y, by = "Id")
Id X1 XY
<char> <int> <char>
1: C 5 x6
2: C 7 x8
Reshape
Wide to long
Converting data from wide format (where variables are in columns) to long format (where variables become rows).
melt(DT,
id.vars = "V4",
variable.name = "Variable",
value.name = "Value")
Warning in melt.data.table(DT, id.vars = "V4", variable.name = "Variable", :
'measure.vars' [V1, V2, V3] are not all of the same type. By order of
hierarchy, the molten data value column will be of type 'double'. All measure
variables not of type 'double' will be coerced too. Check DETAILS in
?melt.data.table for more on coercion.
V4 Variable Value
<char> <fctr> <num>
1: A V1 1.0
2: B V1 2.0
3: C V1 1.0
4: A V1 2.0
5: B V1 1.0
6: C V1 2.0
7: A V1 1.0
8: B V1 2.0
9: C V1 1.0
10: A V2 1.0
11: B V2 2.0
12: C V2 3.0
13: A V2 4.0
14: B V2 5.0
15: C V2 6.0
16: A V2 7.0
17: B V2 8.0
18: C V2 9.0
19: A V3 0.5
20: B V3 1.0
21: C V3 1.5
22: A V3 0.5
23: B V3 1.0
24: C V3 1.5
25: A V3 0.5
26: B V3 1.0
27: C V3 1.5
V4 Variable Value
reshape(DF,
varying = setdiff(names(DF), "V4"),
v.names = "value",
timevar = "variable",
times = setdiff(names(DF), "V4"),
direction = "long")
V4 variable value id
1.V1 A V1 1.0 1
2.V1 B V1 2.0 2
3.V1 C V1 1.0 3
4.V1 A V1 2.0 4
5.V1 B V1 1.0 5
6.V1 C V1 2.0 6
7.V1 A V1 1.0 7
8.V1 B V1 2.0 8
9.V1 C V1 1.0 9
1.V2 A V2 1.0 1
2.V2 B V2 2.0 2
3.V2 C V2 3.0 3
4.V2 A V2 4.0 4
5.V2 B V2 5.0 5
6.V2 C V2 6.0 6
7.V2 A V2 7.0 7
8.V2 B V2 8.0 8
9.V2 C V2 9.0 9
1.V3 A V3 0.5 1
2.V3 B V3 1.0 2
3.V3 C V3 1.5 3
4.V3 A V3 0.5 4
5.V3 B V3 1.0 5
6.V3 C V3 1.5 6
7.V3 A V3 0.5 7
8.V3 B V3 1.0 8
9.V3 C V3 1.5 9
|> tidyr::pivot_longer(
TB cols = c("V1", "V2", "V3"),
names_to = "Variable",
values_to = "Value")
# A tibble: 27 × 3
V4 Variable Value
<chr> <chr> <dbl>
1 A V1 1
2 A V2 1
3 A V3 0.5
4 B V1 2
5 B V2 2
6 B V3 1
7 C V1 1
8 C V2 3
9 C V3 1.5
10 A V1 2
# ℹ 17 more rows
Long to wide
Converting data from long format back to wide format.
# Create example long data
= CJ(a = 1:2, b = 1:2, c = c("x", "y"))
long let(d = rnorm(8))]
long[,
dcast(long, a + b ~ c)
Using 'd' as value column. Use 'value.var' to override
Key: <a, b>
a b x y
<int> <int> <num> <num>
1: 1 1 2.68902220 0.5831947
2: 1 2 -2.63981033 -0.9560305
3: 2 1 -0.03325292 0.8866986
4: 2 2 -0.45678554 -1.1640416
# Create example long data
= expand.grid(a = 1:2, b = 1:2, c = c("x", "y"))
long $d = rnorm(8)
long
reshape(long,
idvar = c("a", "b"),
timevar = "c",
direction = "wide")
a b d.x d.y
1 1 1 -0.3285512 -2.21124190
2 2 1 -0.1594908 0.79596831
3 1 2 1.7166315 -0.02878383
4 2 2 -1.0624083 0.16262237
# Create example long data
= tidyr::expand_grid(a = 1:2, b = 1:2, c = c("x", "y"))
long $d = rnorm(8)
long
::pivot_wider(long,
tidyrid_cols = c("a", "b"),
names_from = "c",
values_from = "d")
# A tibble: 4 × 4
a b x y
<int> <int> <dbl> <dbl>
1 1 1 1.07 0.638
2 1 2 -0.160 -0.456
3 2 1 0.423 2.33
4 2 2 -2.08 0.0602
Split rows
Separating data into groups based on a factor.
split(DT, by = "V4")
$A
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
2: 2 4 0.5 A
3: 1 7 0.5 A
$B
V1 V2 V3 V4
<int> <int> <num> <char>
1: 2 2 1 B
2: 1 5 1 B
3: 2 8 1 B
$C
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 3 1.5 C
2: 2 6 1.5 C
3: 1 9 1.5 C
split(DF, DF$V4)
$A
V1 V2 V3 V4
1 1 1 0.5 A
4 2 4 0.5 A
7 1 7 0.5 A
$B
V1 V2 V3 V4
2 2 2 1 B
5 1 5 1 B
8 2 8 1 B
$C
V1 V2 V3 V4
3 1 3 1.5 C
6 2 6 1.5 C
9 1 9 1.5 C
|> group_split(V4) TB
<list_of<
tbl_df<
V1: integer
V2: integer
V3: double
V4: character
>
>[3]>
[[1]]
# A tibble: 3 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 1 0.5 A
2 2 4 0.5 A
3 1 7 0.5 A
[[2]]
# A tibble: 3 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 2 2 1 B
2 1 5 1 B
3 2 8 1 B
[[3]]
# A tibble: 3 × 4
V1 V2 V3 V4
<int> <int> <dbl> <chr>
1 1 3 1.5 C
2 2 6 1.5 C
3 1 9 1.5 C
Split columns
Separating a single column into multiple columns based on a delimiter.
# Create example data
= data.table(a = c("A:a", "B:b", "C:c"))
tmp
c("w", "z") := tstrsplit(a, split = ":")]
tmp[, tmp
a w z
<char> <char> <char>
1: A:a A a
2: B:b B b
3: C:c C c
# Create example data
= data.frame(a = c("A:a", "B:b", "C:c"))
tmp
$w = sapply(strsplit(as.character(tmp$a), ":"), `[`, 1)
tmp$z = sapply(strsplit(as.character(tmp$a), ":"), `[`, 2)
tmp tmp
a w z
1 A:a A a
2 B:b B b
3 C:c C c
# Create example data
= tibble(a = c("A:a", "B:b", "C:c"))
tmp
= tidyr::separate(tmp, a, c("w", "z"), remove = FALSE)
tmp tmp
# A tibble: 3 × 3
a w z
<chr> <chr> <chr>
1 A:a A a
2 B:b B b
3 C:c C c
Bind rows
Combining multiple datasets by rows. To begin, we create example data.
= data.table(1:3)
x = data.table(4:6)
y = data.table(7:9, 0L) z
# Simple row bind
rbind(x, y)
V1
<int>
1: 1
2: 2
3: 3
4: 4
5: 5
6: 6
# Simple row bind
rbindlist(list(x, y, x))
V1
<int>
1: 1
2: 2
3: 3
4: 4
5: 5
6: 6
7: 1
8: 2
9: 3
# Row bind with different columns
rbind(x, z, fill = TRUE)
V1 V2
<int> <int>
1: 1 NA
2: 2 NA
3: 3 NA
4: 7 0
5: 8 0
6: 9 0
# Bind list of data.tables with ID column
rbindlist(list(x, y), idcol = TRUE)
.id V1
<int> <int>
1: 1 1
2: 1 2
3: 1 3
4: 2 4
5: 2 5
6: 2 6
# Simple row bind
rbind(x, y)
V1
<int>
1: 1
2: 2
3: 3
4: 4
5: 5
6: 6
# Simple row bind
do.call(rbind, list(x, y, x))
V1
<int>
1: 1
2: 2
3: 3
4: 4
5: 5
6: 6
7: 1
8: 2
9: 3
# Bind with ID column
rbind(
cbind(x, id = 1),
cbind(y, id = 2)
)
V1 id
<int> <num>
1: 1 1
2: 2 1
3: 3 1
4: 4 2
5: 5 2
6: 6 2
# Simple row bind
bind_rows(x, y)
V1
<int>
1: 1
2: 2
3: 3
4: 4
5: 5
6: 6
# Simple row bind
bind_rows(list(x, y, x))
V1
<int>
1: 1
2: 2
3: 3
4: 4
5: 5
6: 6
7: 1
8: 2
9: 3
# Row bind with different columns
bind_rows(x, z)
V1 V2
<int> <int>
1: 1 NA
2: 2 NA
3: 3 NA
4: 7 0
5: 8 0
6: 9 0
# Bind list with ID column
bind_rows(list(x, y), .id = "id")
id V1
<char> <int>
1: 1 1
2: 1 2
3: 1 3
4: 2 4
5: 2 5
6: 2 6
Bind columns
# Column bind
::cbind(x, y) base
V1 V1
<int> <int>
1: 1 4
2: 2 5
3: 3 6
# Column bind
cbind(x, y)
V1 V1
<int> <int>
1: 1 4
2: 2 5
3: 3 6
# Column bind
bind_cols(x, y)
New names:
• `V1` -> `V1...1`
• `V1` -> `V1...2`
V1...1 V1...2
<int> <int>
1: 1 4
2: 2 5
3: 3 6
Set Operations
Set operations treat datasets as sets and perform mathematical set operations on them.
<- data.table(c(1, 2, 2, 3, 3))
x <- data.table(c(2, 2, 3, 4, 4)) y
# Intersection (rows present in both x and y)
fintersect(x, y)
V1
<num>
1: 2
2: 3
# Set difference (rows in x but not in y)
fsetdiff(x, y)
V1
<num>
1: 1
# Include duplicates
fsetdiff(x, y, all = TRUE)
V1
<num>
1: 1
2: 3
# Union (combine unique rows)
funion(x, y)
V1
<num>
1: 1
2: 2
3: 3
4: 4
# Include duplicates
funion(x, y, all = TRUE)
V1
<num>
1: 1
2: 2
3: 2
4: 3
5: 3
6: 2
7: 2
8: 3
9: 4
10: 4
# Test for equality
fsetequal(x, x[order(-V1),])
[1] TRUE
all.equal(x, x)
[1] TRUE
# Intersection
intersect(x, y)
V1
<num>
1: 2
2: 3
# Set difference
setdiff(x, y)
V1
<num>
1: 1
# Union (unique rows)
union(x, y)
V1
<num>
1: 1
2: 2
3: 3
4: 4
# With duplicates
unlist(c(x, y))
V11 V12 V13 V14 V15 V11 V12 V13 V14 V15
1 2 2 3 3 2 2 3 4 4
# Test for equality
identical(x, x)
[1] TRUE
all.equal(x, x)
[1] TRUE
# Intersection
::intersect(x, y) dplyr
V1
<num>
1: 2
2: 3
# Set difference
::setdiff(x, y) dplyr
V1
<num>
1: 1
# Union (unique rows)
::union(x, y) dplyr
V1
<num>
1: 1
2: 2
3: 3
4: 4
# With duplicates
union_all(x, y)
V1
<num>
1: 1
2: 2
3: 2
4: 3
5: 3
6: 2
7: 2
8: 3
9: 4
10: 4
# Test for equality
setequal(x, x[order(-V1),])
[1] TRUE
all.equal(x, x)
[1] TRUE
Read and write
# Write to CSV
fwrite(DT, "DT.csv")
# Write tab-delimited
fwrite(DT, "DT.txt", sep = "\t")
# Read CSV or tab-delimited
= fread("DT.csv")
DT1 = fread("DT.txt", sep = "\t")
DT2
# Read specific columns
= fread("DT.csv", select = c("V1", "V4"))
DT3 = fread("DT.csv", drop = "V4")
DT4
# Combine multiple files
rbindlist(lapply(c("DT.csv", "DT.csv"), fread))
V1 V2 V3 V4
<int> <int> <num> <char>
1: 1 1 0.5 A
2: 2 2 1.0 B
3: 1 3 1.5 C
4: 2 4 0.5 A
5: 1 5 1.0 B
6: 2 6 1.5 C
7: 1 7 0.5 A
8: 2 8 1.0 B
9: 1 9 1.5 C
10: 1 1 0.5 A
11: 2 2 1.0 B
12: 1 3 1.5 C
13: 2 4 0.5 A
14: 1 5 1.0 B
15: 2 6 1.5 C
16: 1 7 0.5 A
17: 2 8 1.0 B
18: 1 9 1.5 C
# Write to CSV
write.csv(DF, "DF.csv", row.names = FALSE)
# Write tab-delimited
write.table(DF, "DF.txt", sep = "\t",
row.names = FALSE, col.names = TRUE)
# Read CSV or tab-delimited
= read.csv("DF.csv")
DF1 = read.table("DF.txt", sep = "\t", header = TRUE)
DF2
# Read specific columns
= read.csv("DF.csv")[, c("V1", "V4")]
DF3 = read.csv("DF.csv")[, !(names(read.csv("DF.csv")) %in% "V4")]
DF4
# Combine multiple files
do.call(rbind, lapply(c("DF.csv", "DF.csv"), read.csv))
V1 V2 V3 V4
1 1 1 0.5 A
2 2 2 1.0 B
3 1 3 1.5 C
4 2 4 0.5 A
5 1 5 1.0 B
6 2 6 1.5 C
7 1 7 0.5 A
8 2 8 1.0 B
9 1 9 1.5 C
10 1 1 0.5 A
11 2 2 1.0 B
12 1 3 1.5 C
13 2 4 0.5 A
14 1 5 1.0 B
15 2 6 1.5 C
16 1 7 0.5 A
17 2 8 1.0 B
18 1 9 1.5 C
# Write to CSV
|> readr::write_csv("TB.csv")
TB
# Write tab-delimited
|> readr::write_delim("TB.txt", delim = "\t")
TB
# Read CSV or tab-delimited
= readr::read_csv("TB.csv") TB1
Rows: 9 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): V4
dbl (3): V1, V2, V3
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
= readr::read_delim("TB.txt", delim = "\t") TB2
Rows: 9 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): V4
dbl (3): V1, V2, V3
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Combine multiple files
c("TB.csv", "TB.csv") |>
::map_dfr(readr::read_csv) purrr
Rows: 9 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): V4
dbl (3): V1, V2, V3
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Rows: 9 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (1): V4
dbl (3): V1, V2, V3
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# A tibble: 18 × 4
V1 V2 V3 V4
<dbl> <dbl> <dbl> <chr>
1 1 1 0.5 A
2 2 2 1 B
3 1 3 1.5 C
4 2 4 0.5 A
5 1 5 1 B
6 2 6 1.5 C
7 1 7 0.5 A
8 2 8 1 B
9 1 9 1.5 C
10 1 1 0.5 A
11 2 2 1 B
12 1 3 1.5 C
13 2 4 0.5 A
14 1 5 1 B
15 2 6 1.5 C
16 1 7 0.5 A
17 2 8 1 B
18 1 9 1.5 C
Clean up local files.
file.remove(c("DT.csv", "TB.csv", "DF.csv", "DT.txt", "TB.txt", "DF.txt"))
[1] TRUE TRUE TRUE TRUE TRUE TRUE