Setup data frames

Setup data frames
library(data.table)
DT <- data.table(V1 = rep(1:2, 5)[-10],
V2 = 1:9,
V3 = c(0.5, 1.0, 1.5),
V4 = rep(LETTERS[1:3], 3))
class(DT)
DT
DF <- data.frame(
V1 = rep(1:2, 5)[-10],
V2 = 1:9,
V3 = c(0.5, 1.0, 1.5),
V4 = rep(LETTERS[1:3], 3)
)
class(DF)
DF
library(dplyr)
TB <- tibble(V1 = rep(1:2, 5)[-10],
V2 = 1:9,
V3 = rep(c(0.5, 1.0, 1.5), 3),
V4 = rep(LETTERS[1:3], 3))
class(TB)
TB

Filter rows

Filter rows
DT[3:4,] DF[3:4,] TB[3:4,]
Negative indices
DT[!3:7,] DF[-(3:7),] slice(TB, -(3:7))
Logical expression
DT[V2 > 5]
DT[V4 %chin% c("A", "C")]
DF[TB$V2 > 5, ]
DF[TB$V4 %in% c("A", "C"), ]
filter(TB, V2 > 5)
filter(TB, V4 %in% c("A", "C"))
Multiple conditions
DT[V1 == 1 & V4 == "A"] DF[DF$V1 == 1 & DF$V4 == "A"] filter(TB, V1 == 1, V4 == "A")
Unique rows
unique(DT)
unique(DT, by = c("V1", "V4"))
DF[!duplicated(DF), ]
DF[!duplicated(DF[c("V1", "V4")]), ]
distinct(TB)
distinct(TB, across(c(V1, V4)))
Discard rows with missing values
na.omit(DT, cols = 1:4) DF[complete.cases(DF[, 1:4]), ] tidyr::drop_na(TB, names(DF))
Random sample of rows
DT[sample(.N, 3)] # .N = nb of rows in DT
DT[sample(.N, .N / 2)]
DT[frankv(-V1, ties.method = "dense") < 2]
DF[sample(nrow(DF), 3), ]
DF[sample(nrow(DF), nrow(DF) / 2), ]
DF[rank(-DF$V1, ties.method = "dense") < 2, ]
slice_sample(TB, n = 3)
slice_sample(TB, prop = 0.5)
TB %>% slice_max(V1, n = 1)
Other
DT[V4 %like% "^B"]
DT[V2 %between% c(3, 5)]
DT[between(V2, 3, 5, incbounds = FALSE)]
DT[V2 %inrange% list(-1:1, 1:3)]
DF[grepl("^B", DF$V4), ]
DF[DF$V2 >= 3 & DF$V2 <= 5, ]
DF[DF$V2 > 3 & DF$V2 < 5, ]
DF[DF$V2 %in% c(-1:1, 1:3), ]
filter(TB, grepl("^B", V4))
filter(TB, dplyr::between(V2, 3, 5))
filter(TB, V2 > 3 & V2 < 5)
filter(TB, V2 >= -1:1 & V2 <= 1:3)

Select columns

Select one column using an index(not recommended)
DT[[3]]
DT[, 3]
DF[[3]]
DF[3]
TB[[3]]
TB[3]
Select one column using column name
DT[, list(V2)] # data.table
DT[, .(V2)] # data.table
DT[, "V2"] # data.table
DT[, V2] # vector
DT[["V2"]] # vector
DF["V2"]. # data frame
setNames(DF$V2, DFV$V4) # vector
DF[, "V2", drop = FALSE] # data frame
DF[["V2"]] # vector
select(TB, V2) # tibble
pull(TB, V2, name = V4) # vector
TB[, "V2"] # tibble
TB[["V2"]] # vector
Select several columns
DT[, .(V2, V3, V4)]
DT[, list(V2, V3, V4)]
DT[, V2:V4]
DF[c("V2", "V3", "V4")]
DF[, which(names(DF) %in% c("V2", "V3", "V4"))]
DF[, 2:4]
select(TB, V2, V3, V4)
select(TB, V2:V4)
Exclude columns
DT[, !c("V2", "V3")] DF[ , !(names(DF) %in% c("V2", "V3"))] select(TB, -V2, -V3)
Select/Exclude columns using a character vector
cols <- c("V2", "V3")
DT[, ..cols] # .. prefix means 'one-level up'
DT[, !..cols]
cols <- c("V2", "V3")
DF[, cols]
DF[ , !(names(DF) %in% cols)]
cols <- c("V2", "V3")
select(DF, !!cols)
select(DF, -!!cols)
Other selections
cols <- paste0("V", 1:2)
cols <- union("V4", names(DT))
cols <- grep("V", names(DT))
cols <- grep("3$", names(DT))
cols <- grep(".2", names(DT))
cols <- grep("^V1|X$", names(DT))
cols <- grep("^(?!V2)", names(DT), perl = TRUE)
DT[, ..cols]
DF[ , grep("^V[1-2]$", names(DF))]
DF[ , c("V4", setdiff(names(DF), "V4"))]
DF[ , grep("V", names(DF))]
DF[ , grep("3$", names(DF))]
DF[ , grep(".2", names(DF))]
DF[ , c("V1", "X")]
DF[ , !grepl("^V2", names(DF))]
select(TB, num_range("V", 1:2))
select(TB, V4, everything())
select(TB, contains("V"))
select(TB, ends_with("3"))
select(TB, matches(".2"))
select(TB, one_of(c("V1", "X")))
select(TB, -starts_with("V2"))

Sort rows

By column
DT[order(V3)] DF[order(DF$V3)] arrange(TB, V3)
Decreasing order
DT[order(-V3)] DF[order(DF$V3, decreasing = TRUE)] arrange(TB, desc(V3))
Based on several columns
DT[order(V1, -V2)] DF[order(DF$V1, -DF$V2), ] arrange(TB, V1, desc(V2))

Aggregate (summarise)

One column
DT[, sum(V1)] # vector
DT[, .(sum(V1))] # data.table
DT[, .(sumV1 = sum(V1))] # data.table
sum(DF$V1)
data.frame(sumV1 = sum(DF$V1))
summarise(TB, sum(V1))
summarise(TB, sumV1 = sum(V1))
Several columns
DT[, .(sum(V1), sd(V3))] data.frame(sum_V1 = sum(DF$V1), sd_V3 = sd(DF$V3)) summarise(TB, sum(V1), sd(V3))
Several columns with column names
DT[, .(sumv1 = sum(V1),
sdv3 = sd(V3))]
data.frame(sumv1 = sum(V1),
sdv3 = sum(V3))
TB %>%
summarise(sumv1 = sum(V1),
sdv3 = sd(V3))
Subset of rows
DT[1:4, sum(V1)] sum(DF[1:4, "V1"]) TB %>%
slice(1:4) %>%
summarise(sum(V1))
Subset of rows
DT[, data.table::first(V3)]
DT[, data.table::last(V3)]
DT[5, V3]
DT[, uniqueN(V4)]
uniqueN(DT)
DF[1, "V3"]
DF[nrow(DF), "V3"]
DF[5, "V3"]
length(unique(DF$V4))
length(unique(DF))
summarise(TB, dplyr::first(V3))
summarise(TB, dplyr::last(V3))
summarise(TB, nth(V3, 5))
summarise(TB, n_distinct(V4))
n_distinct(TB)
Count number of observations for each group
DT[, .N, by = V4] as.data.frame(table(DF$V4)) TB %>%
group_by(V4) %>%
tally()

Modify columns

Modify a column
DT[, V1 := V1^2]
DT
DF$V1 <- DF$V1^2
DF
TB <- TB %>% mutate(V1 = V1^2)
TB
Add one column
DT[, v5 := log(V1)] DF$v5 <- log(DF$V1)
DF
TB <- mutate(DF, v5 = log(V1))
Add several columns
DT[, c("v6", "v7") := .(sqrt(V1), "X")]
DT[, ':='(v6 = sqrt(V1),
v7 = "X")]
DF$v6 <- sqrt(DF$V1)
DF$v7 <- "X"
DF
TB <- mutate(TB, v6 = sqrt(V1), v7 = "X")
Create one column and remove the others
DT[, .(v8 = V3 + 1)] data.frame(v8 = DF$V3 + 1) transmute(TB, v8 = V3 + 1)
Remove one column
DT[, v5 := NULL] DF <- DF[, !names(DF %in% "v5")] TB <- select(TB, -v5)
Remove several columns
DT[, c("v6", "v7") := NULL] DF <- DF[, !(names(DF) %in% c("v6", "v7"))] TB <- select(TB, -v6, -v7)
Remove columns using a vector of colnames
cols <- c("V3")
DT[, (cols) := NULL] # ! not DT[, cols := NULL]
cols <- c("V3")
DF <- DF[, !(names(DF) %in% cols)]
cols <- c("V3")
TB <- select(TB, -one_of(cols))
Replace values for rows matching a condition
DT[V2 < 4, V2 := 0L]
DT
DF$V2 <- replace(DF$V2, DF$V2 < 4, 0L)
DF
TB <- mutate(DF, V2 = base::replace(V2, V2 < 4, 0L))
TB
By group
DT[, .(sumV2 = sum(V2)), by = "V4"]
DT[, by = V4,
.(sumV2 = sum(V2))]
aggregate(V2 ~ V4, data = DF, FUN = sum) TB %>%
group_by(V4) %>%
summarise(sumV2 = sum(V2))
By several groups
DT[, keyby = .(V4, V1),
.(sumV2 = sum(V2))]
aggregate(V2 ~ V4 + V1,
data = DF, FUN = sum)
TB %>%
group_by(V4, V1) %>%
summarise(sumV2 = sum(V2)) %>%
ungroup()
Calling function in by
DT[, by = tolower(V4),
.(sumV1 = sum(V1))]
aggregate(V! ~ tolower(V4),
data = DF, FUN = sum)
TB %>%
group_by(tolower(V4)) %>%
summarise(sumV1 = sum(V1))
Assigning column name in by
DT[, keyby = .(abc = tolower(V4)),
.(sumV1 = sum(V1))]
aggregate(V1 ~ abc,
data = transform(DF, abc = tolower(V4)),
FUN = sum)
TB %>%
group_by(abc = tolower(V4)) %>%
summarise(sumV1 = sum(V1))
Using a condition in by
DT[, keyby = V4 == "A",
sum(V1)]
aggreggate(V1 ~ groupA,
data = transform(DF, groupA = V4 == "A"),
FUN = sum)
TB %>%
group_by(V4 == "A") %>%
summarise(sum(V1))
By on a subset of rows
DT[1:5, # i
.(sumV1 = sum(V1)), # j
by = V4] # by
aggregate(V1 ~ V4,
data = DF[1:5,], sum)
TB %>%
slice(1:5) %>%
group_by(V4) %>%
summarise(sumV1 = sum(V1))
Add a column with number of observations for each group
DT[, n := .N, by = V1][]
DT[, n := NULL]
DF$tally <-
ave(DF$V1,
DF$V1,
FUN = length)
TB %>%
group_by(V1) %>%
add_tally()
Retrieve the first/last/nth observation for each group
DT[, data.table::first(V2), by = V4]
DT[, data.table::last(V2), by = V4]
DT[, V2[2], by = V4]
aggregate(V2 ~ V4,
data = DF,
FUN = function(X) x[1])
aggregate(V2 ~ V4,
data = DF,
FUN = function(x) x[length(x)])
aggregate(V2 ~ V4,
data = DF,
FUN = function(x) x[2])
TB %>%
group_by(V4) %>%
summarise(dplyr::first(V2))
TB %>%
group_by(V4) %>%
summarise(dplyr::last(V2))
TB %>%
group_by(V4) %>%
summarise(dplyr::nth(V2, 2))

Going further

Advanced columns manipulation

Summarise all the columns
DT[, lapply(.SD, max)] apply(DF, 2, max) TB %>% summarise(across(everything(), max))
Summarise several columns
DT[, lapply(.SD, mean),
.SDcols = c("V1", "V2")]
data.frame(mean_V1 = mean(DF$V1), mean_V2 = mean(DF$V2)) TB %>% summarise(across(c(V1, V2), mean))
Summarise several columns by group
DT[, by = V4,
lapply(.SD, mean),
.SDcols = c("V1", "V2")]
DT[, by = V4,
lapply(.SD, mean),
.SDcols = patterns("V1|V2|Z0")]
cols <- intersect(C("V1", "V2", "Z0"), names(DF))
aggregate(DF[cols], by = LIST(DF$V4), FUN = mean, na.rm = TRUE)
Summarise with more than one function by group
DT[, by = V4,
c(lapply(.SD, sum),
lapply(.SD, mean))]
aggregate(cbind(sumV1 = DF$V1, meanV1 = DF$V1,
sumV2 = DF$V2, meanV2 = DF$V2) ~ V4,
data = DF,
FUN = function(x) c(sum = sum(x), mean = mean(x)))
TB %>%
group_by(V4) %>%
summarise(across(everything(),
list(sum = sum, mean = mean)))
Summarise using a condition
DT[, lapply(.SD, mean),
.SDcols = is.numeric]
foo <- function(x) {is.numeric(x) && mean(x) > 3}
DT[, lapply(.SD, mean),
.SDcols = foo]
sapply(DF[sapplu(DF, is.numeric)],
mean)
sapply(DF[sapply(DF, function(x) is.numeric(x) && mean(x) > 3)], mean)
TB %>%
summarise(across(where(is.numeric),
mean))
TB %>% summarise(across(where(~ is.numeric(.x) && mean(.x) > 3),
mean))
Modify all the columns
DT[, lapply(.SD, rev)] data.frame(lapply(DF, rev)) TB %>% mutate(across(everything(), rev))
Modify several columns (dropping the others)
DT[, lapply(.SD, sqrt),
.SDcols = V1:V2]
DT[, lapply(.SD, exp),
.SDcols = !"V4"]
DF[c("V1", "V2")] <- lapply(DF[c("V1", "V2")], sqrt)
DF[setdiff(names(DF), "V4")] <- lapply(DF[setdiff(names(DF), "V4")], exp)
Modify several columns (keeping the others)
DT[, c("V1", "V2") := lapply(.SD, sqrt),
.SDcols = c("V1", "V2")]
cols <- setdiff(names(DT), "V4")
DT[, (cols) := lapply(.SD, "^", 2L),
.SDcols = cols]
DT
DF[c("V1", "V2")] <- lapply(DF[c("V1", "V2")], sqrt)
cols <- setdiff(names(DF), "V4")
DF[cols] <- lapply(DF[cols], "^", 2L)
TB <-TB %>%
mutate(across(all_of(c("V1", "V2")), sqrt))
TB <- TB %>%
mutate(across(-any_of("V4"),
~ "^"(.x, 2L)))
TB
Modify columns using a condition (dropping the others)
DT[, .SD - 1,
.SDcols = is.numeric]
DT
data.frame(lapply(DF,
function(x) if (is.numeric(x)) x - 1 else x))
DF
TB %>%
transmute(across(where(is.numeric),
~ '-'(., 1L)))
TB
Modify columns using a condition (keeping the others)
DT[, (cols) := lapply(.SD, as.integer),
.SDcols = is.numeric]
DF[sapply(DF, is.numeric)] <-
lapply(DF[sapply(DF, is.numeric)],
as.integer)
DF
TB <- TB %>%
mutate(across(where(is.numeric),
as.integer))
Use a complex expression
DT[, by = V4,
.(V1[1:2], "X")]
DF <- do.call(rbind,
by(DF, DF$V4, function(sub) {
head(data.frame(V1 = sub$V1,
V2 = "X"), 2)
}))
TB %>%
group_by(V4) %>%
slice(1:2) %>%
transmute(V1 = V1,
V2 = "X")
Use multiple expressions (with DT[,{j}])
DT[, {print(V1)
print(summary(V1))
x <- V1 + sum(V2)
.(A = 1:.N, B = x)
}]
print(DF$V1)
print(summary(DF$V1))
x <- DF$V1 + sum(DF$V2)
data.frame(A = seq_along(DF$V1), B = x)
#

Chain expressions

Expression chaining using DT[][] (recommended)
DT[, by = V4,
.(V1sum = sum(V1)) ][
V1sum > 5]
DF <- subset(aggregate(V1 ~ V4,
data = DF,
FUN = sum), V1 > 5)
TB %>%
group_by(V4) %>%
summarise(V1sum = sum(V1)) %>%
filter(V1sum > 5)
Expression chaining using %>%
DT[, by = V4,
.(V1sum = sum(V1))] %>%
.[order(-V1sum)]
subset(aggregate(V1 ~ V4,
data = DF,
FUN = sum),
V1 > 5)
TB %>%
group_by(V4) %>%
summarise(V1sum = sum(V1)) %>%
arrange(desc(V1sum))

Indexing and Keys

Set the key/index
setkey(DT, V4)
setindex(DT, V4)
DT
DF[order(DF$V4), ] TB <- arrange(TB, V4)
TB
Select the matching rows
DT["A", on = "V4"]
DT[c("A", "C"), on = .(V4)]
DF[DF$V4 == "A", ]
DF[DF$V4 %in% c("A", "C"), ]
filter(TB, V4 == "A")
filter(TB, V4 %in% c("A", "C"))
Select the first matching row
DT["B", on = "V4", mult = "first"]
DT[c("B", "C"), on = "V4", mult = "first"]
DF[DF$V4 == "B", ][1, ] TB %>%
filter(V4 == "B") %>%
slice(1)
# ?
Select the last matching row
DT["A", on = "V4", mult = "last"] DF[DF$V4 == "A", ][nrow(DF[DF$V4 == "A", ]), ] TB %>%
filter(V4 == "A") %>%
slice(n())
Nomatch argument
DT[c("A", "D"), on = "V4", nomatch = NA]
DT[c("A", "D"), on = "V4", nomatch = 0]
DF[DF$V4 %in% c("A", "D"), ] filter(TB, V4 %in% c("A", "D"))
Apply a function on the matching rows
DT[c("A", "C"), sum(V1), on = "V4"] sum(DF$V1[DF$V4 %in% c("A", "C")]) TB %>%
filter(V4 %in% c("A", "C")) %>%
summarise(sum(V1))
Modify values for matching rows
DT["A", V1 := 0, on = "V4"]
DT
DF$V1[DF$V4 == "A"] <- 0 TB <- TB %>%
mutate(V1 = base::replace(V1, V4 == "A", 0L)) %>%
arrange(V4)
TB
Use keys in by
DT[!"B", sum(V1), on = "V4", by = .EACHI]
DT[V4 != "B",
by = V4,
sum(V1)]
aggregate(V1 ~ V4,
data = DF[DF$V4 != "B", ],
FUN = sum)
TB %>%
filter(V4 != "B") %>%
group_by(V4) %>%
summarise(sum(V1))
Set keys/indices for multiple columns
setkey(DT, V4, V1)
setindex(DT, V4, V1)
DF[order(DF$V4, DF$V1)] TB <- arrange(DF, V4, V1)
Subset using multiple keys/indices
DT[.("C", 1), on = .(V4, V1)]
DT[.(c("B", "C"), 1), on = .(V4, V1)]
DT[.(c("B", "C"), 1), on = .(V4, V1), which = TRUE]
DF[DF$V4 == "C" & DT$V1 == 1, ]
DF[DF$V4 %in% c("B", "C") & DT$V1 == 1, ]
which(DF$V4 %in% c("B", "C") & DT$V1 == 1)
filter(TB, V1 == 1, V4 == "C")
filter(TB, V1 == 1, V4 %in% c("B", "C"))
Remove keys/indices
setkey(DT, NULL)
setindex(DT, NULL)
DF TB

’set*()’ modifications

Replace values
set(DT, i = 1L, j = 2L, value = 3L)
DT
DF[1, 2] <- 3
DF
TB[1, 2] <- 3L
TB
Reorder rows
setorder(DT, V4, -V1)
setorderv(DT, c("V4", "V1"), c(1, -1))
DF <- DF[order(TB$V4, -DF$V1), ]
DF
TB <- arrange(TB, V4, desc(V1))
Modify colnames
setnames(DT, old = "V2", new = "v2")
setnames(DT, old = -(c(1, 3)), new = "V2")
DT
names(DF)[names(DF) == "V2"] <- "v2"
names(DF)[names(DF) == "v2"] <- "V2"
DF
TB <- rename(DF, v2 = V2)
TB <- rename(DF, V2 = v2)
TB
Reorder columns
setcolorder(DT, c("V4", "V1", "V2"))
DT
DF <- DF[, c("V4", "V1", "V2")]
DF
TB <- select(TB, V4, V1, V2)
TB
Convert data
DF <- DF[, c("V1", "V2", "V4")]
DF
TB %>%
relocate(V4, .after = V2)
Relocate columns

Advanced use of by

Select first/last/.. row by group
DT[, .SD[1], by = V4]
DT[, .SD[c(1, .N)], by = V4]
DT[, tail(.SD, 2), by = V4]
DF[ave(DF$V4,
DF$V4,
FUN = seq_along) == 1, ]
DF[ave(DF$V4,
DF$V4,
FUN = seq_along) %in% c(1, max(ave(DF$V4, DF$V4, FUN = seq_along)))]
do.call(rbind, by(DF, DF$V4, function(x) tail(x, 2)))
TB %>%
group_by(V4) %>%
slice(1)
TB %>%
group_by(V4) %>%
slice(1, n())
TB %>%
group_by(V4) %>%
group_map(~ tail(.x, 2))
Select rows using a nested query
DT[, .SD[which.min(V2)], by = V4] DF[order(DF$V2), ][
ave(DF$V4, DF$V4, FUN = seq_along) == 1,]
TB %>%
group_by(V4) %>%
arrange(V2) %>%
slice(1)
Add a group counter column
DT[, Grp := .GRP, by = .(V4, V1)][]
DT[, Grp := NULL]
ave(seq_along(DF$V4), DF$V4, DF$V1, FUN = function(X) seq_along(x)) TB %>%
group_by(V4, V1) %>%
mutate(Grp = cur_group_id())
Get row number of first (and last) observation by group
DT[, .I, by = V4]
DT[, .I[1], by = V4]
DT[, .I[c(1, .N)], by = V4]
sapply(split(DT, DT$V4), function(group) rownames(groupe))
sapply(split(DT, DT$V4), function(group) rownames(group)[1])
sapply(split(DT, DT$V4), function(group) c(rownames(group)[1],
rownames(group)[nrow(group)]))
TB %>%
group_by(V4) %>%
mutate(cur_group_rows()) %>%
ungroup()
TB %>%
group_by(V4) %>%
summarize(cur_group_rows()[1])
TB %>%
group_by(V4) %>%
summarize(cur_group_rows()[c(1, n())])
Handle list-columns by group
DT[, .(.(V1)), by = V4] # V1 as list
DT[, .(.(.SD)), by = V4] # subsets
tapply(DF$V1, DF$V4,
function(x) list(x))
split(DF, DF$V4)
TB %>%
group_by(V4) %>%
summarise(list(V1))
TB %>%
group_by(V4) %>%
group_nest()
Grouping sets (multiple by at once)
rollup(DT,
.(SumV2 = sum(V2)),
by = c("V1", "V4"))
rollup(DT,
.(SumV2 = sum(V2), .N),
by = c("V1", "V4"),
id = TRUE)
cube(DT,
.(SumV2 = sum(V2), .N),
by = c("V1", "V4"),
id = TRUE)
groupingsets(DT,
.(SumV2 = sum(V2), .N),
by = c("V1", "V4"),
sets = list("V1", c("V1", "V4")),
id = TRUE)
aggregate(V2 ~ V1 + V4,
data = DF,
FUN = sum)
aggregate(cbind(V2, N = 1) ~ V1 + V4,
data = DF,
FUN = function(X) c(sum(x),
length(x)))
aggregate(cbind(V2, N = 1) ~ V1 + V4,
data = DF,
FUN = function(c) c(sum(x),
length(x)),
na.action = NULL)
aggregate(V2 ~ V1, data = DF, FUN = sum)
aggregate(V2 ~ V1 + V4, data = DF, FUN = sum)

Read and write

Write data to a csv file
fwrite(DT, "DT.csv") write.csv(DF, "DF.csv", row.names = FALSE) readr::write_csv(TB, "TB.csv")
Write data to a tab-delimited file
fwrite(DT, "DT.txt", sep = "\t") write.table(DF, "DF.txt", sep = "\t", row.names = FALSE, col.names = TRUE) readr::write_delim(TB, "TB.txt", delim = "\t")
Write list-column data to a csv file
fwrite(setDT(list(0, list(1:5))), "DT2.csv") DT2 <- data.frame(V1 = 0, V2 = list(1:5))
write.csv(DT2, "DT2.csv", row.names = FALSE)
Read a csv / tab-delimited filed
fread("DT.csv")
fread("DT.txt", sep = "\t")
read.csv("DF.csv")
read.table("DF.txt", sep = "\t", header = TRUE)
readr::read_csv("TB.csv")
readr::read_delim("TB.txt", delim = "\t")
Read a csv file selecting / dropping columns
fread("DT.csv", select = c("V1", "V4"))
fread("DT.csv", drop = "V4")
read.csv("DF.csv")[, c("V1", "V4")]
read.csv("DF.csv")[, !(names(read.csv("DF.csv")) %in% "V4")]
Read and rbind several files
rbindlist(lapply(c("DT.csv", "DT.csv"), fread)) do.call(rbind, lapply(c("DF.csv", "DF.csv"), read.csv)) c("TB.csv", "TB.csv") %>%
purrr::map_dfr(readr::read_csv)

Reshape

Melt data (from wide to long)
melt(DT, id.vars = "V4")
mDT <- melt(DT,
id.vars = "V4",
measure.vars = c("V1", "V2"),
variable.name = "Variable",
value.name = "Value")
reshape(DF,
varying = setdiff(names(DF), "V4"),
v.names = "value",
timevar = "variable",
times = setdiff(names(DF), "V4"),
direction = "long")
reshape(DF,
varying = setdiff(names(DF), "V4"),
v.names = "Value",
timevar = "Variable",
times = setdiff(names(DF), "V4"),
direction = "long")
tidyr::gather(TB, variable, value, -V4)
mTB <- tidyr::gather(TB,
key = Variable,
value = Value,
-V4)
Cast data (from long to wide)
dcast(mDT, V4 ~ Variable)
dcast(mDT, V4 ~ Variable, fun.aggregate = sum)
dcast(mDT, V4 ~ Value > 5)
as.data.frame.matrix(replace(table(DF$V4,
DF$Variable),
is.na(table(DF$V4,
DF$Variable)), 0))
tidyr::spread(data = count(mTB, V4, Variable),
key = Variable,
value = n,
fill = 0)
Split
split(DT, by = "V4") split(DF, DF$V4) group_split(TB, V4)
Split and transpose a vector/column
vec <- c("A:a", "B:b", "C:c")
tstrsplit(vec, split = ":", keep = 2L)
setDT(tstrsplit(vec, split = ":"))[]
do.call(rbind, strsplit(vec, " ")) |> as.data.frame() vec <- c("A:a", "B:b", "C:c")
tidyr::separate(tibble(vec), vec, c("V1", "V2"))

Other

Check package installation
List data.tables/tibbles
tables() # #
Get/Set number of threads when parallelized
getDTthreads()
setDTthreads()
# #
Lead/Lag
shift(1:10, n = 1, fill = NA, type = "lag")
shift(1:10, n = 1:2, fill = NA, type = "lag")
shift(1:10, n = 1, fill = NA, type = "lead")
c(NA, 1:9)
sapply(1:2, function(n) c(rep(NA, n), seq(1, 10)[1:(length(seq(1, 10)) - n)]))
c(1:9, NA)
lag(1:10, n = 1, default = NA)
purrr::map(1:2, ~lag(1:10, n = .x))
lead(1:10, n = 1, default = NA)
Generate run-length ids
rleid(rep(c("a", "b", "a"), each = 3))
rleid(rep(c("a", "b", "a"), each = 3), prefix = "G")
unlist(lapply(rle(rep(c("a", "b", "a"), each = 3))$lengths, seq_along))
paste0("G", unlist(lapply(rle(rep(c("a", "b", "a"), each = 3))$lengths, seq_along)))
#
Fast version of `ifelse()'
x <- c(-3:3, NA)
fifelse(test = x < 0,
yes = "neg",
no = "pos",
na = "NA")
x <- c(-3:3, NA)
result <- ifelse(is.na(x),
"NA",
ifelse(x < 0, "neg", "pos"))
result
x <- c(-3:3, NA)
if_else(condition = x < 0,
true = "neg",
false = "pos",
missing = "NA")
Vectorised 'ifelse' statements
x <- 1:10
fcase(
x %% 6 == 0, "fizz buzz",
x %% 2 == 0, "fizz",
x %% 3 == 0, "buzz",
default = NA_character_
)
x <- 1:10
result <- ifelse(x %% 6 == 0, "fizz buzz",
ifelse(x %% 2 == 0, "fizz",
ifelse(x %% 3 == 0, "buzz", as.character(x))))
result
x <- 1:10
case_when(
x %% 6 == 0 ~ "fizz buzz",
x %% 2 == 0 ~ "fizz",
x %% 3 == 0 ~ "buzz",
TRUE ~ as.character(x)
)
Rolling functions

Join

Join matching rows from y to x
y[x, on = "Id"]
merge(x, y, all.x = TRUE, by = "Id")
y[x] # requires keys
merge(x, y, by = "Id", all.x = TRUE) left_join(x, y, by = "Id")
Join matching rows from x to y
x[y, on = "Id"]
merge(x, y, all.y = TRUE, by = "Id")
x[y] # requires keys
merge(x, y, by = "Id", all.y = TRUE) right_join(x, y, by = "Id")
Join matching rows from both x and y
x[y, on = "Id", nomatch = 0]
merge(x, y)
x[y, nomatch = 0] # requires keys
merge(x, y, by = "Id") inner_join(x, y, by = "Id")
Join keeping all the rows
merge(x, y, all = TRUE, by = "Id") merge(x, y, by = "Id", all = TRUE) full_join(x, y, by = "Id")
Return rows from x matching y
unique(x[y$Id, on = "Id", nomatch = 0])
unique(x[y$Id, nomatch = 0]) # requires keys
merge(x, y, by = "Id", all.x = TRUE)[!is.na(Id), ] semi_join(x, y, by = "Id")
Return rows from x not matching y
x[!y, on = "Id"]
x[!y] # requires keys
merge(x, y, by = "Id",
all.x = TRUE, all.y = FALSE)[is.na(y$Id), ]
anti_join(x, y, by = "Id")

More joins

Select columns while joining
## i. prefix refers to cols in y
x[y, .(Id, X1, i.XY)]
# x. prefix refers to cols in x
x[y, .(Id, x.XY, i.XY)]
merge(select(x, Id, X1),
select(y, Id, XY), by = "Id",
all.x = FALSE, all.y = TRUE)
merge(select(x, Id, XY),
select(y, Id, XY), by = "Id",
all.x = FALSE, all.y = TRUE)
right_join(select(x, Id, X1),
select(y, Id, XY),
by = "Id")
right_join(select(x, Id, XY),
select(y, Id, XY),
by = "Id")
Aggregate columns while joining
y[x, .(X1Y1 = sum(Y1) * X1), by = .EACHI] merge(aggregate(Y1 ~ Id, data = y, FUN = sum),
x, by = "Id", all.x = FALSE, all.y = TRUE) %>%
transform(X1Y1 = SumY1 * X1)[, c("Id", "X1Y1")]
y %>%
group_by(Id) %>%
summarise(SumY1 = sum(Y1)) %>%
right_join(x) %>%
mutate(X1Y1 = SumY1 * X1) %>%
select(Id, X1Y1)
Update columns while joining
y[x, SqX1 := i.X1^2]
y[, SqX1 := x[.BY, X1^2, on = "Id"], by = Id]
y[, SqX1 := NULL]
merge(
transform(x[, c("Id", "X1")], SqX1 = X1^2),
y, by = "Id", all.x = FALSE, all.y = TRUE)[, c(names(y), "SqX1")]
x %>%
select(Id, X1) %>%
mutate(SqX1 = X1^2) %>%
right_join(y, by = "Id") %>%
select(names(y), SqX1)
Adds a list column with rows from y matching x (nest-join)
x[, y := .(.(y[.BY, on = "Id"])), by = Id]
x[, y := NULL]
merge(x, y, by = "Id", all = TRUE) |>
transform(Nested = split(., .$Id))
nest_join(x, y, by = "Id")
Update columns while joining (using vectors of colnames)
cols <- c("NewXY", "NewX1")
icols <- paste0("i.", c("XY", "X1"))
y[x, (cols) := mget(icols)]
y[, (cols) := NULL]
cols <- c("NewXY", "NewX1")
icols <- paste0("i.", c("XY", "X1"))
y[, cols] <- mget(icols)
y[, cols] <- NULL
# ?
Join passing columns to match in the 'on' argument
x[z, on = "X1 == Z1"]
x[z, on = .(X1 == Z1)]
x[z, on = .(Id == ID, X1 == Z1)]
merge(x, z, by.x = "X1", by.y = "Z1", all.y = TRUE)
merge(x, z, by.x = c("Id", "X1"), by.y = c("ID", "Z1"), all.y = TRUE)
right_join(x, z, by = c("X1" = "Z1"))
right_join(x, z, by = c("Id" = "ID", "X1" = "Z1"))
Non-equi joins
x[z, on = .(Id == ID, X1 <= Z1)]
x[z, on = .(Id == ID, X1 > Z1)]
x[z, on = .(X1 < Z1), allow.cartesian = TRUE]
subset(merge(x, z, by.x = "Id", by.y = "ID", all.x = TRUE), X1 <= Z1)
subset(merge(x, z, by.x = "Id", by.y = "ID", all.x = TRUE), X1 > Z1)
merge(x, z, by.x = "Id", by.y = "ID", all.x = TRUE)[X1 < Z1, ]
#
Rolling joins/subsets (performed on the last numeric column)
x[z, on = .(Id == ID, X1 == Z1), roll = "nearest"]
setkey(x, Id, X1)
x[.("C", 5:9), roll = "nearest"]
merge(x, z, by.x = c("Id", "X1"), by.y = c("ID", "Z1"), all.x = TRUE)
x[order(x$Id, x$X1)]
x[.("C", 5:9), roll = "nearest"]
#
x[.("C", 5:9), roll = Inf]
x[.("C", 5:9), roll = 0.5]
x[.("C", 5:9), roll = Inf, rollends = c(FALSE, TRUE)]
x[.("C", 5:9), roll = Inf, rollends = c(FALSE, FALSE)]
x[x$V1 == "C" & x$V2 %in% 5:9, ]
x[x$V1 == "C" & x$V2 >= 5 & x$V2 <= 9, ]
x[x$V1 == "C" & x$V2 >= 5 & x$V2 <= 9, ]
x[x$V1 == "C" & x$V2 > 5 & x$V2 < 9, ]
#
x[.("C", 5:9), roll = -Inf]
x[.("C", 5:9), roll = -0.5]
x[.("C", 5:9), roll = -Inf, rollends = c(TRUE, FALSE)]
x[.("C", 5:9), roll = -Inf, rollends = c(TRUE, TRUE)]
x[x$V1 == "C" & x$V2 <= 9 & x$V2 >= 5, ]
x[x$V1 == "C" & x$V2 >= 5 & x$V2 <= 9, ]
x[x$V1 == "C" & x$V2 > 5 & x$V2 <= 9, ]
x[x$V1 == "C" & x$V2 >= 5 & x$V2 <= 9, ]
#
Cross join ('CJ' ~ 'expand.grid'
CJ(c(2, 1, 1), 3:2)
CJ(c(2, 1, 1), 3:2, sorted = FALSE, unique = TRUE)
expand.grid(c(2, 1, 1), 3:2)
unique(expand.grid(c(2, 1, 1), 3:2))
expand.grid(c(2, 1, 1), 3:2)
#

Bind

Bind rows
rbind(x, y)
rbind(x, z, fill = TRUE)
rbind(x, y)
rbind(x, z)
bind_rows(x, y)
bind_rows(x, z)
Bind rows using a list
rbindlist(list(x, y), idcol = TRUE) rbind(cbind(x, id = 1), cbind(y, id = 2)) bind_rows(list(x, y), .id = "id")
Bind columns
base::cbind(x, y) cbind(x, y) bind_cols(x, y)

Set operations

Intersection
fsetdiff(x, y)
fsetdiff(x, y, all = TRUE)
setdiff(x, y) dplyr::setdiff(x, y)
Difference
funion(x, y)
funion(x, y, all = TRUE)
union(x, y)
c(x, y)
dplyr::union(x, y)
union_all(x, y)
Union
fsetequal(x, x[order(-V1),])
all.equal(x, x)
identical(x, x)
all.equal(x, x)
setequal(x, x[order(-V1),])
all_equal(x, x)
Equality