Setup data frames

Setup data frames
`library(data.table)` `DT <- data.table(V1 = rep(1:2, 5)[-10],` `V2 = 1:9,` `V3 = c(0.5, 1.0, 1.5),` `V4 = rep(LETTERS[1:3], 3))` `class(DT)` `DT`	`DF <- data.frame(` `V1 = rep(1:2, 5)[-10],` `V2 = 1:9,` `V3 = c(0.5, 1.0, 1.5),` `V4 = rep(LETTERS[1:3], 3)` `)` `class(DF)` `DF`	`library(dplyr)` `TB <- tibble(V1 = rep(1:2, 5)[-10],` `V2 = 1:9,` `V3 = rep(c(0.5, 1.0, 1.5), 3),` `V4 = rep(LETTERS[1:3], 3))` `class(TB)` `TB`

Filter rows

Filter rows
`DT[3:4,]`	`DF[3:4,]`	`TB[3:4,]`

Negative indices
`DT[!3:7,]`	`DF[-(3:7),]`	`slice(TB, -(3:7))`

Logical expression
`DT[V2 > 5]` `DT[V4 %chin% c("A", "C")]`	`DF[TB$V2 > 5, ]` `DF[TB$V4 %in% c("A", "C"), ]`	`filter(TB, V2 > 5)` `filter(TB, V4 %in% c("A", "C"))`

Multiple conditions
`DT[V1 == 1 & V4 == "A"]`	`DF[DF$V1 == 1 & DF$V4 == "A"]`	`filter(TB, V1 == 1, V4 == "A")`

Unique rows
`unique(DT)` `unique(DT, by = c("V1", "V4"))`	`DF[!duplicated(DF), ]` `DF[!duplicated(DF[c("V1", "V4")]), ]`	`distinct(TB)` `distinct(TB, across(c(V1, V4)))`

Discard rows with missing values
`na.omit(DT, cols = 1:4)`	`DF[complete.cases(DF[, 1:4]), ]`	`tidyr::drop_na(TB, names(DF))`

Random sample of rows
`DT[sample(.N, 3)] # .N = nb of rows in DT` `DT[sample(.N, .N / 2)]` `DT[frankv(-V1, ties.method = "dense") < 2]`	`DF[sample(nrow(DF), 3), ]` `DF[sample(nrow(DF), nrow(DF) / 2), ]` `DF[rank(-DF$V1, ties.method = "dense") < 2, ]`	`slice_sample(TB, n = 3)` `slice_sample(TB, prop = 0.5)` `TB %>% slice_max(V1, n = 1)`

Other
`DT[V4 %like% "^B"]` `DT[V2 %between% c(3, 5)]` `DT[between(V2, 3, 5, incbounds = FALSE)]` `DT[V2 %inrange% list(-1:1, 1:3)]`	`DF[grepl("^B", DF$V4), ]` `DF[DF$V2 >= 3 & DF$V2 <= 5, ]` `DF[DF$V2 > 3 & DF$V2 < 5, ]` `DF[DF$V2 %in% c(-1:1, 1:3), ]`	`filter(TB, grepl("^B", V4))` `filter(TB, dplyr::between(V2, 3, 5))` `filter(TB, V2 > 3 & V2 < 5)` `filter(TB, V2 >= -1:1 & V2 <= 1:3)`

Select columns

Select one column using an index(not recommended)
`DT[[3]]` `DT[, 3]`	`DF[[3]]` `DF[3]`	`TB[[3]]` `TB[3]`

Select one column using column name
`DT[, list(V2)] # data.table` `DT[, .(V2)] # data.table` `DT[, "V2"] # data.table` `DT[, V2] # vector` `DT[["V2"]] # vector`	`DF["V2"]. # data frame` `setNames(DF$V2, DFV$V4) # vector` `DF[, "V2", drop = FALSE] # data frame` `DF[["V2"]] # vector`	`select(TB, V2) # tibble` `pull(TB, V2, name = V4) # vector` `TB[, "V2"] # tibble` `TB[["V2"]] # vector`

Select several columns
`DT[, .(V2, V3, V4)]` `DT[, list(V2, V3, V4)]` `DT[, V2:V4]`	`DF[c("V2", "V3", "V4")]` `DF[, which(names(DF) %in% c("V2", "V3", "V4"))]` `DF[, 2:4]`	`select(TB, V2, V3, V4)` `select(TB, V2:V4)`

Exclude columns
`DT[, !c("V2", "V3")]`	`DF[ , !(names(DF) %in% c("V2", "V3"))]`	`select(TB, -V2, -V3)`

Select/Exclude columns using a character vector
`cols <- c("V2", "V3")` `DT[, ..cols] # .. prefix means 'one-level up'` `DT[, !..cols]`	`cols <- c("V2", "V3")` `DF[, cols]` `DF[ , !(names(DF) %in% cols)]`	`cols <- c("V2", "V3")` `select(DF, !!cols)` `select(DF, -!!cols)`

Other selections
`cols <- paste0("V", 1:2)` `cols <- union("V4", names(DT))` `cols <- grep("V", names(DT))` `cols <- grep("3$", names(DT))` `cols <- grep(".2", names(DT))` `cols <- grep("^V1\|X$", names(DT))` `cols <- grep("^(?!V2)", names(DT), perl = TRUE)` `DT[, ..cols]`	`DF[ , grep("^V[1-2]$", names(DF))]` `DF[ , c("V4", setdiff(names(DF), "V4"))]` `DF[ , grep("V", names(DF))]` `DF[ , grep("3$", names(DF))]` `DF[ , grep(".2", names(DF))]` `DF[ , c("V1", "X")]` `DF[ , !grepl("^V2", names(DF))]`	`select(TB, num_range("V", 1:2))` `select(TB, V4, everything())` `select(TB, contains("V"))` `select(TB, ends_with("3"))` `select(TB, matches(".2"))` `select(TB, one_of(c("V1", "X")))` `select(TB, -starts_with("V2"))`

Sort rows

By column
`DT[order(V3)]`	`DF[order(DF$V3)]`	`arrange(TB, V3)`

Decreasing order
`DT[order(-V3)]`	`DF[order(DF$V3, decreasing = TRUE)]`	`arrange(TB, desc(V3))`

Based on several columns
`DT[order(V1, -V2)]`	`DF[order(DF$V1, -DF$V2), ]`	`arrange(TB, V1, desc(V2))`

Aggregate (summarise)

One column
`DT[, sum(V1)] # vector` `DT[, .(sum(V1))] # data.table` `DT[, .(sumV1 = sum(V1))] # data.table`	`sum(DF$V1)` `data.frame(sumV1 = sum(DF$V1))`	`summarise(TB, sum(V1))` `summarise(TB, sumV1 = sum(V1))`

Several columns
`DT[, .(sum(V1), sd(V3))]`	`data.frame(sum_V1 = sum(DF$V1), sd_V3 = sd(DF$V3))`	`summarise(TB, sum(V1), sd(V3))`

Several columns with column names
`DT[, .(sumv1 = sum(V1),` `sdv3 = sd(V3))]`	`data.frame(sumv1 = sum(V1),` `sdv3 = sum(V3))`	`TB %>%` `summarise(sumv1 = sum(V1),` `sdv3 = sd(V3))`

Subset of rows
`DT[1:4, sum(V1)]`	`sum(DF[1:4, "V1"])`	`TB %>%` `slice(1:4) %>%` `summarise(sum(V1))`

Subset of rows
`DT[, data.table::first(V3)]` `DT[, data.table::last(V3)]` `DT[5, V3]` `DT[, uniqueN(V4)]` `uniqueN(DT)`	`DF[1, "V3"]` `DF[nrow(DF), "V3"]` `DF[5, "V3"]` `length(unique(DF$V4))` `length(unique(DF))`	`summarise(TB, dplyr::first(V3))` `summarise(TB, dplyr::last(V3))` `summarise(TB, nth(V3, 5))` `summarise(TB, n_distinct(V4))` `n_distinct(TB)`

Count number of observations for each group
`DT[, .N, by = V4]`	`as.data.frame(table(DF$V4))`	`TB %>%` `group_by(V4) %>%` `tally()`

Modify columns

Modify a column
`DT[, V1 := V1^2]` `DT`	`DF$V1 <- DF$V1^2` `DF`	`TB <- TB %>% mutate(V1 = V1^2)` `TB`

Add one column
`DT[, v5 := log(V1)]`	`DF$v5 <- log(DF$V1)` `DF`	`TB <- mutate(DF, v5 = log(V1))`

Add several columns
`DT[, c("v6", "v7") := .(sqrt(V1), "X")]` `DT[, ':='(v6 = sqrt(V1),` `v7 = "X")]`	`DF$v6 <- sqrt(DF$V1)` `DF$v7 <- "X"` `DF`	`TB <- mutate(TB, v6 = sqrt(V1), v7 = "X")`

Create one column and remove the others
`DT[, .(v8 = V3 + 1)]`	`data.frame(v8 = DF$V3 + 1)`	`transmute(TB, v8 = V3 + 1)`

Remove one column
`DT[, v5 := NULL]`	`DF <- DF[, !names(DF %in% "v5")]`	`TB <- select(TB, -v5)`

Remove several columns
`DT[, c("v6", "v7") := NULL]`	`DF <- DF[, !(names(DF) %in% c("v6", "v7"))]`	`TB <- select(TB, -v6, -v7)`

Remove columns using a vector of colnames
`cols <- c("V3")` `DT[, (cols) := NULL] # ! not DT[, cols := NULL]`	`cols <- c("V3")` `DF <- DF[, !(names(DF) %in% cols)]`	`cols <- c("V3")` `TB <- select(TB, -one_of(cols))`

Replace values for rows matching a condition
`DT[V2 < 4, V2 := 0L]` `DT`	`DF$V2 <- replace(DF$V2, DF$V2 < 4, 0L)` `DF`	`TB <- mutate(DF, V2 = base::replace(V2, V2 < 4, 0L))` `TB`

By group
`DT[, .(sumV2 = sum(V2)), by = "V4"]` `DT[, by = V4,` `.(sumV2 = sum(V2))]`	`aggregate(V2 ~ V4, data = DF, FUN = sum)`	`TB %>%` `group_by(V4) %>%` `summarise(sumV2 = sum(V2))`

By several groups
`DT[, keyby = .(V4, V1),` `.(sumV2 = sum(V2))]`	`aggregate(V2 ~ V4 + V1,` `data = DF, FUN = sum)`	`TB %>%` `group_by(V4, V1) %>%` `summarise(sumV2 = sum(V2)) %>%` `ungroup()`

Calling function in by
`DT[, by = tolower(V4),` `.(sumV1 = sum(V1))]`	`aggregate(V! ~ tolower(V4),` `data = DF, FUN = sum)`	`TB %>%` `group_by(tolower(V4)) %>%` `summarise(sumV1 = sum(V1))`

Assigning column name in by
`DT[, keyby = .(abc = tolower(V4)),` `.(sumV1 = sum(V1))]`	`aggregate(V1 ~ abc,` `data = transform(DF, abc = tolower(V4)),` `FUN = sum)`	`TB %>%` `group_by(abc = tolower(V4)) %>%` `summarise(sumV1 = sum(V1))`

Using a condition in by
`DT[, keyby = V4 == "A",` `sum(V1)]`	`aggreggate(V1 ~ groupA,` `data = transform(DF, groupA = V4 == "A"),` `FUN = sum)`	`TB %>%` `group_by(V4 == "A") %>%` `summarise(sum(V1))`

By on a subset of rows
`DT[1:5, # i` `.(sumV1 = sum(V1)), # j` `by = V4] # by`	`aggregate(V1 ~ V4,` `data = DF[1:5,], sum)`	`TB %>%` `slice(1:5) %>%` `group_by(V4) %>%` `summarise(sumV1 = sum(V1))`

Add a column with number of observations for each group
`DT[, n := .N, by = V1][]` `DT[, n := NULL]`	`DF$tally <-` `ave(DF$V1,` `DF$V1,` `FUN = length)`	`TB %>%` `group_by(V1) %>%` `add_tally()`

Retrieve the first/last/nth observation for each group
`DT[, data.table::first(V2), by = V4]` `DT[, data.table::last(V2), by = V4]` `DT[, V2[2], by = V4]`	`aggregate(V2 ~ V4,` `data = DF,` `FUN = function(X) x[1])` `aggregate(V2 ~ V4,` `data = DF,` `FUN = function(x) x[length(x)])` `aggregate(V2 ~ V4,` `data = DF,` `FUN = function(x) x[2])`	`TB %>%` `group_by(V4) %>%` `summarise(dplyr::first(V2))` `TB %>%` `group_by(V4) %>%` `summarise(dplyr::last(V2))` `TB %>%` `group_by(V4) %>%` `summarise(dplyr::nth(V2, 2))`

Going further

Advanced columns manipulation

Summarise all the columns
`DT[, lapply(.SD, max)]`	`apply(DF, 2, max)`	`TB %>% summarise(across(everything(), max))`

Summarise several columns
`DT[, lapply(.SD, mean),` `.SDcols = c("V1", "V2")]`	`data.frame(mean_V1 = mean(DF$V1), mean_V2 = mean(DF$V2))`	`TB %>% summarise(across(c(V1, V2), mean))`

Summarise several columns by group
`DT[, by = V4,` `lapply(.SD, mean),` `.SDcols = c("V1", "V2")]` `DT[, by = V4,` `lapply(.SD, mean),` `.SDcols = patterns("V1\|V2\|Z0")]`	`cols <- intersect(C("V1", "V2", "Z0"), names(DF))` `aggregate(DF[cols], by = LIST(DF$V4), FUN = mean, na.rm = TRUE)`

Summarise with more than one function by group
`DT[, by = V4,` `c(lapply(.SD, sum),` `lapply(.SD, mean))]`	`aggregate(cbind(sumV1 = DF$V1, meanV1 = DF$V1,` `sumV2 = DF$V2, meanV2 = DF$V2) ~ V4,` `data = DF,` `FUN = function(x) c(sum = sum(x), mean = mean(x)))`	`TB %>%` `group_by(V4) %>%` `summarise(across(everything(),` `list(sum = sum, mean = mean)))`

Summarise using a condition
`DT[, lapply(.SD, mean),` `.SDcols = is.numeric]` `foo <- function(x) {is.numeric(x) && mean(x) > 3}` `DT[, lapply(.SD, mean),` `.SDcols = foo]`	`sapply(DF[sapplu(DF, is.numeric)],` `mean)` `sapply(DF[sapply(DF, function(x) is.numeric(x) && mean(x) > 3)], mean)`	`TB %>%` `summarise(across(where(is.numeric),` `mean))` `TB %>% summarise(across(where(~ is.numeric(.x) && mean(.x) > 3),` `mean))`

Modify all the columns
`DT[, lapply(.SD, rev)]`	`data.frame(lapply(DF, rev))`	`TB %>% mutate(across(everything(), rev))`

Modify several columns (dropping the others)
`DT[, lapply(.SD, sqrt),` `.SDcols = V1:V2]` `DT[, lapply(.SD, exp),` `.SDcols = !"V4"]`	`DF[c("V1", "V2")] <- lapply(DF[c("V1", "V2")], sqrt)` `DF[setdiff(names(DF), "V4")] <- lapply(DF[setdiff(names(DF), "V4")], exp)`

Modify several columns (keeping the others)
`DT[, c("V1", "V2") := lapply(.SD, sqrt),` `.SDcols = c("V1", "V2")]` `cols <- setdiff(names(DT), "V4")` `DT[, (cols) := lapply(.SD, "^", 2L),` `.SDcols = cols]` `DT`	`DF[c("V1", "V2")] <- lapply(DF[c("V1", "V2")], sqrt)` `cols <- setdiff(names(DF), "V4")` `DF[cols] <- lapply(DF[cols], "^", 2L)`	`TB <-TB %>%` `mutate(across(all_of(c("V1", "V2")), sqrt))` `TB <- TB %>%` `mutate(across(-any_of("V4"),` `~ "^"(.x, 2L)))` `TB`

Modify columns using a condition (dropping the others)
`DT[, .SD - 1,` `.SDcols = is.numeric]` `DT`	`data.frame(lapply(DF,` `function(x) if (is.numeric(x)) x - 1 else x))` `DF`	`TB %>%` `transmute(across(where(is.numeric),` `~ '-'(., 1L)))` `TB`

Modify columns using a condition (keeping the others)
`DT[, (cols) := lapply(.SD, as.integer),` `.SDcols = is.numeric]`	`DF[sapply(DF, is.numeric)] <-` `lapply(DF[sapply(DF, is.numeric)],` `as.integer)` `DF`	`TB <- TB %>%` `mutate(across(where(is.numeric),` `as.integer))`

Use a complex expression
`DT[, by = V4,` `.(V1[1:2], "X")]`	`DF <- do.call(rbind,` `by(DF, DF$V4, function(sub) {` `head(data.frame(V1 = sub$V1,` `V2 = "X"), 2)` `}))`	`TB %>%` `group_by(V4) %>%` `slice(1:2) %>%` `transmute(V1 = V1,` `V2 = "X")`

Use multiple expressions (with DT[,{j}])
`DT[, {print(V1)` `print(summary(V1))` `x <- V1 + sum(V2)` `.(A = 1:.N, B = x)` `}]`	`print(DF$V1)` `print(summary(DF$V1))` `x <- DF$V1 + sum(DF$V2)` `data.frame(A = seq_along(DF$V1), B = x)`	`#`

Chain expressions

Expression chaining using DT[][] (recommended)
`DT[, by = V4,` `.(V1sum = sum(V1)) ][` `V1sum > 5]`	`DF <- subset(aggregate(V1 ~ V4,` `data = DF,` `FUN = sum), V1 > 5)`	`TB %>%` `group_by(V4) %>%` `summarise(V1sum = sum(V1)) %>%` `filter(V1sum > 5)`

Expression chaining using %>%
`DT[, by = V4,` `.(V1sum = sum(V1))] %>%` `.[order(-V1sum)]`	`subset(aggregate(V1 ~ V4,` `data = DF,` `FUN = sum),` `V1 > 5)`	`TB %>%` `group_by(V4) %>%` `summarise(V1sum = sum(V1)) %>%` `arrange(desc(V1sum))`

Indexing and Keys

Set the key/index
`setkey(DT, V4)` `setindex(DT, V4)` `DT`	`DF[order(DF$V4), ]`	`TB <- arrange(TB, V4)` `TB`

Select the matching rows
`DT["A", on = "V4"]` `DT[c("A", "C"), on = .(V4)]`	`DF[DF$V4 == "A", ]` `DF[DF$V4 %in% c("A", "C"), ]`	`filter(TB, V4 == "A")` `filter(TB, V4 %in% c("A", "C"))`

Select the first matching row
`DT["B", on = "V4", mult = "first"]` `DT[c("B", "C"), on = "V4", mult = "first"]`	`DF[DF$V4 == "B", ][1, ]`	`TB %>%` `filter(V4 == "B") %>%` `slice(1)` `# ?`

Select the last matching row
`DT["A", on = "V4", mult = "last"]`	`DF[DF$V4 == "A", ][nrow(DF[DF$V4 == "A", ]), ]`	`TB %>%` `filter(V4 == "A") %>%` `slice(n())`

Nomatch argument
`DT[c("A", "D"), on = "V4", nomatch = NA]` `DT[c("A", "D"), on = "V4", nomatch = 0]`	`DF[DF$V4 %in% c("A", "D"), ]`	`filter(TB, V4 %in% c("A", "D"))`

Apply a function on the matching rows
`DT[c("A", "C"), sum(V1), on = "V4"]`	`sum(DF$V1[DF$V4 %in% c("A", "C")])`	`TB %>%` `filter(V4 %in% c("A", "C")) %>%` `summarise(sum(V1))`

Modify values for matching rows
`DT["A", V1 := 0, on = "V4"]` `DT`	`DF$V1[DF$V4 == "A"] <- 0`	`TB <- TB %>%` `mutate(V1 = base::replace(V1, V4 == "A", 0L)) %>%` `arrange(V4)` `TB`

Use keys in by
`DT[!"B", sum(V1), on = "V4", by = .EACHI]` `DT[V4 != "B",` `by = V4,` `sum(V1)]`	`aggregate(V1 ~ V4,` `data = DF[DF$V4 != "B", ],` `FUN = sum)`	`TB %>%` `filter(V4 != "B") %>%` `group_by(V4) %>%` `summarise(sum(V1))`

Set keys/indices for multiple columns
`setkey(DT, V4, V1)` `setindex(DT, V4, V1)`	`DF[order(DF$V4, DF$V1)]`	`TB <- arrange(DF, V4, V1)`

Subset using multiple keys/indices
`DT[.("C", 1), on = .(V4, V1)]` `DT[.(c("B", "C"), 1), on = .(V4, V1)]` `DT[.(c("B", "C"), 1), on = .(V4, V1), which = TRUE]`	`DF[DF$V4 == "C" & DT$V1 == 1, ]` `DF[DF$V4 %in% c("B", "C") & DT$V1 == 1, ]` `which(DF$V4 %in% c("B", "C") & DT$V1 == 1)`	`filter(TB, V1 == 1, V4 == "C")` `filter(TB, V1 == 1, V4 %in% c("B", "C"))`

Remove keys/indices
`setkey(DT, NULL)` `setindex(DT, NULL)`	`DF`	`TB`

’set*()’ modifications

Replace values
`set(DT, i = 1L, j = 2L, value = 3L)` `DT`	`DF[1, 2] <- 3` `DF`	`TB[1, 2] <- 3L` `TB`

Reorder rows
`setorder(DT, V4, -V1)` `setorderv(DT, c("V4", "V1"), c(1, -1))`	`DF <- DF[order(TB$V4, -DF$V1), ]` `DF`	`TB <- arrange(TB, V4, desc(V1))`

Modify colnames
`setnames(DT, old = "V2", new = "v2")` `setnames(DT, old = -(c(1, 3)), new = "V2")` `DT`	`names(DF)[names(DF) == "V2"] <- "v2"` `names(DF)[names(DF) == "v2"] <- "V2"` `DF`	`TB <- rename(DF, v2 = V2)` `TB <- rename(DF, V2 = v2)` `TB`

Reorder columns
`setcolorder(DT, c("V4", "V1", "V2"))` `DT`	`DF <- DF[, c("V4", "V1", "V2")]` `DF`	`TB <- select(TB, V4, V1, V2)` `TB`

Convert data
	`DF <- DF[, c("V1", "V2", "V4")]` `DF`	`TB %>%` `relocate(V4, .after = V2)`

Relocate columns

Advanced use of by

Select first/last/.. row by group
`DT[, .SD[1], by = V4]` `DT[, .SD[c(1, .N)], by = V4]` `DT[, tail(.SD, 2), by = V4]`	`DF[ave(DF$V4,` `DF$V4,` `FUN = seq_along) == 1, ]` `DF[ave(DF$V4,` `DF$V4,` `FUN = seq_along) %in% c(1, max(ave(DF$V4, DF$V4, FUN = seq_along)))]` `do.call(rbind, by(DF, DF$V4, function(x) tail(x, 2)))`	`TB %>%` `group_by(V4) %>%` `slice(1)` `TB %>%` `group_by(V4) %>%` `slice(1, n())` `TB %>%` `group_by(V4) %>%` `group_map(~ tail(.x, 2))`

Select rows using a nested query
`DT[, .SD[which.min(V2)], by = V4]`	`DF[order(DF$V2), ][` `ave(DF$V4, DF$V4, FUN = seq_along) == 1,]`	`TB %>%` `group_by(V4) %>%` `arrange(V2) %>%` `slice(1)`

Add a group counter column
`DT[, Grp := .GRP, by = .(V4, V1)][]` `DT[, Grp := NULL]`	`ave(seq_along(DF$V4), DF$V4, DF$V1, FUN = function(X) seq_along(x))`	`TB %>%` `group_by(V4, V1) %>%` `mutate(Grp = cur_group_id())`

Get row number of first (and last) observation by group
`DT[, .I, by = V4]` `DT[, .I[1], by = V4]` `DT[, .I[c(1, .N)], by = V4]`	`sapply(split(DT, DT$V4), function(group) rownames(groupe))` `sapply(split(DT, DT$V4), function(group) rownames(group)[1])` `sapply(split(DT, DT$V4), function(group) c(rownames(group)[1],` `rownames(group)[nrow(group)]))`	`TB %>%` `group_by(V4) %>%` `mutate(cur_group_rows()) %>%` `ungroup()` `TB %>%` `group_by(V4) %>%` `summarize(cur_group_rows()[1])` `TB %>%` `group_by(V4) %>%` `summarize(cur_group_rows()[c(1, n())])`

Handle list-columns by group
`DT[, .(.(V1)), by = V4] # V1 as list` `DT[, .(.(.SD)), by = V4] # subsets`	`tapply(DF$V1, DF$V4,` `function(x) list(x))` `split(DF, DF$V4)`	`TB %>%` `group_by(V4) %>%` `summarise(list(V1))` `TB %>%` `group_by(V4) %>%` `group_nest()`

Grouping sets (multiple by at once)
`rollup(DT,` `.(SumV2 = sum(V2)),` `by = c("V1", "V4"))` `rollup(DT,` `.(SumV2 = sum(V2), .N),` `by = c("V1", "V4"),` `id = TRUE)` `cube(DT,` `.(SumV2 = sum(V2), .N),` `by = c("V1", "V4"),` `id = TRUE)` `groupingsets(DT,` `.(SumV2 = sum(V2), .N),` `by = c("V1", "V4"),` `sets = list("V1", c("V1", "V4")),` `id = TRUE)`	`aggregate(V2 ~ V1 + V4,` `data = DF,` `FUN = sum)` `aggregate(cbind(V2, N = 1) ~ V1 + V4,` `data = DF,` `FUN = function(X) c(sum(x),` `length(x)))` `aggregate(cbind(V2, N = 1) ~ V1 + V4,` `data = DF,` `FUN = function(c) c(sum(x),` `length(x)),` `na.action = NULL)` `aggregate(V2 ~ V1, data = DF, FUN = sum)` `aggregate(V2 ~ V1 + V4, data = DF, FUN = sum)`

Read and write

Write data to a csv file
`fwrite(DT, "DT.csv")`	`write.csv(DF, "DF.csv", row.names = FALSE)`	`readr::write_csv(TB, "TB.csv")`

Write data to a tab-delimited file
`fwrite(DT, "DT.txt", sep = "\t")`	`write.table(DF, "DF.txt", sep = "\t", row.names = FALSE, col.names = TRUE)`	`readr::write_delim(TB, "TB.txt", delim = "\t")`

Write list-column data to a csv file
`fwrite(setDT(list(0, list(1:5))), "DT2.csv")`	`DT2 <- data.frame(V1 = 0, V2 = list(1:5))` `write.csv(DT2, "DT2.csv", row.names = FALSE)`

Read a csv / tab-delimited filed
`fread("DT.csv")` `fread("DT.txt", sep = "\t")`	`read.csv("DF.csv")` `read.table("DF.txt", sep = "\t", header = TRUE)`	`readr::read_csv("TB.csv")` `readr::read_delim("TB.txt", delim = "\t")`

Read a csv file selecting / dropping columns
`fread("DT.csv", select = c("V1", "V4"))` `fread("DT.csv", drop = "V4")`	`read.csv("DF.csv")[, c("V1", "V4")]` `read.csv("DF.csv")[, !(names(read.csv("DF.csv")) %in% "V4")]`

Read and rbind several files
`rbindlist(lapply(c("DT.csv", "DT.csv"), fread))`	`do.call(rbind, lapply(c("DF.csv", "DF.csv"), read.csv))`	`c("TB.csv", "TB.csv") %>%` `purrr::map_dfr(readr::read_csv)`

Reshape

Melt data (from wide to long)
`melt(DT, id.vars = "V4")` `mDT <- melt(DT,` `id.vars = "V4",` `measure.vars = c("V1", "V2"),` `variable.name = "Variable",` `value.name = "Value")`	`reshape(DF,` `varying = setdiff(names(DF), "V4"),` `v.names = "value",` `timevar = "variable",` `times = setdiff(names(DF), "V4"),` `direction = "long")` `reshape(DF,` `varying = setdiff(names(DF), "V4"),` `v.names = "Value",` `timevar = "Variable",` `times = setdiff(names(DF), "V4"),` `direction = "long")`	`tidyr::gather(TB, variable, value, -V4)` `mTB <- tidyr::gather(TB,` `key = Variable,` `value = Value,` `-V4)`

Cast data (from long to wide)
`dcast(mDT, V4 ~ Variable)` `dcast(mDT, V4 ~ Variable, fun.aggregate = sum)` `dcast(mDT, V4 ~ Value > 5)`	`as.data.frame.matrix(replace(table(DF$V4,` `DF$Variable),` `is.na(table(DF$V4,` `DF$Variable)), 0))`	`tidyr::spread(data = count(mTB, V4, Variable),` `key = Variable,` `value = n,` `fill = 0)`

Split
`split(DT, by = "V4")`	`split(DF, DF$V4)`	`group_split(TB, V4)`

Split and transpose a vector/column
`vec <- c("A:a", "B:b", "C:c")` `tstrsplit(vec, split = ":", keep = 2L)` `setDT(tstrsplit(vec, split = ":"))[]`	`do.call(rbind, strsplit(vec, " ")) \|> as.data.frame()`	`vec <- c("A:a", "B:b", "C:c")` `tidyr::separate(tibble(vec), vec, c("V1", "V2"))`

Other

Check package installation

List data.tables/tibbles
`tables()`	`#`	`#`

Get/Set number of threads when parallelized
`getDTthreads()` `setDTthreads()`	`#`	`#`

Lead/Lag
`shift(1:10, n = 1, fill = NA, type = "lag")` `shift(1:10, n = 1:2, fill = NA, type = "lag")` `shift(1:10, n = 1, fill = NA, type = "lead")`	`c(NA, 1:9)` `sapply(1:2, function(n) c(rep(NA, n), seq(1, 10)[1:(length(seq(1, 10)) - n)]))` `c(1:9, NA)`	`lag(1:10, n = 1, default = NA)` `purrr::map(1:2, ~lag(1:10, n = .x))` `lead(1:10, n = 1, default = NA)`

Generate run-length ids
`rleid(rep(c("a", "b", "a"), each = 3))` `rleid(rep(c("a", "b", "a"), each = 3), prefix = "G")`	`unlist(lapply(rle(rep(c("a", "b", "a"), each = 3))$lengths, seq_along))` `paste0("G", unlist(lapply(rle(rep(c("a", "b", "a"), each = 3))$lengths, seq_along)))`	`#`

Fast version of `ifelse()'
`x <- c(-3:3, NA)` `fifelse(test = x < 0,` `yes = "neg",` `no = "pos",` `na = "NA")`	`x <- c(-3:3, NA)` `result <- ifelse(is.na(x),` `"NA",` `ifelse(x < 0, "neg", "pos"))` `result`	`x <- c(-3:3, NA)` `if_else(condition = x < 0,` `true = "neg",` `false = "pos",` `missing = "NA")`

Vectorised 'ifelse' statements
`x <- 1:10` `fcase(` `x %% 6 == 0, "fizz buzz",` `x %% 2 == 0, "fizz",` `x %% 3 == 0, "buzz",` `default = NA_character_` `)`	`x <- 1:10` `result <- ifelse(x %% 6 == 0, "fizz buzz",` `ifelse(x %% 2 == 0, "fizz",` `ifelse(x %% 3 == 0, "buzz", as.character(x))))` `result`	`x <- 1:10` `case_when(` `x %% 6 == 0 ~ "fizz buzz",` `x %% 2 == 0 ~ "fizz",` `x %% 3 == 0 ~ "buzz",` `TRUE ~ as.character(x)` `)`

Rolling functions

Join

Join matching rows from y to x
`y[x, on = "Id"]` `merge(x, y, all.x = TRUE, by = "Id")` `y[x] # requires keys`	`merge(x, y, by = "Id", all.x = TRUE)`	`left_join(x, y, by = "Id")`

Join matching rows from x to y
`x[y, on = "Id"]` `merge(x, y, all.y = TRUE, by = "Id")` `x[y] # requires keys`	`merge(x, y, by = "Id", all.y = TRUE)`	`right_join(x, y, by = "Id")`

Join matching rows from both x and y
`x[y, on = "Id", nomatch = 0]` `merge(x, y)` `x[y, nomatch = 0] # requires keys`	`merge(x, y, by = "Id")`	`inner_join(x, y, by = "Id")`

Join keeping all the rows
`merge(x, y, all = TRUE, by = "Id")`	`merge(x, y, by = "Id", all = TRUE)`	`full_join(x, y, by = "Id")`

Return rows from x matching y
`unique(x[y$Id, on = "Id", nomatch = 0])` `unique(x[y$Id, nomatch = 0]) # requires keys`	`merge(x, y, by = "Id", all.x = TRUE)[!is.na(Id), ]`	`semi_join(x, y, by = "Id")`

Return rows from x not matching y
`x[!y, on = "Id"]` `x[!y] # requires keys`	`merge(x, y, by = "Id",` `all.x = TRUE, all.y = FALSE)[is.na(y$Id), ]`	`anti_join(x, y, by = "Id")`

More joins

Select columns while joining
`## i. prefix refers to cols in y` `x[y, .(Id, X1, i.XY)]` `# x. prefix refers to cols in x` `x[y, .(Id, x.XY, i.XY)]`	`merge(select(x, Id, X1),` `select(y, Id, XY), by = "Id",` `all.x = FALSE, all.y = TRUE)` `merge(select(x, Id, XY),` `select(y, Id, XY), by = "Id",` `all.x = FALSE, all.y = TRUE)`	`right_join(select(x, Id, X1),` `select(y, Id, XY),` `by = "Id")` `right_join(select(x, Id, XY),` `select(y, Id, XY),` `by = "Id")`

Aggregate columns while joining
`y[x, .(X1Y1 = sum(Y1) * X1), by = .EACHI]`	`merge(aggregate(Y1 ~ Id, data = y, FUN = sum),` `x, by = "Id", all.x = FALSE, all.y = TRUE) %>%` `transform(X1Y1 = SumY1 * X1)[, c("Id", "X1Y1")]`	`y %>%` `group_by(Id) %>%` `summarise(SumY1 = sum(Y1)) %>%` `right_join(x) %>%` `mutate(X1Y1 = SumY1 * X1) %>%` `select(Id, X1Y1)`

Update columns while joining
`y[x, SqX1 := i.X1^2]` `y[, SqX1 := x[.BY, X1^2, on = "Id"], by = Id]` `y[, SqX1 := NULL]`	`merge(` `transform(x[, c("Id", "X1")], SqX1 = X1^2),` `y, by = "Id", all.x = FALSE, all.y = TRUE)[, c(names(y), "SqX1")]`	`x %>%` `select(Id, X1) %>%` `mutate(SqX1 = X1^2) %>%` `right_join(y, by = "Id") %>%` `select(names(y), SqX1)`

Adds a list column with rows from y matching x (nest-join)
`x[, y := .(.(y[.BY, on = "Id"])), by = Id]` `x[, y := NULL]`	`merge(x, y, by = "Id", all = TRUE) \|>` `transform(Nested = split(., .$Id))`	`nest_join(x, y, by = "Id")`

Update columns while joining (using vectors of colnames)
`cols <- c("NewXY", "NewX1")` `icols <- paste0("i.", c("XY", "X1"))` `y[x, (cols) := mget(icols)]` `y[, (cols) := NULL]`	`cols <- c("NewXY", "NewX1")` `icols <- paste0("i.", c("XY", "X1"))` `y[, cols] <- mget(icols)` `y[, cols] <- NULL`	`# ?`

Join passing columns to match in the 'on' argument
`x[z, on = "X1 == Z1"]` `x[z, on = .(X1 == Z1)]` `x[z, on = .(Id == ID, X1 == Z1)]`	`merge(x, z, by.x = "X1", by.y = "Z1", all.y = TRUE)` `merge(x, z, by.x = c("Id", "X1"), by.y = c("ID", "Z1"), all.y = TRUE)`	`right_join(x, z, by = c("X1" = "Z1"))` `right_join(x, z, by = c("Id" = "ID", "X1" = "Z1"))`

Non-equi joins
`x[z, on = .(Id == ID, X1 <= Z1)]` `x[z, on = .(Id == ID, X1 > Z1)]` `x[z, on = .(X1 < Z1), allow.cartesian = TRUE]`	`subset(merge(x, z, by.x = "Id", by.y = "ID", all.x = TRUE), X1 <= Z1)` `subset(merge(x, z, by.x = "Id", by.y = "ID", all.x = TRUE), X1 > Z1)` `merge(x, z, by.x = "Id", by.y = "ID", all.x = TRUE)[X1 < Z1, ]`	`#`

Rolling joins/subsets (performed on the last numeric column)
`x[z, on = .(Id == ID, X1 == Z1), roll = "nearest"]` `setkey(x, Id, X1)` `x[.("C", 5:9), roll = "nearest"]`	`merge(x, z, by.x = c("Id", "X1"), by.y = c("ID", "Z1"), all.x = TRUE)` `x[order(x$Id, x$X1)]` `x[.("C", 5:9), roll = "nearest"]`	`#`


`x[.("C", 5:9), roll = Inf]` `x[.("C", 5:9), roll = 0.5]` `x[.("C", 5:9), roll = Inf, rollends = c(FALSE, TRUE)]` `x[.("C", 5:9), roll = Inf, rollends = c(FALSE, FALSE)]`	`x[x$V1 == "C" & x$V2 %in% 5:9, ]` `x[x$V1 == "C" & x$V2 >= 5 & x$V2 <= 9, ]` `x[x$V1 == "C" & x$V2 >= 5 & x$V2 <= 9, ]` `x[x$V1 == "C" & x$V2 > 5 & x$V2 < 9, ]`	`#`


`x[.("C", 5:9), roll = -Inf]` `x[.("C", 5:9), roll = -0.5]` `x[.("C", 5:9), roll = -Inf, rollends = c(TRUE, FALSE)]` `x[.("C", 5:9), roll = -Inf, rollends = c(TRUE, TRUE)]`	`x[x$V1 == "C" & x$V2 <= 9 & x$V2 >= 5, ]` `x[x$V1 == "C" & x$V2 >= 5 & x$V2 <= 9, ]` `x[x$V1 == "C" & x$V2 > 5 & x$V2 <= 9, ]` `x[x$V1 == "C" & x$V2 >= 5 & x$V2 <= 9, ]`	`#`

Cross join ('CJ' ~ 'expand.grid'
`CJ(c(2, 1, 1), 3:2)` `CJ(c(2, 1, 1), 3:2, sorted = FALSE, unique = TRUE)`	`expand.grid(c(2, 1, 1), 3:2)` `unique(expand.grid(c(2, 1, 1), 3:2))`	`expand.grid(c(2, 1, 1), 3:2)` `#`

Bind

Bind rows
`rbind(x, y)` `rbind(x, z, fill = TRUE)`	`rbind(x, y)` `rbind(x, z)`	`bind_rows(x, y)` `bind_rows(x, z)`

Bind rows using a list
`rbindlist(list(x, y), idcol = TRUE)`	`rbind(cbind(x, id = 1), cbind(y, id = 2))`	`bind_rows(list(x, y), .id = "id")`

Bind columns
`base::cbind(x, y)`	`cbind(x, y)`	`bind_cols(x, y)`

Set operations

Intersection
`fsetdiff(x, y)` `fsetdiff(x, y, all = TRUE)`	`setdiff(x, y)`	`dplyr::setdiff(x, y)`

Difference
`funion(x, y)` `funion(x, y, all = TRUE)`	`union(x, y)` `c(x, y)`	`dplyr::union(x, y)` `union_all(x, y)`

Union
`fsetequal(x, x[order(-V1),])` `all.equal(x, x)`	`identical(x, x)` `all.equal(x, x)`	`setequal(x, x[order(-V1),])` `all_equal(x, x)`

Equality