Yet Another Blog in Statistical Computing

I can calculate the motion of heavenly bodies but not the madness of people. -Isaac Newton

Fastest Way to Add New Variables to A Large Data.Frame

pkgs <- list("hflights", "doParallel", "foreach", "dplyr", "rbenchmark", "data.table")
lapply(pkgs, require, character.only = T)

data(hflights)

benchmark(replications = 10, order = "user.self", relative = "user.self",
  transform = {
    ### THE GENERIC FUNCTION MODIFYING THE DATA.FRAME, SIMILAR TO DATA.FRAME() ###
    transform(hflights, wday = ifelse(DayOfWeek %in% c(6, 7), 'weekend', 'weekday'), delay = ArrDelay + DepDelay)
  },
  within    = {
    ### EVALUATE THE EXPRESSION WITHIN THE LOCAL ENVIRONMENT ###
    within(hflights, {wday = ifelse(DayOfWeek %in% c(6, 7), 'weekend', 'weekday'); delay = ArrDelay + DepDelay})
  },
  mutate   = {
    ### THE SPECIFIC FUNCTION IN DPLYR PACKAGE TO ADD VARIABLES ###
    mutate(hflights, wday = ifelse(DayOfWeek %in% c(6, 7), 'weekend', 'weekday'), delay = ArrDelay + DepDelay)
  },
  foreach = {
    ### SPLIT AND THEN COMBINE IN PARALLEL ###
    registerDoParallel(cores = 2)
    v <- c(names(hflights), 'wday', 'delay')
    f <- expression(ifelse(hflights$DayOfWeek %in% c(6, 7), 'weekend', 'weekday'),
                    hflights$ArrDelay + hflights$DepDelay)
    df <- foreach(fn = iter(f), .combine = mutate, .init = hflights) %dopar% {
      eval(fn)
    }
    names(df) <- v
  },
  data.table = {
    ### DATA.TABLE ###
    data.table(hflights)[, c("wday", "delay") := list(ifelse(hflights$DayOfWeek %in% c(6, 7), 'weekend', 'weekday'), hflights$ArrDelay + hflights$DepDelay)]
  }
)

#         test replications elapsed relative user.self sys.self user.child
# 4    foreach           10   1.442    1.000     0.240    0.144      0.848
# 2     within           10   0.667    2.783     0.668    0.000      0.000
# 3     mutate           10   0.679    2.833     0.680    0.000      0.000
# 5 data.table           10   0.955    3.983     0.956    0.000      0.000
# 1  transform           10   1.732    7.200     1.728    0.000      0.000
Advertisements

Written by statcompute

October 31, 2016 at 12:14 am

Posted in Big Data, S+/R

Tagged with ,

%d bloggers like this: