Yet Another Blog in Statistical Computing

I can calculate the motion of heavenly bodies but not the madness of people. -Isaac Newton

R Interface to Spark

SparkR

library(SparkR, lib.loc = paste(Sys.getenv("SPARK_HOME"), "/R/lib", sep = ""))
sc <- sparkR.session(master = "local")
df1 <- read.df("nycflights13.csv", source = "csv", header = "true", inferSchema = "true")

### SUMMARY TABLE WITH SQL
createOrReplaceTempView(df1, "tbl1")
summ <- sql("select month, avg(dep_time) as avg_dep, avg(arr_time) as avg_arr from tbl1 where month in (1, 3, 5) group by month")
head(summ)
#   month  avg_dep  avg_arr
# 1     1 1347.210 1523.155
# 2     3 1359.500 1509.743
# 3     5 1351.168 1502.685

### SUMMARY TABLE WITH AGG()
grp <- groupBy(filter(df1, "month in (1, 3, 5)"), "month")
summ <- agg(grp, avg_dep = avg(df1$dep_time), avg_arr = avg(df1$arr_time))
head(summ)
#   month  avg_dep  avg_arr
# 1     1 1347.210 1523.155
# 2     3 1359.500 1509.743
# 3     5 1351.168 1502.685

sparklyr

library(sparklyr)
sc <- spark_connect(master = "local")
df1 <- spark_read_csv(sc, name = "tbl1", path = "nycflights13.csv", header = TRUE, infer_schema = TRUE)

### SUMMARY TABLE WITH SQL
library(DBI)
summ <- dbGetQuery(sc, "select month, avg(dep_time) as avg_dep, avg(arr_time) as avg_arr from tbl1 where month in (1, 3, 5) group by month")
head(summ)
#   month  avg_dep  avg_arr
# 1     5 1351.168 1502.685
# 2     1 1347.210 1523.155
# 3     3 1359.500 1509.743

### SUMMARY TABLE WITH DPLYR
library(dplyr)
summ <- df1 %>% 
        filter(month %in% c(1, 3, 5)) %>% 
        group_by(month) %>%
        summarize(avg_dep = mean(dep_time), avg_arr = mean(arr_time)) 
head(summ)        
#   month  avg_dep  avg_arr
#   <int>    <dbl>    <dbl>
# 1     5 1351.168 1502.685
# 2     1 1347.210 1523.155
# 3     3 1359.500 1509.743        
Advertisements

Written by statcompute

June 9, 2017 at 12:30 am

Posted in Big Data, S+/R, Spark

Tagged with , , ,

%d bloggers like this: