Yet Another Blog in Statistical Computing

I can calculate the motion of heavenly bodies but not the madness of people. -Isaac Newton

Aggregation by Group in R

> df <- read.csv('credit_count.csv')
> 
> # METHOD 1: USING AGGREGAGE() 
> summ1 <- aggregate(df[c('INCOME', 'BAD')], df[c('SELFEMPL', 'OWNRENT')], mean)
> print(summ1)
  SELFEMPL OWNRENT   INCOME        BAD
1        0       0 2133.314 0.08470957
2        1       0 2742.247 0.06896552
3        0       1 2881.201 0.06293210
4        1       1 3487.910 0.05316973
> 
> # METHOD 2: USING BY() 
> temp2 <- by(df[c('INCOME', 'BAD')], df[c('SELFEMPL', 'OWNRENT')], colMeans)
> summ2 <- cbind(expand.grid(dimnames(temp2)), do.call(rbind, temp2))
> print(summ2)
  SELFEMPL OWNRENT   INCOME        BAD
1        0       0 2133.314 0.08470957
2        1       0 2742.247 0.06896552
3        0       1 2881.201 0.06293210
4        1       1 3487.910 0.05316973
> 
> # METHOD 3: USING SQLDF() 
> library(sqldf)
Loading required package: DBI
Loading required package: gsubfn
Loading required package: proto
Loading required namespace: tcltk
Loading Tcl/Tk interface ... done
Loading required package: chron
Loading required package: RSQLite
Loading required package: RSQLite.extfuns
> summ3 <- sqldf("select SELFEMPL, OWNRENT, avg(INCOME) as INCOME, avg(BAD) from df
+                 group by SELFEMPL, OWNRENT")
Loading required package: tcltk
> print(summ3)
  SELFEMPL OWNRENT   INCOME   avg(BAD)
1        0       0 2133.314 0.08470957
2        0       1 2881.201 0.06293210
3        1       0 2742.247 0.06896552
4        1       1 3487.910 0.05316973
> 
> # METHOD 4: USING SQL.SELECT()
> source("http://sqlselect.googlecode.com/svn/trunk/sql.select.R")
Creating a generic function for ‘as.data.frame’ from package ‘base’ in the global environment
> summ4 <- sql.select("select SELFEMPL, OWNRENT, `mean(INCOME)` as INCOME, `mean(BAD)` as BAD 
+                      from df group by SELFEMPL, OWNRENT")
> print(summ4)
  SELFEMPL OWNRENT   INCOME        BAD
1        0       0 2133.314 0.08470957
2        0       1 2881.201 0.06293210
3        1       1 3487.910 0.05316973
4        1       0 2742.247 0.06896552

Efficiency Comparison among 4 Methods above

> test1 <- function(n){
+   for (i in 1:n){
+     summ1 <- aggregate(df[c('INCOME', 'BAD')], df[c('SELFEMPL', 'OWNRENT')], mean)
+   }
+ }
> system.time(test1(10))
   user  system elapsed 
  0.404   0.036   0.513 
> 
> test2 <- function(n){
+   for (i in 1:n){
+     temp2 <- by(df[c('INCOME', 'BAD')], df[c('SELFEMPL', 'OWNRENT')], colMeans)
+     summ2 <- cbind(expand.grid(dimnames(temp2)), do.call(rbind, temp2))
+   }
+ }
> system.time(test2(10))
   user  system elapsed 
  0.244   0.020   0.309 
> 
> test3 <- function(n){
+   for (i in 1:n){
+     summ3 <- sqldf("select SELFEMPL, OWNRENT, avg(INCOME) as INCOME, avg(BAD) from df
+                     group by SELFEMPL, OWNRENT")
+   }
+ }
> system.time(test3(10))
   user  system elapsed 
  0.956   0.112   1.178 
> 
> test4 <- function(n){
+   for (i in 1:n){
+     summ4 <- sql.select("select SELFEMPL, OWNRENT, `mean(INCOME)` as INCOME, `mean(BAD)` as BAD 
+                          from df group by SELFEMPL, OWNRENT")
+   }
+ }
> system.time(test4(10))
   user  system elapsed 
  0.432   0.112   0.601 
Advertisements

Written by statcompute

December 24, 2012 at 12:22 am

Posted in S+/R

Tagged with

%d bloggers like this: