## Archive for **February 2014**

## My First Learning Session in Pig Latin

-- load data A = load '/home/liuwensui/Documents/data/test_pig.txt' using PigStorage(',') as (id: chararray, x: int, y: float); dump A; /* (A,1,1.1) (A,2,2.2) (A,2,3.3) (B,1,1.1) (B,1,2.2) */ -- group data B = group A by id; dump B; /* (A,{(A,1,1.1),(A,2,2.2),(A,2,3.3)}) (B,{(B,1,1.1),(B,1,2.2)}) */ -- summarize data by group C1 = foreach B generate group as id1, MAX(A.x) as max_x; dump C1; /* (A,2) (B,1) */ C2 = foreach B generate group as id2, MIN(A.y) as min_y; dump C2; /* (A,1.1) (B,1.1) */ -- select data by criterion C3 = filter C2 by id2 == 'A'; dump C3; /* (A,1.1) */ -- inner join C4 = join C1 by id1, C3 by id2; dump C4; /* (A,2,A,1.1) */ -- full join C5 = join C1 by id1 full, C3 by id2; dump C5; /* (A,2,A,1.1) (B,1,,) */ -- union C6 = union C4, C5; dump C6; /* (A,2,A,1.1) (B,1,,) (A,2,A,1.1) */

## Efficiency of Importing Large CSV Files in R

### size of csv file: 689.4MB (7,009,728 rows * 29 columns) ### system.time(read.csv('../data/2008.csv', header = T)) # user system elapsed # 88.301 2.416 90.716 library(data.table) system.time(fread('../data/2008.csv', header = T, sep = ',')) # user system elapsed # 4.740 0.048 4.785 library(bigmemory) system.time(read.big.matrix('../data/2008.csv', header = T)) # user system elapsed # 59.544 0.764 60.308 library(ff) system.time(read.csv.ffdf(file = '../data/2008.csv', header = T)) # user system elapsed # 60.028 1.280 61.335 library(sqldf) system.time(read.csv.sql('../data/2008.csv')) # user system elapsed # 87.461 3.880 91.447

## Julia and SQLite

Similar to R and Pandas in Python, Julia provides a simple yet efficient interface with SQLite database. In addition, it is extremely handy to use sqldf() function, which is almost identical to the sqldf package in R, in SQLite package for data munging.

julia> # LOADING SQLITE PACKAGE julia> using SQLite julia> # CONNECT TO THE SQLITE DB FILE julia> db = SQLite.connect("/home/liuwensui/Documents/db/sqlitedb/csdata.db") julia> # SHOW TABLES IN THE DB julia> query("select name from sqlite_master where type = 'table'") 1x1 DataFrame |-------|-----------| | Row # | name | | 1 | tblcsdata | julia> # PULL DATA FROM THE TABLE julia> # THE DATA WOULD BE AUTOMATICALLY SAVED AS A DATAFRAME julia> df1 = query("select * from tblcsdata"); julia> head(df1, 2) 6x12 DataFrame |-------|---------|----------|-----------|---------|-----------|----------|-----|-----------|-------|-------|-------|-------| | Row # | LEV_LT3 | TAX_NDEB | COLLAT1 | SIZE1 | PROF2 | GROWTH2 | AGE | LIQ | IND2A | IND3A | IND4A | IND5A | | 1 | 0.0 | 0.530298 | 0.0791719 | 13.132 | 0.0820164 | 1.16649 | 53 | 0.385779 | 0 | 0 | 1 | 0 | | 2 | 0.0 | 0.370025 | 0.0407454 | 12.1326 | 0.0826154 | 11.092 | 54 | 0.224123 | 1 | 0 | 0 | 0 | julia> # SELECT DATA FROM THE TABLE WITH SQLDF() FUNCTION julia> df2 = sqldf("select * from df1 where AGE between 25 and 30"); julia> # SUMMARIZE DATA WITH SQLDF() FUNCTION julia> df3 = sqldf("select age, avg(LEV_LT3) as avg_lev from df2 group by age") 6x2 DataFrame |-------|-----|-----------| | Row # | AGE | avg_lev | | 1 | 25 | 0.0923202 | | 2 | 26 | 0.0915009 | | 3 | 27 | 0.0579876 | | 4 | 28 | 0.104191 | | 5 | 29 | 0.0764582 | | 6 | 30 | 0.0806471 |

## Simplex Model in R

**R CODE**

library(simplexreg) library(foreign) ### http://fmwww.bc.edu/repec/bocode/k/k401.dta ### data <- read.dta("/home/liuwensui/Documents/data/k401.dta") mdl <- simplexreg(prate ~ mrate + totemp + age + sole|mrate + totemp + age + sole, type = "hetero", link = "logit", data = data, subset = prate < 1) summary(mdl)

**R OUTPUT**

simplexreg(formula = prate ~ mrate + totemp + age + sole | mrate + totemp + age + sole, data = data, subset = prate < 1, type = "hetero", link = "logit") standard Pearson residuals: Min 1Q Median 3Q Max -6.1724 -0.5369 0.0681 0.5379 2.2987 Coefficients (mean model with logit link): Estimate Std. Error z value Pr(>|z|) (Intercept) 8.817e-01 4.036e-02 21.848 < 2e-16 *** mrate 2.710e-01 4.880e-02 5.553 2.81e-08 *** totemp -8.454e-06 1.164e-06 -7.266 3.70e-13 *** age 2.762e-02 2.702e-03 10.225 < 2e-16 *** sole 1.079e-01 4.684e-02 2.304 0.0212 * Coefficients (dispersion model with log link): Estimate Std. Error z value Pr(>|z|) (Intercept) 1.668e+00 5.395e-02 30.918 < 2e-16 *** mrate 8.775e-01 4.472e-02 19.621 < 2e-16 *** totemp 7.432e-06 1.434e-06 5.182 2.2e-07 *** age 2.816e-02 3.242e-03 8.688 < 2e-16 *** sole 7.744e-01 5.966e-02 12.981 < 2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Log-likelihood: 2370, p-value: 0.4693177 Deviance: 2711 Number of Fisher Scoring iterations: 20

**SAS CODE & OUTPUT FOR COMPARISON**

proc nlmixed data = one tech = trureg maxiter = 100; parms b0 = 0 b1 = 0 b2 = 0 b3 = 0 b4 = 0 c0 = 2 c1 = 0 c2 = 0 c3 = 0 c4 = 0 ; xb = b0 + b1 * mrate + b2 * totemp + b3 * age + b4 * sole; xc = c0 + c1 * mrate + c2 * totemp + c3 * age + c4 * sole; mu_xb = 1 / (1 + exp(-xb)); s2 = exp(xc); v = (prate * (1 - prate)) ** 3; d = (prate - mu_xb) ** 2 / (prate * (1 - prate) * mu_xb ** 2 * (1 - mu_xb) ** 2); lh = (2 * constant('pi') * s2 * v) ** (-0.5) * exp(-(2 * s2) ** (-1) * d); ll = log(lh); model prate ~ general(ll); run; /* Standard Parameter Estimate Error DF t Value Pr > |t| Alpha b0 0.8817 0.03843 2711 22.94 <.0001 0.05 b1 0.2710 0.04540 2711 5.97 <.0001 0.05 b2 -8.45E-6 1.35E-6 2711 -6.26 <.0001 0.05 b3 0.02762 0.002588 2711 10.67 <.0001 0.05 b4 0.1079 0.04792 2711 2.25 0.0244 0.05 c0 1.6680 0.05490 2711 30.38 <.0001 0.05 c1 0.8775 0.07370 2711 11.91 <.0001 0.05 c2 7.431E-6 1.935E-6 2711 3.84 0.0001 0.05 c3 0.02816 0.003224 2711 8.73 <.0001 0.05 c4 0.7744 0.06194 2711 12.50 <.0001 0.05 */