Yet Another Blog in Statistical Computing

I can calculate the motion of heavenly bodies but not the madness of people. -Isaac Newton

Archive for March 2016

Python Prototype of Grid Search for SVM Parameters

from itertools import product
from pandas import read_table, DataFrame
from sklearn.cross_validation import KFold as kfold
from sklearn.svm import SVC as svc
from sklearn.metrics import roc_auc_score as auc

df = read_table('credit_count.txt', sep = ',')
Y = df[df.CARDHLDR == 1].DEFAULT
X = df[df.CARDHLDR == 1][['AGE', 'ADEPCNT', 'MAJORDRG', 'MINORDRG', 'INCOME', 'OWNRENT', 'SELFEMPL']]

c = [1, 10]
g = [0.01, 0.001]
parms = [i for i in product(c, g)]
kf = [i for i in kfold(Y.count(), n_folds = 3, shuffle = True, random_state = 0)]
final = DataFrame()

for i in parms:
  result = DataFrame()	
  mdl = svc(C = i[0], gamma = i[1], probability = True, random_state = 0)
  for j in kf:
    X1 = X.iloc[j[0]]
    Y1 = Y.iloc[j[0]]
    X2 = X.iloc[j[1]]
    Y2 = Y.iloc[j[1]]
    mdl.fit(X1, Y1)
    pred = mdl.predict_proba(X2)[:, 1]
    out = DataFrame({'pred': pred, 'y': Y2})
    result = result.append(out)
  perf = DataFrame({'Cost': i[0], 'Gamma': i[1], 'AUC': [auc(result.y, result.pred)]}) 
  final = final.append(perf)

Written by statcompute

March 27, 2016 at 2:40 pm

Improve SVM Tuning through Parallelism

As pointed out in the chapter 10 of “The Elements of Statistical Learning”, ANN and SVM (support vector machines) share similar pros and cons, e.g. lack of interpretability and good predictive power. However, in contrast to ANN usually suffering from local minima solutions, SVM is always able to converge globally. In addition, SVM is less prone to over-fitting given a good choice of free parameters, which usually can be identified through cross-validations.

In the R package “e1071”, tune() function can be used to search for SVM parameters but is extremely inefficient due to the sequential instead of parallel executions. In the code snippet below, a parallelism-based algorithm performs the grid search for SVM parameters through the K-fold cross validation.

pkgs <- c('foreach', 'doParallel')
lapply(pkgs, require, character.only = T)
registerDoParallel(cores = 4)
### PREPARE FOR THE DATA ###
df1 <- read.csv("credit_count.txt")
df2 <- df1[df1$CARDHLDR == 1, ]
x <- paste("AGE + ACADMOS + ADEPCNT + MAJORDRG + MINORDRG + OWNRENT + INCOME + SELFEMPL + INCPER + EXP_INC")
fml <- as.formula(paste("as.factor(DEFAULT) ~ ", x))
### SPLIT DATA INTO K FOLDS ###
set.seed(2016)
df2$fold <- caret::createFolds(1:nrow(df2), k = 4, list = FALSE)
### PARAMETER LIST ###
cost <- c(10, 100)
gamma <- c(1, 2)
parms <- expand.grid(cost = cost, gamma = gamma)
### LOOP THROUGH PARAMETER VALUES ###
result <- foreach(i = 1:nrow(parms), .combine = rbind) %do% {
  c <- parms[i, ]$cost
  g <- parms[i, ]$gamma
  ### K-FOLD VALIDATION ###
  out <- foreach(j = 1:max(df2$fold), .combine = rbind, .inorder = FALSE) %dopar% {
    deve <- df2[df2$fold != j, ]
    test <- df2[df2$fold == j, ]
    mdl <- e1071::svm(fml, data = deve, type = "C-classification", kernel = "radial", cost = c, gamma = g, probability = TRUE)
    pred <- predict(mdl, test, decision.values = TRUE, probability = TRUE)
    data.frame(y = test$DEFAULT, prob = attributes(pred)$probabilities[, 2])
  }
  ### CALCULATE SVM PERFORMANCE ###
  roc <- pROC::roc(as.factor(out$y), out$prob) 
  data.frame(parms[i, ], roc = roc$auc[1])
}

Written by statcompute

March 19, 2016 at 8:57 pm

SAS Macro Calculating LOO Predictions for GLM

Back in last year, I wrote a R function calculating the Leave-One-Out (LOO) prediction in GLM (https://statcompute.wordpress.com/2015/12/13/calculate-leave-one-out-prediction-for-glm). Below is a SAS macro implementing the same function with Gamma regression as the example. However, it is trivial to extend to models with other distributional assumptions.

%macro loo_gamma(data = , y = , x = , out = , out_var = _loo);
**********************************************************************;
* SAS MACRO CALCULATING LEAVE-ONE-OUT PREDICTIONS WITH THE TRAINING  *;
* SAMPLE AND PRESENTING DISTRIBUTIONS OF LOO PARAMETER ESTIMATES     *;
* ================================================================== *;
* INPUT PARAMETERS:                                                  *;
*  DATA   : INPUT SAS DATA TABLE                                     *;
*  Y      : DEPENDENT VARIABLE IN THE GAMMA MODEL                    *;
*  X      : NUMERIC INDEPENDENT VARIABLES IN THE MODEL               *;
*  OUT    : OUTPUT SAS DATA TABLE WITH LOO PREDICTIONS               *;
*  OUT_VAR: VARIABLE NAME OF LOO PREDICTIONS                         *;
* ================================================================== *;
* OUTPUTS:                                                           *;
* 1. A TABLE SHOWING DISTRIBUTIONS OF LOO PARAMETER ESTIMATES        *;
* 2. A SAS DATA TABLE WITH LOO PREDICTIONS                           *;
**********************************************************************;
options nocenter nonumber nodate mprint mlogic symbolgen;

data _1;
  retain &x &y;
  set &data (keep = &x &y);
  where &y ~= .;
  Intercept = 1;
  _i + 1;
run;

data _2;
  set _1 (keep = _i &x Intercept);
  array _x Intercept &x;
  do over _x;
    _name = upcase(vname(_x));
    _value = _x;
    output;
  end;
run;

proc sql noprint;
  select max(_i) into :nobs from _1;
quit;

%do i = 1 %to &nobs;

data _3;
  set _1;
  where _i ~= &i;
run;

ods listing close;
ods output ParameterEstimates = _est;
proc glimmix data = _last_;
  model &y = &x / solution dist = gamma link = log; 
run; 
ods listing;

proc sql;
create table
  _pred1 as
select
  a._i                  as _i,
  upcase(a._name)       as _name,
  a._value              as _value,
  b.estimate            as estimate,
  a._value * b.estimate as _xb
from
  _2 as a, _est as b
where
  a._i = &i and upcase(a._name) = upcase(b.effect); quit;

%if &i = 1 %then %do;
  data _pred2;
    set _pred1;
  run;
%end;
%else %do;
  data _pred2;
    set _pred2 _pred1;
  run;
%end;

%end;

proc summary data = _pred2 nway;
  class _name;
  output out = _eff (drop = _freq_ _type_)
  min(estimate)    = beta_min
  p5(estimate)     = beta_p05
  p10(estimate)    = beta_p10
  median(estimate) = beta_med
  p90(estimate)    = beta_p90
  p95(estimate)    = beta_p95
  max(estimate)    = beta_max
  mean(estimate)   = beta_avg
  stddev(estimate) = beta_std;
run;

title;
proc report data = _eff spacing = 1 headline nowindows split = "*";
  column(" * DISTRIBUTIONS OF LEAVE-ONE-OUT COEFFICIENTS *
             ESTIMATED FROM GAMMA REGRESSIONS * "
          _name beta_:);
  where upcase(_name) ~= 'INTERCEPT';
  define _name    / "BETA"    width = 20;
  define beta_min / "MIN"     width = 10 format = 10.4;
  define beta_p05 / '5%ILE'   width = 10 format = 10.4;
  define beta_p10 / '10%ILE'  width = 10 format = 10.4;
  define beta_med / 'MEDIAN'  width = 10 format = 10.4;
  define beta_p90 / '90%ILE'  width = 10 format = 10.4;
  define beta_p95 / '95%ILE'  width = 10 format = 10.4;
  define beta_max / "MAX"     width = 10 format = 10.4;
  define beta_avg / "AVERAGE" width = 10 format = 10.4;
  define beta_std / "STD DEV" width = 10 format = 10.4; 
run;

proc sql;
create table
  &out as
select
  a.*,
  b.out_var      as _xb,
  exp(b.out_var) as &out_var
from
  _1 (drop = intercept) as a,
  (select _i, sum(_xb) as out_var from _pred2 group by _i) as b where
  a._i = b._i;
quit;

%mend loo_gamma;

Written by statcompute

March 12, 2016 at 2:38 pm

Posted in SAS

Tagged with