## Archive for **March 2016**

## Python Prototype of Grid Search for SVM Parameters

from itertools import product from pandas import read_table, DataFrame from sklearn.cross_validation import KFold as kfold from sklearn.svm import SVC as svc from sklearn.metrics import roc_auc_score as auc df = read_table('credit_count.txt', sep = ',') Y = df[df.CARDHLDR == 1].DEFAULT X = df[df.CARDHLDR == 1][['AGE', 'ADEPCNT', 'MAJORDRG', 'MINORDRG', 'INCOME', 'OWNRENT', 'SELFEMPL']] c = [1, 10] g = [0.01, 0.001] parms = [i for i in product(c, g)] kf = [i for i in kfold(Y.count(), n_folds = 3, shuffle = True, random_state = 0)] final = DataFrame() for i in parms: result = DataFrame() mdl = svc(C = i[0], gamma = i[1], probability = True, random_state = 0) for j in kf: X1 = X.iloc[j[0]] Y1 = Y.iloc[j[0]] X2 = X.iloc[j[1]] Y2 = Y.iloc[j[1]] mdl.fit(X1, Y1) pred = mdl.predict_proba(X2)[:, 1] out = DataFrame({'pred': pred, 'y': Y2}) result = result.append(out) perf = DataFrame({'Cost': i[0], 'Gamma': i[1], 'AUC': [auc(result.y, result.pred)]}) final = final.append(perf)

## Improve SVM Tuning through Parallelism

As pointed out in the chapter 10 of “The Elements of Statistical Learning”, ANN and SVM (support vector machines) share similar pros and cons, e.g. lack of interpretability and good predictive power. However, in contrast to ANN usually suffering from local minima solutions, SVM is always able to converge globally. In addition, SVM is less prone to over-fitting given a good choice of free parameters, which usually can be identified through cross-validations.

In the R package “e1071”, tune() function can be used to search for SVM parameters but is extremely inefficient due to the sequential instead of parallel executions. In the code snippet below, a parallelism-based algorithm performs the grid search for SVM parameters through the K-fold cross validation.

pkgs <- c('foreach', 'doParallel') lapply(pkgs, require, character.only = T) registerDoParallel(cores = 4) ### PREPARE FOR THE DATA ### df1 <- read.csv("credit_count.txt") df2 <- df1[df1$CARDHLDR == 1, ] x <- paste("AGE + ACADMOS + ADEPCNT + MAJORDRG + MINORDRG + OWNRENT + INCOME + SELFEMPL + INCPER + EXP_INC") fml <- as.formula(paste("as.factor(DEFAULT) ~ ", x)) ### SPLIT DATA INTO K FOLDS ### set.seed(2016) df2$fold <- caret::createFolds(1:nrow(df2), k = 4, list = FALSE) ### PARAMETER LIST ### cost <- c(10, 100) gamma <- c(1, 2) parms <- expand.grid(cost = cost, gamma = gamma) ### LOOP THROUGH PARAMETER VALUES ### result <- foreach(i = 1:nrow(parms), .combine = rbind) %do% { c <- parms[i, ]$cost g <- parms[i, ]$gamma ### K-FOLD VALIDATION ### out <- foreach(j = 1:max(df2$fold), .combine = rbind, .inorder = FALSE) %dopar% { deve <- df2[df2$fold != j, ] test <- df2[df2$fold == j, ] mdl <- e1071::svm(fml, data = deve, type = "C-classification", kernel = "radial", cost = c, gamma = g, probability = TRUE) pred <- predict(mdl, test, decision.values = TRUE, probability = TRUE) data.frame(y = test$DEFAULT, prob = attributes(pred)$probabilities[, 2]) } ### CALCULATE SVM PERFORMANCE ### roc <- pROC::roc(as.factor(out$y), out$prob) data.frame(parms[i, ], roc = roc$auc[1]) }

## SAS Macro Calculating LOO Predictions for GLM

Back in last year, I wrote a R function calculating the Leave-One-Out (LOO) prediction in GLM (https://statcompute.wordpress.com/2015/12/13/calculate-leave-one-out-prediction-for-glm). Below is a SAS macro implementing the same function with Gamma regression as the example. However, it is trivial to extend to models with other distributional assumptions.

%macro loo_gamma(data = , y = , x = , out = , out_var = _loo); **********************************************************************; * SAS MACRO CALCULATING LEAVE-ONE-OUT PREDICTIONS WITH THE TRAINING *; * SAMPLE AND PRESENTING DISTRIBUTIONS OF LOO PARAMETER ESTIMATES *; * ================================================================== *; * INPUT PARAMETERS: *; * DATA : INPUT SAS DATA TABLE *; * Y : DEPENDENT VARIABLE IN THE GAMMA MODEL *; * X : NUMERIC INDEPENDENT VARIABLES IN THE MODEL *; * OUT : OUTPUT SAS DATA TABLE WITH LOO PREDICTIONS *; * OUT_VAR: VARIABLE NAME OF LOO PREDICTIONS *; * ================================================================== *; * OUTPUTS: *; * 1. A TABLE SHOWING DISTRIBUTIONS OF LOO PARAMETER ESTIMATES *; * 2. A SAS DATA TABLE WITH LOO PREDICTIONS *; **********************************************************************; options nocenter nonumber nodate mprint mlogic symbolgen; data _1; retain &x &y; set &data (keep = &x &y); where &y ~= .; Intercept = 1; _i + 1; run; data _2; set _1 (keep = _i &x Intercept); array _x Intercept &x; do over _x; _name = upcase(vname(_x)); _value = _x; output; end; run; proc sql noprint; select max(_i) into :nobs from _1; quit; %do i = 1 %to &nobs; data _3; set _1; where _i ~= &i; run; ods listing close; ods output ParameterEstimates = _est; proc glimmix data = _last_; model &y = &x / solution dist = gamma link = log; run; ods listing; proc sql; create table _pred1 as select a._i as _i, upcase(a._name) as _name, a._value as _value, b.estimate as estimate, a._value * b.estimate as _xb from _2 as a, _est as b where a._i = &i and upcase(a._name) = upcase(b.effect); quit; %if &i = 1 %then %do; data _pred2; set _pred1; run; %end; %else %do; data _pred2; set _pred2 _pred1; run; %end; %end; proc summary data = _pred2 nway; class _name; output out = _eff (drop = _freq_ _type_) min(estimate) = beta_min p5(estimate) = beta_p05 p10(estimate) = beta_p10 median(estimate) = beta_med p90(estimate) = beta_p90 p95(estimate) = beta_p95 max(estimate) = beta_max mean(estimate) = beta_avg stddev(estimate) = beta_std; run; title; proc report data = _eff spacing = 1 headline nowindows split = "*"; column(" * DISTRIBUTIONS OF LEAVE-ONE-OUT COEFFICIENTS * ESTIMATED FROM GAMMA REGRESSIONS * " _name beta_:); where upcase(_name) ~= 'INTERCEPT'; define _name / "BETA" width = 20; define beta_min / "MIN" width = 10 format = 10.4; define beta_p05 / '5%ILE' width = 10 format = 10.4; define beta_p10 / '10%ILE' width = 10 format = 10.4; define beta_med / 'MEDIAN' width = 10 format = 10.4; define beta_p90 / '90%ILE' width = 10 format = 10.4; define beta_p95 / '95%ILE' width = 10 format = 10.4; define beta_max / "MAX" width = 10 format = 10.4; define beta_avg / "AVERAGE" width = 10 format = 10.4; define beta_std / "STD DEV" width = 10 format = 10.4; run; proc sql; create table &out as select a.*, b.out_var as _xb, exp(b.out_var) as &out_var from _1 (drop = intercept) as a, (select _i, sum(_xb) as out_var from _pred2 group by _i) as b where a._i = b._i; quit; %mend loo_gamma;