## A More Flexible Ljung-Box Test in SAS

Ljung-Box test is an important diagnostic to check if residuals from the time series model are independently distributed. In SAS / ETS module, it is easy to perform Ljung-Box with ARIMA procedure. However, test outputs are only provided for Lag 6, 12, 18, and so on, which cannot be changed by any option.

data one; do i = 1 to 100; x = uniform(1); output; end; run; proc arima data = one; identify var = x whitenoise = ignoremiss; run; quit; /* Autocorrelation Check for White Noise To Chi- Pr > Lag Square DF ChiSq --------------------Autocorrelations-------------------- 6 5.49 6 0.4832 0.051 -0.132 0.076 -0.024 -0.146 0.064 12 6.78 12 0.8719 0.050 0.076 -0.046 -0.025 -0.016 -0.018 18 10.43 18 0.9169 0.104 -0.053 0.063 0.038 -0.085 -0.065 24 21.51 24 0.6083 0.007 0.178 0.113 -0.046 0.180 0.079 */

The SAS macro below is a more flexible way to perform Ljung-Box test for any number of lags. As shown in the output, test results for Lag 6 and 12 are identical to the one directly from ARIMA procedure.

%macro LBtest(data = , var = , lags = 4); ***********************************************************; * SAS MACRO PERFORMING LJUNG-BOX TEST FOR INDEPENDENCE *; * ======================================================= *; * INPUT PAREMETERS: *; * DATA : INPUT SAS DATA TABLE *; * VAR : THE TIME SERIES TO TEST FOR INDEPENDENCE *; * LAGS : THE NUMBER OF LAGS BEING TESTED *; * ======================================================= *; * AUTHOR: WENSUI.LIU@53.COM *; ***********************************************************; %local nlag; data _1 (keep = &var); set &data end = eof; if eof then do; call execute('%let nlag = '||put(_n_ - 1, 8.)||';'); end; run; proc arima data = _last_; identify var = &var nlag = &nlag outcov = _2 noprint; run; quit; %do i = 1 %to &lags; data _3; set _2; where lag > 0 and lag <= &i; run; proc sql noprint; create table _4 as select sum(corr * corr / n) * (&nlag + 1) * (&nlag + 3) as _chisq, 1 - probchi(calculated _chisq, &i.) as _p_chisq, &i as _df from _last_; quit; %if &i = 1 %then %do; data _5; set _4; run; %end; %else %do; data _5; set _5 _4; run; %end; %end; title; proc report data = _5 spacing = 1 headline nowindows split = "*"; column(" * LJUNG-BOX TEST FOR WHITE NOISE * * H0: RESIDUALS ARE INDEPENDENTLY DISTRIBUTED UPTO LAG &lags * " _chisq _df _p_chisq); define _chisq / "CHI-SQUARE" width = 20 format = 15.10; define _df / "DF" width = 10 order; define _p_chisq / "P-VALUE" width = 20 format = 15.10; run; %mend LBtest; %LBtest(data = one, var = x, lags = 12); /* LJUNG-BOX TEST FOR WHITE NOISE H0: RESIDUALS ARE INDEPENDENTLY DISTRIBUTED UPTO LAG 12 CHI-SQUARE DF P-VALUE ------------------------------------------------------ 0.2644425904 1 0.6070843322 2.0812769288 2 0.3532290858 2.6839655476 3 0.4429590625 2.7428168168 4 0.6017432831 5.0425834917 5 0.4107053939 5.4851972398 6 0.4832476224 5.7586229652 7 0.5681994829 6.4067856029 8 0.6017645131 6.6410385135 9 0.6744356312 6.7142471241 10 0.7521182318 6.7427585395 11 0.8195164211 6.7783018413 12 0.8719097622 */

## SAS Macro Performing Breusch–Godfrey Test for Serial Correlation

%macro bgtest(data = , r = , x = , order = 1); ********************************************************************; * SAS MACRO PERFORMING BREUSCH-GODFREY TEST FOR SERIAL CORRELATION *; * BY FOLLOWING THE LOGIC OF BGTEST() IN R LMTEST PACKAGE *; * ================================================================ *; * INPUT PAREMETERS: *; * DATA : INPUT SAS DATA TABLE *; * R : RESIDUALS TO TEST SERIAL CORRELATION *; * X : INDEPENDENT VARIABLES IN THE ORIGINAL REGRESSION MODEL *; * ORDER : THE ORDER OF SERIAL CORRELATION *; * ================================================================ *; * AUTHOR: WENSUI.LIU@53.COM *; ********************************************************************; data _1 (drop = _i); set &data (keep = &r &x); %do i = 1 %to ℴ _lag&i._&r = lag&i.(&r); %end; _i + 1; _index = _i - ℴ if _index > 0 then output; run; ods listing close; proc reg data = _last_; model &r = &x _lag:; output out = _2 p = yhat; run; ods listing; proc sql noprint; create table _result as select (select count(*) from _2) * sum(yhat ** 2) / sum(&r ** 2) as _chisq, 1 - probchi(calculated _chisq, &order.) as _p_chisq, &order as _df from _2; quit; title; proc report data = _last_ spacing = 1 headline nowindows split = "*"; column(" * BREUSCH-GODFREY TEST FOR SERIAL CORRELATION * H0: THERE IS NO SERIAL CORRELATION OF ANY ORDER UP TO &order * " _chisq _df _p_chisq); define _chisq / "CHI-SQUARE" width = 20 format = 15.10; define _df / "DF" width = 10; define _p_chisq / "P-VALUE" width = 20 format = 15.10; run; %mend bgtest;

## Python Prototype of Grid Search for SVM Parameters

from itertools import product from pandas import read_table, DataFrame from sklearn.cross_validation import KFold as kfold from sklearn.svm import SVC as svc from sklearn.metrics import roc_auc_score as auc df = read_table('credit_count.txt', sep = ',') Y = df[df.CARDHLDR == 1].DEFAULT X = df[df.CARDHLDR == 1][['AGE', 'ADEPCNT', 'MAJORDRG', 'MINORDRG', 'INCOME', 'OWNRENT', 'SELFEMPL']] c = [1, 10] g = [0.01, 0.001] parms = [i for i in product(c, g)] kf = [i for i in kfold(Y.count(), n_folds = 3, shuffle = True, random_state = 0)] final = DataFrame() for i in parms: result = DataFrame() mdl = svc(C = i[0], gamma = i[1], probability = True, random_state = 0) for j in kf: X1 = X.iloc[j[0]] Y1 = Y.iloc[j[0]] X2 = X.iloc[j[1]] Y2 = Y.iloc[j[1]] mdl.fit(X1, Y1) pred = mdl.predict_proba(X2)[:, 1] out = DataFrame({'pred': pred, 'y': Y2}) result = result.append(out) perf = DataFrame({'Cost': i[0], 'Gamma': i[1], 'AUC': [auc(result.y, result.pred)]}) final = final.append(perf)

## Improve SVM Tuning through Parallelism

As pointed out in the chapter 10 of “The Elements of Statistical Learning”, ANN and SVM (support vector machines) share similar pros and cons, e.g. lack of interpretability and good predictive power. However, in contrast to ANN usually suffering from local minima solutions, SVM is always able to converge globally. In addition, SVM is less prone to over-fitting given a good choice of free parameters, which usually can be identified through cross-validations.

In the R package “e1071”, tune() function can be used to search for SVM parameters but is extremely inefficient due to the sequential instead of parallel executions. In the code snippet below, a parallelism-based algorithm performs the grid search for SVM parameters through the K-fold cross validation.

pkgs <- c('foreach', 'doParallel') lapply(pkgs, require, character.only = T) registerDoParallel(cores = 4) ### PREPARE FOR THE DATA ### df1 <- read.csv("credit_count.txt") df2 <- df1[df1$CARDHLDR == 1, ] x <- paste("AGE + ACADMOS + ADEPCNT + MAJORDRG + MINORDRG + OWNRENT + INCOME + SELFEMPL + INCPER + EXP_INC") fml <- as.formula(paste("as.factor(DEFAULT) ~ ", x)) ### SPLIT DATA INTO K FOLDS ### set.seed(2016) df2$fold <- caret::createFolds(1:nrow(df2), k = 4, list = FALSE) ### PARAMETER LIST ### cost <- c(10, 100) gamma <- c(1, 2) parms <- expand.grid(cost = cost, gamma = gamma) ### LOOP THROUGH PARAMETER VALUES ### result <- foreach(i = 1:nrow(parms), .combine = rbind) %do% { c <- parms[i, ]$cost g <- parms[i, ]$gamma ### K-FOLD VALIDATION ### out <- foreach(j = 1:max(df2$fold), .combine = rbind, .inorder = FALSE) %dopar% { deve <- df2[df2$fold != j, ] test <- df2[df2$fold == j, ] mdl <- e1071::svm(fml, data = deve, type = "C-classification", kernel = "radial", cost = c, gamma = g, probability = TRUE) pred <- predict(mdl, test, decision.values = TRUE, probability = TRUE) data.frame(y = test$DEFAULT, prob = attributes(pred)$probabilities[, 2]) } ### CALCULATE SVM PERFORMANCE ### roc <- pROC::roc(as.factor(out$y), out$prob) data.frame(parms[i, ], roc = roc$auc[1]) }

## SAS Macro Calculating LOO Predictions for GLM

Back in last year, I wrote a R function calculating the Leave-One-Out (LOO) prediction in GLM (https://statcompute.wordpress.com/2015/12/13/calculate-leave-one-out-prediction-for-glm). Below is a SAS macro implementing the same function with Gamma regression as the example. However, it is trivial to extend to models with other distributional assumptions.

%macro loo_gamma(data = , y = , x = , out = , out_var = _loo); **********************************************************************; * SAS MACRO CALCULATING LEAVE-ONE-OUT PREDICTIONS WITH THE TRAINING *; * SAMPLE AND PRESENTING DISTRIBUTIONS OF LOO PARAMETER ESTIMATES *; * ================================================================== *; * INPUT PARAMETERS: *; * DATA : INPUT SAS DATA TABLE *; * Y : DEPENDENT VARIABLE IN THE GAMMA MODEL *; * X : NUMERIC INDEPENDENT VARIABLES IN THE MODEL *; * OUT : OUTPUT SAS DATA TABLE WITH LOO PREDICTIONS *; * OUT_VAR: VARIABLE NAME OF LOO PREDICTIONS *; * ================================================================== *; * OUTPUTS: *; * 1. A TABLE SHOWING DISTRIBUTIONS OF LOO PARAMETER ESTIMATES *; * 2. A SAS DATA TABLE WITH LOO PREDICTIONS *; **********************************************************************; options nocenter nonumber nodate mprint mlogic symbolgen; data _1; retain &x &y; set &data (keep = &x &y); where &y ~= .; Intercept = 1; _i + 1; run; data _2; set _1 (keep = _i &x Intercept); array _x Intercept &x; do over _x; _name = upcase(vname(_x)); _value = _x; output; end; run; proc sql noprint; select max(_i) into :nobs from _1; quit; %do i = 1 %to &nobs; data _3; set _1; where _i ~= &i; run; ods listing close; ods output ParameterEstimates = _est; proc glimmix data = _last_; model &y = &x / solution dist = gamma link = log; run; ods listing; proc sql; create table _pred1 as select a._i as _i, upcase(a._name) as _name, a._value as _value, b.estimate as estimate, a._value * b.estimate as _xb from _2 as a, _est as b where a._i = &i and upcase(a._name) = upcase(b.effect); quit; %if &i = 1 %then %do; data _pred2; set _pred1; run; %end; %else %do; data _pred2; set _pred2 _pred1; run; %end; %end; proc summary data = _pred2 nway; class _name; output out = _eff (drop = _freq_ _type_) min(estimate) = beta_min p5(estimate) = beta_p05 p10(estimate) = beta_p10 median(estimate) = beta_med p90(estimate) = beta_p90 p95(estimate) = beta_p95 max(estimate) = beta_max mean(estimate) = beta_avg stddev(estimate) = beta_std; run; title; proc report data = _eff spacing = 1 headline nowindows split = "*"; column(" * DISTRIBUTIONS OF LEAVE-ONE-OUT COEFFICIENTS * ESTIMATED FROM GAMMA REGRESSIONS * " _name beta_:); where upcase(_name) ~= 'INTERCEPT'; define _name / "BETA" width = 20; define beta_min / "MIN" width = 10 format = 10.4; define beta_p05 / '5%ILE' width = 10 format = 10.4; define beta_p10 / '10%ILE' width = 10 format = 10.4; define beta_med / 'MEDIAN' width = 10 format = 10.4; define beta_p90 / '90%ILE' width = 10 format = 10.4; define beta_p95 / '95%ILE' width = 10 format = 10.4; define beta_max / "MAX" width = 10 format = 10.4; define beta_avg / "AVERAGE" width = 10 format = 10.4; define beta_std / "STD DEV" width = 10 format = 10.4; run; proc sql; create table &out as select a.*, b.out_var as _xb, exp(b.out_var) as &out_var from _1 (drop = intercept) as a, (select _i, sum(_xb) as out_var from _pred2 group by _i) as b where a._i = b._i; quit; %mend loo_gamma;

## Where Bagging Might Work Better Than Boosting

In the previous post (https://statcompute.wordpress.com/2016/01/01/the-power-of-decision-stumps), it was shown that the boosting algorithm performs extremely well even with a simple 1-level stump as the base learner and provides a better performance lift than the bagging algorithm does. However, this observation shouldn’t be generalized, which would be demonstrated in the following example.

First of all, we developed a rule-based PART model as below. Albeit pruned, this model will still tend to over-fit the data, as shown in the highlighted.

# R = TRUE AND N = 10 FOR 10-FOLD CV PRUNING # M = 5 SPECIFYING MINIMUM NUMBER OF CASES PER LEAF part_control <- Weka_control(R = TRUE, N = 10, M = 5, Q = 2016) part <- PART(fml, data = df, control = part_control) roc(as.factor(train$DEFAULT), predict(part, newdata = train, type = "probability")[, 2]) # Area under the curve: 0.6839 roc(as.factor(test$DEFAULT), predict(part, newdata = test, type = "probability")[, 2]) # Area under the curve: 0.6082

Next, we applied the boosting to the PART model. As shown in the highlighted result below, AUC of the boosting on the testing data is even lower than AUC of the base model.

wlist <- list(PART, R = TRUE, N = 10, M = 5, Q = 2016) # I = 100 SPECIFYING NUMBER OF ITERATIONS # Q = TRUE SPECIFYING RESAMPLING USED IN THE BOOSTING boost_control <- Weka_control(I = 100, S = 2016, Q = TRUE, P = 100, W = wlist) boosting <- AdaBoostM1(fml, data = train, control = boost_control) roc(as.factor(test$DEFAULT), predict(boosting, newdata = test, type = "probability")[, 2]) # Area under the curve: 0.592

However, if employing the bagging, we are able to achieve more than 11% performance lift in terms of AUC.

# NUM-SLOTS = 0 AND I = 100 FOR PARALLELISM # P = 50 SPECIFYING THE SIZE OF EACH BAG bag_control <- Weka_control("num-slots" = 0, I = 100, S = 2016, P = 50, W = wlist) bagging <- Bagging(fml, data = train, control = bag_control) roc(as.factor(test$DEFAULT), predict(bagging, newdata = test, type = "probability")[, 2]) # Area under the curve: 0.6778

From examples demonstrated today and yesterday, an important lesson to learn is that ensemble methods are powerful machine learning tools only when they are used appropriately. Empirically speaking, while the boosting works well to improve the performance of a under-fitted base model such as the decision stump, the bagging might be able to perform better in the case of an over-fitted base model with high variance and low bias.

## The Power of Decision Stumps

A decision stump is the weak classification model with the simple tree structure consisting of one split, which can also be considered a one-level decision tree. Due to its simplicity, the stump often demonstrates a low predictive performance. As shown in the example below, the AUC measure of a stump is even lower than the one of a single attribute in a separate testing dataset.

pkgs <- c('pROC', 'RWeka') lapply(pkgs, require, character.only = T) df1 <- read.csv("credit_count.txt") df2 <- df1[df1$CARDHLDR == 1, ] set.seed(2016) n <- nrow(df2) sample <- sample(seq(n), size = n / 2, replace = FALSE) train <- df2[sample, ] test <- df2[-sample, ] x <- paste("AGE + ACADMOS + ADEPCNT + MAJORDRG + MINORDRG + OWNRENT + INCOME + SELFEMPL + INCPER + EXP_INC") fml <- as.formula(paste("as.factor(DEFAULT) ~ ", x)) ### IDENTIFY THE MOST PREDICTIVE ATTRIBUTE ### imp <- InfoGainAttributeEval(fml, data = train) imp_x <- test[, names(imp[imp == max(imp)])] roc(as.factor(test$DEFAULT), imp_x) # Area under the curve: 0.6243 ### CONSTRUCT A WEAK CLASSIFIER OF DECISION STUMP ### stump <- DecisionStump(fml, data = train) print(stump) roc(as.factor(test$DEFAULT), predict(stump, newdata = test, type = "probability")[, 2]) # Area under the curve: 0.5953

Albeit weak by itself, the decision stump can be used as a base model in many machine learning ensemble methods, such as bagging and boosting. For instance, the bagging classifier with 1,000 stumps combined outperforms the single stump by ~7% in terms of AUC (0.6346 vs. 0.5953). Moreover, AdaBoost with stumps can further improve the predictive performance over the single stump by ~11% (0.6585 vs. 0.5953) and also over the logistic regression benchmark by ~2% (0.6585 vs. 0.6473).

### BUILD A BAGGING CLASSIFIER WITH 1,000 STUMPS IN PARALLEL ### bagging <- Bagging(fml, data = train, control = Weka_control("num-slots" = 0, I = 1000, W = "DecisionStump", S = 2016, P = 50)) roc(as.factor(test$DEFAULT), predict(bagging, newdata = test, type = "probability")[, 2]) # Area under the curve: 0.6346 ### BUILD A BOOSTING CLASSIFIER WITH STUMPS ### boosting <- AdaBoostM1(fml, data = train, control = Weka_control(I = 100, W = "DecisionStump", S = 2016)) roc(as.factor(test$DEFAULT), predict(boosting, newdata = test, type = "probability")[, 2]) # Area under the curve: 0.6585 ### DEVELOP A LOGIT MODEL FOR THE BENCHMARK ### logit <- Logistic(fml, data = train) roc(as.factor(test$DEFAULT), predict(logit, newdata = test, type = "probability")[, 2]) # Area under the curve: 0.6473