I can calculate the motion of heavenly bodies but not the madness of people. -Isaac Newton

## A SAS Macro for Bootstrap Aggregating (Bagging)

Proposed by Breiman (1996), bagging is the acronym of “bootstrap aggregating” and is a machine learning method to improve the prediction accuracy by simply averaging over predictions from multiple classifiers developed with bootstrapped samples out of the original training set.

Regardless of its statistical elegance, bagging is attractive in modern machine learning in that the construction of each decision tree in bagging is very computationally efficient and is completely independent from each other, which makes bagging ideal in parallel computing.

The SAS macro demonstrated below is an attempt to test bagging algorithm on a consumer banking dataset.

```%macro bagging(data = , y = , numx = , catx = , ntrees = 50);
***********************************************************;
* THIS SAS MACRO IS AN ATTEMPT TO IMPLEMENT BAGGING       *;
* PROPOSED BY LEO BREIMAN (1996)                          *;
* ======================================================= *;
* PAMAMETERS:                                             *;
*  DATA   : INPUT SAS DATA TABLE                          *;
*  Y      : RESPONSE VARIABLE WITH 0/1 VALUE              *;
*  NUMX   : A LIST OF NUMERIC ATTRIBUTES                  *;
*  CATX   : A LIST OF CATEGORICAL ATTRIBUTES              *;
*  NTREES : # OF TREES TO DO THE BAGGING                  *;
* ======================================================= *;
* OUTPUTS:                                                *;
*  1. A SAS CATALOG FILE NAMED "TREEFILES" IN THE WORKING *;
*     DIRECTORY CONTAINING ALL SCORING FILES IN BAGGING   *;
*  2. A LST FILE SHOWING ks STATISTICS OF THE BAGGING     *;
*     CLASSIFIER AND EACH TREE CLASSIFIER                 *;
* ======================================================= *;
* CONTACT:                                                *;
*  WENSUI.LIU@53.COM, LOSS FORECASTING & RISK MODELING    *;
***********************************************************;

options mprint mlogic nocenter nodate nonumber;

*** a random seed value subject to change ***;
%let seed = 20110613;

*** assign a library to the working folder ***;
libname _path '';

*** generate a series of random seeds ***;
data _null_;
do i = 1 to &ntrees;
random = put(ranuni(&seed) * (10 ** 8), 8.);
name   = compress("random"||put(i, 3.), ' ');
call symput(name, random);
end;
run;

*** clean up catalog files in the library ***;
proc datasets library = _path nolist;
delete TreeFiles tmp / memtype = catalog;
run;
quit;

proc sql noprint;
select count(*) into :nobs from &data where &y in (1, 0);
quit;

data _tmp1 (keep = &y &numx &catx _id_);
set &data;
_id_ + 1;
run;

%do i = 1 %to &ntrees;
%put &&random&i;

*** generate bootstrap samples for bagging ***;
proc surveyselect data = _tmp1 method = urs n = &nobs seed = &&random&i
out = sample&i(rename = (NumberHits = _hits)) noprint;
run;

*** generate data mining datasets for sas e-miner ***;
proc dmdb data = sample&i out = db_sample&i dmdbcat = cl_sample&i;
class &y &catx;
var &numx;
target &y;
freq _hits;
run;

*** create a sas temporary catalog to contain sas output ***;
filename out_tree catalog "_path.tmp.out_tree.source";

*** create decision tree mimicking CART ***;
proc split data = db_sample&i dmdbcat = cl_sample&i
criterion    = gini
assess       = impurity
maxbranch    = 2
splitsize    = 100
subtree      = assessment
exhaustive   = 0
nsurrs       = 0;
code file    = out_tree;
input &numx   / level = interval;
input &catx   / level = nominal;
target &y     / level = binary;
freq _hits;
run;

*** create a perminant sas catalog to contain all tree outputs ***;
filename in_tree catalog "_path.TreeFiles.tree&i..source";

data _null_;
infile out_tree;
input;
file in_tree;
if _n_ > 3 then put _infile_;
run;

*** score the original data by each tree output file ***;
data _score&i (keep = p_&y.1 p_&y.0 &y _id_);
set _tmp1;
%include in_tree;
run;

*** calculate KS stat ***;
proc printto new print = lst_out;
run;

ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));
proc npar1way wilcoxon edf data = _score&i;
class &y.;
var p_&y.1;
run;

proc printto;
run;

%if &i = 1 %then %do;
data _tmp2;
set _score&i;
run;

data _ks;
set _kstmp (keep = nvalue2);
tree_id = &i;
seed    = &&random&i;
ks      = round(nvalue2 * 100, 0.0001);
run;
%end;
%else %do;
data _tmp2;
set _tmp2 _score&i;
run;

data _ks;
set _ks _kstmp(in = a keep = nvalue2);
if a then do;
tree_id = &i;
seed    = &&random&i;
ks      = round(nvalue2 * 100, 0.0001);
end;
run;
%end;

%end;

*** aggregate predictions from all trees in the bag ***;
proc summary data = _tmp2 nway;
class _id_;
output out = _tmp3(drop = _type_ rename = (_freq_ = freq))
mean(p_&y.1) =  mean(p_&y.0) =  mean(&y) = ;
run;

*** calculate bagging KS stat ***;
proc printto new print = lst_out;
run;

ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));
proc npar1way wilcoxon edf data = _tmp3;
class &y;
var p_&y.1;
run;

proc printto;
run;

data _ks;
set _ks _kstmp (in = a keep = nvalue2);
if a then do;
tree_id = 0;
seed    = &seed;
ks      = round(nvalue2 * 100, 0.0001);
end;
run;

proc sort data = _ks;
by tree_id;
run;

proc sql noprint;
select max(ks) into :max_ks from _ks where tree_id > 0;

select min(ks) into :min_ks from _ks where tree_id > 0;

select ks into :bag_ks from _ks where tree_id = 0;
quit;

*** summarize the performance of bagging classifier and each tree in the bag ***;
title "MAX KS = &max_ks, MIN KS = &min_ks, BAGGING KS = &bag_ks";
proc print data = _ks noobs;
var tree_id seed ks;
run;
title;

proc datasets library = _path nolist;
delete tmp / memtype = catalog;
run;
quit;

%mend bagging;

%let x1 = tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt
tot_rev_line rev_util bureau_score ltv tot_income;

%let x2 = purpose;

libname data 'D:\SAS_CODE\bagging';

%bagging(data = data.accepts, y = bad, numx = &x1, catx = &x2, ntrees = 10);
```

The table below is to show the result of bagging estimated out of 10 bootstrap samples. As seen, bagging prediction outperforms the prediction from the best decision tree by at least 10%. And this performance of bagging is very robust with multiple iterations and experiments.

```MAX KS =  41.9205, MIN KS =  37.9653, BAGGING KS =  47.9446

tree_id      seed         ks
0      20110613    47.9446
1      66117721    38.0739
2      73612659    41.9205
3      88775645    37.9653
4      76989116    39.7305
5      78326288    41.8533
6      67052887    39.7698
7       1826834    38.9471
8      47292499    39.2977
9      39078123    40.2813
10      15798916    40.6123
```

Written by statcompute

July 14, 2012 at 11:33 pm

Posted in Machine Learning, SAS

## A SAS Macro Implementing Bi-variate Granger Causality Test

In loss forecasting, it is often of the interest to know: 1) if a time series, e.g. macro-economic variables, is useful to predict another, e.g. portfolio loss; 2) the number of lags to contribute such predictive power. Granger (1969) proposed that A time series X is said to Granger-cause Y if lagged values of X are able to provide statistically significant information to predict the future values of Y.

A SAS macro below is showing how to implement Granger Causality test in a bi-variate sense.

```%macro causal(data = , y = , drivers = , max_lags = );
***********************************************************;
* THIS SAS MACRO IS AN IMPLEMENTATION OF BI-VARIATE       *;
* GRANGER CAUSALITY TEST PROPOSED BY GRANGER (1969)       *;
* ======================================================= *;
* PAMAMETERS:                                             *;
*  DATA     : INPUT SAS DATA TABLE                        *;
*  Y        : A CONTINUOUS TIME SERIES RESPONSE VARIABLE  *;
*  DRIVERS  : A LIST OF TIME SERIES PREDICTORS            *;
*  MAX_LAGS : MAX # OF LAGS TO SEARCH FOR CAUSAL          *;
*             RELATIONSHIPS                               *;
* ======================================================= *;
* CONTACT:                                                *;
*  WENSUI.LIU@53.COM, LOSS FORECASTING & RISK MODELING    *;
***********************************************************;

options nocenter nonumber nodate mprint mlogic symbolgen orientation = landscape
ls = 150 formchar = "|----|+|---+=|-/\<>*";

%macro granger(data = , dep = , indep = , nlag = );

%let lag_dep = ;
%let lag_indep = ;

data _tmp1;
set &data (keep = &dep &indep);

%do i = 1 %to &nlag;
lag&i._&dep = lag&i.(&dep);
lag&i._&indep = lag&i.(&indep);

%let lag_dep = &lag_dep lag&i._&dep;
%let lag_indep = &lag_indep lag&i._&indep;
%end;
run;

proc corr data = _tmp1 noprint outp = _corr1(rename = (&dep = value) where = (_type_ = 'CORR')) nosimple;
var &dep;
with lag&nlag._&indep;
run;

proc corr data = _tmp1 noprint outp = _corr2(rename = (&dep = value) where = (_type_ = 'CORR')) nosimple;
var &dep;
with lag&nlag._&indep;
partial lag&nlag._&dep;
run;

proc reg data = _tmp1 noprint;
model &dep = &lag_dep;
output out = _rest1 r = rest_e;
run;

proc reg data = _tmp1 noprint;
model &dep = &lag_dep &lag_indep;
output out = _full1 r = full_e;
run;

proc sql noprint;
select sum(full_e * full_e) into :full_sse1 from _full1;

select sum(rest_e * rest_e) into :rest_sse1 from _rest1;

select count(*) into :n from _full1;

select value into :cor1 from _corr1;

select value into :cor2 from _corr2;
quit;

data _result;
format dep \$20. ind \$20.;
dep   = "&dep";
ind   = "%upcase(&indep)";
nlag = &nlag;

corr1 = &cor1;
corr2 = &cor2;

f_test1  = ((&rest_sse1 - &full_sse1) / &nlag) / (&full_sse1 / (&n -  2 * &nlag - 1));
p_ftest1 = 1 - probf(f_test1, &nlag, &n -  2 * &nlag - 1);

chisq_test1 = (&n * (&rest_sse1 - &full_sse1)) / &full_sse1;
p_chisq1    = 1 - probchi(chisq_test1, &nlag);

format flag1 \$3.;
if max(p_ftest1, p_chisq1) < 0.01 then flag1 = "***";
else if max(p_ftest1, p_chisq1) < 0.05 then flag1 = " **";
else if max(p_ftest1, p_chisq1) < 0.1 then flag1 = "  *";
else flag1 = "   ";
run;

%mend granger;

data _in1;
set &data (keep = &y &drivers);
run;

%let var_loop = 1;

%do %while (%scan(&drivers, &var_loop) ne %str());

%let driver = %scan(&drivers, &var_loop);

%do lag_loop = 1 %to &max_lags;

%granger(data = _in1, dep = &y, indep = &driver, nlag = &lag_loop);

%if &var_loop = 1 & &lag_loop = 1 %then %do;
data _final;
set _result;
run;
%end;
%else %do;
data _final;
set _final _result;
run;
%end;
%end;

%let var_loop = %eval(&var_loop + 1);
%end;

title;
proc report data  = _last_ box spacing = 1 split = "/" nowd;
column("GRANGER CAUSALITY TEST FOR %UPCASE(&y) UPTO &MAX_LAGS LAGS"
ind nlag corr1 corr2 f_test1 chisq_test1 flag1);

define ind         / "DRIVERS"                width = 20 center group order order = data;
define nlag        / "LAG"                    width = 3  format = 3.   center order order = data;
define corr1       / "PEARSON/CORRELATION"    width = 12 format = 8.4  center;
define corr2       / "PARTIAL/CORRELATION"    width = 12 format = 8.4  center;
define f_test1     / "CAUSAL/F-STAT"          width = 12 format = 10.4 center;
define chisq_test1 / "CAUSAL/CHISQ-STAT"      width = 12 format = 10.4 center;
define flag1       / "CAUSAL/FLAG"            width = 8  right;
run;

%mend causal;

%causal(data = sashelp.citimon, y = RTRR, drivers = CCIUTC LHUR FSPCON, max_lags = 6);
```

Based upon the output shown below, it is tentative to conclude that LHUR (UNEMPLOYMENT RATE) 3 months early might be helpful to predict RTRR (RETAIL SALES).

``` ---------------------------------------------------------------------------------------
|                     GRANGER CAUSALITY TEST FOR RTRR UPTO 6 LAGS                     |
|                           PEARSON      PARTIAL       CAUSAL       CAUSAL      CAUSAL|
|      DRIVERS        LAG CORRELATION  CORRELATION     F-STAT     CHISQ-STAT      FLAG|
|-------------------------------------------------------------------------------------|
|       CCIUTC       |  1|    0.9852  |    0.1374  |     2.7339 |     2.7917 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  2|    0.9842  |    0.1114  |     0.7660 |     1.5867 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  3|    0.9834  |    0.0778  |     0.8186 |     2.5803 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  4|    0.9829  |    0.1047  |     0.7308 |     3.1165 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  5|    0.9825  |    0.0926  |     0.7771 |     4.2043 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  6|    0.9819  |    0.0868  |     0.7085 |     4.6695 |        |
|--------------------+---+------------+------------+------------+------------+--------|
|        LHUR        |  1|   -0.7236  |    0.0011  |     0.0000 |     0.0000 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  2|   -0.7250  |    0.0364  |     1.4136 |     2.9282 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  3|   -0.7268  |    0.0759  |     2.4246 |     7.6428 |       *|
|                    |---+------------+------------+------------+------------+--------|
|                    |  4|   -0.7293  |    0.0751  |     2.1621 |     9.2208 |       *|
|                    |---+------------+------------+------------+------------+--------|
|                    |  5|   -0.7312  |    0.1045  |     2.1148 |    11.4422 |       *|
|                    |---+------------+------------+------------+------------+--------|
|                    |  6|   -0.7326  |    0.1365  |     1.9614 |    12.9277 |       *|
|--------------------+---+------------+------------+------------+------------+--------|
|       FSPCON       |  1|    0.9484  |    0.0431  |     0.2631 |     0.2687 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  2|    0.9481  |    0.0029  |     0.6758 |     1.3998 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  3|    0.9483  |   -0.0266  |     0.4383 |     1.3817 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  4|    0.9484  |   -0.0360  |     0.9219 |     3.9315 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  5|    0.9494  |   -0.0793  |     0.9008 |     4.8739 |        |
|                    |---+------------+------------+------------+------------+--------|
|                    |  6|    0.9492  |   -0.0999  |     0.9167 |     6.0421 |        |
---------------------------------------------------------------------------------------
```

Written by statcompute

July 8, 2012 at 2:13 pm