# Yet Another Blog in Statistical Computing

I can calculate the motion of heavenly bodies but not the madness of people. -Isaac Newton

## Bumping: A Stochastic Search for the Best Model

Breiman (1996) showed how to use the bootstrap sampling technique to improve the prediction accuracy in the bagging algorithm, which has shown successful use cases in subset selection and decision trees. However, a major drawback of bagging is that it destroys the simple structure of the original model. For instance, the bagging of decision trees is not presented in a tree structure any more. As a result, bagging improves the model prediction accuracy at the cost of interpretability.

Tibshirani (1997) proposed another use case of bootstrap sampling, which was named Bumping. In the bumping algorithm, bootstrap sampling is used to estimate candidate models with the purpose to do a stochastic search for a best single model throughout the whole model space. As such, the simple structure of the original model, such as the one presented in a decision tree, will be well preserved.

A SAS macro below is showing how to implement the bumping algorithm.

```%macro bumping(data = , y = , numx = , catx = x, ntrees = 100);
***********************************************************;
* THIS SAS MACRO IS AN ATTEMPT TO IMPLEMENT BUMPING       *;
* PROPOSED BY TIBSHIRANI AND KNIGHT (1997)                *;
* ======================================================= *;
* PAMAMETERS:                                             *;
*  DATA   : INPUT SAS DATA TABLE                          *;
*  Y      : RESPONSE VARIABLE WITH 0/1 VALUE              *;
*  NUMX   : A LIST OF NUMERIC ATTRIBUTES                  *;
*  CATX   : A LIST OF CATEGORICAL ATTRIBUTES              *;
*  NTREES : # OF TREES TO DO THE BUMPING SEARCH           *;
* ======================================================= *;
* OUTPUTS:                                                *;
*  BESTTREE.TXT: A TEXT FILE USED TO SCORE THE BEST TREE  *;
*                THROUGH THE BUMPING SEARCH               *;
* ======================================================= *;
* CONTACT:                                                *;
*  WENSUI.LIU@53.COM                                      *;
***********************************************************;

options mprint mlogic nocenter nodate nonumber;

*** a random seed value subject to change ***;
%let seed = 1;

data _null_;
do i = 1 to &ntrees;
random = put(ranuni(&seed) * (10 ** 8), 8.);
name   = compress("random"||put(i, 3.), ' ');
call symput(name, random);
end;
run;

proc datasets library = data nolist;
delete catalog / memtype = catalog;
run;
quit;

proc sql noprint;
select count(*) into :nobs from &data where &y in (1, 0);
quit;

%do i = 1 %to &ntrees;
%put &&random&i;

proc surveyselect data = &data method = urs n = &nobs seed = &&random&i
out = sample&i(rename = (NumberHits = _hits)) noprint;
run;

proc dmdb data = sample&i out = db_sample&i dmdbcat = cl_sample&i;
class &y &catx;
var &numx;
target &y;
freq _hits;
run;

filename out_tree catalog "data.catalog.out_tree.source";

proc split data = db_sample&i dmdbcat = cl_sample&i
criterion    = gini
assess       = impurity
maxbranch    = 2
splitsize    = 100
subtree      = assessment
exhaustive   = 0
nsurrs       = 0;
code file    = out_tree;
input &numx   / level = interval;
input &catx   / level = nominal;
target &y     / level = binary;
freq _hits;
run;

filename in_tree catalog "data.catalog.tree&i..source";

data _null_;
infile out_tree;
input;
file in_tree;
if _n_ > 3 then put _infile_;
run;

data _tmp1(keep = p_&y.1 p_&y.0 &y);
set &data;
%include in_tree;
run;

proc printto new print = lst_out;
run;

ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));
proc npar1way wilcoxon edf data = _tmp1;
class &y;
var p_&y.1;
run;

proc printto;
run;

%if &i = 1 %then %do;
data _ks;
set _kstmp (keep = nvalue2);
tree_id = &i;
seed    = &&random&i;
ks      = round(nvalue2 * 100, 0.0001);
run;
%end;
%else %do;
data _ks;
set _ks _kstmp(in = a keep = nvalue2);
if a then do;
tree_id = &i;
seed    = &&random&i;
ks      = round(nvalue2 * 100, 0.0001);
end;
run;
%end;
%end;

proc sql noprint;
select max(ks) into :ks from _ks;

select tree_id into :best from _ks where round(ks, 0.0001) = round(&ks., 0.0001);
quit;

filename best catalog "data.catalog.tree%trim(&best).source";
filename output "BestTree.txt";

data _null_;
infile best;
input;
file output;
if _n_ = 1 then do;
put " ******************************************************; ";
put " ***** BEST TREE: TREE %trim(&best) WITH KS = &KS *****; ";
put " ******************************************************; ";
end;
put _infile_;
run;

data _out;
set _ks;

if round(ks, 0.0001) = round(&ks., 0.0001) then flag = '***';
run;

proc print data = _out noobs;
var tree_id seed ks flag;
run;

%mend bumping;

libname data 'D:\SAS_CODE\bagging';

%let x1 = tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt
tot_rev_line rev_util bureau_score ltv tot_income;

%let x2 = purpose;

%bumping(data = data.accepts, y = bad, numx = &x1, catx = &x2, ntrees = 50);
```

The table below is to show the result of a stochastic search for the best decision tree out of 50 trees estimated from bootstrapped samples. In the result table, the best tree has been flagged by “***”. With the related seed value, any bootstrap sample and decision trees should be replicated.

```tree_id      seed         ks      flag
1      18496257    41.1210
2      97008872    41.2568
3      39982431    39.2714
4      25939865    38.7901
5      92160258    40.9343
6      96927735    40.6441
7      54297917    41.2460
8      53169172    40.5881
9       4979403    40.9662
10       6656655    41.2006
11      81931857    41.1540
12      52387052    40.1930
13      85339431    36.7912
14       6718458    39.9277
15      95702386    39.0264
16      29719396    39.8790
17      27261179    40.1256
18      68992963    40.7699
19      97676486    37.7472
20      22650752    39.6255
21      68823655    40.3759
22      41276387    41.2282
23      55855411    41.5945    ***
24      28722561    40.6127
25      47578931    40.2973
26      84498698    38.6929
27      63452412    41.0329
28      59036467    39.1822
29      58258153    40.5223
30      37701337    40.2190
31      72836156    40.1872
32      50660353    38.5086
33      93121359    39.9043
34      92912005    40.0265
35      58966034    38.8403
36      29722285    39.7879
37      39104243    38.4006
38      47242918    39.5534
39      67952575    39.2817
40      16808835    40.4024
41      16652610    40.5237
42      87110489    39.9251
43      29878953    39.6106
44      93464176    40.5942
45      90047083    40.4422
46      56878347    40.6057
47       4954566    39.7689
48      13558826    38.7292
49      51131788    41.0891
50      43320456    41.0566
```

At last, the text file used to score the best tree is attached below, in which you should be able to see the structure of the best decision tree selected out of 50 trees.

``` ******************************************************;
***** BEST TREE: TREE 23 WITH KS =  41.5945 *****;
******************************************************;

******         LENGTHS OF NEW CHARACTER VARIABLES         ******;
LENGTH I_bad  \$   12;
LENGTH _WARN_  \$    4;

******              LABELS FOR NEW VARIABLES              ******;
LABEL _NODE_  = 'Node' ;
LABEL _LEAF_  = 'Leaf' ;
LABEL U_bad  = 'Unnormalized Into: bad' ;
LABEL _WARN_  = 'Warnings' ;

******      TEMPORARY VARIABLES FOR FORMATTED VALUES      ******;
LENGTH _ARBFMT_2 \$     12; DROP _ARBFMT_2;
_ARBFMT_2 = ' '; /* Initialize to avoid warning. */
LENGTH _ARBFMT_15 \$      5; DROP _ARBFMT_15;
_ARBFMT_15 = ' '; /* Initialize to avoid warning. */

******             ASSIGN OBSERVATION TO NODE             ******;
IF  NOT MISSING(bureau_score ) AND
662.5 <= bureau_score  THEN DO;
IF  NOT MISSING(bureau_score ) AND
721.5 <= bureau_score  THEN DO;
IF  NOT MISSING(tot_derog ) AND
tot_derog  <                  3.5 THEN DO;
IF  NOT MISSING(ltv ) AND
99.5 <= ltv  THEN DO;
IF  NOT MISSING(tot_rev_line ) AND
tot_rev_line  <                16671 THEN DO;
_ARBFMT_15 = PUT( purpose , \$5.);
%DMNORMIP( _ARBFMT_15);
IF _ARBFMT_15 IN ('LEASE' ) THEN DO;
_NODE_  =                   62;
_LEAF_  =                   29;
I_bad  = '0' ;
END;
ELSE DO;
_NODE_  =                   63;
_LEAF_  =                   30;
I_bad  = '0' ;
END;
END;
ELSE DO;
IF  NOT MISSING(ltv ) AND
131.5 <= ltv  THEN DO;
_NODE_  =                   65;
_LEAF_  =                   32;
I_bad  = '0' ;
END;
ELSE DO;
_NODE_  =                   64;
_LEAF_  =                   31;
I_bad  = '0' ;
END;
END;
END;
ELSE DO;
IF  NOT MISSING(tot_open_tr ) AND
tot_open_tr  <                  1.5 THEN DO;
_NODE_  =                   42;
_LEAF_  =                   26;
I_bad  = '0' ;
END;
ELSE DO;
IF  NOT MISSING(tot_derog ) AND
1.5 <= tot_derog  THEN DO;
_NODE_  =                   61;
_LEAF_  =                   28;
I_bad  = '0' ;
END;
ELSE DO;
_NODE_  =                   60;
_LEAF_  =                   27;
I_bad  = '0' ;
END;
END;
END;
END;
ELSE DO;
_NODE_  =                   15;
_LEAF_  =                   33;
I_bad  = '0' ;
END;
END;
ELSE DO;
IF  NOT MISSING(tot_rev_line ) AND
6453.5 <= tot_rev_line  THEN DO;
IF  NOT MISSING(ltv ) AND
122.5 <= ltv  THEN DO;
_NODE_  =                   27;
_LEAF_  =                   25;
I_bad  = '0' ;
END;
ELSE DO;
IF  NOT MISSING(tot_derog ) AND
9.5 <= tot_derog  THEN DO;
_NODE_  =                   41;
_LEAF_  =                   24;
I_bad  = '0' ;
END;
ELSE DO;
IF  NOT MISSING(tot_rev_line ) AND
tot_rev_line  <                16694 THEN DO;
_NODE_  =                   58;
_LEAF_  =                   22;
I_bad  = '0' ;
END;
ELSE DO;
_NODE_  =                   59;
_LEAF_  =                   23;
I_bad  = '0' ;
END;
END;
END;
END;
ELSE DO;
IF  NOT MISSING(ltv ) AND
ltv  <                133.5 THEN DO;
IF  NOT MISSING(tot_income ) AND
tot_income  <               2377.2 THEN DO;
IF  NOT MISSING(age_oldest_tr ) AND
age_oldest_tr  <                   57 THEN DO;
_NODE_  =                   54;
_LEAF_  =                   17;
I_bad  = '0' ;
END;
ELSE DO;
_NODE_  =                   55;
_LEAF_  =                   18;
I_bad  = '0' ;
END;
END;
ELSE DO;
IF  NOT MISSING(ltv ) AND
ltv  <                 94.5 THEN DO;
_NODE_  =                   56;
_LEAF_  =                   19;
I_bad  = '0' ;
END;
ELSE DO;
_NODE_  =                   57;
_LEAF_  =                   20;
I_bad  = '0' ;
END;
END;
END;
ELSE DO;
_NODE_  =                   25;
_LEAF_  =                   21;
I_bad  = '1' ;
END;
END;
END;
END;
ELSE DO;
IF  NOT MISSING(ltv ) AND
ltv  <                 97.5 THEN DO;
IF  NOT MISSING(bureau_score ) AND
bureau_score  <                639.5 THEN DO;
IF  NOT MISSING(tot_open_tr ) AND
3.5 <= tot_open_tr  THEN DO;
IF  NOT MISSING(tot_income ) AND
tot_income  <             2604.165 THEN DO;
_NODE_  =                   32;
_LEAF_  =                    3;
I_bad  = '0' ;
END;
ELSE DO;
IF  NOT MISSING(tot_income ) AND
7375 <= tot_income  THEN DO;
_NODE_  =                   47;
_LEAF_  =                    5;
I_bad  = '0' ;
END;
ELSE DO;
_NODE_  =                   46;
_LEAF_  =                    4;
I_bad  = '0' ;
END;
END;
END;
ELSE DO;
IF  NOT MISSING(tot_rev_line ) AND
tot_rev_line  <                 2460 THEN DO;
_NODE_  =                   30;
_LEAF_  =                    1;
I_bad  = '0' ;
END;
ELSE DO;
_NODE_  =                   31;
_LEAF_  =                    2;
I_bad  = '1' ;
END;
END;
END;
ELSE DO;
IF  NOT MISSING(tot_income ) AND
tot_income  <             9291.835 THEN DO;
IF  NOT MISSING(tot_tr ) AND
13.5 <= tot_tr  THEN DO;
_NODE_  =                   35;
_LEAF_  =                    8;
I_bad  = '0' ;
END;
ELSE DO;
IF  NOT MISSING(bureau_score ) AND
bureau_score  <                646.5 THEN DO;
_NODE_  =                   48;
_LEAF_  =                    6;
I_bad  = '0' ;
END;
ELSE DO;
_NODE_  =                   49;
_LEAF_  =                    7;
I_bad  = '0' ;
END;
END;
END;
ELSE DO;
_NODE_  =                   19;
_LEAF_  =                    9;
I_bad  = '1' ;
END;
END;
END;
ELSE DO;
IF  NOT MISSING(tot_rev_line ) AND
tot_rev_line  <               1218.5 THEN DO;
IF  NOT MISSING(age_oldest_tr ) AND
115 <= age_oldest_tr  THEN DO;
_NODE_  =                   21;
_LEAF_  =                   11;
I_bad  = '0' ;
END;
ELSE DO;
_NODE_  =                   20;
_LEAF_  =                   10;
I_bad  = '1' ;
END;
END;
ELSE DO;
IF  NOT MISSING(bureau_score ) AND
bureau_score  <                  566 THEN DO;
_NODE_  =                   22;
_LEAF_  =                   12;
I_bad  = '1' ;
END;
ELSE DO;
IF  NOT MISSING(tot_rev_line ) AND
13717.5 <= tot_rev_line  THEN DO;
IF  NOT MISSING(tot_rev_debt ) AND
tot_rev_debt  <                11884 THEN DO;
_NODE_  =                   52;
_LEAF_  =                   15;
I_bad  = '0' ;
END;
ELSE DO;
_NODE_  =                   53;
_LEAF_  =                   16;
I_bad  = '0' ;
END;
END;
ELSE DO;
IF  NOT MISSING(ltv ) AND
ltv  <                 99.5 THEN DO;
_NODE_  =                   50;
_LEAF_  =                   13;
I_bad  = '1' ;
END;
ELSE DO;
_NODE_  =                   51;
_LEAF_  =                   14;
I_bad  = '0' ;
END;
END;
END;
END;
END;
END;

****************************************************************;
******          END OF DECISION TREE SCORING CODE         ******;
****************************************************************;
```