Yet Another Blog in Statistical Computing

I can calculate the motion of heavenly bodies but not the madness of people. -Isaac Newton

Bumping: A Stochastic Search for the Best Model

Breiman (1996) showed how to use the bootstrap sampling technique to improve the prediction accuracy in the bagging algorithm, which has shown successful use cases in subset selection and decision trees. However, a major drawback of bagging is that it destroys the simple structure of the original model. For instance, the bagging of decision trees is not presented in a tree structure any more. As a result, bagging improves the model prediction accuracy at the cost of interpretability.

Tibshirani (1997) proposed another use case of bootstrap sampling, which was named Bumping. In the bumping algorithm, bootstrap sampling is used to estimate candidate models with the purpose to do a stochastic search for a best single model throughout the whole model space. As such, the simple structure of the original model, such as the one presented in a decision tree, will be well preserved.

A SAS macro below is showing how to implement the bumping algorithm.

%macro bumping(data = , y = , numx = , catx = x, ntrees = 100);
***********************************************************;
* THIS SAS MACRO IS AN ATTEMPT TO IMPLEMENT BUMPING       *;
* PROPOSED BY TIBSHIRANI AND KNIGHT (1997)                *;
* ======================================================= *;
* PAMAMETERS:                                             *;
*  DATA   : INPUT SAS DATA TABLE                          *;
*  Y      : RESPONSE VARIABLE WITH 0/1 VALUE              *;
*  NUMX   : A LIST OF NUMERIC ATTRIBUTES                  *;
*  CATX   : A LIST OF CATEGORICAL ATTRIBUTES              *;
*  NTREES : # OF TREES TO DO THE BUMPING SEARCH           *;
* ======================================================= *;
* OUTPUTS:                                                *;
*  BESTTREE.TXT: A TEXT FILE USED TO SCORE THE BEST TREE  *;
*                THROUGH THE BUMPING SEARCH               *;
* ======================================================= *;
* CONTACT:                                                *;
*  WENSUI.LIU@53.COM                                      *;
***********************************************************;

options mprint mlogic nocenter nodate nonumber;

*** a random seed value subject to change ***;
%let seed = 1;

data _null_;
  do i = 1 to &ntrees;
    random = put(ranuni(&seed) * (10 ** 8), 8.);
    name   = compress("random"||put(i, 3.), ' ');
    call symput(name, random);
  end;
run;    

proc datasets library = data nolist;
  delete catalog / memtype = catalog;
run;
quit;

proc sql noprint;
  select count(*) into :nobs from &data where &y in (1, 0);
quit;

%do i = 1 %to &ntrees;
  %put &&random&i;

  proc surveyselect data = &data method = urs n = &nobs seed = &&random&i
    out = sample&i(rename = (NumberHits = _hits)) noprint;
  run;
  
  proc dmdb data = sample&i out = db_sample&i dmdbcat = cl_sample&i;
    class &y &catx;
    var &numx;
    target &y;
    freq _hits;
  run;

  filename out_tree catalog "data.catalog.out_tree.source";
  
  proc split data = db_sample&i dmdbcat = cl_sample&i
    criterion    = gini
    assess       = impurity
    maxbranch    = 2
    splitsize    = 100
    subtree      = assessment
    exhaustive   = 0 
    nsurrs       = 0;
    code file    = out_tree;
    input &numx   / level = interval;
    input &catx   / level = nominal;
    target &y     / level = binary;
    freq _hits;
  run;  

  filename in_tree catalog "data.catalog.tree&i..source";

  data _null_;
    infile out_tree;
    input;
    file in_tree;
    if _n_ > 3 then put _infile_;
  run;

  data _tmp1(keep = p_&y.1 p_&y.0 &y);
    set &data;
    %include in_tree;
  run;

  proc printto new print = lst_out;
  run;

  ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));
  proc npar1way wilcoxon edf data = _tmp1;
    class &y;
    var p_&y.1;
  run;

  proc printto;
  run;

  %if &i = 1 %then %do;
    data _ks;
      set _kstmp (keep = nvalue2);
      tree_id = &i;
      seed    = &&random&i;
      ks      = round(nvalue2 * 100, 0.0001);
    run;
  %end;    
  %else %do;
    data _ks;
      set _ks _kstmp(in = a keep = nvalue2);
      if a then do;
        tree_id = &i;
        seed    = &&random&i;
        ks      = round(nvalue2 * 100, 0.0001);
      end;
    run;
  %end;  
%end;

proc sql noprint;
  select max(ks) into :ks from _ks;
  
  select tree_id into :best from _ks where round(ks, 0.0001) = round(&ks., 0.0001);
quit;

filename best catalog "data.catalog.tree%trim(&best).source";
filename output "BestTree.txt";

data _null_;
  infile best;
  input;
  file output;
  if _n_ = 1 then do;
    put " ******************************************************; ";
    put " ***** BEST TREE: TREE %trim(&best) WITH KS = &KS *****; ";
    put " ******************************************************; ";    
  end;
  put _infile_;
run;

data _out;
  set _ks;

  if round(ks, 0.0001) = round(&ks., 0.0001) then flag = '***';
run;

proc print data = _out noobs;
  var tree_id seed ks flag;
run;

%mend bumping;

libname data 'D:\SAS_CODE\bagging';

%let x1 = tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt
            tot_rev_line rev_util bureau_score ltv tot_income;

%let x2 = purpose;

%bumping(data = data.accepts, y = bad, numx = &x1, catx = &x2, ntrees = 50);

The table below is to show the result of a stochastic search for the best decision tree out of 50 trees estimated from bootstrapped samples. In the result table, the best tree has been flagged by “***”. With the related seed value, any bootstrap sample and decision trees should be replicated.

tree_id      seed         ks      flag
    1      18496257    41.1210        
    2      97008872    41.2568        
    3      39982431    39.2714        
    4      25939865    38.7901        
    5      92160258    40.9343        
    6      96927735    40.6441        
    7      54297917    41.2460        
    8      53169172    40.5881        
    9       4979403    40.9662        
   10       6656655    41.2006        
   11      81931857    41.1540        
   12      52387052    40.1930        
   13      85339431    36.7912        
   14       6718458    39.9277        
   15      95702386    39.0264        
   16      29719396    39.8790        
   17      27261179    40.1256        
   18      68992963    40.7699        
   19      97676486    37.7472        
   20      22650752    39.6255        
   21      68823655    40.3759        
   22      41276387    41.2282        
   23      55855411    41.5945    *** 
   24      28722561    40.6127        
   25      47578931    40.2973        
   26      84498698    38.6929        
   27      63452412    41.0329        
   28      59036467    39.1822        
   29      58258153    40.5223        
   30      37701337    40.2190        
   31      72836156    40.1872        
   32      50660353    38.5086        
   33      93121359    39.9043        
   34      92912005    40.0265        
   35      58966034    38.8403        
   36      29722285    39.7879        
   37      39104243    38.4006        
   38      47242918    39.5534        
   39      67952575    39.2817        
   40      16808835    40.4024        
   41      16652610    40.5237        
   42      87110489    39.9251        
   43      29878953    39.6106        
   44      93464176    40.5942        
   45      90047083    40.4422        
   46      56878347    40.6057        
   47       4954566    39.7689        
   48      13558826    38.7292        
   49      51131788    41.0891        
   50      43320456    41.0566        

At last, the text file used to score the best tree is attached below, in which you should be able to see the structure of the best decision tree selected out of 50 trees.

 ******************************************************; 
 ***** BEST TREE: TREE 23 WITH KS =  41.5945 *****; 
 ******************************************************; 
 
 ******         LENGTHS OF NEW CHARACTER VARIABLES         ******;
 LENGTH I_bad  $   12; 
 LENGTH _WARN_  $    4; 
 
 ******              LABELS FOR NEW VARIABLES              ******;
 LABEL _NODE_  = 'Node' ;
 LABEL _LEAF_  = 'Leaf' ;
 LABEL P_bad0  = 'Predicted: bad=0' ;
 LABEL P_bad1  = 'Predicted: bad=1' ;
 LABEL I_bad  = 'Into: bad' ;
 LABEL U_bad  = 'Unnormalized Into: bad' ;
 LABEL _WARN_  = 'Warnings' ;
 
 
 ******      TEMPORARY VARIABLES FOR FORMATTED VALUES      ******;
 LENGTH _ARBFMT_2 $     12; DROP _ARBFMT_2; 
 _ARBFMT_2 = ' '; /* Initialize to avoid warning. */
 LENGTH _ARBFMT_15 $      5; DROP _ARBFMT_15; 
 _ARBFMT_15 = ' '; /* Initialize to avoid warning. */
 
 
 ******             ASSIGN OBSERVATION TO NODE             ******;
 IF  NOT MISSING(bureau_score ) AND 
                  662.5 <= bureau_score  THEN DO;
   IF  NOT MISSING(bureau_score ) AND 
                    721.5 <= bureau_score  THEN DO;
     IF  NOT MISSING(tot_derog ) AND 
       tot_derog  <                  3.5 THEN DO;
       IF  NOT MISSING(ltv ) AND 
                         99.5 <= ltv  THEN DO;
         IF  NOT MISSING(tot_rev_line ) AND 
           tot_rev_line  <                16671 THEN DO;
           _ARBFMT_15 = PUT( purpose , $5.);
            %DMNORMIP( _ARBFMT_15);
           IF _ARBFMT_15 IN ('LEASE' ) THEN DO;
             _NODE_  =                   62;
             _LEAF_  =                   29;
             P_bad0  =     0.77884615384615;
             P_bad1  =     0.22115384615384;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           ELSE DO;
             _NODE_  =                   63;
             _LEAF_  =                   30;
             P_bad0  =     0.91479820627802;
             P_bad1  =     0.08520179372197;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           END;
         ELSE DO;
           IF  NOT MISSING(ltv ) AND 
                            131.5 <= ltv  THEN DO;
             _NODE_  =                   65;
             _LEAF_  =                   32;
             P_bad0  =     0.79166666666666;
             P_bad1  =     0.20833333333333;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           ELSE DO;
             _NODE_  =                   64;
             _LEAF_  =                   31;
             P_bad0  =     0.96962025316455;
             P_bad1  =     0.03037974683544;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           END;
         END;
       ELSE DO;
         IF  NOT MISSING(tot_open_tr ) AND 
           tot_open_tr  <                  1.5 THEN DO;
           _NODE_  =                   42;
           _LEAF_  =                   26;
           P_bad0  =                  0.9;
           P_bad1  =                  0.1;
           I_bad  = '0' ;
           U_bad  =                    0;
           END;
         ELSE DO;
           IF  NOT MISSING(tot_derog ) AND 
                              1.5 <= tot_derog  THEN DO;
             _NODE_  =                   61;
             _LEAF_  =                   28;
             P_bad0  =      0.9047619047619;
             P_bad1  =     0.09523809523809;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           ELSE DO;
             _NODE_  =                   60;
             _LEAF_  =                   27;
             P_bad0  =     0.98779134295227;
             P_bad1  =     0.01220865704772;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           END;
         END;
       END;
     ELSE DO;
       _NODE_  =                   15;
       _LEAF_  =                   33;
       P_bad0  =     0.75925925925925;
       P_bad1  =     0.24074074074074;
       I_bad  = '0' ;
       U_bad  =                    0;
       END;
     END;
   ELSE DO;
     IF  NOT MISSING(tot_rev_line ) AND 
                     6453.5 <= tot_rev_line  THEN DO;
       IF  NOT MISSING(ltv ) AND 
                        122.5 <= ltv  THEN DO;
         _NODE_  =                   27;
         _LEAF_  =                   25;
         P_bad0  =      0.7471264367816;
         P_bad1  =     0.25287356321839;
         I_bad  = '0' ;
         U_bad  =                    0;
         END;
       ELSE DO;
         IF  NOT MISSING(tot_derog ) AND 
                            9.5 <= tot_derog  THEN DO;
           _NODE_  =                   41;
           _LEAF_  =                   24;
           P_bad0  =     0.53846153846153;
           P_bad1  =     0.46153846153846;
           I_bad  = '0' ;
           U_bad  =                    0;
           END;
         ELSE DO;
           IF  NOT MISSING(tot_rev_line ) AND 
             tot_rev_line  <                16694 THEN DO;
             _NODE_  =                   58;
             _LEAF_  =                   22;
             P_bad0  =     0.84235294117647;
             P_bad1  =     0.15764705882352;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           ELSE DO;
             _NODE_  =                   59;
             _LEAF_  =                   23;
             P_bad0  =     0.92375366568914;
             P_bad1  =     0.07624633431085;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           END;
         END;
       END;
     ELSE DO;
       IF  NOT MISSING(ltv ) AND 
         ltv  <                133.5 THEN DO;
         IF  NOT MISSING(tot_income ) AND 
           tot_income  <               2377.2 THEN DO;
           IF  NOT MISSING(age_oldest_tr ) AND 
             age_oldest_tr  <                   57 THEN DO;
             _NODE_  =                   54;
             _LEAF_  =                   17;
             P_bad0  =                 0.75;
             P_bad1  =                 0.25;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           ELSE DO;
             _NODE_  =                   55;
             _LEAF_  =                   18;
             P_bad0  =     0.93150684931506;
             P_bad1  =     0.06849315068493;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           END;
         ELSE DO;
           IF  NOT MISSING(ltv ) AND 
             ltv  <                 94.5 THEN DO;
             _NODE_  =                   56;
             _LEAF_  =                   19;
             P_bad0  =     0.86301369863013;
             P_bad1  =     0.13698630136986;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           ELSE DO;
             _NODE_  =                   57;
             _LEAF_  =                   20;
             P_bad0  =     0.67766497461928;
             P_bad1  =     0.32233502538071;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           END;
         END;
       ELSE DO;
         _NODE_  =                   25;
         _LEAF_  =                   21;
         P_bad0  =      0.4090909090909;
         P_bad1  =     0.59090909090909;
         I_bad  = '1' ;
         U_bad  =                    1;
         END;
       END;
     END;
   END;
 ELSE DO;
   IF  NOT MISSING(ltv ) AND 
     ltv  <                 97.5 THEN DO;
     IF  NOT MISSING(bureau_score ) AND 
       bureau_score  <                639.5 THEN DO;
       IF  NOT MISSING(tot_open_tr ) AND 
                          3.5 <= tot_open_tr  THEN DO;
         IF  NOT MISSING(tot_income ) AND 
           tot_income  <             2604.165 THEN DO;
           _NODE_  =                   32;
           _LEAF_  =                    3;
           P_bad0  =     0.54237288135593;
           P_bad1  =     0.45762711864406;
           I_bad  = '0' ;
           U_bad  =                    0;
           END;
         ELSE DO;
           IF  NOT MISSING(tot_income ) AND 
                             7375 <= tot_income  THEN DO;
             _NODE_  =                   47;
             _LEAF_  =                    5;
             P_bad0  =     0.57575757575757;
             P_bad1  =     0.42424242424242;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           ELSE DO;
             _NODE_  =                   46;
             _LEAF_  =                    4;
             P_bad0  =     0.81102362204724;
             P_bad1  =     0.18897637795275;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           END;
         END;
       ELSE DO;
         IF  NOT MISSING(tot_rev_line ) AND 
           tot_rev_line  <                 2460 THEN DO;
           _NODE_  =                   30;
           _LEAF_  =                    1;
           P_bad0  =     0.69411764705882;
           P_bad1  =     0.30588235294117;
           I_bad  = '0' ;
           U_bad  =                    0;
           END;
         ELSE DO;
           _NODE_  =                   31;
           _LEAF_  =                    2;
           P_bad0  =     0.34343434343434;
           P_bad1  =     0.65656565656565;
           I_bad  = '1' ;
           U_bad  =                    1;
           END;
         END;
       END;
     ELSE DO;
       IF  NOT MISSING(tot_income ) AND 
         tot_income  <             9291.835 THEN DO;
         IF  NOT MISSING(tot_tr ) AND 
                           13.5 <= tot_tr  THEN DO;
           _NODE_  =                   35;
           _LEAF_  =                    8;
           P_bad0  =     0.94039735099337;
           P_bad1  =     0.05960264900662;
           I_bad  = '0' ;
           U_bad  =                    0;
           END;
         ELSE DO;
           IF  NOT MISSING(bureau_score ) AND 
             bureau_score  <                646.5 THEN DO;
             _NODE_  =                   48;
             _LEAF_  =                    6;
             P_bad0  =                    1;
             P_bad1  =                    0;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           ELSE DO;
             _NODE_  =                   49;
             _LEAF_  =                    7;
             P_bad0  =     0.73604060913705;
             P_bad1  =     0.26395939086294;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           END;
         END;
       ELSE DO;
         _NODE_  =                   19;
         _LEAF_  =                    9;
         P_bad0  =     0.35714285714285;
         P_bad1  =     0.64285714285714;
         I_bad  = '1' ;
         U_bad  =                    1;
         END;
       END;
     END;
   ELSE DO;
     IF  NOT MISSING(tot_rev_line ) AND 
       tot_rev_line  <               1218.5 THEN DO;
       IF  NOT MISSING(age_oldest_tr ) AND 
                          115 <= age_oldest_tr  THEN DO;
         _NODE_  =                   21;
         _LEAF_  =                   11;
         P_bad0  =     0.60273972602739;
         P_bad1  =      0.3972602739726;
         I_bad  = '0' ;
         U_bad  =                    0;
         END;
       ELSE DO;
         _NODE_  =                   20;
         _LEAF_  =                   10;
         P_bad0  =     0.28776978417266;
         P_bad1  =     0.71223021582733;
         I_bad  = '1' ;
         U_bad  =                    1;
         END;
       END;
     ELSE DO;
       IF  NOT MISSING(bureau_score ) AND 
         bureau_score  <                  566 THEN DO;
         _NODE_  =                   22;
         _LEAF_  =                   12;
         P_bad0  =     0.15384615384615;
         P_bad1  =     0.84615384615384;
         I_bad  = '1' ;
         U_bad  =                    1;
         END;
       ELSE DO;
         IF  NOT MISSING(tot_rev_line ) AND 
                        13717.5 <= tot_rev_line  THEN DO;
           IF  NOT MISSING(tot_rev_debt ) AND 
             tot_rev_debt  <                11884 THEN DO;
             _NODE_  =                   52;
             _LEAF_  =                   15;
             P_bad0  =     0.85869565217391;
             P_bad1  =     0.14130434782608;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           ELSE DO;
             _NODE_  =                   53;
             _LEAF_  =                   16;
             P_bad0  =     0.65972222222222;
             P_bad1  =     0.34027777777777;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           END;
         ELSE DO;
           IF  NOT MISSING(ltv ) AND 
             ltv  <                 99.5 THEN DO;
             _NODE_  =                   50;
             _LEAF_  =                   13;
             P_bad0  =     0.41489361702127;
             P_bad1  =     0.58510638297872;
             I_bad  = '1' ;
             U_bad  =                    1;
             END;
           ELSE DO;
             _NODE_  =                   51;
             _LEAF_  =                   14;
             P_bad0  =     0.61769352290679;
             P_bad1  =      0.3823064770932;
             I_bad  = '0' ;
             U_bad  =                    0;
             END;
           END;
         END;
       END;
     END;
   END;
 
 ****************************************************************;
 ******          END OF DECISION TREE SCORING CODE         ******;
 ****************************************************************;
Advertisements

Written by statcompute

June 30, 2012 at 1:32 pm

%d bloggers like this: