Yet Another Blog in Statistical Computing

I can calculate the motion of heavenly bodies but not the madness of people. -Isaac Newton

A SAS Macro for Bootstrap Aggregating (Bagging)

Proposed by Breiman (1996), bagging is the acronym of “bootstrap aggregating” and is a machine learning method to improve the prediction accuracy by simply averaging over predictions from multiple classifiers developed with bootstrapped samples out of the original training set.

Regardless of its statistical elegance, bagging is attractive in modern machine learning in that the construction of each decision tree in bagging is very computationally efficient and is completely independent from each other, which makes bagging ideal in parallel computing.

The SAS macro demonstrated below is an attempt to test bagging algorithm on a consumer banking dataset.

%macro bagging(data = , y = , numx = , catx = , ntrees = 50);
***********************************************************;
* THIS SAS MACRO IS AN ATTEMPT TO IMPLEMENT BAGGING       *;
* PROPOSED BY LEO BREIMAN (1996)                          *;
* ======================================================= *;
* PAMAMETERS:                                             *;
*  DATA   : INPUT SAS DATA TABLE                          *;
*  Y      : RESPONSE VARIABLE WITH 0/1 VALUE              *;
*  NUMX   : A LIST OF NUMERIC ATTRIBUTES                  *;
*  CATX   : A LIST OF CATEGORICAL ATTRIBUTES              *;
*  NTREES : # OF TREES TO DO THE BAGGING                  *;
* ======================================================= *;
* OUTPUTS:                                                *;
*  1. A SAS CATALOG FILE NAMED "TREEFILES" IN THE WORKING *;
*     DIRECTORY CONTAINING ALL SCORING FILES IN BAGGING   *;
*  2. A LST FILE SHOWING ks STATISTICS OF THE BAGGING     *;
*     CLASSIFIER AND EACH TREE CLASSIFIER                 *;
* ======================================================= *;
* CONTACT:                                                *;
*  WENSUI.LIU@53.COM, LOSS FORECASTING & RISK MODELING    *;
***********************************************************;

options mprint mlogic nocenter nodate nonumber;

*** a random seed value subject to change ***;
%let seed = 20110613;

*** assign a library to the working folder ***;
libname _path '';

*** generate a series of random seeds ***;
data _null_;
  do i = 1 to &ntrees;
    random = put(ranuni(&seed) * (10 ** 8), 8.);
    name   = compress("random"||put(i, 3.), ' ');
    call symput(name, random);
  end;
run;    

*** clean up catalog files in the library ***;
proc datasets library = _path nolist;
  delete TreeFiles tmp / memtype = catalog;
run;
quit;

proc sql noprint;
  select count(*) into :nobs from &data where &y in (1, 0);
quit;

data _tmp1 (keep = &y &numx &catx _id_);
  set &data;
  _id_ + 1;
run;
  
%do i = 1 %to &ntrees;
  %put &&random&i;

  *** generate bootstrap samples for bagging ***;
  proc surveyselect data = _tmp1 method = urs n = &nobs seed = &&random&i
    out = sample&i(rename = (NumberHits = _hits)) noprint;
  run;
  
  *** generate data mining datasets for sas e-miner ***;
  proc dmdb data = sample&i out = db_sample&i dmdbcat = cl_sample&i;
    class &y &catx;
    var &numx;
    target &y;
    freq _hits;
  run;

  *** create a sas temporary catalog to contain sas output ***;
  filename out_tree catalog "_path.tmp.out_tree.source";

  *** create decision tree mimicking CART ***;
  proc split data = db_sample&i dmdbcat = cl_sample&i
    criterion    = gini
    assess       = impurity
    maxbranch    = 2
    splitsize    = 100
    subtree      = assessment
    exhaustive   = 0 
    nsurrs       = 0;
    code file    = out_tree;
    input &numx   / level = interval;
    input &catx   / level = nominal;
    target &y     / level = binary;
    freq _hits;
  run;  

  *** create a perminant sas catalog to contain all tree outputs ***;
  filename in_tree catalog "_path.TreeFiles.tree&i..source";

  data _null_;
    infile out_tree;
    input;
    file in_tree;
    if _n_ > 3 then put _infile_;
  run;

  *** score the original data by each tree output file ***;
  data _score&i (keep = p_&y.1 p_&y.0 &y _id_);
    set _tmp1;
    %include in_tree;
  run;

  *** calculate KS stat ***;
  proc printto new print = lst_out;
  run;

  ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));
  proc npar1way wilcoxon edf data = _score&i;
    class &y.;
    var p_&y.1;
  run;

  proc printto;
  run;

  %if &i = 1 %then %do;
    data _tmp2;
      set _score&i;
    run;

    data _ks;
      set _kstmp (keep = nvalue2);
      tree_id = &i;
      seed    = &&random&i;
      ks      = round(nvalue2 * 100, 0.0001);
    run;
  %end;    
  %else %do;
    data _tmp2;
      set _tmp2 _score&i;
    run;

    data _ks;
      set _ks _kstmp(in = a keep = nvalue2);
      if a then do;
        tree_id = &i;
        seed    = &&random&i;
        ks      = round(nvalue2 * 100, 0.0001);
      end;
    run;
  %end;    

%end;

*** aggregate predictions from all trees in the bag ***;
proc summary data = _tmp2 nway;
  class _id_;
  output out = _tmp3(drop = _type_ rename = (_freq_ = freq))
  mean(p_&y.1) =  mean(p_&y.0) =  mean(&y) = ;
run;

*** calculate bagging KS stat ***;
proc printto new print = lst_out;
run;

ods output kolsmir2stats = _kstmp(where = (label1 = 'KS'));
proc npar1way wilcoxon edf data = _tmp3;
  class &y;
  var p_&y.1;
run;

proc printto;
run;

data _ks;
  set _ks _kstmp (in = a keep = nvalue2);
  if a then do;
    tree_id = 0;
    seed    = &seed;
    ks      = round(nvalue2 * 100, 0.0001);
  end;
run;

proc sort data = _ks;
  by tree_id;
run;

proc sql noprint;
  select max(ks) into :max_ks from _ks where tree_id > 0;
  
  select min(ks) into :min_ks from _ks where tree_id > 0;

  select ks into :bag_ks from _ks where tree_id = 0;
quit;

*** summarize the performance of bagging classifier and each tree in the bag ***;
title "MAX KS = &max_ks, MIN KS = &min_ks, BAGGING KS = &bag_ks";
proc print data = _ks noobs;
  var tree_id seed ks;
run;
title;

proc datasets library = _path nolist;
  delete tmp / memtype = catalog;
run;
quit;

%mend bagging;

%let x1 = tot_derog tot_tr age_oldest_tr tot_open_tr tot_rev_tr tot_rev_debt
          tot_rev_line rev_util bureau_score ltv tot_income;

%let x2 = purpose;

libname data 'D:\SAS_CODE\bagging';

%bagging(data = data.accepts, y = bad, numx = &x1, catx = &x2, ntrees = 10);

The table below is to show the result of bagging estimated out of 10 bootstrap samples. As seen, bagging prediction outperforms the prediction from the best decision tree by at least 10%. And this performance of bagging is very robust with multiple iterations and experiments.

MAX KS =  41.9205, MIN KS =  37.9653, BAGGING KS =  47.9446

tree_id      seed         ks
    0      20110613    47.9446
    1      66117721    38.0739
    2      73612659    41.9205
    3      88775645    37.9653
    4      76989116    39.7305
    5      78326288    41.8533
    6      67052887    39.7698
    7       1826834    38.9471
    8      47292499    39.2977
    9      39078123    40.2813
   10      15798916    40.6123
Advertisements

Written by statcompute

July 14, 2012 at 11:33 pm

Posted in Machine Learning, SAS

%d bloggers like this: