Monotonic WoE Binning for LGD Models

While the monotonic binning algorithm has been widely used in scorecard and PD model (Probability of Default) developments, the similar idea can be generalized to LGD (Loss Given Default) models. In the post below, two SAS macros performing the monotonic binning for LGD are demonstrated.

The first one tends to generate relatively coarse bins based on iterative grouping, which requires a longer computing time.


%macro lgd_bin1(data = , y = , x = );

%let maxbin = 20;

data _tmp1 (keep = x y);
  set &data;
  y = min(1, max(0, &y));
  x = &x;
run;

proc sql noprint;
  select
    count(distinct x) into :xflg
  from
    _last_;
quit;

%let nbin = %sysfunc(min(&maxbin, &xflg));

%if &nbin > 2 %then %do;
  %do j = &nbin %to 2 %by -1;
    proc rank data = _tmp1 groups = &j out = _data_ (keep = x rank y);
      var x;
      ranks rank;
    run;

    proc summary data = _last_ nway;
      class rank;
      output out = _tmp2 (drop = _type_ rename = (_freq_ = freq))
      sum(y) = bads  mean(y) = bad_rate 
      min(x) = minx  max(x)  = maxx;
    run;

    proc sql noprint;
      select
        case when min(bad_rate) > 0 then 1 else 0 end into :minflg
      from
        _tmp2;
 
      select
        case when max(bad_rate) < 1 then 1 else 0 end into :maxflg
      from
        _tmp2;              
    quit;

    %if &minflg = 1 & &maxflg = 1 %then %do;
      proc corr data = _tmp2 spearman noprint outs = _corr;
        var minx;
        with bad_rate;
      run;
      
      proc sql noprint;
        select
          case when abs(minx) = 1 then 1 else 0 end into :cor
        from
          _corr
        where
          _type_ = 'CORR';
      quit;
 
      %if &cor = 1 %then %goto loopout;
    %end;
  %end;
%end;

%loopout:

proc sql noprint;
create table
  _tmp3 as
select
  a.rank + 1                                           as bin,
  a.minx                                               as minx,
  a.maxx                                               as maxx,
  a.freq                                               as freq,
  a.freq / b.freq                                      as dist,
  a.bad_rate                                           as avg_lgd,
  a.bads / b.bads                                      as bpct,
  (a.freq - a.bads) / (b.freq - b.bads)                as gpct,
  log(calculated bpct / calculated gpct)               as woe,
  (calculated bpct - calculated gpct) / calculated woe as iv 
from
  _tmp2 as a, (select sum(freq) as freq, sum(bads) as bads from _tmp2) as b;
quit;

proc print data = _last_ noobs label;
  var minx maxx freq dist avg_lgd woe;
  format freq comma8. dist percent10.2;
  label
    minx    = "Lower Limit"
    maxx    = "Upper Limit"
    freq    = "Freq"
    dist    = "Dist"
    avg_lgd = "Average LGD"
    woe     = "WoE";
  sum freq dist;
run; 

%mend lgd_bin1;

The second one can generate much finer bins based on the idea of isotonic regressions and is more computationally efficient.


%macro lgd_bin2(data = , y = , x = );

data _data_ (keep = x y);
  set &data;
  y = min(1, max(0, &y));
  x = &x;
run;

proc transreg data = _last_ noprint;
  model identity(y) = monotone(x);
  output out = _tmp1 tip = _t;
run;
 
proc summary data = _last_ nway;
  class _tx;
  output out = _data_ (drop = _freq_ _type_) mean(y) = lgd;
run;

proc sort data = _last_;
  by lgd;
run;
 
data _tmp2;
  set _last_;
  by lgd;
  _idx = _n_;
  if lgd = 0 then _idx = _idx + 1;
  if lgd = 1 then _idx = _idx - 1;
run;

proc sql noprint;
create table
  _tmp3 as
select
  a.*,
  b._idx
from
  _tmp1 as a inner join _tmp2 as b
on
  a._tx = b._tx;

create table
  _tmp4 as
select
  min(a.x)                                             as minx,
  max(a.x)                                             as maxx,
  sum(a.y)                                             as bads,
  count(a.y)                                           as freq,
  count(a.y) / b.freq                                  as dist,
  mean(a.y)                                            as avg_lgd,
  sum(a.y) / b.bads                                    as bpct,
  sum(1 - a.y) / (b.freq - b.bads)                     as gpct,
  log(calculated bpct / calculated gpct)               as woe,
  (calculated bpct - calculated gpct) * calculated woe as iv
from
  _tmp3 as a, (select count(*) as freq, sum(y) as bads from _tmp3) as b
group by
  a._idx;
quit;

proc print data = _last_ noobs label;
  var minx maxx freq dist avg_lgd woe; 
  format freq comma8. dist percent10.2;
  label
    minx    = "Lower Limit"
    maxx    = "Upper Limit"
    freq    = "Freq"
    dist    = "Dist"
    avg_lgd = "Average LGD"
    woe     = "WoE";
  sum freq dist;
run; 

%mend lgd_bin2;

Below is the output comparison between two macros with the testing data downloaded from http://www.creditriskanalytics.net/datasets-private.html. Should you have any feedback, please feel free to leave me a message.

lgd1