************************************************************************** ************************************************************************** PROGRAM: MFBUS.SAS DATE: 9/13/10 CODER: J.D. Opdyke Managing Director, Quantitative Strategies DataMineIt PURPOSE: Run and compare different SAS bootstrap algorithms including OPDY, PSS, DA, Out-SM, and A4.8. See "Much Faster Bootstraps Using SAS" by J.D. Opdyke for detailed explanations of the different algorithms. INPUTS: Each macro is completely modular and accepts 5 macro parameters: bsmp_size = number of observations in each of the bootstrap samples num_bsmps = number of bootstrap samples indata = the input dataset (including libname) byvars = the "by variables" defining the strata bootvar = the variable to be bootstrapped OUTPUTS: A uniquely named SAS dataset, the name of which contains the name of the algorithm, bsmp_size, and num_bsmps. Variables in the output dataset include the mean of the bootstrap statistics, the standard deviation of the bootstrap statistics, and the 2.5th and 97.5th percentiles of the bootstrap statistics. Additional or different bootstrap statistics are easily added. CAVEATS: The "by variables" defining the strata in the input datasets are, in OPDY and DA, assumed to be character variables. The directory c:\MFBUS must be created before the program is run. ************************************************************************** ************************************************************************** ***; options label symbolgen fullstimer yearcutoff=1950 nocenter ls = 256 ps = 51 msymtabmax=max mprint mlogic minoperator mindelimiter=' ' cleanup ; libname MFBUS "c:\MFBUS"; %macro makedata(strata_size=, numsegs=, numgeogs=); %let numstrata = %eval(&numsegs.*&numgeogs.); *** For the price variable, multiplying the random variates by the loop counter dramatically skews the values of the sample space, thus ensuring that any erroneous non-random sampling will be spotted quickly and easily. ***; data MFBUS.price_data_&numstrata.strata_&strata_size.(keep=geography segment price sortedby=geography segment); format segment geography $8.; array seg{3} $ _TEMPORARY_ ('segment1' 'segment2' 'segment3'); array geog{4} $ _TEMPORARY_ ('geog1' 'geog2' 'geog3' 'geog4'); strata_size = 1* &strata_size.; do x=1 to &numgeogs.; geography=geog{x}; do j=1 to &numsegs.; segment=seg{j}; if j=1 then do i=1 to strata_size; price=rand('UNIFORM')*10*i; output; end; else if j=2 then do i=1 to strata_size; price=rand('NORMAL')*10*i; output; end; else if j=3 then do i=1 to strata_size; price=rand('LOGNORMAL')*10*i; output; end; end; end; run; %mend makedata; %makedata(strata_size=10000, numsegs=2, numgeogs=1); %makedata(strata_size=10000, numsegs=2, numgeogs=3); %makedata(strata_size=10000, numsegs=3, numgeogs=4); %makedata(strata_size=100000, numsegs=2, numgeogs=1); %makedata(strata_size=100000, numsegs=2, numgeogs=3); %makedata(strata_size=100000, numsegs=3, numgeogs=4); %makedata(strata_size=1000000, numsegs=2, numgeogs=1); %makedata(strata_size=1000000, numsegs=2, numgeogs=3); %makedata(strata_size=1000000, numsegs=3, numgeogs=4); %makedata(strata_size=10000000, numsegs=2, numgeogs=1); %makedata(strata_size=10000000, numsegs=2, numgeogs=3); %makedata(strata_size=10000000, numsegs=3, numgeogs=4); *** OPDY_Boot ***; *** OPDY_Boot ***; *** OPDY_Boot ***; %macro OPDY_Boot(bsmp_size=, num_bsmps=, indata=, byvars=, bootvar=); *** the only assumption made within this macro is that the byvars are all character variables; *** obtain last byvar, count byvars, and assign each byvar into macro variables for easy access/processing; %let last_byvar = %scan(&byvars.,-1); %let num_byvars = %sysfunc(countw(&byvars.)); %do i=1 %to &num_byvars.; %let byvar&i. = %scan(&byvars.,&i.); %end; *** macro obtains number of observations in a dataset; %macro nobs(dset); %if %sysfunc(exist(&dset)) %then %do; %let dsid=%sysfunc(open(&dset)); %let nobs=%sysfunc(attrn(&dsid,nobs)); %let dsid=%sysfunc(close(&dsid)); %end; %else %let nobs=0; &nobs %mend nobs; *** initialize macro variables used later; %let bmean =; %let bstd =; %let b975 =; %let b025 =; *** obtain counts and cumulated counts for each strata; proc summary data=&indata. nway; class &byvars.; var &bootvar.; output out=byvar_nobs(keep=_FREQ_ &byvars.) n=junk; run; %let n_byvals = %nobs(byvar_nobs); data cum_temp(keep=_FREQ_ cum_prev_freq); set byvar_nobs(keep=_FREQ_); retain cum_prev_freq 0; prev_freq = lag(_FREQ_); if _n_=1 then prev_freq = 0; cum_prev_freq = sum(cum_prev_freq, prev_freq); run; *** put counts, cumulated counts, and byvar values into macro strings; proc sql noprint; select cum_prev_freq into :cum_prev_freqs separated by ' ' from cum_temp; quit; proc sql noprint; select _freq_ into :freqs separated by ' ' from cum_temp; quit; %do i=1 %to &num_byvars.; proc sql noprint; select &&byvar&i. into :byvals&i. separated by ' ' from byvar_nobs; quit; %end; *** get size of largest stratum; proc summary data=byvar_nobs(keep=_FREQ_) nway; var _FREQ_; output out=byvar_nobs(keep=max_freq) max=max_freq; run; data _null_; set byvar_nobs; call symputx('max_freq',max_freq); run; *** save results of each stratum in cumulated macro variables instead of outputting to a dataset on the data step to lessen intermediate memory requirements ***; data _null_; set &indata.(keep=&byvars. &bootvar.); by &byvars.; * Modified 01-03-11 -- change can double speed of execution for large datasets.; * array bmeans{&num_bsmps.} bm1-bm&num_bsmps.; array bmeans{&num_bsmps.} _TEMPORARY_; array temp{&max_freq.} _TEMPORARY_; retain byval_counter 0 cum_prev_freq 0; temp[_n_-cum_prev_freq]=&bootvar.; if last.&last_byvar. then do; byval_counter+1; freq = 1* scan("&freqs.", byval_counter,' '); num_bsmps = &num_bsmps.*1; bsmp_size = &bsmp_size.*1; do m=1 to num_bsmps; x=0; do n=1 to bsmp_size; x = temp[floor(ranuni(-1)*freq) + 1] + x; end; bmeans[m] = x/bsmp_size; end; * Modified 01-03-11 -- change can double speed of execution for large datasets.; * bmean = mean(of bm1-bm&num_bsmps.); * bstd = std(of bm1-bm&num_bsmps.); * b975 = pctl(97.5, of bm1-bm&num_bsmps.); * b025 = pctl(2.5, of bm1-bm&num_bsmps.); bmean = mean(of bmeans[*]); bstd = std(of bmeans[*]); b975 = pctl(97.5, of bmeans[*]); b025 = pctl(2.5, of bmeans[*]); call symput('bmean',symget('bmean')||" "||compress(bmean)); call symput('bstd',symget('bstd')||" "||compress(bstd)); call symput('b975',symget('b975')||" "||compress(b975)); call symput('b025',symget('b025')||" "||compress(b025)); cum_prev_freq = 1*scan("&cum_prev_freqs.",byval_counter+1,' '); end; run; *** obtain and assign the format of each byvar, all of which are assumed to be character variables; data lens(keep=lens); set &indata.(keep=&byvars. firstobs=1 obs=1); do i=1 to &num_byvars.; lens = vlengthx(scan("&byvars.",i)); output; end; run; proc sql noprint; select lens into :alllens separated by ' ' from lens; quit; %macro assign_formats; %do i=1 %to &num_byvars.; &&byvar&i. $%scan(&alllens.,&i.). %end; %mend assign_formats; *** assign each byvar value for each stratum; %macro assign_byvar_vals(which_strata=); %do j=1 %to &num_byvars.; &&byvar&j. = scan("&&byvals&j.",&which_strata.,' '); %end; %mend assign_byvar_vals; *** unwind and assign all the cumulated macro variables; data MFBUS.OPDY_boots_&bsmp_size._&num_bsmps.(sortedby=&byvars. drop=n_byvals i); n_byvals = 1*&n_byvals.; format %assign_formats; do i=1 to n_byvals; bmean = 1*scan("&bmean.",i,' '); bstd = 1*scan("&bstd.",i,' '); b025 = 1*scan("&b025.",i,' '); b975 = 1*scan("&b975.",i,' '); %assign_byvar_vals(which_strata = i) output; end; run; *** optional; proc datasets lib=work memtype=data kill nodetails; run; %mend OPDY_Boot; %OPDY_Boot(bsmp_size=500, num_bsmps=500, indata=MFBUS.price_data_6strata_100000, byvars=geography segment, bootvar=price ); *** PSS ***; *** PSS ***; *** PSS ***; %macro PSS(bsmp_size=, num_bsmps=, indata=, byvars=, bootvar=); proc surveyselect data=&indata. method=urs sampsize=&bsmp_size. rep=&num_bsmps. seed=-1 out=Boot_PSS_Samps(drop=expectedhits samplingweight) noprint; strata &byvars.; run; proc summary data=Boot_PSS_Samps nway; class &byvars. replicate; weight numberhits; var &bootvar.; output out=Boot_PSS_avgs(sortedby=&byvars. keep=&byvars. &bootvar.) mean=; run; proc univariate data=Boot_PSS_avgs noprint; by &byvars.; var &bootvar.; output out=MFBUS.Boot_PSS_&bsmp_size._&num_bsmps. mean=bmean std=bstd pctlpts = 2.5 97.5 pctlpre=b ; run; *** optional; proc datasets lib=work memtype=data kill nodetails; run; %mend PSS; %PSS(bsmp_size=500, num_bsmps=500, indata=MFBUS.price_data_6strata_100000, byvars=geography segment, bootvar=price ); *** Boot_DA ***; *** Boot_DA ***; *** Boot_DA ***; %macro Boot_DA(bsmp_size=, num_bsmps=, indata=, byvars=, bootvar=); *** the only assumption made within this macro is that the byvars are all character variables; *** obtain last byvar, count byvars, and assign each byvar into macro variables for easy access/processing; %let last_byvar = %scan(&byvars.,-1); %let num_byvars = %sysfunc(countw(&byvars.)); %do i=1 %to &num_byvars.; %let byvar&i. = %scan(&byvars.,&i.); %end; *** macro obtains number of observations in a dataset; %macro nobs(dset); %if %sysfunc(exist(&dset)) %then %do; %let dsid=%sysfunc(open(&dset)); %let nobs=%sysfunc(attrn(&dsid,nobs)); %let dsid=%sysfunc(close(&dsid)); %end; %else %let nobs=0; &nobs %mend nobs; *** obtain counts and cumulated counts for each strata; proc summary data=&indata. nway; class &byvars.; var &bootvar.; output out=byvar_nobs(keep=_FREQ_ &byvars.) n=junk; run; %let n_byvals = %nobs(byvar_nobs); data cum_temp(keep=_FREQ_ cum_prev_freq); set byvar_nobs(keep=_FREQ_); retain cum_prev_freq 0; prev_freq = lag(_FREQ_); if _n_=1 then prev_freq = 0; cum_prev_freq = sum(cum_prev_freq, prev_freq); run; *** put counts, cumulated counts, and byvar values into macro strings; proc sql noprint; select cum_prev_freq into :cum_prev_freqs separated by ' ' from cum_temp; quit; proc sql noprint; select _freq_ into :freqs separated by ' ' from cum_temp; quit; %do i=1 %to &num_byvars.; proc sql noprint; select &&byvar&i. into :byvals&i. separated by ' ' from byvar_nobs; quit; %end; *** obtain and assign the format of each byvar, all of which are assumed to be character variables; data lens(keep=lens); set &indata.(keep=&byvars. firstobs=1 obs=1); do i=1 to &num_byvars.; lens = vlengthx(scan("&byvars.",i)); output; end; run; proc sql noprint; select lens into :alllens separated by ' ' from lens; quit; %macro assign_formats; %do i=1 %to &num_byvars.; &&byvar&i. $%scan(&alllens.,&i.). %end; %mend assign_formats; *** assign each byvar value for each stratum; %macro assign_byvar_vals(which_strata=); %do j=1 %to &num_byvars.; &&byvar&j. = scan("&&byvals&j.",&which_strata.,' '); %end; %mend assign_byvar_vals; data MFBUS.boot_da_&bsmp_size._&num_bsmps.(keep=&byvars. bmean bstd b975 b025); n_byvals=&n_byvals.; bsmp_size = 1* &bsmp_size.; num_bsmps = 1* &num_bsmps.; format %assign_formats; do byval_counter=1 to n_byvals; freq = 1* scan("&freqs.", byval_counter,' '); cum_prev_freq = 1* scan("&cum_prev_freqs.", byval_counter,' '); %assign_byvar_vals(which_strata = byval_counter) array bmeans{&num_bsmps.} bm1-bm&num_bsmps. (&num_bsmps.*0); do bsample=1 to num_bsmps; xsum=0; do obs=1 to bsmp_size; obsnum = floor(freq*ranuni(-1))+1+cum_prev_freq; set &indata.(keep=&bootvar.) point=obsnum; xsum = xsum + &bootvar.; end; bmeans[bsample] = xsum/bsmp_size; end; bmean = mean(of bm1-bm&num_bsmps.); bstd = std(of bm1-bm&num_bsmps.); b025 = pctl(2.5, of bm1-bm&num_bsmps.); b975 = pctl(97.5, of bm1-bm&num_bsmps.); output; end; stop; run; *** optional; proc datasets lib=work memtype=data kill nodetails; run; %mend Boot_DA; %Boot_DA(bsmp_size=500, num_bsmps=500, indata=MFBUS.price_data_6strata_100000, byvars=geography segment, bootvar=price ); *** Boot_SM ***; *** Boot_SM ***; *** Boot_SM ***; %macro Boot_SM(bsmp_size=, num_bsmps=, indata=, byvars=, bootvar=); *** obtain counts by strata; proc summary data=&indata. nway; class &byvars.; var &bootvar.; output out=byvar_nobs(keep=_FREQ_ &byvars.) n=junk; run; *** output bootstrap observations to sample in a nested loop; data bsmp; set byvar_nobs(keep=&byvars. _FREQ_); num_bsmps = 1*&num_bsmps.; bsmp_size = 1*&bsmp_size.; do sample=1 to num_bsmps; do k=1 to bsmp_size; obsnum = floor(_FREQ_*ranuni(-1))+1; output; end; end; run; proc sort data=bsmp; by &byvars. obsnum; run; proc sort data=&indata. out=price_data; by &byvars.; run; *** create record counter on input dataset; %let last_byvar = %scan(&byvars.,-1); data boot; set price_data; retain obsnum 0; by &byvars.; if first.&last_byvar. then obsnum=1; else obsnum+1; run; proc sort data=boot; by &byvars. obsnum; run; *** merge bootstrap sample observations with input dataset to obtain bootstrap samples; data boot error ; merge boot(in=inboot) bsmp(in=inbsmp) ; by &byvars. obsnum; if inboot & inbsmp then output boot; else if inboot~=1 then output error; run; *** summarize bootstrap samples; proc summary data=boot(keep=&byvars. sample &bootvar.) nway; class &byvars. sample; var &bootvar.; output out=boot_means(keep=&byvars. &bootvar. sortedby=&byvars.) mean=; run; proc univariate data=boot_means(keep=&byvars. &bootvar.) noprint; by &byvars.; var &bootvar.; output out=MFBUS.boot_Out_SM_&bsmp_size._&num_bsmps. mean = b_mean std = b_std pctlpts = 2.5 97.5 pctlpre=b ; run; *** optional; proc datasets lib=work memtype=data kill nodetails; run; %mend Boot_SM; %Boot_SM(bsmp_size=500, num_bsmps=500, indata=MFBUS.price_data_6strata_100000, byvars=geography segment, bootvar=price ); *** ALGO4p8 ***; *** ALGO4p8 ***; *** ALGO4p8 ***; %macro Algo4p8(bsmp_size=, num_bsmps=, indata=, byvars=, bootvar=); *** obtain counts by strata and merge onto input dataset; proc summary data=&indata. nway; class &byvars.; var &bootvar.; output out=byvar_nobs(keep=_FREQ_ &byvars.) n=junk; run; proc sort data=&indata. out=price_data; by &byvars.; run; data price_data error ; merge byvar_nobs(in=innobs keep=&byvars. _FREQ_) &indata.(in=infull) ; by &byvars.; if innobs & infull then output price_data; else output error; run; *** simultaneously create all bootstrap samples and summarize results of each as the end of stratum is reached; %let last_byvar = %scan(&byvars.,-1); data MFBUS.boot_algo4p8_&bsmp_size._&num_bsmps.(keep=&byvars. bstd bmean b025 b975); set price_data end=lastrec; by &byvars.; num_bsmps = &num_bsmps.; array bsummeans{&num_bsmps.} bsummean_1-bsummean_&num_bsmps.; array bcds{&num_bsmps.} bcd_1-bcd_&num_bsmps.; retain bsummean_1-bsummean_&num_bsmps. 0 bcd_1-bcd_&num_bsmps. &bsmp_size. counter 0; counter+1; p = 1/(_FREQ_-counter+1); do i=1 to num_bsmps; if bcds[i]>0 then do; x = rand('BINOMIAL',p,bcds[i]); bsummeans[i]=x*&bootvar. + bsummeans[i]; bcds[i]=bcds[i]-x; end; end; if last.&last_byvar. then do; bsmp_size = 1*&bsmp_size.; do h=1 to num_bsmps; bsummeans[h] = bsummeans[h]/bsmp_size; end; bmean = mean(of bsummean_1-bsummean_&num_bsmps.); bstd = std(of bsummean_1-bsummean_&num_bsmps.); b025 = pctl(2.5, of bsummean_1-bsummean_&num_bsmps.); b975 = pctl(97.5, of bsummean_1-bsummean_&num_bsmps.); output; if lastrec~=1 then do; counter = 0; do x=1 to num_bsmps; bsummeans[x]=0; bcds[x]=bsmp_size; end; end; end; run; *** optional; proc datasets lib=work memtype=data kill nodetails; run; %mend Algo4p8; %Algo4p8(bsmp_size=500, num_bsmps=500, indata=MFBUS.price_data_6strata_100000, byvars=geography segment, bootvar=price );