************************************************************************** ************************************************************************** PROGRAM: MFPTUS.SAS DATE: 12/30/10 CODER: J.D. Opdyke Managing Director, Quantitative Strategies DataMineIt PURPOSE: Run and compare 6 different SAS Permutation Tests algorithms including OPDN, OPDN_alt, PROCSS, PROCMT, PROCNPAR, and BEBB_SIM. See "Permutation Tests (and Sampling with Replacement) Orders of Magnitude Faster Using SAS" by J.D. Opdyke for detailed explanations of the different algorithms. INPUTS: Each macro is completely modular and accepts 6 macro parameter values (PROCMT accepts 7): indata = the input dataset (including libname) outdata = the input dataset (including libname) byvars = the "by variables" defining the strata (these are character variables) samp2var = the (character) variable defining the two samples in each stratum: TEST ("T") and CONTROL ("C"), with those values permvar = the variable to be tested with permutation tests num_psmps = number of Permutation Test samples seed = and optional random number seed (must be an integer or blank) left_or_right = "left" or "right" depending on whether the user wants lower or upper one-tailed p-values, respectively, in addition to the two-tailed p-value (only for PROCMT) OUTPUTS: A SAS dataset, named by the user via the macro variable outdata, which contains the following variables: 1) the "by variables" defining the strata 2) the name of the permuted variable 3) the size (# of records) of the permutation samples (this should always be the smaller of the two samples (TEST v. CONTROL)) 4) the number of permutation samples 5) the left, right, and two-tailed p-values corresponding to the following hypotheses: p_left = Pr(x>X | Ho: Test>=Control) p_right = Pr(xX | Ho: Test>=Control) p_right = Pr(xpsums[num_psmps] THEN DO; p_left=num_psmps; p_right=0; p_both=0; END; ELSE DO; *** For non-extreme cases, start with shorter tail for less looping.; if pmed>=sumpvar then do; do z=1 to num_psmps; if sumpvar>=psums[z] then p_left+1; else do; lastbinnum = z-1; distance_left = pmean - psums[z-1]; leave; end; end; *** Avoid loop for other (larger) p-value. If sumpvar equals last bin, p_right = 1 - p_left + lastbinsize. Otherwise, p_right = 1 - p_left. ***; if sumpvar = psums[lastbinnum] then do; lastbinsize=1; do k=lastbinnum to 1 by -1; if psums[k]=psums[k-1] then lastbinsize+1; leave; end; p_right = num_psmps - p_left + lastbinsize; end; else p_right = num_psmps - p_left; end; else do; do z=num_psmps to 1 by -1; if sumpvar<=psums[z] then p_right+1; else do; lastbinnum = z+1; distance_right = psums[z+1] - pmean; leave; end; end; *** Avoid loop for other (larger) p-value. If psum equals last bin, p_left = 1 - p_right + lastbinsize. Otherwise, p_left = 1 - p_right. ***; if sumpvar = psums[lastbinnum] then do; lastbinsize=1; do k=lastbinnum to num_psmps; if psums[k]=psums[k+1] then lastbinsize+1; else leave; end; p_left = num_psmps - p_right + lastbinsize; end; else p_left = num_psmps - p_right; end; *** Base 2-sided p-value on distance from mean of last (i.e. least extreme) bin of smaller p-value. This is common practice. ***; if p_left= distance_left then p_both+1; else leave; end; end; else if p_left>p_right then do; p_both = p_right; do z=1 to num_psmps; if (pmean - psums[z]) >= distance_right then p_both+1; else leave; end; end; else p_both=num_psmps; *** Account for possibility, due to psum=a particular bin value, that p_both>num_psmps. ***; p_both = min(p_both,num_psmps); END; *** If CONTROL sample is smaller than TEST (which is atypical), reverse *** p-values, as empirical distribution is mirror of itself.; if &samp2var.="C" then do; hold = p_left; p_left = p_right; p_right = hold; end; p_left = p_left / num_psmps; p_right = p_right / num_psmps; p_both = p_both / num_psmps; length permvar $32; retain permvar "&permvar"; output; run; data &outdata.; set &outdata.(keep=&byvars. permvar num_psmps n_psamp p_left p_right p_both); label permvar = "Permuted Variable" n_psamp = "Size of Permutation Samples" num_psmps = "# of Permutation Samples" p_left = "Left p-value" p_right = "Right p-value" p_both = "Two-Tailed p-value" ; run; *** Optional.; proc datasets lib=work memtype=data kill nodetails; run; %MEND OPDN_alt; %OPDN_alt(num_psmps = 1000, indata = MFPTUS.pricing_data_2strata_100000, outdata = MFPTUS.OPDN_alt_100000_2strata, byvars = geography segment, samp2var = cntrl_test, permvar = price, seed = ); *** OPDN ***; *** OPDN ***; *** OPDN ***; %MACRO OPDN(num_psmps=, indata=, outdata=, byvars=, samp2var=, permvar=, seed= ); *** The only assumption made within this macro is that the byvars are all character variables and the input dataset is sorted by the "by variables" that define the strata. Also, the "TEST" and "CONTROL" samples are defined by a variable "cntrl_test" formatted as $1. containing "T" and "C" character strings, respectively, as values. ***; *** Obtain the last byvar, count the byvars, and assign each byvar into numbered macro variables for easy access/processing. ***; %let last_byvar = %scan(&byvars.,-1); %let num_byvars = %sysfunc(countw(&byvars.)); %do i=1 %to &num_byvars.; %let byvar&i. = %scan(&byvars.,&i.); %end; *** If user does not pass a value to the optional macro variable "seed," use -1 based on the time of day. ***; %if %sysevalf(%superq(seed)=,boolean) %then %let seed=-1; *** Obtain counts and cumulated counts for each strata.; proc summary data=&indata. nway; class &byvars. &samp2var.; var &permvar.; output out=byvar_sum(keep = _FREQ_ &byvars. &samp2var. sumpvar) sum = sumpvar ; run; *** Identify and keep the smaller of the two samples for more efficient sampling. Below, invert the empirical permutation distribution if CONTROL sample is smaller that TEST sample (which is not typical). That will remain consistent with output variables corresponding to: p_left = Pr(x>X | Ho: Test>=Control) p_right = Pr(xpsums[num_psmps] THEN DO; p_left=num_psmps; p_right=0; p_both=0; END; ELSE DO; *** For non-extreme cases, start with shorter tail for less looping.; if pmed>=psum then do; do z=1 to num_psmps; if psum>=psums[z] then p_left+1; else do; lastbinnum = z-1; distance_left = pmean - psums[z-1]; leave; end; end; *** Avoid loop for other (larger) p-value. If psum equals last bin, p_right = 1 - p_left + lastbinsize. Otherwise, p_right = 1 - p_left. ***; if psum = psums[lastbinnum] then do; lastbinsize=1; do k=lastbinnum to 1 by -1; if psums[k]=psums[k-1] then lastbinsize+1; leave; end; p_right = num_psmps - p_left + lastbinsize; end; else p_right = num_psmps - p_left; end; else do; do z=num_psmps to 1 by -1; if psum<=psums[z] then p_right+1; else do; lastbinnum = z+1; distance_right = psums[z+1] - pmean; leave; end; end; *** Avoid loop for other (larger) p-value. If psum equals last bin, p_left = 1 - p_right + lastbinsize. Otherwise, p_left = 1 - p_right. ***; if psum = psums[lastbinnum] then do; lastbinsize=1; do k=lastbinnum to num_psmps; if psums[k]=psums[k+1] then lastbinsize+1; else leave; end; p_left = num_psmps - p_right + lastbinsize; end; else p_left = num_psmps - p_right; end; *** Base 2-sided p-value on distance from mean of last (i.e. least extreme) bin of smaller p-value. This is common practice. ***; if p_left= distance_left then p_both+1; else leave; end; end; else if p_left>p_right then do; p_both = p_right; do z=1 to num_psmps; if (pmean - psums[z]) >= distance_right then p_both+1; else leave; end; end; else p_both=num_psmps; *** Account for possibility, due to psum=a particular bin value, that p_both>num_psmps. ***; p_both = min(p_both,num_psmps); END; p_left = p_left / num_psmps; p_right = p_right / num_psmps; p_both = p_both / num_psmps; *** If CONTROL sample is smaller than TEST (which is atypical), reverse *** p-values, as empirical distribution is mirror of itself.; if "C"=COMPRESS(UPCASE(scan("&samp2var.", byval_counter,' ')),' ') then do; hold = p_left; p_left = p_right; p_right = hold; end; *** Cumulate key macro variables to save results.; call symput('p_left',symget('p_left')||" "||compress(p_left)); call symput('p_right',symget('p_right')||" "||compress(p_right)); call symput('p_both',symget('p_both')||" "||compress(p_both)); cum_prev_freq = 1*scan("&cum_prev_freqs.",byval_counter+1,' '); end; run; *** Obtain and assign the format of each byvar, all of which are assumed to be character variables. ***; data lens(keep=lens); set &indata.(keep=&byvars. firstobs=1 obs=1); do i=1 to &num_byvars.; lens = vlengthx(scan("&byvars.",i)); output; end; run; proc sql noprint; select lens into :alllens separated by ' ' from lens; quit; %macro assign_formats; %do i=1 %to &num_byvars.; &&byvar&i. $%scan(&alllens.,&i.). %end; %mend assign_formats; *** Assign each byvar value for each stratum.; %macro assign_byvar_vals(which_strata=); %do j=1 %to &num_byvars.; &&byvar&j. = scan("&&byvals&j.",&which_strata.,' '); %end; %mend assign_byvar_vals; *** Unwind and assign all the cumulated macro variables.; data &outdata.(sortedby=&byvars. drop=n_byvals i); n_byvals = 1*&n_byvals.; format %assign_formats; do i=1 to n_byvals; length permvar $32; permvar = "&permvar."; n_psamp = 1 * scan("&freqs.", i,' '); num_psmps = &num_psmps.; p_left = 1*scan("&p_left.",i,' '); p_right = 1*scan("&p_right.",i,' '); p_both = 1*scan("&p_both.",i,' '); %assign_byvar_vals(which_strata = i) label permvar = "Permuted Variable" n_psamp = "Size of Permutation Samples" num_psmps = "# of Permutation Samples" p_left = "Left p-value" p_right = "Right p-value" p_both = "Two-Tailed p-value" ; output; end; run; *** Optional.; proc datasets lib=work memtype=data kill nodetails; run; %MEND OPDN; %OPDN(num_psmps = 1000, indata = MFPTUS.pricing_data_2strata_100000, outdata = MFPTUS.OPDN_100000_2strata, byvars = geography segment, samp2var = cntrl_test, permvar = price, seed = ); *** PROC_SS ***; *** PROC_SS ***; *** PROC_SS ***; %MACRO PROCSS(num_psmps=, indata=, outdata=, byvars=, samp2var=, permvar=, seed= ); *** If user does not pass a value to the optional macro variable "seed," use -1 based on the time of day. ***; %if %sysevalf(%superq(seed)=,boolean) %then %let seed=-1; *** Obtain counts and cumulated counts for each strata.; proc summary data=&indata. nway; class &byvars. &samp2var.; var &permvar.; output out=byvar_sum(keep = _FREQ_ &byvars. &samp2var. sumpvar) sum = sumpvar ; run; *** Identify and keep the smaller of the two samples for more efficient sampling. Below, invert the empirical permutation distribution if CONTROL sample is smaller that TEST sample (which is not typical). That will remain consistent with output variables corresponding to: p_left = Pr(x>X | Ho: Test>=Control) p_right = Pr(xpsums[num_psmps] THEN DO; p_left=num_psmps; p_right=0; p_both=0; END; ELSE DO; *** For non-extreme cases, start with shorter tail for less looping.; if pmed>=sumpvar then do; do z=1 to num_psmps; if sumpvar>=psums[z] then p_left+1; else do; lastbinnum = z-1; distance_left = pmean - psums[z-1]; leave; end; end; *** Avoid loop for other (larger) p-value. If sumpvar equals last bin, p_right = 1 - p_left + lastbinsize. Otherwise, p_right = 1 - p_left. ***; if sumpvar = psums[lastbinnum] then do; lastbinsize=1; do k=lastbinnum to 1 by -1; if psums[k]=psums[k-1] then lastbinsize+1; leave; end; p_right = num_psmps - p_left + lastbinsize; end; else p_right = num_psmps - p_left; end; else do; do z=num_psmps to 1 by -1; if sumpvar<=psums[z] then p_right+1; else do; lastbinnum = z+1; distance_right = psums[z+1] - pmean; leave; end; end; *** Avoid loop for other (larger) p-value. If psum equals last bin, p_left = 1 - p_right + lastbinsize. Otherwise, p_left = 1 - p_right. ***; if sumpvar = psums[lastbinnum] then do; lastbinsize=1; do k=lastbinnum to num_psmps; if psums[k]=psums[k+1] then lastbinsize+1; else leave; end; p_left = num_psmps - p_right + lastbinsize; end; else p_left = num_psmps - p_right; end; *** Base 2-sided p-value on distance from mean of last (i.e. least extreme) bin of smaller p-value. This is common practice. ***; if p_left= distance_left then p_both+1; else leave; end; end; else if p_left>p_right then do; p_both = p_right; do z=1 to num_psmps; if (pmean - psums[z]) >= distance_right then p_both+1; else leave; end; end; else p_both=num_psmps; *** Account for possibility, due to psum=a particular bin value, that p_both>num_psmps. ***; p_both = min(p_both,num_psmps); END; p_left = p_left / num_psmps; p_right = p_right / num_psmps; p_both = p_both / num_psmps; *** If CONTROL sample is smaller than TEST (which is atypical), reverse *** p-values, as empirical distribution is mirror of itself.; if &samp2var.="C" then do; hold = p_left; p_left = p_right; p_right = hold; end; label permvar = "Permuted Variable" n_psamp = "Size of Permutation Samples" num_psmps = "# of Permutation Samples" p_left = "Left p-value" p_right = "Right p-value" p_both = "Two-Tailed p-value" ; output &outdata.; end; else output error; run; proc datasets lib=work memtype=data kill nodetails; run; %MEND PROCSS; %PROCSS(num_psmps = 1000, indata = MFPTUS.pricing_data_2strata_100000, outdata = MFPTUS.PROCSS_100000_2strata, byvars = geography segment, samp2var = cntrl_test, permvar = price, seed = ); *** PROC_MT ***; *** PROC_MT ***; *** PROC_MT ***; %MACRO PROCMT(num_psmps=, indata=, outdata=, byvars=, samp2var=, permvar=, seed=, left_or_right= ); *** If user does not pass a value to the optional macro variable "seed," generate a random seed and use it for all three proc multtests below (although SAS OnlineDoc says seed=-1 should work, it does not). ***; %if %sysevalf(%superq(seed)=,boolean) %then %let seed=%sysfunc(ceil(%sysevalf(1000000000*%sysfunc(ranuni(-1))))); *** Un/comment test statements below to perform permutation tests based on different assumptions about the variance structure of the samples.; ***; proc multtest data = &indata. nsample = &num_psmps. seed = &seed. out = mt_output_results_2t(keep=&byvars. perm_p rename=(perm_p=xp_both)) permutation noprint; by &byvars.; class &samp2var.; * test mean (&permvar. / DDFM=SATTERTHWAITE); test mean (&permvar.); run; *** To make runtime results comparable to PROC NPAR1WAY, which provides only two-tailed p-value and the smaller of the right or left p-values, run two of the three PROC MULTTESTs and calculate the second tail as one minus the given tail, which will usually be very close to the actual value unless the data is highly discretized. ***; proc multtest data = &indata. nsample = &num_psmps. seed = &seed. %if %UPCASE(%sysfunc(compress(&left_or_right.)))=RIGHT %then %do; out = mt_output_results_up(keep=&byvars. perm_p rename=(perm_p=xp_right)) %end; %if %UPCASE(%sysfunc(compress(&left_or_right.)))=LEFT %then %do; out = mt_output_results_low(keep=&byvars. perm_p rename=(perm_p=xp_left)) %end; permutation noprint; by &byvars.; class &samp2var.; %if %UPCASE(%sysfunc(compress(&left_or_right.)))=RIGHT %then %do; * test mean (&permvar. / upper DDFM=SATTERTHWAITE); test mean (&permvar. / upper); %end; %if %UPCASE(%sysfunc(compress(&left_or_right.)))=LEFT %then %do; * test mean (&permvar. / lower DDFM=SATTERTHWAITE); test mean (&permvar. / lower); %end; run; proc summary data=&indata. nway; class &byvars. &samp2var.; var &permvar.; output out=byvar_frq(keep = _FREQ_ &byvars.) n = toss ; run; %let last_byvar = %scan(&byvars.,-1); data byvar_frq(keep=&byvars. xn_psamp sortedby=&byvars.); set byvar_frq(rename=(_FREQ_=xn_psamp)); by &byvars.; retain lag_FREQ; if first.&last_byvar. then lag_FREQ = xn_psamp; else do; if xn_psamp<=lag_FREQ then output; else do; xn_psamp = lag_FREQ; output; end; end; run; data &outdata.(drop=xn_psamp xp_left xp_both) error ; %if %UPCASE(%sysfunc(compress(&left_or_right.)))=LEFT %then %do; merge mt_output_results_low(in=inlow) mt_output_results_2t(in=in2t) byvar_frq(in=infrq) ; if in2t & inlow & infrq then do; format permvar $32.; permvar = "&permvar."; n_psamp = xn_psamp; num_psmps = 1*&num_psmps.; p_left = xp_left; p_right = .; %end; %if %UPCASE(%sysfunc(compress(&left_or_right.)))=RIGHT %then %do; merge mt_output_results_up(in=inup) mt_output_results_2t(in=in2t) byvar_frq(in=infrq) ; if in2t & inup & infrq then do; format permvar $32.; permvar = "&permvar."; n_psamp = xn_psamp; num_psmps = 1*&num_psmps.; p_right = xp_right; p_left = .; %end; p_both = xp_both; label permvar = "Permuted Variable" n_psamp = "Size of Permutation Samples" num_psmps = "# of Permutation Samples" p_left = "Left p-value" p_right = "Right p-value" p_both = "Two-Tailed p-value" ; output &outdata.; end; else output error; run; proc datasets lib=work memtype=data kill nodetails; run; %MEND PROCMT; %PROCMT(num_psmps = 1000, indata = MFPTUS.pricing_data_2strata_100000, outdata = MFPTUS.PROCMT_100000_2strata, byvars = geography segment, samp2var = cntrl_test, permvar = price, seed = , left_or_right = left ); *** PROCNPAR ***; *** PROCNPAR ***; *** PROCNPAR ***; %MACRO PROCNPAR(num_psmps=, indata=, outdata=, byvars=, samp2var=, permvar=, seed= ); *** If user does not pass a value to the optional macro variable "seed," use -1 based on the time of day. ***; %if %sysevalf(%superq(seed)=,boolean) %then %let seed=-1; ods listing close; proc npar1way data = &indata. scores = data ; var &permvar.; by &byvars.; class &samp2var.; exact scores=data / n=&num_psmps. seed=&seed.; ods output DataScoresMC = hold(keep = &byvars. Name1 Label1 nValue1 where = (Label1 = "Estimate") ); run; ods listing; proc transpose data = hold(drop=Label1) out = &outdata.(drop = _NAME_); by &byvars.; id Name1; var nValue1; run; proc summary data=&indata. nway; class &byvars. &samp2var.; var &permvar.; output out=byvar_frq(keep = _FREQ_ &byvars. &samp2var.) n = toss ; run; %let last_byvar = %scan(&byvars.,-1); data byvar_frq(keep=&byvars. permvar n_psamp num_psmps &samp2var. sortedby=&byvars.); set byvar_frq; by &byvars.; format permvar $32.; retain permvar "&permvar." lag_FREQ lag_samp2 ; n_psamp = _FREQ_; num_psmps = 1*&num_psmps.; if first.&last_byvar. then do; lag_FREQ = n_psamp; lag_samp2 = &samp2var.; end; else do; if n_psamp<=lag_FREQ then output; else do; n_psamp = lag_FREQ; &samp2var. = lag_samp2; output; end; end; run; data &outdata.(drop=hold mcpl_data mcpr_data mcp2_data &samp2var.) error ; merge byvar_frq(in=infrq) &outdata.(in=inresults) ; by &byvars.; if inresults & infrq then do; p_left = mcpl_data; p_right = mcpr_data; p_both = mcp2_data; label permvar = "Permuted Variable" n_psamp = "Size of Permutation Samples" num_psmps = "# of Permutation Samples" p_left = "Left p-value" p_right = "Right p-value" p_both = "Two-Tailed p-value" ; *** If CONTROL sample is smaller than TEST (which is atypical), reverse *** p-values, as empirical distribution is mirror of itself.; if &samp2var.="C" then do; hold = p_left; p_left = p_right; p_right = hold; end; output &outdata.; end; else output error; run; %MEND PROCNPAR; %PROCNPAR(num_psmps = 1000, indata = MFPTUS.pricing_data_2strata_100000, outdata = MFPTUS.PROCNPAR_100000_2strata, byvars = geography segment, samp2var = cntrl_test, permvar = price, seed = ); *** BEBB_SIM ***; *** BEBB_SIM ***; *** BEBB_SIM ***; %MACRO BEBB_SIM(num_psmps=, indata=, outdata=, byvars=, samp2var=, permvar=, seed= ); *** If user does not pass a value to the optional macro variable "seed," use -1 based on the time of day. ***; %if %sysevalf(%superq(seed)=,boolean) %then %let seed=-1; *** Obtain counts for each strata.; proc summary data=&indata. nway; class &byvars. &samp2var.; var &permvar.; output out=byvar_sum(keep = _FREQ_ &byvars. &samp2var. sumpvar) sum = sumpvar ; run; %let last_byvar = %scan(&byvars.,-1); data byvar_sum_min(keep=tot_FREQ _FREQ_ &byvars. &samp2var. sumpvar sortedby=&byvars.); set byvar_sum; by &byvars.; retain lag_FREQ lag_sum lag_samp2; if first.&last_byvar. then do; lag_FREQ = _FREQ_; lag_sum = sumpvar; lag_samp2 = &samp2var.; end; else do; tot_FREQ = sum(lag_FREQ,_FREQ_); if _FREQ_<=lag_FREQ then output; else do; _FREQ_ = lag_FREQ; sumpvar = lag_sum; &samp2var. = lag_samp2; output; end; end; run; *** Slightly faster to use macro variable value strings and scan() than to merge _FREQ_ etc. onto the "indata" dataset, especially for large "indata" datasets. *** ; proc sql noprint; select _freq_ into :freqs separated by ' ' from byvar_sum_min; quit; proc sql noprint; select tot_FREQ into :tot_FREQs separated by ' ' from byvar_sum_min; quit; proc sql noprint; select sumpvar into :sumpvar separated by ' ' from byvar_sum_min; quit; proc sql noprint; select &samp2var. into :samp2var separated by ' ' from byvar_sum_min; quit; *** simultaneously create all permstrap samples and summarize results of each as the end of stratum is reached ***; %let last_byvar = %scan(&byvars.,-1); data &outdata.(keep=&byvars. permvar n_psamp num_psmps p_left p_right p_both); set &indata.(keep=&byvars. &permvar.) end=lastrec; by &byvars.; retain permvar "&permvar" n_psamp 0 num_psmps &num_psmps.; array smalln_counter{&num_psmps.} _TEMPORARY_; array psums{&num_psmps.} _TEMPORARY_; if first.&last_byvar. then do; byval_counter+1; _freq_ = 1*scan("&freqs.",byval_counter,' '); n_psamp = _FREQ_; tot_FREQ = 1*scan("&tot_FREQs.",byval_counter,' '); bigN_counter = tot_FREQ+1; do i=1 to num_psmps; smalln_counter[i] = _FREQ_; psums[i] = 0; end; end; bigN_counter+(-1); min_mac_res_inloop = &permvar.; seed=1*&seed.; if last.&last_byvar.~=1 then do i=1 to num_psmps; if ranuni(seed) <= smalln_counter[i]/bigN_counter then do; psums[i] = min_mac_res_inloop + psums[i]; smalln_counter[i]+(-1); end; end; else do i=1 to num_psmps; if smalln_counter[i]>0 then psums[i] = min_mac_res_inloop + psums[i]; end; if last.&last_byvar. then DO; *** If CONTROL sample is smaller than TEST (which is atypical), reverse *** order of empirical distribution.; sumpvar = 1*scan("&sumpvar.",byval_counter,' '); p_left = 0; p_right = 0; p_both = 0; call sortn(of psums[*]); pmed = median(of psums[*]); pmean = mean(of psums[*]); *** Efficiently handle extreme test sample values.; IF sumpvarpsums[num_psmps] THEN DO; p_left=num_psmps; p_right=0; p_both=0; END; ELSE DO; *** For non-extreme cases, start with shorter tail for less looping.; if pmed>=sumpvar then do; do z=1 to num_psmps; if sumpvar>=psums[z] then p_left+1; else do; lastbinnum = z-1; distance_left = pmean - psums[z-1]; leave; end; end; *** Avoid loop for other (larger) p-value. If sumpvar equals last bin, p_right = 1 - p_left + lastbinsize. Otherwise, p_right = 1 - p_left. ***; if sumpvar = psums[lastbinnum] then do; lastbinsize=1; do k=lastbinnum to 1 by -1; if psums[k]=psums[k-1] then lastbinsize+1; leave; end; p_right = num_psmps - p_left + lastbinsize; end; else p_right = num_psmps - p_left; end; else do; do z=num_psmps to 1 by -1; if sumpvar<=psums[z] then p_right+1; else do; lastbinnum = z+1; distance_right = psums[z+1] - pmean; leave; end; end; *** Avoid loop for other (larger) p-value. If psum equals last bin, p_left = 1 - p_right + lastbinsize. Otherwise, p_left = 1 - p_right. ***; if sumpvar = psums[lastbinnum] then do; lastbinsize=1; do k=lastbinnum to num_psmps; if psums[k]=psums[k+1] then lastbinsize+1; else leave; end; p_left = num_psmps - p_right + lastbinsize; end; else p_left = num_psmps - p_right; end; *** Base 2-sided p-value on distance from mean of last (i.e. least extreme) bin of smaller p-value. This is common practice. ***; if p_left= distance_left then p_both+1; else leave; end; end; else if p_left>p_right then do; p_both = p_right; do z=1 to num_psmps; if (pmean - psums[z]) >= distance_right then p_both+1; else leave; end; end; else p_both=num_psmps; *** Account for possibility, due to psum=a particular bin value, that p_both>num_psmps. ***; p_both = min(p_both,num_psmps); END; p_left = p_left / num_psmps; p_right = p_right / num_psmps; p_both = p_both / num_psmps; *** If CONTROL sample is smaller than TEST (which is atypical), reverse *** p-values, as empirical distribution is mirror of itself.; if "C"=COMPRESS(UPCASE(scan("&samp2var.", byval_counter,' ')),' ') then do; hold = p_left; p_left = p_right; p_right = hold; end; label permvar = "Permuted Variable" n_psamp = "Size of Permutation Samples" num_psmps = "# of Permutation Samples" p_left = "Left p-value" p_right = "Right p-value" p_both = "Two-Tailed p-value" ; output &outdata.; end; run; proc datasets lib=work memtype=data kill nodetails; run; %MEND BEBB_SIM; %BEBB_SIM(num_psmps = 1000, indata = MFPTUS.pricing_data_2strata_100000, outdata = MFPTUS.BEBB_SIM_100000_2strata, byvars = geography segment, samp2var=cntrl_test, permvar = price, seed = );