Wednesday, June 30, 2010

Bug in SAS' SELECT DISTINCT implementation

Running the following code on SAS version 9.2 (TS1M0) on the XP_PRO and NET_SRV platforms, you get erroneous results on the second SELECT DISTINCT. Dataset with_index should only have a single record in it, but on the above-named platforms you get 2 records. It seems that SAS is ignoring the DISTINCT.



%let startdt = 01jan2009;
%let enddt = 31dec2009;

data enrollment ;
input
@1 mrn $char8.
@9 enr_start date9.
@21 enr_end date9.
;
format
enr_: mmddyy10. ;
;
datalines ;
roy 01jan2008 30jun2009
roy 01jul2009 30jun2010
;

proc sql ;
create table without_index as
select distinct mrn
from enrollment as e
where e.enr_start <= "&enddt"d and
e.enr_end >= "&startdt"d
;
quit ;


proc sql ;
** This guy queers the DISTINCT result ;
create index pk on enrollment (mrn, enr_start, enr_end) ;
quit ;

proc sql ;
create table with_index as
select distinct mrn
from enrollment as e
where e.enr_start <= "&enddt"d and
e.enr_end >= "&startdt"d
;
quit ;

I'm not sure exactly what the conditions are for this bug, but the above definitely ain't right.