/* Introduction ----------------------------------------- Inverted Member Matching with Hash Tables ----------------------------------------- Dan Atkins, Capsaicin LLC, 952-941-1300 Dan@DANalytics.net */ /* UltraEdit Notes... Program almost exclusively in UltraEdit Add an UltraEdit SAS button using: "Advanced", "Tool Configuration..." Command line parameters... "C:\Program Files\SAS\SAS 9.1\sas.exe" -CONFIG "C:\Program Files\SAS\SAS 9.1\nls\en\SASV9.CFG" -BATCH -SYSIN %n ...and working directory %p */ Options Compress=Yes MsgLevel=I STimer FullSTimer MacroGen SymbolGen Source Source2 MPrint MLogic Ls=150 PS=55 Errors=1 SPOOL SUMSIZE=MAX SASTRACELOC=saslog set=sasdatefmt date ; Run; Libname Libby "C:\_Ingenix\Definity\Sleep\2009" ; **** PREPARE MASTER DATASET *******************************************************************; **** Simple - Create one record per Member with ALL the enollment whether it continous or not ;;; Data Libby.Enrollment (Drop=Mbr_Cov_Month Year ) ; Retain Med_2005 Med_2006 Med_2007 Med_2008 Med_2009 . ; Set Libby.mem_jan2005_dec2009 ; By Mbr_ID ; Array Med_ {5} Med_2005-Med_2009 ; Year = Year(Mbr_Cov_Month) - 2004 ; If First.Mbr_ID Then Do i = 1 to 5 ; Med_{i}= 0 ; End ; If '01Jan2005'D <= Mbr_Cov_Month <= '31Dec2009'D Then Med_{Year}=Bor(Med_{Year},2**(Month(Mbr_Cov_Month)-0)) ; If Last.Mbr_ID Then Output ; ** comment out for Testing ; Run ; Title "Sample of MASTER Enrollment Dataset" ; Run ; Proc Print Data=Libby.Enrollment(Where=(Mbr_Id in('803060904','803269732','802984002','800453683','800461106','803907415','228749828'))) ; Var SUB_SSN MBR_SSN MBR_FNAME MBR_MI MBR_LNAME /*Mbr_Cov_Month*/ Mbr_Dob Med: ; /* This is how the Years populate as it runs through the data Month Med_2005 Med_2006 Med_2007 Med_2008 Med_2009 2005-01-01 2 0 0 0 0 2005-02-01 6 0 0 0 0 2005-03-01 14 0 0 0 0 2005-04-01 30 0 0 0 0 2005-05-01 62 0 0 0 0 2005-06-01 126 0 0 0 0 2005-07-01 254 0 0 0 0 2005-08-01 510 0 0 0 0 2005-09-01 1022 0 0 0 0 2005-10-01 2046 0 0 0 0 2005-11-01 4094 0 0 0 0 2005-12-01 8190 0 0 0 0 2006-06-01 8190 64 0 0 0 Note break in enrollment 2006-07-01 8190 192 0 0 0 2006-08-01 8190 448 0 0 0 2006-09-01 8190 960 0 0 0 2006-10-01 8190 1984 0 0 0 2006-11-01 8190 4032 0 0 0 2006-12-01 8190 8128 0 0 0 2007-01-01 8190 8128 2 0 0 2007-02-01 8190 8128 6 0 0 2007-03-01 8190 8128 14 0 0 2007-04-01 8190 8128 30 0 0 2007-05-01 8190 8128 62 0 0 2007-06-01 8190 8128 126 0 0 2007-07-01 8190 8128 254 0 0 2007-08-01 8190 8128 510 0 0 2007-09-01 8190 8128 1022 0 0 2007-10-01 8190 8128 2046 0 0 2007-11-01 8190 8128 4094 0 0 2007-12-01 8190 8128 8190 0 0 2008-01-01 8190 8128 8190 2 0 2008-02-01 8190 8128 8190 6 0 2008-03-01 8190 8128 8190 14 0 2008-04-01 8190 8128 8190 30 0 2008-05-01 8190 8128 8190 62 0 2008-06-01 8190 8128 8190 126 0 2008-07-01 8190 8128 8190 254 0 2008-08-01 8190 8128 8190 510 0 2008-09-01 8190 8128 8190 1022 0 2008-10-01 8190 8128 8190 2046 0 2008-11-01 8190 8128 8190 4094 0 2008-12-01 8190 8128 8190 8190 0 <== This is the single record it ouputs */ ** Read in SMALL file sent by client *************************************************************************************; Data Libby.Small_Data (Keep=Driver Names DOB MBR_SSNs Intervention_Date Mbr_SSN Dr_ID Lst_Nm Fst_Nm MI SUB_SSN Sub_SSNs) ; Length Lst_Nm $30. Fst_Nm $20. MI $1. ; Infile "C:\_Ingenix\Definity\Sleep\2009\Newest_Data_Thank_You.txt" TruncOver DSD Dlm='09'x FirstObs=2; Input Driver : 8. Names : $30. Board : 5. LOB : $12. Status : $4. Pos_Neg : $8. DOB : AnyDtDte10. MBR_SSNs : $11. AHI : 3. Intervention_Date : AnyDtDte10. SUB_SSNs : $11. Hire_Date_Disp : AnyDtDte10. Term_Date_Disp : AnyDtDte10. ; ; ; ; Dr_ID = _n_ ; *** Modify values so variable names matches master file and format and case is similar ; Mbr_SSN = Put( Input(Compress(MBR_SSNs,'-'),9.) , Z9.); Sub_SSN = Put( Input(Compress(Sub_SSNs,'-'),9.) , Z9.); ; Lst_Nm = Scan( UpCase(CompBl(Scan(Names , 1, ',' ) ) ) , 1, ' ' ) ; MI = UpCase(CompBl(Scan( UpCase(CompBl(Scan(Names , 2, ',' ) ) ) , 2, ', ' ) ) ) ; Fst_Nm = UpCase(CompBl(Scan( UpCase(CompBl(Scan(Names , 2, ',' ) ) ) , 1, ' ' ) ) ) ; Format DOB Intervention_Date Hire_Date_Disp Term_Date_Disp YyMmDd10. ; Run ; Title "Sample of SMALL Dataset" ; Run ; *** Identify Duplicate Driver Keys ****; Proc Sort Data=Libby.Small_Data Out=Dupos ; By Driver ; Data Dupos ; Set Dupos (Where=(Dr_ID NE 520 ) ) ; By Driver ; If First.Driver and Last.Driver then delete ; Proc Print Data=Dupos ; * Note- manually renumbered duplicates "Neal, Deanna " and "Bailey, Tiffany " ; *%Macro Process; Data Libby.Matched (Drop=_lu:); Length Names $35. MBR_SSNs Sub_SSNs $11. ; * Hash will not like you if you dont define character lookups ahead of time ; If _N_ = 1 Then Do ; Declare Hash LU01(Dataset:'Libby.Small_Data'); LU01.defineKey('Mbr_SSN','Lst_Nm','Fst_Nm','MI','DOB'); LU01.defineData('DR_ID'); LU01.defineDone(); Declare Hash LU02(Dataset:'Libby.Small_Data'); LU02.defineKey('Mbr_SSN','Lst_Nm','Fst_Nm','DOB'); LU02.defineData('DR_ID'); LU02.defineDone(); Declare Hash LU03(Dataset:'Libby.Small_Data'); LU03.defineKey('Mbr_SSN','Lst_Nm','DOB'); LU03.defineData('DR_ID'); LU03.defineDone(); Declare Hash LU04(Dataset:'Libby.Small_Data'); LU04.defineKey('Mbr_SSN','Fst_Nm'); LU04.defineData('DR_ID'); LU04.defineDone(); Declare Hash LU05(Dataset:'Libby.Small_Data'); LU05.defineKey('Sub_SSN','Lst_Nm','DOB'); LU05.defineData('DR_ID'); LU05.defineDone(); Declare Hash LU06(Dataset:'Libby.Small_Data'); LU06.defineKey('Sub_SSN','Fst_Nm'); LU06.defineData('DR_ID'); LU06.defineDone(); Declare Hash LU07(Dataset:'Libby.Small_Data'); LU07.defineKey('Lst_Nm','Fst_Nm','MI','DOB'); LU07.defineData('DR_ID'); LU07.defineDone(); Declare Hash LU08(Dataset:'Libby.Small_Data'); LU08.defineKey('Lst_Nm','Fst_Nm','DOB'); LU08.defineData('DR_ID'); LU08.defineDone(); *** May not really expect to get a lot of matches below but want to match up close ones to see where I can tweak things ; Declare Hash LU10(Dataset:'Libby.Small_Data'); LU10.defineKey('Lst_Nm','DOB'); LU10.defineData('DR_ID'); LU10.defineDone(); Declare Hash LU11(Dataset:'Libby.Small_Data'); LU11.defineKey('Fst_Nm','DOB'); LU11.defineData('DR_ID'); LU11.defineDone(); Declare Hash LU12(Dataset:'Libby.Small_Data'); LU12.defineKey('Fst_Nm','Lst_Nm'); LU12.defineData('DR_ID'); LU12.defineDone(); Declare Hash LU13(Dataset:'Libby.Small_Data'); LU13.defineKey('Lst_Nm'); LU13.defineData('DR_ID'); LU13.defineDone(); * This is a simple way of attaching all the values from the small data set to the big one without having to do it every time above ; Declare Hash LU99(Dataset:'Libby.Small_Data'); LU99.defineKey('DR_ID'); LU99.defineData('Intervention_Date','Names','DOB','MBR_SSNs','Sub_SSNs'); LU99.defineDone(); Call Missing( Intervention_Date , Names , DOB , MBR_SSNs , DR_ID , Sub_SSNs ); * avoids warning messages ; End ; Set Libby.Enrollment ; *This is the BIG MASTER FILE - Sort Order does not Matter ;;;; Array _lu {13} _lu01-_lu13 ; * You Have to Make the Big Dataset look just like your little dataset - Names have to be identical in order to do lookup; Fst_Nm = Upcase(mbr_fname) ; MI = Upcase(mbr_mi) ; Lst_Nm = Upcase(mbr_lname) ; DOB = Mbr_DOB ; *** For my benefit I create variable just like the Small file names so it is easy to compare side to side ; Names_Big = CompBl(Compress(mbr_lname !! ',') !! ' ' !! mbr_fname !! ' ' !! MI ) ; If Length(Mbr_SSN) => 9 Then Do ; * You dont even want to look at values where you risk SSN being blank ; _lu01 = LU01.find() = 0 ; If not Sum(of _lu:) then _lu02 = LU02.find() = 0 ; If not Sum(of _lu:) then _lu03 = LU03.find() = 0 ; If not Sum(of _lu:) then _lu04 = LU04.find() = 0 ; End ; If not Sum(of _lu:) and Length(SUB_SSN) => 9 Then Do ; If not Sum(of _lu:) then _lu05 = LU05.find() = 0 ; If not Sum(of _lu:) then _lu06 = LU06.find() = 0 ; End ; If not Sum(of _lu:) then _lu07 = LU07.find() = 0 ; If not Sum(of _lu:) then _lu08 = LU08.find() = 0 ; If not Sum(of _lu:) then _lu10 = LU10.find() = 0 ; If not Sum(of _lu:) then _lu11 = LU11.find() = 0 ; * I am not happy matching any from this point without a hard look ; If not Sum(of _lu:) then _lu12 = LU12.find() = 0 ; If not Sum(of _lu:) then _lu13 = LU13.find() = 0 ; /* Running it on 11+ produces more matches so you end up with a lot of BAD matches NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 778 observations read from the data set LIBBY.SMALL_DATA. NOTE: There were 75935 observations read from the data set LIBBY.ENROLLMENT. NOTE: The data set LIBBY.MATCHED has 946 observations and 34 variables. NOTE: Compressing data set LIBBY.MATCHED decreased size by 41.67 percent. Compressed is 14 pages; un-compressed would require 24 pages. NOTE: DATA statement used (Total process time): real time 1.93 seconds user cpu time 0.76 seconds system cpu time 0.23 seconds Memory 863k */ If Sum(of _lu:) Then Do ; Do i = 1 to 13 until (_lu{i}) ; * Attach a single score to each record ; Score = i ; End ; _lu99= LU99.find() = 0 ; * Now pull in every variable desired from small dataset ; Output ; End ; Format DOB Intervention_Date YyMmDd10. ; Run ; Proc Print Data=Libby.Matched(Obs=7) ; /* The sample above is small but this can be run on huge datasets - all you need is memory - see log from another hash match below... 31 Data Common.Member_Master ; 33 If _N_ = 1 Then Do; 34 Declare Hash X1_Lookup(Dataset:'Common.X_Member_Name_n_Address'); 35 X1_Lookup.defineKey('SBSCR_NBR','BTH_DT','GDR_CD'); 36 X1_Lookup.defineData('INDV_ID'); 37 X1_Lookup.defineDone(); 38 End; 39 Set Common.Sbscrbr_ClMnt_Membership ( Rename=(DEPN_NBR=Clmnt_Depn_Nbr) ) ; 40 If INDV_ID = . Then RC_X1_Lookup = X1_Lookup.find() = 0 ; 42 Run ; NOTE: There were 91,407,690 observations read from the data set COMMON.X_MEMBER_NAME_N_ADDRESS. NOTE: There were 67,741,973 observations read from the data set COMMON.SBSCRBR_CLMNT_MEMBERSHIP. NOTE: The data set COMMON.MEMBER_MASTER has 67,741,973 observations and 38 variables. NOTE: Compressing data set COMMON.MEMBER_MASTER decreased size by 26.44 percent. Compressed is 732817 pages; un-compressed would require 996206 pages. NOTE: DATA statement used (Total process time): real time 21:01.41 user cpu time 13:53.38 system cpu time 1:39.17 Memory 5768033k */ * Take a look at your output ; Title "MATCHED DATA" ; Run ; Proc Sort Data=Libby.Matched ; By Dr_ID Score ; * Now look at multiple matches and determine why - Member may been in twice or you have bad matches ; Data A ; Set Libby.Matched (Where=(Score < 10 )) ; By Dr_ID ; If first.Dr_ID and last.Dr_ID then delete ; Title "Potentially Good MULTIPLE Matches" ; Run ; Proc Print Data=A Uniform ; Var Dr_ID MBR_ID Name: DOB Mbr_Dob MBR_SSN: SUB_SSN: Med: ; Run ; Proc Sort Data=Libby.Matched NoDupKey ; By Dr_ID ; Title "Potentially Good Poor Score Matches" ; Run ; Proc Print Data=Libby.Matched (Obs=1111 Where=(Score >= 10 ) ) NoObs Uniform ; Var Dr_ID MBR_ID Name: DOB: Mbr_Dob MBR_SSN: SUB_SSN: Score ; Run ; Title "Good Score Matches" ; Run ; Proc Print Data=Libby.Matched (Obs=1111 Where=(Score < 10 )) NoObs Uniform ; Var Dr_ID MBR_ID Name: DOB Mbr_Dob MBR_SSN: SUB_SSN: Score ; Run ; * What about the small dataset memberr that didn't match at all? Use another Hash ; Data WhereOutThou ; If _N_ = 1 Then Do ; Declare Hash LU(Dataset:'Libby.Matched'); LU.defineKey('DR_ID'); LU.defineDone(); End ; Set Libby.Small_Data ; RC = LU.find() = 0 ; If NOT RC ; Title "WhereOutThou" ; Run ; Proc Print Data=WhereOutThou ; Run ; * Demonsrate the other end of the Bit BAND - one possible way ;;;; Data Libby.Matched_Plus ; Set Libby.Matched (Where=( Score < 10 )); Array Y {48} _Temporary_ ; If Med_2005 Then Do xx = 1 to 12 ; If Band(Med_2005, 2 **( (xx) ) ) Then Y{XX} = 1 ; End ; If Med_2006 Then Do xx = 13 to 24 ; If Band(Med_2006, 2 **( (xx-12) ) ) Then Y{XX} = 1 ; End ; If Med_2007 Then Do xx = 25 to 36 ; If Band(Med_2007, 2 **( (xx-24) ) ) Then Y{XX} = 1 ; End ; If Med_2008 Then Do xx = 37 to 48 ; If Band(Med_2008, 2 **( (xx-36) ) ) Then Y{XX} = 1 ; End ; Intervention_Month = Sum( Month(Intervention_Date) , (Year(Intervention_Date)-2005)*12 ) ; Do i = 1 to Intervention_Month ; Months_Prior_Enrollment = Sum(Months_Prior_Enrollment, Y{i}) ; End ; Do k = Intervention_Month to 48 ; Months_Post_Enrollment = Sum(Months_Post_Enrollment, Y{k}) ; End ; Active = Sum(Months_Prior_Enrollment, Months_Post_Enrollment ) ; If Months_Post_Enrollment >= 11 and Months_Prior_Enrollment >= 11 Then Good = 1 ; Else If Months_Post_Enrollment >= 11 or Months_Prior_Enrollment >= 11 Then Good = 2 ; Age = Int(YrDif(Mbr_DOB, Intervention_Date, 'ACT/ACT')); Run ; Proc Print Data=Libby.Matched_Plus ; Var Intervention_Date Active Y: Med_: ; Proc Freq Data=Libby.Matched_Plus; Tables Active / NoCol NoRow NoCum Missing ; Run ; ENDSAS ;