*************************************************************************; * PROGRAM NAME: ErrorCheckDedupeAppend.sas *; * DATE CREATED: 09/23/03 *; * LAST UPDATED: 10/27/03 *; * *; * PROGRAMMERS: NYC DOHMH Bureau of Communicable Disease *; * PURPOSE: Sample program demonstrating basic error checks and *; * deduplication of ReadAll (new data) and appending to *; * archive. *; ************************************************************************; ************************************************************************; * This program will only run after ReadinData.sas *; ************************************************************************; * Include NYCSyndromicMacros.sas (this is the equivalent of opening and runhning it); filename syndcode "c:\SyndromicWorkshop\SAS\NYCSyndromicMacros.sas"; %include syndcode; * Error checks; data errorcheck1; set readall; * Create numeric age variable; %Age; %ValidZip; * Verification variables=1 if data look correct, 0 otherwise; if agenum ne . and agenum <120 then age1=1; else age1=0; if id ne "" then id1=1; else id1=0; sex=upcase(sex); if sex in ('M','F','U') then sex1=1; else sex1=0; if cc ne "" then cc1=1; else cc1=0; if zip ne "" then zip1=1; else zip1=0; if DT_stamp ne . then DT_stamp1=1; else DT_stamp1=0; format zip1 age1 id1 sex1 cc1 1.; run; proc sql; create table errorcheck2 as select edcode, date, sum(age1) as age, sum(id1) as id, sum(zip1) as zip, sum(sex1) as sex, sum(cc1) as cc, sum(DT_stamp1) as reference from errorcheck1 group by edcode, date; data errorcheck3; set errorcheck2; Page=(reference-age)/reference; Pzip=(reference-zip)/reference; Psex=(reference-sex)/reference; Pcc= (reference-cc) /reference; format page pzip psex pcc 4.2; if age>2 and page >0.05 then flgage=1; else flgage=0; if zip>2 and pzip >0.05 then flgzip=1; else flgzip=0; if sex>2 and psex >0.05 then flgsex=1; else flgsex=0; if cc>2 and pcc >0.05 then flgcc=1; else flgcc=0; attrib flagage flagzip flagsex flagcc length=$4.; if flgage=1 then flagage =put(page,4.2); else flagage=flgage; if flgzip=1 then flagzip =put(pzip,4.2); else flagzip=flgzip; if flgsex=1 then flagsex =put(psex,4.2); else flagsex=flgsex; if flgcc =1 then flagcc =put(pcc ,4.2); else flagcc =flgcc; run; * Merge with complete list of EDs so that today's missing EDs show up; data EDlist; informat edcode $3.; input edcode @@; date=mdy(7,25,03); datalines; ED1 ED2 ED3 ED4 ED5 ; run; proc sort data=errorcheck3; by date edcode; proc sort data=edlist; by date edcode; data errorcheck4; merge errorcheck3 (in=in1) edlist (in=in2); by date edcode; if in1 or (in2 and not in1); run; proc print data=errorcheck4; var date edcode reference flagage flagzip flagsex flagcc; title1 'PRINTOUT OF FLAGGED ERRORS: shows proportion of possibly incorrect or missing data when > 5%'; title2 'PLEASE INVESTIGATE FLAGS (Scan Log, trace back through Readall, individual hospital dataset'; title3 'and original text file if necessary. Reference=number of records added today.)'; format edcode $3. date mmddyy8.; run; * Recommended enhancements to error checks: Instead of using the same thresholds for all EDs, calculate mean % missing for each ED and only flag when unusual for that ED (could use CuSum to identify aberrations); * Deduplicate new data: remove duplicates from READALL (today's data) and output to NEWDATA *Method 1: Use PROC SQL to select unique combinations of the listed variables Any variable that you want in the output dataset must be included in the PROC SQL statement; proc sql; create table newdata as select distinct date, edcode, time, age, sex, zip, id, cc, DT_stamp from readall; * To make sure that all EDs are listed in PROC FREQ table merge with complete list of EDs; data chknew; merge newdata (in=in1) edlist (in=in2); by date edcode; if in1 or (in2 and not in1); run; proc freq data=chknew formchar(1,2,7)=' '; tables edcode*date/missing missprint nocol norow nopercent; format date mmddyy8.; title1 "Frequencies of new data by ED and visit date"; title2 "'1' in yesterday's data indicates missing file"; run; *Method 2: Use PROC SORT NODUPKEY (Acknowledgement: Paul Grant, SAS) This method allows you to carry forward variables without necessarily matching on them; proc sort data=readall out=newdata1 nodupkey; by date edcode time age sex zip id cc; run; * Deduplicate with respect to archive: Make sure new records have not already been added to archive; data newdata; set newdata; key=put(date,mmddyy6.)||edcode||time||left(age)||SUBSTR(CC,1,15); proc sort data=newdata; by key; run; data archiveold; set data.NYCfakearchive; attrib key length=$68.; key=put(date,mmddyy6.)||edcode||time||left(age)||SUBSTR(CC,1,15); proc sort data=archiveold; by key; run; data newdata2 chkarchivenew notmerged; merge archiveold(in=in1) newdata(in=in2); by key; drop key; if in2 and not in1 then output newdata2; if in1 or (in2 and not in1) then output chkarchivenew; if in1 and in2 then output notmerged; run; proc print data=notmerged; var date edcode time age sex zip cc id DT_stamp; title1 'Line list of todays records not merged into archive because'; title2 'they matched with a previous record by date, hospcode, time, age and cc'; run; * Verify CHKARCHIVENEW, a temp version of what the new archive will look like; options ls=120; proc freq data=chkarchivenew formchar(1,2,7)=' ';* headings=v; tables edcode*date/missing missprint nocol norow nopercent; *where date>date()-20; format date mmddyy5. edcode $3.; title1 'CHKARCHIVENEW: new data combined with archive by ED and date of ER visit'; run; * If everything looks good then proceed to append new data to archive; ******************************************************************************; ******************************************************************************; ******************************************************************************; ************************** PERMANENT APPEND **********************************; * Append new data to archive and save today's new data to new072503; proc append base=data.NYCfakearchive data=newdata2; run; data data.new072503; set newdata2; run; * Copy archive to backup location; libname backup "c:\temp"; data backup.archive; set data.NYCfakearchive; run; ******************************************************************************; ******************************************************************************; ******************************************************************************; ******************************************************************************; * If you want to delete the new data just added so that you can run this program again for demonstration purposes run the two lines below; /* data data.NYCfakearchive; set data.NYCfakearchive; if datepart(dt_stamp)>mdy(7,25,03) then delete; run; */