# setwd("C:/cwru/classes/epbi473_05/lec3(SEER.Abomb)") # This code convers big SEER text files (with no header or field spacings) into a smaller text file (~33% reduced) with tab spacing and a header row # This takes about an hour to run, but should only need to be done once for a given SEER dataset ... make sure all the fields you want are here! cancers=c('breast','digothr','malegen','femgen','other','respir','colrect','lymyleuk','urinary','test') # test was for debugging for (pre in c('yr1992_2002.sj_la_rg_ak/','yr1973_2002.seer9/')) #for (pre in c('yr1992_2002.sj_la_rg_ak/')) # for debugging for (k in 1:9) #for (k in 10:10) { iname=paste("c:/SEER02/",pre,cancers[k],'.txt',sep="") oname=paste("c:/SEER02/",pre,cancers[k],'.stx',sep="") fid=file(iname,'r') fid2=file(oname,'w') y=paste("region","personID","bYear","race","sex","sequen","yDiag","morph","BT","rad","ageRC","stage","NHL","surv","ICD9","COD","numPrim",sep="\t") writeLines(y,con=fid2) while (1) { s=readLines(fid,n=1,ok=T) #print(s) # for debugging if (length(s)==0) break region=substr(s,1,2) personID=substr(s,3,10) bYear=substr(s,20,23); race=substr(s,27,28); sex=substr(s,30,30); sequen=substr(s,32,33); yDiag=substr(s,36,39); morph=substr(s,45,48); #bh=substr(s,49,49); BT=substr(s,50,50); # 5 = T, 6 = B rad=substr(s,67,67); # 0 or 7 = no radiation, 8,9 = unknown ageRC=substr(s,84,85); stage=substr(s,86,86); NHL=substr(s,89,90); srvy=substr(s,91,92); srvm=substr(s,93,94); ICD9=substr(s,95,98); COD=substr(s,137,141); numPrim=substr(s,143,144) v1=as.numeric(srvy); v2=as.numeric(srvm); surv=as.character(12*v1+v2); # months of survival y=paste(region,personID,bYear,race,sex,sequen,yDiag,morph,BT,rad,ageRC,stage,NHL,surv,ICD9,COD,numPrim,sep="\t") writeLines(y,con=fid2) } close(fid) close(fid2) }