* ADA2_01.sas ; * SAS code from Chapter 01 ; * Stat 428/528 Advanced Data Analysis II ; * ; * Prof. Erik Erhardt ; * University of New Mexico ; * Spring 2012 ; *******************************************************************************; * rabbit - first example; options ls=79 nodate nocenter; data rabbit; input dose nres ntotal; datalines; 50 1 5 200 4 10 250 2 5 400 6 10 2000 4 5 ; run; proc print; run; *******************************************************************************; * speeding tickets - using labels; * text between an asterisk and a semicolon is a comment; options ls=79 nodate nocenter; * define sas options; data tickets; input state $ amt @@; label state = 'STATE' amt = 'AMOUNT OF SPEEDING TICKET'; datalines; al 60 hi 35 de 31.5 il 20 ak 20 ct 60 ar 47 ia 33 fl 44.5 nm 20 az 15 ca 50 dc . ; run; title 'cost of speeding ticket for selected states'; proc print data=tickets label; * use labels rather than variable names; var state amt; run; *******************************************************************************; * debugging example; options ls=79 nodate nocenter; data d1; input pulse height sex; datalines; 62 72 0 66 70 1 70 66 1 49 63 1 76 76 0 ; run; proc prnt; var pulse sex; run; proc print var pulse sex; run; *******************************************************************************; * variable creation, case selection and deletion; options ls=79 nodate nocenter; data d1; input pulse height sex; if height le 69 then delete; * delete short people; htft = height/12; * height in feet; pulpht = pulse + height; * nonsense variable; lacko = .25*(height + height**2)/(pulpht - 12); * another; datalines; 62 72 0 66 70 1 70 66 1 . 78 0 86 70 0 49 63 1 76 76 0 ; run; proc print; run; *******************************************************************************; * if statement for conditional variable assignment; options ls=79 nodate nocenter; data d1; input pulse height sex; if pulse ne .; * delete missing values; htft = height/12; * height in feet; loght = log(htft); * log of height; if htft < 5.5 then vertchal='yes'; * short; else vertchal='no'; * not short; if (vertchal = 'yes') or (pulse > 75) then agh = 6; * short or fast pulse, agh=6; else agh = loght; * otherwise, agh=log of height; datalines; 62 72 0 66 70 1 70 66 1 . 78 0 86 70 0 49 63 1 76 76 0 ; run; proc print; run; *******************************************************************************; * this is a good comment style for individual comment lines; /* this style is better for big blocks of lines of comments or for commenting out blocks or sections of code */ *******************************************************************************; * turkey example - sort, by, univariate, ttest, plot, freq, chart, boxplot, gplot, gchart; options ls=79 nodate nocenter; data turkey; input age weight orig $; * Note orig is character $ var ; if age gt 25 then yt = 1; * older than 25; else yt = 0; * not older than 25; label age = 'age at death' weight = 'weight at death' orig = 'site of farm' yt = 'coded age'; datalines; 28 13.3 va 20 8.9 va 32 15.1 va 25 13.8 wi 23 13.1 wi 22 10.4 va 29 13.1 va 27 12.4 va 28 13.2 va 26 11.8 va 21 11.5 wi 31 16.6 wi 27 14.2 wi 29 15.4 wi 30 15.9 wi ; run; * Print entire data set with title; title 'print input data set with created variables'; proc print data=turkey; run; * Sort data by orig in ascending order ; * (could use BY ORIG WT if needed) ; * proc sort "quietly" sorts the data, producing no output; proc sort data=turkey; by orig; run; * Separate print for each origin. Data must be presorted by orig. ; title 'data by state of origin'; proc print data=turkey; var age weight; by orig; * using "by" requires pre-sort; run; * Plot option gives pictures + default summaries. ; * specify /normal option to get a formal test of normality. ; title 'summary of weight data by state'; proc univariate data=turkey plot; var weight; by orig; * this creates the side-by-side boxplots; run; * 2-sample t-tests on both age and wt. ; * Class statement identifies indicator variable for 2 samples. ; title '2 sample t-tests for age and weight'; proc ttest data=turkey; var age weight; class orig; run; * First plot weight by age, ; * then use orig (wv or va) as plotting symbol. ; * Vpos and Hpos control plot size. ; title 'illustration of plot features'; proc plot data=turkey; plot weight*age /vpos=16 hpos=32; plot weight*age=orig /vpos=16 hpos=32; run; * Contingency table of origin by categorized age ; title 'frequency distribution for orig by age'; proc freq data=turkey; tables orig*yt/chisq; run; * a horizontal bar chart ; proc chart data=turkey; hbar orig/type=percent; run; title 'side-by-side box plots'; proc boxplot data=turkey ; plot weight*orig=orig /boxstyle=schematic; run; symbol v=plus; title 'Normal Q-Q Plot for turkey age'; proc univariate data=turkey noprint; qqplot age / normal square vaxis=axis1; axis1 label=(a=90 r=0); run; title 'Scatterplot - WV/VA'; symbol1 v=circle c=black i=none; * define v=symbol, c=color, ; symbol2 v=star c=red i=none; * and i=interpolation ; *symbol1 v=circle c=blue i=r; * i=r makes a regression line; proc gplot data=turkey; plot weight*age=orig; * weight vert, age horiz, orig symbol; run; title 'horizontal bar chart with discrete'; proc gchart data=turkey; hbar orig /type=percent discrete; run; *******************************************************************************; * creating an output dataset from proc univariate; data d1; input ht wt sex; datalines; 4 5 1 3 7 2 5 7 1 3 2 2 2 5 2 ; run; proc sort data=d1; by sex; run; proc univariate data=d1 noprint; var ht wt; by sex; output out = showit mean = mht mwt median = medht medwt std = stdht stdwt; run; proc print data=showit; run; *******************************************************************************; * using @@ for multiple observations on a line; * donut - anova analysis; options ls=79 nodate nocenter; title 'donut fat absorbsion example'; data donut; input fat $ abs @@; * fat type is a character string $ ; label fat = 'type of fat' abs = 'amt of fat absorbed in grams'; datalines; f1 64 f1 72 f1 68 f1 77 f1 90 f1 76 f2 78 f2 91 f2 97 f2 82 f2 85 f2 77 f3 75 f3 86 f3 78 f3 71 f3 63 f3 76 f4 55 f4 66 f4 49 f4 64 f4 70 f4 68 ; run; proc print data=donut; run; proc boxplot data=donut; plot abs*fat=fat /boxstyle=schematic; run; * ANOVA of absorbsions by fat type; proc glm data=donut; class fat; * Identifies samples. The class var could be numeric ; model abs = fat; means fat / bon lsd; run; * ANOVA of absorbsions by fat type; proc glm data=donut; class fat; * Identifies samples. The class var could be numeric ; model abs = fat; *means fat / bon lsd cldiff; means fat / bon lsd snk; run; *******************************************************************************; * using "infile" to read data from a file; options ls=79 nodate; data donut; infile "ADA2_01_fats.dat"; * Infile with data file location (eg c:\fats.dat); input fat $ abs @@; * string $ ; label fat = 'type of fat' abs = 'amt of fat absorbed in grams'; * End of data step ; run; proc print; run; *******************************************************************************; * turkey - format, drop/keep, output dataset; title 'Further analysis of the turkey data set'; options ls=79 nodate nocenter; data turkey; infile "ADA2_01_turkey.dat"; input age weight orig $; * Note orig is character $ var ; if age gt 25 then yt = 1; * older than 25; else yt = 0; * not older than 25; label age = 'age at death' weight = 'weight at death' orig = 'site of farm' yt = 'coded age'; run; proc sort data=turkey; by orig; run; * Illustrate the use of format - note form of output; proc format; value ytfmt 0='young' 1='old'; value weightfmt 1-12='light' 12.1-40='heavy'; run; proc print data=turkey; var age weight orig yt; format yt ytfmt. weight weightfmt. ; * How would output look without format? ; run; proc freq data=turkey; tables weight; format weight weightfmt.; * What would happen without the format? ; run; * Create a new data set turkey2 from turkey ; data turkey2; set turkey; if weight > 16 then delete; * Delete heaviest turkey ; drop age yt; * Drop age and yt vars ; run; proc print data = turkey2; format weight weightfmt. ; run; * Create output data set tsumm with mean and median weight ; * Also, suppress printing of all output with NOPRINT option ; proc univariate data=turkey2 noprint; var weight; by orig; output out=tsumm mean=meanwt median=medwt; run; * Print out summarized data contained in tsumm ; proc print data = tsumm; run; *******************************************************************************; * bloodloss - correlation; title 'example: blood loss'; options ls=79 nodate nocenter; data bloodloss; input wt ti bl; label wt='weight in kg' ti='time of operation' bl='blood loss in ml'; datalines; 44.3 105 503 40.6 80 490 69.0 86 471 43.7 112 505 50.3 109 482 50.2 100 490 35.4 96 513 52.2 120 464 ; run; proc gplot data=bloodloss; plot bl*wt bl*ti; * Multiple plots per statement, same as bl*(wt ti); run; proc corr data=bloodloss pearson spearman nosimple; var wt ti bl; * Pearson and Spearman corr for ; * each pair of vars in var list ; * nosimple suppresses simple summary statistic output. ; run; *******************************************************************************; * bloodloss - regression, fitting new observations, CIs and PIs; options ls=79 nodate nocenter; title 'blood loss regression bl=wt'; data bloodloss2; input wt ti bl; label wt='weight in kg' ti='time of operation' bl='blood loss in ml'; datalines; 44.3 105 503 40.6 80 490 69.0 86 471 43.7 112 505 50.3 109 482 50.2 100 490 35.4 96 513 52.2 120 464 60.0 . . 45.0 . . ; run; proc reg data=bloodloss2; model bl = wt /p r cli clm; * pred, resid, PI new resp, CI mean resp; run; *******************************************************************************; * rocket - infile, gplot, output residuals, plot residuals and QQ plot; options ls=79 nodate nocenter; title 'example: rocket'; data rocket; infile "ADA2_01_rocket.dat"; input v1 v2; label v1='shear strength psi' v2='age of propellant in weeks'; run; title 'regression of shear strength vs age of propellant'; symbol1 v=circle c=black i=r; * i=r makes a regression line; proc gplot data=rocket; plot v1*v2; run; proc reg data=rocket; model v1=v2/p r; * diagnostic plots can be created directly in proc reg; plot student.*predicted. nqq.*student. cookd.*obs.; output out=rocket2 p=pv1 student=str; run; proc rank data=rocket2 out=rocket3 normal=blom; var str; ranks rankstr; run; symbol1 v=circle c=black i=none; title 'std resid plot'; proc gplot data=rocket3; plot str*pv1; run; title 'QQ plot'; proc gplot data=rocket3; plot str*rankstr; run; quit; *******************************************************************************; * rocket - delete two observations; options ls=79 nodate nocenter; title 'example: rocket'; data rocket; infile "ADA2_01_rocket.dat"; input v1 v2; label v1='shear strength psi' v2='age of propellant in weeks'; if ( _n_ = 5 ) or ( _n_ = 6 ) then delete; run; title 'regression of shear strength vs age of propellant'; symbol1 v=circle c=black i=r; * i=r makes a regression line; proc gplot data=rocket; plot v1*v2; run; proc reg data=rocket; model v1=v2/p r; * diagnostic plots can be created directly in proc reg; plot student.*predicted. nqq.*student. cookd.*obs.; run; quit; *******************************************************************************;