###input dataset: library(car) #############1.Linear regression ###a.Fit linear regression model summary(Prestige) pairs(Prestige) reg1<-lm(prestige~education+log2(income)+women,data=Prestige) summary(reg1) attributes(reg1) reg1$coefficients reg1$df.residual ###b.test normality library(MASS) sresid <- studres(reg1) par(mfrow=c(1,2)) qqPlot(reg1, main="QQ Plot") hist(sresid, freq=FALSE,main="Distribution of Studentized Residuals") xfit<-seq(min(sresid),max(sresid),length=40) yfit<-dnorm(xfit) lines(xfit, yfit) shapiro.test(reg1$residuals) library(nortest) ad.test(reg1$residuals) ###c.examine independence par(mfrow=c(1,2)) plot(reg1$residuals,main='Residuals vs Index',xlab='Index',ylab='residuals') plot(reg1$residuals[1:(length(reg1$residuals)-1)],reg1$residuals[2:length(reg1$residuals)], main='Residuals vs Lag 1 Residuals',xlab='Lag 1 residuals',ylab='residuals') durbinWatsonTest(reg1) ###d.examine heteroscedasticity attach(Prestige) par(mfrow=c(2,2)) plot(reg1$residuals~reg1$fitted.values,main='Residuals vs Fitted Values',xlab='Fitted Values',ylab='Residuals') plot(reg1$residuals~education,main='Residuals vs Education',xlab='Fitted Values',ylab='Education') plot(reg1$residuals~log2(income),main='Residuals vs log2(income)',xlab='Fitted Values',ylab='log2(income)') plot(reg1$residuals~women,main='Residuals vs Women',xlab='Fitted Values',ylab='Women') library(lmtest) gqtest(reg1) ncvTest(reg1) ###e.examine outliers outlierTest(reg1) # Bonferonni p-value for most extreme obs lm.influence(reg1)$hat lm.influence(reg1)$hat[lm.influence(reg1)$hat>3*(3+1)/102] ###f.examine influence points influence.measures(reg1) #Cook’s D plot (identify D values>4/(n-k-1)): cutoff<-4/((nrow(Prestige)-length(reg1$coefficients)-2)) plot(reg1, which=4, cook.levels=cutoff) #Influence Plot: influencePlot(reg1, id.method="identify", main="Influence Plot", sub="Circle size is proportial to Cook's Distance" ) ###g.examine colinearity #Added variable plots: avPlots(reg1) #Variance inflation factors: vif(reg1) #Problem? sqrt(vif(reg1)) > 2 ###h.model selection edu2<-education^2 loginc2<-log2(income)^2 edulogin<-education*log2(income) reg2<-lm(prestige~education+edu2+log2(income)+loginc2+edulogin+women,data=Prestige) step(reg2,direction='both') ###i.model comparison anova(reg1,reg2) ######2.logistic regression library(HSAUR) head(plasma) attach(plasma) ###a.model fitting fit<-glm(ESR~fibrinogen+globulin,data=plasma,family=binomial('logit')) summary(fit) attributes(fit) attach(plasma) ESR2<-rep(1,dim(plasma)[1]) ESR2[ESR=='ESR > 20']<-0 fit<-glm(ESR2~fibrinogen,data=plasma,family=binomial('logit')) plot(fibrinogen, ESR2) lines(fibrinogen[order(fibrinogen)],fit$fitted.values[order(fibrinogen)]) ###b.Odds ratio and confidence intervals exp(coef(fit)) vcov(fit) confint.default(fit) exp(confint.default(fit)) ###c. model selection fit2<-glm(ESR2~fibrinogen+globulin,data=plasma,family=binomial('logit')) step(fit2) ###d. Hosmer-Lemeshow Goodness of Fit (GOF) Test. library(ResourceSelection) hoslem.test(ESR2,fit2$fitted.values)