# Read data: insul=read.table("insulgas.dat",header=T) insul names(insul) # Plot gas vs temp: plot(gas~temp,data=insul) # Fit linear regression model with response = gas, regression variables = temp: fit1=lm(gas~temp,data=insul) summary(fit1) abline(fit1) insulres=resid(fit1) hist(insulres) plot(density(insulres)) # Bimodal! Residuals seem to come from a mixture of two distributions. plot(insulres~temp,data=insul) boxplot(insulres~insulate,data=insul) # Indicates that we need to account for the effect of insul. plot(gas~temp,data=insul,col=insulate) # Same conclusion. # So let us fit a model with two regression lines with the same slope: fit2=lm(gas~temp+factor(insulate),data=insul) summary(fit2) #NB: the intercept estimate is the estimated intercept for the first group #The factor(insulate)2 is the difference between the intercepts of group 2 and 1. abline(c(6.55,-0.33)) abline(c(6.55-1.56,-0.33),col="red") # Indeed this looks as a better fit. insulres2=resid(fit2) hist(insulres2) plot(insulres2~temp,data=insul) boxplot(insulres2~insulate,data=insul) # Looks good. AIC(fit1) # AIC for the fitted regression model gas~temp. AIC(fit2) # AIC for the fitted model with two regression lines (same slope). # AIC confirms the better fit of the model gas~temp+factor(insulate). # Cross validation using random validation sample of length 6: indices=c(1:length(insul$gas)) V=sample(indices,6) indices V gasTrain=insul$gas[-V] tempTrain=insul$temp[-V] gasNew=insul$gas[V] tempNew=insul$temp[V] # First for the linear regression model: fit1Train=lm(gasTrain~tempTrain) gasNewHat1=predict(fit1Train,newdata=list(tempTrain=tempNew)) CV1=sum((gasNew-gasNewHat1)^2) CV1 # Second for the two regression lines (same slope): insulateTrain=insul$insulate[-V] fit2Train=lm(gasTrain~tempTrain+factor(insulateTrain)) insulateNew=insul$insulate[V] gasNewHat2=predict(fit2Train,newdata=list(tempTrain=tempNew,insulateTrain=insulateNew)) CV2=sum((gasNew-gasNewHat2)^2) CV2 #we can average over several CV scores: nrep=10 CV1=CV2=rep(0,nrep) for (i in 1:10){ V=sample(indices,6) gasTrain=insul$gas[-V] tempTrain=insul$temp[-V] gasNew=insul$gas[V] tempNew=insul$temp[V] # First for the linear regression model: fit1Train=lm(gasTrain~tempTrain) gasNewHat1=predict(fit1Train,newdata=list(tempTrain=tempNew)) CV1[i]=sum((gasNew-gasNewHat1)^2) insulateTrain=insul$insulate[-V] fit2Train=lm(gasTrain~tempTrain+factor(insulateTrain)) insulateNew=insul$insulate[V] gasNewHat2=predict(fit2Train,newdata=list(tempTrain=tempNew,insulateTrain=insulateNew)) CV2[i]=sum((gasNew-gasNewHat2)^2) } mean(CV1) mean(CV2)