--- title: "Informal tests of surveyCV using the Auto dataset" author: "Cole Guerin, Thomas McMahon, Jerzy Wieczorek" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{test-Auto} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- Informally test our various CV functions on the `ISLR::Auto` dataset using SRS-CV, clustered-CV, and stratified-CV. (We are arbitrarily using year as either clusterID or stratumID.) Just run the functions on a somewhat arbitrary dataset to ensure that the output format is what we're looking for, and that the output is roughly consistent across the different functions that are doing the same thing. * TODO: Add tests for a case with both clusters AND strata simultaneously. * TODO: Add tests for cases with sampling weights and/or fpc's. * TODO: Add tests for transformed responses within formulae, eg "log(y) ~ ..." * TODO: Turn these into formal tests in the `tests` directory, not a vignette. * TODO: Replace this vignette of informal tests with a practical demo vignette: how can you *use* `surveyCV` appropriately on a real complex-survey dataset? ```{r, message=FALSE} library(surveyCV) library(survey) library(ISLR) ``` ```{r} data(Auto) with(Auto, plot(horsepower, mpg, pch = '.')) ``` ## `cv.svy()` ```{r} # Test the general cv.svy() function # The first line's results are usually around: # linear lm: mean=24.3, se=1.45 # quadratic lm: mean=19.3, se=1.38 cv.svy(Auto, c("mpg~poly(horsepower,1, raw = TRUE)", "mpg~poly(horsepower,2, raw = TRUE)"), nfolds = 10) cv.svy(Auto, c("mpg~poly(horsepower,1,raw=TRUE)", "mpg~poly(horsepower,2,raw=TRUE)"), nfolds = 10, clusterID = "year") cv.svy(Auto, c("mpg~poly(horsepower,1, raw = TRUE)", "mpg~poly(horsepower,2, raw = TRUE)"), nfolds = 10, strataID = "year") ``` ## `cv.svydesign()` ```{r} # Test the cv.svydesign() function, which takes a svydesign object and formulas auto.srs.svy <- svydesign(ids = ~0, data = Auto) auto.clus.svy <- svydesign(ids = ~year, data = Auto) auto.strat.svy <- svydesign(ids = ~0, strata = ~year, data = Auto) cv.svydesign(formulae = c("mpg~poly(horsepower,1, raw = TRUE)", "mpg~poly(horsepower,2, raw = TRUE)", "mpg~poly(horsepower,3, raw = TRUE)"), design_object = auto.srs.svy, nfolds = 10) cv.svydesign(formulae = c("mpg~poly(horsepower,1, raw = TRUE)", "mpg~poly(horsepower,2, raw = TRUE)", "mpg~poly(horsepower,3, raw = TRUE)"), design_object = auto.clus.svy, nfolds = 10) cv.svydesign(formulae = c("mpg~poly(horsepower,1, raw = TRUE)", "mpg~poly(horsepower,2, raw = TRUE)", "mpg~poly(horsepower,3, raw = TRUE)"), design_object = auto.strat.svy, nfolds = 10) ``` ## `cv.svyglm()` ```{r} # Test the cv.svyglm() function, which takes one svyglm object srs.model <- svyglm(mpg~horsepower+I(horsepower^2)+I(horsepower^3), design = auto.srs.svy) clus.model <- svyglm(mpg~horsepower+I(horsepower^2)+I(horsepower^3), design = auto.clus.svy) strat.model <- svyglm(mpg~horsepower+I(horsepower^2)+I(horsepower^3), design = auto.strat.svy) cv.svyglm(glm = srs.model, nfolds = 10) cv.svyglm(glm = clus.model, nfolds = 10) cv.svyglm(glm = strat.model, nfolds = 10) ``` ## Test for equivalence of the 3 functions at same random seed ```{r} seed = 20210708 set.seed(seed) cv.svy(Auto, "mpg~horsepower+I(horsepower^2)+I(horsepower^3)", nfolds = 10) set.seed(seed) cv.svydesign(formulae = "mpg~horsepower+I(horsepower^2)+I(horsepower^3)", design_object = auto.srs.svy, nfolds = 10) srs.model <- svyglm(mpg~horsepower+I(horsepower^2)+I(horsepower^3), design = auto.srs.svy) set.seed(seed) cv.svyglm(glm = srs.model, nfolds = 10) ``` ## Test of logistic regression ```{r} Auto$isAmerican <- Auto$origin == 1 with(Auto, plot(horsepower, isAmerican)) cv.svy(Auto, c("isAmerican~horsepower", "isAmerican~horsepower+I(horsepower^2)", "isAmerican~horsepower+I(horsepower^2)+I(horsepower^3)"), nfolds = 10, method = "logistic") ``` ## Tests that we expect should fail ```{r} # Should stop early because method isn't linear or logistic try(cv.svy(Auto, "mpg~horsepower+I(horsepower^2)+I(horsepower^3)", nfolds = 10, method = "abcde")) # Should try to run but crash because response variable isn't 0/1 try(cv.svy(Auto, "mpg~horsepower+I(horsepower^2)+I(horsepower^3)", nfolds = 10, method = "logistic")) ```