surveyCV
Stat paper based on SDSS presentationWriting cleaner code for only the simulations needed for our submission to Stat (the special issue for SDSS 2021 talks).
Based on the earlier sims (now on GitHub in data-raw/plot_generation.R
and data-raw/early-results.html
)
– but instead of having similar functions repeatedly defined separately
as in that file, use our functions such as cv.svy()
directly from the package.
library(surveyCV)
rerunSims <- FALSE # when we *do* rerun sims, remember to reinstall package & restart R
knitr::opts_chunk$set(dpi=300, out.extra ='WIDTH="95%"')
# Generate a cubic-ish artificial population with 1000 units
set.seed(47)
N <- 1000
nrStrata <- 10
nrClusters <- 100
clusSizes <- N/nrClusters
x1 = stats::runif(1:(N/2), min = 26, max = 38)
y1 = (x1-29)^3 - 13*(x1-29)^2 + 0*(x1-29) + 900
x2 = stats::runif(1:(N/2), min = 38, max = 50)
y2 = (x2-36)^3 - 10*(x2-36)^2 + 2*(x2-36) + 600
z1 = jitter(y1, 15000)
z2 = jitter(y2, 15000)
ds1 <- data.frame(Response = z1, Predictor = x1)
ds2 <- data.frame(Response = z2, Predictor = x2)
ds12 <- rbind(ds1, ds2)
b12 <- data.frame(ID = c(1:1000))
splinepop.df <- cbind(b12, ds12)
splinepop.df <- splinepop.df %>%
dplyr::arrange(Predictor) %>%
# Create Cluster and Stratum variables so that
# we can (separately) simulate both kinds of sampling
dplyr::mutate(Stratum = dplyr::row_number(),
Cluster = dplyr::row_number()) %>%
dplyr::mutate(Stratum = cut(Stratum, nrStrata, 1:nrStrata),
Cluster = cut(Cluster, nrClusters, 1:nrClusters)) %>%
# Create fpc variables for each sampling type:
# full pop has 1000 units; 100 units/stratum; 100 clusters (of 10 units each);
# & if we added a combined Strat+Clus sim, we'd need different fpc's for that
dplyr::mutate(fpcSRS = N,
fpcStratum = N/nrStrata,
fpcCluster = nrClusters) %>%
dplyr::arrange(ID)
# Create unequal sampling weights that are intentionally
# NOT well matched to the true shape of the population,
# so we can see how the use of weights impacts
# model-training step vs test-error estimation step
# in a case where naively ignoring weights will confidently pick wrong model
lm_quad <- stats::lm(Response ~ Predictor + I(Predictor^2),
data = splinepop.df)
splinepop.df$samp_prob_quad <-
(1/(abs(lm_quad$residuals))) / sum(1/(abs(lm_quad$residuals)))
splinepop.df$samp_wt_quad <- 1/splinepop.df$samp_prob_quad
stopifnot(all.equal(sum(1/splinepop.df$samp_wt_quad), 1))
n <- 100
srs.df <- dplyr::sample_n(splinepop.df, n)
stratcounts <- rep(n/nrStrata, nrStrata)
names(stratcounts) <- 1:nrStrata
s <- stratsample(splinepop.df$Stratum, stratcounts)
strat.df <- splinepop.df[s,]
c <- unique(splinepop.df[["Cluster"]])
clus.df <- splinepop.df[splinepop.df[["Cluster"]] %in% sample(c, n/clusSizes),]
a <- ggplot(splinepop.df, aes(x = Predictor, y = Response)) +
geom_point(shape = 1) +
labs(title = "Artificial Pop.", x = "Predictor", y = "Response") +
ylim(100,1650) + xlim(25, 51)
b <- ggplot(srs.df, aes(x = Predictor, y = Response)) +
geom_point(shape = 8, color = "darkgreen") +
labs(title = "SRS", x = "Predictor", y = "Response") +
ylim(100,1650) + xlim(25, 51)
c <- ggplot(strat.df, aes(x = Predictor, y = Response)) +
geom_point(shape = 3, color = "darkred") +
labs(title = "Stratified Sample", x = "Predictor", y = "Response") +
ylim(100,1650) + xlim(25, 51)
d <- ggplot(clus.df, aes(x = Predictor, y = Response)) +
geom_point(shape = 4, color = "steelblue") +
labs(title = "Cluster Sample", x = "Predictor", y = "Response") +
ylim(100,1650) + xlim(25, 51)
gridExtra::grid.arrange(a, b, c, d, ncol = 4,
top = "Simulated Data and Examples of How It Was Sampled")
# Use SRS samples and make SRS folds
srssrsds <- data.frame(df = c(), MSE = c())
# Use SRS samples and evaluate on full pop
srspopds <- data.frame(df = c(), MSE = c())
# Use Cluster samples and make Cluster folds
clusclusds <- data.frame(df = c(), MSE = c())
# Use Cluster samples and make SRS folds
clussrsds <- data.frame(df = c(), MSE = c())
# Use Cluster samples and evaluate on full pop
cluspopds <- data.frame(df = c(), MSE = c())
# Use Strat samples and make Strat folds
stratstratds <- data.frame(df = c(), MSE = c())
# Use Strat samples and make SRS folds
stratsrsds <- data.frame(df = c(), MSE = c())
# Use Strat samples and evaluate on full pop
stratpopds <- data.frame(df = c(), MSE = c())
modelsToFit <- c("Response~splines::ns(Predictor, df=1)",
"Response~splines::ns(Predictor, df=2)",
"Response~splines::ns(Predictor, df=3)",
"Response~splines::ns(Predictor, df=4)",
"Response~splines::ns(Predictor, df=5)",
"Response~splines::ns(Predictor, df=6)")
# Making as many simple random samples as we specify for 'loops'
for (i in 1:loops) {
# Take a SRS
sim.srs <- dplyr::sample_n(splinepop.df, n)
# Using our SRS function on SRS samples
srs.CV.out <- cv.svy(sim.srs, modelsToFit,
nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
srssrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out)
srssrsds <- rbind(srssrsds, srssrsds2)
# Eval SRS samples on full pop
sub.srs <- dplyr::sample_n(sim.srs, n*.8)
srs.des <- svydesign(ids = ~1, fpc = ~fpcSRS, data = sub.srs)
srs.pop.out <- sapply(1:6,
function(ii) {
yhat <- predict(svyglm(modelsToFit[ii], srs.des),
newdata = splinepop.df)
return(mean((yhat - splinepop.df$Response)^2))
})
srspopds2 <- data.frame(df = 1:6, MSE = srs.pop.out)
srspopds <- rbind(srspopds, srspopds2)
}
# Making as many cluster samples as we specify for 'loops'
for (i in 1:loops) {
# Take a Cluster sample
c <- unique(splinepop.df[["Cluster"]])
sim.clus <- splinepop.df[splinepop.df[["Cluster"]] %in% sample(c, n/clusSizes),]
# Using our Cluster function on Cluster samples
clus.CV.out <- cv.svy(sim.clus, modelsToFit,
clusterID = "Cluster", nfolds = 5, fpcID = "fpcCluster") %>% as.vector()
clusclusds2 <- data.frame(df = 1:6, MSE = clus.CV.out)
clusclusds <- rbind(clusclusds, clusclusds2)
# Using our SRS function on Cluster samples
srs.CV.out <- cv.svy(sim.clus, modelsToFit,
nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
clussrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out)
clussrsds <- rbind(clussrsds, clussrsds2)
# Eval 80% Cluster samples on full pop
sub.c <- unique(sim.clus[["Cluster"]])
sub.clus <- sim.clus[sim.clus[["Cluster"]] %in% sample(sub.c, (n/clusSizes)*.8),]
clus.des <- svydesign(ids = ~Cluster, fpc = ~fpcCluster, data = sub.clus)
clus.pop.out <- sapply(1:6,
function(ii) {
yhat <- predict(svyglm(modelsToFit[ii], clus.des),
newdata = splinepop.df)
return(mean((yhat - splinepop.df$Response)^2))
})
cluspopds2 <- data.frame(df = 1:6, MSE = clus.pop.out)
cluspopds <- rbind(cluspopds, cluspopds2)
}
# Making as many stratified samples as we specify for 'loops'
for (i in 1:loops) {
# Take a Strat sample
stratcounts <- rep(n/nrStrata, nrStrata)
names(stratcounts) <- 1:nrStrata
s <- stratsample(splinepop.df$Stratum, stratcounts)
sim.strat <- splinepop.df[s,]
# Using our Strat function on Strat samples
strat.CV.out <- cv.svy(sim.strat, modelsToFit,
strataID = "Stratum", nfolds = 5, fpcID = "fpcStratum") %>% as.vector()
stratstratds2 <- data.frame(df = 1:6, MSE = strat.CV.out)
stratstratds <- rbind(stratstratds, stratstratds2)
# Using our SRS function on Strat samples
srs.CV.out <- cv.svy(sim.strat, modelsToFit,
nfolds = 5, fpcID = "fpcSRS") %>% as.vector()
stratsrsds2 <- data.frame(df = 1:6, MSE = srs.CV.out)
stratsrsds <- rbind(stratsrsds, stratsrsds2)
# Eval 80% Strat samples on full pop
sub.stratcounts <- rep((n/nrStrata)*.8, nrStrata)
names(sub.stratcounts) <- 1:nrStrata
sub.s <- survey::stratsample(sim.strat$Stratum, sub.stratcounts)
sub.strat <- sim.strat[sub.s,]
strat.des <- svydesign(ids = ~1, strata = ~Stratum, fpc = ~fpcStratum, data = sub.strat)
strat.pop.out <- sapply(1:6,
function(ii) {
yhat <- predict(svyglm(modelsToFit[ii], strat.des),
newdata = splinepop.df)
return(mean((yhat - splinepop.df$Response)^2))
})
stratpopds2 <- data.frame(df = 1:6, MSE = strat.pop.out)
stratpopds <- rbind(stratpopds, stratpopds2)
}
# Making the degrees of freedom variable a factor variable
srssrsds$df <- as.factor(srssrsds$df)
srspopds$df <- as.factor(srspopds$df)
clusclusds$df <- as.factor(clusclusds$df)
clussrsds$df <- as.factor(clussrsds$df)
cluspopds$df <- as.factor(cluspopds$df)
stratstratds$df <- as.factor(stratstratds$df)
stratsrsds$df <- as.factor(stratsrsds$df)
stratpopds$df <- as.factor(stratpopds$df)
usethis::use_data(srssrsds, srspopds,
clusclusds, clussrsds, cluspopds,
stratstratds, stratsrsds, stratpopds,
internal = FALSE, overwrite = TRUE)
# TODO: for now setting internal=FALSE
# so that we don't overwrite the sysdata.Rda for intro.Rmd;
# but eventually when we're ready to replace that old vignette,
# we can return to internal=TRUE here
# if that's indeed better for R packages
# ...
# OK, now we've written those dataframes,
# AND the 4 below, all into R/sysdata.Rda together
if(!rerunSims) {
srssrsds <- surveyCV:::srssrsds
clusclusds <- surveyCV:::clusclusds
clussrsds <- surveyCV:::clussrsds
stratstratds <- surveyCV:::stratstratds
stratsrsds <- surveyCV:::stratsrsds
srspopds <- surveyCV:::srspopds
cluspopds <- surveyCV:::cluspopds
stratpopds <- surveyCV:::stratpopds
}
# Find the y-ranges of MSEs
yminMSE <- min(min(srssrsds$MSE),
min(clusclusds$MSE), min(clussrsds$MSE),
min(stratstratds$MSE), min(stratsrsds$MSE))
ymaxMSE <- max(max(srssrsds$MSE),
max(clusclusds$MSE), max(clussrsds$MSE),
max(stratstratds$MSE), max(stratsrsds$MSE))
srspopds <- dplyr::group_by(srspopds, df) %>%
dplyr::summarise(MSE = mean(MSE)) %>%
dplyr::mutate(df = as.numeric(df))
cluspopds <- dplyr::group_by(cluspopds, df) %>%
dplyr::summarise(MSE = mean(MSE)) %>%
dplyr::mutate(df = as.numeric(df))
stratpopds <- dplyr::group_by(stratpopds, df) %>%
dplyr::summarise(MSE = mean(MSE)) %>%
dplyr::mutate(df = as.numeric(df))
# Making ggplot objects for the MSEs and AICs
plot.srssrs <- ggplot(data = srssrsds, mapping = aes(x = df, y = MSE/1e4)) +
geom_boxplot() +
geom_line(data = srspopds, mapping = aes(x = df, y = MSE/1e4)) +
ggtitle("CV: SRS folds,\nSRS sample") +
scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.blank <- rectGrob(width = 0, height = 0) # empty spot here -- no corresponding sim
plot.clusclus <- ggplot(data = clusclusds, mapping = aes(x = df, y = MSE/1e4)) +
geom_boxplot() +
geom_line(data = cluspopds) +
ggtitle("CV: Cluster folds,\nCluster sample") +
scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.clussrs <- ggplot(data = clussrsds, mapping = aes(x = df, y = MSE/1e4)) +
geom_boxplot() +
geom_line(data = cluspopds) +
ggtitle("CV: SRS folds,\nCluster sample") +
scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.stratstrat <- ggplot(data = stratstratds, mapping = aes(x = df, y = MSE/1e4)) +
geom_boxplot() +
geom_line(data = stratpopds) +
ggtitle("CV: Strat folds,\nStrat sample") +
scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
plot.stratsrs <- ggplot(data = stratsrsds, mapping = aes(x = df, y = MSE/1e4)) +
geom_boxplot() +
geom_line(data = stratpopds) +
ggtitle("CV: SRS folds,\nStrat sample") +
scale_y_log10(limits = c(yminMSE, ymaxMSE)/1e4)
gridExtra::grid.arrange(plot.srssrs, plot.clussrs, plot.stratsrs,
plot.blank, plot.clusclus, plot.stratstrat,
ncol = 3,
top = paste0("Simulated Data (Sample Sizes = ", n,
", Clusters = ", n/clusSizes,
" or Strata = ", nrStrata,
", Loops = ", loops, ", Folds = 5)"))
ggplot(splinepop.df,
aes(x = Predictor, y = Response)) +
geom_point(aes(size = samp_prob_quad), alpha = 0.1) +
stat_smooth(method = "lm", formula = y ~ x + I(x^2), se = FALSE) +
labs(title = "Artificial Population with Sampling Probabilities",
subtitle = "Sampling prob. is higher for points nearer the solid curve",
x = "Predictor", y = "Response",
size = "Sampling Probability")
# In all the following cases,
# we are only checking the effect of using sampling **weights**
# during model-fitting on training sets, and testing on test sets;
# we did not use cluster or stratified sampling,
# so all CV folds will (sensibly) be SRS (regardless of samp weights)
# Use weighted models, and calculate MSEs using weighted design
AllW <- data.frame(df = c(), MSE = c())
# Use SRS models, and calculate MSEs using SRS design
NoW <- data.frame(df = c(), MSE = c())
# Use weighted models, and calculate MSEs using SRS design
ModW <- data.frame(df = c(), MSE = c())
# Use SRS models, and calculate MSEs using weighted design
MSEW <- data.frame(df = c(), MSE = c())
for (i in 1:loops) {
# Take a sample of size n, using the sampling probabilities instead of SRS
# (using 1/samp_wt as the samp_prob per unit,
# or n/samp_wt as overall samp_prob to get a sample of size n)
in.sample <- sampling::UPtille(n / splinepop.df[[weights]])
splinesamp.df <- splinepop.df[in.sample > 0, ]
modelsToFit <- c("Response~ns(Predictor, df=1)", "Response~ns(Predictor, df=2)",
"Response~ns(Predictor, df=3)", "Response~ns(Predictor, df=4)",
"Response~ns(Predictor, df=5)", "Response~ns(Predictor, df=6)")
AllWdat <- cv.svy(splinesamp.df, modelsToFit,
nfolds = 5, weightsID = weights) %>% as.vector()
NoWdat <- cv.svy(splinesamp.df, modelsToFit,
nfolds = 5) %>% as.vector()
ModWdat <- cv.svy(splinesamp.df, modelsToFit,
nfolds = 5, weightsID = weights,
useSvyForLoss = FALSE) %>% as.vector()
MSEWdat <- cv.svy(splinesamp.df, modelsToFit,
nfolds = 5, weightsID = weights,
useSvyForFits = FALSE) %>% as.vector()
# compiling one data frame
AllW2 <- data.frame(df = 1:6, MSE = AllWdat, sample = rep(i, 6))
AllW <- rbind(AllW, AllW2)
NoW2 <- data.frame(df = 1:6, MSE = NoWdat, sample = rep(i, 6))
NoW <- rbind(NoW, NoW2)
ModW2 <- data.frame(df = 1:6, MSE = ModWdat, sample = rep(i, 6))
ModW <- rbind(ModW, ModW2)
MSEW2 <- data.frame(df = 1:6, MSE = MSEWdat, sample = rep(i, 6))
MSEW <- rbind(MSEW, MSEW2)
}
# Making the degrees of freedom variable a factor variable
AllW$df <- as.factor(AllW$df)
NoW$df <- as.factor(NoW$df)
MSEW$df <- as.factor(MSEW$df)
ModW$df <- as.factor(ModW$df)
usethis::use_data(AllW, NoW,
MSEW, ModW,
internal = FALSE, overwrite = TRUE)
# TODO: as above, for now setting internal=FALSE
# so that we don't overwrite the sysdata.Rda for intro.Rmd;
# but eventually when we're ready to replace that old vignette,
# we can return to internal=TRUE here
# if that's indeed better for R packages
# ...
# OK, now we've written those dataframes,
# AND the 8 above, all into R/sysdata.Rda together
if(!rerunSims) {
AllW <- surveyCV:::AllW
NoW <- surveyCV:::NoW
MSEW <- surveyCV:::MSEW
ModW <- surveyCV:::ModW
}
# Find the y-range of MSEs
ymin <- min(min(AllW$MSE), min(NoW$MSE), min(MSEW$MSE), min(ModW$MSE))
ymax <- max(max(AllW$MSE), max(NoW$MSE), max(MSEW$MSE), max(ModW$MSE))
# Making a ggplot object for the MSEs collected when using
# SRS folds, SRS models, and SRS error calculations
pAllW <- ggplot(data = AllW, mapping = aes(x = df, y = MSE/1e5)) +
ggtitle("Weights for both training and testing") +
scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# SRS folds, Clus models, and SRS error calculations
pNoW <- ggplot(data = NoW, mapping = aes(x = df, y = MSE/1e5)) +
ggtitle("No weights") +
scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# Clus folds, Clus models, and Clus error calculations
pModW <- ggplot(data = ModW, mapping = aes(x = df, y = MSE/1e5)) +
ggtitle("Weights when training models") +
scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a ggplot object for the MSEs collected when using
# Clus folds, SRS models, and Clus error calculations
pMSEW <- ggplot(data = MSEW, mapping = aes(x = df, y = MSE/1e5)) +
ggtitle("Weights when estimating test MSE") +
scale_y_log10(limits = c(ymin, ymax)/1e5) + geom_boxplot()
# Making a grid display of the four plot objects above
lay <- matrix(c(NA, 1, 1, NA,
NA, 1, 1, NA,
2, 2, 3, 3,
2, 2, 3, 3,
NA, 4, 4, NA,
NA, 4, 4, NA), byrow = TRUE, ncol = 4)
gridExtra::grid.arrange(pAllW, pModW, pMSEW, pNoW, ncol = 2,
top = paste0("Simulated Data (Sample Sizes = ", n,
", Loops = ", loops, ", 5 Folds, Samp. Wts from Quad. Fit)"),
layout_matrix = lay)