library(tidyverse) library(dplyr) library(psych) library(mvtnorm) library(MASS) set.seed(123456) N=10000 n=100 #educ=rnorm(N,10,3) #age=rnorm(N,49,7) cutoff= 34.3 # change this values to change response - if 31.6 then 60% if 34.3 then 75% # educ = sample(8:24, N, replace = TRUE) age = sample(18:64, N, replace = TRUE) e12i<-mvrnorm(N,mu=c(0,0),Sigma=matrix(c(1,0.6,0.6,1),2,2)) BMI_1star=18-0.15*educ+0.35*age+e12i[,1] BMI_2star=18-0.15*educ+0.35*age+ e12i[,1] summary(BMI_1star) SD(BMI_1star) quantile(BMI_1star, 0.6) # need to put dataset in front of re-code# y1i=NULL y1i[BMI_1star < cutoff] <-1 y1i[BMI_1star >= cutoff] <-0 BMI= BMI_2star*y1i z=rnorm(N,mean=1+BMI_2star, sd=0.5) k=1/(1+exp(2.5-0.5*z)) #poisson sampling Pi=n*k/sum(k) #inclusion probablities di=1/Pi indicator=y1i summary(y1i) mydata=cbind( age, educ, y1i, BMI_1star, BMI, BMI_2star, k, z, Pi,di, indicator) #sim.pop=data.frame(mydata) sim.pop.60=data.frame(mydata) #Plot 1A - Population #plot by Observed and Unobserved attach(sim.pop.60) plot(age, BMI_1star, pch = 19, col = factor(indicator)) # Legend legend("topleft", legend = c("Unobserved", "Observed"), pch = 19, col = factor(levels(factor(indicator)))) #Plot 1B - Population #plot by Observed and Unobserved plot(educ, BMI_1star, pch = 19, col = factor(indicator)) # Legend legend("topleft", legend = c("Unobserved", "Observed"), pch = 19, col = factor(levels(factor(indicator)))) ############################# # Creating Final Dataset ############################### #making values missing based on indicator #sim.pop$BMI=ifelse(sim.pop$indicator == 0, NA, sim.pop.60$BMI) sim.pop.60$BMI=ifelse(sim.pop.60$indicator == 0, NA, sim.pop.60$BMI) #set.seed(1234) #samp100=sample_n(sim.pop, 100) #samp250=sample_n(sim.pop, 250) #samp500=sample_n(sim.pop, 500) library(openxlsx) samp100.60=sample_n(sim.pop.60, 100) samp250.60=sample_n(sim.pop.60, 250) samp500.60=sample_n(sim.pop.60, 500) write.xlsx(samp100.60, 'C:/Users/MMACHIOR/OneDrive - University of Oklahoma/OUHSC Research/OSCTR/Heckman Model - OSCTR/Data/samp.100.60.xlsx') write.xlsx(samp250.60, 'C:/Users/MMACHIOR/OneDrive - University of Oklahoma/OUHSC Research/OSCTR/Heckman Model - OSCTR/Data/samp.250.60.xlsx') write.xlsx(samp500.60, 'C:/Users/MMACHIOR/OneDrive - University of Oklahoma/OUHSC Research/OSCTR/Heckman Model - OSCTR/Data/samp.500.60.xlsx') # Checking values summary(samp100.60$indicator) summary(samp250.60$indicator) summary(samp500.60$indicator) #Examine linear regression approaches if desired. # lm based on full POPULATION data lm_all = lm(BMI_2star ~ educ + age, data=pop) summary(lm_all) # Sample size 100 # lm based on if the data was fully observed data at sample 100 lm_100 = lm(BMI ~ educ + age, data=samp100) summary(lm_100) # Sample size 250 # lm based on observed data lm_250 = lm(BMI ~ educ + age, data=samp250) summary(lm_250) # Sample size 500 # lm based on observed data lm_500 = lm(BMI ~ educ + age, data=samp500) summary(lm_500) ############################### # Heckman by Hand ############################## #sample size 100 probit = glm(indicator ~ educ + age, data = samp100, family = binomial(link = 'probit')) summary(probit) # Create the IVMR samp100$probit_lp = predict(probit) samp100$mills100=dnorm(samp100$probit_lp)/pnorm(samp100$probit_lp) summary(samp100$mills100) samp100$imr = samp100$mills100 ggplot2::qplot(samp100$imr, geom = 'histogram') lm_HM_select = lm(BMI ~ educ + age + imr, data = samp100) summary(lm_HM_select) #sample size 250 probit = glm(indicator ~ educ + age, data = samp250, family = binomial(link = 'probit')) summary(probit) # Create the IVMR samp250$probit_lp = predict(probit) samp250$mills250=dnorm(samp250$probit_lp)/pnorm(samp250$probit_lp) summary(samp250$mills250) samp250$imr = samp250$mills250 ggplot2::qplot(samp250$imr, geom = 'histogram') lm_HM_select = lm(BMI ~ educ + age + imr, data = samp250) summary(lm_HM_select) #sample size 500 probit = glm(indicator ~ educ + age, data = samp500, family = binomial(link = 'probit')) summary(probit) # Create the IVMR samp500$probit_lp = predict(probit) samp500$mills500=dnorm(samp100$probit_lp)/pnorm(samp100$probit_lp) summary(samp500$mills500) samp500$imr = samp500$mills500 ggplot2::qplot(samp500$imr, geom = 'histogram') lm_HM_select = lm(BMI ~ educ + age + imr, data = samp500) summary(lm_HM_select) ########################## # USING SAMPLE SELECTIOn ########################## library(sampleSelection) selection_pop = selection(indicator ~ educ + age , BMI ~ educ + age, method = '2step', data=pop) summary(selection_pop) library(sampleSelection) selection_samp100 = selection(indicator ~ educ + age , BMI ~ educ + age, method = '2step', data=samp100) summary(selection_samp100) selection_samp250 = selection(indicator ~ educ + age , BMI ~ educ + age, method = '2step', data=samp250) summary(selection_samp250) selection_samp500 = selection(indicator ~ educ + age , BMI ~ educ + age, method = '2step', data=samp500) summary(selection_samp500) # 60% response selection_samp100.60 = selection(indicator ~ educ + age , BMI ~ educ + age, method = '2step', data=samp100.60) summary(selection_samp100.60) selection_samp250.60 = selection(indicator ~ educ + age , BMI ~ educ + age, method = '2step', data=samp250.60) summary(selection_samp250.60) selection_samp500.60 = selection(indicator ~ educ + age , BMI ~ educ + age, method = '2step', data=samp500.60) summary(selection_samp500.60) ######################### # Multiple Imputation - DEFAULT=PMM ######################## library(mice) # sample 100 imp.data.100<- mice(samp100, m = 5) fit.model.100=fitm <- with(imp.data.100, lm(BMI ~ educ + age)) summary(pool(fit.model.100)) # sample 250 imp.data.250<- mice(samp250, m = 5) fit.model.250=fitm <- with(imp.data.250, lm(BMI ~ educ + age)) summary(pool(fit.model.250)) # sample 500 imp.data.500<- mice(samp500, m = 5) fit.model.500=fitm <- with(imp.data.500, lm(BMI ~ educ + age)) summary(pool(fit.model.500)) # NHANES library(sas7bdat) #Read in the SAS data NHdata <- read.sas7bdat("C:/Users/MMACHIOR/OneDrive - University of Oklahoma/OUHSC Research/OSCTR/Heckman Model - OSCTR/Data/merged3.sas7bdat") names(NHdata) NHdata$missdrink=ifelse(NHdata$avgdrinks =="NaN", 0, 1) plot(NHdata$RIDAGEYR, NHdata$avgdrinks, pch = 19) selection_nhanes = selection( missdrink ~ age + gender +race+currentwt , avgdrinks ~ age + gender +race+currentwt, method = '2step', data=NHdata) summary(selection_nhanes)