library(tidyverse)
library(dplyr)
library(psych)
library(mvtnorm)
library(MASS)

set.seed(123456)

N=10000
n=100
#educ=rnorm(N,10,3)
#age=rnorm(N,49,7)

cutoff= 34.3 # change this values to change response - if 31.6 then 60% if 34.3 then 75% #
educ = sample(8:24, N, replace = TRUE)
age  = sample(18:64, N, replace = TRUE)

e12i<-mvrnorm(N,mu=c(0,0),Sigma=matrix(c(1,0.6,0.6,1),2,2))
BMI_1star=18-0.15*educ+0.35*age+e12i[,1]
BMI_2star=18-0.15*educ+0.35*age+ e12i[,1]

summary(BMI_1star)
SD(BMI_1star)
quantile(BMI_1star, 0.6)

# need to put dataset in front of re-code#
y1i=NULL
y1i[BMI_1star < cutoff] <-1
y1i[BMI_1star >= cutoff] <-0

BMI= BMI_2star*y1i

z=rnorm(N,mean=1+BMI_2star, sd=0.5)
k=1/(1+exp(2.5-0.5*z))

#poisson sampling
Pi=n*k/sum(k) #inclusion probablities
di=1/Pi

indicator=y1i
summary(y1i)

mydata=cbind( age, educ, y1i, BMI_1star, BMI, BMI_2star, k, z, Pi,di, indicator)
#sim.pop=data.frame(mydata)
sim.pop.60=data.frame(mydata)

#Plot 1A - Population
#plot by Observed and Unobserved
attach(sim.pop.60)
plot(age, BMI_1star, 
     pch = 19,
     col = factor(indicator))

# Legend
legend("topleft",
       legend = c("Unobserved", "Observed"),
       pch = 19,
       col = factor(levels(factor(indicator))))

#Plot 1B - Population
#plot by Observed and Unobserved
plot(educ, BMI_1star,
     pch = 19,
     col = factor(indicator))

# Legend
legend("topleft",
       legend = c("Unobserved", "Observed"),
       pch = 19,
       col = factor(levels(factor(indicator))))





#############################
# Creating Final Dataset
###############################
#making values missing based on indicator
#sim.pop$BMI=ifelse(sim.pop$indicator == 0, NA, sim.pop.60$BMI)
sim.pop.60$BMI=ifelse(sim.pop.60$indicator == 0, NA, sim.pop.60$BMI)

#set.seed(1234)
#samp100=sample_n(sim.pop, 100)
#samp250=sample_n(sim.pop, 250)
#samp500=sample_n(sim.pop, 500)

library(openxlsx)

samp100.60=sample_n(sim.pop.60, 100)
samp250.60=sample_n(sim.pop.60, 250)
samp500.60=sample_n(sim.pop.60, 500)

write.xlsx(samp100.60, 'C:/Users/MMACHIOR/OneDrive - University of Oklahoma/OUHSC Research/OSCTR/Heckman Model - OSCTR/Data/samp.100.60.xlsx')
write.xlsx(samp250.60, 'C:/Users/MMACHIOR/OneDrive - University of Oklahoma/OUHSC Research/OSCTR/Heckman Model - OSCTR/Data/samp.250.60.xlsx')
write.xlsx(samp500.60, 'C:/Users/MMACHIOR/OneDrive - University of Oklahoma/OUHSC Research/OSCTR/Heckman Model - OSCTR/Data/samp.500.60.xlsx')

# Checking values
summary(samp100.60$indicator)
summary(samp250.60$indicator)
summary(samp500.60$indicator)

#Examine linear regression approaches if desired.
# lm based on full POPULATION data
lm_all = lm(BMI_2star ~ educ + age, data=pop)
summary(lm_all)


# Sample size 100
# lm based on if the data was fully observed data at sample 100
lm_100 = lm(BMI ~ educ + age, data=samp100)
summary(lm_100)


# Sample size 250
# lm based on observed data
lm_250 = lm(BMI ~ educ + age, data=samp250)
summary(lm_250)

# Sample size 500
# lm based on observed data
lm_500 = lm(BMI ~ educ + age, data=samp500)
summary(lm_500)




###############################
# Heckman by Hand
##############################

#sample size 100
probit = glm(indicator ~ educ + age,
             data   = samp100,
             family = binomial(link = 'probit'))

summary(probit)

# Create the IVMR
samp100$probit_lp = predict(probit)
samp100$mills100=dnorm(samp100$probit_lp)/pnorm(samp100$probit_lp)
summary(samp100$mills100)

samp100$imr = samp100$mills100
ggplot2::qplot(samp100$imr, geom = 'histogram')

lm_HM_select = lm(BMI ~ educ + age + imr, data = samp100)

summary(lm_HM_select)


#sample size 250
probit = glm(indicator ~ educ + age,
             data   = samp250,
             family = binomial(link = 'probit'))

summary(probit)

# Create the IVMR
samp250$probit_lp = predict(probit)
samp250$mills250=dnorm(samp250$probit_lp)/pnorm(samp250$probit_lp)
summary(samp250$mills250)

samp250$imr = samp250$mills250
ggplot2::qplot(samp250$imr, geom = 'histogram')

lm_HM_select = lm(BMI ~ educ + age + imr, data = samp250)

summary(lm_HM_select)



#sample size 500
probit = glm(indicator ~ educ + age,
             data   = samp500,
             family = binomial(link = 'probit'))

summary(probit)

# Create the IVMR
samp500$probit_lp = predict(probit)
samp500$mills500=dnorm(samp100$probit_lp)/pnorm(samp100$probit_lp)
summary(samp500$mills500)

samp500$imr = samp500$mills500
ggplot2::qplot(samp500$imr, geom = 'histogram')

lm_HM_select = lm(BMI ~ educ + age + imr, data = samp500)

summary(lm_HM_select)

##########################
# USING SAMPLE SELECTIOn
##########################

library(sampleSelection)

selection_pop = selection(indicator ~ educ + age , BMI ~ educ + age, 
                               method = '2step', data=pop)

summary(selection_pop)


library(sampleSelection)

selection_samp100 = selection(indicator ~ educ + age , BMI ~ educ + age, 
                          method = '2step', data=samp100)
summary(selection_samp100)


selection_samp250 = selection(indicator ~ educ + age , BMI ~ educ + age, 
                              method = '2step', data=samp250)
summary(selection_samp250)


selection_samp500 = selection(indicator ~ educ + age , BMI ~ educ + age, 
                              method = '2step', data=samp500)
summary(selection_samp500)

# 60% response
selection_samp100.60 = selection(indicator ~ educ + age , BMI ~ educ + age, 
                              method = '2step', data=samp100.60)
summary(selection_samp100.60)


selection_samp250.60 = selection(indicator ~ educ + age , BMI ~ educ + age, 
                              method = '2step', data=samp250.60)
summary(selection_samp250.60)


selection_samp500.60 = selection(indicator ~ educ + age , BMI ~ educ + age, 
                              method = '2step', data=samp500.60)
summary(selection_samp500.60)


#########################
# Multiple Imputation - DEFAULT=PMM
########################
library(mice)

# sample 100
imp.data.100<- mice(samp100, m = 5)

fit.model.100=fitm <- with(imp.data.100, lm(BMI ~ educ + age))
summary(pool(fit.model.100))



# sample 250
imp.data.250<- mice(samp250, m = 5)

fit.model.250=fitm <- with(imp.data.250, lm(BMI ~ educ + age))
summary(pool(fit.model.250))



# sample 500
imp.data.500<- mice(samp500, m = 5)

fit.model.500=fitm <- with(imp.data.500, lm(BMI ~ educ + age))
summary(pool(fit.model.500))


# NHANES

library(sas7bdat)
#Read in the SAS data
NHdata <- read.sas7bdat("C:/Users/MMACHIOR/OneDrive - University of Oklahoma/OUHSC Research/OSCTR/Heckman Model - OSCTR/Data/merged3.sas7bdat")

names(NHdata)
NHdata$missdrink=ifelse(NHdata$avgdrinks =="NaN", 0, 1)

plot(NHdata$RIDAGEYR, NHdata$avgdrinks,
     pch = 19)

selection_nhanes = selection( missdrink ~ age + gender +race+currentwt , avgdrinks ~ age + gender +race+currentwt, 
                                 method = '2step', data=NHdata)
summary(selection_nhanes)