sampA<-readRDS(file='M:/teaching/short course/data integration/real data applications/KNHANES and NHISS/survey_sub.rds') sampB<-readRDS(file='M:/teaching/short course/data integration/real data applications/KNHANES and NHISS/blood_test_results.rds') N<-dim(sampB)[1] n<-5000 id_B<-sample(1:N,n) sampB2<-sampB[id_B,] sampA2<-sampA[is.na(sampA$HGB)=='FALSE' & is.na(sampA$TG)=='FALSE' & is.na(sampA$TCHOL)=='FALSE' & is.na(sampA$HDL)=='FALSE' & is.na(sampA$ANE)=='FALSE',] #############functions for point estimation: library(mgcv) library(np) ###rearrange data to have the following orders: ###TCHOL, SEX, ANE, AGE_G, HGB, TG, HDL, wt_hs for sample A ###TCHOL, SEX, ANE, AGE_G, HGB, TG, HDL for sample B sampA3<-sampA2[,c(4,1,6,11,2,3,5,7)] sampA3$AGE_G<-as.numeric(sampA3$AGE_G) sampA3$SEX<-2-as.numeric(sampA3$SEX) sampA3$ANE<-as.numeric(sampA3$ANE)-1 sampB3<-sampB2[,c(4,1,7,2,3,5,6)] sampB3$AGE_G<-as.numeric(sampB3$AGE_G) sampB3$SEX<-2-as.numeric(sampB3$SEX) sampB3$ANE<-as.numeric(sampB3$ANE)-1 ###Comparing the distributions of predictors: ##from A: m1A<-sum(sampA3$wt_hs[sampA3$SEX==1])/sum(sampA3$wt_hs) m2A<-sum(sampA3$wt_hs[sampA3$ANE==1])/sum(sampA3$wt_hs) m3A<-sum(sampA3$AGE_G*sampA3$wt_hs)/sum(sampA3$wt_hs) m4A<-sum(sampA3$HGB*sampA3$wt_hs)/sum(sampA3$wt_hs) m5A<-sum(sampA3$TG*sampA3$wt_hs)/sum(sampA3$wt_hs) m6A<-sum(sampA3$HDL*sampA3$wt_hs)/sum(sampA3$wt_hs) mA<-c(m1A,m2A,m3A,m4A,m5A,m6A) ##from B: nB<-dim(sampB3)[1] wtB<-rep(1,nB) sampB4<-cbind(sampB3,wtB) m1B<-sum(sampB4$wtB[sampB4$SEX==1])/sum(sampB4$wtB) m2B<-sum(sampB4$wtB[sampB4$ANE==1])/sum(sampB4$wtB) m3B<-sum(sampB4$AGE_G*sampB4$wtB)/sum(sampB4$wtB) m4B<-sum(sampB4$HGB*sampB4$wtB)/sum(sampB4$wtB) m5B<-sum(sampB4$TG*sampB4$wtB)/sum(sampB4$wtB) m6B<-sum(sampB4$HDL*sampB4$wtB)/sum(sampB4$wtB) mB<-c(m1B,m2B,m3B,m4B,m5B,m6B) MX<-cbind(mA,mB) rownames(MX)<-c('SEX(Male)','ANE','AGE','HGB','TG','HDL') colnames(MX)<-c('Mean A','Mean B') write.csv(MX,file='M:/teaching/short course/data integration/real data applications/KNHANES and NHISS/KNHANES and NHISS/MX.csv') ###1. The sample mean from sample A: fM_A<-function(datA,datB){ etheta1<-sum(datA$TCHOL*datA$wt_hs)/sum(datA$wt_hs) etheta2<-sum(datA$TCHOL[datA$SEX==1]*datA$wt_hs[datA$SEX==1])/sum(datA$wt_hs[datA$SEX==1]) etheta<-c(etheta1,etheta2) return(etheta) } ###2. The naive estimator (sample mean) from sample B: fM_B<-function(datA,datB){ etheta1<-mean(datB$TCHOL) etheta2<-mean(datB$TCHOL[datB$SEX==1]) etheta<-c(etheta1,etheta2) return(etheta) } ######################################Calibration methods: ###3.Chi-square distance: fC<-function(datA,datB){ nB<-dim(datB)[1] nA<-dim(datA)[1] XB<-cbind(rep(1,nB),datB[,c(2:7)]) XA<-cbind(rep(1,nA),datA[,c(2:7)]) XB<-as.matrix(XB) XA<-as.matrix(XA) fwts<-function(a){ return(sum(a*datA$wt_hs)) } elambda<-solve(t(XB)%*%XB)%*%as.matrix(apply(XA,2,fwts)-apply(XB,2,sum)) wB<-as.numeric(1+t(elambda)%*%t(XB)) etheta1<-sum(datB$TCHOL*wB)/sum(wB) etheta2<-sum(datB$TCHOL[datB$SEX==1]*wB[datB$SEX==1])/sum(wB[datB$SEX==1]) sum(datB$SEX*wB) etheta<-c(etheta1,etheta2) return(etheta) } ######################################mass imputation methods: ###4. The parametric mass imputation estimator (PMIE) from sample A by using linear model: fP_A<-function(datA,datB){ XA<-datA[,2:7] XA<-as.data.frame(XA) ###fit model by using sample B: m<-lm(TCHOL~SEX+ANE+AGE_G+HGB+TG+HDL,data=datB) iyA<-predict(m,XA) etheta1<-sum(iyA*datA$wt_hs)/sum(datA$wt_hs) etheta2<-sum(iyA[datA$SEX==1]*datA$wt_hs[datA$SEX==1])/sum(datA$wt_hs[datA$SEX==1]) etheta<-c(etheta1,etheta2) return(etheta) } ###5. The nonparametric mass imputation estimator using generalized additive model (GAM): fGAM<-function(datA,datB){ XA<-datA[,2:7] XA<-as.data.frame(XA) ###fit model by using sample B: fit<-gam(TCHOL~SEX+ANE+s(AGE_G)+s(HGB)+s(TG)+s(HDL),data=datB) iyA<-predict(fit,XA) etheta1<-sum(iyA*datA$wt_hs)/sum(datA$wt_hs) etheta2<-sum(iyA[datA$SEX==1]*datA$wt_hs[datA$SEX==1])/sum(datA$wt_hs[datA$SEX==1]) etheta<-c(etheta1,etheta2) return(etheta) } ##############################################propensity score weighting methods: ###6. Rescaled design weight (RDW) method (Valliant and Dever, 2011): fRDW<-function(datA,datB){ nB<-dim(datB)[1] nA<-dim(datA)[1] eN<-sum(datA$wt_hs) XYB<-cbind(datB,rep(1,nB),rep(1,nB)) colnames(XYB)[c(8,9)]<-c('wt','r') XYA<-cbind(datA[,1:7],datA$wt_hs*(eN-nB)/eN,rep(0,nA)) colnames(XYA)[c(8,9)]<-c('wt','r') XY<-rbind(XYB,XYA) ###fit weighted logistic regression model by using the combined data file: wtlogit<-glm(r ~ SEX+ANE+AGE_G+HGB+TG+HDL, data = XY, family = "binomial",weights=wt) #ea<-wtlogit$coefficients #epB<-exp(ea[1]+ea[2]*XYB$SEX+ea[3]*XYB$ANE+ea[4]*XYB$AGE_G+ea[5]*XYB$HGB+ea[6]*XYB$TG+ea[7]*XYB$HDL)/(1+exp(ea[1]+ea[2]*XYB$SEX+ea[3]*XYB$ANE+ea[4]*XYB$AGE_G+ea[5]*XYB$HGB+ea[6]*XYB$TG+ea[7]*XYB$HDL)) epB<-wtlogit$fitted.values[XY$r==1] ###fit model by using sample B: etheta1<-sum(XYB[,1]/epB)/sum(1/epB) etheta2<-sum(XYB[XYB$SEX==1,1]/epB[XYB$SEX==1])/sum(1/epB[XYB$SEX==1]) etheta<-c(etheta1,etheta2) Etheta<-list(etheta,epB) return(Etheta) } ###7. Log-likelihood estimating equation (LEE) method (Chen, Li, and Wu, 2019) library(nleqslv) fLEE<-function(datA,datB){ nB<-dim(datB)[1] ###estimate propensity score: fp_1<-function(alpha0){ p0<-exp(alpha0[1]+alpha0[2]*datA$SEX+alpha0[3]*datA$ANE+alpha0[4]*datA$AGE_G+alpha0[5]*datA$HGB+alpha0[6]*datA$TG +alpha0[7]*datA$HDL)/(1+exp(alpha0[1]+alpha0[2]*datA$SEX+alpha0[3]*datA$ANE+alpha0[4]*datA$AGE_G+alpha0[5]*datA$HGB+alpha0[6]*datA$TG +alpha0[7]*datA$HDL)) res0_1<-sum(datA$wt_hs*p0)-nB res0_2<-sum(datA$wt_hs*p0*datA$SEX)-sum(datB$SEX) res0_3<-sum(datA$wt_hs*p0*datA$ANE)-sum(datB$ANE) res0_4<-sum(datA$wt_hs*p0*datA$AGE_G)-sum(datB$AGE_G) res0_5<-sum(datA$wt_hs*p0*datA$HGB)-sum(datB$HGB) res0_6<-sum(datA$wt_hs*p0*datA$TG)-sum(datB$TG) res0_7<-sum(datA$wt_hs*p0*datA$HDL)-sum(datB$HDL) return(c(res0_1,res0_2,res0_3,res0_4,res0_5,res0_6,res0_7)) } ealpha<-nleqslv(rep(0,7),fp_1,method=c("Newton"))$x epB<-exp(ealpha[1]+ealpha[2]*datB$SEX+ealpha[3]*datB$ANE+ealpha[4]*datB$AGE_G+ealpha[5]*datB$HGB+ealpha[6]*datB$TG +ealpha[7]*datB$HDL)/(1+exp(ealpha[1]+ealpha[2]*datB$SEX+ealpha[3]*datB$ANE+ealpha[4]*datB$AGE_G+ealpha[5]*datB$HGB+ealpha[6]*datB$TG +ealpha[7]*datB$HDL)) etheta1<-sum(datB$TCHOL/epB)/sum(1/epB) etheta2<-sum(datB$TCHOL[datB$SEX==1]/epB[datB$SEX==1])/sum(1/epB[datB$SEX==1]) etheta<-c(etheta1,etheta2) Etheta<-list(etheta,epB) return(Etheta) } ###8.Adjusted logistic propensity weighting (ALP) methods (Wang et al, 2021) fALP<-function(datA,datB){ nB<-dim(datB)[1] nA<-dim(datA)[1] eN<-sum(datA$wt_hs) XYB<-cbind(datB,rep(1,nB),rep(1,nB)) colnames(XYB)[c(8,9)]<-c('wt','r') XYA<-cbind(datA[,1:7],datA$wt_hs,rep(0,nA)) colnames(XYA)[c(8,9)]<-c('wt','r') XY<-rbind(XYB,XYA) ###fit weighted logistic regression model by using the combined data file: wtlogit<-glm(r ~ SEX+ANE+AGE_G+HGB+TG+HDL, data = XY, family = "binomial",weights=wt) #ea<-wtlogit$coefficients #epB<-exp(ea[1]+ea[2]*XYB$SEX+ea[3]*XYB$ANE+ea[4]*XYB$AGE_G+ea[5]*XYB$HGB+ea[6]*XYB$TG+ea[7]*XYB$HDL)/(1+exp(ea[1]+ea[2]*XYB$SEX+ea[3]*XYB$ANE+ea[4]*XYB$AGE_G+ea[5]*XYB$HGB+ea[6]*XYB$TG+ea[7]*XYB$HDL)) eqB<-wtlogit$fitted.values[XY$r==1] epB<-eqB/(1-eqB) ###fit model by using sample B: etheta1<-sum(XYB[,1]/epB)/sum(1/epB) etheta2<-sum(XYB[XYB$SEX==1,1]/epB[XYB$SEX==1])/sum(1/epB[XYB$SEX==1]) etheta<-c(etheta1,etheta2) Etheta<-list(etheta,epB) return(Etheta) } RES_P_A<-fM_A(sampA3,sampB3) RES_P_B<-fM_B(sampA3,sampB3) RES_P_C<-fC(sampA3,sampB3) RES_P_PMIE<-fP_A(sampA3,sampB3) RES_P_NPMIEG<-fGAM(sampA3,sampB3) RES_P_RDW<-fRDW(sampA3,sampB3)[[1]] RES_P_LEE<-fLEE(sampA3,sampB3)[[1]] RES_P_ALP<-fALP(sampA3,sampB3)[[1]] RES_P<-rbind(RES_P_B-RES_P_A,RES_P_C-RES_P_A,RES_P_PMIE-RES_P_A,RES_P_NPMIEG-RES_P_A, RES_P_RDW-RES_P_A,RES_P_LEE-RES_P_A,RES_P_ALP-RES_P_A) colnames(RES_P)<-c('Bias_Mean','Bias_Domain mean') rownames(RES_P)<-c('Mean B','Calibration','LM','GAM','RDW','LEE','ALP') write.csv(RES_P,file='M:/teaching/short course/data integration/real data applications/KNHANES and NHISS/KNHANES and NHISS/res_point.csv') #########################################Variance estimation: ###Bootstrap variance estimation: library(survey) fboot<-function(datA,datB,theta0,B_B){ nA<-dim(datA)[1] nB<-dim(datB)[1] d=datA$wt_hs #design weights eN=sum(d) # pop size prob=d/sum(d) etheta0_C<-fC(datA,datB) etheta0_P_A<-fP_A(datA,datB) etheta0_GAM<-fGAM(datA,datB) etheta0_RDW<-fRDW(datA,datB)[[1]] etheta0_LEE<-fLEE(datA,datB)[[1]] etheta0_ALP<-fALP(datA,datB)[[1]] etheta0<-c(etheta0_C,etheta0_P_A,etheta0_GAM,etheta0_RDW,etheta0_LEE,etheta0_ALP) ###bootstrap procedure: S_etheta<-NULL iter<-0 repeat{ iter<-iter+1 ##bootstrap sample from B: s_id_B<-sample(1:nB,nB,replace=T) datBb<-datB[s_id_B,] ##bootstrap sample from A: #1 select boostrap pseudo population #cat("b=",b,"\n") U=sample(1:nA, eN, prob=prob, replace=T) #2 choose boostrap sample Pi=1/d[U] #inclusion probabilites Ub.in=rbinom(eN,1,Pi) #inclusion indicators Ub=U[Ub.in==1] #selecet bootstrap sample datAb<-datA[Ub,] s_etheta_C<-fC(datAb,datBb) s_etheta_P_A<-fP_A(datAb,datBb) s_etheta_GAM<-fGAM(datAb,datBb) s_etheta_RDW<-fRDW(datAb,datBb)[[1]] s_etheta_LEE<-fLEE(datAb,datBb)[[1]] s_etheta_ALP<-fALP(datAb,datBb)[[1]] s_etheta<-c(s_etheta_C,s_etheta_P_A,s_etheta_GAM,s_etheta_RDW,s_etheta_LEE,s_etheta_ALP) S_etheta<-cbind(S_etheta,s_etheta) if(iter==B_B){break} } eV<-apply((S_etheta-etheta0)^{2},1,mean,na.rm=T) qz<-qnorm(0.975) LB<-etheta0-qz*sqrt(eV) UB<-etheta0+qz*sqrt(eV) AL<-UB-LB CI<-as.numeric(LB