# Install and load necessary libraries library(readxl) library(dplyr) library(ggplot2) library(broom) library(stargazer) # Load data data <- read_excel("DiD_School_FreeLunch_MockData.xlsx") %>% arrange(student_id, school, free_lunch, year) # Descriptive statistics summary_table <- data %>% group_by(school, year) %>% summarise( N = n(), Avg_Attendance = round(mean(attendance_rate), 2), .groups = 'drop' ) print(summary_table) # Convert variables to factors data <- data %>% mutate( school = factor(school), year = factor(year), treatment_post = ifelse(school == "A" & year == 2023, 1, 0) ) # Estimate Difference-in-Differences model did_model <- lm(attendance_rate ~ school + year + treatment_post, data = data) # Stargazer output for LaTeX or HTML stargazer(did_model, type = "text", title = "Difference-in-Differences Regression Results", digits = 3, out = "did_model_results.txt") # Plot average attendance by school and year (keep only 2022 and 2023) avg_attendance <- data %>% filter(year %in% c(2022, 2023)) %>% group_by(school, year) %>% summarise(mean_attendance = mean(attendance_rate), .groups = 'drop') %>% arrange(school, year) %>% mutate( year_num = as.numeric(as.character(year)), line_label = paste("School", school) ) # Counterfactual is calculated using the DiD logic: # Counterfactual_Treatment_Post = Treatment_Pre + (Control_Post - Control_Pre) # This assumes that in the absence of the treatment, the treated group would have followed the same trend as the control group. # Here, values are derived from observed averages: Treatment_Pre (School A 2022): 90.134 # Control change: (School B 2023 - School B 2022): (91.262 - 90.134) = 1.128 # Therefore, Counterfactual_Treatment_Post = 90.134 + 1.128 = 91.262 years_cf <- c(2022, 2023) cf_line <- data.frame( year_num = years_cf, counterfactual = c(90.134, 91.262), line_label = "Counterfactual" ) # Combine both lines plot_df <- bind_rows( avg_attendance %>% select(year_num, y = mean_attendance, line_label), cf_line %>% rename(y = counterfactual) ) # Plot only two lines, with names ggplot(plot_df, aes(x = year_num, y = y, group = line_label, color = line_label, linetype = line_label)) + geom_line(size = 1.2) + geom_point(size = 3) + scale_linetype_manual(values = c("School A" = "solid", "School B" = "solid", "Counterfactual" = "dashed")) + scale_x_continuous(breaks = c(2022, 2023), labels = c("2022", "2023")) + labs( title = "Difference-in-Differences: Free Lunch Policy Effect on Attendance", y = "Average Attendance Rate (%)", color = "Legend", linetype = "Legend" ) + theme_minimal() + ylim(85, 95)