# Install and load necessary libraries
library(readxl)
library(dplyr)
library(ggplot2)
library(broom)
library(stargazer)

# Load data
data <- read_excel("DiD_School_FreeLunch_MockData.xlsx") %>%
  arrange(student_id, school, free_lunch, year)

# Descriptive statistics
summary_table <- data %>%
  group_by(school, year) %>%
  summarise(
    N = n(),
    Avg_Attendance = round(mean(attendance_rate), 2),
    .groups = 'drop'
  )
print(summary_table)

# Convert variables to factors
data <- data %>%
  mutate(
    school = factor(school),
    year = factor(year),
    treatment_post = ifelse(school == "A" & year == 2023, 1, 0)
  )

# Estimate Difference-in-Differences model
did_model <- lm(attendance_rate ~ school + year + treatment_post, data = data)

# Stargazer output for LaTeX or HTML
stargazer(did_model, type = "text", title = "Difference-in-Differences Regression Results", digits = 3, out = "did_model_results.txt")

# Plot average attendance by school and year (keep only 2022 and 2023)
avg_attendance <- data %>%
  filter(year %in% c(2022, 2023)) %>%
  group_by(school, year) %>%
  summarise(mean_attendance = mean(attendance_rate), .groups = 'drop') %>%
  arrange(school, year) %>%
  mutate(
    year_num = as.numeric(as.character(year)),
    line_label = paste("School", school)
  )

# Counterfactual is calculated using the DiD logic: # Counterfactual_Treatment_Post = Treatment_Pre + (Control_Post - Control_Pre)
# This assumes that in the absence of the treatment, the treated group would have followed the same trend as the control group.
# Here, values are derived from observed averages: Treatment_Pre (School A 2022): 90.134
# Control change: (School B 2023 - School B 2022): (91.262 - 90.134) = 1.128
# Therefore, Counterfactual_Treatment_Post = 90.134 + 1.128 = 91.262
years_cf <- c(2022, 2023)
cf_line <- data.frame(
  year_num = years_cf,
  counterfactual = c(90.134, 91.262),
  line_label = "Counterfactual"
)

# Combine both lines
plot_df <- bind_rows(
  avg_attendance %>% select(year_num, y = mean_attendance, line_label),
  cf_line %>% rename(y = counterfactual)
)

# Plot only two lines, with names
ggplot(plot_df, aes(x = year_num, y = y, group = line_label, color = line_label, linetype = line_label)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  
  scale_linetype_manual(values = c("School A" = "solid", "School B" = "solid", "Counterfactual" = "dashed")) +
  scale_x_continuous(breaks = c(2022, 2023), labels = c("2022", "2023")) +
  labs(
    title = "Difference-in-Differences: Free Lunch Policy Effect on Attendance",
    y = "Average Attendance Rate (%)",
    color = "Legend",
    linetype = "Legend"
  ) +
  theme_minimal() +
  ylim(85, 95)