cs2.qmd

# Questions

1.  Conf Int for random intercepts?
2.  Same interpretation as binomial logistic regression?

# Load

```{r, include=FALSE}
library(tidyverse)
library(lme4)
library(pps)
library(cowplot)
library(stargazer)

vote = read.csv('voter_stats_20201103.txt', sep = "\t")
history = read.csv('history_stats_20201103.txt', sep = "\t")
```

# 1. Cleaning

```{r}

# history is actual voters
history = history |>
  mutate(party = case_when(party_cd == "DEM" ~ 'Democrat',
                           party_cd == "REP" ~ 'Republican',
                           TRUE ~ "Other Party")) |> 
  group_by(county_desc,
           party,
           race_code,
           ethnic_code,
           sex_code,
           age) |> 
  summarize(actual_voters = sum(total_voters))

# registered voters
vote = vote |> 
  mutate(party = case_when(party_cd == "DEM" ~ 'Democrat',
                           party_cd == "REP" ~ 'Republican',
                           TRUE ~ "Other Party")) |> 
  group_by(
    county_desc,
    party,
    race_code,
    ethnic_code,
    sex_code,
    age
  ) |> 
  summarize(reg_voters = sum(total_voters))


total_voting = left_join(
  vote,
  history,
  by = c("county_desc", "party", "race_code", "ethnic_code", 
         "sex_code", "age")
  ) |> 
  mutate(race = case_when(race_code == "A" ~ "Asian",
                          race_code == "B" ~ "Black",
                          race_code == "I" ~ "Native American",
                          race_code == "M" ~ "two_or_more",
                          race_code == "O" ~ "Other Race",
                          race_code == "W" ~ "White",
                          race_code == "P" ~ "Pacific Islander",
                          TRUE ~ "Undesig. Race"),
         ethnicity = case_when(ethnic_code == "HL" ~ 'Hispanic Latino',
                               ethnic_code == "NL" ~ 'Not Hispanic Latino',
                               ethnic_code == "UN" ~ 'Undesig. Eth.'),
         actual_voters = replace_na(actual_voters, 0),
         turnout = actual_voters / reg_voters) |> 
  ungroup() |> 
  select(-c(race_code, ethnic_code)) |> 
  filter(turnout <= 1)
```

# 2. Sample the counties

```{r}
set.seed(1780)

# first aggregate everything so we can pps sample 
aggregated = total_voting |> 
  group_by(county_desc) |> 
  summarize(total_registered = sum(reg_voters)) |> 
  ungroup()

# draw 37 samples with replacement
county_index = ppss(aggregated$total_registered, 37)

# print county list 
samp_counties = unique(aggregated[county_index, ]$county_desc)
length(samp_counties)
print(samp_counties)

sampled_counties = total_voting |> 
  filter(county_desc %in% samp_counties)
```

# 3. Plots

First aggregate data together to plot later.

```{r}
agg_char = bind_rows(

  sampled_counties |> 
    group_by(race) |> 
    summarize(mean_turnout = mean(turnout)) |>
    mutate(id = race) |> 
    select(id, mean_turnout),
  
  sampled_counties |> 
    group_by(party) |> 
    summarize(mean_turnout = mean(turnout)) |> 
    mutate(id = party) |> 
    ungroup() |> 
    select(id, mean_turnout),
  
  sampled_counties |> 
    group_by(age) |> 
    summarize(mean_turnout = mean(turnout)) |> 
    mutate(id = age) |> 
    ungroup() |> 
    select(id, mean_turnout),
  
  sampled_counties |> 
    group_by(ethnicity) |> 
    summarize(mean_turnout = mean(turnout)) |> 
    mutate(id = ethnicity) |> 
    ungroup() |> 
    select(id, mean_turnout),
  
  sampled_counties |> 
    group_by(sex_code, party) |> 
    summarize(mean_turnout = mean(turnout)) |> 
    mutate(id = paste(sex_code, party)) |> 
    ungroup() |> 
    select(id, mean_turnout),
  
  sampled_counties |> 
    group_by(age, party) |> 
    summarize(mean_turnout = mean(turnout)) |> 
    mutate(id = paste(age, party)) |> 
    ungroup() |> 
    select(id, mean_turnout)
  ) |> 
  mutate(id = fct_reorder(id, mean_turnout))
```

## Plot 1

Corresponds to question 1

```{r}
agg_char |> 
  ggplot(aes(x = mean_turnout, y = id, fill=mean_turnout)) + 
  geom_col(width = 0.7) + 
  scale_fill_gradient(low = "cadetblue3", high = "darkorchid3") + 
  scale_x_continuous(breaks = seq(0, 1, by = 0.1)) + 
  geom_vline(xintercept = 0.66, linewidth = 1, linetype = 'dashed') + 
  labs(
    title = "Mean Voter Turnout by Demographics in 30 Sampled Counties",
    y = "",
    x = "Mean Turnout"
  ) + 
    annotate(
    "text",
    x = 0.8,
    y = 3,
    label = "2020 U.S. \nAverage Turnout",
    color = "black",
    size = 3,
    vjust = 0.5  # Adjust vertical position
  )

ggsave("images/voter_turnout_demographics.png", 
       plot = last_plot(), width = 8, height = 9, units = 'in')
```

Boxplot for question 2

```{r}
sampled_counties |> 
  ggplot(aes(x = county_desc, y = turnout)) + 
  geom_boxplot(fill = 'steelblue') + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  labs(
    title = "Voter Turnout by 30 Sampled Counties",
    x = "County", 
    y = "Voter Turnout"
  )

ggsave("images/voter_turnout_county.png", 
       plot = last_plot(), width = 6.5, height = 4, units = 'in')
```

Checking for random slopes

```{r}

# random slope for party? 
p1 = sampled_counties |> 
  ggplot(aes(x = county_desc, y = turnout, color = party)) +
  geom_jitter(size = 0.3) + 
  geom_smooth(method = "lm", se = FALSE, aes(group = party)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  labs(x = "")

# random slope for sex? 
p2 = sampled_counties |> 
  ggplot(aes(x = county_desc, y = turnout, color = sex_code)) +
  geom_jitter(size = 0.3) + 
  geom_smooth(method = "lm", se = FALSE, aes(group = sex_code)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  labs(x = "")

# random slope for sex? 
p3 = sampled_counties |> 
  ggplot(aes(x = county_desc, y = turnout, color = race)) +
  geom_jitter(size = 0.3) + 
  geom_smooth(method = "lm", se = FALSE, aes(group = race)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  labs(x = "")

# random slope for ethnicity? 
p4 = sampled_counties |> 
  ggplot(aes(x = county_desc, y = turnout, color = ethnicity)) +
  geom_jitter(size = 0.3) + 
  geom_smooth(method = "lm", se = FALSE, aes(group = ethnicity)) + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  labs(x = "")

combined_plots = plot_grid(p1, p2, p3, p4)

ggsave("images/random_slopes.png", 
       plot = last_plot(), width = 12, height = 6, units = 'in')
```

# 4. Modeling

```{r}

# function to make dotplots later
custom_dotplot = function(model) {
  
  ranef_data = data.frame(ranef(model))

  ranef_data |> 
    ggplot(aes(x = condval, y = grp)) +
    geom_dotplot(binaxis = 'y', stackdir = 'center', 
                 dotsize = 0.7, fill='steelblue') + 
    geom_errorbar(data = ranef_data, 
                  aes(x = condval, y = grp, 
                      xmin = condval - 1.96 * condsd,
                      xmax = condval + 1.96 * condsd)) + 
    labs(
      title = "Sampled Counties", 
      y = "County") + 
    scale_x_continuous(breaks = seq(-1, 1, by = 0.1)) + 
    geom_vline(xintercept = 0, linetype = 'dashed')

}
```

## 4.1) GLMM Models

Interpretation:

-   same as binary logistic regression: $\beta_1 = -0.33$ indicates "Other Party" registered voters have 0.71x odds of voting compared to the baseline (Democrat).

```{r}
m1 = glmer(cbind(actual_voters, reg_voters - actual_voters) ~ (1 | county_desc) +
           party + sex_code + age + race + ethnicity, 
           data = sampled_counties, 
           family = binomial)

summary(m1)
custom_dotplot(m1)
```

Have to calculate confidence interval this way

```{r}
m_test = glmer(cbind(actual_voters, reg_voters - actual_voters) ~
                 (1 | county_desc), 
           data = sampled_counties, 
           family = binomial)

confint(m_test, oldNames = FALSE)
```

SRS Approach - maybe include this plot at the end

```{r}
srs_model = glmer(cbind(actual_voters, reg_voters - actual_voters) ~ 
                    (1 | county_desc) +
           party + sex_code + age + race + ethnicity, 
           data = new_counties, 
           family = binomial)

custom_dotplot(srs_model)
```

### 4.1.1) Final Model

```{r}
m2 = glmer(cbind(actual_voters, reg_voters - actual_voters) ~ (1 | county_desc) +
           party*sex_code + party*age + race + ethnicity, 
           data = sampled_counties, 
           family = binomial)

final_model = m2

summary(m2)
```

Tried random slopes by county, fails to converge

```{r}
m3 = glmer(cbind(actual_voters, reg_voters - actual_voters) ~ (1 | county_desc) +
           (party | county_desc) + party*sex_code + party*age + race + ethnicity, 
           data = sampled_counties, 
           family = binomial)

m4 = glmer(cbind(actual_voters, reg_voters - actual_voters) ~ (1 | county_desc) +
           (age | county_desc) + party*sex_code + party*age + race + ethnicity, 
           data = sampled_counties, 
           family = binomial)

summary(m3)
summary(m4)
```

Random intercepts, similar AIC as interaction model

```{r}
m5 = glmer(cbind(actual_voters, reg_voters - actual_voters) ~ (1 | county_desc) +
           (1 | party:sex_code) + (1 | party:age) + race + ethnicity, 
           data = sampled_counties, 
           family = binomial)

summary(m5)
```

# 5. Getting tables out

```{r}
custom_dotplot(final_model)

ggsave("images/dotplot.png", 
       plot = last_plot(), width = 8, height = 6, units = 'in')
```

Fixed effects table

```{r}
confint_table = data.frame(summary(final_model)$coefficients) |> 
  select(Estimate, Std..Error) |> 
  mutate(Estimate = exp(Estimate),
         Std..Error = exp(Std..Error),
         `lb_2.5%` = Estimate - Std..Error,
         `ub_97.5%` = Estimate + Std..Error)

stargazer(confint_table, summary = FALSE)
```

Random effects table

```{r}
ranef_coeffs = data.frame(ranef(final_model)) |> 
  select(grp, condval, condsd) |> 
  rename(estimate = condval, 
         std_dev = condsd) |> 
  mutate(estimate = exp(estimate),
         std_dev = exp(std_dev),
         `lb_2.5%` = estimate - 1.96 * std_dev,
         `ub_97.5%` = estimate + 1.96 * std_dev)

stargazer(ranef_coeffs, summary = FALSE)
```