BU-Spark · albertf123 · Oct 29, 2020 · Nov 24, 2020 · Nov 24, 2020 · Nov 24, 2020
diff --git a/.DS_Store b/.DS_Store
diff --git a/vaccination_viruses/Deliverable 1/.DS_Store b/vaccination_viruses/Deliverable 1/.DS_Store
diff --git a/vaccination_viruses/Deliverable 1/CS506 Final Project - Deliverable 1.pdf b/vaccination_viruses/Deliverable 1/CS506 Final Project - Deliverable 1.pdf
diff --git a/vaccination_viruses/Deliverable 1/code/deliverable_1.Rmd b/vaccination_viruses/Deliverable 1/code/deliverable_1.Rmd
@@ -0,0 +1,47 @@
+---
+title: "Deliverable 1"
+output: html_document
+---
+
+```{r setup}
+library(tidyverse)
+```
+
+# For the writeup, see CS506 Final Project - Deliverable 1.pdf
+
+## Cleaning Data
+```{r clean_data}
+data <- read.csv(file = '../../data/vaccinationrate_and_exemption.csv')
+spending_data_headers <- read.csv(file = '../../data/healthExpendituresState.csv', skip = 1, nrows = 1, as.is = TRUE)
+spending_data = read.csv(file = '../../data/healthExpendituresState.csv', skip = 3, nrows = 51, as.is = TRUE)
+colnames(spending_data) = spending_data_headers
+spending_data$`Total Health Spending` = gsub("\\$", "", spending_data$`Total Health Spending`)
+spending_data$`Total Health Spending` = as.numeric(gsub("\\,", "", spending_data$`Total Health Spending`))
+spending_data <- spending_data %>% rename(
+                    States = Location
+)
+population_data <- read.csv(file = '../../data/population_by_state.csv')
+population_data <- population_data[order(population_data$States),]
+population_data$Population = as.numeric(gsub("\\,", "", population_data$Population))
+all_data <- merge(data, spending_data, by="States")
+all_data <- merge(all_data, population_data, by="States")
+all_data <- all_data %>% rename(VaxRate = `X.`)
+all_data <- all_data %>% mutate(spending_per_capita = (`Total Health Spending`/Population)*1000000)
+```
+## Exemptions
+
+```{r make_simple_graph}
+ggplot(all_data) +
+  #geom_point(aes(x = Exemption, y=X.)) +
+  geom_boxplot(aes(x = Exemption, y=VaxRate)) +
+  ylab("Vaccination Rate") +
+  ggtitle("Vaccination Rates in States with Various Exemptions")
+
+```
+
+## Spending per Capita by State
+
+```{r graphs2}
+model <- lm(VaxRate~spending_per_capita, data = all_data)
+summary(model)
+```
diff --git a/vaccination_viruses/Deliverable 1/code/deliverable_1.html b/vaccination_viruses/Deliverable 1/code/deliverable_1.html
diff --git a/vaccination_viruses/Deliverable 2/.DS_Store b/vaccination_viruses/Deliverable 2/.DS_Store
diff --git a/vaccination_viruses/Deliverable 2/Deliverable 2 Write-Up.pdf b/vaccination_viruses/Deliverable 2/Deliverable 2 Write-Up.pdf
diff --git a/vaccination_viruses/Deliverable 2/README.md b/vaccination_viruses/Deliverable 2/README.md
@@ -0,0 +1,118 @@
+This file will cover all the labels/features in our dataset. 
+
+States:
+
+- This label simply represents each state within the USA. We are primarly be using state data. 
+There are 50 states plus the District of Columbia (DC), which totals to 51 observations. Since
+we're looking at state data, we found data and created this final dataset by merging the 
+observations with the state name.
+
+- This label represents the percentage of each state's population that has received the MMR
+vaccine (measles, mumps, and rubella). The MMR vaccine is one of the most commonly 
+recommended vaccines for individuals and is typically taken at a young age in two doses.
+This vaccine is very commonly required by law for children to attend school. 2017 is the 
+primarly vaccination rate that we used. We also used, however, others as well to gain 
+additional insight into trends over the years.
+
+2007
+
+- Vaccination rates by state for 2007. 
+
+2008
+
+- Vaccination rates by state for 2008. 
+
+2009
+
+- Vaccination rates by state for 2009. 
+
+2010
+
+- Vaccination rates by state for 2010. 
+
+2011
+
+- Vaccination rates by state for 2011. 
+
+2012
+
+- Vaccination rates by state for 2012. 
+
+2013
+
+- Vaccination rates by state for 2013.
+
+2014
+
+- Vaccination rates by state for 2014.
+
+2015
+
+- Vaccination rates by state for 2015.
+
+2016
+
+- Vaccination rates by state for 2016.
+
+2017
+
+- Vaccination rates by state for 2017.
+
+Exemption:
+
+- This variable represents the exemptions that each state allows for all mandatory vaccines.
+There are three types of exemptions: religious exemption, medical exemption only, and personal
+belief exemptions. Religious and medical exemptions are self-explanatory. Personal belief 
+exemptions are very broad and may overlap with other exemptions such as religious exemptions.
+From strictest vaccination legislation to the most lax: medical exemptions only, religious
+exemptions, personal relief exemptions.
+
+Total Health Spending 
+
+- This feature represents the total health spending that each state spends per year (represented
+in millions of dollars). This includes “spending for all privately and publicly funded personal 
+health care services and products (hospital care, physician services, nursing home care, 
+prescription drugs, etc.) by state of residence. It should be noted, however, that we decided it
+would be prudent to find the per capita since states have widely varying populations. Therefore, 
+just examining the total health spending per state would fail to take into account that the
+population may play a role (states with lower populations likely spend much less and vice versa).
+
+Population
+
+- Represents the population of each state. This feature can be used for a myriad of things. For 
+example, it was used in conjunction with the "Total Health Spending" feature to find the total
+health spending per capita in each state. 
+
+spending_per_capita
+
+- This feature represents the total health spending per capita for each state. This was a variable
+that we created by dividing the "Total Health Spending" observations by the "Population" 
+observations. This feature will give us more reliable readings on how much each state spends on
+health by removing any skewing of the data that might affect "Total Health Spending" due to 
+population size differences.
+
+population_density
+
+- Represents the number of people per square mile for each state. This feature may have multiple
+uses. It provides insight into the geograph and demographics of different states.
+
+Median_income
+
+- Represents the median household income in per state. The values are indicated in dollars.
+
+HighSchool_Plus
+
+- Represents the percentage of residents in each state that have completed at least 
+high school. This percentage also includes those who have completed studies after high school 
+such as undergraduate degree (bachelor's).
+
+Bachelor_Plus
+
+-  Represents the percentage of residents in each state that have completed at least a 
+bachelor's (undergraduate degree). This also includes those who have pursued further
+education (postgraduate education, etc). 
+
+spending_per_pupil
+
+- Represents the amount of dollars spent per elementary and secondary (K-8) student by
+state.
diff --git a/vaccination_viruses/Deliverable 2/code/analyze_data.Rmd b/vaccination_viruses/Deliverable 2/code/analyze_data.Rmd
@@ -0,0 +1,72 @@
+---
+title: "analyze_data"
+output: pdf_document
+---
+Setup and import data. Rename columns cause read.csv renames any numerical columns.
+```{r setup, include=FALSE}
+library(tidyverse)
+library(olsrr)
+library(reshape2)
+library(broom)
+data <- read.csv(file = "../../data/full_data.csv")
+data = data %>% 
+                  rename(
+                    `2007` = X2007,  
+                    `2008` = X2008,
+                    `2009` = X2009,
+                    `2010` = X2010,
+                    `2011` = X2011,
+                    `2012` = X2012,
+                    `2013` = X2013,
+                    `2014` = X2014,
+                    `2015` = X2015,
+                    `2016` = X2016,
+                    `2017` = X2017
+)
+```
+
+Select only columns with vaccination rate data (2007-2017) and use regression to find the rate of change from year to year.
+Then plot the rate of change and do a regression on the rate of change against spending per capita, population density, etc. to see if any of these factors has any correlation with rate of change of vaccination rate.
+```{r plot1}
+data2 <- data %>% select(States, `2007`, `2008`, `2009`, `2010`, `2011`, `2012`, `2013`, `2014`, `2015`, `2016`, `2017`) %>% melt()
+data2$variable <- as.numeric(data2$variable)
+data2 <- data2 %>% mutate(Year = variable + 2006) %>% select(States, Year, value)
+
+#data3 <- data2 %>% filter(States == "Michigan")
+#data2
+#ggplot(data=data3, aes(x = Year, y = value)) + geom_point() + geom_smooth(method= "lm", se=F)
+#data2
+
+models = data2 %>% group_by(States) %>% do(model = lm(value ~ Year, data = .))
+trend <- models %>% tidy(model) %>% select(States, term, estimate, p.value) %>% pivot_wider(names_from = term, values_from = c(estimate, p.value)) %>% rename(Intercept = `estimate_(Intercept)`,
+                     Year_coefficient = `estimate_Year`,
+                     p.value_Intercept = `p.value_(Intercept)`) %>% arrange(Year_coefficient)
+trend
+
+ggplot(data=trend, aes(x=reorder(States, Year_coefficient), y=Year_coefficient)) + geom_point() + theme(axis.text.x = element_text(angle=45, hjust=1)) + xlab("State") + ylab("Change in vaccination rate (%)") + ggtitle("Plot of States' change in vaccination rate")
+
+#ggsave(file="../figures/test.png", width=8, height=4, dpi=500)
+
+all_data <- merge(data, trend, by="States")
+coeffModel <- lm(Year_coefficient~spending_per_capita + population_density + Median_income + HighSchool_Plus + Bachelors_Plus + spending_per_pupil + Exemption, data=all_data)
+summary(coeffModel)
+ols_coll_diag(coeffModel)
+```
+
+Regress vaccination rate against many factors. Prints out summary and Tolerance/VIF to verify.
+```{r regression against vaccination rate}
+model3 <- lm(`2017` ~ spending_per_capita + population_density + Median_income + HighSchool_Plus + Bachelors_Plus + spending_per_pupil + Exemption, data=data)
+
+summary(model3)
+ols_coll_diag(model3)
+
+
+```
+
+Regresses vaccination rate against only spending per capita and % of population with a bachelors' degree.
+```{r regression2}
+model2 <- lm(`2017` ~ spending_per_capita + Bachelors_plus, data=data)
+summary(model2)
+ols_coll_diag(model2)
+#pairs(data %>% select(spending_per_capita, population_density, Median_income, HighSchool_Plus, Bachelors_Plus, spending_per_pupil))
+```
diff --git a/vaccination_viruses/Deliverable 2/code/clean_data.Rmd b/vaccination_viruses/Deliverable 2/code/clean_data.Rmd
@@ -0,0 +1,95 @@
+---
+title: "Clean_data"
+output: pdf_document
+---
+Setup
+```{r setup, include=FALSE}
+library(tidyverse)
+```
+
+Import and Clean Health Expenditures Data
+```{r clean_healthExpenditures}
+# data from 2014
+healthcareSpending_headers <- read.csv(file = '../../data/healthExpendituresState.csv', skip = 1, nrows = 1, as.is = TRUE)
+healthcareSpending = read.csv(file = '../../data/healthExpendituresState.csv', skip = 3, nrows = 51, as.is = TRUE)
+colnames(healthcareSpending) = healthcareSpending_headers
+healthcareSpending$`Total Health Spending` = gsub("\\$", "", healthcareSpending$`Total Health Spending`)
+healthcareSpending$`Total Health Spending` = as.numeric(gsub("\\,", "", healthcareSpending$`Total Health Spending`))
+healthcareSpending <- healthcareSpending %>% rename(States = Location)
+```
+
+Import and Clean Population Data
+```{r clean_population_by_state}
+# data from 2019
+population <- read.csv(file = '../../data/population_by_state.csv')
+population <- population[order(population$States),]
+population$Population = as.numeric(gsub("\\,", "", population$Population))
+```
+
+Import and Clean Vaccination Rate Data
+```{r vaccination rate}
+# data from 2007-2017
+vaccinationRate <- read.csv(file = '../../data/vaccination_19_35months_exemption.csv')
+vaccinationRate = vaccinationRate %>% 
+                  rename(
+                    `2007` = X2007,  
+                    `2008` = X2008,
+                    `2009` = X2009,
+                    `2010` = X2010,
+                    `2011` = X2011,
+                    `2012` = X2012,
+                    `2013` = X2013,
+                    `2014` = X2014,
+                    `2015` = X2015,
+                    `2016` = X2016,
+                    `2017` = X2017
+)
+```
+
+Import and CleanPopulation Density Data
+```{r populationDensity}
+# data from 2020
+populationDensity <- read.csv(file = '../../data/populationDensity.csv')
+populationDensity <- subset(populationDensity, select=c('States', 'Density'))
+populationDensity = populationDensity %>% rename(population_density = Density)
+```
+
+Import and Clean Median Income Data
+```{r medianIncome}
+# data from 2017
+medianIncome <- read.csv(file = '../../data/medianHouseholdIncome.csv')
+medianIncome <- subset(medianIncome, select=c('States', 'Median_income'))
+medianIncome$Median_income = as.numeric(gsub("\\,", "", medianIncome$Median_income))
+```
+
+Import Educational Attainment Data
+```{r educational_attainment}
+# data from 2020
+educational_attainment <- read.csv(file = '../../data/educational_attainment.csv')
+```
+
+Import and Clean Educational Spending Data
+```{r educationSpending}
+# data from 2016
+educationSpending <- read.csv(file = '../../data/EducationSpending.csv')
+educationSpending$`spending_per_pupil` = gsub("\\$", "", educationSpending$`spending_per_pupil`)
+educationSpending$`spending_per_pupil` = as.numeric(gsub("\\,", "", educationSpending$`spending_per_pupil`))
+```
+
+Merge and Export all Data to .csv
+```{r merge_all_data and export}
+# Vaccination data is from 2017, Exemption is from 2020, Healthcare spending is from 2014
+
+data <- merge(vaccinationRate, healthcareSpending, by="States")
+data <- merge(data, population, by="States")
+data <- data %>% mutate(spending_per_capita = (`Total Health Spending`/Population)*1000000)
+data <- merge(data, populationDensity, by="States")
+data <- merge(data, medianIncome, by="States")
+data <- merge(data, educational_attainment, by="States")
+data <- merge(data, educationSpending, by="States")
+write.csv(data, "data/full_data.csv")
+```
+
+
+
+
diff --git a/vaccination_viruses/Deliverable 2/code/test.png b/vaccination_viruses/Deliverable 2/code/test.png
diff --git a/vaccination_viruses/Final Deliverable/CS506 Final Project - Final Deliverable.pdf b/vaccination_viruses/Final Deliverable/CS506 Final Project - Final Deliverable.pdf
diff --git a/vaccination_viruses/data/.DS_Store b/vaccination_viruses/data/.DS_Store
diff --git a/vaccination_viruses/data/EducationSpending.csv b/vaccination_viruses/data/EducationSpending.csv
@@ -0,0 +1,52 @@
+States,spending_per_pupil
+Alabama,"$9,236 "
+Alaska,"$17,510 "
+Arizona,"$7,613 "
+Arkansas,"$9,846 "
+California,"$11,495 "
+Colorado,"$9,575 "
+Connecticut,"$18,958 "
+Delaware,"$14,713 "
+District of Columbia,"$19,159 "
+Florida,"$8,920 "
+Georgia,"$9,769 "
+Hawaii,"$13,748 "
+Idaho,"$7,157 "
+Illinois,"$14,180 "
+Indiana,"$9,856 "
+Iowa,"$11,150 "
+Kansas,"$9,960 "
+Kentucky,"$9,863 "
+Louisiana,"$11,038 "
+Maine,"$13,278 "
+Maryland,"$14,206 "
+Massachusetts,"$15,593 "
+Michigan,"$11,668 "
+Minnesota,"$12,382 "
+Mississippi,"$8,702 "
+Missouri,"$10,313 "
+Montana,"$11,348 "
+Nebraska,"$12,299 "
+Nevada,"$8,960 "
+New Hampshire,"$15,340 "
+New Jersey,"$18,402 "
+New Mexico,"$9,693 "
+New York,"$22,366 "
+North Carolina,"$8,792 "
+North Dakota,"$13,373 "
+Ohio,"$12,102 "
+Oklahoma,"$8,097 "
+Oregon,"$10,842 "
+Pennsylvania,"$15,418 "
+Rhode Island,"$15,532 "
+South Carolina,"$10,249 "
+South Dakota,"$9,176 "
+Tennessee,"$8,810 "
+Texas,"$9,016 "
+Utah,"$6,953 "
+Vermont,"$17,873 "
+Virginia,"$11,432 "
+Washington,"$11,534 "
+West Virginia,"$11,291 "
+Wisconsin,"$11,456 "
+Wyoming,"$16,442 "
diff --git a/vaccination_viruses/data/cb12-33table1states-1.xls b/vaccination_viruses/data/cb12-33table1states-1.xls
diff --git a/vaccination_viruses/data/data_for_introduction/reduction_cases.xlsx b/vaccination_viruses/data/data_for_introduction/reduction_cases.xlsx
diff --git a/vaccination_viruses/data/data_for_introduction/reduction_deaths.xlsx b/vaccination_viruses/data/data_for_introduction/reduction_deaths.xlsx