analysis.Rmd

---
title: "MAG Representation"
author: "Marian L Schmidt & Ian Morelan"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output:
  html_document:
    code_folding: show
    highlight: default
    keep_md: no
    theme: journal
    toc: yes
    toc_float:
      collapsed: no
      smooth_scroll: yes
      toc_depth: 3
editor_options: 
  chunk_output_type: console
---

# Setup 
```{r setup, include=FALSE}
knitr::opts_chunk$set(eval = TRUE, 
                      echo = TRUE, 
                      cache = FALSE,
                      include = TRUE,
                      warning = FALSE,
                      collapse = FALSE,
                      message = FALSE,
                      engine = "R", # Chunks will always have R code, unless noted
                      error = TRUE,
                      fig.height = 7, fig.width = 7,
                      fig.path="./Figures/")  # Set the figure options
```


# Load Packages & Functions
```{r load-packages-funs}
# Load functions and packages that are necessary
source("functions.R")
library(plotly)

# We'd like to plot with pretty colors based on national park posters :)
# install.packages("devtools")
#devtools::install_github("katiejolly/nationalparkcolors")

library(nationalparkcolors)
# Assign the 4 colors for the 4 different depths 
colors4 <- park_palette("Saguaro", 4)
```


# Load Data

```{r import-data} 
# Import the tax data
tax_physeq <- import_gtdbtk_taxonomy_and_checkm(
  taxonomy_filename = "Tara_Oceans_Med/TOBG-MED-READCOUNTMATCH.bac120.tsv",
  checkm_filename = "Tara_Oceans_Med/TOBG-MED_qa.txt")

# Import the metadata
meta_physeq <- import_metadata(province_filename = "Tara_Oceans_Med/Sample-Province.tsv", 
                           sizeFraction_filename = "Tara_Oceans_Med/Sample-SizeFraction.tsv") %>%
  mutate(names = rownames(.)) %>%
  separate(., col = names, into = c("station", "fraction", "depth"), sep = "_") %>%
  mutate(r_names = paste(station, fraction, depth, sep = "_")) %>%
  column_to_rownames(var = "r_names") %>%
  dplyr::select(-c(station, fraction)) %>% 
  sample_data()

# Import the readcounts
mag_table <- import_readcounts(readcounts_filename = "Tara_Oceans_Med/TOBG-MED-TOTAL.readcounts")

# Import the tree
mag_tree <- import_tree(tree_filename = "Tara_Oceans_Med/GToTree_output.newick")

# Put into phyloseq object 
tara_physeq <- phyloseq(meta_physeq, tax_physeq, mag_table, mag_tree)
```

# Normalization
```{r data-normalized, fig.height=10, fig.width=12}
# Normalizing the matrix 
# Dividing rows by the genome size for the bin
# Genome size is different from Bin size
# Genome size is BinSize * BinCompletion

# Read in checkm data to calculate the expected 
checkm <- read.csv("Tara_Oceans_Med/TOBG-MED_qa.txt", sep = "\t", as.is = TRUE) %>%
  dplyr::select("Bin.Id", "Completeness", "Genome.size..bp.") %>%
  dplyr::rename(est_genome_size = "Genome.size..bp.") %>%
  # Make a new column for the expected genome size based on est_genome_size * completeness
  mutate(exp_genome_size = round(est_genome_size/(Completeness/100)))

ordered_checkm <- data.frame(Bin.Id = rownames(mag_table)) %>%
  left_join(checkm, by = "Bin.Id")

#Sanity check 
stopifnot(rownames(mag_table) == ordered_checkm$Bin.Id)

# Do the normalization
t_mat <- t(as.matrix(mag_table))
# divide the matrix columns by the genome size of each MAG
norm_mat <- t_mat/ordered_checkm$exp_genome_size
t_norm_mat <- t(norm_mat)

# Combine into a normalized phyloseq object
tara_norm_physeq <- phyloseq(meta_physeq, tax_physeq, mag_tree, 
                             otu_table(t_norm_mat, taxa_are_rows = TRUE))
```


## Heatmap

```{r heatmap, fig.height=8, fig.width=10}
# A clustered heatmap
heatmap(otu_table(tara_norm_physeq))

# Visualize only the top 50 taxa
top_50MAGs <- names(sort(taxa_sums(tara_norm_physeq), decreasing = TRUE))[1:50]
top_50MAGs_physeq <- prune_taxa(top_50MAGs, tara_norm_physeq)

# Subset only the bacterial samples
girus_top50MAGs <- subset_samples(top_50MAGs_physeq, size_fraction =="girus")

# Melt into long format and fix zeros to avoid infinity values
otu_long_50_girus <- psmelt(otu_table(girus_top50MAGs)) %>%
  mutate(log_abund = log2(Abundance + 0.0000001))

# Make a "heatmap" with geom_tile that works with plotly :) 
heat_plot <- otu_long_50_girus %>%
  ggplot(aes(x=Sample, y=OTU)) +
  geom_tile(aes(fill = log_abund)) +
  scale_fill_distiller(palette = "YlGnBu") +
  labs(title = "Top 50 MAGs") +
  theme(axis.text.x = element_text(angle = 30, vjust = 1, hjust = 1))

# Plot the plotly plot!
ggplotly(heat_plot)
```

## Beta diversity plots
```{r betadiv, fig.width = 9}
# Calculate the bray curtis distances for all samples
tara_norm.ord <- ordinate(tara_norm_physeq, method = "PCoA", distance = "bray")

# Make a faceted plot by size fraction
sizeFrac_PCoA_prov <- 
  plot_ordination(tara_norm_physeq, tara_norm.ord, color = "province") +
  facet_wrap(~size_fraction) +
  theme_minimal() +
  scale_color_brewer(palette = "Paired")

# Make it a plotly plot
ggplotly(sizeFrac_PCoA_prov)
```

```{r betadiv2, fig.height = 4, fig.width=5}
# Subset the bacterial samples and color by depth 
bact <- subset_samples(tara_norm_physeq, size_fraction == "bact")
# Calculate the bray curtis distances for a PCoA
bact.ord <- ordinate(bact, method = "PCoA", distance = "bray")
# Make the plot
bact_PCoA_depth <- plot_ordination(bact, bact.ord, color = "depth")+
  theme_minimal() + 
  geom_point(size = 2) + 
  scale_color_manual(values = colors4)

# Make it a plotly plot
ggplotly(bact_PCoA_depth)
```

## Abundance plots
```{r abundance plots}
# Select the top 4 taxa 
top_taxa <- names(sort(taxa_sums(bact),decreasing = TRUE))[1:4]
top_bact <- prune_taxa(top_taxa, bact)
# Melt the data to the long format
top_bact_df <- psmelt(top_bact)

# Plot it! :) 
abundplot <- ggplot(top_bact_df, aes(x=province, y= Abundance, color = depth)) +
  facet_wrap(~OTU, scales = "free_y") + 
  geom_jitter(size = 2) + theme_minimal()+ 
  labs(y = "Normalized MAG Abundance") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
        axis.title.x = element_blank()) +
  scale_color_manual(values = colors4)

# Make it interactive 
ggplotly(abundplot)
```