explore_ft_data.Rmd

Flaveria trinervia eXpress data exploration
========================================================

Load the data

```{r}
library(ggplot2)
library(gplots)
library(reshape2)
setwd('~/hydrogen/flaveria/Ft/Ft_2/')
data_tpm <- read.csv('Ft_express.tpm', sep="\t", head=T)
data_counts <- read.csv('Ft_express.eff_count', sep="\t", head=T)
```

## TPM

Correlation matrix

```{r tpm_correlation, fig.width=12, fig.height=12}
tpms <- data_tpm[,3:20]
names(tpms) <- gsub(names(tpms), pattern="([0-9])\\.([0-9])", replacement="\\2.\\1")
c <- melt(cor(tpms))
ggplot(data=c, aes_string(x=names(c)[1], y=names(c)[2], fill="value")) +
  geom_tile() +
  ggtitle("TPM Correlation Matrix") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('')
ggsave("Ft_correlation.tpm.pdf")

```

Expression Density Plot

```{r tpm_density, fig.width=12, fig.height=12}
tpms_melt <- melt(tpms)
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")

ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density()
ggsave("Ft_density.tpm.pdf")
```

Heatmap on TPMs

```{r heatmap, fig.width=12, fig.height=12}
plotheatmap <- function(data) {
  dists <- dist(t(as.matrix(data)))
  mat <- as.matrix(dists)
  heatmap.2(mat, trace="none")
}
#pdf("Ft_heatmap.pdf")
plotheatmap(tpms)
#dev.off()
```

## Effective counts

```{r eff_corr, fig.width=12, fig.height=12}
library(EBSeq)
library(DESeq)
# Remove any rows containing all-zeros
remove_zero_rows <- function(df) {
  df[apply(df, 1, function(x) !all(x==0)),]
}

normalise_counts <- function(counts, normfactors) {
  round(t(t(counts) / normfactors))
}

counts <- data_counts[,3:20]
rownames(counts) <- data_counts$contig
counts <- remove_zero_rows(round(counts))
cds <- newCountDataSet(counts, condition=rep('a', 18))
cds <- estimateSizeFactors(cds)
cds <- counts(cds, normalize=TRUE)
rownames(cds) <- rownames(counts)

colnames(counts) <- gsub(colnames(counts), pattern="([0-9]).([0-9])", replacement="\\2.\\1")
plotcountcor <- function(counts) {
  c <- melt(cor(counts))
  p <- ggplot(data=c, aes_string(x=names(c)[1], y=names(c)[2], fill="value")) +
    geom_tile() +
    ggtitle("Effective Count Correlation matrix") +
    geom_text(aes(label = round(value, 2))) +
    xlab('') +
    ylab('')
  print(p)
}
pdf("Ft_correlation.eff.pdf")
plotcountcor(counts)
dev.off()
```

log count distributions

```{r log_count, fig.width=12, fig.height=12}
cf <- as.data.frame(counts)
cf$contig <- rownames(counts)
counts_melt <- melt(cf, id='contig')
ggplot(counts_melt, aes(x=log(value), colour=variable))
+ geom_density()
ggsave("Ft_density.eff.pdf")
```

Box plot with outliers

```{r boxplot1, fig.width=12, fig.height=12}
ggplot(counts_melt, aes(x=variable, y=value, colour=variable)) + geom_boxplot()
ggsave("Ft_boxplot.eff.pdf")
```

Plot distribution of counts over 100,000

```{r boxplot2, fig.width=12, fig.height=12}
counts_50k <- counts_melt[counts_melt$value > 100000,]
ggplot(counts_50k, aes(x=variable, y=value, colour=variable)) + geom_boxplot()
ggsave("Ft_boxplot50k.eff.pdf")
```

There are some high counts (>2e+05) distorting the distributions - let's remove those rows for the purposes of checking how good replication is for the majority of genes
```{r fig.width=12, fig.height=12}
high_contigs <- unique(counts_50k[which(counts_50k$value > 2e+05),]$contig)
fixed_tpm <- data_tpm[-which(data_tpm$contig %in% high_contigs),] # remove highly expressed contigs
names(fixed_tpm) <- gsub(names(fixed_tpm), pattern="([0-9]).([0-9])", replacement="\\2.\\1")
tpms <- fixed_tpm[,3:20]
rownames(tpms) <- fixed_tpm$contig

c <- melt(cor(tpms))
ggplot(data=c, aes_string(x=names(c)[1], y=names(c)[2], fill="value")) +
  geom_tile() +
  ggtitle("TPM Correlation Matrix with high expression removed") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('')
ggsave("Ft_correlation_no_outliers.tpm.pdf")

```

We can also use a correlation metric less sensitive to outliers
```{r fig.width=12, fig.height=12}
c <- cor(counts, method="spearman")
c <- melt(c)
c <- c[-which(c$value==1),] # remove 1s so the gradient is better
ggplot(data=c, aes_string(x=names(c)[1], y=names(c)[2], fill="value")) +
  geom_tile() +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  ggtitle("Effective count Spearman (ranked) correlation matrix")
ggsave("Ft_spearman.eff.pdf")
```

Just re-checking the distributions with outliers removed and aggregated by sample 
```{r fig.width=12, fig.height=12}
tpms_melt <- melt(tpms)
tpms_melt$value[tpms_melt$value < 0.01] <- 0
tpms_melt$stage <- gsub(tpms_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density() +
  ggtitle("TPM Density plot by section")
ggsave("Ft_density_no_outliers.tpm.pdf")
```

and not aggregated...
```{r fig.width=12, fig.height=12}
ggplot(tpms_melt, aes(x=log(value), colour=variable)) +
  geom_density() +
  ggtitle("TPM Density plot all samples")
```

we probably should re-normalise TPMs within each column since we removed the highest rows
```{r fig.width=12, fig.height=12}
#print(apply(tpms, 2, sum))
renorm <- function(x)  {
  x = (x / sum(x)) * 1e6
}
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))
#print(apply(tpms_rn, 2, sum))

tpms_rn_melt <- melt(tpms_rn)
tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density() +
  ggtitle("Renormalised TPM Density plot")
```

Plot the TPM correlation matrices again with outliers removed and then renormalised
```{r fig.width=12, fig.height=12}

c <- melt(cor(tpms_rn))
ggplot(data=c, aes_string(x=names(c)[1], y=names(c)[2], fill="value")) +
  geom_tile() +
  ggtitle("TPM Correlation matrix") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('')

c <- cor(tpms_rn, method="spearman")
c <- melt(c)
c <- c[-which(c$value==1),] # remove 1s so the gradient is better
ggplot(data=c, aes_string(x=names(c)[1], y=names(c)[2], fill="value")) +
  geom_tile() +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('') +
  ggtitle("TPM Spearman (ranked) correlation matrix")
```

Chloroplast genes could be causing problems - if chloroplasts were highly expressing or were captured at higher rates in one replicate's extractions, that could lead to broken correlations
```{r fig.width=12, fig.height=12}
setwd('~/hydrogen/flaveria/Ft/Ft_2/')
chloro <- read.csv('Ft_chloro_contigs.txt', head=F)
names(chloro)[1] <- 'contig'
nc_tpms <- fixed_tpm[-which(fixed_tpm$contig %in% chloro$contig),] # remove chloroplast contigs
tpms <- nc_tpms[,3:20]
rownames(tpms) <- nc_tpms$contig
tpms_rn <- as.data.frame(apply(tpms, 2, renorm))

c <- melt(cor(tpms_rn))
ggplot(data=c, aes_string(x=names(c)[1], y=names(c)[2], fill="value")) +
  geom_tile() +
  ggtitle("TPM Correlation Matrix no outliers, no chloroplast") +
  geom_text(aes(label = round(value, 2))) +
  xlab('') +
  ylab('')


tpms_rn_melt <- melt(tpms_rn)
tpms_rn_melt$value[tpms_rn_melt$value < 0.01] <- 0
tpms_rn_melt$stage <- gsub(tpms_rn_melt$variable, pattern="\\.[0-9]", replacement="")
ggplot(tpms_rn_melt, aes(x=log(value), group=stage, colour=stage)) +
  geom_density() +
  ggtitle("TPM Density plot by section")
```

Annotation
```{r fig.width=12, fig.height=12}
setwd('~/hydrogen/flaveria/Ft/Ft_2/')
anno <- read.csv('Ft_annotation.txt', head=F, sep='\t')            # load annotation (species, contig, agi)
key <- read.csv('key_agi.txt', head=F, sep='\t')

names(anno) <- c("species", "contig", "annotation")                # rename column headers
names(key) <- c("agi", "desc")
anno_tpm <- merge(data_tpm, anno, by="contig", all=T)              # merge count data and annotation

library(plyr)
anno_tpm <- anno_tpm[,-c(1, 2, 21, 22)]                            # remove extraneous columns
anno_tpm <- ddply(anno_tpm, .(annotation), numcolwise(sum))        # sum columns, grouping by annotation
key_tpm <- anno_tpm[which(anno_tpm$annotation %in% key$agi),]      # print expression tpms for PIN1
desc_tpm <- merge(key_tpm, key, by.x="annotation", by.y="agi", all=F) # add gene description

write.table(desc_tpm, file ="Ft_key_agi.tpm.csv", row.names=F, quote=F,sep = "\t")
```