From 8587b3703eee7058fa53dd9281bdecab6506d3d5 Mon Sep 17 00:00:00 2001
From: Nicolas HOMBERG <nicolashomberg@gricad-nh.u-ga.fr>
Date: Mon, 2 Dec 2024 16:18:17 +0100
Subject: [PATCH] final challenge ?

---
 .../scoring_program/detailed_results copy.Rmd | 476 ------------------
 .../scoring_program/detailed_results.Rmd      |   8 +-
 2 files changed, 3 insertions(+), 481 deletions(-)
 delete mode 100644 phase-1_2_3/scoring_program/detailed_results copy.Rmd

diff --git a/phase-1_2_3/scoring_program/detailed_results copy.Rmd b/phase-1_2_3/scoring_program/detailed_results copy.Rmd
deleted file mode 100644
index 0161642..0000000
--- a/phase-1_2_3/scoring_program/detailed_results copy.Rmd	
+++ /dev/null
@@ -1,476 +0,0 @@
----
-title: "Visualize Results"
-author: "Elise Amblard, Hugo Barbot, Florent Chuffart and Magali Richard"
-date: "`r Sys.Date()`"
-output: 
-  # prettydoc::html_pretty: # create a styles.css file and snakemake doesn't like it
-    # self_contained: true
-    # theme: cayman
-    # highlight: github
-    # toc: true
-    # toc_float: true
-    # toc_depth: 3
-    # number_sections: true
-    # keep_tex: yes
-  rmarkdown::html_document:
-    theme: flatly
-    toc: true
-    # toc_float: true
-    toc_depth: 3
-    number_sections: true
----
-
-
-```{r label = "header", echo = FALSE, eval = TRUE,include=FALSE}
-knitr::opts_chunk$set(collapse=TRUE, comment = "#>", fig.width=9, fig.height=6, eval=TRUE, echo=FALSE, fig.align = 'center')#, results="hide")
-options(knitr.duplicate.label = "allow")
-```
-
-
-```{r}
-
-
-library(ggplot2)
-
-dic_datasets2short <- list(
-  invitro = "VITR" , 
-  invivo = "VIVO" , 
-  insilicopseudobulk = "SBN5" , 
-  insilicodirichletNoDep = "SDN5" , 
-  insilicodirichletNoDep4CTsource = "SDN4" , 
-  insilicodirichletNoDep6CTsource = "SDN6" , 
-  insilicodirichletEMFA = "SDE5" , 
-  insilicodirichletEMFAImmuneLowProp = "SDEL" , 
-  insilicodirichletCopule =  "SDC5" 
-)
-
-# input = "test_output/"
-# output = "test_output/"
-
-
-datasets = c("VITR", "VIVO", "SBN5", "SDN5", "SDN4", "SDN6", "SDE5", "SDEL", "SDC5")
-
-
-dir_name = paste0(input, .Platform$file.sep, "ref", .Platform$file.sep)
-groundtruh_list = list.files(dir_name,pattern="groundtruth*")
-
-# print(groundtruh_list)
-
-
-dic_longnames2short =list()
-
-ground_truth= list()
-score = list()
-for (groundthruth_name in groundtruh_list){
-  
-  gt_list = unlist(strsplit(groundthruth_name, "_"))
-  methods_name =  gt_list[2]
-  phase =  substr(gt_list[1], nchar(gt_list[1]), nchar(gt_list[1]))
-
-  dataset_name = paste0("mixes",phase,"_",methods_name,'_pdac.rds')
-
-  dic_longnames2short[[dataset_name]] = dic_datasets2short[[methods_name]]
-  # dic_longnames2short_gt[[groundthruth_name]] = dic_datasets2short[[methods_name]]
-
-  gt_file =  readRDS(file = paste0(dir_name,.Platform$file.sep, groundthruth_name) )
-
-  ground_truth[[dic_datasets2short[[methods_name]]]] = gt_file
-
-
-  score_file = paste0(output, .Platform$file.sep, "scores_",dataset_name)
-  score_data = readRDS(file = score_file )
-  score[[dic_datasets2short[[methods_name]] ]] = score_data
-}
-
-prediction <- readRDS(paste0(input,.Platform$file.sep,"res",.Platform$file.sep,"prediction.rds"))
-time = readRDS(file = paste0(input, .Platform$file.sep, "res", .Platform$file.sep, "Rprof.rds") )
-
-stopifnot(identical(names(prediction), names(time)))
-
-
-for (name in names(prediction)) {
-  names(time)[which(name == names(prediction))] = dic_longnames2short[name]
-  names(prediction)[which(name == names(prediction))] = dic_longnames2short[name]
-}
-
-
-datasets <- datasets[datasets %in%  unlist(dic_longnames2short)]
-prediction <- prediction[datasets]
-time <- time[datasets]
-ground_truth <- ground_truth[datasets]
-score <- score[datasets]
-
-```
-
-
-# Visualisations of your method
-
-## Table of scores for all datasets
-
-```{r Display_scores}
-score_df = data.frame(matrix(as.numeric(cbind(do.call(rbind,score),matrix(time,ncol=1))),nrow=length(datasets)))
-rownames(score_df) = datasets
-colnames(score_df) = c(names(score[[1]]), "time_exec")
-knitr::kable(
-  score_df, 
-  # format = "html",
-  table.attr = "style='overflow-x:auto; display:block; white-space:nowrap;'"
-)
-# kableExtra::scroll_box(score_table, width = "100%", height = "400px")
-```
-
-We have 9 different datasets: 
-
-- `VITR` : *in vitro* mixtures of pure cell types
-- `VIVO` : real *in vivo* bulk samples
-- `SBN5` : *in silico simulation* of pseudo-bulk from simulated single-cell data, for 5 cell-types
-- `SDN5` : *in silico simulation*, using a Dirichlet distribution, without correlation between features, for 5 cell-types
-- `SDN4` : *in silico simulation*, using a Dirichlet distribution, without correlation between features, for 4 cell-types
-- `SDN6` : *in silico simulation*, using a Dirichlet distribution, without correlation between features, for 6 cell-types
-- `SDE5` : *in silico simulation*, using a Dirichlet distribution, with EMFA dependence, for 5 cell-types
-- `SDEL` : *in silico simulation*, using a Dirichlet distribution, with EMFA dependence, for 5 cell-types including one with very low proportions
-- `SDC5` : *in silico simulation*, using a Dirichlet distribution, with copula dependence, for 5 cell-types
-
-```{r Heatmap_pred, eval=FALSE}
-## Heatmap of predictions
-for (dataset in seq_along(prediction)) {
-  if (is.null(colnames(prediction[[dataset]]))) {
-  colnames(prediction[[dataset]]) = paste0("Sample",
-                             format(1:ncol(prediction[[dataset]]),
-                                    digits = ceiling(log10(ncol(prediction[[dataset]])))))
-  }
-}
-
-heatmaps = lapply(seq_along(prediction), function(x)
-  ComplexHeatmap::Heatmap(
-  prediction[[x]],
-  col = circlize::colorRamp2(seq(0, 1, length.out = 32), rev(heat.colors(32))),
-  column_title = datasets[x],
-  cluster_rows = FALSE,
-  cluster_columns = FALSE,
-  heatmap_legend_param = list(title = "Proportions"),
-  column_names_rot = 45                                                         # HB: do we even show them ?
-))
-heatmaps[[1]] + heatmaps[[2]] #+ heatmaps[[3]]
-#heatmaps[[4]] + heatmaps[[5]] + heatmaps[[6]]
-#heatmaps[[7]] + heatmaps[[8]] + heatmaps[[9]]
-```
-
-## Boxplot of the absolute error
-
-```{r abs_bias, warning=FALSE}
-color_pallet = RColorBrewer::brewer.pal(9, "Set1")
-names(color_pallet) = datasets
-
-ground_truth_corrected = ground_truth
-prediction_corrected = prediction
-for (data in seq_along(datasets)) {
-  if (nrow(prediction[[data]])==(nrow(ground_truth[[data]])-1)) { # we have 1 missing cell type in the ground truth
-    if (all(rownames(prediction[[data]]) %in% rownames(ground_truth[[data]]))) {
-      ground_truth_corrected[[data]] = ground_truth_corrected[[data]][rownames(prediction[[data]]),]
-    }
-    else {stop("Cell types names do not match")}
-  }
-  if (nrow(prediction[[data]])==(nrow(ground_truth[[data]])+1)) { # we have 1 missing cell type in the reference
-    if (all(rownames(ground_truth[[data]]) %in% rownames(prediction[[data]]))) {
-      extra_type = matrix(0,ncol=ncol(ground_truth[[data]]),nrow=1)
-      rownames(extra_type) = setdiff(rownames(prediction[[data]]),rownames(ground_truth[[data]]))
-      ground_truth_corrected[[data]] = rbind(ground_truth_corrected[[data]],extra_type)
-      prediction_corrected[[data]] = prediction_corrected[[data]][rownames(ground_truth_corrected[[data]]),]
-    }
-    else {stop("Cell types names do not match")}
-  }
-  prediction_corrected[[data]] = prediction_corrected[[data]][rownames(ground_truth_corrected[[data]]),]
-}
-
-cell_types = rownames(prediction[[1]])
-abs_bias = lapply(seq_along(prediction), function(x) abs(prediction_corrected[[x]][rownames(ground_truth_corrected[[x]]),] - ground_truth_corrected[[x]][rownames(ground_truth_corrected[[x]]),]))
-abs_bias = data.frame(
-  Error = unlist(lapply(abs_bias,c)),
-  CellType = unlist(lapply(abs_bias, function(x) rep(rownames(x),ncol(x)))),
-  Dataset = unlist(lapply(seq_along(abs_bias), function(x) rep(datasets[x],ncol(abs_bias[[x]])*nrow(abs_bias[[x]]))))
-)
-abs_bias$Dataset = factor(abs_bias$Dataset, levels = datasets)
-abs_bias[abs_bias$Dataset == "VIVO", "Error"] = NA_real_
-
-# boxplots = lapply(cell_types, function(CT)
-#   ggplot(abs_bias, aes(x=Dataset, y=Error, color=Dataset)) +
-#     geom_boxplot() +
-#     geom_point() +
-#     xlab("") )
-#     #see::theme_modern(axis.text.angle = 40, axis.text.size = 5))
-# #ggpubr::ggarrange(plotlist = boxplots, common.legend = T, labels = cell_types)
-# #boxplots[[1]]
-
-boxplots = ggplot(abs_bias, aes(x=CellType, y=Error, fill=Dataset)) +
-  geom_boxplot(position = position_dodge2(padding = 0.2)) + # 
-  labs(
-    title = paste("Absolute bias among each cell type by your method on each dataset"),
-    x = "Cell types",
-    y = "Absolut bias"
-  ) +
-  theme_light() +
-  theme(
-    plot.title = element_text(hjust = 0.5),
-    legend.position = "bottom"
-  ) +
-  scale_fill_manual(
-    values = color_pallet[-which(names(color_pallet) == "VIVO")]
-  )
-plotly::layout(
-  plotly::ggplotly(boxplots), 
-  boxmode = "group",
-  legend = list(
-    x = 0,
-    y = -0.3,
-    xanchor = 'left',
-    yanchor = 'bottom',
-    orientation = 'h')
-)
-```
-
-<!-- Boxplot of absolute error between estimated proportions and ground truth proportions $|\hat{\beta}_{prop} - \beta_{true\_prop}|$ for each cell type. -->
-Boxplot of absolute error between estimated proportions $\hat{\beta}$ and ground truth proportions $\beta$ for each cell type. For the sample $j$ and the cell type $i$,  $\text{Absolut error}_{i,j} = |\hat{\beta}_{i,j} - \beta_{i,j}|$
-
-## Boxplot of each metric
-
-```{r scores_samples_all_metrics, warning=FALSE}
-correlationP_1col = function(Ai_real, Ai_pred) {
-  # Ai_pred = Ai_pred[names(Ai_real)]
-  return(cor(Ai_real, Ai_pred, method = "pearson"))
-}
-correlationS_1col = function(Ai_real, Ai_pred) {
-  # Ai_pred = Ai_pred[names(Ai_real)]
-  return(cor(Ai_real, Ai_pred, method = "spearman"))
-}
-eval_Aitchison_1col = function(Ai_real, Ai_pred, min = 1e-9) {
-  # Ai_pred = Ai_pred[names(Ai_real)]
-  # Aitchison dist doesn't like 0 
-  Ai_real[Ai_real < min] = min
-  Ai_pred[Ai_pred < min] = min
-  return(coda.base::dist(rbind(Ai_real, Ai_pred), method = "aitchison")[1])
-}
-eval_Aitchison = function(A_real, A_pred, min = 1e-9) {
-  # A_pred = A_pred[rownames(A_real),]
-  # Aitchison dist doesn't like 0 
-  A_real[A_real < min] = min
-  A_pred[A_pred < min] = min
-  res = c()
-  for (i in seq(ncol(A_real))) {
-    res[i] = coda.base::dist(rbind(t(A_real[,i]), t(A_pred[,i])), method = "aitchison")[1]
-  }
-  res = res[!is.na(res)]
-  return(mean(res))
-}
-eval_RMSE = function(A_real, A_pred) {
-  return(sqrt(mean((A_real - A_pred)^2)))
-}
-eval_MAE = function (A_real, A_pred){
-  return(mean(abs(A_real - A_pred)))
-}
-
-
-
-
-df_metric = do.call(
-  what = rbind,
-  args = lapply(seq_along(datasets), function(data) {
-    
-    stopifnot(identical(rownames(prediction_corrected[[data]]), rownames(ground_truth_corrected[[data]])))
-    prediction_corrected[[data]][rownames(ground_truth_corrected[[data]]),]
-
-    data.frame(
-      "RMSE" = sapply(seq(ncol(ground_truth_corrected[[data]])), function(sample) {
-        eval_RMSE(ground_truth_corrected[[data]][,sample], prediction_corrected[[data]][,sample])
-      }),
-      "MAE" = sapply(seq(ncol(ground_truth_corrected[[data]])), function(sample) {
-        eval_MAE(ground_truth_corrected[[data]][,sample], prediction_corrected[[data]][,sample])
-      }),
-      "Aitchison" = sapply(seq(ncol(ground_truth_corrected[[data]])), function(sample) {
-        eval_Aitchison_1col(ground_truth_corrected[[data]][,sample], prediction_corrected[[data]][,sample])
-      }),
-      "Pearson" = sapply(seq(ncol(ground_truth_corrected[[data]])), function(sample) {
-        correlationP_1col(ground_truth_corrected[[data]][,sample], prediction_corrected[[data]][,sample])
-      }),
-      "Spearman" = sapply(seq(ncol(ground_truth_corrected[[data]])), function(sample) {
-        correlationS_1col(ground_truth_corrected[[data]][,sample], prediction_corrected[[data]][,sample])
-      }),
-      "Dataset" = datasets[data]
-    )
-  })
-)
-  
-df_metric = tidyr::pivot_longer(
-  data = df_metric,
-  cols = c("RMSE", "MAE", "Aitchison", "Pearson", "Spearman"),
-  names_to = "metric_name",
-  values_to = "metric_score"
-) 
-df_metric$Dataset = factor(df_metric$Dataset, levels = datasets)
-df_metric$metric_name = factor(df_metric$metric_name, levels = c("RMSE", "MAE", "Aitchison", "Pearson", "Spearman"))
-df_metric[(df_metric$Dataset == "VIVO") & (df_metric$metric_name %in% c("RMSE", "MAE", "Aitchison")), "metric_score"] = NA_real_
-
-# boxplots_metric = lapply(c("RMSE","MAE","Aitchison","Pearson","Spearman"), function(metric)
-#   ggplot(df_metric, aes(x=Dataset, y=get(metric), color=Dataset)) +
-#     geom_boxplot() +
-#     geom_point() +
-#     xlab("") +
-#     ylab(metric) )
-#     #see::theme_modern(axis.text.angle = 40, axis.text.size = 5))
-# #ggpubr::ggarrange(plotlist = boxplots_metric, common.legend = T)
-# boxplots_metric[[1]]
-# boxplots_metric[[2]]
-
-boxplots_metric = ggplot(df_metric, aes(x = Dataset, y = metric_score, fill = Dataset)) +
-  facet_wrap( ~ metric_name, scales = "free") +
-  geom_boxplot(position=position_dodge2(padding = 0.2)) +
-  labs(
-    title = paste("Samples' metrics results of your method on all datasets"),
-    x = "Datasets",
-    y = "Scores"
-  ) +
-  theme_light() +
-  theme(
-    plot.title = element_text(hjust = 0.5),
-    legend.position = "bottom",
-    axis.text.x = element_blank() # element_text(angle = 45, hjust = 1)
-  ) +
-  scale_fill_manual(
-    values = color_pallet
-  )
-plotly::layout(
-  plotly::ggplotly(boxplots_metric), 
-  legend = list(
-    x = 0,
-    y = -0.3,
-    xanchor = 'left',
-    yanchor = 'bottom',
-    orientation = 'h')
-)
-```
-
-
-Boxplot of 15 or 30 samples for five different metrics : between estimated proportions and ground truth proportions for each cell type
-
-- Root Mean Squared Error: for the sample $j$, $\text{RMSE}_j = \sqrt{\frac{\sum_{i=1}^{N} \big(\hat{\beta}_{i,j}-\beta_{i,j}\big)^2}{N}}$.
-- Mean Absolut Error: for the sample $j$, $\text{MAE}_j = \frac{\sum_{i=1}^{N} \big|\hat{\beta}_{i,j}-\beta_{i,j}\big|}{N}$.
-- Aitchison distance: this distance use a isomorphism to transform a vector of proportions, true and estimated vectors for each samples, from the unity simplex to the real space (see https://en.wikipedia.org/wiki/Compositional_data#Aitchison_geometry). Then, an Euclidean distance is calculated between the transformed estimated vector of proportions and the transformed ground truth vector of proportions.
-- Pearson correlation: for the sample $j$, the Pearson correlation is calculated between the estimated vector of proportions $\hat{\beta}_j$ and the ground truth vector of proportions $\beta_j$.
-- Spearman correlation: for the sample $j$, the Spearman correlation is calculated between the estimated vector of proportions $\hat{\beta}_j$ and the ground truth vector of proportions $\beta_j$.
-
-
-
-## Scatter plots between truth and prediction
-
-```{r scatter_plot, warning=FALSE}
-df_scatter = do.call(
-  what = rbind,
-  args = lapply(seq_along(datasets), function(data) { # we remove in vivo
-    
-    prediction_data = t(prediction_corrected[[data]])
-    prediction_data = tidyr::pivot_longer(
-      as.data.frame(prediction_data),
-      cols = colnames(prediction_data),
-      names_to = "cell_type",
-      values_to = "pred_prop"
-    )
-    
-    ground_truth_data = t(ground_truth_corrected[[data]])
-    ground_truth_data = tidyr::pivot_longer(
-      as.data.frame(ground_truth_data),
-      cols = colnames(ground_truth_data),
-      names_to = "cell_type",
-      values_to = "true_prop"
-    )
-    
-    prop_data = cbind(prediction_data, ground_truth_data[,"true_prop"], Dataset = datasets[data])
-  })
-)
-df_scatter$Dataset = factor(df_scatter$Dataset, levels = datasets)
-df_scatter$cell_type = factor(df_scatter$cell_type, levels = rownames(ground_truth[[1]]))
-
-
-scatter_prop = ggplot(df_scatter) +
-  facet_wrap( ~ Dataset) +
-  geom_abline(intercept = 0, slope = 1, linetype = 3, linewidth = 0.25) +
-  geom_point(aes(x = true_prop, y = pred_prop, col = cell_type)) +
-  coord_cartesian(
-    xlim = c(0,1),
-    ylim = c(0,1)
-  ) +
-  labs(
-    x = "Ground truth proportion",
-    y = "Prediction",
-    title = paste("Proportions predicted by your method vs true ones for all datasets")
-  ) +
-  scale_x_continuous(
-    labels = scales::percent
-  ) +
-  scale_y_continuous(
-    labels = scales::percent
-  ) +
-  scale_color_discrete(
-    name = "Cell type"
-  ) +
-  theme_light() +
-  theme(plot.title = element_text(hjust = 0.5)) +
-  scale_fill_manual(
-    values = color_pallet
-  )
-plotly::ggplotly(scatter_prop)
-```
-
-## Radar charts
-
-```{r scores_global_radarchart, fig.width=20, fig.height=11.25, warning=FALSE, message=FALSE, results='hide'}
-par(mfrow = c(3,3))
-for (data in seq_along(datasets)) {
-  ground_truth_data = ground_truth[[data]]
-  
-  fake_worst_pred = apply(ground_truth_data, 2, function(prop) { 
-    tmp = rep(1e-9, length(prop))
-    tmp[which.min(prop)] = 1
-    return(tmp)
-  })
-  rownames(fake_worst_pred) = rownames(ground_truth_data)
-  
-  worst_RMSE = eval_RMSE(ground_truth_data, fake_worst_pred)
-  worst_MAE = eval_MAE(ground_truth_data, fake_worst_pred)
-  worst_Aitchison = eval_Aitchison(ground_truth_data, fake_worst_pred)
-  
-  
-  score_data = score[[data]]
-  
-  # We don't take the global score for the radar chart since it's a proxy of the area
-  df_radarplot = as.data.frame(rbind(
-    # rmse,       mae,       aitchison,        pearson_tot, pearson_col, pearson_row, spearman_tot, spearman_col, spearman_row
-    c(0,          0,         0,                1,           1,           1,           1,            1,            1), # the max/best value for each var
-    c(worst_RMSE, worst_MAE, worst_Aitchison, -1,          -1,          -1,          -1,           -1,           -1), # the min/worst value for each var
-    score_data[-1]
-  ))
-  
-  fmsb::radarchart(
-    df_radarplot, 
-    axistype=2, # add the max/best value of each axe (but I don't know how to add the min/worst)
-    
-    title = paste("Global metrics scores for your method\non dataset", datasets[data]),
-    #custom polygon
-    pcol=rgb(0.2,0.5,0.5,0.9), pfcol=rgb(0.2,0.5,0.5,0.5), plwd=4,
-   
-    #custom the grid
-    cglcol="grey", cglty=1, axislabcol="grey50", cglwd=0.8,
-   
-    #custom labels
-    vlcex=0.8
-  )
-  
-  # Comment HB: the visualization by a radar/spider plot is intrinsically linked to the aggregated score for defining the scope (min and max) for each variable.
-}
-par(mfrow = c(1,1))
-```
-
-# Session Information
-
-```{r, results="verbatim"}
-sessionInfo()
-```
diff --git a/phase-1_2_3/scoring_program/detailed_results.Rmd b/phase-1_2_3/scoring_program/detailed_results.Rmd
index d6509f5..d322354 100644
--- a/phase-1_2_3/scoring_program/detailed_results.Rmd
+++ b/phase-1_2_3/scoring_program/detailed_results.Rmd
@@ -109,7 +109,6 @@ max_time <- 86400
 
 
 if (!  (    any(names(dic_longnames2short) %in% names(time))   )) {
-  # Initialize `score` if it doesn't already exist
   time <- list()
   # Loop through each dataset and set the corresponding score
   for (name in datasets) {
@@ -117,14 +116,13 @@ if (!  (    any(names(dic_longnames2short) %in% names(time))   )) {
   }
 }else{
   stopifnot(identical(names(prediction), names(time)))
+  for (name in names(prediction)) {
+    names(time)[which(name == names(prediction))] = dic_longnames2short[name]
+  }
 }
 
-
-
 for (name in names(prediction)) {
-  names(time)[which(name == names(prediction))] = dic_longnames2short[name]
   names(prediction)[which(name == names(prediction))] = dic_longnames2short[name]
-  
 }