From 37c6d6978bbeb1c1285dcd226664f91a802440e2 Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 1 Nov 2023 13:14:21 -0600
Subject: [PATCH 1/2] updates-after-2023.10-release

---
 notebooks/resource-allocation/102023.1.ipynb  | 499 ++++++++++++
 notebooks/resource-allocation/112023.ipynb    | 744 ++++++++++++++++++
 .../generate-allocation-summary.py            |   4 -
 qiita_db/processing_job.py                    |  10 +-
 4 files changed, 1249 insertions(+), 8 deletions(-)
 create mode 100644 notebooks/resource-allocation/102023.1.ipynb
 create mode 100644 notebooks/resource-allocation/112023.ipynb
diff --git a/notebooks/resource-allocation/102023.1.ipynb b/notebooks/resource-allocation/102023.1.ipynb
new file mode 100644
index 000000000..fce3cfd46
--- /dev/null
+++ b/notebooks/resource-allocation/102023.1.ipynb
@@ -0,0 +1,499 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "from datetime import datetime, timedelta, date\n",
+    "from humanize import naturalsize\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Qiita's resource allocation - quick update from previous version\n",
+    "\n",
+    "After the 2023.10 release we noticed that:\n",
+    "1. `job-output-folder` `VALIDATE` command didn't have valid request because those jobs do not have sample/column values\n",
+    "2. The default during a resource allocation for time is minutes and the calculations were done in seconds"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading data\n",
+    "\n",
+    "First you will need to run `generate-allocation-summary.py` in Qiita as the qiita user (or whatever user runs qiita in your system). The resulting file would be: `job_[date].tsv.gz`.\n",
+    "\n",
+    "The generated file will have these columns: `['JobID', 'ElapsedRaw', 'MaxRSS', 'Submit', 'Start', 'MaxRSS.1', 'CPUTimeRAW', 'ReqMem', 'AllocCPUS', 'AveVMSize', 'QiitaID', 'external_id', 'sId', 'sName', 'sVersion', 'cId', 'cName', 'samples', 'columns', 'input_size', 'extra_info'],`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "m1g = 2**30\n",
+    "df = pd.read_csv('jobs_2023-10-04.tsv.gz', sep='\\t', dtype={'extra_info': str})\n",
+    "df['ElapsedRawTime'] = pd.to_timedelta(df.ElapsedRawTime)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'There are 101147 successful jobs since we moved to barnacle2 and the largest external_id is: 1581986'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# for reference for the next iteration of this notebook\n",
+    "f'There are {df.shape[0]} successful jobs since we moved to barnacle2 and the largest external_id is: {df.external_id.max()}'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1. Getting the default values for `job-output-folder` `VALIDATE`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th colspan=\"3\" halign=\"left\">ElapsedRawTime</th>\n",
+       "      <th colspan=\"3\" halign=\"left\">MaxRSSRaw</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "      <th>count</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>cName</th>\n",
+       "      <th>sName</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"12\" valign=\"top\">Validate</th>\n",
+       "      <th>BIOM type - BIOM</th>\n",
+       "      <td>687</td>\n",
+       "      <td>0 days 00:00:55</td>\n",
+       "      <td>0 days 01:03:49</td>\n",
+       "      <td>687</td>\n",
+       "      <td>222.8 MB</td>\n",
+       "      <td>82.0 GB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Diversity types - FeatureData</th>\n",
+       "      <td>6</td>\n",
+       "      <td>0 days 00:01:20</td>\n",
+       "      <td>0 days 00:02:49</td>\n",
+       "      <td>6</td>\n",
+       "      <td>331.4 MB</td>\n",
+       "      <td>384.3 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Diversity types - alpha_vector</th>\n",
+       "      <td>123</td>\n",
+       "      <td>0 days 00:01:12</td>\n",
+       "      <td>3 days 04:36:54</td>\n",
+       "      <td>123</td>\n",
+       "      <td>289.3 MB</td>\n",
+       "      <td>101.5 GB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Diversity types - distance_matrix</th>\n",
+       "      <td>117</td>\n",
+       "      <td>0 days 00:00:37</td>\n",
+       "      <td>0 days 00:03:55</td>\n",
+       "      <td>117</td>\n",
+       "      <td>122.7 MB</td>\n",
+       "      <td>12.5 GB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Diversity types - ordination_results</th>\n",
+       "      <td>107</td>\n",
+       "      <td>0 days 00:00:39</td>\n",
+       "      <td>0 days 00:03:19</td>\n",
+       "      <td>107</td>\n",
+       "      <td>117.2 MB</td>\n",
+       "      <td>2.9 GB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Sequencing Data Type - Demultiplexed</th>\n",
+       "      <td>43</td>\n",
+       "      <td>0 days 00:00:35</td>\n",
+       "      <td>0 days 00:12:23</td>\n",
+       "      <td>43</td>\n",
+       "      <td>83.4 MB</td>\n",
+       "      <td>517.4 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Sequencing Data Type - FASTA</th>\n",
+       "      <td>2</td>\n",
+       "      <td>0 days 00:00:56</td>\n",
+       "      <td>0 days 00:02:23</td>\n",
+       "      <td>2</td>\n",
+       "      <td>79.8 MB</td>\n",
+       "      <td>83.6 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Sequencing Data Type - FASTQ</th>\n",
+       "      <td>32</td>\n",
+       "      <td>0 days 00:00:41</td>\n",
+       "      <td>0 days 01:50:44</td>\n",
+       "      <td>32</td>\n",
+       "      <td>78.7 MB</td>\n",
+       "      <td>84.4 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Sequencing Data Type - SFF</th>\n",
+       "      <td>1</td>\n",
+       "      <td>0 days 00:01:09</td>\n",
+       "      <td>0 days 00:01:09</td>\n",
+       "      <td>1</td>\n",
+       "      <td>79.6 MB</td>\n",
+       "      <td>79.6 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Sequencing Data Type - per_sample_FASTQ</th>\n",
+       "      <td>73</td>\n",
+       "      <td>0 days 00:00:36</td>\n",
+       "      <td>0 days 18:13:21</td>\n",
+       "      <td>73</td>\n",
+       "      <td>77.6 MB</td>\n",
+       "      <td>83.6 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Visualization types - q2_visualization</th>\n",
+       "      <td>133</td>\n",
+       "      <td>0 days 00:00:36</td>\n",
+       "      <td>0 days 00:24:56</td>\n",
+       "      <td>133</td>\n",
+       "      <td>51.5 MB</td>\n",
+       "      <td>67.5 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>qtp-job-output-folder - job-output-folder</th>\n",
+       "      <td>228</td>\n",
+       "      <td>0 days 00:00:31</td>\n",
+       "      <td>0 days 00:04:06</td>\n",
+       "      <td>228</td>\n",
+       "      <td>18.3 MB</td>\n",
+       "      <td>46.7 MB</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                   ElapsedRawTime  \\\n",
+       "                                                            count   \n",
+       "cName    sName                                                      \n",
+       "Validate BIOM type - BIOM                                     687   \n",
+       "         Diversity types - FeatureData                          6   \n",
+       "         Diversity types - alpha_vector                       123   \n",
+       "         Diversity types - distance_matrix                    117   \n",
+       "         Diversity types - ordination_results                 107   \n",
+       "         Sequencing Data Type - Demultiplexed                  43   \n",
+       "         Sequencing Data Type - FASTA                           2   \n",
+       "         Sequencing Data Type - FASTQ                          32   \n",
+       "         Sequencing Data Type - SFF                             1   \n",
+       "         Sequencing Data Type - per_sample_FASTQ               73   \n",
+       "         Visualization types - q2_visualization               133   \n",
+       "         qtp-job-output-folder - job-output-folder            228   \n",
+       "\n",
+       "                                                                    \\\n",
+       "                                                               min   \n",
+       "cName    sName                                                       \n",
+       "Validate BIOM type - BIOM                          0 days 00:00:55   \n",
+       "         Diversity types - FeatureData             0 days 00:01:20   \n",
+       "         Diversity types - alpha_vector            0 days 00:01:12   \n",
+       "         Diversity types - distance_matrix         0 days 00:00:37   \n",
+       "         Diversity types - ordination_results      0 days 00:00:39   \n",
+       "         Sequencing Data Type - Demultiplexed      0 days 00:00:35   \n",
+       "         Sequencing Data Type - FASTA              0 days 00:00:56   \n",
+       "         Sequencing Data Type - FASTQ              0 days 00:00:41   \n",
+       "         Sequencing Data Type - SFF                0 days 00:01:09   \n",
+       "         Sequencing Data Type - per_sample_FASTQ   0 days 00:00:36   \n",
+       "         Visualization types - q2_visualization    0 days 00:00:36   \n",
+       "         qtp-job-output-folder - job-output-folder 0 days 00:00:31   \n",
+       "\n",
+       "                                                                   MaxRSSRaw  \\\n",
+       "                                                               max     count   \n",
+       "cName    sName                                                                 \n",
+       "Validate BIOM type - BIOM                          0 days 01:03:49       687   \n",
+       "         Diversity types - FeatureData             0 days 00:02:49         6   \n",
+       "         Diversity types - alpha_vector            3 days 04:36:54       123   \n",
+       "         Diversity types - distance_matrix         0 days 00:03:55       117   \n",
+       "         Diversity types - ordination_results      0 days 00:03:19       107   \n",
+       "         Sequencing Data Type - Demultiplexed      0 days 00:12:23        43   \n",
+       "         Sequencing Data Type - FASTA              0 days 00:02:23         2   \n",
+       "         Sequencing Data Type - FASTQ              0 days 01:50:44        32   \n",
+       "         Sequencing Data Type - SFF                0 days 00:01:09         1   \n",
+       "         Sequencing Data Type - per_sample_FASTQ   0 days 18:13:21        73   \n",
+       "         Visualization types - q2_visualization    0 days 00:24:56       133   \n",
+       "         qtp-job-output-folder - job-output-folder 0 days 00:04:06       228   \n",
+       "\n",
+       "                                                                        \n",
+       "                                                         min       max  \n",
+       "cName    sName                                                          \n",
+       "Validate BIOM type - BIOM                           222.8 MB   82.0 GB  \n",
+       "         Diversity types - FeatureData              331.4 MB  384.3 MB  \n",
+       "         Diversity types - alpha_vector             289.3 MB  101.5 GB  \n",
+       "         Diversity types - distance_matrix          122.7 MB   12.5 GB  \n",
+       "         Diversity types - ordination_results       117.2 MB    2.9 GB  \n",
+       "         Sequencing Data Type - Demultiplexed        83.4 MB  517.4 MB  \n",
+       "         Sequencing Data Type - FASTA                79.8 MB   83.6 MB  \n",
+       "         Sequencing Data Type - FASTQ                78.7 MB   84.4 MB  \n",
+       "         Sequencing Data Type - SFF                  79.6 MB   79.6 MB  \n",
+       "         Sequencing Data Type - per_sample_FASTQ     77.6 MB   83.6 MB  \n",
+       "         Visualization types - q2_visualization      51.5 MB   67.5 MB  \n",
+       "         qtp-job-output-folder - job-output-folder   18.3 MB   46.7 MB  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "cname = 'Validate'\n",
+    "_df = df[(df.cName == cname)].copy()\n",
+    "\n",
+    "summary = _df[_df['samples'].isnull() & _df['columns'].isnull()].groupby(\n",
+    "    ['cName', 'sName'])[['ElapsedRawTime', 'MaxRSSRaw']].agg(['count', 'min', 'max']).copy()\n",
+    "\n",
+    "summary[('MaxRSSRaw', 'min')] = summary[('MaxRSSRaw', 'min')].apply(naturalsize)\n",
+    "summary[('MaxRSSRaw', 'max')] = summary[('MaxRSSRaw', 'max')].apply(naturalsize)\n",
+    "\n",
+    "display(summary)\n",
+    "\n",
+    "# New allocation: -p qiita -N 1 -n 1 --mem 100mb --time 00:40:00"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2. Updates for the seconds to minute confusion"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "=============\n",
+    "=============\n",
+    "     allocation = '-p qiita -N 1 -n 1 '\n",
+    "            || '--mem (2**30)+({samples}*150000) '\n",
+    "            || '--time 240'\n",
+    "\n",
+    "UPDATE qiita.processing_job_resource_allocation set\n",
+    "    allocation = '-p qiita -N 1 -n 1 '\n",
+    "           || '--mem (2**30)+({samples}*150000) '\n",
+    "           || '--time 4'\n",
+    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
+    "    name = 'delete_sample_or_column';\n",
+    "\n",
+    "=============\n",
+    "=============\n",
+    "     allocation = '-p qiita -N 1 -n 1 '\n",
+    "            || '--mem {samples}*10000000'\n",
+    "            || '--time 61200'\n",
+    "\n",
+    "UPDATE qiita.processing_job_resource_allocation set\n",
+    "    allocation = '-p qiita -N 1 -n 1 '\n",
+    "           || '--mem {samples}*10000000'\n",
+    "           || '--time 1020'\n",
+    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
+    "    name = 'Sequence Processing Pipeline';\n",
+    "\n",
+    "=============\n",
+    "=============\n",
+    "     allocation = '-p qiita -N 1 -n 1 --mem 4g --time 900'\n",
+    "\n",
+    "UPDATE qiita.processing_job_resource_allocation set\n",
+    "    allocation = '-p qiita -N 1 -n 1 --mem 4g --time 15'\n",
+    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
+    "    name = 'Filter samples from table [filter_samples]';\n",
+    "\n",
+    "=============\n",
+    "=============\n",
+    "     allocation = '-p qiita -N 1 -n 1 '\n",
+    "            || '--mem (2**31)+({input_size}*6) if\n",
+    "(2**31)+({input_size}*6) < 13958643712 else 13958643712 '\n",
+    "            || '--time 2400'\n",
+    "\n",
+    "UPDATE qiita.processing_job_resource_allocation set\n",
+    "    allocation = '-p qiita -N 1 -n 1 '\n",
+    "           || '--mem (2**31)+({input_size}*6) if\n",
+    "(2**31)+({input_size}*6) < 13958643712 else 13958643712 '\n",
+    "           || '--time 40'\n",
+    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
+    "    name = 'Rarefy table [rarefy]';\n",
+    "\n",
+    "=============\n",
+    "=============\n",
+    "     allocation = '-p qiita -N 1 -n 1 '\n",
+    "            || '--mem 14g'\n",
+    "            || '--time 360'\n",
+    "\n",
+    "UPDATE qiita.processing_job_resource_allocation set\n",
+    "    allocation = '-p qiita -N 1 -n 1 '\n",
+    "           || '--mem 14g '\n",
+    "           || '--time 6'\n",
+    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
+    "    name = 'Alpha diversity (phylogenetic) [alpha_phylogenetic]';\n",
+    "\n",
+    "\n",
+    "=============\n",
+    "=============\n",
+    "     allocation = '-p qiita -N 1 -n 1 '\n",
+    "            || '--mem (2**33)+(2**30)+(({samples}*{columns}*{input_size})/4500000)'\n",
+    "            || '--time 1800'\n",
+    "\n",
+    "UPDATE qiita.processing_job_resource_allocation set\n",
+    "    allocation = '-p qiita -N 1 -n 1 '\n",
+    "           || '--mem\n",
+    "(2**33)+(2**30)+(({samples}*{columns}*{input_size})/4500000) '\n",
+    "           || '--time 30'\n",
+    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
+    "    name = 'Visualize and Interact with Principal Coordinates Analysis\n",
+    "Plots [plot]';\n",
+    "\n",
+    "=============\n",
+    "=============\n",
+    "     allocation = '-p qiita -N 1 -n 1 '\n",
+    "            || '--mem (2**32)+(({samples}*{columns}*{input_size}')/20000)'\n",
+    "            || '--time 90000'\n",
+    "\n",
+    "UPDATE qiita.processing_job_resource_allocation set\n",
+    "    allocation = '-p qiita -N 1 -n 1 '\n",
+    "           || '--mem (2**32)+(({samples}*{columns}*{input_size})/20000) '\n",
+    "           || '--time 1500'\n",
+    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
+    "    name = 'Alpha rarefaction curves [alpha_rarefaction]';\n",
+    "\n",
+    "=============\n",
+    "=============\n",
+    "     allocation = '-p qiita -N 1 -n 1 '\n",
+    "            || '--mem 2*(2**30)+{input_size} if 2*(2**30)+{input_size} < 16*(2**30) else 16*(2**30)'\n",
+    "            || '--time 36000'\n",
+    "\n",
+    "UPDATE qiita.processing_job_resource_allocation set\n",
+    "    allocation = '-p qiita -N 1 -n 1 '\n",
+    "           || '--mem 2*(2**30)+{input_size} if 2*(2**30)+{input_size}\n",
+    "< 16*(2**30) else 16*(2**30) '\n",
+    "           || '--time 600'\n",
+    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
+    "    name = 'Trimming';\n",
+    "\n",
+    "=============\n",
+    "=============\n",
+    "     allocation = '-p qiita -N 1 -n 1 '\n",
+    "            || '--mem (2**30)+({samples}*{columns}*2000)'\n",
+    "            || '--time 2300'\n",
+    "\n",
+    "UPDATE qiita.processing_job_resource_allocation set\n",
+    "    allocation = '-p qiita -N 1 -n 1 '\n",
+    "           || '--mem (2**30)+({samples}*{columns}*2000) '\n",
+    "           || '--time 39'\n",
+    "    WHERE job_type = 'RESOURCE_PARAMS_COMMAND' and\n",
+    "    name = 'update_sample_template';"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/resource-allocation/112023.ipynb b/notebooks/resource-allocation/112023.ipynb
new file mode 100644
index 000000000..08b2d9a25
--- /dev/null
+++ b/notebooks/resource-allocation/112023.ipynb
@@ -0,0 +1,744 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "from datetime import datetime, timedelta, date\n",
+    "from humanize import naturalsize\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Qiita's resource allocation\n",
+    "\n",
+    "This notebook walks through how to load & parse the job stats from Qiita. It additionally tries to split the different commands by their resource utilization and make sure to be as accurate/fair to request them. Here resource allocations are mainly walltime (`ElapsedRawTime`), memory (`MaxRSSRaw`) and the time a job took to start running (`WaitTime`: Start - Submit). "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading data\n",
+    "\n",
+    "First you will need to run `generate-allocation-summary.py` in Qiita as the qiita user (or whatever user runs qiita in your system). The resulting file would be: `job_[date].tsv.gz`.\n",
+    "\n",
+    "The generated file will have these columns: `['JobID', 'ElapsedRaw', 'MaxRSS', 'Submit', 'Start', 'MaxRSS.1', 'CPUTimeRAW', 'ReqMem', 'AllocCPUS', 'AveVMSize', 'QiitaID', 'external_id', 'sId', 'sName', 'sVersion', 'cId', 'cName', 'samples', 'columns', 'input_size', 'extra_info'],`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "m1g = 2**30\n",
+    "df = pd.read_csv('jobs_2023-10-31.tsv.gz', sep='\\t', dtype={'extra_info': str})\n",
+    "df['ElapsedRawTime'] = pd.to_timedelta(df.ElapsedRawTime)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'There are 106548 successful jobs since we moved to barnacle2 and the largest external_id is: 1614116'"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# for reference for the next iteration of this notebook\n",
+    "f'There are {df.shape[0]} successful jobs since we moved to barnacle2 and the largest external_id is: {df.external_id.max()}'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Deciding what to optimize and what to leave with a default value\n",
+    "\n",
+    "In the previous versions (072023, 102023, 102023.1) we decided to only optimize things that are using more than 4gb or 4hrs and now we want to review commands that are below 4g and 4hrs so we add specific parameters for them."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "qiita: 10\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th colspan=\"3\" halign=\"left\">ElapsedRawTime</th>\n",
+       "      <th colspan=\"2\" halign=\"left\">MaxRSSRaw</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>cName</th>\n",
+       "      <th>sName</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>delete_artifact</th>\n",
+       "      <th>Qiita</th>\n",
+       "      <td>1534</td>\n",
+       "      <td>0 days 00:00:03</td>\n",
+       "      <td>0 days 02:48:08</td>\n",
+       "      <td>0 Bytes</td>\n",
+       "      <td>122.2 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>create_sample_template</th>\n",
+       "      <th>Qiita</th>\n",
+       "      <td>569</td>\n",
+       "      <td>0 days 00:00:03</td>\n",
+       "      <td>0 days 00:11:22</td>\n",
+       "      <td>0 Bytes</td>\n",
+       "      <td>415.8 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>delete_analysis</th>\n",
+       "      <th>Qiita</th>\n",
+       "      <td>320</td>\n",
+       "      <td>0 days 00:00:03</td>\n",
+       "      <td>0 days 00:06:13</td>\n",
+       "      <td>0 Bytes</td>\n",
+       "      <td>120.8 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>download_remote_files</th>\n",
+       "      <th>Qiita</th>\n",
+       "      <td>194</td>\n",
+       "      <td>0 days 00:00:07</td>\n",
+       "      <td>0 days 03:29:36</td>\n",
+       "      <td>0 Bytes</td>\n",
+       "      <td>128.9 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>delete_sample_template</th>\n",
+       "      <th>Qiita</th>\n",
+       "      <td>181</td>\n",
+       "      <td>0 days 00:00:04</td>\n",
+       "      <td>0 days 00:19:31</td>\n",
+       "      <td>0 Bytes</td>\n",
+       "      <td>120.6 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>delete_study</th>\n",
+       "      <th>Qiita</th>\n",
+       "      <td>136</td>\n",
+       "      <td>0 days 00:00:03</td>\n",
+       "      <td>0 days 00:16:09</td>\n",
+       "      <td>0 Bytes</td>\n",
+       "      <td>125.5 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>update_prep_template</th>\n",
+       "      <th>Qiita</th>\n",
+       "      <td>126</td>\n",
+       "      <td>0 days 00:00:03</td>\n",
+       "      <td>0 days 00:02:25</td>\n",
+       "      <td>0 Bytes</td>\n",
+       "      <td>125.3 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>copy_artifact</th>\n",
+       "      <th>Qiita</th>\n",
+       "      <td>101</td>\n",
+       "      <td>0 days 00:00:06</td>\n",
+       "      <td>0 days 00:33:16</td>\n",
+       "      <td>0 Bytes</td>\n",
+       "      <td>124.1 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Generate HTML summary</th>\n",
+       "      <th>Sequencing Data Type</th>\n",
+       "      <td>78</td>\n",
+       "      <td>0 days 00:00:35</td>\n",
+       "      <td>0 days 02:18:54</td>\n",
+       "      <td>56.6 MB</td>\n",
+       "      <td>85.7 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>list_remote_files</th>\n",
+       "      <th>Qiita</th>\n",
+       "      <td>47</td>\n",
+       "      <td>0 days 00:00:05</td>\n",
+       "      <td>0 days 00:02:21</td>\n",
+       "      <td>0 Bytes</td>\n",
+       "      <td>121.7 MB</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            ElapsedRawTime                  \\\n",
+       "                                                     count             min   \n",
+       "cName                  sName                                                 \n",
+       "delete_artifact        Qiita                          1534 0 days 00:00:03   \n",
+       "create_sample_template Qiita                           569 0 days 00:00:03   \n",
+       "delete_analysis        Qiita                           320 0 days 00:00:03   \n",
+       "download_remote_files  Qiita                           194 0 days 00:00:07   \n",
+       "delete_sample_template Qiita                           181 0 days 00:00:04   \n",
+       "delete_study           Qiita                           136 0 days 00:00:03   \n",
+       "update_prep_template   Qiita                           126 0 days 00:00:03   \n",
+       "copy_artifact          Qiita                           101 0 days 00:00:06   \n",
+       "Generate HTML summary  Sequencing Data Type             78 0 days 00:00:35   \n",
+       "list_remote_files      Qiita                            47 0 days 00:00:05   \n",
+       "\n",
+       "                                                            MaxRSSRaw  \\\n",
+       "                                                        max       min   \n",
+       "cName                  sName                                            \n",
+       "delete_artifact        Qiita                0 days 02:48:08   0 Bytes   \n",
+       "create_sample_template Qiita                0 days 00:11:22   0 Bytes   \n",
+       "delete_analysis        Qiita                0 days 00:06:13   0 Bytes   \n",
+       "download_remote_files  Qiita                0 days 03:29:36   0 Bytes   \n",
+       "delete_sample_template Qiita                0 days 00:19:31   0 Bytes   \n",
+       "delete_study           Qiita                0 days 00:16:09   0 Bytes   \n",
+       "update_prep_template   Qiita                0 days 00:02:25   0 Bytes   \n",
+       "copy_artifact          Qiita                0 days 00:33:16   0 Bytes   \n",
+       "Generate HTML summary  Sequencing Data Type 0 days 02:18:54   56.6 MB   \n",
+       "list_remote_files      Qiita                0 days 00:02:21   0 Bytes   \n",
+       "\n",
+       "                                                       \n",
+       "                                                  max  \n",
+       "cName                  sName                           \n",
+       "delete_artifact        Qiita                 122.2 MB  \n",
+       "create_sample_template Qiita                 415.8 MB  \n",
+       "delete_analysis        Qiita                 120.8 MB  \n",
+       "download_remote_files  Qiita                 128.9 MB  \n",
+       "delete_sample_template Qiita                 120.6 MB  \n",
+       "delete_study           Qiita                 125.5 MB  \n",
+       "update_prep_template   Qiita                 125.3 MB  \n",
+       "copy_artifact          Qiita                 124.1 MB  \n",
+       "Generate HTML summary  Sequencing Data Type   85.7 MB  \n",
+       "list_remote_files      Qiita                 121.7 MB  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "qiime2: 8\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th colspan=\"3\" halign=\"left\">ElapsedRawTime</th>\n",
+       "      <th colspan=\"2\" halign=\"left\">MaxRSSRaw</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>cName</th>\n",
+       "      <th>sName</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>adonis PERMANOVA test for beta group significance [adonis]</th>\n",
+       "      <th>qiime2</th>\n",
+       "      <td>552</td>\n",
+       "      <td>0 days 00:00:57</td>\n",
+       "      <td>0 days 00:39:12</td>\n",
+       "      <td>147.4 MB</td>\n",
+       "      <td>3.5 GB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Core diversity metrics (non-phylogenetic) [core_metrics]</th>\n",
+       "      <th>qiime2</th>\n",
+       "      <td>100</td>\n",
+       "      <td>0 days 00:02:17</td>\n",
+       "      <td>0 days 00:25:31</td>\n",
+       "      <td>213.5 MB</td>\n",
+       "      <td>4.3 GB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Taxonomy-based feature table filter. [filter_table]</th>\n",
+       "      <th>qiime2</th>\n",
+       "      <td>74</td>\n",
+       "      <td>0 days 00:00:52</td>\n",
+       "      <td>0 days 00:19:37</td>\n",
+       "      <td>214.9 MB</td>\n",
+       "      <td>2.6 GB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Summarize table [summarize]</th>\n",
+       "      <th>qiime2</th>\n",
+       "      <td>64</td>\n",
+       "      <td>0 days 00:00:56</td>\n",
+       "      <td>0 days 00:05:54</td>\n",
+       "      <td>229.8 MB</td>\n",
+       "      <td>3.0 GB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Add pseudocount to table. [add_pseudocount]</th>\n",
+       "      <th>qiime2</th>\n",
+       "      <td>55</td>\n",
+       "      <td>0 days 00:01:04</td>\n",
+       "      <td>0 days 00:06:14</td>\n",
+       "      <td>242.5 MB</td>\n",
+       "      <td>2.9 GB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Filter features from a table based on abundance and prevalence [filter_features_conditionally]</th>\n",
+       "      <th>qiime2</th>\n",
+       "      <td>53</td>\n",
+       "      <td>0 days 00:00:53</td>\n",
+       "      <td>0 days 00:02:33</td>\n",
+       "      <td>212.4 MB</td>\n",
+       "      <td>553.3 MB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Identify core features in table [core_features]</th>\n",
+       "      <th>qiime2</th>\n",
+       "      <td>49</td>\n",
+       "      <td>0 days 00:01:03</td>\n",
+       "      <td>0 days 00:59:29</td>\n",
+       "      <td>212.9 MB</td>\n",
+       "      <td>2.6 GB</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Filter features from table [filter_features]</th>\n",
+       "      <th>qiime2</th>\n",
+       "      <td>48</td>\n",
+       "      <td>0 days 00:00:47</td>\n",
+       "      <td>0 days 00:03:34</td>\n",
+       "      <td>208.3 MB</td>\n",
+       "      <td>398.4 MB</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                          ElapsedRawTime  \\\n",
+       "                                                                   count   \n",
+       "cName                                              sName                   \n",
+       "adonis PERMANOVA test for beta group significan... qiime2            552   \n",
+       "Core diversity metrics (non-phylogenetic) [core... qiime2            100   \n",
+       "Taxonomy-based feature table filter. [filter_ta... qiime2             74   \n",
+       "Summarize table [summarize]                        qiime2             64   \n",
+       "Add pseudocount to table. [add_pseudocount]        qiime2             55   \n",
+       "Filter features from a table based on abundance... qiime2             53   \n",
+       "Identify core features in table [core_features]    qiime2             49   \n",
+       "Filter features from table [filter_features]       qiime2             48   \n",
+       "\n",
+       "                                                                           \\\n",
+       "                                                                      min   \n",
+       "cName                                              sName                    \n",
+       "adonis PERMANOVA test for beta group significan... qiime2 0 days 00:00:57   \n",
+       "Core diversity metrics (non-phylogenetic) [core... qiime2 0 days 00:02:17   \n",
+       "Taxonomy-based feature table filter. [filter_ta... qiime2 0 days 00:00:52   \n",
+       "Summarize table [summarize]                        qiime2 0 days 00:00:56   \n",
+       "Add pseudocount to table. [add_pseudocount]        qiime2 0 days 00:01:04   \n",
+       "Filter features from a table based on abundance... qiime2 0 days 00:00:53   \n",
+       "Identify core features in table [core_features]    qiime2 0 days 00:01:03   \n",
+       "Filter features from table [filter_features]       qiime2 0 days 00:00:47   \n",
+       "\n",
+       "                                                                           \\\n",
+       "                                                                      max   \n",
+       "cName                                              sName                    \n",
+       "adonis PERMANOVA test for beta group significan... qiime2 0 days 00:39:12   \n",
+       "Core diversity metrics (non-phylogenetic) [core... qiime2 0 days 00:25:31   \n",
+       "Taxonomy-based feature table filter. [filter_ta... qiime2 0 days 00:19:37   \n",
+       "Summarize table [summarize]                        qiime2 0 days 00:05:54   \n",
+       "Add pseudocount to table. [add_pseudocount]        qiime2 0 days 00:06:14   \n",
+       "Filter features from a table based on abundance... qiime2 0 days 00:02:33   \n",
+       "Identify core features in table [core_features]    qiime2 0 days 00:59:29   \n",
+       "Filter features from table [filter_features]       qiime2 0 days 00:03:34   \n",
+       "\n",
+       "                                                          MaxRSSRaw            \n",
+       "                                                                min       max  \n",
+       "cName                                              sName                       \n",
+       "adonis PERMANOVA test for beta group significan... qiime2  147.4 MB    3.5 GB  \n",
+       "Core diversity metrics (non-phylogenetic) [core... qiime2  213.5 MB    4.3 GB  \n",
+       "Taxonomy-based feature table filter. [filter_ta... qiime2  214.9 MB    2.6 GB  \n",
+       "Summarize table [summarize]                        qiime2  229.8 MB    3.0 GB  \n",
+       "Add pseudocount to table. [add_pseudocount]        qiime2  242.5 MB    2.9 GB  \n",
+       "Filter features from a table based on abundance... qiime2  212.4 MB  553.3 MB  \n",
+       "Identify core features in table [core_features]    qiime2  212.9 MB    2.6 GB  \n",
+       "Filter features from table [filter_features]       qiime2  208.3 MB  398.4 MB  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "summary = df.groupby(['cName', 'sName'])[\n",
+    "        ['ElapsedRawTime', 'MaxRSSRaw']].agg(['count', 'min', 'max']).copy()\n",
+    "\n",
+    "# We are gonna focus on jobs that request more than 4gb or take more than 4 hrs.\n",
+    "summary = summary[(summary[('MaxRSSRaw', 'max')] < 4*m1g) & \n",
+    "                      (summary[('ElapsedRawTime', 'max')] < timedelta(hours=4))]\n",
+    "\n",
+    "summary.sort_values(('MaxRSSRaw', 'count'), inplace=True, ascending=False)\n",
+    "summary.drop(columns=[('MaxRSSRaw', 'count')], inplace=True)\n",
+    "\n",
+    "# ignore commands with less than 40 jobs to avoid over fitting early\n",
+    "summary = summary[summary[('ElapsedRawTime', 'count')] > 40]\n",
+    "\n",
+    "# ignore commands that were optimized on the previous notebooks - as part of larger sets\n",
+    "# summary = summary[]\n",
+    "summary = summary[summary.index.get_level_values('cName') != 'Validate']\n",
+    "\n",
+    "summary[('MaxRSSRaw', 'min')] = summary[('MaxRSSRaw', 'min')].apply(naturalsize)\n",
+    "summary[('MaxRSSRaw', 'max')] = summary[('MaxRSSRaw', 'max')].apply(naturalsize)\n",
+    "\n",
+    "_df = summary[summary.index.get_level_values('sName') != 'qiime2']\n",
+    "print (\"qiita:\", _df.shape[0])\n",
+    "display(_df)\n",
+    "\n",
+    "_df = summary[summary.index.get_level_values('sName') == 'qiime2']\n",
+    "print (\"qiime2:\", _df.shape[0])\n",
+    "display(_df)\n",
+    "\n",
+    "# *** RESOURCE ALLOCATION ***\n",
+    "\n",
+    "# Qiita jobs \n",
+    "# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) \n",
+    "#     VALUES \n",
+    "#     ('delete_artifact', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 3:00:00'),\n",
+    "#     ('create_sample_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 600mb --time 00:20:00'),\n",
+    "#     ('delete_analysis', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:10:00'),\n",
+    "#     ('download_remote_files', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 4:00:00'),\n",
+    "#     ('delete_sample_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:30:00'),\n",
+    "#     ('delete_study', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:30:00'),\n",
+    "#     ('update_prep_template', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:05:00'),\n",
+    "#     ('copy_artifact', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 1:00:00'),\n",
+    "#     ('list_remote_files', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 200mb --time 00:05:00');\n",
+    "\n",
+    "# Q2 jobs\n",
+    "# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) \n",
+    "#     VALUES \n",
+    "#     ('adonis PERMANOVA test for beta group significance [adonis]', 'RESOURCE_PARAMS_COMMAND', \n",
+    "#      '-p qiita -N 1 -n 1 --mem 4gb --time 4:00:00'),\n",
+    "#     ('Core diversity metrics (non-phylogenetic) [core_metrics]', 'RESOURCE_PARAMS_COMMAND', \n",
+    "#      '-p qiita -N 1 -n 1 --mem 6gb --time 1:00:00'),\n",
+    "#     ('Taxonomy-based feature table filter. [filter_table]', 'RESOURCE_PARAMS_COMMAND', \n",
+    "#      '-p qiita -N 1 -n 1 --mem 4gb --time 00:20:00'),\n",
+    "#     ('Summarize table [summarize]', 'RESOURCE_PARAMS_COMMAND', \n",
+    "#      '-p qiita -N 1 -n 1 --mem 4gb --time 00:10:00'),\n",
+    "#     ('Add pseudocount to table. [add_pseudocount]', 'RESOURCE_PARAMS_COMMAND', \n",
+    "#      '-p qiita -N 1 -n 1 --mem 3.5gb --time 00:15:00'),\n",
+    "#     ('Filter features from a table based on abundance and prevalence [filter_features_conditionally]', \n",
+    "#      'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 1gb --time 00:10:00'),\n",
+    "#     ('Identify core features in table [core_features]', 'RESOURCE_PARAMS_COMMAND', \n",
+    "#      '-p qiita -N 1 -n 1 --mem 3.5gb --time 2:00:00'),\n",
+    "#     ('Filter features from table [filter_features]', 'RESOURCE_PARAMS_COMMAND', \n",
+    "#      '-p qiita -N 1 -n 1 --mem 500mb --time 00:10:00');    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Optimizing Qiita processing jobs.\n",
+    "\n",
+    "As a remider, we can use:\n",
+    "- 'samples'\n",
+    "- 'columns'\n",
+    "- 'input_size'\n",
+    "- 'extra_info': this is when the current method doesn't provide the required info or we need to update it; this info comes from `job_stats_generation.py`\n",
+    "\n",
+    "Extra from the list of commands we should take a close look at `Generate HTML summary`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Generate HTML summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead tr:last-of-type th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th colspan=\"3\" halign=\"left\">ElapsedRawTime</th>\n",
+       "      <th colspan=\"2\" halign=\"left\">MaxRSSRaw</th>\n",
+       "      <th colspan=\"2\" halign=\"left\">WaitTime</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>count</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "      <th>min</th>\n",
+       "      <th>max</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>cName</th>\n",
+       "      <th>sName</th>\n",
+       "      <th>extra_info</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">Generate HTML summary</th>\n",
+       "      <th>Sequencing Data Type</th>\n",
+       "      <th>NaN</th>\n",
+       "      <td>78</td>\n",
+       "      <td>0 days 00:00:35</td>\n",
+       "      <td>0 days 02:18:54</td>\n",
+       "      <td>56.6 MB</td>\n",
+       "      <td>85.7 MB</td>\n",
+       "      <td>0 days 00:00:00</td>\n",
+       "      <td>0 days 06:22:26</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>BIOM type</th>\n",
+       "      <th>NaN</th>\n",
+       "      <td>2</td>\n",
+       "      <td>0 days 00:01:43</td>\n",
+       "      <td>0 days 00:02:23</td>\n",
+       "      <td>278.1 MB</td>\n",
+       "      <td>315.8 MB</td>\n",
+       "      <td>0 days 00:00:00</td>\n",
+       "      <td>0 days 00:00:01</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                      ElapsedRawTime  \\\n",
+       "                                                               count   \n",
+       "cName                 sName                extra_info                  \n",
+       "Generate HTML summary Sequencing Data Type NaN                    78   \n",
+       "                      BIOM type            NaN                     2   \n",
+       "\n",
+       "                                                                       \\\n",
+       "                                                                  min   \n",
+       "cName                 sName                extra_info                   \n",
+       "Generate HTML summary Sequencing Data Type NaN        0 days 00:00:35   \n",
+       "                      BIOM type            NaN        0 days 00:01:43   \n",
+       "\n",
+       "                                                                       \\\n",
+       "                                                                  max   \n",
+       "cName                 sName                extra_info                   \n",
+       "Generate HTML summary Sequencing Data Type NaN        0 days 02:18:54   \n",
+       "                      BIOM type            NaN        0 days 00:02:23   \n",
+       "\n",
+       "                                                      MaxRSSRaw            \\\n",
+       "                                                            min       max   \n",
+       "cName                 sName                extra_info                       \n",
+       "Generate HTML summary Sequencing Data Type NaN          56.6 MB   85.7 MB   \n",
+       "                      BIOM type            NaN         278.1 MB  315.8 MB   \n",
+       "\n",
+       "                                                              WaitTime  \\\n",
+       "                                                                   min   \n",
+       "cName                 sName                extra_info                    \n",
+       "Generate HTML summary Sequencing Data Type NaN         0 days 00:00:00   \n",
+       "                      BIOM type            NaN         0 days 00:00:00   \n",
+       "\n",
+       "                                                                        \n",
+       "                                                                   max  \n",
+       "cName                 sName                extra_info                   \n",
+       "Generate HTML summary Sequencing Data Type NaN         0 days 06:22:26  \n",
+       "                      BIOM type            NaN         0 days 00:00:01  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "# Generate HTML summary\n",
+    "cmd = 'Generate HTML summary'\n",
+    "summary = df[df.cName == cmd].groupby(\n",
+    "    ['cName', 'sName', 'extra_info'], dropna=False)[\n",
+    "    ['ElapsedRawTime', 'MaxRSSRaw', 'WaitTime']].agg(['count', 'min', 'max']).copy()\n",
+    "summary[('MaxRSSRaw', 'min')] = summary[('MaxRSSRaw', 'min')].apply(naturalsize)\n",
+    "summary[('MaxRSSRaw', 'max')] = summary[('MaxRSSRaw', 'max')].apply(naturalsize)\n",
+    "summary.drop(columns=[('MaxRSSRaw', 'count')], inplace=True)\n",
+    "summary.drop(columns=[('WaitTime', 'count')], inplace=True)\n",
+    "summary.sort_values(('ElapsedRawTime', 'max'), inplace=True, ascending=False)\n",
+    "\n",
+    "display(summary)\n",
+    "\n",
+    "# As a little background: in multiple cases the `Generate HTML summary` command is run as part of the\n",
+    "#                         Validate command\n",
+    "# Note: there is no special case (like for `Validate`) for `Generate HTML summary` but the jobs are small \n",
+    "#       enough to be bin together\n",
+    "\n",
+    "# *** RESOURCE ALLOCATION ***\n",
+    "\n",
+    "# INSERT INTO qiita.processing_job_resource_allocation (name, job_type, allocation) \n",
+    "#     VALUES ('Generate HTML summary', 'RESOURCE_PARAMS_COMMAND', '-p qiita -N 1 -n 1 --mem 500mb --time 3:00:00');"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Rest of Qiita jobs"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/resource-allocation/generate-allocation-summary.py b/notebooks/resource-allocation/generate-allocation-summary.py
index 1c3e081b4..e081a5d12 100644
--- a/notebooks/resource-allocation/generate-allocation-summary.py
+++ b/notebooks/resource-allocation/generate-allocation-summary.py
@@ -68,10 +68,6 @@
         extra_info = j.parameters.values[
             ('The number of rarefaction depths to include between min_depth '
              'and max_depth. (steps)')]
-    elif cmd.name == 'build_analysis_files':
-        extra_info = j.parameters.values[
-            ('The number of rarefaction depths to include between min_depth '
-             'and max_depth. (steps)')]
 
     _d['external_id'] = eid
     _d['sId'] = s.id
diff --git a/qiita_db/processing_job.py b/qiita_db/processing_job.py
index fdb30db94..8c87fceff 100644
--- a/qiita_db/processing_job.py
+++ b/qiita_db/processing_job.py
@@ -1756,10 +1756,12 @@ def _update_and_launch_children(self, mapping):
         ready = self._update_children(mapping)
         # Submit all the children that already have all the input parameters
         for c in ready:
-            c.submit()
-            # some jobs create several children jobs/validators and this can
-            # clog the submission process; giving it a second to avoid this
-            sleep(1)
+            if c.status in {'in_construction', 'waiting'}:
+                c.submit()
+                # some jobs create several children jobs/validators and this
+                # can clog the submission process; giving it a second to
+                # avoid this
+                sleep(1)
 
     @property
     def outputs(self):

From 780354fb12f51c69b34c8d74eeb0d9bcbc5b641b Mon Sep 17 00:00:00 2001
From: Antonio Gonzalez <antgonza@gmail.com>
Date: Wed, 8 Nov 2023 13:04:33 -0700
Subject: [PATCH 2/2] fix bug in has_human

---
 qiita_db/artifact.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/qiita_db/artifact.py b/qiita_db/artifact.py
index 79cf64de0..2604ee6ef 100644
--- a/qiita_db/artifact.py
+++ b/qiita_db/artifact.py
@@ -1566,7 +1566,8 @@ def has_human(self):
                 with qdb.sql_connection.TRN:
                     qdb.sql_connection.TRN.add(sql)
                     for v in qdb.sql_connection.TRN.execute_fetchflatten():
-                        if v.startswith('human-'):
+                        # str is needed as v could be None
+                        if str(v).startswith('human-'):
                             has_human = True
                             break
 

		ElapsedRawTime	MaxRSSRaw
		count	min	max	count	min	max
cName	sName
Validate	BIOM type - BIOM	687	0 days 00:00:55	0 days 01:03:49	687	222.8 MB	82.0 GB
Diversity types - FeatureData	6	0 days 00:01:20	0 days 00:02:49	6	331.4 MB	384.3 MB
Diversity types - alpha_vector	123	0 days 00:01:12	3 days 04:36:54	123	289.3 MB	101.5 GB
Diversity types - distance_matrix	117	0 days 00:00:37	0 days 00:03:55	117	122.7 MB	12.5 GB
Diversity types - ordination_results	107	0 days 00:00:39	0 days 00:03:19	107	117.2 MB	2.9 GB
Sequencing Data Type - Demultiplexed	43	0 days 00:00:35	0 days 00:12:23	43	83.4 MB	517.4 MB
Sequencing Data Type - FASTA	2	0 days 00:00:56	0 days 00:02:23	2	79.8 MB	83.6 MB
Sequencing Data Type - FASTQ	32	0 days 00:00:41	0 days 01:50:44	32	78.7 MB	84.4 MB
Sequencing Data Type - SFF	1	0 days 00:01:09	0 days 00:01:09	1	79.6 MB	79.6 MB
Sequencing Data Type - per_sample_FASTQ	73	0 days 00:00:36	0 days 18:13:21	73	77.6 MB	83.6 MB
Visualization types - q2_visualization	133	0 days 00:00:36	0 days 00:24:56	133	51.5 MB	67.5 MB
qtp-job-output-folder - job-output-folder	228	0 days 00:00:31	0 days 00:04:06	228	18.3 MB	46.7 MB