From d96a6cf048f9184de4afd68290a1373434b212de Mon Sep 17 00:00:00 2001
From: Kory Stiger <kory@zumolabs.ai>
Date: Fri, 4 Jun 2021 08:22:49 -0700
Subject: [PATCH 01/12] adding print of known 400 errors

---
 cli/cli.py | 32 +++++++++++++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/cli/cli.py b/cli/cli.py
index a214f570..517fda4a 100644
--- a/cli/cli.py
+++ b/cli/cli.py
@@ -89,7 +89,7 @@ def set_project(project_uuid):
 
 
 @project.command("clear")
-def set_project():
+def clear_project():
     """Clear global PROJECT uuid."""
     config = read_config()
     config.pop("PROJECT")
@@ -180,6 +180,8 @@ def list_datasets(filters, project=None):
         click.echo("Fetched datasets successfully.")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to fetch datasets {e}.", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
         return
 
     tbl = TableLogger(
@@ -228,6 +230,8 @@ def list_sims(filters, project=None):
         click.echo("Fetched sims successfully.")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to fetch sims {e}.", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
         return
 
     tbl = TableLogger(
@@ -275,6 +279,8 @@ def list_projects(filters):
         click.echo("Fetched projects successfully.")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to fetch projects {e}.", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
         return
 
     tbl = TableLogger(
@@ -316,6 +322,8 @@ def list_accounts(filters):
         click.echo("Fetched accounts successfully.")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to fetch accounts {e}.", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
         return
 
     tbl = TableLogger(
@@ -361,6 +369,8 @@ def list_jobs(filters, project=None):
         click.echo("Fetched jobs successfully.")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to fetch jobs {e}.", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
         return
 
     tbl = TableLogger(
@@ -413,6 +423,8 @@ def get_dataset(name, dtype, path):
         click.echo(f"Downloaded {dtype} dataset '{name}' to {output_path}")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to download dataset: {e}", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
     except NameError as e:
         click.secho(f"Failed to download dataset: {e}", fg="yellow", err=True)
 
@@ -439,6 +451,8 @@ def get_sim(name, path):
         click.echo(f"Downloaded sim '{name}' to {output_path}")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to download sim: {e}", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
     except NameError as e:
         click.secho(f"Failed to download sim: {e}", fg="yellow", err=True)
 
@@ -479,6 +493,8 @@ def upload_sim(name, path, project=None):
         click.secho(f"Uploaded sim {path} with name '{name}'", fg="green")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to upload sim: {e}", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
 
 
 @upload.command("dataset")
@@ -505,6 +521,8 @@ def upload_dataset(name, path, project=None):
         click.secho(f"Uploaded dataset {path} with name '{name}'", fg="green")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to upload dataset: {e}", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
 
 
 # ------- CREATE
@@ -534,6 +552,8 @@ def create_project(account, name):
         click.secho(f"Created project '{name}'", fg="green")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to create project: {e}", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
 
 
 @create.command("dataset")
@@ -568,6 +588,8 @@ def create_dataset(name, sim, args, project=None):
         )
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to create dataset: {e}", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
     except NameError as e:
         click.secho(f"Failed to create dataset: {e}", fg="yellow", err=True)
 
@@ -610,6 +632,8 @@ def create_sweep(name, sim, number, args, project=None):
             )
         except requests.exceptions.HTTPError as e:
             click.secho(f"Failed to create dataset: {e}", fg="red", err=True)
+            if e.response.status_code == 400:
+                click.secho(str(e.response.json()), fg="red", err=True)
         except NameError as e:
             click.secho(f"Failed to create dataset: {e}", fg="yellow", err=True)
             return
@@ -691,6 +715,8 @@ def create_job(name, operation, filters, configfile, sweepfile, project=None):
             )
         except requests.exceptions.HTTPError as e:
             click.secho(f"Failed to create job: {e}", fg="red", err=True)
+            if e.response.status_code == 400:
+                click.secho(str(e.response.json()), fg="red", err=True)
 
     click.echo(f"Finished creating {len(job_configs)} jobs with name '{name}'")
 
@@ -729,6 +755,8 @@ def logs_dataset(name, path):
         click.echo(f"Downloaded {path}/[info/debug/error].log from '{name}'.")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to fetch logs: {e}", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
     except NameError as e:
         click.secho(f"Failed to fetch logs: {e}", fg="yellow", err=True)
 
@@ -755,5 +783,7 @@ def logs_job(name, path):
         click.echo(f"Downloaded {path}/[info/debug/error].log from '{name}'.")
     except requests.exceptions.HTTPError as e:
         click.secho(f"Failed to fetch logs: {e}", fg="red", err=True)
+        if e.response.status_code == 400:
+            click.secho(str(e.response.json()), fg="red", err=True)
     except NameError as e:
         click.secho(f"Failed to fetch logs: {e}", fg="yellow", err=True)

From 173d0d6fb536efca2b3f32a90f3cea1c5d3e9a18 Mon Sep 17 00:00:00 2001
From: Kory Stiger <kory@zumolabs.ai>
Date: Fri, 4 Jun 2021 08:29:29 -0700
Subject: [PATCH 02/12] fix project filter in create job

---
 cli/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cli/datasets.py b/cli/datasets.py
index 5a8d2dbc..7ae37982 100644
--- a/cli/datasets.py
+++ b/cli/datasets.py
@@ -30,7 +30,7 @@ def filter_datasets(dfilter, project, url, auth_headers):
     for dataset_type in DATASET_TYPES:
         endpoint = f"{url}/api/v1/{dataset_type}/"
         params = {
-            **params,
+            'project': project,
             f"{field}__{pattern}": regex,
         }
 

From 5d143a2e5fd2bbc33d062ce69170aafdc273ba4f Mon Sep 17 00:00:00 2001
From: Lint Bot <infra@zumolabs.ai>
Date: Fri, 4 Jun 2021 15:29:59 +0000
Subject: [PATCH 03/12] Fix code style issues with Black

---
 cli/datasets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cli/datasets.py b/cli/datasets.py
index 7ae37982..fb43a72a 100644
--- a/cli/datasets.py
+++ b/cli/datasets.py
@@ -30,7 +30,7 @@ def filter_datasets(dfilter, project, url, auth_headers):
     for dataset_type in DATASET_TYPES:
         endpoint = f"{url}/api/v1/{dataset_type}/"
         params = {
-            'project': project,
+            "project": project,
             f"{field}__{pattern}": regex,
         }
 

From 56a409be0cceb0a42f36429b7cc2072f0525512d Mon Sep 17 00:00:00 2001
From: Kory Stiger <kory@zumolabs.ai>
Date: Mon, 7 Jun 2021 15:09:26 -0700
Subject: [PATCH 04/12] - rename functions to fix 'var mirrors name in outer
 scope' issue - fix data set filtering for jobs - add helper function to print
 list of strings as columns

---
 cli/cli.py      | 97 +++++++++++++++++++++++++++----------------------
 cli/datasets.py | 30 +++++++++++----
 cli/utils.py    | 26 +++++++++++++
 3 files changed, 101 insertions(+), 52 deletions(-)

diff --git a/cli/cli.py b/cli/cli.py
index 517fda4a..d40de991 100644
--- a/cli/cli.py
+++ b/cli/cli.py
@@ -1,4 +1,5 @@
 import json
+import math
 
 import click
 import requests
@@ -7,7 +8,7 @@
 
 from cli.config import initialize_config, read_config, write_config, get_endpoint
 from cli.loader import Loader
-from cli.utils import parse_args, resolve_sweep, use_project
+from cli.utils import parse_args, resolve_sweep, use_project, print_list_as_columns
 from zpy.files import read_json, to_pathlib_path
 
 SMALL_WIDTH = 12
@@ -28,7 +29,7 @@ def cli():
 
 
 @cli.command("help")
-def help():
+def cli_help():
     """display help
 
     This will display help in order to provide users with more information
@@ -70,13 +71,13 @@ def set_env(env):
     click.echo("zpy login to fetch token")
 
 
-@cli.group()
-def project():
+@cli.group("project")
+def cli_project():
     """Manage global project workspace."""
     pass
 
 
-@project.command("set")
+@cli_project.command("set")
 @click.argument("project_uuid", type=click.UUID)
 def set_project(project_uuid):
     """Set global PROJECT uuid."""
@@ -88,7 +89,7 @@ def set_project(project_uuid):
     click.echo(f"  {old_project_uuid} -> {config['PROJECT']}")
 
 
-@project.command("clear")
+@cli_project.command("clear")
 def clear_project():
     """Clear global PROJECT uuid."""
     config = read_config()
@@ -124,7 +125,7 @@ def login(username, password):
 
 
 @cli.command("config")
-def config():
+def cli_config():
     """display config
 
     Display current configuration file to developer.
@@ -147,8 +148,8 @@ def version():
 # ------- LIST
 
 
-@cli.group()
-def list():
+@cli.group("list")
+def cli_list():
     """List objects.
 
     List group is used for list commands on backend objects.
@@ -156,7 +157,7 @@ def list():
     pass
 
 
-@list.command("datasets")
+@cli_list.command("datasets")
 @click.argument("filters", nargs=-1)
 @use_project()
 def list_datasets(filters, project=None):
@@ -206,7 +207,7 @@ def list_datasets(filters, project=None):
         )
 
 
-@list.command("sims")
+@cli_list.command("sims")
 @click.argument("filters", nargs=-1)
 @use_project()
 def list_sims(filters, project=None):
@@ -258,7 +259,7 @@ def list_sims(filters, project=None):
         )
 
 
-@list.command("projects")
+@cli_list.command("projects")
 @click.argument("filters", nargs=-1)
 def list_projects(filters):
     """list projects
@@ -270,7 +271,7 @@ def list_projects(filters):
     try:
         filters = parse_args(filters)
     except Exception:
-        click.secho("Failed to parse filters: {args}", fg="yellow", err=True)
+        click.secho(f"Failed to parse filters: {filters}", fg="yellow", err=True)
         return
 
     try:
@@ -301,7 +302,7 @@ def list_projects(filters):
         )
 
 
-@list.command("accounts")
+@cli_list.command("accounts")
 @click.argument("filters", nargs=-1)
 def list_accounts(filters):
     """list accounts
@@ -313,7 +314,7 @@ def list_accounts(filters):
     try:
         filters = parse_args(filters)
     except Exception:
-        click.secho("Failed to parse filters: {args}", fg="yellow", err=True)
+        click.secho(f"Failed to parse filters: {filters}", fg="yellow", err=True)
         return
 
     try:
@@ -344,7 +345,7 @@ def list_accounts(filters):
         )
 
 
-@list.command("jobs")
+@cli_list.command("jobs")
 @click.argument("filters", nargs=-1)
 @use_project()
 def list_jobs(filters, project=None):
@@ -360,7 +361,7 @@ def list_jobs(filters, project=None):
         if project:
             filters["project"] = project
     except Exception:
-        click.secho("Failed to parse filters: {args}", fg="yellow", err=True)
+        click.secho(f"Failed to parse filters: {filters}", fg="yellow", err=True)
         return
 
     try:
@@ -390,7 +391,7 @@ def list_jobs(filters, project=None):
 # ------- GET
 
 
-@cli.group()
+@cli.group("get")
 def get():
     """get object
 
@@ -460,7 +461,7 @@ def get_sim(name, path):
 # -------  UPLOAD
 
 
-@cli.group()
+@cli.group("upload")
 def upload():
     """upload object
 
@@ -528,7 +529,7 @@ def upload_dataset(name, path, project=None):
 # ------- CREATE
 
 
-@cli.group()
+@cli.group("create")
 def create():
     """create object
 
@@ -578,7 +579,7 @@ def create_dataset(name, sim, args, project=None):
     try:
         dataset_config = parse_args(args)
     except Exception:
-        click.secho("Failed to parse args: {args}", fg="yellow", err=True)
+        click.secho(f"Failed to parse args: {args}", fg="yellow", err=True)
         return
     try:
         create_generated_dataset(name, sim, parse_args(args), project)
@@ -619,7 +620,7 @@ def create_sweep(name, sim, number, args, project=None):
     try:
         dataset_config = parse_args(args)
     except Exception:
-        click.secho("Failed to parse args: {args}", fg="yellow", err=True)
+        click.secho(f"Failed to parse args: {args}", fg="yellow", err=True)
         return
     for i in range(int(number)):
         dataset_name = f"{name} seed{i}"
@@ -642,48 +643,56 @@ def create_sweep(name, sim, number, args, project=None):
 
 @create.command("job")
 @click.argument("name")
-@click.argument("operation")
-@click.option("filters", "-f", multiple=True)
+@click.argument("operation", type=click.Choice(["package", "tvt", "train"]))
+@click.option(
+    "filters",
+    "-f",
+    multiple=True,
+    help="Key/value pairs separated by spaces. Passed as query params in the API call to filter data sets."
+)
 @click.option(
     "configfile",
     "--configfile",
     type=click.Path(exists=True, dir_okay=False, resolve_path=True),
+    help="Path to json file"
 )
 @click.option(
     "sweepfile",
     "--sweepfile",
     type=click.Path(exists=True, dir_okay=False, resolve_path=True),
+    help="Path to json file"
 )
 @use_project(required=True)
 def create_job(name, operation, filters, configfile, sweepfile, project=None):
     """create job
 
-    Create a job object in backend that will trigger an operation on
-    datasets filtered by the filters. Requires PROJECT set via `zpy project`.
-
-    Args:
-        name (str): name of new job
-        operation (str): name of operation to run on datasets
-        filters (str): string filters for dataset names to run job on
-        configfile (str): json configuration for the job
-        sweepfile (str): sweep json to launch a suite of jobs
-        project (str): project uuid
+    Create a job called NAME within PROJECT to perform OPERATION on a group of datasets defined by the FILTERS
+    provided by -f. Requires PROJECT set via `zpy project`.
     """
     from cli.datasets import filter_datasets
     from cli.jobs import create_new_job
 
-    datasets = []
+    filtered_datasets = []
     for dfilter in filters:
         try:
             with Loader(f"Filtering datasets by '{dfilter}'..."):
-                filtered_datasets = filter_datasets(dfilter, project)
-            filtered_datasets_names = [*filtered_datasets.keys()]
-            click.echo(
-                f"Filtered datasets by filter '{dfilter}':\n{filtered_datasets_names}"
-            )
-            datasets.append(filtered_datasets.values())
+                datasets_by_type = filter_datasets(dfilter, project)
+
+            for [dataset_type, datasets] in datasets_by_type.items():
+                count = len(datasets)
+                click.secho(f"Found {count} of type<{dataset_type}>")
+
+                if count == 0:
+                    continue
+
+                dataset_names = list(datasets.values())
+                print_list_as_columns(dataset_names)
+
+            filtered_datasets_ids = [data_set_id for data_sets in datasets_by_type.values() for data_set_id in
+                                     data_sets.keys()]
+            filtered_datasets.extend(filtered_datasets_ids)
         except requests.exceptions.HTTPError as e:
-            click.secho(f"Failed to filter datsets {e}", fg="red", err=True)
+            click.secho(f"Failed to filter datasets {e}", fg="red", err=True)
 
     job_configs = []
     if configfile:
@@ -709,7 +718,7 @@ def create_job(name, operation, filters, configfile, sweepfile, project=None):
     for i, config in enumerate(job_configs):
         job_name = name if i == 0 else f"{name} {i}"
         try:
-            create_new_job(job_name, operation, config, datasets, project)
+            create_new_job(job_name, operation, config, filtered_datasets, project)
             click.secho(
                 f"Created {operation} job '{job_name}' with config {config}", fg="green"
             )
@@ -724,7 +733,7 @@ def create_job(name, operation, filters, configfile, sweepfile, project=None):
 # ------- LOGS
 
 
-@cli.group()
+@cli.group("logs")
 def logs():
     """logs
 
diff --git a/cli/datasets.py b/cli/datasets.py
index fb43a72a..350bbb1a 100644
--- a/cli/datasets.py
+++ b/cli/datasets.py
@@ -23,9 +23,14 @@ def filter_datasets(dfilter, project, url, auth_headers):
         auth_headers: authentication for backend
 
     Return:
-        dict: filtered datasets by dfilter {'name': 'id'}
+        dict: filtered datasets by dfilter
+        {
+            'uploaded-data-sets': {'id': 'name'},
+            'generated-data-sets': {'id': 'name'},
+            'job-data-sets': {'id': 'name'},
+        }
     """
-    filtered_datasets = {}
+    filtered_datasets = {key: {} for key in DATASET_TYPES}
     field, pattern, regex = parse_filter(dfilter)
     for dataset_type in DATASET_TYPES:
         endpoint = f"{url}/api/v1/{dataset_type}/"
@@ -34,14 +39,23 @@ def filter_datasets(dfilter, project, url, auth_headers):
             f"{field}__{pattern}": regex,
         }
 
-        while endpoint is not None:
-            r = requests.get(endpoint, params=params, headers=auth_headers)
+        # Do initial request
+        r = requests.get(endpoint, params=params, headers=auth_headers)
+        if r.status_code != 200:
+            r.raise_for_status()
+        body = json.loads(r.text)
+        for data_set in body["results"]:
+            filtered_datasets[dataset_type][data_set["id"]] = data_set["name"]
+
+        # Traverse the next links until we've gotten all of the data sets
+        while body["next"] is not None:
+            r = requests.get(body["next"], headers=auth_headers)
             if r.status_code != 200:
                 r.raise_for_status()
-            response = json.loads(r.text)
-            for r in response["results"]:
-                filtered_datasets[r["name"]] = r["id"]
-            endpoint = response["next"]
+            body = json.loads(r.text)
+            for data_set in body["results"]:
+                filtered_datasets[dataset_type][data_set["id"]] = data_set["name"]
+
     return filtered_datasets
 
 
diff --git a/cli/utils.py b/cli/utils.py
index 2f465b72..2df022cb 100644
--- a/cli/utils.py
+++ b/cli/utils.py
@@ -1,4 +1,5 @@
 import functools
+import math
 from copy import deepcopy
 from itertools import product
 from urllib.request import urlopen
@@ -169,3 +170,28 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     return use_project_inner
+
+
+def print_list_as_columns(list_of_strings, num_cols=5, indent_prefix="    "):
+    """Format and echo a list of strings into nicely formatted columns.
+
+    Args:
+        list_of_strings (list of str): A list of similar strings to format into columns.
+        num_cols (int): Desired number of columns.
+        indent_prefix (str): String to attach to the beginning of every printed line.
+    Returns:
+        None
+    """
+    count = len(list_of_strings)
+    col_width = max(len(string) for string in list_of_strings)
+    num_rows = math.ceil(count / num_cols)
+    for i in range(num_rows):
+        start_index = i * num_cols
+        end_index = (i + 1) * num_cols
+        if end_index > len(list_of_strings):
+            end_index = len(list_of_strings)
+        row = list_of_strings[start_index: end_index]
+
+        format_string = indent_prefix + " ".join(["{{:<{}}}".format(col_width) for _ in row])
+
+        click.echo(format_string.format(*row))

From d35f41751dba39b6d25426d106e64ceb3898d7ca Mon Sep 17 00:00:00 2001
From: Lint Bot <infra@zumolabs.ai>
Date: Mon, 7 Jun 2021 22:09:57 +0000
Subject: [PATCH 05/12] Fix code style issues with Black

---
 cli/cli.py   | 13 ++++++++-----
 cli/utils.py |  6 ++++--
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/cli/cli.py b/cli/cli.py
index d40de991..d104e778 100644
--- a/cli/cli.py
+++ b/cli/cli.py
@@ -648,19 +648,19 @@ def create_sweep(name, sim, number, args, project=None):
     "filters",
     "-f",
     multiple=True,
-    help="Key/value pairs separated by spaces. Passed as query params in the API call to filter data sets."
+    help="Key/value pairs separated by spaces. Passed as query params in the API call to filter data sets.",
 )
 @click.option(
     "configfile",
     "--configfile",
     type=click.Path(exists=True, dir_okay=False, resolve_path=True),
-    help="Path to json file"
+    help="Path to json file",
 )
 @click.option(
     "sweepfile",
     "--sweepfile",
     type=click.Path(exists=True, dir_okay=False, resolve_path=True),
-    help="Path to json file"
+    help="Path to json file",
 )
 @use_project(required=True)
 def create_job(name, operation, filters, configfile, sweepfile, project=None):
@@ -688,8 +688,11 @@ def create_job(name, operation, filters, configfile, sweepfile, project=None):
                 dataset_names = list(datasets.values())
                 print_list_as_columns(dataset_names)
 
-            filtered_datasets_ids = [data_set_id for data_sets in datasets_by_type.values() for data_set_id in
-                                     data_sets.keys()]
+            filtered_datasets_ids = [
+                data_set_id
+                for data_sets in datasets_by_type.values()
+                for data_set_id in data_sets.keys()
+            ]
             filtered_datasets.extend(filtered_datasets_ids)
         except requests.exceptions.HTTPError as e:
             click.secho(f"Failed to filter datasets {e}", fg="red", err=True)
diff --git a/cli/utils.py b/cli/utils.py
index 2df022cb..513649f6 100644
--- a/cli/utils.py
+++ b/cli/utils.py
@@ -190,8 +190,10 @@ def print_list_as_columns(list_of_strings, num_cols=5, indent_prefix="    "):
         end_index = (i + 1) * num_cols
         if end_index > len(list_of_strings):
             end_index = len(list_of_strings)
-        row = list_of_strings[start_index: end_index]
+        row = list_of_strings[start_index:end_index]
 
-        format_string = indent_prefix + " ".join(["{{:<{}}}".format(col_width) for _ in row])
+        format_string = indent_prefix + " ".join(
+            ["{{:<{}}}".format(col_width) for _ in row]
+        )
 
         click.echo(format_string.format(*row))

From 2a4dbc0b3bc4925ebcc9681fcc28e5c3c9409355 Mon Sep 17 00:00:00 2001
From: HugoCMU <hugo.p.cmu@gmail.com>
Date: Wed, 2 Jun 2021 12:39:52 -0700
Subject: [PATCH 06/12] Start of literature docs

---
 docs/overview/literature.md | 55 +++++++++++++++++++++++++++----------
 docs/overview/what.md       |  2 +-
 2 files changed, 41 insertions(+), 16 deletions(-)

diff --git a/docs/overview/literature.md b/docs/overview/literature.md
index 2d158500..9b578d5a 100644
--- a/docs/overview/literature.md
+++ b/docs/overview/literature.md
@@ -1,22 +1,47 @@
-## Papers
+# Synthetic Data Literature
 
-Many papers have been written about synthetic data over the years. This is a short list of some of the important ones:
+Many papers have been written about synthetic data over the years. If academic papers aren't your jam, we [publish articles](https://www.zumolabs.ai/blog) to explain synthetic data as simply as we can. Below are some key papers **organized by ...**
 
-- [Using Synthetic Data to Train Neural Networks is Model-Based Reasoning](https://arxiv.org/pdf/1703.00868.pdf)
-- [Multi Modal Semantic Segmentation using Synthetic Data](https://arxiv.org/pdf/1910.13676.pdf)
-- [Semantic Understanding of Foggy Scenes with Purely Synthetic Data](https://arxiv.org/pdf/1910.03997.pdf)
-- [Deep Drone Racing: From Simulation to Reality with Domain Randomization](https://arxiv.org/pdf/1905.09727.pdf)
-- [Learning from Synthetic Humans](https://arxiv.org/pdf/1701.01370.pdf)
-- [Sim-to-Real via Sim-to-Sim: Data-efficient Robotic Grasping via Randomized-to-Canonical Adaptation Networks](https://arxiv.org/pdf/1812.07252.pdf)
+**... usecase:**
+
+- Robotics:
+- Autonomous Vehicles:
+- Humans:
+- Space:
+- ML Theory:
+- Review:
+
+**... year:**
+
+- 2019:
+- 2020:
+- 2021:
+
+# Paper Sumarries
+
+### [Sim-to-Real via Sim-to-Sim: Data-efficient Robotic Grasping via Randomized-to-Canonical Adaptation Networks](https://arxiv.org/pdf/1812.07252.pdf)
+
+**Usecase** Robotic Grasping
+
+**Year** 2019
+
+**TLDR** Randomized to-Canonical Adaptation Networks (RCANs): learns to translate randomized rendered images into their equivalent non-randomized, canonical versions.Achieving 91% grasp performance with just 5,000 real-world grasps attaining comparable performance to a state-of-the-art system
+trained with 580,000 real-world grasps, resulting in a reduction of real-world data by more than 99%.
+
+
+- [Learning from Simulated and Unsupervised Images through Adversarial Training](https://arxiv.org/pdf/1612.07828.pdf)
 - [Domain Randomization for Transferring Deep Neural Networks from Simulation to the Real World](https://arxiv.org/pdf/1703.06907.pdf)
+- [Deep Drone Racing: From Simulation to Reality with Domain Randomization](https://arxiv.org/pdf/1905.09727.pdf)
 - [Structured Domain Randomization: Bridging the Reality Gap by Context-Aware Synthetic Data](https://arxiv.org/pdf/1810.10093.pdf)
-- [Learning from Simulated and Unsupervised Images through Adversarial Training](https://arxiv.org/pdf/1612.07828.pdf)
 
-## Blogs
+## 
 
-We publish artticles to explain synthetic data:
+- [Using Synthetic Data to Train Neural Networks is Model-Based Reasoning](https://arxiv.org/pdf/1703.00868.pdf)
+
+# Synthetic Humans
+
+- [Learning from Synthetic Humans](https://arxiv.org/pdf/1701.01370.pdf)
+- [Multi Modal Semantic Segmentation using Synthetic Data](https://arxiv.org/pdf/1910.13676.pdf)
+- [Semantic Understanding of Foggy Scenes with Purely Synthetic Data](https://arxiv.org/pdf/1910.03997.pdf)
 
-- [Synthetic Data is Dynamic Data](https://www.zumolabs.ai/post/five-big-problems-with-labeled-data)
-- [Patrick vs Squidward: Training Vote Detection AI with Synthetic Data](https://www.zumolabs.ai/post/patrick-vs-squidward-training-vote-detection-ai-with-synthetic-data)
-- [Five Big Problems With Labeled Data](https://www.zumolabs.ai/post/five-big-problems-with-labeled-data)
-- [Synthetic Data: Useful, Privacy-Risk-Free Data](https://www.zumolabs.ai/post/synthetic-data-useful-privacy-risk-free-data)
\ No newline at end of file
+## Summary Papers
diff --git a/docs/overview/what.md b/docs/overview/what.md
index 714d6830..d19aaba7 100644
--- a/docs/overview/what.md
+++ b/docs/overview/what.md
@@ -1 +1 @@
-Synthetic data is data that is *created* as opposed to *collected*. Synthetic data can be used in any of the flavors of Machine Learning.
\ No newline at end of file
+Synthetic data is data that is *created* as opposed to *collected*. Synthetic data can be used in any of the flavors of Machine Learning (natural language, computer vision, tabular data).
\ No newline at end of file

From de59163c440fa8aff9447112dbb1df570155b9cb Mon Sep 17 00:00:00 2001
From: HugoCMU <hugo.p.cmu@gmail.com>
Date: Thu, 3 Jun 2021 10:56:30 -0700
Subject: [PATCH 07/12] link format for papers

---
 docs/overview/literature.md | 102 +++++++++++++++++++++++++++++++-----
 1 file changed, 88 insertions(+), 14 deletions(-)

diff --git a/docs/overview/literature.md b/docs/overview/literature.md
index 9b578d5a..e5edf7cd 100644
--- a/docs/overview/literature.md
+++ b/docs/overview/literature.md
@@ -4,7 +4,7 @@ Many papers have been written about synthetic data over the years. If academic p
 
 **... usecase:**
 
-- Robotics:
+- Robotics: [1](#ref1), 
 - Autonomous Vehicles:
 - Humans:
 - Space:
@@ -13,13 +13,14 @@ Many papers have been written about synthetic data over the years. If academic p
 
 **... year:**
 
-- 2019:
+- 2017:
+- 2019: [1](#ref1),
 - 2020:
 - 2021:
 
 # Paper Sumarries
 
-### [Sim-to-Real via Sim-to-Sim: Data-efficient Robotic Grasping via Randomized-to-Canonical Adaptation Networks](https://arxiv.org/pdf/1812.07252.pdf)
+### [Sim-to-Real via Sim-to-Sim: Data-efficient Robotic Grasping via Randomized-to-Canonical Adaptation Networks](https://arxiv.org/pdf/1812.07252.pdf) <a name="ref1"></a>
 
 **Usecase** Robotic Grasping
 
@@ -28,20 +29,93 @@ Many papers have been written about synthetic data over the years. If academic p
 **TLDR** Randomized to-Canonical Adaptation Networks (RCANs): learns to translate randomized rendered images into their equivalent non-randomized, canonical versions.Achieving 91% grasp performance with just 5,000 real-world grasps attaining comparable performance to a state-of-the-art system
 trained with 580,000 real-world grasps, resulting in a reduction of real-world data by more than 99%.
 
+---
 
-- [Learning from Simulated and Unsupervised Images through Adversarial Training](https://arxiv.org/pdf/1612.07828.pdf)
-- [Domain Randomization for Transferring Deep Neural Networks from Simulation to the Real World](https://arxiv.org/pdf/1703.06907.pdf)
-- [Deep Drone Racing: From Simulation to Reality with Domain Randomization](https://arxiv.org/pdf/1905.09727.pdf)
-- [Structured Domain Randomization: Bridging the Reality Gap by Context-Aware Synthetic Data](https://arxiv.org/pdf/1810.10093.pdf)
+### [Learning from Simulated and Unsupervised Images through Adversarial Training](https://arxiv.org/pdf/1612.07828.pdf) <a name="ref"></a>
 
-## 
+**Usecase**
 
-- [Using Synthetic Data to Train Neural Networks is Model-Based Reasoning](https://arxiv.org/pdf/1703.00868.pdf)
+**Year**
 
-# Synthetic Humans
+**TLDR**
 
-- [Learning from Synthetic Humans](https://arxiv.org/pdf/1701.01370.pdf)
-- [Multi Modal Semantic Segmentation using Synthetic Data](https://arxiv.org/pdf/1910.13676.pdf)
-- [Semantic Understanding of Foggy Scenes with Purely Synthetic Data](https://arxiv.org/pdf/1910.03997.pdf)
+---
 
-## Summary Papers
+### [Domain Randomization for Transferring Deep Neural Networks from Simulation to the Real World](https://arxiv.org/pdf/1703.06907.pdf) <a name="ref"></a>
+
+**Usecase**
+
+**Year**
+
+**TLDR**
+
+---
+
+### [Deep Drone Racing: From Simulation to Reality with Domain Randomization](https://arxiv.org/pdf/1905.09727.pdf) <a name="ref"></a>
+
+**Usecase**
+
+**Year**
+
+**TLDR**
+
+---
+### [Structured Domain Randomization: Bridging the Reality Gap by Context-Aware Synthetic Data](https://arxiv.org/pdf/1810.10093.pdf) <a name="ref"></a>
+
+**Usecase**
+
+**Year**
+
+**TLDR**
+
+---
+ 
+### [Using Synthetic Data to Train Neural Networks is Model-Based Reasoning](https://arxiv.org/pdf/1703.00868.pdf) <a name="ref"></a>
+
+**Usecase**
+
+**Year**
+
+**TLDR**
+
+---
+
+### [Learning from Synthetic Humans](https://arxiv.org/pdf/1701.01370.pdf) <a name="ref"></a>
+
+**Usecase**
+
+**Year**
+
+**TLDR**
+
+---
+
+### [Multi Modal Semantic Segmentation using Synthetic Data](https://arxiv.org/pdf/1910.13676.pdf) <a name="ref"></a>
+
+**Usecase**
+
+**Year**
+
+**TLDR**
+
+---
+
+### [Semantic Understanding of Foggy Scenes with Purely Synthetic Data](https://arxiv.org/pdf/1910.03997.pdf) <a name="ref"></a>
+
+**Usecase**
+
+**Year**
+
+**TLDR**
+
+---
+
+### [Synthetic Data for Deep Learning](https://arxiv.org/pdf/1909.11512.pdf) <a name="ref"></a>
+
+**Usecase**
+
+**Year**
+
+**TLDR**
+
+---
\ No newline at end of file

From b36374e79d6dddd83a6175890e70522e2a8738ac Mon Sep 17 00:00:00 2001
From: HugoCMU <hugo.p.cmu@gmail.com>
Date: Thu, 3 Jun 2021 11:06:46 -0700
Subject: [PATCH 08/12] abstract over tldr

---
 docs/overview/literature.md | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/docs/overview/literature.md b/docs/overview/literature.md
index e5edf7cd..c876300f 100644
--- a/docs/overview/literature.md
+++ b/docs/overview/literature.md
@@ -6,19 +6,21 @@ Many papers have been written about synthetic data over the years. If academic p
 
 - Robotics: [1](#ref1), 
 - Autonomous Vehicles:
-- Humans:
+- Humans: [2](#ref2),
 - Space:
 - ML Theory:
 - Review:
 
 **... year:**
 
-- 2017:
+- 2017: [2](#ref2),
 - 2019: [1](#ref1),
 - 2020:
 - 2021:
 
-# Paper Sumarries
+The abstracts are also included with the paper links, so a good way to use this document is to `ctrl-F` the key words relevant to your usecase.
+
+# Papers
 
 ### [Sim-to-Real via Sim-to-Sim: Data-efficient Robotic Grasping via Randomized-to-Canonical Adaptation Networks](https://arxiv.org/pdf/1812.07252.pdf) <a name="ref1"></a>
 
@@ -26,18 +28,17 @@ Many papers have been written about synthetic data over the years. If academic p
 
 **Year** 2019
 
-**TLDR** Randomized to-Canonical Adaptation Networks (RCANs): learns to translate randomized rendered images into their equivalent non-randomized, canonical versions.Achieving 91% grasp performance with just 5,000 real-world grasps attaining comparable performance to a state-of-the-art system
-trained with 580,000 real-world grasps, resulting in a reduction of real-world data by more than 99%.
+**Abstract** Real world data, especially in the domain of robotics, is notoriously costly to collect. One way to circumvent this can be to leverage the power of simulation to produce large amounts of labelled data. However, training models on simulated images does not readily transfer to realworld ones. Using domain adaptation methods to cross this “reality gap” requires a large amount of unlabelled realworld data, whilst domain randomization alone can waste modeling power. In this paper, we present Randomizedto-Canonical Adaptation Networks (RCANs), a novel approach to crossing the visual reality gap that uses no realworld data. Our method learns to translate randomized rendered images into their equivalent non-randomized, canonical versions. This in turn allows for real images to also be translated into canonical sim images. We demonstrate the effectiveness of this sim-to-real approach by training a vision-based closed-loop grasping reinforcement learning agent in simulation, and then transferring it to the real world to attain 70% zero-shot grasp success on unseen objects, a result that almost doubles the success of learning the same task directly on domain randomization alone. Additionally, by joint finetuning in the real-world with only 5,000 real-world grasps, our method achieves 91%, attaining comparable performance to a state-of-the-art system trained with 580,000 real-world grasps, resulting in a reduction of real-world data by more than 99%.
 
 ---
 
-### [Learning from Simulated and Unsupervised Images through Adversarial Training](https://arxiv.org/pdf/1612.07828.pdf) <a name="ref"></a>
+### [Learning from Simulated and Unsupervised Images through Adversarial Training](https://arxiv.org/pdf/1612.07828.pdf) <a name="ref2"></a>
 
-**Usecase**
+**Usecase** Human Gaze Estimation
 
-**Year**
+**Year** 2017
 
-**TLDR**
+**Abstract** With recent progress in graphics, it has become more tractable to train models on synthetic images, potentially avoiding the need for expensive annotations. However, learning from synthetic images may not achieve the desired performance due to a gap between synthetic and real image distributions. To reduce this gap, we propose Simulated+Unsupervised (S+U) learning, where the task is to learn a model to improve the realism of a simulator’s output using unlabeled real data, while preserving the annotation information from the simulator. We develop a method for S+U learning that uses an adversarial network similar to Generative Adversarial Networks (GANs), but with synthetic images as inputs instead of random vectors. We make several key modifications to the standard GAN algorithm to preserve annotations, avoid artifacts, and stabilize training: (i) a ‘self-regularization’ term, (ii) a local adversarial loss, and (iii) updating the discriminator using a history of refined images. We show that this enables generation of highly realistic images, which we demonstrate both qualitatively and with a user study. We quantitatively evaluate the generated images by training models for gaze estimation and hand pose estimation. We show a significant improvement over using synthetic images, and achieve state-of-the-art results on the MPIIGaze dataset without any labeled real data.
 
 ---
 
@@ -47,7 +48,7 @@ trained with 580,000 real-world grasps, resulting in a reduction of real-world d
 
 **Year**
 
-**TLDR**
+**Abstract**
 
 ---
 
@@ -57,7 +58,7 @@ trained with 580,000 real-world grasps, resulting in a reduction of real-world d
 
 **Year**
 
-**TLDR**
+**Abstract**
 
 ---
 ### [Structured Domain Randomization: Bridging the Reality Gap by Context-Aware Synthetic Data](https://arxiv.org/pdf/1810.10093.pdf) <a name="ref"></a>
@@ -66,7 +67,7 @@ trained with 580,000 real-world grasps, resulting in a reduction of real-world d
 
 **Year**
 
-**TLDR**
+**Abstract**
 
 ---
  
@@ -76,7 +77,7 @@ trained with 580,000 real-world grasps, resulting in a reduction of real-world d
 
 **Year**
 
-**TLDR**
+**Abstract**
 
 ---
 
@@ -86,7 +87,7 @@ trained with 580,000 real-world grasps, resulting in a reduction of real-world d
 
 **Year**
 
-**TLDR**
+**Abstract**
 
 ---
 
@@ -96,7 +97,7 @@ trained with 580,000 real-world grasps, resulting in a reduction of real-world d
 
 **Year**
 
-**TLDR**
+**Abstract**
 
 ---
 
@@ -106,7 +107,7 @@ trained with 580,000 real-world grasps, resulting in a reduction of real-world d
 
 **Year**
 
-**TLDR**
+**Abstract**
 
 ---
 
@@ -116,6 +117,6 @@ trained with 580,000 real-world grasps, resulting in a reduction of real-world d
 
 **Year**
 
-**TLDR**
+**Abstract**
 
 ---
\ No newline at end of file

From c8722ec4cc6a305d537c84027ee2ab4533487998 Mon Sep 17 00:00:00 2001
From: HugoCMU <hugo.p.cmu@gmail.com>
Date: Thu, 3 Jun 2021 11:21:46 -0700
Subject: [PATCH 09/12] Edits to literature page

---
 docs/overview/literature.md | 87 +++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 43 deletions(-)

diff --git a/docs/overview/literature.md b/docs/overview/literature.md
index c876300f..5ca58127 100644
--- a/docs/overview/literature.md
+++ b/docs/overview/literature.md
@@ -4,25 +4,26 @@ Many papers have been written about synthetic data over the years. If academic p
 
 **... usecase:**
 
-- Robotics: [1](#ref1), 
-- Autonomous Vehicles:
-- Humans: [2](#ref2),
+- Robotics: [1](#ref1), [3](#ref3), [4](#ref4),
+- Autonomous Vehicles: [5](#ref5), [8](#ref8), [9](#ref9),
+- Humans: [2](#ref2), [7](#ref7),
 - Space:
-- ML Theory:
-- Review:
+- ML Theory: [6](#ref6),
+- Overview: [10](#ref10),
 
 **... year:**
 
-- 2017: [2](#ref2),
-- 2019: [1](#ref1),
-- 2020:
+- 2017: [2](#ref2), [3](#ref3), [6](#ref6),
+- 2018: [7](#ref7),
+- 2019: [1](#ref1), [4](#ref4), [8](#ref8),
+- 2020: [5](#ref5), [9](#ref9),
 - 2021:
 
-The abstracts are also included with the paper links, so a good way to use this document is to `ctrl-F` the key words relevant to your usecase.
+**TIP** The abstracts are also included with the paper links, so a good way to use this document is to `ctrl-F` the key words relevant to your usecase.
 
 # Papers
 
-### [Sim-to-Real via Sim-to-Sim: Data-efficient Robotic Grasping via Randomized-to-Canonical Adaptation Networks](https://arxiv.org/pdf/1812.07252.pdf) <a name="ref1"></a>
+## [Sim-to-Real via Sim-to-Sim: Data-efficient Robotic Grasping via Randomized-to-Canonical Adaptation Networks](https://arxiv.org/pdf/1812.07252.pdf) <a name="ref1"></a>
 
 **Usecase** Robotic Grasping
 
@@ -32,7 +33,7 @@ The abstracts are also included with the paper links, so a good way to use this
 
 ---
 
-### [Learning from Simulated and Unsupervised Images through Adversarial Training](https://arxiv.org/pdf/1612.07828.pdf) <a name="ref2"></a>
+## [Learning from Simulated and Unsupervised Images through Adversarial Training](https://arxiv.org/pdf/1612.07828.pdf) <a name="ref2"></a>
 
 **Usecase** Human Gaze Estimation
 
@@ -42,81 +43,81 @@ The abstracts are also included with the paper links, so a good way to use this
 
 ---
 
-### [Domain Randomization for Transferring Deep Neural Networks from Simulation to the Real World](https://arxiv.org/pdf/1703.06907.pdf) <a name="ref"></a>
+## [Domain Randomization for Transferring Deep Neural Networks from Simulation to the Real World](https://arxiv.org/pdf/1703.06907.pdf) <a name="ref3"></a>
 
-**Usecase**
+**Usecase** Robotic Grasping
 
-**Year**
+**Year** 2017
 
-**Abstract**
+**Abstract** Bridging the ‘reality gap’ that separates simulated robotics from experiments on hardware could accelerate robotic research through improved data availability. This paper explores domain randomization, a simple technique for training models on simulated images that transfer to real images by randomizing rendering in the simulator. With enough variability in the simulator, the real world may appear to the model as just another variation. We focus on the task of object localization, which is a stepping stone to general robotic manipulation skills. We find that it is possible to train a real-world object detector that is accurate to 1.5 cm and robust to distractors and partial occlusions using only data from a simulator with non-realistic random textures. To demonstrate the capabilities of our detectors, we show they can be used to perform grasping in a cluttered environment. To our knowledge, this is the first successful transfer of a deep neural network trained only on simulated RGB images (without pre-training on real images) to the real world for the purpose of robotic control.
 
 ---
 
-### [Deep Drone Racing: From Simulation to Reality with Domain Randomization](https://arxiv.org/pdf/1905.09727.pdf) <a name="ref"></a>
+## [Deep Drone Racing: From Simulation to Reality with Domain Randomization](https://arxiv.org/pdf/1905.09727.pdf) <a name="ref4"></a>
 
-**Usecase**
+**Usecase** Drone Racing
 
-**Year**
+**Year** 2019
 
-**Abstract**
+**Abstract** Dynamically changing environments, unreliable state estimation, and operation under severe resource constraints are fundamental challenges that limit the deployment of small autonomous drones. We address these challenges in the context of autonomous, vision-based drone racing in dynamic environments. A racing drone must traverse a track with possibly moving gates at high speed. We enable this functionality by combining the performance of a state-of-the-art planning and control system with the perceptual awareness of a convolutional neural network (CNN). The resulting modular system is both platform- and domain-independent: it is trained in simulation and deployed on a physical quadrotor without any fine-tuning. The abundance of simulated data, generated via domain randomization, makes our system robust to changes of illumination and gate appearance. To the best of our knowledge, our approach is the first to demonstrate zero-shot sim-to-real transfer on the task of agile drone flight. We extensively test the precision and robustness of our system, both in simulation and on a physical platform, and show significant improvements over the state of the art.
 
 ---
-### [Structured Domain Randomization: Bridging the Reality Gap by Context-Aware Synthetic Data](https://arxiv.org/pdf/1810.10093.pdf) <a name="ref"></a>
+## [Structured Domain Randomization: Bridging the Reality Gap by Context-Aware Synthetic Data](https://arxiv.org/pdf/1810.10093.pdf) <a name="ref5"></a>
 
-**Usecase**
+**Usecase** Autonomous Vehicles
 
-**Year**
+**Year** 2020
 
-**Abstract**
+**Abstract**  We present structured domain randomization (SDR), a variant of domain randomization (DR) that takes into account the structure and context of the scene. In contrast to DR, which places objects and distractors randomly according to a uniform probability distribution, SDR places objects and distractors randomly according to probability distributions that arise from the specific problem at hand. In this manner, SDRgenerated imagery enables the neural network to take the context around an object into consideration during detection. We demonstrate the power of SDR for the problem of 2D bounding box car detection, achieving competitive results on real data after training only on synthetic data. On the KITTI easy, moderate, and hard tasks, we show that SDR outperforms other approaches to generating synthetic data (VKITTI, Sim 200k, or DR), as well as real data collected in a different domain (BDD100K). Moreover, synthetic SDR data combined with real KITTI data outperforms real KITTI data alone.
 
 ---
  
-### [Using Synthetic Data to Train Neural Networks is Model-Based Reasoning](https://arxiv.org/pdf/1703.00868.pdf) <a name="ref"></a>
+## [Using Synthetic Data to Train Neural Networks is Model-Based Reasoning](https://arxiv.org/pdf/1703.00868.pdf) <a name="ref6"></a>
 
-**Usecase**
+**Usecase** ML Theory
 
-**Year**
+**Year** 2017
 
-**Abstract**
+**Abstract** We draw a formal connection between using synthetic training data to optimize neural network parameters and approximate, Bayesian, model-based reasoning. In particular, training a neural network using synthetic data can be viewed as learning a proposal distribution generator for approximate inference in the synthetic-data generative model. We demonstrate this connection in a recognition task where we develop a novel Captcha-breaking architecture and train it using synthetic data, demonstrating both state-of-the-art performance and a way of computing task-specific posterior uncertainty. Using a neural network trained this way, we also demonstrate successful breaking of real-world Captchas currently used by Facebook and Wikipedia. Reasoning from these empirical results and drawing connections with Bayesian modeling, we discuss the robustness of synthetic data results and suggest important considerations for ensuring good neural network generalization when training with synthetic data.
 
 ---
 
-### [Learning from Synthetic Humans](https://arxiv.org/pdf/1701.01370.pdf) <a name="ref"></a>
+## [Learning from Synthetic Humans](https://arxiv.org/pdf/1701.01370.pdf) <a name="ref7"></a>
 
-**Usecase**
+**Usecase** Human Pose Detection
 
-**Year**
+**Year** 2018
 
-**Abstract**
+**Abstract** Estimating human pose, shape, and motion from images and videos are fundamental challenges with many applications. Recent advances in 2D human pose estimation use large amounts of manually-labeled training data for learning convolutional neural networks (CNNs). Such data is time consuming to acquire and difficult to extend. Moreover, manual labeling of 3D pose, depth and motion is impractical. In this work we present SURREAL (Synthetic hUmans foR REAL tasks): a new large-scale dataset with synthetically-generated but realistic images of people rendered from 3D sequences of human motion capture data. We generate more than 6 million frames together with ground truth pose, depth maps, and segmentation masks. We show that CNNs trained on our synthetic dataset allow for accurate human depth estimation and human part segmentation in real RGB images. Our results and the new dataset open up new possibilities for advancing person analysis using cheap and large-scale synthetic data.
 
 ---
 
-### [Multi Modal Semantic Segmentation using Synthetic Data](https://arxiv.org/pdf/1910.13676.pdf) <a name="ref"></a>
+## [Multi Modal Semantic Segmentation using Synthetic Data](https://arxiv.org/pdf/1910.13676.pdf) <a name="ref8"></a>
 
-**Usecase**
+**Usecase** Autonomous Vehicles
 
-**Year**
+**Year** 2019
 
-**Abstract**
+**Abstract** Semantic understanding of scenes in threedimensional space (3D) is a quintessential part of robotics oriented applications such as autonomous driving as it provides geometric cues such as size, orientation and true distance of separation to objects which are crucial for taking mission critical decisions. As a first step, in this work we investigate the possibility of semantically classifying different parts of a given scene in 3D by learning the underlying geometric context in addition to the texture cues BUT in the absence of labelled real-world datasets. To this end we generate a large number of synthetic scenes, their pixel-wise labels and corresponding 3D representations using CARLA software framework. We then build a deep neural network that learns underlying category specific 3D representation and texture cues from color information of the rendered synthetic scenes. Further on we apply the learned model on different real world datasets to evaluate its performance. Our preliminary investigation of results show that the neural network is able to learn the geometric context from synthetic scenes and effectively apply this knowledge to classify each point of a 3D representation of a scene in real-world.
 
 ---
 
-### [Semantic Understanding of Foggy Scenes with Purely Synthetic Data](https://arxiv.org/pdf/1910.03997.pdf) <a name="ref"></a>
+## [Semantic Understanding of Foggy Scenes with Purely Synthetic Data](https://arxiv.org/pdf/1910.03997.pdf) <a name="ref9"></a>
 
-**Usecase**
+**Usecase** Autonomous Vehicles
 
-**Year**
+**Year** 2020
 
-**Abstract**
+**Abstract** This work addresses the problem of semantic scene understanding under foggy road conditions. Although marked progress has been made in semantic scene understanding over the recent years, it is mainly concentrated on clear weather outdoor scenes. Extending semantic segmentation methods to adverse weather conditions like fog is crucially important for outdoor applications such as self-driving cars. In this paper, we propose a novel method, which uses purely synthetic data to improve the performance on unseen realworld foggy scenes captured in the streets of Zurich and its surroundings. Our results highlight the potential and power of photo-realistic synthetic images for training and especially fine-tuning deep neural nets. Our contributions are threefold, 1) we created a purely synthetic, high-quality foggy dataset of 25,000 unique outdoor scenes, that we call Foggy Synscapes and plan to release publicly 2) we show that with this data we outperform previous approaches on real-world foggy test data 3) we show that a combination of our data and previously used data can even further improve the performance on real-world foggy data.
 
 ---
 
-### [Synthetic Data for Deep Learning](https://arxiv.org/pdf/1909.11512.pdf) <a name="ref"></a>
+## [Synthetic Data for Deep Learning](https://arxiv.org/pdf/1909.11512.pdf) <a name="ref10"></a>
 
-**Usecase**
+**Usecase** Overview
 
-**Year**
+**Year** 2019
 
-**Abstract**
+**Abstract** Synthetic data is an increasingly popular tool for training deep learning models, especially in computer vision but also in other areas. In this work, we attempt to provide a comprehensive survey of the various directions in the development and application of synthetic data. First, we discuss synthetic datasets for basic computer vision problems, both low-level (e.g., optical flow estimation) and high-level (e.g., semantic segmentation), synthetic environments and datasets for outdoor and urban scenes (autonomous driving), indoor scenes (indoor navigation), aerial navigation, simulation environments for robotics, applications of synthetic data outside computer vision (in neural programming, bioinformatics, NLP, and more); we also survey the work on improving synthetic data development and alternative ways to produce it such as GANs. Second, we discuss in detail the synthetic-to-real domain adaptation problem that inevitably arises in applications of synthetic data, including syntheticto-real refinement with GAN-based models and domain adaptation at the feature/model level without explicit data transformations. Third, we turn to privacy-related applications of synthetic data and review the work on generating synthetic datasets with differential privacy guarantees. We conclude by highlighting the most promising directions for further work in synthetic data studies.
 
 ---
\ No newline at end of file

From 0bc0db7111cc330c9d11b4004bad5f8dfb3aae47 Mon Sep 17 00:00:00 2001
From: HugoCMU <hugo.p.cmu@gmail.com>
Date: Thu, 3 Jun 2021 12:01:09 -0700
Subject: [PATCH 10/12] climate paper

---
 docs/overview/literature.md | 97 +++++++++++++++++++++++++++++++++++--
 1 file changed, 94 insertions(+), 3 deletions(-)

diff --git a/docs/overview/literature.md b/docs/overview/literature.md
index 5ca58127..a5be2741 100644
--- a/docs/overview/literature.md
+++ b/docs/overview/literature.md
@@ -7,16 +7,17 @@ Many papers have been written about synthetic data over the years. If academic p
 - Robotics: [1](#ref1), [3](#ref3), [4](#ref4),
 - Autonomous Vehicles: [5](#ref5), [8](#ref8), [9](#ref9),
 - Humans: [2](#ref2), [7](#ref7),
-- Space:
+- Climate: [11](#ref11),
 - ML Theory: [6](#ref6),
 - Overview: [10](#ref10),
 
+
 **... year:**
 
 - 2017: [2](#ref2), [3](#ref3), [6](#ref6),
 - 2018: [7](#ref7),
-- 2019: [1](#ref1), [4](#ref4), [8](#ref8),
-- 2020: [5](#ref5), [9](#ref9),
+- 2019: [1](#ref1), [4](#ref4), [8](#ref8), [10](#ref10),
+- 2020: [5](#ref5), [9](#ref9), [11](#ref11),
 - 2021:
 
 **TIP** The abstracts are also included with the paper links, so a good way to use this document is to `ctrl-F` the key words relevant to your usecase.
@@ -120,4 +121,94 @@ Many papers have been written about synthetic data over the years. If academic p
 
 **Abstract** Synthetic data is an increasingly popular tool for training deep learning models, especially in computer vision but also in other areas. In this work, we attempt to provide a comprehensive survey of the various directions in the development and application of synthetic data. First, we discuss synthetic datasets for basic computer vision problems, both low-level (e.g., optical flow estimation) and high-level (e.g., semantic segmentation), synthetic environments and datasets for outdoor and urban scenes (autonomous driving), indoor scenes (indoor navigation), aerial navigation, simulation environments for robotics, applications of synthetic data outside computer vision (in neural programming, bioinformatics, NLP, and more); we also survey the work on improving synthetic data development and alternative ways to produce it such as GANs. Second, we discuss in detail the synthetic-to-real domain adaptation problem that inevitably arises in applications of synthetic data, including syntheticto-real refinement with GAN-based models and domain adaptation at the feature/model level without explicit data transformations. Third, we turn to privacy-related applications of synthetic data and review the work on generating synthetic datasets with differential privacy guarantees. We conclude by highlighting the most promising directions for further work in synthetic data studies.
 
+---
+
+## [Learning color space adaptation from synthetic to real images of cirrus clouds](https://arxiv.org/pdf/1810.10286v2.pdf) <a name="ref11"></a>
+
+**Usecase** Cloud Detection
+
+**Year** 2020
+
+**Abstract** Cloud segmentation plays a crucial role in image analysis for climate modeling. Manually labeling the training data for cloud segmentation is time-consuming and error-prone. We explore to train segmentation networks with synthetic data due to the natural acquisition of pixel-level labels. Nevertheless, the domain gap between synthetic and real images significantly degrades the performance of the trained model. We propose a color space adaptation method to bridge the gap, by training a color-sensitive generator and discriminator to adapt synthetic data to real images in color space. Instead of transforming images by general convolutional kernels, we adopt a set of closed-form operations to make color-space adjustments while preserving the labels. We also construct a synthetic-to-real cirrus cloud dataset SynCloud and demonstrate the adaptation efficacy on the semantic segmentation task of cirrus clouds. With our adapted synthetic data for training the semantic segmentation, we achieve an improvement of 6:59% when applied to real images, superior to alternative methods.
+
+---
+
+## [Title](arxivlink) <a name="ref"></a>
+
+**Usecase** 
+
+**Year** 
+
+**Abstract** 
+
+---
+
+## [Title](arxivlink) <a name="ref"></a>
+
+**Usecase** 
+
+**Year** 
+
+**Abstract** 
+
+---
+
+## [Title](arxivlink) <a name="ref"></a>
+
+**Usecase** 
+
+**Year** 
+
+**Abstract** 
+
+---
+
+## [Title](arxivlink) <a name="ref"></a>
+
+**Usecase** 
+
+**Year** 
+
+**Abstract** 
+
+---
+
+## [Title](arxivlink) <a name="ref"></a>
+
+**Usecase** 
+
+**Year** 
+
+**Abstract** 
+
+---
+
+## [Title](arxivlink) <a name="ref"></a>
+
+**Usecase** 
+
+**Year** 
+
+**Abstract** 
+
+---
+
+## [Title](arxivlink) <a name="ref"></a>
+
+**Usecase** 
+
+**Year** 
+
+**Abstract** 
+
+---
+
+## [Title](arxivlink) <a name="ref"></a>
+
+**Usecase** 
+
+**Year** 
+
+**Abstract** 
+
 ---
\ No newline at end of file

From 014b3fe4b4345075dc512fe7e47ec200c2433335 Mon Sep 17 00:00:00 2001
From: HugoCMU <hugo.p.cmu@gmail.com>
Date: Thu, 3 Jun 2021 12:06:45 -0700
Subject: [PATCH 11/12] more papers

---
 docs/overview/literature.md | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/docs/overview/literature.md b/docs/overview/literature.md
index a5be2741..874335f4 100644
--- a/docs/overview/literature.md
+++ b/docs/overview/literature.md
@@ -5,20 +5,22 @@ Many papers have been written about synthetic data over the years. If academic p
 **... usecase:**
 
 - Robotics: [1](#ref1), [3](#ref3), [4](#ref4),
-- Autonomous Vehicles: [5](#ref5), [8](#ref8), [9](#ref9),
+- Autonomous Vehicles: [5](#ref5), [8](#ref8), [9](#ref9), [13](#ref13),
 - Humans: [2](#ref2), [7](#ref7),
 - Climate: [11](#ref11),
 - ML Theory: [6](#ref6),
 - Overview: [10](#ref10),
+- Frameworks: [12](#ref12),
 
 
 **... year:**
 
+- 2016: [13](#ref13),
 - 2017: [2](#ref2), [3](#ref3), [6](#ref6),
 - 2018: [7](#ref7),
 - 2019: [1](#ref1), [4](#ref4), [8](#ref8), [10](#ref10),
 - 2020: [5](#ref5), [9](#ref9), [11](#ref11),
-- 2021:
+- 2021: [12](#ref12),
 
 **TIP** The abstracts are also included with the paper links, so a good way to use this document is to `ctrl-F` the key words relevant to your usecase.
 
@@ -133,23 +135,23 @@ Many papers have been written about synthetic data over the years. If academic p
 
 ---
 
-## [Title](arxivlink) <a name="ref"></a>
+## [UnrealROX+: An Improved Tool for Acquiring Synthetic Data from Virtual 3D Environments](https://arxiv.org/pdf/2104.11776v1.pdf) <a name="ref12"></a>
 
-**Usecase** 
+**Usecase** Framework
 
-**Year** 
+**Year** 2021
 
-**Abstract** 
+**Abstract** Synthetic data generation has become essential in last years for feeding data-driven algorithms, which surpassed traditional techniques performance in almost every computer vision problem. Gathering and labelling the amount of data needed for these data-hungry models in the real world may become unfeasible and error-prone, while synthetic data give us the possibility of generating huge amounts of data with pixel-perfect annotations. However, most synthetic datasets lack from enough realism in their rendered images. In that context UnrealROX generation tool was presented in 2019, allowing to generate highly realistic data, at high resolutions and framerates, with an efficient pipeline based on Unreal Engine, a cutting-edge videogame engine. UnrealROX enabled robotic vision researchers to generate realistic and visually plausible data with full ground truth for a wide variety of problems such as class and instance semantic segmentation, object detection, depth estimation, visual grasping, and navigation. Nevertheless, its workflow was very tied to generate image sequences from a robotic on-board camera, making hard to generate data for other purposes. In this work, we present UnrealROX+, an improved version of UnrealROX where its decoupled and easy-to-use data acquisition system allows to quickly design and generate data in a much more flexible and customizable way. Moreover, it is packaged as an Unreal plug-in, which makes it more comfortable to use with already existing Unreal projects, and it also includes new features such as generating albedo or a Python API for interacting with the virtual environment from Deep Learning frameworks.
 
 ---
 
-## [Title](arxivlink) <a name="ref"></a>
+## [The SYNTHIA Dataset: A Large Collection of Synthetic Images for Semantic Segmentation of Urban Scenes](arxivlink) <a name="ref13"></a>
 
-**Usecase** 
+**Usecase** Autonomous Vehicles
 
-**Year** 
+**Year** 2016
 
-**Abstract** 
+**Abstract** Vision-based semantic segmentation in urban scenarios is a key functionality for autonomous driving. Recent revolutionary results of deep convolutional neural networks (DCNNs) foreshadow the advent of reliable classifiers to perform such visual tasks. However, DCNNs require learning of many parameters from raw images; thus, having a sufficient amount of diverse images with class annotations is needed. These annotations are obtained via cumbersome, human labour which is particularly challenging for semantic segmentation since pixel-level annotations are required. In this paper, we propose to use a virtual world to automatically generate realistic synthetic images with pixel-level annotations. Then, we address the question of how useful such data can be for semantic segmentation – in particular, when using a DCNN paradigm. In order to answer this question we have generated a synthetic collection of diverse urban images, named SYNTHIA, with automatically generated class annotations. We use SYNTHIA in combination with publicly available real-world urban images with manually provided annotations. Then, we conduct experiments with DCNNs that show how the inclusion of SYNTHIA in the training stage significantly improves performance on the semantic segmentation task.
 
 ---
 

From f0d2c7b3461a1af69ad9c5d85d2c3eb90b85c03c Mon Sep 17 00:00:00 2001
From: Kory Stiger <kory@zumolabs.ai>
Date: Mon, 7 Jun 2021 15:14:44 -0700
Subject: [PATCH 12/12] removing unused import

---
 cli/cli.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cli/cli.py b/cli/cli.py
index d104e778..6256ff73 100644
--- a/cli/cli.py
+++ b/cli/cli.py
@@ -1,5 +1,4 @@
 import json
-import math
 
 import click
 import requests