Feature/gradio demo (#190)

* [gradio] added demo app * polish
hpcaitech · Mar 22, 2024 · 0050800 · 0050800
1 parent 3875053
commit 0050800
Show file tree

Hide file tree

Showing 9 changed files with 281 additions and 56 deletions.
diff --git a/.gitignore b/.gitignore
@@ -175,3 +175,4 @@ pretrained_models/
 
 # Secret files
 hostfile
+gradio_cached_examples/
diff --git a/README.md b/README.md
@@ -134,7 +134,15 @@ Our model's weight is partially initialized from [PixArt-α](https://github.com/
 
 ## Inference
 
-To run inference with our provided weights, first download [T5](https://huggingface.co/DeepFloyd/t5-v1_1-xxl/tree/main) weights into `pretrained_models/t5_ckpts/t5-v1_1-xxl`. Then download the model weights from [huggingface](https://huggingface.co/hpcai-tech/Open-Sora/tree/main). Run the following commands to generate samples. To change sampling prompts, modify the txt file passed to `--prompt-path`. See [here](docs/structure.md#inference-config-demos) to customize the configuration.
+We have provided a Gradio application in this repository, you can use the following the command to start an interactive web application to experience video generation with Open-Sora.
+
+```bash
+python scripts/demo.py
+```
+
+This will launch a Gradio application on your localhost.
+
+Besides, we have also provided an offline inference script. To run inference with our provided weights, first download [T5](https://huggingface.co/DeepFloyd/t5-v1_1-xxl/tree/main) weights into `pretrained_models/t5_ckpts/t5-v1_1-xxl`. Then download the model weights from [huggingface](https://huggingface.co/hpcai-tech/Open-Sora/tree/main). Run the following commands to generate samples. To change sampling prompts, modify the txt file passed to `--prompt-path`. See [here](docs/structure.md#inference-config-demos) to customize the configuration.
 
 ```bash
 # Sample 16x256x256 (5s/sample, 100 time steps, 22 GB memory)

diff --git a/configs/opensora/inference/16x256x256.py b/configs/opensora/inference/16x256x256.py
@@ -18,7 +18,7 @@
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
 )
 scheduler = dict(

diff --git a/configs/opensora/inference/16x512x512.py b/configs/opensora/inference/16x512x512.py
@@ -18,7 +18,7 @@
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
 )
 scheduler = dict(

diff --git a/configs/opensora/inference/64x512x512.py b/configs/opensora/inference/64x512x512.py
@@ -18,7 +18,7 @@
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
 )
 scheduler = dict(

diff --git a/configs/opensora/train/16x256x256.py b/configs/opensora/train/16x256x256.py
@@ -29,7 +29,7 @@
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )

diff --git a/opensora/datasets/utils.py b/opensora/datasets/utils.py
@@ -33,6 +33,7 @@ def save_sample(x, fps=8, save_path=None, normalize=True, value_range=(-1, 1)):
         x = x.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 3, 0).to("cpu", torch.uint8)
         write_video(save_path, x, fps=fps, video_codec="h264")
     print(f"Saved to {save_path}")
+    return save_path
 
 
 class StatefulDistributedSampler(DistributedSampler):

diff --git a/opensora/models/text_encoder/t5.py b/opensora/models/text_encoder/t5.py
@@ -37,17 +37,16 @@
 
 
 class T5Embedder:
-    available_models = ["t5-v1_1-xxl"]
+    available_models = ["DeepFloyd/t5-v1_1-xxl"]
     bad_punct_regex = re.compile(
         r"[" + "#®•©™&@·º½¾¿¡§~" + "\)" + "\(" + "\]" + "\[" + "\}" + "\{" + "\|" + "\\" + "\/" + "\*" + r"]{1,}"
     )  # noqa
 
     def __init__(
         self,
         device,
-        dir_or_name="t5-v1_1-xxl",
+        from_pretrained=None,
         *,
-        local_cache=False,
         cache_dir=None,
         hf_token=None,
         use_text_preprocessing=True,
@@ -58,8 +57,11 @@ def __init__(
     ):
         self.device = torch.device(device)
         self.torch_dtype = torch_dtype or torch.bfloat16
+        self.cache_dir = cache_dir
+
         if t5_model_kwargs is None:
             t5_model_kwargs = {"low_cpu_mem_usage": True, "torch_dtype": self.torch_dtype}
+
             if use_offload_folder is not None:
                 t5_model_kwargs["offload_folder"] = use_offload_folder
                 t5_model_kwargs["device_map"] = {
@@ -97,51 +99,10 @@ def __init__(
 
         self.use_text_preprocessing = use_text_preprocessing
         self.hf_token = hf_token
-        self.cache_dir = cache_dir or os.path.expanduser("~/.cache/IF_")
-        self.dir_or_name = dir_or_name
-        tokenizer_path, path = dir_or_name, dir_or_name
-        if local_cache:
-            cache_dir = os.path.join(self.cache_dir, dir_or_name)
-            tokenizer_path, path = cache_dir, cache_dir
-        elif dir_or_name in self.available_models:
-            cache_dir = os.path.join(self.cache_dir, dir_or_name)
-            for filename in [
-                "config.json",
-                "special_tokens_map.json",
-                "spiece.model",
-                "tokenizer_config.json",
-                "pytorch_model.bin.index.json",
-                "pytorch_model-00001-of-00002.bin",
-                "pytorch_model-00002-of-00002.bin",
-            ]:
-                hf_hub_download(
-                    repo_id=f"DeepFloyd/{dir_or_name}",
-                    filename=filename,
-                    cache_dir=cache_dir,
-                    force_filename=filename,
-                    token=self.hf_token,
-                )
-            tokenizer_path, path = cache_dir, cache_dir
-        else:
-            cache_dir = os.path.join(self.cache_dir, "t5-v1_1-xxl")
-            for filename in [
-                "config.json",
-                "special_tokens_map.json",
-                "spiece.model",
-                "tokenizer_config.json",
-            ]:
-                hf_hub_download(
-                    repo_id="DeepFloyd/t5-v1_1-xxl",
-                    filename=filename,
-                    cache_dir=cache_dir,
-                    force_filename=filename,
-                    token=self.hf_token,
-                )
-            tokenizer_path = cache_dir
-
-        print(tokenizer_path)
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-        self.model = T5EncoderModel.from_pretrained(path, **t5_model_kwargs).eval()
+
+        assert from_pretrained in self.available_models
+        self.tokenizer = AutoTokenizer.from_pretrained(from_pretrained, cache_dir=cache_dir)
+        self.model = T5EncoderModel.from_pretrained(from_pretrained, cache_dir=cache_dir, **t5_model_kwargs).eval()
         self.model_max_length = model_max_length
 
     def get_text_embeddings(self, texts):
@@ -304,16 +265,16 @@ def __init__(
         model_max_length=120,
         device="cuda",
         dtype=torch.float,
-        local_cache=True,
+        cache_dir=None,
         shardformer=False,
     ):
         assert from_pretrained is not None, "Please specify the path to the T5 model"
 
         self.t5 = T5Embedder(
             device=device,
             torch_dtype=dtype,
-            local_cache=local_cache,
-            cache_dir=from_pretrained,
+            from_pretrained=from_pretrained,
+            cache_dir=cache_dir,
             model_max_length=model_max_length,
         )
         self.t5.model.to(dtype=dtype)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -175,3 +175,4 @@ pretrained_models/

		# Secret files
		hostfile
		gradio_cached_examples/