Update gh-pages (#2654)

NVIDIA · Jan 3, 2025 · 3537f75 · 3537f75
1 parent f11aeed
commit 3537f75
Show file tree

Hide file tree

Showing 74 changed files with 205 additions and 13,467 deletions.
diff --git a/_cpp_gen/executor.html b/_cpp_gen/executor.html
@@ -8480,7 +8480,7 @@ <h2>version.h<a class="headerlink" href="#version-h" title="Link to this heading
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da74e6b5be0>
+<jinja2.runtime.BlockReference object at 0x74a60e3c2030>
 
 <div class="footer">
     <p>

diff --git a/_cpp_gen/runtime.html b/_cpp_gen/runtime.html
@@ -12360,7 +12360,7 @@ <h2>worldConfig.h<a class="headerlink" href="#worldconfig-h" title="Link to this
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da74ae393a0>
+<jinja2.runtime.BlockReference object at 0x74a60d0c5160>
 
 <div class="footer">
     <p>

diff --git a/_modules/index.html b/_modules/index.html
@@ -161,70 +161,7 @@
   <h1>All modules for which code is available</h1>
 <ul><li><a href="tensorrt_llm/bindings.html">tensorrt_llm.bindings</a></li>
 <ul><li><a href="tensorrt_llm/bindings/executor.html">tensorrt_llm.bindings.executor</a></li>
-</ul><li><a href="tensorrt_llm/builder.html">tensorrt_llm.builder</a></li>
-<li><a href="tensorrt_llm/executor.html">tensorrt_llm.executor</a></li>
-<li><a href="tensorrt_llm/functional.html">tensorrt_llm.functional</a></li>
-<li><a href="tensorrt_llm/layers/activation.html">tensorrt_llm.layers.activation</a></li>
-<li><a href="tensorrt_llm/layers/attention.html">tensorrt_llm.layers.attention</a></li>
-<li><a href="tensorrt_llm/layers/cast.html">tensorrt_llm.layers.cast</a></li>
-<li><a href="tensorrt_llm/layers/conv.html">tensorrt_llm.layers.conv</a></li>
-<li><a href="tensorrt_llm/layers/embedding.html">tensorrt_llm.layers.embedding</a></li>
-<li><a href="tensorrt_llm/layers/linear.html">tensorrt_llm.layers.linear</a></li>
-<li><a href="tensorrt_llm/layers/mlp.html">tensorrt_llm.layers.mlp</a></li>
-<li><a href="tensorrt_llm/layers/normalization.html">tensorrt_llm.layers.normalization</a></li>
-<li><a href="tensorrt_llm/layers/pooling.html">tensorrt_llm.layers.pooling</a></li>
-<li><a href="tensorrt_llm/llmapi/build_cache.html">tensorrt_llm.llmapi.build_cache</a></li>
-<li><a href="tensorrt_llm/llmapi/llm.html">tensorrt_llm.llmapi.llm</a></li>
-<li><a href="tensorrt_llm/llmapi/llm_utils.html">tensorrt_llm.llmapi.llm_utils</a></li>
-<li><a href="tensorrt_llm/models/baichuan/model.html">tensorrt_llm.models.baichuan.model</a></li>
-<li><a href="tensorrt_llm/models/bert/model.html">tensorrt_llm.models.bert.model</a></li>
-<li><a href="tensorrt_llm/models/bloom/model.html">tensorrt_llm.models.bloom.model</a></li>
-<li><a href="tensorrt_llm/models/chatglm/config.html">tensorrt_llm.models.chatglm.config</a></li>
-<li><a href="tensorrt_llm/models/chatglm/model.html">tensorrt_llm.models.chatglm.model</a></li>
-<li><a href="tensorrt_llm/models/cogvlm/config.html">tensorrt_llm.models.cogvlm.config</a></li>
-<li><a href="tensorrt_llm/models/cogvlm/model.html">tensorrt_llm.models.cogvlm.model</a></li>
-<li><a href="tensorrt_llm/models/commandr/model.html">tensorrt_llm.models.commandr.model</a></li>
-<li><a href="tensorrt_llm/models/dbrx/config.html">tensorrt_llm.models.dbrx.config</a></li>
-<li><a href="tensorrt_llm/models/dbrx/model.html">tensorrt_llm.models.dbrx.model</a></li>
-<li><a href="tensorrt_llm/models/deepseek_v1/model.html">tensorrt_llm.models.deepseek_v1.model</a></li>
-<li><a href="tensorrt_llm/models/deepseek_v2/model.html">tensorrt_llm.models.deepseek_v2.model</a></li>
-<li><a href="tensorrt_llm/models/dit/model.html">tensorrt_llm.models.dit.model</a></li>
-<li><a href="tensorrt_llm/models/eagle/model.html">tensorrt_llm.models.eagle.model</a></li>
-<li><a href="tensorrt_llm/models/enc_dec/model.html">tensorrt_llm.models.enc_dec.model</a></li>
-<li><a href="tensorrt_llm/models/falcon/config.html">tensorrt_llm.models.falcon.config</a></li>
-<li><a href="tensorrt_llm/models/falcon/model.html">tensorrt_llm.models.falcon.model</a></li>
-<li><a href="tensorrt_llm/models/gemma/config.html">tensorrt_llm.models.gemma.config</a></li>
-<li><a href="tensorrt_llm/models/gemma/model.html">tensorrt_llm.models.gemma.model</a></li>
-<li><a href="tensorrt_llm/models/gpt/config.html">tensorrt_llm.models.gpt.config</a></li>
-<li><a href="tensorrt_llm/models/gpt/model.html">tensorrt_llm.models.gpt.model</a></li>
-<li><a href="tensorrt_llm/models/gptj/config.html">tensorrt_llm.models.gptj.config</a></li>
-<li><a href="tensorrt_llm/models/gptj/model.html">tensorrt_llm.models.gptj.model</a></li>
-<li><a href="tensorrt_llm/models/gptneox/model.html">tensorrt_llm.models.gptneox.model</a></li>
-<li><a href="tensorrt_llm/models/llama/config.html">tensorrt_llm.models.llama.config</a></li>
-<li><a href="tensorrt_llm/models/llama/model.html">tensorrt_llm.models.llama.model</a></li>
-<li><a href="tensorrt_llm/models/mamba/model.html">tensorrt_llm.models.mamba.model</a></li>
-<li><a href="tensorrt_llm/models/medusa/config.html">tensorrt_llm.models.medusa.config</a></li>
-<li><a href="tensorrt_llm/models/medusa/model.html">tensorrt_llm.models.medusa.model</a></li>
-<li><a href="tensorrt_llm/models/mllama/model.html">tensorrt_llm.models.mllama.model</a></li>
-<li><a href="tensorrt_llm/models/modeling_utils.html">tensorrt_llm.models.modeling_utils</a></li>
-<li><a href="tensorrt_llm/models/mpt/model.html">tensorrt_llm.models.mpt.model</a></li>
-<li><a href="tensorrt_llm/models/opt/model.html">tensorrt_llm.models.opt.model</a></li>
-<li><a href="tensorrt_llm/models/phi/model.html">tensorrt_llm.models.phi.model</a></li>
-<li><a href="tensorrt_llm/models/phi3/model.html">tensorrt_llm.models.phi3.model</a></li>
-<li><a href="tensorrt_llm/models/recurrentgemma/model.html">tensorrt_llm.models.recurrentgemma.model</a></li>
-<li><a href="tensorrt_llm/models/redrafter/model.html">tensorrt_llm.models.redrafter.model</a></li>
-<li><a href="tensorrt_llm/plugin/plugin.html">tensorrt_llm.plugin.plugin</a></li>
-<li><a href="tensorrt_llm/quantization/mode.html">tensorrt_llm.quantization.mode</a></li>
-<li><a href="tensorrt_llm/quantization/quantize_by_modelopt.html">tensorrt_llm.quantization.quantize_by_modelopt</a></li>
-<li><a href="tensorrt_llm/runtime/enc_dec_model_runner.html">tensorrt_llm.runtime.enc_dec_model_runner</a></li>
-<li><a href="tensorrt_llm/runtime/generation.html">tensorrt_llm.runtime.generation</a></li>
-<li><a href="tensorrt_llm/runtime/kv_cache_manager.html">tensorrt_llm.runtime.kv_cache_manager</a></li>
-<li><a href="tensorrt_llm/runtime/model_runner.html">tensorrt_llm.runtime.model_runner</a></li>
-<li><a href="tensorrt_llm/runtime/model_runner_cpp.html">tensorrt_llm.runtime.model_runner_cpp</a></li>
-<li><a href="tensorrt_llm/runtime/multimodal_model_runner.html">tensorrt_llm.runtime.multimodal_model_runner</a></li>
-<li><a href="tensorrt_llm/runtime/session.html">tensorrt_llm.runtime.session</a></li>
-<li><a href="tensorrt_llm/sampling_params.html">tensorrt_llm.sampling_params</a></li>
-</ul>
+</ul></ul>
 
            </div>
           </div>
@@ -233,7 +170,7 @@ <h1>All modules for which code is available</h1>
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da791a96e40>
+<jinja2.runtime.BlockReference object at 0x74a6148aee10>
 
 <div class="footer">
     <p>

diff --git a/_sources/installation/linux.md.txt b/_sources/installation/linux.md.txt
@@ -13,23 +13,29 @@
     ```python3
     from tensorrt_llm import LLM, SamplingParams
 
-    prompts = [
-        "Hello, my name is",
-        "The president of the United States is",
-        "The capital of France is",
-        "The future of AI is",
-    ]
-    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-
-    outputs = llm.generate(prompts, sampling_params)
-
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    def main():
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+        llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+
+        outputs = llm.generate(prompts, sampling_params)
+
+        # Print the outputs.
+        for output in outputs:
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+    # The entry point of the program need to be protected for spawning processes.
+    if __name__ == '__main__':
+        main()
     ```
 
 **Known limitations**

diff --git a/advanced/disaggregated-service.html b/advanced/disaggregated-service.html
@@ -270,7 +270,7 @@ <h3>Debugging FAQs<a class="headerlink" href="#debugging-faqs" title="Link to th
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747e8d0a0>
+<jinja2.runtime.BlockReference object at 0x74a60f079a00>
 
 <div class="footer">
     <p>

diff --git a/advanced/executor.html b/advanced/executor.html
@@ -182,7 +182,7 @@
 <p>TensorRT-LLM includes a high-level C++ API called the Executor API which allows you to execute requests
 asynchronously, with in-flight batching, and without the need to define callbacks.</p>
 <p>A software component (referred to as “the client” in the text that follows) can interact
-with the executor using the API defined in the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/user/kaiyu/update_disagg_doc/cpp/include/tensorrt_llm/executor/executor.h"><code class="docutils literal notranslate"><span class="pre">executor.h</span></code></a> file.
+with the executor using the API defined in the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/rel/cpp/include/tensorrt_llm/executor/executor.h"><code class="docutils literal notranslate"><span class="pre">executor.h</span></code></a> file.
 For details about the API, refer to the <span class="xref std std-ref">_cpp_gen/executor.rst</span>.</p>
 <p>The following sections provide an overview of the main classes defined in the Executor API.</p>
 <section id="api">
@@ -250,7 +250,7 @@ <h3>Structured output with guided decoding<a class="headerlink" href="#structure
 <span class="n">stop_token_ids</span> <span class="o">=</span> <span class="p">[</span><span class="n">tokenizer</span><span class="o">.</span><span class="n">eos_token_id</span><span class="p">]</span>
 </pre></div>
 </div>
-<p>Refer to <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/user/kaiyu/update_disagg_doc/tensorrt_llm/llmapi/tokenizer.py"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm/llmapi/tokenizer.py</span></code></a> for more details. You may dump these materials to disk, and reload them to C++ runtime for use.</p>
+<p>Refer to <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/rel/tensorrt_llm/llmapi/tokenizer.py"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm/llmapi/tokenizer.py</span></code></a> for more details. You may dump these materials to disk, and reload them to C++ runtime for use.</p>
 <p>Each request can be optionally specified with a <code class="docutils literal notranslate"><span class="pre">GuidedDecodingParams</span></code>, which defines the desired structured format. Currently, it supports four types:</p>
 <ul class="simple">
 <li><p><code class="docutils literal notranslate"><span class="pre">GuidedDecodingParams::GuideType::kJSON</span></code>: The generated text is amenable to JSON format;</p></li>
@@ -263,12 +263,12 @@ <h3>Structured output with guided decoding<a class="headerlink" href="#structure
 </section>
 <section id="c-executor-api-example">
 <h2>C++ Executor API Example<a class="headerlink" href="#c-executor-api-example" title="Link to this heading"></a></h2>
-<p>Two C++ examples are provided that shows how to use the Executor API and can be found in the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/user/kaiyu/update_disagg_doc/examples/cpp/executor/"><code class="docutils literal notranslate"><span class="pre">examples/cpp/executor</span></code></a> folder.</p>
+<p>Two C++ examples are provided that shows how to use the Executor API and can be found in the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/rel/examples/cpp/executor/"><code class="docutils literal notranslate"><span class="pre">examples/cpp/executor</span></code></a> folder.</p>
 </section>
 <section id="python-bindings-for-the-executor-api">
 <h2>Python Bindings for the Executor API<a class="headerlink" href="#python-bindings-for-the-executor-api" title="Link to this heading"></a></h2>
-<p>Python bindings for the Executor API are also available to use the Executor API from Python. The Python bindings are defined in <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/user/kaiyu/update_disagg_doc/cpp/tensorrt_llm/pybind/executor/bindings.cpp">bindings.cpp</a> and once built, are available in package <code class="docutils literal notranslate"><span class="pre">tensorrt_llm.bindings.executor</span></code>. Running <code class="docutils literal notranslate"><span class="pre">'help('tensorrt_llm.bindings.executor')</span></code> in a Python interpreter will provide an overview of the classes available.</p>
-<p>In addition, three Python examples are provided to demonstrate how to use the Python bindings to the Executor API for single and multi-GPU models. They can be found in <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/user/kaiyu/update_disagg_doc/examples/bindings"><code class="docutils literal notranslate"><span class="pre">examples/bindings</span></code></a>.</p>
+<p>Python bindings for the Executor API are also available to use the Executor API from Python. The Python bindings are defined in <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/rel/cpp/tensorrt_llm/pybind/executor/bindings.cpp">bindings.cpp</a> and once built, are available in package <code class="docutils literal notranslate"><span class="pre">tensorrt_llm.bindings.executor</span></code>. Running <code class="docutils literal notranslate"><span class="pre">'help('tensorrt_llm.bindings.executor')</span></code> in a Python interpreter will provide an overview of the classes available.</p>
+<p>In addition, three Python examples are provided to demonstrate how to use the Python bindings to the Executor API for single and multi-GPU models. They can be found in <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/rel/examples/bindings"><code class="docutils literal notranslate"><span class="pre">examples/bindings</span></code></a>.</p>
 </section>
 <section id="in-flight-batching-with-the-triton-inference-server">
 <h2>In-flight Batching with the Triton Inference Server<a class="headerlink" href="#in-flight-batching-with-the-triton-inference-server" title="Link to this heading"></a></h2>
@@ -290,7 +290,7 @@ <h2>In-flight Batching with the Triton Inference Server<a class="headerlink" hre
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747e36000>
+<jinja2.runtime.BlockReference object at 0x74a61b2777a0>
 
 <div class="footer">
     <p>

diff --git a/advanced/expert-parallelism.html b/advanced/expert-parallelism.html
@@ -202,7 +202,7 @@ <h2>How to Enable<a class="headerlink" href="#how-to-enable" title="Link to this
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747e9ea80>
+<jinja2.runtime.BlockReference object at 0x74a61acb05f0>
 
 <div class="footer">
     <p>

diff --git a/advanced/gpt-attention.html b/advanced/gpt-attention.html
@@ -373,9 +373,9 @@ <h3>Paged KV Cache<a class="headerlink" href="#paged-kv-cache" title="Link to th
 the different requests by a cache manager during processing. That cache manager
 keeps track of the sequences, allocate new blocks from a pool and recycle those
 blocks when required. See the simplified implementation of
-<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/user/kaiyu/update_disagg_doc/tensorrt_llm/runtime/kv_cache_manager.py"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm.runtime.KVCacheManager</span></code></a>.
+<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/rel/tensorrt_llm/runtime/kv_cache_manager.py"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm.runtime.KVCacheManager</span></code></a>.
 A more efficient C++ implementation is included in the
-<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/user/kaiyu/update_disagg_doc/cpp/include/tensorrt_llm/batch_manager">Batch Manager</a>.</p>
+<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/rel/cpp/include/tensorrt_llm/batch_manager">Batch Manager</a>.</p>
 </section>
 </section>
 <section id="int8-fp8-kv-caches">
@@ -522,7 +522,7 @@ <h3>Relative Attention Bias (RAB)<a class="headerlink" href="#relative-attention
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747d4d790>
+<jinja2.runtime.BlockReference object at 0x74a60fe348c0>
 
 <div class="footer">
     <p>

diff --git a/advanced/gpt-runtime.html b/advanced/gpt-runtime.html
@@ -575,7 +575,7 @@ <h2>Know Issues and Future Changes<a class="headerlink" href="#know-issues-and-f
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747ed2c30>
+<jinja2.runtime.BlockReference object at 0x74a61120b4a0>
 
 <div class="footer">
     <p>

diff --git a/advanced/graph-rewriting.html b/advanced/graph-rewriting.html
@@ -381,7 +381,7 @@ <h2>Classical Workflow<a class="headerlink" href="#classical-workflow" title="Li
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747c66870>
+<jinja2.runtime.BlockReference object at 0x74a61ade8fe0>
 
 <div class="footer">
     <p>

diff --git a/advanced/inference-request.html b/advanced/inference-request.html
@@ -397,7 +397,7 @@ <h1>Responses<a class="headerlink" href="#responses" title="Link to this heading
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747c6e0f0>
+<jinja2.runtime.BlockReference object at 0x74a61b98bec0>
 
 <div class="footer">
     <p>

diff --git a/advanced/kv-cache-reuse.html b/advanced/kv-cache-reuse.html
@@ -285,7 +285,7 @@ <h2>Offloading to host memory<a class="headerlink" href="#offloading-to-host-mem
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747d4c5f0>
+<jinja2.runtime.BlockReference object at 0x74a61b117560>
 
 <div class="footer">
     <p>

diff --git a/advanced/lora.html b/advanced/lora.html
@@ -359,7 +359,7 @@ <h3>LoRA with tensor parallel<a class="headerlink" href="#lora-with-tensor-paral
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747c21dc0>
+<jinja2.runtime.BlockReference object at 0x74a61b98ae70>
 
 <div class="footer">
     <p>

diff --git a/advanced/speculative-decoding.html b/advanced/speculative-decoding.html
@@ -619,7 +619,7 @@ <h2>Lookahead Decoding<a class="headerlink" href="#lookahead-decoding" title="Li
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747c04d40>
+<jinja2.runtime.BlockReference object at 0x74a61ab41b80>
 
 <div class="footer">
     <p>

diff --git a/advanced/weight-streaming.html b/advanced/weight-streaming.html
@@ -233,7 +233,7 @@ <h2>API Changes<a class="headerlink" href="#api-changes" title="Link to this hea
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747ba1f70>
+<jinja2.runtime.BlockReference object at 0x74a61ab86ab0>
 
 <div class="footer">
     <p>

diff --git a/architecture/add-model.html b/architecture/add-model.html
@@ -272,7 +272,7 @@ <h2>Reference<a class="headerlink" href="#reference" title="Link to this heading
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747b2fc80>
+<jinja2.runtime.BlockReference object at 0x74a61b159b20>
 
 <div class="footer">
     <p>

diff --git a/architecture/checkpoint.html b/architecture/checkpoint.html
@@ -537,7 +537,7 @@ <h2>Make Evaluation<a class="headerlink" href="#make-evaluation" title="Link to
   <hr/>
 
   <div role="contentinfo">
-<jinja2.runtime.BlockReference object at 0x7da747bb12e0>
+<jinja2.runtime.BlockReference object at 0x74a61ac6e900>
 
 <div class="footer">
     <p>