From 4389288216c972bb02465b96ff006db27bd6918f Mon Sep 17 00:00:00 2001
From: Appu Shaji <appoose@gmail.com>
Date: Thu, 16 Nov 2023 16:08:01 +0100
Subject: [PATCH] adding selectors

---
 index.html | 2631 +++++++++++++++++++++++++++-------------------------
 1 file changed, 1384 insertions(+), 1247 deletions(-)
diff --git a/index.html b/index.html
index 50f612f..daff35e 100644
--- a/index.html
+++ b/index.html
@@ -1,64 +1,81 @@
-    <html>
-    <!DOCTYPE html>
-    <head>
-        <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-        <script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=default'></script>
-        
-
-
-        <!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.2/es5/latest.min.js"></script> -->
-    
-
-        <title>HQQ quantization</title>
-        <link rel="stylesheet" type="text/css" href="styling.css">
-        <link rel="icon" type="image/png" href="figs/aana_logo.png">
-
-        <meta name="description" content="Efficient data-free quantization for large language models (LLMs)">
-
-        <meta name="keywords" content="Model Quantization, Machine Learning, LLama2, Model Compression, Transformer Models, Neural Networks, AI Optimization">
-
-        <meta name="Hicham Badri and Appu Shaji" content="Mobius Labs GmbH">
-
-        <!-- Specific tags for Open Graph / social media sharing -->
-        <meta property="og:title" content="Half-Quadratic Quantization">
-        <meta property="og:description" content="An in-depth article discussing the intricacies of efficient model quantization in machine learning and their application in large language models for improved efficiency.">
-        <meta property="og:image" content="https://github.com/mobiusml/hqq/blob/main/figs/aana_hqq_1.png">
-        <meta property="og:url" content="https://mobiusml.github.io/hqq/">
-        <meta property="og:type" content="article">
-
-        <!-- Twitter Card data -->
-        <meta name="twitter:card" content="summary_large_image">
-        <meta name="twitter:title" content="Half-Quadratic Quantization">
-        <meta name="twitter:description" content="Discover the advanced strategies for model quantization in AI, highlighting optimizations for large language models such as LLama2.">
-        <meta name="twitter:image" content="https://github.com/mobiusml/hqq/blob/main/figs/aana_hqq_1.png">
-        <meta name="twitter:creator" content="@appughar">
-
-        <!-- Meta tags for article publishing date and modification date -->
-        <meta name="article:published_time" content="2023-11-03T08:00:00+00:00">
-        <meta name="article:modified_time" content="2023-11-03T09:00:00+00:00">
-
-        
-    </head>
-
-    <body>
-        <article id="low-rank-sparsity" class="page sans">
-            <header>
-                <h1 class="page-title">Half-Quadratic Quantization of Large Machine Learning Models</h1>
-                
-            </header>
-            <div class="page-body">
-                <p><a href="https://scholar.google.com/citations?user=LxweMX4AAAAJ&hl=en"><mark
-                            class="highlight-gray">Hicham Badri</mark></a><mark class="highlight-gray">, </mark><a
-                        href="https://scholar.google.com/citations?user=HxZDDzUAAAAJ&hl=en"><mark class="highlight-gray">Appu Shaji</mark></a><mark
-                        class="highlight-gray"></mark></p>
-                <p><mark class="highlight-gray"><a href="https://www.mobiuslabs.com/"><mark
-                    class="highlight-gray">Mobius Labs GmbH</mark></a></p>
-                <hr  />
-                <p>Large Language Models (LLMs) have revolutionized various subfields of machine learning like natural language processing, speech recognition and computer vision, enabling machines to understand and generate outputs with unprecedented accuracy and fluency. However, one of the most critical challenges in deploying LLMs is their expensive memory requirements, for both training and inference. Quantization methods such as <a href="https://github.com/TimDettmers/bitsandbytes">bitsandbytes</a>, <a href="https://arxiv.org/abs/2210.17323">GPTQ</a> and <a href="https://github.com/mit-han-lab/llm-awq">AWQ</a> have made it possible to use large models such as the popular LLama2 with significantly less memory, enabling the machine learning community to conduct remarkable research using a single consumer-grade GPU. 
-                </p>
-                <p>In this article, we propose a new quantization technique called <b>H</b>alf-<b>Q</b>uadratic <b>Q</b>uantization (<b>HQQ</b>). Our approach, requiring no calibration data, significantly speeds up the quantization of large models, while offering compression quality competitive with that of calibration-based methods. For instance, <b>HQQ</b> takes less than 8 minutes to process the colossal LLama2-70B, that’s <em>27x</em> faster compared to the widely adopted GPTQ, while <em>significantly outperforming</em> it for extreme low-bit quantization.                
-                </p>
-                <!-- <p id="c8835517-e8ec-4781-8d42-047d63df4d94" class=""><strong>Paper</strong>: <a
+<html>
+<!DOCTYPE html>
+
+<head>
+	<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+	<script src='https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.4/MathJax.js?config=default'></script>
+
+
+
+	<!-- <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/3.2.2/es5/latest.min.js"></script> -->
+
+
+	<title>HQQ quantization</title>
+	<link rel="stylesheet" type="text/css" href="styling.css">
+	<link rel="icon" type="image/png" href="figs/aana_logo.png">
+
+	<meta name="description" content="Efficient data-free quantization for large language models (LLMs)">
+
+	<meta name="keywords"
+		content="Model Quantization, Machine Learning, LLama2, Model Compression, Transformer Models, Neural Networks, AI Optimization">
+
+	<meta name="Hicham Badri and Appu Shaji" content="Mobius Labs GmbH">
+
+	<!-- Specific tags for Open Graph / social media sharing -->
+	<meta property="og:title" content="Half-Quadratic Quantization">
+	<meta property="og:description"
+		content="An in-depth article discussing the intricacies of efficient model quantization in machine learning and their application in large language models for improved efficiency.">
+	<meta property="og:image" content="https://github.com/mobiusml/hqq/blob/main/figs/aana_hqq_1.png">
+	<meta property="og:url" content="https://mobiusml.github.io/hqq/">
+	<meta property="og:type" content="article">
+
+	<!-- Twitter Card data -->
+	<meta name="twitter:card" content="summary_large_image">
+	<meta name="twitter:title" content="Half-Quadratic Quantization">
+	<meta name="twitter:description"
+		content="Discover the advanced strategies for model quantization in AI, highlighting optimizations for large language models such as LLama2.">
+	<meta name="twitter:image" content="https://github.com/mobiusml/hqq/blob/main/figs/aana_hqq_1.png">
+	<meta name="twitter:creator" content="@appughar">
+
+	<!-- Meta tags for article publishing date and modification date -->
+	<meta name="article:published_time" content="2023-11-03T08:00:00+00:00">
+	<meta name="article:modified_time" content="2023-11-03T09:00:00+00:00">
+
+
+</head>
+
+<body>
+	<article id="low-rank-sparsity" class="page sans">
+		<header>
+			<h1 class="page-title">Half-Quadratic Quantization of Large Machine Learning Models</h1>
+
+		</header>
+		<div class="page-body">
+			<p><a href="https://scholar.google.com/citations?user=LxweMX4AAAAJ&hl=en"><mark
+						class="highlight-gray">Hicham Badri</mark></a><mark class="highlight-gray">, </mark><a
+					href="https://scholar.google.com/citations?user=HxZDDzUAAAAJ&hl=en"><mark
+						class="highlight-gray">Appu Shaji</mark></a><mark class="highlight-gray"></mark></p>
+			<p><mark class="highlight-gray"><a href="https://www.mobiuslabs.com/"><mark class="highlight-gray">Mobius
+							Labs GmbH</mark></a></p>
+			<hr />
+			<p>Large Language Models (LLMs) have revolutionized various subfields of machine learning like natural
+				language processing, speech recognition and computer vision, enabling machines to understand and
+				generate outputs with unprecedented accuracy and fluency. However, one of the most critical challenges
+				in deploying LLMs is their expensive memory requirements, for both training and inference. Quantization
+				methods such as <a href="https://github.com/TimDettmers/bitsandbytes">bitsandbytes</a>, <a
+					href="https://arxiv.org/abs/2210.17323">GPTQ</a> and <a
+					href="https://github.com/mit-han-lab/llm-awq">AWQ</a> have made it possible to use large models such
+				as the popular LLama2 with significantly less memory, enabling the machine learning community to conduct
+				remarkable research using a single consumer-grade GPU.
+			</p>
+			<p>In this article, we propose a new quantization technique called <b>H</b>alf-<b>Q</b>uadratic
+				<b>Q</b>uantization (<b>HQQ</b>). Our approach, requiring no calibration data, significantly speeds up
+				the quantization of large models, while offering compression quality competitive with that of
+				calibration-based methods. For instance, <b>HQQ</b> takes less than 8 minutes to process the colossal
+				LLama2-70B, that’s <em>27x</em> faster compared to the widely adopted GPTQ, while <em>significantly
+					outperforming</em> it for extreme low-bit quantization.
+			</p>
+			<!-- <p id="c8835517-e8ec-4781-8d42-047d63df4d94" class=""><strong>Paper</strong>: <a
                         href="https://arxiv.org/abs/2310.06694">https://arxiv.org/abs/2310.06694</a>
                     <strong>Code</strong>: <a
                         href="https://github.com/princeton-nlp/LLM-Shearing">https://github.com/princeton-nlp/LLM-Shearing</a>
@@ -66,1190 +83,1310 @@ <h1 class="page-title">Half-Quadratic Quantization of Large Machine Learning Mod
                         href="https://huggingface.co/princeton-nlp/Sheared-LLaMA-1.3B">Sheared-LLaMA-1.3B</a>, <a
                         href="https://huggingface.co/princeton-nlp/Sheared-LLaMA-2.7B">Sheared-LLaMA-2.7B</a>
                 </p> -->
-                <hr id="header_seperator" />
-                <div class="column-list">
-                    <div style="width:32%" class="column">
-                        <!-- <p class="page-description"><img src="./baby_aana.png" /></p> -->
-                        <figure class="image" style="text-align:left"><a
-                                href="figs/aana_hqq_1.png"><img
-                                    style="width:240px"
-                                    src="figs/aana_hqq_1.png" /></a>
-                        </figure>
-                        <p>
-                            <strong><strong><strong><strong><strong><strong><strong>Table of
-                                                        Contents</strong></strong></strong></strong></strong></strong></strong>
-                        </p>
-                        <nav class="block-color-gray table_of_contents">
-                            <div class="table_of_contents-item table_of_contents-indent-0"><a class="table_of_contents-link"
-                                    href="#intro">Introduction</a></div>
-                            <div class="table_of_contents-item table_of_contents-indent-0"><a class="table_of_contents-link"
-                                    href="#hqq">Half-Quadratic Quantization</a>
-                            <div class="table_of_contents-item table_of_contents-indent-0"><a class="table_of_contents-link"
-                                href="#processing_time">Processing Time</a>
-                            </div>
-                            <div class="table_of_contents-item table_of_contents-indent-0"><a class="table_of_contents-link"
-                                href="#benchmark">Benchmark</a>
-                            </div>
-                            <div class="table_of_contents-item table_of_contents-indent-0"><a class="table_of_contents-link"
-                                href="#conclusion">Conclusion</a>
-                            </div>
-                            
-                            <hr />
-                            <div class="table_of_contents-item table_of_contents-indent-0"> <small>Support code is available at <a href="https://github.com/mobiusml/hqq/tree/main/code"><mark
-                                class="highlight-gray">https://github.com/mobiusml/hqq/tree/main/code</mark></a></small></div>
-                            <hr /> 
-                                                        
-                        </nav>
-
-                    </div>
-                    <div  style="width:75%" class="column">
-                        <h2 id="intro" class="">Introduction</h2>
-                        <p>Model quantization is a crucial step to deploy large models with limited resources and save costs, which is particularly relevant to LLMs for both training and inference. Software packages such as bitsandbytes have made it possible to utilize large models on consumer-grade GPUs, which has been a game-changer for the machine learning community.</p>
-
-                        <p></p>When it comes to weight-only quantization, there are two classes of approaches: data-free calibration techniques such as <i>bitsandbytes</i> rely on using the weights only without external data, and calibration-based methods such as GPTQ and AWQ that rely on an external dataset to adjust the quantization parameters. While calibration-based methods offer better quantization quality, they suffer from two main issues:</p>
-                        <ol>
-                        <li><em>Calibration data bias</em>: the quality of quantization can be negatively affected if incorrect calibration data is provided.</li>
-                        <li><em>Quantization time</em>: calibration can be a heavy computational process especially for very large models, which makes it difficult to test and deploy multiple models. </li>
-                        </ol>
-                        <p>Wouldn't it be great if we can achieve the quality of calibration-based methods for the speed of calibration-free quantization methods? That’s exactly what we propose via our method Half-Quadratic Quantization (HQQ). </p>
-                        
-
-                        <h2 id="hqq" class="">Half-Quadratic Quantization</h2>
-
-			<p>Basic quantization often results in a loss of model accuracy, especially in Large Language Models (LLMs). This is because the weights in these models can have a wide range of values that can be significantly altered after the quantization process. Weights that deviate notably (known as outliers) pose a particular challenge.
-			<a href="https://arxiv.org/abs/2210.17323">Group-wise Precision Tuning Quantization (GPTQ)</a> and <a href="https://arxiv.org/abs/2306.00978">Activation-Aware Layer Quantization (AWQ)</a> are algorithms that try to compensate for the outliers by relying on calibration data to minimize the error on layer outputs.  
+			<hr id="header_seperator" />
+			<div class="column-list">
+				<div style="width:32%" class="column">
+					<!-- <p class="page-description"><img src="./baby_aana.png" /></p> -->
+					<figure class="image" style="text-align:left"><a href="figs/aana_hqq_1.png"><img style="width:240px"
+								src="figs/aana_hqq_1.png" /></a>
+					</figure>
+					<p>
+						<strong><strong><strong><strong><strong><strong><strong>Table of
+													Contents</strong></strong></strong></strong></strong></strong></strong>
+					</p>
+					<nav class="block-color-gray table_of_contents">
+						<div class="table_of_contents-item table_of_contents-indent-0"><a class="table_of_contents-link"
+								href="#intro">Introduction</a></div>
+						<div class="table_of_contents-item table_of_contents-indent-0"><a class="table_of_contents-link"
+								href="#hqq">Half-Quadratic Quantization</a>
+							<div class="table_of_contents-item table_of_contents-indent-0"><a
+									class="table_of_contents-link" href="#processing_time">Processing Time</a>
+							</div>
+							<div class="table_of_contents-item table_of_contents-indent-0"><a
+									class="table_of_contents-link" href="#benchmark">Benchmark</a>
+							</div>
+							<div class="table_of_contents-item table_of_contents-indent-0"><a
+									class="table_of_contents-link" href="#conclusion">Conclusion</a>
+							</div>
+
+							<hr />
+							<div class="table_of_contents-item table_of_contents-indent-0"> <small>Support code is
+									available at <a href="https://github.com/mobiusml/hqq/tree/main/code"><mark
+											class="highlight-gray">https://github.com/mobiusml/hqq/tree/main/code</mark></a></small>
+							</div>
+							<hr />
+
+					</nav>
+
+				</div>
+				<div style="width:75%" class="column">
+					<h2 id="intro" class="">Introduction</h2>
+					<p>Model quantization is a crucial step to deploy large models with limited resources and save
+						costs, which is particularly relevant to LLMs for both training and inference. Software packages
+						such as bitsandbytes have made it possible to utilize large models on consumer-grade GPUs, which
+						has been a game-changer for the machine learning community.</p>
+
+					<p></p>When it comes to weight-only quantization, there are two classes of approaches: data-free
+					calibration techniques such as <i>bitsandbytes</i> rely on using the weights only without external
+					data, and calibration-based methods such as GPTQ and AWQ that rely on an external dataset to adjust
+					the quantization parameters. While calibration-based methods offer better quantization quality, they
+					suffer from two main issues:</p>
+					<ol>
+						<li><em>Calibration data bias</em>: the quality of quantization can be negatively affected if
+							incorrect calibration data is provided.</li>
+						<li><em>Quantization time</em>: calibration can be a heavy computational process especially for
+							very large models, which makes it difficult to test and deploy multiple models. </li>
+					</ol>
+					<p>Wouldn't it be great if we can achieve the quality of calibration-based methods for the speed of
+						calibration-free quantization methods? That’s exactly what we propose via our method
+						Half-Quadratic Quantization (HQQ). </p>
+
+
+					<h2 id="hqq" class="">Half-Quadratic Quantization</h2>
+
+					<p>Basic quantization often results in a loss of model accuracy, especially in Large Language Models
+						(LLMs). This is because the weights in these models can have a wide range of values that can be
+						significantly altered after the quantization process. Weights that deviate notably (known as
+						outliers) pose a particular challenge.
+						<a href="https://arxiv.org/abs/2210.17323">Group-wise Precision Tuning Quantization (GPTQ)</a>
+						and <a href="https://arxiv.org/abs/2306.00978">Activation-Aware Layer Quantization (AWQ)</a> are
+						algorithms that try to compensate for the outliers by relying on calibration data to minimize
+						the error on layer outputs.
+					</p>
+					<p>Unlike these approaches, our method focuses specifically on minimizing errors in the
+						<i>weights</i> rather than the layer activation error. Additionally, by incorporating a
+						sparsity-promoting loss, such as the \( {l_{p<1}} \)-norm, we effectively model outliers through
+							a hyper-Laplacian distribution. This distribution more accurately captures the heavy-tailed
+							nature of outlier errors compared to the squared error, resulting in a more nuanced
+							representation of error distribution. </p>
+							<p>We propose a robust optimization formulation to find the quantization parameters
+								(zero-point \( z \) and scaling \( s \)). More specifically, we use a sparsity-promoting
+								loss function \( \phi() \) such as the \( {l_{p}} \) norm between the original weights
+								\( W \) and their dequantized version:</p>
+
+							$$\underset{z,s}{\text{argmin}}\,\phi(W-Q_{z,s}^{-1}(Q_{z,s}(W)),$$
+
+							where \( Q_{z,s}() \) is the quantization operator which depends on the \( z \) and \( s \)
+							parameters and generates the quantized weights \( W_{q} \). \( Q_{z,s}()^{-1} \) is the
+							de-quantization operator:
+							$$\begin{array}{c}
+							Q_{z,s}(W)=\text{round}(W/s+z)=W_{q}\\
+							Q_{z,s}^{-1}(W_{q})=s(W_{q}-z)
+							\end{array}$$
+
+							<p>The use of the \( {l_{p<1}} \)-norm makes the problem non-convex. To find a solution, we
+									adopt a <a href="https://ieeexplore.ieee.org/document/120331">Half-Quadratic
+									solver</a> by introducing an extra variable \( W_{e} \). This additional parameter
+									allows us to split the main problem into sub-problems that are easier to solve.
+									Moreover, to make the problem simpler, we fix the scaling \( s \) parameter and only
+									optimize for the zero-point \( z \). </p>
+
+							$$\underset{z,W_{e}}{\text{argmin}}\,\phi(W_{e})+\frac{\beta}{2}||W_{e}-(W-Q_{z}^{-1}(Q_{z}(W))||_{2}^{2}$$
+
+							We then form sub-problems which are solved via alternate optimization:
+
+							$$\begin{array}{cc}
+							\text{(sp}_{1}) &
+							W_{e}^{(t+1)}\leftarrow\underset{}{\underset{W_{e}}{\text{argmin}}\,\phi(W_{e})+\frac{\beta^{(t)}}{2}||W_{e}-(W-Q_{z}^{-1}(Q_{z}(W))||_{2}^{2}}\\
+							\text{(sp}_{2}) &
+							z^{(t+1)}\leftarrow\underset{z}{\text{argmin}}\,\frac{1}{2}||Q_{z}^{-1}(Q_{z}(W))-(W-W_{e}^{(t+1)})||_{2}^{2}\\
+							& \beta^{(t+1)}\leftarrow\kappa\beta^{(t)},\end{array}$$
+							where \( \beta \) and \( \kappa \) and strictly positive parameters.
+
+							<h4>Sub-problem \( \text{(sp}_{1}) \)</h4>
+
+							This problem takes the form of a <a
+								href="https://web.stanford.edu/~boyd/papers/pdf/prox_algs.pdf">Proximal Operator</a>.
+							When \( \phi() \) is the \( l_{1} \) norm, the solution is the <a
+								href="https://sparse-plex.readthedocs.io/en/latest/book/opt/soft_thresholding.html">soft-thresholding
+								operator</a>. There exists a more general thresholding solution for the \( l_{p}\)-norm
+							with \( 0 \le p \leq 1 \) that we adopt known is as the <a
+								href="https://inria.hal.science/hal-01317151/file/lowrank_ieee_tip.pdf">generalized
+								soft-thresholding operator</a>:
+
+							$$\begin{array}{c}
+							W_{e}^{(t+1)}\leftarrow\text{shrink}_{l_{p}}\left(W-Q_{z}^{-1}(Q_{z}(W)),\beta\right)\\
+							\text{shrink}_{l_{p}}(x,\beta)=\text{sign}(x)\text{relu}(|x|-\frac{|x|^{p-1}}{\beta})
+							\end{array}$$
+
+
+							<h4>Sub-problem \( \text{(sp}_{2}) \)</h4>
+							The second sub-problem can be rewritten as follows:
+							$$\begin{array}{c}
+							z^{(t+1)}\leftarrow\underset{z}{\text{argmin}}\,\frac{1}{2}||z-\left(W_{q}^{(t+1)}-\frac{(W-W_{e}^{(t+1)})}{s}\right)||_{2}^{2}\\
+							W_{q}^{(t+1)}=\text{round}(W/s+z^{(t)})
+							\end{array}$$
+
+							The solution is simply the average over the axis the quantization grouping is performed on:
+							$$z^{(t+1)}\leftarrow\langle W_{q}^{(t+1)}-\frac{(W-W_{e}^{(t+1)})}{s}\rangle$$
+
+							In our implementation, we work with the inverse of the scale \( 1/s \) instead of \( s \)
+							which we found to be a bit more stable with the half-precision calculations.<br><br>
+
+							Note that, contrary to using gradient descent with <a
+								href="https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html">autograd</a>,
+							the solution that we propose relies on closed-form solutions, which means that there are no
+							gradients calculated. This allows us to run all the calculations in inference mode with
+							half-precision. Moreover, it only takes a few iterations for the solver to converge.
+							Conversely, using the AdamW optimizer and Pytorch’s autograd takes thousands of iterations
+							to achieve good results. It also fails with \( p \le 1 \), which is what we actually use to
+							promote sparsity. Thanks to the Half-Quadratic solution, our quantization method achieves
+							significant speed-up (over <b>100x</b> faster vs. autograd to quantize LLama2-7B), being
+							able to process even the largest models in only a few minutes!
+
+							<h2 id="processing_time" class="">Processing Time</h2>
+							<p>We report the processing time to quantize the <a
+									href="https://ai.meta.com/llama/">Llama2</a> models. We noticed that the processing
+								time for GPTQ and AWQ drastically changes from one machine to another. GPTQ heavily
+								relies on the CPU which creates issues on virtual machines, so we limit the number of
+								threads to those available in the virtual machine (32) to avoid the process hanging for
+								hours. Our method performs the whole quantization on the GPU with half-precision and
+								only uses the CPU to transfer data to the GPU once the solver is finished. </p>
+							<center><img src="figs/llama2-7b_time.svg" /></center>
+							<center><img src="figs/llama2-13b_time.svg" /></center>
+							<center><img src="figs/llama2-70b_time.svg" /></center>
+
+							<h2 id="benchmark" class="">Benchmark</h2>
+
+							<p>To measure the quantization quality of our method, we use the perplexity metric
+								(<b>PPL</b>) on the widely adopted <a
+									href="https://huggingface.co/datasets/wikitext/viewer/wikitext-2-raw-v1">wikitext2</a>
+								dataset. We also report the runtime GPU memory in GB (<b>MEM</b>) the session takes to
+								run the quantized model (additional memory is required for prediction depending on the
+								sequence length). We compare against the popular approaches widely used by the
+								community: <a href="https://github.com/TimDettmers/bitsandbytes/">BNB (bitsandbytes)</a>
+								, <a href="https://github.com/PanQiWei/AutoGPTQ">GPTQ via AutoGPTQ</a> and <a
+									href="https://github.com/casper-hansen/AutoAWQ">AWQ via AutoAWQ</a>. </p>
+
+							<p>Regarding the parameters, we fix the Half-Quadratic solver with the following: <em>p=0.7,
+									beta=1, kappa=1.01, iterations=20</em>. Additionally, we use early-stopping to exit
+								the solver when the error doesn’t improve. We haven’t experimented much with the
+								parameters, so different settings might actually yield better results. Similar to the
+								other approaches, we use grouping to quantize the weights into buffers (<i>_g128</i>
+								means we use a group-size of 128). We also quantize the zero-point into 8-bit without
+								grouping or optimization. </p>
+
+							<center>
+								<table class="c106">
+									<tr class="c5">
+										<td class="c14" colspan="1" rowspan="2">
+											<p class="c3" style="text-align:center">
+												<span class="c17 c4"><b>Method</b></span>
+											</p>
+										</td>
+										<td class="c6" colspan="1" rowspan="2">
+											<p class="c3" style="text-align:center">
+												<span class="c17 c4"><b>nBits</b></span>
+											</p>
+										</td>
+										<td class="c85" colspan="2" rowspan="1">
+											<p class="c3" style="text-align:center">
+												<span class="c4 c17"><b>LLama2-7B</b></span>
+											</p>
+										</td>
+										<td class="c48" colspan="2" rowspan="1">
+											<p class="c3" style="text-align:center">
+												<span class="c17 c4"><b>LLama2-13B</b></span>
+											</p>
+										</td>
+										<td class="c85" colspan="2" rowspan="1">
+											<p class="c3" style="text-align:center">
+												<span class="c17 c4"><b>LLama2-70B</b></span>
+											</p>
+										</td>
+									</tr>
+									<tr class="c5">
+										<td class="c49" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>PPL &darr;</b></span>
+											</p>
+										</td>
+										<td class="c40" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>MEM &darr;</b> </span>
+											</p>
+										</td>
+										<td class="c114" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>PPL &darr;</b></span>
+											</p>
+										</td>
+										<td class="c88" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>MEM &darr;</b></span>
+											</p>
+										</td>
+										<td class="c49" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>PPL &darr;</b></span>
+											</p>
+										</td>
+										<td class="c40" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>MEM &darr;</b></span>
+											</p>
+										</td>
+									</tr>
+									<tr class="c32">
+										<td class="c14 c65" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">FP16</span>
+											</p>
+										</td>
+										<td class="c6 c65" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">16</span>
+											</p>
+										</td>
+										<td class="c25" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5.18</span>
+											</p>
+										</td>
+										<td class="c40 c65" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">13.5</span>
+											</p>
+										</td>
+										<td class="c50" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16"><b>4.63</b></span>
+											</p>
+										</td>
+										<td class="c60" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">25.6</span>
+											</p>
+										</td>
+										<td class="c25" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">OOM</span>
+											</p>
+										</td>
+										<td class="c40 c65" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">OOM</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: #F0F0F0;" class="c30">
+										<td class="c54" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">bnb</span>
+											</p>
+										</td>
+										<td class="c111" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">8</span>
+											</p>
+										</td>
+										<td class="c101" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5.22</span>
+											</p>
+										</td>
+										<td class="c80" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">7.9</span>
+											</p>
+										</td>
+										<td class="c34" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4.67</span>
+											</p>
+										</td>
+										<td class="c66" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">14.4</span>
+											</p>
+										</td>
+										<td class="c97" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">OOM</span>
+											</p>
+										</td>
+										<td class="c1" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">OOM</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: #F0F0F0;" class="c32">
+										<td class="c27" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">GPTQ_g128</span>
+											</p>
+										</td>
+										<td class="c44" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">8</span>
+											</p>
+										</td>
+										<td class="c113" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>5.19</b></span>
+											</p>
+										</td>
+										<td class="c108" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">7.8</span>
+											</p>
+										</td>
+										<td class="c115" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>4.63</b></span>
+											</p>
+										</td>
+										<td class="c69" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">14.8</span>
+											</p>
+										</td>
+										<td class="c98" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>3.12</b></span>
+											</p>
+										</td>
+										<td class="c76" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">74.87</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: #F0F0F0;" class="c32">
+										<td class="c26" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">HQQ_g128</span>
+											</p>
+										</td>
+										<td class="c75" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">8</span>
+											</p>
+										</td>
+										<td class="c104" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>5.19</b></span>
+											</p>
+										</td>
+										<td class="c89" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>7.6</b></span>
+											</p>
+										</td>
+										<td class="c51" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>4.63</b></span>
+											</p>
+										</td>
+										<td class="c58" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>14</b></span>
+											</p>
+										</td>
+										<td class="c102" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>3.12</b></span>
+											</p>
+										</td>
+										<td class="c74" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>69.32</b></span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(220, 220, 220);" class="c30">
+										<td class="c86" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">bnb_g64</span>
+											</p>
+										</td>
+										<td class="c87" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4</span>
+											</p>
+										</td>
+										<td class="c94" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5.43</span>
+											</p>
+										</td>
+										<td class="c39" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4.7</span>
+											</p>
+										</td>
+										<td class="c35" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4.79</span>
+											</p>
+										</td>
+										<td class="c118" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">8.2</span>
+											</p>
+										</td>
+										<td class="c116" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3.29</span>
+											</p>
+										</td>
+										<td class="c39" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">39.11</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(220, 220, 220);" class="c32">
+										<td class="c95" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">GPTQ_g128</span>
+											</p>
+										</td>
+										<td class="c93" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4</span>
+											</p>
+										</td>
+										<td class="c24" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5.41</span>
+											</p>
+										</td>
+										<td class="c23" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5</span>
+											</p>
+										</td>
+										<td class="c43" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4.74</span>
+											</p>
+										</td>
+										<td class="c9" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">8.9</span>
+											</p>
+										</td>
+										<td class="c10" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3.24</span>
+											</p>
+										</td>
+										<td class="c23" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">40</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(220, 220, 220);" class="c32">
+										<td class="c95" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">GPTQ_g64</span>
+											</p>
+										</td>
+										<td class="c93" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4</span>
+											</p>
+										</td>
+										<td class="c24" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5.38</span>
+											</p>
+										</td>
+										<td class="c23" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5</span>
+											</p>
+										</td>
+										<td class="c43" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4.73</span>
+											</p>
+										</td>
+										<td class="c9" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">9.1</span>
+											</p>
+										</td>
+										<td class="c10" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3.23</span>
+											</p>
+										</td>
+										<td class="c23" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">41.13</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(220, 220, 220);" class="c32">
+										<td class="c95" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">AWQ_g128</span>
+											</p>
+										</td>
+										<td class="c93" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4</span>
+											</p>
+										</td>
+										<td class="c10" colspan="1" rowspan="1">
+											<p class="c55">
+												<span class="c16">5.32</span>
+											</p>
+										</td>
+										<td class="c23" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>4.6</b></span>
+											</p>
+										</td>
+										<td class="c43" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4.71</span>
+											</p>
+										</td>
+										<td class="c9" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">8.2</span>
+											</p>
+										</td>
+										<td class="c10" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3.21</span>
+											</p>
+										</td>
+										<td class="c23" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>35.78</b></span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(220, 220, 220);" class="c30">
+										<td class="c95" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">AWQ_g64</span>
+											</p>
+										</td>
+										<td class="c93" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4</span>
+											</p>
+										</td>
+										<td class="c10" colspan="1" rowspan="1">
+											<p class="c55">
+												<span class="c19 c4"><b>5.28</b></span>
+											</p>
+										</td>
+										<td class="c23" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>4.6</b></span>
+											</p>
+										</td>
+										<td class="c43" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>4.7</b></span>
+											</p>
+										</td>
+										<td class="c9" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">8.5</span>
+											</p>
+										</td>
+										<td class="c10" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3.2</span>
+											</p>
+										</td>
+										<td class="c23" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">37.08</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(220, 220, 220);" class="c32">
+										<td class="c95" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">HQQ_g128</span>
+											</p>
+										</td>
+										<td class="c93" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4</span>
+											</p>
+										</td>
+										<td class="c10" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5.35</span>
+											</p>
+										</td>
+										<td class="c23" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>4.6</b></span>
+											</p>
+										</td>
+										<td class="c43" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4.74</span>
+											</p>
+										</td>
+										<td class="c9" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>7.9</b></span>
+											</p>
+										</td>
+										<td class="c10" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3.21</span>
+											</p>
+										</td>
+										<td class="c23" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">35.97</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(220, 220, 220);" class="c32">
+										<td class="c21" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">HQQ_g64</span>
+											</p>
+										</td>
+										<td class="c96" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4</span>
+											</p>
+										</td>
+										<td class="c81" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5.3</span>
+											</p>
+										</td>
+										<td class="c45" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>4.6</b></span>
+											</p>
+										</td>
+										<td class="c28" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>4.7</b></span>
+											</p>
+										</td>
+										<td class="c68" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">8.2</span>
+											</p>
+										</td>
+										<td class="c81" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c4 c63"><b>3.19</b></span>
+											</p>
+										</td>
+										<td class="c45" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c63">37.52</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(200, 200, 200);" class="c32">
+										<td class="c22" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">GPTQ_g128</span>
+											</p>
+										</td>
+										<td class="c7" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3</span>
+											</p>
+										</td>
+										<td class="c84" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">6.3</span>
+											</p>
+										</td>
+										<td class="c52" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3.9</span>
+											</p>
+										</td>
+										<td class="c120" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5.25</span>
+											</p>
+										</td>
+										<td class="c59" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">7</span>
+											</p>
+										</td>
+										<td class="c84" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3.85</span>
+											</p>
+										</td>
+										<td class="c52" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">33.7</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(200, 200, 200);" class="c32">
+										<td class="c56" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">GPTQ_g64</span>
+											</p>
+										</td>
+										<td class="c92" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3</span>
+											</p>
+										</td>
+										<td class="c47" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">6.1</span>
+											</p>
+										</td>
+										<td class="c61" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4</span>
+											</p>
+										</td>
+										<td class="c37" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5.16</span>
+											</p>
+										</td>
+										<td class="c31" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">7.3</span>
+											</p>
+										</td>
+										<td class="c47" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3.7</span>
+											</p>
+										</td>
+										<td class="c61" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">33.47</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(200, 200, 200);" class="c32">
+										<td class="c56" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">HQQ_g128</span>
+											</p>
+										</td>
+										<td class="c92" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3</span>
+											</p>
+										</td>
+										<td class="c47" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">6.2</span>
+											</p>
+										</td>
+										<td class="c61" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>3.8</b></span>
+											</p>
+										</td>
+										<td class="c37" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">5.15</span>
+											</p>
+										</td>
+										<td class="c31" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>6.8</b></span>
+											</p>
+										</td>
+										<td class="c47" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3.58</span>
+											</p>
+										</td>
+										<td class="c61" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>30.11</b></span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(200, 200, 200);" class="c32">
+										<td class="c112" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">HQQ_g64</span>
+											</p>
+										</td>
+										<td class="c70" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3</span>
+											</p>
+										</td>
+										<td class="c64" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>5.82</b></span>
+											</p>
+										</td>
+										<td class="c67" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4.5</span>
+											</p>
+										</td>
+										<td class="c105" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>4.98</b></span>
+											</p>
+										</td>
+										<td class="c83" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">7.4</span>
+											</p>
+										</td>
+										<td class="c64" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>3.45</b></span>
+											</p>
+										</td>
+										<td class="c67" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">33.46</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(180, 180, 180);" class="c32">
+										<td class="c119" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">GPTQ_g64</span>
+											</p>
+										</td>
+										<td class="c78" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">2</span>
+											</p>
+										</td>
+										<td class="c57" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">nan</span>
+											</p>
+										</td>
+										<td class="c90" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>3.5</b></span>
+											</p>
+										</td>
+										<td class="c62" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">13</span>
+											</p>
+										</td>
+										<td class="c110" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">6</span>
+											</p>
+										</td>
+										<td class="c57" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">9.44</span>
+											</p>
+										</td>
+										<td class="c90" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">24.5</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(180, 180, 180);" class="c32">
+										<td class="c71" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">HQQ_g32</span>
+											</p>
+										</td>
+										<td class="c72" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">2</span>
+											</p>
+										</td>
+										<td class="c41" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">15.61</span>
+											</p>
+										</td>
+										<td class="c79" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>3.5</b></span>
+											</p>
+										</td>
+										<td class="c42" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">7.63</span>
+											</p>
+										</td>
+										<td class="c53" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>5.9</b></span>
+											</p>
+										</td>
+										<td class="c41" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4.82</span>
+											</p>
+										</td>
+										<td class="c79" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>24.2</b></span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(180, 180, 180);" class="c32">
+										<td class="c71" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">HQQ_g16</span>
+											</p>
+										</td>
+										<td class="c72" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">2</span>
+											</p>
+										</td>
+										<td class="c41" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c4 c19"><b>7.3</b></span>
+											</p>
+										</td>
+										<td class="c79" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4.1</span>
+											</p>
+										</td>
+										<td class="c42" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>6.36</b></span>
+											</p>
+										</td>
+										<td class="c53" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">6.9</span>
+											</p>
+										</td>
+										<td class="c41" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c19 c4"><b>4.12</b></span>
+											</p>
+										</td>
+										<td class="c79" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">30.27</span>
+											</p>
+										</td>
+									</tr>
+									<tr style="background-color: rgb(180, 180, 180);" class="c32">
+										<td class="c73" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">HQQ_g16_s*</span>
+											</p>
+										</td>
+										<td class="c107" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">2</span>
+											</p>
+										</td>
+										<td class="c33" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">7.54</span>
+											</p>
+										</td>
+										<td class="c29" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">3.7</span>
+											</p>
+										</td>
+										<td class="c99" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">6.4</span>
+											</p>
+										</td>
+										<td class="c109" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">6.1</span>
+											</p>
+										</td>
+										<td class="c33" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">4.13</span>
+											</p>
+										</td>
+										<td class="c29" colspan="1" rowspan="1">
+											<p class="c3">
+												<span class="c16">26.37</span>
+											</p>
+										</td>
+									</tr>
+								</table>
+							</center>
+
+
+
+							<i>*: the scaling is also quantized to 8-bits with a group-size of 128.</i><br>
+
+							<p>As the table above shows, our method achieves very good performance despite not using any
+								calibration data, achieving the same performance as the leading method AWQ for both
+								Llama2-13B and 70B. For extreme low-bit quantization (3 and 2-bits), our method
+								outperforms GPTQ by a large margin for the same GPU memory requirements.</p>
+
+							<p>The interactive graph below summarizes the various data points into a scatter plot. Hover
+								or click on a bubble to display the details. </p>
+
+							<!--                         <center><img src="figs/scatter_plot.svg" /></center> -->
+
+
+							<meta charset="utf-8">
+
+							<!-- Load d3.js -->
+							<script src="https://d3js.org/d3.v4.js"></script>
+
+							<!-- Load color scale -->
+							<script src="https://d3js.org/d3-scale-chromatic.v1.min.js"></script>
+
+							<!-- Create a div where the graph will take place -->
+							<div id="my_dataviz"></div>
+
+							<!-- A bit of CSS: change stroke color of circle on hover (white -> black) -->
+							<style>
+								.bubbles {
+									stroke-width: 2px;
+									stroke: white;
+								}
+
+								.bubbles:hover {
+									stroke: black;
+								}
+							</style>
+
+							<script>
+
+								// set the dimensions and margins of the graph
+								var margin = { top: 10, right: 20, bottom: 30, left: 50 },
+									width = 500 - margin.left - margin.right,
+									height = 420 - margin.top - margin.bottom;
+
+								// append the svg object to the body of the page
+								var svg = d3.select("#my_dataviz")
+									.append("svg")
+									.attr("width", width + margin.left + margin.right)
+									.attr("height", height + margin.top + margin.bottom)
+									.append("g")
+									.attr("transform",
+										"translate(" + margin.left + "," + margin.top + ")");
+
+								//Read the data
+								d3.csv("https://gist.githubusercontent.com/mobicham/8a376dbf2137c284cf575602609b9f3b/raw/1b8e3ab11b6ae6764c6bb238c42e484cff2634e9/hqq_bubble_data.csv", function (data) {
+
+									var uniqueNbits = [...new Set(data.map(item => item.nbits))];
+									var uniqueMethods = [...new Set(data.map(item => item.method))];
+
+									// Create nbits selector
+									var nbitsSelector = d3.select("#my_dataviz")
+										.append("select")
+										.attr("id", "nbits-selector");
+									nbitsSelector.append("option").text("All").attr("value", "all").property("selected", true);
+									uniqueNbits.forEach(function (nbits) {
+										nbitsSelector.append("option").text(nbits).attr("value", nbits);
+									});
+
+									// Create method selector
+									var methodSelector = d3.select("#my_dataviz")
+										.append("select")
+										.attr("id", "method-selector");
+									methodSelector.append("option").text("All").attr("value", "all").property("selected", true);
+									uniqueMethods.forEach(function (method) {
+										methodSelector.append("option").text(method).attr("value", method);
+									});
+
+									// Event Listeners for the Selectors
+									nbitsSelector.on('change', updatePlot);
+									methodSelector.on('change', updatePlot);
+
+									// Add X axis
+									var x = d3.scaleLinear()
+										.domain([2, 42])
+										.range([0, width]);
+									svg.append("g")
+										.attr("transform", "translate(0," + height + ")")
+										.call(d3.axisBottom(x));
+
+									// Add Y axis
+									var y = d3.scaleLinear()
+										.domain([3, 6.5])
+										.range([height, 0]);
+									svg.append("g")
+										.call(d3.axisLeft(y));
+
+									// Add a scale for bubble size
+									var z = d3.scaleLinear()
+										.domain([2, 16])
+										.range([3, 16]);
+
+
+									var _methods = ["FP16", "BNB", "GPTQ", "AWQ", "HQQ"]
+									var _colors = ["#444444", "#3d85c6", "#f44336", "#ce7e00", "#8fce00"]
+									// Add a scale for bubble color
+									var myColor = d3.scaleOrdinal()
+										.domain(_methods)
+										//.range(d3.schemeSet2);
+										.range(_colors)
+
+									// -1- Create a tooltip div that is hidden by default:
+									var tooltip = d3.select("#my_dataviz")
+										.append("div")
+										.style("opacity", 0)
+										.attr("class", "tooltip")
+										.style("background-color", "black")
+										.style("border-radius", "5px")
+										.style("padding", "10px")
+										.style("color", "white")
+
+									// -2- Create 3 functions to show / update (when mouse move but stay on same circle) / hide the tooltip
+									var showTooltip = function (d) {
+										tooltip
+											.transition()
+											.duration(200)
+										tooltip
+											.style("opacity", 1)
+											.html(d.method_detail)
+											.style("left", (d3.mouse(this)[0] + 30) + "px")
+											.style("top", (d3.mouse(this)[1] + 30) + "px")
+									}
+									var moveTooltip = function (d) {
+										tooltip
+											.style("left", (d3.mouse(this)[0] + 30) + "px")
+											.style("top", (d3.mouse(this)[1] + 30) + "px")
+									}
+									var hideTooltip = function (d) {
+										tooltip
+											.transition()
+											.duration(200)
+											.style("opacity", 0)
+									}
+
+
+									//Axis
+									svg.append("text")
+										.attr("class", "x label")
+										.attr("text-anchor", "end")
+										.attr("x", width - width / 2 + 60)
+										.attr("y", height + 30)
+										.text("Memory (GB)");
+
+									svg.append("text")
+										.attr("class", "y label")
+										.attr("text-anchor", "end")
+										.attr("y", -50)
+										.attr("dy", ".75em")
+										.attr("x", -height / 2 + 70)
+										.attr("transform", "rotate(-90)")
+										.text("Perplexity");
+
+
+									// Add dots
+									svg.append('g')
+										.selectAll("dot")
+										.data(data)
+										.enter()
+										.append("circle")
+										.attr("class", "bubbles")
+										.attr("cx", function (d) { return x(d.memory); })
+										.attr("cy", function (d) { return y(d.perplexity); })
+										.attr("r", function (d) { return z(d.nbits); })
+										.style("fill", function (d) { return myColor(d.method); })
+										// -3- Trigger the functions
+										.on("mouseover", showTooltip)
+										.on("mousemove", moveTooltip)
+										.on("mouseleave", hideTooltip)
+
+
+									//Legend
+									// Add one dot in the legend for each name.
+									svg.selectAll("legend_dots")
+										.data(_methods)
+										.enter()
+										.append("circle")
+										.attr("cx", 420)
+										.attr("cy", function (d, i) { return i * 20 }) // 100 is where the first dot appears. 25 is the distance between dots
+										.attr("r", 5)
+										.style("fill", function (d) { return myColor(d) })
+
+
+
+									svg.selectAll("legend_dots_labels")
+										.data(_methods)
+										.enter()
+										.append("text")
+										.attr("x", 420 - 40)
+										.attr("y", function (d, i) { return i * 20 + 3 }) // 100 is where the first dot appears. 25 is the distance between dots
+										.style("fill", function (d) { return myColor(d) })
+										.style("font", "12px times")
+										.text(function (d) { return d })
+										.attr("text-anchor", "left")
+										.style("alignment-baseline", "right")
+
+									function updatePlot() {
+										var selectedNbits = d3.select("#nbits-selector").property('value');
+										var selectedMethod = d3.select("#method-selector").property('value');
+
+										svg.selectAll(".bubbles")
+										.style('display', function(d) {
+											return (selectedNbits === 'all' || d.nbits === selectedNbits) &&
+												(selectedMethod === 'all' || d.method === selectedMethod) ? null : 'none';
+										});
+									}
+
+								})
+
+							</script>
+
+
+
+							<h2 id="conclusion">Conclusion</h2>
+
+							<p>This article demonstrates that calibration-free quantization through our proposed HQQ
+								method can achieve a quality competitive with popular data-dependent methods like GPTQ
+								and AWQ. We have demonstrated the effectiveness of HQQ even for extreme low-bit
+								quantization across different model sizes. Moreover, by leveraging efficient
+								optimization techniques such as Half-Quadratic splitting, our method cuts the
+								quantization time to only a few minutes even for the biggest models available such as
+								Llama2-70B. </p>
+
+							<p>We provide the code to reproduce all the results presented in this article: <a
+									href="https://github.com/mobiusml/hqq/tree/main/code">https://github.com/mobiusml/hqq/tree/main/code</a>
+							</p>
+
+
+							<hr />
+							<div>
+								<p style="text-align: center;">Please feel free to contact us at <a
+										href="mailto:hicham@mobiuslabs.com">hicham@mobiuslabs.com</a>.</p>
+								<!--                             <p style="text-align: center; color:hotpink;">Check out our other blog post</p> -->
+
+							</div>
+
+				</div>
+
+
+
+
+			</div>
+			<p id="d9be7859-86c8-4e9e-8957-b0127ad9431d" class="">
+			<div class="indented">
+				<p id="7b0d7f13-0909-4e80-97fe-e0102053cc62" class="">
+				</p>
+			</div>
 			</p>
-			<p>Unlike these approaches, our method focuses specifically on minimizing errors in the <i>weights</i> rather than the layer activation error. Additionally, by incorporating a sparsity-promoting loss, such as the  \( {l_{p<1}} \)-norm, we effectively model outliers through a hyper-Laplacian distribution. This distribution more accurately captures the heavy-tailed nature of outlier errors compared to the squared error, resulting in a more nuanced representation of error distribution.
-			</p>
-			<p>We propose a robust optimization formulation to find the quantization parameters (zero-point \( z \) and scaling \( s \)). More specifically, we use a sparsity-promoting loss function \( \phi() \) such as the \( {l_{p}} \) norm between the original weights \( W \) and their dequantized version:</p>
-			    
-                        $$\underset{z,s}{\text{argmin}}\,\phi(W-Q_{z,s}^{-1}(Q_{z,s}(W)),$$
-                         
-                        where \( Q_{z,s}() \) is the quantization operator which depends on the \( z \) and \( s \) parameters and generates the quantized weights \( W_{q} \). \( Q_{z,s}()^{-1} \) is the de-quantization operator:
-                        $$\begin{array}{c}
-                        Q_{z,s}(W)=\text{round}(W/s+z)=W_{q}\\
-                        Q_{z,s}^{-1}(W_{q})=s(W_{q}-z)
-                        \end{array}$$
-
-                        <p>The use of the \( {l_{p<1}} \)-norm  makes the problem non-convex. To find a solution, we adopt a <a href="https://ieeexplore.ieee.org/document/120331">Half-Quadratic solver</a> by introducing an extra variable \( W_{e} \). This additional parameter allows us to split the main problem into sub-problems that are easier to solve. Moreover, to make the problem simpler, we fix the scaling \( s \) parameter and only optimize for the zero-point \( z \). </p>
-
-                        $$\underset{z,W_{e}}{\text{argmin}}\,\phi(W_{e})+\frac{\beta}{2}||W_{e}-(W-Q_{z}^{-1}(Q_{z}(W))||_{2}^{2}$$
-
-                        We then form sub-problems which are solved via alternate optimization: 
-
-  			$$\begin{array}{cc}
-\text{(sp}_{1}) & W_{e}^{(t+1)}\leftarrow\underset{}{\underset{W_{e}}{\text{argmin}}\,\phi(W_{e})+\frac{\beta^{(t)}}{2}||W_{e}-(W-Q_{z}^{-1}(Q_{z}(W))||_{2}^{2}}\\
-\text{(sp}_{2}) & z^{(t+1)}\leftarrow\underset{z}{\text{argmin}}\,\frac{1}{2}||Q_{z}^{-1}(Q_{z}(W))-(W-W_{e}^{(t+1)})||_{2}^{2}\\
- & \beta^{(t+1)}\leftarrow\kappa\beta^{(t)},\end{array}$$
-			where \( \beta \) and \( \kappa \) and strictly positive parameters. 
-
-                        <h4>Sub-problem \( \text{(sp}_{1}) \)</h4>
-
-                        This problem takes the form of a <a href="https://web.stanford.edu/~boyd/papers/pdf/prox_algs.pdf">Proximal Operator</a>. When \( \phi() \) is the \( l_{1} \) norm, the solution is the <a href="https://sparse-plex.readthedocs.io/en/latest/book/opt/soft_thresholding.html">soft-thresholding operator</a>. There exists a more general thresholding solution for the \( l_{p}\)-norm with \( 0 \le p \leq 1 \) that we adopt known is as the <a href="https://inria.hal.science/hal-01317151/file/lowrank_ieee_tip.pdf">generalized soft-thresholding operator</a>:
-
-                        $$\begin{array}{c}
-                        W_{e}^{(t+1)}\leftarrow\text{shrink}_{l_{p}}\left(W-Q_{z}^{-1}(Q_{z}(W)),\beta\right)\\
-                        \text{shrink}_{l_{p}}(x,\beta)=\text{sign}(x)\text{relu}(|x|-\frac{|x|^{p-1}}{\beta})
-                        \end{array}$$
-
-
-                         <h4>Sub-problem \( \text{(sp}_{2}) \)</h4>
-                        The second sub-problem can be rewritten as follows:
-                        $$\begin{array}{c}
-                        z^{(t+1)}\leftarrow\underset{z}{\text{argmin}}\,\frac{1}{2}||z-\left(W_{q}^{(t+1)}-\frac{(W-W_{e}^{(t+1)})}{s}\right)||_{2}^{2}\\
-                        W_{q}^{(t+1)}=\text{round}(W/s+z^{(t)})
-                        \end{array}$$
-
-                        The solution is simply the average over the axis the quantization grouping is performed on:
-                        $$z^{(t+1)}\leftarrow\langle W_{q}^{(t+1)}-\frac{(W-W_{e}^{(t+1)})}{s}\rangle$$
-
-                        In our implementation, we work with the inverse of the scale \( 1/s \) instead of \( s \) which we found to be a bit more stable with the half-precision calculations.<br><br> 
-
-                        Note that, contrary to using gradient descent with <a href="https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html">autograd</a>, the solution that we propose relies on closed-form solutions, which means that there are no gradients calculated. This allows us to run all the calculations in inference mode with half-precision. Moreover, it only takes a few iterations for the solver to converge. Conversely, using the AdamW optimizer and Pytorch’s autograd takes thousands of iterations to achieve good results. It also fails with \( p \le 1 \), which is what we actually use to promote sparsity. Thanks to the Half-Quadratic solution, our quantization method achieves significant speed-up (over <b>100x</b> faster vs. autograd to quantize LLama2-7B), being able to process even the largest models in only a few minutes! 
-
-                        <h2 id="processing_time" class="">Processing Time</h2>
-                        <p>We report the processing time to quantize the <a href="https://ai.meta.com/llama/">Llama2</a> models. We noticed that the processing time for GPTQ and AWQ drastically changes from one machine to another. GPTQ heavily relies on the CPU which creates issues on virtual machines, so we limit the number of threads to those available in the virtual machine (32) to avoid the process hanging for hours. Our method performs the whole quantization on the GPU with half-precision and only uses the CPU to transfer data to the GPU once the solver is finished. </p>
-                        <center><img src="figs/llama2-7b_time.svg" /></center>
-                        <center><img src="figs/llama2-13b_time.svg" /></center>
-                        <center><img src="figs/llama2-70b_time.svg" /></center>
-
-                        <h2 id="benchmark" class="">Benchmark</h2>
-
-                        <p>To measure the quantization quality of our method, we use the perplexity metric (<b>PPL</b>) on the widely adopted <a href="https://huggingface.co/datasets/wikitext/viewer/wikitext-2-raw-v1">wikitext2</a> dataset. We also report the runtime GPU memory in GB (<b>MEM</b>) the session takes to run the quantized model (additional memory is required for prediction depending on the sequence length). We compare against the popular approaches widely used by the community: <a href="https://github.com/TimDettmers/bitsandbytes/">BNB (bitsandbytes)</a> , <a href="https://github.com/PanQiWei/AutoGPTQ">GPTQ via AutoGPTQ</a> and <a href="https://github.com/casper-hansen/AutoAWQ">AWQ via AutoAWQ</a>.  </p>
-
-                        <p>Regarding the parameters, we fix the Half-Quadratic solver with the following: <em>p=0.7, beta=1, kappa=1.01, iterations=20</em>. Additionally, we use early-stopping to exit the solver when the error doesn’t improve. We haven’t experimented much with the parameters, so different settings might actually yield better results. Similar to the other approaches, we use grouping to quantize the weights into buffers (<i>_g128</i> means we use a group-size of 128). We also quantize the zero-point into 8-bit without grouping or optimization. </p>
-
-<center>	
-	<table class="c106">
-	  <tr class="c5">
-	    <td class="c14" colspan="1" rowspan="2">
-	      <p class="c3" style="text-align:center">
-	        <span class="c17 c4"><b>Method</b></span>
-	      </p>
-	    </td>
-	    <td class="c6" colspan="1" rowspan="2">
-	      <p class="c3" style="text-align:center">
-	        <span class="c17 c4"><b>nBits</b></span>
-	      </p>
-	    </td>
-	    <td class="c85" colspan="2" rowspan="1">
-	      <p class="c3" style="text-align:center">
-	        <span class="c4 c17"><b>LLama2-7B</b></span>
-	      </p>
-	    </td>
-	    <td class="c48" colspan="2" rowspan="1">
-	      <p class="c3" style="text-align:center">
-	        <span class="c17 c4"><b>LLama2-13B</b></span>
-	      </p>
-	    </td>
-	    <td class="c85" colspan="2" rowspan="1">
-	      <p class="c3" style="text-align:center">
-	        <span class="c17 c4"><b>LLama2-70B</b></span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr class="c5">
-	    <td class="c49" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>PPL &darr;</b></span>
-	      </p>
-	    </td>
-	    <td class="c40" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>MEM &darr;</b> </span>
-	      </p>
-	    </td>
-	    <td class="c114" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>PPL &darr;</b></span>
-	      </p>
-	    </td>
-	    <td class="c88" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>MEM &darr;</b></span>
-	      </p>
-	    </td>
-	    <td class="c49" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>PPL &darr;</b></span>
-	      </p>
-	    </td>
-	    <td class="c40" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>MEM &darr;</b></span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr class="c32">
-	    <td class="c14 c65" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">FP16</span>
-	      </p>
-	    </td>
-	    <td class="c6 c65" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">16</span>
-	      </p>
-	    </td>
-	    <td class="c25" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5.18</span>
-	      </p>
-	    </td>
-	    <td class="c40 c65" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">13.5</span>
-	      </p>
-	    </td>
-	    <td class="c50" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16"><b>4.63</b></span>
-	      </p>
-	    </td>
-	    <td class="c60" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">25.6</span>
-	      </p>
-	    </td>
-	    <td class="c25" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">OOM</span>
-	      </p>
-	    </td>
-	    <td class="c40 c65" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">OOM</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: #F0F0F0;" class="c30"> 
-	    <td class="c54" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">bnb</span>
-	      </p>
-	    </td>
-	    <td class="c111" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">8</span>
-	      </p>
-	    </td>
-	    <td class="c101" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5.22</span>
-	      </p>
-	    </td>
-	    <td class="c80" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">7.9</span>
-	      </p>
-	    </td>
-	    <td class="c34" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4.67</span>
-	      </p>
-	    </td>
-	    <td class="c66" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">14.4</span>
-	      </p>
-	    </td>
-	    <td class="c97" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">OOM</span>
-	      </p>
-	    </td>
-	    <td class="c1" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">OOM</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: #F0F0F0;"  class="c32">
-	    <td class="c27" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">GPTQ_g128</span>
-	      </p>
-	    </td>
-	    <td class="c44" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">8</span>
-	      </p>
-	    </td>
-	    <td class="c113" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>5.19</b></span>
-	      </p>
-	    </td>
-	    <td class="c108" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">7.8</span>
-	      </p>
-	    </td>
-	    <td class="c115" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>4.63</b></span>
-	      </p>
-	    </td>
-	    <td class="c69" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">14.8</span>
-	      </p>
-	    </td>
-	    <td class="c98" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>3.12</b></span>
-	      </p>
-	    </td>
-	    <td class="c76" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">74.87</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: #F0F0F0;"  class="c32">
-	    <td class="c26" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">HQQ_g128</span>
-	      </p>
-	    </td>
-	    <td class="c75" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">8</span>
-	      </p>
-	    </td>
-	    <td class="c104" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>5.19</b></span>
-	      </p>
-	    </td>
-	    <td class="c89" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>7.6</b></span>
-	      </p>
-	    </td>
-	    <td class="c51" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>4.63</b></span>
-	      </p>
-	    </td>
-	    <td class="c58" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>14</b></span>
-	      </p>
-	    </td>
-	    <td class="c102" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>3.12</b></span>
-	      </p>
-	    </td>
-	    <td class="c74" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>69.32</b></span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(220, 220, 220);" class="c30">
-	    <td class="c86" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">bnb_g64</span>
-	      </p>
-	    </td>
-	    <td class="c87" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4</span>
-	      </p>
-	    </td>
-	    <td class="c94" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5.43</span>
-	      </p>
-	    </td>
-	    <td class="c39" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4.7</span>
-	      </p>
-	    </td>
-	    <td class="c35" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4.79</span>
-	      </p>
-	    </td>
-	    <td class="c118" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">8.2</span>
-	      </p>
-	    </td>
-	    <td class="c116" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3.29</span>
-	      </p>
-	    </td>
-	    <td class="c39" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">39.11</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(220, 220, 220);" class="c32">
-	    <td class="c95" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">GPTQ_g128</span>
-	      </p>
-	    </td>
-	    <td class="c93" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4</span>
-	      </p>
-	    </td>
-	    <td class="c24" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5.41</span>
-	      </p>
-	    </td>
-	    <td class="c23" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5</span>
-	      </p>
-	    </td>
-	    <td class="c43" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4.74</span>
-	      </p>
-	    </td>
-	    <td class="c9" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">8.9</span>
-	      </p>
-	    </td>
-	    <td class="c10" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3.24</span>
-	      </p>
-	    </td>
-	    <td class="c23" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">40</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(220, 220, 220);" class="c32">
-	    <td class="c95" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">GPTQ_g64</span>
-	      </p>
-	    </td>
-	    <td class="c93" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4</span>
-	      </p>
-	    </td>
-	    <td class="c24" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5.38</span>
-	      </p>
-	    </td>
-	    <td class="c23" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5</span>
-	      </p>
-	    </td>
-	    <td class="c43" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4.73</span>
-	      </p>
-	    </td>
-	    <td class="c9" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">9.1</span>
-	      </p>
-	    </td>
-	    <td class="c10" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3.23</span>
-	      </p>
-	    </td>
-	    <td class="c23" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">41.13</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(220, 220, 220);" class="c32">
-	    <td class="c95" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">AWQ_g128</span>
-	      </p>
-	    </td>
-	    <td class="c93" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4</span>
-	      </p>
-	    </td>
-	    <td class="c10" colspan="1" rowspan="1">
-	      <p class="c55">
-	        <span class="c16">5.32</span>
-	      </p>
-	    </td>
-	    <td class="c23" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>4.6</b></span>
-	      </p>
-	    </td>
-	    <td class="c43" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4.71</span>
-	      </p>
-	    </td>
-	    <td class="c9" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">8.2</span>
-	      </p>
-	    </td>
-	    <td class="c10" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3.21</span>
-	      </p>
-	    </td>
-	    <td class="c23" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>35.78</b></span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(220, 220, 220);" class="c30">
-	    <td class="c95" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">AWQ_g64</span>
-	      </p>
-	    </td>
-	    <td class="c93" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4</span>
-	      </p>
-	    </td>
-	    <td class="c10" colspan="1" rowspan="1">
-	      <p class="c55">
-	        <span class="c19 c4"><b>5.28</b></span>
-	      </p>
-	    </td>
-	    <td class="c23" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>4.6</b></span>
-	      </p>
-	    </td>
-	    <td class="c43" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>4.7</b></span>
-	      </p>
-	    </td>
-	    <td class="c9" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">8.5</span>
-	      </p>
-	    </td>
-	    <td class="c10" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3.2</span>
-	      </p>
-	    </td>
-	    <td class="c23" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">37.08</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(220, 220, 220);" class="c32">
-	    <td class="c95" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">HQQ_g128</span>
-	      </p>
-	    </td>
-	    <td class="c93" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4</span>
-	      </p>
-	    </td>
-	    <td class="c10" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5.35</span>
-	      </p>
-	    </td>
-	    <td class="c23" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>4.6</b></span>
-	      </p>
-	    </td>
-	    <td class="c43" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4.74</span>
-	      </p>
-	    </td>
-	    <td class="c9" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>7.9</b></span>
-	      </p>
-	    </td>
-	    <td class="c10" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3.21</span>
-	      </p>
-	    </td>
-	    <td class="c23" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">35.97</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(220, 220, 220);" class="c32">
-	    <td class="c21" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">HQQ_g64</span>
-	      </p>
-	    </td>
-	    <td class="c96" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4</span>
-	      </p>
-	    </td>
-	    <td class="c81" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5.3</span>
-	      </p>
-	    </td>
-	    <td class="c45" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>4.6</b></span>
-	      </p>
-	    </td>
-	    <td class="c28" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>4.7</b></span>
-	      </p>
-	    </td>
-	    <td class="c68" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">8.2</span>
-	      </p>
-	    </td>
-	    <td class="c81" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c4 c63"><b>3.19</b></span>
-	      </p>
-	    </td>
-	    <td class="c45" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c63">37.52</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(200, 200, 200);" class="c32">
-	    <td class="c22" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">GPTQ_g128</span>
-	      </p>
-	    </td>
-	    <td class="c7" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3</span>
-	      </p>
-	    </td>
-	    <td class="c84" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">6.3</span>
-	      </p>
-	    </td>
-	    <td class="c52" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3.9</span>
-	      </p>
-	    </td>
-	    <td class="c120" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5.25</span>
-	      </p>
-	    </td>
-	    <td class="c59" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">7</span>
-	      </p>
-	    </td>
-	    <td class="c84" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3.85</span>
-	      </p>
-	    </td>
-	    <td class="c52" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">33.7</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(200, 200, 200);" class="c32">
-	    <td class="c56" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">GPTQ_g64</span>
-	      </p>
-	    </td>
-	    <td class="c92" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3</span>
-	      </p>
-	    </td>
-	    <td class="c47" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">6.1</span>
-	      </p>
-	    </td>
-	    <td class="c61" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4</span>
-	      </p>
-	    </td>
-	    <td class="c37" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5.16</span>
-	      </p>
-	    </td>
-	    <td class="c31" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">7.3</span>
-	      </p>
-	    </td>
-	    <td class="c47" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3.7</span>
-	      </p>
-	    </td>
-	    <td class="c61" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">33.47</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(200, 200, 200);" class="c32">
-	    <td class="c56" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">HQQ_g128</span>
-	      </p>
-	    </td>
-	    <td class="c92" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3</span>
-	      </p>
-	    </td>
-	    <td class="c47" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">6.2</span>
-	      </p>
-	    </td>
-	    <td class="c61" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>3.8</b></span>
-	      </p>
-	    </td>
-	    <td class="c37" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">5.15</span>
-	      </p>
-	    </td>
-	    <td class="c31" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>6.8</b></span>
-	      </p>
-	    </td>
-	    <td class="c47" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3.58</span>
-	      </p>
-	    </td>
-	    <td class="c61" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>30.11</b></span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(200, 200, 200);"  class="c32">
-	    <td class="c112" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">HQQ_g64</span>
-	      </p>
-	    </td>
-	    <td class="c70" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3</span>
-	      </p>
-	    </td>
-	    <td class="c64" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>5.82</b></span>
-	      </p>
-	    </td>
-	    <td class="c67" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4.5</span>
-	      </p>
-	    </td>
-	    <td class="c105" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>4.98</b></span>
-	      </p>
-	    </td>
-	    <td class="c83" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">7.4</span>
-	      </p>
-	    </td>
-	    <td class="c64" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>3.45</b></span>
-	      </p>
-	    </td>
-	    <td class="c67" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">33.46</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(180, 180, 180);" class="c32">
-	    <td class="c119" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">GPTQ_g64</span>
-	      </p>
-	    </td>
-	    <td class="c78" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">2</span>
-	      </p>
-	    </td>
-	    <td class="c57" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">nan</span>
-	      </p>
-	    </td>
-	    <td class="c90" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>3.5</b></span>
-	      </p>
-	    </td>
-	    <td class="c62" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">13</span>
-	      </p>
-	    </td>
-	    <td class="c110" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">6</span>
-	      </p>
-	    </td>
-	    <td class="c57" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">9.44</span>
-	      </p>
-	    </td>
-	    <td class="c90" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">24.5</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(180, 180, 180);" class="c32">
-	    <td class="c71" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">HQQ_g32</span>
-	      </p>
-	    </td>
-	    <td class="c72" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">2</span>
-	      </p>
-	    </td>
-	    <td class="c41" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">15.61</span>
-	      </p>
-	    </td>
-	    <td class="c79" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>3.5</b></span>
-	      </p>
-	    </td>
-	    <td class="c42" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">7.63</span>
-	      </p>
-	    </td>
-	    <td class="c53" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>5.9</b></span>
-	      </p>
-	    </td>
-	    <td class="c41" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4.82</span>
-	      </p>
-	    </td>
-	    <td class="c79" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>24.2</b></span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(180, 180, 180);" class="c32">
-	    <td class="c71" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">HQQ_g16</span>
-	      </p>
-	    </td>
-	    <td class="c72" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">2</span>
-	      </p>
-	    </td>
-	    <td class="c41" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c4 c19"><b>7.3</b></span>
-	      </p>
-	    </td>
-	    <td class="c79" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4.1</span>
-	      </p>
-	    </td>
-	    <td class="c42" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>6.36</b></span>
-	      </p>
-	    </td>
-	    <td class="c53" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">6.9</span>
-	      </p>
-	    </td>
-	    <td class="c41" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c19 c4"><b>4.12</b></span>
-	      </p>
-	    </td>
-	    <td class="c79" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">30.27</span>
-	      </p>
-	    </td>
-	  </tr>
-	  <tr style="background-color: rgb(180, 180, 180);" class="c32">
-	    <td class="c73" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">HQQ_g16_s*</span>
-	      </p>
-	    </td>
-	    <td class="c107" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">2</span>
-	      </p>
-	    </td>
-	    <td class="c33" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">7.54</span>
-	      </p>
-	    </td>
-	    <td class="c29" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">3.7</span>
-	      </p>
-	    </td>
-	    <td class="c99" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">6.4</span>
-	      </p>
-	    </td>
-	    <td class="c109" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">6.1</span>
-	      </p>
-	    </td>
-	    <td class="c33" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">4.13</span>
-	      </p>
-	    </td>
-	    <td class="c29" colspan="1" rowspan="1">
-	      <p class="c3">
-	        <span class="c16">26.37</span>
-	      </p>
-	    </td>
-	  </tr>
-	</table>
-</center>
-
-                        
-                        
-                        <i>*: the scaling is also quantized to 8-bits with a group-size of 128.</i><br>
-
-                        <p>As the table above shows, our method achieves very good performance despite not using any calibration data, achieving the same performance as the leading method AWQ for both Llama2-13B and 70B. For extreme low-bit quantization (3 and 2-bits), our method outperforms GPTQ by a large margin for the same GPU memory requirements.</p>
-
-                        <p>The interactive graph below summarizes the various data points into a scatter plot. Hover or click on a bubble to display the details. </p>
-
-<!--                         <center><img src="figs/scatter_plot.svg" /></center> -->
-
-
-<meta charset="utf-8">
-
-<!-- Load d3.js -->
-<script src="https://d3js.org/d3.v4.js"></script>
-
-<!-- Load color scale -->
-<script src="https://d3js.org/d3-scale-chromatic.v1.min.js"></script>
-
-<!-- Create a div where the graph will take place -->
-<div id="my_dataviz"></div>
-
-<!-- A bit of CSS: change stroke color of circle on hover (white -> black) -->
-<style>
-.bubbles {
-  stroke-width: 2px;
-  stroke: white;
-}
-.bubbles:hover {
-  stroke: black;
-}
-</style>
-
-<script>
-
-// set the dimensions and margins of the graph
-var margin = {top: 10, right: 20, bottom: 30, left: 50},
-    width = 500 - margin.left - margin.right,
-    height = 420 - margin.top - margin.bottom;
-
-// append the svg object to the body of the page
-var svg = d3.select("#my_dataviz")
-  .append("svg")
-    .attr("width", width + margin.left + margin.right)
-    .attr("height", height + margin.top + margin.bottom)
-  .append("g")
-    .attr("transform",
-          "translate(" + margin.left + "," + margin.top + ")");
-
-//Read the data
-d3.csv("https://gist.githubusercontent.com/mobicham/8a376dbf2137c284cf575602609b9f3b/raw/1b8e3ab11b6ae6764c6bb238c42e484cff2634e9/hqq_bubble_data.csv", function(data) 
-{
-  // Add X axis
-  var x = d3.scaleLinear()
-    .domain([2, 42])
-    .range([ 0, width ]);
-  svg.append("g")
-    .attr("transform", "translate(0," + height + ")")
-    .call(d3.axisBottom(x));
-
-  // Add Y axis
-  var y = d3.scaleLinear()
-    .domain([3, 6.5])
-    .range([height, 0]);
-  svg.append("g")
-    .call(d3.axisLeft(y));
-
-  // Add a scale for bubble size
-  var z = d3.scaleLinear()
-    .domain([2, 16])
-    .range([3, 16]);
-
-
-   var _methods = ["FP16", "BNB", "GPTQ", "AWQ", "HQQ"]
-   var _colors  = ["#444444", "#3d85c6", "#f44336", "#ce7e00", "#8fce00"]
-  // Add a scale for bubble color
-  var myColor = d3.scaleOrdinal()
-    .domain(_methods)
-    //.range(d3.schemeSet2);
-    .range(_colors)
-
-  // -1- Create a tooltip div that is hidden by default:
-  var tooltip = d3.select("#my_dataviz")
-    .append("div")
-      .style("opacity", 0)
-      .attr("class", "tooltip")
-      .style("background-color", "black")
-      .style("border-radius", "5px")
-      .style("padding", "10px")
-      .style("color", "white")
-
-  // -2- Create 3 functions to show / update (when mouse move but stay on same circle) / hide the tooltip
-  var showTooltip = function(d) {
-    tooltip
-      .transition()
-      .duration(200)
-    tooltip
-      .style("opacity", 1)
-      .html(d.method_detail)
-      .style("left", (d3.mouse(this)[0]+30) + "px")
-      .style("top", (d3.mouse(this)[1]+30) + "px")
-  }
-  var moveTooltip = function(d) {
-    tooltip
-      .style("left", (d3.mouse(this)[0]+30) + "px")
-      .style("top", (d3.mouse(this)[1]+30) + "px")
-  }
-  var hideTooltip = function(d) {
-    tooltip
-      .transition()
-      .duration(200)
-      .style("opacity", 0)
-  }
-
-
-  //Axis
-  svg.append("text")
-      .attr("class", "x label")
-      .attr("text-anchor", "end")
-      .attr("x", width - width/2 + 60)
-      .attr("y", height + 30)
-      .text("Memory (GB)");
-
-svg.append("text")
-    .attr("class", "y label")
-    .attr("text-anchor", "end")
-    .attr("y", -50)
-    .attr("dy", ".75em")
-    .attr("x", -height/2 + 70)
-    .attr("transform", "rotate(-90)")
-    .text("Perplexity");
-
-
-  // Add dots
-  svg.append('g')
-    .selectAll("dot")
-    .data(data)
-    .enter()
-    .append("circle")
-      .attr("class", "bubbles")
-      .attr("cx", function (d) { return x(d.memory); } )
-      .attr("cy", function (d) { return y(d.perplexity); } )
-      .attr("r", function (d) { return z(d.nbits); } )
-      .style("fill", function (d) { return myColor(d.method); } )
-    // -3- Trigger the functions
-    .on("mouseover", showTooltip )
-    .on("mousemove", moveTooltip )
-    .on("mouseleave", hideTooltip )
-
-
-  //Legend
-// Add one dot in the legend for each name.
-svg.selectAll("legend_dots")
-  .data(_methods)
-  .enter()
-  .append("circle")
-    .attr("cx", 420)
-    .attr("cy", function(d,i){ return  i*20}) // 100 is where the first dot appears. 25 is the distance between dots
-    .attr("r", 5)
-    .style("fill", function(d){ return myColor(d)})
-
- 
-
-svg.selectAll("legend_dots_labels")
-  .data(_methods)
-  .enter()
-  .append("text")
-    .attr("x", 420 - 40)
-    .attr("y", function(d,i){ return i*20 + 3}) // 100 is where the first dot appears. 25 is the distance between dots
-    .style("fill", function(d){ return myColor(d)})
-    .style("font", "12px times")
-    .text(function(d){ return d})
-    .attr("text-anchor", "left")
-    .style("alignment-baseline", "right")
-
- })
-
-</script>
-
-			    
-
-                        <h2 id="conclusion">Conclusion</h2>
-
-                        <p>This article demonstrates that calibration-free quantization through our proposed HQQ method can achieve a quality competitive with popular data-dependent methods like GPTQ and AWQ. We have demonstrated the effectiveness of HQQ even for extreme low-bit quantization across different model sizes. Moreover, by leveraging efficient optimization techniques such as Half-Quadratic splitting, our method cuts the quantization time to only a few minutes even for the biggest models available such as Llama2-70B. </p>
-
-                        <p>We provide the code to reproduce all the results presented in this article: <a href="https://github.com/mobiusml/hqq/tree/main/code">https://github.com/mobiusml/hqq/tree/main/code</a>  </p>
-
-
-                        <hr />
-                        <div>
-                            <p style="text-align: center;">Please feel free to contact us at <a href="mailto:hicham@mobiuslabs.com">hicham@mobiuslabs.com</a>.</p>
-<!--                             <p style="text-align: center; color:hotpink;">Check out our other blog post</p> -->
-
-                        </div> 
-
-                    </div>
-        
-        
-        
-
-                </div>
-                <p id="d9be7859-86c8-4e9e-8957-b0127ad9431d" class="">
-                <div class="indented">
-                    <p id="7b0d7f13-0909-4e80-97fe-e0102053cc62" class="">
-                    </p>
-                </div>
-                </p>
-            </div>
-        </article>
-    </body>
-
-    </html>
+		</div>
+	</article>
+</body>
+
+</html>
\ No newline at end of file

+ + Method + +	+ + nBits + +	+ + LLama2-7B + +		+ + LLama2-13B + +		+ + LLama2-70B + +
+ + Method + +	+ + nBits + +	+ + PPL ↓ + +	+ + MEM ↓ + +	+ + PPL ↓ + +	+ + MEM ↓ + +	+ + PPL ↓ + +	+ + MEM ↓ + +
+ + FP16 + +	+ + 16 + +	+ + 5.18 + +	+ + 13.5 + +	+ + 4.63 + +	+ + 25.6 + +	+ + OOM + +	+ + OOM + +
+ + bnb + +	+ + 8 + +	+ + 5.22 + +	+ + 7.9 + +	+ + 4.67 + +	+ + 14.4 + +	+ + OOM + +	+ + OOM + +
+ + GPTQ_g128 + +	+ + 8 + +	+ + 5.19 + +	+ + 7.8 + +	+ + 4.63 + +	+ + 14.8 + +	+ + 3.12 + +	+ + 74.87 + +
+ + HQQ_g128 + +	+ + 8 + +	+ + 5.19 + +	+ + 7.6 + +	+ + 4.63 + +	+ + 14 + +	+ + 3.12 + +	+ + 69.32 + +
+ + bnb_g64 + +	+ + 4 + +	+ + 5.43 + +	+ + 4.7 + +	+ + 4.79 + +	+ + 8.2 + +	+ + 3.29 + +	+ + 39.11 + +
+ + GPTQ_g128 + +	+ + 4 + +	+ + 5.41 + +	+ + 5 + +	+ + 4.74 + +	+ + 8.9 + +	+ + 3.24 + +	+ + 40 + +
+ + GPTQ_g64 + +	+ + 4 + +	+ + 5.38 + +	+ + 5 + +	+ + 4.73 + +	+ + 9.1 + +	+ + 3.23 + +	+ + 41.13 + +
+ + AWQ_g128 + +	+ + 4 + +	+ + 5.32 + +	+ + 4.6 + +	+ + 4.71 + +	+ + 8.2 + +	+ + 3.21 + +	+ + 35.78 + +
+ + AWQ_g64 + +	+ + 4 + +	+ + 5.28 + +	+ + 4.6 + +	+ + 4.7 + +	+ + 8.5 + +	+ + 3.2 + +	+ + 37.08 + +
+ + HQQ_g128 + +	+ + 4 + +	+ + 5.35 + +	+ + 4.6 + +	+ + 4.74 + +	+ + 7.9 + +	+ + 3.21 + +	+ + 35.97 + +
+ + HQQ_g64 + +	+ + 4 + +	+ + 5.3 + +	+ + 4.6 + +	+ + 4.7 + +	+ + 8.2 + +	+ + 3.19 + +	+ + 37.52 + +
+ + GPTQ_g128 + +	+ + 3 + +	+ + 6.3 + +	+ + 3.9 + +	+ + 5.25 + +	+ + 7 + +	+ + 3.85 + +	+ + 33.7 + +
+ + GPTQ_g64 + +	+ + 3 + +	+ + 6.1 + +	+ + 4 + +	+ + 5.16 + +	+ + 7.3 + +	+ + 3.7 + +	+ + 33.47 + +
+ + HQQ_g128 + +	+ + 3 + +	+ + 6.2 + +	+ + 3.8 + +	+ + 5.15 + +	+ + 6.8 + +	+ + 3.58 + +	+ + 30.11 + +
+ + HQQ_g64 + +	+ + 3 + +	+ + 5.82 + +	+ + 4.5 + +	+ + 4.98 + +	+ + 7.4 + +	+ + 3.45 + +	+ + 33.46 + +
+ + GPTQ_g64 + +	+ + 2 + +	+ + nan + +	+ + 3.5 + +	+ + 13 + +	+ + 6 + +	+ + 9.44 + +	+ + 24.5 + +
+ + HQQ_g32 + +	+ + 2 + +	+ + 15.61 + +	+ + 3.5 + +	+ + 7.63 + +	+ + 5.9 + +	+ + 4.82 + +	+ + 24.2 + +
+ + HQQ_g16 + +	+ + 2 + +	+ + 7.3 + +	+ + 4.1 + +	+ + 6.36 + +	+ + 6.9 + +	+ + 4.12 + +	+ + 30.27 + +
+ + HQQ_g16_s* + +	+ + 2 + +	+ + 7.54 + +	+ + 3.7 + +	+ + 6.4 + +	+ + 6.1 + +	+ + 4.13 + +	+ + 26.37 + +
- - Method - -	- - nBits - -	- - LLama2-7B - -		- - LLama2-13B - -		- - LLama2-70B - -
- - Method - -	- - nBits - -	- - PPL ↓ - -	- - MEM ↓ - -	- - PPL ↓ - -	- - MEM ↓ - -	- - PPL ↓ - -	- - MEM ↓ - -
- - FP16 - -	- - 16 - -	- - 5.18 - -	- - 13.5 - -	- - 4.63 - -	- - 25.6 - -	- - OOM - -	- - OOM - -
- - bnb - -	- - 8 - -	- - 5.22 - -	- - 7.9 - -	- - 4.67 - -	- - 14.4 - -	- - OOM - -	- - OOM - -
- - GPTQ_g128 - -	- - 8 - -	- - 5.19 - -	- - 7.8 - -	- - 4.63 - -	- - 14.8 - -	- - 3.12 - -	- - 74.87 - -
- - HQQ_g128 - -	- - 8 - -	- - 5.19 - -	- - 7.6 - -	- - 4.63 - -	- - 14 - -	- - 3.12 - -	- - 69.32 - -
- - bnb_g64 - -	- - 4 - -	- - 5.43 - -	- - 4.7 - -	- - 4.79 - -	- - 8.2 - -	- - 3.29 - -	- - 39.11 - -
- - GPTQ_g128 - -	- - 4 - -	- - 5.41 - -	- - 5 - -	- - 4.74 - -	- - 8.9 - -	- - 3.24 - -	- - 40 - -
- - GPTQ_g64 - -	- - 4 - -	- - 5.38 - -	- - 5 - -	- - 4.73 - -	- - 9.1 - -	- - 3.23 - -	- - 41.13 - -
- - AWQ_g128 - -	- - 4 - -	- - 5.32 - -	- - 4.6 - -	- - 4.71 - -	- - 8.2 - -	- - 3.21 - -	- - 35.78 - -
- - AWQ_g64 - -	- - 4 - -	- - 5.28 - -	- - 4.6 - -	- - 4.7 - -	- - 8.5 - -	- - 3.2 - -	- - 37.08 - -
- - HQQ_g128 - -	- - 4 - -	- - 5.35 - -	- - 4.6 - -	- - 4.74 - -	- - 7.9 - -	- - 3.21 - -	- - 35.97 - -
- - HQQ_g64 - -	- - 4 - -	- - 5.3 - -	- - 4.6 - -	- - 4.7 - -	- - 8.2 - -	- - 3.19 - -	- - 37.52 - -
- - GPTQ_g128 - -	- - 3 - -	- - 6.3 - -	- - 3.9 - -	- - 5.25 - -	- - 7 - -	- - 3.85 - -	- - 33.7 - -
- - GPTQ_g64 - -	- - 3 - -	- - 6.1 - -	- - 4 - -	- - 5.16 - -	- - 7.3 - -	- - 3.7 - -	- - 33.47 - -
- - HQQ_g128 - -	- - 3 - -	- - 6.2 - -	- - 3.8 - -	- - 5.15 - -	- - 6.8 - -	- - 3.58 - -	- - 30.11 - -
- - HQQ_g64 - -	- - 3 - -	- - 5.82 - -	- - 4.5 - -	- - 4.98 - -	- - 7.4 - -	- - 3.45 - -	- - 33.46 - -
- - GPTQ_g64 - -	- - 2 - -	- - nan - -	- - 3.5 - -	- - 13 - -	- - 6 - -	- - 9.44 - -	- - 24.5 - -
- - HQQ_g32 - -	- - 2 - -	- - 15.61 - -	- - 3.5 - -	- - 7.63 - -	- - 5.9 - -	- - 4.82 - -	- - 24.2 - -
- - HQQ_g16 - -	- - 2 - -	- - 7.3 - -	- - 4.1 - -	- - 6.36 - -	- - 6.9 - -	- - 4.12 - -	- - 30.27 - -
- - HQQ_g16_s* - -	- - 2 - -	- - 7.54 - -	- - 3.7 - -	- - 6.4 - -	- - 6.1 - -	- - 4.13 - -	- - 26.37 - -