From aebc0183310fa7f8dd4598d2d19985734c5113d3 Mon Sep 17 00:00:00 2001
From: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com>
Date: Tue, 17 Dec 2024 12:27:19 -0800
Subject: [PATCH 01/10] More minor fixes for llama3.1-405b (#1983)

* More minor fixes

* Fix indentation for stats report
---
 language/llama3-405b/SUT_VLLM.py          | 22 +++++++++++-----------
 language/llama3-405b/dataset.py           |  2 +-
 language/llama3-405b/evaluate-accuracy.py |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/language/llama3-405b/SUT_VLLM.py b/language/llama3-405b/SUT_VLLM.py
index e64999d09..f5a802021 100644
--- a/language/llama3-405b/SUT_VLLM.py
+++ b/language/llama3-405b/SUT_VLLM.py
@@ -31,7 +31,7 @@ def __init__(
         model_path=None,
         dtype="bfloat16",
         batch_size=None,
-        total_sample_count=8312,
+        total_sample_count=8313,
         dataset_path=None,
         use_cached_outputs=False,
         # Set this to True *only for test accuracy runs* in case your prior
@@ -140,16 +140,16 @@ def process_queries(self):
                         n_tokens)]
                 lg.QuerySamplesComplete(response)
 
-        tok = time.time()
+            tok = time.time()
 
-        with self.sample_counter_lock:
-            self.sample_counter += len(qitem)
-            log.info(f"Samples run: {self.sample_counter}")
-            if tik1:
-                log.info(f"\tBatchMaker time: {tik2 - tik1}")
-                log.info(f"\tInference time: {tik3 - tik2}")
-                log.info(f"\tPostprocess time: {tok - tik3}")
-                log.info(f"\t==== Total time: {tok - tik1}")
+            with self.sample_counter_lock:
+                self.sample_counter += len(qitem)
+                log.info(f"Samples run: {self.sample_counter}")
+                if tik1:
+                    log.info(f"\tBatchMaker time: {tik2 - tik1}")
+                    log.info(f"\tInference time: {tik3 - tik2}")
+                    log.info(f"\tPostprocess time: {tok - tik3}")
+                    log.info(f"\t==== Total time: {tok - tik1}")
 
     def load_model(self):
         log.info("Loading model...")
@@ -194,7 +194,7 @@ def __init__(
         self,
         model_path=None,
         dtype="bfloat16",
-        total_sample_count=8312,
+        total_sample_count=8313,
         dataset_path=None,
         batch_size=None,
         workers=1,
diff --git a/language/llama3-405b/dataset.py b/language/llama3-405b/dataset.py
index 04fe9c4b2..084f13208 100644
--- a/language/llama3-405b/dataset.py
+++ b/language/llama3-405b/dataset.py
@@ -24,7 +24,7 @@ class Dataset:
     def __init__(
         self,
         model_name=None,
-        total_sample_count=8312,
+        total_sample_count=8313,
         perf_count_override=None,
         dataset_path=None,
         dtype="bfloat16"
diff --git a/language/llama3-405b/evaluate-accuracy.py b/language/llama3-405b/evaluate-accuracy.py
index ccc87f71f..f5677820e 100644
--- a/language/llama3-405b/evaluate-accuracy.py
+++ b/language/llama3-405b/evaluate-accuracy.py
@@ -141,7 +141,7 @@ def main():
 
     tokenizer = AutoTokenizer.from_pretrained(
         checkpoint_path,
-        model_max_length=2048,
+        model_max_length=22000,
         padding_side="left",
         use_fast=False,
     )

From 3ae2b2a20b01fda1236e6950d089f2bc6eac91aa Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Wed, 18 Dec 2024 12:28:57 -0500
Subject: [PATCH 02/10] Remove unused rgat files (#1961)

Co-authored-by: Miro <mirhodak@amd.com>
---
 graph/R-GAT/igbh/tiny/models/dataloader.py |  82 ------
 graph/R-GAT/igbh/tiny/models/gnn.py        | 296 ---------------------
 graph/R-GAT/igbh/tiny/models/main.py       |  79 ------
 graph/R-GAT/igbh/tiny/models/utils.py      | 224 ----------------
 4 files changed, 681 deletions(-)
 delete mode 100644 graph/R-GAT/igbh/tiny/models/dataloader.py
 delete mode 100644 graph/R-GAT/igbh/tiny/models/gnn.py
 delete mode 100644 graph/R-GAT/igbh/tiny/models/main.py
 delete mode 100644 graph/R-GAT/igbh/tiny/models/utils.py

diff --git a/graph/R-GAT/igbh/tiny/models/dataloader.py b/graph/R-GAT/igbh/tiny/models/dataloader.py
deleted file mode 100644
index cc64d1466..000000000
--- a/graph/R-GAT/igbh/tiny/models/dataloader.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import torch
-from torch_geometric.data import InMemoryDataset, Data
-from dgl.data import DGLDataset
-
-from utils import IGL260MDataset
-
-# TODO: Make a PyG dataloader for large datasets
-
-
-class IGL260M_PyG(InMemoryDataset):
-    def __init__(self, args):
-        super().__init__(root, transform, pre_transform, pre_filter)
-
-    def process(self):
-        dataset = IGL260MDataset(root=self.dir, size=args.dataset_size,
-                                 in_memory=args.in_memory, classes=args.type_classes, synthetic=args.synthetic)
-        node_features = torch.from_numpy(dataset.paper_feat)
-        node_edges = torch.from_numpy(dataset.paper_edge).T
-        node_labels = torch.from_numpy(dataset.paper_label).to(torch.long)
-        data = Data(x=node_features, edge_index=node_edges, y=node_labels)
-
-        n_nodes = node_features.shape[0]
-
-        n_train = int(n_nodes * 0.6)
-        n_val = int(n_nodes * 0.2)
-
-        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
-        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
-        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
-
-        train_mask[:n_train] = True
-        val_mask[n_train:n_train + n_val] = True
-        test_mask[n_train + n_val:] = True
-
-        data.train_mask = train_mask
-        data.val_mask = val_mask
-        data.test_mask = test_mask
-
-
-class IGL260M_DGL(DGLDataset):
-    def __init__(self, args):
-        self.dir = args.path
-        super().__init__(name='IGB260M')
-
-    def process(self):
-        dataset = IGL260MDataset(root=self.dir, size=args.dataset_size,
-                                 in_memory=args.in_memory, classes=args.type_classes, synthetic=args.synthetic)
-        node_features = torch.from_numpy(dataset.paper_feat)
-        node_edges = torch.from_numpy(dataset.paper_edge)
-        node_labels = torch.from_numpy(dataset.paper_label).to(torch.long)
-
-        self.graph = dgl.graph(
-            (node_edges[:, 0], node_edges[:, 1]), num_nodes=node_features.shape[0])
-
-        self.graph.ndata['feat'] = node_features
-        self.graph.ndata['label'] = node_labels
-
-        self.graph = dgl.remove_self_loop(self.graph)
-        self.graph = dgl.add_self_loop(self.graph)
-
-        n_nodes = node_features.shape[0]
-
-        n_train = int(n_nodes * 0.6)
-        n_val = int(n_nodes * 0.2)
-
-        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
-        val_mask = torch.zeros(n_nodes, dtype=torch.bool)
-        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
-
-        train_mask[:n_train] = True
-        val_mask[n_train:n_train + n_val] = True
-        test_mask[n_train + n_val:] = True
-
-        self.graph.ndata['train_mask'] = train_mask
-        self.graph.ndata['val_mask'] = val_mask
-        self.graph.ndata['test_mask'] = test_mask
-
-    def __getitem__(self, i):
-        return self.graph
-
-    def __len__(self):
-        return 1
diff --git a/graph/R-GAT/igbh/tiny/models/gnn.py b/graph/R-GAT/igbh/tiny/models/gnn.py
deleted file mode 100644
index 20d5ecd72..000000000
--- a/graph/R-GAT/igbh/tiny/models/gnn.py
+++ /dev/null
@@ -1,296 +0,0 @@
-from utils import IGL260MDataset
-import warnings
-from tqdm import tqdm
-import numpy as np
-import time
-import torch.nn.functional as F
-import torch.optim as optim
-import torch.nn as nn
-import dgl
-from dgl.data import DGLDataset
-import dgl.nn.pytorch as dglnn
-from dgl.nn.pytorch import GATConv, GraphConv, SAGEConv
-import os.path as osp
-from sys import getsizeof
-
-
-import torch
-torch.manual_seed(0)
-dgl.seed(0)
-warnings.filterwarnings("ignore")
-
-
-class GCN(nn.Module):
-    def __init__(self,
-                 in_feats,
-                 n_hidden,
-                 n_classes,
-                 n_layers,
-                 activation,
-                 dropout):
-        super(GCN, self).__init__()
-        self.layers = nn.ModuleList()
-        self.n_layers = n_layers
-        self.n_hidden = n_hidden
-        self.n_classes = n_classes
-        # input layer
-        self.layers.append(
-            GraphConv(
-                in_feats,
-                n_hidden,
-                activation=activation))
-        # hidden layers
-        for i in range(n_layers - 1):
-            self.layers.append(
-                GraphConv(
-                    n_hidden,
-                    n_hidden,
-                    activation=activation))
-        # output layer
-        self.layers.append(GraphConv(n_hidden, n_classes))
-        self.dropout = nn.Dropout(p=dropout)
-        self.activation = activation
-
-    def forward(self, blocks, x):
-        h = x
-        for l, (layer, block) in enumerate(zip(self.layers, blocks)):
-            if l != len(self.layers) - 1:
-                # h = self.activation(h)
-                h = self.dropout(h)
-            h = layer(block, h)
-        return h
-
-    def inference(self, g, x, batch_size, device):
-        """
-        Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
-        g : the entire graph.
-        x : the input of entire node set.
-        The inference code is written in a fashion that it could handle any number of nodes and
-        layers.
-        """
-        # During inference with sampling, multi-layer blocks are very inefficient because
-        # lots of computations in the first few layers are repeated.
-        # Therefore, we compute the representation of all nodes layer by layer.  The nodes
-        # on each layer are of course splitted in batches.
-        # TODO: can we standardize this?
-        for l, layer in enumerate(self.layers):
-            y = torch.zeros(g.number_of_nodes(), self.n_hidden if l !=
-                            len(self.layers) - 1 else self.n_classes)
-
-            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
-            dataloader = dgl.dataloading.NodeDataLoader(
-                g,
-                torch.arange(g.number_of_nodes()),
-                sampler,
-                batch_size=batch_size,
-                shuffle=True,
-                drop_last=False,
-                num_workers=4)
-
-            for input_nodes, output_nodes, blocks in dataloader:
-                block = blocks[0]
-
-                block = block.int().to(device)
-                h = x[input_nodes].to(device)
-                h = layer(block, h)
-                if l != len(self.layers) - 1:
-                    h = self.activation(h)
-                    h = self.dropout(h)
-
-                y[output_nodes] = h.cpu()
-
-            x = y
-        return y
-
-
-class GAT(nn.Module):
-    def __init__(
-        self, in_feats, n_hidden, n_classes, n_layers, num_heads, activation
-    ):
-        super().__init__()
-        self.n_layers = n_layers
-        self.n_hidden = n_hidden
-        self.n_classes = n_classes
-        self.layers = nn.ModuleList()
-        self.layers.append(
-            dglnn.GATConv(
-                (in_feats, in_feats),
-                n_hidden,
-                num_heads=num_heads,
-                activation=activation,
-            )
-        )
-        for i in range(1, n_layers - 1):
-            self.layers.append(
-                dglnn.GATConv(
-                    (n_hidden * num_heads, n_hidden * num_heads),
-                    n_hidden,
-                    num_heads=num_heads,
-                    activation=activation,
-                )
-            )
-        self.layers.append(
-            dglnn.GATConv(
-                (n_hidden * num_heads, n_hidden * num_heads),
-                n_classes,
-                num_heads=num_heads,
-                activation=None,
-            )
-        )
-
-    def forward(self, blocks, x):
-        h = x
-        for l, (layer, block) in enumerate(zip(self.layers, blocks)):
-            # We need to first copy the representation of nodes on the RHS from the
-            # appropriate nodes on the LHS.
-            # Note that the shape of h is (num_nodes_LHS, D) and the shape of h_dst
-            # would be (num_nodes_RHS, D)
-            h_dst = h[: block.num_dst_nodes()]
-            # Then we compute the updated representation on the RHS.
-            # The shape of h now becomes (num_nodes_RHS, D)
-            if l < self.n_layers - 1:
-                h = layer(block, (h, h_dst)).flatten(1)
-            else:
-                h = layer(block, (h, h_dst))
-        h = h.mean(1)
-        return h.log_softmax(dim=-1)
-
-    def inference(self, g, x, batch_size, device):
-        """
-        Inference with the GAT model on full neighbors (i.e. without neighbor sampling).
-        g : the entire graph.
-        x : the input of entire node set.
-        The inference code is written in a fashion that it could handle any number of nodes and
-        layers.
-        """
-        # During inference with sampling, multi-layer blocks are very inefficient because
-        # lots of computations in the first few layers are repeated.
-        # Therefore, we compute the representation of all nodes layer by layer.  The nodes
-        # on each layer are of course splitted in batches.
-        # TODO: can we standardize this?
-        # TODO: make thiw into a variable
-        num_heads = 2
-        for l, layer in enumerate(self.layers):
-            if l < self.n_layers - 1:
-                y = torch.zeros(
-                    g.num_nodes(),
-                    self.n_hidden * num_heads
-                    if l != len(self.layers) - 1
-                    else self.n_classes,
-                )
-            else:
-                y = torch.zeros(
-                    g.num_nodes(),
-                    self.n_hidden
-                    if l != len(self.layers) - 1
-                    else self.n_classes,
-                )
-
-            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
-            dataloader = dgl.dataloading.DataLoader(
-                g,
-                torch.arange(g.num_nodes()),
-                sampler,
-                batch_size=batch_size,
-                shuffle=True,
-                drop_last=False,
-                num_workers=4,
-            )
-
-            for input_nodes, output_nodes, blocks in dataloader:
-                block = blocks[0].int().to(device)
-
-                h = x[input_nodes].to(device)
-                h_dst = h[: block.num_dst_nodes()]
-                if l < self.n_layers - 1:
-                    h = layer(block, (h, h_dst)).flatten(1)
-                else:
-                    h = layer(block, (h, h_dst))
-                    h = h.mean(1)
-                    h = h.log_softmax(dim=-1)
-
-                y[output_nodes] = h.cpu()
-
-            x = y
-        return y
-
-
-class SAGE(nn.Module):
-    def __init__(self,
-                 in_feats,
-                 n_hidden,
-                 n_classes,
-                 n_layers,
-                 activation,
-                 dropout,
-                 aggregator_type):
-        super().__init__()
-        self.n_layers = n_layers
-        self.n_hidden = n_hidden
-        self.n_classes = n_classes
-        self.layers = nn.ModuleList()
-        self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, aggregator_type))
-        for i in range(1, n_layers - 1):
-            self.layers.append(
-                dglnn.SAGEConv(
-                    n_hidden,
-                    n_hidden,
-                    aggregator_type))
-        self.layers.append(
-            dglnn.SAGEConv(
-                n_hidden,
-                n_classes,
-                aggregator_type))
-        self.dropout = nn.Dropout(dropout)
-        self.activation = activation
-
-    def forward(self, blocks, x):
-        h = x
-        for l, (layer, block) in enumerate(zip(self.layers, blocks)):
-            h = layer(block, h)
-            if l != len(self.layers) - 1:
-                h = self.activation(h)
-                h = self.dropout(h)
-        return h
-
-    def inference(self, g, x, batch_size, device):
-        """
-        Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
-        g : the entire graph.
-        x : the input of entire node set.
-        The inference code is written in a fashion that it could handle any number of nodes and
-        layers.
-        """
-        # During inference with sampling, multi-layer blocks are very inefficient because
-        # lots of computations in the first few layers are repeated.
-        # Therefore, we compute the representation of all nodes layer by layer.  The nodes
-        # on each layer are of course splitted in batches.
-        # TODO: can we standardize this?
-        for l, layer in enumerate(self.layers):
-            y = torch.zeros(g.number_of_nodes(), self.n_hidden if l !=
-                            len(self.layers) - 1 else self.n_classes)
-
-            sampler = dgl.dataloading.MultiLayerFullNeighborSampler(1)
-            dataloader = dgl.dataloading.NodeDataLoader(
-                g,
-                torch.arange(g.number_of_nodes()),
-                sampler,
-                batch_size=batch_size,
-                shuffle=True,
-                drop_last=False,
-                num_workers=4)
-
-            for input_nodes, output_nodes, blocks in dataloader:
-                block = blocks[0]
-
-                block = block.int().to(device)
-                h = x[input_nodes].to(device)
-                h = layer(block, h)
-                if l != len(self.layers) - 1:
-                    h = self.activation(h)
-                    h = self.dropout(h)
-
-                y[output_nodes] = h.cpu()
-
-            x = y
-        return y
diff --git a/graph/R-GAT/igbh/tiny/models/main.py b/graph/R-GAT/igbh/tiny/models/main.py
deleted file mode 100644
index 4ab22eb75..000000000
--- a/graph/R-GAT/igbh/tiny/models/main.py
+++ /dev/null
@@ -1,79 +0,0 @@
-import argparse
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Input/output paths
-    parser.add_argument('--path', type=str, default='/gnndataset/')
-    parser.add_argument('--modelpath', type=str, default='gcn_19.pt')
-
-    # Dataset selection
-    parser.add_argument(
-        '--dataset_size',
-        type=str,
-        default='experimental',
-        choices=[
-            'experimental',
-            'small',
-            'medium',
-            'large',
-            'full'])
-    parser.add_argument(
-        '--type_classes',
-        type=int,
-        default=19,
-        choices=[
-            19,
-            292,
-            2983])
-
-    # Hyperparameters
-    parser.add_argument('--hidden_channels', type=int, default=16)
-    parser.add_argument('--fan_out', type=str, default='5,10')
-    parser.add_argument('--num_layers', type=int, default=2)
-    parser.add_argument('--learning_rate', type=int, default=0.01)
-    parser.add_argument('--decay', type=int, default=0.001)
-    parser.add_argument('--num_workers', type=int, default=4)
-    parser.add_argument('--batch_size', type=int, default=2048 * 16)
-    parser.add_argument('--dropout', type=float, default=0.2)
-    parser.add_argument('--epochs', type=int, default=20)
-    parser.add_argument(
-        '--model_type',
-        type=str,
-        default='gcn',
-        choices=[
-            'gat',
-            'sage',
-            'gcn'])
-    parser.add_argument('--in_memory', type=int, default=0)
-    parser.add_argument('--synthetic', type=int, default=0)
-    parser.add_argument('--device', type=str, default='1')
-    args = parser.parse_args()
-
-    print("Dataset_size: " + args.dataset_size)
-    print("Model       : " + args.model)
-    print("Num_classes : " + str(args.num_classes))
-    print()
-
-    device = f'cuda:' + args.device if torch.cuda.is_available() else 'cpu'
-
-    dataset = IGL260M_DGL(args)
-    g = dataset[0]
-
-    best_test_acc, train_acc, test_acc = track_acc(g, args)
-
-    print(
-        f"Train accuracy: {np.mean(train_acc):.2f} \u00B1 {np.std(train_acc):.2f} \t Best: {np.max(train_acc) * 100:.4f}%")
-    print(
-        f"Test accuracy: {np.mean(test_acc):.2f} \u00B1 {np.std(test_acc):.2f} \t Best: {np.max(test_acc) * 100:.4f}%")
-    print()
-    print(" -------- For debugging --------- ")
-    print("Parameters: ", args)
-    print(g)
-    print("Train accuracy: ", train_acc)
-    print("Test accuracy: ", test_acc)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/graph/R-GAT/igbh/tiny/models/utils.py b/graph/R-GAT/igbh/tiny/models/utils.py
deleted file mode 100644
index 5e9e1a25d..000000000
--- a/graph/R-GAT/igbh/tiny/models/utils.py
+++ /dev/null
@@ -1,224 +0,0 @@
-import numpy as np
-import torch
-
-
-class IGL260MDataset(object):
-    def __init__(self, root: str, size: str, in_memory: int,
-                 classes: int, synthetic: int):
-        self.dir = root
-        self.size = size
-        self.synthetic = synthetic
-        self.in_memory = in_memory
-        self.num_classes = classes
-        self.__meta__ = torch.load(osp.join(self.dir, self.size, 'meta.pt'))
-
-        self.num_features = self.__meta__['paper']['emb_dim']
-        self.num_nodes = self.__meta__['paper']['num_node']
-        self.num_edges = self.__meta__['cites']['num_edge']
-
-    @property
-    def paper_feat(self) -> np.ndarray:
-        if self.synthetic:
-            return np.random((self.num_nodes, self.num_edges))
-
-        path = osp.join(
-            self.dir,
-            self.size,
-            'processed',
-            'paper',
-            'node_feat.npy')
-        if self.in_memory:
-            return np.load(path)
-        else:
-            return np.load(path, mmap_mode='r')
-
-    @property
-    def paper_label(self) -> np.ndarray:
-        if self.num_classes == 19:
-            path = osp.join(
-                self.dir,
-                self.size,
-                'processed',
-                'paper',
-                'node_label_19.npy')
-        else:
-            path = osp.join(
-                self.dir,
-                self.size,
-                'processed',
-                'paper',
-                'node_label_2K.npy')
-        if self.in_memory:
-            return np.load(path)
-        else:
-            return np.load(path, mmap_mode='r')
-
-    @property
-    def paper_edge(self) -> np.ndarray:
-        path = osp.join(
-            self.dir,
-            self.size,
-            'processed',
-            'paper__cites__paper',
-            'edge_index.npy')
-        if self.in_memory:
-            return np.load(path)
-        else:
-            return np.load(path, mmap_mode='r')
-
-
-def compute_acc(pred, labels):
-    """
-    Compute the accuracy of prediction given the labels.
-    """
-    labels = labels.long()
-    return (torch.argmax(pred, dim=1) == labels).float().sum() / len(pred)
-
-
-def evaluate(model, g, inputs, labels, val_nid, batch_size, device):
-    """
-    Evaluate the model on the validation set specified by ``val_nid``.
-    g : The entire graph.
-    inputs : The features of all the nodes.
-    labels : The labels of all the nodes.
-    val_nid : the node Ids for validation.
-    batch_size : Number of nodes to compute at the same time.
-    device : The GPU device to evaluate on.
-    """
-    model.eval()
-    with torch.no_grad():
-        pred = model.inference(g, inputs, batch_size, device)
-    model.train()
-    return compute_acc(pred[val_nid], labels[val_nid])
-
-
-def load_subtensor(g, seeds, input_nodes, device):
-    """
-    Copys features and labels of a set of nodes onto GPU.
-    """
-    batch_inputs = g.ndata['features'][input_nodes].to(device)
-    batch_labels = g.ndata['labels'][seeds].to(device)
-    return batch_inputs, batch_labels
-
-
-def track_acc(g, args):
-    train_accuracy = []
-    test_accuracy = []
-    g.ndata['features'] = g.ndata['feat']
-    g.ndata['labels'] = g.ndata['label']
-    in_feats = g.ndata['features'].shape[1]
-    n_classes = args.num_classes
-
-    # Create csr/coo/csc formats before launching training processes with multi-gpu.
-    # This avoids creating certain formats in each sub-process, which saves
-    # momory and CPU.
-    g.create_formats_()
-
-    num_epochs = args.epochs
-    num_hidden = args.hidden_channels
-    num_layers = args.num_layers
-    fan_out = args.fan_out
-    batch_size = args.batch_size
-    lr = args.learning_rate
-    dropout = args.dropout
-    num_workers = args.num_workers
-
-    train_nid = torch.nonzero(g.ndata['train_mask'], as_tuple=True)[0]
-
-    # Create PyTorch DataLoader for constructing blocks
-    sampler = dgl.dataloading.MultiLayerNeighborSampler(
-        [int(fanout) for fanout in fan_out.split(',')])
-
-    dataloader = dgl.dataloading.NodeDataLoader(
-        g,
-        train_nid,
-        sampler,
-        batch_size=batch_size,
-        shuffle=True,
-        drop_last=False,
-        num_workers=num_workers)
-
-    # Define model and optimizer
-    if args.model_type == 'gcn':
-        model = GCN(in_feats, num_hidden, n_classes, 1, F.relu, dropout)
-    if args.model_type == 'sage':
-        model = SAGE(
-            in_feats,
-            num_hidden,
-            n_classes,
-            num_layers,
-            F.relu,
-            dropout,
-            'gcn')
-    if args.model_type == 'gat':
-        model = GAT(in_feats, num_hidden, n_classes, num_layers, 2, F.relu)
-
-    model = model.to(device)
-    loss_fcn = nn.CrossEntropyLoss()
-    loss_fcn = loss_fcn.to(device)
-    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=args.decay)
-
-    # Training loop
-    avg = 0
-    best_test_acc = 0
-    log_every = 1
-    training_start = time.time()
-    for epoch in (range(num_epochs)):
-        # Loop over the dataloader to sample the computation dependency graph as a list of
-        # blocks.
-        epoch_loss = 0
-        gpu_mem_alloc = 0
-        epoch_start = time.time()
-        for step, (input_nodes, seeds, blocks) in (enumerate(dataloader)):
-            # Load the input features as well as output labels
-            # batch_inputs, batch_labels = load_subtensor(g, seeds, input_nodes, device)
-            blocks = [block.int().to(device) for block in blocks]
-            batch_inputs = blocks[0].srcdata['features']
-            batch_labels = blocks[-1].dstdata['labels']
-
-            # Compute loss and prediction
-            batch_pred = model(blocks, batch_inputs)
-            loss = loss_fcn(batch_pred, batch_labels)
-            optimizer.zero_grad()
-            loss.backward()
-            optimizer.step()
-
-            epoch_loss += loss.detach()
-
-            gpu_mem_alloc += (
-                torch.cuda.max_memory_allocated() / 1000000
-                if torch.cuda.is_available()
-                else 0
-            )
-
-        train_g = g
-        train_nid = torch.nonzero(
-            train_g.ndata['train_mask'], as_tuple=True)[0]
-        train_acc = evaluate(
-            model, train_g, train_g.ndata['features'], train_g.ndata['labels'], train_nid, batch_size, device)
-
-        test_g = g
-        test_nid = torch.nonzero(
-            test_g.ndata['test_mask'], as_tuple=True)[0]
-        test_acc = evaluate(
-            model, test_g, test_g.ndata['features'], test_g.ndata['labels'], test_nid, batch_size, device)
-
-        if test_acc.item() > best_test_acc:
-            best_test_acc = test_acc.item()
-        tqdm.write(
-            "Epoch {:05d} | Loss {:.4f} | Train Acc {:.4f} | Test Acc {:.4f} | Time {:.2f}s | GPU {:.1f} MB".format(
-                epoch,
-                epoch_loss,
-                train_acc.item(),
-                test_acc.item(),
-                time.time() - epoch_start,
-                gpu_mem_alloc
-            )
-        )
-        test_accuracy.append(test_acc.item())
-        train_accuracy.append(train_acc.item())
-        torch.save(model.state_dict(), args.modelpath)
-    print()
-    print("Total time taken: ", time.time() - training_start)
-
-    return best_test_acc, train_accuracy, test_accuracy

From 03c96663dc2bd47cc5e8f5fbb0fc4079ae2c784d Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Wed, 18 Dec 2024 12:31:23 -0500
Subject: [PATCH 03/10] Update docker GPU, avoid long build time (#1966)

Co-authored-by: Miro <mirhodak@amd.com>
---
 graph/R-GAT/README.md      | 5 +++--
 graph/R-GAT/dockerfile.gpu | 6 ++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/graph/R-GAT/README.md b/graph/R-GAT/README.md
index 569233ac6..aecf7ffe9 100644
--- a/graph/R-GAT/README.md
+++ b/graph/R-GAT/README.md
@@ -181,9 +181,10 @@ docker build . -f dockerfile.gpu -t rgat-gpu
 ```
 Run docker container:
 ```bash
-docker run --rm -it -v $(pwd):/root --gpus all rgat-gpu
+docker run --rm -it -v $(pwd):/workspace/root --gpus all rgat-gpu
 ```
-Run benchmark inside the docker container:
+Go inside the root folder and run benchmark inside the docker container:
 ```bash
+cd root
 python3 main.py --dataset igbh-dgl --dataset-path igbh/ --profile rgat-dgl-full --device gpu [--model-path <path_to_ckpt>] [--in-memory] [--dtype <fp16 or fp32>] [--scenario <SingleStream, MultiStream, Server or Offline>]
 ```
diff --git a/graph/R-GAT/dockerfile.gpu b/graph/R-GAT/dockerfile.gpu
index fae65081f..f600028fe 100644
--- a/graph/R-GAT/dockerfile.gpu
+++ b/graph/R-GAT/dockerfile.gpu
@@ -26,6 +26,8 @@ RUN apt install -y --no-install-recommends rsync
 # Upgrade pip
 RUN python3 -m pip install --upgrade pip
 
+RUN pip install torch-geometric torch-scatter torch-sparse -f https://pytorch-geometric.com/whl/torch-2.1.0+cu121.html
+RUN pip install  dgl -f https://data.dgl.ai/wheels/torch-2.1/cu121/repo.html
 
 COPY requirements.txt requirements.txt
 RUN pip install -r requirements.txt
@@ -35,10 +37,6 @@ RUN cd /tmp && \
     pip install pybind11 && \
     CFLAGS="-std=c++14" python3 setup.py install
 
-RUN export TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
-RUN pip install torch-geometric torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-${TORCH_VERSION}.html
-RUN pip install  dgl -f https://data.dgl.ai/wheels/torch-2.1/cu121/repo.html
-
 # Clean up
 RUN rm -rf mlperf \
     rm requirements.txt
\ No newline at end of file

From 867def46417627eaa9de8f926721bf88167009ba Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Wed, 18 Dec 2024 13:15:33 -0500
Subject: [PATCH 04/10] Require equal issue mode for R-GAT (#1968)

* Require equal issue mode for R-GAT

* Add equal issue note in readme

---------

Co-authored-by: Miro <mirhodak@amd.com>
---
 graph/R-GAT/README.md                  | 2 ++
 loadgen/mlperf.conf                    | 3 +++
 tools/submission/submission_checker.py | 1 +
 3 files changed, 6 insertions(+)

diff --git a/graph/R-GAT/README.md b/graph/R-GAT/README.md
index aecf7ffe9..69883c0d1 100644
--- a/graph/R-GAT/README.md
+++ b/graph/R-GAT/README.md
@@ -188,3 +188,5 @@ Go inside the root folder and run benchmark inside the docker container:
 cd root
 python3 main.py --dataset igbh-dgl --dataset-path igbh/ --profile rgat-dgl-full --device gpu [--model-path <path_to_ckpt>] [--in-memory] [--dtype <fp16 or fp32>] [--scenario <SingleStream, MultiStream, Server or Offline>]
 ```
+
+**NOTE:** For official submissions, this benchmark is required to run in equal issue mode. Please make sure that the flag `rgat.*.sample_concatenate_permutation` is set to one in the [mlperf.conf](../../loadgen/mlperf.conf) file when loadgen is built.
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index 1fe202253..95cc08351 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -42,6 +42,9 @@ retinanet.MultiStream.target_latency = 528
 # 3D-UNet uses equal issue mode because it has non-uniform inputs
 3d-unet.*.sample_concatenate_permutation = 1
 
+# R-GAT uses equal issue mode because it may have non-uniform inputs
+rgat.*.sample_concatenate_permutation = 1
+
 # LLM benchmarks have non-uniform inputs and outputs, and use equal issue mode for all latency scenario
 gptj.*.sample_concatenate_permutation = 1
 llama2-70b.*.sample_concatenate_permutation = 1
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index 4a463f304..dcdad1180 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -957,6 +957,7 @@ def requires_equal_issue(self, model, division):
                 "llama2-70b-99.9",
                 "mixtral-8x7b",
                 "llama3-405b",
+                "rgat",
             ]
             and self.version not in ["v4.0", "v4.1"]
         )

From b3e1e8e636908a6989a3d04c4c09c21b756f3a4a Mon Sep 17 00:00:00 2001
From: mrmhodak <mrmhodak@users.noreply.github.com>
Date: Wed, 18 Dec 2024 18:15:47 +0000
Subject: [PATCH 05/10] Increment version to 5.0.3

---
 loadgen/VERSION.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loadgen/VERSION.txt b/loadgen/VERSION.txt
index a1ef0cae1..50e2274e6 100644
--- a/loadgen/VERSION.txt
+++ b/loadgen/VERSION.txt
@@ -1 +1 @@
-5.0.2
+5.0.3

From 8397bec7447afc2eba5a0b630594981cc4dfed27 Mon Sep 17 00:00:00 2001
From: Arjun Suresh <arjun@gateoverflow.com>
Date: Wed, 18 Dec 2024 18:17:07 +0000
Subject: [PATCH 06/10] Docs update for r-gat (#1969)

* Fixes #1648, restrict loadgen uncommitted error message to within the loadgen directory

* Update test-rnnt.yml (#1688)

Stopping the github action for rnnt

* Added docs init

Added github action for website publish

Update benchmark documentation

Update publish.yaml

Update publish.yaml

Update benchmark documentation

Improved the submission documentation

Fix taskname

Removed unused images

* Fix benchmark URLs

* Fix links

* Add _full variation to run commands

* Added script flow diagram

* Added docker setup command for CM, extra run options

* Added support for docker options in the docs

* Added --quiet to the CM run_cmds in docs

* Fix the test query count for cm commands

* Support ctuning-cpp implementation

* Added commands for mobilenet models

* Docs cleanup

* Docs cleanup

* Added separate files for dataset and models in the docs

* Remove redundant tab in the docs

* Fixes some WIP models in the docs

* Use the official docs page for CM installation

* Fix the deadlink in docs

* Fix indendation issue in docs

* Added dockerinfo for nvidia implementation

* Added run options for gptj

* Added execution environment tabs

* Cleanup of the docs

* Cleanup of the docs

* Reordered the sections of the docs page

* Removed an unnecessary heading in the docs

* Fixes the commands for datacenter

* Fix the build --sdist for loadgen

* Fixes #1761, llama2 and mixtral runtime error on CPU systems

* Added mixtral to the benchmark list, improved benchmark docs

* Update docs for MLPerf inference v4.1

* Update docs for MLPerf inference v4.1

* Fix typo

* Gave direct link to implementation readmes

* Added tables detailing implementations

* Update vision README.md, split the frameworks into separate rows

* Update README.md

* pointed links to specific frameworks

* pointed links to specific frameworks

* Update Submission_Guidelines.md

* Update Submission_Guidelines.md

* Update Submission_Guidelines.md

* api support llama2

* Added request module and reduced max token len

* Fix for llama2 api server

* Update SUT_API offline to work for OpenAI

* Update SUT_API.py

* Minor fixes

* Fix json import in SUT_API.py

* Fix llama2 token length

* Added model name verification with server

* clean temp files

* support num_workers in LLAMA2 SUTs

* Remove batching from Offline SUT_API.py

* Update SUT_API.py

* Minor fixes for llama2 API

* Fix for llama2 API

* removed table of contents

* enabled llama2-nvidia + vllm-NM : WIP

* enabled dlrm for intel

* lower cased implementation

* added raw data input

* corrected data download commands

* renamed filename

* changes for bert and vllm

* documentation to work on custom repo and branch

* benchmark index page update

* enabled sdxl for nvidia and intel

* updated vllm server run cmd

* benchmark page information addition

* fix indendation issue

* Added submission categories

* update submission page - generate submission with or w/o using CM for benchmarking

* Updated kits dataset documentation

* Updated model parameters

* updation of information

* updated non cm based benchmark

* added info about hf password

* added links to model and access tokens

* Updated reference results structuree tree

* submission docs cleanup

* Some cleanups for benchmark info

* Some cleanups for benchmark info

* Some cleanups for benchmark info

* added generic stubs deepsparse

* Some cleanups for benchmark info

* Some cleanups for benchmark info

* Some cleanups for benchmark info

* Some cleanups for benchmark info (FID and CLIP data added)

* typo fix for bert deepsparse framework

* added min system requirements for models

* fixed code version

* changes for displaying reference and intel implementation tip

* added reference to installation page

* updated neural magic documentation

* Added links to the install page, redirect benchmarks page

* added tips about batch size and dataset for nvidia llama2

* fix conditions logic

* modified tips and additional run cmds

* sentence corrections

* Minor fix for the documentation

* fixed bug in deepsparse generic model stubs + styling

* added more information to stubs

* Added SCC24 readme, support reproducibility in the docs

* Made clear the custom CM repo URL format

* Support conditional implementation, setup and run tips

* Support rocm for sdxl

* Fix _short tag support

* Fix install URL

* Expose bfloat16 and float16 options for sdxl

* Expose download model to host option for sdxl

* IndySCC24 documentation added

* Improve the SCC24 docs

* Improve the support of short variation

* Improved the indyscc24 documentation

* Updated scc run commands

* removed test_query_count option for scc

* Remove scc24 in the main docs

* Remove scc24 in the main docs

* Fix docs: indendation issue on the submission page

* generalised code for skipping test query count

* Fixes for SCC24 docs

* Fix scenario text in main.py

* Fix links for scc24

* Fix links for scc24

* Improve the general docs

* Fix links for scc24

* Use float16 in scc24 doc

* Improve scc24 docs

* Improve scc24 docs

* Use float16 in scc24 doc

* fixed command bug

* Fix typo in docs

* Fix typo in docs

* Remove unnecessary indendation in docs

* initial commit for tip - native run CUDA

* Updated tip

* added docker_cm_repo_branch to more run option - docker

* Update docs for IndySCC24

* Support custom repo branch and owner for final report generation

* enabled amd implementation for llama2

* updations for amd - docs

* Fix scenarios in docs page

* formatted the files to pass the gh action

* scenarios -> fixed_scenarios in docs

* [Automated Commit] Format Codebase

* Update indyscc24-bert.md

* Update scc24.md

* updated tip for reference implementation (#1912)

* [Automated Commit] Format Codebase

* fix for run suffix (#1913)

* [Automated Commit] Format Codebase

* Updation for adding submission flow diagram

* Added submission flow diagram

* Update scc24.md

* changes in submission documentation (#1946)

* update results category (#1947)

* changes for adding rgat to docs (#1965)

* Update index.md | Added R-GAT details (WIP)

* Update index.md

* Create system_requirements.yml

* Update system_requirements.yml

* Update system_requirements.yml

* Update system_requirements.yml

---------

Co-authored-by: anandhu-eng <anandhukicks@gmail.com>
Co-authored-by: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
Co-authored-by: arjunsuresh <arjunsuresh@users.noreply.github.com>
Co-authored-by: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Co-authored-by: Mitchelle Rasquinha <80070689+mrasquinha-g@users.noreply.github.com>
Co-authored-by: Miro <mirhodak@amd.com>
---
 docs/benchmarks/graph/get-rgat-data.md |  39 ++++++
 docs/benchmarks/graph/rgat.md          |  13 ++
 docs/index.md                          |  19 ++-
 docs/submission/index.md               | 160 +++++++++++++------------
 docs/system_requirements.yml           |  50 ++++++++
 main.py                                |   9 +-
 mkdocs.yml                             |   2 +
 7 files changed, 211 insertions(+), 81 deletions(-)
 create mode 100644 docs/benchmarks/graph/get-rgat-data.md
 create mode 100644 docs/benchmarks/graph/rgat.md
 create mode 100644 docs/system_requirements.yml

diff --git a/docs/benchmarks/graph/get-rgat-data.md b/docs/benchmarks/graph/get-rgat-data.md
new file mode 100644
index 000000000..189c25b87
--- /dev/null
+++ b/docs/benchmarks/graph/get-rgat-data.md
@@ -0,0 +1,39 @@
+---
+hide:
+  - toc
+---
+
+# Graph Neural Network using R-GAT 
+
+## Dataset
+
+The benchmark implementation run command will automatically download the validation and calibration datasets and do the necessary preprocessing. In case you want to download only the datasets, you can use the below commands.
+
+=== "Full Dataset"
+    R-GAT validation run uses the IGBH dataset consisting of 547,306,935 nodes and 5,812,005,639 edges.
+
+    ### Get Full Dataset
+    ```
+    cm run script --tags=get,dataset,igbh,_full -j
+    ```
+
+=== "Debug Dataset"
+    R-GAT debug run uses the IGBH debug dataset(tiny).
+
+    ### Get Full Dataset
+    ```
+    cm run script --tags=get,dataset,igbh,_debug -j
+    ```
+
+## Model
+The benchmark implementation run command will automatically download the required model and do the necessary conversions. In case you want to only download the official model, you can use the below commands.
+
+Get the Official MLPerf R-GAT Model
+
+=== "PyTorch"
+
+    ### PyTorch
+    ```
+    cm run script --tags=get,ml-model,rgat -j
+    ```
+
diff --git a/docs/benchmarks/graph/rgat.md b/docs/benchmarks/graph/rgat.md
new file mode 100644
index 000000000..ffff467a4
--- /dev/null
+++ b/docs/benchmarks/graph/rgat.md
@@ -0,0 +1,13 @@
+---
+hide:
+  - toc
+---
+
+
+# Graph Neural Network using R-GAT 
+
+
+=== "MLCommons-Python"
+    ## MLPerf Reference Implementation in Python
+    
+{{ mlperf_inference_implementation_readme (4, "rgat", "reference", devices = ["CPU", "CUDA"]) }}
\ No newline at end of file
diff --git a/docs/index.md b/docs/index.md
index 11f2a52c2..b46d4c274 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,7 +1,7 @@
 # MLPerf Inference Benchmarks
 
 ## Overview
-The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf inference v4.0 round are listed below, categorized by tasks. Under each model you can find its details like the dataset used, reference accuracy, server latency constraints etc.
+The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf inference v5.0 round are listed below, categorized by tasks. Under each model you can find its details like the dataset used, reference accuracy, server latency constraints etc.
 
 ---
 
@@ -80,7 +80,7 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
 - **Server Scenario Latency Constraint**: 130ms
 - **Equal Issue mode**: False
 - **High accuracy variant**: yes
-- **Submission Category**: Datacenter, Edge
+- **Submission Category**: Edge
 
 #### [LLAMA2-70B](benchmarks/language/llama2-70b.md)
 - **Dataset**: OpenORCA (GPT-4 split, max_seq_len=1024)
@@ -157,11 +157,22 @@ The currently valid [MLPerf Inference Benchmarks](index_gh.md) as of MLPerf infe
 - **High accuracy variant**: Yes
 - **Submission Category**: Datacenter
 
+## Graph Neural Networks
+### [R-GAT](benchmarks/graph/rgat.md)
+- **Dataset**: Illinois Graph Benchmark Heterogeneous validation dataset
+    - **Dataset Size**: 788,379
+    - **QSL Size**: 788,379
+- **Number of Parameters**: 
+- **Reference Model Accuracy**: ACC = ?
+- **Server Scenario Latency Constraint**: N/A
+- **Equal Issue mode**: True
+- **High accuracy variant**: No
+- **Submission Category**: Datacenter
 ---
 
 ## Submission Categories
-- **Datacenter Category**: All the current inference benchmarks are applicable to the datacenter category.
-- **Edge Category**: All benchmarks except DLRMv2, LLAMA2-70B, and Mixtral-8x7B are applicable to the edge category.
+- **Datacenter Category**: All benchmarks except bert are applicable to the datacenter category for inference v5.0.
+- **Edge Category**: All benchmarks except DLRMv2, LLAMA2-70B, Mixtral-8x7B and R-GAT are applicable to the edge category for v5.0.
 
 ## High Accuracy Variants
 - **Benchmarks**: `bert`, `llama2-70b`, `gpt-j`,  `dlrm_v2`, and `3d-unet` have a normal accuracy variant as well as a high accuracy variant.
diff --git a/docs/submission/index.md b/docs/submission/index.md
index c99802420..1050f5fb0 100644
--- a/docs/submission/index.md
+++ b/docs/submission/index.md
@@ -13,13 +13,15 @@ hide:
 
 Click [here](https://youtu.be/eI1Hoecc3ho) to view the recording of the workshop: Streamlining your MLPerf Inference results using CM.
 
-=== "CM based benchmark"
+Click [here](https://docs.google.com/presentation/d/1cmbpZUpVr78EIrhzyMBnnWnjJrD-mZ2vmSb-yETkTA8/edit?usp=sharing) to view the prposal slide for Common Automation for MLPerf Inference Submission Generation through CM.
+
+=== "CM based results"
     If you have followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, all the valid results will get aggregated to the `cm cache` folder. The following command could be used to browse the structure of inference results folder generated by CM.
     ### Get results folder structure
     ```bash
     cm find cache --tags=get,mlperf,inference,results,dir | xargs tree
     ```
-=== "Non CM based benchmark"
+=== "Non CM based results"
     If you have not followed the `cm run` commands under the individual model pages in the [benchmarks](../index.md) directory, please make sure that the result directory is structured in the following way. 
     ```
     └── System description ID(SUT Name)
@@ -35,18 +37,20 @@ Click [here](https://youtu.be/eI1Hoecc3ho) to view the recording of the workshop
                 |   ├── mlperf_log_detail.txt
                 |   ├── mlperf_log_accuracy.json
                 |   └── accuracy.txt
-                └── Compliance_Test_ID
-                    ├── Performance
-                    |   └── run_x/#1 run for all scenarios
-                    |       ├── mlperf_log_summary.txt
-                    |       └── mlperf_log_detail.txt
-                    ├── Accuracy
-                    |   ├── baseline_accuracy.txt
-                    |   ├── compliance_accuracy.txt
-                    |   ├── mlperf_log_accuracy.json
-                    |   └── accuracy.txt
-                    ├── verify_performance.txt
-                    └── verify_accuracy.txt #for TEST01 only
+                |── Compliance_Test_ID
+                |   ├── Performance
+                |   |   └── run_x/#1 run for all scenarios
+                |   |       ├── mlperf_log_summary.txt
+                |   |       └── mlperf_log_detail.txt
+                |   ├── Accuracy
+                |   |   ├── baseline_accuracy.txt
+                |   |   ├── compliance_accuracy.txt
+                |   |   ├── mlperf_log_accuracy.json
+                |   |   └── accuracy.txt
+                |   ├── verify_performance.txt
+                |   └── verify_accuracy.txt #for TEST01 only
+                |── user.conf
+                └── measurements.json
     ```
     
     <details>
@@ -67,67 +71,69 @@ Once all the results across all the models are ready you can use the following c
 
 ## Generate actual submission tree
 
-=== "Closed Edge"
-    ### Closed Edge Submission
-    ```bash
-    cm run script --tags=generate,inference,submission \
-       --clean \
-       --preprocess_submission=yes \
-       --run-checker \
-       --submitter=MLCommons \
-       --tar=yes \
-       --env.CM_TAR_OUTFILE=submission.tar.gz \
-       --division=closed \
-       --category=edge \
-       --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
-       --quiet
-    ```
-
-=== "Closed Datacenter"
-    ### Closed Datacenter Submission
-    ```bash
-    cm run script --tags=generate,inference,submission \
-       --clean \
-       --preprocess_submission=yes \
-       --run-checker \
-       --submitter=MLCommons \
-       --tar=yes \
-       --env.CM_TAR_OUTFILE=submission.tar.gz \
-       --division=closed \
-       --category=datacenter \
-       --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
-       --quiet
-    ```
-=== "Open Edge"
-    ### Open Edge Submission
-    ```bash
-    cm run script --tags=generate,inference,submission \
-       --clean \
-       --preprocess_submission=yes \
-       --run-checker \
-       --submitter=MLCommons \
-       --tar=yes \
-       --env.CM_TAR_OUTFILE=submission.tar.gz \
-       --division=open \
-       --category=edge \
-       --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
-       --quiet
-    ```
-=== "Open Datacenter"
-    ### Closed Datacenter Submission
-    ```bash
-    cm run script --tags=generate,inference,submission \
-       --clean \
-       --preprocess_submission=yes \
-       --run-checker \
-       --submitter=MLCommons \
-       --tar=yes \
-       --env.CM_TAR_OUTFILE=submission.tar.gz \
-       --division=open \
-       --category=datacenter \
-       --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
-       --quiet
-    ```
+=== "Docker run"
+    ### Docker run
+    === "Closed"
+        ### Closed Submission
+        ```bash
+        cm docker script --tags=generate,inference,submission \
+            --clean \
+            --preprocess_submission=yes \
+            --run-checker \
+            --submitter=MLCommons \
+            --tar=yes \
+            --env.CM_TAR_OUTFILE=submission.tar.gz \
+            --division=closed \
+            --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
+            --quiet
+        ```
+
+    === "Open"
+        ### Open Submission
+        ```bash
+        cm docker script --tags=generate,inference,submission \
+            --clean \
+            --preprocess_submission=yes \
+            --run-checker \
+            --submitter=MLCommons \
+            --tar=yes \
+            --env.CM_TAR_OUTFILE=submission.tar.gz \
+            --division=open \
+            --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
+            --quiet
+        ```
+
+=== "Native run"
+    ### Native run
+    === "Closed"
+        ### Closed Submission
+        ```bash
+        cm run script --tags=generate,inference,submission \
+            --clean \
+            --preprocess_submission=yes \
+            --run-checker \
+            --submitter=MLCommons \
+            --tar=yes \
+            --env.CM_TAR_OUTFILE=submission.tar.gz \
+            --division=closed \
+            --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
+            --quiet
+        ```
+
+    === "Open"
+        ### Open Submission
+        ```bash
+        cm run script --tags=generate,inference,submission \
+            --clean \
+            --preprocess_submission=yes \
+            --run-checker \
+            --submitter=MLCommons \
+            --tar=yes \
+            --env.CM_TAR_OUTFILE=submission.tar.gz \
+            --division=open \
+            --env.CM_DETERMINE_MEMORY_CONFIGURATION=yes \
+            --quiet
+        ```
 
 * Use `--hw_name="My system name"` to give a meaningful system name. Examples can be seen [here](https://github.com/mlcommons/inference_results_v3.0/tree/main/open/cTuning/systems)
 
@@ -137,6 +143,10 @@ Once all the results across all the models are ready you can use the following c
 
 * Use `--results_dir` option to specify the results folder for Non CM based benchmarks
 
+* Use `--category` option to specify the category for which submission is generated(datacenter/edge). By default, the category is taken from `system_meta.json` file located in the SUT root directory.
+
+* Use `--submission_base_dir` to specify the directory to which outputs from preprocess submission script and final submission is to be dumped. No need to provide `--submission_dir` along with this. For `docker run`, use `--submission_base_dir` instead of `--submission_dir`.
+
 The above command should generate "submission.tar.gz" if there are no submission checker issues and you can upload it to the [MLCommons Submission UI](https://submissions-ui.mlcommons.org/submission).
 
 ## Aggregate Results in GitHub
diff --git a/docs/system_requirements.yml b/docs/system_requirements.yml
new file mode 100644
index 000000000..5dfec202a
--- /dev/null
+++ b/docs/system_requirements.yml
@@ -0,0 +1,50 @@
+# All memory requirements in GB
+resnet:
+  reference:
+    fp32:
+      system_memory: 8
+      accelerator_memory: 4
+      disk_storage: 25
+  nvidia:
+    int8:
+      system_memory: 8
+      accelerator_memory: 4
+      disk_storage: 100
+  intel:
+    int8:
+      system_memory: 8
+      accelerator_memory: 0
+      disk_storage: 50
+  qualcomm:
+    int8:
+      system_memory: 8
+      accelerator_memory: 8
+      disk_storage: 50
+retinanet:
+  reference:
+    fp32:
+      system_memory: 8
+      accelerator_memory: 8
+      disk_storage: 200
+  nvidia:
+    int8:
+      system_memory: 8
+      accelerator_memory: 8
+      disk_storage: 200
+  intel:
+    int8:
+      system_memory: 8
+      accelerator_memory: 0
+      disk_storage: 200
+  qualcomm:
+    int8:
+      system_memory: 8
+      accelerator_memory: 8
+      disk_storage: 200
+rgat:
+  reference:
+    fp32:
+      system_memory: 768
+      accelerator_memory: 8
+      disk_storage: 2300
+
diff --git a/main.py b/main.py
index c8c64b8c3..c5b22a705 100755
--- a/main.py
+++ b/main.py
@@ -239,7 +239,8 @@ def mlperf_inference_implementation_readme(
 
                             common_info = get_common_info(
                                 spaces + 16,
-                                implementation
+                                implementation,
+                                model.lower()
                             )
 
                             if (
@@ -488,7 +489,7 @@ def get_venv_command(spaces):
 
     # contains run command information which is common to both docker and
     # native runs
-    def get_common_info(spaces, implementation):
+    def get_common_info(spaces, implementation, model):
         info = ""
         pre_space = ""
         for i in range(1, spaces):
@@ -496,7 +497,11 @@ def get_common_info(spaces, implementation):
         pre_space += " "
         # pre_space = "                "
         info += f"\n{pre_space}!!! tip\n\n"
+        info += f"{pre_space}    - Number of threads could be adjusted using `--threads=#`, where `#` is the desired number of threads. This option works only if the implementation in use supports threading.\n\n"
         info += f"{pre_space}    - Batch size could be adjusted using `--batch_size=#`, where `#` is the desired batch size. This option works only if the implementation in use is supporting the given batch size.\n\n"
+        if model == "rgat":
+            info += f"{pre_space}    - Add `--env.CM_DATASET_IGBH_PATH=<Path to IGBH dataset>` if you have already downloaded the dataset. The path will be automatically mounted when using docker run.\n\n"
+            info += f"{pre_space}    - Add `--env.CM_ML_MODEL_RGAT_CHECKPOINT_PATH=<Path to R-GAT model checkpoint>` if you have already downloaded the model. The path will be automatically mounted when using docker run.\n\n"  
         if implementation.lower() == "reference":
             info += f"{pre_space}    - Add `--adr.mlperf-implementation.tags=_branch.master,_repo.<CUSTOM_INFERENCE_REPO_LINK>` if you are modifying the official MLPerf Inference implementation in a custom fork.\n\n"
             info += f"{pre_space}    - Add `--adr.inference-src.tags=_repo.<CUSTOM_INFERENCE_REPO_LINK>` if you are modifying the model config accuracy script in the submission checker within a custom fork.\n\n"
diff --git a/mkdocs.yml b/mkdocs.yml
index 95dfb6e86..96bcfb758 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -42,6 +42,8 @@ nav:
       - MIXTRAL-8x7B: benchmarks/language/mixtral-8x7b.md
     - Recommendation:
       - DLRM-v2: benchmarks/recommendation/dlrm-v2.md
+    - Graph Neural Networks:
+      - R-GAT: benchmarks/graph/rgat.md
   - Install CM:
     - install/index.md
   - Submission:

From 647f9f84ff91394eb865ed9eaf5de688a1d37448 Mon Sep 17 00:00:00 2001
From: mlcommons-bot <mlcommons-bot@users.noreply.github.com>
Date: Wed, 18 Dec 2024 18:17:31 +0000
Subject: [PATCH 07/10] [Automated Commit] Format Codebase

---
 main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.py b/main.py
index c5b22a705..6a34587dd 100755
--- a/main.py
+++ b/main.py
@@ -501,7 +501,7 @@ def get_common_info(spaces, implementation, model):
         info += f"{pre_space}    - Batch size could be adjusted using `--batch_size=#`, where `#` is the desired batch size. This option works only if the implementation in use is supporting the given batch size.\n\n"
         if model == "rgat":
             info += f"{pre_space}    - Add `--env.CM_DATASET_IGBH_PATH=<Path to IGBH dataset>` if you have already downloaded the dataset. The path will be automatically mounted when using docker run.\n\n"
-            info += f"{pre_space}    - Add `--env.CM_ML_MODEL_RGAT_CHECKPOINT_PATH=<Path to R-GAT model checkpoint>` if you have already downloaded the model. The path will be automatically mounted when using docker run.\n\n"  
+            info += f"{pre_space}    - Add `--env.CM_ML_MODEL_RGAT_CHECKPOINT_PATH=<Path to R-GAT model checkpoint>` if you have already downloaded the model. The path will be automatically mounted when using docker run.\n\n"
         if implementation.lower() == "reference":
             info += f"{pre_space}    - Add `--adr.mlperf-implementation.tags=_branch.master,_repo.<CUSTOM_INFERENCE_REPO_LINK>` if you are modifying the official MLPerf Inference implementation in a custom fork.\n\n"
             info += f"{pre_space}    - Add `--adr.inference-src.tags=_repo.<CUSTOM_INFERENCE_REPO_LINK>` if you are modifying the model config accuracy script in the submission checker within a custom fork.\n\n"

From e6069aa91c2d8e75f62b00fbdd566b399e77f3f8 Mon Sep 17 00:00:00 2001
From: ANANDHU S <71482562+anandhu-eng@users.noreply.github.com>
Date: Wed, 18 Dec 2024 23:48:11 +0530
Subject: [PATCH 08/10] Update automated run command section - R-GAT (#1970)

* Update automated run command section

* add cm commands for model and dataset downloads

* Update README.md

* Update cm run cmds

---------

Co-authored-by: Miro <mirhodak@amd.com>
---
 graph/R-GAT/README.md | 53 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/graph/R-GAT/README.md b/graph/R-GAT/README.md
index 69883c0d1..fbfca4709 100644
--- a/graph/R-GAT/README.md
+++ b/graph/R-GAT/README.md
@@ -19,7 +19,7 @@ This is the reference implementation for MLPerf Inference Graph Neural Network.
 
 ## Automated command to run the benchmark via MLCommons CM
 
-TODO
+Please check the official inference documentation [here](https://docs.mlcommons.org/inference/benchmarks/graph/rgat/)
  
 ## Setup
 Set the following helper variables
@@ -95,6 +95,12 @@ You can then navigate in the terminal to your desired download directory and run
 rclone copy mlc-inference:mlcommons-inference-wg-public/R-GAT/RGAT.pt $MODEL_PATH -P
 ```
 
+### Download model through CM (Collective Minds)
+
+```
+cm run script --tags=get,ml-model,rgat -j
+```
+
 ### Download and setup dataset
 #### Debug Dataset
 
@@ -110,6 +116,10 @@ cd $GRAPH_FOLDER
 python3 tools/split_seeds.py --path igbh --dataset_size tiny
 ```
 
+**CM Command**
+```
+cm run script --tags=get,dataset,igbh,_debug -j
+```
 
 #### Full Dataset
 **Warning:** This script will download 2.2TB of data 
@@ -124,6 +134,11 @@ cd $GRAPH_FOLDER
 python3 tools/split_seeds.py --path igbh --dataset_size full
 ```
 
+**CM Command**
+```
+cm run script --tags=get,dataset,igbh,_full -j
+```
+
 
 #### Calibration dataset
 
@@ -140,6 +155,21 @@ cd $GRAPH_FOLDER
 python3 main.py --dataset igbh-dgl-tiny --dataset-path igbh/ --profile debug-dgl [--model-path <path_to_ckpt>] [--in-memory] [--device <cpu or gpu>] [--dtype <fp16 or fp32>] [--scenario <SingleStream, MultiStream, Server or Offline>]
 ```
 
+##### Debug Run using CM
+```
+cm run script --tags=run-mlperf,inference,_submission,_short,_r5.0-dev \
+   --model=rgat \
+   --implementation=reference \
+   --framework=pytorch \
+   --category=edge \
+   --scenario=Offline \
+   --execution_mode=test \
+   --device=<cpu or cuda> \
+   --quiet \
+   --test_query_count=10 \
+   --docker
+```
+
 #### Local run
 ```bash
 # Go to the benchmark folder
@@ -148,6 +178,27 @@ cd $GRAPH_FOLDER
 # Run the benchmark DGL
 python3 main.py --dataset igbh-dgl --dataset-path igbh/ --profile rgat-dgl-full [--model-path <path_to_ckpt>] [--in-memory] [--device <cpu or gpu>] [--dtype <fp16 or fp32>] [--scenario <SingleStream, MultiStream, Server or Offline>]
 ```
+
+##### Local Run using CM
+```
+cm run script --tags=run-mlperf,inference,_submission,_full,_r5.0-dev \
+   --model=rgat \
+   --implementation=reference \
+   --framework=pytorch \
+   --category=edge \
+   --scenario=Offline \
+   --execution_mode=test \
+   --device=<>cpu or cuda> \
+   --quiet \
+   --test_query_count=10 \
+   --docker
+```
+
+- Number of threads could be adjusted using `--threads=#`, where # is the desired number of threads. This option works only if the implementation in use supports threading.
+- Batch size could be adjusted using `--batch_size=#`, where # is the desired batch size. This option works only if the implementation in use is supporting the given batch size.
+- Add `--env.CM_DATASET_IGBH_PATH=<Path to IGBH dataset>` if you have already downloaded the dataset. The path will be automatically mounted when using docker run.
+- Add `--env.CM_ML_MODEL_RGAT_CHECKPOINT_PATH=<Path to R-GAT model checkpoint>` if you have already downloaded the model. The path will be automatically mounted when using docker run.
+
 #### Run using docker
 
 Not implemented yet

From 00945c3b0378a48fc385a0fd2851f02f40f49863 Mon Sep 17 00:00:00 2001
From: Pablo Gonzalez <pablo.gonzalez@factored.ai>
Date: Thu, 19 Dec 2024 01:36:42 -0500
Subject: [PATCH 09/10] Unify llama3 names to llama3.1-405b (#1982)

* Unify llama3 names to llama3.1-405b

* Set mlperf.conf name to llama3_1-405b
---
 .../{llama3-405b => llama3.1-405b}/Dockerfile |  2 +-
 .../{llama3-405b => llama3.1-405b}/README.md  | 20 +++++------
 .../SUT_VLLM.py                               |  0
 .../{llama3-405b => llama3.1-405b}/build.sh   |  0
 .../{llama3-405b => llama3.1-405b}/dataset.py |  0
 .../evaluate-accuracy.py                      |  2 +-
 .../launch_docker.sh                          |  0
 .../{llama3-405b => llama3.1-405b}/main.py    |  4 +--
 .../requirements.txt                          |  0
 .../run_accuracy.sh                           |  0
 .../run_offline.sh                            |  0
 .../run_server.sh                             |  0
 .../{llama3-405b => llama3.1-405b}/user.conf  |  2 +-
 .../with_the_same_user                        |  0
 loadgen/mlperf.conf                           | 16 ++++-----
 tools/submission/generate_final_report.py     |  2 +-
 tools/submission/submission_checker.py        | 35 ++++++++++---------
 17 files changed, 42 insertions(+), 41 deletions(-)
 rename language/{llama3-405b => llama3.1-405b}/Dockerfile (97%)
 rename language/{llama3-405b => llama3.1-405b}/README.md (87%)
 rename language/{llama3-405b => llama3.1-405b}/SUT_VLLM.py (100%)
 rename language/{llama3-405b => llama3.1-405b}/build.sh (100%)
 rename language/{llama3-405b => llama3.1-405b}/dataset.py (100%)
 rename language/{llama3-405b => llama3.1-405b}/evaluate-accuracy.py (98%)
 rename language/{llama3-405b => llama3.1-405b}/launch_docker.sh (100%)
 rename language/{llama3-405b => llama3.1-405b}/main.py (97%)
 rename language/{llama3-405b => llama3.1-405b}/requirements.txt (100%)
 rename language/{llama3-405b => llama3.1-405b}/run_accuracy.sh (100%)
 rename language/{llama3-405b => llama3.1-405b}/run_offline.sh (100%)
 rename language/{llama3-405b => llama3.1-405b}/run_server.sh (100%)
 rename language/{llama3-405b => llama3.1-405b}/user.conf (87%)
 rename language/{llama3-405b => llama3.1-405b}/with_the_same_user (100%)

diff --git a/language/llama3-405b/Dockerfile b/language/llama3.1-405b/Dockerfile
similarity index 97%
rename from language/llama3-405b/Dockerfile
rename to language/llama3.1-405b/Dockerfile
index 67edcc46b..14d0a202d 100644
--- a/language/llama3-405b/Dockerfile
+++ b/language/llama3.1-405b/Dockerfile
@@ -44,7 +44,7 @@ WORKDIR /tmp
 RUN wget https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-Linux-x86_64.sh \
     && bash Miniconda3-* -b -p /opt/miniconda3
 ENV PATH="$PATH:/opt/miniconda3/bin"
-RUN conda create -n llama3-405b python=3.10
+RUN conda create -n llama3.1-405b python=3.10
 RUN chmod -R 777 /opt/miniconda3
 
 # Set the env variable for vLLM
diff --git a/language/llama3-405b/README.md b/language/llama3.1-405b/README.md
similarity index 87%
rename from language/llama3-405b/README.md
rename to language/llama3.1-405b/README.md
index 8df2a81f1..d1dd5ad4f 100644
--- a/language/llama3-405b/README.md
+++ b/language/llama3.1-405b/README.md
@@ -1,13 +1,13 @@
-# Reference Implementation for llama3-405b
+# Reference Implementation for llama3.1-405b
 
-**Basic implementation for llama3-405b. Few noteworthy items:**
+**Basic implementation for llama3.1-405b. Few noteworthy items:**
 
 + Streamer for communicating with loadgen has quite some overhead. This is only meant to provide functional implementation
 + For custom/optimized implementations of this benchmark it is important to include the :
         - For server scenario, it is necessary to call `lg.FirstTokenComplete(response)` for each query. This way the first token will be reported and it's latency will be measured.
         - For all scenarios, when calling `lg.QuerySamplesComplete(response)`, it is necessary that each of the elements in response is a `lg.QuerySampleResponse` that contains the number of tokens (can be create this way: `lg.QuerySampleResponse(qitem.id, bi[0], bi[1], n_tokens)`). The number of tokens reported should match with the number of tokens on your answer and this will be checked in [TEST06](../../compliance/nvidia/TEST06/)
 
-Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3-405b) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
+Please see the [new docs site](https://docs.mlcommons.org/inference/benchmarks/language/llama3.1-405b) for an automated way to run this benchmark across different available implementations and do an end-to-end submission with or without docker.
 
 
 ## Prepare environment
@@ -33,9 +33,9 @@ rm ~/miniconda3/miniconda.sh
 - Set the following helper variables
 ```bash
 export ROOT=$PWD/inference
-export LLAMA_FOLDER=$PWD/inference/language/llama3-405b
+export LLAMA_FOLDER=$PWD/inference/language/llama3.1-405b
 export LOADGEN_FOLDER=$PWD/inference/loadgen
-export DATASET_FOLDER=$PWD/inference/language/llama3-405b/dataset
+export DATASET_FOLDER=$PWD/inference/language/llama3.1-405b/dataset
 ```
 
 - Clone the inference repository:
@@ -46,8 +46,8 @@ git clone --recurse-submodules https://github.com/mlcommons/inference.git \
 
 - Create a conda environment:
 ```bash
-conda create -y -n llama3-405b python=3.10
-conda activate llama3-405b
+conda create -y -n llama3.1-405b python=3.10
+conda activate llama3.1-405b
 conda install -y -c conda-forge libstdcxx-ng=12
 ```
 
@@ -100,7 +100,7 @@ TODO: Host model and grant access to submitters
 
 
 ### External Download
-+ First go to [llama3-request-link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and make a request, sign in to HuggingFace (if you don't have account, you'll need to create one). **Please note your authentication credentials** as you may be required to provide them when cloning below.
++ First go to [llama3.1-request-link](https://ai.meta.com/resources/models-and-libraries/llama-downloads/) and make a request, sign in to HuggingFace (if you don't have account, you'll need to create one). **Please note your authentication credentials** as you may be required to provide them when cloning below.
 + Requires Git Large Files Storage
 ```
 export CHECKPOINT_PATH=Meta-Llama-3.1-405B-Instruct
@@ -127,13 +127,13 @@ rclone config create mlc-inference s3 provider=Cloudflare access_key_id=f65ba5ee
 You can then navigate in the terminal to your desired download directory and run the following command to download the dataset:
 
 ```
-rclone copy mlc-inference:mlcommons-inference-wg-public/llama3_405b/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl ./ -P
+rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_405b/mlperf_llama3.1_405b_dataset_8313_processed_fp16_eval.pkl ./ -P
 ```
 
 You can also download the calibration dataset from the Cloudflare R2 bucket by running the following command:
 
 ```
-rclone copy mlc-inference:mlcommons-inference-wg-public/llama3_405b/mlperf_llama3.1_405b_calibration_dataset_512_processed_fp16_eval.pkl ./ -P
+rclone copy mlc-inference:mlcommons-inference-wg-public/llama3.1_405b/mlperf_llama3.1_405b_calibration_dataset_512_processed_fp16_eval.pkl ./ -P
 ```
 
 ## Run Performance Benchmarks
diff --git a/language/llama3-405b/SUT_VLLM.py b/language/llama3.1-405b/SUT_VLLM.py
similarity index 100%
rename from language/llama3-405b/SUT_VLLM.py
rename to language/llama3.1-405b/SUT_VLLM.py
diff --git a/language/llama3-405b/build.sh b/language/llama3.1-405b/build.sh
similarity index 100%
rename from language/llama3-405b/build.sh
rename to language/llama3.1-405b/build.sh
diff --git a/language/llama3-405b/dataset.py b/language/llama3.1-405b/dataset.py
similarity index 100%
rename from language/llama3-405b/dataset.py
rename to language/llama3.1-405b/dataset.py
diff --git a/language/llama3-405b/evaluate-accuracy.py b/language/llama3.1-405b/evaluate-accuracy.py
similarity index 98%
rename from language/llama3-405b/evaluate-accuracy.py
rename to language/llama3.1-405b/evaluate-accuracy.py
index f5677820e..7c803e1ca 100644
--- a/language/llama3-405b/evaluate-accuracy.py
+++ b/language/llama3.1-405b/evaluate-accuracy.py
@@ -15,7 +15,7 @@ def get_args():
     parser.add_argument(
         "--checkpoint-path",
         default="meta-llama/Meta-Llama-3-8B",
-        help="Path to Llama3-405b-hf-chat checkpoint"
+        help="Path to Llama3.1-405b-hf-chat checkpoint"
     )
     parser.add_argument(
         "--mlperf-accuracy-file", required=True, help="path to mlperf_log_accuracy.json"
diff --git a/language/llama3-405b/launch_docker.sh b/language/llama3.1-405b/launch_docker.sh
similarity index 100%
rename from language/llama3-405b/launch_docker.sh
rename to language/llama3.1-405b/launch_docker.sh
diff --git a/language/llama3-405b/main.py b/language/llama3.1-405b/main.py
similarity index 97%
rename from language/llama3-405b/main.py
rename to language/llama3.1-405b/main.py
index f7802687e..32f80060c 100644
--- a/language/llama3-405b/main.py
+++ b/language/llama3.1-405b/main.py
@@ -136,8 +136,8 @@ def main():
     settings = lg.TestSettings()
     settings.scenario = scenario_map[args.scenario.lower()]
     # mlperf.conf is automatically loaded by the loadgen
-    # settings.FromConfig(args.mlperf_conf, "llama3-405b", args.scenario)
-    settings.FromConfig(args.user_conf, "llama3-405b", args.scenario)
+    # settings.FromConfig(args.mlperf_conf, "llama3_1-405b", args.scenario)
+    settings.FromConfig(args.user_conf, "llama3_1-405b", args.scenario)
 
     if args.accuracy:
         settings.mode = lg.TestMode.AccuracyOnly
diff --git a/language/llama3-405b/requirements.txt b/language/llama3.1-405b/requirements.txt
similarity index 100%
rename from language/llama3-405b/requirements.txt
rename to language/llama3.1-405b/requirements.txt
diff --git a/language/llama3-405b/run_accuracy.sh b/language/llama3.1-405b/run_accuracy.sh
similarity index 100%
rename from language/llama3-405b/run_accuracy.sh
rename to language/llama3.1-405b/run_accuracy.sh
diff --git a/language/llama3-405b/run_offline.sh b/language/llama3.1-405b/run_offline.sh
similarity index 100%
rename from language/llama3-405b/run_offline.sh
rename to language/llama3.1-405b/run_offline.sh
diff --git a/language/llama3-405b/run_server.sh b/language/llama3.1-405b/run_server.sh
similarity index 100%
rename from language/llama3-405b/run_server.sh
rename to language/llama3.1-405b/run_server.sh
diff --git a/language/llama3-405b/user.conf b/language/llama3.1-405b/user.conf
similarity index 87%
rename from language/llama3-405b/user.conf
rename to language/llama3.1-405b/user.conf
index 9f4eb5f9a..30681302c 100644
--- a/language/llama3-405b/user.conf
+++ b/language/llama3.1-405b/user.conf
@@ -10,4 +10,4 @@
 *.Server.min_duration = 120000
 *.Server.min_query_count = 100
 
-llama3-405b.Server.sample_concatenate_permutation = 1
\ No newline at end of file
+llama3_1-405b.Server.sample_concatenate_permutation = 1
\ No newline at end of file
diff --git a/language/llama3-405b/with_the_same_user b/language/llama3.1-405b/with_the_same_user
similarity index 100%
rename from language/llama3-405b/with_the_same_user
rename to language/llama3.1-405b/with_the_same_user
diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf
index 95cc08351..1d036f4b4 100644
--- a/loadgen/mlperf.conf
+++ b/loadgen/mlperf.conf
@@ -14,7 +14,7 @@ dlrm-v2.*.performance_sample_count_override = 204800
 rnnt.*.performance_sample_count_override = 2513
 gptj.*.performance_sample_count_override = 13368
 llama2-70b.*.performance_sample_count_override = 24576
-llama3-405b.*.performance_sample_count_override = 8313
+llama3_1-405b.*.performance_sample_count_override = 8313
 stable-diffusion-xl.*.performance_sample_count_override = 5000
 rgat.*.performance_sample_count_override = 788379
 # set to 0 to let entire sample set to be performance sample
@@ -49,7 +49,7 @@ rgat.*.sample_concatenate_permutation = 1
 gptj.*.sample_concatenate_permutation = 1
 llama2-70b.*.sample_concatenate_permutation = 1
 mixtral-8x7b.*.sample_concatenate_permutation = 1
-llama3-405b.*.sample_concatenate_permutation = 1
+llama3_1-405b.*.sample_concatenate_permutation = 1
 
 *.Server.target_latency = 10
 *.Server.target_latency_percentile = 99
@@ -66,11 +66,11 @@ stable-diffusion-xl.Server.target_latency = 20000
 # Benchmarks that measure token latencies
 llama2-70b.*.use_token_latencies = 1
 mixtral-8x7b.*.use_token_latencies = 1
-llama3-405b.*.use_token_latencies = 1
+llama3_1-405b.*.use_token_latencies = 1
 # gptj benchmark infers token latencies
 gptj.*.infer_token_latencies = 1
 gptj.*.token_latency_scaling_factor = 69
-# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3-405b benchmark therefore target_latency = 0
+# Only ttft and tpot are tracked for the llama2-70b, mixtral-8x7B & llama3_1-405b benchmark therefore target_latency = 0
 llama2-70b.Server.target_latency = 0
 llama2-70b.Server.ttft_latency = 2000
 llama2-70b.Server.tpot_latency = 200
@@ -79,9 +79,9 @@ mixtral-8x7b.Server.target_latency = 0
 mixtral-8x7b.Server.ttft_latency = 2000
 mixtral-8x7b.Server.tpot_latency = 200
 
-llama3-405b.Server.target_latency = 0
-llama3-405b.Server.ttft_latency = 6000
-llama3-405b.Server.tpot_latency = 175
+llama3_1-405b.Server.target_latency = 0
+llama3_1-405b.Server.ttft_latency = 6000
+llama3_1-405b.Server.tpot_latency = 175
 
 *.Offline.target_latency_percentile = 90
 *.Offline.min_duration = 600000
@@ -100,7 +100,7 @@ rnnt.Offline.min_query_count = 2513
 3d-unet.Offline.min_query_count = 43
 stable-diffusion-xl.Offline.min_query_count = 5000
 llama2-70b.Offline.min_query_count = 24576
-llama3-405b.Offline.min_query_count = 8313
+llama3_1-405b.Offline.min_query_count = 8313
 mixtral-8x7b.Offline.min_query_count = 15000
 rgat.Offline.min_query_count = 788379
 
diff --git a/tools/submission/generate_final_report.py b/tools/submission/generate_final_report.py
index 34ae82fb1..aa5b36983 100644
--- a/tools/submission/generate_final_report.py
+++ b/tools/submission/generate_final_report.py
@@ -211,7 +211,7 @@ def main():
                 "llama2-70b-99.9": ["Server", "Offline"],
                 "mixtral-8x7b": ["Server", "Offline"],
                 "rgat": ["Offline"],
-                "llama3-405b": ["Offline", "Server"]
+                "llama3.1-405b": ["Offline", "Server"]
             },
             "edge": {
                 "resnet": ["SingleStream", "MultiStream", "Offline"],
diff --git a/tools/submission/submission_checker.py b/tools/submission/submission_checker.py
index dcdad1180..26d5212f9 100755
--- a/tools/submission/submission_checker.py
+++ b/tools/submission/submission_checker.py
@@ -194,6 +194,7 @@
             "ssd-resnet34": "retinanet",
             "mobilenet": "resnet",
             "resnet50": "resnet",
+            "llama3_1-405b": "llama3.1-405b"
         },
         "seeds": {
             "qsl_rng_seed": 3066443479025735752,
@@ -267,7 +268,7 @@
             "llama2-70b-99.9",
             "stable-diffusion-xl",
             "mixtral-8x7b",
-            "llama3-405b",
+            "llama3.1-405b",
             "rgat",
             # TODO: add automotive?
         ],
@@ -284,7 +285,7 @@
             "llama2-70b-99.9": ["Server", "Offline"],
             "stable-diffusion-xl": ["Server", "Offline"],
             "mixtral-8x7b": ["Server", "Offline"],
-            "llama3-405b": ["Server", "Offline"],
+            "llama3.1-405b": ["Server", "Offline"],
             "rgat": ["Offline"],
         },
         "optional-scenarios-datacenter": {},
@@ -315,7 +316,7 @@
             "llama2-70b-99.9": ["Server", "Offline"],
             "stable-diffusion-xl": ["SingleStream", "Offline", "Server"],
             "mixtral-8x7b": ["Server", "Offline"],
-            "llama3-405b": ["Server", "Offline"],
+            "llama3.1-405b": ["Server", "Offline"],
             "rgat": ["Offline"],
         },
         "optional-scenarios-datacenter-edge": {},
@@ -389,7 +390,7 @@
                 "mbxp_accuracy",
                 60.12 * 0.99,
             ),
-            "llama3-405b": (
+            "llama3.1-405b": (
                 "ROUGEL",
                 21.6666 * 0.99,
                 "exact_match",
@@ -409,7 +410,7 @@
             "llama2-70b-99": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
             "llama2-70b-99.9": ("TOKENS_PER_SAMPLE", 294.45 * 1.1),
             "mixtral-8x7b": ("TOKENS_PER_SAMPLE", 145.9 * 1.1),
-            "llama3-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1),
+            "llama3.1-405b": ("TOKENS_PER_SAMPLE", 684.68 * 1.1),
         },
         "accuracy-delta-perc": {
             "stable-diffusion-xl": {"CLIP_SCORE": 1, "FID_SCORE": 2}
@@ -429,7 +430,7 @@
             "llama2-70b-99.9": 24576,
             "stable-diffusion-xl": 5000,
             "mixtral-8x7b": 15000,
-            "llama3-405b": 8313,
+            "llama3.1-405b": 8313,
             "rgat": 788379
 
         },
@@ -459,7 +460,7 @@
             "llama2-70b-99": {"Server": 20000000000},
             "llama2-70b-99.9": {"Server": 20000000000},
             "mixtral-8x7b": {"Server": 20000000000},
-            "llama3-405b": {"Server": 60000000000}
+            "llama3.1-405b": {"Server": 60000000000}
         },
         "min-queries": {
             "resnet": {
@@ -490,7 +491,7 @@
                 "Offline": 1,
             },
             "mixtral-8x7b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
-            "llama3-405b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
+            "llama3.1-405b": {"SingleStream": 1024, "Server": 270336, "Offline": 1},
             "rgat": {"SingleStream": 1024, "Offline": 1}
         },
     },
@@ -579,7 +580,7 @@
     "llama2-70b-99.9": 24576,
     "stable-diffusion-xl": 5000,
     "mixtral-8x7b": 15000,
-    "llama3-405b": 8313,
+    "llama3.1-405b": 8313,
     "rgat": 788379,
 }
 
@@ -656,7 +657,7 @@
             "Offline": "result_tokens_per_second",
             "Server": "result_completed_tokens_per_second",
         },
-        "llama3-405b": {
+        "llama3.1-405b": {
             "Offline": "result_tokens_per_second",
             "Server": "result_completed_tokens_per_second",
         },
@@ -671,7 +672,7 @@
         "conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}
     },
     "mixtral-8x7b": {"conversational": {"ttft": 2000 * 1000000, "tpot": 200 * 1000000}},
-    "llama3-405b": {
+    "llama3.1-405b": {
         "conversational": {"ttft": 6000 * 1000000, "tpot": 175 * 1000000}
     },
 }
@@ -956,7 +957,7 @@ def requires_equal_issue(self, model, division):
                 "llama2-70b-99",
                 "llama2-70b-99.9",
                 "mixtral-8x7b",
-                "llama3-405b",
+                "llama3.1-405b",
                 "rgat",
             ]
             and self.version not in ["v4.0", "v4.1"]
@@ -1325,7 +1326,7 @@ def check_performance_dir(
         )
 
     if model in ["llama2-70b-99", "llama2-70b-99.9",
-                 "mixtral-8x7b", "llama3-405b"]:
+                 "mixtral-8x7b", "llama3.1-405b"]:
         llama_constraint, is_valid = extra_check_llm(
             mlperf_log, scenario_fixed, model)
 
@@ -1865,7 +1866,7 @@ def log_result(
                 "Offline": "Tokens/s",
                 "Server": "Tokens/s",
             },
-            "llama3-405b": {
+            "llama3.1-405b": {
                 "SingleStream": "Latency (ms)",
                 "MultiStream": "Latency (ms)",
                 "Offline": "Tokens/s",
@@ -2950,7 +2951,7 @@ def check_compliance_dir(
         "llama2-70b-99",
         "llama2-70b-99.9",
         "mixtral-8x7b",
-        "llama3-405b",
+        "llama3.1-405b",
         "rgat",
     ]:
         test_list.remove("TEST04")
@@ -2971,7 +2972,7 @@ def check_compliance_dir(
         "llama2-70b-99",
         "llama2-70b-99.9",
         "mixtral-8x7b",
-        "llama3-405b",
+        "llama3.1-405b",
     ]:
         test_list.remove("TEST01")
 
@@ -2980,7 +2981,7 @@ def check_compliance_dir(
         test_list.remove("TEST04")
 
     if model in ["llama2-70b-99", "llama2-70b-99.9",
-                 "mixtral-8x7b", "llama3-405b"]:
+                 "mixtral-8x7b", "llama3.1-405b"]:
         test_list.append("TEST06")
 
     if test_list and not os.path.exists(compliance_dir):

From 6af0288cd09c3abac9a1ba397c5b35df0d46045c Mon Sep 17 00:00:00 2001
From: mrmhodak <mrmhodak@users.noreply.github.com>
Date: Thu, 19 Dec 2024 06:36:54 +0000
Subject: [PATCH 10/10] Increment version to 5.0.4

---
 loadgen/VERSION.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loadgen/VERSION.txt b/loadgen/VERSION.txt
index 50e2274e6..2d6c0bcf1 100644
--- a/loadgen/VERSION.txt
+++ b/loadgen/VERSION.txt
@@ -1 +1 @@
-5.0.3
+5.0.4