diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..dfe0770
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,2 @@
+# Auto detect text files and perform LF normalization
+* text=auto
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..73ddf6d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,19 @@
+node_modules
+artifacts
+cache
+.*.swp
+venv
+.idea
+*.log
+
+examples/mnist/trainning/data/MNIST/raw
+
+mlgo
+mlgo.bin
+
+examples/llama/llama
+
+ml_mips/ml_mips
+ml_mips/ml_mips.bin
+
+examples/llama/data
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..7007ecc
--- /dev/null
+++ b/README.md
@@ -0,0 +1,14 @@
+# MLGO
+
+MLGO is tensor library for machine learning in pure Golang that can run on MIPS.
+
+The machine learning part of this project refers to the legendary [ggml.cpp](https://github.com/ggerganov/ggml) framework.
+
+
+## MNIST
+
+1. Train the AI model. See `examples/mnist/trainning/mnist.ipynb`
+2. Convert the AI model into GGML using `examples/mnist/convert-h5-to-ggml.py`
+3. Build the AI inference engine for MIPS
+`cd examples/mnist_mips && ./build`
+``
\ No newline at end of file
diff --git a/build.sh b/build.sh
new file mode 100755
index 0000000..cfe73d1
--- /dev/null
+++ b/build.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+
+export GOOS=linux
+export GOARCH=mips
+export GOMIPS=softfloat
+go build -o ./mlgo
+
+file mlgo
+
+if [[ ! -d venv ]]; then
+    python3 -m venv venv
+fi
+
+./compile.py mlgo
diff --git a/common/utils.go b/common/utils.go
new file mode 100644
index 0000000..a87ec5f
--- /dev/null
+++ b/common/utils.go
@@ -0,0 +1,51 @@
+package common
+
+import (
+	"math"
+	"os"
+	"unsafe"
+)
+
+// NB! INT = 32 bits
+func ReadInt32FromFile(file *os.File) uint32 {
+	buf := make([]byte, 4)
+	if count, err := file.Read(buf); err != nil || count != 4 {
+		return 0
+	}
+	return uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0])
+}
+
+func ReadStringFromFile(file *os.File, len uint32) string {
+	buf := make([]byte, len)
+	if count, err := file.Read(buf); err != nil || count != int(len) {
+		return ""
+	}
+	return string(buf)
+}
+
+
+func ReadFP32FromFile(file *os.File) float32 {
+	buf := make([]byte, 4)
+	if count, err := file.Read(buf); err != nil || count != 4 {
+		return 0.0
+	}
+	bits := uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0])
+	return math.Float32frombits(bits)
+}
+
+func min(a, b int) int {
+	if a <= b {
+		return a
+	}
+	return b
+}
+
+
+
+func DecodeFloat32List(bs []byte) []float32 {
+    return unsafe.Slice((*float32)(unsafe.Pointer(&bs[0])), len(bs)/4)
+}
+
+func EncodeFloat32List(fs []float32) []byte {
+    return unsafe.Slice((*byte)(unsafe.Pointer(&fs[0])), len(fs)*4)
+}
\ No newline at end of file
diff --git a/common/vmutils.go b/common/vmutils.go
new file mode 100644
index 0000000..bc37813
--- /dev/null
+++ b/common/vmutils.go
@@ -0,0 +1,140 @@
+package common
+
+import (
+	"bytes"
+	"encoding/binary"
+	"os"
+	"reflect"
+	"unsafe"
+)
+
+// vm only ===================================================================================
+
+// memory layout in MIPS
+const (
+	INPUT_ADDR = 0x31000000
+	OUTPUT_ADDR = 0x32000000
+	MODEL_ADDR = 0x33000000
+	MAGIC_ADDR = 0x30000800
+)
+
+func ByteAt(addr uint64, length int) []byte {
+	var ret []byte
+	bh := (*reflect.SliceHeader)(unsafe.Pointer(&ret))
+	bh.Data = uintptr(addr)
+	bh.Len = length
+	bh.Cap = length
+	return ret
+}
+
+// reading bytes from bigEndian or littleEndian
+func ReadBytes(addr uint64, isBigEndian bool) []byte {
+	rawSize := CopyBytes(ByteAt(addr, 4)) 
+	size := BytesToInt32(rawSize, isBigEndian)
+	ret := ByteAt(addr + 4, int(size)) 
+	//shoud we copy here? may not for saving memory
+	return ret
+}
+
+func Halt() {
+	//os.Stderr.WriteString("THIS SHOULD BE PATCHED OUT\n")
+	// the exit syscall is a jump to 0x5ead0000 now
+	os.Exit(0)
+}
+
+func Output(output []byte, isBigEndian bool) {
+	size := len(output)
+	rawSize := IntToBytes(size,isBigEndian)
+	mSize := ByteAt(OUTPUT_ADDR, 4)
+	copy(mSize, rawSize)
+	mData := ByteAt(OUTPUT_ADDR + 4, size)
+	copy(mData, output)
+	// magic code => have written the result
+	magic := ByteAt(MAGIC_ADDR, 4)
+	copy(magic, []byte{0x12, 0x34, 0x56, 0x78})
+	// stop everything
+	Halt()
+}
+
+
+func IntToBytes(n int, isBigEndian bool) []byte {
+    x := int32(n)
+
+    bytesBuffer := bytes.NewBuffer([]byte{})
+	if isBigEndian {
+		binary.Write(bytesBuffer, binary.BigEndian, x)
+	} else {
+		binary.Write(bytesBuffer, binary.LittleEndian, x)
+	}
+    return bytesBuffer.Bytes()
+}
+
+func BytesToInt32(b []byte, isBigEndian bool) int32 {
+    bytesBuffer := bytes.NewBuffer(b)
+
+    var x int32
+	if isBigEndian {
+		binary.Read(bytesBuffer, binary.BigEndian, &x)
+	} else {
+		binary.Read(bytesBuffer, binary.LittleEndian, &x)
+	}
+    
+
+    return x
+}
+
+func Float32ToBytes(x float32, isBigEndian bool) []byte {
+	bytesBuffer := bytes.NewBuffer([]byte{})
+	if isBigEndian {
+		binary.Write(bytesBuffer, binary.BigEndian, x)
+	} else {
+		binary.Write(bytesBuffer, binary.LittleEndian, x)
+	}
+	return bytesBuffer.Bytes()
+}
+
+func BytesToFloat32(b []byte, isBigEndian bool) float32 {
+	byteBuffer := bytes.NewBuffer(b)
+	var x float32 
+	if isBigEndian {
+		binary.Read(byteBuffer, binary.BigEndian, &x)
+	} else {
+		binary.Read(byteBuffer, binary.LittleEndian, &x)
+	}
+	
+	return x
+}
+
+// CopyBytes returns an exact copy of the provided bytes.
+func CopyBytes(b []byte) (copiedBytes []byte) {
+	if b == nil {
+		return nil
+	}
+	copiedBytes = make([]byte, len(b))
+	copy(copiedBytes, b)
+
+	return
+}
+
+// read from index then return the result and the next index
+func ReadInt32FromBytes(data []byte, index *int, isBigEndian bool) (uint32) {
+	if (*index + 4 > len(data)) {
+		*index = len(data)
+		return 0
+	}
+	buf := CopyBytes(data[*index:*index+4])
+	ret := BytesToInt32(buf, isBigEndian)
+	*index = *index + 4
+	return uint32(ret)
+}
+
+func ReadFP32FromBytes(data []byte, index *int, isBigEndian bool) (float32) {
+	if (*index + 4 > len(data)) {
+		*index = len(data)
+		return 0
+	}
+	buf := CopyBytes(data[*index:*index+4])
+	ret := BytesToFloat32(buf, isBigEndian)
+	*index = *index + 4
+	return ret
+}
\ No newline at end of file
diff --git a/common/vmutils_test.go b/common/vmutils_test.go
new file mode 100644
index 0000000..27045b7
--- /dev/null
+++ b/common/vmutils_test.go
@@ -0,0 +1,49 @@
+package common
+
+import (
+	"fmt"
+	"testing"
+	"unsafe"
+)
+
+func TestByteFloat(t *testing.T){
+	a := 1.234
+	ab := Float32ToBytes(float32(a), true)
+	aa := BytesToFloat32(ab, true)
+	fmt.Println(a, ab, aa)
+}
+
+func byteSliceToFloat32Slice(src []byte) []float32 {
+	if len(src) == 0 {
+		return nil
+	}
+
+	l := len(src) / 4
+	ptr := unsafe.Pointer(&src[0])
+	// It is important to keep in mind that the Go garbage collector
+	// will not interact with this data, and that if src if freed,
+	// the behavior of any Go code using the slice is nondeterministic.
+	// Reference: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+	return (*[1 << 26]float32)((*[1 << 26]float32)(ptr))[:l:l]
+}
+
+func encodeUnsafe(fs []float32) []byte {
+    return unsafe.Slice((*byte)(unsafe.Pointer(&fs[0])), len(fs)*4)
+}
+
+func decodeUnsafe(bs []byte) []float32 {
+    return unsafe.Slice((*float32)(unsafe.Pointer(&bs[0])), len(bs)/4)
+}
+
+func TestByteSliceToFloat32Slice(t *testing.T) {
+	as := []float32{1.234, 2.345}
+	asBytes := make([]byte, 0)
+	for i := 0; i < len(as); i++ {
+		asBytes = append(asBytes, Float32ToBytes(as[i], false)...)
+	}
+	fmt.Println(asBytes)
+	fmt.Println(byteSliceToFloat32Slice(asBytes))
+	fmt.Println(encodeUnsafe(as))
+	fmt.Println(decodeUnsafe(encodeUnsafe(as)))
+	fmt.Println(decodeUnsafe(asBytes))
+}
\ No newline at end of file
diff --git a/compile.py b/compile.py
new file mode 100755
index 0000000..699557d
--- /dev/null
+++ b/compile.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+import os
+import sys
+import struct
+import hashlib
+from rangetree import RangeTree
+from elftools.elf.elffile import ELFFile
+
+def load_minigeth(fn="mlgo"):
+  elf = open(fn, "rb")
+  data = elf.read()
+  elf.seek(0)
+
+  elffile = ELFFile(elf)
+
+  end_addr = 0
+  for seg in elffile.iter_segments():
+    end_addr = max(end_addr, seg.header.p_vaddr + seg.header.p_memsz)
+
+  # program memory (16 MB)
+  prog_size = (end_addr+0xFFF) & ~0xFFF
+  prog_dat = bytearray(prog_size)
+  print("malloced 0x%x for program" % prog_size)
+
+  for seg in elffile.iter_segments():
+    print(seg.header, hex(seg.header.p_vaddr))
+    prog_dat[seg.header.p_vaddr:seg.header.p_vaddr+len(seg.data())] = seg.data()
+
+  entry = elffile.header.e_entry
+  print("entrypoint: 0x%x" % entry)
+
+  # moved to MIPS
+  sf = os.path.join(os.path.dirname(os.path.abspath(__file__)), "startup", "startup.bin")
+  start = open(sf, "rb").read() + struct.pack(">I", entry)
+  prog_dat[:len(start)] = start
+  entry = 0
+
+  r = RangeTree()
+  found = 0
+  for section in elffile.iter_sections():
+    try:
+      for nsym, symbol in enumerate(section.iter_symbols()):
+        ss = symbol['st_value']
+        se = ss+symbol['st_size']
+        if ss != se:
+          try:
+            r[ss:se] = symbol.name
+          except KeyError:
+            continue
+        #print(nsym, symbol.name, symbol['st_value'], symbol['st_size'])
+        if symbol.name == "runtime.gcenable":
+          print(nsym, symbol.name)
+          # nop gcenable
+          prog_dat[symbol['st_value']:symbol['st_value']+8] = b"\x03\xe0\x00\x08\x00\x00\x00\x00"
+          found += 1
+    except Exception:
+      #traceback.print_exc()
+      pass
+
+  #assert(found == 2)
+  return prog_dat, prog_size, r
+
+
+if __name__ == "__main__":
+  fn = "minigeth"
+  if len(sys.argv) > 1:
+    fn = sys.argv[1]
+
+  prog_dat, prog_size, _ = load_minigeth(fn)
+  print("compiled %d bytes with md5 %s" % (prog_size, hashlib.md5(prog_dat).hexdigest()))
+
+  with open(fn+".bin", "wb") as f:
+    f.write(prog_dat)
\ No newline at end of file
diff --git a/examples/gpt-2/gpt2.go b/examples/gpt-2/gpt2.go
new file mode 100644
index 0000000..c1cbd97
--- /dev/null
+++ b/examples/gpt-2/gpt2.go
@@ -0,0 +1,356 @@
+package gpt2
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"mlgo/ml"
+	"os"
+	"strconv"
+)
+
+// default hparams (GPT-2 117M)
+/*
+   int32_t n_vocab = 50257;
+   int32_t n_ctx   = 1024;
+   int32_t n_embd  = 768;
+   int32_t n_head  = 12;
+   int32_t n_layer = 12;
+   int32_t ftype   = 1;
+*/
+type gpt2_hparams struct  {
+	n_vocab int32;
+	n_ctx int32;
+	n_embd int32;
+	n_head int32;
+	n_layer int32;
+	ftype int32;
+
+};
+
+type gpt2_layer struct {
+    // normalization
+	ln_1_g *ml.Tensor;
+	ln_1_b *ml.Tensor;
+
+	ln_2_g *ml.Tensor;
+	ln_2_b *ml.Tensor;
+
+	c_attn_attn_w *ml.Tensor;
+	c_attn_attn_b *ml.Tensor;
+
+	c_attn_proj_w *ml.Tensor;
+	c_attn_proj_b *ml.Tensor;
+
+	c_mlp_fc_w *ml.Tensor;
+	c_mlp_fc_b *ml.Tensor;
+
+	c_mlp_proj_w *ml.Tensor;
+	c_mlp_proj_b *ml.Tensor;
+}
+
+type gpt2_model struct {
+    hparams gpt2_hparams ;
+
+	ln_f_g *ml.Tensor;
+	ln_f_b *ml.Tensor;
+
+	wte *ml.Tensor;
+	wpe *ml.Tensor;
+	lm_head *ml.Tensor;
+
+	layers []gpt2_layer;
+
+	memory_k *ml.Tensor;
+	memory_v *ml.Tensor;
+
+	tensors map[string]*ml.Tensor;
+}
+
+func gpt2_model_load(fname string, model *gpt2_model, vocab *gpt_vocab) error {
+
+	file, err := os.Open(fname)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+	{
+		magic := readInt(file)
+		if magic != 0x67676d6c {
+			return errors.New("invalid model file (bad magic)")
+		}
+	}
+
+	// load hparams
+	{
+		model.hparams.n_vocab = int32(readInt(file))
+		model.hparams.n_ctx = int32(readInt(file))
+		model.hparams.n_embd = int32(readInt(file))
+		model.hparams.n_head = int32(readInt(file))
+		model.hparams.n_layer = int32(readInt(file))
+		model.hparams.ftype = int32(readInt(file))
+
+		fmt.Printf("hparams: %v\n", model.hparams)
+	}
+
+	// load vocab
+	{
+		n_vocab := readInt(file)
+		if n_vocab != uint32(model.hparams.n_vocab) {
+			return errors.New(fmt.Sprintf("n_vocan: %v, model.hparams.n_vocan: %v", n_vocab, model.hparams.n_vocab))
+		}
+
+		for i := uint32(0); i < (n_vocab); i++ {
+			len := readInt(file)
+			word := readString(file, len)
+			vocab.token_to_id[word] = i
+			vocab.id_to_token[i] = word
+		}
+	}
+
+	wtype := ml.TYPE_F32
+	dtype := ml.TYPE_F32
+
+	// weights 
+	{
+		n_embd := uint32(model.hparams.n_embd)
+		n_layer := uint32(model.hparams.n_layer)
+		n_ctx := uint32(model.hparams.n_ctx)
+		n_vocab := uint32(model.hparams.n_vocab)
+
+		model.layers = make([]gpt2_layer, n_layer)
+		model.tensors = make(map[string]*ml.Tensor)
+
+		model.ln_f_g = ml.NewTensor1D(nil, dtype, uint32(n_embd))
+		model.ln_f_b = ml.NewTensor1D(nil, dtype, uint32(n_embd))
+
+		model.wte = ml.NewTensor2D(nil, wtype, uint32(n_embd), uint32(n_vocab))
+		model.wpe = ml.NewTensor2D(nil, dtype, uint32(n_embd), uint32(n_ctx))
+		model.lm_head = ml.NewTensor2D(nil, wtype, uint32(n_embd), uint32(n_vocab))
+
+		// map by name
+		model.tensors["model/ln_f/g"] = model.ln_f_g;
+		model.tensors["model/ln_f/b"] = model.ln_f_b;
+
+		model.tensors["model/wte"]     = model.wte;
+		model.tensors["model/wpe"]     = model.wpe;
+		model.tensors["model/lm_head"] = model.lm_head;
+
+		for i := 0; i < int(n_layer); i++ {
+            layer := &model.layers[i];
+
+            layer.ln_1_g        = ml.NewTensor1D(nil, dtype,   n_embd);
+            layer.ln_1_b        = ml.NewTensor1D(nil, dtype,   n_embd);
+
+            layer.ln_2_g        = ml.NewTensor1D(nil, dtype,   n_embd);
+            layer.ln_2_b        = ml.NewTensor1D(nil, dtype,   n_embd);
+
+            layer.c_attn_attn_w = ml.NewTensor2D(nil, wtype,           n_embd, 3*n_embd);
+            layer.c_attn_attn_b = ml.NewTensor1D(nil, dtype, 3*n_embd);
+
+            layer.c_attn_proj_w = ml.NewTensor2D(nil, wtype,           n_embd, n_embd);
+            layer.c_attn_proj_b = ml.NewTensor1D(nil, dtype,   n_embd);
+
+            layer.c_mlp_fc_w    = ml.NewTensor2D(nil, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b    = ml.NewTensor1D(nil, dtype, 4*n_embd);
+
+            layer.c_mlp_proj_w  = ml.NewTensor2D(nil, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_b  = ml.NewTensor1D(nil, dtype,   n_embd);
+
+            // map by name
+            model.tensors["model/h" + strconv.Itoa(i) + "/ln_1/g"]        = layer.ln_1_g;
+            model.tensors["model/h" + strconv.Itoa(i) + "/ln_1/b"]        = layer.ln_1_b;
+
+            model.tensors["model/h" + strconv.Itoa(i) + "/ln_2/g"]        = layer.ln_2_g;
+            model.tensors["model/h" + strconv.Itoa(i) + "/ln_2/b"]        = layer.ln_2_b;
+
+            model.tensors["model/h" + strconv.Itoa(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
+            model.tensors["model/h" + strconv.Itoa(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
+
+            model.tensors["model/h" + strconv.Itoa(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
+            model.tensors["model/h" + strconv.Itoa(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
+
+            model.tensors["model/h" + strconv.Itoa(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
+            model.tensors["model/h" + strconv.Itoa(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
+
+            model.tensors["model/h" + strconv.Itoa(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
+            model.tensors["model/h" + strconv.Itoa(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
+        }
+	}
+
+	// key + value
+	{
+		n_mem := model.hparams.n_layer * model.hparams.n_ctx
+		n_element := model.hparams.n_embd * n_mem
+
+		model.memory_k = ml.NewTensor1D(nil, dtype, uint32(n_element))
+		model.memory_v = ml.NewTensor1D(nil, dtype, uint32(n_element))
+
+		fmt.Println("n_element in key+value: ", n_element)
+	}
+
+	// load weights 
+	{
+		total_size := 0
+		has_lm_head := false
+
+		for {
+			n_dim := readInt(file)
+			length := readInt(file)
+			ttype := readInt(file)
+
+			if n_dim | length | ttype  == 0 {
+				// eof
+				break
+			}
+
+			nelements := 1
+			ne := make([]int32, 2)
+			for i := 0; i < int(n_dim); i++ {
+				ne[i] = int32(readInt(file))
+				nelements *= int(ne[i])
+			}
+
+			// read name len
+			name := readString(file, length)
+			if _, ok := model.tensors[name]; !ok {
+				return errors.New(fmt.Sprintf("unknow tensor: %s", name))
+			}
+			tensor := model.tensors[name]
+
+			// read data
+			for i := 0; i < len(tensor.Data); i++{
+				tensor.Data[i] = readFP32(file)
+			}
+
+			// GPT-2 models share the WTE tensor as the LM head
+			if name == "model/wte" && !has_lm_head {
+				copy(tensor.Data, model.lm_head.Data)
+			}
+
+			if name == "model/lm_head" {
+				has_lm_head = true
+			}
+
+			total_size += len(tensor.Data) * 4
+
+		}
+	}
+
+	return nil 
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+// func gpt2_eval(model *gpt2_model, n_thread int, n_past int, embd_inp []uint32, embd_w []float32, mem_per_token uint32) {
+// 	N := len(embd_inp)
+
+// 	n_embd := model.hparams.n_embd
+// 	n_layer := model.hparams.n_layer
+// 	n_ctx := model.hparams.n_ctx
+// 	n_head := model.hparams.n_head
+// 	n_vocab := model.hparams.n_vocab
+
+// 	gf := ml.Graph{ThreadsCount: n_thread}
+// 	embd := ml.NewTensor1D(nil, ml.TYPE_F32, uint32(N))
+// 	for i := 0; i < N; i++ {
+// 		embd.Data[i] = float32(embd_inp[i])
+// 	}
+
+// 	position := ml.NewTensor1D(nil, ml.TYPE_F32, uint32(N))
+// 	for i := 0; i < N; i++ {
+// 		position.Data[i] = float32(n_past + 1)
+// 	}
+
+// 	inpL := ml.Add(nil, ml.GetRows(nil, model.wte, embd), ml.GetRows(nil, model.wpe, position))
+
+// 	for il := 0; il < int(n_layer); il++ {
+// 		// TODO: replace with ggml_norm
+// 		cur := ml.RMSNorm(nil, inpL)
+// 		cur = ml.Add(nil, ml.Mul(nil, ml.Repeat(nil, model.layers[il].ln_1_g, cur), cur), ml.Repeat(nil, model.layers[il].ln_1_b, cur))
+
+// 		cur = ml.MulMat(nil, model.layers[il].c_attn_attn_w, cur)
+// 		cur = ml.Add(nil, ml.Repeat(nil, model.layers[il].c_attn_attn_b, cur), cur)
+
+// 		// self-attention
+// 		{
+// 			Qcur := ml.View1D()
+// 		}
+// 	}
+
+// }
+
+func readInt(file *os.File) uint32 {
+	buf := make([]byte, 4)
+	if count, err := file.Read(buf); err != nil || count != 4 {
+		return 0
+	}
+	return uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0])
+}
+
+func readString(file *os.File, len uint32) string {
+	buf := make([]byte, len)
+	if count, err := file.Read(buf); err != nil || count != int(len) {
+		return ""
+	}
+	return string(buf)
+}
+
+
+func readFP32(file *os.File) float32 {
+	buf := make([]byte, 4)
+	if count, err := file.Read(buf); err != nil || count != 4 {
+		return 0.0
+	}
+	bits := uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0])
+	return math.Float32frombits(bits)
+}
+
+func min(a, b int) int {
+	if a <= b {
+		return a
+	}
+	return b
+}
+
+func printTensor(tensor *ml.Tensor, name string) {
+	var dt string
+	if tensor.Type == ml.TYPE_F16 {
+		dt = "FP16"
+	}
+	if tensor.Type == ml.TYPE_F32 {
+		dt = "FP32"
+	}
+	if tensor.Type == ml.TYPE_Q4_0 {
+		dt = "INT4"
+	}
+
+	fmt.Printf("\n\n=== [ %s | %s | %d:%d:%d ] ===\n",
+		name, dt, tensor.NE[0], tensor.NE[1], tensor.NE[2])
+
+	for nn := 0; nn < min(12, int(tensor.NE[1])); nn++ {
+		fmt.Printf("\n %d x %d ...\t", nn, tensor.NE[0])
+		for ii := 0; ii < min(12, int(tensor.NE[0])); ii++ {
+			fmt.Printf("%.3f\t", tensor.Data[nn*int(tensor.NE[0])+ii])
+		}
+	}
+}
+
+type gpt_vocab struct {
+	token_to_id map[string]uint32
+	id_to_token map[uint32]string
+}
+
+func NewVocab() *gpt_vocab {
+	return &gpt_vocab{
+		token_to_id: make(map[string]uint32),
+		id_to_token: make(map[uint32]string),
+	}
+}
\ No newline at end of file
diff --git a/examples/llama/README.md b/examples/llama/README.md
new file mode 100644
index 0000000..5d30483
--- /dev/null
+++ b/examples/llama/README.md
@@ -0,0 +1,71 @@
+# LLaMA.go
+
+![](./assets/images/terminal.png?raw=true)
+
+[![Coverage](https://img.shields.io/badge/Coverage-0-red)](https://github.com/gotzmann/llama.go/actions/workflows/coverage.yml)
+
+Part of this code is borrowed from [llama.go](github.com/gotzmann/llama.go)
+
+## The Goal
+
+We dream of a world where ML hackers are able to grok with **REALLY BIG GPT** models without having GPU clusters consuming a shit tons of **$$$** - using only machines in their own homelabs.
+
+The code of the project is based on the legendary **[ggml.cpp](https://github.com/ggerganov/llama.cpp)** framework of Georgi Gerganov written in C++
+
+We hope using our beloved Golang instead of *soo-powerful* but *too-low-level* language will allow much greater adoption of the **NoGPU** ideas.
+
+**NB!** The V1 supports only FP32 math, so you'll need at least 32GB RAM to work even with the smallest **LLaMA-7B** model. As a preliminary step you should have binary files converted from original LLaMA model locally.
+
+## V0 Roadmap
+
+- [x] Move FP32 tensor math from C++ to pure Golang package GoML
+- [x] Implement LLaMA neural net architecture and model loading in Golang
+- [x] Support smaller LLaMA-7B model
+- [x] Be sure Go inference works EXACT SAME way as C++ for static prompts
+- [x] Let Go shine! Enable multi-threading and boost performance
+
+## V1 Roadmap
+
+- [x] Check cross-patform compatibility with Mac and Windows
+- [x] Release first stable version for ML hackers
+- [x] Support bigger LLaMA models: 13B, 30B, 60B
+- [ ] Enable interactive mode for real-time chat with GPT
+- [ ] Allow automatic download converted model weights from the Internet
+- [ ] Implement metrics for RAM and CPU usage
+- [ ] x8 performance boost with AVX2 support
+- [ ] INT8 quantization to allow x4 bigger models fit the same memory
+- [ ] Server Mode for use in clouds as part of microservice architecture
+
+## V2 Roadmap
+
+- [ ] x2 performance boost with AVX512 support
+- [ ] ARM NEON support on Mac machines and ARM servers
+- [ ] FP16 and BF16 support where possible
+- [ ] Support INT4 and GPTQ quantization 
+
+## How to Run
+
+```shell
+go run main.go --threads 8 --model /home/iiislab/project/web3_dl/reference/models/llama-7b-fp32.bin.2 --temp 0.80 --context 128 --predict 128 --prompt "Why Golang is so popular?"
+```
+
+Or edit the Makefile and compile and run:
+
+```shell
+make
+./llama --threads 8 --model /home/iiislab/project/web3_dl/reference/models/llama-7b-fp32.bin.2 --temp 0.80 --context 128 --predict 128 --prompt "Why Golang is so popular?"
+```
+
+## FAQ
+
+**1] Where might I get original LLaMA model files?**
+
+Contact Meta directly or look around for some torrent alternatives
+
+**2] How to convert original LLaMA files into supported format?** 
+
+Youl'll need original FP16 files placed into **models** directory, then convert with command:
+
+```shell
+python3 ./scripts/convert.py ~/models/LLaMA/7B/ 0
+```
diff --git a/examples/llama/VERSION b/examples/llama/VERSION
new file mode 100644
index 0000000..7dea76e
--- /dev/null
+++ b/examples/llama/VERSION
@@ -0,0 +1 @@
+1.0.1
diff --git a/examples/llama/llama_go/llama.go b/examples/llama/llama_go/llama.go
new file mode 100644
index 0000000..fa37868
--- /dev/null
+++ b/examples/llama/llama_go/llama.go
@@ -0,0 +1,1374 @@
+package llama
+
+import (
+	"container/ring"
+	"fmt"
+	"io"
+	"math"
+	"math/rand"
+	"os"
+	"reflect"
+	"runtime"
+	"sort"
+	"time"
+	"unsafe"
+
+	"github.com/mattn/go-colorable"
+	"github.com/mitchellh/colorstring"
+	"github.com/schollz/progressbar/v3"
+	"github.com/x448/float16"
+	"golang.org/x/exp/slices"
+
+	"mlgo/ml"
+)
+
+const (
+	LLAMA_FILE_VERSION           = 1
+	LLAMA_FILE_MAGIC             = 0x67676a74 // 'ggjt' in hex
+	LLAMA_FILE_MAGIC_OLD         = 0x67676d66 // 'ggmf' in hex
+	LLAMA_FILE_MAGIC_UNVERSIONED = 0x67676d6c // 'ggml' pre-versioned files
+
+	SPLIT_NONE       = 0
+	SPLIT_BY_COLUMNS = 1
+	SPLIT_BY_ROWS    = 2
+)
+
+var (
+	// determine number of model parts based on the dimension
+	LLAMA_N_PARTS = map[uint32]int{
+		4096: 1,
+		5120: 2,
+		6656: 4,
+		8192: 8,
+	}
+)
+
+type pair struct {
+	first  float32
+	second uint32
+}
+
+type Context struct {
+	Model *Model
+	Vocab *ml.Vocab
+
+	// decode output (2-dimensional array: [n_tokens][n_vocab])
+	Logits    []float32
+	LogitsAll bool
+
+	// input embedding (1-dimensional array: [n_embd])
+	Embedding []float32
+}
+
+func NewContext() *Context {
+	return &Context{
+		Model:     NewModel(),
+		Vocab:     ml.NewVocab(0),
+		Logits:    make([]float32, 0, 0), // NewFloatSlice(0, 0),
+		Embedding: make([]float32, 0, 0), // NewFloatSlice(0, 0),
+	}
+}
+
+// struct llama_context_params {
+type ContextParams struct {
+	CtxSize    uint32 // text context
+	PartsCount int    // -1 for default
+	Seed       int    // RNG seed, 0 for random
+	LogitsAll  bool   // the llama_eval() call computes all logits, not just the last one
+	VocabOnly  bool   // only load the vocabulary, no weights
+	UseLock    bool   // force system to keep model in RAM
+	Embedding  bool   // embedding mode only
+}
+
+type Layer struct {
+
+	// normalization
+	attentionNorm *ml.Tensor
+
+	// attention
+	wq *ml.Tensor
+	wk *ml.Tensor
+	wv *ml.Tensor
+	wo *ml.Tensor
+
+	// normalization
+	ffn_norm *ml.Tensor
+
+	// ff
+	w1 *ml.Tensor
+	w2 *ml.Tensor
+	w3 *ml.Tensor
+}
+
+// default hparams (LLaMA 7B)
+type HParams struct {
+	ctxSize     uint32 // 512
+	vocabSize   uint32 // 32000
+	embdSize    uint32 // 4096
+	multSize    uint32 // 256
+	headsCount  uint32 // 32
+	layersCount uint32 // 32
+	rotCount    uint32 // 64
+	f16         uint32 // 1
+}
+
+type ModelType uint8
+
+// available llama models
+const (
+	MODEL_UNKNOWN ModelType = iota
+	MODEL_7B
+	MODEL_13B
+	MODEL_30B
+	MODEL_65B
+)
+
+type KVCache struct {
+	K *ml.Tensor
+	V *ml.Tensor
+
+	N uint32 // number of tokens currently in the cache
+}
+
+type Model struct {
+	Type    ModelType
+	ctx     *ml.Context
+	hparams HParams
+
+	tokEmbeddings *ml.Tensor
+	norm          *ml.Tensor
+	output        *ml.Tensor
+
+	layers []Layer
+	kvSelf KVCache // key + value cache for the self attention
+
+	loadedCount uint32
+	tensors     map[string]*ml.Tensor
+}
+
+func NewModel() *Model {
+	return &Model{
+		hparams: HParams{
+			ctxSize:     512,
+			vocabSize:   32000,
+			embdSize:    4096,
+			multSize:    256,
+			headsCount:  32,
+			layersCount: 32,
+			rotCount:    64,
+			f16:         1,
+		},
+		layers:  make([]Layer, 0),
+		tensors: make(map[string]*ml.Tensor),
+		kvSelf: KVCache{
+			K: &ml.Tensor{},
+			V: &ml.Tensor{},
+		},
+	}
+}
+
+func min(a, b int) int {
+	if a <= b {
+		return a
+	}
+	return b
+}
+
+// Resize() (safe) for using instead of C++ std::vector:resize()
+// https://go.dev/play/p/VlQ7N75E5AD
+func Resize(slice []float32, size int) []float32 {
+	newSlice := make([]float32, size)
+	for i := 0; i < min(size, len(slice)); i++ {
+		newSlice[i] = slice[i]
+	}
+	return newSlice
+}
+
+// NB! This do not clear the underlying array when resizing
+// https://go.dev/play/p/DbK4dFqwrZn
+func ResizeInplace(slice *[]float32, size int) {
+	if len(*slice) == size {
+		return
+	} else if size < len(*slice) {
+		*slice = (*slice)[:size]
+	} else {
+		*slice = slices.Grow(*slice, size)
+		*slice = (*slice)[:size]
+	}
+}
+
+// evaluate the transformer
+//
+//   - lctx:      llama context
+//   - tokens:    new batch of tokens to process
+//   - n_past:    the context size so far
+//   - n_threads: number of threads to use
+//
+
+func Eval(
+
+	lctx *Context,
+	tokens []uint32,
+	tokensCount uint32,
+	pastCount uint32,
+	threadsCount int) error {
+
+	N := tokensCount
+	model := lctx.Model
+	kvSelf := model.kvSelf
+
+	embdSize := model.hparams.embdSize
+	layersCount := model.hparams.layersCount
+	ctxSize := model.hparams.ctxSize
+	headsCount := model.hparams.headsCount
+	vocabSize := model.hparams.vocabSize
+	rotCount := model.hparams.embdSize / model.hparams.headsCount
+
+	ctx0 := &ml.Context{} //ctx0 := ml.Init(ml.InitParams{})
+
+	// for big prompts, if BLAS is enabled, it is better to use only one thread
+	// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
+	graph := ml.Graph{ThreadsCount: threadsCount}
+
+	// Convert the tokens to a []float32 slice
+	tokensFloat32 := make([]float32, len(tokens))
+	for i, token := range tokens {
+		tokensFloat32[i] = float32(token)
+	}
+
+	// Initialize the embd tensor with the tokensFloat32 data
+	embd := ml.NewTensor(ctx0, ml.TYPE_F32, 1, uint32(len(tokens)), 1, 1, 1, tokensFloat32)
+	inpL := ml.GetRows(ctx0, model.tokEmbeddings, embd)
+
+	for il := uint32(0); il < layersCount; il++ {
+
+		//if il > 0 {
+		//	break // DEBUG
+		//}
+
+		inpSA := inpL
+		cur := &ml.Tensor{}
+
+		// norm
+		cur = ml.RMSNorm(ctx0, inpL)
+
+		// cur = attention_norm*cur
+		rep := ml.Repeat(ctx0, model.layers[il].attentionNorm, cur)
+
+		cur = ml.Mul(ctx0, rep, cur)
+
+		// self-attention
+		{
+			Qcur := ml.MulMat(ctx0, model.layers[il].wq, cur)
+			Kcur := ml.MulMat(ctx0, model.layers[il].wk, cur)
+			Vcur := ml.MulMat(ctx0, model.layers[il].wv, cur)
+
+			// store key and value to memory
+			if N >= 1 {
+
+				////struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+				////struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
+
+				////ggml_build_forward_expand(&graph, ggml_cpy(ctx0, Kcur, k));
+				////ggml_build_forward_expand(&graph, ggml_cpy(ctx0, Vcur, v));
+
+				// NB! ggml_element_size(kv_self.k) = 2 for FP16
+				k := ml.View1D(ctx0, kvSelf.K, N*embdSize, embdSize*(il*ctxSize+pastCount))
+				v := ml.View1D(ctx0, kvSelf.V, N*embdSize, embdSize*(il*ctxSize+pastCount))
+
+				ml.BuildForwardExpand(&graph, ml.Copy(ctx0, Kcur, k))
+				ml.BuildForwardExpand(&graph, ml.Copy(ctx0, Vcur, v))
+			}
+
+			// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+			Q :=
+				ml.Permute(ctx0,
+					ml.Rope(ctx0,
+						ml.Copy(ctx0,
+							Qcur,
+							ml.NewTensor3D(ctx0, ml.TYPE_F32, embdSize/headsCount, headsCount, N)),
+						pastCount, rotCount, 0),
+					0, 2, 1, 3)
+
+			// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+			K :=
+				ml.Permute(ctx0,
+					ml.Rope(ctx0,
+						ml.Reshape3D(ctx0,
+							////ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
+							////n_embd/n_head, n_head, n_past + N),
+							ml.View1D(ctx0, kvSelf.K, (pastCount+N)*embdSize, il*ctxSize*embdSize),
+							embdSize/headsCount, headsCount, pastCount+N),
+						pastCount, rotCount, 1),
+					0, 2, 1, 3)
+
+			// K * Q
+			////struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+			KQ := ml.MulMat(ctx0, K, Q)
+
+			// KQ_scaled = KQ / sqrt(n_embd/n_head)
+			KQScaled :=
+				ml.Scale(ctx0,
+					KQ,
+					ml.NewFP32(ctx0, float32(1.0/math.Sqrt(float64(embdSize)/float64(headsCount)))),
+				)
+
+			// KQ_masked = mask_past(KQ_scaled)
+			////struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+			KQMasked := ml.DiagMaskInf(ctx0, KQScaled, pastCount)
+
+			// KQ = soft_max(KQ_masked)
+			////struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+			KQSoftMax := ml.SoftMax(ctx0, KQMasked)
+
+			// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+			VTrans :=
+				ml.Copy(ctx0,
+					ml.Permute(ctx0,
+						ml.Reshape3D(ctx0,
+							ml.View1D(ctx0, kvSelf.V, (pastCount+N)*embdSize, il*ctxSize*embdSize),
+							embdSize/headsCount, headsCount, pastCount+N),
+						1, 2, 0, 3),
+					ml.NewTensor3D(ctx0, ml.TYPE_F32 /* kv_self.v->type */, pastCount+N, embdSize/headsCount, headsCount))
+
+			// KQV = transpose(V) * KQ_soft_max
+			KQV := ml.MulMat(ctx0, VTrans, KQSoftMax)
+
+			// KQV_merged = KQV.permute(0, 2, 1, 3)
+			KQVMerged := ml.Permute(ctx0, KQV, 0, 2, 1, 3)
+
+			// cur = KQV_merged.contiguous().view(n_embd, N)
+			cur = ml.Copy(ctx0,
+				KQVMerged,
+				ml.NewTensor2D(ctx0, ml.TYPE_F32, embdSize, N))
+
+			// projection (no bias)
+			cur = ml.MulMat(ctx0,
+				model.layers[il].wo,
+				cur)
+		}
+
+		inpFF := ml.Add(ctx0, cur, inpSA)
+
+		// feed-forward network
+		{
+			// norm
+			{
+				cur = ml.RMSNorm(ctx0, inpFF)
+
+				// cur = ffn_norm*cur
+				cur = ml.Mul(ctx0,
+					ml.Repeat(ctx0, model.layers[il].ffn_norm, cur),
+					cur)
+			}
+
+			tmp := ml.MulMat(ctx0,
+				model.layers[il].w3,
+				cur)
+
+			cur = ml.MulMat(ctx0,
+				model.layers[il].w1,
+				cur)
+
+			// SILU activation
+			cur = ml.Silu(ctx0, cur)
+
+			cur = ml.Mul(ctx0, cur, tmp)
+
+			cur = ml.MulMat(ctx0,
+				model.layers[il].w2,
+				cur)
+		}
+
+		cur = ml.Add(ctx0, cur, inpFF)
+
+		// input for next layer
+		inpL = cur
+
+	}
+
+	// used at the end to optionally extract the embeddings
+	////var embeddings *ml.Tensor
+
+	// --- norm
+
+	inpL = ml.RMSNorm(ctx0, inpL)
+
+	// inpL = norm*inpL
+	inpL = ml.Mul(ctx0,
+		ml.Repeat(ctx0, model.norm, inpL),
+		inpL)
+
+	embeddings := inpL
+
+	// lm_head
+	inpL = ml.MulMat(ctx0, model.output, inpL)
+
+	// logits -> probs
+	// COMMENTED inpL = ggml_soft_max(ctx0, inpL);
+
+	// run the computation
+	ml.BuildForwardExpand(&graph, inpL)
+
+	ml.GraphCompute(ctx0, &graph)
+
+	// --- extract logits
+
+	//fmt.Printf("\n\n=== INPL 09 === [%d,%d,%d,%d] ===\n", inpL.NE[0], inpL.NE[1], inpL.NE[2], inpL.NE[3]) // DEBUG
+	//for ii := 0; ii < 12; ii++ {
+	//	fmt.Printf("%.4f  ", inpL.Data[ii])
+	//}
+
+	if lctx.LogitsAll {
+		fmt.Print("\n[HALT] Not Expected: lctx.LogitsAll == true")
+		os.Exit(1)
+
+		/*
+			// Copy inpL.Data to lctx.Logits
+			for i := uint32(0); i < vocabSize*N; i++ {
+				if i >= uint32(len(lctx.Logits)) || i >= uint32(len(inpL.Data)) {
+					fmt.Println("Error: Index out of bounds during Logits copy")
+					os.Exit(1)
+				}
+				lctx.Logits[i] = inpL.Data[i]
+			}
+		*/
+	} else {
+		// Copy only the relevant part of inpL.Data to lctx.Logits
+		for i := uint32(0); i < vocabSize; i++ {
+			srcIndex := vocabSize*(N-1) + i
+			if i >= uint32(len(lctx.Logits)) || srcIndex >= uint32(len(inpL.Data)) {
+				fmt.Println("Error: Index out of bounds during Logits copy")
+				os.Exit(1)
+			}
+			lctx.Logits[i] = inpL.Data[srcIndex]
+		}
+	}
+
+	if ml.DEBUG {
+		printTensor(inpL, "INPL")
+
+		fmt.Printf("\n\n=== LOGITS === %d ===\n", len(lctx.Logits)) // DEBUG
+		for ii := 0; ii < 13; ii++ {
+			fmt.Printf("%.4f  ", lctx.Logits[ii])
+		}
+	}
+
+	// --- extract embeddings
+
+	if len(lctx.Embedding) > 0 {
+		////memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
+		for i := uint32(0); i < embdSize; i++ {
+			lctx.Embedding[i] = embeddings.Data[(embdSize*(N-1))+i] // FIXME ASAP
+		}
+	}
+
+	return nil
+}
+
+func printTensor(tensor *ml.Tensor, name string) {
+	var dt string
+	if tensor.Type == ml.TYPE_F16 {
+		dt = "FP16"
+	}
+	if tensor.Type == ml.TYPE_F32 {
+		dt = "FP32"
+	}
+	if tensor.Type == ml.TYPE_Q4_0 {
+		dt = "INT4"
+	}
+
+	fmt.Printf("\n\n=== [ %s | %s | %d:%d:%d ] ===\n",
+		name, dt, tensor.NE[0], tensor.NE[1], tensor.NE[2])
+
+	for nn := 0; nn < min(12, int(tensor.NE[1])); nn++ {
+		fmt.Printf("\n %d x %d ...\t", nn, tensor.NE[0])
+		for ii := 0; ii < min(12, int(tensor.NE[0])); ii++ {
+			fmt.Printf("%.3f\t", tensor.Data[nn*int(tensor.NE[0])+ii])
+		}
+	}
+}
+
+func sampleTopK(logitsID []pair, topK uint32) []pair {
+	// find the top K tokens
+
+	// std::partial_sort
+	// Rearranges elements such that the range [first, middle) contains
+	// the sorted middle − first smallest elements in the range [first, last).
+	// The order of equal elements is not guaranteed to be preserved.
+	// The order of the remaining elements in the range [middle, last) is unspecified.
+
+	/*std::partial_sort(
+	        logits_id.begin(),
+	        logits_id.begin() + top_k, logits_id.end(),
+	        [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
+	    return a.first > b.first;
+	});*/
+
+	//keys := make([]double, 0, len(logitsID))
+	//for k := range logitsID {
+	//	keys = append(keys, k)
+	//}
+	//sort.Float64s(keys)
+
+	sort.Slice(
+		logitsID[:topK],
+		func(i, j int) bool {
+			return logitsID[i].first < logitsID[j].first // FIXME ASAP We need bigger elements first
+		})
+
+	// logits_id.resize(top_k);
+	//for i := uint32(0); i < len(keys)-topK; i++ {
+	//delete(logitsID, keys[i])
+	//}
+
+	ret := make([]pair, 0, topK)
+	copy(ret, logitsID)
+
+	return ret
+}
+
+// llama_sample_top_p_top_k
+// sample next token given probabilities for each embedding
+//
+//   - consider only the top K tokens
+//   - from them, consider only the top tokens with cumulative probability > P
+//
+
+// std::mt19937 = A Mersenne Twister pseudo-random generator of 32-bit numbers with a state size of 19937 bits.
+func SampleTopPTopK(
+	lctx *Context,
+	// lastNTokens []uint32,
+	lastNTokens *ring.Ring,
+	lastNTokensSize uint32, // FIXME Remove
+	topK uint32,
+	topP float32,
+	temp float32,
+	repeatPenalty float32,
+) uint32 {
+
+	////auto & rng = lctx.rng;
+	////logitsCount := uint32(len(vocab.ID2Token))
+	logitsCount := lctx.Model.hparams.vocabSize
+	logits := lctx.Logits
+
+	if ml.DEBUG {
+		fmt.Printf("\n\n>>> SampleTopPTopK <<<\n")
+		fmt.Printf("\n=== LOGITS | %d ===\n", len(logits))
+		for i := 0; i < 8; i++ {
+			fmt.Printf("%.4f ", logits[i])
+		}
+		fmt.Printf(" ... ")
+		for i := int(len(logits)) - 1; i >= int(len(logits))-8; i-- {
+			fmt.Printf("%.4f ", logits[i])
+		}
+		/*
+			fmt.Printf("\n=== LAST N TOKENS | %d ===\n", len(lastNTokens))
+			for i := 0; i < int(lastNTokensSize); i++ {
+				fmt.Printf("%d ", lastNTokens[i])
+			}
+		*/
+		extractedTokens := ExtractTokens(lastNTokens.Move(-int(lastNTokensSize)), int(lastNTokensSize))
+		fmt.Printf("\n=== LAST N TOKENS | %d ===\n", len(extractedTokens))
+		for i := 0; i < int(lastNTokensSize); i++ {
+			fmt.Printf("%d ", extractedTokens[i])
+		}
+	}
+
+	////if (temp <= 0) {
+	////    // select the token with the highest logit directly
+	////    float max_logit = plogits[0];
+	////    llama_vocab::id max_id = 0;
+	////
+	////    for (int i = 1; i < n_logits; ++i) {
+	////        if (plogits[i] > max_logit) {
+	////            max_logit = plogits[i];
+	////            max_id = i;
+	////        }
+	////    }
+	////    return max_id;
+	////}
+
+	////const auto * plogits = logits.data() + logits.size() - n_logits;
+	//plogits := logits[len(logits)-int(logitsCount):] // FIXME ASAP
+	plogits := logits[:]
+
+	////std::vector<std::pair<double, llama_vocab::id>> logits_id;
+	////logits_id.reserve(n_logits);
+	logitsID := make([]pair, 0, logitsCount) // FIXME LEN vs CAP
+
+	{
+		scale := float32(1.0 / temp)
+		for i := uint32(0); i < logitsCount; i++ {
+
+			// Repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
+			// Credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
+
+			// Check if the i-th token is present in the last_n_tokens ring buffer
+			tokenExists := false
+			lastNTokens.Do(func(p interface{}) {
+				if p.(uint32) == i {
+					tokenExists = true
+				}
+			})
+
+			// If lastNTokens already contains i-th token, append it with repeat penalty
+			if tokenExists {
+				// If score < 0, then repetition penalty has to be multiplied to reduce the previous token probability
+				if plogits[i] < 0.0 {
+					logitsID = append(logitsID, pair{plogits[i] * scale * repeatPenalty, i})
+				} else {
+					logitsID = append(logitsID, pair{plogits[i] * scale / repeatPenalty, i})
+				}
+				// Else append pair to logitsID, scaling probability
+			} else {
+				logitsID = append(logitsID, pair{plogits[i] * scale, i})
+			}
+		}
+	}
+
+	if ml.DEBUG {
+		fmt.Printf("\n=== LOGITS ID AFTER | %d ===\n", len(logitsID))
+		for i := 0; i < min(6, len(logitsID)); i++ {
+			fmt.Printf("{ %.3f | %d }", logitsID[i].first, logitsID[i].second)
+		}
+		fmt.Printf(" ... ")
+		for i := len(logitsID) - 6; i < len(logitsID)-1; i++ {
+			fmt.Printf("{ %.3f | %d } ", logitsID[i].first, logitsID[i].second)
+		}
+	}
+
+	// sort logitsID slice and return only top K elements
+	//// sampleTopK(logitsID, topK)
+
+	// NB! Inline logic for [sampleTopK] right here
+
+	//// std::partial_sort(
+	////	logits_id.begin(),
+	////	logits_id.begin() + top_k, logits_id.end(),
+	////	[](const std::pair<float, llama_vocab::id> & a, const std::pair<float, llama_vocab::id> & b) {
+	//// return a.first > b.first;
+	//// });
+	//// logits_id.resize(top_k);
+
+	sort.Slice(
+		logitsID, // logitsID[:topK],
+		func(a, b int) bool {
+			return logitsID[a].first > logitsID[b].first
+		})
+
+	if ml.DEBUG {
+		fmt.Printf("\n=== LOGITS ID SORTED | TOP K = %d ===\n", topK)
+		for i := 0; i < min(6, len(logitsID)); i++ {
+			fmt.Printf("{ %.3f | %d }", logitsID[i].first, logitsID[i].second)
+		}
+		fmt.Printf(" ... ")
+		for i := len(logitsID) - 6; i < len(logitsID)-1; i++ {
+			fmt.Printf("{ %.3f | %d } ", logitsID[i].first, logitsID[i].second)
+		}
+	}
+
+	logitsID = logitsID[:topK]
+
+	if ml.DEBUG {
+		fmt.Printf("\n=== LOGITS ID RESIZED | %d ===\n", len(logitsID))
+		for i := 0; i < min(6, len(logitsID)); i++ {
+			fmt.Printf("{ %.3f | %d }", logitsID[i].first, logitsID[i].second)
+		}
+		fmt.Printf(" ... ")
+		for i := len(logitsID) - 6; i < len(logitsID)-1; i++ {
+			fmt.Printf("{ %.3f | %d } ", logitsID[i].first, logitsID[i].second)
+		}
+	}
+
+	// FIXME Why loop? We've already SORTED logitsID and the MAX is just the FIRST element
+	////double maxl = -INFINITY;
+	maxl := float32(math.Inf(-1))
+	for _, kv := range logitsID {
+		//// maxl = std::max(maxl, kv.first);
+		maxl = max(maxl, kv.first)
+	}
+
+	// compute probs for the top k tokens
+	////probs.reserve(logits_id.size());
+	probs := make([]float32, 0, len(logitsID)) // FIXME LEN vs CAP
+
+	sum := float64(0.0)
+	for _, kv := range logitsID {
+		p := math.Exp(float64(kv.first - maxl))
+		probs = append(probs, float32(p))
+		sum += p
+	}
+
+	if ml.DEBUG {
+		fmt.Printf("\n=== PROBS | %d ===\n", len(probs))
+		for i := 0; i < min(6, len(probs)); i++ {
+			fmt.Printf("%.3f  ", probs[i])
+		}
+		fmt.Printf(" ... ")
+		for i := len(logitsID) - 6; i < len(probs)-1; i++ {
+			fmt.Printf("%.3f  ", probs[i])
+		}
+	}
+
+	// normalize the probs
+	for i := range probs {
+		probs[i] /= float32(sum)
+	}
+
+	if ml.DEBUG {
+		fmt.Printf("\n=== PROBS NORM | %d ===\n", len(probs))
+		for i := 0; i < min(6, len(probs)); i++ {
+			fmt.Printf("%.3f  ", probs[i])
+		}
+		fmt.Printf(" ... ")
+		for i := len(logitsID) - 6; i < len(probs)-1; i++ {
+			fmt.Printf("%.3f  ", probs[i])
+		}
+	}
+
+	if topP < 1.0 {
+
+		cumsum := float32(0.0) // TODO float64 for better math?
+		for i := uint32(0); i < uint32(len(probs)); i++ {
+			cumsum += probs[i]
+			if cumsum >= topP {
+				probs = probs[:i+1]
+				logitsID = logitsID[:i+1]
+				break
+			}
+		}
+
+		cumsum = 1.0 / cumsum
+		for i := uint32(0); i < uint32(len(probs)); i++ {
+			probs[i] *= cumsum
+		}
+	}
+
+	if ml.DEBUG {
+		if len(probs) > 6 {
+			fmt.Printf("\n=== PROBS POST | %d ===\n", len(probs))
+			for i := 0; i < min(6, len(probs)); i++ {
+				fmt.Printf("%.3f  ", probs[i])
+			}
+			fmt.Printf(" ... ")
+			for i := len(logitsID) - 6; i < len(probs)-1; i++ {
+				fmt.Printf("%.3f  ", probs[i])
+			}
+		}
+	}
+
+	////std::discrete_distribution<> dist(probs.begin(), probs.end());
+	////int idx = dist(rng);
+	////return logits_id[idx].second;
+
+	// --- discrete distribution
+	//     TODO Do we need something better than hand-crafted math here?
+
+	seed := time.Now().UnixNano()
+	source := rand.NewSource(seed)
+
+	for i := 0; i < len(probs); i++ {
+		f := float32(source.Int63()) / (1 << 63)
+		probs[i] = probs[i] * probs[i] * f * f
+	}
+
+	idx := 0
+	maxProb := probs[0]
+	for i := 1; i < len(probs); i++ {
+		if probs[i] > maxProb {
+			idx = i
+			maxProb = probs[i]
+		}
+	}
+
+	if ml.DEBUG {
+		fmt.Printf("\nidx = %d", idx)
+		fmt.Printf("\nlogitsID = %d | weight = %f", logitsID[idx].second, logitsID[idx].first)
+	}
+
+	return logitsID[idx].second
+}
+
+// evaluate the transformer calculated by NodeID and return graphs
+//
+//   - lctx:      llama context
+//   - tokens:    new batch of tokens to process
+//   - n_past:    the context size so far
+//   - n_threads: number of threads to use
+//
+func ExpandGraph(
+
+	lctx *Context,
+	tokens []uint32,
+	tokensCount uint32,
+	pastCount uint32,
+	threadsCount int) (*ml.Graph, *ml.Context, error) {
+
+	N := tokensCount
+	model := lctx.Model
+	kvSelf := model.kvSelf
+
+	embdSize := model.hparams.embdSize
+	layersCount := model.hparams.layersCount
+	ctxSize := model.hparams.ctxSize
+	headsCount := model.hparams.headsCount
+	rotCount := model.hparams.embdSize / model.hparams.headsCount
+
+	ctx0 := &ml.Context{} //ctx0 := ml.Init(ml.InitParams{})
+
+	// for big prompts, if BLAS is enabled, it is better to use only one thread
+	// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
+	graph := ml.Graph{ThreadsCount: threadsCount}
+
+	// Convert the tokens to a []float32 slice
+	tokensFloat32 := make([]float32, len(tokens))
+	for i, token := range tokens {
+		tokensFloat32[i] = float32(token)
+	}
+
+	// Initialize the embd tensor with the tokensFloat32 data
+	embd := ml.NewTensor(ctx0, ml.TYPE_F32, 1, uint32(len(tokens)), 1, 1, 1, tokensFloat32)
+	inpL := ml.GetRows(ctx0, model.tokEmbeddings, embd)
+
+	for il := uint32(0); il < layersCount; il++ {
+
+		//if il > 0 {
+		//	break // DEBUG
+		//}
+
+		inpSA := inpL
+		cur := &ml.Tensor{}
+
+		// norm
+		cur = ml.RMSNorm(ctx0, inpL)
+
+		// cur = attention_norm*cur
+		rep := ml.Repeat(ctx0, model.layers[il].attentionNorm, cur)
+
+		cur = ml.Mul(ctx0, rep, cur)
+
+		// self-attention
+		{
+			Qcur := ml.MulMat(ctx0, model.layers[il].wq, cur)
+			Kcur := ml.MulMat(ctx0, model.layers[il].wk, cur)
+			Vcur := ml.MulMat(ctx0, model.layers[il].wv, cur)
+
+			// store key and value to memory
+			if N >= 1 {
+
+				////struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
+				////struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past));
+
+				////ggml_build_forward_expand(&graph, ggml_cpy(ctx0, Kcur, k));
+				////ggml_build_forward_expand(&graph, ggml_cpy(ctx0, Vcur, v));
+
+				// NB! ggml_element_size(kv_self.k) = 2 for FP16
+				k := ml.View1D(ctx0, kvSelf.K, N*embdSize, embdSize*(il*ctxSize+pastCount))
+				v := ml.View1D(ctx0, kvSelf.V, N*embdSize, embdSize*(il*ctxSize+pastCount))
+
+				ml.BuildForwardExpand(&graph, ml.Copy(ctx0, Kcur, k))
+				ml.BuildForwardExpand(&graph, ml.Copy(ctx0, Vcur, v))
+			}
+
+			// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+			Q :=
+				ml.Permute(ctx0,
+					ml.Rope(ctx0,
+						ml.Copy(ctx0,
+							Qcur,
+							ml.NewTensor3D(ctx0, ml.TYPE_F32, embdSize/headsCount, headsCount, N)),
+						pastCount, rotCount, 0),
+					0, 2, 1, 3)
+
+			// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+			K :=
+				ml.Permute(ctx0,
+					ml.Rope(ctx0,
+						ml.Reshape3D(ctx0,
+							////ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd),
+							////n_embd/n_head, n_head, n_past + N),
+							ml.View1D(ctx0, kvSelf.K, (pastCount+N)*embdSize, il*ctxSize*embdSize),
+							embdSize/headsCount, headsCount, pastCount+N),
+						pastCount, rotCount, 1),
+					0, 2, 1, 3)
+
+			// K * Q
+			////struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+			KQ := ml.MulMat(ctx0, K, Q)
+
+			// KQ_scaled = KQ / sqrt(n_embd/n_head)
+			KQScaled :=
+				ml.Scale(ctx0,
+					KQ,
+					ml.NewFP32(ctx0, float32(1.0/math.Sqrt(float64(embdSize)/float64(headsCount)))),
+				)
+
+			// KQ_masked = mask_past(KQ_scaled)
+			////struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+			KQMasked := ml.DiagMaskInf(ctx0, KQScaled, pastCount)
+
+			// KQ = soft_max(KQ_masked)
+			////struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+			KQSoftMax := ml.SoftMax(ctx0, KQMasked)
+
+			// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+			VTrans :=
+				ml.Copy(ctx0,
+					ml.Permute(ctx0,
+						ml.Reshape3D(ctx0,
+							ml.View1D(ctx0, kvSelf.V, (pastCount+N)*embdSize, il*ctxSize*embdSize),
+							embdSize/headsCount, headsCount, pastCount+N),
+						1, 2, 0, 3),
+					ml.NewTensor3D(ctx0, ml.TYPE_F32 /* kv_self.v->type */, pastCount+N, embdSize/headsCount, headsCount))
+
+			// KQV = transpose(V) * KQ_soft_max
+			KQV := ml.MulMat(ctx0, VTrans, KQSoftMax)
+
+			// KQV_merged = KQV.permute(0, 2, 1, 3)
+			KQVMerged := ml.Permute(ctx0, KQV, 0, 2, 1, 3)
+
+			// cur = KQV_merged.contiguous().view(n_embd, N)
+			cur = ml.Copy(ctx0,
+				KQVMerged,
+				ml.NewTensor2D(ctx0, ml.TYPE_F32, embdSize, N))
+
+			// projection (no bias)
+			cur = ml.MulMat(ctx0,
+				model.layers[il].wo,
+				cur)
+		}
+
+		inpFF := ml.Add(ctx0, cur, inpSA)
+
+		// feed-forward network
+		{
+			// norm
+			{
+				cur = ml.RMSNorm(ctx0, inpFF)
+
+				// cur = ffn_norm*cur
+				cur = ml.Mul(ctx0,
+					ml.Repeat(ctx0, model.layers[il].ffn_norm, cur),
+					cur)
+			}
+
+			tmp := ml.MulMat(ctx0,
+				model.layers[il].w3,
+				cur)
+
+			cur = ml.MulMat(ctx0,
+				model.layers[il].w1,
+				cur)
+
+			// SILU activation
+			cur = ml.Silu(ctx0, cur)
+
+			cur = ml.Mul(ctx0, cur, tmp)
+
+			cur = ml.MulMat(ctx0,
+				model.layers[il].w2,
+				cur)
+		}
+
+		cur = ml.Add(ctx0, cur, inpFF)
+
+		// input for next layer
+		inpL = cur
+
+	}
+
+	// used at the end to optionally extract the embeddings
+	////var embeddings *ml.Tensor
+
+	// --- norm
+
+	inpL = ml.RMSNorm(ctx0, inpL)
+
+	// inpL = norm*inpL
+	inpL = ml.Mul(ctx0,
+		ml.Repeat(ctx0, model.norm, inpL),
+		inpL)
+
+
+	// lm_head
+	inpL = ml.MulMat(ctx0, model.output, inpL)
+
+	// add an mock op here
+	zeroTensor := ml.NewTensor2D(ctx0, inpL.Type, inpL.NE[0], inpL.NE[1])
+	inpL = ml.Add(ctx0, inpL, zeroTensor)
+
+	// logits -> probs
+	// COMMENTED inpL = ggml_soft_max(ctx0, inpL);
+
+	// run the computation
+	ml.BuildForwardExpand(&graph, inpL)
+
+	// ml.GraphComputeByNodes(ctx0, &graph, nodeID)
+
+	return &graph, ctx0, nil
+}
+
+
+// llama_model_load
+// load the model's weights from a file
+// see convert-pth-to-ggml.py for details on format
+
+func LoadModel(
+	fileName string,
+	//partsCount int,
+	silent bool,
+) (*Context, error) {
+
+	lctx := NewContext()
+
+	file, err := os.Open(fileName)
+	if err != nil {
+		return nil, err
+	}
+	defer file.Close()
+
+	// --- check header magic and format version
+
+	magic := readInt(file)
+
+	if magic == LLAMA_FILE_MAGIC_UNVERSIONED || magic == LLAMA_FILE_MAGIC_OLD {
+		fmt.Printf("\n[ERROR] Invalid model file '%s'! Too old, regenerate!", fileName)
+		return nil, fmt.Errorf("invalid model file")
+	}
+
+	if magic != LLAMA_FILE_MAGIC {
+		fmt.Printf("\n[ERROR] Invalid model file '%s'! Wrong MAGIC in header", fileName)
+		return nil, fmt.Errorf("invalid model file")
+	}
+
+	version := readInt(file)
+
+	if version != LLAMA_FILE_VERSION {
+		fmt.Printf("\n[ERROR] Invalid model file '%s'! Unsupported version", fileName)
+		return nil, fmt.Errorf("invalid model file")
+	}
+
+	// --- load hparams
+
+	vocabSize := readInt(file)   // vocab_size
+	embdSize := readInt(file)    // dim
+	multSize := readInt(file)    // multiple_of
+	headsCount := readInt(file)  // n_heads
+	layersCount := readInt(file) // n_layers
+	rotCount := readInt(file)    // rot = dim // n_heads [obsolete]
+	f16 := readInt(file)         // ftype
+
+	model := lctx.Model
+
+	model.hparams.vocabSize = vocabSize
+	model.hparams.embdSize = embdSize
+	model.hparams.multSize = multSize
+	model.hparams.headsCount = headsCount
+	model.hparams.layersCount = layersCount
+	model.hparams.rotCount = rotCount
+	model.hparams.f16 = f16
+
+	// --- init cache
+	//KVCacheInit(&lctx.Model.hparams, &lctx.Model.kvSelf, ml.TYPE_F32)
+	dt := ml.TYPE_F32
+	size := embdSize * layersCount * 512 /*ctxSize*/ // FIXME ctxSize
+	lctx.Model.kvSelf.K = ml.NewTensor1D(nil, dt, size)
+	lctx.Model.kvSelf.V = ml.NewTensor1D(nil, dt, size)
+
+	// NB! Do not try to resize / relocate secondary pointers
+	lctx.Vocab = ml.NewVocab(vocabSize)
+	vocab := lctx.Vocab
+
+	// FIXME Reserve extra space for tokensCount (N) = 8 (as with LogitsAll == true)
+	//lctx.Logits = make([]float32, vocabSize*8, vocabSize*8) // NewFloatSlice(vocabSize, vocabSize) // FIXME ASAP
+	lctx.Logits = make([]float32, vocabSize, vocabSize) // use just vocab size as CPP version does by default
+
+	if ml.DEBUG {
+		fmt.Printf("\nvocab  = %d", vocabSize)
+		fmt.Printf("\nembd   = %d", embdSize)
+		fmt.Printf("\nmult   = %d", multSize)
+		fmt.Printf("\nheads  = %d", headsCount)
+		fmt.Printf("\nlayers = %d", layersCount)
+		fmt.Printf("\nrot    = %d", rotCount)
+		fmt.Printf("\nf16    = %d", f16)
+	}
+
+	//fmt.Printf("\nctx   = %d", hparamsCtx)
+	//fmt.Printf("\nn_ff    = %d", n_ff)
+
+	n_ff := ((2*(4*embdSize)/3 + multSize - 1) / multSize) * multSize
+
+	// --- load vocab
+
+	if !silent && runtime.GOOS == "windows" {
+		Colorize("[magenta][ INIT ][white] Loading vocab...")
+	}
+
+	vocabBar := progressbar.NewOptions(
+		int(vocabSize),
+		progressbar.OptionFullWidth(),
+		//progressbar.OptionSetWidth(40),
+		progressbar.OptionEnableColorCodes(true),
+		progressbar.OptionSetPredictTime(false),
+		progressbar.OptionSetElapsedTime(false),
+		progressbar.OptionSetDescription("[light_magenta][ INIT ][light_blue] Loading model vocab...  [light_cyan]"),
+		progressbar.OptionSetTheme(progressbar.Theme{
+			Saucer:        "[light_magenta]▒[reset]",
+			SaucerHead:    "[white]▒[reset]",
+			SaucerPadding: "[dark_gray]▒[reset]",
+			BarStart:      "[dark_gray]║[reset]",
+			BarEnd:        "[dark_gray]║[reset]",
+		}))
+
+	for i := uint32(0); i < vocabSize; i++ {
+
+		if !silent && runtime.GOOS != "windows" && i%100 == 0 {
+			vocabBar.Set(int(i))
+		}
+
+		length := readInt(file)
+		token := readString(file, length)
+		score := readFP32(file)
+
+		vocab.Token2ID[token] = i
+		vocab.ID2Token[i] = ml.TokenScore{Token: token, Score: score}
+	}
+
+	if !silent && runtime.GOOS != "windows" {
+		vocabBar.Finish()
+		fmt.Printf("\n")
+	}
+
+	ctx := model.ctx
+
+	// --- prepare memory for the weights
+	{
+		model.tokEmbeddings = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, vocabSize)
+
+		model.norm = ml.NewTensor1D(ctx, ml.TYPE_F32, embdSize)
+		model.output = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, vocabSize)
+
+		// map by name
+		model.tensors["tok_embeddings.weight"] = model.tokEmbeddings
+
+		model.tensors["norm.weight"] = model.norm
+		model.tensors["output.weight"] = model.output
+
+		model.layers = make([]Layer, layersCount)
+		for i := uint32(0); i < layersCount; i++ {
+			//auto & layer = model.layers[i];
+
+			model.layers[i].attentionNorm = ml.NewTensor1D(ctx, ml.TYPE_F32, embdSize)
+
+			model.layers[i].wq = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, embdSize)
+			model.layers[i].wk = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, embdSize)
+			model.layers[i].wv = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, embdSize)
+			model.layers[i].wo = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, embdSize)
+
+			model.layers[i].ffn_norm = ml.NewTensor1D(ctx, ml.TYPE_F32, embdSize)
+
+			model.layers[i].w1 = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, n_ff)
+			model.layers[i].w2 = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, n_ff, embdSize)
+			model.layers[i].w3 = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, n_ff)
+
+			// map by name
+			prefix := fmt.Sprintf("layers.%d.", i)
+
+			model.tensors[prefix+"attention_norm.weight"] = model.layers[i].attentionNorm
+
+			model.tensors[prefix+"attention.wq.weight"] = model.layers[i].wq
+			model.tensors[prefix+"attention.wk.weight"] = model.layers[i].wk
+			model.tensors[prefix+"attention.wv.weight"] = model.layers[i].wv
+			model.tensors[prefix+"attention.wo.weight"] = model.layers[i].wo
+
+			model.tensors[prefix+"ffn_norm.weight"] = model.layers[i].ffn_norm
+
+			model.tensors[prefix+"feed_forward.w1.weight"] = model.layers[i].w1
+			model.tensors[prefix+"feed_forward.w2.weight"] = model.layers[i].w2
+			model.tensors[prefix+"feed_forward.w3.weight"] = model.layers[i].w3
+		}
+	}
+
+	if !silent && runtime.GOOS == "windows" {
+		Colorize("\n[magenta][ INIT ][white] Loading model - please wait ...")
+	}
+
+	// https://pkg.go.dev/github.com/schollz/progressbar/v3#Option
+	bar := progressbar.NewOptions(int(layersCount*9),
+		progressbar.OptionFullWidth(),
+		//progressbar.OptionSetWidth(40),
+		progressbar.OptionEnableColorCodes(true),
+		progressbar.OptionSetPredictTime(false),
+		progressbar.OptionSetElapsedTime(false),
+		progressbar.OptionSetDescription("[light_magenta][ INIT ][light_blue] Loading model weights...[light_cyan]"),
+		progressbar.OptionSetTheme(progressbar.Theme{
+			Saucer:        "[light_magenta]▒[reset]",
+			SaucerHead:    "[white]▒[reset]",
+			SaucerPadding: "[dark_gray]▒[reset]",
+			BarStart:      "[dark_gray]║[reset]",
+			BarEnd:        "[dark_gray]║[reset]",
+		}))
+
+	// --- load weights
+
+	var tensorsCount uint32
+	for {
+
+		dims := readInt(file)
+		if dims < 1 || dims > 2 { // TODO Check for EOF
+			break
+		}
+
+		nameLength := readInt(file)
+		shardType := ml.DType(readInt(file))
+
+		nelements := 1
+		ne := [2]uint32{1, 1}
+		for i := 0; i < int(dims); i++ {
+			ne[i] = readInt(file)
+			nelements *= int(ne[i])
+		}
+
+		name := readString(file, nameLength)
+		if _, ok := model.tensors[name]; !ok {
+			fmt.Printf("\n[ERROR] Unknown tensor '%s' in model file", name)
+			os.Exit(1)
+		}
+
+		if ml.DEBUG {
+			typeStr := "FP32"
+			if shardType == ml.TYPE_F16 {
+				typeStr = "FP16"
+			}
+			memStr := fmt.Sprintf("%dM", nelements*4/1024/1024)
+			fmt.Printf("\n=== LAYER #%d === %s | %s | %s ===", tensorsCount, typeStr, name, memStr)
+		}
+
+		/* The latest GGJT format is always ONE-PART-NO-SPLIT-TENSORS binary file, so the parsing is really streamlined
+
+		    partsCount := LLAMA_N_PARTS[embdSize]
+			splitType := SPLIT_NONE
+			if partsCount > 1 && dims > 1 {
+				splitType = SPLIT_BY_COLUMNS
+				if strings.Contains(name, "output") {
+					splitType = SPLIT_NONE
+				} else if strings.Contains(name, "layers") &&
+					!strings.Contains(name, "attention.wo.weight") &&
+					!strings.Contains(name, "feed_forward.w2.weight") {
+					splitType = SPLIT_NONE
+				}
+			}
+		*/
+
+		tensor := model.tensors[name]
+		tensorSize := tensor.Nelements()
+
+		// --- all tensors in file are aligned for 32 bytes
+
+		alignment := int64(32)
+		offset, _ := file.Seek(0, io.SeekCurrent)
+		for ; offset%alignment != 0; offset++ {
+		}
+		file.Seek(offset, io.SeekStart)
+
+		// --- read tensor into memory
+
+		if shardType == ml.TYPE_F16 {
+			// FIXME Single-dimension tensors always presented as FP32
+			// after conversion from PyTorch even for FP16 models
+			for n := uint32(0); n < tensorSize; n++ {
+				tensor.Data[n] = readFP16ToFP32(file)
+			}
+		} else if shardType == ml.TYPE_F32 {
+			var fake []byte
+			fakeHeader := (*reflect.SliceHeader)(unsafe.Pointer(&fake))
+			// NB! unsafe.Pointer(tensor.Data) for *Data VS unsafe.Pointer(&tensor.Data) for Data
+			dataHeader := (*reflect.SliceHeader)(unsafe.Pointer(&tensor.Data))
+
+			fakeHeader.Data = dataHeader.Data
+			fakeHeader.Len = int(tensorSize * 4)
+			fakeHeader.Cap = int(tensorSize * 4)
+
+			//fmt.Printf("\n== FAKE []BYTE LEN = %d", len(fake))
+			if count, err := io.ReadFull(file, fake); err != nil || count != int(tensorSize*4) {
+				fmt.Printf("\n[ERROR] Failed to read BIG FP32 chunk from model!")
+				fmt.Printf("\n[ERROR] COUNT = %d | ERR = %s", count, err.Error())
+				os.Exit(1)
+			}
+		} else {
+			fmt.Printf("\n[ERROR] Tensor data type is not supported yet!")
+			os.Exit(0)
+		}
+
+		tensorsCount++
+		model.loadedCount++
+		if !silent && runtime.GOOS != "windows" {
+			bar.Add(1)
+		}
+	}
+
+	if !silent && runtime.GOOS != "windows" {
+		bar.Finish()
+	}
+
+	return lctx, nil
+}
+
+func max(a, b float32) float32 {
+	if a >= b {
+		return a
+	}
+	return b
+}
+
+// NB! INT = 32 bits
+func readInt(file *os.File) uint32 {
+	buf := make([]byte, 4)
+	if count, err := file.Read(buf); err != nil || count != 4 {
+		return 0
+	}
+	return uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0])
+}
+
+func readString(file *os.File, len uint32) string {
+	buf := make([]byte, len)
+	if count, err := file.Read(buf); err != nil || count != int(len) {
+		return ""
+	}
+	return string(buf)
+}
+
+func readFP16ToFP32(file *os.File) float32 {
+	buf := make([]byte, 2)
+	if count, err := file.Read(buf); err != nil || count != 2 {
+		return 0.0
+	}
+	bits := uint16(buf[1])<<8 | uint16(buf[0])
+	f16 := float16.Frombits(bits)
+	return f16.Float32()
+}
+
+func readFP32(file *os.File) float32 {
+	buf := make([]byte, 4)
+	if count, err := file.Read(buf); err != nil || count != 4 {
+		return 0.0
+	}
+	bits := uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0])
+	return math.Float32frombits(bits)
+}
+
+// ExtractTokens is a function to extract a slice of tokens from the ring buffer
+func ExtractTokens(r *ring.Ring, count int) []uint32 {
+	tokens := make([]uint32, count)
+	for i := 0; i < count; i++ {
+		tokens[i] = r.Value.(uint32)
+		r = r.Next()
+	}
+	return tokens
+}
+
+func Colorize(format string, opts ...interface{}) (n int, err error) {
+	var DefaultOutput = colorable.NewColorableStdout()
+	return fmt.Fprintf(DefaultOutput, colorstring.Color(format), opts...)
+}
diff --git a/examples/llama/llama_go/llama_test.go b/examples/llama/llama_go/llama_test.go
new file mode 100644
index 0000000..f29a7e8
--- /dev/null
+++ b/examples/llama/llama_go/llama_test.go
@@ -0,0 +1,91 @@
+package llama
+
+import (
+	"fmt"
+	"mlgo/ml"
+	"os"
+	"reflect"
+	"testing"
+)
+
+func TestLLaMA(t *testing.T) {
+	modelFile := "/home/iiislab/project/web3_dl/reference/models/llama-7b-fp32.bin.2"
+	prompt := "Why Golang is so popular?"
+	threadCount := 32
+	ctx, err := LoadModel(modelFile, true)
+	fmt.Println("Load Model Finish")
+	if err != nil {
+		fmt.Println("load model error: ", err)
+		return 
+	}
+	embd := ml.Tokenize(ctx.Vocab, prompt, true)
+	err = Eval(ctx, embd, uint32(len(embd)), 0, threadCount)
+	fmt.Println("Eval Model Finish")
+}
+
+func TestLLaMAEvalGraph(t *testing.T) {
+	modelFile := "/home/iiislab/project/web3_dl/reference/models/llama-7b-fp32.bin.2"
+	prompt := "Why Golang is so popular?"
+	threadCount := 32
+	ctx, err := LoadModel(modelFile, true)
+	fmt.Println("Load Model Finish")
+	if err != nil {
+		fmt.Println("load model error: ", err)
+		return 
+	}
+	embd := ml.Tokenize(ctx.Vocab, prompt, true)
+	graph, mlctx, err := ExpandGraph(ctx, embd, uint32(len(embd)), 0, threadCount)
+	nodeID := int(graph.NodesCount) - 1
+	ml.GraphComputeByNodes(mlctx, graph, nodeID)
+	ml.PrintTensor(graph.Nodes[nodeID], "before")
+
+	envBytes := ml.SaveComputeNodeEnvToBytes(uint32(nodeID), graph.Nodes[nodeID], graph, true)
+	nodeID_, tensorGraphList_ , err := ml.DecodeComputeNodeEnv(envBytes, true, false)
+	// save bytes from mips
+	{
+		fout, err := os.Create(fmt.Sprintf("../data/node_%v", nodeID))
+		if err != nil {
+			fmt.Println(err)
+			return
+		}
+		defer fout.Close()
+		_, err = fout.Write(envBytes)
+		if err != nil {
+			fmt.Println(err)
+			return
+		}
+	}
+	// save => tensorOnGraph[]
+	tensorGraphList := ml.SaveComputeNodeEnv(graph.Nodes[nodeID], graph)
+	fmt.Println("nodeID Equal: ", nodeID_ == uint32(nodeID))
+	fmt.Println("tensorGraphList: ", reflect.DeepEqual(tensorGraphList_, tensorGraphList))
+
+	// reconstruct
+	tensorList := make([]*ml.Tensor, 0)
+	tensorMap := make(map[uint32]*ml.Tensor)
+	for i := 0; i < len(tensorGraphList); i++ {
+		tensor := tensorGraphList[i].ToTensor(nil)
+		tensorMap[tensorGraphList[i].NodeID] = tensor
+		tensorList = append(tensorList, tensor)
+	}
+	// fill in the nodeid
+	for i := 0; i < len(tensorList); i++ {
+		tensor := tensorList[i]
+		tensorG := tensorGraphList[i]
+		if src0, ok := tensorMap[tensorG.Src0NodeID]; ok {
+			tensor.Src0 = src0
+		}
+		if src1, ok := tensorMap[tensorG.Src1NodeID]; ok {
+			tensor.Src1 = src1
+		}
+	}
+
+	// compute
+	ml.ComputeNodeForward(tensorMap[uint32(nodeID)])
+
+	// ml.ComputeNodeForward(graph.Nodes[nodeID])
+	ml.PrintTensor(tensorMap[uint32(nodeID)], "after")
+
+	fmt.Println("graph node number: ", graph.NodesCount)
+	fmt.Println("Eval Model Finish")
+}
\ No newline at end of file
diff --git a/examples/llama/main.go b/examples/llama/main.go
new file mode 100644
index 0000000..894f156
--- /dev/null
+++ b/examples/llama/main.go
@@ -0,0 +1,351 @@
+package main
+
+import (
+	"container/ring"
+	"fmt"
+	"os"
+	"runtime"
+	"strings"
+
+	"github.com/jessevdk/go-flags"
+
+	"mlgo/ml"
+	"mlgo/examples/llama/llama_go"
+)
+
+type ModelParams struct {
+	seed         int
+	threadsCount int
+	predictCount uint32 // new tokens to predict
+	repeatLastN  uint32 // last n tokens to penalize
+	partsCount   int    // amount of model parts (-1 = determine from model dimensions)
+	ctxSize      uint32 // context size
+	batchSize    uint32 // batch size for prompt processing
+	keepCount    uint32
+
+	// --- sampling parameters
+
+	topK          uint32  // 40
+	topP          float32 // 0.95
+	temp          float32 // 0.80
+	repeatPenalty float32 // 1.10
+
+	model       string // model path
+	prompt      string
+	inputPrefix string // string to prefix user inputs with
+
+	antiprompt []string // string upon seeing which more user input is prompted
+
+	memoryFP16   bool // use f16 instead of f32 for memory kv
+	randomPrompt bool // do not randomize prompt if none provided
+	useColor     bool // use color to distinguish generations and inputs
+	interactive  bool // interactive mode
+
+	embedding        bool // get only sentence embedding
+	interactiveStart bool // wait for user input immediately
+
+	instruct   bool // instruction mode (used for Alpaca models)
+	ignoreEOS  bool // do not stop generating after eos
+	perplexity bool // compute perplexity over the prompt
+	use_mlock  bool // use mlock to keep model in memory
+	memTest    bool // compute maximum memory usage
+
+	verbosePrompt bool
+}
+
+func main() {
+
+	// --- Parse command line args and set default parameters
+
+	var opts struct {
+		Prompt  string  `long:"prompt" description:"Text prompt from user to feed the model input"`
+		Model   string  `long:"model" description:"Path and file name of converted .bin LLaMA model"`
+		Threads int     `long:"threads" description:"Adjust to the number of CPU cores you want to use [ all cores by default ]"`
+		Predict uint32  `long:"predict" description:"Number of tokens to predict [ 128 by default ]"`
+		Context uint32  `long:"context" description:"Context size in tokens [ 512 by default ]"`
+		Temp    float32 `long:"temp" description:"Model temperature hyper parameter [ 0.8 by default ]"`
+		Silent  bool    `long:"silent" description:"Hide welcome logo and other output [ show by default ]"`
+		Chat    bool    `long:"chat" description:"Chat with user in interactive mode instead of compute over static prompt"`
+	}
+
+	_, err := flags.Parse(&opts)
+	if err != nil {
+		return
+	}
+
+	prompt := " " + opts.Prompt // add a space to match LLaMA tokenizer behavior
+	final := ""                 // accumulate model output
+
+	// Allow to use ALL cores for the program itself and user-specified number for tensor math
+	// TODO Optimize default settings for CPUs with P and E cores like M1 Pro = 8 performant and 2 energy cores
+	runtime.GOMAXPROCS(runtime.NumCPU())
+	if opts.Threads == 0 {
+		opts.Threads = runtime.NumCPU()
+	}
+
+	if opts.Context == 0 {
+		opts.Context = 512
+	}
+
+	if opts.Predict == 0 {
+		opts.Predict = 128
+	}
+
+	if opts.Temp == 0 {
+		opts.Temp = 0.8
+	}
+
+	repeatLastN := uint32(64)
+	if repeatLastN > opts.Context {
+		repeatLastN = opts.Context
+	}
+
+	if !opts.Silent {
+		showLogo()
+	}
+
+	if opts.Prompt == "" || opts.Model == "" {
+		llama.Colorize("\n[magenta][ ERROR ][white] Please specify correct model path and prompt with [light_magenta]--model[white] and [light_magenta]--prompt[white] parameters\n\n")
+		os.Exit(0)
+	}
+
+	params := ModelParams{
+		model:       opts.Model,
+		interactive: opts.Chat,
+
+		ctxSize:      opts.Context,
+		seed:         -1,
+		threadsCount: opts.Threads,
+		predictCount: opts.Predict,
+		repeatLastN:  repeatLastN,
+		partsCount:   -1,
+		batchSize:    8,
+
+		topK:          40,
+		topP:          0.95,
+		temp:          opts.Temp,
+		repeatPenalty: 1.10,
+
+		memoryFP16: true,
+	}
+
+	// --- load the model
+
+	ctx, err := llama.LoadModel(params.model, opts.Silent)
+	if err != nil {
+		_, err := llama.Colorize("\n[magenta][ ERROR ][white] Failed to load model [light_magenta]\"%s\"\n\n", params.model)
+		if err != nil {
+			return
+		}
+		os.Exit(0)
+	}
+
+	// tokenize the prompt
+	embdInp := ml.Tokenize(ctx.Vocab, prompt, true)
+	tokenNewline := ml.Tokenize(ctx.Vocab, "\n", false)[0]
+
+	var embd []uint32
+
+	// Initialize the ring buffer
+	lastNTokens := ring.New(int(params.ctxSize))
+
+	for i := 0; i < int(params.ctxSize); i++ {
+		lastNTokens.Value = uint32(0)
+		lastNTokens = lastNTokens.Next()
+	}
+
+	// A function to append a token to the ring buffer
+	appendToken := func(token uint32) {
+		lastNTokens.Value = token
+		lastNTokens = lastNTokens.Next()
+	}
+
+	inputNoEcho := false
+	pastCount := uint32(0)
+	remainCount := params.predictCount
+	consumedCount := uint32(0)
+
+	for remainCount != 0 || params.interactive {
+
+		// --- predict
+
+		if len(embd) > 0 {
+
+			// infinite text generation via context swapping
+			// if we run out of context:
+			// - take the n_keep first tokens from the original prompt (via n_past)
+			// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch
+
+			if pastCount+uint32(len(embd)) > params.ctxSize {
+				leftCount := pastCount - params.keepCount
+				pastCount = params.keepCount
+
+				// insert n_left/2 tokens at the start of embd from last_n_tokens
+				//embd = append(lastNTokens[:leftCount/2], embd...)
+				embd = append(llama.ExtractTokens(lastNTokens.Move(-int(leftCount/2)), int(leftCount/2)), embd...)
+			}
+
+			if err := llama.Eval(ctx, embd, uint32(len(embd)), pastCount, params.threadsCount); err != nil {
+				fmt.Printf("\n[ERROR] Failed to eval")
+				os.Exit(1)
+			}
+		}
+
+		pastCount += uint32(len(embd))
+		embd = []uint32{}
+
+		if len(embdInp) <= int(consumedCount) { // && !isInteracting {
+
+			if params.ignoreEOS {
+				ctx.Logits[ml.TOKEN_EOS] = 0
+			}
+
+			/*
+				id := llama.SampleTopPTopK(ctx,
+					lastNTokens[params.ctxSize-params.repeatLastN:], params.repeatLastN,
+					params.topK, params.topP, params.temp, params.repeatPenalty)
+
+				lastNTokens = lastNTokens[1:] ////last_n_tokens.erase(last_n_tokens.begin());
+				lastNTokens = append(lastNTokens, id)
+
+			*/
+			id := llama.SampleTopPTopK(ctx,
+				lastNTokens, params.repeatLastN,
+				params.topK, params.topP, params.temp, params.repeatPenalty)
+
+			appendToken(id)
+
+			// replace end of text token with newline token when in interactive mode
+			if id == ml.TOKEN_EOS && params.interactive && !params.instruct {
+				id = tokenNewline
+			}
+
+			// add it to the context
+			embd = append(embd, id)
+
+			// echo this to console
+			inputNoEcho = false
+
+			// decrement remaining sampling budget
+			remainCount--
+
+		} else {
+
+			// some user input remains from prompt or interaction, forward it to processing
+			/*
+				for len(embdInp) > int(consumedCount) {
+					embd = append(embd, embdInp[consumedCount])
+					if len(lastNTokens) > 0 {
+						lastNTokens = lastNTokens[1:]
+					}
+					lastNTokens = append(lastNTokens, embdInp[consumedCount])
+					consumedCount++
+					if len(embd) >= int(params.batchSize) {
+						break
+					}
+				}
+			*/
+			for len(embdInp) > int(consumedCount) {
+				embd = append(embd, embdInp[consumedCount])
+				appendToken(embdInp[consumedCount])
+				consumedCount++
+				if len(embd) >= int(params.batchSize) {
+					break
+				}
+			}
+		}
+
+		// --- display text
+
+		if !inputNoEcho {
+			for _, id := range embd {
+
+				token := ml.Token2Str(ctx.Vocab, id)
+				final += token
+
+				if len(strings.TrimSpace(final)) < len(strings.TrimSpace(prompt)) {
+					continue
+				}
+
+				out := strings.Split(final, prompt)
+
+				if len(out) == 2 && token == "\n" {
+					continue
+				}
+
+				if len(strings.TrimSpace(final)) == len(strings.TrimSpace(prompt)) && (token != "\n") && (len(out) == 2) {
+					_, err := llama.Colorize("\n\n[magenta]▒▒▒ [light_yellow]" + strings.TrimSpace(prompt) + "\n[light_blue]▒▒▒ ")
+					if err != nil {
+						return
+					}
+					continue
+				}
+
+				_, err := llama.Colorize("[white]" + token)
+				if err != nil {
+					return
+				}
+
+			}
+		}
+	}
+}
+
+// Colorize is a wrapper for colorstring.Color() and fmt.Fprintf()
+// Join colorstring and go-colorable to allow colors both on Mac and Windows
+// TODO: Implement as a small library
+// func Colorize(format string, opts ...interface{}) (n int, err error) {
+// 	var DefaultOutput = colorable.NewColorableStdout()
+// 	return fmt.Fprintf(DefaultOutput, colorstring.Color(format), opts...)
+// }
+
+func showLogo() {
+	// Read the version from the 'VERSION' file
+	version, err := os.ReadFile("VERSION")
+	if err != nil {
+		fmt.Printf("[ERROR] Failed to read VERSION file")
+		os.Exit(1)
+	}
+	versionStr := strings.TrimSpace(string(version))
+
+	// https://patorjk.com/software/taag/#p=display&f=3-D&t=llama.go%0A%0ALLaMA.go
+	// Isometric 1, Modular, Rectangles, Rozzo, Small Isometric 1, 3-D
+
+	logo := `                                                    
+  /88       /88         /888/888   /88/8888/88   /888/888      /8888/88   /888/888    
+  /888      /888      /888/ /888 /888/8888/888 /888/ /888     /8888 //   /8888//888  
+  /8888/88  /8888/88  /8888/8888 /888/8888/888 /8888/8888 /88 /8888/8888 /888 /8888 
+  /8888/888 /8888/888 /888 /8888 /888//88 /888 /888 /8888 /888//8888/88  //888/888
+  //// ///  //// ///  ///  ////  ///  //  ///  ///  ////  ///  //// //    /// ///`
+
+	logoColored := ""
+	prevColor := ""
+	color := ""
+	line := 0
+	colors := []string{"[black]", "[light_blue]", "[magenta]", "[light_magenta]", "[light_blue]"}
+
+	for _, char := range logo {
+		if char == '\n' {
+			line++
+		} else if char == '/' {
+			color = "[blue]"
+		} else if char == '8' {
+			color = colors[line]
+			char = '▒'
+		}
+		if color == prevColor {
+			logoColored += string(char)
+		} else {
+			logoColored += color + string(char)
+		}
+	}
+
+	_, err = llama.Colorize(logoColored)
+	if err != nil {
+		return
+	}
+	_, err = llama.Colorize("\n\n   [magenta]▒▒▒▒[light_magenta] [ LLaMA.go v" + versionStr + " ] [light_blue][ LLaMA GPT in pure Golang - based on LLaMA C++ ] [magenta]▒▒▒▒\n\n")
+	if err != nil {
+		return
+	}
+}
diff --git a/examples/mnist/README.md b/examples/mnist/README.md
new file mode 100644
index 0000000..ed0821f
--- /dev/null
+++ b/examples/mnist/README.md
@@ -0,0 +1,29 @@
+# MNIST Example for GGML
+
+This is a simple example of how to use GGML for inferencing.
+
+## Training the Model
+
+A notebook for training a simple two-layer network to recognize digits is located at `trainning/mnist.ipynb`. You can
+use this to save a pytorch model to be converted to ggml format.
+
+
+
+## GGML Format Conversion
+
+GGML "format" is whatever you choose for efficient loading. In our case, we just save the hyperparameters used
+plus the model weights and biases. Run convert-h5-to-ggml.py to convert your pytorch model. The output format is:
+
+- magic constant (int32)
+- repeated list of tensors
+- number of dimensions of tensor (int32)
+- tensor dimension (int32 repeated)
+- values of tensor (int32)
+
+Run ```convert-h5-to-ggml.py mnist_model.state_dict``` where `mnist_model.state_dict` is the saved pytorch model from the notebook. 
+
+## MNIST Network
+
+The MNIST recognizer network is extremely simple. A fully connected layer + relu, followed by a fully connected layer + softmax. This
+version of the MNIST network doesn't use convolutions.
+
diff --git a/examples/mnist/convert-h5-to-ggml.py b/examples/mnist/convert-h5-to-ggml.py
new file mode 100644
index 0000000..083a933
--- /dev/null
+++ b/examples/mnist/convert-h5-to-ggml.py
@@ -0,0 +1,75 @@
+# Convert MNIS h5 transformer model to ggml format
+#
+# Load the (state_dict) saved model using PyTorch
+# Iterate over all variables and write them to a binary file.
+#
+# For each variable, write the following:
+#   - Number of dimensions (int)
+#   - Name length (int)
+#   - Dimensions (int[n_dims])
+#   - Name (char[name_length])
+#   - Data (float[n_dims])
+#
+# At the start of the ggml file we write the model parameters
+
+import sys
+import struct
+import json
+import numpy as np
+import re
+
+
+import torch
+import torch.nn as nn
+import torchvision.datasets as dsets
+import torchvision.transforms as transforms
+from torch.autograd import Variable
+
+IS_BIGENDIAN = True
+
+pack_fmt = "i"
+
+if len(sys.argv) != 2:
+    print("Usage: convert-h5-to-ggml.py model\n")
+    sys.exit(1)
+
+state_dict_file = sys.argv[1]
+
+fname_out = "models/mnist/ggml-model-small-f32.bin"
+if IS_BIGENDIAN:
+    fname_out = "models/mnist/ggml-model-small-f32-big-endian.bin"
+    pack_fmt = "!i"
+
+state_dict = torch.load(state_dict_file, map_location=torch.device('cpu'))
+#print (model)
+
+list_vars = state_dict
+print (list_vars)
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack(pack_fmt, 0x67676d6c)) # magic: ggml in hex
+
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable: " + name + " with shape: ", data.shape) 
+    n_dims = len(data.shape)
+   
+    fout.write(struct.pack(pack_fmt, n_dims))
+    
+    data = data.astype(np.float32)
+    for i in range(n_dims):
+        fout.write(struct.pack(pack_fmt, data.shape[n_dims - 1 - i]))
+
+    # data
+    if not IS_BIGENDIAN:
+        data.tofile(fout)
+    else:
+        data = data.astype(">f4")
+        data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")
diff --git a/examples/mnist/mnist.go b/examples/mnist/mnist.go
new file mode 100644
index 0000000..1afbcda
--- /dev/null
+++ b/examples/mnist/mnist.go
@@ -0,0 +1,218 @@
+package mnist
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"mlgo/ml"
+	"os"
+)
+
+type mnist_hparams struct{
+	n_input int32;
+	n_hidden int32;
+	n_classes int32;
+}
+
+type mnist_model struct {
+	hparams mnist_hparams;
+
+	fc1_weight *ml.Tensor;
+	fc1_bias *ml.Tensor;
+
+	fc2_weight *ml.Tensor;
+	fc2_bias *ml.Tensor;
+
+}
+
+func mnist_model_load(fname string, model *mnist_model) error {
+
+	file, err := os.Open(fname)
+	if err != nil {
+		return err
+	}
+	defer file.Close()
+
+
+	// verify magic
+	{
+		magic := readInt(file)
+		if magic != 0x67676d6c {
+			return errors.New("invalid model file (bad magic)")
+		}
+	}
+
+	// Read FC1 layer 1
+	{
+		n_dims := int32(readInt(file))
+		ne_weight := make([]int32, 0)
+		for i := int32(0); i < n_dims; i++ {
+			ne_weight = append(ne_weight, int32(readInt(file)))
+		}
+		// FC1 dimensions taken from file, eg. 768x500
+		model.hparams.n_input = ne_weight[0]
+		model.hparams.n_hidden = ne_weight[1]
+
+		model.fc1_weight = ml.NewTensor2D(nil, ml.TYPE_F32, uint32(model.hparams.n_input), uint32(model.hparams.n_hidden))
+		for i := 0; i < len(model.fc1_weight.Data); i++{
+			model.fc1_weight.Data[i] = readFP32(file)
+		}
+
+		ne_bias := make([]int32, 0)
+		for i := 0; i < int(n_dims); i++ {
+			ne_bias = append(ne_bias, int32(readInt(file)))
+		}
+
+		model.fc1_bias = ml.NewTensor1D(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden))
+		for i := 0; i < len(model.fc1_bias.Data); i++ {
+			model.fc1_bias.Data[i] = readFP32(file)
+		}
+	}
+
+	// Read Fc2 layer 2
+	{
+		n_dims := int32(readInt(file))
+		ne_weight := make([]int32, 0)
+		for i := 0; i < int(n_dims); i++ {
+			ne_weight = append(ne_weight, int32(readInt(file)))
+		}
+
+		// FC1 dimensions taken from file, eg. 10x500
+		model.hparams.n_classes = ne_weight[1]
+
+		model.fc2_weight = ml.NewTensor2D(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden), uint32(model.hparams.n_classes))
+		for i := 0; i < len(model.fc2_weight.Data); i++{
+			model.fc2_weight.Data[i] = readFP32(file)
+		}
+
+		ne_bias := make([]int32, 0)
+		for i := 0; i < int(n_dims); i++ {
+			ne_bias = append(ne_bias, int32(readInt(file)))
+		}
+
+		model.fc2_bias = ml.NewTensor1D(nil, ml.TYPE_F32, uint32(model.hparams.n_classes))
+		for i := 0; i < len(model.fc2_bias.Data); i++ {
+			model.fc2_bias.Data[i] = readFP32(file)
+		}
+		printTensor(model.fc2_bias, "model.fc2_bias")
+
+	}
+
+	return nil
+}
+
+func mnist_eval(model *mnist_model, threadCount int, digit []float32) int {
+
+	ctx0 := &ml.Context{}
+	graph := ml.Graph{ThreadsCount: threadCount}
+
+	input := ml.NewTensor1D(ctx0, ml.TYPE_F32, uint32(model.hparams.n_input))
+	copy(input.Data, digit)
+
+	// fc1 MLP = Ax + b
+	fc1 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc1_weight, input), model.fc1_bias)
+	fc2 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc2_weight, ml.Relu(ctx0, fc1)), model.fc2_bias)
+	
+	// softmax
+	final := ml.SoftMax(ctx0, fc2)
+
+	// run the computation
+	ml.BuildForwardExpand(&graph, final)
+	ml.GraphCompute(ctx0, &graph)
+
+	printTensor(final, "final tensor")
+
+	maxIndex := 0
+	for i := 0; i < 10; i++{
+		if final.Data[i] > final.Data[maxIndex] {
+			maxIndex = i
+		}
+	}
+	return maxIndex
+}
+
+func ExpandGraph(model *mnist_model, threadCount int, digit []float32) (*ml.Graph, *ml.Context) {
+	ctx0 := &ml.Context{}
+	graph := ml.Graph{ThreadsCount: threadCount}
+
+	input := ml.NewTensor1D(ctx0, ml.TYPE_F32, uint32(model.hparams.n_input))
+	copy(input.Data, digit)
+
+	// fc1 MLP = Ax + b
+	fc1 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc1_weight, input), model.fc1_bias)
+	fc2 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc2_weight, ml.Relu(ctx0, fc1)), model.fc2_bias)
+	
+	// softmax
+	final := ml.SoftMax(ctx0, fc2)
+
+	// run the computation
+	ml.BuildForwardExpand(&graph, final)
+	return &graph, ctx0
+}
+
+func LoadModel(modeFile string) (*mnist_model, error) {
+	model := new(mnist_model)
+	err := mnist_model_load(modeFile, model)
+	return model, err
+}
+
+// NB! INT = 32 bits
+func readInt(file *os.File) uint32 {
+	buf := make([]byte, 4)
+	if count, err := file.Read(buf); err != nil || count != 4 {
+		return 0
+	}
+	return uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0])
+}
+
+func readString(file *os.File, len uint32) string {
+	buf := make([]byte, len)
+	if count, err := file.Read(buf); err != nil || count != int(len) {
+		return ""
+	}
+	return string(buf)
+}
+
+
+func readFP32(file *os.File) float32 {
+	buf := make([]byte, 4)
+	if count, err := file.Read(buf); err != nil || count != 4 {
+		return 0.0
+	}
+	bits := uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0])
+	return math.Float32frombits(bits)
+}
+
+func min(a, b int) int {
+	if a <= b {
+		return a
+	}
+	return b
+}
+
+func printTensor(tensor *ml.Tensor, name string) {
+	var dt string
+	if tensor.Type == ml.TYPE_F16 {
+		dt = "FP16"
+	}
+	if tensor.Type == ml.TYPE_F32 {
+		dt = "FP32"
+	}
+	if tensor.Type == ml.TYPE_Q4_0 {
+		dt = "INT4"
+	}
+
+	fmt.Printf("\n\n=== [ %s | %s | %d:%d:%d ] ===\n",
+		name, dt, tensor.NE[0], tensor.NE[1], tensor.NE[2])
+
+	for nn := 0; nn < min(12, int(tensor.NE[1])); nn++ {
+		fmt.Printf("\n %d x %d ...\t", nn, tensor.NE[0])
+		for ii := 0; ii < min(12, int(tensor.NE[0])); ii++ {
+			fmt.Printf("%.3f\t", tensor.Data[nn*int(tensor.NE[0])+ii])
+		}
+	}
+}
+
+func main(){
+	fmt.Println("hello world")
+}
\ No newline at end of file
diff --git a/examples/mnist/mnist_test.go b/examples/mnist/mnist_test.go
new file mode 100644
index 0000000..5194f34
--- /dev/null
+++ b/examples/mnist/mnist_test.go
@@ -0,0 +1,277 @@
+package mnist
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"math/rand"
+	"mlgo/ml"
+	"os"
+	"reflect"
+	"testing"
+	"time"
+)
+
+func TestMNIST(t *testing.T) {
+	modelFile := "models/mnist/ggml-model-f32.bin"
+	digitFile := "models/mnist/t10k-images.idx3-ubyte"
+
+	ml.SINGLE_THREAD = true
+	model := new(mnist_model)
+	if err := mnist_model_load(modelFile, model); err != nil {
+		fmt.Println(err)
+		return
+	}
+
+	// load a random test digit
+	fin, err := os.Open(digitFile)
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+	 // Seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+	rand.Seed(time.Now().UnixNano())
+	fin.Seek(int64(16 + 784 * (rand.Int() % 10000)), 0)
+	buf := make([]byte, 784)
+	digits := make([]float32, 784)
+	if count, err := fin.Read(buf); err != nil || count != int(len(buf)) {
+		fmt.Println(err, count)
+		return
+	}
+	
+	// render the digit in ASCII
+	for row := 0; row < 28; row++{
+		for col := 0; col < 28; col++ {
+			digits[row*28 + col] = float32(buf[row*28 + col])
+			var c string
+			if buf[row*28 + col] > 230 {
+				c = "*"
+			} else {
+				c = "_"
+			}
+			fmt.Printf(c)
+		}
+		fmt.Println("")
+	}
+	fmt.Println("")
+
+	res := mnist_eval(model, 1, digits)
+	fmt.Println("Predicted digit is ", res)
+}
+
+
+func IntToBytes(n int) []byte {
+    x := int32(n)
+
+    bytesBuffer := bytes.NewBuffer([]byte{})
+    binary.Write(bytesBuffer, binary.BigEndian, x)
+    return bytesBuffer.Bytes()
+}
+
+func BytesToInt(b []byte) int {
+    bytesBuffer := bytes.NewBuffer(b)
+
+    var x int32
+    binary.Read(bytesBuffer, binary.BigEndian, &x)
+
+    return int(x)
+}
+
+func TestByteInt(t *testing.T){
+	a := int(0x67676d6c)
+	aBytes := IntToBytes(a)
+	aInt := BytesToInt(aBytes)
+	aInt2 := (int(aBytes[0]) << 24) | (int(aBytes[1]) << 16) | (int(aBytes[2]) << 8) | int(aBytes[3])
+	fmt.Println("a ", a);
+	fmt.Println("aBytes ", aBytes)
+	fmt.Println("aInt ", aInt)
+	fmt.Println("aInt2 ", aInt2)
+}
+
+func add(a *int) {
+	*a = *a + 1
+}
+
+func TestSlice(t *testing.T){
+	a := 2
+	{
+		add(&a)
+	}
+	fmt.Println(a)
+}
+
+func TestSaveInput(t *testing.T) {
+	digitFile := "models/mnist/t10k-images.idx3-ubyte"
+	// load a random test digit
+	fin, err := os.Open(digitFile)
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+	fin.Seek(int64(16 + 784 * 0), 0)
+	buf := make([]byte, 784)
+	digits := make([]float32, 784)
+	if count, err := fin.Read(buf); err != nil || count != int(len(buf)) {
+		fmt.Println(err, count)
+		return
+	}
+	
+	// render the digit in ASCII
+	for row := 0; row < 28; row++{
+		for col := 0; col < 28; col++ {
+			digits[row*28 + col] = float32(buf[row*28 + col])
+			var c string
+			if buf[row*28 + col] > 230 {
+				c = "*"
+			} else {
+				c = "_"
+			}
+			fmt.Printf(c)
+		}
+		fmt.Println("")
+	}
+	fmt.Println("")
+
+	fout, err := os.Create("models/mnist/input_7")
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+	defer fout.Close()
+	_, err = fout.Write(buf)
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+
+}
+
+func TestMNISTConvert(t *testing.T) {
+	modelFile := "models/mnist/ggml-model-f32.bin"
+	digitFile := "models/mnist/t10k-images.idx3-ubyte"
+
+	ml.SINGLE_THREAD = true
+	model := new(mnist_model)
+	if err := mnist_model_load(modelFile, model); err != nil {
+		fmt.Println(err)
+		return
+	}
+
+	// load a random test digit
+	fin, err := os.Open(digitFile)
+	if err != nil {
+		fmt.Println(err)
+		return
+	}
+		// Seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+	rand.Seed(time.Now().UnixNano())
+	fin.Seek(int64(16 + 784 * (rand.Int() % 10000)), 0)
+	buf := make([]byte, 784)
+	digits := make([]float32, 784)
+	if count, err := fin.Read(buf); err != nil || count != int(len(buf)) {
+		fmt.Println(err, count)
+		return
+	}
+
+	// render the digit in ASCII
+	for row := 0; row < 28; row++{
+		for col := 0; col < 28; col++ {
+			digits[row*28 + col] = float32(buf[row*28 + col])
+			var c string
+			if buf[row*28 + col] > 230 {
+				c = "*"
+			} else {
+				c = "_"
+			}
+			fmt.Printf(c)
+		}
+		fmt.Println("")
+	}
+	fmt.Println("")
+
+	ctx0 := &ml.Context{}
+	graph := ml.Graph{ThreadsCount: 1}
+
+	input := ml.NewTensor1D(ctx0, ml.TYPE_F32, uint32(model.hparams.n_input))
+	copy(input.Data, digits)
+
+	// fc1 MLP = Ax + b
+	fc1 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc1_weight, input), model.fc1_bias)
+	fc2 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc2_weight, ml.Relu(ctx0, fc1)), model.fc2_bias)
+	
+	// softmax
+	final := ml.SoftMax(ctx0, fc2)
+
+	// run the computation
+	ml.BuildForwardExpand(&graph, final)
+	// stop here
+	nodeID := 5
+	ml.GraphComputeByNodes(ctx0, &graph, nodeID)
+
+	ml.PrintTensor(graph.Nodes[nodeID], "final_before")
+
+	// continue 
+	// ml.ComputeNodeForward(graph.Nodes[5])
+	
+	// ml.PrintTensor(final, "final_after")
+
+	// test coding and encoding
+	envBytes := ml.SaveComputeNodeEnvToBytes(uint32(nodeID), graph.Nodes[nodeID], &graph, true)
+	nodeID_, tensorGraphList_ , err := ml.DecodeComputeNodeEnv(envBytes, true, false)
+
+	// save bytes from mips test
+	{
+		fout, err := os.Create("models/mnist/node_5")
+		if err != nil {
+			fmt.Println(err)
+			return
+		}
+		defer fout.Close()
+		_, err = fout.Write(envBytes)
+		if err != nil {
+			fmt.Println(err)
+			return
+		}
+	}
+
+	// save => tensorOnGraph[]
+	tensorGraphList := ml.SaveComputeNodeEnv(graph.Nodes[5], &graph)
+
+	fmt.Println("nodeID Equal: ", nodeID_ == uint32(nodeID))
+	fmt.Println("tensorGraphList: ", reflect.DeepEqual(tensorGraphList_, tensorGraphList))
+
+	// reconstruct
+	tensorList := make([]*ml.Tensor, 0)
+	tensorMap := make(map[uint32]*ml.Tensor)
+	for i := 0; i < len(tensorGraphList); i++ {
+		tensor := tensorGraphList[i].ToTensor(nil)
+		tensorMap[tensorGraphList[i].NodeID] = tensor
+		tensorList = append(tensorList, tensor)
+	}
+	// fill in the nodeid
+	for i := 0; i < len(tensorList); i++ {
+		tensor := tensorList[i]
+		tensorG := tensorGraphList[i]
+		if src0, ok := tensorMap[tensorG.Src0NodeID]; ok {
+			tensor.Src0 = src0
+		}
+		if src1, ok := tensorMap[tensorG.Src1NodeID]; ok {
+			tensor.Src1 = src1
+		}
+	}
+
+	// compute
+	ml.ComputeNodeForward(tensorMap[uint32(nodeID)])
+	
+	ml.PrintTensor(final, "final_after")
+
+	tensor := final
+	tensorOnGraph := tensor.ToTensorOnGraph(&graph)
+	tensorOnGraphBytes := tensorOnGraph.Encoding(false)
+	// bytesLen := common.BytesToInt32(tensorOnGraphBytes[:4], false)
+	// fmt.Println(int(bytesLen) == len(tensorOnGraphBytes) - 4)
+	tensorOnGraph2 := ml.DecodeTensorOnGraph(tensorOnGraphBytes, false, false)
+	fmt.Println(reflect.DeepEqual(tensor.Data, tensorOnGraph.Data))
+	fmt.Println(reflect.DeepEqual(tensorOnGraph, tensorOnGraph2))
+	fmt.Println(tensorOnGraph.Src0NodeID, tensorOnGraph.Src1NodeID)
+}
\ No newline at end of file
diff --git a/examples/mnist/models/mnist/ggml-model-f32-big-endian.bin b/examples/mnist/models/mnist/ggml-model-f32-big-endian.bin
new file mode 100644
index 0000000..941e8e2
Binary files /dev/null and b/examples/mnist/models/mnist/ggml-model-f32-big-endian.bin differ
diff --git a/examples/mnist/models/mnist/ggml-model-f32.bin b/examples/mnist/models/mnist/ggml-model-f32.bin
new file mode 100644
index 0000000..1459a3b
Binary files /dev/null and b/examples/mnist/models/mnist/ggml-model-f32.bin differ
diff --git a/examples/mnist/models/mnist/ggml-model-small-f32-big-endian.bin b/examples/mnist/models/mnist/ggml-model-small-f32-big-endian.bin
new file mode 100644
index 0000000..6de927b
Binary files /dev/null and b/examples/mnist/models/mnist/ggml-model-small-f32-big-endian.bin differ
diff --git a/examples/mnist/models/mnist/ggml-model-small-f32.bin b/examples/mnist/models/mnist/ggml-model-small-f32.bin
new file mode 100644
index 0000000..9b4cde8
Binary files /dev/null and b/examples/mnist/models/mnist/ggml-model-small-f32.bin differ
diff --git a/examples/mnist/models/mnist/input_7 b/examples/mnist/models/mnist/input_7
new file mode 100644
index 0000000..6e67157
Binary files /dev/null and b/examples/mnist/models/mnist/input_7 differ
diff --git a/examples/mnist/models/mnist/mnist-small.state_dict b/examples/mnist/models/mnist/mnist-small.state_dict
new file mode 100644
index 0000000..5121cdf
Binary files /dev/null and b/examples/mnist/models/mnist/mnist-small.state_dict differ
diff --git a/examples/mnist/models/mnist/mnist_model.state_dict b/examples/mnist/models/mnist/mnist_model.state_dict
new file mode 100644
index 0000000..dfb609b
Binary files /dev/null and b/examples/mnist/models/mnist/mnist_model.state_dict differ
diff --git a/examples/mnist/models/mnist/node_5 b/examples/mnist/models/mnist/node_5
new file mode 100644
index 0000000..2df09b6
Binary files /dev/null and b/examples/mnist/models/mnist/node_5 differ
diff --git a/examples/mnist/models/mnist/t10k-images.idx3-ubyte b/examples/mnist/models/mnist/t10k-images.idx3-ubyte
new file mode 100644
index 0000000..1170b2c
Binary files /dev/null and b/examples/mnist/models/mnist/t10k-images.idx3-ubyte differ
diff --git a/examples/mnist/trainning/mnist.ipynb b/examples/mnist/trainning/mnist.ipynb
new file mode 100644
index 0000000..9b715a2
--- /dev/null
+++ b/examples/mnist/trainning/mnist.ipynb
@@ -0,0 +1,438 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#@title Import Dependencies\n",
+    "\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torchvision.datasets as dsets\n",
+    "import torchvision.transforms as transforms\n",
+    "from torch.autograd import Variable"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#@title Define Hyperparameters\n",
+    "\n",
+    "input_size = 784 # img_size = (28,28) ---> 28*28=784 in total\n",
+    "hidden_size = 20 # number of nodes at hidden layer\n",
+    "num_classes = 10 # number of output classes discrete range [0,9]\n",
+    "num_epochs = 20 # number of times which the entire dataset is passed throughout the model\n",
+    "batch_size = 100 # the size of input data took for one iteration\n",
+    "lr = 1e-3 # size of step "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n",
+      "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 9912422/9912422 [00:00<00:00, 28402590.00it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw\n",
+      "\n",
+      "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n",
+      "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 28881/28881 [00:00<00:00, 6406584.19it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw\n",
+      "\n",
+      "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n",
+      "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 1648877/1648877 [00:00<00:00, 7325341.35it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw\n",
+      "\n",
+      "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n",
+      "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 4542/4542 [00:00<00:00, 12057296.69it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "#@title Downloading MNIST data\n",
+    "\n",
+    "train_data = dsets.MNIST(root = './data', train = True,\n",
+    "                        transform = transforms.ToTensor(), download = True)\n",
+    "\n",
+    "test_data = dsets.MNIST(root = './data', train = False,\n",
+    "                       transform = transforms.ToTensor())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#@title Loading the data\n",
+    "\n",
+    "train_gen = torch.utils.data.DataLoader(dataset = train_data,\n",
+    "                                             batch_size = batch_size,\n",
+    "                                             shuffle = True)\n",
+    "\n",
+    "test_gen = torch.utils.data.DataLoader(dataset = test_data,\n",
+    "                                      batch_size = batch_size, \n",
+    "                                      shuffle = False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#@title Define model class\n",
+    "\n",
+    "class Net(nn.Module):\n",
+    "  def __init__(self, input_size, hidden_size, num_classes):\n",
+    "    super(Net,self).__init__()\n",
+    "    self.fc1 = nn.Linear(input_size, hidden_size)\n",
+    "    self.relu = nn.ReLU()\n",
+    "    self.fc2 = nn.Linear(hidden_size, num_classes)\n",
+    "  \n",
+    "  def forward(self,x):\n",
+    "    out = self.fc1(x)\n",
+    "    out = self.relu(out)\n",
+    "    out = self.fc2(out)\n",
+    "    return out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#@title Build the model\n",
+    "\n",
+    "net = Net(input_size, hidden_size, num_classes)\n",
+    "if torch.cuda.is_available():\n",
+    "  net.cuda()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#@title Define loss-function & optimizer\n",
+    "\n",
+    "loss_function = nn.CrossEntropyLoss()\n",
+    "optimizer = torch.optim.Adam( net.parameters(), lr=lr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch [1/20], Step [100/600], Loss: 0.7586\n",
+      "Epoch [1/20], Step [200/600], Loss: 0.6578\n",
+      "Epoch [1/20], Step [300/600], Loss: 0.4132\n",
+      "Epoch [1/20], Step [400/600], Loss: 0.2653\n",
+      "Epoch [1/20], Step [500/600], Loss: 0.3760\n",
+      "Epoch [1/20], Step [600/600], Loss: 0.2321\n",
+      "Epoch [2/20], Step [100/600], Loss: 0.2010\n",
+      "Epoch [2/20], Step [200/600], Loss: 0.2502\n",
+      "Epoch [2/20], Step [300/600], Loss: 0.2265\n",
+      "Epoch [2/20], Step [400/600], Loss: 0.1894\n",
+      "Epoch [2/20], Step [500/600], Loss: 0.3196\n",
+      "Epoch [2/20], Step [600/600], Loss: 0.1802\n",
+      "Epoch [3/20], Step [100/600], Loss: 0.3340\n",
+      "Epoch [3/20], Step [200/600], Loss: 0.2048\n",
+      "Epoch [3/20], Step [300/600], Loss: 0.1734\n",
+      "Epoch [3/20], Step [400/600], Loss: 0.3398\n",
+      "Epoch [3/20], Step [500/600], Loss: 0.1892\n",
+      "Epoch [3/20], Step [600/600], Loss: 0.2637\n",
+      "Epoch [4/20], Step [100/600], Loss: 0.2435\n",
+      "Epoch [4/20], Step [200/600], Loss: 0.3686\n",
+      "Epoch [4/20], Step [300/600], Loss: 0.3716\n",
+      "Epoch [4/20], Step [400/600], Loss: 0.3624\n",
+      "Epoch [4/20], Step [500/600], Loss: 0.2705\n",
+      "Epoch [4/20], Step [600/600], Loss: 0.2090\n",
+      "Epoch [5/20], Step [100/600], Loss: 0.1542\n",
+      "Epoch [5/20], Step [200/600], Loss: 0.1493\n",
+      "Epoch [5/20], Step [300/600], Loss: 0.2130\n",
+      "Epoch [5/20], Step [400/600], Loss: 0.1685\n",
+      "Epoch [5/20], Step [500/600], Loss: 0.2899\n",
+      "Epoch [5/20], Step [600/600], Loss: 0.1895\n",
+      "Epoch [6/20], Step [100/600], Loss: 0.2628\n",
+      "Epoch [6/20], Step [200/600], Loss: 0.2071\n",
+      "Epoch [6/20], Step [300/600], Loss: 0.0898\n",
+      "Epoch [6/20], Step [400/600], Loss: 0.1123\n",
+      "Epoch [6/20], Step [500/600], Loss: 0.1715\n",
+      "Epoch [6/20], Step [600/600], Loss: 0.2295\n",
+      "Epoch [7/20], Step [100/600], Loss: 0.1155\n",
+      "Epoch [7/20], Step [200/600], Loss: 0.1513\n",
+      "Epoch [7/20], Step [300/600], Loss: 0.1155\n",
+      "Epoch [7/20], Step [400/600], Loss: 0.1920\n",
+      "Epoch [7/20], Step [500/600], Loss: 0.2464\n",
+      "Epoch [7/20], Step [600/600], Loss: 0.0735\n",
+      "Epoch [8/20], Step [100/600], Loss: 0.1250\n",
+      "Epoch [8/20], Step [200/600], Loss: 0.1276\n",
+      "Epoch [8/20], Step [300/600], Loss: 0.1443\n",
+      "Epoch [8/20], Step [400/600], Loss: 0.0967\n",
+      "Epoch [8/20], Step [500/600], Loss: 0.1119\n",
+      "Epoch [8/20], Step [600/600], Loss: 0.1230\n",
+      "Epoch [9/20], Step [100/600], Loss: 0.1142\n",
+      "Epoch [9/20], Step [200/600], Loss: 0.1825\n",
+      "Epoch [9/20], Step [300/600], Loss: 0.1516\n",
+      "Epoch [9/20], Step [400/600], Loss: 0.2317\n",
+      "Epoch [9/20], Step [500/600], Loss: 0.1516\n",
+      "Epoch [9/20], Step [600/600], Loss: 0.0816\n",
+      "Epoch [10/20], Step [100/600], Loss: 0.1645\n",
+      "Epoch [10/20], Step [200/600], Loss: 0.1152\n",
+      "Epoch [10/20], Step [300/600], Loss: 0.1192\n",
+      "Epoch [10/20], Step [400/600], Loss: 0.1058\n",
+      "Epoch [10/20], Step [500/600], Loss: 0.2072\n",
+      "Epoch [10/20], Step [600/600], Loss: 0.1733\n",
+      "Epoch [11/20], Step [100/600], Loss: 0.1161\n",
+      "Epoch [11/20], Step [200/600], Loss: 0.1378\n",
+      "Epoch [11/20], Step [300/600], Loss: 0.1265\n",
+      "Epoch [11/20], Step [400/600], Loss: 0.2290\n",
+      "Epoch [11/20], Step [500/600], Loss: 0.1156\n",
+      "Epoch [11/20], Step [600/600], Loss: 0.0995\n",
+      "Epoch [12/20], Step [100/600], Loss: 0.1722\n",
+      "Epoch [12/20], Step [200/600], Loss: 0.0980\n",
+      "Epoch [12/20], Step [300/600], Loss: 0.1267\n",
+      "Epoch [12/20], Step [400/600], Loss: 0.0467\n",
+      "Epoch [12/20], Step [500/600], Loss: 0.1382\n",
+      "Epoch [12/20], Step [600/600], Loss: 0.1339\n",
+      "Epoch [13/20], Step [100/600], Loss: 0.1389\n",
+      "Epoch [13/20], Step [200/600], Loss: 0.0930\n",
+      "Epoch [13/20], Step [300/600], Loss: 0.0770\n",
+      "Epoch [13/20], Step [400/600], Loss: 0.0875\n",
+      "Epoch [13/20], Step [500/600], Loss: 0.0931\n",
+      "Epoch [13/20], Step [600/600], Loss: 0.1588\n",
+      "Epoch [14/20], Step [100/600], Loss: 0.0850\n",
+      "Epoch [14/20], Step [200/600], Loss: 0.2115\n",
+      "Epoch [14/20], Step [300/600], Loss: 0.0677\n",
+      "Epoch [14/20], Step [400/600], Loss: 0.1456\n",
+      "Epoch [14/20], Step [500/600], Loss: 0.1269\n",
+      "Epoch [14/20], Step [600/600], Loss: 0.1360\n",
+      "Epoch [15/20], Step [100/600], Loss: 0.2047\n",
+      "Epoch [15/20], Step [200/600], Loss: 0.1644\n",
+      "Epoch [15/20], Step [300/600], Loss: 0.0949\n",
+      "Epoch [15/20], Step [400/600], Loss: 0.0733\n",
+      "Epoch [15/20], Step [500/600], Loss: 0.0711\n",
+      "Epoch [15/20], Step [600/600], Loss: 0.1456\n",
+      "Epoch [16/20], Step [100/600], Loss: 0.0946\n",
+      "Epoch [16/20], Step [200/600], Loss: 0.1493\n",
+      "Epoch [16/20], Step [300/600], Loss: 0.1525\n",
+      "Epoch [16/20], Step [400/600], Loss: 0.0556\n",
+      "Epoch [16/20], Step [500/600], Loss: 0.2276\n",
+      "Epoch [16/20], Step [600/600], Loss: 0.1088\n",
+      "Epoch [17/20], Step [100/600], Loss: 0.0487\n",
+      "Epoch [17/20], Step [200/600], Loss: 0.0929\n",
+      "Epoch [17/20], Step [300/600], Loss: 0.0809\n",
+      "Epoch [17/20], Step [400/600], Loss: 0.1210\n",
+      "Epoch [17/20], Step [500/600], Loss: 0.0739\n",
+      "Epoch [17/20], Step [600/600], Loss: 0.1376\n",
+      "Epoch [18/20], Step [100/600], Loss: 0.1401\n",
+      "Epoch [18/20], Step [200/600], Loss: 0.1457\n",
+      "Epoch [18/20], Step [300/600], Loss: 0.0723\n",
+      "Epoch [18/20], Step [400/600], Loss: 0.2226\n",
+      "Epoch [18/20], Step [500/600], Loss: 0.0641\n",
+      "Epoch [18/20], Step [600/600], Loss: 0.1450\n",
+      "Epoch [19/20], Step [100/600], Loss: 0.1496\n",
+      "Epoch [19/20], Step [200/600], Loss: 0.1327\n",
+      "Epoch [19/20], Step [300/600], Loss: 0.0711\n",
+      "Epoch [19/20], Step [400/600], Loss: 0.1269\n",
+      "Epoch [19/20], Step [500/600], Loss: 0.0667\n",
+      "Epoch [19/20], Step [600/600], Loss: 0.0898\n",
+      "Epoch [20/20], Step [100/600], Loss: 0.0569\n",
+      "Epoch [20/20], Step [200/600], Loss: 0.1008\n",
+      "Epoch [20/20], Step [300/600], Loss: 0.0970\n",
+      "Epoch [20/20], Step [400/600], Loss: 0.1094\n",
+      "Epoch [20/20], Step [500/600], Loss: 0.0969\n",
+      "Epoch [20/20], Step [600/600], Loss: 0.0764\n"
+     ]
+    }
+   ],
+   "source": [
+    "#@title Training the model\n",
+    "\n",
+    "for epoch in range(num_epochs):\n",
+    "  for i ,(images,labels) in enumerate(train_gen):\n",
+    "    images = Variable(images.view(-1,28*28)).cuda()\n",
+    "    labels = Variable(labels).cuda()\n",
+    "    \n",
+    "    optimizer.zero_grad()\n",
+    "    outputs = net(images)\n",
+    "    loss = loss_function(outputs, labels)\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "    \n",
+    "    if (i+1) % 100 == 0:\n",
+    "      print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'\n",
+    "                 %(epoch+1, num_epochs, i+1, len(train_data)//batch_size, loss.data))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Accuracy of the model: 96.100 %\n"
+     ]
+    }
+   ],
+   "source": [
+    "#@title Evaluating the accuracy of the model\n",
+    "\n",
+    "correct = 0\n",
+    "total = 0\n",
+    "for images,labels in test_gen:\n",
+    "  images = Variable(images.view(-1,28*28)).cuda()\n",
+    "  labels = labels.cuda()\n",
+    "  \n",
+    "  output = net(images)\n",
+    "  _, predicted = torch.max(output,1)\n",
+    "  correct += (predicted == labels).sum()\n",
+    "  total += labels.size(0)\n",
+    "\n",
+    "print('Accuracy of the model: %.3f %%' %((100*correct)/(total+1)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model's state_dict:\n",
+      "fc1.weight \t torch.Size([20, 784])\n",
+      "fc1.bias \t torch.Size([20])\n",
+      "fc2.weight \t torch.Size([10, 20])\n",
+      "fc2.bias \t torch.Size([10])\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Model's state_dict:\")\n",
+    "for param_tensor in net.state_dict():\n",
+    "    print(param_tensor, \"\\t\", net.state_dict()[param_tensor].size())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "torch.save(net.state_dict(), \"../models/mnist/mnist-small.state_dict\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/examples/mnist_mips/build.sh b/examples/mnist_mips/build.sh
new file mode 100755
index 0000000..c5a3a09
--- /dev/null
+++ b/examples/mnist_mips/build.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+
+export GOOS=linux
+export GOARCH=mips
+export GOMIPS=softfloat
+go build -o ./mlgo
+
+file mlgo
+
+if [[ ! -d venv ]]; then
+    python3 -m venv venv
+fi
+
+../../compile.py mlgo
diff --git a/examples/mnist_mips/main.go b/examples/mnist_mips/main.go
new file mode 100644
index 0000000..6891d87
--- /dev/null
+++ b/examples/mnist_mips/main.go
@@ -0,0 +1,6 @@
+package main
+
+func main() {
+	MIPS_MNIST()
+}
+
diff --git a/examples/mnist_mips/mips_mnist.go b/examples/mnist_mips/mips_mnist.go
new file mode 100644
index 0000000..4146ce3
--- /dev/null
+++ b/examples/mnist_mips/mips_mnist.go
@@ -0,0 +1,223 @@
+package main
+
+import (
+	"errors"
+	"fmt"
+	"mlgo/common"
+	"mlgo/ml"
+)
+
+type mnist_hparams struct{
+	n_input int32;
+	n_hidden int32;
+	n_classes int32;
+}
+
+type mnist_model struct {
+	hparams mnist_hparams;
+
+	fc1_weight *ml.Tensor;
+	fc1_bias *ml.Tensor;
+
+	fc2_weight *ml.Tensor;
+	fc2_bias *ml.Tensor;
+
+}
+
+const (
+	READ_FROM_BIDENDIAN = true
+	OUTPUT_TO_BIDENDIAN = true
+)
+
+func MIPS_mnist_model_load(model *mnist_model) error {
+	fmt.Println("start MIPS_mnist_model_load")
+	model_bytes := common.ReadBytes(common.MODEL_ADDR, READ_FROM_BIDENDIAN)
+	index := 0
+	fmt.Println("model_bytes len: ", len(model_bytes))
+
+	// verify magic
+	{
+		magic := common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN)
+		fmt.Printf("magic: %x\n", magic)
+		if magic != 0x67676d6c {
+			return errors.New("invalid model file (bad magic)")
+		}
+	}
+
+	// Read FC1 layer 1
+	{
+		fmt.Println("reading fc1")
+		n_dims := int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN))
+		fmt.Println("n_dims: ", n_dims)
+		ne_weight := make([]int32, 0)
+		for i := int32(0); i < n_dims; i++ {
+			ne_weight = append(ne_weight, int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN)))
+		}
+		fmt.Println("ne_weight: ", ne_weight)
+		// FC1 dimensions taken from file, eg. 768x500
+		model.hparams.n_input = ne_weight[0]
+		model.hparams.n_hidden = ne_weight[1]
+		
+		if READ_FROM_BIDENDIAN {
+			fc1_weight_data_size := model.hparams.n_input * model.hparams.n_hidden
+			fc1_weight_data := common.DecodeFloat32List(model_bytes[index:index + 4 * int(fc1_weight_data_size)])
+			index += 4 * int(fc1_weight_data_size)
+			model.fc1_weight = ml.NewTensor2DWithData(nil, ml.TYPE_F32, uint32(model.hparams.n_input), uint32(model.hparams.n_hidden), fc1_weight_data)
+		} else {
+			model.fc1_weight = ml.NewTensor2D(nil, ml.TYPE_F32, uint32(model.hparams.n_input), uint32(model.hparams.n_hidden))
+			fmt.Println("len(model.fc1_weight.Data): ", len(model.fc1_weight.Data))
+			for i := 0; i < len(model.fc1_weight.Data); i++{
+				model.fc1_weight.Data[i] = common.ReadFP32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN)
+				if i % 10000 == 0 {
+					fmt.Println("loading fc1_weight: ", i)
+				}
+			}
+		}
+
+		fmt.Println("index: ", index)
+
+		ne_bias := make([]int32, 0)
+		for i := 0; i < int(n_dims); i++ {
+			ne_bias = append(ne_bias, int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN)))
+		}
+
+		if READ_FROM_BIDENDIAN {
+			fc1_bias_data_size := int(model.hparams.n_hidden)
+			fc1_bias_data := common.DecodeFloat32List(model_bytes[index:index + 4*fc1_bias_data_size])
+			index += 4*fc1_bias_data_size
+			model.fc1_bias = ml.NewTensor1DWithData(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden), fc1_bias_data)
+		} else {
+			model.fc1_bias = ml.NewTensor1D(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden))
+			fmt.Println("len(model.fc1_bias.Data): ", len(model.fc1_bias.Data))
+			for i := 0; i < len(model.fc1_bias.Data); i++ {
+				model.fc1_bias.Data[i] = common.ReadFP32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN)
+				if i % 10000 == 0 {
+					fmt.Println("loading fc1_bias: ", i)
+				}
+			}
+		}
+
+	}
+
+	// Read Fc2 layer 2
+	{
+		fmt.Println("reading fc2")
+		n_dims := int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN))
+		ne_weight := make([]int32, 0)
+		for i := 0; i < int(n_dims); i++ {
+			ne_weight = append(ne_weight, int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN)))
+		}
+
+		// FC1 dimensions taken from file, eg. 10x500
+		model.hparams.n_classes = ne_weight[1]
+
+		if READ_FROM_BIDENDIAN {
+			fc2_weight_data_size := int(model.hparams.n_hidden * model.hparams.n_classes)
+			fc2_weight_data := common.DecodeFloat32List(model_bytes[index:index + 4*fc2_weight_data_size])
+			index += 4*fc2_weight_data_size
+			model.fc2_weight = ml.NewTensor2DWithData(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden), uint32(model.hparams.n_classes), fc2_weight_data)
+		} else {
+			model.fc2_weight = ml.NewTensor2D(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden), uint32(model.hparams.n_classes))
+			for i := 0; i < len(model.fc2_weight.Data); i++{
+				model.fc2_weight.Data[i] = common.ReadFP32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN)
+			}
+		}
+
+		ne_bias := make([]int32, 0)
+		for i := 0; i < int(n_dims); i++ {
+			ne_bias = append(ne_bias, int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN)))
+		}
+
+		if READ_FROM_BIDENDIAN {
+			fc2_bias_data_size := int(model.hparams.n_classes)
+			fc2_bias_data := common.DecodeFloat32List(model_bytes[index:index + 4*fc2_bias_data_size])
+			index += 4*fc2_bias_data_size
+			model.fc2_bias = ml.NewTensor1DWithData(nil, ml.TYPE_F32, uint32(model.hparams.n_classes), fc2_bias_data)
+		} else {
+			model.fc2_bias = ml.NewTensor1D(nil, ml.TYPE_F32, uint32(model.hparams.n_classes))
+			for i := 0; i < len(model.fc2_bias.Data); i++ {
+				model.fc2_bias.Data[i] = common.ReadFP32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN)
+			}
+		}
+
+		ml.PrintTensor(model.fc2_bias, "model.fc2_bias")
+	}
+
+	fmt.Println("current index: ", index)
+
+	return nil
+}
+
+// input is 784 bytes
+func MIPS_InputProcess() []float32 {
+	fmt.Println("start MIPS_InputProcess")
+	buf := common.ReadBytes(common.INPUT_ADDR, READ_FROM_BIDENDIAN)
+	fmt.Println("buf len: ", len(buf))
+	digits := make([]float32, 784)
+	
+	// render the digit in ASCII
+	var c string
+	for row := 0; row < 28; row++{
+		for col := 0; col < 28; col++ {
+			digits[row*28 + col] = float32(buf[row*28 + col])
+			if buf[row*28 + col] > 230 {
+				c += "*"
+			} else {
+				c += "_"
+			}
+		}
+		c += "\n"
+	}
+	fmt.Println(c)
+
+	return digits
+}
+
+func MIPS_mnist_eval(model *mnist_model, digit []float32) int {
+	fmt.Println("start MIPS_mnist_eval")
+	ctx0 := &ml.Context{}
+	graph := ml.Graph{ThreadsCount: 1}
+
+	input := ml.NewTensor1D(ctx0, ml.TYPE_F32, uint32(model.hparams.n_input))
+	copy(input.Data, digit)
+
+	// fc1 MLP = Ax + b
+	fc1 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc1_weight, input), model.fc1_bias)
+	fc2 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc2_weight, ml.Relu(ctx0, fc1)), model.fc2_bias)
+	
+	// softmax
+	final := ml.SoftMax(ctx0, fc2)
+
+	// run the computation
+	ml.BuildForwardExpand(&graph, final)
+	ml.GraphCompute(ctx0, &graph)
+
+	ml.PrintTensor(final, "final tensor")
+
+	maxIndex := 0
+	for i := 0; i < 10; i++{
+		if final.Data[i] > final.Data[maxIndex] {
+			maxIndex = i
+		}
+	}
+	return maxIndex
+}
+
+func MIPS_StoreInMemory(ret int) {
+	retBytes := common.IntToBytes(ret, OUTPUT_TO_BIDENDIAN)
+	common.Output(retBytes, OUTPUT_TO_BIDENDIAN)
+}
+
+func MIPS_MNIST() {
+	fmt.Println("Start MIPS MNIST")
+	input := MIPS_InputProcess()
+	model := new(mnist_model)
+	err := MIPS_mnist_model_load(model)
+	if err != nil {
+		fmt.Println(err)
+		common.Halt()
+	}
+	ret := MIPS_mnist_eval(model, input)
+	fmt.Println("Predicted digit is ", ret)
+	MIPS_StoreInMemory(ret)
+}
\ No newline at end of file
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..ca2b1a4
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,20 @@
+module mlgo
+
+go 1.20
+
+require (
+	github.com/jessevdk/go-flags v1.5.0
+	github.com/mattn/go-colorable v0.1.13
+	github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db
+	github.com/schollz/progressbar/v3 v3.13.1
+	github.com/x448/float16 v0.8.4
+	golang.org/x/exp v0.0.0-20230321023759-10a507213a29
+)
+
+require (
+	github.com/mattn/go-isatty v0.0.17 // indirect
+	github.com/mattn/go-runewidth v0.0.14 // indirect
+	github.com/rivo/uniseg v0.2.0 // indirect
+	golang.org/x/sys v0.6.0 // indirect
+	golang.org/x/term v0.6.0 // indirect
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..020213d
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,31 @@
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/jessevdk/go-flags v1.5.0 h1:1jKYvbxEjfUl0fmqTCOfonvskHHXMjBySTLW4y9LFvc=
+github.com/jessevdk/go-flags v1.5.0/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4=
+github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw=
+github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA=
+github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg=
+github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
+github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng=
+github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
+github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU=
+github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
+github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ=
+github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY=
+github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/schollz/progressbar/v3 v3.13.1 h1:o8rySDYiQ59Mwzy2FELeHY5ZARXZTVJC7iHD6PEFUiE=
+github.com/schollz/progressbar/v3 v3.13.1/go.mod h1:xvrbki8kfT1fzWzBT/UZd9L6GA+jdL7HAgq2RFnO6fQ=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
+github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
+golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug=
+golang.org/x/exp v0.0.0-20230321023759-10a507213a29/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
+golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ=
+golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/term v0.6.0 h1:clScbb1cHjoCkyRbWwBEUZ5H/tIFu5TAXIqaZD0Gcjw=
+golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U=
diff --git a/ml/ml.go b/ml/ml.go
new file mode 100644
index 0000000..dc8ee67
--- /dev/null
+++ b/ml/ml.go
@@ -0,0 +1,3005 @@
+package ml
+
+import (
+	"fmt"
+	"math"
+	"os"
+	"sync"
+)
+
+const (
+	DEBUG = false
+
+	MAX_DIMS   = 4
+	MAX_NODES  = 4096
+	MAX_PARAMS = 16
+	MAX_OPT    = 4
+
+	QK = 32 // quantization
+
+	TOKEN_BOS = 1
+	TOKEN_EOS = 2
+)
+
+type DType uint8
+
+// Data types are the same as in llama.cpp so full compatibility there
+const (
+	TYPE_F32   DType = 0
+	TYPE_F16   DType = 1
+	TYPE_Q4_0  DType = 2
+	TYPE_Q4_1  DType = 3
+	TYPE_I8    DType = 4
+	TYPE_I16   DType = 5
+	TYPE_I32   DType = 6
+	TYPE_COUNT DType = 8
+)
+
+// compute in Single thread
+var (
+	SINGLE_THREAD = false
+)
+
+func printTensor(tensor *Tensor, name string) {
+
+	var dt string
+	switch tensor.Type {
+	case TYPE_F16:
+		dt = "FP16"
+	case TYPE_F32:
+		dt = "FP32"
+	case TYPE_Q4_0:
+		dt = "INT4"
+	}
+
+	fmt.Printf("\n\n=== [ %s | %s | %d:%d:%d ] ===\n",
+		name, dt, tensor.NE[0], tensor.NE[1], tensor.NE[2])
+
+	for nn := 0; nn < min(12, int(tensor.NE[1])); nn++ {
+		fmt.Printf("\n %d x %d ...\t", nn, tensor.NE[0])
+		for ii := 0; ii < min(12, int(tensor.NE[0])); ii++ {
+			fmt.Printf("%.3f\t", tensor.Data[nn*int(tensor.NE[0])+ii])
+		}
+	}
+}
+
+// precomputed exp table for f16 (128 KB)
+// static ggml_fp16_t table_exp_f16[1 << 16];
+// var TableExpFP16 [1 << 16]float16.Float16
+
+var BLCK_SIZE [TYPE_COUNT]uint32 = [TYPE_COUNT]uint32{1, 1, QK, QK, 1, 1, 1, 0}
+var TYPE_SIZE [TYPE_COUNT]uint32 = [TYPE_COUNT]uint32{4, 2, 4 + QK/2, 4*2 + QK/2, 1, 2, 4, 0}
+
+func TypeSizeFloat(dt DType) float32 {
+	return float32(TYPE_SIZE[dt]) / float32(BLCK_SIZE[dt])
+}
+
+// available tensor operations
+type optype uint8
+
+const (
+	OP_NONE optype = iota
+	OP_DUP
+	OP_ADD
+	OP_SUB
+	OP_MUL
+	OP_DIV
+	OP_SQR
+	OP_SQRT
+	OP_SUM
+	OP_MEAN
+	OP_REPEAT
+	OP_ABS
+	OP_SGN
+	OP_NEG
+	OP_STEP
+	OP_RELU
+	OP_GELU
+	OP_SILU
+	OP_NORM
+	OP_RMS_NORM
+
+	OP_MUL_MAT
+
+	OP_SCALE
+	OP_CPY
+	OP_RESHAPE
+	OP_VIEW
+	OP_PERMUTE
+	OP_TRANSPOSE
+	OP_GET_ROWS
+	OP_DIAG_MASK_INF
+	OP_SOFT_MAX
+	OP_ROPE
+	OP_CONV_1D_1S
+	OP_CONV_1D_2S
+
+	OP_FLASH_ATTN
+	OP_FLASH_FF
+
+	OP_COUNT
+)
+
+// n-dimensional tensor
+type Tensor struct {
+	Type DType
+
+	Dims uint32
+	NE   [MAX_DIMS]uint32 // number of elements
+	NB   [MAX_DIMS]uint32 // stride in bytes
+
+	op optype
+
+	isParam bool
+
+	grad *Tensor
+	Src0 *Tensor
+	Src1 *Tensor
+	opt  [MAX_OPT]*Tensor // FIXME Do we need this?
+
+	TasksCount int
+
+	// performance
+	//perfRuns   uint32
+	//perfCycles uint32
+	//perfTime   uint64
+
+	Data []float32
+	//padding [8]byte
+}
+
+// static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+func (tensor *Tensor) IsContiguous() bool {
+	//    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+	//
+	return tensor.NB[0] == TYPE_SIZE[tensor.Type] &&
+		tensor.NB[1] == tensor.NB[0]*tensor.NE[0]/BLCK_SIZE[tensor.Type] &&
+		tensor.NB[2] == tensor.NB[1]*tensor.NE[1] &&
+		tensor.NB[3] == tensor.NB[2]*tensor.NE[2]
+}
+
+func AreSameShape(a, b *Tensor) bool {
+	////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function");
+	return (a.NE[0] == b.NE[0]) && (a.NE[1] == b.NE[1]) && (a.NE[2] == b.NE[2]) && (a.NE[3] == b.NE[3])
+}
+
+func (t *Tensor) Nelements() uint32 {
+	////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function");
+	return t.NE[0] * t.NE[1] * t.NE[2] * t.NE[3]
+}
+
+func (t *Tensor) Nrows() uint32 {
+	////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function");
+	return t.NE[1] * t.NE[2] * t.NE[3]
+}
+
+// size_t ggml_nbytes(const struct ggml_tensor * tensor) {
+func (t *Tensor) Nbytes() uint32 {
+	////static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+	return (t.Nelements() * TYPE_SIZE[t.Type]) / BLCK_SIZE[t.Type]
+}
+
+// struct ggml_tensor * ggml_view_tensor(
+func ViewTensor(ctx *Context, src *Tensor) *Tensor {
+	return NewTensor(ctx, src.Type, src.Dims, src.NE[0], src.NE[1], src.NE[2], src.NE[3], src.Data)
+}
+
+// ggml.c : ggml_dup_tensor
+func DupTensor(ctx *Context, src *Tensor) *Tensor {
+	return NewTensor(ctx, src.Type, src.Dims, src.NE[0], src.NE[1], src.NE[2], src.NE[3], nil)
+}
+
+// struct ggml_tensor * Mul(
+func Mul(ctx *Context, a, b *Tensor) *Tensor {
+	return MulImpl(ctx, a, b, false)
+}
+
+// struct ggml_tensor * Mul_inplace(
+func MulInplace(ctx *Context, a, b *Tensor) *Tensor {
+	return MulImpl(ctx, a, b, true)
+}
+
+// struct ggml_tensor * Mul_impl(
+func MulImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor {
+	////ASSERT(ggml_are_same_shape(a, b));
+
+	if !AreSameShape(a, b) {
+		fmt.Printf("\n[STOP] MulImpl - tensors of different shapes!")
+		os.Exit(1)
+	}
+
+	isNode := false
+
+	if inplace && (a.grad != nil || b.grad != nil) {
+		isNode = true
+	}
+
+	if inplace {
+		////ASSERT(is_node == false);
+	}
+
+	var result *Tensor
+	if inplace {
+		result = ViewTensor(ctx, a)
+	} else {
+		result = DupTensor(ctx, a)
+	}
+
+	result.op = OP_MUL
+	result.Src0 = a
+	result.Src1 = b
+
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	return result
+}
+
+// ggml_can_mul_mat
+func CanMulMat(t0, t1 *Tensor) bool {
+	////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function");
+	return (t0.NE[0] == t1.NE[0]) && (t0.NE[2] == t1.NE[2]) && (t0.NE[3] == t1.NE[3])
+}
+
+// ggml_mul_mat
+func MulMat(ctx *Context, a, b *Tensor) *Tensor {
+	////ASSERT(ggml_can_mul_mat(a, b));
+	////GGML_ASSERT(!ggml_is_transposed(a));
+
+	isNode := false
+
+	if a.grad != nil || b.grad != nil {
+		isNode = true
+	}
+
+	result := NewTensor(ctx, TYPE_F32, min32(a.Dims, b.Dims), a.NE[1], b.NE[1], a.NE[2], b.NE[3], nil)
+
+	result.op = OP_MUL_MAT
+	result.Src0 = a
+	result.Src1 = b
+
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	return result
+}
+
+// ggml_add
+
+func AddImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor {
+	////ASSERT(ggml_are_same_shape(a, b));
+
+	//bool is_node = false;
+
+	////if (!inplace && (a.grad || b.grad)) {
+	////is_node = true;
+	////}
+
+	////struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+	var result *Tensor
+	if inplace {
+		result = ViewTensor(ctx, a)
+	} else {
+		result = DupTensor(ctx, a)
+	}
+
+	result.op = OP_ADD
+	////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+	result.grad = nil
+	result.Src0 = a
+	result.Src1 = b
+
+	return result
+}
+
+func Add(ctx *Context, a, b *Tensor) *Tensor {
+	return AddImpl(ctx, a, b, false)
+}
+
+func AddInplace(ctx *Context, a, b *Tensor) *Tensor {
+	return AddImpl(ctx, a, b, true)
+}
+
+// ggml_sum
+
+func Sum(ctx *Context, a *Tensor) *Tensor {
+	isNode := false
+
+	if a.grad != nil {
+		isNode = true
+	}
+
+	result := NewTensor1D(ctx, a.Type, 1)
+
+	result.op = OP_SUM
+	result.Src0 = a
+	result.Src1 = nil
+
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	return result
+}
+
+// ggml_sub
+
+func SubImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor {
+	////ASSERT(ggml_are_same_shape(a, b));
+
+	////bool is_node = false;
+
+	////if (!inplace && (a.grad || b.grad)) {
+	////is_node = true;
+	////}
+
+	var result *Tensor
+	if inplace {
+		result = ViewTensor(ctx, a)
+	} else {
+		result = DupTensor(ctx, a)
+	}
+
+	result.op = OP_SUB
+	////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+	result.grad = nil
+	result.Src0 = a
+	result.Src1 = b
+
+	return result
+}
+
+func Sub(ctx *Context, a, b *Tensor) *Tensor {
+	return SubImpl(ctx, a, b, false)
+}
+
+func SubInplace(ctx *Context, a, b *Tensor) *Tensor {
+	return SubImpl(ctx, a, b, true)
+}
+
+// ggml_div
+
+func DivImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor {
+	////ASSERT(ggml_are_same_shape(a, b));
+
+	////bool is_node = false;
+
+	////if (!inplace && (a->grad || b->grad)) {
+	////is_node = true;
+	////}
+
+	////if (inplace) {
+	////ASSERT(is_node == false);
+	////}
+
+	var result *Tensor
+	if inplace {
+		result = ViewTensor(ctx, a)
+	} else {
+		result = DupTensor(ctx, a)
+	}
+
+	result.op = OP_DIV
+	////result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+	result.grad = nil
+	result.Src0 = a
+	result.Src1 = b
+
+	return result
+}
+
+func Div(ctx *Context, a, b *Tensor) *Tensor {
+	return DivImpl(ctx, a, b, false)
+}
+
+func DivInplace(ctx *Context, a, b *Tensor, inplace bool) *Tensor {
+	return DivImpl(ctx, a, b, true)
+}
+
+// ggml_sgn
+
+func SgnImpl(ctx *Context, a *Tensor, inplace bool) *Tensor {
+	isNode := false
+
+	if !inplace && a.grad != nil {
+		isNode = true
+	}
+
+	var result *Tensor
+	if inplace {
+		result = ViewTensor(ctx, a)
+	} else {
+		result = DupTensor(ctx, a)
+	}
+
+	result.op = OP_SGN
+	result.Src0 = a
+	result.Src1 = nil
+
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	return result
+}
+
+func Sgn(ctx *Context, a *Tensor) *Tensor {
+	return SgnImpl(ctx, a, false)
+}
+
+func SgnInplace(ctx *Context, a *Tensor) *Tensor {
+	return SgnImpl(ctx, a, true)
+}
+
+// ggml_relu
+
+func ReluImpl(ctx *Context, a *Tensor, inplace bool) *Tensor {
+	isNode := false
+
+	if !inplace && a.grad != nil {
+		isNode = true
+	}
+
+	var result *Tensor
+	if inplace {
+		result = ViewTensor(ctx, a)
+	} else {
+		result = DupTensor(ctx, a)
+	}
+
+	result.op = OP_RELU
+	result.Src0 = a
+	result.Src1 = nil 
+
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	return result
+}
+
+func Relu(ctx *Context, a *Tensor) *Tensor {
+	return ReluImpl(ctx, a, false)
+}
+
+func ReluInplace(ctx *Context, a *Tensor) *Tensor {
+	return ReluImpl(ctx, a, true)
+}
+
+// Repeat
+
+// struct ggml_tensor * Repeat(
+func Repeat(ctx *Context, a, b *Tensor) *Tensor {
+	////ASSERT(ggml_can_repeat(a, b));
+
+	isNode := false
+
+	if a.grad != nil {
+		isNode = true
+	}
+
+	if AreSameShape(a, b) && !isNode {
+		return a
+	}
+
+	//struct ggml_tensor * result = ggml_new_tensor(ctx, a.type, b.n_dims, b.ne);
+	result := NewTensor(ctx, a.Type, b.Dims, b.NE[0], b.NE[1], b.NE[2], b.NE[3], nil)
+
+	result.op = OP_REPEAT
+	result.Src0 = a
+	result.Src1 = b
+
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	return result
+}
+
+func IsScalar(tensor *Tensor) bool {
+	////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function");
+	return tensor.NE[0] == 1 && tensor.NE[1] == 1 && tensor.NE[2] == 1 && tensor.NE[3] == 1
+}
+
+func IsVector(tensor *Tensor) bool {
+	////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function");
+	return tensor.NE[1] == 1 && tensor.NE[2] == 1 && tensor.NE[3] == 1
+}
+
+func IsMatrix(tensor *Tensor) bool {
+	////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function");
+	return tensor.NE[2] == 1 && tensor.NE[3] == 1
+}
+
+// ggml_get_rows
+func GetRows(ctx *Context, a, b *Tensor) *Tensor {
+	////ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b.type == TYPE_I32);
+	if !IsMatrix(a) || !IsVector(b) /* || b.Type != TYPE_I32 */ {
+		fmt.Printf("\n[ERROR] GetRows fail basic assertions")
+		os.Exit(1)
+	}
+
+	isNode := false
+
+	if a.grad != nil || b.grad != nil {
+		////ASSERT(false); // TODO: implement backward
+		isNode = true
+		fmt.Printf("\n[STOP] ml.GetRows")
+		os.Exit(1)
+	}
+
+	// TODO: implement non F32 return
+	//struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a.type, a.ne[0], b.ne[0]);
+	result := NewTensor2D(ctx, TYPE_F32, a.NE[0], b.NE[0])
+
+	result.op = OP_GET_ROWS
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	result.Src0 = a
+	result.Src1 = b
+
+	return result
+}
+
+func RMSNorm(ctx *Context, a *Tensor) *Tensor {
+	return RMSNormImpl(ctx, a, false)
+}
+
+func RMSNormInplace(ctx *Context, a *Tensor) *Tensor {
+	return RMSNormImpl(ctx, a, true)
+}
+
+// //struct ggml_tensor * ggml_rms_norm_impl(
+func RMSNormImpl(ctx *Context, a *Tensor, inplace bool) *Tensor {
+	isNode := false
+
+	if !inplace && a.grad != nil {
+		////ASSERT(false); // TODO: implement backward
+		isNode = true
+		fmt.Printf("\n[STOP] ml.GetRows")
+		os.Exit(1)
+	}
+
+	////struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+	var result *Tensor
+	if inplace {
+		result = ViewTensor(ctx, a)
+	} else {
+		result = DupTensor(ctx, a)
+	}
+
+	result.op = OP_RMS_NORM
+	result.Src0 = a
+	result.Src1 = nil // TODO: maybe store epsilon here?
+
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	return result
+}
+
+// ggml_view_1d
+// NB! Originally offset in bytes, but here in floats (4-bytes)
+func View1D(ctx *Context, a *Tensor, ne0 uint32, offset uint32) *Tensor {
+	////if a.grad != nil {
+	////	////ASSERT(false); // gradient propagation is not supported
+	////	fmt.Printf("\n[STOP] View1D : gradient propagation is not supported")
+	////	os.Exit(1)
+	////}
+
+	slice := a.Data[offset:]
+	result := NewTensor(ctx, a.Type, 1, ne0, 1, 1, 1, slice)
+
+	result.op = OP_VIEW
+	result.grad = nil
+	result.Src0 = a
+	result.Src1 = nil // TODO: maybe store the offset here?
+
+	return result
+}
+
+// ggml_build_forward_impl
+func BuildForwardImpl(graph *Graph, tensor *Tensor, expand bool) {
+
+	if !expand {
+		graph.NodesCount = 0
+		graph.LeafsCount = 0
+	}
+
+	n0 := graph.NodesCount
+	VisitParents(graph, tensor)
+	n_new := graph.NodesCount - n0
+
+	if n_new > 0 {
+		// the last added node should always be starting point
+		////ASSERT(cgraph.nodes[cgraph.n_nodes - 1] == tensor);
+		if !(graph.Nodes[graph.NodesCount-1] == tensor) {
+			fmt.Printf("\n[STOP] BuildForwardImpl : the last added node should always be starting point!")
+			os.Exit(1)
+		}
+	}
+}
+
+// ggml_build_forward_expand
+func BuildForwardExpand(graph *Graph, tensor *Tensor) {
+	BuildForwardImpl(graph, tensor, true)
+	// construct the tensor => NodeID mapping
+	ConstructTensor2NodeIDMapping(graph)
+}
+
+func ConstructTensor2NodeIDMapping(graph *Graph) {
+	if graph.Tensor2NodeID == nil {
+		graph.Tensor2NodeID = make(map[*Tensor]uint32)
+	}
+	cnt := uint32(0)
+	for i := uint32(0); i < graph.NodesCount; i++ {
+		node := graph.Nodes[i]
+		graph.Tensor2NodeID[node] = cnt
+		cnt++
+	}
+	// add leaves
+	for i := uint32(0); i < graph.LeafsCount; i++ {
+		node := graph.Leafs[i]
+		graph.Tensor2NodeID[node] = cnt
+		cnt++
+	}
+}
+
+// ggml_visit_parents
+func VisitParents(graph *Graph, node *Tensor) {
+
+	if node.grad == nil {
+		// this usually happens when we generate intermediate nodes from constants in the backward pass
+		// it can also happen during forward pass, if the user performs computations with constants
+		if node.op != OP_NONE {
+			//PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node.op);
+		}
+	}
+
+	// check if already visited
+	for i := uint32(0); i < graph.NodesCount; i++ {
+		if graph.Nodes[i] == node {
+			return
+		}
+	}
+
+	for i := uint32(0); i < graph.LeafsCount; i++ {
+		if graph.Leafs[i] == node {
+			return
+		}
+	}
+
+	if node.Src0 != nil {
+		VisitParents(graph, node.Src0)
+	}
+
+	if node.Src1 != nil {
+		VisitParents(graph, node.Src1)
+	}
+
+	for i := 0; i < MAX_OPT; i++ {
+		if node.opt[i] != nil {
+			VisitParents(graph, node.opt[i])
+		}
+	}
+
+	if node.op == OP_NONE && node.grad == nil {
+		// reached a leaf node, not part of the gradient graph (e.g. a constant)
+		////ASSERT(cgraph.n_leafs < MAX_NODES);
+
+		graph.Leafs[graph.LeafsCount] = node
+		graph.LeafsCount++
+	} else {
+		////ASSERT(cgraph.n_nodes < MAX_NODES);
+
+		graph.Nodes[graph.NodesCount] = node
+		graph.Grads[graph.NodesCount] = node.grad
+		graph.NodesCount++
+	}
+}
+
+// ggml_cpy
+func CopyImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor {
+
+	////ASSERT(ggml_nelements(a) == ggml_nelements(b));
+	if a.Nelements() != b.Nelements() {
+		fmt.Printf("\n[HALT] Copy tensors of different dimensions!")
+		os.Exit(1)
+	}
+
+	isNode := false
+
+	if !inplace && (a.grad != nil || b.grad != nil) {
+		////ASSERT(false); // TODO: implement backward
+		isNode = true
+		fmt.Printf("\n[STOP] cpyImpl")
+		os.Exit(1)
+	}
+
+	// make a view of the destination
+	result := ViewTensor(ctx, b)
+
+	result.op = OP_CPY
+	result.Src0 = a
+	result.Src1 = b
+
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	return result
+}
+
+func Copy(ctx *Context, a, b *Tensor) *Tensor {
+	return CopyImpl(ctx, a, b, false)
+}
+
+func CopyInplace(ctx *Context, a, b *Tensor) *Tensor {
+	return CopyImpl(ctx, a, b, true)
+}
+
+// computation graph
+type Graph struct {
+	NodesCount   uint32
+	LeafsCount   uint32
+	ThreadsCount int
+
+	Jobs chan *ComputeParams
+
+	Nodes [MAX_NODES]*Tensor
+	Grads [MAX_NODES]*Tensor
+	Leafs [MAX_NODES]*Tensor
+
+	Tensor2NodeID map[*Tensor]uint32 // *tensor => NodeID
+	// performance
+	//perfRuns   uint64
+	//perfCycles uint64
+	////int64_t perf_time_us;
+}
+
+type InitParams struct {
+}
+
+// ml/ggml.c:2248
+// TODO Do we need this?
+type Context struct {
+}
+
+// ggml_new_tensor_1d
+func NewTensor1D(ctx *Context, dt DType, ne0 uint32) *Tensor {
+	return NewTensor(ctx, dt, 1, ne0, 1, 1, 1, nil)
+}
+
+func NewTensor1DWithData(ctx *Context, dt DType, ne0 uint32, data []float32) *Tensor {
+	return NewTensor(ctx, dt, 1, ne0, 1, 1, 1, data)
+}
+
+// ggml_new_tensor_2d
+func NewTensor2D(ctx *Context, dt DType, ne0, ne1 uint32) *Tensor {
+	return NewTensor(ctx, dt, 2, ne0, ne1, 1, 1, nil)
+}
+
+func NewTensor2DWithData(ctx *Context, dt DType, ne0, ne1 uint32, data []float32) *Tensor {
+	return NewTensor(ctx, dt, 2, ne0, ne1, 1, 1, data)
+}
+
+func NewTensor3D(ctx *Context, dt DType, ne0, ne1, ne2 uint32) *Tensor {
+	return NewTensor(ctx, dt, 3, ne0, ne1, ne2, 1, nil)
+}
+
+func NewTensor4D(ctx *Context, dt DType, ne0, ne1, ne2, ne3 uint32) *Tensor {
+	return NewTensor(ctx, dt, 4, ne0, ne1, ne2, ne3, nil)
+}
+
+// ggml_new_tensor_impl
+func NewTensor(ctx *Context, dt DType, dims uint32, ne0, ne1, ne2, ne3 uint32, data []float32) *Tensor {
+
+	if dt != TYPE_F32 && dt != TYPE_I32 {
+		fmt.Printf("\n[ERROR] NewTensorImpl got not supported type : %d", dt)
+		os.Exit(1)
+	}
+
+	////ggml_assert_aligned(result);
+
+	result := Tensor{
+		Type: dt,
+		Dims: dims,
+		NE:   [4]uint32{ne0, ne1, ne2, ne3},
+		op:   OP_NONE,
+	}
+
+	////result->nb[0] = GGML_TYPE_SIZE[type];
+	////result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);
+	////for (int i = 2; i < GGML_MAX_DIMS; i++) {
+	////    result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
+	////}
+
+	result.NB[0] = TYPE_SIZE[dt]
+	result.NB[1] = TYPE_SIZE[dt] * (result.NE[0] / BLCK_SIZE[dt])
+	result.NB[2] = result.NB[1] * result.NE[1]
+	result.NB[3] = result.NB[2] * result.NE[2]
+
+	total := ne0 * ne1 * ne2 * ne3
+
+	if data == nil {
+		result.Data = make([]float32, total, total) // &newData
+	} else {
+		result.Data = data
+	}
+
+	return &result
+}
+
+// ggml_permute
+func Permute(ctx *Context, a *Tensor, axis0, axis1, axis2, axis3 uint32) *Tensor {
+
+	////ASSERT(axis0 >= 0 && axis0 < MAX_DIMS);
+	////ASSERT(axis1 >= 0 && axis1 < MAX_DIMS);
+	////ASSERT(axis2 >= 0 && axis2 < MAX_DIMS);
+	////ASSERT(axis3 >= 0 && axis3 < MAX_DIMS);
+
+	////ASSERT(axis0 != axis1);
+	////ASSERT(axis0 != axis2);
+	////ASSERT(axis0 != axis3);
+	////ASSERT(axis1 != axis2);
+	////ASSERT(axis1 != axis3);
+	////ASSERT(axis2 != axis3);
+
+	isNode := false
+
+	if a.grad != nil {
+		////ASSERT(false); // TODO: implement backward
+		isNode = true
+		fmt.Printf("\n[STOP] Permute error")
+		os.Exit(1)
+	}
+
+	result := ViewTensor(ctx, a)
+
+	var ne [MAX_DIMS]uint32
+	var nb [MAX_DIMS]uint32
+
+	ne[axis0] = a.NE[0]
+	ne[axis1] = a.NE[1]
+	ne[axis2] = a.NE[2]
+	ne[axis3] = a.NE[3]
+
+	nb[axis0] = a.NB[0]
+	nb[axis1] = a.NB[1]
+	nb[axis2] = a.NB[2]
+	nb[axis3] = a.NB[3]
+
+	result.NE[0] = ne[0]
+	result.NE[1] = ne[1]
+	result.NE[2] = ne[2]
+	result.NE[3] = ne[3]
+
+	result.NB[0] = nb[0]
+	result.NB[1] = nb[1]
+	result.NB[2] = nb[2]
+	result.NB[3] = nb[3]
+
+	result.op = OP_PERMUTE
+	result.Src0 = a
+	result.Src1 = nil // TODO: maybe store the permutation here?
+
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	return result
+}
+
+// ggml_rope
+func Rope(ctx *Context, a *Tensor, past, dims, mode uint32) *Tensor {
+	////ASSERT(n_past >= 0);
+
+	isNode := false
+
+	if a.grad != nil {
+		////ASSERT(false); // TODO: implement backward
+		isNode = true
+		fmt.Printf("\n[STOP] Rope error")
+		os.Exit(1)
+	}
+
+	// TODO: when implement backward, fix this:
+	//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+	result := ViewTensor(ctx, a)
+
+	b := NewTensor1D(ctx, TYPE_I32, 3)
+	b.Data[0] = float32(past)
+	b.Data[1] = float32(dims)
+	b.Data[2] = float32(mode)
+
+	result.op = OP_ROPE
+	result.Src0 = a
+	result.Src1 = b
+
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	return result
+}
+
+func Reshape3D(ctx *Context, a *Tensor, ne0, ne1, ne2 uint32) *Tensor {
+	////ASSERT(ggml_is_contiguous(a));
+	////ASSERT(ggml_nelements(a) == ne0*ne1*ne2);
+
+	if !a.IsContiguous() {
+		fmt.Printf("\n[STOP] Reshape3D : tensor is NOT contiguous!")
+		os.Exit(1)
+	}
+
+	if a.Nelements() != ne0*ne1*ne2 {
+		fmt.Printf("\n[STOP] Reshape3D : different elements number!")
+		os.Exit(1)
+	}
+
+	////bool is_node = false;
+
+	////if (a.grad) {
+	////   //// ASSERT(false); // TODO: implement backward
+	////    is_node = true;
+	////}
+
+	//ne := [3]uint32{ ne0, ne1, ne2 }
+	result := NewTensor(ctx, a.Type, 3, ne0, ne1, ne2, 1, a.Data)
+
+	result.op = OP_RESHAPE
+	////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+	result.grad = nil
+	result.Src0 = a
+	result.Src1 = nil
+
+	return result
+}
+
+// ggml_new_f32
+func NewFP32(ctx *Context, value float32) *Tensor {
+	result := NewTensor1D(ctx, TYPE_F32, 1)
+	SetFP32(result, value)
+	return result
+}
+
+// ggml_set_f32
+func SetFP32(tensor *Tensor, value float32) *Tensor {
+	// FIXME Optimize with mem zeroing
+	n := tensor.Nelements()
+	for i := uint32(0); i < n; i++ {
+		////ggml_vec_set_f32(nc, (float *)(data + i*n1), value);
+		tensor.Data[i] = value
+	}
+	return tensor
+}
+
+// ggml_scale
+func ScaleImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor {
+	////ASSERT(ggml_is_scalar(b));
+	////ASSERT(ggml_is_padded_1d(a));
+
+	////bool is_node = false;
+
+	if !inplace && (a.grad != nil || b.grad != nil) {
+		////ASSERT(false); // TODO: implement backward
+		////is_node = true;
+		fmt.Printf("\n[STOP] ScaleImpl : assertion failed")
+		os.Exit(1)
+	}
+
+	// TODO: when implement backward, fix this:
+	//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+	result := ViewTensor(ctx, a)
+
+	result.op = OP_SCALE
+	////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+	result.grad = nil
+	result.Src0 = a
+	result.Src1 = b
+
+	return result
+}
+
+func Scale(ctx *Context, a, b *Tensor) *Tensor {
+	return ScaleImpl(ctx, a, b, false)
+}
+
+func ScaleInplace(ctx *Context, a, b *Tensor) *Tensor {
+	return ScaleImpl(ctx, a, b, true)
+}
+
+// ggml_diag_mask_inf
+func DiagMaskInf(ctx *Context, a *Tensor, past uint32) *Tensor {
+	////bool is_node = false;
+
+	if a.grad != nil {
+		////ASSERT(false); // TODO: implement backward
+		////is_node = true;
+		fmt.Printf("\n[STOP] DiagMaskInf : assertion failed")
+		os.Exit(1)
+	}
+
+	// TODO: when implement backward, fix this:
+	//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+	result := ViewTensor(ctx, a)
+	b := NewFP32(ctx, float32(past)) // FIXME NewI32(ctx, past)
+
+	result.op = OP_DIAG_MASK_INF
+	////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+	result.grad = nil
+	result.Src0 = a
+	result.Src1 = b
+
+	return result
+}
+
+// ggml_soft_max
+func SoftMax(ctx *Context, a *Tensor) *Tensor {
+	////bool is_node = false;
+
+	if a.grad != nil {
+		////ASSERT(false); // TODO: implement backward
+		////is_node = true;
+		fmt.Printf("\n[STOP] SoftMax : assertion failed")
+		os.Exit(1)
+	}
+
+	// TODO: when implement backward, fix this:
+	//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+	result := ViewTensor(ctx, a)
+
+	result.op = OP_SOFT_MAX
+	////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+	result.grad = nil
+	result.Src0 = a
+	result.Src1 = nil
+
+	return result
+}
+
+// ggml_silu
+
+func SiluImpl(ctx *Context, a *Tensor, inplace bool) *Tensor {
+	////bool is_node = false;
+
+	////if (!inplace && (a.grad)) {
+	////is_node = true;
+	////}
+
+	var result *Tensor
+	if inplace {
+		result = ViewTensor(ctx, a)
+	} else {
+		result = DupTensor(ctx, a)
+	}
+
+	result.op = OP_SILU
+	////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+	result.grad = nil
+	result.Src0 = a
+	result.Src1 = nil
+
+	return result
+}
+
+func Silu(ctx *Context, a *Tensor) *Tensor {
+	return SiluImpl(ctx, a, false)
+}
+
+func SiluInplace(ctx *Context, a *Tensor) *Tensor {
+	return SiluImpl(ctx, a, true)
+}
+
+// ggml_step
+
+func StepImpl(ctx *Context, a *Tensor, inplace bool) *Tensor {
+	isNode := false
+
+	if !inplace && a.grad != nil {
+		isNode = true
+	}
+
+	var result *Tensor
+	if inplace {
+		result = ViewTensor(ctx, a)
+	} else {
+		result = DupTensor(ctx, a)
+	}
+
+	result.op = OP_STEP
+	result.Src0 = a
+	result.Src1 = nil
+
+	if isNode {
+		result.grad = DupTensor(ctx, result)
+	} else {
+		result.grad = nil
+	}
+
+	return result
+}
+
+func Step(ctx *Context, a *Tensor) *Tensor {
+	return StepImpl(ctx, a, false)
+}
+
+func StepInplace(ctx *Context, a *Tensor) *Tensor {
+	return StepImpl(ctx, a, true)
+}
+
+// ggml_transpose
+
+func Transpose(ctx *Context, a *Tensor) *Tensor {
+	////isNode := false
+
+	if a.grad != nil {
+		////ASSERT(false); // TODO: implement backward
+		////is_node = true;
+	}
+
+	result := ViewTensor(ctx, a)
+
+	result.NE[0] = a.NE[1]
+	result.NE[1] = a.NE[0]
+
+	result.NB[0] = a.NB[1]
+	result.NB[1] = a.NB[0]
+
+	result.op = OP_TRANSPOSE
+	////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+	result.grad = nil
+	result.Src0 = a
+	result.Src1 = nil
+
+	return result
+}
+
+func BuildForward(tensor *Tensor) *Graph {
+	result := Graph{}
+	BuildForwardImpl(&result, tensor, false)
+	return &result
+}
+
+func BuildBackward(ctx *Context, gf *Graph, keep bool) Graph {
+
+	result := *gf
+	////ASSERT(gf.n_nodes > 0);
+
+	// if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph
+	if keep {
+		for i := uint32(0); i < gf.NodesCount; i++ {
+			node := gf.Nodes[i]
+
+			if node.grad != nil {
+				node.grad = DupTensor(ctx, node)
+				gf.Grads[i] = node.grad
+			}
+		}
+	}
+
+	for i := gf.NodesCount - 1; i >= 0; i-- {
+		node := gf.Nodes[i]
+
+		// because we detached the grad nodes from the original graph, we can afford inplace operations
+		if node.grad != nil {
+			ComputeBackward(ctx, node, keep)
+		}
+	}
+
+	for i := gf.NodesCount - 1; i >= 0; i-- {
+		node := gf.Nodes[i]
+
+		if node.isParam {
+			////PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
+			BuildForwardImpl(&result, node.grad, true)
+		}
+	}
+
+	return result
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+func ComputeBackward(ctx *Context, tensor *Tensor, inplace bool) {
+
+	src0 := tensor.Src0
+	src1 := tensor.Src1
+
+	switch tensor.op {
+
+	case OP_DUP:
+		if src0.grad != nil {
+			src0.grad = AddImpl(ctx, src0.grad, tensor.grad, inplace)
+		}
+	case OP_ADD:
+		if src0.grad != nil {
+			src0.grad = AddImpl(ctx, src0.grad, tensor.grad, inplace)
+		}
+		if src1.grad != nil {
+			src1.grad = AddImpl(ctx, src1.grad, tensor.grad, inplace)
+		}
+	case OP_SUB:
+		if src0.grad != nil {
+			src0.grad = AddImpl(ctx, src0.grad, tensor.grad, inplace)
+		}
+		if src1.grad != nil {
+			src1.grad = SubImpl(ctx, src1.grad, tensor.grad, inplace)
+		}
+	case OP_MUL:
+		if src0.grad != nil {
+			src0.grad =
+				AddImpl(ctx,
+					src0.grad,
+					Mul(ctx, src1, tensor.grad),
+					inplace)
+		}
+		if src1.grad != nil {
+			src1.grad =
+				AddImpl(ctx,
+					src1.grad,
+					Mul(ctx, src0, tensor.grad),
+					inplace)
+		}
+	case OP_DIV:
+		if src0.grad != nil {
+			src0.grad =
+				AddImpl(ctx,
+					src0.grad,
+					Div(ctx, tensor.grad, src1),
+					inplace)
+		}
+		if src1.grad != nil {
+			src1.grad =
+				SubImpl(ctx,
+					src1.grad,
+					Mul(ctx,
+						tensor.grad,
+						Div(ctx, tensor, src1)),
+					inplace)
+		}
+	case OP_SQR:
+		if src0.grad != nil {
+			src0.grad =
+				AddImpl(ctx,
+					src0.grad,
+					Mul(ctx,
+						Mul(ctx, src0, tensor.grad),
+						Repeat(ctx, NewFP32(ctx, 2.0), src0)),
+					inplace)
+		}
+	case OP_SQRT:
+		if src0.grad != nil {
+			src0.grad =
+				AddImpl(ctx,
+					src0.grad,
+					Div(ctx,
+						Repeat(ctx, NewFP32(ctx, 0.5), tensor),
+						tensor),
+					inplace)
+		}
+	case OP_SUM:
+		if src0.grad != nil {
+			src0.grad =
+				AddImpl(ctx,
+					src0.grad,
+					Repeat(ctx, tensor.grad, src0.grad),
+					inplace)
+		}
+	case OP_MEAN:
+		//// ASSERT(false); // TODO: implement
+	case OP_REPEAT:
+		if src0.grad != nil {
+			src0.grad =
+				AddImpl(ctx,
+					src0.grad,
+					Sum(ctx, tensor.grad),
+					inplace)
+		}
+	case OP_ABS:
+		if src0.grad != nil {
+			src0.grad =
+				AddImpl(ctx,
+					src0.grad,
+					Mul(ctx,
+						Sgn(ctx, src0),
+						tensor.grad),
+					inplace)
+		}
+	case OP_SGN:
+		if src0.grad != nil {
+			// noop
+		}
+	case OP_NEG:
+		if src0.grad != nil {
+			src0.grad = SubImpl(ctx, src0.grad, tensor.grad, inplace)
+		}
+	case OP_STEP:
+		if src0.grad != nil {
+			// noop
+		}
+	case OP_RELU:
+		if src0.grad != nil {
+			src0.grad = SubImpl(ctx,
+				src0.grad,
+				Mul(ctx,
+					Step(ctx, src0),
+					tensor.grad),
+				inplace)
+		}
+	case OP_GELU:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_SILU:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_NORM:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_RMS_NORM:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_MUL_MAT:
+		if src0.grad != nil {
+			// TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor.grad);
+			//// ASSERT(false);
+			fmt.Printf("\n[HALT] ComputeBackward : OP_MUL_MAT with src0.grad!")
+			os.Exit(1)
+		}
+		if src1.grad != nil {
+			src1.grad =
+				AddImpl(ctx,
+					src1.grad,
+					// TODO: fix transpose, the node will break the graph connections
+					MulMat(ctx, Transpose(ctx, src0), tensor.grad),
+					inplace)
+		}
+	case OP_SCALE:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_CPY:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_RESHAPE:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_VIEW:
+		//// ASSERT(false); // not supported
+	case OP_PERMUTE:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_TRANSPOSE:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_GET_ROWS:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_DIAG_MASK_INF:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_SOFT_MAX:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_ROPE:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_CONV_1D_1S:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_CONV_1D_2S:
+		//// ASSERT(false); // TODO: not implemented
+	case OP_FLASH_ATTN:
+		//// ASSERT(false); // not supported
+	case OP_FLASH_FF:
+		//// ASSERT(false); // not supported
+	case OP_NONE:
+		// nop
+	case OP_COUNT:
+		//// ASSERT(false);
+	}
+}
+
+// ---
+
+type TaskType uint8
+
+const (
+	TASK_INIT     TaskType = 0
+	TASK_COMPUTE  TaskType = 1
+	TASK_FINALIZE TaskType = 2
+)
+
+type ComputeParams struct {
+	Type TaskType
+
+	ith uint32
+	nth uint32
+
+	tensor *Tensor
+
+	wg *sync.WaitGroup
+}
+
+// Golang doesn’t have unary Bitwise NOT(~) like other programming languages
+// Here, you have to use Bitwise XOR(^) operator as Bitwise NOT
+func up32(n uint32) uint32 { // FIXME Not needed ?
+	return uint32(n+31) & ^uint32(31)
+}
+
+func up(n, m uint32) uint32 { // FIXME Not needed ?
+	// assert m is a power of 2
+	////GGML_ASSERT((m & (m - 1)) == 0);
+	return uint32(n+m-1) & ^uint32(m-1)
+}
+
+func max(a, b int) int { // FIXME Not needed ?
+	if a >= b {
+		return a
+	}
+	return b
+}
+
+// Job is goroutine existing while the computation loop is active
+// The main purpose of the Job is to perform some part
+// of time consuming matrix multiplications
+func Job(listen <-chan *ComputeParams) {
+	//fmt.Printf("\nJOB STARTED...")
+	for params := range listen {
+
+		//fmt.Printf("\n...JOB SIGNAL")
+		ComputeForwardMulMatFP32(
+			params,
+			params.tensor.Src0,
+			params.tensor.Src1,
+			params.tensor)
+
+		// DEBUG MULTI_THREAD
+		//if params.nth > 1 {
+		//	defer params.wg.Done()
+		//defer fmt.Printf("\nTHREAD #%d ... defer Done()", params.ith)
+		//}
+
+		//fmt.Printf("\n...JOB DONE")
+		params.wg.Done()
+	}
+	//fmt.Printf("\nJOB FINISHED...")
+}
+
+func GraphCompute(ctx *Context, graph *Graph) {
+
+	maxThreads := graph.ThreadsCount
+
+	// --- init N job goroutines and channel to send tasks for them
+
+	graph.Jobs = make(chan *ComputeParams, maxThreads) // TODO Right place to init?
+	defer close(graph.Jobs)
+
+	// TODO Investigate https://pkg.go.dev/runtime#LockOSThread
+	for i := 0; i < maxThreads; i++ {
+		go Job(graph.Jobs)
+	}
+
+	// --- initialize tasks
+
+	{
+		// thread scheduling for the different operations
+		// TasksCount might be 0, 1, or ThreadsCount
+		for i := uint32(0); i < graph.NodesCount; i++ {
+
+			////struct ggml_tensor * node = cgraph->nodes[i];
+			node := graph.Nodes[i]
+
+			if DEBUG {
+				fmt.Printf("\n\n### STEP #%d ### %d - %d [ %d:%d:%d:%d ]", i, node.op, node.Type, node.NE[0], node.NE[1], node.NE[2], node.NE[3])
+			}
+
+			switch node.op {
+
+			case OP_DUP:
+				node.TasksCount = 1
+			case OP_ADD:
+				node.TasksCount = 1 // TODO threads
+			case OP_SUB:
+			case OP_MUL:
+			case OP_DIV:
+			case OP_SQR:
+			case OP_SQRT:
+			case OP_SUM:
+			case OP_MEAN:
+			case OP_REPEAT:
+			case OP_ABS:
+			case OP_SGN:
+			case OP_NEG:
+			case OP_STEP:
+			case OP_RELU:
+				node.TasksCount = 1
+			case OP_GELU:
+				node.TasksCount = 1 // TODO threads
+			case OP_SILU:
+				node.TasksCount = 1 // TODO threads
+			case OP_NORM:
+			case OP_RMS_NORM:
+				node.TasksCount = 1 // TODO threads
+			case OP_MUL_MAT:
+				node.TasksCount = maxThreads
+				// TODO: use different scheduling for different matrix sizes
+			case OP_SCALE:
+				node.TasksCount = 1 // TODO threads
+			case OP_CPY:
+			case OP_RESHAPE:
+			case OP_VIEW:
+			case OP_PERMUTE:
+			case OP_TRANSPOSE:
+			case OP_GET_ROWS:
+			case OP_DIAG_MASK_INF:
+				node.TasksCount = 1
+			case OP_SOFT_MAX:
+				node.TasksCount = 1 // TODO threads
+			case OP_ROPE:
+				////node.TasksCount = 1
+			case OP_CONV_1D_1S:
+			case OP_CONV_1D_2S:
+				node.TasksCount = 1 // TODO threads
+				////ASSERT(node->src0->ne[3] == 1);
+				////ASSERT(node->src1->ne[2] == 1);
+				////ASSERT(node->src1->ne[3] == 1);
+			case OP_FLASH_ATTN:
+				node.TasksCount = 1 // TODO threads
+			case OP_FLASH_FF:
+				node.TasksCount = 1 // TODO threads
+			case OP_NONE:
+				node.TasksCount = 1
+			case OP_COUNT:
+				fmt.Printf("\n[HALT] Something wrong with compute graph!")
+				os.Exit(1)
+			}
+		}
+	}
+
+	for i := uint32(0); i < graph.NodesCount; i++ {
+
+		node := graph.Nodes[i]
+
+		if DEBUG {
+			fmt.Printf("\n\n### STEP #%d ### %d - %d [ %d:%d:%d:%d ]", i, node.op, node.Type, node.NE[0], node.NE[1], node.NE[2], node.NE[3])
+		}
+
+		params := ComputeParams{
+			Type: TASK_INIT,
+			ith:  0,
+			nth:  uint32(node.TasksCount),
+		}
+
+		ComputeForward(graph, &params, node) // TASK_INIT
+
+		// --- COMPUTE
+
+		// BREAKPOINT DEBUG
+		//if i > 1300 {
+		//	fmt.Printf("\n\n=== HALT #%d ===", i)
+		//	os.Exit(0)
+		//}
+
+		params.Type = TASK_COMPUTE
+		ComputeForward(graph, &params, node)
+
+		// --- FINALIZE
+
+		params.Type = TASK_FINALIZE
+		ComputeForward(graph, &params, node)
+	}
+
+}
+
+
+
+// =======================================================================
+
+func ComputeForward(graph *Graph, params *ComputeParams, tensor *Tensor) {
+
+	switch tensor.op {
+
+	case OP_DUP:
+		////ggml_compute_forward_dup(params, tensor->src0, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_dup")
+		os.Exit(1)
+	case OP_ADD:
+		ComputeForwardAddFP32(params, tensor.Src0, tensor.Src1, tensor)
+	case OP_SUB:
+		////ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_sub")
+		os.Exit(1)
+	case OP_MUL:
+		ComputeForwardMulFP32(params, tensor.Src0, tensor.Src1, tensor)
+	case OP_DIV:
+		////ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_div")
+		os.Exit(1)
+	case OP_SQR:
+		////ggml_compute_forward_sqr(params, tensor->src0, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_sqr")
+		os.Exit(1)
+	case OP_SQRT:
+		////ggml_compute_forward_sqrt(params, tensor->src0, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_sqrt")
+		os.Exit(1)
+	case OP_SUM:
+		////ggml_compute_forward_sum(params, tensor->src0, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_sum")
+		os.Exit(1)
+	case OP_MEAN:
+		////ggml_compute_forward_mean(params, tensor->src0, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_mean")
+		os.Exit(1)
+	case OP_REPEAT:
+		ComputeForwardRepeatFP32(params, tensor.Src0, tensor)
+	case OP_ABS:
+		////ggml_compute_forward_abs(params, tensor->src0, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_abs")
+		os.Exit(1)
+	case OP_SGN:
+		////ggml_compute_forward_sgn(params, tensor->src0, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_sgn")
+		os.Exit(1)
+	case OP_NEG:
+		////ggml_compute_forward_neg(params, tensor->src0, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_neg")
+		os.Exit(1)
+	case OP_STEP:
+		////ggml_compute_forward_step(params, tensor->src0, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_step")
+		os.Exit(1)
+	case OP_RELU:
+		////ggml_compute_forward_relu(params, tensor->src0, tensor);
+		ComputeForwardReluFP32(params, tensor.Src0, tensor)
+	case OP_GELU:
+		////ggml_compute_forward_gelu(params, tensor->src0, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_gelu")
+		os.Exit(1)
+	case OP_SILU:
+		ComputeForwardSiluFP32(params, tensor.Src0, tensor)
+	case OP_NORM:
+		////ggml_compute_forward_norm(params, tensor->src0, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_norm")
+		os.Exit(1)
+	case OP_RMS_NORM:
+		ComputeForwardRMSNormFP32(params, tensor.Src0, tensor)
+	case OP_MUL_MAT:
+
+		if SINGLE_THREAD {
+			ComputeForwardMulMatFP32(params, tensor.Src0, tensor.Src1, tensor)
+		} else {
+			// TODO Optimize this
+			if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+				return
+			}
+
+			//ComputeForwardMulMatFP32(params, tensor.src0, tensor.src1, tensor)
+			//return
+
+			wg := new(sync.WaitGroup)
+			wg.Add(graph.ThreadsCount)
+
+			for i := 0; i < graph.ThreadsCount; i++ {
+				graph.Jobs <- &ComputeParams{
+					Type:   TASK_COMPUTE,
+					ith:    uint32(i),
+					nth:    uint32(graph.ThreadsCount),
+					tensor: tensor,
+					wg:     wg,
+				}
+			}
+
+			wg.Wait()
+		}
+
+	case OP_SCALE:
+		ComputeForwardScaleFP32(params, tensor.Src0, tensor.Src1, tensor)
+	case OP_CPY:
+		ComputeForwardDupFP32(params, tensor.Src0, tensor)
+	case OP_RESHAPE:
+		ComputeForwardReshape(params, tensor.Src0, tensor) // NOP
+	case OP_VIEW:
+		ComputeForwardView(params, tensor.Src0) // NOP
+	case OP_PERMUTE:
+		ComputeForwardPermute(params, tensor.Src0) // NOP
+	case OP_TRANSPOSE:
+		////ggml_compute_forward_transpose(params, tensor->src0);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_transpose")
+		os.Exit(1)
+	case OP_GET_ROWS:
+		ComputeForwardGetRows(params, tensor.Src0, tensor.Src1, tensor)
+	case OP_DIAG_MASK_INF:
+		ComputeForwardDiagMaskInfFP32(params, tensor.Src0, tensor.Src1, tensor)
+	case OP_SOFT_MAX:
+		ComputeForwardSoftMaxFP32(params, tensor.Src0, tensor)
+	case OP_ROPE:
+		ComputeForwardRopeFP32(params, tensor.Src0, tensor.Src1, tensor)
+	case OP_CONV_1D_1S:
+		////ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_conv_1d_1s")
+		os.Exit(1)
+	case OP_CONV_1D_2S:
+		////ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_conv_1d_2s")
+		os.Exit(1)
+	case OP_FLASH_ATTN:
+		////int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);
+		////ASSERT(t == 0 || t == 1);
+		////bool masked = t != 0;
+		////ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_flash_attn")
+		os.Exit(1)
+	case OP_FLASH_FF:
+		////ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);
+		fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_flash_ff")
+		os.Exit(1)
+	case OP_NONE:
+		// nop
+	case OP_COUNT:
+		////ASSERT(false);
+		fmt.Printf("\n[HALT] ComputeForward got OP_COUNT method!")
+		os.Exit(1)
+	}
+}
+
+func VecCopyFP32(n uint32, y, x []float32) {
+	for i := uint32(0); i < n; i++ {
+		y[i] = x[i]
+	}
+}
+
+// ggml_compute_forward_get_rows_f32
+func ComputeForwardGetRows(params *ComputeParams, src0, src1, dst *Tensor) {
+
+	////assert(params->ith == 0);
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	nc := src0.NE[0]
+	nr := src1.Nelements()
+
+	////assert( dst->ne[0] == nc);
+	////assert( dst->ne[1] == nr);
+	////assert(src0->nb[0] == sizeof(float));
+
+	if dst.NE[0] != nc || dst.NE[1] != nr || src0.NB[0] != TYPE_SIZE[TYPE_F32] /*TYPE_SIZE[TYPE_I32]*/ {
+		fmt.Printf("[HALT]ComputeForwardGetRows : wrong dimensions!")
+		os.Exit(1)
+	}
+
+	// FIXME Speed-up
+	////for row := uint32(0); row < nr; row++ {
+	////	for column := uint32(0); column < nc; column++ {
+	////		(*dst.Data)[row*nr+column] = (*src0.Data)[row*nr+column]
+	////	}
+	////}
+
+	for i := uint32(0); i < nr; i++ {
+		r := uint32(src1.Data[i])
+
+		////ggml_vec_cpy_f32(nc,
+		////        (float *) ((char *)  dst->data + i*dst->nb[1]),
+		////        (float *) ((char *) src0->data + r*src0->nb[1]));
+
+		// FIXME ASAP and double check!
+		// VecCopyFP32(nc, (*dst.Data)[i*dst.NE[0]:], (*src0.Data)[uint32(r)*src0.NE[0]:])
+		// VecCopyFP32(nc, dst.Data[i*dst.NB[1]/4:], src0.Data[r*src0.NB[1]/4:])
+		VecCopyFP32(nc, dst.Data[i*dst.NE[0]:], src0.Data[r*src0.NE[0]:])
+	}
+}
+
+// ggml_compute_forward_rms_norm_f32
+func ComputeForwardRMSNormFP32(params *ComputeParams, src0, dst *Tensor) {
+
+	////GGML_ASSERT(ggml_are_same_shape(src0, dst));
+	////GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	ith := params.ith
+	nth := params.nth
+
+	ne00 := src0.NE[0]
+	ne01 := src0.NE[1]
+	ne02 := src0.NE[2]
+	ne03 := src0.NE[3]
+
+	nb01 := src0.NB[1]
+	nb02 := src0.NB[2]
+	nb03 := src0.NB[3]
+
+	nb1 := dst.NB[1]
+	nb2 := dst.NB[2]
+	nb3 := dst.NB[3]
+
+	eps := 1e-5 // TODO: make this a parameter
+
+	// TODO: optimize
+	for i03 := uint32(0); i03 < ne03; i03++ {
+		for i02 := uint32(0); i02 < ne02; i02++ {
+			for i01 := uint32(ith); i01 < ne01; i01 += nth {
+
+				////const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
+				x := src0.Data[i01*nb01/4+i02*nb02/4+i03*nb03/4:]
+
+				mean := 0.0
+				// TODO Simplify to directly access [src]
+				for i00 := uint32(0); i00 < ne00; i00++ {
+					////mean += x[i00] * x[i00];
+					mean += float64(x[i00] * x[i00])
+				}
+
+				mean /= float64(ne00)
+
+				scale := float32(1.0 / math.Sqrt(mean+eps))
+
+				// TODO Simplify to directly update [dst]
+				////float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);
+				y := dst.Data[i01*nb1/4+i02*nb2/4+i03*nb3/4:]
+
+				////memcpy(y, x, ne00 * sizeof(float));
+				//VecScaleFP32(ne00, y, float32(scale))
+
+				for i := uint32(0); i < ne00; i++ {
+					y[i] = x[i] * scale
+				}
+			}
+		}
+	}
+}
+
+// ggml_vec_scale_f32
+func VecScaleFP32(n uint32, y []float32, v float32) {
+	for i := uint32(0); i < n; i++ {
+		y[i] *= v
+	}
+}
+
+// ggml_compute_forward_repeat
+func ComputeForwardRepeatFP32(params *ComputeParams, src0, dst *Tensor) {
+
+	////assert(params->ith == 0);
+	////assert(ggml_can_repeat(src0, dst));
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	// TODO: implement support for rank > 2 tensors
+	////assert(src0->ne[2] == 1);
+	////assert(src0->ne[3] == 1);
+	////assert( dst->ne[2] == 1);
+	////assert( dst->ne[3] == 1);
+
+	nc := dst.NE[0]
+	nr := dst.NE[1]
+	nc0 := src0.NE[0]
+	nr0 := src0.NE[1]
+	ncr := nc / nc0 // guaranteed to be an integer due to the check in ggml_can_repeat
+	nrr := nr / nr0 // guaranteed to be an integer due to the check in ggml_can_repeat
+
+	// TODO: support for transposed / permuted tensors
+	////assert( dst->nb[0] == sizeof(float));
+	////assert(src0->nb[0] == sizeof(float));
+
+	// TODO: maybe this is not optimal?
+	for i := uint32(0); i < nrr; i++ {
+		for j := uint32(0); j < ncr; j++ {
+			for k := uint32(0); k < nr0; k++ {
+
+				////ggml_vec_cpy_f32(nc0,
+				////(float *) ((char *)  dst->data + (i*nr0 + k)*( dst->nb[1]) + j*nc0*( dst->nb[0])),
+				////(float *) ((char *) src0->data + (        k)*(src0->nb[1])));
+
+				VecCopyFP32(nc0,
+					dst.Data[(i*nr0+k)*dst.NB[1]/4+j*nc0*dst.NB[0]/4:],
+					src0.Data[k*src0.NB[1]/4:])
+			}
+		}
+	}
+
+	if DEBUG {
+		printTensor(src0, "REPEAT SRC0")
+		printTensor(dst, "REPEAT DST")
+	}
+}
+
+// ggml_compute_forward_relu
+
+// inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
+
+func VecReluFP32(n uint32, y, x []float32) {
+	for i := uint32(0); i < n; i++ {
+		if x[i] > 0 {
+			y[i] = x[i]
+		} else {
+			y[i] = 0
+		}
+	}
+}
+
+func ComputeForwardReluFP32(params *ComputeParams, src0, dst *Tensor) {
+	// assert(params->ith == 0);
+    // assert(ggml_are_same_shape(src0, dst));
+	if !AreSameShape(src0, dst) {
+		fmt.Printf("\n[HALT] ComputeForwardReluFP32 : different shapes!")
+		os.Exit(1)
+	}
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return 
+	}
+
+	n := src0.Nrows()
+	nc := src0.NE[0]
+
+	// assert(dst->nb[0]  == sizeof(float));
+    // assert(src0->nb[0] == sizeof(float));
+
+	for i := uint32(0); i < n; i++{
+		// ggml_vec_relu_f32(nc,
+		// 	(float *) ((char *) dst->data  + i*( dst->nb[1])),
+		// 	(float *) ((char *) src0->data + i*(src0->nb[1])));
+		VecReluFP32(nc, dst.Data[i*dst.NE[0]:], src0.Data[i*src0.NE[0]:])
+	}
+}
+
+func VecMulFP32(n uint32, z, x, y []float32) {
+	for i := uint32(0); i < n; i++ {
+		z[i] = x[i] * y[i]
+	}
+}
+
+// ggml_compute_forward_mul
+func ComputeForwardMulFP32(params *ComputeParams, src0, src1, dst *Tensor) {
+
+	////assert(params->ith == 0);
+	////assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+	if !AreSameShape(src0, src1) || !AreSameShape(src0, dst) {
+		fmt.Printf("\n[HALT] ComputeForwardMulFP32 : different shapes!")
+		os.Exit(1)
+	}
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	n := src0.Nrows()
+	nc := src0.NE[0]
+
+	////assert( dst->nb[0] == sizeof(float));
+	////assert(src0->nb[0] == sizeof(float));
+	////assert(src1->nb[0] == sizeof(float));
+
+	for i := uint32(0); i < n; i++ {
+
+		////ggml_vec_mul_f32(nc,
+		////(float *) ((char *) dst->data  + i*( dst->nb[1])),
+		////(float *) ((char *) src0->data + i*(src0->nb[1])),
+		////(float *) ((char *) src1->data + i*(src1->nb[1])));
+
+		// FIXME NE vs NB
+		VecMulFP32(nc, dst.Data[i*dst.NE[0]:], src0.Data[i*src0.NE[0]:], src1.Data[i*src1.NE[0]:])
+	}
+
+	if DEBUG {
+		printTensor(src0, "MUL SRC0")
+		printTensor(src1, "MUL SRC1")
+		printTensor(dst, "MUL DST")
+	}
+}
+
+// ggml_vec_dot_f32
+func VecDotFP32(n uint32, x, y []float32) float32 {
+	sumf := float32(0.0)
+	for i := uint32(0); i < n; i++ {
+		sumf += x[i] * y[i]
+	}
+	return sumf
+}
+
+// ggml_vec_mad_f32
+func VecMadFP32(n uint32, y, x []float32, v float32) {
+	for i := uint32(0); i < n; i++ {
+		y[i] += x[i] * v
+	}
+}
+
+// ggml_vec_acc_f32
+func VecAccFP32(n uint32, y, x []float32) {
+	for i := uint32(0); i < n; i++ {
+		y[i] += x[i]
+	}
+}
+
+// ggml_compute_forward_mul_mat_f32
+func ComputeForwardMulMatFP32(params *ComputeParams, src0, src1, dst *Tensor) {
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	ith := params.ith
+	nth := params.nth
+
+	ne00 := src0.NE[0]
+	ne01 := src0.NE[1]
+	ne02 := src0.NE[2]
+	ne03 := src0.NE[3]
+
+	//ne10 := src1.NE[0] // for BLAS only
+	ne11 := src1.NE[1]
+	//ne12 := src1.NE[2]
+	//ne13 := src1.NE[3]
+
+	//ne0 := dst.NE[0]
+	//ne1 := dst.NE[1]
+	//ne2 := dst.NE[2]
+	//ne3 := dst.NE[3]
+	//ne := ne0 * ne1 * ne2 * ne3
+
+	//nb00 := src0.NB[0]
+	nb01 := src0.NB[1] / 4
+	nb02 := src0.NB[2] / 4
+	nb03 := src0.NB[3] / 4
+
+	//nb10 := src1.NB[0]
+	nb11 := src1.NB[1] / 4
+	nb12 := src1.NB[2] / 4
+	nb13 := src1.NB[3] / 4
+
+	nb0 := dst.NB[0] / 4
+	nb1 := dst.NB[1] / 4
+	nb2 := dst.NB[2] / 4
+	nb3 := dst.NB[3] / 4
+
+	////assert(ne02 == ne12);
+	////assert(ne03 == ne13);
+	////assert(ne2  == ne12);
+	////assert(ne3  == ne13);
+
+	// TODO: we don't support permuted src0
+	////assert(nb00 == sizeof(float) || nb01 == sizeof(float));
+
+	// dst cannot be transposed or permuted
+	////assert(nb0 == sizeof(float));
+	////assert(nb0 <= nb1);
+	////assert(nb1 <= nb2);
+	////assert(nb2 <= nb3);
+
+	////assert(ne0 == ne01);
+	////assert(ne1 == ne11);
+	////assert(ne2 == ne02);
+	////assert(ne3 == ne03);
+
+	// nb01 >= nb00 - src0 is not transposed
+	//   compute by src0 rows
+
+	/*
+		////#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+
+		////if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
+		////GGML_ASSERT(nb10 == sizeof(float));
+
+		if params.ith != 0 {
+		return
+		}
+
+		if params.Type == TASK_INIT {
+		return
+		}
+
+		if params.Type == TASK_FINALIZE {
+		return
+		}
+
+		for i03 := uint32(0); i03 < ne03; i03++ {
+		for i02 := uint32(0); i02 < ne02; i02++ {
+
+		const float * x = (float *) (src0->data);
+
+		////const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
+
+		////float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+
+		// zT = y * xT
+		////{
+		////cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+		////ne11, ne01, ne10,
+		////1.0f,    y, ne10,
+		////         x, ne10,
+		////0.0f,    d, ne01);
+		////}
+		////}
+		////}
+
+		//printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
+
+		////return;
+		////}
+		////#endif
+	*/
+
+	// TODO: do not support transposed src1
+	////assert(nb10 == sizeof(float));
+	////if nb10 == 4 {
+	////	fmt.Printf("\n[HALT] Do not support transposed src1")
+	////	os.Exit(1)
+	////}
+
+	// parallelize by src0 rows using ggml_vec_dot_f32
+
+	// total rows in src0
+	nr := ne01 * ne02 * ne03
+
+	// rows per thread
+	dr := (nr + nth - 1) / nth
+
+	// row range for this thread
+	ir0 := dr * ith
+	ir1 := min32(ir0+dr, nr)
+
+	for ir := uint32(ir0); ir < ir1; ir++ {
+
+		// src0 indices
+		i03 := ir / (ne02 * ne01)
+		i02 := (ir - i03*ne02*ne01) / ne01
+		i01 := (ir - i03*ne02*ne01 - i02*ne01)
+
+		// src1 indices
+		i13 := i03
+		i12 := i02
+		//i11 := ic
+
+		// dst indices
+		i0 := i01
+		//i1 := i11
+		i2 := i02
+		i3 := i03
+
+		for ic := uint32(0); ic < ne11; ic++ {
+
+			//dst.Data[i0*nb0+ic*nb1+i2*nb2+i3*nb3] =
+			//	VecDotFP32(ne00,
+			//		src0.Data[i01*nb01+i02*nb02+i03*nb03:],
+			//		src1.Data[ic*nb11+i12*nb12+i13*nb13:])
+
+			// --- inline VecDotFP32
+
+			src0Ptr := src0.Data[i01*nb01+i02*nb02+i03*nb03:]
+			src1Ptr := src1.Data[ic*nb11+i12*nb12+i13*nb13:]
+
+			sum := float32(0.0)
+			for i := uint32(0); i < ne00; i++ {
+				sum += src0Ptr[i] * src1Ptr[i]
+			}
+
+			dst.Data[i0*nb0+ic*nb1+i2*nb2+i3*nb3] = sum
+		}
+	}
+
+	if DEBUG {
+		fmt.Printf("\n\n>>> ComputeForwardMulMatFP32 OUT <<<\n")
+		printTensor(dst, "DST")
+	}
+
+}
+
+// ggml_compute_forward_view
+func ComputeForwardView(params *ComputeParams, src0 *Tensor) {
+	// NOP
+}
+
+func ComputeForwardCopy(params *ComputeParams, src0, dst *Tensor) {
+	ComputeForwardDupFP32(params, src0, dst)
+}
+
+// ggml_compute_forward_dup_f32
+func ComputeForwardDupFP32(params *ComputeParams, src0, dst *Tensor) {
+
+	////GGML_ASSERT(params->ith == 0);
+	////GGML_ASSERT(ggml_is_contiguous(dst));
+	////GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+
+	if !dst.IsContiguous() {
+		fmt.Printf("[HALT] ComputeForwardDupFP32 : [dst] is NOT contiguous!")
+		os.Exit(1)
+	}
+
+	if dst.Nelements() != src0.Nelements() {
+		fmt.Printf("[HALT] ComputeForwardDupFP32 : [dst] and [src0] capacities are different!")
+		os.Exit(1)
+	}
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	ne00 := src0.NE[0]
+	ne01 := src0.NE[1]
+	ne02 := src0.NE[2]
+	ne03 := src0.NE[3]
+
+	nb00 := src0.NB[0] / 4
+	nb01 := src0.NB[1] / 4
+	nb02 := src0.NB[2] / 4
+	nb03 := src0.NB[3] / 4
+
+	////if (ggml_is_contiguous(src0) && src0->type == dst->type) {
+	if src0.IsContiguous() && src0.Type == dst.Type {
+		////memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);
+		copy(dst.Data, src0.Data)
+		return
+	}
+
+	// --- src0 is NOT contigious
+	// --- supporting only 4-bytes data for [src0] and FP32 for [dst]
+
+	if src0.NB[0] == TYPE_SIZE[TYPE_F32] {
+		if dst.Type == TYPE_F32 {
+
+			id := uint32(0)
+			rs := ne00 * nb00
+
+			for i03 := uint32(0); i03 < ne03; i03++ {
+				for i02 := uint32(0); i02 < ne02; i02++ {
+					for i01 := uint32(0); i01 < ne01; i01++ {
+
+						////const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+						src0Ptr := src0.Data[i01*nb01+i02*nb02+i03*nb03 : i01*nb01+i02*nb02+i03*nb03+rs]
+						////char * dst_ptr = (char *) dst->data + id*rs;
+						dstPtr := dst.Data[id*rs : id*rs+rs]
+						////memcpy(dst_ptr, src0_ptr, rs);
+						copy(dstPtr, src0Ptr)
+
+						id++
+					}
+				}
+			}
+			////} else if (dst->type == GGML_TYPE_F16) {
+			////    int id = 0;
+			////    ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+			////    for (int i03 = 0; i03 < ne03; i03++) {
+			////        for (int i02 = 0; i02 < ne02; i02++) {
+			////            for (int i01 = 0; i01 < ne01; i01++) {
+			////                for (int i00 = 0; i00 < ne00; i00++) {
+			////                    const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+			////                    dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
+			////                    id++;
+			////                }
+			////            }
+			////        }
+			////    }
+		} else {
+			////GGML_ASSERT(false); // TODO: implement
+			fmt.Printf("[HALT] ComputeForwardDupFP32 : not supported tensor type!")
+			os.Exit(1)
+		}
+	} else {
+
+		if dst.Type == TYPE_F32 {
+
+			id := 0
+			////dstPtr = (float *) dst->data;
+
+			for i03 := uint32(0); i03 < ne03; i03++ {
+				for i02 := uint32(0); i02 < ne02; i02++ {
+					for i01 := uint32(0); i01 < ne01; i01++ {
+						for i00 := uint32(0); i00 < ne00; i00++ {
+
+							//src0Ptr := src0.Data[i00*nb00/4 + i01*nb01/4 + i02*nb02/4 + i03*nb03/4:]
+							//dstPtr[id] = *src0_ptr;
+
+							dst.Data[id] = src0.Data[i00*nb00+i01*nb01+i02*nb02+i03*nb03]
+
+							id++
+						}
+					}
+				}
+			}
+			////} else if (dst->type == GGML_TYPE_F16) {
+			////    int id = 0;
+			////    ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;
+
+			////    for (int i03 = 0; i03 < ne03; i03++) {
+			////        for (int i02 = 0; i02 < ne02; i02++) {
+			////            for (int i01 = 0; i01 < ne01; i01++) {
+			////                for (int i00 = 0; i00 < ne00; i00++) {
+			////                    const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
+
+			////                    dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);
+			////                    id++;
+			////                }
+			////            }
+			////        }
+			////    }
+		} else {
+			////GGML_ASSERT(false) // TODO: implement
+			fmt.Printf("[HALT] ComputeForwardDupFP32 : not supported tensor type!")
+			os.Exit(1)
+		}
+	}
+
+	if DEBUG {
+		fmt.Printf("\n\n>>> ComputeForwardDupFP32 OUT <<<\n")
+	}
+}
+
+// ggml_compute_forward_reshape
+func ComputeForwardReshape(params *ComputeParams, src0, dst *Tensor) {
+	// NOP
+}
+
+// ggml_compute_forward_permute
+func ComputeForwardPermute(params *ComputeParams, src0 *Tensor) {
+	// NOP
+}
+
+// ggml_compute_forward_rope
+func ComputeForwardRopeFP32(params *ComputeParams, src0, src1, dst *Tensor) {
+
+	////assert(params->ith == 0);
+	////assert(src1->type == GGML_TYPE_I32);
+	////assert(ggml_nelements(src1) == 3);
+
+	if src1.Nelements() != 3 {
+		fmt.Printf("\n[HALT] ComputeForwardRopeFP32 : src1 has NOT EXACT 3 elements!")
+		os.Exit(1)
+	}
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	pastCount := uint32(src1.Data[0])
+	dims := uint32(src1.Data[1])
+	mode := uint32(src1.Data[2])
+
+	//const int ne0 = src0->ne[0];
+	ne1 := src0.NE[1]
+	ne2 := src0.NE[2]
+	ne3 := src0.NE[3]
+
+	nb0 := src0.NB[0]
+	nb1 := src0.NB[1]
+	nb2 := src0.NB[2]
+	nb3 := src0.NB[3]
+
+	////assert(nb0 == sizeof(float));
+
+	var modeCount uint32
+	if mode == 0 {
+		modeCount = 0
+	} else {
+		modeCount = pastCount
+	}
+
+	// TODO: optimize
+	for i3 := uint32(0); i3 < ne3; i3++ {
+		for i2 := modeCount; i2 < ne2; i2++ {
+
+			////const int p = (mode == 0 ? n_past + i2 : i2);
+			var p uint32
+			if mode == 0 {
+				p = pastCount + i2
+			} else {
+				p = i2
+			}
+
+			for i1 := uint32(0); i1 < ne1; i1++ {
+				for i0 := 0; i0 < int(dims); i0 += 2 {
+
+					////const double theta = pow(10000.0, ((double)-i0)/n_dims);
+					theta := math.Pow(10000.0, float64(-i0)/float64(dims))
+
+					cosTheta := math.Cos(float64(p) * theta)
+					sinTheta := math.Sin(float64(p) * theta)
+
+					////const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+					offset := i3*nb3/4 + i2*nb2/4 + i1*nb1/4 + uint32(i0)*nb0/4
+					src := src0.Data[offset:]
+					////   float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);
+					dstData := dst.Data[offset:]
+
+					x0 := float64(src[0])
+					x1 := float64(src[1])
+
+					dstData[0] = float32(x0*cosTheta - x1*sinTheta)
+					dstData[1] = float32(x0*sinTheta + x1*cosTheta)
+				}
+			}
+		}
+	}
+
+}
+
+// ggml_compute_forward_scale_f32
+func ComputeForwardScaleFP32(params *ComputeParams, src0, src1, dst *Tensor) {
+
+	////GGML_ASSERT(ggml_is_contiguous(src0));
+	////GGML_ASSERT(ggml_is_contiguous(dst));
+	////GGML_ASSERT(ggml_are_same_shape(src0, dst));
+	////GGML_ASSERT(ggml_is_scalar(src1));
+
+	if !src0.IsContiguous() {
+		fmt.Printf("[HALT] ComputeForwardScaleFP32 : [src0] is NOT contiguous!")
+		os.Exit(1)
+	}
+
+	if !dst.IsContiguous() {
+		fmt.Printf("[HALT] ComputeForwardScaleFP32 : [dst] is NOT contiguous!")
+		os.Exit(1)
+	}
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	// scale factor
+	v := src1.Data[0]
+
+	ith := params.ith
+	nth := params.nth
+
+	nc := src0.NE[0]
+	nr := src0.Nrows()
+
+	// rows per thread
+	dr := (nr + nth - 1) / nth
+
+	// row range for this thread
+	ir0 := dr * ith
+	ir1 := min(int(ir0)+int(dr), int(nr))
+
+	for i1 := ir0; int(i1) < ir1; i1++ {
+		////ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), v);
+		////VecScaleFP32(nc, (*dst.Data)[i1*dst.NE[0]:], v)
+		VecScaleFP32(nc, dst.Data[i1*dst.NB[1]/4:], v)
+	}
+
+}
+
+// ggml_compute_forward_diag_mask_inf
+func ComputeForwardDiagMaskInfFP32(params *ComputeParams, src0, src1, dst *Tensor) {
+
+	////assert(params->ith == 0);
+	////assert(src1->type == GGML_TYPE_I32);
+	////assert(ggml_nelements(src1) == 1);
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	pastCount := uint32(src1.Data[0])
+
+	// TODO: handle transposed/permuted matrices
+
+	n := src0.Nrows()
+	nc := src0.NE[0]
+	nr := src0.NE[1]
+	nz := n / nr
+
+	////assert( dst->nb[0] == sizeof(float));
+	////assert(src0->nb[0] == sizeof(float));
+
+	for k := uint32(0); k < nz; k++ {
+		for j := uint32(0); j < nr; j++ {
+			for i := pastCount; i < nc; i++ {
+				if i > pastCount+j {
+					////*(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = -INFINITY;
+					dst.Data[k*dst.NB[2]/4+j*dst.NB[1]/4+i*dst.NB[0]/4] = float32(math.Inf(-1)) // TODO Use const
+				}
+			}
+		}
+	}
+
+	if DEBUG {
+		fmt.Printf("\n\n>>> ComputeForwardDiagMaskInfFP32 OUT <<<\n")
+	}
+
+}
+
+func maxFloat(x, y float32) float32 {
+	if x >= y {
+		return x
+	}
+	return y
+}
+
+func VecMaxFP32(n uint32, x []float32) float32 {
+	max := float32(math.Inf(-1)) // TODO use constant
+	for i := uint32(0); i < n; i++ {
+		max = maxFloat(max, x[i])
+	}
+	return max
+}
+
+// ggml_compute_forward_soft_max
+func ComputeForwardSoftMaxFP32(params *ComputeParams, src0, dst *Tensor) {
+
+	////GGML_ASSERT(ggml_is_contiguous(src0));
+	////GGML_ASSERT(ggml_is_contiguous(dst));
+	////GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+	if !src0.IsContiguous() {
+		fmt.Printf("[HALT] ComputeForwardSoftMaxFP32 : [src0] is NOT contiguous!")
+		os.Exit(1)
+	}
+
+	if !dst.IsContiguous() {
+		fmt.Printf("[HALT] ComputeForwardSoftMaxFP32 : [dst] is NOT contiguous!")
+		os.Exit(1)
+	}
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	negInf := float32(math.Inf(-1)) // TODO use constant
+
+	// TODO: handle transposed/permuted matrices
+
+	ith := params.ith
+	nth := params.nth
+
+	nc := src0.NE[0]
+	nr := src0.Nrows()
+
+	// rows per thread
+	dr := (nr + nth - 1) / nth
+
+	// row range for this thread
+	ir0 := dr * ith
+	ir1 := min(int(ir0+dr), int(nr))
+
+	for i1 := ir0; int(i1) < ir1; i1++ {
+		////float *p = (float *)((char *) dst->data + i1*dst->nb[1]);
+		p := dst.Data[i1*dst.NB[1]/4:]
+		max := VecMaxFP32(nc, p)
+		sum := float32(0.0)
+		//var bits uint16
+		for i := 0; i < int(nc); i++ {
+			if p[i] == negInf { // TODO use constant
+				p[i] = 0.0
+			} else {
+				//const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);
+
+				////ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);
+				//s := FP32_TO_FP16(p[i] - max)
+				////memcpy(&scvt, &s, sizeof(scvt));
+				////const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);
+
+				//////////////////////////fp16 := float16.Fromfloat32(p[i] - max)
+				//////////////////////////bits := fp16.Bits()
+				//////////////////////////exp := TableExpFP16[bits] // FIXME table_exp_f16 ASAP Initialize first!
+				//////////////////////////val := exp.Float32()
+
+				val := float32(math.Exp(float64(p[i] - max)))
+				sum += val
+				p[i] = val
+			}
+		}
+
+		////assert(sum > 0.0f);
+		sum = 1.0 / sum
+		VecScaleFP32(nc, p, sum)
+	}
+
+	if DEBUG {
+		fmt.Printf("\n\n>>> ComputeForwardSoftMaxFP32 OUT <<<\n")
+	}
+}
+
+// inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
+func VecAddFP32(n uint32, z, x, y []float32) {
+	for i := uint32(0); i < n; i++ {
+		z[i] = x[i] + y[i]
+	}
+}
+
+// ggml_compute_forward_add
+func ComputeForwardAddFP32(params *ComputeParams, src0, src1, dst *Tensor) {
+
+	////GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	if src1.NB[0] != TYPE_SIZE[TYPE_F32] {
+		fmt.Printf("[HALT] ComputeForwardAddFP32 : [src1] is NOT contiguous!")
+		os.Exit(1)
+	}
+
+	ith := params.ith
+	nth := params.nth
+
+	n := src0.Nrows()
+	nc := src0.NE[0]
+
+	//nb00 := src0.NB[0]
+	nb01 := src0.NB[1]
+
+	nb10 := src1.NB[0]
+	nb11 := src1.NB[1]
+
+	//nb0 := dst.NB[0]
+	nb1 := dst.NB[1]
+
+	////GGML_ASSERT( nb0 == sizeof(float));
+	////GGML_ASSERT(nb00 == sizeof(float));
+
+	if nb10 == TYPE_SIZE[TYPE_F32] {
+		j0 := (n / nth) * ith
+
+		// j1 := ith == nth - 1 ? n : (n/nth)*(ith + 1)
+		var j1 uint32
+		if ith == nth-1 {
+			j1 = n
+		} else {
+			j1 = (n / nth) * (ith + 1)
+		}
+
+		for j := j0; j < j1; j++ {
+
+			////ggml_vec_add_f32(nc,
+			////        (float *) ((char *) dst->data  + j*nb1),
+			////        (float *) ((char *) src0->data + j*nb01),
+			////        (float *) ((char *) src1->data + j*nb11));
+
+			VecAddFP32(nc, dst.Data[j*nb1/4:], src0.Data[j*nb01/4:], src1.Data[j*nb11/4:])
+		}
+
+	} else { // src1 is not contiguous
+		for j := ith; j < n; j += nth {
+			////float * dst_ptr  = (float *) ((char *) dst->data  + j*nb1);
+			dstPtr := dst.Data[j*nb1/4:]
+			////float * src0_ptr = (float *) ((char *) src0->data + j*nb01);
+			src0Ptr := src0.Data[j*nb01/4:]
+			for i := uint32(0); i < nc; i++ {
+				////float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);
+				src1Ptr := src1.Data[j*nb11/4+i*nb10/4]
+				dstPtr[i] = src0Ptr[i] + src1Ptr
+			}
+		}
+	}
+
+	if DEBUG {
+		fmt.Printf("\n\n>>> OUT <<< ComputeForwardAddFP32 <<<")
+	}
+}
+
+// Sigmoid Linear Unit (SiLU) function
+func SiluFP32(x float32) float32 {
+	return x / float32(1.0+math.Exp(float64(-x)))
+}
+
+// inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {
+func VecSiluFP32(n uint32, y, x []float32) {
+	for i := uint32(0); i < n; i++ {
+		y[i] = SiluFP32(x[i]) // ggml_silu_f32
+	}
+}
+
+// ggml_compute_forward_silu
+func ComputeForwardSiluFP32(params *ComputeParams, src0, dst *Tensor) {
+
+	////GGML_ASSERT(ggml_is_contiguous(src0));
+	////GGML_ASSERT(ggml_is_contiguous(dst));
+	////GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+	if !src0.IsContiguous() {
+		fmt.Printf("[HALT] ComputeForwardSiluFP32 : [src0] is NOT contiguous!")
+		os.Exit(1)
+	}
+
+	if !dst.IsContiguous() {
+		fmt.Printf("[HALT] ComputeForwardSiluFP32 : [dst] is NOT contiguous!")
+		os.Exit(1)
+	}
+
+	if params.Type == TASK_INIT || params.Type == TASK_FINALIZE {
+		return
+	}
+
+	ith := params.ith
+	nth := params.nth
+
+	nc := src0.NE[0]
+	nr := src0.Nrows()
+
+	// rows per thread
+	dr := (nr + nth - 1) / nth
+
+	// row range for this thread
+	ir0 := dr * ith
+	ir1 := uint32(min(int(ir0+dr), int(nr)))
+
+	for i1 := ir0; i1 < ir1; i1++ {
+		////ggml_vec_silu_f32(nc,
+		////        (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+		////        (float *) ((char *) src0->data + i1*(src0->nb[1])));
+
+		VecSiluFP32(nc, dst.Data[i1*dst.NB[1]/4:], src0.Data[i1*src0.NB[1]/4:])
+	}
+
+	if DEBUG {
+		printTensor(src0, "SRC SILI")
+		printTensor(dst, "DST SILI")
+	}
+}
+
+// ---
+
+type TokenScore struct {
+	Token string
+	Score float32
+}
+
+type Vocab struct {
+	Token2ID map[string]uint32
+	ID2Token []TokenScore
+}
+
+func NewVocab(size uint32) *Vocab {
+	return &Vocab{
+		Token2ID: make(map[string]uint32, size),
+		ID2Token: make([]TokenScore, size, size),
+	}
+}
+
+func min(a, b int) int {
+	if a <= b {
+		return a
+	}
+	return b
+}
+
+func min32(a, b uint32) uint32 {
+	if a <= b {
+		return a
+	}
+	return b
+}
+
+// ---- SentencePiece Tokenizer
+
+// struct llama_sp_symbol {
+type Symbol struct {
+	////using index = int;
+
+	// NB! Allow -1
+	Prev int
+	Next int
+
+	Text string
+	N    uint32
+}
+
+// struct llama_sp_bigram {
+type Bigram struct {
+
+	// NB! Allow -1
+	Left  int
+	Right int
+
+	Score float32
+	Size  uint32
+}
+
+func utf8Len(src byte) uint32 {
+	lookup := []uint32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4}
+	highbits := uint8(src) >> 4
+	return lookup[highbits]
+}
+
+func Token2Str(vocab *Vocab, token uint32) string {
+	if int(token) >= len(vocab.ID2Token) {
+		return ""
+	}
+
+	return vocab.ID2Token[token].Token
+}
+
+func PopMax(queue *[]Bigram) Bigram {
+
+	max := 0 // index of max score element in queue
+	for cur := 1; cur < len(*queue); cur++ {
+		if ((*queue)[max].Score < (*queue)[cur].Score) ||
+			((*queue)[max].Score == (*queue)[cur].Score &&
+				(*queue)[max].Left > (*queue)[cur].Left) {
+			max = cur
+		}
+	}
+
+	pop := (*queue)[max]
+
+	// replace max element with last and shrink slice (if max == last, then just remove it)
+	(*queue)[max] = (*queue)[len(*queue)-1]
+	*queue = (*queue)[:len(*queue)-1]
+
+	return pop
+}
+
+func TryAddBigram(vocab *Vocab, symbols []Symbol, workQueue *[]Bigram, left, right int) {
+
+	if left == -1 || right == -1 {
+		return
+	}
+
+	token := symbols[left].Text[:symbols[left].N+symbols[right].N]
+	id, ok := vocab.Token2ID[token]
+
+	if !ok || int(id) >= len(vocab.ID2Token) {
+		return
+	}
+
+	tokenScore := vocab.ID2Token[id]
+
+	bigram := Bigram{Left: left, Right: right, Score: tokenScore.Score, Size: uint32(len(token))}
+	*workQueue = append(*workQueue, bigram)
+}
+
+// void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
+func Tokenize(vocab *Vocab, text string, bos bool) []uint32 {
+
+	output := make([]uint32, 0)
+	symbols := make([]Symbol, 0)   // std::vector<llama_sp_symbol> symbols_;
+	workQueue := make([]Bigram, 0) // llama_sp_bigram::queue work_queue_; // std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
+
+	if bos {
+		output = append(output, 1) // TODO: replace with vocab.bos
+	}
+
+	// --- split string into utf8 chars
+
+	index := 0
+	offs := 0
+	for offs < len(text) {
+		var sym Symbol
+		charLen := min(len(text)-offs, int(utf8Len(text[offs])))
+		sym.Text = text[offs:]
+		sym.N = uint32(charLen)
+		offs += charLen
+		sym.Prev = index - 1
+		if offs == len(text) {
+			sym.Next = -1
+		} else {
+			sym.Next = index + 1
+		}
+		index++
+		symbols = append(symbols, sym)
+	}
+
+	// seed the work queue with all possible 2-character tokens
+	for i := 1; i < len(symbols); i++ {
+		TryAddBigram(vocab, symbols, &workQueue, i-1, i)
+	}
+
+	// keep substituting the highest frequency pairs for as long as we can
+	for len(workQueue) > 0 {
+		bigram := PopMax(&workQueue)
+
+		leftSym := &symbols[bigram.Left]
+		rightSym := &symbols[bigram.Right]
+
+		// if one of the symbols already got merged, skip it
+		if leftSym.N == 0 || rightSym.N == 0 || leftSym.N+rightSym.N != bigram.Size {
+			continue
+		}
+
+		// merge the right sym into the left one
+		leftSym.N += rightSym.N
+		rightSym.N = 0
+
+		// remove the right sym from the chain
+		leftSym.Next = rightSym.Next
+		if rightSym.Next >= 0 {
+			symbols[rightSym.Next].Prev = bigram.Left
+		}
+
+		// find more substitutions
+		TryAddBigram(vocab, symbols, &workQueue, leftSym.Prev, bigram.Left)
+		TryAddBigram(vocab, symbols, &workQueue, bigram.Left, leftSym.Next)
+	}
+
+	for i := 0; i != -1; i = symbols[i].Next {
+		symbol := symbols[i]
+		id, ok := vocab.Token2ID[symbol.Text[:symbol.N]]
+
+		if !ok {
+			// output any symbols that did not form tokens as bytes.
+			for j := uint32(0); j < symbol.N; j++ {
+				////llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
+				tokenID := uint32(symbol.Text[j] + 3)
+				output = append(output, tokenID)
+			}
+		} else {
+			output = append(output, id)
+		}
+	}
+
+	if DEBUG {
+		fmt.Printf("\n\n=== TOKENIZER ===\n\n%+v", output)
+		for i := 0; i < len(output); i++ {
+			fmt.Printf("%d:'%s'  ", output[i], Token2Str(vocab, output[i]))
+		}
+	}
+
+	return output
+
+}
+
+// TODO Do we need this?
+func Init(params InitParams) {
+
+	// ---- initialize GELU, SILU and EXP F32 tables
+
+	////const uint64_t t_start = ggml_time_us(); UNUSED(t_start);
+
+	/////////////////////////////////////////var ii uint16
+	/////////////////////////////////////////for i := uint32(0); i < (1 << 16); i++ {
+	/////////////////////////////////////////ui := uint16(i)
+
+	////memcpy(&ii, &ui, sizeof(ii));
+	////const float f = table_f32_f16[i] = COMPUTE_FP16_TO_FP32(ii);
+	/////////////////////////////////////////fp32 := float32()
+
+	////table_gelu_f16[i] = FP32_TO_FP16(ggml_gelu_f32(f));
+	////table_silu_f16[i] = FP32_TO_FP16(ggml_silu_f32(f));
+
+	////TableExpFP16[i]  = FP32_TO_FP16(exp(f));
+	/////////////////////////////////////////exp := float32(math.Exp(fp32))
+	/////////////////////////////////////////TableExpFP16[i] = float16.Fromfloat32(exp)
+
+	/////////////////////////////////////////}
+
+	////const uint64_t t_end = ggml_time_us(); UNUSED(t_end);
+
+}
+
+
+func PrintTensor(tensor *Tensor, name string) {
+	var dt string
+	if tensor.Type == TYPE_F16 {
+		dt = "FP16"
+	}
+	if tensor.Type == TYPE_F32 {
+		dt = "FP32"
+	}
+	if tensor.Type == TYPE_Q4_0 {
+		dt = "INT4"
+	}
+
+	fmt.Printf("\n\n=== [ %s | %s | %d:%d:%d ] ===\n",
+		name, dt, tensor.NE[0], tensor.NE[1], tensor.NE[2])
+
+	for nn := 0; nn < min(12, int(tensor.NE[1])); nn++ {
+		fmt.Printf("\n %d x %d ...\t", nn, tensor.NE[0])
+		for ii := 0; ii < min(12, int(tensor.NE[0])); ii++ {
+			fmt.Printf("%.3f\t", tensor.Data[nn*int(tensor.NE[0])+ii])
+		}
+	}
+	fmt.Println("")
+}
diff --git a/ml/utils.go b/ml/utils.go
new file mode 100644
index 0000000..70c7d92
--- /dev/null
+++ b/ml/utils.go
@@ -0,0 +1,411 @@
+package ml
+
+import (
+	"errors"
+	"fmt"
+	"math"
+	"mlgo/common"
+	"os"
+)
+
+// Tensor on Graph for stroage
+type TensorOnGraph struct {
+	Type DType
+
+	NodeID uint32 // nodeID == 99999 no exist (> graph.NodeCount)
+
+	Dims uint32
+	NE   [MAX_DIMS]uint32 // number of elements
+	NB   [MAX_DIMS]uint32 // stride in bytes
+
+	Op optype
+
+	// isParam bool // no need here?
+
+	// GradTensorID uint32 // no need for forward compute?
+	Src0NodeID uint32
+	Src1NodeID uint32
+
+	// grad *Tensor
+	// src0 *Tensor
+	// src1 *Tensor
+	// opt  [MAX_OPT]*Tensor // FIXME Do we need this?
+
+	TasksCount int
+
+	// performance
+	//perfRuns   uint32
+	//perfCycles uint32
+	//perfTime   uint64
+
+	Data []float32
+	//padding [8]byte
+}
+
+func (tensor * Tensor) ToTensorOnGraph(graph *Graph) *TensorOnGraph {
+	if tensor == nil || graph == nil || graph.Tensor2NodeID == nil {
+		return nil 
+	}
+	t := &TensorOnGraph{
+		Type: tensor.Type,
+		Dims: tensor.Dims,
+		NE: tensor.NE,
+		NB: tensor.NB,
+		Op: tensor.op,
+		TasksCount: tensor.TasksCount,
+		Data: tensor.Data,
+	}
+	t.NodeID = tensor2NodeID(tensor, graph)
+	t.Src0NodeID = tensor2NodeID(tensor.Src0, graph)
+	t.Src1NodeID = tensor2NodeID(tensor.Src1, graph)
+	return t
+}
+
+func (tensor *TensorOnGraph) ToTensor(tensorMap map[uint32]*Tensor) *Tensor {
+	t := &Tensor{
+		Type: tensor.Type,
+		Dims: tensor.Dims,
+		NE: tensor.NE,
+		NB: tensor.NB,
+		op: tensor.Op,
+		TasksCount: tensor.TasksCount,
+		Data: tensor.Data,
+	}
+	if tensorMap != nil {
+		t.Src0 = tensorMap[tensor.Src0NodeID]
+		t.Src1 = tensorMap[tensor.Src1NodeID]
+	}
+	return t
+}
+
+func tensor2NodeID(tensor *Tensor, graph *Graph) uint32 {
+	if id, ok := graph.Tensor2NodeID[tensor]; ok {
+		return id
+	} else {
+		return math.MaxUint32
+	}
+}
+
+func (tensor *TensorOnGraph) Encoding(toBigEndian bool) []byte {
+	data := make([]byte, 0)
+	data = append(data, common.IntToBytes(int(tensor.Type), toBigEndian)...) // Type
+	data = append(data, common.IntToBytes(int(tensor.NodeID), toBigEndian)...) // NodeID
+	data = append(data, common.IntToBytes(int(tensor.Dims), toBigEndian)...) // Dims
+	data = append(data, common.IntToBytes(int(tensor.Op), toBigEndian)...) // Op
+	data = append(data, common.IntToBytes(int(tensor.Src0NodeID), toBigEndian)...) // Src0NodeID
+	data = append(data, common.IntToBytes(int(tensor.Src1NodeID), toBigEndian)...) // Src1NodeID
+	data = append(data, common.IntToBytes(int(tensor.TasksCount), toBigEndian)...) // TasksCount
+
+	// encoding list
+	// NE
+	data = append(data, common.IntToBytes(MAX_DIMS, toBigEndian)...)
+	for i := 0; i < MAX_DIMS; i++ {
+		data = append(data, common.IntToBytes(int(tensor.NE[i]), toBigEndian)...)
+	}
+	// NB
+	data = append(data, common.IntToBytes(MAX_DIMS, toBigEndian)...)
+	for i := 0; i < MAX_DIMS; i++ {
+		data = append(data, common.IntToBytes(int(tensor.NB[i]), toBigEndian)...)
+	}
+	// Data
+	data = append(data, common.IntToBytes(len(tensor.Data), toBigEndian)...)
+	for i := 0; i < len(tensor.Data); i++ {
+		data = append(data, common.Float32ToBytes(tensor.Data[i], toBigEndian)...)
+	}
+	// append the data size ahead
+	// data = append(common.IntToBytes(len(data), toBigEndian), data...)
+	return data
+}
+
+func DecodeTensorOnGraph(data []byte, fromBigEndian bool, currentBigEndian bool) *TensorOnGraph {
+	if (len(data) == 0) {
+		return nil
+	}
+	t := 0
+	tensorType := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+
+	nodeId := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+
+	dims := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+
+	op := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+
+	src0NodeID := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+
+	src1NodeID := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+
+	tasksCount := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+
+	//NE
+	neSize := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+	ne := [4]uint32{0, 0, 0, 0}
+	for i := 0; i < int(neSize); i++ {
+		ne[i] = uint32(common.BytesToInt32(data[t:t+4], fromBigEndian))
+		t += 4
+	}
+
+	// NB
+	nbSize := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+	nb := [4]uint32{0, 0, 0, 0}
+	for i := 0; i < int(nbSize); i++ {
+		nb[i] = uint32(common.BytesToInt32(data[t:t+4], fromBigEndian))
+		t += 4
+	}
+
+	// Data
+	dataSize := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+	tensorData := make([]float32, 0)
+	if currentBigEndian && fromBigEndian {
+		// this code should be only used in MIPS!
+		tensorData = common.DecodeFloat32List(data[t:t+4*int(dataSize)])
+		t += 4*int(dataSize)
+	} else {
+		tensorData = make([]float32, dataSize)
+		for i := 0; i < int(dataSize); i++ {
+			tensorData[i] = common.BytesToFloat32(data[t:t+4], fromBigEndian)
+			t += 4
+		}
+	}
+
+
+	tensor := &TensorOnGraph{
+		Type: DType(tensorType),
+		NodeID: uint32(nodeId),
+		Dims: uint32(dims),
+		Op: optype(op),
+		Src0NodeID: uint32(src0NodeID),
+		Src1NodeID: uint32(src1NodeID),
+		TasksCount: int(tasksCount),
+		NE: ne,
+		NB: nb,
+		Data: tensorData,
+	}
+
+	return tensor
+}
+
+func ComputeNodeForward(node *Tensor) {
+	if node == nil {
+		return 
+	}
+	node.TasksCount = 1
+	params := ComputeParams{
+		Type: TASK_COMPUTE,
+		ith:  0,
+		nth:  uint32(node.TasksCount),
+	}
+	ComputeForward(nil, &params, node)
+}
+
+// =======================================================================
+
+// compute [0, nodeID)
+func GraphComputeByNodes(ctx *Context, graph *Graph, nodeID int) {
+
+	maxThreads := graph.ThreadsCount
+
+	// --- init N job goroutines and channel to send tasks for them
+
+	graph.Jobs = make(chan *ComputeParams, maxThreads) // TODO Right place to init?
+	defer close(graph.Jobs)
+
+	// TODO Investigate https://pkg.go.dev/runtime#LockOSThread
+	for i := 0; i < maxThreads; i++ {
+		go Job(graph.Jobs)
+	}
+
+	// --- initialize tasks
+
+	{
+		// thread scheduling for the different operations
+		// TasksCount might be 0, 1, or ThreadsCount
+		for i := uint32(0); i < graph.NodesCount; i++ {
+
+			////struct ggml_tensor * node = cgraph->nodes[i];
+			node := graph.Nodes[i]
+
+			if DEBUG {
+				fmt.Printf("\n\n### STEP #%d ### %d - %d [ %d:%d:%d:%d ]", i, node.op, node.Type, node.NE[0], node.NE[1], node.NE[2], node.NE[3])
+			}
+
+			switch node.op {
+
+			case OP_DUP:
+				node.TasksCount = 1
+			case OP_ADD:
+				node.TasksCount = 1 // TODO threads
+			case OP_SUB:
+			case OP_MUL:
+			case OP_DIV:
+			case OP_SQR:
+			case OP_SQRT:
+			case OP_SUM:
+			case OP_MEAN:
+			case OP_REPEAT:
+			case OP_ABS:
+			case OP_SGN:
+			case OP_NEG:
+			case OP_STEP:
+			case OP_RELU:
+				node.TasksCount = 1
+			case OP_GELU:
+				node.TasksCount = 1 // TODO threads
+			case OP_SILU:
+				node.TasksCount = 1 // TODO threads
+			case OP_NORM:
+			case OP_RMS_NORM:
+				node.TasksCount = 1 // TODO threads
+			case OP_MUL_MAT:
+				node.TasksCount = maxThreads
+				// TODO: use different scheduling for different matrix sizes
+			case OP_SCALE:
+				node.TasksCount = 1 // TODO threads
+			case OP_CPY:
+			case OP_RESHAPE:
+			case OP_VIEW:
+			case OP_PERMUTE:
+			case OP_TRANSPOSE:
+			case OP_GET_ROWS:
+			case OP_DIAG_MASK_INF:
+				node.TasksCount = 1
+			case OP_SOFT_MAX:
+				node.TasksCount = 1 // TODO threads
+			case OP_ROPE:
+				////node.TasksCount = 1
+			case OP_CONV_1D_1S:
+			case OP_CONV_1D_2S:
+				node.TasksCount = 1 // TODO threads
+				////ASSERT(node->src0->ne[3] == 1);
+				////ASSERT(node->src1->ne[2] == 1);
+				////ASSERT(node->src1->ne[3] == 1);
+			case OP_FLASH_ATTN:
+				node.TasksCount = 1 // TODO threads
+			case OP_FLASH_FF:
+				node.TasksCount = 1 // TODO threads
+			case OP_NONE:
+				node.TasksCount = 1
+			case OP_COUNT:
+				fmt.Printf("\n[HALT] Something wrong with compute graph!")
+				os.Exit(1)
+			}
+		}
+	}
+
+	nodeID = min(nodeID, int(graph.NodesCount))
+
+	for i := uint32(0); i < uint32(nodeID); i++ {
+
+		node := graph.Nodes[i]
+
+		if DEBUG {
+			fmt.Printf("\n\n### STEP #%d ### %d - %d [ %d:%d:%d:%d ]", i, node.op, node.Type, node.NE[0], node.NE[1], node.NE[2], node.NE[3])
+		}
+
+		params := ComputeParams{
+			Type: TASK_INIT,
+			ith:  0,
+			nth:  uint32(node.TasksCount),
+		}
+
+		ComputeForward(graph, &params, node) // TASK_INIT
+
+		// --- COMPUTE
+
+		// BREAKPOINT DEBUG
+		//if i > 1300 {
+		//	fmt.Printf("\n\n=== HALT #%d ===", i)
+		//	os.Exit(0)
+		//}
+
+		params.Type = TASK_COMPUTE
+		ComputeForward(graph, &params, node)
+
+		// --- FINALIZE
+
+		params.Type = TASK_FINALIZE
+		ComputeForward(graph, &params, node)
+	}
+
+}
+
+func SaveComputeNodeEnv(node *Tensor, graph *Graph) []*TensorOnGraph{
+	tensorOnGraphList := make([]*TensorOnGraph, 0)
+	tensorOnGraphList = append(tensorOnGraphList, node.ToTensorOnGraph(graph))
+	if node.Src0 != nil {
+		tensorOnGraphList = append(tensorOnGraphList, node.Src0.ToTensorOnGraph(graph))
+	}
+	if node.Src1 != nil {
+		tensorOnGraphList = append(tensorOnGraphList, node.Src1.ToTensorOnGraph(graph))
+	}
+	return tensorOnGraphList
+}
+
+// total_bytes_len
+// nodeID
+// tensorGraph num
+// [len, tensor]
+func SaveComputeNodeEnvToBytes(nodeID uint32, node *Tensor, graph *Graph, toBigEndian bool) []byte {
+	tensorGraphList := SaveComputeNodeEnv(node, graph)
+	if len(tensorGraphList) == 0 {
+		return nil 
+	}
+	data := make([]byte, 0)
+	// nodeID
+	data = append(data, common.IntToBytes(int(nodeID), toBigEndian)...)
+	// tensorGraph num
+	data = append(data, common.IntToBytes(len(tensorGraphList), toBigEndian)...)
+	// tensor
+	for i := 0; i < len(tensorGraphList); i++ {
+		tensor := tensorGraphList[i]
+		bytes := tensor.Encoding(toBigEndian)
+		// append size ahead of content
+		bytes = append(common.IntToBytes(len(bytes), toBigEndian), bytes...)
+		// append into data
+		data = append(data, bytes...)
+	}
+	// total bytes len
+	data = append(common.IntToBytes(len(data), toBigEndian), data...)
+	return data
+}
+
+func DecodeComputeNodeEnv(data []byte, fromBigEndian bool, currentBigEndian bool) (uint32, []*TensorOnGraph, error) {
+	t := 0
+	totalSize := common.BytesToInt32(data[:4], fromBigEndian)
+	t += 4
+	if int(totalSize) < len(data) - 4 {
+		return 0, nil, errors.New("no enough data")
+	}
+
+	// nodeID
+	nodeID := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+
+	// tensorNum
+	tensorNum := common.BytesToInt32(data[t:t+4], fromBigEndian)
+	t += 4
+
+	tensorOnGraphList := make([]*TensorOnGraph, tensorNum)
+
+	for i := 0; i < int(tensorNum); i++ {
+		// size
+		size := common.BytesToInt32(data[t:t+4], fromBigEndian)
+		t += 4
+		// tensorOnGraph
+		tensor := DecodeTensorOnGraph(data[t:t+int(size)], fromBigEndian, currentBigEndian)
+		t += int(size)
+		
+		tensorOnGraphList[i] = tensor
+	}
+
+	return uint32(nodeID), tensorOnGraphList, nil
+}
\ No newline at end of file
diff --git a/ml_mips/build.sh b/ml_mips/build.sh
new file mode 100755
index 0000000..b6b4302
--- /dev/null
+++ b/ml_mips/build.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+set -e
+
+export GOOS=linux
+export GOARCH=mips
+export GOMIPS=softfloat
+go build -o ./ml_mips
+
+file ml_mips
+
+if [[ ! -d venv ]]; then
+    python3 -m venv venv
+fi
+
+../compile.py ml_mips
diff --git a/ml_mips/main.go b/ml_mips/main.go
new file mode 100644
index 0000000..169e026
--- /dev/null
+++ b/ml_mips/main.go
@@ -0,0 +1,53 @@
+package main
+
+import (
+	"fmt"
+	"mlgo/common"
+	"mlgo/ml"
+)
+
+const (
+	READ_FROM_BIDENDIAN = true
+	OUTPUT_TO_BIDENDIAN = true
+)
+
+// read from memory [size: [envData]]
+// output: nodeID, tensorOnGraph, error
+func ReadTensorGraph() (uint32, []*ml.TensorOnGraph, error){
+	fmt.Println("Start Read Tensor Graph")
+	dataBytes := common.ReadBytes(common.INPUT_ADDR, READ_FROM_BIDENDIAN)
+	nodeID, tensorGraphList, err := ml.DecodeComputeNodeEnv(dataBytes, READ_FROM_BIDENDIAN, true)
+	return nodeID, tensorGraphList, err
+}
+
+func ComputeTensorGraph(nodeID uint32, tensorGraphList []*ml.TensorOnGraph) {
+	fmt.Println("State Compute Tensor Graph")
+	tensorList := make([]*ml.Tensor, 0)
+	tensorMap := make(map[uint32]*ml.Tensor)
+	for i := 0; i < len(tensorGraphList); i++ {
+		tensor := tensorGraphList[i].ToTensor(nil)
+		tensorMap[tensorGraphList[i].NodeID] = tensor
+		tensorList = append(tensorList, tensor)
+	}
+	// fill in the nodeid
+	for i := 0; i < len(tensorList); i++ {
+		tensor := tensorList[i]
+		tensorG := tensorGraphList[i]
+		if src0, ok := tensorMap[tensorG.Src0NodeID]; ok {
+			tensor.Src0 = src0
+		}
+		if src1, ok := tensorMap[tensorG.Src1NodeID]; ok {
+			tensor.Src1 = src1
+		}
+	}
+	ml.ComputeNodeForward(tensorMap[uint32(nodeID)])
+	ml.PrintTensor(tensorMap[uint32(nodeID)], "final_after")
+}
+
+func main() {
+	nodeID, tensorGraphList, err := ReadTensorGraph()
+	if err != nil {
+		return 
+	}
+	ComputeTensorGraph(nodeID, tensorGraphList)
+}
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..9e1dd8b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+pyelftools==0.27
+hexdump==3.3
+termcolor==1.1.0
+capstone==4.0.2
+rangetree==1.0
\ No newline at end of file
diff --git a/startup/startup.bin b/startup/startup.bin
new file mode 100644
index 0000000..6a80327
Binary files /dev/null and b/startup/startup.bin differ
diff --git a/startup/startup.s b/startup/startup.s
new file mode 100644
index 0000000..4289472
--- /dev/null
+++ b/startup/startup.s
@@ -0,0 +1,24 @@
+    .section .test, "x"
+    .balign 4
+    .set    noreorder
+    .global test
+    .ent    test
+test:
+
+lui     $sp, 0x7fff
+ori     $sp, 0xd000
+
+# http://articles.manugarg.com/aboutelfauxiliaryvectors.html
+# _AT_PAGESZ = 6
+ori $t0, $0, 6
+sw $t0, 0xC($sp)
+ori $t0, $0, 0x1000
+sw $t0, 0x10($sp)
+
+lw $ra, dat($0)
+jr $ra
+nop
+
+dat:
+
+.end test
diff --git a/startup/startup.sh b/startup/startup.sh
new file mode 100644
index 0000000..8ea4595
--- /dev/null
+++ b/startup/startup.sh
@@ -0,0 +1,4 @@
+#!/usr/bin/env bash
+set -e
+
+../../mipsevm/maketests.py startup.s startup.bin