diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..dfe0770 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..73ddf6d --- /dev/null +++ b/.gitignore @@ -0,0 +1,19 @@ +node_modules +artifacts +cache +.*.swp +venv +.idea +*.log + +examples/mnist/trainning/data/MNIST/raw + +mlgo +mlgo.bin + +examples/llama/llama + +ml_mips/ml_mips +ml_mips/ml_mips.bin + +examples/llama/data \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..7007ecc --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +# MLGO + +MLGO is tensor library for machine learning in pure Golang that can run on MIPS. + +The machine learning part of this project refers to the legendary [ggml.cpp](https://github.com/ggerganov/ggml) framework. + + +## MNIST + +1. Train the AI model. See `examples/mnist/trainning/mnist.ipynb` +2. Convert the AI model into GGML using `examples/mnist/convert-h5-to-ggml.py` +3. Build the AI inference engine for MIPS +`cd examples/mnist_mips && ./build` +`` \ No newline at end of file diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..cfe73d1 --- /dev/null +++ b/build.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -e + +export GOOS=linux +export GOARCH=mips +export GOMIPS=softfloat +go build -o ./mlgo + +file mlgo + +if [[ ! -d venv ]]; then + python3 -m venv venv +fi + +./compile.py mlgo diff --git a/common/utils.go b/common/utils.go new file mode 100644 index 0000000..a87ec5f --- /dev/null +++ b/common/utils.go @@ -0,0 +1,51 @@ +package common + +import ( + "math" + "os" + "unsafe" +) + +// NB! INT = 32 bits +func ReadInt32FromFile(file *os.File) uint32 { + buf := make([]byte, 4) + if count, err := file.Read(buf); err != nil || count != 4 { + return 0 + } + return uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0]) +} + +func ReadStringFromFile(file *os.File, len uint32) string { + buf := make([]byte, len) + if count, err := file.Read(buf); err != nil || count != int(len) { + return "" + } + return string(buf) +} + + +func ReadFP32FromFile(file *os.File) float32 { + buf := make([]byte, 4) + if count, err := file.Read(buf); err != nil || count != 4 { + return 0.0 + } + bits := uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0]) + return math.Float32frombits(bits) +} + +func min(a, b int) int { + if a <= b { + return a + } + return b +} + + + +func DecodeFloat32List(bs []byte) []float32 { + return unsafe.Slice((*float32)(unsafe.Pointer(&bs[0])), len(bs)/4) +} + +func EncodeFloat32List(fs []float32) []byte { + return unsafe.Slice((*byte)(unsafe.Pointer(&fs[0])), len(fs)*4) +} \ No newline at end of file diff --git a/common/vmutils.go b/common/vmutils.go new file mode 100644 index 0000000..bc37813 --- /dev/null +++ b/common/vmutils.go @@ -0,0 +1,140 @@ +package common + +import ( + "bytes" + "encoding/binary" + "os" + "reflect" + "unsafe" +) + +// vm only =================================================================================== + +// memory layout in MIPS +const ( + INPUT_ADDR = 0x31000000 + OUTPUT_ADDR = 0x32000000 + MODEL_ADDR = 0x33000000 + MAGIC_ADDR = 0x30000800 +) + +func ByteAt(addr uint64, length int) []byte { + var ret []byte + bh := (*reflect.SliceHeader)(unsafe.Pointer(&ret)) + bh.Data = uintptr(addr) + bh.Len = length + bh.Cap = length + return ret +} + +// reading bytes from bigEndian or littleEndian +func ReadBytes(addr uint64, isBigEndian bool) []byte { + rawSize := CopyBytes(ByteAt(addr, 4)) + size := BytesToInt32(rawSize, isBigEndian) + ret := ByteAt(addr + 4, int(size)) + //shoud we copy here? may not for saving memory + return ret +} + +func Halt() { + //os.Stderr.WriteString("THIS SHOULD BE PATCHED OUT\n") + // the exit syscall is a jump to 0x5ead0000 now + os.Exit(0) +} + +func Output(output []byte, isBigEndian bool) { + size := len(output) + rawSize := IntToBytes(size,isBigEndian) + mSize := ByteAt(OUTPUT_ADDR, 4) + copy(mSize, rawSize) + mData := ByteAt(OUTPUT_ADDR + 4, size) + copy(mData, output) + // magic code => have written the result + magic := ByteAt(MAGIC_ADDR, 4) + copy(magic, []byte{0x12, 0x34, 0x56, 0x78}) + // stop everything + Halt() +} + + +func IntToBytes(n int, isBigEndian bool) []byte { + x := int32(n) + + bytesBuffer := bytes.NewBuffer([]byte{}) + if isBigEndian { + binary.Write(bytesBuffer, binary.BigEndian, x) + } else { + binary.Write(bytesBuffer, binary.LittleEndian, x) + } + return bytesBuffer.Bytes() +} + +func BytesToInt32(b []byte, isBigEndian bool) int32 { + bytesBuffer := bytes.NewBuffer(b) + + var x int32 + if isBigEndian { + binary.Read(bytesBuffer, binary.BigEndian, &x) + } else { + binary.Read(bytesBuffer, binary.LittleEndian, &x) + } + + + return x +} + +func Float32ToBytes(x float32, isBigEndian bool) []byte { + bytesBuffer := bytes.NewBuffer([]byte{}) + if isBigEndian { + binary.Write(bytesBuffer, binary.BigEndian, x) + } else { + binary.Write(bytesBuffer, binary.LittleEndian, x) + } + return bytesBuffer.Bytes() +} + +func BytesToFloat32(b []byte, isBigEndian bool) float32 { + byteBuffer := bytes.NewBuffer(b) + var x float32 + if isBigEndian { + binary.Read(byteBuffer, binary.BigEndian, &x) + } else { + binary.Read(byteBuffer, binary.LittleEndian, &x) + } + + return x +} + +// CopyBytes returns an exact copy of the provided bytes. +func CopyBytes(b []byte) (copiedBytes []byte) { + if b == nil { + return nil + } + copiedBytes = make([]byte, len(b)) + copy(copiedBytes, b) + + return +} + +// read from index then return the result and the next index +func ReadInt32FromBytes(data []byte, index *int, isBigEndian bool) (uint32) { + if (*index + 4 > len(data)) { + *index = len(data) + return 0 + } + buf := CopyBytes(data[*index:*index+4]) + ret := BytesToInt32(buf, isBigEndian) + *index = *index + 4 + return uint32(ret) +} + +func ReadFP32FromBytes(data []byte, index *int, isBigEndian bool) (float32) { + if (*index + 4 > len(data)) { + *index = len(data) + return 0 + } + buf := CopyBytes(data[*index:*index+4]) + ret := BytesToFloat32(buf, isBigEndian) + *index = *index + 4 + return ret +} \ No newline at end of file diff --git a/common/vmutils_test.go b/common/vmutils_test.go new file mode 100644 index 0000000..27045b7 --- /dev/null +++ b/common/vmutils_test.go @@ -0,0 +1,49 @@ +package common + +import ( + "fmt" + "testing" + "unsafe" +) + +func TestByteFloat(t *testing.T){ + a := 1.234 + ab := Float32ToBytes(float32(a), true) + aa := BytesToFloat32(ab, true) + fmt.Println(a, ab, aa) +} + +func byteSliceToFloat32Slice(src []byte) []float32 { + if len(src) == 0 { + return nil + } + + l := len(src) / 4 + ptr := unsafe.Pointer(&src[0]) + // It is important to keep in mind that the Go garbage collector + // will not interact with this data, and that if src if freed, + // the behavior of any Go code using the slice is nondeterministic. + // Reference: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices + return (*[1 << 26]float32)((*[1 << 26]float32)(ptr))[:l:l] +} + +func encodeUnsafe(fs []float32) []byte { + return unsafe.Slice((*byte)(unsafe.Pointer(&fs[0])), len(fs)*4) +} + +func decodeUnsafe(bs []byte) []float32 { + return unsafe.Slice((*float32)(unsafe.Pointer(&bs[0])), len(bs)/4) +} + +func TestByteSliceToFloat32Slice(t *testing.T) { + as := []float32{1.234, 2.345} + asBytes := make([]byte, 0) + for i := 0; i < len(as); i++ { + asBytes = append(asBytes, Float32ToBytes(as[i], false)...) + } + fmt.Println(asBytes) + fmt.Println(byteSliceToFloat32Slice(asBytes)) + fmt.Println(encodeUnsafe(as)) + fmt.Println(decodeUnsafe(encodeUnsafe(as))) + fmt.Println(decodeUnsafe(asBytes)) +} \ No newline at end of file diff --git a/compile.py b/compile.py new file mode 100755 index 0000000..699557d --- /dev/null +++ b/compile.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +import os +import sys +import struct +import hashlib +from rangetree import RangeTree +from elftools.elf.elffile import ELFFile + +def load_minigeth(fn="mlgo"): + elf = open(fn, "rb") + data = elf.read() + elf.seek(0) + + elffile = ELFFile(elf) + + end_addr = 0 + for seg in elffile.iter_segments(): + end_addr = max(end_addr, seg.header.p_vaddr + seg.header.p_memsz) + + # program memory (16 MB) + prog_size = (end_addr+0xFFF) & ~0xFFF + prog_dat = bytearray(prog_size) + print("malloced 0x%x for program" % prog_size) + + for seg in elffile.iter_segments(): + print(seg.header, hex(seg.header.p_vaddr)) + prog_dat[seg.header.p_vaddr:seg.header.p_vaddr+len(seg.data())] = seg.data() + + entry = elffile.header.e_entry + print("entrypoint: 0x%x" % entry) + + # moved to MIPS + sf = os.path.join(os.path.dirname(os.path.abspath(__file__)), "startup", "startup.bin") + start = open(sf, "rb").read() + struct.pack(">I", entry) + prog_dat[:len(start)] = start + entry = 0 + + r = RangeTree() + found = 0 + for section in elffile.iter_sections(): + try: + for nsym, symbol in enumerate(section.iter_symbols()): + ss = symbol['st_value'] + se = ss+symbol['st_size'] + if ss != se: + try: + r[ss:se] = symbol.name + except KeyError: + continue + #print(nsym, symbol.name, symbol['st_value'], symbol['st_size']) + if symbol.name == "runtime.gcenable": + print(nsym, symbol.name) + # nop gcenable + prog_dat[symbol['st_value']:symbol['st_value']+8] = b"\x03\xe0\x00\x08\x00\x00\x00\x00" + found += 1 + except Exception: + #traceback.print_exc() + pass + + #assert(found == 2) + return prog_dat, prog_size, r + + +if __name__ == "__main__": + fn = "minigeth" + if len(sys.argv) > 1: + fn = sys.argv[1] + + prog_dat, prog_size, _ = load_minigeth(fn) + print("compiled %d bytes with md5 %s" % (prog_size, hashlib.md5(prog_dat).hexdigest())) + + with open(fn+".bin", "wb") as f: + f.write(prog_dat) \ No newline at end of file diff --git a/examples/gpt-2/gpt2.go b/examples/gpt-2/gpt2.go new file mode 100644 index 0000000..c1cbd97 --- /dev/null +++ b/examples/gpt-2/gpt2.go @@ -0,0 +1,356 @@ +package gpt2 + +import ( + "errors" + "fmt" + "math" + "mlgo/ml" + "os" + "strconv" +) + +// default hparams (GPT-2 117M) +/* + int32_t n_vocab = 50257; + int32_t n_ctx = 1024; + int32_t n_embd = 768; + int32_t n_head = 12; + int32_t n_layer = 12; + int32_t ftype = 1; +*/ +type gpt2_hparams struct { + n_vocab int32; + n_ctx int32; + n_embd int32; + n_head int32; + n_layer int32; + ftype int32; + +}; + +type gpt2_layer struct { + // normalization + ln_1_g *ml.Tensor; + ln_1_b *ml.Tensor; + + ln_2_g *ml.Tensor; + ln_2_b *ml.Tensor; + + c_attn_attn_w *ml.Tensor; + c_attn_attn_b *ml.Tensor; + + c_attn_proj_w *ml.Tensor; + c_attn_proj_b *ml.Tensor; + + c_mlp_fc_w *ml.Tensor; + c_mlp_fc_b *ml.Tensor; + + c_mlp_proj_w *ml.Tensor; + c_mlp_proj_b *ml.Tensor; +} + +type gpt2_model struct { + hparams gpt2_hparams ; + + ln_f_g *ml.Tensor; + ln_f_b *ml.Tensor; + + wte *ml.Tensor; + wpe *ml.Tensor; + lm_head *ml.Tensor; + + layers []gpt2_layer; + + memory_k *ml.Tensor; + memory_v *ml.Tensor; + + tensors map[string]*ml.Tensor; +} + +func gpt2_model_load(fname string, model *gpt2_model, vocab *gpt_vocab) error { + + file, err := os.Open(fname) + if err != nil { + return err + } + defer file.Close() + + { + magic := readInt(file) + if magic != 0x67676d6c { + return errors.New("invalid model file (bad magic)") + } + } + + // load hparams + { + model.hparams.n_vocab = int32(readInt(file)) + model.hparams.n_ctx = int32(readInt(file)) + model.hparams.n_embd = int32(readInt(file)) + model.hparams.n_head = int32(readInt(file)) + model.hparams.n_layer = int32(readInt(file)) + model.hparams.ftype = int32(readInt(file)) + + fmt.Printf("hparams: %v\n", model.hparams) + } + + // load vocab + { + n_vocab := readInt(file) + if n_vocab != uint32(model.hparams.n_vocab) { + return errors.New(fmt.Sprintf("n_vocan: %v, model.hparams.n_vocan: %v", n_vocab, model.hparams.n_vocab)) + } + + for i := uint32(0); i < (n_vocab); i++ { + len := readInt(file) + word := readString(file, len) + vocab.token_to_id[word] = i + vocab.id_to_token[i] = word + } + } + + wtype := ml.TYPE_F32 + dtype := ml.TYPE_F32 + + // weights + { + n_embd := uint32(model.hparams.n_embd) + n_layer := uint32(model.hparams.n_layer) + n_ctx := uint32(model.hparams.n_ctx) + n_vocab := uint32(model.hparams.n_vocab) + + model.layers = make([]gpt2_layer, n_layer) + model.tensors = make(map[string]*ml.Tensor) + + model.ln_f_g = ml.NewTensor1D(nil, dtype, uint32(n_embd)) + model.ln_f_b = ml.NewTensor1D(nil, dtype, uint32(n_embd)) + + model.wte = ml.NewTensor2D(nil, wtype, uint32(n_embd), uint32(n_vocab)) + model.wpe = ml.NewTensor2D(nil, dtype, uint32(n_embd), uint32(n_ctx)) + model.lm_head = ml.NewTensor2D(nil, wtype, uint32(n_embd), uint32(n_vocab)) + + // map by name + model.tensors["model/ln_f/g"] = model.ln_f_g; + model.tensors["model/ln_f/b"] = model.ln_f_b; + + model.tensors["model/wte"] = model.wte; + model.tensors["model/wpe"] = model.wpe; + model.tensors["model/lm_head"] = model.lm_head; + + for i := 0; i < int(n_layer); i++ { + layer := &model.layers[i]; + + layer.ln_1_g = ml.NewTensor1D(nil, dtype, n_embd); + layer.ln_1_b = ml.NewTensor1D(nil, dtype, n_embd); + + layer.ln_2_g = ml.NewTensor1D(nil, dtype, n_embd); + layer.ln_2_b = ml.NewTensor1D(nil, dtype, n_embd); + + layer.c_attn_attn_w = ml.NewTensor2D(nil, wtype, n_embd, 3*n_embd); + layer.c_attn_attn_b = ml.NewTensor1D(nil, dtype, 3*n_embd); + + layer.c_attn_proj_w = ml.NewTensor2D(nil, wtype, n_embd, n_embd); + layer.c_attn_proj_b = ml.NewTensor1D(nil, dtype, n_embd); + + layer.c_mlp_fc_w = ml.NewTensor2D(nil, wtype, n_embd, 4*n_embd); + layer.c_mlp_fc_b = ml.NewTensor1D(nil, dtype, 4*n_embd); + + layer.c_mlp_proj_w = ml.NewTensor2D(nil, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ml.NewTensor1D(nil, dtype, n_embd); + + // map by name + model.tensors["model/h" + strconv.Itoa(i) + "/ln_1/g"] = layer.ln_1_g; + model.tensors["model/h" + strconv.Itoa(i) + "/ln_1/b"] = layer.ln_1_b; + + model.tensors["model/h" + strconv.Itoa(i) + "/ln_2/g"] = layer.ln_2_g; + model.tensors["model/h" + strconv.Itoa(i) + "/ln_2/b"] = layer.ln_2_b; + + model.tensors["model/h" + strconv.Itoa(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; + model.tensors["model/h" + strconv.Itoa(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; + + model.tensors["model/h" + strconv.Itoa(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; + model.tensors["model/h" + strconv.Itoa(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; + + model.tensors["model/h" + strconv.Itoa(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; + model.tensors["model/h" + strconv.Itoa(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; + + model.tensors["model/h" + strconv.Itoa(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w; + model.tensors["model/h" + strconv.Itoa(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; + } + } + + // key + value + { + n_mem := model.hparams.n_layer * model.hparams.n_ctx + n_element := model.hparams.n_embd * n_mem + + model.memory_k = ml.NewTensor1D(nil, dtype, uint32(n_element)) + model.memory_v = ml.NewTensor1D(nil, dtype, uint32(n_element)) + + fmt.Println("n_element in key+value: ", n_element) + } + + // load weights + { + total_size := 0 + has_lm_head := false + + for { + n_dim := readInt(file) + length := readInt(file) + ttype := readInt(file) + + if n_dim | length | ttype == 0 { + // eof + break + } + + nelements := 1 + ne := make([]int32, 2) + for i := 0; i < int(n_dim); i++ { + ne[i] = int32(readInt(file)) + nelements *= int(ne[i]) + } + + // read name len + name := readString(file, length) + if _, ok := model.tensors[name]; !ok { + return errors.New(fmt.Sprintf("unknow tensor: %s", name)) + } + tensor := model.tensors[name] + + // read data + for i := 0; i < len(tensor.Data); i++{ + tensor.Data[i] = readFP32(file) + } + + // GPT-2 models share the WTE tensor as the LM head + if name == "model/wte" && !has_lm_head { + copy(tensor.Data, model.lm_head.Data) + } + + if name == "model/lm_head" { + has_lm_head = true + } + + total_size += len(tensor.Data) * 4 + + } + } + + return nil +} + +// evaluate the transformer +// +// - model: the model +// - n_threads: number of threads to use +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +// func gpt2_eval(model *gpt2_model, n_thread int, n_past int, embd_inp []uint32, embd_w []float32, mem_per_token uint32) { +// N := len(embd_inp) + +// n_embd := model.hparams.n_embd +// n_layer := model.hparams.n_layer +// n_ctx := model.hparams.n_ctx +// n_head := model.hparams.n_head +// n_vocab := model.hparams.n_vocab + +// gf := ml.Graph{ThreadsCount: n_thread} +// embd := ml.NewTensor1D(nil, ml.TYPE_F32, uint32(N)) +// for i := 0; i < N; i++ { +// embd.Data[i] = float32(embd_inp[i]) +// } + +// position := ml.NewTensor1D(nil, ml.TYPE_F32, uint32(N)) +// for i := 0; i < N; i++ { +// position.Data[i] = float32(n_past + 1) +// } + +// inpL := ml.Add(nil, ml.GetRows(nil, model.wte, embd), ml.GetRows(nil, model.wpe, position)) + +// for il := 0; il < int(n_layer); il++ { +// // TODO: replace with ggml_norm +// cur := ml.RMSNorm(nil, inpL) +// cur = ml.Add(nil, ml.Mul(nil, ml.Repeat(nil, model.layers[il].ln_1_g, cur), cur), ml.Repeat(nil, model.layers[il].ln_1_b, cur)) + +// cur = ml.MulMat(nil, model.layers[il].c_attn_attn_w, cur) +// cur = ml.Add(nil, ml.Repeat(nil, model.layers[il].c_attn_attn_b, cur), cur) + +// // self-attention +// { +// Qcur := ml.View1D() +// } +// } + +// } + +func readInt(file *os.File) uint32 { + buf := make([]byte, 4) + if count, err := file.Read(buf); err != nil || count != 4 { + return 0 + } + return uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0]) +} + +func readString(file *os.File, len uint32) string { + buf := make([]byte, len) + if count, err := file.Read(buf); err != nil || count != int(len) { + return "" + } + return string(buf) +} + + +func readFP32(file *os.File) float32 { + buf := make([]byte, 4) + if count, err := file.Read(buf); err != nil || count != 4 { + return 0.0 + } + bits := uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0]) + return math.Float32frombits(bits) +} + +func min(a, b int) int { + if a <= b { + return a + } + return b +} + +func printTensor(tensor *ml.Tensor, name string) { + var dt string + if tensor.Type == ml.TYPE_F16 { + dt = "FP16" + } + if tensor.Type == ml.TYPE_F32 { + dt = "FP32" + } + if tensor.Type == ml.TYPE_Q4_0 { + dt = "INT4" + } + + fmt.Printf("\n\n=== [ %s | %s | %d:%d:%d ] ===\n", + name, dt, tensor.NE[0], tensor.NE[1], tensor.NE[2]) + + for nn := 0; nn < min(12, int(tensor.NE[1])); nn++ { + fmt.Printf("\n %d x %d ...\t", nn, tensor.NE[0]) + for ii := 0; ii < min(12, int(tensor.NE[0])); ii++ { + fmt.Printf("%.3f\t", tensor.Data[nn*int(tensor.NE[0])+ii]) + } + } +} + +type gpt_vocab struct { + token_to_id map[string]uint32 + id_to_token map[uint32]string +} + +func NewVocab() *gpt_vocab { + return &gpt_vocab{ + token_to_id: make(map[string]uint32), + id_to_token: make(map[uint32]string), + } +} \ No newline at end of file diff --git a/examples/llama/README.md b/examples/llama/README.md new file mode 100644 index 0000000..5d30483 --- /dev/null +++ b/examples/llama/README.md @@ -0,0 +1,71 @@ +# LLaMA.go + +![](./assets/images/terminal.png?raw=true) + +[![Coverage](https://img.shields.io/badge/Coverage-0-red)](https://github.com/gotzmann/llama.go/actions/workflows/coverage.yml) + +Part of this code is borrowed from [llama.go](github.com/gotzmann/llama.go) + +## The Goal + +We dream of a world where ML hackers are able to grok with **REALLY BIG GPT** models without having GPU clusters consuming a shit tons of **$$$** - using only machines in their own homelabs. + +The code of the project is based on the legendary **[ggml.cpp](https://github.com/ggerganov/llama.cpp)** framework of Georgi Gerganov written in C++ + +We hope using our beloved Golang instead of *soo-powerful* but *too-low-level* language will allow much greater adoption of the **NoGPU** ideas. + +**NB!** The V1 supports only FP32 math, so you'll need at least 32GB RAM to work even with the smallest **LLaMA-7B** model. As a preliminary step you should have binary files converted from original LLaMA model locally. + +## V0 Roadmap + +- [x] Move FP32 tensor math from C++ to pure Golang package GoML +- [x] Implement LLaMA neural net architecture and model loading in Golang +- [x] Support smaller LLaMA-7B model +- [x] Be sure Go inference works EXACT SAME way as C++ for static prompts +- [x] Let Go shine! Enable multi-threading and boost performance + +## V1 Roadmap + +- [x] Check cross-patform compatibility with Mac and Windows +- [x] Release first stable version for ML hackers +- [x] Support bigger LLaMA models: 13B, 30B, 60B +- [ ] Enable interactive mode for real-time chat with GPT +- [ ] Allow automatic download converted model weights from the Internet +- [ ] Implement metrics for RAM and CPU usage +- [ ] x8 performance boost with AVX2 support +- [ ] INT8 quantization to allow x4 bigger models fit the same memory +- [ ] Server Mode for use in clouds as part of microservice architecture + +## V2 Roadmap + +- [ ] x2 performance boost with AVX512 support +- [ ] ARM NEON support on Mac machines and ARM servers +- [ ] FP16 and BF16 support where possible +- [ ] Support INT4 and GPTQ quantization + +## How to Run + +```shell +go run main.go --threads 8 --model /home/iiislab/project/web3_dl/reference/models/llama-7b-fp32.bin.2 --temp 0.80 --context 128 --predict 128 --prompt "Why Golang is so popular?" +``` + +Or edit the Makefile and compile and run: + +```shell +make +./llama --threads 8 --model /home/iiislab/project/web3_dl/reference/models/llama-7b-fp32.bin.2 --temp 0.80 --context 128 --predict 128 --prompt "Why Golang is so popular?" +``` + +## FAQ + +**1] Where might I get original LLaMA model files?** + +Contact Meta directly or look around for some torrent alternatives + +**2] How to convert original LLaMA files into supported format?** + +Youl'll need original FP16 files placed into **models** directory, then convert with command: + +```shell +python3 ./scripts/convert.py ~/models/LLaMA/7B/ 0 +``` diff --git a/examples/llama/VERSION b/examples/llama/VERSION new file mode 100644 index 0000000..7dea76e --- /dev/null +++ b/examples/llama/VERSION @@ -0,0 +1 @@ +1.0.1 diff --git a/examples/llama/llama_go/llama.go b/examples/llama/llama_go/llama.go new file mode 100644 index 0000000..fa37868 --- /dev/null +++ b/examples/llama/llama_go/llama.go @@ -0,0 +1,1374 @@ +package llama + +import ( + "container/ring" + "fmt" + "io" + "math" + "math/rand" + "os" + "reflect" + "runtime" + "sort" + "time" + "unsafe" + + "github.com/mattn/go-colorable" + "github.com/mitchellh/colorstring" + "github.com/schollz/progressbar/v3" + "github.com/x448/float16" + "golang.org/x/exp/slices" + + "mlgo/ml" +) + +const ( + LLAMA_FILE_VERSION = 1 + LLAMA_FILE_MAGIC = 0x67676a74 // 'ggjt' in hex + LLAMA_FILE_MAGIC_OLD = 0x67676d66 // 'ggmf' in hex + LLAMA_FILE_MAGIC_UNVERSIONED = 0x67676d6c // 'ggml' pre-versioned files + + SPLIT_NONE = 0 + SPLIT_BY_COLUMNS = 1 + SPLIT_BY_ROWS = 2 +) + +var ( + // determine number of model parts based on the dimension + LLAMA_N_PARTS = map[uint32]int{ + 4096: 1, + 5120: 2, + 6656: 4, + 8192: 8, + } +) + +type pair struct { + first float32 + second uint32 +} + +type Context struct { + Model *Model + Vocab *ml.Vocab + + // decode output (2-dimensional array: [n_tokens][n_vocab]) + Logits []float32 + LogitsAll bool + + // input embedding (1-dimensional array: [n_embd]) + Embedding []float32 +} + +func NewContext() *Context { + return &Context{ + Model: NewModel(), + Vocab: ml.NewVocab(0), + Logits: make([]float32, 0, 0), // NewFloatSlice(0, 0), + Embedding: make([]float32, 0, 0), // NewFloatSlice(0, 0), + } +} + +// struct llama_context_params { +type ContextParams struct { + CtxSize uint32 // text context + PartsCount int // -1 for default + Seed int // RNG seed, 0 for random + LogitsAll bool // the llama_eval() call computes all logits, not just the last one + VocabOnly bool // only load the vocabulary, no weights + UseLock bool // force system to keep model in RAM + Embedding bool // embedding mode only +} + +type Layer struct { + + // normalization + attentionNorm *ml.Tensor + + // attention + wq *ml.Tensor + wk *ml.Tensor + wv *ml.Tensor + wo *ml.Tensor + + // normalization + ffn_norm *ml.Tensor + + // ff + w1 *ml.Tensor + w2 *ml.Tensor + w3 *ml.Tensor +} + +// default hparams (LLaMA 7B) +type HParams struct { + ctxSize uint32 // 512 + vocabSize uint32 // 32000 + embdSize uint32 // 4096 + multSize uint32 // 256 + headsCount uint32 // 32 + layersCount uint32 // 32 + rotCount uint32 // 64 + f16 uint32 // 1 +} + +type ModelType uint8 + +// available llama models +const ( + MODEL_UNKNOWN ModelType = iota + MODEL_7B + MODEL_13B + MODEL_30B + MODEL_65B +) + +type KVCache struct { + K *ml.Tensor + V *ml.Tensor + + N uint32 // number of tokens currently in the cache +} + +type Model struct { + Type ModelType + ctx *ml.Context + hparams HParams + + tokEmbeddings *ml.Tensor + norm *ml.Tensor + output *ml.Tensor + + layers []Layer + kvSelf KVCache // key + value cache for the self attention + + loadedCount uint32 + tensors map[string]*ml.Tensor +} + +func NewModel() *Model { + return &Model{ + hparams: HParams{ + ctxSize: 512, + vocabSize: 32000, + embdSize: 4096, + multSize: 256, + headsCount: 32, + layersCount: 32, + rotCount: 64, + f16: 1, + }, + layers: make([]Layer, 0), + tensors: make(map[string]*ml.Tensor), + kvSelf: KVCache{ + K: &ml.Tensor{}, + V: &ml.Tensor{}, + }, + } +} + +func min(a, b int) int { + if a <= b { + return a + } + return b +} + +// Resize() (safe) for using instead of C++ std::vector:resize() +// https://go.dev/play/p/VlQ7N75E5AD +func Resize(slice []float32, size int) []float32 { + newSlice := make([]float32, size) + for i := 0; i < min(size, len(slice)); i++ { + newSlice[i] = slice[i] + } + return newSlice +} + +// NB! This do not clear the underlying array when resizing +// https://go.dev/play/p/DbK4dFqwrZn +func ResizeInplace(slice *[]float32, size int) { + if len(*slice) == size { + return + } else if size < len(*slice) { + *slice = (*slice)[:size] + } else { + *slice = slices.Grow(*slice, size) + *slice = (*slice)[:size] + } +} + +// evaluate the transformer +// +// - lctx: llama context +// - tokens: new batch of tokens to process +// - n_past: the context size so far +// - n_threads: number of threads to use +// + +func Eval( + + lctx *Context, + tokens []uint32, + tokensCount uint32, + pastCount uint32, + threadsCount int) error { + + N := tokensCount + model := lctx.Model + kvSelf := model.kvSelf + + embdSize := model.hparams.embdSize + layersCount := model.hparams.layersCount + ctxSize := model.hparams.ctxSize + headsCount := model.hparams.headsCount + vocabSize := model.hparams.vocabSize + rotCount := model.hparams.embdSize / model.hparams.headsCount + + ctx0 := &ml.Context{} //ctx0 := ml.Init(ml.InitParams{}) + + // for big prompts, if BLAS is enabled, it is better to use only one thread + // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance + graph := ml.Graph{ThreadsCount: threadsCount} + + // Convert the tokens to a []float32 slice + tokensFloat32 := make([]float32, len(tokens)) + for i, token := range tokens { + tokensFloat32[i] = float32(token) + } + + // Initialize the embd tensor with the tokensFloat32 data + embd := ml.NewTensor(ctx0, ml.TYPE_F32, 1, uint32(len(tokens)), 1, 1, 1, tokensFloat32) + inpL := ml.GetRows(ctx0, model.tokEmbeddings, embd) + + for il := uint32(0); il < layersCount; il++ { + + //if il > 0 { + // break // DEBUG + //} + + inpSA := inpL + cur := &ml.Tensor{} + + // norm + cur = ml.RMSNorm(ctx0, inpL) + + // cur = attention_norm*cur + rep := ml.Repeat(ctx0, model.layers[il].attentionNorm, cur) + + cur = ml.Mul(ctx0, rep, cur) + + // self-attention + { + Qcur := ml.MulMat(ctx0, model.layers[il].wq, cur) + Kcur := ml.MulMat(ctx0, model.layers[il].wk, cur) + Vcur := ml.MulMat(ctx0, model.layers[il].wv, cur) + + // store key and value to memory + if N >= 1 { + + ////struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + ////struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past)); + + ////ggml_build_forward_expand(&graph, ggml_cpy(ctx0, Kcur, k)); + ////ggml_build_forward_expand(&graph, ggml_cpy(ctx0, Vcur, v)); + + // NB! ggml_element_size(kv_self.k) = 2 for FP16 + k := ml.View1D(ctx0, kvSelf.K, N*embdSize, embdSize*(il*ctxSize+pastCount)) + v := ml.View1D(ctx0, kvSelf.V, N*embdSize, embdSize*(il*ctxSize+pastCount)) + + ml.BuildForwardExpand(&graph, ml.Copy(ctx0, Kcur, k)) + ml.BuildForwardExpand(&graph, ml.Copy(ctx0, Vcur, v)) + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + Q := + ml.Permute(ctx0, + ml.Rope(ctx0, + ml.Copy(ctx0, + Qcur, + ml.NewTensor3D(ctx0, ml.TYPE_F32, embdSize/headsCount, headsCount, N)), + pastCount, rotCount, 0), + 0, 2, 1, 3) + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + K := + ml.Permute(ctx0, + ml.Rope(ctx0, + ml.Reshape3D(ctx0, + ////ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), + ////n_embd/n_head, n_head, n_past + N), + ml.View1D(ctx0, kvSelf.K, (pastCount+N)*embdSize, il*ctxSize*embdSize), + embdSize/headsCount, headsCount, pastCount+N), + pastCount, rotCount, 1), + 0, 2, 1, 3) + + // K * Q + ////struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ := ml.MulMat(ctx0, K, Q) + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + KQScaled := + ml.Scale(ctx0, + KQ, + ml.NewFP32(ctx0, float32(1.0/math.Sqrt(float64(embdSize)/float64(headsCount)))), + ) + + // KQ_masked = mask_past(KQ_scaled) + ////struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + KQMasked := ml.DiagMaskInf(ctx0, KQScaled, pastCount) + + // KQ = soft_max(KQ_masked) + ////struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + KQSoftMax := ml.SoftMax(ctx0, KQMasked) + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + VTrans := + ml.Copy(ctx0, + ml.Permute(ctx0, + ml.Reshape3D(ctx0, + ml.View1D(ctx0, kvSelf.V, (pastCount+N)*embdSize, il*ctxSize*embdSize), + embdSize/headsCount, headsCount, pastCount+N), + 1, 2, 0, 3), + ml.NewTensor3D(ctx0, ml.TYPE_F32 /* kv_self.v->type */, pastCount+N, embdSize/headsCount, headsCount)) + + // KQV = transpose(V) * KQ_soft_max + KQV := ml.MulMat(ctx0, VTrans, KQSoftMax) + + // KQV_merged = KQV.permute(0, 2, 1, 3) + KQVMerged := ml.Permute(ctx0, KQV, 0, 2, 1, 3) + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ml.Copy(ctx0, + KQVMerged, + ml.NewTensor2D(ctx0, ml.TYPE_F32, embdSize, N)) + + // projection (no bias) + cur = ml.MulMat(ctx0, + model.layers[il].wo, + cur) + } + + inpFF := ml.Add(ctx0, cur, inpSA) + + // feed-forward network + { + // norm + { + cur = ml.RMSNorm(ctx0, inpFF) + + // cur = ffn_norm*cur + cur = ml.Mul(ctx0, + ml.Repeat(ctx0, model.layers[il].ffn_norm, cur), + cur) + } + + tmp := ml.MulMat(ctx0, + model.layers[il].w3, + cur) + + cur = ml.MulMat(ctx0, + model.layers[il].w1, + cur) + + // SILU activation + cur = ml.Silu(ctx0, cur) + + cur = ml.Mul(ctx0, cur, tmp) + + cur = ml.MulMat(ctx0, + model.layers[il].w2, + cur) + } + + cur = ml.Add(ctx0, cur, inpFF) + + // input for next layer + inpL = cur + + } + + // used at the end to optionally extract the embeddings + ////var embeddings *ml.Tensor + + // --- norm + + inpL = ml.RMSNorm(ctx0, inpL) + + // inpL = norm*inpL + inpL = ml.Mul(ctx0, + ml.Repeat(ctx0, model.norm, inpL), + inpL) + + embeddings := inpL + + // lm_head + inpL = ml.MulMat(ctx0, model.output, inpL) + + // logits -> probs + // COMMENTED inpL = ggml_soft_max(ctx0, inpL); + + // run the computation + ml.BuildForwardExpand(&graph, inpL) + + ml.GraphCompute(ctx0, &graph) + + // --- extract logits + + //fmt.Printf("\n\n=== INPL 09 === [%d,%d,%d,%d] ===\n", inpL.NE[0], inpL.NE[1], inpL.NE[2], inpL.NE[3]) // DEBUG + //for ii := 0; ii < 12; ii++ { + // fmt.Printf("%.4f ", inpL.Data[ii]) + //} + + if lctx.LogitsAll { + fmt.Print("\n[HALT] Not Expected: lctx.LogitsAll == true") + os.Exit(1) + + /* + // Copy inpL.Data to lctx.Logits + for i := uint32(0); i < vocabSize*N; i++ { + if i >= uint32(len(lctx.Logits)) || i >= uint32(len(inpL.Data)) { + fmt.Println("Error: Index out of bounds during Logits copy") + os.Exit(1) + } + lctx.Logits[i] = inpL.Data[i] + } + */ + } else { + // Copy only the relevant part of inpL.Data to lctx.Logits + for i := uint32(0); i < vocabSize; i++ { + srcIndex := vocabSize*(N-1) + i + if i >= uint32(len(lctx.Logits)) || srcIndex >= uint32(len(inpL.Data)) { + fmt.Println("Error: Index out of bounds during Logits copy") + os.Exit(1) + } + lctx.Logits[i] = inpL.Data[srcIndex] + } + } + + if ml.DEBUG { + printTensor(inpL, "INPL") + + fmt.Printf("\n\n=== LOGITS === %d ===\n", len(lctx.Logits)) // DEBUG + for ii := 0; ii < 13; ii++ { + fmt.Printf("%.4f ", lctx.Logits[ii]) + } + } + + // --- extract embeddings + + if len(lctx.Embedding) > 0 { + ////memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd); + for i := uint32(0); i < embdSize; i++ { + lctx.Embedding[i] = embeddings.Data[(embdSize*(N-1))+i] // FIXME ASAP + } + } + + return nil +} + +func printTensor(tensor *ml.Tensor, name string) { + var dt string + if tensor.Type == ml.TYPE_F16 { + dt = "FP16" + } + if tensor.Type == ml.TYPE_F32 { + dt = "FP32" + } + if tensor.Type == ml.TYPE_Q4_0 { + dt = "INT4" + } + + fmt.Printf("\n\n=== [ %s | %s | %d:%d:%d ] ===\n", + name, dt, tensor.NE[0], tensor.NE[1], tensor.NE[2]) + + for nn := 0; nn < min(12, int(tensor.NE[1])); nn++ { + fmt.Printf("\n %d x %d ...\t", nn, tensor.NE[0]) + for ii := 0; ii < min(12, int(tensor.NE[0])); ii++ { + fmt.Printf("%.3f\t", tensor.Data[nn*int(tensor.NE[0])+ii]) + } + } +} + +func sampleTopK(logitsID []pair, topK uint32) []pair { + // find the top K tokens + + // std::partial_sort + // Rearranges elements such that the range [first, middle) contains + // the sorted middle − first smallest elements in the range [first, last). + // The order of equal elements is not guaranteed to be preserved. + // The order of the remaining elements in the range [middle, last) is unspecified. + + /*std::partial_sort( + logits_id.begin(), + logits_id.begin() + top_k, logits_id.end(), + [](const std::pair & a, const std::pair & b) { + return a.first > b.first; + });*/ + + //keys := make([]double, 0, len(logitsID)) + //for k := range logitsID { + // keys = append(keys, k) + //} + //sort.Float64s(keys) + + sort.Slice( + logitsID[:topK], + func(i, j int) bool { + return logitsID[i].first < logitsID[j].first // FIXME ASAP We need bigger elements first + }) + + // logits_id.resize(top_k); + //for i := uint32(0); i < len(keys)-topK; i++ { + //delete(logitsID, keys[i]) + //} + + ret := make([]pair, 0, topK) + copy(ret, logitsID) + + return ret +} + +// llama_sample_top_p_top_k +// sample next token given probabilities for each embedding +// +// - consider only the top K tokens +// - from them, consider only the top tokens with cumulative probability > P +// + +// std::mt19937 = A Mersenne Twister pseudo-random generator of 32-bit numbers with a state size of 19937 bits. +func SampleTopPTopK( + lctx *Context, + // lastNTokens []uint32, + lastNTokens *ring.Ring, + lastNTokensSize uint32, // FIXME Remove + topK uint32, + topP float32, + temp float32, + repeatPenalty float32, +) uint32 { + + ////auto & rng = lctx.rng; + ////logitsCount := uint32(len(vocab.ID2Token)) + logitsCount := lctx.Model.hparams.vocabSize + logits := lctx.Logits + + if ml.DEBUG { + fmt.Printf("\n\n>>> SampleTopPTopK <<<\n") + fmt.Printf("\n=== LOGITS | %d ===\n", len(logits)) + for i := 0; i < 8; i++ { + fmt.Printf("%.4f ", logits[i]) + } + fmt.Printf(" ... ") + for i := int(len(logits)) - 1; i >= int(len(logits))-8; i-- { + fmt.Printf("%.4f ", logits[i]) + } + /* + fmt.Printf("\n=== LAST N TOKENS | %d ===\n", len(lastNTokens)) + for i := 0; i < int(lastNTokensSize); i++ { + fmt.Printf("%d ", lastNTokens[i]) + } + */ + extractedTokens := ExtractTokens(lastNTokens.Move(-int(lastNTokensSize)), int(lastNTokensSize)) + fmt.Printf("\n=== LAST N TOKENS | %d ===\n", len(extractedTokens)) + for i := 0; i < int(lastNTokensSize); i++ { + fmt.Printf("%d ", extractedTokens[i]) + } + } + + ////if (temp <= 0) { + //// // select the token with the highest logit directly + //// float max_logit = plogits[0]; + //// llama_vocab::id max_id = 0; + //// + //// for (int i = 1; i < n_logits; ++i) { + //// if (plogits[i] > max_logit) { + //// max_logit = plogits[i]; + //// max_id = i; + //// } + //// } + //// return max_id; + ////} + + ////const auto * plogits = logits.data() + logits.size() - n_logits; + //plogits := logits[len(logits)-int(logitsCount):] // FIXME ASAP + plogits := logits[:] + + ////std::vector> logits_id; + ////logits_id.reserve(n_logits); + logitsID := make([]pair, 0, logitsCount) // FIXME LEN vs CAP + + { + scale := float32(1.0 / temp) + for i := uint32(0); i < logitsCount; i++ { + + // Repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) + // Credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main + + // Check if the i-th token is present in the last_n_tokens ring buffer + tokenExists := false + lastNTokens.Do(func(p interface{}) { + if p.(uint32) == i { + tokenExists = true + } + }) + + // If lastNTokens already contains i-th token, append it with repeat penalty + if tokenExists { + // If score < 0, then repetition penalty has to be multiplied to reduce the previous token probability + if plogits[i] < 0.0 { + logitsID = append(logitsID, pair{plogits[i] * scale * repeatPenalty, i}) + } else { + logitsID = append(logitsID, pair{plogits[i] * scale / repeatPenalty, i}) + } + // Else append pair to logitsID, scaling probability + } else { + logitsID = append(logitsID, pair{plogits[i] * scale, i}) + } + } + } + + if ml.DEBUG { + fmt.Printf("\n=== LOGITS ID AFTER | %d ===\n", len(logitsID)) + for i := 0; i < min(6, len(logitsID)); i++ { + fmt.Printf("{ %.3f | %d }", logitsID[i].first, logitsID[i].second) + } + fmt.Printf(" ... ") + for i := len(logitsID) - 6; i < len(logitsID)-1; i++ { + fmt.Printf("{ %.3f | %d } ", logitsID[i].first, logitsID[i].second) + } + } + + // sort logitsID slice and return only top K elements + //// sampleTopK(logitsID, topK) + + // NB! Inline logic for [sampleTopK] right here + + //// std::partial_sort( + //// logits_id.begin(), + //// logits_id.begin() + top_k, logits_id.end(), + //// [](const std::pair & a, const std::pair & b) { + //// return a.first > b.first; + //// }); + //// logits_id.resize(top_k); + + sort.Slice( + logitsID, // logitsID[:topK], + func(a, b int) bool { + return logitsID[a].first > logitsID[b].first + }) + + if ml.DEBUG { + fmt.Printf("\n=== LOGITS ID SORTED | TOP K = %d ===\n", topK) + for i := 0; i < min(6, len(logitsID)); i++ { + fmt.Printf("{ %.3f | %d }", logitsID[i].first, logitsID[i].second) + } + fmt.Printf(" ... ") + for i := len(logitsID) - 6; i < len(logitsID)-1; i++ { + fmt.Printf("{ %.3f | %d } ", logitsID[i].first, logitsID[i].second) + } + } + + logitsID = logitsID[:topK] + + if ml.DEBUG { + fmt.Printf("\n=== LOGITS ID RESIZED | %d ===\n", len(logitsID)) + for i := 0; i < min(6, len(logitsID)); i++ { + fmt.Printf("{ %.3f | %d }", logitsID[i].first, logitsID[i].second) + } + fmt.Printf(" ... ") + for i := len(logitsID) - 6; i < len(logitsID)-1; i++ { + fmt.Printf("{ %.3f | %d } ", logitsID[i].first, logitsID[i].second) + } + } + + // FIXME Why loop? We've already SORTED logitsID and the MAX is just the FIRST element + ////double maxl = -INFINITY; + maxl := float32(math.Inf(-1)) + for _, kv := range logitsID { + //// maxl = std::max(maxl, kv.first); + maxl = max(maxl, kv.first) + } + + // compute probs for the top k tokens + ////probs.reserve(logits_id.size()); + probs := make([]float32, 0, len(logitsID)) // FIXME LEN vs CAP + + sum := float64(0.0) + for _, kv := range logitsID { + p := math.Exp(float64(kv.first - maxl)) + probs = append(probs, float32(p)) + sum += p + } + + if ml.DEBUG { + fmt.Printf("\n=== PROBS | %d ===\n", len(probs)) + for i := 0; i < min(6, len(probs)); i++ { + fmt.Printf("%.3f ", probs[i]) + } + fmt.Printf(" ... ") + for i := len(logitsID) - 6; i < len(probs)-1; i++ { + fmt.Printf("%.3f ", probs[i]) + } + } + + // normalize the probs + for i := range probs { + probs[i] /= float32(sum) + } + + if ml.DEBUG { + fmt.Printf("\n=== PROBS NORM | %d ===\n", len(probs)) + for i := 0; i < min(6, len(probs)); i++ { + fmt.Printf("%.3f ", probs[i]) + } + fmt.Printf(" ... ") + for i := len(logitsID) - 6; i < len(probs)-1; i++ { + fmt.Printf("%.3f ", probs[i]) + } + } + + if topP < 1.0 { + + cumsum := float32(0.0) // TODO float64 for better math? + for i := uint32(0); i < uint32(len(probs)); i++ { + cumsum += probs[i] + if cumsum >= topP { + probs = probs[:i+1] + logitsID = logitsID[:i+1] + break + } + } + + cumsum = 1.0 / cumsum + for i := uint32(0); i < uint32(len(probs)); i++ { + probs[i] *= cumsum + } + } + + if ml.DEBUG { + if len(probs) > 6 { + fmt.Printf("\n=== PROBS POST | %d ===\n", len(probs)) + for i := 0; i < min(6, len(probs)); i++ { + fmt.Printf("%.3f ", probs[i]) + } + fmt.Printf(" ... ") + for i := len(logitsID) - 6; i < len(probs)-1; i++ { + fmt.Printf("%.3f ", probs[i]) + } + } + } + + ////std::discrete_distribution<> dist(probs.begin(), probs.end()); + ////int idx = dist(rng); + ////return logits_id[idx].second; + + // --- discrete distribution + // TODO Do we need something better than hand-crafted math here? + + seed := time.Now().UnixNano() + source := rand.NewSource(seed) + + for i := 0; i < len(probs); i++ { + f := float32(source.Int63()) / (1 << 63) + probs[i] = probs[i] * probs[i] * f * f + } + + idx := 0 + maxProb := probs[0] + for i := 1; i < len(probs); i++ { + if probs[i] > maxProb { + idx = i + maxProb = probs[i] + } + } + + if ml.DEBUG { + fmt.Printf("\nidx = %d", idx) + fmt.Printf("\nlogitsID = %d | weight = %f", logitsID[idx].second, logitsID[idx].first) + } + + return logitsID[idx].second +} + +// evaluate the transformer calculated by NodeID and return graphs +// +// - lctx: llama context +// - tokens: new batch of tokens to process +// - n_past: the context size so far +// - n_threads: number of threads to use +// +func ExpandGraph( + + lctx *Context, + tokens []uint32, + tokensCount uint32, + pastCount uint32, + threadsCount int) (*ml.Graph, *ml.Context, error) { + + N := tokensCount + model := lctx.Model + kvSelf := model.kvSelf + + embdSize := model.hparams.embdSize + layersCount := model.hparams.layersCount + ctxSize := model.hparams.ctxSize + headsCount := model.hparams.headsCount + rotCount := model.hparams.embdSize / model.hparams.headsCount + + ctx0 := &ml.Context{} //ctx0 := ml.Init(ml.InitParams{}) + + // for big prompts, if BLAS is enabled, it is better to use only one thread + // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance + graph := ml.Graph{ThreadsCount: threadsCount} + + // Convert the tokens to a []float32 slice + tokensFloat32 := make([]float32, len(tokens)) + for i, token := range tokens { + tokensFloat32[i] = float32(token) + } + + // Initialize the embd tensor with the tokensFloat32 data + embd := ml.NewTensor(ctx0, ml.TYPE_F32, 1, uint32(len(tokens)), 1, 1, 1, tokensFloat32) + inpL := ml.GetRows(ctx0, model.tokEmbeddings, embd) + + for il := uint32(0); il < layersCount; il++ { + + //if il > 0 { + // break // DEBUG + //} + + inpSA := inpL + cur := &ml.Tensor{} + + // norm + cur = ml.RMSNorm(ctx0, inpL) + + // cur = attention_norm*cur + rep := ml.Repeat(ctx0, model.layers[il].attentionNorm, cur) + + cur = ml.Mul(ctx0, rep, cur) + + // self-attention + { + Qcur := ml.MulMat(ctx0, model.layers[il].wq, cur) + Kcur := ml.MulMat(ctx0, model.layers[il].wk, cur) + Vcur := ml.MulMat(ctx0, model.layers[il].wv, cur) + + // store key and value to memory + if N >= 1 { + + ////struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + ////struct ggml_tensor * v = ggml_view_1d(ctx0, kv_self.v, N*n_embd, (ggml_element_size(kv_self.v)*n_embd)*(il*n_ctx + n_past)); + + ////ggml_build_forward_expand(&graph, ggml_cpy(ctx0, Kcur, k)); + ////ggml_build_forward_expand(&graph, ggml_cpy(ctx0, Vcur, v)); + + // NB! ggml_element_size(kv_self.k) = 2 for FP16 + k := ml.View1D(ctx0, kvSelf.K, N*embdSize, embdSize*(il*ctxSize+pastCount)) + v := ml.View1D(ctx0, kvSelf.V, N*embdSize, embdSize*(il*ctxSize+pastCount)) + + ml.BuildForwardExpand(&graph, ml.Copy(ctx0, Kcur, k)) + ml.BuildForwardExpand(&graph, ml.Copy(ctx0, Vcur, v)) + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + Q := + ml.Permute(ctx0, + ml.Rope(ctx0, + ml.Copy(ctx0, + Qcur, + ml.NewTensor3D(ctx0, ml.TYPE_F32, embdSize/headsCount, headsCount, N)), + pastCount, rotCount, 0), + 0, 2, 1, 3) + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + K := + ml.Permute(ctx0, + ml.Rope(ctx0, + ml.Reshape3D(ctx0, + ////ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), + ////n_embd/n_head, n_head, n_past + N), + ml.View1D(ctx0, kvSelf.K, (pastCount+N)*embdSize, il*ctxSize*embdSize), + embdSize/headsCount, headsCount, pastCount+N), + pastCount, rotCount, 1), + 0, 2, 1, 3) + + // K * Q + ////struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + KQ := ml.MulMat(ctx0, K, Q) + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + KQScaled := + ml.Scale(ctx0, + KQ, + ml.NewFP32(ctx0, float32(1.0/math.Sqrt(float64(embdSize)/float64(headsCount)))), + ) + + // KQ_masked = mask_past(KQ_scaled) + ////struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + KQMasked := ml.DiagMaskInf(ctx0, KQScaled, pastCount) + + // KQ = soft_max(KQ_masked) + ////struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + KQSoftMax := ml.SoftMax(ctx0, KQMasked) + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + VTrans := + ml.Copy(ctx0, + ml.Permute(ctx0, + ml.Reshape3D(ctx0, + ml.View1D(ctx0, kvSelf.V, (pastCount+N)*embdSize, il*ctxSize*embdSize), + embdSize/headsCount, headsCount, pastCount+N), + 1, 2, 0, 3), + ml.NewTensor3D(ctx0, ml.TYPE_F32 /* kv_self.v->type */, pastCount+N, embdSize/headsCount, headsCount)) + + // KQV = transpose(V) * KQ_soft_max + KQV := ml.MulMat(ctx0, VTrans, KQSoftMax) + + // KQV_merged = KQV.permute(0, 2, 1, 3) + KQVMerged := ml.Permute(ctx0, KQV, 0, 2, 1, 3) + + // cur = KQV_merged.contiguous().view(n_embd, N) + cur = ml.Copy(ctx0, + KQVMerged, + ml.NewTensor2D(ctx0, ml.TYPE_F32, embdSize, N)) + + // projection (no bias) + cur = ml.MulMat(ctx0, + model.layers[il].wo, + cur) + } + + inpFF := ml.Add(ctx0, cur, inpSA) + + // feed-forward network + { + // norm + { + cur = ml.RMSNorm(ctx0, inpFF) + + // cur = ffn_norm*cur + cur = ml.Mul(ctx0, + ml.Repeat(ctx0, model.layers[il].ffn_norm, cur), + cur) + } + + tmp := ml.MulMat(ctx0, + model.layers[il].w3, + cur) + + cur = ml.MulMat(ctx0, + model.layers[il].w1, + cur) + + // SILU activation + cur = ml.Silu(ctx0, cur) + + cur = ml.Mul(ctx0, cur, tmp) + + cur = ml.MulMat(ctx0, + model.layers[il].w2, + cur) + } + + cur = ml.Add(ctx0, cur, inpFF) + + // input for next layer + inpL = cur + + } + + // used at the end to optionally extract the embeddings + ////var embeddings *ml.Tensor + + // --- norm + + inpL = ml.RMSNorm(ctx0, inpL) + + // inpL = norm*inpL + inpL = ml.Mul(ctx0, + ml.Repeat(ctx0, model.norm, inpL), + inpL) + + + // lm_head + inpL = ml.MulMat(ctx0, model.output, inpL) + + // add an mock op here + zeroTensor := ml.NewTensor2D(ctx0, inpL.Type, inpL.NE[0], inpL.NE[1]) + inpL = ml.Add(ctx0, inpL, zeroTensor) + + // logits -> probs + // COMMENTED inpL = ggml_soft_max(ctx0, inpL); + + // run the computation + ml.BuildForwardExpand(&graph, inpL) + + // ml.GraphComputeByNodes(ctx0, &graph, nodeID) + + return &graph, ctx0, nil +} + + +// llama_model_load +// load the model's weights from a file +// see convert-pth-to-ggml.py for details on format + +func LoadModel( + fileName string, + //partsCount int, + silent bool, +) (*Context, error) { + + lctx := NewContext() + + file, err := os.Open(fileName) + if err != nil { + return nil, err + } + defer file.Close() + + // --- check header magic and format version + + magic := readInt(file) + + if magic == LLAMA_FILE_MAGIC_UNVERSIONED || magic == LLAMA_FILE_MAGIC_OLD { + fmt.Printf("\n[ERROR] Invalid model file '%s'! Too old, regenerate!", fileName) + return nil, fmt.Errorf("invalid model file") + } + + if magic != LLAMA_FILE_MAGIC { + fmt.Printf("\n[ERROR] Invalid model file '%s'! Wrong MAGIC in header", fileName) + return nil, fmt.Errorf("invalid model file") + } + + version := readInt(file) + + if version != LLAMA_FILE_VERSION { + fmt.Printf("\n[ERROR] Invalid model file '%s'! Unsupported version", fileName) + return nil, fmt.Errorf("invalid model file") + } + + // --- load hparams + + vocabSize := readInt(file) // vocab_size + embdSize := readInt(file) // dim + multSize := readInt(file) // multiple_of + headsCount := readInt(file) // n_heads + layersCount := readInt(file) // n_layers + rotCount := readInt(file) // rot = dim // n_heads [obsolete] + f16 := readInt(file) // ftype + + model := lctx.Model + + model.hparams.vocabSize = vocabSize + model.hparams.embdSize = embdSize + model.hparams.multSize = multSize + model.hparams.headsCount = headsCount + model.hparams.layersCount = layersCount + model.hparams.rotCount = rotCount + model.hparams.f16 = f16 + + // --- init cache + //KVCacheInit(&lctx.Model.hparams, &lctx.Model.kvSelf, ml.TYPE_F32) + dt := ml.TYPE_F32 + size := embdSize * layersCount * 512 /*ctxSize*/ // FIXME ctxSize + lctx.Model.kvSelf.K = ml.NewTensor1D(nil, dt, size) + lctx.Model.kvSelf.V = ml.NewTensor1D(nil, dt, size) + + // NB! Do not try to resize / relocate secondary pointers + lctx.Vocab = ml.NewVocab(vocabSize) + vocab := lctx.Vocab + + // FIXME Reserve extra space for tokensCount (N) = 8 (as with LogitsAll == true) + //lctx.Logits = make([]float32, vocabSize*8, vocabSize*8) // NewFloatSlice(vocabSize, vocabSize) // FIXME ASAP + lctx.Logits = make([]float32, vocabSize, vocabSize) // use just vocab size as CPP version does by default + + if ml.DEBUG { + fmt.Printf("\nvocab = %d", vocabSize) + fmt.Printf("\nembd = %d", embdSize) + fmt.Printf("\nmult = %d", multSize) + fmt.Printf("\nheads = %d", headsCount) + fmt.Printf("\nlayers = %d", layersCount) + fmt.Printf("\nrot = %d", rotCount) + fmt.Printf("\nf16 = %d", f16) + } + + //fmt.Printf("\nctx = %d", hparamsCtx) + //fmt.Printf("\nn_ff = %d", n_ff) + + n_ff := ((2*(4*embdSize)/3 + multSize - 1) / multSize) * multSize + + // --- load vocab + + if !silent && runtime.GOOS == "windows" { + Colorize("[magenta][ INIT ][white] Loading vocab...") + } + + vocabBar := progressbar.NewOptions( + int(vocabSize), + progressbar.OptionFullWidth(), + //progressbar.OptionSetWidth(40), + progressbar.OptionEnableColorCodes(true), + progressbar.OptionSetPredictTime(false), + progressbar.OptionSetElapsedTime(false), + progressbar.OptionSetDescription("[light_magenta][ INIT ][light_blue] Loading model vocab... [light_cyan]"), + progressbar.OptionSetTheme(progressbar.Theme{ + Saucer: "[light_magenta]▒[reset]", + SaucerHead: "[white]▒[reset]", + SaucerPadding: "[dark_gray]▒[reset]", + BarStart: "[dark_gray]║[reset]", + BarEnd: "[dark_gray]║[reset]", + })) + + for i := uint32(0); i < vocabSize; i++ { + + if !silent && runtime.GOOS != "windows" && i%100 == 0 { + vocabBar.Set(int(i)) + } + + length := readInt(file) + token := readString(file, length) + score := readFP32(file) + + vocab.Token2ID[token] = i + vocab.ID2Token[i] = ml.TokenScore{Token: token, Score: score} + } + + if !silent && runtime.GOOS != "windows" { + vocabBar.Finish() + fmt.Printf("\n") + } + + ctx := model.ctx + + // --- prepare memory for the weights + { + model.tokEmbeddings = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, vocabSize) + + model.norm = ml.NewTensor1D(ctx, ml.TYPE_F32, embdSize) + model.output = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, vocabSize) + + // map by name + model.tensors["tok_embeddings.weight"] = model.tokEmbeddings + + model.tensors["norm.weight"] = model.norm + model.tensors["output.weight"] = model.output + + model.layers = make([]Layer, layersCount) + for i := uint32(0); i < layersCount; i++ { + //auto & layer = model.layers[i]; + + model.layers[i].attentionNorm = ml.NewTensor1D(ctx, ml.TYPE_F32, embdSize) + + model.layers[i].wq = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, embdSize) + model.layers[i].wk = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, embdSize) + model.layers[i].wv = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, embdSize) + model.layers[i].wo = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, embdSize) + + model.layers[i].ffn_norm = ml.NewTensor1D(ctx, ml.TYPE_F32, embdSize) + + model.layers[i].w1 = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, n_ff) + model.layers[i].w2 = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, n_ff, embdSize) + model.layers[i].w3 = ml.NewTensor2D(ctx, ml.TYPE_F32 /*wtype*/, embdSize, n_ff) + + // map by name + prefix := fmt.Sprintf("layers.%d.", i) + + model.tensors[prefix+"attention_norm.weight"] = model.layers[i].attentionNorm + + model.tensors[prefix+"attention.wq.weight"] = model.layers[i].wq + model.tensors[prefix+"attention.wk.weight"] = model.layers[i].wk + model.tensors[prefix+"attention.wv.weight"] = model.layers[i].wv + model.tensors[prefix+"attention.wo.weight"] = model.layers[i].wo + + model.tensors[prefix+"ffn_norm.weight"] = model.layers[i].ffn_norm + + model.tensors[prefix+"feed_forward.w1.weight"] = model.layers[i].w1 + model.tensors[prefix+"feed_forward.w2.weight"] = model.layers[i].w2 + model.tensors[prefix+"feed_forward.w3.weight"] = model.layers[i].w3 + } + } + + if !silent && runtime.GOOS == "windows" { + Colorize("\n[magenta][ INIT ][white] Loading model - please wait ...") + } + + // https://pkg.go.dev/github.com/schollz/progressbar/v3#Option + bar := progressbar.NewOptions(int(layersCount*9), + progressbar.OptionFullWidth(), + //progressbar.OptionSetWidth(40), + progressbar.OptionEnableColorCodes(true), + progressbar.OptionSetPredictTime(false), + progressbar.OptionSetElapsedTime(false), + progressbar.OptionSetDescription("[light_magenta][ INIT ][light_blue] Loading model weights...[light_cyan]"), + progressbar.OptionSetTheme(progressbar.Theme{ + Saucer: "[light_magenta]▒[reset]", + SaucerHead: "[white]▒[reset]", + SaucerPadding: "[dark_gray]▒[reset]", + BarStart: "[dark_gray]║[reset]", + BarEnd: "[dark_gray]║[reset]", + })) + + // --- load weights + + var tensorsCount uint32 + for { + + dims := readInt(file) + if dims < 1 || dims > 2 { // TODO Check for EOF + break + } + + nameLength := readInt(file) + shardType := ml.DType(readInt(file)) + + nelements := 1 + ne := [2]uint32{1, 1} + for i := 0; i < int(dims); i++ { + ne[i] = readInt(file) + nelements *= int(ne[i]) + } + + name := readString(file, nameLength) + if _, ok := model.tensors[name]; !ok { + fmt.Printf("\n[ERROR] Unknown tensor '%s' in model file", name) + os.Exit(1) + } + + if ml.DEBUG { + typeStr := "FP32" + if shardType == ml.TYPE_F16 { + typeStr = "FP16" + } + memStr := fmt.Sprintf("%dM", nelements*4/1024/1024) + fmt.Printf("\n=== LAYER #%d === %s | %s | %s ===", tensorsCount, typeStr, name, memStr) + } + + /* The latest GGJT format is always ONE-PART-NO-SPLIT-TENSORS binary file, so the parsing is really streamlined + + partsCount := LLAMA_N_PARTS[embdSize] + splitType := SPLIT_NONE + if partsCount > 1 && dims > 1 { + splitType = SPLIT_BY_COLUMNS + if strings.Contains(name, "output") { + splitType = SPLIT_NONE + } else if strings.Contains(name, "layers") && + !strings.Contains(name, "attention.wo.weight") && + !strings.Contains(name, "feed_forward.w2.weight") { + splitType = SPLIT_NONE + } + } + */ + + tensor := model.tensors[name] + tensorSize := tensor.Nelements() + + // --- all tensors in file are aligned for 32 bytes + + alignment := int64(32) + offset, _ := file.Seek(0, io.SeekCurrent) + for ; offset%alignment != 0; offset++ { + } + file.Seek(offset, io.SeekStart) + + // --- read tensor into memory + + if shardType == ml.TYPE_F16 { + // FIXME Single-dimension tensors always presented as FP32 + // after conversion from PyTorch even for FP16 models + for n := uint32(0); n < tensorSize; n++ { + tensor.Data[n] = readFP16ToFP32(file) + } + } else if shardType == ml.TYPE_F32 { + var fake []byte + fakeHeader := (*reflect.SliceHeader)(unsafe.Pointer(&fake)) + // NB! unsafe.Pointer(tensor.Data) for *Data VS unsafe.Pointer(&tensor.Data) for Data + dataHeader := (*reflect.SliceHeader)(unsafe.Pointer(&tensor.Data)) + + fakeHeader.Data = dataHeader.Data + fakeHeader.Len = int(tensorSize * 4) + fakeHeader.Cap = int(tensorSize * 4) + + //fmt.Printf("\n== FAKE []BYTE LEN = %d", len(fake)) + if count, err := io.ReadFull(file, fake); err != nil || count != int(tensorSize*4) { + fmt.Printf("\n[ERROR] Failed to read BIG FP32 chunk from model!") + fmt.Printf("\n[ERROR] COUNT = %d | ERR = %s", count, err.Error()) + os.Exit(1) + } + } else { + fmt.Printf("\n[ERROR] Tensor data type is not supported yet!") + os.Exit(0) + } + + tensorsCount++ + model.loadedCount++ + if !silent && runtime.GOOS != "windows" { + bar.Add(1) + } + } + + if !silent && runtime.GOOS != "windows" { + bar.Finish() + } + + return lctx, nil +} + +func max(a, b float32) float32 { + if a >= b { + return a + } + return b +} + +// NB! INT = 32 bits +func readInt(file *os.File) uint32 { + buf := make([]byte, 4) + if count, err := file.Read(buf); err != nil || count != 4 { + return 0 + } + return uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0]) +} + +func readString(file *os.File, len uint32) string { + buf := make([]byte, len) + if count, err := file.Read(buf); err != nil || count != int(len) { + return "" + } + return string(buf) +} + +func readFP16ToFP32(file *os.File) float32 { + buf := make([]byte, 2) + if count, err := file.Read(buf); err != nil || count != 2 { + return 0.0 + } + bits := uint16(buf[1])<<8 | uint16(buf[0]) + f16 := float16.Frombits(bits) + return f16.Float32() +} + +func readFP32(file *os.File) float32 { + buf := make([]byte, 4) + if count, err := file.Read(buf); err != nil || count != 4 { + return 0.0 + } + bits := uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0]) + return math.Float32frombits(bits) +} + +// ExtractTokens is a function to extract a slice of tokens from the ring buffer +func ExtractTokens(r *ring.Ring, count int) []uint32 { + tokens := make([]uint32, count) + for i := 0; i < count; i++ { + tokens[i] = r.Value.(uint32) + r = r.Next() + } + return tokens +} + +func Colorize(format string, opts ...interface{}) (n int, err error) { + var DefaultOutput = colorable.NewColorableStdout() + return fmt.Fprintf(DefaultOutput, colorstring.Color(format), opts...) +} diff --git a/examples/llama/llama_go/llama_test.go b/examples/llama/llama_go/llama_test.go new file mode 100644 index 0000000..f29a7e8 --- /dev/null +++ b/examples/llama/llama_go/llama_test.go @@ -0,0 +1,91 @@ +package llama + +import ( + "fmt" + "mlgo/ml" + "os" + "reflect" + "testing" +) + +func TestLLaMA(t *testing.T) { + modelFile := "/home/iiislab/project/web3_dl/reference/models/llama-7b-fp32.bin.2" + prompt := "Why Golang is so popular?" + threadCount := 32 + ctx, err := LoadModel(modelFile, true) + fmt.Println("Load Model Finish") + if err != nil { + fmt.Println("load model error: ", err) + return + } + embd := ml.Tokenize(ctx.Vocab, prompt, true) + err = Eval(ctx, embd, uint32(len(embd)), 0, threadCount) + fmt.Println("Eval Model Finish") +} + +func TestLLaMAEvalGraph(t *testing.T) { + modelFile := "/home/iiislab/project/web3_dl/reference/models/llama-7b-fp32.bin.2" + prompt := "Why Golang is so popular?" + threadCount := 32 + ctx, err := LoadModel(modelFile, true) + fmt.Println("Load Model Finish") + if err != nil { + fmt.Println("load model error: ", err) + return + } + embd := ml.Tokenize(ctx.Vocab, prompt, true) + graph, mlctx, err := ExpandGraph(ctx, embd, uint32(len(embd)), 0, threadCount) + nodeID := int(graph.NodesCount) - 1 + ml.GraphComputeByNodes(mlctx, graph, nodeID) + ml.PrintTensor(graph.Nodes[nodeID], "before") + + envBytes := ml.SaveComputeNodeEnvToBytes(uint32(nodeID), graph.Nodes[nodeID], graph, true) + nodeID_, tensorGraphList_ , err := ml.DecodeComputeNodeEnv(envBytes, true, false) + // save bytes from mips + { + fout, err := os.Create(fmt.Sprintf("../data/node_%v", nodeID)) + if err != nil { + fmt.Println(err) + return + } + defer fout.Close() + _, err = fout.Write(envBytes) + if err != nil { + fmt.Println(err) + return + } + } + // save => tensorOnGraph[] + tensorGraphList := ml.SaveComputeNodeEnv(graph.Nodes[nodeID], graph) + fmt.Println("nodeID Equal: ", nodeID_ == uint32(nodeID)) + fmt.Println("tensorGraphList: ", reflect.DeepEqual(tensorGraphList_, tensorGraphList)) + + // reconstruct + tensorList := make([]*ml.Tensor, 0) + tensorMap := make(map[uint32]*ml.Tensor) + for i := 0; i < len(tensorGraphList); i++ { + tensor := tensorGraphList[i].ToTensor(nil) + tensorMap[tensorGraphList[i].NodeID] = tensor + tensorList = append(tensorList, tensor) + } + // fill in the nodeid + for i := 0; i < len(tensorList); i++ { + tensor := tensorList[i] + tensorG := tensorGraphList[i] + if src0, ok := tensorMap[tensorG.Src0NodeID]; ok { + tensor.Src0 = src0 + } + if src1, ok := tensorMap[tensorG.Src1NodeID]; ok { + tensor.Src1 = src1 + } + } + + // compute + ml.ComputeNodeForward(tensorMap[uint32(nodeID)]) + + // ml.ComputeNodeForward(graph.Nodes[nodeID]) + ml.PrintTensor(tensorMap[uint32(nodeID)], "after") + + fmt.Println("graph node number: ", graph.NodesCount) + fmt.Println("Eval Model Finish") +} \ No newline at end of file diff --git a/examples/llama/main.go b/examples/llama/main.go new file mode 100644 index 0000000..894f156 --- /dev/null +++ b/examples/llama/main.go @@ -0,0 +1,351 @@ +package main + +import ( + "container/ring" + "fmt" + "os" + "runtime" + "strings" + + "github.com/jessevdk/go-flags" + + "mlgo/ml" + "mlgo/examples/llama/llama_go" +) + +type ModelParams struct { + seed int + threadsCount int + predictCount uint32 // new tokens to predict + repeatLastN uint32 // last n tokens to penalize + partsCount int // amount of model parts (-1 = determine from model dimensions) + ctxSize uint32 // context size + batchSize uint32 // batch size for prompt processing + keepCount uint32 + + // --- sampling parameters + + topK uint32 // 40 + topP float32 // 0.95 + temp float32 // 0.80 + repeatPenalty float32 // 1.10 + + model string // model path + prompt string + inputPrefix string // string to prefix user inputs with + + antiprompt []string // string upon seeing which more user input is prompted + + memoryFP16 bool // use f16 instead of f32 for memory kv + randomPrompt bool // do not randomize prompt if none provided + useColor bool // use color to distinguish generations and inputs + interactive bool // interactive mode + + embedding bool // get only sentence embedding + interactiveStart bool // wait for user input immediately + + instruct bool // instruction mode (used for Alpaca models) + ignoreEOS bool // do not stop generating after eos + perplexity bool // compute perplexity over the prompt + use_mlock bool // use mlock to keep model in memory + memTest bool // compute maximum memory usage + + verbosePrompt bool +} + +func main() { + + // --- Parse command line args and set default parameters + + var opts struct { + Prompt string `long:"prompt" description:"Text prompt from user to feed the model input"` + Model string `long:"model" description:"Path and file name of converted .bin LLaMA model"` + Threads int `long:"threads" description:"Adjust to the number of CPU cores you want to use [ all cores by default ]"` + Predict uint32 `long:"predict" description:"Number of tokens to predict [ 128 by default ]"` + Context uint32 `long:"context" description:"Context size in tokens [ 512 by default ]"` + Temp float32 `long:"temp" description:"Model temperature hyper parameter [ 0.8 by default ]"` + Silent bool `long:"silent" description:"Hide welcome logo and other output [ show by default ]"` + Chat bool `long:"chat" description:"Chat with user in interactive mode instead of compute over static prompt"` + } + + _, err := flags.Parse(&opts) + if err != nil { + return + } + + prompt := " " + opts.Prompt // add a space to match LLaMA tokenizer behavior + final := "" // accumulate model output + + // Allow to use ALL cores for the program itself and user-specified number for tensor math + // TODO Optimize default settings for CPUs with P and E cores like M1 Pro = 8 performant and 2 energy cores + runtime.GOMAXPROCS(runtime.NumCPU()) + if opts.Threads == 0 { + opts.Threads = runtime.NumCPU() + } + + if opts.Context == 0 { + opts.Context = 512 + } + + if opts.Predict == 0 { + opts.Predict = 128 + } + + if opts.Temp == 0 { + opts.Temp = 0.8 + } + + repeatLastN := uint32(64) + if repeatLastN > opts.Context { + repeatLastN = opts.Context + } + + if !opts.Silent { + showLogo() + } + + if opts.Prompt == "" || opts.Model == "" { + llama.Colorize("\n[magenta][ ERROR ][white] Please specify correct model path and prompt with [light_magenta]--model[white] and [light_magenta]--prompt[white] parameters\n\n") + os.Exit(0) + } + + params := ModelParams{ + model: opts.Model, + interactive: opts.Chat, + + ctxSize: opts.Context, + seed: -1, + threadsCount: opts.Threads, + predictCount: opts.Predict, + repeatLastN: repeatLastN, + partsCount: -1, + batchSize: 8, + + topK: 40, + topP: 0.95, + temp: opts.Temp, + repeatPenalty: 1.10, + + memoryFP16: true, + } + + // --- load the model + + ctx, err := llama.LoadModel(params.model, opts.Silent) + if err != nil { + _, err := llama.Colorize("\n[magenta][ ERROR ][white] Failed to load model [light_magenta]\"%s\"\n\n", params.model) + if err != nil { + return + } + os.Exit(0) + } + + // tokenize the prompt + embdInp := ml.Tokenize(ctx.Vocab, prompt, true) + tokenNewline := ml.Tokenize(ctx.Vocab, "\n", false)[0] + + var embd []uint32 + + // Initialize the ring buffer + lastNTokens := ring.New(int(params.ctxSize)) + + for i := 0; i < int(params.ctxSize); i++ { + lastNTokens.Value = uint32(0) + lastNTokens = lastNTokens.Next() + } + + // A function to append a token to the ring buffer + appendToken := func(token uint32) { + lastNTokens.Value = token + lastNTokens = lastNTokens.Next() + } + + inputNoEcho := false + pastCount := uint32(0) + remainCount := params.predictCount + consumedCount := uint32(0) + + for remainCount != 0 || params.interactive { + + // --- predict + + if len(embd) > 0 { + + // infinite text generation via context swapping + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in a batch + + if pastCount+uint32(len(embd)) > params.ctxSize { + leftCount := pastCount - params.keepCount + pastCount = params.keepCount + + // insert n_left/2 tokens at the start of embd from last_n_tokens + //embd = append(lastNTokens[:leftCount/2], embd...) + embd = append(llama.ExtractTokens(lastNTokens.Move(-int(leftCount/2)), int(leftCount/2)), embd...) + } + + if err := llama.Eval(ctx, embd, uint32(len(embd)), pastCount, params.threadsCount); err != nil { + fmt.Printf("\n[ERROR] Failed to eval") + os.Exit(1) + } + } + + pastCount += uint32(len(embd)) + embd = []uint32{} + + if len(embdInp) <= int(consumedCount) { // && !isInteracting { + + if params.ignoreEOS { + ctx.Logits[ml.TOKEN_EOS] = 0 + } + + /* + id := llama.SampleTopPTopK(ctx, + lastNTokens[params.ctxSize-params.repeatLastN:], params.repeatLastN, + params.topK, params.topP, params.temp, params.repeatPenalty) + + lastNTokens = lastNTokens[1:] ////last_n_tokens.erase(last_n_tokens.begin()); + lastNTokens = append(lastNTokens, id) + + */ + id := llama.SampleTopPTopK(ctx, + lastNTokens, params.repeatLastN, + params.topK, params.topP, params.temp, params.repeatPenalty) + + appendToken(id) + + // replace end of text token with newline token when in interactive mode + if id == ml.TOKEN_EOS && params.interactive && !params.instruct { + id = tokenNewline + } + + // add it to the context + embd = append(embd, id) + + // echo this to console + inputNoEcho = false + + // decrement remaining sampling budget + remainCount-- + + } else { + + // some user input remains from prompt or interaction, forward it to processing + /* + for len(embdInp) > int(consumedCount) { + embd = append(embd, embdInp[consumedCount]) + if len(lastNTokens) > 0 { + lastNTokens = lastNTokens[1:] + } + lastNTokens = append(lastNTokens, embdInp[consumedCount]) + consumedCount++ + if len(embd) >= int(params.batchSize) { + break + } + } + */ + for len(embdInp) > int(consumedCount) { + embd = append(embd, embdInp[consumedCount]) + appendToken(embdInp[consumedCount]) + consumedCount++ + if len(embd) >= int(params.batchSize) { + break + } + } + } + + // --- display text + + if !inputNoEcho { + for _, id := range embd { + + token := ml.Token2Str(ctx.Vocab, id) + final += token + + if len(strings.TrimSpace(final)) < len(strings.TrimSpace(prompt)) { + continue + } + + out := strings.Split(final, prompt) + + if len(out) == 2 && token == "\n" { + continue + } + + if len(strings.TrimSpace(final)) == len(strings.TrimSpace(prompt)) && (token != "\n") && (len(out) == 2) { + _, err := llama.Colorize("\n\n[magenta]▒▒▒ [light_yellow]" + strings.TrimSpace(prompt) + "\n[light_blue]▒▒▒ ") + if err != nil { + return + } + continue + } + + _, err := llama.Colorize("[white]" + token) + if err != nil { + return + } + + } + } + } +} + +// Colorize is a wrapper for colorstring.Color() and fmt.Fprintf() +// Join colorstring and go-colorable to allow colors both on Mac and Windows +// TODO: Implement as a small library +// func Colorize(format string, opts ...interface{}) (n int, err error) { +// var DefaultOutput = colorable.NewColorableStdout() +// return fmt.Fprintf(DefaultOutput, colorstring.Color(format), opts...) +// } + +func showLogo() { + // Read the version from the 'VERSION' file + version, err := os.ReadFile("VERSION") + if err != nil { + fmt.Printf("[ERROR] Failed to read VERSION file") + os.Exit(1) + } + versionStr := strings.TrimSpace(string(version)) + + // https://patorjk.com/software/taag/#p=display&f=3-D&t=llama.go%0A%0ALLaMA.go + // Isometric 1, Modular, Rectangles, Rozzo, Small Isometric 1, 3-D + + logo := ` + /88 /88 /888/888 /88/8888/88 /888/888 /8888/88 /888/888 + /888 /888 /888/ /888 /888/8888/888 /888/ /888 /8888 // /8888//888 + /8888/88 /8888/88 /8888/8888 /888/8888/888 /8888/8888 /88 /8888/8888 /888 /8888 + /8888/888 /8888/888 /888 /8888 /888//88 /888 /888 /8888 /888//8888/88 //888/888 + //// /// //// /// /// //// /// // /// /// //// /// //// // /// ///` + + logoColored := "" + prevColor := "" + color := "" + line := 0 + colors := []string{"[black]", "[light_blue]", "[magenta]", "[light_magenta]", "[light_blue]"} + + for _, char := range logo { + if char == '\n' { + line++ + } else if char == '/' { + color = "[blue]" + } else if char == '8' { + color = colors[line] + char = '▒' + } + if color == prevColor { + logoColored += string(char) + } else { + logoColored += color + string(char) + } + } + + _, err = llama.Colorize(logoColored) + if err != nil { + return + } + _, err = llama.Colorize("\n\n [magenta]▒▒▒▒[light_magenta] [ LLaMA.go v" + versionStr + " ] [light_blue][ LLaMA GPT in pure Golang - based on LLaMA C++ ] [magenta]▒▒▒▒\n\n") + if err != nil { + return + } +} diff --git a/examples/mnist/README.md b/examples/mnist/README.md new file mode 100644 index 0000000..ed0821f --- /dev/null +++ b/examples/mnist/README.md @@ -0,0 +1,29 @@ +# MNIST Example for GGML + +This is a simple example of how to use GGML for inferencing. + +## Training the Model + +A notebook for training a simple two-layer network to recognize digits is located at `trainning/mnist.ipynb`. You can +use this to save a pytorch model to be converted to ggml format. + + + +## GGML Format Conversion + +GGML "format" is whatever you choose for efficient loading. In our case, we just save the hyperparameters used +plus the model weights and biases. Run convert-h5-to-ggml.py to convert your pytorch model. The output format is: + +- magic constant (int32) +- repeated list of tensors +- number of dimensions of tensor (int32) +- tensor dimension (int32 repeated) +- values of tensor (int32) + +Run ```convert-h5-to-ggml.py mnist_model.state_dict``` where `mnist_model.state_dict` is the saved pytorch model from the notebook. + +## MNIST Network + +The MNIST recognizer network is extremely simple. A fully connected layer + relu, followed by a fully connected layer + softmax. This +version of the MNIST network doesn't use convolutions. + diff --git a/examples/mnist/convert-h5-to-ggml.py b/examples/mnist/convert-h5-to-ggml.py new file mode 100644 index 0000000..083a933 --- /dev/null +++ b/examples/mnist/convert-h5-to-ggml.py @@ -0,0 +1,75 @@ +# Convert MNIS h5 transformer model to ggml format +# +# Load the (state_dict) saved model using PyTorch +# Iterate over all variables and write them to a binary file. +# +# For each variable, write the following: +# - Number of dimensions (int) +# - Name length (int) +# - Dimensions (int[n_dims]) +# - Name (char[name_length]) +# - Data (float[n_dims]) +# +# At the start of the ggml file we write the model parameters + +import sys +import struct +import json +import numpy as np +import re + + +import torch +import torch.nn as nn +import torchvision.datasets as dsets +import torchvision.transforms as transforms +from torch.autograd import Variable + +IS_BIGENDIAN = True + +pack_fmt = "i" + +if len(sys.argv) != 2: + print("Usage: convert-h5-to-ggml.py model\n") + sys.exit(1) + +state_dict_file = sys.argv[1] + +fname_out = "models/mnist/ggml-model-small-f32.bin" +if IS_BIGENDIAN: + fname_out = "models/mnist/ggml-model-small-f32-big-endian.bin" + pack_fmt = "!i" + +state_dict = torch.load(state_dict_file, map_location=torch.device('cpu')) +#print (model) + +list_vars = state_dict +print (list_vars) + +fout = open(fname_out, "wb") + +fout.write(struct.pack(pack_fmt, 0x67676d6c)) # magic: ggml in hex + + +for name in list_vars.keys(): + data = list_vars[name].squeeze().numpy() + print("Processing variable: " + name + " with shape: ", data.shape) + n_dims = len(data.shape) + + fout.write(struct.pack(pack_fmt, n_dims)) + + data = data.astype(np.float32) + for i in range(n_dims): + fout.write(struct.pack(pack_fmt, data.shape[n_dims - 1 - i])) + + # data + if not IS_BIGENDIAN: + data.tofile(fout) + else: + data = data.astype(">f4") + data.tofile(fout) + +fout.close() + +print("Done. Output file: " + fname_out) +print("") diff --git a/examples/mnist/mnist.go b/examples/mnist/mnist.go new file mode 100644 index 0000000..1afbcda --- /dev/null +++ b/examples/mnist/mnist.go @@ -0,0 +1,218 @@ +package mnist + +import ( + "errors" + "fmt" + "math" + "mlgo/ml" + "os" +) + +type mnist_hparams struct{ + n_input int32; + n_hidden int32; + n_classes int32; +} + +type mnist_model struct { + hparams mnist_hparams; + + fc1_weight *ml.Tensor; + fc1_bias *ml.Tensor; + + fc2_weight *ml.Tensor; + fc2_bias *ml.Tensor; + +} + +func mnist_model_load(fname string, model *mnist_model) error { + + file, err := os.Open(fname) + if err != nil { + return err + } + defer file.Close() + + + // verify magic + { + magic := readInt(file) + if magic != 0x67676d6c { + return errors.New("invalid model file (bad magic)") + } + } + + // Read FC1 layer 1 + { + n_dims := int32(readInt(file)) + ne_weight := make([]int32, 0) + for i := int32(0); i < n_dims; i++ { + ne_weight = append(ne_weight, int32(readInt(file))) + } + // FC1 dimensions taken from file, eg. 768x500 + model.hparams.n_input = ne_weight[0] + model.hparams.n_hidden = ne_weight[1] + + model.fc1_weight = ml.NewTensor2D(nil, ml.TYPE_F32, uint32(model.hparams.n_input), uint32(model.hparams.n_hidden)) + for i := 0; i < len(model.fc1_weight.Data); i++{ + model.fc1_weight.Data[i] = readFP32(file) + } + + ne_bias := make([]int32, 0) + for i := 0; i < int(n_dims); i++ { + ne_bias = append(ne_bias, int32(readInt(file))) + } + + model.fc1_bias = ml.NewTensor1D(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden)) + for i := 0; i < len(model.fc1_bias.Data); i++ { + model.fc1_bias.Data[i] = readFP32(file) + } + } + + // Read Fc2 layer 2 + { + n_dims := int32(readInt(file)) + ne_weight := make([]int32, 0) + for i := 0; i < int(n_dims); i++ { + ne_weight = append(ne_weight, int32(readInt(file))) + } + + // FC1 dimensions taken from file, eg. 10x500 + model.hparams.n_classes = ne_weight[1] + + model.fc2_weight = ml.NewTensor2D(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden), uint32(model.hparams.n_classes)) + for i := 0; i < len(model.fc2_weight.Data); i++{ + model.fc2_weight.Data[i] = readFP32(file) + } + + ne_bias := make([]int32, 0) + for i := 0; i < int(n_dims); i++ { + ne_bias = append(ne_bias, int32(readInt(file))) + } + + model.fc2_bias = ml.NewTensor1D(nil, ml.TYPE_F32, uint32(model.hparams.n_classes)) + for i := 0; i < len(model.fc2_bias.Data); i++ { + model.fc2_bias.Data[i] = readFP32(file) + } + printTensor(model.fc2_bias, "model.fc2_bias") + + } + + return nil +} + +func mnist_eval(model *mnist_model, threadCount int, digit []float32) int { + + ctx0 := &ml.Context{} + graph := ml.Graph{ThreadsCount: threadCount} + + input := ml.NewTensor1D(ctx0, ml.TYPE_F32, uint32(model.hparams.n_input)) + copy(input.Data, digit) + + // fc1 MLP = Ax + b + fc1 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc1_weight, input), model.fc1_bias) + fc2 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc2_weight, ml.Relu(ctx0, fc1)), model.fc2_bias) + + // softmax + final := ml.SoftMax(ctx0, fc2) + + // run the computation + ml.BuildForwardExpand(&graph, final) + ml.GraphCompute(ctx0, &graph) + + printTensor(final, "final tensor") + + maxIndex := 0 + for i := 0; i < 10; i++{ + if final.Data[i] > final.Data[maxIndex] { + maxIndex = i + } + } + return maxIndex +} + +func ExpandGraph(model *mnist_model, threadCount int, digit []float32) (*ml.Graph, *ml.Context) { + ctx0 := &ml.Context{} + graph := ml.Graph{ThreadsCount: threadCount} + + input := ml.NewTensor1D(ctx0, ml.TYPE_F32, uint32(model.hparams.n_input)) + copy(input.Data, digit) + + // fc1 MLP = Ax + b + fc1 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc1_weight, input), model.fc1_bias) + fc2 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc2_weight, ml.Relu(ctx0, fc1)), model.fc2_bias) + + // softmax + final := ml.SoftMax(ctx0, fc2) + + // run the computation + ml.BuildForwardExpand(&graph, final) + return &graph, ctx0 +} + +func LoadModel(modeFile string) (*mnist_model, error) { + model := new(mnist_model) + err := mnist_model_load(modeFile, model) + return model, err +} + +// NB! INT = 32 bits +func readInt(file *os.File) uint32 { + buf := make([]byte, 4) + if count, err := file.Read(buf); err != nil || count != 4 { + return 0 + } + return uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0]) +} + +func readString(file *os.File, len uint32) string { + buf := make([]byte, len) + if count, err := file.Read(buf); err != nil || count != int(len) { + return "" + } + return string(buf) +} + + +func readFP32(file *os.File) float32 { + buf := make([]byte, 4) + if count, err := file.Read(buf); err != nil || count != 4 { + return 0.0 + } + bits := uint32(buf[3])<<24 | uint32(buf[2])<<16 | uint32(buf[1])<<8 | uint32(buf[0]) + return math.Float32frombits(bits) +} + +func min(a, b int) int { + if a <= b { + return a + } + return b +} + +func printTensor(tensor *ml.Tensor, name string) { + var dt string + if tensor.Type == ml.TYPE_F16 { + dt = "FP16" + } + if tensor.Type == ml.TYPE_F32 { + dt = "FP32" + } + if tensor.Type == ml.TYPE_Q4_0 { + dt = "INT4" + } + + fmt.Printf("\n\n=== [ %s | %s | %d:%d:%d ] ===\n", + name, dt, tensor.NE[0], tensor.NE[1], tensor.NE[2]) + + for nn := 0; nn < min(12, int(tensor.NE[1])); nn++ { + fmt.Printf("\n %d x %d ...\t", nn, tensor.NE[0]) + for ii := 0; ii < min(12, int(tensor.NE[0])); ii++ { + fmt.Printf("%.3f\t", tensor.Data[nn*int(tensor.NE[0])+ii]) + } + } +} + +func main(){ + fmt.Println("hello world") +} \ No newline at end of file diff --git a/examples/mnist/mnist_test.go b/examples/mnist/mnist_test.go new file mode 100644 index 0000000..5194f34 --- /dev/null +++ b/examples/mnist/mnist_test.go @@ -0,0 +1,277 @@ +package mnist + +import ( + "bytes" + "encoding/binary" + "fmt" + "math/rand" + "mlgo/ml" + "os" + "reflect" + "testing" + "time" +) + +func TestMNIST(t *testing.T) { + modelFile := "models/mnist/ggml-model-f32.bin" + digitFile := "models/mnist/t10k-images.idx3-ubyte" + + ml.SINGLE_THREAD = true + model := new(mnist_model) + if err := mnist_model_load(modelFile, model); err != nil { + fmt.Println(err) + return + } + + // load a random test digit + fin, err := os.Open(digitFile) + if err != nil { + fmt.Println(err) + return + } + // Seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000) + rand.Seed(time.Now().UnixNano()) + fin.Seek(int64(16 + 784 * (rand.Int() % 10000)), 0) + buf := make([]byte, 784) + digits := make([]float32, 784) + if count, err := fin.Read(buf); err != nil || count != int(len(buf)) { + fmt.Println(err, count) + return + } + + // render the digit in ASCII + for row := 0; row < 28; row++{ + for col := 0; col < 28; col++ { + digits[row*28 + col] = float32(buf[row*28 + col]) + var c string + if buf[row*28 + col] > 230 { + c = "*" + } else { + c = "_" + } + fmt.Printf(c) + } + fmt.Println("") + } + fmt.Println("") + + res := mnist_eval(model, 1, digits) + fmt.Println("Predicted digit is ", res) +} + + +func IntToBytes(n int) []byte { + x := int32(n) + + bytesBuffer := bytes.NewBuffer([]byte{}) + binary.Write(bytesBuffer, binary.BigEndian, x) + return bytesBuffer.Bytes() +} + +func BytesToInt(b []byte) int { + bytesBuffer := bytes.NewBuffer(b) + + var x int32 + binary.Read(bytesBuffer, binary.BigEndian, &x) + + return int(x) +} + +func TestByteInt(t *testing.T){ + a := int(0x67676d6c) + aBytes := IntToBytes(a) + aInt := BytesToInt(aBytes) + aInt2 := (int(aBytes[0]) << 24) | (int(aBytes[1]) << 16) | (int(aBytes[2]) << 8) | int(aBytes[3]) + fmt.Println("a ", a); + fmt.Println("aBytes ", aBytes) + fmt.Println("aInt ", aInt) + fmt.Println("aInt2 ", aInt2) +} + +func add(a *int) { + *a = *a + 1 +} + +func TestSlice(t *testing.T){ + a := 2 + { + add(&a) + } + fmt.Println(a) +} + +func TestSaveInput(t *testing.T) { + digitFile := "models/mnist/t10k-images.idx3-ubyte" + // load a random test digit + fin, err := os.Open(digitFile) + if err != nil { + fmt.Println(err) + return + } + fin.Seek(int64(16 + 784 * 0), 0) + buf := make([]byte, 784) + digits := make([]float32, 784) + if count, err := fin.Read(buf); err != nil || count != int(len(buf)) { + fmt.Println(err, count) + return + } + + // render the digit in ASCII + for row := 0; row < 28; row++{ + for col := 0; col < 28; col++ { + digits[row*28 + col] = float32(buf[row*28 + col]) + var c string + if buf[row*28 + col] > 230 { + c = "*" + } else { + c = "_" + } + fmt.Printf(c) + } + fmt.Println("") + } + fmt.Println("") + + fout, err := os.Create("models/mnist/input_7") + if err != nil { + fmt.Println(err) + return + } + defer fout.Close() + _, err = fout.Write(buf) + if err != nil { + fmt.Println(err) + return + } + +} + +func TestMNISTConvert(t *testing.T) { + modelFile := "models/mnist/ggml-model-f32.bin" + digitFile := "models/mnist/t10k-images.idx3-ubyte" + + ml.SINGLE_THREAD = true + model := new(mnist_model) + if err := mnist_model_load(modelFile, model); err != nil { + fmt.Println(err) + return + } + + // load a random test digit + fin, err := os.Open(digitFile) + if err != nil { + fmt.Println(err) + return + } + // Seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000) + rand.Seed(time.Now().UnixNano()) + fin.Seek(int64(16 + 784 * (rand.Int() % 10000)), 0) + buf := make([]byte, 784) + digits := make([]float32, 784) + if count, err := fin.Read(buf); err != nil || count != int(len(buf)) { + fmt.Println(err, count) + return + } + + // render the digit in ASCII + for row := 0; row < 28; row++{ + for col := 0; col < 28; col++ { + digits[row*28 + col] = float32(buf[row*28 + col]) + var c string + if buf[row*28 + col] > 230 { + c = "*" + } else { + c = "_" + } + fmt.Printf(c) + } + fmt.Println("") + } + fmt.Println("") + + ctx0 := &ml.Context{} + graph := ml.Graph{ThreadsCount: 1} + + input := ml.NewTensor1D(ctx0, ml.TYPE_F32, uint32(model.hparams.n_input)) + copy(input.Data, digits) + + // fc1 MLP = Ax + b + fc1 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc1_weight, input), model.fc1_bias) + fc2 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc2_weight, ml.Relu(ctx0, fc1)), model.fc2_bias) + + // softmax + final := ml.SoftMax(ctx0, fc2) + + // run the computation + ml.BuildForwardExpand(&graph, final) + // stop here + nodeID := 5 + ml.GraphComputeByNodes(ctx0, &graph, nodeID) + + ml.PrintTensor(graph.Nodes[nodeID], "final_before") + + // continue + // ml.ComputeNodeForward(graph.Nodes[5]) + + // ml.PrintTensor(final, "final_after") + + // test coding and encoding + envBytes := ml.SaveComputeNodeEnvToBytes(uint32(nodeID), graph.Nodes[nodeID], &graph, true) + nodeID_, tensorGraphList_ , err := ml.DecodeComputeNodeEnv(envBytes, true, false) + + // save bytes from mips test + { + fout, err := os.Create("models/mnist/node_5") + if err != nil { + fmt.Println(err) + return + } + defer fout.Close() + _, err = fout.Write(envBytes) + if err != nil { + fmt.Println(err) + return + } + } + + // save => tensorOnGraph[] + tensorGraphList := ml.SaveComputeNodeEnv(graph.Nodes[5], &graph) + + fmt.Println("nodeID Equal: ", nodeID_ == uint32(nodeID)) + fmt.Println("tensorGraphList: ", reflect.DeepEqual(tensorGraphList_, tensorGraphList)) + + // reconstruct + tensorList := make([]*ml.Tensor, 0) + tensorMap := make(map[uint32]*ml.Tensor) + for i := 0; i < len(tensorGraphList); i++ { + tensor := tensorGraphList[i].ToTensor(nil) + tensorMap[tensorGraphList[i].NodeID] = tensor + tensorList = append(tensorList, tensor) + } + // fill in the nodeid + for i := 0; i < len(tensorList); i++ { + tensor := tensorList[i] + tensorG := tensorGraphList[i] + if src0, ok := tensorMap[tensorG.Src0NodeID]; ok { + tensor.Src0 = src0 + } + if src1, ok := tensorMap[tensorG.Src1NodeID]; ok { + tensor.Src1 = src1 + } + } + + // compute + ml.ComputeNodeForward(tensorMap[uint32(nodeID)]) + + ml.PrintTensor(final, "final_after") + + tensor := final + tensorOnGraph := tensor.ToTensorOnGraph(&graph) + tensorOnGraphBytes := tensorOnGraph.Encoding(false) + // bytesLen := common.BytesToInt32(tensorOnGraphBytes[:4], false) + // fmt.Println(int(bytesLen) == len(tensorOnGraphBytes) - 4) + tensorOnGraph2 := ml.DecodeTensorOnGraph(tensorOnGraphBytes, false, false) + fmt.Println(reflect.DeepEqual(tensor.Data, tensorOnGraph.Data)) + fmt.Println(reflect.DeepEqual(tensorOnGraph, tensorOnGraph2)) + fmt.Println(tensorOnGraph.Src0NodeID, tensorOnGraph.Src1NodeID) +} \ No newline at end of file diff --git a/examples/mnist/models/mnist/ggml-model-f32-big-endian.bin b/examples/mnist/models/mnist/ggml-model-f32-big-endian.bin new file mode 100644 index 0000000..941e8e2 Binary files /dev/null and b/examples/mnist/models/mnist/ggml-model-f32-big-endian.bin differ diff --git a/examples/mnist/models/mnist/ggml-model-f32.bin b/examples/mnist/models/mnist/ggml-model-f32.bin new file mode 100644 index 0000000..1459a3b Binary files /dev/null and b/examples/mnist/models/mnist/ggml-model-f32.bin differ diff --git a/examples/mnist/models/mnist/ggml-model-small-f32-big-endian.bin b/examples/mnist/models/mnist/ggml-model-small-f32-big-endian.bin new file mode 100644 index 0000000..6de927b Binary files /dev/null and b/examples/mnist/models/mnist/ggml-model-small-f32-big-endian.bin differ diff --git a/examples/mnist/models/mnist/ggml-model-small-f32.bin b/examples/mnist/models/mnist/ggml-model-small-f32.bin new file mode 100644 index 0000000..9b4cde8 Binary files /dev/null and b/examples/mnist/models/mnist/ggml-model-small-f32.bin differ diff --git a/examples/mnist/models/mnist/input_7 b/examples/mnist/models/mnist/input_7 new file mode 100644 index 0000000..6e67157 Binary files /dev/null and b/examples/mnist/models/mnist/input_7 differ diff --git a/examples/mnist/models/mnist/mnist-small.state_dict b/examples/mnist/models/mnist/mnist-small.state_dict new file mode 100644 index 0000000..5121cdf Binary files /dev/null and b/examples/mnist/models/mnist/mnist-small.state_dict differ diff --git a/examples/mnist/models/mnist/mnist_model.state_dict b/examples/mnist/models/mnist/mnist_model.state_dict new file mode 100644 index 0000000..dfb609b Binary files /dev/null and b/examples/mnist/models/mnist/mnist_model.state_dict differ diff --git a/examples/mnist/models/mnist/node_5 b/examples/mnist/models/mnist/node_5 new file mode 100644 index 0000000..2df09b6 Binary files /dev/null and b/examples/mnist/models/mnist/node_5 differ diff --git a/examples/mnist/models/mnist/t10k-images.idx3-ubyte b/examples/mnist/models/mnist/t10k-images.idx3-ubyte new file mode 100644 index 0000000..1170b2c Binary files /dev/null and b/examples/mnist/models/mnist/t10k-images.idx3-ubyte differ diff --git a/examples/mnist/trainning/mnist.ipynb b/examples/mnist/trainning/mnist.ipynb new file mode 100644 index 0000000..9b715a2 --- /dev/null +++ b/examples/mnist/trainning/mnist.ipynb @@ -0,0 +1,438 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#@title Import Dependencies\n", + "\n", + "import torch\n", + "import torch.nn as nn\n", + "import torchvision.datasets as dsets\n", + "import torchvision.transforms as transforms\n", + "from torch.autograd import Variable" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "#@title Define Hyperparameters\n", + "\n", + "input_size = 784 # img_size = (28,28) ---> 28*28=784 in total\n", + "hidden_size = 20 # number of nodes at hidden layer\n", + "num_classes = 10 # number of output classes discrete range [0,9]\n", + "num_epochs = 20 # number of times which the entire dataset is passed throughout the model\n", + "batch_size = 100 # the size of input data took for one iteration\n", + "lr = 1e-3 # size of step " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz\n", + "Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 9912422/9912422 [00:00<00:00, 28402590.00it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw\n", + "\n", + "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz\n", + "Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 28881/28881 [00:00<00:00, 6406584.19it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw\n", + "\n", + "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz\n", + "Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1648877/1648877 [00:00<00:00, 7325341.35it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw\n", + "\n", + "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz\n", + "Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 4542/4542 [00:00<00:00, 12057296.69it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "#@title Downloading MNIST data\n", + "\n", + "train_data = dsets.MNIST(root = './data', train = True,\n", + " transform = transforms.ToTensor(), download = True)\n", + "\n", + "test_data = dsets.MNIST(root = './data', train = False,\n", + " transform = transforms.ToTensor())" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "#@title Loading the data\n", + "\n", + "train_gen = torch.utils.data.DataLoader(dataset = train_data,\n", + " batch_size = batch_size,\n", + " shuffle = True)\n", + "\n", + "test_gen = torch.utils.data.DataLoader(dataset = test_data,\n", + " batch_size = batch_size, \n", + " shuffle = False)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "#@title Define model class\n", + "\n", + "class Net(nn.Module):\n", + " def __init__(self, input_size, hidden_size, num_classes):\n", + " super(Net,self).__init__()\n", + " self.fc1 = nn.Linear(input_size, hidden_size)\n", + " self.relu = nn.ReLU()\n", + " self.fc2 = nn.Linear(hidden_size, num_classes)\n", + " \n", + " def forward(self,x):\n", + " out = self.fc1(x)\n", + " out = self.relu(out)\n", + " out = self.fc2(out)\n", + " return out" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "#@title Build the model\n", + "\n", + "net = Net(input_size, hidden_size, num_classes)\n", + "if torch.cuda.is_available():\n", + " net.cuda()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "#@title Define loss-function & optimizer\n", + "\n", + "loss_function = nn.CrossEntropyLoss()\n", + "optimizer = torch.optim.Adam( net.parameters(), lr=lr)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch [1/20], Step [100/600], Loss: 0.7586\n", + "Epoch [1/20], Step [200/600], Loss: 0.6578\n", + "Epoch [1/20], Step [300/600], Loss: 0.4132\n", + "Epoch [1/20], Step [400/600], Loss: 0.2653\n", + "Epoch [1/20], Step [500/600], Loss: 0.3760\n", + "Epoch [1/20], Step [600/600], Loss: 0.2321\n", + "Epoch [2/20], Step [100/600], Loss: 0.2010\n", + "Epoch [2/20], Step [200/600], Loss: 0.2502\n", + "Epoch [2/20], Step [300/600], Loss: 0.2265\n", + "Epoch [2/20], Step [400/600], Loss: 0.1894\n", + "Epoch [2/20], Step [500/600], Loss: 0.3196\n", + "Epoch [2/20], Step [600/600], Loss: 0.1802\n", + "Epoch [3/20], Step [100/600], Loss: 0.3340\n", + "Epoch [3/20], Step [200/600], Loss: 0.2048\n", + "Epoch [3/20], Step [300/600], Loss: 0.1734\n", + "Epoch [3/20], Step [400/600], Loss: 0.3398\n", + "Epoch [3/20], Step [500/600], Loss: 0.1892\n", + "Epoch [3/20], Step [600/600], Loss: 0.2637\n", + "Epoch [4/20], Step [100/600], Loss: 0.2435\n", + "Epoch [4/20], Step [200/600], Loss: 0.3686\n", + "Epoch [4/20], Step [300/600], Loss: 0.3716\n", + "Epoch [4/20], Step [400/600], Loss: 0.3624\n", + "Epoch [4/20], Step [500/600], Loss: 0.2705\n", + "Epoch [4/20], Step [600/600], Loss: 0.2090\n", + "Epoch [5/20], Step [100/600], Loss: 0.1542\n", + "Epoch [5/20], Step [200/600], Loss: 0.1493\n", + "Epoch [5/20], Step [300/600], Loss: 0.2130\n", + "Epoch [5/20], Step [400/600], Loss: 0.1685\n", + "Epoch [5/20], Step [500/600], Loss: 0.2899\n", + "Epoch [5/20], Step [600/600], Loss: 0.1895\n", + "Epoch [6/20], Step [100/600], Loss: 0.2628\n", + "Epoch [6/20], Step [200/600], Loss: 0.2071\n", + "Epoch [6/20], Step [300/600], Loss: 0.0898\n", + "Epoch [6/20], Step [400/600], Loss: 0.1123\n", + "Epoch [6/20], Step [500/600], Loss: 0.1715\n", + "Epoch [6/20], Step [600/600], Loss: 0.2295\n", + "Epoch [7/20], Step [100/600], Loss: 0.1155\n", + "Epoch [7/20], Step [200/600], Loss: 0.1513\n", + "Epoch [7/20], Step [300/600], Loss: 0.1155\n", + "Epoch [7/20], Step [400/600], Loss: 0.1920\n", + "Epoch [7/20], Step [500/600], Loss: 0.2464\n", + "Epoch [7/20], Step [600/600], Loss: 0.0735\n", + "Epoch [8/20], Step [100/600], Loss: 0.1250\n", + "Epoch [8/20], Step [200/600], Loss: 0.1276\n", + "Epoch [8/20], Step [300/600], Loss: 0.1443\n", + "Epoch [8/20], Step [400/600], Loss: 0.0967\n", + "Epoch [8/20], Step [500/600], Loss: 0.1119\n", + "Epoch [8/20], Step [600/600], Loss: 0.1230\n", + "Epoch [9/20], Step [100/600], Loss: 0.1142\n", + "Epoch [9/20], Step [200/600], Loss: 0.1825\n", + "Epoch [9/20], Step [300/600], Loss: 0.1516\n", + "Epoch [9/20], Step [400/600], Loss: 0.2317\n", + "Epoch [9/20], Step [500/600], Loss: 0.1516\n", + "Epoch [9/20], Step [600/600], Loss: 0.0816\n", + "Epoch [10/20], Step [100/600], Loss: 0.1645\n", + "Epoch [10/20], Step [200/600], Loss: 0.1152\n", + "Epoch [10/20], Step [300/600], Loss: 0.1192\n", + "Epoch [10/20], Step [400/600], Loss: 0.1058\n", + "Epoch [10/20], Step [500/600], Loss: 0.2072\n", + "Epoch [10/20], Step [600/600], Loss: 0.1733\n", + "Epoch [11/20], Step [100/600], Loss: 0.1161\n", + "Epoch [11/20], Step [200/600], Loss: 0.1378\n", + "Epoch [11/20], Step [300/600], Loss: 0.1265\n", + "Epoch [11/20], Step [400/600], Loss: 0.2290\n", + "Epoch [11/20], Step [500/600], Loss: 0.1156\n", + "Epoch [11/20], Step [600/600], Loss: 0.0995\n", + "Epoch [12/20], Step [100/600], Loss: 0.1722\n", + "Epoch [12/20], Step [200/600], Loss: 0.0980\n", + "Epoch [12/20], Step [300/600], Loss: 0.1267\n", + "Epoch [12/20], Step [400/600], Loss: 0.0467\n", + "Epoch [12/20], Step [500/600], Loss: 0.1382\n", + "Epoch [12/20], Step [600/600], Loss: 0.1339\n", + "Epoch [13/20], Step [100/600], Loss: 0.1389\n", + "Epoch [13/20], Step [200/600], Loss: 0.0930\n", + "Epoch [13/20], Step [300/600], Loss: 0.0770\n", + "Epoch [13/20], Step [400/600], Loss: 0.0875\n", + "Epoch [13/20], Step [500/600], Loss: 0.0931\n", + "Epoch [13/20], Step [600/600], Loss: 0.1588\n", + "Epoch [14/20], Step [100/600], Loss: 0.0850\n", + "Epoch [14/20], Step [200/600], Loss: 0.2115\n", + "Epoch [14/20], Step [300/600], Loss: 0.0677\n", + "Epoch [14/20], Step [400/600], Loss: 0.1456\n", + "Epoch [14/20], Step [500/600], Loss: 0.1269\n", + "Epoch [14/20], Step [600/600], Loss: 0.1360\n", + "Epoch [15/20], Step [100/600], Loss: 0.2047\n", + "Epoch [15/20], Step [200/600], Loss: 0.1644\n", + "Epoch [15/20], Step [300/600], Loss: 0.0949\n", + "Epoch [15/20], Step [400/600], Loss: 0.0733\n", + "Epoch [15/20], Step [500/600], Loss: 0.0711\n", + "Epoch [15/20], Step [600/600], Loss: 0.1456\n", + "Epoch [16/20], Step [100/600], Loss: 0.0946\n", + "Epoch [16/20], Step [200/600], Loss: 0.1493\n", + "Epoch [16/20], Step [300/600], Loss: 0.1525\n", + "Epoch [16/20], Step [400/600], Loss: 0.0556\n", + "Epoch [16/20], Step [500/600], Loss: 0.2276\n", + "Epoch [16/20], Step [600/600], Loss: 0.1088\n", + "Epoch [17/20], Step [100/600], Loss: 0.0487\n", + "Epoch [17/20], Step [200/600], Loss: 0.0929\n", + "Epoch [17/20], Step [300/600], Loss: 0.0809\n", + "Epoch [17/20], Step [400/600], Loss: 0.1210\n", + "Epoch [17/20], Step [500/600], Loss: 0.0739\n", + "Epoch [17/20], Step [600/600], Loss: 0.1376\n", + "Epoch [18/20], Step [100/600], Loss: 0.1401\n", + "Epoch [18/20], Step [200/600], Loss: 0.1457\n", + "Epoch [18/20], Step [300/600], Loss: 0.0723\n", + "Epoch [18/20], Step [400/600], Loss: 0.2226\n", + "Epoch [18/20], Step [500/600], Loss: 0.0641\n", + "Epoch [18/20], Step [600/600], Loss: 0.1450\n", + "Epoch [19/20], Step [100/600], Loss: 0.1496\n", + "Epoch [19/20], Step [200/600], Loss: 0.1327\n", + "Epoch [19/20], Step [300/600], Loss: 0.0711\n", + "Epoch [19/20], Step [400/600], Loss: 0.1269\n", + "Epoch [19/20], Step [500/600], Loss: 0.0667\n", + "Epoch [19/20], Step [600/600], Loss: 0.0898\n", + "Epoch [20/20], Step [100/600], Loss: 0.0569\n", + "Epoch [20/20], Step [200/600], Loss: 0.1008\n", + "Epoch [20/20], Step [300/600], Loss: 0.0970\n", + "Epoch [20/20], Step [400/600], Loss: 0.1094\n", + "Epoch [20/20], Step [500/600], Loss: 0.0969\n", + "Epoch [20/20], Step [600/600], Loss: 0.0764\n" + ] + } + ], + "source": [ + "#@title Training the model\n", + "\n", + "for epoch in range(num_epochs):\n", + " for i ,(images,labels) in enumerate(train_gen):\n", + " images = Variable(images.view(-1,28*28)).cuda()\n", + " labels = Variable(labels).cuda()\n", + " \n", + " optimizer.zero_grad()\n", + " outputs = net(images)\n", + " loss = loss_function(outputs, labels)\n", + " loss.backward()\n", + " optimizer.step()\n", + " \n", + " if (i+1) % 100 == 0:\n", + " print('Epoch [%d/%d], Step [%d/%d], Loss: %.4f'\n", + " %(epoch+1, num_epochs, i+1, len(train_data)//batch_size, loss.data))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy of the model: 96.100 %\n" + ] + } + ], + "source": [ + "#@title Evaluating the accuracy of the model\n", + "\n", + "correct = 0\n", + "total = 0\n", + "for images,labels in test_gen:\n", + " images = Variable(images.view(-1,28*28)).cuda()\n", + " labels = labels.cuda()\n", + " \n", + " output = net(images)\n", + " _, predicted = torch.max(output,1)\n", + " correct += (predicted == labels).sum()\n", + " total += labels.size(0)\n", + "\n", + "print('Accuracy of the model: %.3f %%' %((100*correct)/(total+1)))" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model's state_dict:\n", + "fc1.weight \t torch.Size([20, 784])\n", + "fc1.bias \t torch.Size([20])\n", + "fc2.weight \t torch.Size([10, 20])\n", + "fc2.bias \t torch.Size([10])\n" + ] + } + ], + "source": [ + "print(\"Model's state_dict:\")\n", + "for param_tensor in net.state_dict():\n", + " print(param_tensor, \"\\t\", net.state_dict()[param_tensor].size())" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(net.state_dict(), \"../models/mnist/mnist-small.state_dict\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/examples/mnist_mips/build.sh b/examples/mnist_mips/build.sh new file mode 100755 index 0000000..c5a3a09 --- /dev/null +++ b/examples/mnist_mips/build.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -e + +export GOOS=linux +export GOARCH=mips +export GOMIPS=softfloat +go build -o ./mlgo + +file mlgo + +if [[ ! -d venv ]]; then + python3 -m venv venv +fi + +../../compile.py mlgo diff --git a/examples/mnist_mips/main.go b/examples/mnist_mips/main.go new file mode 100644 index 0000000..6891d87 --- /dev/null +++ b/examples/mnist_mips/main.go @@ -0,0 +1,6 @@ +package main + +func main() { + MIPS_MNIST() +} + diff --git a/examples/mnist_mips/mips_mnist.go b/examples/mnist_mips/mips_mnist.go new file mode 100644 index 0000000..4146ce3 --- /dev/null +++ b/examples/mnist_mips/mips_mnist.go @@ -0,0 +1,223 @@ +package main + +import ( + "errors" + "fmt" + "mlgo/common" + "mlgo/ml" +) + +type mnist_hparams struct{ + n_input int32; + n_hidden int32; + n_classes int32; +} + +type mnist_model struct { + hparams mnist_hparams; + + fc1_weight *ml.Tensor; + fc1_bias *ml.Tensor; + + fc2_weight *ml.Tensor; + fc2_bias *ml.Tensor; + +} + +const ( + READ_FROM_BIDENDIAN = true + OUTPUT_TO_BIDENDIAN = true +) + +func MIPS_mnist_model_load(model *mnist_model) error { + fmt.Println("start MIPS_mnist_model_load") + model_bytes := common.ReadBytes(common.MODEL_ADDR, READ_FROM_BIDENDIAN) + index := 0 + fmt.Println("model_bytes len: ", len(model_bytes)) + + // verify magic + { + magic := common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN) + fmt.Printf("magic: %x\n", magic) + if magic != 0x67676d6c { + return errors.New("invalid model file (bad magic)") + } + } + + // Read FC1 layer 1 + { + fmt.Println("reading fc1") + n_dims := int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN)) + fmt.Println("n_dims: ", n_dims) + ne_weight := make([]int32, 0) + for i := int32(0); i < n_dims; i++ { + ne_weight = append(ne_weight, int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN))) + } + fmt.Println("ne_weight: ", ne_weight) + // FC1 dimensions taken from file, eg. 768x500 + model.hparams.n_input = ne_weight[0] + model.hparams.n_hidden = ne_weight[1] + + if READ_FROM_BIDENDIAN { + fc1_weight_data_size := model.hparams.n_input * model.hparams.n_hidden + fc1_weight_data := common.DecodeFloat32List(model_bytes[index:index + 4 * int(fc1_weight_data_size)]) + index += 4 * int(fc1_weight_data_size) + model.fc1_weight = ml.NewTensor2DWithData(nil, ml.TYPE_F32, uint32(model.hparams.n_input), uint32(model.hparams.n_hidden), fc1_weight_data) + } else { + model.fc1_weight = ml.NewTensor2D(nil, ml.TYPE_F32, uint32(model.hparams.n_input), uint32(model.hparams.n_hidden)) + fmt.Println("len(model.fc1_weight.Data): ", len(model.fc1_weight.Data)) + for i := 0; i < len(model.fc1_weight.Data); i++{ + model.fc1_weight.Data[i] = common.ReadFP32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN) + if i % 10000 == 0 { + fmt.Println("loading fc1_weight: ", i) + } + } + } + + fmt.Println("index: ", index) + + ne_bias := make([]int32, 0) + for i := 0; i < int(n_dims); i++ { + ne_bias = append(ne_bias, int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN))) + } + + if READ_FROM_BIDENDIAN { + fc1_bias_data_size := int(model.hparams.n_hidden) + fc1_bias_data := common.DecodeFloat32List(model_bytes[index:index + 4*fc1_bias_data_size]) + index += 4*fc1_bias_data_size + model.fc1_bias = ml.NewTensor1DWithData(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden), fc1_bias_data) + } else { + model.fc1_bias = ml.NewTensor1D(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden)) + fmt.Println("len(model.fc1_bias.Data): ", len(model.fc1_bias.Data)) + for i := 0; i < len(model.fc1_bias.Data); i++ { + model.fc1_bias.Data[i] = common.ReadFP32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN) + if i % 10000 == 0 { + fmt.Println("loading fc1_bias: ", i) + } + } + } + + } + + // Read Fc2 layer 2 + { + fmt.Println("reading fc2") + n_dims := int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN)) + ne_weight := make([]int32, 0) + for i := 0; i < int(n_dims); i++ { + ne_weight = append(ne_weight, int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN))) + } + + // FC1 dimensions taken from file, eg. 10x500 + model.hparams.n_classes = ne_weight[1] + + if READ_FROM_BIDENDIAN { + fc2_weight_data_size := int(model.hparams.n_hidden * model.hparams.n_classes) + fc2_weight_data := common.DecodeFloat32List(model_bytes[index:index + 4*fc2_weight_data_size]) + index += 4*fc2_weight_data_size + model.fc2_weight = ml.NewTensor2DWithData(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden), uint32(model.hparams.n_classes), fc2_weight_data) + } else { + model.fc2_weight = ml.NewTensor2D(nil, ml.TYPE_F32, uint32(model.hparams.n_hidden), uint32(model.hparams.n_classes)) + for i := 0; i < len(model.fc2_weight.Data); i++{ + model.fc2_weight.Data[i] = common.ReadFP32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN) + } + } + + ne_bias := make([]int32, 0) + for i := 0; i < int(n_dims); i++ { + ne_bias = append(ne_bias, int32(common.ReadInt32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN))) + } + + if READ_FROM_BIDENDIAN { + fc2_bias_data_size := int(model.hparams.n_classes) + fc2_bias_data := common.DecodeFloat32List(model_bytes[index:index + 4*fc2_bias_data_size]) + index += 4*fc2_bias_data_size + model.fc2_bias = ml.NewTensor1DWithData(nil, ml.TYPE_F32, uint32(model.hparams.n_classes), fc2_bias_data) + } else { + model.fc2_bias = ml.NewTensor1D(nil, ml.TYPE_F32, uint32(model.hparams.n_classes)) + for i := 0; i < len(model.fc2_bias.Data); i++ { + model.fc2_bias.Data[i] = common.ReadFP32FromBytes(model_bytes, &index, READ_FROM_BIDENDIAN) + } + } + + ml.PrintTensor(model.fc2_bias, "model.fc2_bias") + } + + fmt.Println("current index: ", index) + + return nil +} + +// input is 784 bytes +func MIPS_InputProcess() []float32 { + fmt.Println("start MIPS_InputProcess") + buf := common.ReadBytes(common.INPUT_ADDR, READ_FROM_BIDENDIAN) + fmt.Println("buf len: ", len(buf)) + digits := make([]float32, 784) + + // render the digit in ASCII + var c string + for row := 0; row < 28; row++{ + for col := 0; col < 28; col++ { + digits[row*28 + col] = float32(buf[row*28 + col]) + if buf[row*28 + col] > 230 { + c += "*" + } else { + c += "_" + } + } + c += "\n" + } + fmt.Println(c) + + return digits +} + +func MIPS_mnist_eval(model *mnist_model, digit []float32) int { + fmt.Println("start MIPS_mnist_eval") + ctx0 := &ml.Context{} + graph := ml.Graph{ThreadsCount: 1} + + input := ml.NewTensor1D(ctx0, ml.TYPE_F32, uint32(model.hparams.n_input)) + copy(input.Data, digit) + + // fc1 MLP = Ax + b + fc1 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc1_weight, input), model.fc1_bias) + fc2 := ml.Add(ctx0, ml.MulMat(ctx0, model.fc2_weight, ml.Relu(ctx0, fc1)), model.fc2_bias) + + // softmax + final := ml.SoftMax(ctx0, fc2) + + // run the computation + ml.BuildForwardExpand(&graph, final) + ml.GraphCompute(ctx0, &graph) + + ml.PrintTensor(final, "final tensor") + + maxIndex := 0 + for i := 0; i < 10; i++{ + if final.Data[i] > final.Data[maxIndex] { + maxIndex = i + } + } + return maxIndex +} + +func MIPS_StoreInMemory(ret int) { + retBytes := common.IntToBytes(ret, OUTPUT_TO_BIDENDIAN) + common.Output(retBytes, OUTPUT_TO_BIDENDIAN) +} + +func MIPS_MNIST() { + fmt.Println("Start MIPS MNIST") + input := MIPS_InputProcess() + model := new(mnist_model) + err := MIPS_mnist_model_load(model) + if err != nil { + fmt.Println(err) + common.Halt() + } + ret := MIPS_mnist_eval(model, input) + fmt.Println("Predicted digit is ", ret) + MIPS_StoreInMemory(ret) +} \ No newline at end of file diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..ca2b1a4 --- /dev/null +++ b/go.mod @@ -0,0 +1,20 @@ +module mlgo + +go 1.20 + +require ( + github.com/jessevdk/go-flags v1.5.0 + github.com/mattn/go-colorable v0.1.13 + github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db + github.com/schollz/progressbar/v3 v3.13.1 + github.com/x448/float16 v0.8.4 + golang.org/x/exp v0.0.0-20230321023759-10a507213a29 +) + +require ( + github.com/mattn/go-isatty v0.0.17 // indirect + github.com/mattn/go-runewidth v0.0.14 // indirect + github.com/rivo/uniseg v0.2.0 // indirect + golang.org/x/sys v0.6.0 // indirect + golang.org/x/term v0.6.0 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..020213d --- /dev/null +++ b/go.sum @@ -0,0 +1,31 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/jessevdk/go-flags v1.5.0 h1:1jKYvbxEjfUl0fmqTCOfonvskHHXMjBySTLW4y9LFvc= +github.com/jessevdk/go-flags v1.5.0/go.mod h1:Fw0T6WPc1dYxT4mKEZRfG5kJhaTDP9pj1c2EWnYs/m4= +github.com/k0kubun/go-ansi v0.0.0-20180517002512-3bf9e2903213/go.mod h1:vNUNkEQ1e29fT/6vq2aBdFsgNPmy8qMdSay1npru+Sw= +github.com/mattn/go-colorable v0.1.13 h1:fFA4WZxdEF4tXPZVKMLwD8oUnCTTo08duU7wxecdEvA= +github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= +github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.17 h1:BTarxUcIeDqL27Mc+vyvdWYSL28zpIhv3RoTdsLMPng= +github.com/mattn/go-isatty v0.0.17/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-runewidth v0.0.14 h1:+xnbZSEeDbOIg5/mE6JF0w6n9duR1l3/WmbinWVwUuU= +github.com/mattn/go-runewidth v0.0.14/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db h1:62I3jR2EmQ4l5rM/4FEfDWcRD+abF5XlKShorW5LRoQ= +github.com/mitchellh/colorstring v0.0.0-20190213212951-d06e56a500db/go.mod h1:l0dey0ia/Uv7NcFFVbCLtqEBQbrT4OCwCSKTEv6enCw= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/rivo/uniseg v0.2.0 h1:S1pD9weZBuJdFmowNwbpi7BJ8TNftyUImj/0WQi72jY= +github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= +github.com/schollz/progressbar/v3 v3.13.1 h1:o8rySDYiQ59Mwzy2FELeHY5ZARXZTVJC7iHD6PEFUiE= +github.com/schollz/progressbar/v3 v3.13.1/go.mod h1:xvrbki8kfT1fzWzBT/UZd9L6GA+jdL7HAgq2RFnO6fQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= +github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +golang.org/x/exp v0.0.0-20230321023759-10a507213a29 h1:ooxPy7fPvB4kwsA2h+iBNHkAbp/4JxTSwCmvdjEYmug= +golang.org/x/exp v0.0.0-20230321023759-10a507213a29/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc= +golang.org/x/sys v0.0.0-20210320140829-1e4c9ba3b0c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.6.0 h1:clScbb1cHjoCkyRbWwBEUZ5H/tIFu5TAXIqaZD0Gcjw= +golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= diff --git a/ml/ml.go b/ml/ml.go new file mode 100644 index 0000000..dc8ee67 --- /dev/null +++ b/ml/ml.go @@ -0,0 +1,3005 @@ +package ml + +import ( + "fmt" + "math" + "os" + "sync" +) + +const ( + DEBUG = false + + MAX_DIMS = 4 + MAX_NODES = 4096 + MAX_PARAMS = 16 + MAX_OPT = 4 + + QK = 32 // quantization + + TOKEN_BOS = 1 + TOKEN_EOS = 2 +) + +type DType uint8 + +// Data types are the same as in llama.cpp so full compatibility there +const ( + TYPE_F32 DType = 0 + TYPE_F16 DType = 1 + TYPE_Q4_0 DType = 2 + TYPE_Q4_1 DType = 3 + TYPE_I8 DType = 4 + TYPE_I16 DType = 5 + TYPE_I32 DType = 6 + TYPE_COUNT DType = 8 +) + +// compute in Single thread +var ( + SINGLE_THREAD = false +) + +func printTensor(tensor *Tensor, name string) { + + var dt string + switch tensor.Type { + case TYPE_F16: + dt = "FP16" + case TYPE_F32: + dt = "FP32" + case TYPE_Q4_0: + dt = "INT4" + } + + fmt.Printf("\n\n=== [ %s | %s | %d:%d:%d ] ===\n", + name, dt, tensor.NE[0], tensor.NE[1], tensor.NE[2]) + + for nn := 0; nn < min(12, int(tensor.NE[1])); nn++ { + fmt.Printf("\n %d x %d ...\t", nn, tensor.NE[0]) + for ii := 0; ii < min(12, int(tensor.NE[0])); ii++ { + fmt.Printf("%.3f\t", tensor.Data[nn*int(tensor.NE[0])+ii]) + } + } +} + +// precomputed exp table for f16 (128 KB) +// static ggml_fp16_t table_exp_f16[1 << 16]; +// var TableExpFP16 [1 << 16]float16.Float16 + +var BLCK_SIZE [TYPE_COUNT]uint32 = [TYPE_COUNT]uint32{1, 1, QK, QK, 1, 1, 1, 0} +var TYPE_SIZE [TYPE_COUNT]uint32 = [TYPE_COUNT]uint32{4, 2, 4 + QK/2, 4*2 + QK/2, 1, 2, 4, 0} + +func TypeSizeFloat(dt DType) float32 { + return float32(TYPE_SIZE[dt]) / float32(BLCK_SIZE[dt]) +} + +// available tensor operations +type optype uint8 + +const ( + OP_NONE optype = iota + OP_DUP + OP_ADD + OP_SUB + OP_MUL + OP_DIV + OP_SQR + OP_SQRT + OP_SUM + OP_MEAN + OP_REPEAT + OP_ABS + OP_SGN + OP_NEG + OP_STEP + OP_RELU + OP_GELU + OP_SILU + OP_NORM + OP_RMS_NORM + + OP_MUL_MAT + + OP_SCALE + OP_CPY + OP_RESHAPE + OP_VIEW + OP_PERMUTE + OP_TRANSPOSE + OP_GET_ROWS + OP_DIAG_MASK_INF + OP_SOFT_MAX + OP_ROPE + OP_CONV_1D_1S + OP_CONV_1D_2S + + OP_FLASH_ATTN + OP_FLASH_FF + + OP_COUNT +) + +// n-dimensional tensor +type Tensor struct { + Type DType + + Dims uint32 + NE [MAX_DIMS]uint32 // number of elements + NB [MAX_DIMS]uint32 // stride in bytes + + op optype + + isParam bool + + grad *Tensor + Src0 *Tensor + Src1 *Tensor + opt [MAX_OPT]*Tensor // FIXME Do we need this? + + TasksCount int + + // performance + //perfRuns uint32 + //perfCycles uint32 + //perfTime uint64 + + Data []float32 + //padding [8]byte +} + +// static inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) { +func (tensor *Tensor) IsContiguous() bool { + // static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + // + return tensor.NB[0] == TYPE_SIZE[tensor.Type] && + tensor.NB[1] == tensor.NB[0]*tensor.NE[0]/BLCK_SIZE[tensor.Type] && + tensor.NB[2] == tensor.NB[1]*tensor.NE[1] && + tensor.NB[3] == tensor.NB[2]*tensor.NE[2] +} + +func AreSameShape(a, b *Tensor) bool { + ////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function"); + return (a.NE[0] == b.NE[0]) && (a.NE[1] == b.NE[1]) && (a.NE[2] == b.NE[2]) && (a.NE[3] == b.NE[3]) +} + +func (t *Tensor) Nelements() uint32 { + ////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function"); + return t.NE[0] * t.NE[1] * t.NE[2] * t.NE[3] +} + +func (t *Tensor) Nrows() uint32 { + ////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function"); + return t.NE[1] * t.NE[2] * t.NE[3] +} + +// size_t ggml_nbytes(const struct ggml_tensor * tensor) { +func (t *Tensor) Nbytes() uint32 { + ////static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + return (t.Nelements() * TYPE_SIZE[t.Type]) / BLCK_SIZE[t.Type] +} + +// struct ggml_tensor * ggml_view_tensor( +func ViewTensor(ctx *Context, src *Tensor) *Tensor { + return NewTensor(ctx, src.Type, src.Dims, src.NE[0], src.NE[1], src.NE[2], src.NE[3], src.Data) +} + +// ggml.c : ggml_dup_tensor +func DupTensor(ctx *Context, src *Tensor) *Tensor { + return NewTensor(ctx, src.Type, src.Dims, src.NE[0], src.NE[1], src.NE[2], src.NE[3], nil) +} + +// struct ggml_tensor * Mul( +func Mul(ctx *Context, a, b *Tensor) *Tensor { + return MulImpl(ctx, a, b, false) +} + +// struct ggml_tensor * Mul_inplace( +func MulInplace(ctx *Context, a, b *Tensor) *Tensor { + return MulImpl(ctx, a, b, true) +} + +// struct ggml_tensor * Mul_impl( +func MulImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor { + ////ASSERT(ggml_are_same_shape(a, b)); + + if !AreSameShape(a, b) { + fmt.Printf("\n[STOP] MulImpl - tensors of different shapes!") + os.Exit(1) + } + + isNode := false + + if inplace && (a.grad != nil || b.grad != nil) { + isNode = true + } + + if inplace { + ////ASSERT(is_node == false); + } + + var result *Tensor + if inplace { + result = ViewTensor(ctx, a) + } else { + result = DupTensor(ctx, a) + } + + result.op = OP_MUL + result.Src0 = a + result.Src1 = b + + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + return result +} + +// ggml_can_mul_mat +func CanMulMat(t0, t1 *Tensor) bool { + ////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function"); + return (t0.NE[0] == t1.NE[0]) && (t0.NE[2] == t1.NE[2]) && (t0.NE[3] == t1.NE[3]) +} + +// ggml_mul_mat +func MulMat(ctx *Context, a, b *Tensor) *Tensor { + ////ASSERT(ggml_can_mul_mat(a, b)); + ////GGML_ASSERT(!ggml_is_transposed(a)); + + isNode := false + + if a.grad != nil || b.grad != nil { + isNode = true + } + + result := NewTensor(ctx, TYPE_F32, min32(a.Dims, b.Dims), a.NE[1], b.NE[1], a.NE[2], b.NE[3], nil) + + result.op = OP_MUL_MAT + result.Src0 = a + result.Src1 = b + + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + return result +} + +// ggml_add + +func AddImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor { + ////ASSERT(ggml_are_same_shape(a, b)); + + //bool is_node = false; + + ////if (!inplace && (a.grad || b.grad)) { + ////is_node = true; + ////} + + ////struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + var result *Tensor + if inplace { + result = ViewTensor(ctx, a) + } else { + result = DupTensor(ctx, a) + } + + result.op = OP_ADD + ////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result.grad = nil + result.Src0 = a + result.Src1 = b + + return result +} + +func Add(ctx *Context, a, b *Tensor) *Tensor { + return AddImpl(ctx, a, b, false) +} + +func AddInplace(ctx *Context, a, b *Tensor) *Tensor { + return AddImpl(ctx, a, b, true) +} + +// ggml_sum + +func Sum(ctx *Context, a *Tensor) *Tensor { + isNode := false + + if a.grad != nil { + isNode = true + } + + result := NewTensor1D(ctx, a.Type, 1) + + result.op = OP_SUM + result.Src0 = a + result.Src1 = nil + + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + return result +} + +// ggml_sub + +func SubImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor { + ////ASSERT(ggml_are_same_shape(a, b)); + + ////bool is_node = false; + + ////if (!inplace && (a.grad || b.grad)) { + ////is_node = true; + ////} + + var result *Tensor + if inplace { + result = ViewTensor(ctx, a) + } else { + result = DupTensor(ctx, a) + } + + result.op = OP_SUB + ////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result.grad = nil + result.Src0 = a + result.Src1 = b + + return result +} + +func Sub(ctx *Context, a, b *Tensor) *Tensor { + return SubImpl(ctx, a, b, false) +} + +func SubInplace(ctx *Context, a, b *Tensor) *Tensor { + return SubImpl(ctx, a, b, true) +} + +// ggml_div + +func DivImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor { + ////ASSERT(ggml_are_same_shape(a, b)); + + ////bool is_node = false; + + ////if (!inplace && (a->grad || b->grad)) { + ////is_node = true; + ////} + + ////if (inplace) { + ////ASSERT(is_node == false); + ////} + + var result *Tensor + if inplace { + result = ViewTensor(ctx, a) + } else { + result = DupTensor(ctx, a) + } + + result.op = OP_DIV + ////result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result.grad = nil + result.Src0 = a + result.Src1 = b + + return result +} + +func Div(ctx *Context, a, b *Tensor) *Tensor { + return DivImpl(ctx, a, b, false) +} + +func DivInplace(ctx *Context, a, b *Tensor, inplace bool) *Tensor { + return DivImpl(ctx, a, b, true) +} + +// ggml_sgn + +func SgnImpl(ctx *Context, a *Tensor, inplace bool) *Tensor { + isNode := false + + if !inplace && a.grad != nil { + isNode = true + } + + var result *Tensor + if inplace { + result = ViewTensor(ctx, a) + } else { + result = DupTensor(ctx, a) + } + + result.op = OP_SGN + result.Src0 = a + result.Src1 = nil + + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + return result +} + +func Sgn(ctx *Context, a *Tensor) *Tensor { + return SgnImpl(ctx, a, false) +} + +func SgnInplace(ctx *Context, a *Tensor) *Tensor { + return SgnImpl(ctx, a, true) +} + +// ggml_relu + +func ReluImpl(ctx *Context, a *Tensor, inplace bool) *Tensor { + isNode := false + + if !inplace && a.grad != nil { + isNode = true + } + + var result *Tensor + if inplace { + result = ViewTensor(ctx, a) + } else { + result = DupTensor(ctx, a) + } + + result.op = OP_RELU + result.Src0 = a + result.Src1 = nil + + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + return result +} + +func Relu(ctx *Context, a *Tensor) *Tensor { + return ReluImpl(ctx, a, false) +} + +func ReluInplace(ctx *Context, a *Tensor) *Tensor { + return ReluImpl(ctx, a, true) +} + +// Repeat + +// struct ggml_tensor * Repeat( +func Repeat(ctx *Context, a, b *Tensor) *Tensor { + ////ASSERT(ggml_can_repeat(a, b)); + + isNode := false + + if a.grad != nil { + isNode = true + } + + if AreSameShape(a, b) && !isNode { + return a + } + + //struct ggml_tensor * result = ggml_new_tensor(ctx, a.type, b.n_dims, b.ne); + result := NewTensor(ctx, a.Type, b.Dims, b.NE[0], b.NE[1], b.NE[2], b.NE[3], nil) + + result.op = OP_REPEAT + result.Src0 = a + result.Src1 = b + + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + return result +} + +func IsScalar(tensor *Tensor) bool { + ////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function"); + return tensor.NE[0] == 1 && tensor.NE[1] == 1 && tensor.NE[2] == 1 && tensor.NE[3] == 1 +} + +func IsVector(tensor *Tensor) bool { + ////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function"); + return tensor.NE[1] == 1 && tensor.NE[2] == 1 && tensor.NE[3] == 1 +} + +func IsMatrix(tensor *Tensor) bool { + ////static_assert(MAX_DIMS == 4, "MAX_DIMS is not 4 - update this function"); + return tensor.NE[2] == 1 && tensor.NE[3] == 1 +} + +// ggml_get_rows +func GetRows(ctx *Context, a, b *Tensor) *Tensor { + ////ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b.type == TYPE_I32); + if !IsMatrix(a) || !IsVector(b) /* || b.Type != TYPE_I32 */ { + fmt.Printf("\n[ERROR] GetRows fail basic assertions") + os.Exit(1) + } + + isNode := false + + if a.grad != nil || b.grad != nil { + ////ASSERT(false); // TODO: implement backward + isNode = true + fmt.Printf("\n[STOP] ml.GetRows") + os.Exit(1) + } + + // TODO: implement non F32 return + //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a.type, a.ne[0], b.ne[0]); + result := NewTensor2D(ctx, TYPE_F32, a.NE[0], b.NE[0]) + + result.op = OP_GET_ROWS + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + result.Src0 = a + result.Src1 = b + + return result +} + +func RMSNorm(ctx *Context, a *Tensor) *Tensor { + return RMSNormImpl(ctx, a, false) +} + +func RMSNormInplace(ctx *Context, a *Tensor) *Tensor { + return RMSNormImpl(ctx, a, true) +} + +// //struct ggml_tensor * ggml_rms_norm_impl( +func RMSNormImpl(ctx *Context, a *Tensor, inplace bool) *Tensor { + isNode := false + + if !inplace && a.grad != nil { + ////ASSERT(false); // TODO: implement backward + isNode = true + fmt.Printf("\n[STOP] ml.GetRows") + os.Exit(1) + } + + ////struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + var result *Tensor + if inplace { + result = ViewTensor(ctx, a) + } else { + result = DupTensor(ctx, a) + } + + result.op = OP_RMS_NORM + result.Src0 = a + result.Src1 = nil // TODO: maybe store epsilon here? + + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + return result +} + +// ggml_view_1d +// NB! Originally offset in bytes, but here in floats (4-bytes) +func View1D(ctx *Context, a *Tensor, ne0 uint32, offset uint32) *Tensor { + ////if a.grad != nil { + //// ////ASSERT(false); // gradient propagation is not supported + //// fmt.Printf("\n[STOP] View1D : gradient propagation is not supported") + //// os.Exit(1) + ////} + + slice := a.Data[offset:] + result := NewTensor(ctx, a.Type, 1, ne0, 1, 1, 1, slice) + + result.op = OP_VIEW + result.grad = nil + result.Src0 = a + result.Src1 = nil // TODO: maybe store the offset here? + + return result +} + +// ggml_build_forward_impl +func BuildForwardImpl(graph *Graph, tensor *Tensor, expand bool) { + + if !expand { + graph.NodesCount = 0 + graph.LeafsCount = 0 + } + + n0 := graph.NodesCount + VisitParents(graph, tensor) + n_new := graph.NodesCount - n0 + + if n_new > 0 { + // the last added node should always be starting point + ////ASSERT(cgraph.nodes[cgraph.n_nodes - 1] == tensor); + if !(graph.Nodes[graph.NodesCount-1] == tensor) { + fmt.Printf("\n[STOP] BuildForwardImpl : the last added node should always be starting point!") + os.Exit(1) + } + } +} + +// ggml_build_forward_expand +func BuildForwardExpand(graph *Graph, tensor *Tensor) { + BuildForwardImpl(graph, tensor, true) + // construct the tensor => NodeID mapping + ConstructTensor2NodeIDMapping(graph) +} + +func ConstructTensor2NodeIDMapping(graph *Graph) { + if graph.Tensor2NodeID == nil { + graph.Tensor2NodeID = make(map[*Tensor]uint32) + } + cnt := uint32(0) + for i := uint32(0); i < graph.NodesCount; i++ { + node := graph.Nodes[i] + graph.Tensor2NodeID[node] = cnt + cnt++ + } + // add leaves + for i := uint32(0); i < graph.LeafsCount; i++ { + node := graph.Leafs[i] + graph.Tensor2NodeID[node] = cnt + cnt++ + } +} + +// ggml_visit_parents +func VisitParents(graph *Graph, node *Tensor) { + + if node.grad == nil { + // this usually happens when we generate intermediate nodes from constants in the backward pass + // it can also happen during forward pass, if the user performs computations with constants + if node.op != OP_NONE { + //PRINT_DEBUG("%s: warning: node %p has no grad, but op %d\n", __func__, (void *) node, node.op); + } + } + + // check if already visited + for i := uint32(0); i < graph.NodesCount; i++ { + if graph.Nodes[i] == node { + return + } + } + + for i := uint32(0); i < graph.LeafsCount; i++ { + if graph.Leafs[i] == node { + return + } + } + + if node.Src0 != nil { + VisitParents(graph, node.Src0) + } + + if node.Src1 != nil { + VisitParents(graph, node.Src1) + } + + for i := 0; i < MAX_OPT; i++ { + if node.opt[i] != nil { + VisitParents(graph, node.opt[i]) + } + } + + if node.op == OP_NONE && node.grad == nil { + // reached a leaf node, not part of the gradient graph (e.g. a constant) + ////ASSERT(cgraph.n_leafs < MAX_NODES); + + graph.Leafs[graph.LeafsCount] = node + graph.LeafsCount++ + } else { + ////ASSERT(cgraph.n_nodes < MAX_NODES); + + graph.Nodes[graph.NodesCount] = node + graph.Grads[graph.NodesCount] = node.grad + graph.NodesCount++ + } +} + +// ggml_cpy +func CopyImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor { + + ////ASSERT(ggml_nelements(a) == ggml_nelements(b)); + if a.Nelements() != b.Nelements() { + fmt.Printf("\n[HALT] Copy tensors of different dimensions!") + os.Exit(1) + } + + isNode := false + + if !inplace && (a.grad != nil || b.grad != nil) { + ////ASSERT(false); // TODO: implement backward + isNode = true + fmt.Printf("\n[STOP] cpyImpl") + os.Exit(1) + } + + // make a view of the destination + result := ViewTensor(ctx, b) + + result.op = OP_CPY + result.Src0 = a + result.Src1 = b + + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + return result +} + +func Copy(ctx *Context, a, b *Tensor) *Tensor { + return CopyImpl(ctx, a, b, false) +} + +func CopyInplace(ctx *Context, a, b *Tensor) *Tensor { + return CopyImpl(ctx, a, b, true) +} + +// computation graph +type Graph struct { + NodesCount uint32 + LeafsCount uint32 + ThreadsCount int + + Jobs chan *ComputeParams + + Nodes [MAX_NODES]*Tensor + Grads [MAX_NODES]*Tensor + Leafs [MAX_NODES]*Tensor + + Tensor2NodeID map[*Tensor]uint32 // *tensor => NodeID + // performance + //perfRuns uint64 + //perfCycles uint64 + ////int64_t perf_time_us; +} + +type InitParams struct { +} + +// ml/ggml.c:2248 +// TODO Do we need this? +type Context struct { +} + +// ggml_new_tensor_1d +func NewTensor1D(ctx *Context, dt DType, ne0 uint32) *Tensor { + return NewTensor(ctx, dt, 1, ne0, 1, 1, 1, nil) +} + +func NewTensor1DWithData(ctx *Context, dt DType, ne0 uint32, data []float32) *Tensor { + return NewTensor(ctx, dt, 1, ne0, 1, 1, 1, data) +} + +// ggml_new_tensor_2d +func NewTensor2D(ctx *Context, dt DType, ne0, ne1 uint32) *Tensor { + return NewTensor(ctx, dt, 2, ne0, ne1, 1, 1, nil) +} + +func NewTensor2DWithData(ctx *Context, dt DType, ne0, ne1 uint32, data []float32) *Tensor { + return NewTensor(ctx, dt, 2, ne0, ne1, 1, 1, data) +} + +func NewTensor3D(ctx *Context, dt DType, ne0, ne1, ne2 uint32) *Tensor { + return NewTensor(ctx, dt, 3, ne0, ne1, ne2, 1, nil) +} + +func NewTensor4D(ctx *Context, dt DType, ne0, ne1, ne2, ne3 uint32) *Tensor { + return NewTensor(ctx, dt, 4, ne0, ne1, ne2, ne3, nil) +} + +// ggml_new_tensor_impl +func NewTensor(ctx *Context, dt DType, dims uint32, ne0, ne1, ne2, ne3 uint32, data []float32) *Tensor { + + if dt != TYPE_F32 && dt != TYPE_I32 { + fmt.Printf("\n[ERROR] NewTensorImpl got not supported type : %d", dt) + os.Exit(1) + } + + ////ggml_assert_aligned(result); + + result := Tensor{ + Type: dt, + Dims: dims, + NE: [4]uint32{ne0, ne1, ne2, ne3}, + op: OP_NONE, + } + + ////result->nb[0] = GGML_TYPE_SIZE[type]; + ////result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]); + ////for (int i = 2; i < GGML_MAX_DIMS; i++) { + //// result->nb[i] = result->nb[i - 1]*result->ne[i - 1]; + ////} + + result.NB[0] = TYPE_SIZE[dt] + result.NB[1] = TYPE_SIZE[dt] * (result.NE[0] / BLCK_SIZE[dt]) + result.NB[2] = result.NB[1] * result.NE[1] + result.NB[3] = result.NB[2] * result.NE[2] + + total := ne0 * ne1 * ne2 * ne3 + + if data == nil { + result.Data = make([]float32, total, total) // &newData + } else { + result.Data = data + } + + return &result +} + +// ggml_permute +func Permute(ctx *Context, a *Tensor, axis0, axis1, axis2, axis3 uint32) *Tensor { + + ////ASSERT(axis0 >= 0 && axis0 < MAX_DIMS); + ////ASSERT(axis1 >= 0 && axis1 < MAX_DIMS); + ////ASSERT(axis2 >= 0 && axis2 < MAX_DIMS); + ////ASSERT(axis3 >= 0 && axis3 < MAX_DIMS); + + ////ASSERT(axis0 != axis1); + ////ASSERT(axis0 != axis2); + ////ASSERT(axis0 != axis3); + ////ASSERT(axis1 != axis2); + ////ASSERT(axis1 != axis3); + ////ASSERT(axis2 != axis3); + + isNode := false + + if a.grad != nil { + ////ASSERT(false); // TODO: implement backward + isNode = true + fmt.Printf("\n[STOP] Permute error") + os.Exit(1) + } + + result := ViewTensor(ctx, a) + + var ne [MAX_DIMS]uint32 + var nb [MAX_DIMS]uint32 + + ne[axis0] = a.NE[0] + ne[axis1] = a.NE[1] + ne[axis2] = a.NE[2] + ne[axis3] = a.NE[3] + + nb[axis0] = a.NB[0] + nb[axis1] = a.NB[1] + nb[axis2] = a.NB[2] + nb[axis3] = a.NB[3] + + result.NE[0] = ne[0] + result.NE[1] = ne[1] + result.NE[2] = ne[2] + result.NE[3] = ne[3] + + result.NB[0] = nb[0] + result.NB[1] = nb[1] + result.NB[2] = nb[2] + result.NB[3] = nb[3] + + result.op = OP_PERMUTE + result.Src0 = a + result.Src1 = nil // TODO: maybe store the permutation here? + + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + return result +} + +// ggml_rope +func Rope(ctx *Context, a *Tensor, past, dims, mode uint32) *Tensor { + ////ASSERT(n_past >= 0); + + isNode := false + + if a.grad != nil { + ////ASSERT(false); // TODO: implement backward + isNode = true + fmt.Printf("\n[STOP] Rope error") + os.Exit(1) + } + + // TODO: when implement backward, fix this: + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + result := ViewTensor(ctx, a) + + b := NewTensor1D(ctx, TYPE_I32, 3) + b.Data[0] = float32(past) + b.Data[1] = float32(dims) + b.Data[2] = float32(mode) + + result.op = OP_ROPE + result.Src0 = a + result.Src1 = b + + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + return result +} + +func Reshape3D(ctx *Context, a *Tensor, ne0, ne1, ne2 uint32) *Tensor { + ////ASSERT(ggml_is_contiguous(a)); + ////ASSERT(ggml_nelements(a) == ne0*ne1*ne2); + + if !a.IsContiguous() { + fmt.Printf("\n[STOP] Reshape3D : tensor is NOT contiguous!") + os.Exit(1) + } + + if a.Nelements() != ne0*ne1*ne2 { + fmt.Printf("\n[STOP] Reshape3D : different elements number!") + os.Exit(1) + } + + ////bool is_node = false; + + ////if (a.grad) { + //// //// ASSERT(false); // TODO: implement backward + //// is_node = true; + ////} + + //ne := [3]uint32{ ne0, ne1, ne2 } + result := NewTensor(ctx, a.Type, 3, ne0, ne1, ne2, 1, a.Data) + + result.op = OP_RESHAPE + ////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result.grad = nil + result.Src0 = a + result.Src1 = nil + + return result +} + +// ggml_new_f32 +func NewFP32(ctx *Context, value float32) *Tensor { + result := NewTensor1D(ctx, TYPE_F32, 1) + SetFP32(result, value) + return result +} + +// ggml_set_f32 +func SetFP32(tensor *Tensor, value float32) *Tensor { + // FIXME Optimize with mem zeroing + n := tensor.Nelements() + for i := uint32(0); i < n; i++ { + ////ggml_vec_set_f32(nc, (float *)(data + i*n1), value); + tensor.Data[i] = value + } + return tensor +} + +// ggml_scale +func ScaleImpl(ctx *Context, a, b *Tensor, inplace bool) *Tensor { + ////ASSERT(ggml_is_scalar(b)); + ////ASSERT(ggml_is_padded_1d(a)); + + ////bool is_node = false; + + if !inplace && (a.grad != nil || b.grad != nil) { + ////ASSERT(false); // TODO: implement backward + ////is_node = true; + fmt.Printf("\n[STOP] ScaleImpl : assertion failed") + os.Exit(1) + } + + // TODO: when implement backward, fix this: + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + result := ViewTensor(ctx, a) + + result.op = OP_SCALE + ////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result.grad = nil + result.Src0 = a + result.Src1 = b + + return result +} + +func Scale(ctx *Context, a, b *Tensor) *Tensor { + return ScaleImpl(ctx, a, b, false) +} + +func ScaleInplace(ctx *Context, a, b *Tensor) *Tensor { + return ScaleImpl(ctx, a, b, true) +} + +// ggml_diag_mask_inf +func DiagMaskInf(ctx *Context, a *Tensor, past uint32) *Tensor { + ////bool is_node = false; + + if a.grad != nil { + ////ASSERT(false); // TODO: implement backward + ////is_node = true; + fmt.Printf("\n[STOP] DiagMaskInf : assertion failed") + os.Exit(1) + } + + // TODO: when implement backward, fix this: + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + result := ViewTensor(ctx, a) + b := NewFP32(ctx, float32(past)) // FIXME NewI32(ctx, past) + + result.op = OP_DIAG_MASK_INF + ////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result.grad = nil + result.Src0 = a + result.Src1 = b + + return result +} + +// ggml_soft_max +func SoftMax(ctx *Context, a *Tensor) *Tensor { + ////bool is_node = false; + + if a.grad != nil { + ////ASSERT(false); // TODO: implement backward + ////is_node = true; + fmt.Printf("\n[STOP] SoftMax : assertion failed") + os.Exit(1) + } + + // TODO: when implement backward, fix this: + //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + result := ViewTensor(ctx, a) + + result.op = OP_SOFT_MAX + ////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result.grad = nil + result.Src0 = a + result.Src1 = nil + + return result +} + +// ggml_silu + +func SiluImpl(ctx *Context, a *Tensor, inplace bool) *Tensor { + ////bool is_node = false; + + ////if (!inplace && (a.grad)) { + ////is_node = true; + ////} + + var result *Tensor + if inplace { + result = ViewTensor(ctx, a) + } else { + result = DupTensor(ctx, a) + } + + result.op = OP_SILU + ////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result.grad = nil + result.Src0 = a + result.Src1 = nil + + return result +} + +func Silu(ctx *Context, a *Tensor) *Tensor { + return SiluImpl(ctx, a, false) +} + +func SiluInplace(ctx *Context, a *Tensor) *Tensor { + return SiluImpl(ctx, a, true) +} + +// ggml_step + +func StepImpl(ctx *Context, a *Tensor, inplace bool) *Tensor { + isNode := false + + if !inplace && a.grad != nil { + isNode = true + } + + var result *Tensor + if inplace { + result = ViewTensor(ctx, a) + } else { + result = DupTensor(ctx, a) + } + + result.op = OP_STEP + result.Src0 = a + result.Src1 = nil + + if isNode { + result.grad = DupTensor(ctx, result) + } else { + result.grad = nil + } + + return result +} + +func Step(ctx *Context, a *Tensor) *Tensor { + return StepImpl(ctx, a, false) +} + +func StepInplace(ctx *Context, a *Tensor) *Tensor { + return StepImpl(ctx, a, true) +} + +// ggml_transpose + +func Transpose(ctx *Context, a *Tensor) *Tensor { + ////isNode := false + + if a.grad != nil { + ////ASSERT(false); // TODO: implement backward + ////is_node = true; + } + + result := ViewTensor(ctx, a) + + result.NE[0] = a.NE[1] + result.NE[1] = a.NE[0] + + result.NB[0] = a.NB[1] + result.NB[1] = a.NB[0] + + result.op = OP_TRANSPOSE + ////result.grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result.grad = nil + result.Src0 = a + result.Src1 = nil + + return result +} + +func BuildForward(tensor *Tensor) *Graph { + result := Graph{} + BuildForwardImpl(&result, tensor, false) + return &result +} + +func BuildBackward(ctx *Context, gf *Graph, keep bool) Graph { + + result := *gf + ////ASSERT(gf.n_nodes > 0); + + // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph + if keep { + for i := uint32(0); i < gf.NodesCount; i++ { + node := gf.Nodes[i] + + if node.grad != nil { + node.grad = DupTensor(ctx, node) + gf.Grads[i] = node.grad + } + } + } + + for i := gf.NodesCount - 1; i >= 0; i-- { + node := gf.Nodes[i] + + // because we detached the grad nodes from the original graph, we can afford inplace operations + if node.grad != nil { + ComputeBackward(ctx, node, keep) + } + } + + for i := gf.NodesCount - 1; i >= 0; i-- { + node := gf.Nodes[i] + + if node.isParam { + ////PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); + BuildForwardImpl(&result, node.grad, true) + } + } + + return result +} + +//////////////////////////////////////////////////////////////////////////////// + +func ComputeBackward(ctx *Context, tensor *Tensor, inplace bool) { + + src0 := tensor.Src0 + src1 := tensor.Src1 + + switch tensor.op { + + case OP_DUP: + if src0.grad != nil { + src0.grad = AddImpl(ctx, src0.grad, tensor.grad, inplace) + } + case OP_ADD: + if src0.grad != nil { + src0.grad = AddImpl(ctx, src0.grad, tensor.grad, inplace) + } + if src1.grad != nil { + src1.grad = AddImpl(ctx, src1.grad, tensor.grad, inplace) + } + case OP_SUB: + if src0.grad != nil { + src0.grad = AddImpl(ctx, src0.grad, tensor.grad, inplace) + } + if src1.grad != nil { + src1.grad = SubImpl(ctx, src1.grad, tensor.grad, inplace) + } + case OP_MUL: + if src0.grad != nil { + src0.grad = + AddImpl(ctx, + src0.grad, + Mul(ctx, src1, tensor.grad), + inplace) + } + if src1.grad != nil { + src1.grad = + AddImpl(ctx, + src1.grad, + Mul(ctx, src0, tensor.grad), + inplace) + } + case OP_DIV: + if src0.grad != nil { + src0.grad = + AddImpl(ctx, + src0.grad, + Div(ctx, tensor.grad, src1), + inplace) + } + if src1.grad != nil { + src1.grad = + SubImpl(ctx, + src1.grad, + Mul(ctx, + tensor.grad, + Div(ctx, tensor, src1)), + inplace) + } + case OP_SQR: + if src0.grad != nil { + src0.grad = + AddImpl(ctx, + src0.grad, + Mul(ctx, + Mul(ctx, src0, tensor.grad), + Repeat(ctx, NewFP32(ctx, 2.0), src0)), + inplace) + } + case OP_SQRT: + if src0.grad != nil { + src0.grad = + AddImpl(ctx, + src0.grad, + Div(ctx, + Repeat(ctx, NewFP32(ctx, 0.5), tensor), + tensor), + inplace) + } + case OP_SUM: + if src0.grad != nil { + src0.grad = + AddImpl(ctx, + src0.grad, + Repeat(ctx, tensor.grad, src0.grad), + inplace) + } + case OP_MEAN: + //// ASSERT(false); // TODO: implement + case OP_REPEAT: + if src0.grad != nil { + src0.grad = + AddImpl(ctx, + src0.grad, + Sum(ctx, tensor.grad), + inplace) + } + case OP_ABS: + if src0.grad != nil { + src0.grad = + AddImpl(ctx, + src0.grad, + Mul(ctx, + Sgn(ctx, src0), + tensor.grad), + inplace) + } + case OP_SGN: + if src0.grad != nil { + // noop + } + case OP_NEG: + if src0.grad != nil { + src0.grad = SubImpl(ctx, src0.grad, tensor.grad, inplace) + } + case OP_STEP: + if src0.grad != nil { + // noop + } + case OP_RELU: + if src0.grad != nil { + src0.grad = SubImpl(ctx, + src0.grad, + Mul(ctx, + Step(ctx, src0), + tensor.grad), + inplace) + } + case OP_GELU: + //// ASSERT(false); // TODO: not implemented + case OP_SILU: + //// ASSERT(false); // TODO: not implemented + case OP_NORM: + //// ASSERT(false); // TODO: not implemented + case OP_RMS_NORM: + //// ASSERT(false); // TODO: not implemented + case OP_MUL_MAT: + if src0.grad != nil { + // TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor.grad); + //// ASSERT(false); + fmt.Printf("\n[HALT] ComputeBackward : OP_MUL_MAT with src0.grad!") + os.Exit(1) + } + if src1.grad != nil { + src1.grad = + AddImpl(ctx, + src1.grad, + // TODO: fix transpose, the node will break the graph connections + MulMat(ctx, Transpose(ctx, src0), tensor.grad), + inplace) + } + case OP_SCALE: + //// ASSERT(false); // TODO: not implemented + case OP_CPY: + //// ASSERT(false); // TODO: not implemented + case OP_RESHAPE: + //// ASSERT(false); // TODO: not implemented + case OP_VIEW: + //// ASSERT(false); // not supported + case OP_PERMUTE: + //// ASSERT(false); // TODO: not implemented + case OP_TRANSPOSE: + //// ASSERT(false); // TODO: not implemented + case OP_GET_ROWS: + //// ASSERT(false); // TODO: not implemented + case OP_DIAG_MASK_INF: + //// ASSERT(false); // TODO: not implemented + case OP_SOFT_MAX: + //// ASSERT(false); // TODO: not implemented + case OP_ROPE: + //// ASSERT(false); // TODO: not implemented + case OP_CONV_1D_1S: + //// ASSERT(false); // TODO: not implemented + case OP_CONV_1D_2S: + //// ASSERT(false); // TODO: not implemented + case OP_FLASH_ATTN: + //// ASSERT(false); // not supported + case OP_FLASH_FF: + //// ASSERT(false); // not supported + case OP_NONE: + // nop + case OP_COUNT: + //// ASSERT(false); + } +} + +// --- + +type TaskType uint8 + +const ( + TASK_INIT TaskType = 0 + TASK_COMPUTE TaskType = 1 + TASK_FINALIZE TaskType = 2 +) + +type ComputeParams struct { + Type TaskType + + ith uint32 + nth uint32 + + tensor *Tensor + + wg *sync.WaitGroup +} + +// Golang doesn’t have unary Bitwise NOT(~) like other programming languages +// Here, you have to use Bitwise XOR(^) operator as Bitwise NOT +func up32(n uint32) uint32 { // FIXME Not needed ? + return uint32(n+31) & ^uint32(31) +} + +func up(n, m uint32) uint32 { // FIXME Not needed ? + // assert m is a power of 2 + ////GGML_ASSERT((m & (m - 1)) == 0); + return uint32(n+m-1) & ^uint32(m-1) +} + +func max(a, b int) int { // FIXME Not needed ? + if a >= b { + return a + } + return b +} + +// Job is goroutine existing while the computation loop is active +// The main purpose of the Job is to perform some part +// of time consuming matrix multiplications +func Job(listen <-chan *ComputeParams) { + //fmt.Printf("\nJOB STARTED...") + for params := range listen { + + //fmt.Printf("\n...JOB SIGNAL") + ComputeForwardMulMatFP32( + params, + params.tensor.Src0, + params.tensor.Src1, + params.tensor) + + // DEBUG MULTI_THREAD + //if params.nth > 1 { + // defer params.wg.Done() + //defer fmt.Printf("\nTHREAD #%d ... defer Done()", params.ith) + //} + + //fmt.Printf("\n...JOB DONE") + params.wg.Done() + } + //fmt.Printf("\nJOB FINISHED...") +} + +func GraphCompute(ctx *Context, graph *Graph) { + + maxThreads := graph.ThreadsCount + + // --- init N job goroutines and channel to send tasks for them + + graph.Jobs = make(chan *ComputeParams, maxThreads) // TODO Right place to init? + defer close(graph.Jobs) + + // TODO Investigate https://pkg.go.dev/runtime#LockOSThread + for i := 0; i < maxThreads; i++ { + go Job(graph.Jobs) + } + + // --- initialize tasks + + { + // thread scheduling for the different operations + // TasksCount might be 0, 1, or ThreadsCount + for i := uint32(0); i < graph.NodesCount; i++ { + + ////struct ggml_tensor * node = cgraph->nodes[i]; + node := graph.Nodes[i] + + if DEBUG { + fmt.Printf("\n\n### STEP #%d ### %d - %d [ %d:%d:%d:%d ]", i, node.op, node.Type, node.NE[0], node.NE[1], node.NE[2], node.NE[3]) + } + + switch node.op { + + case OP_DUP: + node.TasksCount = 1 + case OP_ADD: + node.TasksCount = 1 // TODO threads + case OP_SUB: + case OP_MUL: + case OP_DIV: + case OP_SQR: + case OP_SQRT: + case OP_SUM: + case OP_MEAN: + case OP_REPEAT: + case OP_ABS: + case OP_SGN: + case OP_NEG: + case OP_STEP: + case OP_RELU: + node.TasksCount = 1 + case OP_GELU: + node.TasksCount = 1 // TODO threads + case OP_SILU: + node.TasksCount = 1 // TODO threads + case OP_NORM: + case OP_RMS_NORM: + node.TasksCount = 1 // TODO threads + case OP_MUL_MAT: + node.TasksCount = maxThreads + // TODO: use different scheduling for different matrix sizes + case OP_SCALE: + node.TasksCount = 1 // TODO threads + case OP_CPY: + case OP_RESHAPE: + case OP_VIEW: + case OP_PERMUTE: + case OP_TRANSPOSE: + case OP_GET_ROWS: + case OP_DIAG_MASK_INF: + node.TasksCount = 1 + case OP_SOFT_MAX: + node.TasksCount = 1 // TODO threads + case OP_ROPE: + ////node.TasksCount = 1 + case OP_CONV_1D_1S: + case OP_CONV_1D_2S: + node.TasksCount = 1 // TODO threads + ////ASSERT(node->src0->ne[3] == 1); + ////ASSERT(node->src1->ne[2] == 1); + ////ASSERT(node->src1->ne[3] == 1); + case OP_FLASH_ATTN: + node.TasksCount = 1 // TODO threads + case OP_FLASH_FF: + node.TasksCount = 1 // TODO threads + case OP_NONE: + node.TasksCount = 1 + case OP_COUNT: + fmt.Printf("\n[HALT] Something wrong with compute graph!") + os.Exit(1) + } + } + } + + for i := uint32(0); i < graph.NodesCount; i++ { + + node := graph.Nodes[i] + + if DEBUG { + fmt.Printf("\n\n### STEP #%d ### %d - %d [ %d:%d:%d:%d ]", i, node.op, node.Type, node.NE[0], node.NE[1], node.NE[2], node.NE[3]) + } + + params := ComputeParams{ + Type: TASK_INIT, + ith: 0, + nth: uint32(node.TasksCount), + } + + ComputeForward(graph, ¶ms, node) // TASK_INIT + + // --- COMPUTE + + // BREAKPOINT DEBUG + //if i > 1300 { + // fmt.Printf("\n\n=== HALT #%d ===", i) + // os.Exit(0) + //} + + params.Type = TASK_COMPUTE + ComputeForward(graph, ¶ms, node) + + // --- FINALIZE + + params.Type = TASK_FINALIZE + ComputeForward(graph, ¶ms, node) + } + +} + + + +// ======================================================================= + +func ComputeForward(graph *Graph, params *ComputeParams, tensor *Tensor) { + + switch tensor.op { + + case OP_DUP: + ////ggml_compute_forward_dup(params, tensor->src0, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_dup") + os.Exit(1) + case OP_ADD: + ComputeForwardAddFP32(params, tensor.Src0, tensor.Src1, tensor) + case OP_SUB: + ////ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_sub") + os.Exit(1) + case OP_MUL: + ComputeForwardMulFP32(params, tensor.Src0, tensor.Src1, tensor) + case OP_DIV: + ////ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_div") + os.Exit(1) + case OP_SQR: + ////ggml_compute_forward_sqr(params, tensor->src0, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_sqr") + os.Exit(1) + case OP_SQRT: + ////ggml_compute_forward_sqrt(params, tensor->src0, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_sqrt") + os.Exit(1) + case OP_SUM: + ////ggml_compute_forward_sum(params, tensor->src0, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_sum") + os.Exit(1) + case OP_MEAN: + ////ggml_compute_forward_mean(params, tensor->src0, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_mean") + os.Exit(1) + case OP_REPEAT: + ComputeForwardRepeatFP32(params, tensor.Src0, tensor) + case OP_ABS: + ////ggml_compute_forward_abs(params, tensor->src0, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_abs") + os.Exit(1) + case OP_SGN: + ////ggml_compute_forward_sgn(params, tensor->src0, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_sgn") + os.Exit(1) + case OP_NEG: + ////ggml_compute_forward_neg(params, tensor->src0, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_neg") + os.Exit(1) + case OP_STEP: + ////ggml_compute_forward_step(params, tensor->src0, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_step") + os.Exit(1) + case OP_RELU: + ////ggml_compute_forward_relu(params, tensor->src0, tensor); + ComputeForwardReluFP32(params, tensor.Src0, tensor) + case OP_GELU: + ////ggml_compute_forward_gelu(params, tensor->src0, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_gelu") + os.Exit(1) + case OP_SILU: + ComputeForwardSiluFP32(params, tensor.Src0, tensor) + case OP_NORM: + ////ggml_compute_forward_norm(params, tensor->src0, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_norm") + os.Exit(1) + case OP_RMS_NORM: + ComputeForwardRMSNormFP32(params, tensor.Src0, tensor) + case OP_MUL_MAT: + + if SINGLE_THREAD { + ComputeForwardMulMatFP32(params, tensor.Src0, tensor.Src1, tensor) + } else { + // TODO Optimize this + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + //ComputeForwardMulMatFP32(params, tensor.src0, tensor.src1, tensor) + //return + + wg := new(sync.WaitGroup) + wg.Add(graph.ThreadsCount) + + for i := 0; i < graph.ThreadsCount; i++ { + graph.Jobs <- &ComputeParams{ + Type: TASK_COMPUTE, + ith: uint32(i), + nth: uint32(graph.ThreadsCount), + tensor: tensor, + wg: wg, + } + } + + wg.Wait() + } + + case OP_SCALE: + ComputeForwardScaleFP32(params, tensor.Src0, tensor.Src1, tensor) + case OP_CPY: + ComputeForwardDupFP32(params, tensor.Src0, tensor) + case OP_RESHAPE: + ComputeForwardReshape(params, tensor.Src0, tensor) // NOP + case OP_VIEW: + ComputeForwardView(params, tensor.Src0) // NOP + case OP_PERMUTE: + ComputeForwardPermute(params, tensor.Src0) // NOP + case OP_TRANSPOSE: + ////ggml_compute_forward_transpose(params, tensor->src0); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_transpose") + os.Exit(1) + case OP_GET_ROWS: + ComputeForwardGetRows(params, tensor.Src0, tensor.Src1, tensor) + case OP_DIAG_MASK_INF: + ComputeForwardDiagMaskInfFP32(params, tensor.Src0, tensor.Src1, tensor) + case OP_SOFT_MAX: + ComputeForwardSoftMaxFP32(params, tensor.Src0, tensor) + case OP_ROPE: + ComputeForwardRopeFP32(params, tensor.Src0, tensor.Src1, tensor) + case OP_CONV_1D_1S: + ////ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_conv_1d_1s") + os.Exit(1) + case OP_CONV_1D_2S: + ////ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_conv_1d_2s") + os.Exit(1) + case OP_FLASH_ATTN: + ////int32_t t = ggml_get_i32_1d(tensor->opt[1], 0); + ////ASSERT(t == 0 || t == 1); + ////bool masked = t != 0; + ////ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_flash_attn") + os.Exit(1) + case OP_FLASH_FF: + ////ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor); + fmt.Printf("\n[HALT] Please implement : ggml_compute_forward_flash_ff") + os.Exit(1) + case OP_NONE: + // nop + case OP_COUNT: + ////ASSERT(false); + fmt.Printf("\n[HALT] ComputeForward got OP_COUNT method!") + os.Exit(1) + } +} + +func VecCopyFP32(n uint32, y, x []float32) { + for i := uint32(0); i < n; i++ { + y[i] = x[i] + } +} + +// ggml_compute_forward_get_rows_f32 +func ComputeForwardGetRows(params *ComputeParams, src0, src1, dst *Tensor) { + + ////assert(params->ith == 0); + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + nc := src0.NE[0] + nr := src1.Nelements() + + ////assert( dst->ne[0] == nc); + ////assert( dst->ne[1] == nr); + ////assert(src0->nb[0] == sizeof(float)); + + if dst.NE[0] != nc || dst.NE[1] != nr || src0.NB[0] != TYPE_SIZE[TYPE_F32] /*TYPE_SIZE[TYPE_I32]*/ { + fmt.Printf("[HALT]ComputeForwardGetRows : wrong dimensions!") + os.Exit(1) + } + + // FIXME Speed-up + ////for row := uint32(0); row < nr; row++ { + //// for column := uint32(0); column < nc; column++ { + //// (*dst.Data)[row*nr+column] = (*src0.Data)[row*nr+column] + //// } + ////} + + for i := uint32(0); i < nr; i++ { + r := uint32(src1.Data[i]) + + ////ggml_vec_cpy_f32(nc, + //// (float *) ((char *) dst->data + i*dst->nb[1]), + //// (float *) ((char *) src0->data + r*src0->nb[1])); + + // FIXME ASAP and double check! + // VecCopyFP32(nc, (*dst.Data)[i*dst.NE[0]:], (*src0.Data)[uint32(r)*src0.NE[0]:]) + // VecCopyFP32(nc, dst.Data[i*dst.NB[1]/4:], src0.Data[r*src0.NB[1]/4:]) + VecCopyFP32(nc, dst.Data[i*dst.NE[0]:], src0.Data[r*src0.NE[0]:]) + } +} + +// ggml_compute_forward_rms_norm_f32 +func ComputeForwardRMSNormFP32(params *ComputeParams, src0, dst *Tensor) { + + ////GGML_ASSERT(ggml_are_same_shape(src0, dst)); + ////GGML_ASSERT(src0->nb[0] == sizeof(float)); + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + ith := params.ith + nth := params.nth + + ne00 := src0.NE[0] + ne01 := src0.NE[1] + ne02 := src0.NE[2] + ne03 := src0.NE[3] + + nb01 := src0.NB[1] + nb02 := src0.NB[2] + nb03 := src0.NB[3] + + nb1 := dst.NB[1] + nb2 := dst.NB[2] + nb3 := dst.NB[3] + + eps := 1e-5 // TODO: make this a parameter + + // TODO: optimize + for i03 := uint32(0); i03 < ne03; i03++ { + for i02 := uint32(0); i02 < ne02; i02++ { + for i01 := uint32(ith); i01 < ne01; i01 += nth { + + ////const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); + x := src0.Data[i01*nb01/4+i02*nb02/4+i03*nb03/4:] + + mean := 0.0 + // TODO Simplify to directly access [src] + for i00 := uint32(0); i00 < ne00; i00++ { + ////mean += x[i00] * x[i00]; + mean += float64(x[i00] * x[i00]) + } + + mean /= float64(ne00) + + scale := float32(1.0 / math.Sqrt(mean+eps)) + + // TODO Simplify to directly update [dst] + ////float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); + y := dst.Data[i01*nb1/4+i02*nb2/4+i03*nb3/4:] + + ////memcpy(y, x, ne00 * sizeof(float)); + //VecScaleFP32(ne00, y, float32(scale)) + + for i := uint32(0); i < ne00; i++ { + y[i] = x[i] * scale + } + } + } + } +} + +// ggml_vec_scale_f32 +func VecScaleFP32(n uint32, y []float32, v float32) { + for i := uint32(0); i < n; i++ { + y[i] *= v + } +} + +// ggml_compute_forward_repeat +func ComputeForwardRepeatFP32(params *ComputeParams, src0, dst *Tensor) { + + ////assert(params->ith == 0); + ////assert(ggml_can_repeat(src0, dst)); + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + // TODO: implement support for rank > 2 tensors + ////assert(src0->ne[2] == 1); + ////assert(src0->ne[3] == 1); + ////assert( dst->ne[2] == 1); + ////assert( dst->ne[3] == 1); + + nc := dst.NE[0] + nr := dst.NE[1] + nc0 := src0.NE[0] + nr0 := src0.NE[1] + ncr := nc / nc0 // guaranteed to be an integer due to the check in ggml_can_repeat + nrr := nr / nr0 // guaranteed to be an integer due to the check in ggml_can_repeat + + // TODO: support for transposed / permuted tensors + ////assert( dst->nb[0] == sizeof(float)); + ////assert(src0->nb[0] == sizeof(float)); + + // TODO: maybe this is not optimal? + for i := uint32(0); i < nrr; i++ { + for j := uint32(0); j < ncr; j++ { + for k := uint32(0); k < nr0; k++ { + + ////ggml_vec_cpy_f32(nc0, + ////(float *) ((char *) dst->data + (i*nr0 + k)*( dst->nb[1]) + j*nc0*( dst->nb[0])), + ////(float *) ((char *) src0->data + ( k)*(src0->nb[1]))); + + VecCopyFP32(nc0, + dst.Data[(i*nr0+k)*dst.NB[1]/4+j*nc0*dst.NB[0]/4:], + src0.Data[k*src0.NB[1]/4:]) + } + } + } + + if DEBUG { + printTensor(src0, "REPEAT SRC0") + printTensor(dst, "REPEAT DST") + } +} + +// ggml_compute_forward_relu + +// inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } + +func VecReluFP32(n uint32, y, x []float32) { + for i := uint32(0); i < n; i++ { + if x[i] > 0 { + y[i] = x[i] + } else { + y[i] = 0 + } + } +} + +func ComputeForwardReluFP32(params *ComputeParams, src0, dst *Tensor) { + // assert(params->ith == 0); + // assert(ggml_are_same_shape(src0, dst)); + if !AreSameShape(src0, dst) { + fmt.Printf("\n[HALT] ComputeForwardReluFP32 : different shapes!") + os.Exit(1) + } + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + n := src0.Nrows() + nc := src0.NE[0] + + // assert(dst->nb[0] == sizeof(float)); + // assert(src0->nb[0] == sizeof(float)); + + for i := uint32(0); i < n; i++{ + // ggml_vec_relu_f32(nc, + // (float *) ((char *) dst->data + i*( dst->nb[1])), + // (float *) ((char *) src0->data + i*(src0->nb[1]))); + VecReluFP32(nc, dst.Data[i*dst.NE[0]:], src0.Data[i*src0.NE[0]:]) + } +} + +func VecMulFP32(n uint32, z, x, y []float32) { + for i := uint32(0); i < n; i++ { + z[i] = x[i] * y[i] + } +} + +// ggml_compute_forward_mul +func ComputeForwardMulFP32(params *ComputeParams, src0, src1, dst *Tensor) { + + ////assert(params->ith == 0); + ////assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if !AreSameShape(src0, src1) || !AreSameShape(src0, dst) { + fmt.Printf("\n[HALT] ComputeForwardMulFP32 : different shapes!") + os.Exit(1) + } + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + n := src0.Nrows() + nc := src0.NE[0] + + ////assert( dst->nb[0] == sizeof(float)); + ////assert(src0->nb[0] == sizeof(float)); + ////assert(src1->nb[0] == sizeof(float)); + + for i := uint32(0); i < n; i++ { + + ////ggml_vec_mul_f32(nc, + ////(float *) ((char *) dst->data + i*( dst->nb[1])), + ////(float *) ((char *) src0->data + i*(src0->nb[1])), + ////(float *) ((char *) src1->data + i*(src1->nb[1]))); + + // FIXME NE vs NB + VecMulFP32(nc, dst.Data[i*dst.NE[0]:], src0.Data[i*src0.NE[0]:], src1.Data[i*src1.NE[0]:]) + } + + if DEBUG { + printTensor(src0, "MUL SRC0") + printTensor(src1, "MUL SRC1") + printTensor(dst, "MUL DST") + } +} + +// ggml_vec_dot_f32 +func VecDotFP32(n uint32, x, y []float32) float32 { + sumf := float32(0.0) + for i := uint32(0); i < n; i++ { + sumf += x[i] * y[i] + } + return sumf +} + +// ggml_vec_mad_f32 +func VecMadFP32(n uint32, y, x []float32, v float32) { + for i := uint32(0); i < n; i++ { + y[i] += x[i] * v + } +} + +// ggml_vec_acc_f32 +func VecAccFP32(n uint32, y, x []float32) { + for i := uint32(0); i < n; i++ { + y[i] += x[i] + } +} + +// ggml_compute_forward_mul_mat_f32 +func ComputeForwardMulMatFP32(params *ComputeParams, src0, src1, dst *Tensor) { + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + ith := params.ith + nth := params.nth + + ne00 := src0.NE[0] + ne01 := src0.NE[1] + ne02 := src0.NE[2] + ne03 := src0.NE[3] + + //ne10 := src1.NE[0] // for BLAS only + ne11 := src1.NE[1] + //ne12 := src1.NE[2] + //ne13 := src1.NE[3] + + //ne0 := dst.NE[0] + //ne1 := dst.NE[1] + //ne2 := dst.NE[2] + //ne3 := dst.NE[3] + //ne := ne0 * ne1 * ne2 * ne3 + + //nb00 := src0.NB[0] + nb01 := src0.NB[1] / 4 + nb02 := src0.NB[2] / 4 + nb03 := src0.NB[3] / 4 + + //nb10 := src1.NB[0] + nb11 := src1.NB[1] / 4 + nb12 := src1.NB[2] / 4 + nb13 := src1.NB[3] / 4 + + nb0 := dst.NB[0] / 4 + nb1 := dst.NB[1] / 4 + nb2 := dst.NB[2] / 4 + nb3 := dst.NB[3] / 4 + + ////assert(ne02 == ne12); + ////assert(ne03 == ne13); + ////assert(ne2 == ne12); + ////assert(ne3 == ne13); + + // TODO: we don't support permuted src0 + ////assert(nb00 == sizeof(float) || nb01 == sizeof(float)); + + // dst cannot be transposed or permuted + ////assert(nb0 == sizeof(float)); + ////assert(nb0 <= nb1); + ////assert(nb1 <= nb2); + ////assert(nb2 <= nb3); + + ////assert(ne0 == ne01); + ////assert(ne1 == ne11); + ////assert(ne2 == ne02); + ////assert(ne3 == ne03); + + // nb01 >= nb00 - src0 is not transposed + // compute by src0 rows + + /* + ////#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) + + ////if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) { + ////GGML_ASSERT(nb10 == sizeof(float)); + + if params.ith != 0 { + return + } + + if params.Type == TASK_INIT { + return + } + + if params.Type == TASK_FINALIZE { + return + } + + for i03 := uint32(0); i03 < ne03; i03++ { + for i02 := uint32(0); i02 < ne02; i02++ { + + const float * x = (float *) (src0->data); + + ////const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13); + + ////float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3); + + // zT = y * xT + ////{ + ////cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, + ////ne11, ne01, ne10, + ////1.0f, y, ne10, + //// x, ne10, + ////0.0f, d, ne01); + ////} + ////} + ////} + + //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3); + + ////return; + ////} + ////#endif + */ + + // TODO: do not support transposed src1 + ////assert(nb10 == sizeof(float)); + ////if nb10 == 4 { + //// fmt.Printf("\n[HALT] Do not support transposed src1") + //// os.Exit(1) + ////} + + // parallelize by src0 rows using ggml_vec_dot_f32 + + // total rows in src0 + nr := ne01 * ne02 * ne03 + + // rows per thread + dr := (nr + nth - 1) / nth + + // row range for this thread + ir0 := dr * ith + ir1 := min32(ir0+dr, nr) + + for ir := uint32(ir0); ir < ir1; ir++ { + + // src0 indices + i03 := ir / (ne02 * ne01) + i02 := (ir - i03*ne02*ne01) / ne01 + i01 := (ir - i03*ne02*ne01 - i02*ne01) + + // src1 indices + i13 := i03 + i12 := i02 + //i11 := ic + + // dst indices + i0 := i01 + //i1 := i11 + i2 := i02 + i3 := i03 + + for ic := uint32(0); ic < ne11; ic++ { + + //dst.Data[i0*nb0+ic*nb1+i2*nb2+i3*nb3] = + // VecDotFP32(ne00, + // src0.Data[i01*nb01+i02*nb02+i03*nb03:], + // src1.Data[ic*nb11+i12*nb12+i13*nb13:]) + + // --- inline VecDotFP32 + + src0Ptr := src0.Data[i01*nb01+i02*nb02+i03*nb03:] + src1Ptr := src1.Data[ic*nb11+i12*nb12+i13*nb13:] + + sum := float32(0.0) + for i := uint32(0); i < ne00; i++ { + sum += src0Ptr[i] * src1Ptr[i] + } + + dst.Data[i0*nb0+ic*nb1+i2*nb2+i3*nb3] = sum + } + } + + if DEBUG { + fmt.Printf("\n\n>>> ComputeForwardMulMatFP32 OUT <<<\n") + printTensor(dst, "DST") + } + +} + +// ggml_compute_forward_view +func ComputeForwardView(params *ComputeParams, src0 *Tensor) { + // NOP +} + +func ComputeForwardCopy(params *ComputeParams, src0, dst *Tensor) { + ComputeForwardDupFP32(params, src0, dst) +} + +// ggml_compute_forward_dup_f32 +func ComputeForwardDupFP32(params *ComputeParams, src0, dst *Tensor) { + + ////GGML_ASSERT(params->ith == 0); + ////GGML_ASSERT(ggml_is_contiguous(dst)); + ////GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0)); + + if !dst.IsContiguous() { + fmt.Printf("[HALT] ComputeForwardDupFP32 : [dst] is NOT contiguous!") + os.Exit(1) + } + + if dst.Nelements() != src0.Nelements() { + fmt.Printf("[HALT] ComputeForwardDupFP32 : [dst] and [src0] capacities are different!") + os.Exit(1) + } + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + ne00 := src0.NE[0] + ne01 := src0.NE[1] + ne02 := src0.NE[2] + ne03 := src0.NE[3] + + nb00 := src0.NB[0] / 4 + nb01 := src0.NB[1] / 4 + nb02 := src0.NB[2] / 4 + nb03 := src0.NB[3] / 4 + + ////if (ggml_is_contiguous(src0) && src0->type == dst->type) { + if src0.IsContiguous() && src0.Type == dst.Type { + ////memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]); + copy(dst.Data, src0.Data) + return + } + + // --- src0 is NOT contigious + // --- supporting only 4-bytes data for [src0] and FP32 for [dst] + + if src0.NB[0] == TYPE_SIZE[TYPE_F32] { + if dst.Type == TYPE_F32 { + + id := uint32(0) + rs := ne00 * nb00 + + for i03 := uint32(0); i03 < ne03; i03++ { + for i02 := uint32(0); i02 < ne02; i02++ { + for i01 := uint32(0); i01 < ne01; i01++ { + + ////const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03; + src0Ptr := src0.Data[i01*nb01+i02*nb02+i03*nb03 : i01*nb01+i02*nb02+i03*nb03+rs] + ////char * dst_ptr = (char *) dst->data + id*rs; + dstPtr := dst.Data[id*rs : id*rs+rs] + ////memcpy(dst_ptr, src0_ptr, rs); + copy(dstPtr, src0Ptr) + + id++ + } + } + } + ////} else if (dst->type == GGML_TYPE_F16) { + //// int id = 0; + //// ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + //// for (int i03 = 0; i03 < ne03; i03++) { + //// for (int i02 = 0; i02 < ne02; i02++) { + //// for (int i01 = 0; i01 < ne01; i01++) { + //// for (int i00 = 0; i00 < ne00; i00++) { + //// const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + //// dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + //// id++; + //// } + //// } + //// } + //// } + } else { + ////GGML_ASSERT(false); // TODO: implement + fmt.Printf("[HALT] ComputeForwardDupFP32 : not supported tensor type!") + os.Exit(1) + } + } else { + + if dst.Type == TYPE_F32 { + + id := 0 + ////dstPtr = (float *) dst->data; + + for i03 := uint32(0); i03 < ne03; i03++ { + for i02 := uint32(0); i02 < ne02; i02++ { + for i01 := uint32(0); i01 < ne01; i01++ { + for i00 := uint32(0); i00 < ne00; i00++ { + + //src0Ptr := src0.Data[i00*nb00/4 + i01*nb01/4 + i02*nb02/4 + i03*nb03/4:] + //dstPtr[id] = *src0_ptr; + + dst.Data[id] = src0.Data[i00*nb00+i01*nb01+i02*nb02+i03*nb03] + + id++ + } + } + } + } + ////} else if (dst->type == GGML_TYPE_F16) { + //// int id = 0; + //// ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data; + + //// for (int i03 = 0; i03 < ne03; i03++) { + //// for (int i02 = 0; i02 < ne02; i02++) { + //// for (int i01 = 0; i01 < ne01; i01++) { + //// for (int i00 = 0; i00 < ne00; i00++) { + //// const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03); + + //// dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr); + //// id++; + //// } + //// } + //// } + //// } + } else { + ////GGML_ASSERT(false) // TODO: implement + fmt.Printf("[HALT] ComputeForwardDupFP32 : not supported tensor type!") + os.Exit(1) + } + } + + if DEBUG { + fmt.Printf("\n\n>>> ComputeForwardDupFP32 OUT <<<\n") + } +} + +// ggml_compute_forward_reshape +func ComputeForwardReshape(params *ComputeParams, src0, dst *Tensor) { + // NOP +} + +// ggml_compute_forward_permute +func ComputeForwardPermute(params *ComputeParams, src0 *Tensor) { + // NOP +} + +// ggml_compute_forward_rope +func ComputeForwardRopeFP32(params *ComputeParams, src0, src1, dst *Tensor) { + + ////assert(params->ith == 0); + ////assert(src1->type == GGML_TYPE_I32); + ////assert(ggml_nelements(src1) == 3); + + if src1.Nelements() != 3 { + fmt.Printf("\n[HALT] ComputeForwardRopeFP32 : src1 has NOT EXACT 3 elements!") + os.Exit(1) + } + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + pastCount := uint32(src1.Data[0]) + dims := uint32(src1.Data[1]) + mode := uint32(src1.Data[2]) + + //const int ne0 = src0->ne[0]; + ne1 := src0.NE[1] + ne2 := src0.NE[2] + ne3 := src0.NE[3] + + nb0 := src0.NB[0] + nb1 := src0.NB[1] + nb2 := src0.NB[2] + nb3 := src0.NB[3] + + ////assert(nb0 == sizeof(float)); + + var modeCount uint32 + if mode == 0 { + modeCount = 0 + } else { + modeCount = pastCount + } + + // TODO: optimize + for i3 := uint32(0); i3 < ne3; i3++ { + for i2 := modeCount; i2 < ne2; i2++ { + + ////const int p = (mode == 0 ? n_past + i2 : i2); + var p uint32 + if mode == 0 { + p = pastCount + i2 + } else { + p = i2 + } + + for i1 := uint32(0); i1 < ne1; i1++ { + for i0 := 0; i0 < int(dims); i0 += 2 { + + ////const double theta = pow(10000.0, ((double)-i0)/n_dims); + theta := math.Pow(10000.0, float64(-i0)/float64(dims)) + + cosTheta := math.Cos(float64(p) * theta) + sinTheta := math.Sin(float64(p) * theta) + + ////const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + offset := i3*nb3/4 + i2*nb2/4 + i1*nb1/4 + uint32(i0)*nb0/4 + src := src0.Data[offset:] + //// float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); + dstData := dst.Data[offset:] + + x0 := float64(src[0]) + x1 := float64(src[1]) + + dstData[0] = float32(x0*cosTheta - x1*sinTheta) + dstData[1] = float32(x0*sinTheta + x1*cosTheta) + } + } + } + } + +} + +// ggml_compute_forward_scale_f32 +func ComputeForwardScaleFP32(params *ComputeParams, src0, src1, dst *Tensor) { + + ////GGML_ASSERT(ggml_is_contiguous(src0)); + ////GGML_ASSERT(ggml_is_contiguous(dst)); + ////GGML_ASSERT(ggml_are_same_shape(src0, dst)); + ////GGML_ASSERT(ggml_is_scalar(src1)); + + if !src0.IsContiguous() { + fmt.Printf("[HALT] ComputeForwardScaleFP32 : [src0] is NOT contiguous!") + os.Exit(1) + } + + if !dst.IsContiguous() { + fmt.Printf("[HALT] ComputeForwardScaleFP32 : [dst] is NOT contiguous!") + os.Exit(1) + } + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + // scale factor + v := src1.Data[0] + + ith := params.ith + nth := params.nth + + nc := src0.NE[0] + nr := src0.Nrows() + + // rows per thread + dr := (nr + nth - 1) / nth + + // row range for this thread + ir0 := dr * ith + ir1 := min(int(ir0)+int(dr), int(nr)) + + for i1 := ir0; int(i1) < ir1; i1++ { + ////ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), v); + ////VecScaleFP32(nc, (*dst.Data)[i1*dst.NE[0]:], v) + VecScaleFP32(nc, dst.Data[i1*dst.NB[1]/4:], v) + } + +} + +// ggml_compute_forward_diag_mask_inf +func ComputeForwardDiagMaskInfFP32(params *ComputeParams, src0, src1, dst *Tensor) { + + ////assert(params->ith == 0); + ////assert(src1->type == GGML_TYPE_I32); + ////assert(ggml_nelements(src1) == 1); + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + pastCount := uint32(src1.Data[0]) + + // TODO: handle transposed/permuted matrices + + n := src0.Nrows() + nc := src0.NE[0] + nr := src0.NE[1] + nz := n / nr + + ////assert( dst->nb[0] == sizeof(float)); + ////assert(src0->nb[0] == sizeof(float)); + + for k := uint32(0); k < nz; k++ { + for j := uint32(0); j < nr; j++ { + for i := pastCount; i < nc; i++ { + if i > pastCount+j { + ////*(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = -INFINITY; + dst.Data[k*dst.NB[2]/4+j*dst.NB[1]/4+i*dst.NB[0]/4] = float32(math.Inf(-1)) // TODO Use const + } + } + } + } + + if DEBUG { + fmt.Printf("\n\n>>> ComputeForwardDiagMaskInfFP32 OUT <<<\n") + } + +} + +func maxFloat(x, y float32) float32 { + if x >= y { + return x + } + return y +} + +func VecMaxFP32(n uint32, x []float32) float32 { + max := float32(math.Inf(-1)) // TODO use constant + for i := uint32(0); i < n; i++ { + max = maxFloat(max, x[i]) + } + return max +} + +// ggml_compute_forward_soft_max +func ComputeForwardSoftMaxFP32(params *ComputeParams, src0, dst *Tensor) { + + ////GGML_ASSERT(ggml_is_contiguous(src0)); + ////GGML_ASSERT(ggml_is_contiguous(dst)); + ////GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if !src0.IsContiguous() { + fmt.Printf("[HALT] ComputeForwardSoftMaxFP32 : [src0] is NOT contiguous!") + os.Exit(1) + } + + if !dst.IsContiguous() { + fmt.Printf("[HALT] ComputeForwardSoftMaxFP32 : [dst] is NOT contiguous!") + os.Exit(1) + } + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + negInf := float32(math.Inf(-1)) // TODO use constant + + // TODO: handle transposed/permuted matrices + + ith := params.ith + nth := params.nth + + nc := src0.NE[0] + nr := src0.Nrows() + + // rows per thread + dr := (nr + nth - 1) / nth + + // row range for this thread + ir0 := dr * ith + ir1 := min(int(ir0+dr), int(nr)) + + for i1 := ir0; int(i1) < ir1; i1++ { + ////float *p = (float *)((char *) dst->data + i1*dst->nb[1]); + p := dst.Data[i1*dst.NB[1]/4:] + max := VecMaxFP32(nc, p) + sum := float32(0.0) + //var bits uint16 + for i := 0; i < int(nc); i++ { + if p[i] == negInf { // TODO use constant + p[i] = 0.0 + } else { + //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max); + + ////ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max); + //s := FP32_TO_FP16(p[i] - max) + ////memcpy(&scvt, &s, sizeof(scvt)); + ////const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); + + //////////////////////////fp16 := float16.Fromfloat32(p[i] - max) + //////////////////////////bits := fp16.Bits() + //////////////////////////exp := TableExpFP16[bits] // FIXME table_exp_f16 ASAP Initialize first! + //////////////////////////val := exp.Float32() + + val := float32(math.Exp(float64(p[i] - max))) + sum += val + p[i] = val + } + } + + ////assert(sum > 0.0f); + sum = 1.0 / sum + VecScaleFP32(nc, p, sum) + } + + if DEBUG { + fmt.Printf("\n\n>>> ComputeForwardSoftMaxFP32 OUT <<<\n") + } +} + +// inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i] + y[i]; } +func VecAddFP32(n uint32, z, x, y []float32) { + for i := uint32(0); i < n; i++ { + z[i] = x[i] + y[i] + } +} + +// ggml_compute_forward_add +func ComputeForwardAddFP32(params *ComputeParams, src0, src1, dst *Tensor) { + + ////GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst)); + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + if src1.NB[0] != TYPE_SIZE[TYPE_F32] { + fmt.Printf("[HALT] ComputeForwardAddFP32 : [src1] is NOT contiguous!") + os.Exit(1) + } + + ith := params.ith + nth := params.nth + + n := src0.Nrows() + nc := src0.NE[0] + + //nb00 := src0.NB[0] + nb01 := src0.NB[1] + + nb10 := src1.NB[0] + nb11 := src1.NB[1] + + //nb0 := dst.NB[0] + nb1 := dst.NB[1] + + ////GGML_ASSERT( nb0 == sizeof(float)); + ////GGML_ASSERT(nb00 == sizeof(float)); + + if nb10 == TYPE_SIZE[TYPE_F32] { + j0 := (n / nth) * ith + + // j1 := ith == nth - 1 ? n : (n/nth)*(ith + 1) + var j1 uint32 + if ith == nth-1 { + j1 = n + } else { + j1 = (n / nth) * (ith + 1) + } + + for j := j0; j < j1; j++ { + + ////ggml_vec_add_f32(nc, + //// (float *) ((char *) dst->data + j*nb1), + //// (float *) ((char *) src0->data + j*nb01), + //// (float *) ((char *) src1->data + j*nb11)); + + VecAddFP32(nc, dst.Data[j*nb1/4:], src0.Data[j*nb01/4:], src1.Data[j*nb11/4:]) + } + + } else { // src1 is not contiguous + for j := ith; j < n; j += nth { + ////float * dst_ptr = (float *) ((char *) dst->data + j*nb1); + dstPtr := dst.Data[j*nb1/4:] + ////float * src0_ptr = (float *) ((char *) src0->data + j*nb01); + src0Ptr := src0.Data[j*nb01/4:] + for i := uint32(0); i < nc; i++ { + ////float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10); + src1Ptr := src1.Data[j*nb11/4+i*nb10/4] + dstPtr[i] = src0Ptr[i] + src1Ptr + } + } + } + + if DEBUG { + fmt.Printf("\n\n>>> OUT <<< ComputeForwardAddFP32 <<<") + } +} + +// Sigmoid Linear Unit (SiLU) function +func SiluFP32(x float32) float32 { + return x / float32(1.0+math.Exp(float64(-x))) +} + +// inline static void ggml_vec_silu_f32(const int n, float * y, const float * x) { +func VecSiluFP32(n uint32, y, x []float32) { + for i := uint32(0); i < n; i++ { + y[i] = SiluFP32(x[i]) // ggml_silu_f32 + } +} + +// ggml_compute_forward_silu +func ComputeForwardSiluFP32(params *ComputeParams, src0, dst *Tensor) { + + ////GGML_ASSERT(ggml_is_contiguous(src0)); + ////GGML_ASSERT(ggml_is_contiguous(dst)); + ////GGML_ASSERT(ggml_are_same_shape(src0, dst)); + + if !src0.IsContiguous() { + fmt.Printf("[HALT] ComputeForwardSiluFP32 : [src0] is NOT contiguous!") + os.Exit(1) + } + + if !dst.IsContiguous() { + fmt.Printf("[HALT] ComputeForwardSiluFP32 : [dst] is NOT contiguous!") + os.Exit(1) + } + + if params.Type == TASK_INIT || params.Type == TASK_FINALIZE { + return + } + + ith := params.ith + nth := params.nth + + nc := src0.NE[0] + nr := src0.Nrows() + + // rows per thread + dr := (nr + nth - 1) / nth + + // row range for this thread + ir0 := dr * ith + ir1 := uint32(min(int(ir0+dr), int(nr))) + + for i1 := ir0; i1 < ir1; i1++ { + ////ggml_vec_silu_f32(nc, + //// (float *) ((char *) dst->data + i1*( dst->nb[1])), + //// (float *) ((char *) src0->data + i1*(src0->nb[1]))); + + VecSiluFP32(nc, dst.Data[i1*dst.NB[1]/4:], src0.Data[i1*src0.NB[1]/4:]) + } + + if DEBUG { + printTensor(src0, "SRC SILI") + printTensor(dst, "DST SILI") + } +} + +// --- + +type TokenScore struct { + Token string + Score float32 +} + +type Vocab struct { + Token2ID map[string]uint32 + ID2Token []TokenScore +} + +func NewVocab(size uint32) *Vocab { + return &Vocab{ + Token2ID: make(map[string]uint32, size), + ID2Token: make([]TokenScore, size, size), + } +} + +func min(a, b int) int { + if a <= b { + return a + } + return b +} + +func min32(a, b uint32) uint32 { + if a <= b { + return a + } + return b +} + +// ---- SentencePiece Tokenizer + +// struct llama_sp_symbol { +type Symbol struct { + ////using index = int; + + // NB! Allow -1 + Prev int + Next int + + Text string + N uint32 +} + +// struct llama_sp_bigram { +type Bigram struct { + + // NB! Allow -1 + Left int + Right int + + Score float32 + Size uint32 +} + +func utf8Len(src byte) uint32 { + lookup := []uint32{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4} + highbits := uint8(src) >> 4 + return lookup[highbits] +} + +func Token2Str(vocab *Vocab, token uint32) string { + if int(token) >= len(vocab.ID2Token) { + return "" + } + + return vocab.ID2Token[token].Token +} + +func PopMax(queue *[]Bigram) Bigram { + + max := 0 // index of max score element in queue + for cur := 1; cur < len(*queue); cur++ { + if ((*queue)[max].Score < (*queue)[cur].Score) || + ((*queue)[max].Score == (*queue)[cur].Score && + (*queue)[max].Left > (*queue)[cur].Left) { + max = cur + } + } + + pop := (*queue)[max] + + // replace max element with last and shrink slice (if max == last, then just remove it) + (*queue)[max] = (*queue)[len(*queue)-1] + *queue = (*queue)[:len(*queue)-1] + + return pop +} + +func TryAddBigram(vocab *Vocab, symbols []Symbol, workQueue *[]Bigram, left, right int) { + + if left == -1 || right == -1 { + return + } + + token := symbols[left].Text[:symbols[left].N+symbols[right].N] + id, ok := vocab.Token2ID[token] + + if !ok || int(id) >= len(vocab.ID2Token) { + return + } + + tokenScore := vocab.ID2Token[id] + + bigram := Bigram{Left: left, Right: right, Score: tokenScore.Score, Size: uint32(len(token))} + *workQueue = append(*workQueue, bigram) +} + +// void tokenize(const std::string & text, std::vector & output) { +func Tokenize(vocab *Vocab, text string, bos bool) []uint32 { + + output := make([]uint32, 0) + symbols := make([]Symbol, 0) // std::vector symbols_; + workQueue := make([]Bigram, 0) // llama_sp_bigram::queue work_queue_; // std::priority_queue; + + if bos { + output = append(output, 1) // TODO: replace with vocab.bos + } + + // --- split string into utf8 chars + + index := 0 + offs := 0 + for offs < len(text) { + var sym Symbol + charLen := min(len(text)-offs, int(utf8Len(text[offs]))) + sym.Text = text[offs:] + sym.N = uint32(charLen) + offs += charLen + sym.Prev = index - 1 + if offs == len(text) { + sym.Next = -1 + } else { + sym.Next = index + 1 + } + index++ + symbols = append(symbols, sym) + } + + // seed the work queue with all possible 2-character tokens + for i := 1; i < len(symbols); i++ { + TryAddBigram(vocab, symbols, &workQueue, i-1, i) + } + + // keep substituting the highest frequency pairs for as long as we can + for len(workQueue) > 0 { + bigram := PopMax(&workQueue) + + leftSym := &symbols[bigram.Left] + rightSym := &symbols[bigram.Right] + + // if one of the symbols already got merged, skip it + if leftSym.N == 0 || rightSym.N == 0 || leftSym.N+rightSym.N != bigram.Size { + continue + } + + // merge the right sym into the left one + leftSym.N += rightSym.N + rightSym.N = 0 + + // remove the right sym from the chain + leftSym.Next = rightSym.Next + if rightSym.Next >= 0 { + symbols[rightSym.Next].Prev = bigram.Left + } + + // find more substitutions + TryAddBigram(vocab, symbols, &workQueue, leftSym.Prev, bigram.Left) + TryAddBigram(vocab, symbols, &workQueue, bigram.Left, leftSym.Next) + } + + for i := 0; i != -1; i = symbols[i].Next { + symbol := symbols[i] + id, ok := vocab.Token2ID[symbol.Text[:symbol.N]] + + if !ok { + // output any symbols that did not form tokens as bytes. + for j := uint32(0); j < symbol.N; j++ { + ////llama_vocab::id token_id = static_cast(symbol.text[j]) + 3; + tokenID := uint32(symbol.Text[j] + 3) + output = append(output, tokenID) + } + } else { + output = append(output, id) + } + } + + if DEBUG { + fmt.Printf("\n\n=== TOKENIZER ===\n\n%+v", output) + for i := 0; i < len(output); i++ { + fmt.Printf("%d:'%s' ", output[i], Token2Str(vocab, output[i])) + } + } + + return output + +} + +// TODO Do we need this? +func Init(params InitParams) { + + // ---- initialize GELU, SILU and EXP F32 tables + + ////const uint64_t t_start = ggml_time_us(); UNUSED(t_start); + + /////////////////////////////////////////var ii uint16 + /////////////////////////////////////////for i := uint32(0); i < (1 << 16); i++ { + /////////////////////////////////////////ui := uint16(i) + + ////memcpy(&ii, &ui, sizeof(ii)); + ////const float f = table_f32_f16[i] = COMPUTE_FP16_TO_FP32(ii); + /////////////////////////////////////////fp32 := float32() + + ////table_gelu_f16[i] = FP32_TO_FP16(ggml_gelu_f32(f)); + ////table_silu_f16[i] = FP32_TO_FP16(ggml_silu_f32(f)); + + ////TableExpFP16[i] = FP32_TO_FP16(exp(f)); + /////////////////////////////////////////exp := float32(math.Exp(fp32)) + /////////////////////////////////////////TableExpFP16[i] = float16.Fromfloat32(exp) + + /////////////////////////////////////////} + + ////const uint64_t t_end = ggml_time_us(); UNUSED(t_end); + +} + + +func PrintTensor(tensor *Tensor, name string) { + var dt string + if tensor.Type == TYPE_F16 { + dt = "FP16" + } + if tensor.Type == TYPE_F32 { + dt = "FP32" + } + if tensor.Type == TYPE_Q4_0 { + dt = "INT4" + } + + fmt.Printf("\n\n=== [ %s | %s | %d:%d:%d ] ===\n", + name, dt, tensor.NE[0], tensor.NE[1], tensor.NE[2]) + + for nn := 0; nn < min(12, int(tensor.NE[1])); nn++ { + fmt.Printf("\n %d x %d ...\t", nn, tensor.NE[0]) + for ii := 0; ii < min(12, int(tensor.NE[0])); ii++ { + fmt.Printf("%.3f\t", tensor.Data[nn*int(tensor.NE[0])+ii]) + } + } + fmt.Println("") +} diff --git a/ml/utils.go b/ml/utils.go new file mode 100644 index 0000000..70c7d92 --- /dev/null +++ b/ml/utils.go @@ -0,0 +1,411 @@ +package ml + +import ( + "errors" + "fmt" + "math" + "mlgo/common" + "os" +) + +// Tensor on Graph for stroage +type TensorOnGraph struct { + Type DType + + NodeID uint32 // nodeID == 99999 no exist (> graph.NodeCount) + + Dims uint32 + NE [MAX_DIMS]uint32 // number of elements + NB [MAX_DIMS]uint32 // stride in bytes + + Op optype + + // isParam bool // no need here? + + // GradTensorID uint32 // no need for forward compute? + Src0NodeID uint32 + Src1NodeID uint32 + + // grad *Tensor + // src0 *Tensor + // src1 *Tensor + // opt [MAX_OPT]*Tensor // FIXME Do we need this? + + TasksCount int + + // performance + //perfRuns uint32 + //perfCycles uint32 + //perfTime uint64 + + Data []float32 + //padding [8]byte +} + +func (tensor * Tensor) ToTensorOnGraph(graph *Graph) *TensorOnGraph { + if tensor == nil || graph == nil || graph.Tensor2NodeID == nil { + return nil + } + t := &TensorOnGraph{ + Type: tensor.Type, + Dims: tensor.Dims, + NE: tensor.NE, + NB: tensor.NB, + Op: tensor.op, + TasksCount: tensor.TasksCount, + Data: tensor.Data, + } + t.NodeID = tensor2NodeID(tensor, graph) + t.Src0NodeID = tensor2NodeID(tensor.Src0, graph) + t.Src1NodeID = tensor2NodeID(tensor.Src1, graph) + return t +} + +func (tensor *TensorOnGraph) ToTensor(tensorMap map[uint32]*Tensor) *Tensor { + t := &Tensor{ + Type: tensor.Type, + Dims: tensor.Dims, + NE: tensor.NE, + NB: tensor.NB, + op: tensor.Op, + TasksCount: tensor.TasksCount, + Data: tensor.Data, + } + if tensorMap != nil { + t.Src0 = tensorMap[tensor.Src0NodeID] + t.Src1 = tensorMap[tensor.Src1NodeID] + } + return t +} + +func tensor2NodeID(tensor *Tensor, graph *Graph) uint32 { + if id, ok := graph.Tensor2NodeID[tensor]; ok { + return id + } else { + return math.MaxUint32 + } +} + +func (tensor *TensorOnGraph) Encoding(toBigEndian bool) []byte { + data := make([]byte, 0) + data = append(data, common.IntToBytes(int(tensor.Type), toBigEndian)...) // Type + data = append(data, common.IntToBytes(int(tensor.NodeID), toBigEndian)...) // NodeID + data = append(data, common.IntToBytes(int(tensor.Dims), toBigEndian)...) // Dims + data = append(data, common.IntToBytes(int(tensor.Op), toBigEndian)...) // Op + data = append(data, common.IntToBytes(int(tensor.Src0NodeID), toBigEndian)...) // Src0NodeID + data = append(data, common.IntToBytes(int(tensor.Src1NodeID), toBigEndian)...) // Src1NodeID + data = append(data, common.IntToBytes(int(tensor.TasksCount), toBigEndian)...) // TasksCount + + // encoding list + // NE + data = append(data, common.IntToBytes(MAX_DIMS, toBigEndian)...) + for i := 0; i < MAX_DIMS; i++ { + data = append(data, common.IntToBytes(int(tensor.NE[i]), toBigEndian)...) + } + // NB + data = append(data, common.IntToBytes(MAX_DIMS, toBigEndian)...) + for i := 0; i < MAX_DIMS; i++ { + data = append(data, common.IntToBytes(int(tensor.NB[i]), toBigEndian)...) + } + // Data + data = append(data, common.IntToBytes(len(tensor.Data), toBigEndian)...) + for i := 0; i < len(tensor.Data); i++ { + data = append(data, common.Float32ToBytes(tensor.Data[i], toBigEndian)...) + } + // append the data size ahead + // data = append(common.IntToBytes(len(data), toBigEndian), data...) + return data +} + +func DecodeTensorOnGraph(data []byte, fromBigEndian bool, currentBigEndian bool) *TensorOnGraph { + if (len(data) == 0) { + return nil + } + t := 0 + tensorType := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + + nodeId := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + + dims := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + + op := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + + src0NodeID := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + + src1NodeID := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + + tasksCount := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + + //NE + neSize := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + ne := [4]uint32{0, 0, 0, 0} + for i := 0; i < int(neSize); i++ { + ne[i] = uint32(common.BytesToInt32(data[t:t+4], fromBigEndian)) + t += 4 + } + + // NB + nbSize := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + nb := [4]uint32{0, 0, 0, 0} + for i := 0; i < int(nbSize); i++ { + nb[i] = uint32(common.BytesToInt32(data[t:t+4], fromBigEndian)) + t += 4 + } + + // Data + dataSize := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + tensorData := make([]float32, 0) + if currentBigEndian && fromBigEndian { + // this code should be only used in MIPS! + tensorData = common.DecodeFloat32List(data[t:t+4*int(dataSize)]) + t += 4*int(dataSize) + } else { + tensorData = make([]float32, dataSize) + for i := 0; i < int(dataSize); i++ { + tensorData[i] = common.BytesToFloat32(data[t:t+4], fromBigEndian) + t += 4 + } + } + + + tensor := &TensorOnGraph{ + Type: DType(tensorType), + NodeID: uint32(nodeId), + Dims: uint32(dims), + Op: optype(op), + Src0NodeID: uint32(src0NodeID), + Src1NodeID: uint32(src1NodeID), + TasksCount: int(tasksCount), + NE: ne, + NB: nb, + Data: tensorData, + } + + return tensor +} + +func ComputeNodeForward(node *Tensor) { + if node == nil { + return + } + node.TasksCount = 1 + params := ComputeParams{ + Type: TASK_COMPUTE, + ith: 0, + nth: uint32(node.TasksCount), + } + ComputeForward(nil, ¶ms, node) +} + +// ======================================================================= + +// compute [0, nodeID) +func GraphComputeByNodes(ctx *Context, graph *Graph, nodeID int) { + + maxThreads := graph.ThreadsCount + + // --- init N job goroutines and channel to send tasks for them + + graph.Jobs = make(chan *ComputeParams, maxThreads) // TODO Right place to init? + defer close(graph.Jobs) + + // TODO Investigate https://pkg.go.dev/runtime#LockOSThread + for i := 0; i < maxThreads; i++ { + go Job(graph.Jobs) + } + + // --- initialize tasks + + { + // thread scheduling for the different operations + // TasksCount might be 0, 1, or ThreadsCount + for i := uint32(0); i < graph.NodesCount; i++ { + + ////struct ggml_tensor * node = cgraph->nodes[i]; + node := graph.Nodes[i] + + if DEBUG { + fmt.Printf("\n\n### STEP #%d ### %d - %d [ %d:%d:%d:%d ]", i, node.op, node.Type, node.NE[0], node.NE[1], node.NE[2], node.NE[3]) + } + + switch node.op { + + case OP_DUP: + node.TasksCount = 1 + case OP_ADD: + node.TasksCount = 1 // TODO threads + case OP_SUB: + case OP_MUL: + case OP_DIV: + case OP_SQR: + case OP_SQRT: + case OP_SUM: + case OP_MEAN: + case OP_REPEAT: + case OP_ABS: + case OP_SGN: + case OP_NEG: + case OP_STEP: + case OP_RELU: + node.TasksCount = 1 + case OP_GELU: + node.TasksCount = 1 // TODO threads + case OP_SILU: + node.TasksCount = 1 // TODO threads + case OP_NORM: + case OP_RMS_NORM: + node.TasksCount = 1 // TODO threads + case OP_MUL_MAT: + node.TasksCount = maxThreads + // TODO: use different scheduling for different matrix sizes + case OP_SCALE: + node.TasksCount = 1 // TODO threads + case OP_CPY: + case OP_RESHAPE: + case OP_VIEW: + case OP_PERMUTE: + case OP_TRANSPOSE: + case OP_GET_ROWS: + case OP_DIAG_MASK_INF: + node.TasksCount = 1 + case OP_SOFT_MAX: + node.TasksCount = 1 // TODO threads + case OP_ROPE: + ////node.TasksCount = 1 + case OP_CONV_1D_1S: + case OP_CONV_1D_2S: + node.TasksCount = 1 // TODO threads + ////ASSERT(node->src0->ne[3] == 1); + ////ASSERT(node->src1->ne[2] == 1); + ////ASSERT(node->src1->ne[3] == 1); + case OP_FLASH_ATTN: + node.TasksCount = 1 // TODO threads + case OP_FLASH_FF: + node.TasksCount = 1 // TODO threads + case OP_NONE: + node.TasksCount = 1 + case OP_COUNT: + fmt.Printf("\n[HALT] Something wrong with compute graph!") + os.Exit(1) + } + } + } + + nodeID = min(nodeID, int(graph.NodesCount)) + + for i := uint32(0); i < uint32(nodeID); i++ { + + node := graph.Nodes[i] + + if DEBUG { + fmt.Printf("\n\n### STEP #%d ### %d - %d [ %d:%d:%d:%d ]", i, node.op, node.Type, node.NE[0], node.NE[1], node.NE[2], node.NE[3]) + } + + params := ComputeParams{ + Type: TASK_INIT, + ith: 0, + nth: uint32(node.TasksCount), + } + + ComputeForward(graph, ¶ms, node) // TASK_INIT + + // --- COMPUTE + + // BREAKPOINT DEBUG + //if i > 1300 { + // fmt.Printf("\n\n=== HALT #%d ===", i) + // os.Exit(0) + //} + + params.Type = TASK_COMPUTE + ComputeForward(graph, ¶ms, node) + + // --- FINALIZE + + params.Type = TASK_FINALIZE + ComputeForward(graph, ¶ms, node) + } + +} + +func SaveComputeNodeEnv(node *Tensor, graph *Graph) []*TensorOnGraph{ + tensorOnGraphList := make([]*TensorOnGraph, 0) + tensorOnGraphList = append(tensorOnGraphList, node.ToTensorOnGraph(graph)) + if node.Src0 != nil { + tensorOnGraphList = append(tensorOnGraphList, node.Src0.ToTensorOnGraph(graph)) + } + if node.Src1 != nil { + tensorOnGraphList = append(tensorOnGraphList, node.Src1.ToTensorOnGraph(graph)) + } + return tensorOnGraphList +} + +// total_bytes_len +// nodeID +// tensorGraph num +// [len, tensor] +func SaveComputeNodeEnvToBytes(nodeID uint32, node *Tensor, graph *Graph, toBigEndian bool) []byte { + tensorGraphList := SaveComputeNodeEnv(node, graph) + if len(tensorGraphList) == 0 { + return nil + } + data := make([]byte, 0) + // nodeID + data = append(data, common.IntToBytes(int(nodeID), toBigEndian)...) + // tensorGraph num + data = append(data, common.IntToBytes(len(tensorGraphList), toBigEndian)...) + // tensor + for i := 0; i < len(tensorGraphList); i++ { + tensor := tensorGraphList[i] + bytes := tensor.Encoding(toBigEndian) + // append size ahead of content + bytes = append(common.IntToBytes(len(bytes), toBigEndian), bytes...) + // append into data + data = append(data, bytes...) + } + // total bytes len + data = append(common.IntToBytes(len(data), toBigEndian), data...) + return data +} + +func DecodeComputeNodeEnv(data []byte, fromBigEndian bool, currentBigEndian bool) (uint32, []*TensorOnGraph, error) { + t := 0 + totalSize := common.BytesToInt32(data[:4], fromBigEndian) + t += 4 + if int(totalSize) < len(data) - 4 { + return 0, nil, errors.New("no enough data") + } + + // nodeID + nodeID := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + + // tensorNum + tensorNum := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + + tensorOnGraphList := make([]*TensorOnGraph, tensorNum) + + for i := 0; i < int(tensorNum); i++ { + // size + size := common.BytesToInt32(data[t:t+4], fromBigEndian) + t += 4 + // tensorOnGraph + tensor := DecodeTensorOnGraph(data[t:t+int(size)], fromBigEndian, currentBigEndian) + t += int(size) + + tensorOnGraphList[i] = tensor + } + + return uint32(nodeID), tensorOnGraphList, nil +} \ No newline at end of file diff --git a/ml_mips/build.sh b/ml_mips/build.sh new file mode 100755 index 0000000..b6b4302 --- /dev/null +++ b/ml_mips/build.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash +set -e + +export GOOS=linux +export GOARCH=mips +export GOMIPS=softfloat +go build -o ./ml_mips + +file ml_mips + +if [[ ! -d venv ]]; then + python3 -m venv venv +fi + +../compile.py ml_mips diff --git a/ml_mips/main.go b/ml_mips/main.go new file mode 100644 index 0000000..169e026 --- /dev/null +++ b/ml_mips/main.go @@ -0,0 +1,53 @@ +package main + +import ( + "fmt" + "mlgo/common" + "mlgo/ml" +) + +const ( + READ_FROM_BIDENDIAN = true + OUTPUT_TO_BIDENDIAN = true +) + +// read from memory [size: [envData]] +// output: nodeID, tensorOnGraph, error +func ReadTensorGraph() (uint32, []*ml.TensorOnGraph, error){ + fmt.Println("Start Read Tensor Graph") + dataBytes := common.ReadBytes(common.INPUT_ADDR, READ_FROM_BIDENDIAN) + nodeID, tensorGraphList, err := ml.DecodeComputeNodeEnv(dataBytes, READ_FROM_BIDENDIAN, true) + return nodeID, tensorGraphList, err +} + +func ComputeTensorGraph(nodeID uint32, tensorGraphList []*ml.TensorOnGraph) { + fmt.Println("State Compute Tensor Graph") + tensorList := make([]*ml.Tensor, 0) + tensorMap := make(map[uint32]*ml.Tensor) + for i := 0; i < len(tensorGraphList); i++ { + tensor := tensorGraphList[i].ToTensor(nil) + tensorMap[tensorGraphList[i].NodeID] = tensor + tensorList = append(tensorList, tensor) + } + // fill in the nodeid + for i := 0; i < len(tensorList); i++ { + tensor := tensorList[i] + tensorG := tensorGraphList[i] + if src0, ok := tensorMap[tensorG.Src0NodeID]; ok { + tensor.Src0 = src0 + } + if src1, ok := tensorMap[tensorG.Src1NodeID]; ok { + tensor.Src1 = src1 + } + } + ml.ComputeNodeForward(tensorMap[uint32(nodeID)]) + ml.PrintTensor(tensorMap[uint32(nodeID)], "final_after") +} + +func main() { + nodeID, tensorGraphList, err := ReadTensorGraph() + if err != nil { + return + } + ComputeTensorGraph(nodeID, tensorGraphList) +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..9e1dd8b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +pyelftools==0.27 +hexdump==3.3 +termcolor==1.1.0 +capstone==4.0.2 +rangetree==1.0 \ No newline at end of file diff --git a/startup/startup.bin b/startup/startup.bin new file mode 100644 index 0000000..6a80327 Binary files /dev/null and b/startup/startup.bin differ diff --git a/startup/startup.s b/startup/startup.s new file mode 100644 index 0000000..4289472 --- /dev/null +++ b/startup/startup.s @@ -0,0 +1,24 @@ + .section .test, "x" + .balign 4 + .set noreorder + .global test + .ent test +test: + +lui $sp, 0x7fff +ori $sp, 0xd000 + +# http://articles.manugarg.com/aboutelfauxiliaryvectors.html +# _AT_PAGESZ = 6 +ori $t0, $0, 6 +sw $t0, 0xC($sp) +ori $t0, $0, 0x1000 +sw $t0, 0x10($sp) + +lw $ra, dat($0) +jr $ra +nop + +dat: + +.end test diff --git a/startup/startup.sh b/startup/startup.sh new file mode 100644 index 0000000..8ea4595 --- /dev/null +++ b/startup/startup.sh @@ -0,0 +1,4 @@ +#!/usr/bin/env bash +set -e + +../../mipsevm/maketests.py startup.s startup.bin