-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbenchmark.cpp
103 lines (65 loc) · 2.68 KB
/
benchmark.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#include <iostream>
#include <string>
#include <fstream>
#include "rwkv.h"
// #include "sampler/sample.hpp"
#include "tokenizer/tokenizer.hpp"
#include "chrono"
int main( int argc, char** argv ){
std::cout << "Hello World" << std::endl;
std::string path = "../../../../CUDAMAX/1b5model.safetensors";
RWKVTokenizer worldTokenizer("rwkv_vocab_v20230424.txt");
auto tokens = worldTokenizer.encode("\n\nUser: please create a long harry potter fanfiction. \n\nAssistant:");
std::cout << worldTokenizer.decode(tokens) << std::endl;
std::cout << "Loading model" << std::endl;
// allocating ram for 50 tokens simultaneously
// used for allocations of static memory usage
RWKV model(path);
if (argc > 1)
{
model.cuda();
}
std::cout << "Model loaded" << std::endl;
std::cout << "Layers:" << model.layers << std::endl;
std::cout << "Embed:" << model.emb1.weight.shape[0] << "," << model.emb1.weight.shape[1] << std::endl;
std::cout << "Head:" << model.blocks[0].att.ln_x.heads << std::endl;
// model.cuda();
auto logits = model({tokens});
std::cout << logits << std::endl;
// model.set_state(model.new_state());
// model.toVulkan();
// logits = model({tokens});
// std::cout << logits << std::endl;
// model.set_state(model.new_state());
// std::cout << "Model sent to vulkan" << std::endl;
// logits = model({tokens});
// std::cout << logits << std::endl;
auto start = std::chrono::high_resolution_clock::now();
size_t tokenstogen = 100;
std::vector<size_t> generated;
for (int i = 0; i < tokenstogen; i++)
{
// std::cout << "Generating token " << i << std::endl;
float* logs = (float*)(logits[0][logits.shape[1]-1]).cpu().float32().data;
size_t sample = 0;
float max = -99999;
for (int j = 0; j < logits.shape[2]; j++)
{
if (float(logs[j]) > max)
{
max = float(logs[j]);
sample = j;
}
}
generated.push_back(sample);
std::cout.flush();
std::cout << worldTokenizer.decode({sample});
logits = model({{sample}});
// std::cout << logits << std::endl;
}
auto end = std::chrono::high_resolution_clock::now();
std::cout << std::endl;
std::cout << "Generated " << tokenstogen << " tokens in " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
std::cout << "tokens per second: " << (tokenstogen / (std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() / 1000.0)) << std::endl;
return 0;
}