[MNN::Bugfix] Some bugfix sync.

alibaba · Dec 11, 2023 · 635738b · 635738b
1 parent 1ea55f4
commit 635738b
Show file tree

Hide file tree

Showing 37 changed files with 1,266 additions and 431 deletions.
diff --git a/llm/cli_demo.cpp b/llm/cli_demo.cpp
@@ -0,0 +1,64 @@
+//
+//  cli_demo.cpp
+//
+//  Created by MNN on 2023/03/24.
+//  ZhaodeWang
+//
+
+#include "llm.hpp"
+#include <fstream>
+#include <stdlib.h>
+
+void benchmark(Llm* llm, std::string prompt_file) {
+    std::cout << "prompt file is " << prompt_file << std::endl;
+    std::ifstream prompt_fs(prompt_file);
+    std::vector<std::string> prompts;
+    std::string prompt;
+    while (std::getline(prompt_fs, prompt)) {
+        // prompt start with '#' will be ignored
+        if (prompt.substr(0, 1) == "#") {
+            continue;
+        }
+        prompts.push_back(prompt);
+    }
+    int prompt_len = 0;
+    int decode_len = 0;
+    int64_t prefill_time = 0;
+    int64_t decode_time = 0;
+    // llm->warmup();
+    for (int i = 0; i < prompts.size(); i++) {
+        llm->response(prompts[i]);
+        prompt_len += llm->prompt_len_;
+        decode_len += llm->gen_seq_len_;
+        prefill_time += llm->prefill_us_;
+        decode_time += llm->decode_us_;
+        llm->reset();
+    }
+    float prefill_s = prefill_time / 1e6;
+    float decode_s = decode_time / 1e6;
+    printf("\n#################################\n");
+    printf("prompt tokens num  = %d\n", prompt_len);
+    printf("decode tokens num  = %d\n", decode_len);
+    printf("prefill time = %.2f s\n", prefill_s);
+    printf(" decode time = %.2f s\n", decode_s);
+    printf("prefill speed = %.2f tok/s\n", prompt_len / prefill_s);
+    printf(" decode speed = %.2f tok/s\n", decode_len / decode_s);
+    printf("##################################\n");
+}
+
+int main(int argc, const char* argv[]) {
+    if (argc < 2) {
+        std::cout << "Usage: " << argv[0] << " model_dir <prompt.txt>" << std::endl;
+        return 0;
+    }
+    std::string model_dir = argv[1];
+    std::cout << "model path is " << model_dir << std::endl;
+    std::unique_ptr<Llm> llm(Llm::createLLM(model_dir));
+    llm->load(model_dir);
+    if (argc < 3) {
+        llm->chat();
+    }
+    std::string prompt_file = argv[2];
+    benchmark(llm.get(), prompt_file);
+    return 0;
+}
diff --git a/llm/include/llm.hpp b/llm/include/llm.hpp
@@ -23,22 +23,37 @@
 
 using namespace MNN;
 using namespace Express;
+class Tokenizer;
 
 class MNN_PUBLIC Llm {
 public:
     Llm() {
         // default tokenier is senrencepiece
         tokenizer_.reset(new Sentencepiece);
     }
-    static Llm* createLLM(const std::string& path);
-    VARP gen_embedding(const std::vector<int>& input_ids);
+    virtual ~Llm() = default;
+    static Llm* createLLM(const std::string& path, std::string model_type = "auto");
+    VARP disk_embedding(const std::vector<int>& input_ids);
     void load(const std::string& model_dir);
     int forward(const std::vector<int>& input_ids);
     std::vector<int> tokenizer_encode(const std::string& input_str);
     std::string decode(int id);
-    std::string response(const std::string& input_str, std::ostream* os = &std::cout);
+    void chat();
+    void warmup();
+    std::string response(const std::string& input_str, std::ostream* os = &std::cout, const char* end_with = nullptr);
     float load_progress() { return load_progress_; }
     void reset();
+    void print_speed();
+public:
+    std::vector<int> history_;
+    // forward info
+    int max_seq_len_ = 1024;
+    int prompt_len_ = 0;
+    int gen_seq_len_ = 0;
+    int all_seq_len_ = 0;
+    // time
+    int64_t prefill_us_ = 0;
+    int64_t decode_us_ = 0;
 private:
     virtual std::vector<int> tokenizer(const std::string& query) = 0;
     virtual VARP gen_attention_mask(int seq_len) = 0;
@@ -52,9 +67,6 @@ class MNN_PUBLIC Llm {
     std::vector<int> key_value_shape_ = {};
     std::string model_name_ = "";
     // gen info
-    int gen_seq_len_ = 0;
-    int all_seq_len_ = 0;
-    int max_seq_len_ = 256;
     float load_progress_ = 0.f;
     // tokenizer
     std::unique_ptr<Tokenizer> tokenizer_;
@@ -65,9 +77,6 @@ class MNN_PUBLIC Llm {
     std::vector<VARP> past_key_values_;
     // model dir
     std::string model_dir_;
-    // tokenizer
-    std::vector<std::string> word_decoder_;
-    std::unordered_map<std::string, int> word_encoder_;
 };
 
 // some llm models
@@ -107,6 +116,7 @@ class Qwen_7b : public Llm {
         model_name_ = "Qwen_7b";
         layer_nums_ = 32;
         key_value_shape_ = {2, 1, 0, 32, 128};
+        hidden_size_ = 4096;
         tokenizer_.reset(new Tiktoken);
     }
 private:
@@ -116,6 +126,17 @@ class Qwen_7b : public Llm {
     virtual bool is_stop(int token_id) override;
 };
 
+class Qwen_1_8b : public Qwen_7b {
+public:
+    Qwen_1_8b() {
+        model_name_ = "Qwen_1.8b";
+        layer_nums_ = 24;
+        key_value_shape_ = {2, 1, 0, 16, 128};
+        hidden_size_ = 2048;
+        tokenizer_.reset(new Tiktoken);
+    }
+};
+
 class Llama2_7b : public Llm {
 public:
     Llama2_7b() {

diff --git a/llm/include/tokenizer.hpp b/llm/include/tokenizer.hpp
@@ -18,6 +18,7 @@
 class Tokenizer {
 public:
     Tokenizer() = default;
+    virtual ~Tokenizer() = default;
     virtual bool load(const std::string& filename) = 0;
     virtual std::vector<int> encode(const std::string& str) = 0;
     virtual std::string decode(int id) = 0;

diff --git a/llm/llm_demo.cpp b/llm/llm_demo.cpp
@@ -6,17 +6,59 @@
 //
 
 #include "llm.hpp"
-#include <iostream>
+#include <fstream>
+#include <stdlib.h>
+
+void benchmark(Llm* llm, std::string prompt_file) {
+    std::cout << "prompt file is " << prompt_file << std::endl;
+    std::ifstream prompt_fs(prompt_file);
+    std::vector<std::string> prompts;
+    std::string prompt;
+    while (std::getline(prompt_fs, prompt)) {
+        // prompt start with '#' will be ignored
+        if (prompt.substr(0, 1) == "#") {
+            continue;
+        }
+        prompts.push_back(prompt);
+    }
+    int prompt_len = 0;
+    int decode_len = 0;
+    int64_t prefill_time = 0;
+    int64_t decode_time = 0;
+    // llm->warmup();
+    for (int i = 0; i < prompts.size(); i++) {
+        llm->response(prompts[i]);
+        prompt_len += llm->prompt_len_;
+        decode_len += llm->gen_seq_len_;
+        prefill_time += llm->prefill_us_;
+        decode_time += llm->decode_us_;
+        llm->reset();
+    }
+    float prefill_s = prefill_time / 1e6;
+    float decode_s = decode_time / 1e6;
+    printf("\n#################################\n");
+    printf("prompt tokens num  = %d\n", prompt_len);
+    printf("decode tokens num  = %d\n", decode_len);
+    printf("prefill time = %.2f s\n", prefill_s);
+    printf(" decode time = %.2f s\n", decode_s);
+    printf("prefill speed = %.2f tok/s\n", prompt_len / prefill_s);
+    printf(" decode speed = %.2f tok/s\n", decode_len / decode_s);
+    printf("##################################\n");
+}
 
 int main(int argc, const char* argv[]) {
     if (argc < 2) {
-        std::cout << "Usage: ./llm_demo.out <model_path>" << std::endl;
+        std::cout << "Usage: " << argv[0] << " model_dir <prompt.txt>" << std::endl;
         return 0;
     }
     std::string model_dir = argv[1];
     std::cout << "model path is " << model_dir << std::endl;
     std::unique_ptr<Llm> llm(Llm::createLLM(model_dir));
     llm->load(model_dir);
-    llm->response("你好");
+    if (argc < 3) {
+        llm->chat();
+    }
+    std::string prompt_file = argv[2];
+    benchmark(llm.get(), prompt_file);
     return 0;
-}
+}