Both main and speculative mostly working, add latencies to speculativ…

…e, change timing to thread cycles
AutonomicPerfectionist · Jan 19, 2024 · d3baaf7 · d3baaf7
1 parent cfe3120
commit d3baaf7
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 18 deletions.
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -235,7 +235,7 @@ int main(int argc, char ** argv) {
     LOG("add_bos: %d\n", add_bos);
 
     std::vector<llama_token> embd_inp;
-
+    int n_past             = 0;
     if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
         if (params.chatml) {
@@ -249,6 +249,10 @@ int main(int argc, char ** argv) {
 
         }
         llama_decode(ctx, batch);
+        llama_token last = embd_inp.back();
+        n_past = embd_inp.size()-2;
+        embd_inp.clear();
+        embd_inp.push_back(last);
     } else {
         LOG("use session tokens\n");
         embd_inp = session_tokens;
@@ -465,7 +469,7 @@ int main(int argc, char ** argv) {
     bool input_echo           = true;
     bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();
 
-    int n_past             = 0;
+
     int n_remain           = params.n_predict;
     int n_consumed         = 0;
     int n_session_consumed = 0;

diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
@@ -299,6 +299,12 @@ int main(int argc, char ** argv) {
     int spec_past_tgt = n_past_tgt;
     int spec_past_dft = n_past_dft;
 
+    long ttft = ggml_time_us();
+    std::vector<uint64_t > inter_token_times;
+    int64_t itt_start;
+    bool first_token = false;
+    bool has_run_first_token = false;
+
     bool first_run = true;
     llama_token id;
     while (true) {
@@ -456,6 +462,18 @@ int main(int argc, char ** argv) {
                 if (current_run.speculative) {
                     n_accept++;
                 }
+
+                if (has_run_first_token) {
+                    if (first_token) {
+                        ttft = ggml_time_us() - ttft;
+                        LOG("\nTTFT: %ld\n", ttft);
+                        first_token = false;
+                    } else {
+                        inter_token_times.push_back(ggml_time_us() - itt_start);
+                    }
+
+                    itt_start = ggml_time_us();
+                }
                 llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);
 
                 // Root of WORLD
@@ -607,6 +625,12 @@ int main(int argc, char ** argv) {
         begin_non_spec_run(params, n_seq_dft, ctx_dft, max_seq, drafts, id, batch_id, n_past_dft, n_past_dft, dft_cgraphs,
                            kvc_view_dft);
 
+        if (!has_run_first_token) {
+
+            has_run_first_token = true;
+            first_token = true;
+        }
+
         seq_async_run dft_run = dft_cgraphs.back();
         dft_cgraphs.pop_back();
         llama_finish_async_decode(*ctx_dft, dft_run.batch, dft_run.cgraph);
@@ -648,11 +672,21 @@ int main(int argc, char ** argv) {
 
     auto t_dec_end = ggml_time_us();
 
+    uint64_t avg_itt = 0;
+    for (auto latency : inter_token_times) {
+        avg_itt += latency;
+    }
+
+    avg_itt = avg_itt / inter_token_times.size();
+
     LOG_TEE("\n\n");
 
     LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input,   (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
     LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict  / ((t_dec_end - t_dec_start) / 1e6f));
-
+    LOG_TEE("Average inter-token latency: %f seconds\n", avg_itt / 1e6f);
+    LOG_TEE("Time-to-first-token: %f seconds\n", ttft / 1e6f);
+
+
     LOG_TEE("\n");
     LOG_TEE("n_draft   = %d\n", n_draft);
     LOG_TEE("n_predict = %d\n", n_predict);
@@ -671,7 +705,17 @@ int main(int argc, char ** argv) {
         llama_sampling_free(drafts[s].ctx_sampling);
     }
 
-    llama_batch_free(batch_dft);
+    if (llama_node_id(ctx_tgt) == 0) {
+        for (size_t i = tgt_cgraphs.size() - 1; i >= 0; i--) {
+            const auto &run = tgt_cgraphs[i];
+            llama_finish_async_decode(*ctx_tgt, run.batch, run.cgraph);
+        }
+    }
+
+    for (size_t i = dft_cgraphs.size()-1; i >= 0; i--) {
+        const auto& run = dft_cgraphs[i];
+        llama_finish_async_decode(*ctx_dft, run.batch, run.cgraph);
+    }
 
     llama_free(ctx_tgt);
     llama_free_model(model_tgt);
@@ -852,10 +896,6 @@ bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llam
     }
 
 
-    if (n_predict > params.n_predict || has_eos) {
-                return true;
-    }
-
     llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);
 
     int n_seq_cur  = 0;
@@ -1078,7 +1118,9 @@ bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llam
             //drafts[s].tokens.erase(drafts[s].tokens.begin());
         }
 
-
+        if (first_run) {
+            ++n_drafted;
+        }
 
         begin_async_run(params.sparams, params.n_parallel, ctx_tgt, max_seq, n_past_dft, drafts, tgt_cgraphs,
                         batch_id, spec_past_tgt, kvc, true, batch_tgt, spec_past_tgt + drafts[0].tokens.size(), prefix_n_past, current_run.seq_offset);

diff --git a/ggml-mpi.c b/ggml-mpi.c
@@ -108,7 +108,7 @@ void ggml_mpi_free(struct ggml_mpi_context * ctx) {
 
     ggml_mpi_sync_pipelined(ctx, NULL, 0, MPI_INT8_T, GGML_MPI_SHUTDOWN);
     int buffer_size = 128*1024*1024;
-    MPI_Buffer_detach(ctx->send_buffer, &buffer_size);
+    MPI_Buffer_detach(&ctx->send_buffer, &buffer_size);
     MPI_Comm_free(&(ctx->comm));
     free(ctx);
 }
@@ -253,7 +253,44 @@ bool ggml_mpi_eval_init(
 
 
     ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, GGML_MPI_N_TOKENS);
+    int8_t* temp_logits = (int8_t*) calloc(*n_tokens, sizeof(int8_t));
 
+    if (ctx_mpi->rank == 0 && *logits != NULL) {
+        ggml_mpi_sync_pipelined(ctx_mpi, *logits, *n_tokens, MPI_INT8_T, GGML_MPI_BATCH_LOGITS);
+    } else {
+        ggml_mpi_sync_pipelined(ctx_mpi, temp_logits, *n_tokens, MPI_INT8_T, GGML_MPI_BATCH_LOGITS);
+    }
+
+
+
+
+
+
+
+    if (ctx_mpi->rank != 0) {
+        bool should_set_batch_logits = false;
+        for (int i = 0; i < *n_tokens; i++) {
+            if (temp_logits[i]) {
+                should_set_batch_logits = true;
+                break;
+            }
+        }
+        if (should_set_batch_logits) {
+            if (*logits != NULL) {
+                free(*logits);
+                *logits = NULL;
+            }
+            *logits = temp_logits;
+        } else {
+            if (*logits != NULL) {
+                free(*logits);
+                *logits = NULL;
+            }
+            free(temp_logits);
+        }
+    } else {
+        free(temp_logits);
+    }
 
     // For now, we assume that the pos, seq_ids, tokens, etc have been
     // pre-allocated for the largest possible sizes, even on worker nodes.

diff --git a/ggml-mpi.h b/ggml-mpi.h
@@ -49,6 +49,10 @@ extern "C" {
 
 #define GGML_MPI_BEGIN_TRANSACTION 18
 
+#define GGML_MPI_MAX_N_SEQ 19
+
+#define GGML_MPI_BATCH_LOGITS 20
+
 /**
  * The context used for MPI operations,
  * a program may make use of more than one

diff --git a/ggml.c b/ggml.c
@@ -362,13 +362,13 @@ int64_t ggml_time_us(void) {
 void ggml_time_init(void) {}
 int64_t ggml_time_ms(void) {
     struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
     return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
 }
 
 int64_t ggml_time_us(void) {
     struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
     return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
 }
 #endif

diff --git a/llama.cpp b/llama.cpp
@@ -5463,6 +5463,10 @@ static struct ggml_cgraph * llama_decode_internal_phased(
             llama_batch  & batch,
             uint8_t phase,
             ggml_cgraph * cgraph) {
+    if (ggml_mpi_rank(lctx.ctx_mpi) < 0) {
+        return nullptr;
+    }
+
     if (phase == 0) {
         if (ggml_mpi_rank(lctx.ctx_mpi) == 0 && ggml_mpi_size(lctx.ctx_mpi) > 1) {
             int transaction_type = GGML_MPI_DECODE;
@@ -5471,6 +5475,8 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.batch_id, 1, GGML_MPI_BATCH_ID);
 
         ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.n_tokens, 1, GGML_MPI_N_TOKENS);
+
+        ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.max_n_seq, 1, GGML_MPI_MAX_N_SEQ);
     }
     uint32_t n_tokens = batch.n_tokens;
     if (n_tokens == 0) {
@@ -5487,7 +5493,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
     GGML_ASSERT(n_tokens <= n_batch);
 
     int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
-    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
+
 
     const int64_t t_start_us = ggml_time_us();
 
@@ -5523,7 +5529,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         seq_id_arr.resize(n_tokens);
         for (uint32_t i = 0; i < n_tokens; i++) {
             n_seq_id[i] = 1;
-            seq_id[i].resize(1);
+            seq_id[i].resize(batch.max_n_seq);
             seq_id[i][0] = batch.all_seq_id;
             seq_id_arr[i] = seq_id[i].data();
         }
@@ -5548,6 +5554,8 @@ static struct ggml_cgraph * llama_decode_internal_phased(
         }
         n_tokens = batch.n_tokens;
 #endif
+
+        GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
         if (!llama_kv_cache_find_slot(kv_self, batch)) {
             printf("Cannot find cache slot\n");
             return nullptr;
@@ -9837,12 +9845,12 @@ struct llama_batch llama_batch_get_one(
         /*all_pos_0      =*/ pos_0,
         /*all_pos_1      =*/ 1,
         /*all_seq_id     =*/ seq_id,
-        0
+        0, 1
     };
 }
 
 struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
-    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0,};
+    llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, n_seq_max};
 
     if (embd) {
         batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
@@ -9861,7 +9869,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_se
     return batch;
 }
 
-void llama_batch_free(struct llama_batch batch) {
+void llama_batch_free(struct llama_batch & batch) {
     if (batch.token)    free(batch.token);
     if (batch.embd)     free(batch.embd);
     if (batch.pos)      free(batch.pos);
@@ -9873,6 +9881,13 @@ void llama_batch_free(struct llama_batch batch) {
         free(batch.seq_id);
     }
     if (batch.logits)   free(batch.logits);
+
+    batch.token = nullptr;
+    batch.embd = nullptr;
+    batch.pos = nullptr;
+    batch.n_seq_id = nullptr;
+    batch.seq_id = nullptr;
+    batch.logits = nullptr;
 }
 
 #ifdef GGML_USE_MPI
@@ -9887,6 +9902,7 @@ int llama_process_mpi_transaction(
 
     switch (tag) {
         case GGML_MPI_DECODE:
+//            llama_batch_free(batch);
             return llama_decode_internal(*ctx, batch);
             break;
         case GGML_MPI_KV_CLEAR:

diff --git a/llama.h b/llama.h
@@ -158,6 +158,7 @@ extern "C" {
         llama_pos    all_pos_1;  // used if pos == NULL
         llama_seq_id all_seq_id; // used if seq_id == NULL
         int32_t      batch_id;
+        int32_t max_n_seq;
     } llama_batch;
 
     struct llama_model_params {
@@ -581,7 +582,7 @@ LLAMA_API void llama_kv_cache_seq_cp_sync_bi(
             int32_t n_seq_max);
 
     // Frees a batch of tokens allocated with llama_batch_init()
-    LLAMA_API void llama_batch_free(struct llama_batch batch);
+    LLAMA_API void llama_batch_free(struct llama_batch & batch);
 
     // Positive return values does not mean a fatal error, but rather a warning.
     //   0 - success