Skip to content

Commit

Permalink
Both main and speculative mostly working, add latencies to speculativ…
Browse files Browse the repository at this point in the history
…e, change timing to thread cycles
  • Loading branch information
AutonomicPerfectionist committed Jan 19, 2024
1 parent cfe3120 commit d3baaf7
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 18 deletions.
8 changes: 6 additions & 2 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ int main(int argc, char ** argv) {
LOG("add_bos: %d\n", add_bos);

std::vector<llama_token> embd_inp;

int n_past = 0;
if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) {
LOG("tokenize the prompt\n");
if (params.chatml) {
Expand All @@ -249,6 +249,10 @@ int main(int argc, char ** argv) {

}
llama_decode(ctx, batch);
llama_token last = embd_inp.back();
n_past = embd_inp.size()-2;
embd_inp.clear();
embd_inp.push_back(last);
} else {
LOG("use session tokens\n");
embd_inp = session_tokens;
Expand Down Expand Up @@ -465,7 +469,7 @@ int main(int argc, char ** argv) {
bool input_echo = true;
bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size();

int n_past = 0;

int n_remain = params.n_predict;
int n_consumed = 0;
int n_session_consumed = 0;
Expand Down
56 changes: 49 additions & 7 deletions examples/speculative/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,12 @@ int main(int argc, char ** argv) {
int spec_past_tgt = n_past_tgt;
int spec_past_dft = n_past_dft;

long ttft = ggml_time_us();
std::vector<uint64_t > inter_token_times;
int64_t itt_start;
bool first_token = false;
bool has_run_first_token = false;

bool first_run = true;
llama_token id;
while (true) {
Expand Down Expand Up @@ -456,6 +462,18 @@ int main(int argc, char ** argv) {
if (current_run.speculative) {
n_accept++;
}

if (has_run_first_token) {
if (first_token) {
ttft = ggml_time_us() - ttft;
LOG("\nTTFT: %ld\n", ttft);
first_token = false;
} else {
inter_token_times.push_back(ggml_time_us() - itt_start);
}

itt_start = ggml_time_us();
}
llama_sampling_accept(ctx_sampling, ctx_tgt, id, true);

// Root of WORLD
Expand Down Expand Up @@ -607,6 +625,12 @@ int main(int argc, char ** argv) {
begin_non_spec_run(params, n_seq_dft, ctx_dft, max_seq, drafts, id, batch_id, n_past_dft, n_past_dft, dft_cgraphs,
kvc_view_dft);

if (!has_run_first_token) {

has_run_first_token = true;
first_token = true;
}

seq_async_run dft_run = dft_cgraphs.back();
dft_cgraphs.pop_back();
llama_finish_async_decode(*ctx_dft, dft_run.batch, dft_run.cgraph);
Expand Down Expand Up @@ -648,11 +672,21 @@ int main(int argc, char ** argv) {

auto t_dec_end = ggml_time_us();

uint64_t avg_itt = 0;
for (auto latency : inter_token_times) {
avg_itt += latency;
}

avg_itt = avg_itt / inter_token_times.size();

LOG_TEE("\n\n");

LOG_TEE("encoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_input, (t_enc_end - t_enc_start) / 1e6f, inp.size() / ((t_enc_end - t_enc_start) / 1e6f));
LOG_TEE("decoded %4d tokens in %8.3f seconds, speed: %8.3f t/s\n", n_predict, (t_dec_end - t_dec_start) / 1e6f, n_predict / ((t_dec_end - t_dec_start) / 1e6f));

LOG_TEE("Average inter-token latency: %f seconds\n", avg_itt / 1e6f);
LOG_TEE("Time-to-first-token: %f seconds\n", ttft / 1e6f);


LOG_TEE("\n");
LOG_TEE("n_draft = %d\n", n_draft);
LOG_TEE("n_predict = %d\n", n_predict);
Expand All @@ -671,7 +705,17 @@ int main(int argc, char ** argv) {
llama_sampling_free(drafts[s].ctx_sampling);
}

llama_batch_free(batch_dft);
if (llama_node_id(ctx_tgt) == 0) {
for (size_t i = tgt_cgraphs.size() - 1; i >= 0; i--) {
const auto &run = tgt_cgraphs[i];
llama_finish_async_decode(*ctx_tgt, run.batch, run.cgraph);
}
}

for (size_t i = dft_cgraphs.size()-1; i >= 0; i--) {
const auto& run = dft_cgraphs[i];
llama_finish_async_decode(*ctx_dft, run.batch, run.cgraph);
}

llama_free(ctx_tgt);
llama_free_model(model_tgt);
Expand Down Expand Up @@ -852,10 +896,6 @@ bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llam
}


if (n_predict > params.n_predict || has_eos) {
return true;
}

llama_sampling_cp(ctx_sampling, drafts[0].ctx_sampling);

int n_seq_cur = 0;
Expand Down Expand Up @@ -1078,7 +1118,9 @@ bool start_async_spec_run(const gpt_params &params, llama_context *ctx_tgt, llam
//drafts[s].tokens.erase(drafts[s].tokens.begin());
}


if (first_run) {
++n_drafted;
}

begin_async_run(params.sparams, params.n_parallel, ctx_tgt, max_seq, n_past_dft, drafts, tgt_cgraphs,
batch_id, spec_past_tgt, kvc, true, batch_tgt, spec_past_tgt + drafts[0].tokens.size(), prefix_n_past, current_run.seq_offset);
Expand Down
39 changes: 38 additions & 1 deletion ggml-mpi.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ void ggml_mpi_free(struct ggml_mpi_context * ctx) {

ggml_mpi_sync_pipelined(ctx, NULL, 0, MPI_INT8_T, GGML_MPI_SHUTDOWN);
int buffer_size = 128*1024*1024;
MPI_Buffer_detach(ctx->send_buffer, &buffer_size);
MPI_Buffer_detach(&ctx->send_buffer, &buffer_size);
MPI_Comm_free(&(ctx->comm));
free(ctx);
}
Expand Down Expand Up @@ -253,7 +253,44 @@ bool ggml_mpi_eval_init(


ggml_mpi_sync_pipelined(ctx_mpi, n_tokens, 1, MPI_INT, GGML_MPI_N_TOKENS);
int8_t* temp_logits = (int8_t*) calloc(*n_tokens, sizeof(int8_t));

if (ctx_mpi->rank == 0 && *logits != NULL) {
ggml_mpi_sync_pipelined(ctx_mpi, *logits, *n_tokens, MPI_INT8_T, GGML_MPI_BATCH_LOGITS);
} else {
ggml_mpi_sync_pipelined(ctx_mpi, temp_logits, *n_tokens, MPI_INT8_T, GGML_MPI_BATCH_LOGITS);
}







if (ctx_mpi->rank != 0) {
bool should_set_batch_logits = false;
for (int i = 0; i < *n_tokens; i++) {
if (temp_logits[i]) {
should_set_batch_logits = true;
break;
}
}
if (should_set_batch_logits) {
if (*logits != NULL) {
free(*logits);
*logits = NULL;
}
*logits = temp_logits;
} else {
if (*logits != NULL) {
free(*logits);
*logits = NULL;
}
free(temp_logits);
}
} else {
free(temp_logits);
}

// For now, we assume that the pos, seq_ids, tokens, etc have been
// pre-allocated for the largest possible sizes, even on worker nodes.
Expand Down
4 changes: 4 additions & 0 deletions ggml-mpi.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ extern "C" {

#define GGML_MPI_BEGIN_TRANSACTION 18

#define GGML_MPI_MAX_N_SEQ 19

#define GGML_MPI_BATCH_LOGITS 20

/**
* The context used for MPI operations,
* a program may make use of more than one
Expand Down
4 changes: 2 additions & 2 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -362,13 +362,13 @@ int64_t ggml_time_us(void) {
void ggml_time_init(void) {}
int64_t ggml_time_ms(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;
}

int64_t ggml_time_us(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;
}
#endif
Expand Down
26 changes: 21 additions & 5 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5463,6 +5463,10 @@ static struct ggml_cgraph * llama_decode_internal_phased(
llama_batch & batch,
uint8_t phase,
ggml_cgraph * cgraph) {
if (ggml_mpi_rank(lctx.ctx_mpi) < 0) {
return nullptr;
}

if (phase == 0) {
if (ggml_mpi_rank(lctx.ctx_mpi) == 0 && ggml_mpi_size(lctx.ctx_mpi) > 1) {
int transaction_type = GGML_MPI_DECODE;
Expand All @@ -5471,6 +5475,8 @@ static struct ggml_cgraph * llama_decode_internal_phased(
ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.batch_id, 1, GGML_MPI_BATCH_ID);

ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.n_tokens, 1, GGML_MPI_N_TOKENS);

ggml_mpi_sync_ints_pipelined(lctx.ctx_mpi, &batch.max_n_seq, 1, GGML_MPI_MAX_N_SEQ);
}
uint32_t n_tokens = batch.n_tokens;
if (n_tokens == 0) {
Expand All @@ -5487,7 +5493,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
GGML_ASSERT(n_tokens <= n_batch);

int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT


const int64_t t_start_us = ggml_time_us();

Expand Down Expand Up @@ -5523,7 +5529,7 @@ static struct ggml_cgraph * llama_decode_internal_phased(
seq_id_arr.resize(n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
n_seq_id[i] = 1;
seq_id[i].resize(1);
seq_id[i].resize(batch.max_n_seq);
seq_id[i][0] = batch.all_seq_id;
seq_id_arr[i] = seq_id[i].data();
}
Expand All @@ -5548,6 +5554,8 @@ static struct ggml_cgraph * llama_decode_internal_phased(
}
n_tokens = batch.n_tokens;
#endif

GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
if (!llama_kv_cache_find_slot(kv_self, batch)) {
printf("Cannot find cache slot\n");
return nullptr;
Expand Down Expand Up @@ -9837,12 +9845,12 @@ struct llama_batch llama_batch_get_one(
/*all_pos_0 =*/ pos_0,
/*all_pos_1 =*/ 1,
/*all_seq_id =*/ seq_id,
0
0, 1
};
}

struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_seq_max) {
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0,};
llama_batch batch = { 0, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, 0, 0, 0, n_seq_max};

if (embd) {
batch.embd = (float *) malloc(sizeof(float) * n_tokens * embd);
Expand All @@ -9861,7 +9869,7 @@ struct llama_batch llama_batch_init(int32_t n_tokens, int32_t embd, int32_t n_se
return batch;
}

void llama_batch_free(struct llama_batch batch) {
void llama_batch_free(struct llama_batch & batch) {
if (batch.token) free(batch.token);
if (batch.embd) free(batch.embd);
if (batch.pos) free(batch.pos);
Expand All @@ -9873,6 +9881,13 @@ void llama_batch_free(struct llama_batch batch) {
free(batch.seq_id);
}
if (batch.logits) free(batch.logits);

batch.token = nullptr;
batch.embd = nullptr;
batch.pos = nullptr;
batch.n_seq_id = nullptr;
batch.seq_id = nullptr;
batch.logits = nullptr;
}

#ifdef GGML_USE_MPI
Expand All @@ -9887,6 +9902,7 @@ int llama_process_mpi_transaction(

switch (tag) {
case GGML_MPI_DECODE:
// llama_batch_free(batch);
return llama_decode_internal(*ctx, batch);
break;
case GGML_MPI_KV_CLEAR:
Expand Down
3 changes: 2 additions & 1 deletion llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ extern "C" {
llama_pos all_pos_1; // used if pos == NULL
llama_seq_id all_seq_id; // used if seq_id == NULL
int32_t batch_id;
int32_t max_n_seq;
} llama_batch;

struct llama_model_params {
Expand Down Expand Up @@ -581,7 +582,7 @@ LLAMA_API void llama_kv_cache_seq_cp_sync_bi(
int32_t n_seq_max);

// Frees a batch of tokens allocated with llama_batch_init()
LLAMA_API void llama_batch_free(struct llama_batch batch);
LLAMA_API void llama_batch_free(struct llama_batch & batch);

// Positive return values does not mean a fatal error, but rather a warning.
// 0 - success
Expand Down

0 comments on commit d3baaf7

Please sign in to comment.