From 036ba9e6d8901d2b4991c5dc1ec2ace538947c93 Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 14 Apr 2024 16:36:08 +0800 Subject: [PATCH] feat: enable controlnet and photo maker for img2img mode --- examples/cli/main.cpp | 80 ++++---- ggml_extend.hpp | 7 +- stable-diffusion.cpp | 419 ++++++++++++++++++++---------------------- stable-diffusion.h | 7 +- tae.hpp | 2 +- 5 files changed, 257 insertions(+), 258 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index a3dc042c..0f26644b 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -656,13 +656,16 @@ int main(int argc, const char* argv[]) { return 1; } - bool vae_decode_only = true; - uint8_t* input_image_buffer = NULL; + bool vae_decode_only = true; + uint8_t* input_image_buffer = NULL; + uint8_t* control_image_buffer = NULL; if (params.mode == IMG2IMG || params.mode == IMG2VID) { vae_decode_only = false; int c = 0; - input_image_buffer = stbi_load(params.input_path.c_str(), ¶ms.width, ¶ms.height, &c, 3); + int width = 0; + int height = 0; + input_image_buffer = stbi_load(params.input_path.c_str(), &width, &height, &c, 3); if (input_image_buffer == NULL) { fprintf(stderr, "load image from '%s' failed\n", params.input_path.c_str()); return 1; @@ -672,21 +675,22 @@ int main(int argc, const char* argv[]) { free(input_image_buffer); return 1; } - if (params.width <= 0) { + if (width <= 0) { fprintf(stderr, "error: the width of image must be greater than 0\n"); free(input_image_buffer); return 1; } - if (params.height <= 0) { + if (height <= 0) { fprintf(stderr, "error: the height of image must be greater than 0\n"); free(input_image_buffer); return 1; } // Resize input image ... - if (params.height % 64 != 0 || params.width % 64 != 0) { - int resized_height = params.height + (64 - params.height % 64); - int resized_width = params.width + (64 - params.width % 64); + if (params.height != height || params.width != width) { + printf("resize input image from %dx%d to %dx%d\n", width, height, params.width, params.height); + int resized_height = params.height; + int resized_width = params.width; uint8_t* resized_image_buffer = (uint8_t*)malloc(resized_height * resized_width * 3); if (resized_image_buffer == NULL) { @@ -694,7 +698,7 @@ int main(int argc, const char* argv[]) { free(input_image_buffer); return 1; } - stbir_resize(input_image_buffer, params.width, params.height, 0, + stbir_resize(input_image_buffer, width, height, 0, resized_image_buffer, resized_width, resized_height, 0, STBIR_TYPE_UINT8, 3 /*RGB channel*/, STBIR_ALPHA_CHANNEL_NONE, 0, STBIR_EDGE_CLAMP, STBIR_EDGE_CLAMP, @@ -704,8 +708,6 @@ int main(int argc, const char* argv[]) { // Save resized result free(input_image_buffer); input_image_buffer = resized_image_buffer; - params.height = resized_height; - params.width = resized_width; } } @@ -732,31 +734,32 @@ int main(int argc, const char* argv[]) { return 1; } + sd_image_t* control_image = NULL; + if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) { + int c = 0; + control_image_buffer = stbi_load(params.control_image_path.c_str(), ¶ms.width, ¶ms.height, &c, 3); + if (control_image_buffer == NULL) { + fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str()); + return 1; + } + control_image = new sd_image_t{(uint32_t)params.width, + (uint32_t)params.height, + 3, + control_image_buffer}; + if (params.canny_preprocess) { // apply preprocessor + control_image->data = preprocess_canny(control_image->data, + control_image->width, + control_image->height, + 0.08f, + 0.08f, + 0.8f, + 1.0f, + false); + } + } + sd_image_t* results; if (params.mode == TXT2IMG) { - sd_image_t* control_image = NULL; - if (params.controlnet_path.size() > 0 && params.control_image_path.size() > 0) { - int c = 0; - input_image_buffer = stbi_load(params.control_image_path.c_str(), ¶ms.width, ¶ms.height, &c, 3); - if (input_image_buffer == NULL) { - fprintf(stderr, "load image from '%s' failed\n", params.control_image_path.c_str()); - return 1; - } - control_image = new sd_image_t{(uint32_t)params.width, - (uint32_t)params.height, - 3, - input_image_buffer}; - if (params.canny_preprocess) { // apply preprocessor - control_image->data = preprocess_canny(control_image->data, - control_image->width, - control_image->height, - 0.08f, - 0.08f, - 0.8f, - 1.0f, - false); - } - } results = txt2img(sd_ctx, params.prompt.c_str(), params.negative_prompt.c_str(), @@ -828,7 +831,12 @@ int main(int argc, const char* argv[]) { params.sample_steps, params.strength, params.seed, - params.batch_count); + params.batch_count, + control_image, + params.control_strength, + params.style_ratio, + params.normalize_input, + params.input_id_images_path.c_str()); } } @@ -881,6 +889,8 @@ int main(int argc, const char* argv[]) { } free(results); free_sd_ctx(sd_ctx); + free(control_image_buffer); + free(input_image_buffer); return 0; } diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 82aa203f..6dbbfaa9 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -752,10 +752,9 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_timestep_embedding( return ggml_timestep_embedding(ctx, timesteps, dim, max_period); } - -__STATIC_INLINE__ size_t ggml_tensor_num(ggml_context * ctx) { +__STATIC_INLINE__ size_t ggml_tensor_num(ggml_context* ctx) { size_t num = 0; - for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { + for (ggml_tensor* t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { num++; } return num; @@ -851,7 +850,7 @@ struct GGMLModule { } public: - virtual std::string get_desc() = 0; + virtual std::string get_desc() = 0; GGMLModule(ggml_backend_t backend, ggml_type wtype = GGML_TYPE_F32) : backend(backend), wtype(wtype) { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 2fbbafea..abaae693 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -852,7 +852,6 @@ class StableDiffusionGGML { copy_ggml_tensor(x, x_t); struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, x_t); - struct ggml_tensor* guided_hint = NULL; bool has_unconditioned = cfg_scale != 1.0 && uc != NULL; @@ -1536,60 +1535,35 @@ void free_sd_ctx(sd_ctx_t* sd_ctx) { free(sd_ctx); } -sd_image_t* txt2img(sd_ctx_t* sd_ctx, - const char* prompt_c_str, - const char* negative_prompt_c_str, - int clip_skip, - float cfg_scale, - int width, - int height, - enum sample_method_t sample_method, - int sample_steps, - int64_t seed, - int batch_count, - const sd_image_t* control_cond, - float control_strength, - float style_ratio, - bool normalize_input, - const char* input_id_images_path_c_str) { - LOG_DEBUG("txt2img %dx%d", width, height); - if (sd_ctx == NULL) { - return NULL; - } - // LOG_DEBUG("%s %s %f %d %d %d", prompt_c_str, negative_prompt_c_str, cfg_scale, sample_steps, seed, batch_count); - std::string prompt(prompt_c_str); - std::string negative_prompt(negative_prompt_c_str); - std::string input_id_images_path(input_id_images_path_c_str); - - // preprocess input id images - std::vector input_id_images; - if (sd_ctx->sd->pmid_model && input_id_images_path.size() > 0) { - std::vector img_files = get_files_from_dir(input_id_images_path); - for (std::string img_file : img_files) { - int c = 0; - int width, height; - uint8_t* input_image_buffer = stbi_load(img_file.c_str(), &width, &height, &c, 3); - if (input_image_buffer == NULL) { - LOG_ERROR("PhotoMaker load image from '%s' failed", img_file.c_str()); - continue; - } else { - LOG_INFO("PhotoMaker loaded image from '%s'", img_file.c_str()); - } - sd_image_t* input_image = NULL; - input_image = new sd_image_t{(uint32_t)width, - (uint32_t)height, - 3, - input_image_buffer}; - input_image = preprocess_id_image(input_image); - if (input_image == NULL) { - LOG_ERROR("preprocess input id image from '%s' failed", img_file.c_str()); - continue; - } - input_id_images.push_back(input_image); - } +sd_image_t* generate_image(sd_ctx_t* sd_ctx, + struct ggml_context* work_ctx, + ggml_tensor* init_latent, + std::string prompt, + std::string negative_prompt, + int clip_skip, + float cfg_scale, + int width, + int height, + enum sample_method_t sample_method, + const std::vector& sigmas, + int64_t seed, + int batch_count, + const sd_image_t* control_cond, + float control_strength, + float style_ratio, + bool normalize_input, + std::string input_id_images_path) { + if (seed < 0) { + // Generally, when using the provided command line, the seed is always >0. + // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library + // by a third party with a seed <0, let's incorporate randomization here. + srand((int)time(NULL)); + seed = rand(); } - // extract and remove lora + int sample_steps = sigmas.size() - 1; + + // Apply lora auto result_pair = extract_and_remove_lora(prompt); std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier @@ -1605,49 +1579,50 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, int64_t t1 = ggml_time_ms(); LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - if (sd_ctx->sd->stacked_id && !sd_ctx->sd->pmid_lora->applied) { - t0 = ggml_time_ms(); - sd_ctx->sd->pmid_lora->apply(sd_ctx->sd->tensors, sd_ctx->sd->n_threads); - t1 = ggml_time_ms(); - sd_ctx->sd->pmid_lora->applied = true; - LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->pmid_lora->free_params_buffer(); - } - } - - struct ggml_init_params params; - params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB - if (sd_ctx->sd->stacked_id) { - params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB - } - params.mem_size += width * height * 3 * sizeof(float); - params.mem_size *= batch_count; - params.mem_buffer = NULL; - params.no_alloc = false; - // LOG_DEBUG("mem_size %u ", params.mem_size); - - struct ggml_context* work_ctx = ggml_init(params); - if (!work_ctx) { - LOG_ERROR("ggml_init() failed"); - return NULL; - } - - if (seed < 0) { - // Generally, when using the provided command line, the seed is always >0. - // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library - // by a third party with a seed <0, let's incorporate randomization here. - srand((int)time(NULL)); - seed = rand(); - } - + // Photo Maker std::string prompt_text_only; ggml_tensor* init_img = NULL; ggml_tensor* prompts_embeds = NULL; ggml_tensor* pooled_prompts_embeds = NULL; - // ggml_tensor* class_tokens_mask = NULL; std::vector class_tokens_mask; if (sd_ctx->sd->stacked_id) { + if (!sd_ctx->sd->pmid_lora->applied) { + t0 = ggml_time_ms(); + sd_ctx->sd->pmid_lora->apply(sd_ctx->sd->tensors, sd_ctx->sd->n_threads); + t1 = ggml_time_ms(); + sd_ctx->sd->pmid_lora->applied = true; + LOG_INFO("pmid_lora apply completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); + if (sd_ctx->sd->free_params_immediately) { + sd_ctx->sd->pmid_lora->free_params_buffer(); + } + } + // preprocess input id images + std::vector input_id_images; + if (sd_ctx->sd->pmid_model && input_id_images_path.size() > 0) { + std::vector img_files = get_files_from_dir(input_id_images_path); + for (std::string img_file : img_files) { + int c = 0; + int width, height; + uint8_t* input_image_buffer = stbi_load(img_file.c_str(), &width, &height, &c, 3); + if (input_image_buffer == NULL) { + LOG_ERROR("PhotoMaker load image from '%s' failed", img_file.c_str()); + continue; + } else { + LOG_INFO("PhotoMaker loaded image from '%s'", img_file.c_str()); + } + sd_image_t* input_image = NULL; + input_image = new sd_image_t{(uint32_t)width, + (uint32_t)height, + 3, + input_image_buffer}; + input_image = preprocess_id_image(input_image); + if (input_image == NULL) { + LOG_ERROR("preprocess input id image from '%s' failed", img_file.c_str()); + continue; + } + input_id_images.push_back(input_image); + } + } if (input_id_images.size() > 0) { sd_ctx->sd->pmid_model->style_strength = style_ratio; int32_t w = input_id_images[0]->width; @@ -1682,21 +1657,22 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, prompt_text_only = sd_ctx->sd->remove_trigger_from_prompt(work_ctx, prompt); // printf("%s || %s \n", prompt.c_str(), prompt_text_only.c_str()); prompt = prompt_text_only; // - if (sample_steps < 50) { - LOG_INFO("sampling steps increases from %d to 50 for PHOTOMAKER", sample_steps); - sample_steps = 50; - } + // if (sample_steps < 50) { + // LOG_INFO("sampling steps increases from %d to 50 for PHOTOMAKER", sample_steps); + // sample_steps = 50; + // } } else { LOG_WARN("Provided PhotoMaker model file, but NO input ID images"); LOG_WARN("Turn off PhotoMaker"); sd_ctx->sd->stacked_id = false; } + for (sd_image_t* img : input_id_images) { + free(img->data); + } + input_id_images.clear(); } - for (sd_image_t* img : input_id_images) { - free(img->data); - } - input_id_images.clear(); + // Get learned condition t0 = ggml_time_ms(); auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); ggml_tensor* c = cond_pair.first; @@ -1720,12 +1696,14 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, sd_ctx->sd->cond_stage_model->free_params_buffer(); } + // Control net hint struct ggml_tensor* image_hint = NULL; if (control_cond != NULL) { image_hint = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); sd_image_to_tensor(control_cond->data, image_hint); } + // Sample std::vector final_latents; // collect latents to decode int C = 4; int W = width / 8; @@ -1737,22 +1715,28 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed); sd_ctx->sd->rng->manual_seed(cur_seed); - struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); - ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng); - - std::vector sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps); + struct ggml_tensor* x_t = NULL; + struct ggml_tensor* noise = NULL; + if (init_latent == NULL) { + x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + ggml_tensor_set_f32_randn(x_t, sd_ctx->sd->rng); + } else { + x_t = init_latent; + noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); + } int start_merge_step = -1; if (sd_ctx->sd->stacked_id) { start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps); - if (start_merge_step > 30) - start_merge_step = 30; + // if (start_merge_step > 30) + // start_merge_step = 30; LOG_INFO("PHOTOMAKER: start_merge_step: %d", start_merge_step); } struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, x_t, - NULL, + noise, c, NULL, c_vector, @@ -1781,6 +1765,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, int64_t t3 = ggml_time_ms(); LOG_INFO("generating %" PRId64 " latent images completed, taking %.2fs", final_latents.size(), (t3 - t1) * 1.0f / 1000); + // Decode to image LOG_INFO("decoding %zu latents", final_latents.size()); std::vector decoded_images; // collect decoded images for (size_t i = 0; i < final_latents.size(); i++) { @@ -1812,9 +1797,74 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, result_images[i].data = sd_tensor_to_image(decoded_images[i]); } ggml_free(work_ctx); - LOG_INFO( - "txt2img completed in %.2fs", - (t4 - t0) * 1.0f / 1000); + + return result_images; +} + +sd_image_t* txt2img(sd_ctx_t* sd_ctx, + const char* prompt_c_str, + const char* negative_prompt_c_str, + int clip_skip, + float cfg_scale, + int width, + int height, + enum sample_method_t sample_method, + int sample_steps, + int64_t seed, + int batch_count, + const sd_image_t* control_cond, + float control_strength, + float style_ratio, + bool normalize_input, + const char* input_id_images_path_c_str) { + LOG_DEBUG("txt2img %dx%d", width, height); + if (sd_ctx == NULL) { + return NULL; + } + + struct ggml_init_params params; + params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB + if (sd_ctx->sd->stacked_id) { + params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB + } + params.mem_size += width * height * 3 * sizeof(float); + params.mem_size *= batch_count; + params.mem_buffer = NULL; + params.no_alloc = false; + // LOG_DEBUG("mem_size %u ", params.mem_size); + + struct ggml_context* work_ctx = ggml_init(params); + if (!work_ctx) { + LOG_ERROR("ggml_init() failed"); + return NULL; + } + + size_t t0 = ggml_time_ms(); + + std::vector sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps); + + sd_image_t* result_images = generate_image(sd_ctx, + work_ctx, + NULL, + prompt_c_str, + negative_prompt_c_str, + clip_skip, + cfg_scale, + width, + height, + sample_method, + sigmas, + seed, + batch_count, + control_cond, + control_strength, + style_ratio, + normalize_input, + input_id_images_path_c_str); + + size_t t1 = ggml_time_ms(); + + LOG_INFO("txt2img completed in %.2fs", (t1 - t0) * 1.0f / 1000); return result_images; } @@ -1831,59 +1881,44 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, int sample_steps, float strength, int64_t seed, - int batch_count) { + int batch_count, + const sd_image_t* control_cond, + float control_strength, + float style_ratio, + bool normalize_input, + const char* input_id_images_path_c_str) { + LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { return NULL; } - std::string prompt(prompt_c_str); - std::string negative_prompt(negative_prompt_c_str); - - LOG_INFO("img2img %dx%d", width, height); - - std::vector sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps); - size_t t_enc = static_cast(sample_steps * strength); - LOG_INFO("target t_enc is %zu steps", t_enc); - std::vector sigma_sched; - sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); struct ggml_init_params params; - params.mem_size = static_cast(10 * 1024) * 1024; // 10 MB + params.mem_size = static_cast(10 * 1024 * 1024); // 10 MB + if (sd_ctx->sd->stacked_id) { + params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB + } params.mem_size += width * height * 3 * sizeof(float) * 2; + params.mem_size *= batch_count; params.mem_buffer = NULL; params.no_alloc = false; // LOG_DEBUG("mem_size %u ", params.mem_size); - // draft context struct ggml_context* work_ctx = ggml_init(params); if (!work_ctx) { LOG_ERROR("ggml_init() failed"); return NULL; } + size_t t0 = ggml_time_ms(); + if (seed < 0) { - seed = (int)time(NULL); + srand((int)time(NULL)); + seed = rand(); } - sd_ctx->sd->rng->manual_seed(seed); - // extract and remove lora - auto result_pair = extract_and_remove_lora(prompt); - std::unordered_map lora_f2m = result_pair.first; // lora_name -> multiplier - for (auto& kv : lora_f2m) { - LOG_DEBUG("lora %s:%.2f", kv.first.c_str(), kv.second); - } - prompt = result_pair.second; - LOG_DEBUG("prompt after extract and remove lora: \"%s\"", prompt.c_str()); - - // load lora from file - int64_t t0 = ggml_time_ms(); - sd_ctx->sd->apply_loras(lora_f2m); - int64_t t1 = ggml_time_ms(); - LOG_INFO("apply_loras completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); sd_image_to_tensor(init_image.data, init_img); - t0 = ggml_time_ms(); ggml_tensor* init_latent = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); @@ -1892,87 +1927,37 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); } // print_ggml_tensor(init_latent); - t1 = ggml_time_ms(); + size_t t1 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); - auto cond_pair = sd_ctx->sd->get_learned_condition(work_ctx, prompt, clip_skip, width, height); - ggml_tensor* c = cond_pair.first; - ggml_tensor* c_vector = cond_pair.second; // [adm_in_channels, ] - struct ggml_tensor* uc = NULL; - struct ggml_tensor* uc_vector = NULL; - if (cfg_scale != 1.0) { - bool force_zero_embeddings = false; - if (sd_ctx->sd->version == VERSION_XL && negative_prompt.size() == 0) { - force_zero_embeddings = true; - } - auto uncond_pair = sd_ctx->sd->get_learned_condition(work_ctx, negative_prompt, clip_skip, width, height, force_zero_embeddings); - uc = uncond_pair.first; - uc_vector = uncond_pair.second; // [adm_in_channels, ] - } - int64_t t2 = ggml_time_ms(); - LOG_INFO("get_learned_condition completed, taking %" PRId64 " ms", t2 - t1); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->cond_stage_model->free_params_buffer(); - } - - sd_ctx->sd->rng->manual_seed(seed); - struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, init_latent); - ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); - - LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); - struct ggml_tensor* x_0 = sd_ctx->sd->sample(work_ctx, - init_latent, - noise, - c, - NULL, - c_vector, - uc, - NULL, - uc_vector, - {}, - 0.f, - cfg_scale, - cfg_scale, - sample_method, - sigma_sched, - -1, - NULL, - NULL); - // struct ggml_tensor *x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); - // print_ggml_tensor(x_0); - int64_t t3 = ggml_time_ms(); - LOG_INFO("sampling completed, taking %.2fs", (t3 - t2) * 1.0f / 1000); - if (sd_ctx->sd->free_params_immediately) { - sd_ctx->sd->diffusion_model->free_params_buffer(); - } - - struct ggml_tensor* img = sd_ctx->sd->decode_first_stage(work_ctx, x_0); - if (sd_ctx->sd->free_params_immediately && !sd_ctx->sd->use_tiny_autoencoder) { - sd_ctx->sd->first_stage_model->free_params_buffer(); - } - if (img == NULL) { - ggml_free(work_ctx); - return NULL; - } - - sd_image_t* result_images = (sd_image_t*)calloc(1, sizeof(sd_image_t)); - if (result_images == NULL) { - ggml_free(work_ctx); - return NULL; - } - - for (size_t i = 0; i < 1; i++) { - result_images[i].width = width; - result_images[i].height = height; - result_images[i].channel = 3; - result_images[i].data = sd_tensor_to_image(img); - } - ggml_free(work_ctx); - - int64_t t4 = ggml_time_ms(); - LOG_INFO("decode_first_stage completed, taking %.2fs", (t4 - t3) * 1.0f / 1000); + std::vector sigmas = sd_ctx->sd->denoiser->schedule->get_sigmas(sample_steps); + size_t t_enc = static_cast(sample_steps * strength); + LOG_INFO("target t_enc is %zu steps", t_enc); + std::vector sigma_sched; + sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); - LOG_INFO("img2img completed in %.2fs", (t4 - t0) * 1.0f / 1000); + sd_image_t* result_images = generate_image(sd_ctx, + work_ctx, + init_latent, + prompt_c_str, + negative_prompt_c_str, + clip_skip, + cfg_scale, + width, + height, + sample_method, + sigma_sched, + seed, + batch_count, + control_cond, + control_strength, + style_ratio, + normalize_input, + input_id_images_path_c_str); + + size_t t2 = ggml_time_ms(); + + LOG_INFO("img2img completed in %.2fs", (t1 - t0) * 1.0f / 1000); return result_images; } diff --git a/stable-diffusion.h b/stable-diffusion.h index 36960041..0de17ae2 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -160,7 +160,12 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, int sample_steps, float strength, int64_t seed, - int batch_count); + int batch_count, + const sd_image_t* control_cond, + float control_strength, + float style_strength, + bool normalize_input, + const char* input_id_images_path); SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, sd_image_t init_image, diff --git a/tae.hpp b/tae.hpp index dd25a36d..5a4d8562 100644 --- a/tae.hpp +++ b/tae.hpp @@ -201,7 +201,7 @@ struct TinyAutoEncoder : public GGMLModule { } bool load_from_file(const std::string& file_path) { - LOG_INFO("loading taesd from '%s'", file_path.c_str()); + LOG_INFO("loading taesd from '%s', decode_only = %s", file_path.c_str(), decode_only ? "true" : "false"); alloc_params_buffer(); std::map taesd_tensors; taesd.get_param_tensors(taesd_tensors);