diff --git a/ggml b/ggml index 3a0b87bd..eec8d3ca 160000 --- a/ggml +++ b/ggml @@ -1 +1 @@ -Subproject commit 3a0b87bde946d2d5d4896f1e700272e54d07968a +Subproject commit eec8d3ca1258d1317a26fdbf561682a7dad4eb01 diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 094326d3..e63d65a4 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -251,6 +251,11 @@ void image_vec_to_ggml(const std::vector& vec, } } +struct ggml_tensor * ggml_group_norm_32(struct ggml_context * ctx, + struct ggml_tensor * a) { + return ggml_group_norm(ctx, a, 32); +} + /*================================================== CLIPTokenizer ===================================================*/ const std::string UNK_TOKEN = "<|endoftext|>"; @@ -899,7 +904,7 @@ struct ResBlock { // in_layers // group norm 32 - auto h = ggml_group_norm(ctx, x); + auto h = ggml_group_norm_32(ctx, x); h = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, @@ -929,7 +934,7 @@ struct ResBlock { // out_layers h = ggml_add(ctx, h, emb_out); // group norm 32 - h = ggml_group_norm_inplace(ctx, h); + h = ggml_group_norm_inplace(ctx, h, 32); h = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, out_layer_0_w, 1, 1, out_layer_0_w->ne[0], 1), h), h), ggml_repeat(ctx, ggml_reshape_4d(ctx, out_layer_0_b, 1, 1, out_layer_0_b->ne[0], 1), h)); @@ -1122,7 +1127,7 @@ struct SpatialTransformer { auto x_in = x; // group norm 32 - x = ggml_group_norm(ctx, x); + x = ggml_group_norm_32(ctx, x); x = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_w, 1, 1, norm_w->ne[0], 1), x), x), ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_b, 1, 1, norm_b->ne[0], 1), x)); @@ -1424,7 +1429,7 @@ struct UpSample { struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* x) { // x: [N, channels, h, w] - x = ggml_upscale(ctx, x); // [N, channels, h*2, w*2] + x = ggml_upscale(ctx, x, 2); // [N, channels, h*2, w*2] x = ggml_conv_2d(ctx, conv_w, x, 1, 1, 1, 1, 1, 1); x = ggml_add(ctx, @@ -1815,7 +1820,7 @@ struct UNetModel { // out // group norm 32 - h = ggml_group_norm(ctx, h); + h = ggml_group_norm_32(ctx, h); h = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, @@ -1919,7 +1924,7 @@ struct ResnetBlock { // z: [N, in_channels, h, w] // group norm 32 - auto h = ggml_group_norm(ctx, z); + auto h = ggml_group_norm_32(ctx, z); h = ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm1_w, 1, 1, norm1_w->ne[0], 1), @@ -1941,7 +1946,7 @@ struct ResnetBlock { h)); // [N, out_channels, h, w] // group norm 32 - h = ggml_group_norm(ctx, h); + h = ggml_group_norm_32(ctx, h); h = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm2_w, 1, 1, norm2_w->ne[0], 1), h), h), ggml_repeat(ctx, ggml_reshape_4d(ctx, norm2_b, 1, 1, norm2_b->ne[0], 1), h)); @@ -2028,7 +2033,7 @@ struct AttnBlock { // x: [N, in_channels, h, w] // group norm 32 - auto h_ = ggml_group_norm(ctx, x); + auto h_ = ggml_group_norm_32(ctx, x); h_ = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_w, 1, 1, norm_w->ne[0], 1), h_), h_), ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_b, 1, 1, norm_b->ne[0], 1), h_)); @@ -2253,7 +2258,7 @@ struct Encoder { h = mid.block_2.forward(ctx, h); // [N, block_in, h, w] // group norm 32 - h = ggml_group_norm(ctx, h); + h = ggml_group_norm_32(ctx, h); h = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_w, 1, 1, norm_out_w->ne[0], 1), h), h), ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_b, 1, 1, norm_out_b->ne[0], 1), h)); @@ -2435,7 +2440,7 @@ struct Decoder { } // group norm 32 - h = ggml_group_norm(ctx, h); + h = ggml_group_norm_32(ctx, h); h = ggml_add(ctx, ggml_mul(ctx, ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_w, 1, 1, norm_out_w->ne[0], 1), h), h), ggml_repeat(ctx, ggml_reshape_4d(ctx, norm_out_b, 1, 1, norm_out_b->ne[0], 1), h));