@@ -301,7 +301,7 @@ class StableDiffusionGGML {
301
301
// TODO: shift_factor
302
302
}
303
303
304
- if ( version == VERSION_FLEX_2) {
304
+ if ( sd_version_is_control ( version)) {
305
305
// Might need vae encode for control cond
306
306
vae_decode_only = false ;
307
307
}
@@ -815,15 +815,15 @@ class StableDiffusionGGML {
815
815
const std::vector<float >& sigmas,
816
816
int start_merge_step,
817
817
SDCondition id_cond,
818
- ggml_tensor* denoise_mask = NULL ) {
818
+ ggml_tensor* denoise_mask = NULL ) {
819
819
std::vector<int > skip_layers (guidance.slg .layers , guidance.slg .layers + guidance.slg .layer_count );
820
820
821
821
// TODO (Pix2Pix): separate image guidance params (right now it's reusing distilled guidance)
822
822
823
- float cfg_scale = guidance.txt_cfg ;
823
+ float cfg_scale = guidance.txt_cfg ;
824
824
float img_cfg_scale = guidance.img_cfg ;
825
- float slg_scale = guidance.slg .scale ;
826
-
825
+ float slg_scale = guidance.slg .scale ;
826
+
827
827
float min_cfg = guidance.min_cfg ;
828
828
829
829
if (img_cfg_scale != cfg_scale && !sd_version_use_concat (version)) {
@@ -1475,6 +1475,17 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1475
1475
int W = width / 8 ;
1476
1476
int H = height / 8 ;
1477
1477
LOG_INFO (" sampling using %s method" , sampling_methods_str[sample_method]);
1478
+
1479
+ struct ggml_tensor * control_latent = NULL ;
1480
+ if (sd_version_is_control (sd_ctx->sd ->version ) && image_hint != NULL ) {
1481
+ if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1482
+ struct ggml_tensor * control_moments = sd_ctx->sd ->encode_first_stage (work_ctx, image_hint);
1483
+ control_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, control_moments);
1484
+ } else {
1485
+ control_latent = sd_ctx->sd ->encode_first_stage (work_ctx, image_hint);
1486
+ }
1487
+ }
1488
+
1478
1489
if (sd_version_is_inpaint (sd_ctx->sd ->version )) {
1479
1490
int64_t mask_channels = 1 ;
1480
1491
if (sd_ctx->sd ->version == VERSION_FLUX_FILL) {
@@ -1507,50 +1518,44 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
1507
1518
}
1508
1519
}
1509
1520
}
1510
- if (sd_ctx->sd ->version == VERSION_FLEX_2 && image_hint != NULL && sd_ctx->sd ->control_net == NULL ) {
1521
+
1522
+ if (sd_ctx->sd ->version == VERSION_FLEX_2 && control_latent != NULL && sd_ctx->sd ->control_net == NULL ) {
1511
1523
bool no_inpaint = concat_latent == NULL ;
1512
1524
if (no_inpaint) {
1513
1525
concat_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, init_latent->ne [0 ], init_latent->ne [1 ], mask_channels + init_latent->ne [2 ], 1 );
1514
1526
}
1515
1527
// fill in the control image here
1516
- struct ggml_tensor * control_latents = NULL ;
1517
- if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1518
- struct ggml_tensor * control_moments = sd_ctx->sd ->encode_first_stage (work_ctx, image_hint);
1519
- control_latents = sd_ctx->sd ->get_first_stage_encoding (work_ctx, control_moments);
1520
- } else {
1521
- control_latents = sd_ctx->sd ->encode_first_stage (work_ctx, image_hint);
1522
- }
1523
- for (int64_t x = 0 ; x < concat_latent->ne [0 ]; x++) {
1524
- for (int64_t y = 0 ; y < concat_latent->ne [1 ]; y++) {
1528
+ for (int64_t x = 0 ; x < control_latent->ne [0 ]; x++) {
1529
+ for (int64_t y = 0 ; y < control_latent->ne [1 ]; y++) {
1525
1530
if (no_inpaint) {
1526
- for (int64_t c = 0 ; c < concat_latent->ne [2 ] - control_latents ->ne [2 ]; c++) {
1531
+ for (int64_t c = 0 ; c < concat_latent->ne [2 ] - control_latent ->ne [2 ]; c++) {
1527
1532
// 0x16,1x1,0x16
1528
1533
ggml_tensor_set_f32 (concat_latent, c == init_latent->ne [2 ], x, y, c);
1529
1534
}
1530
1535
}
1531
- for (int64_t c = 0 ; c < control_latents ->ne [2 ]; c++) {
1532
- float v = ggml_tensor_get_f32 (control_latents , x, y, c);
1533
- ggml_tensor_set_f32 (concat_latent, v, x, y, concat_latent->ne [2 ] - control_latents ->ne [2 ] + c);
1536
+ for (int64_t c = 0 ; c < control_latent ->ne [2 ]; c++) {
1537
+ float v = ggml_tensor_get_f32 (control_latent , x, y, c);
1538
+ ggml_tensor_set_f32 (concat_latent, v, x, y, concat_latent->ne [2 ] - control_latent ->ne [2 ] + c);
1534
1539
}
1535
1540
}
1536
1541
}
1537
- // Disable controlnet
1538
- image_hint = NULL ;
1539
1542
} else if (concat_latent == NULL ) {
1540
1543
concat_latent = empty_latent;
1541
1544
}
1542
1545
cond.c_concat = concat_latent;
1543
1546
uncond.c_concat = empty_latent;
1544
1547
denoise_mask = NULL ;
1545
- } else if (sd_version_is_edit (sd_ctx->sd ->version )) {
1548
+ } else if (sd_version_is_edit (sd_ctx->sd ->version ) || sd_version_is_control (sd_ctx-> sd -> version ) ) {
1546
1549
auto empty_latent = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, init_latent->ne [0 ], init_latent->ne [1 ], init_latent->ne [2 ], init_latent->ne [3 ]);
1547
1550
ggml_set_f32 (empty_latent, 0 );
1548
1551
uncond.c_concat = empty_latent;
1552
+ if (sd_version_is_control (sd_ctx->sd ->version ) && control_latent != NULL && sd_ctx->sd ->control_net == NULL ) {
1553
+ concat_latent = control_latent;
1554
+ }
1549
1555
if (concat_latent == NULL ) {
1550
1556
concat_latent = empty_latent;
1551
1557
}
1552
- cond.c_concat = concat_latent;
1553
-
1558
+ cond.c_concat = concat_latent;
1554
1559
}
1555
1560
for (int b = 0 ; b < batch_count; b++) {
1556
1561
int64_t sampling_start = ggml_time_ms ();
@@ -1823,7 +1828,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
1823
1828
ggml_tensor* masked_latent = NULL ;
1824
1829
if (!sd_ctx->sd ->use_tiny_autoencoder ) {
1825
1830
ggml_tensor* moments = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1826
- masked_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1831
+ masked_latent = sd_ctx->sd ->get_first_stage_encoding (work_ctx, moments);
1827
1832
} else {
1828
1833
masked_latent = sd_ctx->sd ->encode_first_stage (work_ctx, masked_img);
1829
1834
}
@@ -1894,8 +1899,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
1894
1899
} else {
1895
1900
concat_latent = init_latent;
1896
1901
}
1897
- }
1898
-
1902
+ }
1903
+
1899
1904
{
1900
1905
// LOG_WARN("Inpainting with a base model is not great");
1901
1906
denoise_mask = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, width / 8 , height / 8 , 1 , 1 );
0 commit comments