Skip to content

Commit 2e0c46a

Browse files
committed
fix: adjust timestep calculations for DDIM and TCD
On img2img, the number of steps correspond to the last precalculated sigma values, but the internal alphas_cumprod and compvis_sigmas were being computed over the entire step range. Also, tweaks the prev_timestep calculation on DDIM to better match the current timestamp (like on TCD), to avoid inconsistencies due to rounding.
1 parent eed97a5 commit 2e0c46a

File tree

2 files changed

+24
-9
lines changed

2 files changed

+24
-9
lines changed

denoiser.hpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ static void sample_k_diffusion(sample_method_t method,
497497
ggml_context* work_ctx,
498498
ggml_tensor* x,
499499
std::vector<float> sigmas,
500+
int initial_step,
500501
std::shared_ptr<RNG> rng,
501502
float eta) {
502503
size_t steps = sigmas.size() - 1;
@@ -1083,12 +1084,13 @@ static void sample_k_diffusion(sample_method_t method,
10831084
// - pred_sample_direction -> "direction pointing to
10841085
// x_t"
10851086
// - pred_prev_sample -> "x_t-1"
1086-
int timestep =
1087-
roundf(TIMESTEPS -
1088-
i * ((float)TIMESTEPS / steps)) -
1089-
1;
1087+
int timestep = TIMESTEPS - 1 -
1088+
(int)roundf((initial_step + i) *
1089+
(TIMESTEPS / float(initial_step + steps)));
10901090
// 1. get previous step value (=t-1)
1091-
int prev_timestep = timestep - TIMESTEPS / steps;
1091+
int prev_timestep = TIMESTEPS - 1 -
1092+
(int)roundf((initial_step + i + 1) *
1093+
(TIMESTEPS / float(initial_step + steps)));
10921094
// The sigma here is chosen to cause the
10931095
// CompVisDenoiser to produce t = timestep
10941096
float sigma = compvis_sigmas[timestep];
@@ -1260,9 +1262,14 @@ static void sample_k_diffusion(sample_method_t method,
12601262
// Analytic form for TCD timesteps
12611263
int timestep = TIMESTEPS - 1 -
12621264
(TIMESTEPS / original_steps) *
1263-
(int)floor(i * ((float)original_steps / steps));
1265+
(int)floor((initial_step + i) *
1266+
((float)original_steps / (initial_step + steps)));
12641267
// 1. get previous step value
1265-
int prev_timestep = i >= steps - 1 ? 0 : TIMESTEPS - 1 - (TIMESTEPS / original_steps) * (int)floor((i + 1) * ((float)original_steps / steps));
1268+
int prev_timestep = i >= steps - 1 ? 0 :
1269+
TIMESTEPS - 1 -
1270+
(TIMESTEPS / original_steps) *
1271+
(int)floor((initial_step + i + 1) *
1272+
((float)original_steps / (initial_step + steps)));
12661273
// Here timestep_s is tau_n' in Algorithm 4. The _s
12671274
// notation appears to be that from C. Lu,
12681275
// "DPM-Solver: A Fast ODE Solver for Diffusion

stable-diffusion.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -845,6 +845,7 @@ class StableDiffusionGGML {
845845
float eta,
846846
sample_method_t method,
847847
const std::vector<float>& sigmas,
848+
int initial_step,
848849
int start_merge_step,
849850
SDCondition id_cond,
850851
std::vector<ggml_tensor*> ref_latents = {},
@@ -1083,7 +1084,7 @@ class StableDiffusionGGML {
10831084
return denoised;
10841085
};
10851086

1086-
sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta);
1087+
sample_k_diffusion(method, denoise, work_ctx, x, sigmas, initial_step, rng, eta);
10871088

10881089
x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
10891090

@@ -1520,6 +1521,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
15201521
int height,
15211522
enum sample_method_t sample_method,
15221523
const std::vector<float>& sigmas,
1524+
int initial_step,
15231525
int64_t seed,
15241526
int batch_count,
15251527
const sd_image_t* control_cond,
@@ -1530,6 +1532,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
15301532
std::vector<ggml_tensor*> ref_latents,
15311533
ggml_tensor* concat_latent = NULL,
15321534
ggml_tensor* denoise_mask = NULL) {
1535+
15331536
if (seed < 0) {
15341537
// Generally, when using the provided command line, the seed is always >0.
15351538
// However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1795,6 +1798,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx,
17951798
eta,
17961799
sample_method,
17971800
sigmas,
1801+
initial_step,
17981802
start_merge_step,
17991803
id_cond,
18001804
ref_latents,
@@ -1917,6 +1921,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
19171921
ggml_tensor* concat_latent = NULL;
19181922
ggml_tensor* denoise_mask = NULL;
19191923
std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sd_img_gen_params->sample_steps);
1924+
int initial_step = 0;
19201925

19211926
if (sd_img_gen_params->init_image.data) {
19221927
LOG_INFO("IMG2IMG");
@@ -1926,7 +1931,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
19261931
t_enc--;
19271932
LOG_INFO("target t_enc is %zu steps", t_enc);
19281933
std::vector<float> sigma_sched;
1929-
sigma_sched.assign(sigmas.begin() + sd_img_gen_params->sample_steps - t_enc - 1, sigmas.end());
1934+
initial_step = sd_img_gen_params->sample_steps - t_enc - 1;
1935+
sigma_sched.assign(sigmas.begin() + initial_step, sigmas.end());
19301936
sigmas = sigma_sched;
19311937

19321938
ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
@@ -2063,6 +2069,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
20632069
height,
20642070
sd_img_gen_params->sample_method,
20652071
sigmas,
2072+
initial_step,
20662073
seed,
20672074
sd_img_gen_params->batch_count,
20682075
sd_img_gen_params->control_cond,
@@ -2162,6 +2169,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s
21622169
0.f,
21632170
sd_vid_gen_params->sample_method,
21642171
sigmas,
2172+
0,
21652173
-1,
21662174
SDCondition(NULL, NULL, NULL));
21672175

0 commit comments

Comments
 (0)