-
Notifications
You must be signed in to change notification settings - Fork 681
feat: automatically fall back to VAE tiling when an untiled decode exceeds the backend buffer limit #1621
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
feat: automatically fall back to VAE tiling when an untiled decode exceeds the backend buffer limit #1621
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -958,7 +958,7 @@ ArgOptions SDGenerationParams::get_options() { | |
| &extra_sample_args}, | ||
| {"", | ||
| "--extra-tiling-args", | ||
| "extra VAE tiling args, key=value list. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)", | ||
| "extra VAE tiling args, key=value list. max_buffer_size (bytes) forces the auto fallback to tile when an untiled VAE compute buffer would exceed it. LTX video VAE supports temporal_tile_frames (default: 4), temporal_tile_overlap (default: 1)", | ||
| (int)',', | ||
| &extra_tiling_args}, | ||
| }; | ||
|
|
@@ -1148,6 +1148,12 @@ ArgOptions SDGenerationParams::get_options() { | |
| "process vae in tiles to reduce memory usage", | ||
| true, | ||
| &vae_tiling_params.enabled}, | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think all parameters related to tiling fallback should be placed in |
||
| {"", | ||
| "--no-vae-tiling-fallback", | ||
| "disable the automatic fallback to VAE tiling when an untiled decode would exceed the " | ||
| "backend's max buffer size (fail instead of tiling)", | ||
| false, | ||
| &vae_tiling_params.auto_tile}, | ||
| {"", | ||
| "--temporal-tiling", | ||
| "enable temporal tiling for LTX video VAE decode", | ||
|
|
@@ -1892,6 +1898,9 @@ bool SDGenerationParams::from_json_str( | |
| if (tiling_json.contains("enabled") && tiling_json["enabled"].is_boolean()) { | ||
| vae_tiling_params.enabled = tiling_json["enabled"]; | ||
| } | ||
| if (tiling_json.contains("auto_tile") && tiling_json["auto_tile"].is_boolean()) { | ||
| vae_tiling_params.auto_tile = tiling_json["auto_tile"]; | ||
| } | ||
| if (tiling_json.contains("temporal_tiling") && tiling_json["temporal_tiling"].is_boolean()) { | ||
| vae_tiling_params.temporal_tiling = tiling_json["temporal_tiling"]; | ||
| } | ||
|
|
@@ -2711,10 +2720,12 @@ std::string build_sdcpp_image_metadata_json(const SDContextParams& ctx_params, | |
| } | ||
|
|
||
| if (gen_params.vae_tiling_params.enabled || | ||
| !gen_params.vae_tiling_params.auto_tile || | ||
| gen_params.vae_tiling_params.temporal_tiling || | ||
| !gen_params.extra_tiling_args.empty()) { | ||
| root["vae_tiling"] = { | ||
| {"enabled", gen_params.vae_tiling_params.enabled}, | ||
| {"auto_tile", gen_params.vae_tiling_params.auto_tile}, | ||
| {"temporal_tiling", gen_params.vae_tiling_params.temporal_tiling}, | ||
| {"tile_size_x", gen_params.vae_tiling_params.tile_size_x}, | ||
| {"tile_size_y", gen_params.vae_tiling_params.tile_size_y}, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1705,11 +1705,18 @@ struct GGMLRunner { | |
|
|
||
| ggml_context* compute_ctx = nullptr; | ||
| ggml_gallocr* compute_allocr = nullptr; | ||
| // Set when alloc_compute_buffer deferred to tiling on purpose (not a failure). | ||
| bool compute_buffer_deferred_to_tiling = false; | ||
|
|
||
| size_t max_graph_vram_bytes = 0; | ||
| bool stream_layers_enabled = false; | ||
| size_t observed_max_effective_budget_ = 0; | ||
|
|
||
| // When set, alloc_compute_buffer declines a too-large untiled decode so VAE AUTO can tile. | ||
| bool probe_compute_buffer_fits_ = false; | ||
| // Optional user cap (bytes) to force tiling; 0 = no cap. | ||
| size_t probe_max_bytes_ = 0; | ||
|
|
||
| std::shared_ptr<WeightAdapter> weight_adapter = nullptr; | ||
| std::weak_ptr<RunnerWeightManager> weight_manager; | ||
| std::unordered_set<const ggml_tensor*> kept_compute_param_tensor_set; | ||
|
|
@@ -1978,10 +1985,77 @@ struct GGMLRunner { | |
| } | ||
|
|
||
| bool alloc_compute_buffer(ggml_cgraph* gf) { | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this mixes allocation, graph memory estimation, and VAE auto-tiling policy in one place.
Could we split this into a query-style helper on This helper can use the temporary gallocr probe to estimate the compute buffer size, and also report the graph params that need to be prepared for this graph. Then
That would keep |
||
| compute_buffer_deferred_to_tiling = false; | ||
| if (compute_allocr != nullptr) { | ||
| return true; | ||
| } | ||
| compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(runtime_backend)); | ||
| ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(runtime_backend); | ||
|
|
||
| if (probe_compute_buffer_fits_) { | ||
| // Measure the planned untiled compute buffer once (no allocation), then defer to | ||
| // tiling before the real reserve hits a raw backend error. | ||
| ggml_gallocr* probe = ggml_gallocr_new(buft); | ||
| size_t sizes[1] = {0}; | ||
| ggml_gallocr_reserve_n_size(probe, gf, nullptr, nullptr, sizes); | ||
| ggml_gallocr_free(probe); | ||
| size_t planned = sizes[0]; | ||
|
|
||
| // User cap (extra_tiling_args max_buffer_size), any backend. | ||
| if (probe_max_bytes_ > 0 && planned > probe_max_bytes_) { | ||
| LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds requested max_buffer_size %.2f MB; deferring to tiling", | ||
| get_desc().c_str(), | ||
| planned / 1024.0 / 1024.0, | ||
| probe_max_bytes_ / 1024.0 / 1024.0); | ||
| compute_buffer_deferred_to_tiling = true; | ||
| return false; | ||
| } | ||
|
|
||
| // Free VRAM, any non-CPU backend: a decode can fit every op's per-buffer cap yet | ||
| // still exceed total free VRAM. Margin covers the scratch pool the reserve omits. | ||
| ggml_backend_dev_t dev = ggml_backend_get_device(runtime_backend); | ||
| if (dev != nullptr && ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { | ||
| size_t free_vram = 0, total_vram = 0; | ||
| ggml_backend_dev_memory(dev, &free_vram, &total_vram); | ||
| size_t margin = planned / 3; | ||
| if (margin < 512ull * 1024 * 1024) { | ||
| margin = 512ull * 1024 * 1024; | ||
| } | ||
| if (free_vram > 0 && free_vram < planned + margin) { | ||
| LOG_DEBUG("%s: untiled compute buffer %.2f MB won't fit free VRAM; deferring to tiling", | ||
| get_desc().c_str(), | ||
| planned / 1024.0 / 1024.0); | ||
| compute_buffer_deferred_to_tiling = true; | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| // Per-buffer cap: Vulkan via supports_op (the real limit; buft_get_max_size only | ||
| // reports the suballocation block there), other backends via buft_get_max_size. | ||
| if (sd_backend_is(runtime_backend, "Vulkan")) { | ||
| for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) { | ||
| ggml_tensor* op = ggml_graph_node(gf, i); | ||
| if (!ggml_backend_supports_op(runtime_backend, op)) { | ||
| LOG_DEBUG("%s: untiled compute op %.2f MB exceeds backend support; deferring to tiling", | ||
| get_desc().c_str(), | ||
| ggml_nbytes(op) / 1024.0 / 1024.0); | ||
| compute_buffer_deferred_to_tiling = true; | ||
| return false; | ||
| } | ||
| } | ||
| } else { | ||
| size_t max_size = ggml_backend_buft_get_max_size(buft); | ||
| if (max_size > 0 && planned > max_size) { | ||
| LOG_DEBUG("%s: untiled compute buffer %.2f MB exceeds backend max single buffer %.2f MB; deferring to tiling", | ||
| get_desc().c_str(), | ||
| planned / 1024.0 / 1024.0, | ||
| max_size / 1024.0 / 1024.0); | ||
| compute_buffer_deferred_to_tiling = true; | ||
| return false; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| compute_allocr = ggml_gallocr_new(buft); | ||
|
|
||
| if (!ggml_gallocr_reserve(compute_allocr, gf)) { | ||
| // failed to allocate the compute buffer | ||
|
|
@@ -2432,7 +2506,9 @@ struct GGMLRunner { | |
| GraphWeightDoneGuard graph_weight_done_guard(this, ¶ms_to_prepare); | ||
|
|
||
| if (!alloc_compute_buffer(gf)) { | ||
| LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); | ||
| if (!compute_buffer_deferred_to_tiling) { | ||
| LOG_ERROR("%s alloc compute buffer failed", get_desc().c_str()); | ||
| } | ||
| return std::nullopt; | ||
| } | ||
| struct ComputeBufferGuard { | ||
|
|
@@ -2822,6 +2898,11 @@ struct GGMLRunner { | |
| void set_stream_layers_enabled(bool enabled) { | ||
| stream_layers_enabled = enabled; | ||
| } | ||
|
|
||
| void set_probe_compute_buffer_fits(bool enabled, size_t max_bytes = 0) { | ||
| probe_compute_buffer_fits_ = enabled; | ||
| probe_max_bytes_ = enabled ? max_bytes : 0; | ||
| } | ||
| }; | ||
|
|
||
| class GGMLBlock { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think
extra_tiling_args=max_buffer_sizeis necessary here. We already have--max-vramas the user-facing VRAM budget knob, including per-backend/device assignment. Adding a second VAE-specific memory cap in bytes creates overlapping semantics and makes it unclear which memory budget should be authoritative.For auto VAE tiling, I think the preflight should use the existing runner/backend VRAM budget instead:
--max-vramis set for this backend, compare the estimated VAE graph memory against that budget;supports_oporggml_backend_buft_get_max_size.That would avoid introducing another hidden memory-limit parameter under
extra_tiling_args.