Qwen-Image-Edit-Rapid-AIO-Loras-Experimental

Running on Zero

App Files Files Community

Pr0f3ssi0n4ln00b commited on about 9 hours ago

Commit

db11ff0

verified ·

1 Parent(s): c549fb8

Update app.py

Browse files

Files changed (1) hide show

app.py +260 -306

app.py CHANGED Viewed

@@ -1,19 +1,14 @@
-# app.py
 import os
 import re
 import gc
-import uuid
-import time
-import math
 import traceback
-import random
-from typing import Iterable, Optional
 import gradio as gr
 import numpy as np
 import spaces
 import torch
 from PIL import Image, ImageDraw
 from transformers import (
     AutoProcessor,
@@ -29,133 +24,6 @@ from safetensors.torch import load_file as safetensors_load_file
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
-# ============================================================
-# Process-unique temp dir (helps avoid /tmp collisions)
-# ============================================================
-def _ensure_unique_gradio_tmp():
-    """
-    ZeroGPU/Spaces can serve multiple users across recycled containers.
-    Gradio may use /tmp/gradio by default. We force a unique directory per process.
-    """
-    if os.environ.get("GRADIO_TEMP_DIR"):
-        print(f"GRADIO_TEMP_DIR = {os.environ['GRADIO_TEMP_DIR']}")
-        return
-    pid = os.getpid()
-    tmp = f"/tmp/gradio_{pid}_{uuid.uuid4().hex[:8]}"
-    os.environ["GRADIO_TEMP_DIR"] = tmp
-    try:
-        os.makedirs(tmp, exist_ok=True)
-    except Exception:
-        pass
-    print(f"GRADIO_TEMP_DIR = {tmp}")
-_ensure_unique_gradio_tmp()
-# ============================================================
-# Patch: Qwen2.5-VL RoPE (avoid cublas batched GEMM; preserve shapes)
-# ============================================================
-def patch_qwen25vl_rope_no_gemm():
-    """
-    Patch Qwen2.5-VL rotary embedding to avoid the matmul that can trigger:
-      CUBLAS_STATUS_INVALID_VALUE (cublasSgemmStridedBatched)
-    on some ZeroGPU/H200 MIG configurations.
-    CRITICAL: Preserve exact output shapes used by apply_multimodal_rotary_pos_emb,
-    otherwise you get split_with_sizes mismatches.
-    """
-    if os.environ.get("DISABLE_ROPE_PATCH", "").strip() == "1":
-        print("[patch][rope] DISABLE_ROPE_PATCH=1 -> skipping patch.")
-        return
-    try:
-        from transformers.models.qwen2_5_vl import modeling_qwen2_5_vl as qvl
-    except Exception as e:
-        print(f"[patch][rope] could not import qwen2_5_vl modeling: {e}")
-        return
-    Rotary = None
-    for name in ["Qwen2_5_VLRotaryEmbedding", "Qwen2_5RotaryEmbedding", "RotaryEmbedding"]:
-        Rotary = getattr(qvl, name, None)
-        if Rotary is not None:
-            break
-    if Rotary is None:
-        print("[patch][rope] rotary embedding class not found; no patch applied.")
-        return
-    orig_forward = Rotary.forward
-    def forward_no_gemm(self, x, position_ids):
-        # Fallback to original if structure differs
-        if not hasattr(self, "inv_freq") or position_ids is None:
-            return orig_forward(self, x, position_ids)
-        # Determine rotary dim from module config (NOT x.shape[-1])
-        if hasattr(self, "dim") and isinstance(self.dim, int):
-            rope_dim = int(self.dim)
-        else:
-            rope_dim = int(self.inv_freq.numel() * 2)
-        # Normalize position_ids to (bs, seq)
-        if position_ids.ndim > 2:
-            pos = position_ids.reshape(position_ids.shape[0], -1)
-        else:
-            pos = position_ids
-        # Compute on the same device as inv_freq/x
-        dev = self.inv_freq.device
-        pos = pos.to(device=dev)
-        # Broadcast multiply instead of matmul:
-        # inv: (1,1,dim/2,1), pos: (bs,1,1,seq) -> freqs: (bs,1,dim/2,seq)
-        inv = self.inv_freq[None, None, :, None].float()
-        posf = pos[:, None, None, :].float()
-        freqs = (inv * posf).transpose(2, 3)  # (bs,1,seq,dim/2)
-        # Double to full rotary dim
-        emb = torch.cat((freqs, freqs), dim=-1)  # (bs,1,seq,dim)
-        # Enforce exact expected rotary dim
-        if emb.shape[-1] != rope_dim:
-            emb = emb[..., :rope_dim]
-        cos = emb.cos()
-        sin = emb.sin()
-        # Respect attention scaling if present
-        attn_scale = getattr(self, "attention_scaling", None)
-        if attn_scale is not None:
-            cos = cos * attn_scale
-            sin = sin * attn_scale
-        # Match dtype expectations (upstream typically returns same dtype as x)
-        cos = cos.to(dtype=x.dtype)
-        sin = sin.to(dtype=x.dtype)
-        # Optional debug (enable by env)
-        if os.environ.get("DEBUG_ROPE", "").strip() == "1":
-            ms = getattr(self, "mrope_section", None)
-            if ms is not None:
-                try:
-                    ms_list = list(ms)
-                    print(f"[DEBUG][rope] rope_dim={rope_dim} cos_last={cos.shape[-1]} mrope_sum={sum(ms_list)} mrope={ms_list}")
-                except Exception:
-                    print(f"[DEBUG][rope] rope_dim={rope_dim} cos_last={cos.shape[-1]} (mrope_section unreadable)")
-            else:
-                print(f"[DEBUG][rope] rope_dim={rope_dim} cos_last={cos.shape[-1]} (no mrope_section attr)")
-        return cos, sin
-    Rotary.forward = forward_no_gemm
-    print("[patch] Patched Qwen2.5-VL RoPE matmul -> broadcast multiply (shape-preserving).")
-patch_qwen25vl_rope_no_gemm()
 # ============================================================
 # Theme
 # ============================================================
@@ -175,6 +43,7 @@ colors.orange_red = colors.Color(
     c950="#802200",
 )
 class OrangeRedTheme(Soft):
     def __init__(
         self,
@@ -230,11 +99,11 @@ class OrangeRedTheme(Soft):
             block_label_background_fill="*primary_200",
         )
-orange_red_theme = OrangeRedTheme()
 # ============================================================
-# Device / Env debug
 # ============================================================
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -249,14 +118,6 @@ if torch.cuda.is_available():
     print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
 print("Using device:", device)
-# Optional: make matmul a bit more stable (doesn't change correctness)
-try:
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-except Exception:
-    pass
 # ============================================================
 # AIO version (Space variable)
 # ============================================================
@@ -267,6 +128,7 @@ DEFAULT_AIO_VERSION = "v19"
 _VER_RE = re.compile(r"^v\d+$")
 _DIGITS_RE = re.compile(r"^\d+$")
 def _normalize_version(raw: str) -> Optional[str]:
     if raw is None:
         return None
@@ -275,10 +137,12 @@ def _normalize_version(raw: str) -> Optional[str]:
         return None
     if _VER_RE.fullmatch(s):
         return s
     if _DIGITS_RE.fullmatch(s):
         return f"v{s}"
     return None
 _AIO_ENV_RAW = os.environ.get("AIO_VERSION", "")
 _AIO_ENV_NORM = _normalize_version(_AIO_ENV_RAW)
@@ -289,7 +153,6 @@ print(f"AIO_VERSION (env raw) = {_AIO_ENV_RAW!r}")
 print(f"AIO_VERSION (normalized) = {_AIO_ENV_NORM!r}")
 print(f"Using AIO_VERSION = {AIO_VERSION} ({AIO_VERSION_SOURCE})")
 # ============================================================
 # Pipeline
 # ============================================================
@@ -301,6 +164,7 @@ from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
 dtype = torch.bfloat16
 def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
     sub = f"{version}/transformer"
     print(f"📦 Loading AIO transformer: {AIO_REPO_ID} / {sub}")
@@ -316,9 +180,11 @@ def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
     ).to(device)
     return p
 try:
     pipe = _load_pipe_with_version(AIO_VERSION)
-except Exception:
     print("❌ Failed to load requested AIO_VERSION. Falling back to v19.")
     print("---- exception ----")
     print(traceback.format_exc())
@@ -336,39 +202,47 @@ except Exception as e:
 MAX_SEED = np.iinfo(np.int32).max
 # ============================================================
-# Derived conditioning (Transformers): Pose + Depth  (v1-style)
 # ============================================================
 POSE_MODEL_ID = "usyd-community/vitpose-base-simple"
 POSE_DETECTOR_ID = "PekingU/rtdetr_r50vd_coco_o365"
 DEPTH_MODEL_ID = "depth-anything/Depth-Anything-V2-Small-hf"
 _POSE_CACHE = {}
 _DEPTH_CACHE = {}
 COCO17_EDGES = [
-    (0, 1), (0, 2), (1, 3), (2, 4),
-    (5, 6),
-    (5, 7), (7, 9),
-    (6, 8), (8, 10),
-    (5, 11), (6, 12), (11, 12),
-    (11, 13), (13, 15),
-    (12, 14), (14, 16),
 ]
 def _derived_device(use_gpu: bool) -> torch.device:
     return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
 def _load_pose_models(dev: torch.device):
     key = str(dev)
     if key in _POSE_CACHE:
         return _POSE_CACHE[key]
     det_proc = AutoProcessor.from_pretrained(POSE_DETECTOR_ID)
     det_model = RTDetrForObjectDetection.from_pretrained(POSE_DETECTOR_ID).to(dev)
     pose_proc = AutoProcessor.from_pretrained(POSE_MODEL_ID)
     pose_model = VitPoseForPoseEstimation.from_pretrained(POSE_MODEL_ID).to(dev)
@@ -378,6 +252,7 @@ def _load_pose_models(dev: torch.device):
     _POSE_CACHE[key] = (det_proc, det_model, pose_proc, pose_model)
     return _POSE_CACHE[key]
 def _load_depth_models(dev: torch.device):
     key = str(dev)
     if key in _DEPTH_CACHE:
@@ -390,6 +265,7 @@ def _load_depth_models(dev: torch.device):
     _DEPTH_CACHE[key] = (proc, model)
     return _DEPTH_CACHE[key]
 def _draw_skeleton_on_blank(
     size: tuple[int, int],
     persons_keypoints: list[np.ndarray],
@@ -403,6 +279,7 @@ def _draw_skeleton_on_blank(
     draw = ImageDraw.Draw(canvas)
     for kps, sc in zip(persons_keypoints, persons_scores):
         for a, b in COCO17_EDGES:
             if a >= len(sc) or b >= len(sc):
                 continue
@@ -412,13 +289,20 @@ def _draw_skeleton_on_blank(
             xb, yb = float(kps[b, 0]), float(kps[b, 1])
             draw.line([(xa, ya), (xb, yb)], fill=(255, 255, 255), width=line_w)
         for i in range(min(len(sc), len(kps))):
             if sc[i] < kp_thresh:
                 continue
             x, y = float(kps[i, 0]), float(kps[i, 1])
-            draw.ellipse([(x - point_r, y - point_r), (x + point_r, y + point_r)], fill=(255, 255, 255))
     return canvas
 def make_pose_map(
     img: Image.Image,
     *,
@@ -427,6 +311,12 @@ def make_pose_map(
     det_thresh: float = 0.30,
     max_people: int = 4,
 ) -> Image.Image:
     img = img.convert("RGB")
     dev = _derived_device(use_gpu)
     det_proc, det_model, pose_proc, pose_model = _load_pose_models(dev)
@@ -434,8 +324,10 @@ def make_pose_map(
     w, h = img.size
     if mode == "fast":
         boxes = np.array([[0.0, 0.0, float(w), float(h)]], dtype=np.float32)
     else:
         inputs = det_proc(images=img, return_tensors="pt").to(dev)
         with torch.no_grad():
             outputs = det_model(**inputs)
@@ -446,11 +338,14 @@ def make_pose_map(
             threshold=det_thresh,
         )[0]
         person_boxes = results["boxes"][results["labels"] == 0].detach().cpu().numpy()
         if person_boxes.size == 0:
             boxes = np.array([[0.0, 0.0, float(w), float(h)]], dtype=np.float32)
         else:
             person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
             person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
             boxes = person_boxes.astype(np.float32)
@@ -464,17 +359,23 @@ def make_pose_map(
     pose_results = pose_proc.post_process_pose_estimation(pose_outputs, boxes=[boxes])[0]
-    persons_kps, persons_sc = [], []
     for pr in pose_results:
-        persons_kps.append(pr["keypoints"].detach().cpu().numpy())
-        persons_sc.append(pr["scores"].detach().cpu().numpy())
     if not persons_kps:
         return Image.new("RGB", img.size, (0, 0, 0))
     return _draw_skeleton_on_blank(img.size, persons_kps, persons_sc)
 def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
     img = img.convert("RGB")
     dev = _derived_device(use_gpu)
     proc, model = _load_depth_models(dev)
@@ -485,7 +386,10 @@ def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
     with torch.no_grad():
         out = model(**inputs)
     pred = out.predicted_depth
     pred = torch.nn.functional.interpolate(
         pred.unsqueeze(1),
         size=(img.height, img.width),
@@ -499,55 +403,9 @@ def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
     arr = arr / denom
     depth8 = (arr * 255.0).clip(0, 255).astype(np.uint8)
-    return Image.fromarray(depth8, mode="L").convert("RGB")
-# ============================================================
-# Helpers: gallery normalization + debug-friendly PIL conversion
-# ============================================================
-def _to_pil_rgb(x) -> Optional[Image.Image]:
-    """
-    Accepts PIL / numpy / (image, caption) tuples / gradio dicts and returns PIL RGB.
-    Also safely ignores broken temp paths.
-    """
-    if x is None:
-        return None
-    # Gallery often returns (image, caption)
-    if isinstance(x, tuple) and len(x) >= 1:
-        x = x[0]
-        if x is None:
-            return None
-    # Some gradio versions can return dict with a temp file path
-    if isinstance(x, dict):
-        # common keys: 'name' or 'path'
-        p = x.get("name") or x.get("path")
-        if isinstance(p, str):
-            if not os.path.exists(p):
-                print(f"[WARN] extra image path missing, skipping: {p}")
-                return None
-            try:
-                return Image.open(p).convert("RGB")
-            except Exception as e:
-                print(f"[WARN] failed to open extra image path {p}: {e}")
-                return None
-    if isinstance(x, Image.Image):
-        return x.convert("RGB")
-    if isinstance(x, np.ndarray):
-        try:
-            return Image.fromarray(x).convert("RGB")
-        except Exception:
-            return None
-    # last resort
-    try:
-        return Image.fromarray(np.array(x)).convert("RGB")
-    except Exception:
-        return None
 def _append_to_gallery(existing, new_img: Image.Image):
     items = []
@@ -559,29 +417,8 @@ def _append_to_gallery(existing, new_img: Image.Image):
     items.append(new_img)
     return items
-def build_labeled_images(
-    img1: Image.Image,
-    img2: Optional[Image.Image],
-    extra_imgs: Optional[list[Image.Image]],
-) -> dict[str, Image.Image]:
-    labeled: dict[str, Image.Image] = {}
-    idx = 1
-    labeled[f"image_{idx}"] = img1
-    idx += 1
-    if img2 is not None:
-        labeled[f"image_{idx}"] = img2
-        idx += 1
-    if extra_imgs:
-        for im in extra_imgs:
-            if im is None:
-                continue
-            labeled[f"image_{idx}"] = im
-            idx += 1
-    return labeled
 # ============================================================
-# LoRA adapters + presets (your v1 config)
 # ============================================================
 NONE_LORA = "None"
@@ -649,7 +486,7 @@ ADAPTER_SPECS = {
         "weights": "bfs_head_v5_2511_original.safetensors",
         "adapter_name": "BFS-Best-Faceswap",
         "strength": 1.0,
-        "needs_alpha_fix": True,
     },
     "BFS-Best-FaceSwap-merge": {
         "type": "single",
@@ -659,7 +496,7 @@ ADAPTER_SPECS = {
         "weights": "bfs_head_v5_2511_merged_version_rank_32_fp32.safetensors",
         "adapter_name": "BFS-Best-Faceswap-merge",
         "strength": 1.1,
-        "needs_alpha_fix": True,
     },
     "F2P": {
         "type": "single",
@@ -745,13 +582,16 @@ LORA_PRESET_PROMPTS = {
     "BFS-Best-FaceSwap-merge": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
 }
 LOADED_ADAPTERS = set()
 # ============================================================
-# Helpers: resolution (area-based sizing)
 # ============================================================
 def _round_to_multiple(x: int, m: int) -> int:
     return max(m, (int(x) // m) * m)
@@ -760,9 +600,16 @@ def compute_canvas_dimensions_from_area(
     target_area: int,
     multiple_of: int,
 ) -> tuple[int, int]:
     w, h = image.size
     aspect = w / h if h else 1.0
     from qwenimage.pipeline_qwenimage_edit_plus import calculate_dimensions
     width, height = calculate_dimensions(int(target_area), float(aspect))
     width = _round_to_multiple(int(width), int(multiple_of))
     height = _round_to_multiple(int(height), int(multiple_of))
@@ -773,18 +620,29 @@ def get_target_area_for_lora(
     lora_adapter: str,
     user_target_megapixels: float,
 ) -> int:
     spec = ADAPTER_SPECS.get(lora_adapter, {})
     if "target_area" in spec:
         try:
             return int(spec["target_area"])
         except Exception:
             pass
     if "target_megapixels" in spec:
         try:
             mp = float(spec["target_megapixels"])
             return int(mp * 1024 * 1024)
         except Exception:
             pass
     if "target_long_edge" in spec:
         try:
             long_edge = int(spec["target_long_edge"])
@@ -798,21 +656,99 @@ def get_target_area_for_lora(
             return int(new_w * new_h)
         except Exception:
             pass
-    return int(float(user_target_megapixels) * 1024 * 1024)
 # ============================================================
-# Helpers: LoRA routing + BFS alpha fixes (your v1 logic)
 # ============================================================
 def lora_requires_two_images(lora_adapter: str) -> bool:
     return bool(ADAPTER_SPECS.get(lora_adapter, {}).get("requires_two_images", False))
 def image2_label_for_lora(lora_adapter: str) -> str:
     return str(ADAPTER_SPECS.get(lora_adapter, {}).get("image2_label", "Upload Reference (Image 2)"))
 def _inject_missing_alpha_keys(state_dict: dict) -> dict:
     bases = {}
     for k, v in state_dict.items():
         if not isinstance(v, torch.Tensor):
             continue
@@ -833,9 +769,27 @@ def _inject_missing_alpha_keys(state_dict: dict) -> dict:
             stripped_alpha = f"{stripped_base}.alpha"
             if stripped_alpha not in state_dict:
                 state_dict[stripped_alpha] = alpha_tensor
     return state_dict
 def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
     keep_suffixes = (
         ".lora_up.weight",
         ".lora_down.weight",
@@ -843,6 +797,7 @@ def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
         ".alpha",
         ".lora_alpha",
     )
     dropped_patch = 0
     dropped_other = 0
     kept = 0
@@ -851,21 +806,28 @@ def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
     out: dict[str, torch.Tensor] = {}
     for k, v in state_dict.items():
         if not isinstance(v, torch.Tensor):
             dropped_other += 1
             continue
         if k.endswith(".diff") or k.endswith(".diff_b"):
             dropped_patch += 1
             continue
         if not k.endswith(keep_suffixes):
             dropped_other += 1
             continue
         if k.endswith(".lora_alpha"):
             base = k[: -len(".lora_alpha")]
             k2 = f"{base}.alpha"
             out[k2] = v.float() if v.dtype != torch.float32 else v
             normalized_alpha += 1
             kept += 1
             continue
         out[k] = v
         kept += 1
@@ -877,7 +839,15 @@ def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
     }
     return out, stats
 def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_model.") -> dict:
     out = dict(state_dict)
     for k, v in list(state_dict.items()):
         if not k.startswith(prefix):
@@ -887,11 +857,18 @@ def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_m
             out[stripped] = v
     return out
 def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name: str, needs_alpha_fix: bool = False):
     try:
         pipe.load_lora_weights(repo, weight_name=weight_name, adapter_name=adapter_name)
         return
     except (KeyError, ValueError) as e:
         if not needs_alpha_fix:
             raise
@@ -903,8 +880,13 @@ def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name:
         local_path = hf_hub_download(repo_id=repo, filename=weight_name)
         sd = safetensors_load_file(local_path)
         sd = _inject_missing_alpha_keys(sd)
         sd, stats = _filter_to_diffusers_lora_keys(sd)
         sd = _duplicate_stripped_prefix_keys(sd)
         print(
@@ -912,9 +894,16 @@ def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name:
             f"kept={stats['kept']} dropped_patch={stats['dropped_patch']} "
             f"dropped_other={stats['dropped_other']} normalized_alpha={stats['normalized_alpha']}"
         )
         pipe.load_lora_weights(sd, adapter_name=adapter_name)
         return
 def _ensure_loaded_and_get_active_adapters(selected_lora: str):
     spec = ADAPTER_SPECS.get(selected_lora)
     if not spec:
@@ -952,6 +941,7 @@ def _ensure_loaded_and_get_active_adapters(selected_lora: str):
             adapter_names.append(adapter_name)
             adapter_weights.append(strength)
     else:
         repo = spec["repo"]
         weights = spec["weights"]
@@ -984,7 +974,9 @@ def _ensure_loaded_and_get_active_adapters(selected_lora: str):
 # UI handlers
 # ============================================================
 def on_lora_change_ui(selected_lora, current_prompt, current_extras_condition_only):
     if selected_lora != NONE_LORA:
         preset = LORA_PRESET_PROMPTS.get(selected_lora, "")
         if preset and (current_prompt is None or str(current_prompt).strip() == ""):
@@ -994,19 +986,20 @@ def on_lora_change_ui(selected_lora, current_prompt, current_extras_condition_on
     else:
         prompt_update = gr.update(value=current_prompt)
     if lora_requires_two_images(selected_lora):
         img2_update = gr.update(visible=True, label=image2_label_for_lora(selected_lora))
     else:
         img2_update = gr.update(visible=False, value=None, label="Upload Reference (Image 2)")
     if selected_lora in ("BFS-Best-FaceSwap", "BFS-Best-FaceSwap-merge", "AnyPose"):
         extras_update = gr.update(value=True)
     else:
         extras_update = gr.update(value=current_extras_condition_only)
     return prompt_update, img2_update, extras_update
 # ============================================================
 # UI helpers: output routing + derived conditioning
 # ============================================================
@@ -1016,16 +1009,19 @@ def set_output_as_image1(last):
         raise gr.Error("No output available yet.")
     return gr.update(value=last)
 def set_output_as_image2(last):
     if last is None:
         raise gr.Error("No output available yet.")
     return gr.update(value=last)
 def set_output_as_extra(last, existing_extra):
     if last is None:
         raise gr.Error("No output available yet.")
     return _append_to_gallery(existing_extra, last)
 @spaces.GPU
 def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived_max_people):
     if img1 is None:
@@ -1054,36 +1050,17 @@ def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived
     return gr.update(value=new_gallery), gr.update(visible=True, value=derived)
-# ============================================================
-# Debug helpers (CUDA mem + token count)
-# ============================================================
-def _cuda_mem(prefix: str):
-    if not torch.cuda.is_available():
-        return
-    try:
-        free, total = torch.cuda.mem_get_info()
-        print(f"[DEBUG][cuda][{prefix}] mem free={free/1e9:.2f}GB total={total/1e9:.2f}GB")
-    except Exception:
-        pass
-def _approx_token_count(text: str) -> int:
-    # Lightweight: we avoid forcing tokenizer calls here; this is only for debug.
-    # Rule-of-thumb: ~4 chars per token in English-ish text.
-    if not text:
-        return 0
-    return max(1, int(math.ceil(len(text) / 4.0)))
 # ============================================================
 # Inference
 # ============================================================
 @spaces.GPU
 def infer(
     input_image_1,
     input_image_2,
-    input_images_extra,
     prompt,
     lora_adapter,
     seed,
@@ -1099,8 +1076,6 @@ def infer(
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-    print("[DEBUG][infer] input types:", type(input_image_1), type(input_image_2), type(input_images_extra))
     if input_image_1 is None:
         raise gr.Error("Please upload Image 1.")
@@ -1119,23 +1094,15 @@ def infer(
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator(device=device).manual_seed(seed)
     negative_prompt = (
         "worst quality, low quality, bad anatomy, bad hands, text, error, missing fingers, "
         "extra digit, fewer digits, cropped, jpeg artifacts, signature, watermark, username, blurry"
     )
-    img1 = input_image_1.convert("RGB") if isinstance(input_image_1, Image.Image) else _to_pil_rgb(input_image_1)
-    if img1 is None:
-        raise gr.Error("Image 1 could not be read (unexpected input type/path).")
-    img2 = None
-    if input_image_2 is not None:
-        img2 = input_image_2.convert("RGB") if isinstance(input_image_2, Image.Image) else _to_pil_rgb(input_image_2)
-        if img2 is None:
-            raise gr.Error("Image 2 could not be read (unexpected input type/path).")
-    # Normalize extra images (Gallery)
     extra_imgs: list[Image.Image] = []
     if input_images_extra:
         for item in input_images_extra:
@@ -1143,16 +1110,20 @@ def infer(
             if pil is not None:
                 extra_imgs.append(pil)
-    # Enforce 2-image LoRA behavior
     if lora_requires_two_images(lora_adapter) and img2 is None:
         raise gr.Error("This LoRA needs two images. Please upload Image 2 as well.")
     labeled = build_labeled_images(img1, img2, extra_imgs)
     pipe_images = list(labeled.values())
     if len(pipe_images) == 1:
         pipe_images = pipe_images[0]
     target_area = get_target_area_for_lora(img1, lora_adapter, float(target_megapixels))
     width, height = compute_canvas_dimensions_from_area(
         img1,
@@ -1160,48 +1131,33 @@ def infer(
         multiple_of=int(pipe.vae_scale_factor * 2),
     )
     vae_image_indices = None
     if extras_condition_only:
         if isinstance(pipe_images, list) and len(pipe_images) > 2:
             vae_image_indices = [0, 1] if len(pipe_images) >= 2 else [0]
-    # Debug summary
-    n_images = len(pipe_images) if isinstance(pipe_images, list) else 1
-    tok_est = _approx_token_count(prompt or "")
-    print(
-        "[DEBUG][infer] submitting request | "
-        f"lora_adapter={lora_adapter!r} seed={seed} prompt_len={len(prompt or '')} "
-        f"steps={steps} true_cfg_scale={guidance_scale} target_mp={target_megapixels} "
-        f"canvas=({width}x{height}) n_images={n_images} vae_image_indices={vae_image_indices} "
-        f"pad_to_canvas={bool(pad_to_canvas)}"
-    )
-    print(f"[DEBUG][infer] image_1 size: {img1.size} image_2 size: {img2.size if img2 else None}")
-    print(f"[DEBUG][infer] prompt token_estimate: {tok_est}")
-    _cuda_mem("before")
     try:
         result = pipe(
             image=pipe_images,
             prompt=prompt,
             negative_prompt=negative_prompt,
             height=height,
             width=width,
-            num_inference_steps=int(steps),
             generator=generator,
-            true_cfg_scale=float(guidance_scale),
             vae_image_indices=vae_image_indices,
             pad_to_canvas=bool(pad_to_canvas),
         ).images[0]
         return result, seed, result
-    except Exception as e:
-        print("---- [ERROR][infer] exception ----")
-        print(traceback.format_exc())
-        print("---------------------------------")
-        raise
     finally:
-        _cuda_mem("after")
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
@@ -1214,13 +1170,8 @@ def infer_example(input_image, prompt, lora_adapter):
     input_pil = input_image.convert("RGB")
     guidance_scale = 1.0
     steps = 4
-    result, seed, last = infer(
-        input_pil, None, None,
-        prompt, lora_adapter,
-        0, True,
-        guidance_scale, steps, 1.0,
-        True, True
-    )
     return result, seed, last
@@ -1247,7 +1198,7 @@ with gr.Blocks() as demo:
         gr.Markdown(
             "Perform diverse image edits using specialized "
             "[LoRA](https://huggingface.co/models?other=base_model:adapter:Qwen/Qwen-Image-Edit-2511) adapters for the "
-            "[Qwen-Image-Edit](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) model. Uses a Diffusers compatible extraction of the transformers from Phr00t's Rapid AIO merge."
         )
         gr.Markdown(aio_status_line)
@@ -1341,6 +1292,7 @@ with gr.Blocks() as demo:
                         value=True,
                     )
         lora_adapter.change(
             fn=on_lora_change_ui,
             inputs=[lora_adapter, prompt, extras_condition_only],
@@ -1400,16 +1352,18 @@ with gr.Blocks() as demo:
         outputs=[output_image, seed, last_output],
     )
     btn_out_to_img1.click(fn=set_output_as_image1, inputs=[last_output], outputs=[input_image_1])
     btn_out_to_img2.click(fn=set_output_as_image2, inputs=[last_output], outputs=[input_image_2])
     btn_out_to_extra.click(fn=set_output_as_extra, inputs=[last_output, input_images_extra], outputs=[input_images_extra])
     add_derived_btn.click(
         fn=add_derived_ref,
         inputs=[input_image_1, input_images_extra, derived_type, derived_use_gpu, derived_max_people],
         outputs=[input_images_extra, derived_preview],
     )
 if __name__ == "__main__":
     demo.queue(max_size=30).launch(
         css=css,

 import os
 import re
 import gc
 import traceback
 import gradio as gr
 import numpy as np
 import spaces
 import torch
+import random
 from PIL import Image, ImageDraw
+from typing import Iterable, Optional
 from transformers import (
     AutoProcessor,
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 # ============================================================
 # Theme
 # ============================================================
     c950="#802200",
 )
 class OrangeRedTheme(Soft):
     def __init__(
         self,
             block_label_background_fill="*primary_200",
         )
+orange_red_theme = OrangeRedTheme()
 # ============================================================
+# Device
 # ============================================================
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
 print("Using device:", device)
 # ============================================================
 # AIO version (Space variable)
 # ============================================================
 _VER_RE = re.compile(r"^v\d+$")
 _DIGITS_RE = re.compile(r"^\d+$")
 def _normalize_version(raw: str) -> Optional[str]:
     if raw is None:
         return None
         return None
     if _VER_RE.fullmatch(s):
         return s
+    # forgiving: allow "21" -> "v21"
     if _DIGITS_RE.fullmatch(s):
         return f"v{s}"
     return None
 _AIO_ENV_RAW = os.environ.get("AIO_VERSION", "")
 _AIO_ENV_NORM = _normalize_version(_AIO_ENV_RAW)
 print(f"AIO_VERSION (normalized) = {_AIO_ENV_NORM!r}")
 print(f"Using AIO_VERSION = {AIO_VERSION} ({AIO_VERSION_SOURCE})")
 # ============================================================
 # Pipeline
 # ============================================================
 dtype = torch.bfloat16
 def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
     sub = f"{version}/transformer"
     print(f"📦 Loading AIO transformer: {AIO_REPO_ID} / {sub}")
     ).to(device)
     return p
+# Forgiving load: try env/default version, fallback to v19 if it fails
 try:
     pipe = _load_pipe_with_version(AIO_VERSION)
+except Exception as e:
     print("❌ Failed to load requested AIO_VERSION. Falling back to v19.")
     print("---- exception ----")
     print(traceback.format_exc())
 MAX_SEED = np.iinfo(np.int32).max
 # ============================================================
+# Derived conditioning (Transformers): Pose + Depth
 # ============================================================
+# Pose estimation uses ViTPose (top-down). Official docs show RT-DETR -> ViTPose flow:
+# https://huggingface.co/docs/transformers/model_doc/vitpose
+# Depth uses Depth Anything V2 Small (Transformers-compatible):
+# https://huggingface.co/depth-anything/Depth-Anything-V2-Small-hf
 POSE_MODEL_ID = "usyd-community/vitpose-base-simple"
 POSE_DETECTOR_ID = "PekingU/rtdetr_r50vd_coco_o365"
 DEPTH_MODEL_ID = "depth-anything/Depth-Anything-V2-Small-hf"
+# Lazy caches keyed by device string ("cpu" / "cuda")
 _POSE_CACHE = {}
 _DEPTH_CACHE = {}
+# COCO-17 skeleton connections (approx "OpenPose-like" stick figure)
 COCO17_EDGES = [
+    (0, 1), (0, 2), (1, 3), (2, 4),      # head
+    (5, 6),                              # shoulders
+    (5, 7), (7, 9),                      # left arm
+    (6, 8), (8, 10),                     # right arm
+    (5, 11), (6, 12), (11, 12),          # torso
+    (11, 13), (13, 15),                  # left leg
+    (12, 14), (14, 16),                  # right leg
 ]
 def _derived_device(use_gpu: bool) -> torch.device:
     return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
 def _load_pose_models(dev: torch.device):
     key = str(dev)
     if key in _POSE_CACHE:
         return _POSE_CACHE[key]
+    # Detector (optional but used for multi-person boxes)
     det_proc = AutoProcessor.from_pretrained(POSE_DETECTOR_ID)
     det_model = RTDetrForObjectDetection.from_pretrained(POSE_DETECTOR_ID).to(dev)
+    # Pose model
     pose_proc = AutoProcessor.from_pretrained(POSE_MODEL_ID)
     pose_model = VitPoseForPoseEstimation.from_pretrained(POSE_MODEL_ID).to(dev)
     _POSE_CACHE[key] = (det_proc, det_model, pose_proc, pose_model)
     return _POSE_CACHE[key]
 def _load_depth_models(dev: torch.device):
     key = str(dev)
     if key in _DEPTH_CACHE:
     _DEPTH_CACHE[key] = (proc, model)
     return _DEPTH_CACHE[key]
 def _draw_skeleton_on_blank(
     size: tuple[int, int],
     persons_keypoints: list[np.ndarray],
     draw = ImageDraw.Draw(canvas)
     for kps, sc in zip(persons_keypoints, persons_scores):
+        # Draw edges
         for a, b in COCO17_EDGES:
             if a >= len(sc) or b >= len(sc):
                 continue
             xb, yb = float(kps[b, 0]), float(kps[b, 1])
             draw.line([(xa, ya), (xb, yb)], fill=(255, 255, 255), width=line_w)
+        # Draw keypoints
         for i in range(min(len(sc), len(kps))):
             if sc[i] < kp_thresh:
                 continue
             x, y = float(kps[i, 0]), float(kps[i, 1])
+            draw.ellipse(
+                [(x - point_r, y - point_r), (x + point_r, y + point_r)],
+                fill=(255, 255, 255),
+                outline=None,
+            )
     return canvas
 def make_pose_map(
     img: Image.Image,
     *,
     det_thresh: float = 0.30,
     max_people: int = 4,
 ) -> Image.Image:
+    """Return an OpenPose-like skeleton map (RGB) using Transformers models.
+    mode:
+      - "fast": full-frame box (no detector). Good when Image 1 is already a single subject.
+      - "detect": RT-DETR person boxes -> ViTPose. Better for multi-person scenes.
+    """
     img = img.convert("RGB")
     dev = _derived_device(use_gpu)
     det_proc, det_model, pose_proc, pose_model = _load_pose_models(dev)
     w, h = img.size
     if mode == "fast":
+        # Single box covering whole image, COCO format [x, y, w, h]
         boxes = np.array([[0.0, 0.0, float(w), float(h)]], dtype=np.float32)
     else:
+        # Detect people
         inputs = det_proc(images=img, return_tensors="pt").to(dev)
         with torch.no_grad():
             outputs = det_model(**inputs)
             threshold=det_thresh,
         )[0]
+        # COCO label 0 is "person" for COCO-trained detectors
         person_boxes = results["boxes"][results["labels"] == 0].detach().cpu().numpy()
         if person_boxes.size == 0:
+            # Fallback to full-frame
             boxes = np.array([[0.0, 0.0, float(w), float(h)]], dtype=np.float32)
         else:
+            # Convert VOC x1,y1,x2,y2 to COCO x,y,w,h
             person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
             person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
             boxes = person_boxes.astype(np.float32)
     pose_results = pose_proc.post_process_pose_estimation(pose_outputs, boxes=[boxes])[0]
+    persons_kps = []
+    persons_sc = []
     for pr in pose_results:
+        kps = pr["keypoints"].detach().cpu().numpy()
+        sc = pr["scores"].detach().cpu().numpy()
+        persons_kps.append(kps)
+        persons_sc.append(sc)
     if not persons_kps:
+        # No pose found; return black canvas
         return Image.new("RGB", img.size, (0, 0, 0))
     return _draw_skeleton_on_blank(img.size, persons_kps, persons_sc)
 def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
+    """Return a grayscale (RGB) depth map using Depth Anything V2 Small."""
     img = img.convert("RGB")
     dev = _derived_device(use_gpu)
     proc, model = _load_depth_models(dev)
     with torch.no_grad():
         out = model(**inputs)
+    # predicted_depth: (B, H, W)
     pred = out.predicted_depth
+    # Upsample to original image size
     pred = torch.nn.functional.interpolate(
         pred.unsqueeze(1),
         size=(img.height, img.width),
     arr = arr / denom
     depth8 = (arr * 255.0).clip(0, 255).astype(np.uint8)
+    depth_img = Image.fromarray(depth8, mode="L").convert("RGB")
+    return depth_img
 def _append_to_gallery(existing, new_img: Image.Image):
     items = []
     items.append(new_img)
     return items
 # ============================================================
+# LoRA adapters + presets
 # ============================================================
 NONE_LORA = "None"
         "weights": "bfs_head_v5_2511_original.safetensors",
         "adapter_name": "BFS-Best-Faceswap",
         "strength": 1.0,
+        "needs_alpha_fix": True,  # <-- fixes KeyError 'img_in.alpha'
     },
     "BFS-Best-FaceSwap-merge": {
         "type": "single",
         "weights": "bfs_head_v5_2511_merged_version_rank_32_fp32.safetensors",
         "adapter_name": "BFS-Best-Faceswap-merge",
         "strength": 1.1,
+        "needs_alpha_fix": True,  # <-- fixes KeyError 'img_in.alpha'
     },
     "F2P": {
         "type": "single",
     "BFS-Best-FaceSwap-merge": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
 }
+# Track what is currently loaded in memory (adapter_name values)
 LOADED_ADAPTERS = set()
 # ============================================================
+# Helpers: resolution
 # ============================================================
+# We prefer *area-based* sizing (≈ megapixels) over long-edge sizing.
+# This aligns better with Qwen-Image-Edit's internal assumptions and reduces FOV drift.
 def _round_to_multiple(x: int, m: int) -> int:
     return max(m, (int(x) // m) * m)
     target_area: int,
     multiple_of: int,
 ) -> tuple[int, int]:
+    """Compute (width, height) that matches image aspect ratio and approximates target_area.
+    The result is floored to be divisible by multiple_of (typically vae_scale_factor*2).
+    """
     w, h = image.size
     aspect = w / h if h else 1.0
+    # Use the pipeline's own area->(w,h) helper for consistency.
     from qwenimage.pipeline_qwenimage_edit_plus import calculate_dimensions
     width, height = calculate_dimensions(int(target_area), float(aspect))
     width = _round_to_multiple(int(width), int(multiple_of))
     height = _round_to_multiple(int(height), int(multiple_of))
     lora_adapter: str,
     user_target_megapixels: float,
 ) -> int:
+    """Return target pixel area for the canvas.
+    Priority:
+      1) Adapter spec: target_area (pixels) or target_megapixels
+      2) Adapter spec: target_long_edge (legacy) -> converted to area using image aspect
+      3) User slider target megapixels
+    """
     spec = ADAPTER_SPECS.get(lora_adapter, {})
     if "target_area" in spec:
         try:
             return int(spec["target_area"])
         except Exception:
             pass
     if "target_megapixels" in spec:
         try:
             mp = float(spec["target_megapixels"])
             return int(mp * 1024 * 1024)
         except Exception:
             pass
+    # Legacy support (e.g. Upscale2K)
     if "target_long_edge" in spec:
         try:
             long_edge = int(spec["target_long_edge"])
             return int(new_w * new_h)
         except Exception:
             pass
+    # User default
+    return int(float(user_target_megapixels) * 1024 * 1024)
 # ============================================================
+# Helpers: multi-input routing + gallery normalization
 # ============================================================
 def lora_requires_two_images(lora_adapter: str) -> bool:
     return bool(ADAPTER_SPECS.get(lora_adapter, {}).get("requires_two_images", False))
 def image2_label_for_lora(lora_adapter: str) -> str:
     return str(ADAPTER_SPECS.get(lora_adapter, {}).get("image2_label", "Upload Reference (Image 2)"))
+def _to_pil_rgb(x) -> Optional[Image.Image]:
+    """
+    Accepts PIL / numpy / (image, caption) tuples from gr.Gallery and returns PIL RGB.
+    Gradio Gallery commonly yields tuples like (image, caption).
+    """
+    if x is None:
+        return None
+    # Gallery often returns (image, caption)
+    if isinstance(x, tuple) and len(x) >= 1:
+        x = x[0]
+        if x is None:
+            return None
+    if isinstance(x, Image.Image):
+        return x.convert("RGB")
+    if isinstance(x, np.ndarray):
+        return Image.fromarray(x).convert("RGB")
+    # Best-effort fallback
+    try:
+        return Image.fromarray(np.array(x)).convert("RGB")
+    except Exception:
+        return None
+def build_labeled_images(
+    img1: Image.Image,
+    img2: Optional[Image.Image],
+    extra_imgs: Optional[list[Image.Image]],
+) -> dict[str, Image.Image]:
+    """
+    Creates labels image_1, image_2, image_3... based on what is actually uploaded:
+      - img1 is always image_1
+      - img2 becomes image_2 only if present
+      - extras start immediately after the last present base box
+    The pipeline receives images in this exact order.
+    """
+    labeled: dict[str, Image.Image] = {}
+    idx = 1
+    labeled[f"image_{idx}"] = img1
+    idx += 1
+    if img2 is not None:
+        labeled[f"image_{idx}"] = img2
+        idx += 1
+    if extra_imgs:
+        for im in extra_imgs:
+            if im is None:
+                continue
+            labeled[f"image_{idx}"] = im
+            idx += 1
+    return labeled
+# ============================================================
+# Helpers: BFS alpha key fix
+# ============================================================
 def _inject_missing_alpha_keys(state_dict: dict) -> dict:
+    """
+    Diffusers' Qwen LoRA converter expects '<module>.alpha' keys.
+    BFS safetensors omits them. We inject alpha = rank (neutral scaling).
+    IMPORTANT: diffusers may strip 'diffusion_model.' before lookup, so we
+    inject BOTH:
+      - diffusion_model.xxx.alpha
+      - xxx.alpha
+    """
     bases = {}
     for k, v in state_dict.items():
         if not isinstance(v, torch.Tensor):
             continue
             stripped_alpha = f"{stripped_base}.alpha"
             if stripped_alpha not in state_dict:
                 state_dict[stripped_alpha] = alpha_tensor
     return state_dict
 def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
+    """Return (filtered_state_dict, stats).
+    Some ComfyUI/Qwen safetensors (especially "merged" variants) include non-LoRA
+    delta/patch keys like `*.diff` and `*.diff_b` alongside real LoRA tensors.
+    Diffusers' internal Qwen LoRA converter is strict: any leftover keys cause an
+    error (`state_dict should be empty...`).
+    This helper keeps only the keys Diffusers can consume as a LoRA:
+      - `*.lora_up.weight`
+      - `*.lora_down.weight`
+      - (rare) `*.lora_mid.weight`
+      - alpha keys: `*.alpha` (or `*.lora_alpha` which we normalize to `*.alpha`)
+    It also drops known patch keys (`*.diff`, `*.diff_b`) and everything else.
+    """
     keep_suffixes = (
         ".lora_up.weight",
         ".lora_down.weight",
         ".alpha",
         ".lora_alpha",
     )
     dropped_patch = 0
     dropped_other = 0
     kept = 0
     out: dict[str, torch.Tensor] = {}
     for k, v in state_dict.items():
         if not isinstance(v, torch.Tensor):
+            # Ignore non-tensor entries if any.
             dropped_other += 1
             continue
+        # Drop ComfyUI "delta" keys that Diffusers' LoRA loader will never consume.
         if k.endswith(".diff") or k.endswith(".diff_b"):
             dropped_patch += 1
             continue
         if not k.endswith(keep_suffixes):
             dropped_other += 1
             continue
         if k.endswith(".lora_alpha"):
+            # Normalize common alt name to what Diffusers expects.
             base = k[: -len(".lora_alpha")]
             k2 = f"{base}.alpha"
             out[k2] = v.float() if v.dtype != torch.float32 else v
             normalized_alpha += 1
             kept += 1
             continue
         out[k] = v
         kept += 1
     }
     return out, stats
 def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_model.") -> dict:
+    """Ensure both prefixed and unprefixed variants exist for LoRA-related keys.
+    Diffusers' Qwen LoRA conversion may strip `diffusion_model.` when looking up
+    modules. Some exports only include prefixed keys. To be maximally compatible,
+    we duplicate LoRA keys (and alpha) in stripped form when missing.
+    """
     out = dict(state_dict)
     for k, v in list(state_dict.items()):
         if not k.startswith(prefix):
             out[stripped] = v
     return out
 def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name: str, needs_alpha_fix: bool = False):
+    """
+    Normal path: pipe.load_lora_weights(repo, weight_name=..., adapter_name=...)
+    BFS fallback: download safetensors, inject missing alpha keys, then load from dict.
+    """
     try:
         pipe.load_lora_weights(repo, weight_name=weight_name, adapter_name=adapter_name)
         return
     except (KeyError, ValueError) as e:
+        # KeyError: missing required alpha keys (common in BFS)
+        # ValueError: Diffusers Qwen converter found leftover keys (e.g. .diff/.diff_b)
         if not needs_alpha_fix:
             raise
         local_path = hf_hub_download(repo_id=repo, filename=weight_name)
         sd = safetensors_load_file(local_path)
+        # 1) Inject required `<module>.alpha` keys (neutral scaling alpha=rank).
         sd = _inject_missing_alpha_keys(sd)
+        # 2) Keep only LoRA + alpha keys; drop ComfyUI patch/delta keys.
         sd, stats = _filter_to_diffusers_lora_keys(sd)
+        # 3) Duplicate stripped keys (remove `diffusion_model.`) for compatibility.
         sd = _duplicate_stripped_prefix_keys(sd)
         print(
             f"kept={stats['kept']} dropped_patch={stats['dropped_patch']} "
             f"dropped_other={stats['dropped_other']} normalized_alpha={stats['normalized_alpha']}"
         )
         pipe.load_lora_weights(sd, adapter_name=adapter_name)
         return
+# ============================================================
+# LoRA loader: single/package + strengths
+# ============================================================
 def _ensure_loaded_and_get_active_adapters(selected_lora: str):
     spec = ADAPTER_SPECS.get(selected_lora)
     if not spec:
             adapter_names.append(adapter_name)
             adapter_weights.append(strength)
     else:
         repo = spec["repo"]
         weights = spec["weights"]
 # UI handlers
 # ============================================================
 def on_lora_change_ui(selected_lora, current_prompt, current_extras_condition_only):
+    # Preset prompt (fill only if empty)
     if selected_lora != NONE_LORA:
         preset = LORA_PRESET_PROMPTS.get(selected_lora, "")
         if preset and (current_prompt is None or str(current_prompt).strip() == ""):
     else:
         prompt_update = gr.update(value=current_prompt)
+    # Image2 visibility/label
     if lora_requires_two_images(selected_lora):
         img2_update = gr.update(visible=True, label=image2_label_for_lora(selected_lora))
     else:
         img2_update = gr.update(visible=False, value=None, label="Upload Reference (Image 2)")
+    # Extra references routing default:
+    # For BFS/AnyPose-like adapters, it's usually safer to keep extra refs as conditioning-only.
     if selected_lora in ("BFS-Best-FaceSwap", "BFS-Best-FaceSwap-merge", "AnyPose"):
         extras_update = gr.update(value=True)
     else:
         extras_update = gr.update(value=current_extras_condition_only)
     return prompt_update, img2_update, extras_update
 # ============================================================
 # UI helpers: output routing + derived conditioning
 # ============================================================
         raise gr.Error("No output available yet.")
     return gr.update(value=last)
 def set_output_as_image2(last):
     if last is None:
         raise gr.Error("No output available yet.")
     return gr.update(value=last)
 def set_output_as_extra(last, existing_extra):
     if last is None:
         raise gr.Error("No output available yet.")
     return _append_to_gallery(existing_extra, last)
 @spaces.GPU
 def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu, derived_max_people):
     if img1 is None:
     return gr.update(value=new_gallery), gr.update(visible=True, value=derived)
 # ============================================================
 # Inference
 # ============================================================
 @spaces.GPU
 def infer(
     input_image_1,
     input_image_2,
+    input_images_extra,  # gallery multi-image box
     prompt,
     lora_adapter,
     seed,
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
     if input_image_1 is None:
         raise gr.Error("Please upload Image 1.")
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator(device=device).manual_seed(seed)
     negative_prompt = (
         "worst quality, low quality, bad anatomy, bad hands, text, error, missing fingers, "
         "extra digit, fewer digits, cropped, jpeg artifacts, signature, watermark, username, blurry"
     )
+    img1 = input_image_1.convert("RGB")
+    img2 = input_image_2.convert("RGB") if input_image_2 is not None else None
+    # Normalize extra images (Gallery) to PIL RGB (handles tuples from Gallery)
     extra_imgs: list[Image.Image] = []
     if input_images_extra:
         for item in input_images_extra:
             if pil is not None:
                 extra_imgs.append(pil)
+    # Enforce existing 2-image LoRA behavior (image_1 + image_2 required)
     if lora_requires_two_images(lora_adapter) and img2 is None:
         raise gr.Error("This LoRA needs two images. Please upload Image 2 as well.")
+    # Label images as image_1, image_2, image_3...
     labeled = build_labeled_images(img1, img2, extra_imgs)
+    # Pass to pipeline in labeled order. Keep single-image call when only one is present.
     pipe_images = list(labeled.values())
     if len(pipe_images) == 1:
         pipe_images = pipe_images[0]
+    # Resolution derived from Image 1 (base/body/target)
+    # Use target *area* (≈ megapixels) rather than long-edge sizing to reduce FOV drift.
     target_area = get_target_area_for_lora(img1, lora_adapter, float(target_megapixels))
     width, height = compute_canvas_dimensions_from_area(
         img1,
         multiple_of=int(pipe.vae_scale_factor * 2),
     )
+    # Decide which images participate in the VAE latent stream.
+    # If enabled, extra references beyond (Img_1, Img_2) become conditioning-only.
     vae_image_indices = None
     if extras_condition_only:
         if isinstance(pipe_images, list) and len(pipe_images) > 2:
             vae_image_indices = [0, 1] if len(pipe_images) >= 2 else [0]
     try:
+        print(
+             "[DEBUG][infer] submitting request | "
+             f"lora_adapter={lora_adapter!r} seed={seed} prompt={prompt!r}"
+        )
         result = pipe(
             image=pipe_images,
             prompt=prompt,
             negative_prompt=negative_prompt,
             height=height,
             width=width,
+            num_inference_steps=steps,
             generator=generator,
+            true_cfg_scale=guidance_scale,
             vae_image_indices=vae_image_indices,
             pad_to_canvas=bool(pad_to_canvas),
         ).images[0]
         return result, seed, result
     finally:
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
     input_pil = input_image.convert("RGB")
     guidance_scale = 1.0
     steps = 4
+    # Examples don't supply Image 2 or extra images; and example list doesn't include AnyPose/BFS.
+    result, seed, last = infer(input_pil, None, None, prompt, lora_adapter, 0, True, guidance_scale, steps, 1.0, True, True)
     return result, seed, last
         gr.Markdown(
             "Perform diverse image edits using specialized "
             "[LoRA](https://huggingface.co/models?other=base_model:adapter:Qwen/Qwen-Image-Edit-2511) adapters for the "
+            "[Qwen-Image-Edit](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) model. Uses a Diffusers compatible extraction of the transformers from Phr00t's Rapid AIO merge. If a different AIO version is desired, copy the space and set the space variable to change version.'"
         )
         gr.Markdown(aio_status_line)
                         value=True,
                     )
+        # On LoRA selection: preset prompt + toggle Image 2
         lora_adapter.change(
             fn=on_lora_change_ui,
             inputs=[lora_adapter, prompt, extras_condition_only],
         outputs=[output_image, seed, last_output],
     )
+    # Output routing buttons
     btn_out_to_img1.click(fn=set_output_as_image1, inputs=[last_output], outputs=[input_image_1])
     btn_out_to_img2.click(fn=set_output_as_image2, inputs=[last_output], outputs=[input_image_2])
     btn_out_to_extra.click(fn=set_output_as_extra, inputs=[last_output, input_images_extra], outputs=[input_images_extra])
+    # Derived conditioning: append pose/depth map as extra ref (UI shows preview)
     add_derived_btn.click(
         fn=add_derived_ref,
         inputs=[input_image_1, input_images_extra, derived_type, derived_use_gpu, derived_max_people],
         outputs=[input_images_extra, derived_preview],
     )
 if __name__ == "__main__":
     demo.queue(max_size=30).launch(
         css=css,