dual use text encoder/kv fast edits #13853

kwal559 · 2026-06-01T19:39:49Z

kwal559
Jun 1, 2026

this script explores flux2 klein 9b-kv. Pass a prompt to enhance it directly to text encoder, allow it to think and capture it's final output. Then we create variation prompts and feed them back to text encoder for embeds. 1 component dual use = save memory.. Quantize it if you want to save time.. We include the initial image and pile on the variation prompts. Allow batch generation for speed. Receive a grid of consistent characters in different poses/image challenges. If you want to see magic, load up a svdq or similar small transformer and set the image count to 100. on rtx 4090 100 pics (128x128) generate less than 10 seconds. each image unique and character remains.

import torch,diffusers,gc,time,psutil,random
from PIL import Image

def flush():
gc.collect();torch.cuda.empty_cache()
print(f"🧹✂️ {torch.cuda.memory_reserved()/10243:.1f}GB")
print(f"VRAM: {24 - torch.cuda.mem_get_info()[0]/10243:.2f}GB | RAM: {psutil.virtual_memory()[3]/1024**3:.1f}GB")

model_id, kv_tran= "black-forest-labs/FLUX.2-klein-9B","black-forest-labs/FLUX.2-klein-9b-kv"

def enhance_and_embed(user_concept, num_prompts=20):
time_1 = time.time()
print("🧠 Text Encode + Enhance")
pipe = diffusers.DiffusionPipeline.from_pretrained(model_id,transformer=None,vae=None,scheduler=None,torch_dtype=torch.bfloat16).to("cuda")

system_prompt = "You are a creative assistant. Take the user's simple concept and write a highly detailed, descriptive prompt for an image generator. Wrap your final, ready to input enhanced prompt in quotation marks."

messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_concept}]
text_input = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = pipe.tokenizer(text_input, return_tensors="pt").to("cuda")

print(f"\nStarting enhancement for: '{user_concept}'");flush()

with torch.no_grad():
    outputs = pipe.text_encoder.generate(**inputs,top_p=0.95,top_k=20,repetition_penalty=1.0,temperature=1.0,max_new_tokens=1024,pad_token_id=pipe.tokenizer.pad_token_id,eos_token_id=pipe.tokenizer.eos_token_id)

raw_response = pipe.tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=False)

if "</think>" in raw_response:
    thinking, final_prompt = raw_response.split("</think>")
    final_prompt = final_prompt.replace("<|im_end|>", "").strip()
elif "<tool_call>" in raw_response:
    thinking, final_prompt = raw_response.split("<tool_call>")
    final_prompt = final_prompt.replace("<tool_call>", "").strip()
else:
    final_prompt = raw_response.replace("<|im_end|>", "").strip()
print(f"hmmm..\n{thinking}\n")
print(f"✨ ENHANCED PROMPT (took {time.time() - time_1:.2f}s):\n{final_prompt}\n")

color = ["white", "black", "brown", "neon blue", "pastel green", "red", "orange", "neon yellow"]
feels = ["holds a cat", "holds a pig", "teeth braces big smile", "growls", "laughs", "waves at viewer", "doubles in size", "drools", "shrinks in size"]
extra = ["eye patch", "sunglasses", "devil horns", "baseball cap", "western cowboy hat", "giant wings", "balloon", "swimsuit"]
where = ["storm clouds", "spotlight", "holding a pencil drawing of this image", "holding a charcoal sketch of the character"]
style = ["4bit game", "comic book", "claymation", "sand sculpture", "80's cartoon", "cosmic horror graphic novel"]
looks = ["looks to the left", "looks to the right", "looks up", "looks down", "obtains the rear view", "looks at reflection in mirror"]

print("🎨 Mixing variation prompts...")
variant_prompts = []
for _ in range(num_prompts):
    c, f, e, h, s, l = random.choice(color), random.choice(feels), random.choice(extra), random.choice(where), random.choice(style), random.choice(looks)
    sentence = f"Enhance this concept: {final_prompt}. Add a {c} {e} and this character {f}.. {l}, {h} in a {s} style."
    variant_prompts.append(sentence)

print("🔠 Embedding prompts...")
time2 = time.time()
init_embeddings = []
with torch.no_grad():
    init_embed = pipe.encode_prompt(prompt=final_prompt)[0]
init_embeddings.append(init_embed.to("cpu"))
init_embeddings = torch.cat(init_embeddings, dim=0)

prompt_embeddings = []
for i, p in enumerate(variant_prompts):
    with torch.no_grad():
        prompt_embeds = pipe.encode_prompt(prompt=p)[0]
    prompt_embeddings.append(prompt_embeds.to("cpu"))
    
prompt_embeddings = torch.cat(prompt_embeddings, dim=0)
print(f"Embeddings done.. {time.time() - time2:.1f} sec")

pipe.text_encoder = None
pipe.tokenizer = None
del pipe
flush()

return init_embeddings, prompt_embeddings

def generate_images(init_embeddings, prompt_embeddings, num_prompts=20):
print("\n🚀 Loading Image Generation Models...")
vae = diffusers.AutoencoderKLFlux2.from_pretrained("black-forest-labs/FLUX.2-small-decoder", torch_dtype=torch.bfloat16)
transformer = diffusers.AutoModel.from_pretrained(kv_tran, subfolder="transformer", torch_dtype=torch.bfloat16)
transformer.enable_layerwise_casting(storage_dtype=torch.float8_e4m3fn)

pipe = diffusers.Flux2Pipeline.from_pretrained(model_id,transformer=transformer,text_encoder=None,tokenizer=None,vae=vae,torch_dtype=torch.bfloat16).to("cuda")
pipe.transformer.to(memory_format=torch.channels_last)
pipe.vae.to(memory_format=torch.channels_last)
flush()

print("\n🖼️ Generating Base Image...")
init_embeddings = init_embeddings.to("cuda", dtype=torch.bfloat16)
time_base = time.time()

with torch.inference_mode():
    kvimg = pipe(prompt_embeds=init_embeddings,width=1024,height=1024,num_inference_steps=4,guidance_scale=1).images[0]
    
display(kvimg)
print(f"Base image generated in: {time.time() - time_base:.1f} sec");flush()

init_image = kvimg.resize((256, 256), Image.LANCZOS)
del kvimg
flush()

print(f"\n🧬 Generating {num_prompts} Variations...")
prompt_embeddings = prompt_embeddings.to("cuda", dtype=torch.bfloat16)
time_vars = time.time()

with torch.inference_mode():
    images = pipe(prompt_embeds=prompt_embeddings,image=init_image,width=256,height=256,num_inference_steps=4,guidance_scale=1).images
    
print(f"{num_prompts} variant images generated in: {time.time() - time_vars:.1f} sec");flush()

w, h = images[0].size
max_cols, n=4,len(images)
cols = min(max_cols, n)
rows = (n + max_cols - 1) // max_cols
grid = Image.new('RGB', (cols*w, rows*h))
for i, img in enumerate(images):
    grid.paste(img, (i%cols*w, i//cols*h))
display(grid)

EXECUTE PIPELINE

if name == "main":
USER_CONCEPT = "Portrait of a ghoul"
NUM_VARIATIONS = 20

flush()

init_emb, prompt_emb = enhance_and_embed(USER_CONCEPT, num_prompts=NUM_VARIATIONS) generate_images(init_emb, prompt_emb, num_prompts=NUM_VARIATIONS)

false200 · 2026-06-22T09:42:49Z

false200
Jun 22, 2026

Hi @kwal559, this is a solid workflow. The part that actually makes it work on one 24GB card is loading only the text encoder first, using it as an LLM for prompt expansion and then for embeddings, and dumping it before the transformer and VAE come in. That matches how diffusers already thinks about offload order anyway, you’re just doing it harder. Since Klein’s text encoder is Qwen3, calling generate and then encode_prompt on the same weights isn’t a hack. That’s the model. The KV transformer plus the resized base image as reference is what keeps the character consistent across the grid. You pay for enhancement once, lock the look at 1024, then batch cheap edits at 256.

One thing I’d try is batching encode_prompt with the full variant list instead of looping one prompt at a time. Should help a lot once you crank variations up to 50 or 100. Also your system prompt asks for the final line in quotes but the parser mostly looks for thinking tags. If those don’t show up you might get extra junk in the embed. Pulling the last quoted string with a simple regex fallback would probably make that step more reliable.

Did you try Flux2KleinPipeline instead of Flux2Pipeline here. Might be a cleaner fit since you’re already on distilled 4 step settings and passing prompt_embeds plus image directly.

0 replies

kwal559 · 2026-06-25T03:15:31Z

kwal559
Jun 25, 2026
Author

I shared that 'harder' method because it performs 3 times faster than the recommended enable cpu_offload. I also show how to enhance a prompt with the default 'text encoder'in natural language - with reasoning mode enabled, and have it encode to embeds by telling diffusers what not to load.. I didn't know these llm/encoders nor their sometimes companion prompt helpers are the same default models we download on HF, but with a custom prompt.. Here are some benchmark for BF16 klein on rtx4090, the point being that 'harder' ways are also improved ways, and i figure enthusiasts likely read these, the type who look to benefit the hobby.

5 images cpu offload enabled: 37.61s
5 images cpu offload enabled: 37.79s
5 images all on gpu: 11.38s
5 images all on gpu: 11.42s
5 images text encoder block lvl 10 offload: 23.35s
5 images text encoder block lvl 10 offload: 23.39s
5 images text encoder embeds 5 individual prompts, delete pipe, load transformer + vae: 11.04s
5 images text encoder embeds 5 individual prompts, delete pipe, load transformer + vae: 10.86s

1 reply

kwal559 Jun 25, 2026
Author

oops forgot this **I also show how inefficient this is with no loops and single encodes.. leaving even more gains open to those who try..
model_id= "black-forest-labs/FLUX.2-klein-4B"
pipe = diffusers.DiffusionPipeline.from_pretrained(model_id,transformer=None,vae=None,scheduler=None,torch_dtype=torch.bfloat16).to("cuda")

with torch.no_grad(): prompt_embed1 = pipe.encode_prompt(prompt="an ancient dragon")[0]
with torch.no_grad(): prompt_embed2 = pipe.encode_prompt(prompt="a powerful papa smurf")[0]
with torch.no_grad(): prompt_embed3 = pipe.encode_prompt(prompt="hummingbird with real strawberry for skin")[0]
with torch.no_grad(): prompt_embed4 = pipe.encode_prompt(prompt="waterfall with mirror reflection")[0]
with torch.no_grad(): prompt_embed5 = pipe.encode_prompt(prompt="add the strawberry skinned hummingbird reflecting on waterfall")[0]

pipe.text_encoder, pipe.tokenizer = None, None
del pipe
pipe = diffusers.DiffusionPipeline.from_pretrained(model_id,text_encoder=None,tokenizer=None,torch_dtype=torch.bfloat16).to("cuda")
gen_params={"num_inference_steps": 4, "guidance_scale": 1}

with torch.inference_mode():
with time_it("5 images text encoder embeds 5 seperate prompts, delete pipe, load transformer + vae"):
display(pipe(prompt_embeds=prompt_embed1,**gen_params).images[0])
display(pipe(prompt_embeds=prompt_embed2,**gen_params).images[0])
unoimg=pipe(prompt_embeds=prompt_embed3,**gen_params).images[0]
dosimg=pipe(prompt_embeds=prompt_embed4,**gen_params).images[0]
treimg=pipe(prompt_embeds=prompt_embed5,image=([unoimg, dosimg]),**gen_params).images[0]
end_grid=diffusers.utils.make_image_grid(images=[unoimg,dosimg,treimg], rows=1, cols=3)
display(end_grid)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

dual use text encoder/kv fast edits #13853

Uh oh!

{{title}}

Uh oh!

Replies: 2 comments 1 reply

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Uh oh!

dual use text encoder/kv fast edits #13853

Uh oh!

kwal559 Jun 1, 2026

EXECUTE PIPELINE

Replies: 2 comments · 1 reply

Uh oh!

false200 Jun 22, 2026

Uh oh!

kwal559 Jun 25, 2026 Author

Uh oh!

kwal559 Jun 25, 2026 Author

kwal559
Jun 1, 2026

Replies: 2 comments 1 reply

false200
Jun 22, 2026

kwal559
Jun 25, 2026
Author

kwal559 Jun 25, 2026
Author