| --- |
| license: mit |
| base_model: |
| - Qwen/Qwen3-VL-8B-Instruct |
| pipeline_tag: image-text-to-text |
| library_name: transformers |
| --- |
| |
| # Code2World-8B |
|
|
| Given a current GUI observation and an action, Code2World predicts the next screenshot via **renderable code generation**. |
|
|
|  |
|
|
| ## Quickstart |
|
|
| Below, we provide the main demo script for running one example case to show how to use Code2World with 🤗 Transformers. |
|
|
| To keep the demo clear and reusable, it relies on the following components: |
| - `prompt_builder.py`: builds the text prompt from the task instruction and action. |
| - `visual_hint.py`: adds visual action hints (e.g. click circles or swipe arrows) to the input screenshot. |
| - `render_utils.py`: post-processes generated HTML, renders it into an image, and saves outputs. |
|
|
|
|
| The code of Code2World has been in the latest Hugging Face transformers and we advise you to build from source with command: |
| ``` |
| pip install transformers==4.57.0 |
| ``` |
|
|
|
|
| ```python |
| import torch |
| from PIL import Image |
| from transformers import AutoProcessor, Qwen3VLForConditionalGeneration |
| |
| from prompt_builder import SYSTEM_PROMPT, build_user_prompt |
| from visual_hint import build_visual_hint |
| from render_utils import extract_clean_html, render_html_to_image, save_demo_outputs |
| |
| |
| # ============================================================ |
| # 1. Load model |
| # ============================================================ |
| |
| MODEL_NAME = "GD-ML/Code2World" |
| |
| model = Qwen3VLForConditionalGeneration.from_pretrained( |
| MODEL_NAME, |
| dtype=torch.bfloat16, |
| attn_implementation="flash_attention_2", |
| device_map="auto", |
| ) |
| |
| processor = AutoProcessor.from_pretrained(MODEL_NAME) |
| |
| |
| # ============================================================ |
| # 2. Helper functions |
| # ============================================================ |
| |
| def build_messages(image, instruction, action): |
| user_prompt = build_user_prompt( |
| instruction_str=instruction, |
| action=action, |
| ) |
| |
| messages = [ |
| { |
| "role": "system", |
| "content": [{"type": "text", "text": SYSTEM_PROMPT}], |
| }, |
| { |
| "role": "user", |
| "content": [ |
| {"type": "image", "image": image.convert("RGB")}, |
| {"type": "text", "text": user_prompt}, |
| ], |
| }, |
| ] |
| return messages |
| |
| |
| @torch.inference_mode() |
| def generate_html(image, instruction, action, max_new_tokens=8192): |
| messages = build_messages( |
| image=image, |
| instruction=instruction, |
| action=action, |
| ) |
| |
| inputs = processor.apply_chat_template( |
| messages, |
| add_generation_prompt=True, |
| tokenize=True, |
| return_dict=True, |
| return_tensors="pt", |
| ) |
| inputs = inputs.to(model.device) |
| |
| generated_ids = model.generate( |
| **inputs, |
| max_new_tokens=max_new_tokens, |
| ) |
| |
| generated_ids_trimmed = [ |
| out_ids[len(in_ids):] |
| for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| ] |
| |
| output_text = processor.batch_decode( |
| generated_ids_trimmed, |
| skip_special_tokens=True, |
| clean_up_tokenization_spaces=False, |
| )[0] |
| |
| html = extract_clean_html(output_text) |
| return html |
| |
| |
| def run_demo(case_data, output_dir="./demo_outputs"): |
| """ |
| case_data: |
| - images[0] |
| - instruction |
| - action |
| """ |
| image_path = case_data["images"][0] |
| instruction = case_data["instruction"] |
| action = case_data["action"] |
| |
| image = Image.open(image_path).convert("RGB") |
| hinted_image = build_visual_hint(image, action) |
| |
| html = generate_html( |
| image=hinted_image, |
| instruction=instruction, |
| action=action, |
| ) |
| |
| rendered_image = render_html_to_image(html) |
| |
| save_demo_outputs( |
| output_dir=output_dir, |
| hinted_image=hinted_image, |
| html=html, |
| rendered_image=rendered_image, |
| ) |
| |
| return hinted_image, html, rendered_image |
| |
| |
| # ============================================================ |
| # 3. Example case |
| # ============================================================ |
| |
| if __name__ == "__main__": |
| case_data = { |
| "images": [ |
| "demo_case.png" |
| ], |
| "instruction": "Click on the Search Omio button.", |
| "action": { |
| "action_type": "click", |
| "x": 540, |
| "y": 1470 |
| } |
| } |
| |
| run_demo(case_data, output_dir="./demo_outputs") |
| ``` |
|
|
| ## Citation |
|
|
| If you find our work helpful, feel free to give us a cite. |
|
|
| ``` |
| @article{zheng2026code2world, |
| title={Code2World: A GUI World Model via Renderable Code Generation}, |
| author={Zheng, Yuhao and Zhong, Li'an and Wang, Yi and Dai, Rui and Liu, Kaikui and Chu, Xiangxiang and Lv, Linyuan and Torr, Philip and Lin, Kevin Qinghong}, |
| journal={arXiv preprint arXiv:2602.09856}, |
| year={2026} |
| } |
| ``` |