Spaces:

multimodalart
/

LLaDA

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Feb 26

Commit

b41aa9b

verified ·

1 Parent(s): f6d8cac

Create app.py

Browse files

Files changed (1) hide show

app.py +439 -0

app.py ADDED Viewed

	@@ -0,0 +1,439 @@

+import torch
+import numpy as np
+import gradio as gr
+import spaces
+from transformers import AutoTokenizer, AutoModel
+import time
+import re
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+print(f"Using device: {device}")
+# Load model and tokenizer
+tokenizer = AutoTokenizer.from_pretrained('GSAI-ML/LLaDA-8B-Instruct', trust_remote_code=True)
+model = AutoModel.from_pretrained('GSAI-ML/LLaDA-8B-Instruct', trust_remote_code=True,
+                                  torch_dtype=torch.bfloat16).to(device).eval()
+# Constants
+MASK_TOKEN = "[MASK]"
+MASK_ID = 126336  # The token ID of [MASK] in LLaDA
+def parse_constraints(constraints_text):
+    """Parse constraints in format: 'position:word, position:word, ...'"""
+    constraints = {}
+    if not constraints_text:
+        return constraints
+    parts = constraints_text.split(',')
+    for part in parts:
+        if ':' not in part:
+            continue
+        pos_str, word = part.split(':', 1)
+        try:
+            pos = int(pos_str.strip())
+            word = word.strip()
+            if word and pos >= 0:
+                constraints[pos] = word
+        except ValueError:
+            continue
+    return constraints
+def format_chat_history(history):
+    """
+    Format chat history for the LLaDA model
+    Args:
+        history: List of [user_message, assistant_message] pairs
+    Returns:
+        Formatted conversation for the model
+    """
+    messages = []
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:  # Skip if None (for the latest user message)
+            messages.append({"role": "assistant", "content": assistant_msg})
+    return messages
+@spaces.GPU
+def generate_response_with_visualization(model, tokenizer, device, messages, gen_length=64, steps=32, constraints=None):
+    """
+    Generate text with LLaDA model with visualization of the denoising process
+    Args:
+        messages: List of message dictionaries with 'role' and 'content'
+    Returns:
+        List of visualization states showing the progression and final text
+    """
+    # Set random seed for reproducibility
+    torch.manual_seed(42)
+    # Process constraints
+    if constraints is None:
+        constraints = {}
+    # Convert any string constraints to token IDs
+    processed_constraints = {}
+    for pos, word in constraints.items():
+        tokens = tokenizer.encode(" " + word, add_special_tokens=False)
+        for i, token_id in enumerate(tokens):
+            processed_constraints[pos + i] = token_id
+    # Prepare the prompt using chat template
+    chat_input = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+    input_ids = tokenizer(chat_input)['input_ids']
+    input_ids = torch.tensor(input_ids).to(device).unsqueeze(0)
+    # For generation
+    prompt_length = input_ids.shape[1]
+    # Initialize the sequence with masks for the response part
+    x = torch.full((1, prompt_length + gen_length), MASK_ID, dtype=torch.long).to(device)
+    x[:, :prompt_length] = input_ids.clone()
+    # Initialize visualization states for just the response part
+    visualization_states = []
+    # Add initial state (all masked) - only for the response part
+    initial_state = [(MASK_TOKEN, "#444444") for _ in range(gen_length)]
+    visualization_states.append(initial_state)
+    # Apply constraints to the initial state
+    for pos, token_id in processed_constraints.items():
+        absolute_pos = prompt_length + pos
+        if absolute_pos < x.shape[1]:
+            x[:, absolute_pos] = token_id
+    # Calculate timesteps
+    timesteps = torch.linspace(1.0, 0.0, steps + 1)[:-1]
+    # Keep track of already revealed tokens
+    revealed_tokens = torch.zeros(1, gen_length, dtype=torch.bool).to(device)
+    for step, t in enumerate(timesteps):
+        # Current t to next t
+        s = t - 1.0 / steps if step < steps - 1 else 0
+        # Get all mask positions in the current sequence
+        mask_indices = (x == MASK_ID)
+        # Skip if no masks
+        if not mask_indices.any():
+            break
+        # Get logits from the model
+        logits = model(x).logits
+        # Get the top predictions
+        x0 = torch.argmax(logits, dim=-1)
+        # Get probabilities for visualization
+        probs = torch.softmax(logits, dim=-1)
+        top_probs = torch.max(probs, dim=-1)[0]
+        # Apply the predictions where we have masks
+        x_old = x.clone()
+        x = torch.where(mask_indices, x0, x)
+        # Calculate how many tokens should remain masked at next step
+        total_len = gen_length
+        current_t_value = float(t)
+        next_t_value = float(s)
+        # Linear schedule: t=1 → all masked, t=0 → none masked
+        current_masks_expected = int(current_t_value * total_len)
+        next_masks_expected = int(next_t_value * total_len)
+        # How many to unmask in this step
+        tokens_to_unmask = current_masks_expected - next_masks_expected
+        if tokens_to_unmask > 0 and mask_indices.any():
+            # Get confidence scores for currently masked tokens
+            confidence_scores = top_probs[mask_indices]
+            # Sort confidence scores
+            sorted_indices = torch.argsort(confidence_scores, descending=True)
+            # Select which tokens to keep masked (the lowest confidence ones)
+            indices_to_remask = sorted_indices[tokens_to_unmask:]
+            # Get the actual indices in the sequence
+            mask_positions = torch.where(mask_indices)[1]
+            positions_to_remask = mask_positions[indices_to_remask]
+            # Remask these positions
+            x[:, positions_to_remask] = MASK_ID
+        # Ensure constraints are maintained
+        for pos, token_id in processed_constraints.items():
+            absolute_pos = prompt_length + pos
+            if absolute_pos < x.shape[1]:
+                x[:, absolute_pos] = token_id
+        # Create visualization state ONLY for the response part
+        current_state = []
+        # Update which tokens are newly revealed in this step
+        for i in range(gen_length):
+            pos = prompt_length + i  # Absolute position in the sequence
+            if x[0, pos] == MASK_ID:
+                # Still masked
+                current_state.append((MASK_TOKEN, "#444444"))  # Dark gray for masks
+            elif x_old[0, pos] == MASK_ID:
+                # Newly revealed in this step
+                token = tokenizer.decode([x[0, pos].item()], skip_special_tokens=True)
+                confidence = float(top_probs[0, pos].cpu())
+                # Color based on confidence: red (low) to green (high)
+                if confidence < 0.3:
+                    color = "#FF6666"  # Light red
+                elif confidence < 0.7:
+                    color = "#FFAA33"  # Orange
+                else:
+                    color = "#66CC66"  # Light green
+                current_state.append((token, color))
+                revealed_tokens[0, i] = True
+            else:
+                # Previously revealed
+                token = tokenizer.decode([x[0, pos].item()], skip_special_tokens=True)
+                current_state.append((token, "#6699CC"))  # Light blue
+        visualization_states.append(current_state)
+    # Extract final text (just the assistant's response)
+    response_tokens = x[0, prompt_length:]
+    response_text = tokenizer.decode(response_tokens, skip_special_tokens=True)
+    # Clean the response text
+    final_text = clean_output_text(response_text)
+    return visualization_states, final_text
+def clean_output_text(text):
+    """Clean the output text to remove special tokens and fix spacing"""
+    # Remove any remaining [MASK] tokens
+    text = text.replace(MASK_TOKEN, "")
+    # Fix common spacing issues with tokenization
+    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
+    text = re.sub(r' \.', '.', text)  # Fix spacing before periods
+    text = re.sub(r' ,', ',', text)   # Fix spacing before commas
+    text = re.sub(r' !', '!', text)   # Fix spacing before exclamation marks
+    text = re.sub(r' \?', '?', text)  # Fix spacing before question marks
+    text = re.sub(r' ;', ';', text)   # Fix spacing before semicolons
+    text = re.sub(r' :', ':', text)   # Fix spacing before colons
+    # Fix beginning and end spacing
+    text = text.strip()
+    return text
+css = '''
+.category-legend{display:none}
+'''
+def create_chatbot_demo():
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown("# LLaDA - Large Language Diffusion Model demo")
+        gr.Markdown("[model](https://huggingface.co/GSAI-ML/LLaDA-8B-Instruct), [project page](https://ml-gsai.github.io/LLaDA-demo/)")
+        # STATE MANAGEMENT - IMPORTANT
+        # We use a dedicated state to track the full conversation history
+        chat_history = gr.State([])
+        # UI COMPONENTS
+        # Chatbot for displaying messages
+        with gr.Row():
+            with gr.Column(scale=3):
+                chatbot_ui = gr.Chatbot(label="Conversation", height=500)
+                # Message input
+                with gr.Group():
+                    with gr.Row():
+                        user_input = gr.Textbox(
+                            label="Your Message",
+                            placeholder="Type your message here...",
+                            show_label=False
+                        )
+                        send_btn = gr.Button("Send")
+                constraints_input = gr.Textbox(
+                    label="Word Constraints",
+                    info="This model allows for placing specific words at specific positions using 'position:word' format. Example: 1st word once, 6th word 'upon' and 11th word 'time', would be: '0:Once, 5:upon, 10:time",
+                    placeholder="0:Once, 5:upon, 10:time",
+                    value=""
+                )
+            with gr.Column(scale=2):
+                output_vis = gr.HighlightedText(
+                    label="Denoising Process Visualization",
+                    combine_adjacent=False,
+                    show_legend=True,
+                )
+        # Visualization and response components
+        with gr.Accordion("Generation Settings", open=False):
+            with gr.Row():
+                gen_length = gr.Slider(
+                    minimum=16, maximum=128, value=64, step=8,
+                    label="Generation Length"
+                )
+                steps = gr.Slider(
+                    minimum=8, maximum=64, value=32, step=4,
+                    label="Denoising Steps"
+                )
+            visualization_delay = gr.Slider(
+                minimum=0.0, maximum=1.0, value=0.1, step=0.1, visible=False,
+                label="Visualization Delay (seconds)"
+            )
+        # Current response text box
+        current_response = gr.Textbox(
+            label="Current Response",
+            placeholder="The assistant's response will appear here...",
+            lines=3,
+            visible=False
+        )
+        # Clear button
+        clear_btn = gr.Button("Clear Conversation")
+        # Example inputs
+        gr.Examples(
+            [
+                ["Tell me a short joke", 64, 32, ""],
+                ["Write a short story", 64, 32, "0:Once, 5:upon, 10:time"],
+                ["Explain quantum computing", 64, 32, ""],
+            ],
+            [user_input, gen_length, steps, constraints_input],
+        )
+        # HELPER FUNCTIONS
+        def add_message(history, message, response):
+            """Add a message pair to the history and return the updated history"""
+            history = history.copy()
+            history.append([message, response])
+            return history
+        def user_message_submitted(message, history, gen_length, steps, constraints, delay):
+            """Process a submitted user message"""
+            # Skip empty messages
+            if not message.strip():
+                # Return current state unchanged
+                history_for_display = history.copy()
+                return history, history_for_display, "", [], ""
+            # Add user message to history
+            history = add_message(history, message, None)
+            # Format for display - temporarily show user message with empty response
+            history_for_display = history.copy()
+            # Clear the input
+            message_out = ""
+            # Return immediately to update UI with user message
+            return history, history_for_display, message_out, [], ""
+        def bot_response(history, gen_length, steps, constraints, delay):
+            """Generate bot response for the latest message"""
+            if not history:
+                return history, [], ""
+            # Get the last user message
+            last_user_message = history[-1][0]
+            try:
+                # Format all messages except the last one (which has no response yet)
+                messages = format_chat_history(history[:-1])
+                # Add the last user message
+                messages.append({"role": "user", "content": last_user_message})
+                # Parse constraints
+                parsed_constraints = parse_constraints(constraints)
+                # Generate response with visualization
+                vis_states, response_text = generate_response_with_visualization(
+                    model, tokenizer, device,
+                    messages,
+                    gen_length=gen_length,
+                    steps=steps,
+                    constraints=parsed_constraints
+                )
+                # Update history with the assistant's response
+                history[-1][1] = response_text
+                # Return the initial state immediately
+                yield history, vis_states[0], response_text
+                # Then animate through visualization states
+                for state in vis_states[1:]:
+                    time.sleep(delay)
+                    yield history, state, response_text
+            except Exception as e:
+                error_msg = f"Error: {str(e)}"
+                print(error_msg)
+                # Show error in visualization
+                error_vis = [(error_msg, "red")]
+                # Don't update history with error
+                yield history, error_vis, error_msg
+        def clear_conversation():
+            """Clear the conversation history"""
+            return [], [], "", []
+        # EVENT HANDLERS
+        # Clear button handler
+        clear_btn.click(
+            fn=clear_conversation,
+            inputs=[],
+            outputs=[chat_history, chatbot_ui, current_response, output_vis]
+        )
+        # User message submission flow (2-step process)
+        # Step 1: Add user message to history and update UI
+        msg_submit = user_input.submit(
+            fn=user_message_submitted,
+            inputs=[user_input, chat_history, gen_length, steps, constraints_input, visualization_delay],
+            outputs=[chat_history, chatbot_ui, user_input, output_vis, current_response]
+        )
+        # Also connect the send button
+        send_click = send_btn.click(
+            fn=user_message_submitted,
+            inputs=[user_input, chat_history, gen_length, steps, constraints_input, visualization_delay],
+            outputs=[chat_history, chatbot_ui, user_input, output_vis, current_response]
+        )
+        # Step 2: Generate bot response
+        # This happens after the user message is displayed
+        msg_submit.then(
+            fn=bot_response,
+            inputs=[chat_history, gen_length, steps, constraints_input, visualization_delay],
+            outputs=[chatbot_ui, output_vis, current_response]
+        )
+        send_click.then(
+            fn=bot_response,
+            inputs=[chat_history, gen_length, steps, constraints_input, visualization_delay],
+            outputs=[chatbot_ui, output_vis, current_response]
+        )
+    return demo
+# Launch the demo
+if __name__ == "__main__":
+    demo = create_chatbot_demo()
+    demo.queue().launch(share=True)