Spaces:

jfang
/

gprmax-support-gsoc25

Running on Zero

App Files Files Community

gprmax-support-gsoc25 / app.py

jfang

Upload 7 files

3718631 verified 2 months ago

raw

history blame contribute delete

28.9 kB

	import gradio as gr
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import spaces
	import re
	from typing import List, Dict, Tuple, Optional
	import sys
	from pathlib import Path

	# Add rag-db to path for imports
	sys.path.append(str(Path(__file__).parent / "rag-db"))
	from retriever import create_retriever, GprMaxRAGRetriever


	# Initialize model and tokenizer
	MODEL_NAME = "jfang/gprmax-ft-Qwen3-4B-Instruct"
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

	print(f"Loading model: {MODEL_NAME}")
	print(f"Using device: {DEVICE}")

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
	model = AutoModelForCausalLM.from_pretrained(
	MODEL_NAME,
	torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
	device_map="auto",
	trust_remote_code=True
	)

	# Initialize RAG retriever
	RAG_DB_PATH = Path(__file__).parent / "rag-db" / "chroma_db"
	retriever: Optional[GprMaxRAGRetriever] = None

	def generate_database_if_needed():
	"""Generate the RAG database if it doesn't exist"""
	if not RAG_DB_PATH.exists():
	print("=" * 60)
	print("RAG database not found. Generating database...")
	print("This is a one-time process and may take a few minutes.")
	print("=" * 60)

	import subprocess
	try:
	# Run the generation script
	result = subprocess.run(
	["python", str(Path(__file__).parent / "rag-db" / "generate_db.py")],
	capture_output=True,
	text=True,
	check=True
	)
	print(result.stdout)
	print("✅ Database generated successfully!")
	return True
	except subprocess.CalledProcessError as e:
	print(f"❌ Failed to generate database: {e}")
	if e.stderr:
	print(f"Error output: {e.stderr}")
	return False
	return True

	# Generate database if needed and load retriever
	if generate_database_if_needed():
	try:
	print(f"Loading RAG database from {RAG_DB_PATH}")
	retriever = create_retriever(db_path=RAG_DB_PATH)
	print("RAG database loaded successfully")
	except Exception as e:
	print(f"Error loading RAG database: {e}")
	print("RAG features will be disabled.")
	retriever = None
	else:
	print("RAG features will be disabled due to database generation failure.")
	retriever = None


	@spaces.GPU(duration=60)
	def generate_response_stream(
	message: str,
	history: List[Dict[str, str]],
	system_message: str,
	max_tokens: int,
	temperature: float,
	top_p: float,
	):
	"""
	Generate streaming response using the fine-tuned Qwen3 model.
	Returns both thinking content and main response separately.

	Args:
	message: User's input message
	history: Conversation history
	system_message: System prompt
	max_tokens: Maximum tokens to generate
	temperature: Sampling temperature
	top_p: Nucleus sampling parameter

	Yields:
	Tuple of (thinking_content, response_content)
	"""
	# Construct messages for Qwen3 format
	messages = []

	if system_message:
	messages.append({"role": "system", "content": system_message})

	# Add conversation history
	for msg in history:
	if msg.get("role") and msg.get("content"):
	messages.append({"role": msg["role"], "content": msg["content"]})

	# Add current user message with thinking instruction
	thinking_instruction = "Please think step by step about this problem, showing your reasoning process."
	messages.append({"role": "user", "content": f"{thinking_instruction}\n\n{message}"})

	# Prepare input using Qwen's chat template
	text = tokenizer.apply_chat_template(
	messages,
	tokenize=False,
	add_generation_prompt=True
	)

	inputs = tokenizer([text], return_tensors="pt").to(model.device)

	# Setup streaming
	from transformers import TextIteratorStreamer
	from threading import Thread

	streamer = TextIteratorStreamer(
	tokenizer,
	skip_prompt=True,
	skip_special_tokens=True,
	timeout=60.0
	)

	# Generation kwargs
	generation_kwargs = dict(
	**inputs,
	max_new_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	do_sample=True if temperature > 0 else False,
	pad_token_id=tokenizer.eos_token_id,
	eos_token_id=tokenizer.eos_token_id,
	streamer=streamer
	)

	# Start generation in a separate thread
	thread = Thread(target=model.generate, kwargs=generation_kwargs)
	thread.start()

	# Collect and yield tokens
	full_response = ""
	thinking_buffer = ""
	main_buffer = ""
	in_thinking = False
	thinking_complete = False
	think_start_seen = False

	for new_text in streamer:
	full_response += new_text

	# Check if we're starting to see a thinking block
	if "<think>" in full_response and not think_start_seen:
	think_start_seen = True
	think_start_idx = full_response.find("<think>")
	# Any content before <think> is main content
	main_buffer = full_response[:think_start_idx]
	in_thinking = True
	# Start capturing everything after <think>
	thinking_buffer = full_response[think_start_idx + 7:] # Skip "<think>" itself

	# Yield what we have so far
	yield ("", main_buffer)
	continue

	# If we're in thinking mode
	if in_thinking and not thinking_complete:
	# Update thinking buffer with latest content
	current_pos = full_response.find("<think>") + 7
	thinking_buffer = full_response[current_pos:]

	# Check if thinking is complete
	if "</think>" in thinking_buffer:
	# Extract content before </think>
	end_idx = thinking_buffer.find("</think>")
	final_thinking = thinking_buffer[:end_idx].strip()

	# Get content after </think>
	after_thinking = thinking_buffer[end_idx + 8:] # Skip "</think>"
	main_buffer = main_buffer + after_thinking

	thinking_complete = True
	in_thinking = False

	# Yield final thinking and updated main content
	yield (final_thinking, main_buffer)
	else:
	# Still accumulating thinking content - stream it in real-time
	# Remove any partial </thi or similar incomplete tags at the end
	display_thinking = thinking_buffer
	if display_thinking.endswith("</") or display_thinking.endswith("</t") or \
	display_thinking.endswith("</th") or display_thinking.endswith("</thi") or \
	display_thinking.endswith("</thin"):
	# Don't show partial closing tag
	display_thinking = display_thinking[:display_thinking.rfind("<")]

	yield (display_thinking.strip(), main_buffer)
	continue

	# Normal streaming after thinking is complete or if no thinking at all
	if thinking_complete or not think_start_seen:
	if thinking_complete:
	# Remove the entire thinking block and stream the rest
	clean_response = re.sub(r'<think>.*?</think>', '', full_response, flags=re.DOTALL)
	main_buffer = clean_response
	else:
	# No thinking tags seen yet, everything is main content
	main_buffer = full_response

	# Get the final thinking content if it exists
	if thinking_complete:
	think_match = re.search(r'<think>(.*?)</think>', full_response, re.DOTALL)
	if think_match:
	final_thinking = think_match.group(1).strip()
	yield (final_thinking, main_buffer)
	else:
	yield ("", main_buffer)
	else:
	yield ("", main_buffer)

	# Final cleanup - handle incomplete thinking blocks
	if in_thinking:
	# Generation ended while in thinking mode - likely hit max token limit
	incomplete_thinking_msg = thinking_buffer.strip() if thinking_buffer else ""
	if incomplete_thinking_msg:
	# Add warning about incomplete thinking
	incomplete_thinking_msg += "\n\n⚠️ Thinking was cut off due to token limit. Try increasing 'Max New Tokens' in settings."

	# Show incomplete thinking warning in main response too
	error_msg = "⚠️ AI's thinking process was interrupted due to token limit. Try increasing 'Max New Tokens' and retry."
	yield (incomplete_thinking_msg, error_msg)
	thread.join()


	# Tool definitions in Qwen3 format
	TOOLS = [
	{
	"type": "function",
	"function": {
	"name": "search_documentation",
	"description": "Search gprMax documentation for relevant information about commands, syntax, parameters, or usage",
	"parameters": {
	"type": "object",
	"properties": {
	"query": {
	"type": "string",
	"description": "The search query to find relevant documentation"
	},
	"num_results": {
	"type": "integer",
	"description": "Number of results to return",
	"default": 10
	}
	},
	"required": ["query"]
	}
	}
	}
	]

	def format_tools_prompt() -> str:
	"""Format tools for inclusion in system prompt"""
	import json
	return json.dumps(TOOLS, indent=2)


	def perform_rag_search(query: str, k: int = 10) -> Tuple[str, List[Dict]]:
	"""
	Perform RAG search and return formatted context and sources

	Returns:
	Tuple of (context_for_llm, source_list_for_display)
	"""
	if not retriever:
	print(f"[DEBUG] Retriever is None!")
	return "", []

	try:
	print(f"[DEBUG] Searching for: '{query}' with k={k}")
	# Search for relevant documents
	results = retriever.search(query, k=k)

	print(f"[DEBUG] Search returned {len(results) if results else 0} results")
	if not results:
	return "", []

	# Format context for LLM - pass all text content
	context_parts = []
	source_list = []

	for i, result in enumerate(results, 1):
	# Add full text to context for LLM (up to 1000 chars per doc)
	context_parts.append(f"[Document {i}]: {result.text}")

	# Add to source list for display (limited preview)
	source_list.append({
	"index": i,
	"source": result.metadata.get("source", "Unknown"),
	"score": result.score,
	"preview": result.text[:150] + "..." if len(result.text) > 150 else result.text
	})

	context = "\n\n".join(context_parts)
	return context, source_list

	except Exception as e:
	print(f"[DEBUG] RAG search error: {e}")
	import traceback
	traceback.print_exc()
	return "", []


	def respond(
	message: str,
	history: List[Dict[str, str]],
	system_message: str,
	max_tokens: int,
	temperature: float,
	top_p: float,
	):
	"""
	Response function with proper Qwen3 tool calling
	"""
	import json
	import re

	sources_content = ""

	try:
	# Use system message as-is (already has tools included)
	system_with_tools = system_message

	# First, get initial response from model to see if it wants to use tools
	tool_call = None
	accumulated_response = ""
	final_thinking = ""
	is_complete = False

	# Collect the full response (thinking + potential tool call)
	for thinking, response in generate_response_stream(
	message=message,
	history=history,
	system_message=system_with_tools,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	):
	final_thinking = thinking if thinking else final_thinking
	accumulated_response = response

	# Show thinking progress only
	if thinking:
	yield thinking, "⏳ AI is analyzing your request...", sources_content

	# After streaming completes, check what we got
	if accumulated_response and accumulated_response.strip():
	# Check if the complete response is a JSON tool call
	if accumulated_response.strip().startswith('{'):
	try:
	# Try to parse the entire response as JSON
	response_json = json.loads(accumulated_response.strip())
	if "tool_call" in response_json or ("thought" in response_json and "tool_call" in response_json):
	tool_call = response_json.get("tool_call") or response_json["tool_call"]
	# Show status that we're processing the tool call
	yield final_thinking, "🔍 Processing documentation search request...", sources_content
	is_complete = True
	except json.JSONDecodeError:
	# Invalid JSON, treat as normal response
	yield final_thinking, accumulated_response, sources_content
	is_complete = True
	except Exception:
	yield final_thinking, accumulated_response, sources_content
	is_complete = True
	else:
	# It's a normal text response, not a tool call
	yield final_thinking, accumulated_response, sources_content
	is_complete = True

	# If tool was called, execute it
	if tool_call and retriever:
	tool_name = tool_call.get("name")
	print(f"[DEBUG] Tool called: {tool_name}")
	print(f"[DEBUG] Tool call details: {tool_call}")

	if tool_name == "search_documentation":
	# Update status
	yield "🔍 Searching documentation...", "⏳ Preparing to search...", "📚 Retrieving relevant documents..."

	# Get search query
	query = tool_call.get("arguments", {}).get("query", message)
	num_results = tool_call.get("arguments", {}).get("num_results", 10)
	print(f"[DEBUG] Query extracted: '{query}', num_results: {num_results}")

	# Perform search
	context, sources_list = perform_rag_search(query, k=num_results)
	print(f"[DEBUG] Search results - Context length: {len(context)}, Sources: {len(sources_list)}")

	if context:
	# Format sources for display
	if sources_list:
	sources_parts = ["## 📚 Documentation Sources\n"]
	for source in sources_list:
	sources_parts.append(
	f"[{source['index']}] {source['source']} (Score: {source['score']:.3f})\n"
	f"```\n{source['preview']}\n```\n"
	)
	sources_content = "\n".join(sources_parts)
	else:
	sources_content = "No relevant documentation found"

	yield "✅ Documentation retrieved", "⏳ Generating response with context...", sources_content

	# Now generate response with the retrieved context
	augmented_message = f"""Tool call result for search_documentation:

	{context}

	Original question: {message}

	Please provide a comprehensive answer based on the documentation above."""

	# Generate final response with context
	for thinking, response in generate_response_stream(
	message=augmented_message,
	history=history,
	system_message=system_message, # Use original system message for final response
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	):
	yield thinking, response, sources_content
	else:
	sources_content = "No relevant documentation found"
	yield final_thinking, "⚠️ Unable to retrieve documentation. Providing general answer...", sources_content

	# Generate response without documentation context
	fallback_message = f"""The user asked about: {message}

	No relevant documentation was found in the database. Please provide a helpful answer based on your general knowledge of gprMax."""

	for thinking, response in generate_response_stream(
	message=fallback_message,
	history=history,
	system_message=system_message,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	):
	yield thinking, response, sources_content
	# If tool was called but retriever is not available
	elif tool_call and not retriever:
	yield final_thinking, "⚠️ Documentation search is not available. Providing answer based on general knowledge...", ""

	# Generate response without RAG
	for thinking, response in generate_response_stream(
	message=message,
	history=history,
	system_message=system_message,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	):
	yield thinking, response, ""
	# If no tool call and response wasn't already yielded
	elif not tool_call and not is_complete:
	# This shouldn't happen but handle it just in case
	if accumulated_response and not accumulated_response.strip().startswith('{'):
	yield final_thinking, accumulated_response, sources_content

	except Exception as e:
	error_message = f"❌ Error generating response: {str(e)}"
	yield "", error_message, ""


	# Default system prompt for gprMax assistance
	def get_default_system_prompt():
	"""Get system prompt with tools formatted"""
	tools_json = format_tools_prompt()
	return f"""You are a helpful assistant specialized in gprMax, an open-source software that simulates electromagnetic wave propagation. You help users with:
	1. Creating gprMax input files (.in files)
	2. Understanding gprMax commands and syntax
	3. Setting up simulations for GPR (Ground Penetrating Radar) and other EM applications
	4. Troubleshooting simulation issues
	5. Optimizing simulation parameters

	You have access to the following tools:
	{tools_json}

	When you need to search documentation, respond with a tool call in this JSON format:
	{{
	"thought": "I need to search the documentation for...",
	"tool_call": {{
	"name": "search_documentation",
	"arguments": {{
	"query": "your search query here"
	}}
	}}
	}}

	After receiving tool results, provide a comprehensive answer based on the documentation.

	If you give code blocks, ensure to enclose them inside ```.

	There is no need to always give full input codes, be sure to understand what user needs and intends to do. Some times a simple line of code can do, sometimes user wants explanation rather than codes.

	Provide clear, accurate, and practical guidance for gprMax users."""


	# Create custom interface with collapsible thinking section
	with gr.Blocks(title="gprMax Support", theme=gr.themes.Ocean()) as demo:
	gr.Markdown(
	"""
	# 🛡️ gprMax Support Assistant

	Welcome to the gprMax Support Assistant powered by a fine-tuned Qwen3-4B model specifically trained for gprMax assistance.

	### Features:
	- 💬 Expert Guidance: Get help with gprMax input files, commands, and simulations
	- 🧠 Transparent Reasoning: See the AI's thinking process in a dedicated collapsible section
	- 📚 Documentation: Coming soon - RAG-powered documentation retrieval
	- 🤖 Agent Mode: Coming soon - Automated workflow assistance
	"""
	)

	with gr.Row():
	with gr.Column(scale=3):
	# Chat interface
	chatbot = gr.Chatbot(
	label="Chat",
	type="messages",
	height=500,
	show_copy_button=True,
	)

	with gr.Row():
	msg = gr.Textbox(
	label="Message",
	placeholder="Ask about gprMax commands, simulations, or troubleshooting...",
	lines=2,
	scale=4,
	)
	submit_btn = gr.Button("Send", variant="primary", scale=1)

	with gr.Row():
	clear_btn = gr.Button("🗑️ Clear Chat", scale=1)

	# Examples
	gr.Examples(
	examples=[
	"How do I create a basic gprMax input file for a simple GPR simulation?",
	"What's the difference between #domain and #dx_dy_dz commands?",
	"How can I model a heterogeneous soil with different dielectric properties?",
	"My simulation is taking too long. How can I optimize it?",
	"How do I add a Ricker wavelet source to my model?",
	],
	inputs=msg,
	label="Example Questions",
	)

	with gr.Column(scale=2):
	# Thinking process in collapsible accordion
	with gr.Accordion("🧠 AI Thinking Process", open=False) as thinking_accordion:
	thinking_display = gr.Markdown(
	value="Thinking process will appear here when the AI is reasoning through your question...",
	label="Thinking",
	height=300,
	)

	# Documentation sources in collapsible accordion
	with gr.Accordion("📚 Documentation Sources", open=False) as sources_accordion:
	sources_display = gr.Markdown(
	value="Documentation sources will appear here when RAG search is performed...",
	label="Sources",
	height=300,
	)

	# Settings
	with gr.Accordion("⚙️ Settings", open=True):
	system_message = gr.Textbox(
	value=get_default_system_prompt(),
	label="System Message",
	lines=5,
	info="Customize the assistant's behavior"
	)

	max_tokens = gr.Slider(
	minimum=1,
	maximum=4096,
	value=1536,
	step=1,
	label="Max New Tokens",
	info="Maximum length of generated response"
	)

	temperature = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=0.7,
	step=0.1,
	label="Temperature",
	info="Controls randomness (0=deterministic, higher=more creative)"
	)

	top_p = gr.Slider(
	minimum=0.1,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top-p (Nucleus Sampling)",
	info="Controls diversity of responses"
	)

	# Chat functionality
	def user_submit(message, history):
	if not message:
	return "", history
	history = history + [{"role": "user", "content": message}]
	return "", history

	def bot_respond(history, system_msg, max_tok, temp, top_p_val):
	if not history or history[-1]["role"] != "user":
	yield history, "No thinking process", "No sources"
	return

	user_message = history[-1]["content"]
	history_for_model = history[:-1] # Exclude the last user message

	# Add placeholder for assistant response
	history = history + [{"role": "assistant", "content": ""}]

	thinking_text = ""
	sources_text = ""
	is_thinking = False
	has_main_content = False
	is_searching = False

	for thinking, response, sources in respond(
	user_message,
	history_for_model,
	system_msg,
	max_tok,
	temp,
	top_p_val
	):
	# Update thinking display
	if thinking:
	if "Searching documentation" in thinking:
	thinking_text = thinking
	is_searching = True
	elif "Documentation retrieved" in thinking:
	thinking_text = thinking
	is_searching = False
	else:
	thinking_text = f"## Reasoning Process\n\n{thinking}"
	is_thinking = True
	elif not thinking and not is_searching:
	thinking_text = "Waiting for response..."

	# Update sources display
	if sources:
	sources_text = sources

	# Update chat response
	if response and response.strip():
	# We have actual response content
	if "Preparing to search" in response or "Generating response" in response:
	# Status messages
	history[-1]["content"] = response
	else:
	# Actual content
	history[-1]["content"] = response
	has_main_content = True
	elif is_thinking and not has_main_content:
	# Still thinking, no main response yet
	history[-1]["content"] = "🤔 AI is thinking... Check the right pane for thinking details"
	elif is_searching:
	history[-1]["content"] = "🔍 Searching documentation..."
	elif not response:
	# No response yet and no thinking detected
	history[-1]["content"] = "⏳ Generating response..."

	yield history, thinking_text, sources_text

	# Event handlers
	msg.submit(user_submit, [msg, chatbot], [msg, chatbot]).then(
	bot_respond,
	[chatbot, system_message, max_tokens, temperature, top_p],
	[chatbot, thinking_display, sources_display]
	)

	submit_btn.click(user_submit, [msg, chatbot], [msg, chatbot]).then(
	bot_respond,
	[chatbot, system_message, max_tokens, temperature, top_p],
	[chatbot, thinking_display, sources_display]
	)

	clear_btn.click(
	lambda: (
	[],
	"Thinking process will appear here when the AI is reasoning through your question...",
	"Documentation sources will appear here when RAG search is performed..."
	),
	outputs=[chatbot, thinking_display, sources_display]
	)

	# RAG status indicator
	rag_status = "✅ Documentation search enabled" if retriever else "⚠️ Documentation search disabled (run generate_db.py)"

	gr.Markdown(
	f"""
	---
	### About
	This assistant uses `jfang/gprmax-ft-Qwen3-4B-Instruct`, a model fine-tuned specifically for gprMax support.

	RAG Status: {rag_status}

	Note: For best results, be specific about your gprMax version and simulation requirements.
	"""
	)


	if __name__ == "__main__":
	demo.launch()