Spaces:

kayfahaarukku
/

thorscribe

Sleeping

File size: 9,289 Bytes

import gradio as gr
from PIL import Image
import torch
import os
import sys
from huggingface_hub import login
from transformers import AutoConfig, AutoProcessor, AutoModelForCausalLM
import uvicorn

# Import spaces module for ZeroGPU support
try:
    import spaces
    has_spaces = True
    print("ZeroGPU support enabled via spaces module")
except ImportError:
    has_spaces = False
    print("spaces module not found, ZeroGPU features will be disabled")

# Create examples directory if it doesn't exist
os.makedirs("examples", exist_ok=True)

# Authenticate with Hugging Face Hub using environment variable
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
    login(token=hf_token)
else:
    print("Warning: HF_TOKEN environment variable not set. Some features may not work.")

# Model and device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Model identifier - hardcode the correct model path instead of using environment variables
model_id = "thorscribe/thorscribe-model-3"
print(f"Using model: {model_id}")

# Determine dtype based on available hardware
if device == "cuda":
    if torch.cuda.is_bf16_supported():
        torch_dtype = torch.bfloat16
        print("Using bfloat16 precision")
    else:
        torch_dtype = torch.float16
        print("Using float16 precision")
else:
    torch_dtype = torch.float32
    print("Using float32 precision (CPU mode)")

# Calculate target dimensions - using fixed dimensions
target_size = 1024  # Use a fixed size that works well with the model
print(f"Using fixed image resolution of {target_size}x{target_size}")

def pad_to_square(image, background_color=(0, 0, 0)):
    """Pad image to square with black background"""
    if image is None:
        return None
        
    width, height = image.size
    if width == height:
        return image
    
    new_size = max(width, height)
    new_image = Image.new('RGB', (new_size, new_size), background_color)
    
    # Paste the original image centered in the square
    paste_x = (new_size - width) // 2
    paste_y = (new_size - height) // 2
    new_image.paste(image, (paste_x, paste_y))
    
    return new_image

def process_image(image, size=1024):
    """Process image to be suitable for the model"""
    if image is None:
        return None
    
    # First make the image square by padding
    image = pad_to_square(image)
    
    # Then resize to the target size
    image = image.resize((size, size), Image.LANCZOS)
    
    print(f"Processed image to {image.size[0]}x{image.size[1]}")
    return image

# Load processor first (lower memory requirements)
print(f"Loading processor from {model_id}...")
try:
    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
    print("Processor loaded successfully!")
except Exception as e:
    print(f"Error loading processor: {str(e)}")
    sys.exit(1)

# Load and inspect model config via AutoConfig
try:
    cfg = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
    print("Vision config - patch_size:", cfg.vision_config.patch_size)
    print("Vision config - patch_stride:", cfg.vision_config.patch_stride)
    print("Vision config - patch_padding:", cfg.vision_config.patch_padding)
except Exception as e:
    print(f"Error loading model config: {str(e)}")
    sys.exit(1)

# Load model with explicit config
try:
    print(f"Loading model from {model_id}...")
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        config=cfg,
        torch_dtype=torch_dtype,
        low_cpu_mem_usage=True,
        trust_remote_code=True
    )
    
    # Only move model to GPU when we're actually using it
    # Will be handled by the @spaces.GPU decorator
    if not has_spaces and device == "cuda":
        model.to(device)
        print("Model moved to CUDA device")
    
    print("Model loaded successfully with explicit config!")
except Exception as e:
    print(f"Error loading model: {str(e)}")
    import traceback
    print(traceback.format_exc())
    sys.exit(1)

# Default prompt to use (hidden from UI)
DEFAULT_PROMPT = "<THORSCRIBE> What does this figure show?"

# Define the generation function with ZeroGPU decorator if available
if has_spaces:
    @spaces.GPU(duration=60)  # Set appropriate duration based on your model's generation time
    def generate_caption(image):
        if image is None:
            return "Please upload an image."
        try:
            # Move model to GPU when using ZeroGPU
            model.to(device)
            
            # Process the image to be suitable for the model
            processed_image = process_image(image, size=target_size)
            
            # Process text and image separately
            pixel_values = processor.image_processor(images=processed_image, return_tensors="pt").pixel_values
            
            # Process the text with controlled parameters
            input_ids = processor.tokenizer(
                DEFAULT_PROMPT,
                return_tensors="pt",
                padding="max_length",
                max_length=77,  # Use a safe, reasonable value
                truncation=True
            ).input_ids
            
            # Build inputs dictionary
            inputs = {
                "pixel_values": pixel_values.to(device, dtype=torch_dtype),
                "input_ids": input_ids.to(device)
            }
            
            # Generate with conservative settings
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=50,
                    num_beams=1,
                    do_sample=False
                )

            # Decode and truncate
            text = processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
            return text
        except Exception as e:
            import traceback
            trace = traceback.format_exc()
            print(f"Error: {str(e)}")
            print(trace)
            return f"Error processing image: {str(e)[:200]}. Check console for full traceback."
else:
    # Regular function without ZeroGPU
    def generate_caption(image):
        if image is None:
            return "Please upload an image."
        try:
            # Process the image to be suitable for the model
            processed_image = process_image(image, size=target_size)
            
            # Process text and image separately
            pixel_values = processor.image_processor(images=processed_image, return_tensors="pt").pixel_values
            
            # Process the text with controlled parameters
            input_ids = processor.tokenizer(
                DEFAULT_PROMPT,
                return_tensors="pt",
                padding="max_length",
                max_length=77,  # Use a safe, reasonable value
                truncation=True
            ).input_ids
            
            # Build inputs dictionary
            inputs = {
                "pixel_values": pixel_values.to(device, dtype=torch_dtype),
                "input_ids": input_ids.to(device)
            }
            
            # Generate with conservative settings
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=50,
                    num_beams=1,
                    do_sample=False
                )

            # Decode and truncate
            text = processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
            return text
        except Exception as e:
            import traceback
            trace = traceback.format_exc()
            print(f"Error: {str(e)}")
            print(trace)
            return f"Error processing image: {str(e)[:200]}. Check console for full traceback."

# Create a simple Gradio interface without FastAPI integration
demo = gr.Interface(
    fn=generate_caption,
    inputs=gr.Image(type="pil", label="Upload Thoracic MRI/X-ray Image"),
    outputs=gr.Textbox(label="Generated Caption", lines=5, max_lines=5, show_copy_button=True),
    title="THORSCRIBE: AI-Powered Thoracic Image Captioning",
    description="THORSCRIBE is an advanced AI model that generates detailed captions for MRI and X-ray images of the thorax area. Upload your medical image to receive an informative caption." + (" (with ZeroGPU)" if has_spaces else ""),
    allow_flagging="never",
    theme=gr.themes.Monochrome(),
    examples=["examples/example1.jpg", "examples/example2.jpg", "examples/example3.jpg", "examples/example4.jpg"] if os.path.exists("examples/example1.jpg") else None,
    article="<div style='text-align: center; max-width: 800px; margin: 0 auto;'><h3>About THORSCRIBE</h3><p>THORSCRIBE is specialized in analyzing thoracic medical imagery, providing accurate descriptions of findings in MRI and X-ray images. This tool is designed to assist medical professionals in their diagnostic workflows.</p><p><small>Powered by model: thorscribe/thorscribe-model-3</small></p></div>"
)

# Launch the app - Use 7860 which is the standard port for Hugging Face Spaces
if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        debug=False,
        show_error=True,
    )