llava

Running

File size: 1,714 Bytes

51a5a00

# minimal loader that uses transformers to load a multimodal model if available.
# This is a thin adapter: it expects model checkpoints on HF that are compatible with transformers.auto.modeling.
# For TinyLLaVA upstream functionality, replace with full repo.

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor
import torch

def load_pretrained_model(model_path: str, model_base=None, model_name: str=None):
    """
    Minimal loader:
    - tokenizer: AutoTokenizer.from_pretrained(model_path)
    - model: AutoModelForCausalLM.from_pretrained(model_path, device_map="auto" if cuda else None)
    - image_processor: AutoProcessor.from_pretrained(model_path) or AutoProcessor from a known vision model
    Returns: tokenizer, model, image_processor, context_len
    """
    if model_name is None:
        model_name = model_path.split("/")[-1]
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

    # Try to load an image processor / processor; fallback to using a BLIP processor if available
    try:
        image_processor = AutoProcessor.from_pretrained(model_path)
    except Exception:
        # fallback: try a common image processor (BLIP)
        try:
            image_processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
        except Exception:
            image_processor = None

    # Load causal LM
    model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
    # context length: use tokenizer model_max_length if available
    context_len = getattr(tokenizer, "model_max_length", 2048)
    return tokenizer, model, image_processor, context_len