Petite-LLM-3 / docsandtests /test_pre_quantized_model.py
Tonic's picture
small mods on the title and description
5975026
#!/usr/bin/env python3
"""
Test script for full fine-tuned model inference
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import logging
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def test_full_fine_tuned_model():
"""Test the full fine-tuned model loading and generation"""
model_id = "Tonic/petite-elle-L-aime-3-sft"
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Testing full fine-tuned model on device: {device}")
try:
# Load tokenizer
logger.info("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
# Load full fine-tuned model
logger.info("Loading full fine-tuned model...")
model_kwargs = {
"device_map": "auto" if device == "cuda" else "cpu",
"torch_dtype": torch.float16 if device == "cuda" else torch.float32,
"trust_remote_code": True,
"low_cpu_mem_usage": True,
}
model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
# Test generation
test_prompt = "Bonjour, comment allez-vous?"
inputs = tokenizer(test_prompt, return_tensors="pt")
if device == "cuda":
inputs = {k: v.cuda() for k, v in inputs.items()}
logger.info("Generating response...")
with torch.no_grad():
output_ids = model.generate(
inputs['input_ids'],
max_new_tokens=50,
temperature=0.7,
top_p=0.95,
do_sample=True,
attention_mask=inputs['attention_mask'],
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
assistant_response = response[len(test_prompt):].strip()
logger.info("βœ… Full fine-tuned model test successful!")
logger.info(f"Input: {test_prompt}")
logger.info(f"Output: {assistant_response}")
# Check model precision status
logger.info("Checking model precision status...")
float16_layers = 0
float32_layers = 0
total_layers = 0
for name, module in model.named_modules():
if hasattr(module, 'weight'):
total_layers += 1
if module.weight.dtype == torch.float16:
float16_layers += 1
elif module.weight.dtype == torch.float32:
float32_layers += 1
logger.info(f"Float32 layer: {name} - {module.weight.dtype}")
logger.info(f"Float16 layers: {float16_layers}/{total_layers}")
logger.info(f"Float32 layers: {float32_layers}/{total_layers}")
# Clean up
del model
torch.cuda.empty_cache() if device == "cuda" else None
except Exception as e:
logger.error(f"❌ Full fine-tuned model test failed: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
test_full_fine_tuned_model()