|  | from transformers import AutoTokenizer, AutoProcessor, AutoModelForCausalLM | 
					
						
						|  | from qwen_vl_utils import process_vision_info | 
					
						
						|  | model_path = "lmms-lab/LLaVA-One-Vision-1.5-8B-Instruct" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model = AutoModelForCausalLM.from_pretrained( | 
					
						
						|  | model_path, torch_dtype="auto", device_map="auto", trust_remote_code=True | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) | 
					
						
						|  |  | 
					
						
						|  | messages = [ | 
					
						
						|  | { | 
					
						
						|  | "role": "user", | 
					
						
						|  | "content": [ | 
					
						
						|  | { | 
					
						
						|  | "type": "image", | 
					
						
						|  | "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", | 
					
						
						|  | }, | 
					
						
						|  | {"type": "text", "text": "Describe this image."}, | 
					
						
						|  | ], | 
					
						
						|  | } | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | text = processor.apply_chat_template( | 
					
						
						|  | messages, tokenize=False, add_generation_prompt=True | 
					
						
						|  | ) | 
					
						
						|  | image_inputs, video_inputs = process_vision_info(messages) | 
					
						
						|  | inputs = processor( | 
					
						
						|  | text=[text], | 
					
						
						|  | images=image_inputs, | 
					
						
						|  | videos=video_inputs, | 
					
						
						|  | padding=True, | 
					
						
						|  | return_tensors="pt", | 
					
						
						|  | ) | 
					
						
						|  | inputs = inputs.to("cuda") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | generated_ids = model.generate(**inputs, max_new_tokens=1024) | 
					
						
						|  | generated_ids_trimmed = [ | 
					
						
						|  | out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | 
					
						
						|  | ] | 
					
						
						|  | output_text = processor.batch_decode( | 
					
						
						|  | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | 
					
						
						|  | ) | 
					
						
						|  | print(output_text) | 
					
						
						|  |  |