|  |  | 
					
						
						|  | import os | 
					
						
						|  |  | 
					
						
						|  | os.environ["CUDA_VISIBLE_DEVICES"] = "0" | 
					
						
						|  |  | 
					
						
						|  | import torch | 
					
						
						|  | from transformers import Qwen2VLForConditionalGeneration, AutoProcessor | 
					
						
						|  | from PIL import Image | 
					
						
						|  | import gradio as gr | 
					
						
						|  | from qwen_vl_utils import process_vision_info | 
					
						
						|  |  | 
					
						
						|  | def load_model(): | 
					
						
						|  | """ | 
					
						
						|  | マージ済みモデルとプロセッサのロード | 
					
						
						|  | """ | 
					
						
						|  | print("マージ済みモデルをロード中...") | 
					
						
						|  | model = Qwen2VLForConditionalGeneration.from_pretrained( | 
					
						
						|  | "AIBunCho/AI_bokete", torch_dtype="auto", device_map="auto", | 
					
						
						|  | ) | 
					
						
						|  | processor = AutoProcessor.from_pretrained("AIBunCho/AI_bokete") | 
					
						
						|  | print("マージ済みモデルのロード完了.") | 
					
						
						|  | return model, processor | 
					
						
						|  |  | 
					
						
						|  | def perform_inference(model, processor, image, prompt): | 
					
						
						|  | """ | 
					
						
						|  | 推論の実行 | 
					
						
						|  | """ | 
					
						
						|  |  | 
					
						
						|  | target_width = 512 | 
					
						
						|  | width_percent = (target_width / float(image.size[0])) | 
					
						
						|  | target_height = int((float(image.size[1]) * float(width_percent))) | 
					
						
						|  | image = image.resize((target_width, target_height), Image.Resampling.LANCZOS) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | messages = [ | 
					
						
						|  | { | 
					
						
						|  | "role": "user", | 
					
						
						|  | "content": [ | 
					
						
						|  | { | 
					
						
						|  | "type": "image", | 
					
						
						|  | "image": image, | 
					
						
						|  | }, | 
					
						
						|  | {"type": "text", "text": prompt}, | 
					
						
						|  | ], | 
					
						
						|  | } | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | image = image.convert("RGB") | 
					
						
						|  | image_inputs, video_inputs = process_vision_info(messages) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | text = processor.apply_chat_template( | 
					
						
						|  | messages, tokenize=False, add_generation_prompt=True | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | inputs = processor( | 
					
						
						|  | text=[text], | 
					
						
						|  | images=image_inputs, | 
					
						
						|  | videos=video_inputs, | 
					
						
						|  | padding=True, | 
					
						
						|  | return_tensors="pt", | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | device = "cuda:0" if torch.cuda.is_available() else "cpu" | 
					
						
						|  | model.to(device) | 
					
						
						|  | inputs = {k: v.to(device) for k, v in inputs.items()} | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | for param in model.parameters(): | 
					
						
						|  | param.data = param.data.to(device) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | with torch.no_grad(): | 
					
						
						|  | generated_ids = model.generate(**inputs, max_new_tokens=128) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | generated_ids_trimmed = [ | 
					
						
						|  | out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids) | 
					
						
						|  | ] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | output_text = processor.batch_decode( | 
					
						
						|  | generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | return output_text[0] | 
					
						
						|  |  | 
					
						
						|  | def main(): | 
					
						
						|  |  | 
					
						
						|  | model, processor = load_model() | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | prompt = "<image>画像を見てシュールで面白いことを言ってください。空欄がある場合はそれを埋めるように答えてください。" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | iface = gr.Interface( | 
					
						
						|  | fn=lambda image: perform_inference(model, processor, image, prompt), | 
					
						
						|  | inputs=gr.Image(type="pil"), | 
					
						
						|  | outputs="text", | 
					
						
						|  | title="Qwen2-VL-7B-Instruct Bokete Inference", | 
					
						
						|  | description="画像をアップロードすると、シュールで面白いキャプションが生成される…かも?", | 
					
						
						|  | examples=[["./images/0.jpg"]], | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | iface.launch() | 
					
						
						|  |  | 
					
						
						|  | if __name__ == "__main__": | 
					
						
						|  | main() | 
					
						
						|  |  |