intfloat
/

mmE5-mllama-11b-instruct

Zero-Shot Image Classification

sentence-transformers

text-generation-inference

Model card Files Files and versions

intfloat commited on Feb 24

Commit

71d439f

·

1 Parent(s): 4c104a7

Update README.md

Files changed (1) hide show

README.md +5 -3

README.md CHANGED Viewed

@@ -44,9 +44,11 @@ pip install -r requirements.txt
 Then you can enter the directory to run the following command.
 ```python
-from transformers import MllamaForConditionalGeneration, AutoProcessor
 import torch
 from PIL import Image
 # Pooling and Normalization
 def last_pooling(last_hidden_state, attention_mask, normalize=True):
@@ -70,8 +72,8 @@ model = MllamaForConditionalGeneration.from_pretrained(
 model.eval()
 # Image + Text -> Text
-inputs = processor(text='<|image|><|begin_of_text|> Represent the given image with the following question: What is in the image', images=[Image.open(
-    'figures/example.jpg')], return_tensors="pt").to("cuda")
 qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
 string = 'A cat and a dog'

 Then you can enter the directory to run the following command.
 ```python
 import torch
+import requests
 from PIL import Image
+from transformers import MllamaForConditionalGeneration, AutoProcessor
 # Pooling and Normalization
 def last_pooling(last_hidden_state, attention_mask, normalize=True):
 model.eval()
 # Image + Text -> Text
+image = Image.open(requests.get('https://github.com/haon-chen/mmE5/blob/main/figures/example.jpg?raw=true', stream=True).raw)
+inputs = processor(text='<|image|><|begin_of_text|> Represent the given image with the following question: What is in the image', images=[image], return_tensors="pt").to("cuda")
 qry_output = last_pooling(model(**inputs, return_dict=True, output_hidden_states=True).hidden_states[-1], inputs['attention_mask'])
 string = 'A cat and a dog'