Spaces:
Running
Running
Eigth commit
Browse files
app.py
CHANGED
|
@@ -78,9 +78,9 @@ processor = ViTImageProcessor.from_pretrained('microsoft/swin-tiny-patch4-window
|
|
| 78 |
|
| 79 |
def m1(que, image):
|
| 80 |
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
| 81 |
-
model3 = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
| 82 |
|
| 83 |
-
inputs = processor3(image, que, return_tensors="pt")
|
| 84 |
|
| 85 |
out = model3.generate(**inputs)
|
| 86 |
return processor3.decode(out[0], skip_special_tokens=True)
|
|
@@ -102,7 +102,6 @@ def m3(que, image):
|
|
| 102 |
processor3 = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
| 103 |
model3 = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
| 104 |
|
| 105 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 106 |
model3.to(device)
|
| 107 |
|
| 108 |
prompt = "<s_docvqa><s_question>{que}</s_question><s_answer>"
|
|
@@ -139,7 +138,7 @@ def m5(que, image):
|
|
| 139 |
processor3 = AutoProcessor.from_pretrained("google/pix2struct-ocrvqa-large")
|
| 140 |
model3 = AutoModelForSeq2SeqLM.from_pretrained("google/pix2struct-ocrvqa-large")
|
| 141 |
|
| 142 |
-
inputs = processor3(images=image, text=que, return_tensors="pt")
|
| 143 |
|
| 144 |
predictions = model3.generate(**inputs)
|
| 145 |
return processor3.decode(predictions[0], skip_special_tokens=True)
|
|
@@ -148,7 +147,7 @@ def m6(que, image):
|
|
| 148 |
processor3 = AutoProcessor.from_pretrained("google/pix2struct-infographics-vqa-large")
|
| 149 |
model3 = AutoModelForSeq2SeqLM.from_pretrained("google/pix2struct-infographics-vqa-large")
|
| 150 |
|
| 151 |
-
inputs = processor3(images=image, text=que, return_tensors="pt")
|
| 152 |
|
| 153 |
predictions = model3.generate(**inputs)
|
| 154 |
return processor3.decode(predictions[0], skip_special_tokens=True)
|
|
|
|
| 78 |
|
| 79 |
def m1(que, image):
|
| 80 |
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
| 81 |
+
model3 = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-capfilt-large")
|
| 82 |
|
| 83 |
+
inputs = processor3(image, que, return_tensors="pt")
|
| 84 |
|
| 85 |
out = model3.generate(**inputs)
|
| 86 |
return processor3.decode(out[0], skip_special_tokens=True)
|
|
|
|
| 102 |
processor3 = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
| 103 |
model3 = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
|
| 104 |
|
|
|
|
| 105 |
model3.to(device)
|
| 106 |
|
| 107 |
prompt = "<s_docvqa><s_question>{que}</s_question><s_answer>"
|
|
|
|
| 138 |
processor3 = AutoProcessor.from_pretrained("google/pix2struct-ocrvqa-large")
|
| 139 |
model3 = AutoModelForSeq2SeqLM.from_pretrained("google/pix2struct-ocrvqa-large")
|
| 140 |
|
| 141 |
+
inputs = processor3(images=image, text=que, return_tensors="pt")
|
| 142 |
|
| 143 |
predictions = model3.generate(**inputs)
|
| 144 |
return processor3.decode(predictions[0], skip_special_tokens=True)
|
|
|
|
| 147 |
processor3 = AutoProcessor.from_pretrained("google/pix2struct-infographics-vqa-large")
|
| 148 |
model3 = AutoModelForSeq2SeqLM.from_pretrained("google/pix2struct-infographics-vqa-large")
|
| 149 |
|
| 150 |
+
inputs = processor3(images=image, text=que, return_tensors="pt")
|
| 151 |
|
| 152 |
predictions = model3.generate(**inputs)
|
| 153 |
return processor3.decode(predictions[0], skip_special_tokens=True)
|