MiniCPMV-RAG-PDFQA

Running on Zero

App Files Files Community

bokesyo commited on Aug 16, 2024

Commit

1b2ece1

verified ·

1 Parent(s): 9584a7d

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -5

app.py CHANGED Viewed

@@ -24,6 +24,7 @@ import torch
 import os
 import numpy as np
 import json
 cache_dir = '/data/kb_cache'
 os.makedirs(cache_dir, exist_ok=True)
@@ -43,7 +44,8 @@ def calculate_md5_from_binary(binary_data):
 @spaces.GPU(duration=100)
 def add_pdf_gradio(pdf_file_binary, progress=gr.Progress()):
     global model, tokenizer
     knowledge_base_name = calculate_md5_from_binary(pdf_file_binary)
     this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
@@ -88,6 +90,8 @@ def add_pdf_gradio(pdf_file_binary, progress=gr.Progress()):
 def retrieve_gradio(knowledge_base: str, query: str, topk: int):
     global model, tokenizer
     target_cache_dir = os.path.join(cache_dir, knowledge_base)
     if not os.path.exists(target_cache_dir):
@@ -180,9 +184,36 @@ device = 'cuda'
 model_path = 'RhapsodyAI/minicpm-visual-embedding-v0' # replace with your local model path
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
 model.to(device)
 with gr.Blocks() as app:
     gr.Markdown("# Memex: OCR-free Visual Document Embedding Model as Your Personal Librarian")
     gr.Markdown("""The model only takes images as document-side inputs and produce vectors representing document pages. Memex is trained with over 200k query-visual document pairs, including textual document, visual document, arxiv figures, plots, charts, industry documents, textbooks, ebooks, and openly-available PDFs, etc. Its performance is on a par with our ablation text embedding model on text-oriented documents, and an advantages on visually-intensive documents.
@@ -214,10 +245,6 @@ Our model is capable of:
         topk_input = inputs=gr.Number(value=3, minimum=1, maximum=5, step=1, label="Number of pages to retrieve")
         retrieve_button = gr.Button("Retrieve")
-    with gr.Row():
-        downvote_button = gr.Button("🤣Downvote")
-        upvote_button = gr.Button("🤗Upvote")
     with gr.Row():
         images_output = gr.Gallery(label="Retrieved Pages")
@@ -228,6 +255,18 @@ Our model is capable of:
     gr.Markdown("By using this demo, you agree to share your use data with us for research purpose, to help improve user experience.")
 app.launch()

 import os
 import numpy as np
 import json
+from io import Bytes
 cache_dir = '/data/kb_cache'
 os.makedirs(cache_dir, exist_ok=True)
 @spaces.GPU(duration=100)
 def add_pdf_gradio(pdf_file_binary, progress=gr.Progress()):
     global model, tokenizer
+    model.eval()
     knowledge_base_name = calculate_md5_from_binary(pdf_file_binary)
     this_cache_dir = os.path.join(cache_dir, knowledge_base_name)
 def retrieve_gradio(knowledge_base: str, query: str, topk: int):
     global model, tokenizer
+    model.eval()
     target_cache_dir = os.path.join(cache_dir, knowledge_base)
     if not os.path.exists(target_cache_dir):
 model_path = 'RhapsodyAI/minicpm-visual-embedding-v0' # replace with your local model path
 tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
+model.eval()
 model.to(device)
+def answer_question(images, question):
+    print("model load begin...")
+    gen_model_path = 'openbmb/MiniCPM-V-2_6'
+    gen_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    gen_model =  AutoModel.from_pretrained(model_path, trust_remote_code=True, attn_implementation='sdpa', torch_dtype=torch.bfloat16)
+    gen_model.eval()
+    gen_model.to(device)
+    print("model load success!")
+    images_ = [image.convert('RGB') for image in images]
+    msgs = [{'role': 'user', 'content': [*images_, question]}]
+    answer = gen_model.chat(
+        image=None,
+        msgs=msgs,
+        tokenizer=gen_tokenizer
+    )
+    print(answer)
+    return answer
 with gr.Blocks() as app:
     gr.Markdown("# Memex: OCR-free Visual Document Embedding Model as Your Personal Librarian")
     gr.Markdown("""The model only takes images as document-side inputs and produce vectors representing document pages. Memex is trained with over 200k query-visual document pairs, including textual document, visual document, arxiv figures, plots, charts, industry documents, textbooks, ebooks, and openly-available PDFs, etc. Its performance is on a par with our ablation text embedding model on text-oriented documents, and an advantages on visually-intensive documents.
         topk_input = inputs=gr.Number(value=3, minimum=1, maximum=5, step=1, label="Number of pages to retrieve")
         retrieve_button = gr.Button("Retrieve")
     with gr.Row():
         images_output = gr.Gallery(label="Retrieved Pages")
     gr.Markdown("By using this demo, you agree to share your use data with us for research purpose, to help improve user experience.")
+    with gr.Row():
+        button = gr.Button("Answer Question with Retrieved Pages")
+        gen_model_response = gr.Textbox(label="MiniCPM-V-2.6's Answer")
+        button.click(fn=answer_question, inputs=[images_output, query_input], outputs=gen_model_response)
+    with gr.Row():
+        downvote_button = gr.Button("🤣Downvote")
+        upvote_button = gr.Button("🤗Upvote")
 app.launch()