Spaces:
Running
Running
| import datetime | |
| import json | |
| import cv2 | |
| import httpx | |
| from app.config import env | |
| from app.utils import image_w_box, encode_image | |
| from agents import RunContextWrapper, function_tool | |
| from app.memory import Memory,Snapshot | |
| def task(name, image): | |
| resp = httpx.post(f"{env.end_task}", | |
| data={"name": name}, | |
| files={"file": ("frame.jpg", image.tobytes(), "image/jpeg")}, | |
| timeout=10, | |
| headers={"Authorization": env.api_key}, | |
| ) | |
| resp.raise_for_status() | |
| return resp.json()['result'] | |
| def completion(messages, model): | |
| response = env.client.chat.completions.create( | |
| model=model, | |
| messages=messages | |
| ) | |
| return response.choices[0].message.content | |
| def completion_image(images, prompt, model): | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "text", "text": prompt}, | |
| {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}}, | |
| ], | |
| } | |
| for b64, mime in map(encode_image, images) | |
| ] | |
| return completion(messages, model=model) | |
| # ------------------------ Function Tools ------------------------ | |
| def caption(wrapper: RunContextWrapper[Memory]) -> str: | |
| """ | |
| Generate a descriptive caption for the most recent frame, record it as a snapshot, and return it. | |
| Returns: | |
| str: | |
| The generated caption for the current view (i.e., the latest frame). | |
| """ | |
| mem = wrapper.context | |
| prompt = "Describe the image with rich details but in a concise manner." | |
| result = completion_image([mem.frames[-1]], prompt, env.model_mllm) | |
| mem.snapshots.append(Snapshot(sender='caption', data=result)) | |
| return result | |
| def ocr(wrapper: RunContextWrapper[Memory]) -> str: | |
| """ | |
| Perform OCR on the most recent frame, record it as a snapshot, and return the extracted text. | |
| Returns: | |
| str: | |
| The extracted text from the current view (i.e., the latest frame). | |
| """ | |
| mem = wrapper.context | |
| prompt = "Extract all text from image/payslip without miss anything." | |
| result = completion_image([mem.frames[-1]], prompt, env.model_mllm) | |
| mem.snapshots.append(Snapshot(sender='ocr', data=result)) | |
| return result | |
| def qa(wrapper: RunContextWrapper[Memory], question: str) -> str: | |
| """ | |
| Answer a question based on the most recent frame, record it as a snapshot, and return the answer. | |
| Args: | |
| question (str): The question to be answered. | |
| Returns: | |
| str: | |
| The answer to the question based on the current view (i.e., the latest frame). | |
| """ | |
| mem = wrapper.context | |
| prompt = f"Answer the question based on the image. Question: {question}" | |
| result = completion_image([mem.frames[-1]], prompt, env.model_mllm) | |
| mem.snapshots.append(Snapshot(sender='qa', data=result)) | |
| return result | |
| def localize(wrapper: RunContextWrapper[Memory]) -> str: | |
| """ | |
| Localize all objects in the most recent frame | |
| Returns: | |
| str: | |
| The localization result for the current view (i.e., the latest frame). | |
| the format is {name:list of bboxes} | |
| """ | |
| mem = wrapper.context | |
| frame = mem.frames[-1] | |
| _, img = cv2.imencode('.jpg', frame) | |
| objxbox = task(env.model_loc, img) | |
| mem.snapshots.append(Snapshot(sender='localize', data=image_w_box(frame, objxbox))) | |
| return json.dumps(objxbox, indent=2) | |
| def time(wrapper: RunContextWrapper[Memory]) -> str: | |
| """ | |
| Get the current time, record it as a snapshot, and return the time. | |
| Returns: | |
| str: | |
| The current time. | |
| """ | |
| mem = wrapper.context | |
| result = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| mem.snapshots.append(Snapshot(sender='time', data=result)) | |
| return result | |
| def sample_frames(mem: Memory, n: int) -> list: | |
| """ | |
| Sample frames from the past n seconds of video. | |
| Args: | |
| mem (Memory): The memory context containing frames. | |
| n (int): Number of seconds to look back for video frames. | |
| Returns: | |
| list: Sampled frames from the video sequence. | |
| """ | |
| if len(mem.frames) == 0: | |
| return [] | |
| available_frames = min(n * env.fps, len(mem.frames)) | |
| recent_frames = mem.frames[-available_frames:] | |
| sampled_frames = recent_frames[::env.fps // 2] | |
| return sampled_frames | |
| def video_caption(wrapper: RunContextWrapper[Memory], n=2) -> str: | |
| """ | |
| Generate a descriptive caption for a video sequence from the past n seconds of frames. | |
| The n is a required parameter that specifies how many seconds of video frames to consider. | |
| Args: | |
| n (int): Number of seconds to look back for video frames. | |
| Returns: | |
| str: | |
| The generated caption for the video sequence from the past n seconds. | |
| """ | |
| mem = wrapper.context | |
| sampled_frames = sample_frames(mem, n) | |
| if len(sampled_frames) == 0: | |
| return "No frames available for video caption." | |
| prompt = "Describe this video sequence focusing on any changes or actions that occur over time." | |
| result = completion_image(sampled_frames, prompt, env.model_mllm) | |
| mem.snapshots.append(Snapshot(sender='video caption', data=result)) | |
| return result | |
| def video_qa(wrapper: RunContextWrapper[Memory], question: str, n=2) -> str: | |
| """ | |
| Answer a question based on a video sequence from the past n seconds of frames. | |
| Args: | |
| question (str): The question to be answered. | |
| n (int): Number of seconds to look back for video frames. | |
| Returns: | |
| str: | |
| The answer to the question based on the video sequence from the past n seconds. | |
| """ | |
| mem = wrapper.context | |
| sampled_frames = sample_frames(mem, n) | |
| if len(sampled_frames) == 0: | |
| return "No frames available for video Q&A." | |
| prompt = f"Answer the question based on this video sequence. Question: {question}" | |
| result = completion_image(sampled_frames, prompt, env.model_mllm) | |
| mem.snapshots.append(Snapshot(sender='video qa', data=result)) | |
| return result | |