Spaces:

yingzhac
/

text_editing

Sleeping

yingzhac commited on Jul 3

Commit

838e8f6

1 Parent(s): 6092598

🎨 Add smart text resizing functionality

- Integrate text resizing core functionality (OCR + AI parsing)
- Add beautiful Gradio interface with dual-column layout
- Support natural language commands (e.g., 'enlarge Hello by 50%')
- Add OpenAI API integration for intelligent prompt parsing
- Support both AI parsing and fallback percentage parsing
- Add comprehensive styling with gradient backgrounds
- Remove example images to reduce repo size

Files changed (6) hide show

.gitignore +91 -0
app.py +305 -133
core.py +200 -0
prompt_handler.py +149 -0
requirements.txt +8 -6
utils.py +160 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,91 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+*.manifest
+*.spec
+# Gradio
+.gradio/
+gradio_cached_examples/
+flagged/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+Pipfile.lock
+# PEP 582
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+# Temporary files
+*.tmp
+*.temp
+*~

app.py CHANGED Viewed

@@ -1,154 +1,326 @@
 import gradio as gr
 import numpy as np
-import random
-# import spaces #[uncomment to use ZeroGPU]
-from diffusers import DiffusionPipeline
-import torch
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model_repo_id = "stabilityai/sdxl-turbo"  # Replace to the model you would like to use
-if torch.cuda.is_available():
-    torch_dtype = torch.float16
-else:
-    torch_dtype = torch.float32
-pipe = DiffusionPipeline.from_pretrained(model_repo_id, torch_dtype=torch_dtype)
-pipe = pipe.to(device)
-MAX_SEED = np.iinfo(np.int32).max
-MAX_IMAGE_SIZE = 1024
-# @spaces.GPU #[uncomment to use ZeroGPU]
-def infer(
-    prompt,
-    negative_prompt,
-    seed,
-    randomize_seed,
-    width,
-    height,
-    guidance_scale,
-    num_inference_steps,
-    progress=gr.Progress(track_tqdm=True),
-):
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    image = pipe(
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        guidance_scale=guidance_scale,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-        generator=generator,
-    ).images[0]
-    return image, seed
-examples = [
-    "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
-    "An astronaut riding a green horse",
-    "A delicious ceviche cheesecake slice",
-]
 css = """
 #col-container {
     margin: 0 auto;
-    max-width: 640px;
 }
-"""
-with gr.Blocks(css=css) as demo:
-    with gr.Column(elem_id="col-container"):
-        gr.Markdown(" # Text-to-Image Gradio Template")
-        with gr.Row():
-            prompt = gr.Text(
-                label="Prompt",
-                show_label=False,
-                max_lines=1,
-                placeholder="Enter your prompt",
-                container=False,
-            )
-            run_button = gr.Button("Run", scale=0, variant="primary")
-        result = gr.Image(label="Result", show_label=False)
-        with gr.Accordion("Advanced Settings", open=False):
-            negative_prompt = gr.Text(
-                label="Negative prompt",
-                max_lines=1,
-                placeholder="Enter a negative prompt",
-                visible=False,
-            )
-            seed = gr.Slider(
-                label="Seed",
-                minimum=0,
-                maximum=MAX_SEED,
-                step=1,
-                value=0,
-            )
-            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
-            with gr.Row():
-                width = gr.Slider(
-                    label="Width",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-                height = gr.Slider(
-                    label="Height",
-                    minimum=256,
-                    maximum=MAX_IMAGE_SIZE,
-                    step=32,
-                    value=1024,  # Replace with defaults that work for your model
-                )
-            with gr.Row():
-                guidance_scale = gr.Slider(
-                    label="Guidance scale",
-                    minimum=0.0,
-                    maximum=10.0,
-                    step=0.1,
-                    value=0.0,  # Replace with defaults that work for your model
-                )
-                num_inference_steps = gr.Slider(
-                    label="Number of inference steps",
-                    minimum=1,
-                    maximum=50,
-                    step=1,
-                    value=2,  # Replace with defaults that work for your model
-                )
-        gr.Examples(examples=examples, inputs=[prompt])
-    gr.on(
-        triggers=[run_button.click, prompt.submit],
-        fn=infer,
-        inputs=[
-            prompt,
-            negative_prompt,
-            seed,
-            randomize_seed,
-            width,
-            height,
-            guidance_scale,
-            num_inference_steps,
-        ],
-        outputs=[result, seed],
-    )
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 import numpy as np
+import os
+from PIL import Image
+import cv2
+# Import our custom modules
+from core import TextResizer
+from prompt_handler import PromptHandler
+from utils import (
+    load_image,
+    save_image,
+    validate_scale_factor,
+    parse_percentage_to_scale_factor,
+    create_output_filename
+)
+# Initialize the text resizer
+text_resizer = TextResizer(languages=['en', 'ch_sim'], gpu=False)
+def process_image(input_image, user_prompt, use_ai_parsing=True, api_key=None):
+    """
+    Process image with text resizing based on user prompt
+    """
+    try:
+        if input_image is None:
+            return None, "❌ 错误: 请上传一张图片"
+        # Convert PIL to RGB numpy array
+        image_rgb = np.array(input_image.convert('RGB'))
+        # Perform OCR
+        ocr_results = text_resizer.read_text(image_rgb)
+        if not ocr_results:
+            return None, "❌ 错误: 未在图像中识别到任何文字"
+        # Parse user prompt
+        try:
+            if use_ai_parsing and api_key:
+                # Use OpenAI API parsing
+                prompt_handler = PromptHandler(api_key=api_key)
+                parsed_result = prompt_handler.parse_user_request(ocr_results, user_prompt)
+                if not prompt_handler.validate_parsed_result(parsed_result, ocr_results):
+                    raise Exception("AI解析结果验证失败")
+                target_text = parsed_result["target_text"]
+                scale_factor = validate_scale_factor(parsed_result["scale_factor"])
+                status_msg = f"✅ AI解析成功: 目标文字='{target_text}', 缩放因子={scale_factor}"
+            else:
+                # Use fallback parsing
+                scale_factor = parse_percentage_to_scale_factor(user_prompt)
+                if scale_factor == 1.0:
+                    return None, "❌ 错误: 无法从用户指令中解析出缩放信息"
+                # Use the first detected text as target
+                target_text = ocr_results[0][1].strip()
+                status_msg = f"✅ 备用解析: 目标文字='{target_text}', 缩放因子={scale_factor}"
+        except Exception as e:
+            return None, f"❌ 错误: 指令解析失败: {str(e)}"
+        # Process the image
+        try:
+            result_image = text_resizer.resize_text(image_rgb, target_text, scale_factor)
+            # Convert back to PIL Image
+            result_pil = Image.fromarray(result_image)
+            return result_pil, status_msg
+        except ValueError as e:
+            # Show available texts
+            available_texts = [text.strip() for _, text, _ in ocr_results]
+            error_msg = f"❌ 错误: {str(e)}\n\n📝 可用的文字: {available_texts}"
+            return None, error_msg
+    except Exception as e:
+        return None, f"❌ 处理过程中出现错误: {str(e)}"
+def get_ocr_info(input_image):
+    """
+    Get OCR information from the image
+    """
+    if input_image is None:
+        return "请先上传图片"
+    try:
+        # Convert PIL to RGB numpy array
+        image_rgb = np.array(input_image.convert('RGB'))
+        # Perform OCR
+        ocr_results = text_resizer.read_text(image_rgb)
+        if not ocr_results:
+            return "未识别到任何文字"
+        # Format results
+        info = f"📝 识别到 {len(ocr_results)} 个文字区域:\n"
+        info += "=" * 50 + "\n"
+        for i, (bbox, text, conf) in enumerate(ocr_results):
+            info += f"{i+1:2d}. '{text}' (置信度: {conf:.2f})\n"
+        info += "=" * 50
+        return info
+    except Exception as e:
+        return f"❌ OCR识别失败: {str(e)}"
+# Define CSS for styling
 css = """
 #col-container {
     margin: 0 auto;
+    max-width: 1000px;
 }
+#input-section {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    padding: 20px;
+    border-radius: 15px;
+    margin-bottom: 20px;
+}
+#output-section {
+    background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%);
+    padding: 20px;
+    border-radius: 15px;
+}
+.gradio-container {
+    background: linear-gradient(135deg, #ffecd2 0%, #fcb69f 100%);
+}
+#title {
+    text-align: center;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    font-size: 2.5em;
+    font-weight: bold;
+    margin-bottom: 20px;
+}
+#description {
+    text-align: center;
+    color: #666;
+    font-size: 1.1em;
+    margin-bottom: 30px;
+    line-height: 1.6;
+}
+.process-button {
+    background: linear-gradient(135deg, #4CAF50 0%, #45a049 100%);
+    color: white;
+    border: none;
+    padding: 15px 30px;
+    font-size: 16px;
+    border-radius: 10px;
+    cursor: pointer;
+    transition: all 0.3s ease;
+}
+.process-button:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 5px 15px rgba(0,0,0,0.2);
+}
+"""
+# Create the Gradio interface
+with gr.Blocks(css=css, title="智能文字缩放工具") as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("# 🎨 智能文字缩放工具", elem_id="title")
+        gr.Markdown(
+            """
+            🚀 **使用AI技术智能调整图片中的文字大小**
+            📝 支持自然语言指令，如：
+            - `enlarge 'Hello' by 50%` - 将'Hello'放大50%
+            - `make the title bigger` - 让标题变大
+            - `shrink the footer text` - 缩小页脚文字
+            🎯 **使用方法**：
+            1. 上传包含文字的图片
+            2. 输入文字调整指令
+            3. 点击处理按钮
+            4. 查看处理结果
+            """,
+            elem_id="description"
+        )
+        with gr.Row():
+            with gr.Column(scale=1):
+                with gr.Group(elem_id="input-section"):
+                    gr.Markdown("### 📤 输入设置")
+                    # Image input
+                    input_image = gr.Image(
+                        label="上传图片",
+                        type="pil",
+                        height=300,
+                        sources=["upload", "clipboard", "webcam"]
+                    )
+                    # Prompt input
+                    user_prompt = gr.Textbox(
+                        label="文字调整指令",
+                        placeholder="例如: enlarge 'Hello' by 50%",
+                        lines=2,
+                        info="支持自然语言描述，如 'make XX bigger' 或 'enlarge XX by 50%'"
+                    )
+                    # OCR info button
+                    ocr_button = gr.Button(
+                        "🔍 查看图片中的文字",
+                        variant="secondary",
+                        size="sm"
+                    )
+                    # Advanced settings
+                    with gr.Accordion("⚙️ 高级设置", open=False):
+                        use_ai_parsing = gr.Checkbox(
+                            label="🤖 使用AI解析 (需要OpenAI API密钥)",
+                            value=False,
+                            info="使用GPT模型理解自然语言指令"
+                        )
+                        api_key = gr.Textbox(
+                            label="🔑 OpenAI API密钥 (可选)",
+                            placeholder="sk-...",
+                            type="password",
+                            info="仅在使用AI解析时需要"
+                        )
+                    # Process button
+                    process_button = gr.Button(
+                        "🎯 开始处理",
+                        variant="primary",
+                        size="lg",
+                        elem_classes="process-button"
+                    )
+            with gr.Column(scale=1):
+                with gr.Group(elem_id="output-section"):
+                    gr.Markdown("### 📤 处理结果")
+                    # Output image
+                    output_image = gr.Image(
+                        label="处理后的图片",
+                        height=300,
+                        show_download_button=True
+                    )
+                    # Status message
+                    status_message = gr.Textbox(
+                        label="💬 状态信息",
+                        lines=4,
+                        max_lines=8,
+                        interactive=False
+                    )
+                    # OCR info display
+                    ocr_info = gr.Textbox(
+                        label="📝 OCR识别结果",
+                        lines=6,
+                        max_lines=10,
+                        interactive=False
+                    )
+        # Examples section
+        gr.Markdown("### 📚 示例用法")
+        gr.Markdown(
+            """
+            **示例指令格式：**
+            🔍 **指定文字 + 具体比例：**
+            - `enlarge 'Hello' by 50%` - 将'Hello'放大50%
+            - `shrink 'Title' by 30%` - 将'Title'缩小30%
+            🎯 **自然语言描述：**
+            - `make the title bigger` - 让标题变大
+            - `make the text smaller` - 让文字变小
+            - `enlarge the heading` - 放大标题
+            💡 **使用提示：**
+            1. 上传包含文字的图片
+            2. 先点击"查看图片中的文字"了解可用文字
+            3. 输入调整指令
+            4. 点击"开始处理"
+            """
+        )
+        # Event handlers
+        process_button.click(
+            fn=process_image,
+            inputs=[input_image, user_prompt, use_ai_parsing, api_key],
+            outputs=[output_image, status_message]
+        )
+        ocr_button.click(
+            fn=get_ocr_info,
+            inputs=[input_image],
+            outputs=[ocr_info]
+        )
+        # Auto-run OCR when image is uploaded
+        input_image.change(
+            fn=get_ocr_info,
+            inputs=[input_image],
+            outputs=[ocr_info]
+        )
+        # Footer
+        gr.Markdown(
+            """
+            ---
+            🎨 **智能文字缩放工具** | 基于OCR和AI技术的智能图像文字处理
+            📧 如有问题或建议，请联系开发者
+            """
+        )
 if __name__ == "__main__":
+    demo.launch(share=True, server_name="0.0.0.0", server_port=7860)

core.py ADDED Viewed

	@@ -0,0 +1,200 @@

+import cv2
+import numpy as np
+import easyocr
+from typing import List, Tuple, Optional
+class TextResizer:
+    def __init__(self, languages=['en', 'ch_sim'], gpu=False):
+        """
+        初始化文字缩放器
+        Args:
+            languages: OCR支持的语言列表
+            gpu: 是否使用GPU
+        """
+        self.reader = easyocr.Reader(languages, gpu=gpu)
+    def read_text(self, image: np.ndarray) -> List[Tuple]:
+        """
+        从图像中识别文字
+        Args:
+            image: RGB格式的图像数组
+        Returns:
+            OCR结果列表，每个元素为(bbox, text, confidence)
+        """
+        return self.reader.readtext(image)
+    def extract_text_mask_by_content(self, image: np.ndarray, results: List[Tuple], target_text: str) -> np.ndarray:
+        """
+        根据目标文字内容提取文字mask
+        Args:
+            image: RGB格式的图像数组
+            results: OCR识别结果
+            target_text: 目标文字内容
+        Returns:
+            文字mask，白色为文字区域
+        """
+        h, w = image.shape[:2]
+        mask = np.zeros((h, w), dtype=np.uint8)
+        for (bbox, text, _) in results:
+            if text.strip() != target_text:
+                continue
+            x_min = int(min([pt[0] for pt in bbox]))
+            x_max = int(max([pt[0] for pt in bbox]))
+            y_min = int(min([pt[1] for pt in bbox]))
+            y_max = int(max([pt[1] for pt in bbox]))
+            roi = image[y_min:y_max, x_min:x_max]
+            gray = cv2.cvtColor(roi, cv2.COLOR_RGB2GRAY)
+            thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                         cv2.THRESH_BINARY_INV, 11, 2)
+            contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+            mask_roi = np.zeros_like(thresh)
+            cv2.drawContours(mask_roi, contours, -1, 255, -1)
+            mask[y_min:y_max, x_min:x_max] = np.maximum(mask[y_min:y_max, x_min:x_max], mask_roi)
+        return mask
+    def inpaint_image(self, image: np.ndarray, mask: np.ndarray) -> np.ndarray:
+        """
+        使用mask对图像进行修复
+        Args:
+            image: RGB格式的图像数组
+            mask: 需要修复的区域mask
+        Returns:
+            修复后的图像
+        """
+        return cv2.inpaint(image, mask, 3, cv2.INPAINT_TELEA)
+    def find_target_bbox(self, results: List[Tuple], target_text: str) -> Optional[List]:
+        """
+        查找目标文字的边界框
+        Args:
+            results: OCR识别结果
+            target_text: 目标文字内容
+        Returns:
+            目标文字的边界框，如果未找到则返回None
+        """
+        for (bbox, text, _) in results:
+            if text.strip() == target_text:
+                return bbox
+        return None
+    def create_resized_text_patch(self, image: np.ndarray, bbox: List, scale_factor: float) -> Tuple[np.ndarray, int, int]:
+        """
+        创建缩放后的文字补丁
+        Args:
+            image: RGB格式的图像数组
+            bbox: 文字边界框
+            scale_factor: 缩放因子
+        Returns:
+            (RGBA格式的缩放后文字补丁, 原始中心x坐标, 原始中心y坐标)
+        """
+        # 提取ROI
+        x_min = int(min(pt[0] for pt in bbox))
+        x_max = int(max(pt[0] for pt in bbox))
+        y_min = int(min(pt[1] for pt in bbox))
+        y_max = int(max(pt[1] for pt in bbox))
+        roi = image[y_min:y_max, x_min:x_max]
+        # 创建文字mask
+        gray = cv2.cvtColor(roi, cv2.COLOR_RGB2GRAY)
+        thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+                                     cv2.THRESH_BINARY_INV, 11, 2)
+        contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        mask_roi = np.zeros_like(thresh)
+        cv2.drawContours(mask_roi, contours, -1, 255, -1)
+        # 创建RGBA补丁
+        rgba_patch = cv2.cvtColor(roi, cv2.COLOR_RGB2RGBA)
+        rgba_patch[:, :, 3] = mask_roi
+        # 缩放
+        h, w = rgba_patch.shape[:2]
+        new_size = (int(w * scale_factor), int(h * scale_factor))
+        resized_patch = cv2.resize(rgba_patch, new_size, interpolation=cv2.INTER_LINEAR)
+        # 计算原始中心点
+        cx = (x_min + x_max) // 2
+        cy = (y_min + y_max) // 2
+        return resized_patch, cx, cy
+    def blend_text_patch(self, canvas: np.ndarray, patch: np.ndarray, center_x: int, center_y: int) -> np.ndarray:
+        """
+        将文字补丁混合到画布上
+        Args:
+            canvas: 目标画布（RGB格式）
+            patch: RGBA格式的文字补丁
+            center_x: 放置的中心x坐标
+            center_y: 放置的中心y坐标
+        Returns:
+            混合后的图像
+        """
+        result = canvas.copy()
+        new_h, new_w = patch.shape[:2]
+        top_left_x = max(0, center_x - new_w // 2)
+        top_left_y = max(0, center_y - new_h // 2)
+        for y in range(new_h):
+            for x in range(new_w):
+                if patch[y, x, 3] > 0:  # 如果alpha > 0
+                    yy = top_left_y + y
+                    xx = top_left_x + x
+                    if 0 <= yy < result.shape[0] and 0 <= xx < result.shape[1]:
+                        alpha = patch[y, x, 3] / 255.0
+                        result[yy, xx] = (
+                            (1 - alpha) * result[yy, xx] + alpha * patch[y, x, :3]
+                        ).astype(np.uint8)
+        return result
+    def resize_text(self, image: np.ndarray, target_text: str, scale_factor: float) -> np.ndarray:
+        """
+        完整的文字缩放流程
+        Args:
+            image: RGB格式的图像数组
+            target_text: 目标文字内容
+            scale_factor: 缩放因子
+        Returns:
+            处理后的图像
+        """
+        # 1. OCR识别
+        results = self.read_text(image)
+        # 2. 查找目标文字
+        target_bbox = self.find_target_bbox(results, target_text)
+        if target_bbox is None:
+            raise ValueError(f"未找到目标文字: {target_text}")
+        # 3. 提取文字mask
+        text_mask = self.extract_text_mask_by_content(image, results, target_text)
+        # 4. 图像修复
+        inpainted = self.inpaint_image(image, text_mask)
+        # 5. 创建缩放后的文字补丁
+        resized_patch, cx, cy = self.create_resized_text_patch(image, target_bbox, scale_factor)
+        # 6. 混合文字补丁
+        result = self.blend_text_patch(inpainted, resized_patch, cx, cy)
+        return result

prompt_handler.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import os
+from openai import OpenAI
+from typing import List, Tuple, Dict, Any
+from utils import format_ocr_results_for_prompt, robust_parse_reply
+class PromptHandler:
+    def __init__(self, api_key: str = None, model: str = "gpt-4o-mini"):
+        """
+        初始化Prompt处理器
+        Args:
+            api_key: OpenAI API密钥，如果不提供则从环境变量获取
+            model: 使用的模型名称
+        """
+        if api_key:
+            os.environ["OPENAI_API_KEY"] = api_key
+        self.client = OpenAI()
+        self.model = model
+    def create_system_prompt(self) -> str:
+        """
+        创建系统提示词
+        Returns:
+            系统提示词字符串
+        """
+        return (
+            "You are a helpful assistant. "
+            "You are given a list of OCR results in the form [(bbox, text, score)], "
+            "and a user prompt that describes what text to enlarge and how much to scale it. "
+            "Your job is to:\n"
+            "1. Match the user input text to the actual text in OCR results as best as possible, even if it's fuzzy or missing punctuation.\n"
+            "2. Estimate a scale_factor (float > 0) based on qualitative user intent like 'a bit', 'a lot', 'shrink slightly', etc.\n"
+            "3. Output only two fields:\n"
+            "   target_text: the exact string from OCR result you chose\n"
+            "   scale_factor: a float number\n\n"
+            "Your output must be strictly in JSON format like:\n"
+            "{\n  \"target_text\": \"Tools\",\n  \"scale_factor\": 1.2\n}"
+        )
+    def create_user_prompt(self, ocr_results: List[Tuple], user_request: str) -> str:
+        """
+        创建用户提示词
+        Args:
+            ocr_results: OCR识别结果列表
+            user_request: 用户的原始请求
+        Returns:
+            用户提示词字符串
+        """
+        formatted_results = format_ocr_results_for_prompt(ocr_results)
+        return f"""
+Here are the OCR results:
+{formatted_results}
+User prompt:
+"{user_request}"
+"""
+    def parse_user_request(self, ocr_results: List[Tuple], user_request: str) -> Dict[str, Any]:
+        """
+        使用LLM解析用户请求
+        Args:
+            ocr_results: OCR识别结果列表
+            user_request: 用户的原始请求
+        Returns:
+            包含target_text和scale_factor的字典
+        Raises:
+            Exception: 当API调用失败或解析失败时
+        """
+        # 构造消息
+        messages = [
+            {"role": "system", "content": self.create_system_prompt()},
+            {"role": "user", "content": self.create_user_prompt(ocr_results, user_request)}
+        ]
+        try:
+            # 调用OpenAI API
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                temperature=0.3,
+                max_tokens=300,
+            )
+            # 获取回复
+            reply = response.choices[0].message.content
+            # 解析回复
+            parsed_result = robust_parse_reply(reply)
+            return parsed_result
+        except Exception as e:
+            raise Exception(f"LLM解析失败: {str(e)}")
+    def validate_parsed_result(self, parsed_result: Dict[str, Any], ocr_results: List[Tuple]) -> bool:
+        """
+        验证解析结果的有效性
+        Args:
+            parsed_result: 解析后的结果字典
+            ocr_results: OCR识别结果列表
+        Returns:
+            验证是否通过
+        """
+        target_text = parsed_result.get("target_text", "")
+        scale_factor = parsed_result.get("scale_factor", 0)
+        # 检查目标文字是否在OCR结果中
+        ocr_texts = [text.strip() for _, text, _ in ocr_results]
+        if target_text not in ocr_texts:
+            print(f"警告: 目标文字 '{target_text}' 未在OCR结果中找到")
+            print(f"可用的文字: {ocr_texts}")
+            return False
+        # 检查缩放因子是否合理
+        if not isinstance(scale_factor, (int, float)) or scale_factor <= 0:
+            print(f"错误: 缩放因子 {scale_factor} 不合法")
+            return False
+        return True
+def get_api_key_from_env() -> str:
+    """
+    从环境变量获取OpenAI API密钥
+    Returns:
+        API密钥字符串
+    Raises:
+        ValueError: 当找不到API密钥时
+    """
+    api_key = os.getenv("OPENAI_API_KEY")
+    if not api_key:
+        raise ValueError(
+            "未找到OpenAI API密钥。请设置环境变量OPENAI_API_KEY，"
+            "或在创建PromptHandler时提供api_key参数。"
+        )
+    return api_key

requirements.txt CHANGED Viewed

@@ -1,6 +1,8 @@
-accelerate
-diffusers
-invisible_watermark
-torch
-transformers
-xformers

+opencv-python>=4.5.0
+numpy>=1.21.0
+easyocr>=1.6.0
+openai>=1.0.0
+matplotlib>=3.3.0
+scikit-image>=0.18.0
+Pillow>=8.0.0
+gradio>=4.0.0

utils.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import json
+import re
+import cv2
+import numpy as np
+from typing import Dict, Any
+def robust_parse_reply(reply: str) -> Dict[str, Any]:
+    """
+    从LLM返回的字符串中提取JSON格式的回复
+    Args:
+        reply: LLM返回的原始字符串
+    Returns:
+        解析后的字典，包含target_text和scale_factor
+    Raises:
+        ValueError: 当无法解析JSON或缺少必要字段时
+    """
+    # 尝试去除 Markdown 代码块标记（如 ```json 或 ```）
+    cleaned = re.sub(r"```(?:json)?", "", reply, flags=re.IGNORECASE).strip("` \n")
+    # 尝试提取最可能的 JSON 段（形如 {...}）
+    match = re.search(r"\{.*?\}", cleaned, flags=re.DOTALL)
+    if not match:
+        raise ValueError("未找到 JSON 对象")
+    json_str = match.group(0)
+    try:
+        parsed = json.loads(json_str)
+    except json.JSONDecodeError as e:
+        raise ValueError(f"JSON 解析失败: {e}")
+    # 校验字段完整性
+    if "target_text" not in parsed or "scale_factor" not in parsed:
+        raise ValueError("JSON 中缺少必要字段 target_text 或 scale_factor")
+    return parsed
+def load_image(image_path: str) -> np.ndarray:
+    """
+    加载图像并转换为RGB格式
+    Args:
+        image_path: 图像文件路径
+    Returns:
+        RGB格式的图像数组
+    Raises:
+        ValueError: 当图像加载失败时
+    """
+    image = cv2.imread(image_path)
+    if image is None:
+        raise ValueError(f"无法加载图像: {image_path}")
+    # 转换为RGB格式
+    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    return image_rgb
+def save_image(image: np.ndarray, output_path: str) -> None:
+    """
+    保存RGB格式的图像
+    Args:
+        image: RGB格式的图像数组
+        output_path: 输出文件路径
+    """
+    # 转换为BGR格式以便OpenCV保存
+    image_bgr = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+    cv2.imwrite(output_path, image_bgr)
+def validate_scale_factor(scale_factor: float) -> float:
+    """
+    验证并标准化缩放因子
+    Args:
+        scale_factor: 原始缩放因子
+    Returns:
+        验证后的缩放因子
+    Raises:
+        ValueError: 当缩放因子不合法时
+    """
+    if not isinstance(scale_factor, (int, float)):
+        raise ValueError("缩放因子必须是数字")
+    if scale_factor <= 0:
+        raise ValueError("缩放因子必须大于0")
+    if scale_factor > 10:
+        print(f"警告: 缩放因子 {scale_factor} 过大，可能导致处理时间过长")
+    return float(scale_factor)
+def format_ocr_results_for_prompt(results: list) -> str:
+    """
+    格式化OCR结果以用于LLM prompt
+    Args:
+        results: OCR识别结果列表
+    Returns:
+        格式化后的文字列表字符串
+    """
+    text_list = [text for _, text, _ in results]
+    return str(text_list)
+def parse_percentage_to_scale_factor(text: str) -> float:
+    """
+    将百分比表示转换为缩放因子
+    Args:
+        text: 包含百分比的文本，如 "enlarge by 50%" 或 "shrink by 25%"
+    Returns:
+        对应的缩放因子
+    """
+    # 查找百分比数字
+    percentage_match = re.search(r'(\d+(?:\.\d+)?)%', text.lower())
+    if not percentage_match:
+        return 1.0  # 默认不缩放
+    percentage = float(percentage_match.group(1))
+    # 判断是放大还是缩小
+    if 'enlarge' in text.lower() or 'increase' in text.lower() or 'bigger' in text.lower():
+        return 1 + (percentage / 100)
+    elif 'shrink' in text.lower() or 'reduce' in text.lower() or 'smaller' in text.lower():
+        return 1 - (percentage / 100)
+    else:
+        # 默认当作放大处理
+        return 1 + (percentage / 100)
+def create_output_filename(input_path: str, suffix: str = "_resized") -> str:
+    """
+    根据输入文件路径创建输出文件名
+    Args:
+        input_path: 输入文件路径
+        suffix: 添加的后缀
+    Returns:
+        输出文件路径
+    """
+    import os
+    base_name = os.path.splitext(input_path)[0]
+    extension = os.path.splitext(input_path)[1]
+    return f"{base_name}{suffix}{extension}"