yingzhac commited on
Commit
e219479
·
1 Parent(s): 986650d

🔧 Fix major issues in text processing

Browse files

1. 🌍 OCR: Switch to English-only recognition
2. 🤖 AI: Use gpt-4.1-nano model instead of gpt-4o-mini
3. 🎯 Smart Text Selection: Fix fallback parsing that always picked first result
- Add intelligent text matching from user prompts
- Support quoted text detection ('text' or "text")
- Add fuzzy matching for better text selection
- Provide clear error messages when text not found
4. 🔄 UI: Default enable AI parsing for better user experience

Files changed (2) hide show
  1. app.py +54 -7
  2. prompt_handler.py +1 -1
app.py CHANGED
@@ -16,8 +16,50 @@ from utils import (
16
  create_output_filename
17
  )
18
 
19
- # Initialize the text resizer with GPU support
20
- text_resizer = TextResizer(languages=['en', 'ch_sim'], gpu=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  @spaces.GPU
23
  def process_image(input_image, user_prompt, use_ai_parsing=True, api_key=None):
@@ -57,8 +99,13 @@ def process_image(input_image, user_prompt, use_ai_parsing=True, api_key=None):
57
  if scale_factor == 1.0:
58
  return None, "❌ 错误: 无法从用户指令中解析出缩放信息"
59
 
60
- # Use the first detected text as target
61
- target_text = ocr_results[0][1].strip()
 
 
 
 
 
62
  status_msg = f"✅ 备用解析: 目标文字='{target_text}', 缩放因子={scale_factor}"
63
 
64
  except Exception as e:
@@ -224,9 +271,9 @@ with gr.Blocks(css=css, title="智能文字缩放工具") as demo:
224
  # Advanced settings
225
  with gr.Accordion("⚙️ 高级设置", open=False):
226
  use_ai_parsing = gr.Checkbox(
227
- label="🤖 使用AI解析 (需要OpenAI API密钥)",
228
- value=False,
229
- info="使用GPT模型理解自然语言指令"
230
  )
231
 
232
  api_key = gr.Textbox(
 
16
  create_output_filename
17
  )
18
 
19
+ # Initialize the text resizer with GPU support (English only)
20
+ text_resizer = TextResizer(languages=['en'], gpu=True)
21
+
22
+ def find_target_text_in_prompt(user_prompt, ocr_results):
23
+ """
24
+ 从用户prompt中智能查找目标文字
25
+
26
+ Args:
27
+ user_prompt: 用户输入的指令
28
+ ocr_results: OCR识别结果列表
29
+
30
+ Returns:
31
+ 找到的目标文字,如果没找到则返回None
32
+ """
33
+ import re
34
+
35
+ # 提取所有OCR识别的文字
36
+ ocr_texts = [text.strip() for _, text, _ in ocr_results]
37
+
38
+ # 1. 首先查找被引号包围的文字 (单引号或双引号)
39
+ quoted_matches = re.findall(r'["\']([^"\']+)["\']', user_prompt)
40
+ for quoted_text in quoted_matches:
41
+ # 在OCR结果中查找完全匹配或部分匹配
42
+ for ocr_text in ocr_texts:
43
+ if quoted_text.lower() == ocr_text.lower():
44
+ return ocr_text
45
+ if quoted_text.lower() in ocr_text.lower() or ocr_text.lower() in quoted_text.lower():
46
+ return ocr_text
47
+
48
+ # 2. 如果没有引号,尝试查找prompt中包含的OCR文字
49
+ user_prompt_lower = user_prompt.lower()
50
+ for ocr_text in ocr_texts:
51
+ if ocr_text.lower() in user_prompt_lower:
52
+ return ocr_text
53
+
54
+ # 3. 尝试查找部分匹配的单词
55
+ prompt_words = re.findall(r'\b\w+\b', user_prompt_lower)
56
+ for word in prompt_words:
57
+ if len(word) > 2: # 忽略太短的单词
58
+ for ocr_text in ocr_texts:
59
+ if word in ocr_text.lower():
60
+ return ocr_text
61
+
62
+ return None
63
 
64
  @spaces.GPU
65
  def process_image(input_image, user_prompt, use_ai_parsing=True, api_key=None):
 
99
  if scale_factor == 1.0:
100
  return None, "❌ 错误: 无法从用户指令中解析出缩放信息"
101
 
102
+ # Try to find target text from user prompt
103
+ target_text = find_target_text_in_prompt(user_prompt, ocr_results)
104
+ if not target_text:
105
+ # If no specific text found in prompt, ask user to specify
106
+ available_texts = [text.strip() for _, text, _ in ocr_results]
107
+ return None, f"❌ 错误: 无法确定要调整的文字。请在指令中明确指定文字,如 'enlarge \"具体文字\" by 50%'\n\n📝 可用的文字: {available_texts}"
108
+
109
  status_msg = f"✅ 备用解析: 目标文字='{target_text}', 缩放因子={scale_factor}"
110
 
111
  except Exception as e:
 
271
  # Advanced settings
272
  with gr.Accordion("⚙️ 高级设置", open=False):
273
  use_ai_parsing = gr.Checkbox(
274
+ label="🤖 使用AI解析 (推荐,需要OpenAI API密钥)",
275
+ value=True,
276
+ info="使用GPT-4.1-nano模型智能理解自然语言指令"
277
  )
278
 
279
  api_key = gr.Textbox(
prompt_handler.py CHANGED
@@ -5,7 +5,7 @@ from utils import format_ocr_results_for_prompt, robust_parse_reply
5
 
6
 
7
  class PromptHandler:
8
- def __init__(self, api_key: str = None, model: str = "gpt-4o-mini"):
9
  """
10
  初始化Prompt处理器
11
 
 
5
 
6
 
7
  class PromptHandler:
8
+ def __init__(self, api_key: str = None, model: str = "gpt-4.1-nano"):
9
  """
10
  初始化Prompt处理器
11