Spaces:

Tonic
/

GOT-OCR

Running

Tonic commited on Sep 15, 2024

Commit

d1a7b5b

verified ·

1 Parent(s): eeaf186

improve latex parsing

Files changed (1) hide show

app.py CHANGED Viewed

@@ -133,34 +133,43 @@ def update_inputs(task):
         ]
 def parse_latex_output(res):
-    lines = res.split('\n')
     parsed_lines = []
     in_latex = False
     for line in lines:
-        line = line.strip()
-        if not line:
             continue
-        latex_patterns = [r'\{', r'\}', r'\[', r'\]', r'\\', r'\$', r'_', r'^']
         contains_latex = any(re.search(pattern, line) for pattern in latex_patterns)
-        is_key_value = ':' in line and not line.startswith('{') and not line.endswith('}')
-        if contains_latex or is_key_value:
             if not in_latex:
-                parsed_lines.append('$$')
                 in_latex = True
-            parsed_lines.append(line)
         else:
             if in_latex:
-                parsed_lines.append('$$')
                 in_latex = False
             parsed_lines.append(line)
     if in_latex:
-        parsed_lines.append('$$')
-    return '\n'.join(parsed_lines)
 def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
     res, html_content, unique_id = process_image(image, task, ocr_type, ocr_box, ocr_color)

         ]
 def parse_latex_output(res):
+    # Split the input, preserving newlines and empty lines
+    lines = re.split(r'(\$\$.*?\$\$)', res, flags=re.DOTALL)
     parsed_lines = []
     in_latex = False
+    latex_buffer = []
     for line in lines:
+        if line == '\n':
+            if in_latex:
+                latex_buffer.append(line)
+            else:
+                parsed_lines.append(line)
             continue
+        line = line.strip()
+        latex_patterns = [r'\{', r'\}', r'\[', r'\]', r'\\', r'\$', r'_', r'^', r'"']
         contains_latex = any(re.search(pattern, line) for pattern in latex_patterns)
+        if contains_latex:
             if not in_latex:
                 in_latex = True
+                latex_buffer = ['$$']
+            latex_buffer.append(line)
         else:
             if in_latex:
+                latex_buffer.append('$$')
+                parsed_lines.extend(latex_buffer)
                 in_latex = False
+                latex_buffer = []
             parsed_lines.append(line)
     if in_latex:
+        latex_buffer.append('$$')
+        parsed_lines.extend(latex_buffer)
+    return '$$\n$$'.join(parsed_lines)
 def ocr_demo(image, task, ocr_type, ocr_box, ocr_color):
     res, html_content, unique_id = process_image(image, task, ocr_type, ocr_box, ocr_color)