Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -99,7 +99,6 @@ def model_inference(
|
|
| 99 |
"pixel_values": inputs.pixel_values,
|
| 100 |
"attention_mask": inputs.attention_mask,
|
| 101 |
"num_return_sequences": 1,
|
| 102 |
-
"no_repeat_ngram_size": 10,
|
| 103 |
"max_new_tokens": 8192,
|
| 104 |
}
|
| 105 |
|
|
@@ -111,24 +110,30 @@ def model_inference(
|
|
| 111 |
|
| 112 |
yield "..."
|
| 113 |
buffer = ""
|
| 114 |
-
|
| 115 |
|
| 116 |
for new_text in streamer:
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
doctag_output += new_text
|
| 120 |
yield buffer
|
| 121 |
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
doc = DoclingDocument(name="Document")
|
| 126 |
if "<chart>" in doctag_output:
|
| 127 |
doctag_output = doctag_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
|
| 128 |
doctag_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', doctag_output)
|
| 129 |
-
|
|
|
|
|
|
|
| 130 |
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctag_output], images)
|
| 131 |
doc.load_from_doctags(doctags_doc)
|
|
|
|
| 132 |
yield f"**MD Output:**\n\n{doc.export_to_markdown()}"
|
| 133 |
|
| 134 |
examples=[[{"text": "Convert this page to docling.", "files": ["example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png"]}],
|
|
|
|
| 99 |
"pixel_values": inputs.pixel_values,
|
| 100 |
"attention_mask": inputs.attention_mask,
|
| 101 |
"num_return_sequences": 1,
|
|
|
|
| 102 |
"max_new_tokens": 8192,
|
| 103 |
}
|
| 104 |
|
|
|
|
| 110 |
|
| 111 |
yield "..."
|
| 112 |
buffer = ""
|
| 113 |
+
full_output = ""
|
| 114 |
|
| 115 |
for new_text in streamer:
|
| 116 |
+
full_output += new_text
|
| 117 |
+
buffer += html.escape(new_text)
|
|
|
|
| 118 |
yield buffer
|
| 119 |
|
| 120 |
+
cleaned_output = full_output.replace("<end_of_utterance>", "").strip()
|
| 121 |
+
|
| 122 |
+
if cleaned_output:
|
| 123 |
+
doctag_output = cleaned_output
|
| 124 |
+
yield cleaned_output
|
| 125 |
+
|
| 126 |
+
if any(tag in doctag_output for tag in ["<doctag>", "<otsl>", "<code>", "<chart>", "<formula>"]):
|
| 127 |
doc = DoclingDocument(name="Document")
|
| 128 |
if "<chart>" in doctag_output:
|
| 129 |
doctag_output = doctag_output.replace("<chart>", "<otsl>").replace("</chart>", "</otsl>")
|
| 130 |
doctag_output = re.sub(r'(<loc_500>)(?!.*<loc_500>)<[^>]+>', r'\1', doctag_output)
|
| 131 |
+
|
| 132 |
+
print(doctag_output)
|
| 133 |
+
|
| 134 |
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs([doctag_output], images)
|
| 135 |
doc.load_from_doctags(doctags_doc)
|
| 136 |
+
print(doc)
|
| 137 |
yield f"**MD Output:**\n\n{doc.export_to_markdown()}"
|
| 138 |
|
| 139 |
examples=[[{"text": "Convert this page to docling.", "files": ["example_images/2d0fbcc50e88065a040a537b717620e964fb4453314b71d83f3ed3425addcef6.png"]}],
|