HTML-to-Markdown

Paused

App Files Files Community

maxiw commited on Sep 12, 2024

Commit

8a04aff

1 Parent(s): 7df7460

clean up and add markdownify

Browse files

Files changed (2) hide show

app.py +20 -13
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -2,19 +2,23 @@ import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import spaces
 import re
 models = {
-    "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True).to("cuda").eval(),
 }
 tokenizers = {
     "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True),
 }
 @spaces.GPU
-def run_example(html_content, model_id="jinaai/reader-lm-0.5b"):
     model = models[model_id]
     tokenizer = tokenizers[model_id]
     messages = [{"role": "user", "content": html_content}]
@@ -23,7 +27,9 @@ def run_example(html_content, model_id="jinaai/reader-lm-0.5b"):
     outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
     pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
     assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
-    return assistant_response[0]
 css = """
@@ -37,16 +43,17 @@ css = """
 with gr.Blocks(css=css) as demo:
     gr.Markdown("""
     # HTML-to-Markdown
     """)
-    with gr.Tab(label="Main"):
-        with gr.Row():
-            with gr.Column():
-                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="jinaai/reader-lm-0.5b")
-                html_content = gr.Textbox(label="HTML")
-                submit_btn = gr.Button(value="Submit")
-            with gr.Column():
-                output_text = gr.Textbox(label="Markdown")
-        submit_btn.click(run_example, [html_content, model_selector], [output_text])
 demo.launch(debug=True)

 from transformers import AutoTokenizer, AutoModelForCausalLM
 import spaces
 import re
+from markdownify import markdownify
 models = {
+    "jinaai/reader-lm-0.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True).eval().to("cuda"),
+    "jinaai/reader-lm-1.5b": AutoModelForCausalLM.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True).eval().to("cuda")
 }
 tokenizers = {
     "jinaai/reader-lm-0.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-0.5b", trust_remote_code=True),
+    "jinaai/reader-lm-1.5b": AutoTokenizer.from_pretrained("jinaai/reader-lm-1.5b", trust_remote_code=True),
 }
 @spaces.GPU
+def run_example(html_content, model_id):
+    print("Start Model Processing")
     model = models[model_id]
     tokenizer = tokenizers[model_id]
     messages = [{"role": "user", "content": html_content}]
     outputs = model.generate(inputs, max_new_tokens=1024, temperature=0, do_sample=False, repetition_penalty=1.08)
     pattern = r"<\|im_start\|>assistant(.*?)<\|im_end\|>"
     assistant_response = re.findall(pattern, tokenizer.decode(outputs[0]), re.DOTALL)
+    print("Start Markdownify Processing")
+    markdownify_output = markdownify(html_content)
+    return assistant_response[0], markdownify_output
 css = """
 with gr.Blocks(css=css) as demo:
     gr.Markdown("""
     # HTML-to-Markdown
+    Try out model based HTML-to-Markdown with [Reader LM](https://huggingface.co/jinaai/reader-lm-1.5b) and rule based with [Markdownify](https://github.com/matthewwithanm/python-markdownify).
     """)
+    with gr.Row():
+        with gr.Column():
+            model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="jinaai/reader-lm-0.5b")
+            html_content = gr.Textbox(label="HTML")
+            submit_btn = gr.Button(value="Submit")
+        with gr.Column():
+            model_output_text = gr.Textbox(label="Reader LM Output")
+            markdownify_output = gr.Textbox(label="Markdownify Output")
+    submit_btn.click(run_example, [html_content, model_selector], [model_output_text, markdownify_output])
 demo.launch(debug=True)

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- transformers<=4.43.4


1	+ transformers<=4.43.4
2	+ markdownify==0.13.1