Spaces:
Running
Running
| from huggingface_hub import hf_hub_download | |
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import os | |
| import zipfile | |
| import shutil | |
| st.set_page_config(layout="wide") | |
| with st.spinner("Downloading dataset"): | |
| results = hf_hub_download( | |
| repo_id="reducto/rd-tablebench", | |
| filename="rd-tablebench.zip", | |
| repo_type="dataset", | |
| ) | |
| def unzip_dataset(): | |
| if not os.path.exists("unzipped_dataset"): | |
| os.makedirs("unzipped_dataset") | |
| with st.spinner("Unzipping dataset"): | |
| with zipfile.ZipFile(results, "r") as zip_ref: | |
| zip_ref.extractall("unzipped_dataset") | |
| return "unzipped_dataset/rd-tablebench" | |
| if st.button("Redo Unzip"): | |
| if os.path.exists("unzipped_dataset"): | |
| shutil.rmtree("unzipped_dataset") | |
| st.rerun() | |
| dataset = unzip_dataset() | |
| results = f"{dataset}/providers/scores.csv" | |
| assert os.path.exists(results) | |
| st.html(""" | |
| <style> | |
| table { | |
| font-family: arial, sans-serif; | |
| border-collapse: collapse; | |
| white-space: pre; | |
| } | |
| td, th { | |
| border: 1px solid #dddddd; | |
| text-align: left; | |
| padding: 8px; | |
| font-weight: normal; | |
| } | |
| </style> | |
| """) | |
| df = pd.read_csv(results) | |
| if "current_index" not in st.session_state: | |
| st.session_state.current_index = 0 | |
| col1, col2, col3 = st.columns([2, 5, 2]) | |
| with col1: | |
| st.html("<br/>") | |
| if st.button("β¬ οΈ Previous", use_container_width=True): | |
| if st.session_state.current_index > 0: | |
| st.session_state.current_index -= 1 | |
| st.rerun() | |
| # Search box and Go button in col2 | |
| with col2: | |
| index_input = st.number_input( | |
| "Index", | |
| label_visibility="hidden", | |
| min_value=0, | |
| max_value=len(df) - 1, | |
| value=st.session_state.current_index, | |
| step=1, | |
| ) | |
| if st.button("Go", use_container_width=True): | |
| st.session_state.current_index = int(index_input) | |
| st.rerun() | |
| # Next button in col3 | |
| with col3: | |
| st.html("<br/>") | |
| if st.button("Next β‘οΈ", use_container_width=True): | |
| if st.session_state.current_index < len(df) - 1: | |
| st.session_state.current_index += 1 | |
| st.rerun() | |
| col1, col2 = st.columns([1, 2]) | |
| providers = [ | |
| "reducto", | |
| "azure", | |
| "textract", | |
| "gcloud", | |
| "unstructured", | |
| "gpt4o", | |
| "chunkr", | |
| ] | |
| with col1: | |
| row = df.iloc[st.session_state.current_index] | |
| # Extract scores | |
| scores = [ | |
| row[f"{p}_score"] if row[f"{p}_score"] is not None else 0 for p in providers | |
| ] | |
| fig, ax = plt.subplots(figsize=(6, 10)) | |
| bars = ax.barh(providers[::-1], scores[::-1]) | |
| # Customize plot | |
| ax.set_title("Provider Scores Comparison") | |
| ax.set_ylabel("Providers") | |
| ax.set_xlabel("Scores") | |
| ax.set_xlim(0, 1.1) | |
| for bar in bars: | |
| width = bar.get_width() | |
| ax.text( | |
| width, | |
| bar.get_y() + bar.get_height() / 2.0, | |
| f"{width:.3f}", | |
| ha="left", | |
| va="center", | |
| ) | |
| plt.tight_layout() | |
| st.pyplot(fig) | |
| with col2: | |
| image_path = f"{dataset}/_images/{row['pdf_path'].replace('.pdf', '.jpg')}" | |
| st.image(image_path, use_column_width=True) | |
| st.write(row) | |
| st.subheader("Groundtruth") | |
| st.html(f"{dataset}/groundtruth/{row['pdf_path'].replace('.pdf', '.html')}") | |
| st.subheader("Provider Outputs") | |
| for p in providers: | |
| with st.expander(p): | |
| provider_html = ( | |
| f"{dataset}/providers/{p}/{row['pdf_path'].replace('.pdf', '.html')}" | |
| ) | |
| if os.path.exists(provider_html): | |
| st.html(provider_html) | |
| else: | |
| st.error(f"{p} failed to produce a table output for this image") | |