taprosoft
commited on
Commit
·
007293f
1
Parent(s):
1ae7633
fix: use ZERO
Browse files- Dockerfile +0 -42
- README.md +4 -1
- app.py +7 -0
- backends/smoldocling.py +2 -0
- requirements.txt +2 -1
Dockerfile
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
|
| 2 |
-
|
| 3 |
-
ARG DEBIAN_FRONTEND=noninteractive
|
| 4 |
-
|
| 5 |
-
ENV PYTHONUNBUFFERED=1
|
| 6 |
-
|
| 7 |
-
RUN apt-get update && apt-get install --no-install-recommends -y \
|
| 8 |
-
build-essential \
|
| 9 |
-
python3.10-dev \
|
| 10 |
-
python3-pip \
|
| 11 |
-
wget \
|
| 12 |
-
git \
|
| 13 |
-
ffmpeg \
|
| 14 |
-
poppler-utils \
|
| 15 |
-
libpoppler-dev \
|
| 16 |
-
tesseract-ocr \
|
| 17 |
-
&& apt-get clean && rm -rf /var/lib/apt/lists/*
|
| 18 |
-
|
| 19 |
-
WORKDIR /code
|
| 20 |
-
|
| 21 |
-
COPY ./requirements.txt /code/requirements.txt
|
| 22 |
-
|
| 23 |
-
# Set up a new user named "user" with user ID 1000
|
| 24 |
-
RUN useradd -m -u 1000 user
|
| 25 |
-
# Switch to the "user" user
|
| 26 |
-
USER user
|
| 27 |
-
# Set home to the user's home directory
|
| 28 |
-
ENV HOME=/home/user \
|
| 29 |
-
PATH=/home/user/.local/bin:$PATH \
|
| 30 |
-
PYTHONPATH=$HOME/app \
|
| 31 |
-
PYTHONUNBUFFERED=1 \
|
| 32 |
-
GRADIO_SERVER_NAME=0.0.0.0
|
| 33 |
-
|
| 34 |
-
RUN pip3 install --no-cache-dir --upgrade -r /code/requirements.txt
|
| 35 |
-
|
| 36 |
-
# Set the working directory to the user's home directory
|
| 37 |
-
WORKDIR $HOME/app
|
| 38 |
-
|
| 39 |
-
# Copy the current directory contents into the container at $HOME/app setting the owner to the user
|
| 40 |
-
COPY --chown=user . $HOME/app
|
| 41 |
-
|
| 42 |
-
CMD ["python3", "app.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -3,7 +3,10 @@ title: SmolDoclingPreview
|
|
| 3 |
emoji: 🐢
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
-
sdk:
|
|
|
|
|
|
|
|
|
|
| 7 |
pinned: false
|
| 8 |
header: mini
|
| 9 |
short_description: Convert PDFs to Markdown with SmolDoclingPreview
|
|
|
|
| 3 |
emoji: 🐢
|
| 4 |
colorFrom: blue
|
| 5 |
colorTo: green
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.12.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
license: apache-2.0
|
| 10 |
pinned: false
|
| 11 |
header: mini
|
| 12 |
short_description: Convert PDFs to Markdown with SmolDoclingPreview
|
app.py
CHANGED
|
@@ -26,6 +26,13 @@ MAX_SELECTED_METHODS = int(os.getenv("MAX_SELECTED_METHODS", "6"))
|
|
| 26 |
MAX_PAGES = int(os.getenv("MAX_PAGES", "2"))
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def convert_document(path, method, start_page=0, enabled=True):
|
| 30 |
if enabled:
|
| 31 |
print("Processing file", path, "with method", method)
|
|
|
|
| 26 |
MAX_PAGES = int(os.getenv("MAX_PAGES", "2"))
|
| 27 |
|
| 28 |
|
| 29 |
+
# Install poppler-utils
|
| 30 |
+
import os
|
| 31 |
+
|
| 32 |
+
os.system("apt-get update")
|
| 33 |
+
os.system("apt-get install --no-install-recommends -y poppler-utils tesseract-ocr")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
def convert_document(path, method, start_page=0, enabled=True):
|
| 37 |
if enabled:
|
| 38 |
print("Processing file", path, "with method", method)
|
backends/smoldocling.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
# pip install docling_core
|
| 4 |
# pip install transformers
|
| 5 |
|
|
|
|
| 6 |
import torch
|
| 7 |
from docling_core.types.doc import DoclingDocument
|
| 8 |
from docling_core.types.doc.document import DocTagsDocument
|
|
@@ -32,6 +33,7 @@ messages = [
|
|
| 32 |
]
|
| 33 |
|
| 34 |
|
|
|
|
| 35 |
def convert_smoldocling(path: str, file_name: str):
|
| 36 |
doc = PDF(path)
|
| 37 |
output_md = ""
|
|
|
|
| 3 |
# pip install docling_core
|
| 4 |
# pip install transformers
|
| 5 |
|
| 6 |
+
import spaces
|
| 7 |
import torch
|
| 8 |
from docling_core.types.doc import DoclingDocument
|
| 9 |
from docling_core.types.doc.document import DocTagsDocument
|
|
|
|
| 33 |
]
|
| 34 |
|
| 35 |
|
| 36 |
+
@spaces.GPU(duration=120)
|
| 37 |
def convert_smoldocling(path: str, file_name: str):
|
| 38 |
doc = PDF(path)
|
| 39 |
output_md = ""
|
requirements.txt
CHANGED
|
@@ -2,7 +2,6 @@ gradio-pdf>=0.0.21
|
|
| 2 |
PyMuPDF>=1.24.9,<1.24.14
|
| 3 |
pymupdf4llm
|
| 4 |
unstructured[pdf]
|
| 5 |
-
ultralytics>=8.3.48
|
| 6 |
openai
|
| 7 |
img2table
|
| 8 |
gmft
|
|
@@ -10,3 +9,5 @@ transformers<5.0.0,>=4.45.2
|
|
| 10 |
pypdf
|
| 11 |
docling_core
|
| 12 |
opencv-contrib-python
|
|
|
|
|
|
|
|
|
| 2 |
PyMuPDF>=1.24.9,<1.24.14
|
| 3 |
pymupdf4llm
|
| 4 |
unstructured[pdf]
|
|
|
|
| 5 |
openai
|
| 6 |
img2table
|
| 7 |
gmft
|
|
|
|
| 9 |
pypdf
|
| 10 |
docling_core
|
| 11 |
opencv-contrib-python
|
| 12 |
+
huggingface_hub
|
| 13 |
+
spaces
|