Instantnewdesign commited on
Commit
338ecf0
·
verified ·
1 Parent(s): ce41633

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -5
app.py CHANGED
@@ -6,28 +6,38 @@ import os
6
 
7
  # Init client
8
  model_path = "opendatalab/MinerU2.5-2509-1.2B"
9
- client = MinerUClient(backend="transformers", model_path=model_path)
 
 
 
 
 
 
 
10
 
11
- def extract_from_file(file):
12
  # Vérifier si PDF ou image
13
  ext = os.path.splitext(file.name)[-1].lower()
14
  images = []
15
 
16
  if ext == ".pdf":
17
  doc = fitz.open(file.name)
18
- for page in doc:
 
19
  pix = page.get_pixmap()
20
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
21
  images.append(img)
 
22
  else:
23
  images.append(Image.open(file.name))
24
 
25
  results = []
26
- for img in images:
 
27
  blocks = client.two_step_extract(img)
28
  text_blocks = [b.text for b in blocks if hasattr(b, "text")]
29
  results.append("\n".join(text_blocks))
30
 
 
31
  return "\n\n--- PAGE ---\n\n".join(results)
32
 
33
  demo = gr.Interface(
@@ -35,7 +45,7 @@ demo = gr.Interface(
35
  inputs=gr.File(type="filepath", label="Upload PDF or Image"),
36
  outputs=gr.Textbox(label="Extracted Text", lines=20),
37
  title="MinerU2.5 Document Extractor",
38
- description="Upload a PDF or Image to extract structured text using MinerU2.5."
39
  )
40
 
41
  demo.launch()
 
6
 
7
  # Init client
8
  model_path = "opendatalab/MinerU2.5-2509-1.2B"
9
+ client = MinerUClient(
10
+ backend="transformers",
11
+ model_path=model_path,
12
+ device="cuda" # Utilisation GPU obligatoire
13
+ )
14
+
15
+ def extract_from_file(file, progress=gr.Progress()):
16
+ progress(0, desc="Analyse du fichier...")
17
 
 
18
  # Vérifier si PDF ou image
19
  ext = os.path.splitext(file.name)[-1].lower()
20
  images = []
21
 
22
  if ext == ".pdf":
23
  doc = fitz.open(file.name)
24
+ total_pages = len(doc)
25
+ for i, page in enumerate(doc):
26
  pix = page.get_pixmap()
27
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
28
  images.append(img)
29
+ progress((i+1)/total_pages, desc=f"Conversion page {i+1}/{total_pages}")
30
  else:
31
  images.append(Image.open(file.name))
32
 
33
  results = []
34
+ for i, img in enumerate(images):
35
+ progress(i/len(images), desc=f"Extraction page {i+1}/{len(images)}")
36
  blocks = client.two_step_extract(img)
37
  text_blocks = [b.text for b in blocks if hasattr(b, "text")]
38
  results.append("\n".join(text_blocks))
39
 
40
+ progress(1, desc="Extraction terminée ✅")
41
  return "\n\n--- PAGE ---\n\n".join(results)
42
 
43
  demo = gr.Interface(
 
45
  inputs=gr.File(type="filepath", label="Upload PDF or Image"),
46
  outputs=gr.Textbox(label="Extracted Text", lines=20),
47
  title="MinerU2.5 Document Extractor",
48
+ description="Upload a PDF or Image to extract structured text using MinerU2.5 with GPU."
49
  )
50
 
51
  demo.launch()