File size: 3,143 Bytes
b83ba6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# %% [markdown]
# ### Step 1: Reading PDF Files
# Setup directories
pdf_directory = r"F:\Preprocessing"
output_directory = r"F:\Images"
os.makedirs(output_directory, exist_ok=True)

pages = convert_from_path(pdf_path, dpi=dpi)

# %% [markdown]
# ### Step 2: Convert PDF files to Images

# %%
import os
import cv2
import numpy as np
from pdf2image import convert_from_path
import glob

# Hàm chuyển PDF sang ảnh
def pdf_to_images(pdf_path, output_dir, dpi=300):
    try:
        pages = convert_from_path(pdf_path, dpi=dpi)
        for i, page in enumerate(pages):
            image_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_page_{i+1}.jpg"
            image_path = os.path.join(output_dir, image_name)
            page.save(image_path, "JPEG", quality=95)
        return len(pages)  # Trả về số lượng ảnh được tạo
    except Exception as e:
        print(f"✗ Error processing {pdf_path}: {e}")
        return 0

# Xử lý toàn bộ file PDF
def process_all_pdfs():
    pdf_files = glob.glob(os.path.join(pdf_directory, "*.pdf"))
    total_images = 0

    if not pdf_files:
        print(f"No PDF files found in {pdf_directory}")
        return

    for pdf_file in pdf_files:
        num_pages = pdf_to_images(pdf_file, output_directory)
        total_images += num_pages

    print(f"\n✓ Tổng số file PDF: {len(pdf_files)}")
    print(f"✓ Tổng số ảnh đã chuyển đổi: {total_images}")

# MAIN EXECUTION
if __name__ == "__main__":
    print("PDF TO IMAGES CONVERTER")
    print(f"Input directory: {pdf_directory}")
    print(f"Output directory: {output_directory}")
    print()

    if not os.path.exists(pdf_directory):
        print(f"✗ Input directory does not exist: {pdf_directory}")
        exit(1)

    process_all_pdfs()
    print("\n✓ Processing completed!")

# %% [markdown]
# ### Step 3: Image Preprocessing

# %%
import os
import cv2
import numpy as np
from PIL import Image

def preprocess_image(image_path):
    pil_img = Image.open(image_path)
    img = np.array(pil_img)

    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    contrast_img = clahe.apply(gray)
    _, binary = cv2.threshold(contrast_img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    kernel = np.ones((1, 1), np.uint8)
    bold_img = cv2.dilate(binary, kernel, iterations=1)

    return bold_img

# Thư mục đầu vào và đầu ra
input_folder = r"F:\Images"
output_folder = r"F:\Images_Processed"
os.makedirs(output_folder, exist_ok=True)

# Duyệt qua tất cả ảnh
for filename in os.listdir(input_folder):
    if filename.lower().endswith((".jpg", ".jpeg", ".png", ".bmp")):
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)

        try:
            processed_img = preprocess_image(input_path)

            # Chuyển ảnh về PIL để lưu với Unicode path
            pil_result = Image.fromarray(processed_img)
            pil_result.save(output_path)

        except Exception as e:
            print(f"❌ Lỗi xử lý {filename}: {e}")