Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import fitz # PyMuPDF | |
| import docx | |
| from difflib import HtmlDiff, SequenceMatcher | |
| import os | |
| # Directory to save uploaded files | |
| UPLOAD_DIR = "uploaded_files" | |
| if not os.path.exists(UPLOAD_DIR): | |
| os.makedirs(UPLOAD_DIR) | |
| # Functions to save, extract text, and metadata | |
| def save_uploaded_file(uploaded_file): | |
| file_path = os.path.join(UPLOAD_DIR, uploaded_file.name) | |
| with open(file_path, "wb") as f: | |
| f.write(uploaded_file.getbuffer()) | |
| return file_path | |
| def extract_text_pdf(file_path): | |
| doc = fitz.open(file_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| return text | |
| def extract_text_word(file_path): | |
| doc = docx.Document(file_path) | |
| text = "\n".join([para.text for para in doc.paragraphs]) | |
| return text | |
| def extract_metadata_pdf(file_path): | |
| doc = fitz.open(file_path) | |
| metadata = doc.metadata | |
| return metadata | |
| def extract_metadata_word(file_path): | |
| doc = docx.Document(file_path) | |
| core_props = doc.core_properties | |
| metadata = { | |
| "author": core_props.author, | |
| "created": core_props.created, | |
| "modified": core_props.modified | |
| } | |
| return metadata | |
| # Function to compare text and return highlighted HTML differences | |
| def compare_texts(text1, text2): | |
| differ = HtmlDiff() | |
| return differ.make_file(text1.splitlines(), text2.splitlines(), context=True, numlines=2) | |
| # Function to calculate similarity score | |
| def calculate_similarity(text1, text2): | |
| matcher = SequenceMatcher(None, text1, text2) | |
| return matcher.ratio() | |
| # Streamlit App Interface | |
| st.title("Document Edit Detection POC") | |
| st.write("Upload both the original and edited documents below:") | |
| # File upload | |
| original_file = st.file_uploader("Upload Original Document", type=["pdf", "docx"]) | |
| edited_file = st.file_uploader("Upload Edited Document", type=["pdf", "docx"]) | |
| # Process if both files are uploaded | |
| if original_file and edited_file: | |
| # Save uploaded files | |
| original_file_path = save_uploaded_file(original_file) | |
| edited_file_path = save_uploaded_file(edited_file) | |
| # Identify file types | |
| original_ext = os.path.splitext(original_file.name)[1] | |
| edited_ext = os.path.splitext(edited_file.name)[1] | |
| # Check if both files are of the same type | |
| if original_ext != edited_ext: | |
| st.error("Both documents must be of the same type (PDF or DOCX).") | |
| else: | |
| # Extract text and metadata | |
| if original_ext == ".pdf": | |
| original_text = extract_text_pdf(original_file_path) | |
| edited_text = extract_text_pdf(edited_file_path) | |
| original_metadata = extract_metadata_pdf(original_file_path) | |
| edited_metadata = extract_metadata_pdf(edited_file_path) | |
| else: | |
| original_text = extract_text_word(original_file_path) | |
| edited_text = extract_text_word(edited_file_path) | |
| original_metadata = extract_metadata_word(original_file_path) | |
| edited_metadata = extract_metadata_word(edited_file_path) | |
| # Display Metadata | |
| st.subheader("Metadata Comparison") | |
| metadata_match = original_metadata == edited_metadata | |
| st.write("Metadata Match:", metadata_match) | |
| st.write("Original Document Metadata:") | |
| st.write(original_metadata) | |
| st.write("Edited Document Metadata:") | |
| st.write(edited_metadata) | |
| # Compare text | |
| st.subheader("Text Comparison") | |
| text_diff_html = compare_texts(original_text, edited_text) | |
| similarity_score = calculate_similarity(original_text, edited_text) | |
| st.write("Similarity Score:", round(similarity_score * 100, 2), "%") | |
| text_match = similarity_score == 1.0 | |
| st.write("Text Match:", text_match) | |
| # Display highlighted text differences | |
| st.write("Differences:") | |
| st.components.v1.html(text_diff_html, height=400, scrolling=True) | |
| # Report Generation | |
| st.subheader("Report Summary") | |
| st.write("Metadata Match:", metadata_match) | |
| st.write("Text Match:", text_match) | |
| st.write("Similarity Score:", round(similarity_score * 100, 2), "%") | |
| else: | |
| st.info("Please upload both the original and edited documents to proceed.") | |