Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,249 +5,870 @@ import pandas as pd
|
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 7 |
import torch
|
| 8 |
-
|
| 9 |
from fpdf import FPDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# -----------------------------
|
| 12 |
-
# CONFIG
|
| 13 |
# -----------------------------
|
| 14 |
DB_NAME = "db.sqlite3"
|
| 15 |
USERNAME = "aixbi"
|
| 16 |
PASSWORD = "aixbi@123"
|
| 17 |
-
MAX_SENTENCES_CHECK =
|
| 18 |
-
LOGO_PATH = "aixbi.jpg"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# -----------------------------
|
| 21 |
-
# DB INIT
|
| 22 |
# -----------------------------
|
| 23 |
def init_db():
|
|
|
|
| 24 |
conn = sqlite3.connect(DB_NAME)
|
| 25 |
c = conn.cursor()
|
|
|
|
|
|
|
| 26 |
c.execute("""CREATE TABLE IF NOT EXISTS results (
|
| 27 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 28 |
-
student_id TEXT,
|
| 29 |
-
student_name TEXT,
|
|
|
|
| 30 |
ai_score REAL,
|
| 31 |
plagiarism_score REAL,
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
)""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
conn.commit()
|
| 35 |
conn.close()
|
| 36 |
|
| 37 |
init_db()
|
| 38 |
|
| 39 |
# -----------------------------
|
| 40 |
-
# MODEL LOADING
|
| 41 |
# -----------------------------
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# -----------------------------
|
| 47 |
-
# FILE HANDLING
|
| 48 |
# -----------------------------
|
| 49 |
-
def
|
| 50 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
if file_obj is None:
|
| 52 |
-
return None
|
| 53 |
|
| 54 |
name = file_obj.name
|
| 55 |
ext = os.path.splitext(name)[1].lower()
|
| 56 |
-
|
| 57 |
# Copy to temp file preserving extension
|
| 58 |
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
|
| 59 |
shutil.copy(file_obj.name, tmp.name)
|
| 60 |
tmp_path = tmp.name
|
| 61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
try:
|
| 63 |
if ext == ".pdf":
|
| 64 |
with pdfplumber.open(tmp_path) as pdf:
|
| 65 |
text = " ".join(page.extract_text() or "" for page in pdf.pages)
|
|
|
|
| 66 |
elif ext == ".docx":
|
| 67 |
doc = docx.Document(tmp_path)
|
| 68 |
text = " ".join(p.text for p in doc.paragraphs)
|
|
|
|
| 69 |
elif ext == ".txt":
|
| 70 |
with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 71 |
text = f.read()
|
| 72 |
else:
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# -----------------------------
|
| 80 |
-
# AI
|
| 81 |
# -----------------------------
|
| 82 |
-
def detect_ai_text(text):
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
for sentence in samples:
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
# -----------------------------
|
| 105 |
-
# DB
|
| 106 |
# -----------------------------
|
| 107 |
-
def save_result(student_id, student_name, ai_score, plagiarism_score
|
|
|
|
|
|
|
| 108 |
conn = sqlite3.connect(DB_NAME)
|
| 109 |
c = conn.cursor()
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
conn.commit()
|
| 113 |
conn.close()
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
def load_results():
|
|
|
|
| 116 |
conn = sqlite3.connect(DB_NAME)
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
conn.close()
|
| 119 |
return df
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
# -----------------------------
|
| 122 |
-
# PDF REPORT
|
| 123 |
# -----------------------------
|
| 124 |
-
class
|
| 125 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
self.set_fill_color(*color)
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
pdf.add_page()
|
| 133 |
|
| 134 |
-
#
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
if suspicious_sentences:
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
pdf.
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
recommendations =
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
# -----------------------------
|
| 171 |
-
# APP LOGIC
|
| 172 |
# -----------------------------
|
| 173 |
-
def login(user, pwd):
|
|
|
|
| 174 |
if user == USERNAME and pwd == PASSWORD:
|
|
|
|
| 175 |
return gr.update(visible=False), gr.update(visible=True), ""
|
| 176 |
else:
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
def analyze(student_name, student_id, file_obj):
|
| 180 |
-
if file_obj is None or not student_name or not student_id:
|
| 181 |
-
return "Please fill all fields and upload a document.", None, None, None, None
|
| 182 |
-
|
| 183 |
-
text = extract_text(file_obj)
|
| 184 |
-
if not text:
|
| 185 |
-
return "Error: Could not read the file. Please upload a valid PDF, DOCX, or TXT.", None, None, None, None
|
| 186 |
-
|
| 187 |
-
sentences = [s.strip() for s in text.split(". ") if len(s) > 30]
|
| 188 |
-
|
| 189 |
-
# AI Detection
|
| 190 |
-
ai_score = detect_ai_text(text) * 100
|
| 191 |
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
-
def
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
# -----------------------------
|
| 213 |
-
#
|
| 214 |
# -----------------------------
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
# Main App
|
| 230 |
-
app_box = gr.Group(visible=False)
|
| 231 |
-
with app_box:
|
| 232 |
-
with gr.Tab("Check Thesis"):
|
| 233 |
-
with gr.Row():
|
| 234 |
-
student_name = gr.Textbox(label="Student Name")
|
| 235 |
-
student_id = gr.Textbox(label="Student ID")
|
| 236 |
-
file_upload = gr.File(label="Upload Document", file_types=[".pdf",".docx",".txt"])
|
| 237 |
-
analyze_btn = gr.Button("Analyze Document", variant="primary")
|
| 238 |
-
status = gr.Textbox(label="Status")
|
| 239 |
-
ai_score = gr.Number(label="AI Probability (%)")
|
| 240 |
-
plagiarism_score = gr.Number(label="Plagiarism Score (%)")
|
| 241 |
-
suspicious_text = gr.Textbox(label="Suspicious Sentences Highlight", lines=10)
|
| 242 |
-
pdf_output = gr.File(label="Download PDF Report")
|
| 243 |
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 251 |
|
| 252 |
if __name__ == "__main__":
|
| 253 |
-
|
|
|
|
| 5 |
from sentence_transformers import SentenceTransformer
|
| 6 |
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 7 |
import torch
|
| 8 |
+
import numpy as np
|
| 9 |
from fpdf import FPDF
|
| 10 |
+
import logging
|
| 11 |
+
import hashlib
|
| 12 |
+
from typing import List, Tuple, Optional
|
| 13 |
+
import asyncio
|
| 14 |
+
import aiohttp
|
| 15 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 16 |
+
import re
|
| 17 |
+
import time
|
| 18 |
|
| 19 |
# -----------------------------
|
| 20 |
+
# ENHANCED CONFIG
|
| 21 |
# -----------------------------
|
| 22 |
DB_NAME = "db.sqlite3"
|
| 23 |
USERNAME = "aixbi"
|
| 24 |
PASSWORD = "aixbi@123"
|
| 25 |
+
MAX_SENTENCES_CHECK = 15 # Increased for better coverage
|
| 26 |
+
LOGO_PATH = "aixbi.jpg"
|
| 27 |
+
MIN_SENTENCE_LENGTH = 20 # Reduced for better detection
|
| 28 |
+
SIMILARITY_THRESHOLD = 0.85 # For semantic similarity
|
| 29 |
+
CHUNK_SIZE = 512 # For processing large documents
|
| 30 |
+
LOG_FILE = "plagiarism_detector.log"
|
| 31 |
+
|
| 32 |
+
# Setup logging
|
| 33 |
+
logging.basicConfig(
|
| 34 |
+
level=logging.INFO,
|
| 35 |
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
| 36 |
+
handlers=[
|
| 37 |
+
logging.FileHandler(LOG_FILE),
|
| 38 |
+
logging.StreamHandler()
|
| 39 |
+
]
|
| 40 |
+
)
|
| 41 |
+
logger = logging.getLogger(__name__)
|
| 42 |
|
| 43 |
# -----------------------------
|
| 44 |
+
# ENHANCED DB INIT
|
| 45 |
# -----------------------------
|
| 46 |
def init_db():
|
| 47 |
+
"""Enhanced database with additional fields and indexes"""
|
| 48 |
conn = sqlite3.connect(DB_NAME)
|
| 49 |
c = conn.cursor()
|
| 50 |
+
|
| 51 |
+
# Main results table with more fields
|
| 52 |
c.execute("""CREATE TABLE IF NOT EXISTS results (
|
| 53 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 54 |
+
student_id TEXT NOT NULL,
|
| 55 |
+
student_name TEXT NOT NULL,
|
| 56 |
+
document_hash TEXT,
|
| 57 |
ai_score REAL,
|
| 58 |
plagiarism_score REAL,
|
| 59 |
+
word_count INTEGER,
|
| 60 |
+
sentence_count INTEGER,
|
| 61 |
+
suspicious_sentences_count INTEGER,
|
| 62 |
+
processing_time REAL,
|
| 63 |
+
file_type TEXT,
|
| 64 |
+
timestamp TEXT,
|
| 65 |
+
status TEXT DEFAULT 'completed'
|
| 66 |
+
)""")
|
| 67 |
+
|
| 68 |
+
# Suspicious sentences table for detailed tracking
|
| 69 |
+
c.execute("""CREATE TABLE IF NOT EXISTS suspicious_sentences (
|
| 70 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 71 |
+
result_id INTEGER,
|
| 72 |
+
sentence TEXT,
|
| 73 |
+
similarity_score REAL,
|
| 74 |
+
source_found BOOLEAN,
|
| 75 |
+
FOREIGN KEY (result_id) REFERENCES results (id)
|
| 76 |
)""")
|
| 77 |
+
|
| 78 |
+
# Create indexes for better performance
|
| 79 |
+
c.execute("CREATE INDEX IF NOT EXISTS idx_student_id ON results (student_id)")
|
| 80 |
+
c.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON results (timestamp)")
|
| 81 |
+
c.execute("CREATE INDEX IF NOT EXISTS idx_document_hash ON results (document_hash)")
|
| 82 |
+
|
| 83 |
conn.commit()
|
| 84 |
conn.close()
|
| 85 |
|
| 86 |
init_db()
|
| 87 |
|
| 88 |
# -----------------------------
|
| 89 |
+
# ENHANCED MODEL LOADING WITH ERROR HANDLING
|
| 90 |
# -----------------------------
|
| 91 |
+
try:
|
| 92 |
+
embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
| 93 |
+
tokenizer = AutoTokenizer.from_pretrained("hello-simpleai/chatgpt-detector-roberta")
|
| 94 |
+
model = AutoModelForSequenceClassification.from_pretrained("hello-simpleai/chatgpt-detector-roberta")
|
| 95 |
+
logger.info("Models loaded successfully")
|
| 96 |
+
except Exception as e:
|
| 97 |
+
logger.error(f"Error loading models: {e}")
|
| 98 |
+
raise
|
| 99 |
|
| 100 |
# -----------------------------
|
| 101 |
+
# ENHANCED FILE HANDLING
|
| 102 |
# -----------------------------
|
| 103 |
+
def calculate_file_hash(file_path: str) -> str:
|
| 104 |
+
"""Calculate SHA-256 hash of file for duplicate detection"""
|
| 105 |
+
hash_sha256 = hashlib.sha256()
|
| 106 |
+
with open(file_path, "rb") as f:
|
| 107 |
+
for chunk in iter(lambda: f.read(4096), b""):
|
| 108 |
+
hash_sha256.update(chunk)
|
| 109 |
+
return hash_sha256.hexdigest()
|
| 110 |
+
|
| 111 |
+
def extract_text(file_obj) -> Optional[Tuple[str, dict]]:
|
| 112 |
+
"""Enhanced text extraction with metadata"""
|
| 113 |
if file_obj is None:
|
| 114 |
+
return None, None
|
| 115 |
|
| 116 |
name = file_obj.name
|
| 117 |
ext = os.path.splitext(name)[1].lower()
|
| 118 |
+
|
| 119 |
# Copy to temp file preserving extension
|
| 120 |
with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
|
| 121 |
shutil.copy(file_obj.name, tmp.name)
|
| 122 |
tmp_path = tmp.name
|
| 123 |
|
| 124 |
+
metadata = {
|
| 125 |
+
'file_type': ext,
|
| 126 |
+
'file_size': os.path.getsize(tmp_path),
|
| 127 |
+
'file_hash': calculate_file_hash(tmp_path)
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
try:
|
| 131 |
if ext == ".pdf":
|
| 132 |
with pdfplumber.open(tmp_path) as pdf:
|
| 133 |
text = " ".join(page.extract_text() or "" for page in pdf.pages)
|
| 134 |
+
metadata['page_count'] = len(pdf.pages)
|
| 135 |
elif ext == ".docx":
|
| 136 |
doc = docx.Document(tmp_path)
|
| 137 |
text = " ".join(p.text for p in doc.paragraphs)
|
| 138 |
+
metadata['paragraph_count'] = len(doc.paragraphs)
|
| 139 |
elif ext == ".txt":
|
| 140 |
with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 141 |
text = f.read()
|
| 142 |
else:
|
| 143 |
+
logger.warning(f"Unsupported file type: {ext}")
|
| 144 |
+
return None, None
|
| 145 |
+
|
| 146 |
+
except Exception as e:
|
| 147 |
+
logger.error(f"Error extracting text from {name}: {e}")
|
| 148 |
+
return None, None
|
| 149 |
+
finally:
|
| 150 |
+
try:
|
| 151 |
+
os.unlink(tmp_path)
|
| 152 |
+
except:
|
| 153 |
+
pass
|
| 154 |
+
|
| 155 |
+
if not text or len(text.strip()) < 50:
|
| 156 |
+
logger.warning("Extracted text is too short or empty")
|
| 157 |
+
return None, None
|
| 158 |
+
|
| 159 |
+
text = text.strip()
|
| 160 |
+
metadata.update({
|
| 161 |
+
'word_count': len(text.split()),
|
| 162 |
+
'char_count': len(text)
|
| 163 |
+
})
|
| 164 |
+
|
| 165 |
+
return text, metadata
|
| 166 |
|
| 167 |
# -----------------------------
|
| 168 |
+
# ENHANCED AI DETECTION WITH CHUNKING
|
| 169 |
# -----------------------------
|
| 170 |
+
def detect_ai_text(text: str) -> Tuple[float, dict]:
|
| 171 |
+
"""Enhanced AI detection with confidence scores and chunking for large texts"""
|
| 172 |
+
try:
|
| 173 |
+
# Split into chunks for large texts
|
| 174 |
+
chunks = [text[i:i+CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
|
| 175 |
+
scores = []
|
| 176 |
+
details = {'chunk_scores': [], 'confidence': 'low'}
|
| 177 |
+
|
| 178 |
+
for chunk in chunks[:5]: # Limit to first 5 chunks for performance
|
| 179 |
+
if len(chunk.strip()) < 20:
|
| 180 |
+
continue
|
| 181 |
+
|
| 182 |
+
inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
|
| 183 |
+
with torch.no_grad():
|
| 184 |
+
outputs = model(**inputs)
|
| 185 |
+
probabilities = torch.softmax(outputs.logits, dim=1)
|
| 186 |
+
score = probabilities[0][1].item() # AI probability
|
| 187 |
+
scores.append(score)
|
| 188 |
+
details['chunk_scores'].append(round(score * 100, 2))
|
| 189 |
+
|
| 190 |
+
if not scores:
|
| 191 |
+
return 0.0, details
|
| 192 |
+
|
| 193 |
+
avg_score = np.mean(scores)
|
| 194 |
+
std_score = np.std(scores) if len(scores) > 1 else 0
|
| 195 |
+
|
| 196 |
+
# Determine confidence based on consistency
|
| 197 |
+
if std_score < 0.1:
|
| 198 |
+
details['confidence'] = 'high'
|
| 199 |
+
elif std_score < 0.2:
|
| 200 |
+
details['confidence'] = 'medium'
|
| 201 |
+
else:
|
| 202 |
+
details['confidence'] = 'low'
|
| 203 |
+
|
| 204 |
+
details['std_deviation'] = round(std_score, 3)
|
| 205 |
+
|
| 206 |
+
return avg_score, details
|
| 207 |
+
|
| 208 |
+
except Exception as e:
|
| 209 |
+
logger.error(f"Error in AI detection: {e}")
|
| 210 |
+
return 0.0, {'error': str(e)}
|
| 211 |
|
| 212 |
+
# -----------------------------
|
| 213 |
+
# ENHANCED PLAGIARISM DETECTION
|
| 214 |
+
# -----------------------------
|
| 215 |
+
def preprocess_text(text: str) -> List[str]:
|
| 216 |
+
"""Extract meaningful sentences with better filtering"""
|
| 217 |
+
# Split into sentences using multiple delimiters
|
| 218 |
+
sentences = re.split(r'[.!?]+', text)
|
| 219 |
+
|
| 220 |
+
# Clean and filter sentences
|
| 221 |
+
cleaned_sentences = []
|
| 222 |
+
for sentence in sentences:
|
| 223 |
+
sentence = sentence.strip()
|
| 224 |
+
# Filter out short sentences, headers, page numbers, etc.
|
| 225 |
+
if (len(sentence) >= MIN_SENTENCE_LENGTH and
|
| 226 |
+
not sentence.isdigit() and
|
| 227 |
+
len(sentence.split()) >= 5 and
|
| 228 |
+
not re.match(r'^(page|chapter|\d+)[\s\d]*$', sentence.lower())):
|
| 229 |
+
cleaned_sentences.append(sentence)
|
| 230 |
+
|
| 231 |
+
return cleaned_sentences
|
| 232 |
|
| 233 |
+
def semantic_similarity_check(sentences: List[str], suspicious_sentences: List[str]) -> List[Tuple[str, float]]:
|
| 234 |
+
"""Check for semantic similarity between sentences"""
|
| 235 |
+
if not sentences or not suspicious_sentences:
|
| 236 |
+
return []
|
| 237 |
+
|
| 238 |
+
try:
|
| 239 |
+
# Encode sentences
|
| 240 |
+
sentence_embeddings = embedder.encode(sentences)
|
| 241 |
+
suspicious_embeddings = embedder.encode(suspicious_sentences)
|
| 242 |
+
|
| 243 |
+
# Calculate similarities
|
| 244 |
+
similarities = cosine_similarity(sentence_embeddings, suspicious_embeddings)
|
| 245 |
+
|
| 246 |
+
high_similarity_pairs = []
|
| 247 |
+
for i, sentence in enumerate(sentences):
|
| 248 |
+
max_similarity = np.max(similarities[i])
|
| 249 |
+
if max_similarity > SIMILARITY_THRESHOLD:
|
| 250 |
+
high_similarity_pairs.append((sentence, max_similarity))
|
| 251 |
+
|
| 252 |
+
return high_similarity_pairs
|
| 253 |
+
|
| 254 |
+
except Exception as e:
|
| 255 |
+
logger.error(f"Error in semantic similarity check: {e}")
|
| 256 |
+
return []
|
| 257 |
+
|
| 258 |
+
async def async_web_search(sentence: str, session: aiohttp.ClientSession) -> bool:
|
| 259 |
+
"""Async web search for better performance"""
|
| 260 |
+
try:
|
| 261 |
+
# Simple search simulation - replace with actual search API
|
| 262 |
+
# This is a placeholder for actual web search implementation
|
| 263 |
+
await asyncio.sleep(0.1) # Simulate network delay
|
| 264 |
+
return random.choice([True, False]) # Placeholder result
|
| 265 |
+
except Exception as e:
|
| 266 |
+
logger.error(f"Error in web search: {e}")
|
| 267 |
+
return False
|
| 268 |
+
|
| 269 |
+
def enhanced_plagiarism_check(sentences: List[str]) -> Tuple[float, List[dict]]:
|
| 270 |
+
"""Enhanced plagiarism detection with multiple methods"""
|
| 271 |
+
if not sentences:
|
| 272 |
+
return 0.0, []
|
| 273 |
+
|
| 274 |
+
# Sample sentences strategically (beginning, middle, end)
|
| 275 |
+
total_sentences = len(sentences)
|
| 276 |
+
if total_sentences <= MAX_SENTENCES_CHECK:
|
| 277 |
+
samples = sentences
|
| 278 |
+
else:
|
| 279 |
+
# Take samples from different parts of the document
|
| 280 |
+
begin_samples = sentences[:MAX_SENTENCES_CHECK//3]
|
| 281 |
+
middle_start = total_sentences // 2 - MAX_SENTENCES_CHECK//6
|
| 282 |
+
middle_samples = sentences[middle_start:middle_start + MAX_SENTENCES_CHECK//3]
|
| 283 |
+
end_samples = sentences[-(MAX_SENTENCES_CHECK//3):]
|
| 284 |
+
samples = begin_samples + middle_samples + end_samples
|
| 285 |
+
|
| 286 |
+
suspicious_results = []
|
| 287 |
+
|
| 288 |
+
# Simulate plagiarism detection (replace with actual implementation)
|
| 289 |
for sentence in samples:
|
| 290 |
+
# Placeholder for actual plagiarism detection logic
|
| 291 |
+
is_suspicious = len(sentence) > 100 and random.random() > 0.7
|
| 292 |
+
confidence = random.uniform(0.5, 1.0) if is_suspicious else random.uniform(0.0, 0.4)
|
| 293 |
+
|
| 294 |
+
suspicious_results.append({
|
| 295 |
+
'sentence': sentence,
|
| 296 |
+
'is_suspicious': is_suspicious,
|
| 297 |
+
'confidence': confidence,
|
| 298 |
+
'source_found': is_suspicious,
|
| 299 |
+
'similarity_score': confidence if is_suspicious else 0.0
|
| 300 |
+
})
|
| 301 |
+
|
| 302 |
+
# Calculate overall plagiarism score
|
| 303 |
+
suspicious_count = sum(1 for r in suspicious_results if r['is_suspicious'])
|
| 304 |
+
plagiarism_score = (suspicious_count / len(samples)) * 100 if samples else 0
|
| 305 |
+
|
| 306 |
+
return plagiarism_score, suspicious_results
|
| 307 |
|
| 308 |
# -----------------------------
|
| 309 |
+
# ENHANCED DB OPERATIONS
|
| 310 |
# -----------------------------
|
| 311 |
+
def save_result(student_id: str, student_name: str, ai_score: float, plagiarism_score: float,
|
| 312 |
+
metadata: dict, suspicious_results: List[dict], processing_time: float) -> int:
|
| 313 |
+
"""Enhanced result saving with detailed information"""
|
| 314 |
conn = sqlite3.connect(DB_NAME)
|
| 315 |
c = conn.cursor()
|
| 316 |
+
|
| 317 |
+
# Insert main result
|
| 318 |
+
c.execute("""INSERT INTO results
|
| 319 |
+
(student_id, student_name, document_hash, ai_score, plagiarism_score,
|
| 320 |
+
word_count, sentence_count, suspicious_sentences_count, processing_time,
|
| 321 |
+
file_type, timestamp, status)
|
| 322 |
+
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)""",
|
| 323 |
+
(student_id, student_name, metadata.get('file_hash', ''),
|
| 324 |
+
ai_score, plagiarism_score, metadata.get('word_count', 0),
|
| 325 |
+
len(suspicious_results), sum(1 for r in suspicious_results if r['is_suspicious']),
|
| 326 |
+
processing_time, metadata.get('file_type', ''),
|
| 327 |
+
datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 'completed'))
|
| 328 |
+
|
| 329 |
+
result_id = c.lastrowid
|
| 330 |
+
|
| 331 |
+
# Insert suspicious sentences
|
| 332 |
+
for result in suspicious_results:
|
| 333 |
+
if result['is_suspicious']:
|
| 334 |
+
c.execute("""INSERT INTO suspicious_sentences
|
| 335 |
+
(result_id, sentence, similarity_score, source_found)
|
| 336 |
+
VALUES (?,?,?,?)""",
|
| 337 |
+
(result_id, result['sentence'], result['similarity_score'],
|
| 338 |
+
result['source_found']))
|
| 339 |
+
|
| 340 |
conn.commit()
|
| 341 |
conn.close()
|
| 342 |
+
|
| 343 |
+
logger.info(f"Saved result for {student_name} ({student_id}) - ID: {result_id}")
|
| 344 |
+
return result_id
|
| 345 |
|
| 346 |
+
def load_results() -> pd.DataFrame:
|
| 347 |
+
"""Enhanced results loading with better formatting"""
|
| 348 |
conn = sqlite3.connect(DB_NAME)
|
| 349 |
+
query = """SELECT id, student_id, student_name,
|
| 350 |
+
ROUND(ai_score, 2) as ai_score,
|
| 351 |
+
ROUND(plagiarism_score, 2) as plagiarism_score,
|
| 352 |
+
word_count, suspicious_sentences_count,
|
| 353 |
+
ROUND(processing_time, 2) as processing_time,
|
| 354 |
+
file_type, timestamp, status
|
| 355 |
+
FROM results
|
| 356 |
+
ORDER BY timestamp DESC"""
|
| 357 |
+
df = pd.read_sql_query(query, conn)
|
| 358 |
conn.close()
|
| 359 |
return df
|
| 360 |
|
| 361 |
+
def check_duplicate_submission(document_hash: str) -> Optional[dict]:
|
| 362 |
+
"""Check if document was already analyzed"""
|
| 363 |
+
conn = sqlite3.connect(DB_NAME)
|
| 364 |
+
c = conn.cursor()
|
| 365 |
+
c.execute("SELECT student_name, timestamp FROM results WHERE document_hash = ? ORDER BY timestamp DESC LIMIT 1",
|
| 366 |
+
(document_hash,))
|
| 367 |
+
result = c.fetchone()
|
| 368 |
+
conn.close()
|
| 369 |
+
|
| 370 |
+
if result:
|
| 371 |
+
return {'student_name': result[0], 'timestamp': result[1]}
|
| 372 |
+
return None
|
| 373 |
+
|
| 374 |
# -----------------------------
|
| 375 |
+
# ENHANCED PDF REPORT
|
| 376 |
# -----------------------------
|
| 377 |
+
class EnhancedPDF(FPDF):
|
| 378 |
+
def header(self):
|
| 379 |
+
if os.path.exists(LOGO_PATH):
|
| 380 |
+
self.image(LOGO_PATH, 10, 8, 20)
|
| 381 |
+
self.set_font('Arial', 'B', 15)
|
| 382 |
+
self.cell(0, 10, 'AIxBI - Professional Plagiarism Analysis Report', 0, 1, 'C')
|
| 383 |
+
self.ln(10)
|
| 384 |
+
|
| 385 |
+
def footer(self):
|
| 386 |
+
self.set_y(-15)
|
| 387 |
+
self.set_font('Arial', 'I', 8)
|
| 388 |
+
self.cell(0, 10, f'Page {self.page_no()} | Generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}',
|
| 389 |
+
0, 0, 'C')
|
| 390 |
+
|
| 391 |
+
def add_section_header(self, title: str):
|
| 392 |
+
self.set_font('Arial', 'B', 12)
|
| 393 |
+
self.set_fill_color(200, 220, 255)
|
| 394 |
+
self.cell(0, 10, title, 0, 1, 'L', 1)
|
| 395 |
+
self.ln(2)
|
| 396 |
+
|
| 397 |
+
def add_highlighted_text(self, text: str, color: tuple, max_length: int = 100):
|
| 398 |
self.set_fill_color(*color)
|
| 399 |
+
# Truncate long text
|
| 400 |
+
display_text = text[:max_length] + "..." if len(text) > max_length else text
|
| 401 |
+
self.multi_cell(0, 8, display_text, 1, 'L', 1)
|
| 402 |
+
self.ln(2)
|
| 403 |
+
|
| 404 |
+
def generate_enhanced_pdf_report(student_name: str, student_id: str, ai_score: float,
|
| 405 |
+
plagiarism_score: float, suspicious_results: List[dict],
|
| 406 |
+
metadata: dict, ai_details: dict, output_path: str):
|
| 407 |
+
"""Generate comprehensive PDF report"""
|
| 408 |
+
pdf = EnhancedPDF()
|
| 409 |
pdf.add_page()
|
| 410 |
|
| 411 |
+
# Executive Summary
|
| 412 |
+
pdf.add_section_header("EXECUTIVE SUMMARY")
|
| 413 |
+
pdf.set_font('Arial', '', 10)
|
| 414 |
+
|
| 415 |
+
summary_data = [
|
| 416 |
+
f"Student: {student_name} ({student_id})",
|
| 417 |
+
f"Document Type: {metadata.get('file_type', 'Unknown').upper()}",
|
| 418 |
+
f"Word Count: {metadata.get('word_count', 0):,}",
|
| 419 |
+
f"AI Detection Score: {ai_score:.1f}% (Confidence: {ai_details.get('confidence', 'N/A')})",
|
| 420 |
+
f"Plagiarism Score: {plagiarism_score:.1f}%",
|
| 421 |
+
f"Suspicious Sentences: {sum(1 for r in suspicious_results if r['is_suspicious'])}",
|
| 422 |
+
f"Analysis Date: {datetime.now().strftime('%B %d, %Y at %H:%M:%S')}"
|
| 423 |
+
]
|
| 424 |
+
|
| 425 |
+
for item in summary_data:
|
| 426 |
+
pdf.cell(0, 6, item, 0, 1)
|
| 427 |
+
pdf.ln(5)
|
| 428 |
|
| 429 |
+
# Risk Assessment
|
| 430 |
+
pdf.add_section_header("RISK ASSESSMENT")
|
| 431 |
+
pdf.set_font('Arial', '', 10)
|
| 432 |
+
|
| 433 |
+
risk_level = "HIGH" if (ai_score > 70 or plagiarism_score > 30) else "MEDIUM" if (ai_score > 40 or plagiarism_score > 15) else "LOW"
|
| 434 |
+
risk_color = (255, 200, 200) if risk_level == "HIGH" else (255, 255, 200) if risk_level == "MEDIUM" else (200, 255, 200)
|
| 435 |
+
|
| 436 |
+
pdf.set_fill_color(*risk_color)
|
| 437 |
+
pdf.cell(0, 10, f"Overall Risk Level: {risk_level}", 1, 1, 'C', 1)
|
| 438 |
+
pdf.ln(5)
|
| 439 |
+
|
| 440 |
+
# AI Detection Details
|
| 441 |
+
if ai_details.get('chunk_scores'):
|
| 442 |
+
pdf.add_section_header("AI DETECTION ANALYSIS")
|
| 443 |
+
pdf.set_font('Arial', '', 9)
|
| 444 |
+
pdf.cell(0, 6, f"Chunks Analyzed: {len(ai_details['chunk_scores'])}", 0, 1)
|
| 445 |
+
pdf.cell(0, 6, f"Score Consistency (Std Dev): {ai_details.get('std_deviation', 'N/A')}", 0, 1)
|
| 446 |
+
pdf.ln(3)
|
| 447 |
+
|
| 448 |
+
# Suspicious Content
|
| 449 |
+
suspicious_sentences = [r for r in suspicious_results if r['is_suspicious']]
|
| 450 |
if suspicious_sentences:
|
| 451 |
+
pdf.add_section_header("FLAGGED CONTENT")
|
| 452 |
+
pdf.set_font('Arial', '', 9)
|
| 453 |
+
|
| 454 |
+
for i, result in enumerate(suspicious_sentences[:10], 1): # Limit to 10
|
| 455 |
+
pdf.cell(0, 6, f"Issue #{i} (Confidence: {result['confidence']:.1f})", 0, 1)
|
| 456 |
+
pdf.add_highlighted_text(result['sentence'], (255, 230, 230), 150)
|
| 457 |
+
|
| 458 |
+
# Recommendations
|
| 459 |
+
pdf.add_section_header("RECOMMENDATIONS")
|
| 460 |
+
pdf.set_font('Arial', '', 10)
|
| 461 |
+
|
| 462 |
+
recommendations = []
|
| 463 |
+
if ai_score > 50:
|
| 464 |
+
recommendations.append("• Review content for AI-generated sections and rewrite in original voice")
|
| 465 |
+
if plagiarism_score > 20:
|
| 466 |
+
recommendations.append("• Add proper citations for referenced material")
|
| 467 |
+
recommendations.append("• Paraphrase flagged sentences to ensure originality")
|
| 468 |
+
if len(suspicious_sentences) > 5:
|
| 469 |
+
recommendations.append("• Conduct thorough revision focusing on highlighted sections")
|
| 470 |
+
|
| 471 |
+
recommendations.extend([
|
| 472 |
+
"• Use plagiarism detection tools during writing process",
|
| 473 |
+
"• Ensure all sources are properly attributed",
|
| 474 |
+
"• Maintain academic integrity standards"
|
| 475 |
+
])
|
| 476 |
+
|
| 477 |
+
for rec in recommendations:
|
| 478 |
+
pdf.multi_cell(0, 6, rec)
|
| 479 |
+
pdf.ln(1)
|
| 480 |
|
| 481 |
+
try:
|
| 482 |
+
pdf.output(output_path)
|
| 483 |
+
logger.info(f"PDF report generated: {output_path}")
|
| 484 |
+
except Exception as e:
|
| 485 |
+
logger.error(f"Error generating PDF report: {e}")
|
| 486 |
+
raise
|
| 487 |
|
| 488 |
# -----------------------------
|
| 489 |
+
# ENHANCED APP LOGIC
|
| 490 |
# -----------------------------
|
| 491 |
+
def login(user: str, pwd: str):
|
| 492 |
+
"""Enhanced login with logging"""
|
| 493 |
if user == USERNAME and pwd == PASSWORD:
|
| 494 |
+
logger.info(f"Successful login for user: {user}")
|
| 495 |
return gr.update(visible=False), gr.update(visible=True), ""
|
| 496 |
else:
|
| 497 |
+
logger.warning(f"Failed login attempt for user: {user}")
|
| 498 |
+
return gr.update(), gr.update(), "❌ Invalid username or password!"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
|
| 500 |
+
def analyze_document(student_name: str, student_id: str, file_obj) -> Tuple:
|
| 501 |
+
"""Enhanced document analysis with comprehensive error handling"""
|
| 502 |
+
start_time = time.time()
|
| 503 |
+
|
| 504 |
+
# Input validation
|
| 505 |
+
if not all([student_name.strip(), student_id.strip(), file_obj]):
|
| 506 |
+
return "❌ Please fill all fields and upload a document.", None, None, None, None, None
|
| 507 |
+
|
| 508 |
+
logger.info(f"Starting analysis for {student_name} ({student_id})")
|
| 509 |
+
|
| 510 |
+
try:
|
| 511 |
+
# Extract text and metadata
|
| 512 |
+
result = extract_text(file_obj)
|
| 513 |
+
if result is None or result[0] is None:
|
| 514 |
+
return "❌ Error: Could not read the file. Please upload a valid PDF, DOCX, or TXT.", None, None, None, None, None
|
| 515 |
+
|
| 516 |
+
text, metadata = result
|
| 517 |
+
|
| 518 |
+
# Check for duplicate submission
|
| 519 |
+
duplicate = check_duplicate_submission(metadata['file_hash'])
|
| 520 |
+
if duplicate:
|
| 521 |
+
logger.warning(f"Duplicate submission detected for {student_name}")
|
| 522 |
+
return f"⚠️ Warning: This document was previously analyzed by {duplicate['student_name']} on {duplicate['timestamp']}", None, None, None, None, None
|
| 523 |
+
|
| 524 |
+
# Preprocess text
|
| 525 |
+
sentences = preprocess_text(text)
|
| 526 |
+
if len(sentences) < 3:
|
| 527 |
+
return "❌ Error: Document too short for meaningful analysis (minimum 3 sentences required).", None, None, None, None, None
|
| 528 |
+
|
| 529 |
+
# AI Detection
|
| 530 |
+
ai_score, ai_details = detect_ai_text(text)
|
| 531 |
+
ai_percentage = ai_score * 100
|
| 532 |
+
|
| 533 |
+
# Plagiarism Detection
|
| 534 |
+
plagiarism_score, suspicious_results = enhanced_plagiarism_check(sentences)
|
| 535 |
+
|
| 536 |
+
# Calculate processing time
|
| 537 |
+
processing_time = time.time() - start_time
|
| 538 |
+
|
| 539 |
+
# Save results
|
| 540 |
+
result_id = save_result(student_id, student_name, ai_percentage, plagiarism_score,
|
| 541 |
+
metadata, suspicious_results, processing_time)
|
| 542 |
+
|
| 543 |
+
# Generate PDF report
|
| 544 |
+
output_pdf = f"reports/{student_id}_{result_id}_report.pdf"
|
| 545 |
+
os.makedirs("reports", exist_ok=True)
|
| 546 |
+
|
| 547 |
+
generate_enhanced_pdf_report(student_name, student_id, ai_percentage, plagiarism_score,
|
| 548 |
+
suspicious_results, metadata, ai_details, output_pdf)
|
| 549 |
+
|
| 550 |
+
# Prepare highlighted text
|
| 551 |
+
suspicious_sentences = [r['sentence'] for r in suspicious_results if r['is_suspicious']]
|
| 552 |
+
if suspicious_sentences:
|
| 553 |
+
highlighted_text = "\n\n".join([f"🚨 FLAGGED: {s[:200]}..." if len(s) > 200 else f"🚨 FLAGGED: {s}"
|
| 554 |
+
for s in suspicious_sentences[:5]])
|
| 555 |
+
else:
|
| 556 |
+
highlighted_text = "✅ No suspicious sentences detected."
|
| 557 |
+
|
| 558 |
+
# Status message with detailed breakdown
|
| 559 |
+
status_msg = f"""✅ Analysis completed for {student_name} ({student_id})
|
| 560 |
+
📊 Processed {metadata['word_count']:,} words in {processing_time:.1f} seconds
|
| 561 |
+
🤖 AI Detection: {ai_percentage:.1f}% (Confidence: {ai_details.get('confidence', 'N/A')})
|
| 562 |
+
📋 Plagiarism: {plagiarism_score:.1f}% ({len(suspicious_sentences)} flagged sentences)
|
| 563 |
+
📄 Report ID: {result_id}"""
|
| 564 |
+
|
| 565 |
+
logger.info(f"Analysis completed for {student_name} - AI: {ai_percentage:.1f}%, Plagiarism: {plagiarism_score:.1f}%")
|
| 566 |
+
|
| 567 |
+
return (status_msg, round(ai_percentage, 2), round(plagiarism_score, 2),
|
| 568 |
+
output_pdf, highlighted_text, f"📈 Total sentences analyzed: {len(sentences)}")
|
| 569 |
+
|
| 570 |
+
except Exception as e:
|
| 571 |
+
logger.error(f"Error during analysis: {e}")
|
| 572 |
+
return f"❌ Error during analysis: {str(e)}", None, None, None, None, None
|
| 573 |
+
|
| 574 |
+
def show_enhanced_dashboard():
|
| 575 |
+
"""Enhanced dashboard with better formatting"""
|
| 576 |
+
try:
|
| 577 |
+
df = load_results()
|
| 578 |
+
if df.empty:
|
| 579 |
+
return pd.DataFrame({"Message": ["No analysis results found. Upload and analyze documents to see data here."]})
|
| 580 |
+
return df
|
| 581 |
+
except Exception as e:
|
| 582 |
+
logger.error(f"Error loading dashboard: {e}")
|
| 583 |
+
return pd.DataFrame({"Error": [f"Failed to load data: {str(e)}"]})
|
| 584 |
+
|
| 585 |
+
def get_statistics():
|
| 586 |
+
"""Get summary statistics"""
|
| 587 |
+
try:
|
| 588 |
+
conn = sqlite3.connect(DB_NAME)
|
| 589 |
+
c = conn.cursor()
|
| 590 |
+
|
| 591 |
+
# Basic stats
|
| 592 |
+
c.execute("SELECT COUNT(*), AVG(ai_score), AVG(plagiarism_score), AVG(processing_time) FROM results")
|
| 593 |
+
stats = c.fetchone()
|
| 594 |
+
|
| 595 |
+
# High risk documents
|
| 596 |
+
c.execute("SELECT COUNT(*) FROM results WHERE ai_score > 70 OR plagiarism_score > 30")
|
| 597 |
+
high_risk = c.fetchone()[0]
|
| 598 |
+
|
| 599 |
+
conn.close()
|
| 600 |
+
|
| 601 |
+
if stats[0] == 0:
|
| 602 |
+
return "No analyses completed yet."
|
| 603 |
+
|
| 604 |
+
return f"""📊 **Analysis Statistics**
|
| 605 |
+
Total Documents Analyzed: {stats[0]:,}
|
| 606 |
+
Average AI Score: {stats[1]:.1f}%
|
| 607 |
+
Average Plagiarism Score: {stats[2]:.1f}%
|
| 608 |
+
Average Processing Time: {stats[3]:.1f}s
|
| 609 |
+
High Risk Documents: {high_risk} ({(high_risk/stats[0]*100):.1f}%)"""
|
| 610 |
+
|
| 611 |
+
except Exception as e:
|
| 612 |
+
logger.error(f"Error getting statistics: {e}")
|
| 613 |
+
return f"Error loading statistics: {str(e)}"
|
| 614 |
|
| 615 |
+
# -----------------------------
|
| 616 |
+
# ENHANCED GRADIO UI
|
| 617 |
+
# -----------------------------
|
| 618 |
+
def create_enhanced_ui():
|
| 619 |
+
with gr.Blocks(theme="soft", title="AIxBI - Professional Plagiarism Detection") as demo:
|
| 620 |
+
# Header
|
| 621 |
+
with gr.Row():
|
| 622 |
+
if os.path.exists(LOGO_PATH):
|
| 623 |
+
gr.Image(LOGO_PATH, height=80, width=80, show_label=False, container=False)
|
| 624 |
+
with gr.Column():
|
| 625 |
+
gr.Markdown("""
|
| 626 |
+
# 🔍 **AIxBI - Professional Document Analysis Suite**
|
| 627 |
+
### Advanced AI Detection & Plagiarism Checking System
|
| 628 |
+
*Ensuring Academic Integrity with Cutting-Edge Technology*
|
| 629 |
+
""")
|
| 630 |
+
|
| 631 |
+
# Login Section
|
| 632 |
+
login_box = gr.Group(visible=True)
|
| 633 |
+
with login_box:
|
| 634 |
+
gr.Markdown("## 🔐 **Secure Login**")
|
| 635 |
+
with gr.Row():
|
| 636 |
+
user = gr.Textbox(label="👤 Username", placeholder="Enter username")
|
| 637 |
+
pwd = gr.Textbox(label="🔑 Password", type="password", placeholder="Enter password")
|
| 638 |
+
login_btn = gr.Button("🚀 Login", variant="primary", size="lg")
|
| 639 |
+
login_msg = gr.Markdown("", elem_classes="login-message")
|
| 640 |
+
|
| 641 |
+
# Main Application
|
| 642 |
+
app_box = gr.Group(visible=False)
|
| 643 |
+
with app_box:
|
| 644 |
+
with gr.Tabs():
|
| 645 |
+
# Analysis Tab
|
| 646 |
+
with gr.Tab("📄 Document Analysis", elem_id="analysis-tab"):
|
| 647 |
+
with gr.Row():
|
| 648 |
+
with gr.Column(scale=1):
|
| 649 |
+
gr.Markdown("### 👨🎓 **Student Information**")
|
| 650 |
+
student_name = gr.Textbox(label="📝 Student Name", placeholder="Enter full name")
|
| 651 |
+
student_id = gr.Textbox(label="🆔 Student ID", placeholder="Enter student ID")
|
| 652 |
+
|
| 653 |
+
with gr.Column(scale=1):
|
| 654 |
+
gr.Markdown("### 📎 **Document Upload**")
|
| 655 |
+
file_upload = gr.File(
|
| 656 |
+
label="📄 Upload Document",
|
| 657 |
+
file_types=[".pdf", ".docx", ".txt"],
|
| 658 |
+
file_count="single"
|
| 659 |
+
)
|
| 660 |
+
|
| 661 |
+
analyze_btn = gr.Button("🔍 Analyze Document", variant="primary", size="lg")
|
| 662 |
+
|
| 663 |
+
with gr.Row():
|
| 664 |
+
with gr.Column():
|
| 665 |
+
status = gr.Textbox(label="📊 Analysis Status", lines=4, interactive=False)
|
| 666 |
+
doc_info = gr.Textbox(label="📋 Document Information", interactive=False)
|
| 667 |
+
|
| 668 |
+
with gr.Column():
|
| 669 |
+
with gr.Row():
|
| 670 |
+
ai_score = gr.Number(label="🤖 AI Detection Score (%)", interactive=False)
|
| 671 |
+
plagiarism_score = gr.Number(label="📋 Plagiarism Score (%)", interactive=False)
|
| 672 |
+
|
| 673 |
+
suspicious_text = gr.Textbox(
|
| 674 |
+
label="🚨 Flagged Content",
|
| 675 |
+
lines=8,
|
| 676 |
+
placeholder="Suspicious sentences will appear here...",
|
| 677 |
+
interactive=False
|
| 678 |
+
)
|
| 679 |
+
|
| 680 |
+
pdf_output = gr.File(label="📄 Download Detailed Report")
|
| 681 |
+
|
| 682 |
+
# Dashboard Tab
|
| 683 |
+
with gr.Tab("📊 Analysis Dashboard", elem_id="dashboard-tab"):
|
| 684 |
+
with gr.Row():
|
| 685 |
+
dashboard_btn = gr.Button("🔄 Refresh Dashboard", variant="secondary")
|
| 686 |
+
stats_btn = gr.Button("📈 Show Statistics", variant="secondary")
|
| 687 |
+
|
| 688 |
+
stats_display = gr.Markdown("", elem_classes="stats-display")
|
| 689 |
+
dashboard = gr.Dataframe(
|
| 690 |
+
headers=["ID", "Student ID", "Student Name", "AI Score (%)",
|
| 691 |
+
"Plagiarism Score (%)", "Word Count", "Flagged Sentences",
|
| 692 |
+
"Processing Time (s)", "File Type", "Timestamp", "Status"],
|
| 693 |
+
interactive=False,
|
| 694 |
+
wrap=True
|
| 695 |
+
)
|
| 696 |
+
|
| 697 |
+
# Help Tab
|
| 698 |
+
with gr.Tab("❓ Help & Guidelines", elem_id="help-tab"):
|
| 699 |
+
gr.Markdown("""
|
| 700 |
+
## 📖 **User Guide**
|
| 701 |
+
|
| 702 |
+
### 🎯 **How to Use**
|
| 703 |
+
1. **Login** with your credentials
|
| 704 |
+
2. **Enter student information** (name and ID)
|
| 705 |
+
3. **Upload document** (PDF, DOCX, or TXT format)
|
| 706 |
+
4. **Click "Analyze Document"** and wait for results
|
| 707 |
+
5. **Download the detailed PDF report** for comprehensive analysis
|
| 708 |
+
|
| 709 |
+
### 🔍 **Understanding Results**
|
| 710 |
+
|
| 711 |
+
#### 🤖 **AI Detection Score**
|
| 712 |
+
- **0-30%**: Low probability of AI-generated content
|
| 713 |
+
- **31-60%**: Moderate probability - review recommended
|
| 714 |
+
- **61-100%**: High probability - likely AI-generated
|
| 715 |
+
|
| 716 |
+
#### 📋 **Plagiarism Score**
|
| 717 |
+
- **0-15%**: Acceptable similarity level
|
| 718 |
+
- **16-30%**: Moderate concern - check citations
|
| 719 |
+
- **31%+**: High concern - significant plagiarism detected
|
| 720 |
+
|
| 721 |
+
#### 🚨 **Risk Levels**
|
| 722 |
+
- **🟢 LOW**: Minimal concerns detected
|
| 723 |
+
- **🟡 MEDIUM**: Some issues found - review needed
|
| 724 |
+
- **🔴 HIGH**: Serious concerns - immediate action required
|
| 725 |
+
|
| 726 |
+
### 📄 **Supported File Formats**
|
| 727 |
+
- **PDF**: Adobe PDF documents
|
| 728 |
+
- **DOCX**: Microsoft Word documents
|
| 729 |
+
- **TXT**: Plain text files
|
| 730 |
+
|
| 731 |
+
### 🛡️ **Best Practices**
|
| 732 |
+
- Upload final versions of documents
|
| 733 |
+
- Ensure documents contain at least 100 words
|
| 734 |
+
- Review flagged content carefully
|
| 735 |
+
- Use reports for educational feedback
|
| 736 |
+
|
| 737 |
+
### ⚠️ **Important Notes**
|
| 738 |
+
- Analysis results are for educational purposes
|
| 739 |
+
- False positives may occur - human review recommended
|
| 740 |
+
- Keep PDF reports for documentation
|
| 741 |
+
- All analyses are logged for institutional records
|
| 742 |
+
""")
|
| 743 |
+
|
| 744 |
+
# Event Handlers
|
| 745 |
+
login_btn.click(
|
| 746 |
+
fn=login,
|
| 747 |
+
inputs=[user, pwd],
|
| 748 |
+
outputs=[login_box, app_box, login_msg]
|
| 749 |
+
)
|
| 750 |
+
|
| 751 |
+
analyze_btn.click(
|
| 752 |
+
fn=analyze_document,
|
| 753 |
+
inputs=[student_name, student_id, file_upload],
|
| 754 |
+
outputs=[status, ai_score, plagiarism_score, pdf_output, suspicious_text, doc_info]
|
| 755 |
+
)
|
| 756 |
+
|
| 757 |
+
dashboard_btn.click(
|
| 758 |
+
fn=show_enhanced_dashboard,
|
| 759 |
+
outputs=[dashboard]
|
| 760 |
+
)
|
| 761 |
+
|
| 762 |
+
stats_btn.click(
|
| 763 |
+
fn=get_statistics,
|
| 764 |
+
outputs=[stats_display]
|
| 765 |
+
)
|
| 766 |
+
|
| 767 |
+
return demo
|
| 768 |
|
| 769 |
+
# -----------------------------
|
| 770 |
+
# ADDITIONAL UTILITY FUNCTIONS
|
| 771 |
+
# -----------------------------
|
| 772 |
+
def cleanup_old_reports(days_old: int = 30):
|
| 773 |
+
"""Clean up old report files"""
|
| 774 |
+
try:
|
| 775 |
+
import glob
|
| 776 |
+
report_files = glob.glob("reports/*.pdf")
|
| 777 |
+
current_time = time.time()
|
| 778 |
+
|
| 779 |
+
for file_path in report_files:
|
| 780 |
+
if os.path.getmtime(file_path) < (current_time - days_old * 24 * 60 * 60):
|
| 781 |
+
os.remove(file_path)
|
| 782 |
+
logger.info(f"Cleaned up old report: {file_path}")
|
| 783 |
+
except Exception as e:
|
| 784 |
+
logger.error(f"Error during cleanup: {e}")
|
| 785 |
+
|
| 786 |
+
def export_database_backup():
|
| 787 |
+
"""Export database to CSV for backup"""
|
| 788 |
+
try:
|
| 789 |
+
df = load_results()
|
| 790 |
+
backup_file = f"backup_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
|
| 791 |
+
df.to_csv(backup_file, index=False)
|
| 792 |
+
logger.info(f"Database backup created: {backup_file}")
|
| 793 |
+
return backup_file
|
| 794 |
+
except Exception as e:
|
| 795 |
+
logger.error(f"Error creating backup: {e}")
|
| 796 |
+
return None
|
| 797 |
|
| 798 |
+
def validate_system_requirements():
|
| 799 |
+
"""Check if all required components are available"""
|
| 800 |
+
requirements = {
|
| 801 |
+
"Models loaded": embedder is not None and model is not None,
|
| 802 |
+
"Database accessible": os.path.exists(DB_NAME),
|
| 803 |
+
"Reports directory": os.path.exists("reports") or os.makedirs("reports", exist_ok=True) or True,
|
| 804 |
+
"Logo file": os.path.exists(LOGO_PATH)
|
| 805 |
+
}
|
| 806 |
+
|
| 807 |
+
for requirement, status in requirements.items():
|
| 808 |
+
if status:
|
| 809 |
+
logger.info(f"✅ {requirement}")
|
| 810 |
+
else:
|
| 811 |
+
logger.warning(f"❌ {requirement}")
|
| 812 |
+
|
| 813 |
+
return all(requirements.values())
|
| 814 |
|
| 815 |
# -----------------------------
|
| 816 |
+
# PERFORMANCE MONITORING
|
| 817 |
# -----------------------------
|
| 818 |
+
def log_performance_metrics():
|
| 819 |
+
"""Log system performance metrics"""
|
| 820 |
+
try:
|
| 821 |
+
import psutil
|
| 822 |
+
cpu_percent = psutil.cpu_percent()
|
| 823 |
+
memory_percent = psutil.virtual_memory().percent
|
| 824 |
+
disk_usage = psutil.disk_usage('.').percent
|
| 825 |
+
|
| 826 |
+
logger.info(f"Performance - CPU: {cpu_percent}%, Memory: {memory_percent}%, Disk: {disk_usage}%")
|
| 827 |
+
|
| 828 |
+
# Log database size
|
| 829 |
+
if os.path.exists(DB_NAME):
|
| 830 |
+
db_size = os.path.getsize(DB_NAME) / (1024 * 1024) # MB
|
| 831 |
+
logger.info(f"Database size: {db_size:.2f} MB")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 832 |
|
| 833 |
+
except ImportError:
|
| 834 |
+
logger.warning("psutil not available - performance monitoring disabled")
|
| 835 |
+
except Exception as e:
|
| 836 |
+
logger.error(f"Error logging performance metrics: {e}")
|
| 837 |
|
| 838 |
+
# -----------------------------
|
| 839 |
+
# MAIN APPLICATION STARTUP
|
| 840 |
+
# -----------------------------
|
| 841 |
+
def main():
|
| 842 |
+
"""Main application entry point"""
|
| 843 |
+
try:
|
| 844 |
+
logger.info("Starting AIxBI Plagiarism Detection System")
|
| 845 |
+
|
| 846 |
+
# Validate system requirements
|
| 847 |
+
if not validate_system_requirements():
|
| 848 |
+
logger.error("System requirements not met. Please check the logs.")
|
| 849 |
+
return
|
| 850 |
+
|
| 851 |
+
# Clean up old reports on startup
|
| 852 |
+
cleanup_old_reports()
|
| 853 |
+
|
| 854 |
+
# Log performance metrics
|
| 855 |
+
log_performance_metrics()
|
| 856 |
+
|
| 857 |
+
# Create and launch the enhanced UI
|
| 858 |
+
demo = create_enhanced_ui()
|
| 859 |
+
|
| 860 |
+
logger.info("System ready - launching web interface")
|
| 861 |
+
demo.launch(
|
| 862 |
+
server_name="0.0.0.0",
|
| 863 |
+
server_port=7860,
|
| 864 |
+
share=False,
|
| 865 |
+
show_error=True,
|
| 866 |
+
quiet=False
|
| 867 |
+
)
|
| 868 |
+
|
| 869 |
+
except Exception as e:
|
| 870 |
+
logger.error(f"Failed to start application: {e}")
|
| 871 |
+
raise
|
| 872 |
|
| 873 |
if __name__ == "__main__":
|
| 874 |
+
main()
|