mohitrulzz commited on
Commit
da3ff3f
·
verified ·
1 Parent(s): 27c8f12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +72 -27
app.py CHANGED
@@ -108,59 +108,104 @@ def calculate_file_hash(file_path: str) -> str:
108
  hash_sha256.update(chunk)
109
  return hash_sha256.hexdigest()
110
 
111
- def extract_text(file_obj) -> Optional[Tuple[str, dict]]:
112
- """Enhanced text extraction with metadata"""
113
  if file_obj is None:
114
- return None, None
115
 
116
  name = file_obj.name
117
  ext = os.path.splitext(name)[1].lower()
118
-
119
  # Copy to temp file preserving extension
120
  with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
121
  shutil.copy(file_obj.name, tmp.name)
122
  tmp_path = tmp.name
123
 
124
- metadata = {
125
- 'file_type': ext,
126
- 'file_size': os.path.getsize(tmp_path),
127
- 'file_hash': calculate_file_hash(tmp_path)
128
- }
129
-
130
  try:
131
  if ext == ".pdf":
132
  with pdfplumber.open(tmp_path) as pdf:
133
  text = " ".join(page.extract_text() or "" for page in pdf.pages)
134
- metadata['page_count'] = len(pdf.pages)
135
  elif ext == ".docx":
136
  doc = docx.Document(tmp_path)
137
  text = " ".join(p.text for p in doc.paragraphs)
138
- metadata['paragraph_count'] = len(doc.paragraphs)
139
  elif ext == ".txt":
140
  with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
141
  text = f.read()
142
  else:
143
- logger.warning(f"Unsupported file type: {ext}")
144
- return None, None
145
-
146
- except Exception as e:
147
- logger.error(f"Error extracting text from {name}: {e}")
148
- return None, None
149
  finally:
 
150
  try:
151
  os.unlink(tmp_path)
152
  except:
153
  pass
154
 
155
- if not text or len(text.strip()) < 50:
156
- logger.warning("Extracted text is too short or empty")
 
 
 
 
 
 
 
 
157
  return None, None
158
 
159
- text = text.strip()
160
- metadata.update({
161
- 'word_count': len(text.split()),
162
- 'char_count': len(text)
163
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  return text, metadata
166
 
@@ -508,8 +553,8 @@ def analyze_document(student_name: str, student_id: str, file_obj) -> Tuple:
508
  logger.info(f"Starting analysis for {student_name} ({student_id})")
509
 
510
  try:
511
- # Extract text and metadata
512
- result = extract_text(file_obj)
513
  if result is None or result[0] is None:
514
  return "❌ Error: Could not read the file. Please upload a valid PDF, DOCX, or TXT.", None, None, None, None, None
515
 
 
108
  hash_sha256.update(chunk)
109
  return hash_sha256.hexdigest()
110
 
111
+ def extract_text(file_obj):
112
+ """Extracts text safely from PDF/DOCX/TXT - Enhanced version of working code"""
113
  if file_obj is None:
114
+ return None
115
 
116
  name = file_obj.name
117
  ext = os.path.splitext(name)[1].lower()
118
+
119
  # Copy to temp file preserving extension
120
  with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
121
  shutil.copy(file_obj.name, tmp.name)
122
  tmp_path = tmp.name
123
 
 
 
 
 
 
 
124
  try:
125
  if ext == ".pdf":
126
  with pdfplumber.open(tmp_path) as pdf:
127
  text = " ".join(page.extract_text() or "" for page in pdf.pages)
 
128
  elif ext == ".docx":
129
  doc = docx.Document(tmp_path)
130
  text = " ".join(p.text for p in doc.paragraphs)
 
131
  elif ext == ".txt":
132
  with open(tmp_path, "r", encoding="utf-8", errors="ignore") as f:
133
  text = f.read()
134
  else:
135
+ return None
136
+ except:
137
+ return None
 
 
 
138
  finally:
139
+ # Clean up temp file
140
  try:
141
  os.unlink(tmp_path)
142
  except:
143
  pass
144
 
145
+ return text.strip() if text else None
146
+
147
+ def extract_text_with_metadata(file_obj) -> Optional[Tuple[str, dict]]:
148
+ """Enhanced text extraction with metadata - calls the working extract_text function"""
149
+ if file_obj is None:
150
+ return None, None
151
+
152
+ # Use the working extract_text function first
153
+ text = extract_text(file_obj)
154
+ if text is None:
155
  return None, None
156
 
157
+ # Now gather metadata safely
158
+ name = file_obj.name
159
+ ext = os.path.splitext(name)[1].lower()
160
+
161
+ # Create temporary file again for metadata extraction
162
+ with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
163
+ shutil.copy(file_obj.name, tmp.name)
164
+ tmp_path = tmp.name
165
+
166
+ try:
167
+ metadata = {
168
+ 'file_type': ext,
169
+ 'file_size': os.path.getsize(tmp_path),
170
+ 'file_hash': calculate_file_hash(tmp_path),
171
+ 'word_count': len(text.split()),
172
+ 'char_count': len(text)
173
+ }
174
+
175
+ # Add specific metadata based on file type
176
+ if ext == ".pdf":
177
+ try:
178
+ with pdfplumber.open(tmp_path) as pdf:
179
+ metadata['page_count'] = len(pdf.pages)
180
+ except:
181
+ metadata['page_count'] = 'Unknown'
182
+ elif ext == ".docx":
183
+ try:
184
+ doc = docx.Document(tmp_path)
185
+ metadata['paragraph_count'] = len(doc.paragraphs)
186
+ except:
187
+ metadata['paragraph_count'] = 'Unknown'
188
+
189
+ except Exception as e:
190
+ logger.error(f"Error gathering metadata from {name}: {e}")
191
+ # Return text with minimal metadata if metadata extraction fails
192
+ metadata = {
193
+ 'file_type': ext,
194
+ 'file_size': 0,
195
+ 'file_hash': '',
196
+ 'word_count': len(text.split()),
197
+ 'char_count': len(text)
198
+ }
199
+ finally:
200
+ try:
201
+ os.unlink(tmp_path)
202
+ except:
203
+ pass
204
+
205
+ # Final validation
206
+ if len(text.strip()) < 50:
207
+ logger.warning("Extracted text is too short for meaningful analysis")
208
+ return None, None
209
 
210
  return text, metadata
211
 
 
553
  logger.info(f"Starting analysis for {student_name} ({student_id})")
554
 
555
  try:
556
+ # Extract text and metadata using the working function
557
+ result = extract_text_with_metadata(file_obj)
558
  if result is None or result[0] is None:
559
  return "❌ Error: Could not read the file. Please upload a valid PDF, DOCX, or TXT.", None, None, None, None, None
560