Spaces:

BinKhoaLe1812
/

EdSummariser

Sleeping

App Files Files Community

LiamKhoaLe commited on about 1 month ago

Commit

50da96f

1 Parent(s): 954da5f

Initial setups refine for ingestion_js #16 (upd pdf-ext)

Browse files

Files changed (7) hide show

ingestion_js/app/api/upload/route.ts +21 -0
ingestion_js/lib/chunker.ts +78 -20
ingestion_js/lib/parser.ts +47 -11
ingestion_js/package-lock.json +0 -19
ingestion_js/package.json +2 -1
ingestion_js/test_simple.sh +27 -39
ingestion_js/test_upload_js.sh +5 -3

ingestion_js/app/api/upload/route.ts CHANGED Viewed

@@ -3,6 +3,7 @@ import { randomUUID } from 'crypto'
 import { extractPages } from '@/lib/parser'
 import { buildCardsFromPages } from '@/lib/chunker'
 import { embedRemote } from '@/lib/embedder'
 import { deleteFileData, storeCards, upsertFileSummary } from '@/lib/mongo'
 import { cheapSummarize } from '@/lib/summarizer'
 import { createJob, updateJob } from '@/lib/jobs'
@@ -55,6 +56,7 @@ export async function POST(req: NextRequest) {
   // Start processing immediately
   try {
     await processAll(job_id, user_id, project_id, preloaded, replaceSet)
     return NextResponse.json({ job_id, status: 'completed', total_files: preloaded.length })
   } catch (e) {
     console.error(`[UPLOAD_DEBUG] Processing failed for job ${job_id}:`, e)
@@ -80,6 +82,25 @@ async function processAll(job_id: string, user_id: string, project_id: string, f
       const pages = await extractPages(fname, buf)
       console.log(`[UPLOAD_DEBUG] Extracted ${pages.length} pages`)
       console.log(`[UPLOAD_DEBUG] Building cards from pages`)
       const cards = await buildCardsFromPages(pages, fname, user_id, project_id)
       console.log(`[UPLOAD_DEBUG] Built ${cards.length} cards`)

 import { extractPages } from '@/lib/parser'
 import { buildCardsFromPages } from '@/lib/chunker'
 import { embedRemote } from '@/lib/embedder'
+import { captionImage, normalizeCaption } from '@/lib/captioner'
 import { deleteFileData, storeCards, upsertFileSummary } from '@/lib/mongo'
 import { cheapSummarize } from '@/lib/summarizer'
 import { createJob, updateJob } from '@/lib/jobs'
   // Start processing immediately
   try {
     await processAll(job_id, user_id, project_id, preloaded, replaceSet)
+    await updateJob(job_id, { status: 'completed' })
     return NextResponse.json({ job_id, status: 'completed', total_files: preloaded.length })
   } catch (e) {
     console.error(`[UPLOAD_DEBUG] Processing failed for job ${job_id}:`, e)
       const pages = await extractPages(fname, buf)
       console.log(`[UPLOAD_DEBUG] Extracted ${pages.length} pages`)
+      // Process images with captions (best effort - images not extracted in current parser)
+      // This matches Python behavior where captions are appended to page text
+      for (const page of pages) {
+        if (page.images && page.images.length > 0) {
+          const captions: string[] = []
+          for (const img of page.images) {
+            try {
+              const caption = await captionImage(img)
+              if (caption) captions.push(normalizeCaption(caption))
+            } catch (e) {
+              console.warn(`[${job_id}] Caption error in ${fname}: ${e}`)
+            }
+          }
+          if (captions.length > 0) {
+            page.text = (page.text + '\n\n' + captions.map(c => `[Image] ${c}`).join('\n')).trim()
+          }
+        }
+      }
       console.log(`[UPLOAD_DEBUG] Building cards from pages`)
       const cards = await buildCardsFromPages(pages, fname, user_id, project_id)
       console.log(`[UPLOAD_DEBUG] Built ${cards.length} cards`)

ingestion_js/lib/chunker.ts CHANGED Viewed

@@ -2,36 +2,94 @@ import slugify from 'slugify'
 import type { Page } from './parser'
 import { cheapSummarize, cleanChunkText } from './summarizer'
-const MAX_WORDS = 220
-const OVERLAP_WORDS = 40
 function byHeadings(text: string): string[] {
-  const lines = text.split('\n')
   const parts: string[] = []
-  let current: string[] = []
-  const flush = () => { if (current.length) { parts.push(current.join('\n')); current = [] } }
-  const headingRe = /^(#+\s+|\d+\.|[A-Z][A-Za-z\s\-]{0,40}:?|^\s*\[[A-Za-z ]+\]\s*$)/
-  for (const ln of lines) {
-    if (headingRe.test(ln)) flush()
-    current.push(ln)
   }
-  flush()
   return parts.filter(p => p.trim().length > 0)
 }
 function createOverlappingChunks(blocks: string[]): string[] {
-  const out: string[] = []
-  let words: string[] = []
-  for (const b of blocks) {
-    words.push(...b.split(/\s+/))
-    while (words.length > MAX_WORDS) {
-      const chunk = words.slice(0, MAX_WORDS).join(' ')
-      out.push(chunk)
-      words = words.slice(MAX_WORDS - OVERLAP_WORDS)
     }
   }
-  if (words.length) out.push(words.join(' '))
-  return out
 }
 export async function buildCardsFromPages(pages: Page[], filename: string, user_id: string, project_id: string) {

 import type { Page } from './parser'
 import { cheapSummarize, cleanChunkText } from './summarizer'
+const MAX_WORDS = 500
+const MIN_WORDS = 150
+const OVERLAP_WORDS = 50
 function byHeadings(text: string): string[] {
+  // Enhanced patterns matching Python logic
+  const patterns = [
+    /^(#{1,6}\s.*)\s*$/gm,  // Markdown headers
+    /^([0-9]+\.\s+[^\n]+)\s*$/gm,  // Numbered sections
+    /^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$/gm,  // Underlined headers
+    /^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$/gm,  // Chapter/Section headers
+    /^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$/gm,  // Common academic sections
+  ]
   const parts: string[] = []
+  let last = 0
+  const allMatches: Array<{start: number, end: number, header: string}> = []
+  // Find all matches from all patterns
+  for (const pattern of patterns) {
+    let match
+    while ((match = pattern.exec(text)) !== null) {
+      allMatches.push({
+        start: match.index,
+        end: match.index + match[0].length,
+        header: match[1].trim()
+      })
+    }
+  }
+  // Sort matches by position
+  allMatches.sort((a, b) => a.start - b.start)
+  // Split text based on matches
+  for (const { start, end, header } of allMatches) {
+    if (start > last) {
+      parts.push(text.slice(last, start))
+    }
+    parts.push(text.slice(start, end))
+    last = end
   }
+  if (last < text.length) {
+    parts.push(text.slice(last))
+  }
+  if (parts.length === 0) {
+    parts.push(text)
+  }
   return parts.filter(p => p.trim().length > 0)
 }
 function createOverlappingChunks(blocks: string[]): string[] {
+  const chunks: string[] = []
+  for (let i = 0; i < blocks.length; i++) {
+    const block = blocks[i]
+    const words = block.split(/\s+/).filter(w => w.length > 0)
+    if (words.length === 0) continue
+    // If block is small enough, use as-is
+    if (words.length <= MAX_WORDS) {
+      chunks.push(block)
+      continue
+    }
+    // Split large blocks with overlap
+    let start = 0
+    while (start < words.length) {
+      const end = Math.min(start + MAX_WORDS, words.length)
+      let chunkWords = words.slice(start, end)
+      // Add overlap from previous chunk if available
+      if (start > 0 && chunks.length > 0) {
+        const prevWords = chunks[chunks.length - 1].split(/\s+/).filter(w => w.length > 0)
+        const overlapStart = Math.max(0, prevWords.length - OVERLAP_WORDS)
+        const overlapWords = prevWords.slice(overlapStart)
+        chunkWords = [...overlapWords, ...chunkWords]
+      }
+      chunks.push(chunkWords.join(' '))
+      start = end - OVERLAP_WORDS  // Overlap with next chunk
     }
   }
+  return chunks
 }
 export async function buildCardsFromPages(pages: Page[], filename: string, user_id: string, project_id: string) {

ingestion_js/lib/parser.ts CHANGED Viewed

@@ -1,3 +1,4 @@
 import mammoth from 'mammoth'
 export type Page = { page_num: number; text: string; images: Buffer[] }
@@ -5,24 +6,57 @@ export type Page = { page_num: number; text: string; images: Buffer[] }
 export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
   console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`)
-  // For now, return a simple text extraction as a fallback
-  // This avoids the pdf-text-extract Buffer issue
   try {
-    // Simple fallback: return the PDF as a single page with placeholder text
-    // In production, you'd want to use a proper PDF parser
-    const text = `[PDF Content - ${buf.length} bytes]`
-    console.log(`[PARSER_DEBUG] Using fallback PDF parsing`)
-    return [{ page_num: 1, text, images: [] }]
   } catch (error) {
     console.error('[PARSER_DEBUG] PDF parsing error:', error)
-    throw error
   }
 }
 export async function parseDocxBytes(buf: Buffer): Promise<Page[]> {
-  const { value } = await mammoth.extractRawText({ buffer: buf })
-  const text = value || ''
-  return [{ page_num: 1, text, images: [] }]
 }
 export function inferMime(filename: string): string {
@@ -35,6 +69,8 @@ export function inferMime(filename: string): string {
 export async function extractPages(filename: string, file: Buffer): Promise<Page[]> {
   const mime = inferMime(filename)
   if (mime === 'application/pdf') return parsePdfBytes(file)
   if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file)
   throw new Error(`Unsupported file type: ${filename}`)

+import * as pdfjs from 'pdfjs-dist/legacy/build/pdf'
 import mammoth from 'mammoth'
 export type Page = { page_num: number; text: string; images: Buffer[] }
 export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
   console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`)
   try {
+    // Convert Buffer to Uint8Array for pdfjs-dist
+    const uint8Array = new Uint8Array(buf)
+    const loadingTask = pdfjs.getDocument({ data: uint8Array })
+    const pdf = await loadingTask.promise
+    const pages: Page[] = []
+    console.log(`[PARSER_DEBUG] PDF has ${pdf.numPages} pages`)
+    for (let i = 1; i <= pdf.numPages; i++) {
+      console.log(`[PARSER_DEBUG] Processing page ${i}`)
+      const page = await pdf.getPage(i)
+      const textContent = await page.getTextContent()
+      // Extract text like Python PyMuPDF does
+      const text = textContent.items
+        .map((item: any) => item.str || '')
+        .join(' ')
+        .trim()
+      console.log(`[PARSER_DEBUG] Page ${i} extracted ${text.length} characters`)
+      // For now, we don't extract images from PDF in serverless (complex)
+      // This matches the current limitation but we could add image extraction later
+      pages.push({
+        page_num: i,
+        text: text || `[Page ${i} - No text content extracted]`,
+        images: [] // Images not extracted in current implementation
+      })
+    }
+    console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages`)
+    return pages
   } catch (error) {
     console.error('[PARSER_DEBUG] PDF parsing error:', error)
+    // Fallback to simple text representation
+    return [{ page_num: 1, text: `[PDF Content - ${buf.length} bytes - Parse error: ${error}]`, images: [] }]
   }
 }
 export async function parseDocxBytes(buf: Buffer): Promise<Page[]> {
+  try {
+    const { value } = await mammoth.extractRawText({ buffer: buf })
+    const text = value || ''
+    console.log(`[PARSER_DEBUG] DOCX extracted ${text.length} characters`)
+    return [{ page_num: 1, text, images: [] }]
+  } catch (error) {
+    console.error('[PARSER_DEBUG] DOCX parsing error:', error)
+    return [{ page_num: 1, text: `[DOCX Parse Error: ${error}]`, images: [] }]
+  }
 }
 export function inferMime(filename: string): string {
 export async function extractPages(filename: string, file: Buffer): Promise<Page[]> {
   const mime = inferMime(filename)
+  console.log(`[PARSER_DEBUG] Processing ${filename} as ${mime}`)
   if (mime === 'application/pdf') return parsePdfBytes(file)
   if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file)
   throw new Error(`Unsupported file type: ${filename}`)

ingestion_js/package-lock.json CHANGED Viewed

@@ -11,7 +11,6 @@
         "mammoth": "^1.6.0",
         "mongodb": "^6.8.0",
         "next": "^14.2.5",
-        "pdf-text-extract": "^1.0.0",
         "react": "^18.3.1",
         "react-dom": "^18.3.1",
         "slugify": "^1.6.6"
@@ -610,18 +609,6 @@
         "node": ">=0.10.0"
       }
     },
-    "node_modules/pdf-text-extract": {
-      "version": "1.5.0",
-      "resolved": "https://registry.npmjs.org/pdf-text-extract/-/pdf-text-extract-1.5.0.tgz",
-      "integrity": "sha512-5zpNQljVf4h0b9sY8KGKDHxYoTYqDjahvkxmpHwpxBe3p92AWnscpWausl5/OaedOgnS8Pw53DOQx7bqtYcpow==",
-      "license": "BSD",
-      "dependencies": {
-        "yargs": "^1.2.5"
-      },
-      "bin": {
-        "pdf-text-extract": "bin/pdf-text-extract.js"
-      }
-    },
     "node_modules/picocolors": {
       "version": "1.1.1",
       "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
@@ -886,12 +873,6 @@
       "engines": {
         "node": ">=4.0"
       }
-    },
-    "node_modules/yargs": {
-      "version": "1.3.3",
-      "resolved": "https://registry.npmjs.org/yargs/-/yargs-1.3.3.tgz",
-      "integrity": "sha512-7OGt4xXoWJQh5ulgZ78rKaqY7dNWbjfK+UKxGcIlaM2j7C4fqGchyv8CPvEWdRPrHp6Ula/YU8yGRpYGOHrI+g==",
-      "license": "MIT/X11"
     }
   }
 }

         "mammoth": "^1.6.0",
         "mongodb": "^6.8.0",
         "next": "^14.2.5",
         "react": "^18.3.1",
         "react-dom": "^18.3.1",
         "slugify": "^1.6.6"
         "node": ">=0.10.0"
       }
     },
     "node_modules/picocolors": {
       "version": "1.1.1",
       "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
       "engines": {
         "node": ">=4.0"
       }
     }
   }
 }

ingestion_js/package.json CHANGED Viewed

@@ -12,11 +12,12 @@
     "start": "next start"
   },
   "dependencies": {
     "mongodb": "^6.8.0",
     "next": "^14.2.5",
     "react": "^18.3.1",
     "react-dom": "^18.3.1",
-    "mammoth": "^1.6.0",
     "slugify": "^1.6.6"
   },
   "devDependencies": {

     "start": "next start"
   },
   "dependencies": {
+    "mammoth": "^1.6.0",
     "mongodb": "^6.8.0",
     "next": "^14.2.5",
+    "pdfjs-dist": "^3.11.174",
     "react": "^18.3.1",
     "react-dom": "^18.3.1",
     "slugify": "^1.6.6"
   },
   "devDependencies": {

ingestion_js/test_simple.sh CHANGED Viewed

@@ -10,9 +10,9 @@ BACKEND_URL="https://study-buddy-ingestion1.vercel.app/api"
 USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
 PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
-# Test files - use smaller files
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
-FILE1="$SCRIPT_DIR/../exefiles/Tut5.pdf"  # Smaller file
 echo "📋 Configuration:"
 echo "   Backend URL: $BACKEND_URL"
@@ -26,8 +26,7 @@ if [ ! -f "$FILE1" ]; then echo "❌ Missing file: $FILE1"; exit 26; fi
 echo "🏥 Step 1: Health Check"
 echo "------------------------"
-curl -sS "$BACKEND_URL/health"
-echo ""
 echo ""
 echo "📁 Step 2: Upload File"
@@ -35,52 +34,41 @@ echo "----------------------"
 UPLOAD_RESPONSE=$(curl -sS -X POST "$BACKEND_URL/upload" \
   -F "user_id=$USER_ID" \
   -F "project_id=$PROJECT_ID" \
-  -F "files=@$FILE1")
-echo "Upload response:"
-echo "$UPLOAD_RESPONSE"
-echo ""
-# Extract job_id using grep and sed
-JOB_ID=$(echo "$UPLOAD_RESPONSE" | grep -o '"job_id":"[^"]*"' | sed 's/"job_id":"\([^"]*\)"/\1/')
-if [ -z "$JOB_ID" ]; then
-  echo "❌ Failed to extract job_id"
   exit 1
 fi
-echo "✅ Upload initiated! Job ID: $JOB_ID"
-echo ""
-echo "📊 Step 3: Monitor Progress"
-echo "---------------------------"
-for i in {1..10}; do
-  echo "Check $i/10..."
-  STATUS_RESPONSE=$(curl -sS "$BACKEND_URL/upload/status?job_id=$JOB_ID")
-  echo "Status: $STATUS_RESPONSE"
-  if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
-    echo "✅ Upload completed!"
-    break
-  elif echo "$STATUS_RESPONSE" | grep -q '"status":"failed"'; then
-    echo "❌ Upload failed!"
-    break
-  fi
-  sleep 100
-done
 echo ""
-echo "🔍 Step 4: Debug Job"
-echo "--------------------"
-curl -sS "$BACKEND_URL/debug?job_id=$JOB_ID"
 echo ""
 echo ""
-echo "📋 Step 5: List Files"
 echo "---------------------"
-curl -sS "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID"
-echo ""
 echo ""
-echo "🎉 Test completed!"

 USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
 PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
+# Test file
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+FILE1="$SCRIPT_DIR/../exefiles/Lecture5_ML.pdf"
 echo "📋 Configuration:"
 echo "   Backend URL: $BACKEND_URL"
 echo "🏥 Step 1: Health Check"
 echo "------------------------"
+curl -sS -X GET "$BACKEND_URL/health" -H "Accept: application/json" | jq '.' || echo "Health check failed"
 echo ""
 echo "📁 Step 2: Upload File"
 UPLOAD_RESPONSE=$(curl -sS -X POST "$BACKEND_URL/upload" \
   -F "user_id=$USER_ID" \
   -F "project_id=$PROJECT_ID" \
+  -F "files=@$FILE1" \
+  -w "\nHTTP_STATUS:%{http_code}")
+HTTP_STATUS=$(echo "$UPLOAD_RESPONSE" | grep "HTTP_STATUS:" | cut -d: -f2)
+RESPONSE_BODY=$(echo "$UPLOAD_RESPONSE" | grep -v "HTTP_STATUS:")
+echo "HTTP Status: $HTTP_STATUS"
+echo "Response:"
+echo "$RESPONSE_BODY" | jq '.' || echo "$RESPONSE_BODY"
+if [ "$HTTP_STATUS" != "200" ]; then
+  echo "❌ Upload failed with status $HTTP_STATUS"
   exit 1
 fi
+JOB_ID=$(echo "$RESPONSE_BODY" | jq -r '.job_id // empty')
+if [ -z "$JOB_ID" ]; then
+  echo "❌ No job_id in response"
+  exit 1
+fi
 echo ""
+echo "✅ Upload initiated successfully!"
+echo "   Job ID: $JOB_ID"
 echo ""
+echo "📊 Step 3: Check Status"
+echo "-----------------------"
+curl -sS -X GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | jq '.' || echo "Status check failed"
 echo ""
+echo "📋 Step 4: List Files"
 echo "---------------------"
+curl -sS -X GET "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID" -H "Accept: application/json" | jq '.' || echo "List files failed"
 echo ""
+echo "🎉 Simple test completed!"
+echo "========================"

ingestion_js/test_upload_js.sh CHANGED Viewed

@@ -124,7 +124,9 @@ for i in {1..12}; do
   if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
     echo "✅ Upload completed successfully!"; break
   elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
-    echo "⏳ Still processing... waiting 120 seconds"; sleep 120
   else
     echo "❌ Upload failed or unknown status: $STATUS_RESPONSE"; break
   fi
@@ -186,7 +188,7 @@ echo "-------------------------------------"
 for i in {1..12}; do
   echo "Checking progress (attempt $i/12)..."
   STATUS_RESPONSE=$(curl -L --http1.1 --fail-with-body -sS \
-    --connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
     -H "Accept: application/json" \
     "$BACKEND_URL/upload/status?job_id=$JOB_ID2" 2>/dev/null || echo '{"status":"error"}')
@@ -195,7 +197,7 @@ for i in {1..12}; do
   if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
     echo "✅ Upload 2 completed successfully!"; break
   elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
-    echo "⏳ Still processing... waiting 20 seconds"; sleep 20
   else
     echo "❌ Upload 2 failed or unknown status: $STATUS_RESPONSE"; break
   fi

   if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
     echo "✅ Upload completed successfully!"; break
   elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
+    echo "⏳ Still processing... waiting 20 seconds"; sleep 20
+  elif echo "$STATUS_RESPONSE" | grep -q '"status":"failed"'; then
+    echo "❌ Upload failed: $STATUS_RESPONSE"; break
   else
     echo "❌ Upload failed or unknown status: $STATUS_RESPONSE"; break
   fi
 for i in {1..12}; do
   echo "Checking progress (attempt $i/12)..."
   STATUS_RESPONSE=$(curl -L --http1.1 --fail-with-body -sS \
+    --connect-timeout 1800 --retry 3 --retry-delay 4 --retry-connrefused \
     -H "Accept: application/json" \
     "$BACKEND_URL/upload/status?job_id=$JOB_ID2" 2>/dev/null || echo '{"status":"error"}')
   if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
     echo "✅ Upload 2 completed successfully!"; break
   elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
+    echo "⏳ Still processing... waiting 120 seconds"; sleep 120
   else
     echo "❌ Upload 2 failed or unknown status: $STATUS_RESPONSE"; break
   fi