LiamKhoaLe commited on
Commit
50da96f
·
1 Parent(s): 954da5f

Initial setups refine for ingestion_js #16 (upd pdf-ext)

Browse files
ingestion_js/app/api/upload/route.ts CHANGED
@@ -3,6 +3,7 @@ import { randomUUID } from 'crypto'
3
  import { extractPages } from '@/lib/parser'
4
  import { buildCardsFromPages } from '@/lib/chunker'
5
  import { embedRemote } from '@/lib/embedder'
 
6
  import { deleteFileData, storeCards, upsertFileSummary } from '@/lib/mongo'
7
  import { cheapSummarize } from '@/lib/summarizer'
8
  import { createJob, updateJob } from '@/lib/jobs'
@@ -55,6 +56,7 @@ export async function POST(req: NextRequest) {
55
  // Start processing immediately
56
  try {
57
  await processAll(job_id, user_id, project_id, preloaded, replaceSet)
 
58
  return NextResponse.json({ job_id, status: 'completed', total_files: preloaded.length })
59
  } catch (e) {
60
  console.error(`[UPLOAD_DEBUG] Processing failed for job ${job_id}:`, e)
@@ -80,6 +82,25 @@ async function processAll(job_id: string, user_id: string, project_id: string, f
80
  const pages = await extractPages(fname, buf)
81
  console.log(`[UPLOAD_DEBUG] Extracted ${pages.length} pages`)
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  console.log(`[UPLOAD_DEBUG] Building cards from pages`)
84
  const cards = await buildCardsFromPages(pages, fname, user_id, project_id)
85
  console.log(`[UPLOAD_DEBUG] Built ${cards.length} cards`)
 
3
  import { extractPages } from '@/lib/parser'
4
  import { buildCardsFromPages } from '@/lib/chunker'
5
  import { embedRemote } from '@/lib/embedder'
6
+ import { captionImage, normalizeCaption } from '@/lib/captioner'
7
  import { deleteFileData, storeCards, upsertFileSummary } from '@/lib/mongo'
8
  import { cheapSummarize } from '@/lib/summarizer'
9
  import { createJob, updateJob } from '@/lib/jobs'
 
56
  // Start processing immediately
57
  try {
58
  await processAll(job_id, user_id, project_id, preloaded, replaceSet)
59
+ await updateJob(job_id, { status: 'completed' })
60
  return NextResponse.json({ job_id, status: 'completed', total_files: preloaded.length })
61
  } catch (e) {
62
  console.error(`[UPLOAD_DEBUG] Processing failed for job ${job_id}:`, e)
 
82
  const pages = await extractPages(fname, buf)
83
  console.log(`[UPLOAD_DEBUG] Extracted ${pages.length} pages`)
84
 
85
+ // Process images with captions (best effort - images not extracted in current parser)
86
+ // This matches Python behavior where captions are appended to page text
87
+ for (const page of pages) {
88
+ if (page.images && page.images.length > 0) {
89
+ const captions: string[] = []
90
+ for (const img of page.images) {
91
+ try {
92
+ const caption = await captionImage(img)
93
+ if (caption) captions.push(normalizeCaption(caption))
94
+ } catch (e) {
95
+ console.warn(`[${job_id}] Caption error in ${fname}: ${e}`)
96
+ }
97
+ }
98
+ if (captions.length > 0) {
99
+ page.text = (page.text + '\n\n' + captions.map(c => `[Image] ${c}`).join('\n')).trim()
100
+ }
101
+ }
102
+ }
103
+
104
  console.log(`[UPLOAD_DEBUG] Building cards from pages`)
105
  const cards = await buildCardsFromPages(pages, fname, user_id, project_id)
106
  console.log(`[UPLOAD_DEBUG] Built ${cards.length} cards`)
ingestion_js/lib/chunker.ts CHANGED
@@ -2,36 +2,94 @@ import slugify from 'slugify'
2
  import type { Page } from './parser'
3
  import { cheapSummarize, cleanChunkText } from './summarizer'
4
 
5
- const MAX_WORDS = 220
6
- const OVERLAP_WORDS = 40
 
7
 
8
  function byHeadings(text: string): string[] {
9
- const lines = text.split('\n')
 
 
 
 
 
 
 
 
10
  const parts: string[] = []
11
- let current: string[] = []
12
- const flush = () => { if (current.length) { parts.push(current.join('\n')); current = [] } }
13
- const headingRe = /^(#+\s+|\d+\.|[A-Z][A-Za-z\s\-]{0,40}:?|^\s*\[[A-Za-z ]+\]\s*$)/
14
- for (const ln of lines) {
15
- if (headingRe.test(ln)) flush()
16
- current.push(ln)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  }
18
- flush()
 
 
 
 
 
 
 
 
19
  return parts.filter(p => p.trim().length > 0)
20
  }
21
 
22
  function createOverlappingChunks(blocks: string[]): string[] {
23
- const out: string[] = []
24
- let words: string[] = []
25
- for (const b of blocks) {
26
- words.push(...b.split(/\s+/))
27
- while (words.length > MAX_WORDS) {
28
- const chunk = words.slice(0, MAX_WORDS).join(' ')
29
- out.push(chunk)
30
- words = words.slice(MAX_WORDS - OVERLAP_WORDS)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  }
32
  }
33
- if (words.length) out.push(words.join(' '))
34
- return out
35
  }
36
 
37
  export async function buildCardsFromPages(pages: Page[], filename: string, user_id: string, project_id: string) {
 
2
  import type { Page } from './parser'
3
  import { cheapSummarize, cleanChunkText } from './summarizer'
4
 
5
+ const MAX_WORDS = 500
6
+ const MIN_WORDS = 150
7
+ const OVERLAP_WORDS = 50
8
 
9
  function byHeadings(text: string): string[] {
10
+ // Enhanced patterns matching Python logic
11
+ const patterns = [
12
+ /^(#{1,6}\s.*)\s*$/gm, // Markdown headers
13
+ /^([0-9]+\.\s+[^\n]+)\s*$/gm, // Numbered sections
14
+ /^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$/gm, // Underlined headers
15
+ /^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$/gm, // Chapter/Section headers
16
+ /^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$/gm, // Common academic sections
17
+ ]
18
+
19
  const parts: string[] = []
20
+ let last = 0
21
+ const allMatches: Array<{start: number, end: number, header: string}> = []
22
+
23
+ // Find all matches from all patterns
24
+ for (const pattern of patterns) {
25
+ let match
26
+ while ((match = pattern.exec(text)) !== null) {
27
+ allMatches.push({
28
+ start: match.index,
29
+ end: match.index + match[0].length,
30
+ header: match[1].trim()
31
+ })
32
+ }
33
+ }
34
+
35
+ // Sort matches by position
36
+ allMatches.sort((a, b) => a.start - b.start)
37
+
38
+ // Split text based on matches
39
+ for (const { start, end, header } of allMatches) {
40
+ if (start > last) {
41
+ parts.push(text.slice(last, start))
42
+ }
43
+ parts.push(text.slice(start, end))
44
+ last = end
45
  }
46
+
47
+ if (last < text.length) {
48
+ parts.push(text.slice(last))
49
+ }
50
+
51
+ if (parts.length === 0) {
52
+ parts.push(text)
53
+ }
54
+
55
  return parts.filter(p => p.trim().length > 0)
56
  }
57
 
58
  function createOverlappingChunks(blocks: string[]): string[] {
59
+ const chunks: string[] = []
60
+
61
+ for (let i = 0; i < blocks.length; i++) {
62
+ const block = blocks[i]
63
+ const words = block.split(/\s+/).filter(w => w.length > 0)
64
+
65
+ if (words.length === 0) continue
66
+
67
+ // If block is small enough, use as-is
68
+ if (words.length <= MAX_WORDS) {
69
+ chunks.push(block)
70
+ continue
71
+ }
72
+
73
+ // Split large blocks with overlap
74
+ let start = 0
75
+ while (start < words.length) {
76
+ const end = Math.min(start + MAX_WORDS, words.length)
77
+ let chunkWords = words.slice(start, end)
78
+
79
+ // Add overlap from previous chunk if available
80
+ if (start > 0 && chunks.length > 0) {
81
+ const prevWords = chunks[chunks.length - 1].split(/\s+/).filter(w => w.length > 0)
82
+ const overlapStart = Math.max(0, prevWords.length - OVERLAP_WORDS)
83
+ const overlapWords = prevWords.slice(overlapStart)
84
+ chunkWords = [...overlapWords, ...chunkWords]
85
+ }
86
+
87
+ chunks.push(chunkWords.join(' '))
88
+ start = end - OVERLAP_WORDS // Overlap with next chunk
89
  }
90
  }
91
+
92
+ return chunks
93
  }
94
 
95
  export async function buildCardsFromPages(pages: Page[], filename: string, user_id: string, project_id: string) {
ingestion_js/lib/parser.ts CHANGED
@@ -1,3 +1,4 @@
 
1
  import mammoth from 'mammoth'
2
 
3
  export type Page = { page_num: number; text: string; images: Buffer[] }
@@ -5,24 +6,57 @@ export type Page = { page_num: number; text: string; images: Buffer[] }
5
  export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
6
  console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`)
7
 
8
- // For now, return a simple text extraction as a fallback
9
- // This avoids the pdf-text-extract Buffer issue
10
  try {
11
- // Simple fallback: return the PDF as a single page with placeholder text
12
- // In production, you'd want to use a proper PDF parser
13
- const text = `[PDF Content - ${buf.length} bytes]`
14
- console.log(`[PARSER_DEBUG] Using fallback PDF parsing`)
15
- return [{ page_num: 1, text, images: [] }]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  } catch (error) {
17
  console.error('[PARSER_DEBUG] PDF parsing error:', error)
18
- throw error
 
19
  }
20
  }
21
 
22
  export async function parseDocxBytes(buf: Buffer): Promise<Page[]> {
23
- const { value } = await mammoth.extractRawText({ buffer: buf })
24
- const text = value || ''
25
- return [{ page_num: 1, text, images: [] }]
 
 
 
 
 
 
26
  }
27
 
28
  export function inferMime(filename: string): string {
@@ -35,6 +69,8 @@ export function inferMime(filename: string): string {
35
 
36
  export async function extractPages(filename: string, file: Buffer): Promise<Page[]> {
37
  const mime = inferMime(filename)
 
 
38
  if (mime === 'application/pdf') return parsePdfBytes(file)
39
  if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file)
40
  throw new Error(`Unsupported file type: ${filename}`)
 
1
+ import * as pdfjs from 'pdfjs-dist/legacy/build/pdf'
2
  import mammoth from 'mammoth'
3
 
4
  export type Page = { page_num: number; text: string; images: Buffer[] }
 
6
  export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
7
  console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`)
8
 
 
 
9
  try {
10
+ // Convert Buffer to Uint8Array for pdfjs-dist
11
+ const uint8Array = new Uint8Array(buf)
12
+ const loadingTask = pdfjs.getDocument({ data: uint8Array })
13
+ const pdf = await loadingTask.promise
14
+ const pages: Page[] = []
15
+
16
+ console.log(`[PARSER_DEBUG] PDF has ${pdf.numPages} pages`)
17
+
18
+ for (let i = 1; i <= pdf.numPages; i++) {
19
+ console.log(`[PARSER_DEBUG] Processing page ${i}`)
20
+
21
+ const page = await pdf.getPage(i)
22
+ const textContent = await page.getTextContent()
23
+
24
+ // Extract text like Python PyMuPDF does
25
+ const text = textContent.items
26
+ .map((item: any) => item.str || '')
27
+ .join(' ')
28
+ .trim()
29
+
30
+ console.log(`[PARSER_DEBUG] Page ${i} extracted ${text.length} characters`)
31
+
32
+ // For now, we don't extract images from PDF in serverless (complex)
33
+ // This matches the current limitation but we could add image extraction later
34
+ pages.push({
35
+ page_num: i,
36
+ text: text || `[Page ${i} - No text content extracted]`,
37
+ images: [] // Images not extracted in current implementation
38
+ })
39
+ }
40
+
41
+ console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages`)
42
+ return pages
43
  } catch (error) {
44
  console.error('[PARSER_DEBUG] PDF parsing error:', error)
45
+ // Fallback to simple text representation
46
+ return [{ page_num: 1, text: `[PDF Content - ${buf.length} bytes - Parse error: ${error}]`, images: [] }]
47
  }
48
  }
49
 
50
  export async function parseDocxBytes(buf: Buffer): Promise<Page[]> {
51
+ try {
52
+ const { value } = await mammoth.extractRawText({ buffer: buf })
53
+ const text = value || ''
54
+ console.log(`[PARSER_DEBUG] DOCX extracted ${text.length} characters`)
55
+ return [{ page_num: 1, text, images: [] }]
56
+ } catch (error) {
57
+ console.error('[PARSER_DEBUG] DOCX parsing error:', error)
58
+ return [{ page_num: 1, text: `[DOCX Parse Error: ${error}]`, images: [] }]
59
+ }
60
  }
61
 
62
  export function inferMime(filename: string): string {
 
69
 
70
  export async function extractPages(filename: string, file: Buffer): Promise<Page[]> {
71
  const mime = inferMime(filename)
72
+ console.log(`[PARSER_DEBUG] Processing ${filename} as ${mime}`)
73
+
74
  if (mime === 'application/pdf') return parsePdfBytes(file)
75
  if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file)
76
  throw new Error(`Unsupported file type: ${filename}`)
ingestion_js/package-lock.json CHANGED
@@ -11,7 +11,6 @@
11
  "mammoth": "^1.6.0",
12
  "mongodb": "^6.8.0",
13
  "next": "^14.2.5",
14
- "pdf-text-extract": "^1.0.0",
15
  "react": "^18.3.1",
16
  "react-dom": "^18.3.1",
17
  "slugify": "^1.6.6"
@@ -610,18 +609,6 @@
610
  "node": ">=0.10.0"
611
  }
612
  },
613
- "node_modules/pdf-text-extract": {
614
- "version": "1.5.0",
615
- "resolved": "https://registry.npmjs.org/pdf-text-extract/-/pdf-text-extract-1.5.0.tgz",
616
- "integrity": "sha512-5zpNQljVf4h0b9sY8KGKDHxYoTYqDjahvkxmpHwpxBe3p92AWnscpWausl5/OaedOgnS8Pw53DOQx7bqtYcpow==",
617
- "license": "BSD",
618
- "dependencies": {
619
- "yargs": "^1.2.5"
620
- },
621
- "bin": {
622
- "pdf-text-extract": "bin/pdf-text-extract.js"
623
- }
624
- },
625
  "node_modules/picocolors": {
626
  "version": "1.1.1",
627
  "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
@@ -886,12 +873,6 @@
886
  "engines": {
887
  "node": ">=4.0"
888
  }
889
- },
890
- "node_modules/yargs": {
891
- "version": "1.3.3",
892
- "resolved": "https://registry.npmjs.org/yargs/-/yargs-1.3.3.tgz",
893
- "integrity": "sha512-7OGt4xXoWJQh5ulgZ78rKaqY7dNWbjfK+UKxGcIlaM2j7C4fqGchyv8CPvEWdRPrHp6Ula/YU8yGRpYGOHrI+g==",
894
- "license": "MIT/X11"
895
  }
896
  }
897
  }
 
11
  "mammoth": "^1.6.0",
12
  "mongodb": "^6.8.0",
13
  "next": "^14.2.5",
 
14
  "react": "^18.3.1",
15
  "react-dom": "^18.3.1",
16
  "slugify": "^1.6.6"
 
609
  "node": ">=0.10.0"
610
  }
611
  },
 
 
 
 
 
 
 
 
 
 
 
 
612
  "node_modules/picocolors": {
613
  "version": "1.1.1",
614
  "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
 
873
  "engines": {
874
  "node": ">=4.0"
875
  }
 
 
 
 
 
 
876
  }
877
  }
878
  }
ingestion_js/package.json CHANGED
@@ -12,11 +12,12 @@
12
  "start": "next start"
13
  },
14
  "dependencies": {
 
15
  "mongodb": "^6.8.0",
16
  "next": "^14.2.5",
 
17
  "react": "^18.3.1",
18
  "react-dom": "^18.3.1",
19
- "mammoth": "^1.6.0",
20
  "slugify": "^1.6.6"
21
  },
22
  "devDependencies": {
 
12
  "start": "next start"
13
  },
14
  "dependencies": {
15
+ "mammoth": "^1.6.0",
16
  "mongodb": "^6.8.0",
17
  "next": "^14.2.5",
18
+ "pdfjs-dist": "^3.11.174",
19
  "react": "^18.3.1",
20
  "react-dom": "^18.3.1",
 
21
  "slugify": "^1.6.6"
22
  },
23
  "devDependencies": {
ingestion_js/test_simple.sh CHANGED
@@ -10,9 +10,9 @@ BACKEND_URL="https://study-buddy-ingestion1.vercel.app/api"
10
  USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
11
  PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
12
 
13
- # Test files - use smaller files
14
  SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
15
- FILE1="$SCRIPT_DIR/../exefiles/Tut5.pdf" # Smaller file
16
 
17
  echo "📋 Configuration:"
18
  echo " Backend URL: $BACKEND_URL"
@@ -26,8 +26,7 @@ if [ ! -f "$FILE1" ]; then echo "❌ Missing file: $FILE1"; exit 26; fi
26
 
27
  echo "🏥 Step 1: Health Check"
28
  echo "------------------------"
29
- curl -sS "$BACKEND_URL/health"
30
- echo ""
31
  echo ""
32
 
33
  echo "📁 Step 2: Upload File"
@@ -35,52 +34,41 @@ echo "----------------------"
35
  UPLOAD_RESPONSE=$(curl -sS -X POST "$BACKEND_URL/upload" \
36
  -F "user_id=$USER_ID" \
37
  -F "project_id=$PROJECT_ID" \
38
- -F "files=@$FILE1")
 
39
 
40
- echo "Upload response:"
41
- echo "$UPLOAD_RESPONSE"
42
- echo ""
43
 
44
- # Extract job_id using grep and sed
45
- JOB_ID=$(echo "$UPLOAD_RESPONSE" | grep -o '"job_id":"[^"]*"' | sed 's/"job_id":"\([^"]*\)"/\1/')
 
46
 
47
- if [ -z "$JOB_ID" ]; then
48
- echo "❌ Failed to extract job_id"
49
  exit 1
50
  fi
51
 
52
- echo " Upload initiated! Job ID: $JOB_ID"
53
- echo ""
54
-
55
- echo "📊 Step 3: Monitor Progress"
56
- echo "---------------------------"
57
- for i in {1..10}; do
58
- echo "Check $i/10..."
59
- STATUS_RESPONSE=$(curl -sS "$BACKEND_URL/upload/status?job_id=$JOB_ID")
60
- echo "Status: $STATUS_RESPONSE"
61
-
62
- if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
63
- echo "✅ Upload completed!"
64
- break
65
- elif echo "$STATUS_RESPONSE" | grep -q '"status":"failed"'; then
66
- echo "❌ Upload failed!"
67
- break
68
- fi
69
-
70
- sleep 100
71
- done
72
 
73
  echo ""
74
- echo "🔍 Step 4: Debug Job"
75
- echo "--------------------"
76
- curl -sS "$BACKEND_URL/debug?job_id=$JOB_ID"
77
  echo ""
 
 
 
 
78
  echo ""
79
 
80
- echo "📋 Step 5: List Files"
81
  echo "---------------------"
82
- curl -sS "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID"
83
- echo ""
84
  echo ""
85
 
86
- echo "🎉 Test completed!"
 
 
10
  USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
11
  PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
12
 
13
+ # Test file
14
  SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
15
+ FILE1="$SCRIPT_DIR/../exefiles/Lecture5_ML.pdf"
16
 
17
  echo "📋 Configuration:"
18
  echo " Backend URL: $BACKEND_URL"
 
26
 
27
  echo "🏥 Step 1: Health Check"
28
  echo "------------------------"
29
+ curl -sS -X GET "$BACKEND_URL/health" -H "Accept: application/json" | jq '.' || echo "Health check failed"
 
30
  echo ""
31
 
32
  echo "📁 Step 2: Upload File"
 
34
  UPLOAD_RESPONSE=$(curl -sS -X POST "$BACKEND_URL/upload" \
35
  -F "user_id=$USER_ID" \
36
  -F "project_id=$PROJECT_ID" \
37
+ -F "files=@$FILE1" \
38
+ -w "\nHTTP_STATUS:%{http_code}")
39
 
40
+ HTTP_STATUS=$(echo "$UPLOAD_RESPONSE" | grep "HTTP_STATUS:" | cut -d: -f2)
41
+ RESPONSE_BODY=$(echo "$UPLOAD_RESPONSE" | grep -v "HTTP_STATUS:")
 
42
 
43
+ echo "HTTP Status: $HTTP_STATUS"
44
+ echo "Response:"
45
+ echo "$RESPONSE_BODY" | jq '.' || echo "$RESPONSE_BODY"
46
 
47
+ if [ "$HTTP_STATUS" != "200" ]; then
48
+ echo "❌ Upload failed with status $HTTP_STATUS"
49
  exit 1
50
  fi
51
 
52
+ JOB_ID=$(echo "$RESPONSE_BODY" | jq -r '.job_id // empty')
53
+ if [ -z "$JOB_ID" ]; then
54
+ echo "❌ No job_id in response"
55
+ exit 1
56
+ fi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  echo ""
59
+ echo " Upload initiated successfully!"
60
+ echo " Job ID: $JOB_ID"
 
61
  echo ""
62
+
63
+ echo "📊 Step 3: Check Status"
64
+ echo "-----------------------"
65
+ curl -sS -X GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | jq '.' || echo "Status check failed"
66
  echo ""
67
 
68
+ echo "📋 Step 4: List Files"
69
  echo "---------------------"
70
+ curl -sS -X GET "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID" -H "Accept: application/json" | jq '.' || echo "List files failed"
 
71
  echo ""
72
 
73
+ echo "🎉 Simple test completed!"
74
+ echo "========================"
ingestion_js/test_upload_js.sh CHANGED
@@ -124,7 +124,9 @@ for i in {1..12}; do
124
  if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
125
  echo "✅ Upload completed successfully!"; break
126
  elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
127
- echo "⏳ Still processing... waiting 120 seconds"; sleep 120
 
 
128
  else
129
  echo "❌ Upload failed or unknown status: $STATUS_RESPONSE"; break
130
  fi
@@ -186,7 +188,7 @@ echo "-------------------------------------"
186
  for i in {1..12}; do
187
  echo "Checking progress (attempt $i/12)..."
188
  STATUS_RESPONSE=$(curl -L --http1.1 --fail-with-body -sS \
189
- --connect-timeout 60 --retry 3 --retry-delay 4 --retry-connrefused \
190
  -H "Accept: application/json" \
191
  "$BACKEND_URL/upload/status?job_id=$JOB_ID2" 2>/dev/null || echo '{"status":"error"}')
192
 
@@ -195,7 +197,7 @@ for i in {1..12}; do
195
  if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
196
  echo "✅ Upload 2 completed successfully!"; break
197
  elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
198
- echo "⏳ Still processing... waiting 20 seconds"; sleep 20
199
  else
200
  echo "❌ Upload 2 failed or unknown status: $STATUS_RESPONSE"; break
201
  fi
 
124
  if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
125
  echo "✅ Upload completed successfully!"; break
126
  elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
127
+ echo "⏳ Still processing... waiting 20 seconds"; sleep 20
128
+ elif echo "$STATUS_RESPONSE" | grep -q '"status":"failed"'; then
129
+ echo "❌ Upload failed: $STATUS_RESPONSE"; break
130
  else
131
  echo "❌ Upload failed or unknown status: $STATUS_RESPONSE"; break
132
  fi
 
188
  for i in {1..12}; do
189
  echo "Checking progress (attempt $i/12)..."
190
  STATUS_RESPONSE=$(curl -L --http1.1 --fail-with-body -sS \
191
+ --connect-timeout 1800 --retry 3 --retry-delay 4 --retry-connrefused \
192
  -H "Accept: application/json" \
193
  "$BACKEND_URL/upload/status?job_id=$JOB_ID2" 2>/dev/null || echo '{"status":"error"}')
194
 
 
197
  if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
198
  echo "✅ Upload 2 completed successfully!"; break
199
  elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
200
+ echo "⏳ Still processing... waiting 120 seconds"; sleep 120
201
  else
202
  echo "❌ Upload 2 failed or unknown status: $STATUS_RESPONSE"; break
203
  fi