Spaces:
Sleeping
Sleeping
Commit
·
50da96f
1
Parent(s):
954da5f
Initial setups refine for ingestion_js #16 (upd pdf-ext)
Browse files- ingestion_js/app/api/upload/route.ts +21 -0
- ingestion_js/lib/chunker.ts +78 -20
- ingestion_js/lib/parser.ts +47 -11
- ingestion_js/package-lock.json +0 -19
- ingestion_js/package.json +2 -1
- ingestion_js/test_simple.sh +27 -39
- ingestion_js/test_upload_js.sh +5 -3
ingestion_js/app/api/upload/route.ts
CHANGED
|
@@ -3,6 +3,7 @@ import { randomUUID } from 'crypto'
|
|
| 3 |
import { extractPages } from '@/lib/parser'
|
| 4 |
import { buildCardsFromPages } from '@/lib/chunker'
|
| 5 |
import { embedRemote } from '@/lib/embedder'
|
|
|
|
| 6 |
import { deleteFileData, storeCards, upsertFileSummary } from '@/lib/mongo'
|
| 7 |
import { cheapSummarize } from '@/lib/summarizer'
|
| 8 |
import { createJob, updateJob } from '@/lib/jobs'
|
|
@@ -55,6 +56,7 @@ export async function POST(req: NextRequest) {
|
|
| 55 |
// Start processing immediately
|
| 56 |
try {
|
| 57 |
await processAll(job_id, user_id, project_id, preloaded, replaceSet)
|
|
|
|
| 58 |
return NextResponse.json({ job_id, status: 'completed', total_files: preloaded.length })
|
| 59 |
} catch (e) {
|
| 60 |
console.error(`[UPLOAD_DEBUG] Processing failed for job ${job_id}:`, e)
|
|
@@ -80,6 +82,25 @@ async function processAll(job_id: string, user_id: string, project_id: string, f
|
|
| 80 |
const pages = await extractPages(fname, buf)
|
| 81 |
console.log(`[UPLOAD_DEBUG] Extracted ${pages.length} pages`)
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
console.log(`[UPLOAD_DEBUG] Building cards from pages`)
|
| 84 |
const cards = await buildCardsFromPages(pages, fname, user_id, project_id)
|
| 85 |
console.log(`[UPLOAD_DEBUG] Built ${cards.length} cards`)
|
|
|
|
| 3 |
import { extractPages } from '@/lib/parser'
|
| 4 |
import { buildCardsFromPages } from '@/lib/chunker'
|
| 5 |
import { embedRemote } from '@/lib/embedder'
|
| 6 |
+
import { captionImage, normalizeCaption } from '@/lib/captioner'
|
| 7 |
import { deleteFileData, storeCards, upsertFileSummary } from '@/lib/mongo'
|
| 8 |
import { cheapSummarize } from '@/lib/summarizer'
|
| 9 |
import { createJob, updateJob } from '@/lib/jobs'
|
|
|
|
| 56 |
// Start processing immediately
|
| 57 |
try {
|
| 58 |
await processAll(job_id, user_id, project_id, preloaded, replaceSet)
|
| 59 |
+
await updateJob(job_id, { status: 'completed' })
|
| 60 |
return NextResponse.json({ job_id, status: 'completed', total_files: preloaded.length })
|
| 61 |
} catch (e) {
|
| 62 |
console.error(`[UPLOAD_DEBUG] Processing failed for job ${job_id}:`, e)
|
|
|
|
| 82 |
const pages = await extractPages(fname, buf)
|
| 83 |
console.log(`[UPLOAD_DEBUG] Extracted ${pages.length} pages`)
|
| 84 |
|
| 85 |
+
// Process images with captions (best effort - images not extracted in current parser)
|
| 86 |
+
// This matches Python behavior where captions are appended to page text
|
| 87 |
+
for (const page of pages) {
|
| 88 |
+
if (page.images && page.images.length > 0) {
|
| 89 |
+
const captions: string[] = []
|
| 90 |
+
for (const img of page.images) {
|
| 91 |
+
try {
|
| 92 |
+
const caption = await captionImage(img)
|
| 93 |
+
if (caption) captions.push(normalizeCaption(caption))
|
| 94 |
+
} catch (e) {
|
| 95 |
+
console.warn(`[${job_id}] Caption error in ${fname}: ${e}`)
|
| 96 |
+
}
|
| 97 |
+
}
|
| 98 |
+
if (captions.length > 0) {
|
| 99 |
+
page.text = (page.text + '\n\n' + captions.map(c => `[Image] ${c}`).join('\n')).trim()
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
console.log(`[UPLOAD_DEBUG] Building cards from pages`)
|
| 105 |
const cards = await buildCardsFromPages(pages, fname, user_id, project_id)
|
| 106 |
console.log(`[UPLOAD_DEBUG] Built ${cards.length} cards`)
|
ingestion_js/lib/chunker.ts
CHANGED
|
@@ -2,36 +2,94 @@ import slugify from 'slugify'
|
|
| 2 |
import type { Page } from './parser'
|
| 3 |
import { cheapSummarize, cleanChunkText } from './summarizer'
|
| 4 |
|
| 5 |
-
const MAX_WORDS =
|
| 6 |
-
const
|
|
|
|
| 7 |
|
| 8 |
function byHeadings(text: string): string[] {
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
const parts: string[] = []
|
| 11 |
-
let
|
| 12 |
-
const
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
}
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
return parts.filter(p => p.trim().length > 0)
|
| 20 |
}
|
| 21 |
|
| 22 |
function createOverlappingChunks(blocks: string[]): string[] {
|
| 23 |
-
const
|
| 24 |
-
|
| 25 |
-
for (
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
}
|
| 32 |
}
|
| 33 |
-
|
| 34 |
-
return
|
| 35 |
}
|
| 36 |
|
| 37 |
export async function buildCardsFromPages(pages: Page[], filename: string, user_id: string, project_id: string) {
|
|
|
|
| 2 |
import type { Page } from './parser'
|
| 3 |
import { cheapSummarize, cleanChunkText } from './summarizer'
|
| 4 |
|
| 5 |
+
const MAX_WORDS = 500
|
| 6 |
+
const MIN_WORDS = 150
|
| 7 |
+
const OVERLAP_WORDS = 50
|
| 8 |
|
| 9 |
function byHeadings(text: string): string[] {
|
| 10 |
+
// Enhanced patterns matching Python logic
|
| 11 |
+
const patterns = [
|
| 12 |
+
/^(#{1,6}\s.*)\s*$/gm, // Markdown headers
|
| 13 |
+
/^([0-9]+\.\s+[^\n]+)\s*$/gm, // Numbered sections
|
| 14 |
+
/^([A-Z][A-Za-z0-9\s\-]{2,}\n[-=]{3,})\s*$/gm, // Underlined headers
|
| 15 |
+
/^(Chapter\s+\d+.*|Section\s+\d+.*)\s*$/gm, // Chapter/Section headers
|
| 16 |
+
/^(Abstract|Introduction|Conclusion|References|Bibliography)\s*$/gm, // Common academic sections
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
const parts: string[] = []
|
| 20 |
+
let last = 0
|
| 21 |
+
const allMatches: Array<{start: number, end: number, header: string}> = []
|
| 22 |
+
|
| 23 |
+
// Find all matches from all patterns
|
| 24 |
+
for (const pattern of patterns) {
|
| 25 |
+
let match
|
| 26 |
+
while ((match = pattern.exec(text)) !== null) {
|
| 27 |
+
allMatches.push({
|
| 28 |
+
start: match.index,
|
| 29 |
+
end: match.index + match[0].length,
|
| 30 |
+
header: match[1].trim()
|
| 31 |
+
})
|
| 32 |
+
}
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
// Sort matches by position
|
| 36 |
+
allMatches.sort((a, b) => a.start - b.start)
|
| 37 |
+
|
| 38 |
+
// Split text based on matches
|
| 39 |
+
for (const { start, end, header } of allMatches) {
|
| 40 |
+
if (start > last) {
|
| 41 |
+
parts.push(text.slice(last, start))
|
| 42 |
+
}
|
| 43 |
+
parts.push(text.slice(start, end))
|
| 44 |
+
last = end
|
| 45 |
}
|
| 46 |
+
|
| 47 |
+
if (last < text.length) {
|
| 48 |
+
parts.push(text.slice(last))
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
if (parts.length === 0) {
|
| 52 |
+
parts.push(text)
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
return parts.filter(p => p.trim().length > 0)
|
| 56 |
}
|
| 57 |
|
| 58 |
function createOverlappingChunks(blocks: string[]): string[] {
|
| 59 |
+
const chunks: string[] = []
|
| 60 |
+
|
| 61 |
+
for (let i = 0; i < blocks.length; i++) {
|
| 62 |
+
const block = blocks[i]
|
| 63 |
+
const words = block.split(/\s+/).filter(w => w.length > 0)
|
| 64 |
+
|
| 65 |
+
if (words.length === 0) continue
|
| 66 |
+
|
| 67 |
+
// If block is small enough, use as-is
|
| 68 |
+
if (words.length <= MAX_WORDS) {
|
| 69 |
+
chunks.push(block)
|
| 70 |
+
continue
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
// Split large blocks with overlap
|
| 74 |
+
let start = 0
|
| 75 |
+
while (start < words.length) {
|
| 76 |
+
const end = Math.min(start + MAX_WORDS, words.length)
|
| 77 |
+
let chunkWords = words.slice(start, end)
|
| 78 |
+
|
| 79 |
+
// Add overlap from previous chunk if available
|
| 80 |
+
if (start > 0 && chunks.length > 0) {
|
| 81 |
+
const prevWords = chunks[chunks.length - 1].split(/\s+/).filter(w => w.length > 0)
|
| 82 |
+
const overlapStart = Math.max(0, prevWords.length - OVERLAP_WORDS)
|
| 83 |
+
const overlapWords = prevWords.slice(overlapStart)
|
| 84 |
+
chunkWords = [...overlapWords, ...chunkWords]
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
chunks.push(chunkWords.join(' '))
|
| 88 |
+
start = end - OVERLAP_WORDS // Overlap with next chunk
|
| 89 |
}
|
| 90 |
}
|
| 91 |
+
|
| 92 |
+
return chunks
|
| 93 |
}
|
| 94 |
|
| 95 |
export async function buildCardsFromPages(pages: Page[], filename: string, user_id: string, project_id: string) {
|
ingestion_js/lib/parser.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import mammoth from 'mammoth'
|
| 2 |
|
| 3 |
export type Page = { page_num: number; text: string; images: Buffer[] }
|
|
@@ -5,24 +6,57 @@ export type Page = { page_num: number; text: string; images: Buffer[] }
|
|
| 5 |
export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
|
| 6 |
console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`)
|
| 7 |
|
| 8 |
-
// For now, return a simple text extraction as a fallback
|
| 9 |
-
// This avoids the pdf-text-extract Buffer issue
|
| 10 |
try {
|
| 11 |
-
//
|
| 12 |
-
|
| 13 |
-
const
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
} catch (error) {
|
| 17 |
console.error('[PARSER_DEBUG] PDF parsing error:', error)
|
| 18 |
-
|
|
|
|
| 19 |
}
|
| 20 |
}
|
| 21 |
|
| 22 |
export async function parseDocxBytes(buf: Buffer): Promise<Page[]> {
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
}
|
| 27 |
|
| 28 |
export function inferMime(filename: string): string {
|
|
@@ -35,6 +69,8 @@ export function inferMime(filename: string): string {
|
|
| 35 |
|
| 36 |
export async function extractPages(filename: string, file: Buffer): Promise<Page[]> {
|
| 37 |
const mime = inferMime(filename)
|
|
|
|
|
|
|
| 38 |
if (mime === 'application/pdf') return parsePdfBytes(file)
|
| 39 |
if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file)
|
| 40 |
throw new Error(`Unsupported file type: ${filename}`)
|
|
|
|
| 1 |
+
import * as pdfjs from 'pdfjs-dist/legacy/build/pdf'
|
| 2 |
import mammoth from 'mammoth'
|
| 3 |
|
| 4 |
export type Page = { page_num: number; text: string; images: Buffer[] }
|
|
|
|
| 6 |
export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
|
| 7 |
console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`)
|
| 8 |
|
|
|
|
|
|
|
| 9 |
try {
|
| 10 |
+
// Convert Buffer to Uint8Array for pdfjs-dist
|
| 11 |
+
const uint8Array = new Uint8Array(buf)
|
| 12 |
+
const loadingTask = pdfjs.getDocument({ data: uint8Array })
|
| 13 |
+
const pdf = await loadingTask.promise
|
| 14 |
+
const pages: Page[] = []
|
| 15 |
+
|
| 16 |
+
console.log(`[PARSER_DEBUG] PDF has ${pdf.numPages} pages`)
|
| 17 |
+
|
| 18 |
+
for (let i = 1; i <= pdf.numPages; i++) {
|
| 19 |
+
console.log(`[PARSER_DEBUG] Processing page ${i}`)
|
| 20 |
+
|
| 21 |
+
const page = await pdf.getPage(i)
|
| 22 |
+
const textContent = await page.getTextContent()
|
| 23 |
+
|
| 24 |
+
// Extract text like Python PyMuPDF does
|
| 25 |
+
const text = textContent.items
|
| 26 |
+
.map((item: any) => item.str || '')
|
| 27 |
+
.join(' ')
|
| 28 |
+
.trim()
|
| 29 |
+
|
| 30 |
+
console.log(`[PARSER_DEBUG] Page ${i} extracted ${text.length} characters`)
|
| 31 |
+
|
| 32 |
+
// For now, we don't extract images from PDF in serverless (complex)
|
| 33 |
+
// This matches the current limitation but we could add image extraction later
|
| 34 |
+
pages.push({
|
| 35 |
+
page_num: i,
|
| 36 |
+
text: text || `[Page ${i} - No text content extracted]`,
|
| 37 |
+
images: [] // Images not extracted in current implementation
|
| 38 |
+
})
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages`)
|
| 42 |
+
return pages
|
| 43 |
} catch (error) {
|
| 44 |
console.error('[PARSER_DEBUG] PDF parsing error:', error)
|
| 45 |
+
// Fallback to simple text representation
|
| 46 |
+
return [{ page_num: 1, text: `[PDF Content - ${buf.length} bytes - Parse error: ${error}]`, images: [] }]
|
| 47 |
}
|
| 48 |
}
|
| 49 |
|
| 50 |
export async function parseDocxBytes(buf: Buffer): Promise<Page[]> {
|
| 51 |
+
try {
|
| 52 |
+
const { value } = await mammoth.extractRawText({ buffer: buf })
|
| 53 |
+
const text = value || ''
|
| 54 |
+
console.log(`[PARSER_DEBUG] DOCX extracted ${text.length} characters`)
|
| 55 |
+
return [{ page_num: 1, text, images: [] }]
|
| 56 |
+
} catch (error) {
|
| 57 |
+
console.error('[PARSER_DEBUG] DOCX parsing error:', error)
|
| 58 |
+
return [{ page_num: 1, text: `[DOCX Parse Error: ${error}]`, images: [] }]
|
| 59 |
+
}
|
| 60 |
}
|
| 61 |
|
| 62 |
export function inferMime(filename: string): string {
|
|
|
|
| 69 |
|
| 70 |
export async function extractPages(filename: string, file: Buffer): Promise<Page[]> {
|
| 71 |
const mime = inferMime(filename)
|
| 72 |
+
console.log(`[PARSER_DEBUG] Processing ${filename} as ${mime}`)
|
| 73 |
+
|
| 74 |
if (mime === 'application/pdf') return parsePdfBytes(file)
|
| 75 |
if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file)
|
| 76 |
throw new Error(`Unsupported file type: ${filename}`)
|
ingestion_js/package-lock.json
CHANGED
|
@@ -11,7 +11,6 @@
|
|
| 11 |
"mammoth": "^1.6.0",
|
| 12 |
"mongodb": "^6.8.0",
|
| 13 |
"next": "^14.2.5",
|
| 14 |
-
"pdf-text-extract": "^1.0.0",
|
| 15 |
"react": "^18.3.1",
|
| 16 |
"react-dom": "^18.3.1",
|
| 17 |
"slugify": "^1.6.6"
|
|
@@ -610,18 +609,6 @@
|
|
| 610 |
"node": ">=0.10.0"
|
| 611 |
}
|
| 612 |
},
|
| 613 |
-
"node_modules/pdf-text-extract": {
|
| 614 |
-
"version": "1.5.0",
|
| 615 |
-
"resolved": "https://registry.npmjs.org/pdf-text-extract/-/pdf-text-extract-1.5.0.tgz",
|
| 616 |
-
"integrity": "sha512-5zpNQljVf4h0b9sY8KGKDHxYoTYqDjahvkxmpHwpxBe3p92AWnscpWausl5/OaedOgnS8Pw53DOQx7bqtYcpow==",
|
| 617 |
-
"license": "BSD",
|
| 618 |
-
"dependencies": {
|
| 619 |
-
"yargs": "^1.2.5"
|
| 620 |
-
},
|
| 621 |
-
"bin": {
|
| 622 |
-
"pdf-text-extract": "bin/pdf-text-extract.js"
|
| 623 |
-
}
|
| 624 |
-
},
|
| 625 |
"node_modules/picocolors": {
|
| 626 |
"version": "1.1.1",
|
| 627 |
"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
|
|
@@ -886,12 +873,6 @@
|
|
| 886 |
"engines": {
|
| 887 |
"node": ">=4.0"
|
| 888 |
}
|
| 889 |
-
},
|
| 890 |
-
"node_modules/yargs": {
|
| 891 |
-
"version": "1.3.3",
|
| 892 |
-
"resolved": "https://registry.npmjs.org/yargs/-/yargs-1.3.3.tgz",
|
| 893 |
-
"integrity": "sha512-7OGt4xXoWJQh5ulgZ78rKaqY7dNWbjfK+UKxGcIlaM2j7C4fqGchyv8CPvEWdRPrHp6Ula/YU8yGRpYGOHrI+g==",
|
| 894 |
-
"license": "MIT/X11"
|
| 895 |
}
|
| 896 |
}
|
| 897 |
}
|
|
|
|
| 11 |
"mammoth": "^1.6.0",
|
| 12 |
"mongodb": "^6.8.0",
|
| 13 |
"next": "^14.2.5",
|
|
|
|
| 14 |
"react": "^18.3.1",
|
| 15 |
"react-dom": "^18.3.1",
|
| 16 |
"slugify": "^1.6.6"
|
|
|
|
| 609 |
"node": ">=0.10.0"
|
| 610 |
}
|
| 611 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 612 |
"node_modules/picocolors": {
|
| 613 |
"version": "1.1.1",
|
| 614 |
"resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
|
|
|
|
| 873 |
"engines": {
|
| 874 |
"node": ">=4.0"
|
| 875 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 876 |
}
|
| 877 |
}
|
| 878 |
}
|
ingestion_js/package.json
CHANGED
|
@@ -12,11 +12,12 @@
|
|
| 12 |
"start": "next start"
|
| 13 |
},
|
| 14 |
"dependencies": {
|
|
|
|
| 15 |
"mongodb": "^6.8.0",
|
| 16 |
"next": "^14.2.5",
|
|
|
|
| 17 |
"react": "^18.3.1",
|
| 18 |
"react-dom": "^18.3.1",
|
| 19 |
-
"mammoth": "^1.6.0",
|
| 20 |
"slugify": "^1.6.6"
|
| 21 |
},
|
| 22 |
"devDependencies": {
|
|
|
|
| 12 |
"start": "next start"
|
| 13 |
},
|
| 14 |
"dependencies": {
|
| 15 |
+
"mammoth": "^1.6.0",
|
| 16 |
"mongodb": "^6.8.0",
|
| 17 |
"next": "^14.2.5",
|
| 18 |
+
"pdfjs-dist": "^3.11.174",
|
| 19 |
"react": "^18.3.1",
|
| 20 |
"react-dom": "^18.3.1",
|
|
|
|
| 21 |
"slugify": "^1.6.6"
|
| 22 |
},
|
| 23 |
"devDependencies": {
|
ingestion_js/test_simple.sh
CHANGED
|
@@ -10,9 +10,9 @@ BACKEND_URL="https://study-buddy-ingestion1.vercel.app/api"
|
|
| 10 |
USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
|
| 11 |
PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
|
| 12 |
|
| 13 |
-
# Test
|
| 14 |
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
| 15 |
-
FILE1="$SCRIPT_DIR/../exefiles/
|
| 16 |
|
| 17 |
echo "📋 Configuration:"
|
| 18 |
echo " Backend URL: $BACKEND_URL"
|
|
@@ -26,8 +26,7 @@ if [ ! -f "$FILE1" ]; then echo "❌ Missing file: $FILE1"; exit 26; fi
|
|
| 26 |
|
| 27 |
echo "🏥 Step 1: Health Check"
|
| 28 |
echo "------------------------"
|
| 29 |
-
curl -sS "$BACKEND_URL/health"
|
| 30 |
-
echo ""
|
| 31 |
echo ""
|
| 32 |
|
| 33 |
echo "📁 Step 2: Upload File"
|
|
@@ -35,52 +34,41 @@ echo "----------------------"
|
|
| 35 |
UPLOAD_RESPONSE=$(curl -sS -X POST "$BACKEND_URL/upload" \
|
| 36 |
-F "user_id=$USER_ID" \
|
| 37 |
-F "project_id=$PROJECT_ID" \
|
| 38 |
-
-F "files=@$FILE1"
|
|
|
|
| 39 |
|
| 40 |
-
echo "
|
| 41 |
-
echo "$UPLOAD_RESPONSE"
|
| 42 |
-
echo ""
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
|
|
|
| 46 |
|
| 47 |
-
if [
|
| 48 |
-
echo "❌
|
| 49 |
exit 1
|
| 50 |
fi
|
| 51 |
|
| 52 |
-
echo "
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
for i in {1..10}; do
|
| 58 |
-
echo "Check $i/10..."
|
| 59 |
-
STATUS_RESPONSE=$(curl -sS "$BACKEND_URL/upload/status?job_id=$JOB_ID")
|
| 60 |
-
echo "Status: $STATUS_RESPONSE"
|
| 61 |
-
|
| 62 |
-
if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
|
| 63 |
-
echo "✅ Upload completed!"
|
| 64 |
-
break
|
| 65 |
-
elif echo "$STATUS_RESPONSE" | grep -q '"status":"failed"'; then
|
| 66 |
-
echo "❌ Upload failed!"
|
| 67 |
-
break
|
| 68 |
-
fi
|
| 69 |
-
|
| 70 |
-
sleep 100
|
| 71 |
-
done
|
| 72 |
|
| 73 |
echo ""
|
| 74 |
-
echo "
|
| 75 |
-
echo "
|
| 76 |
-
curl -sS "$BACKEND_URL/debug?job_id=$JOB_ID"
|
| 77 |
echo ""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
echo ""
|
| 79 |
|
| 80 |
-
echo "📋 Step
|
| 81 |
echo "---------------------"
|
| 82 |
-
curl -sS "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID"
|
| 83 |
-
echo ""
|
| 84 |
echo ""
|
| 85 |
|
| 86 |
-
echo "🎉
|
|
|
|
|
|
| 10 |
USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
|
| 11 |
PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
|
| 12 |
|
| 13 |
+
# Test file
|
| 14 |
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
| 15 |
+
FILE1="$SCRIPT_DIR/../exefiles/Lecture5_ML.pdf"
|
| 16 |
|
| 17 |
echo "📋 Configuration:"
|
| 18 |
echo " Backend URL: $BACKEND_URL"
|
|
|
|
| 26 |
|
| 27 |
echo "🏥 Step 1: Health Check"
|
| 28 |
echo "------------------------"
|
| 29 |
+
curl -sS -X GET "$BACKEND_URL/health" -H "Accept: application/json" | jq '.' || echo "Health check failed"
|
|
|
|
| 30 |
echo ""
|
| 31 |
|
| 32 |
echo "📁 Step 2: Upload File"
|
|
|
|
| 34 |
UPLOAD_RESPONSE=$(curl -sS -X POST "$BACKEND_URL/upload" \
|
| 35 |
-F "user_id=$USER_ID" \
|
| 36 |
-F "project_id=$PROJECT_ID" \
|
| 37 |
+
-F "files=@$FILE1" \
|
| 38 |
+
-w "\nHTTP_STATUS:%{http_code}")
|
| 39 |
|
| 40 |
+
HTTP_STATUS=$(echo "$UPLOAD_RESPONSE" | grep "HTTP_STATUS:" | cut -d: -f2)
|
| 41 |
+
RESPONSE_BODY=$(echo "$UPLOAD_RESPONSE" | grep -v "HTTP_STATUS:")
|
|
|
|
| 42 |
|
| 43 |
+
echo "HTTP Status: $HTTP_STATUS"
|
| 44 |
+
echo "Response:"
|
| 45 |
+
echo "$RESPONSE_BODY" | jq '.' || echo "$RESPONSE_BODY"
|
| 46 |
|
| 47 |
+
if [ "$HTTP_STATUS" != "200" ]; then
|
| 48 |
+
echo "❌ Upload failed with status $HTTP_STATUS"
|
| 49 |
exit 1
|
| 50 |
fi
|
| 51 |
|
| 52 |
+
JOB_ID=$(echo "$RESPONSE_BODY" | jq -r '.job_id // empty')
|
| 53 |
+
if [ -z "$JOB_ID" ]; then
|
| 54 |
+
echo "❌ No job_id in response"
|
| 55 |
+
exit 1
|
| 56 |
+
fi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
echo ""
|
| 59 |
+
echo "✅ Upload initiated successfully!"
|
| 60 |
+
echo " Job ID: $JOB_ID"
|
|
|
|
| 61 |
echo ""
|
| 62 |
+
|
| 63 |
+
echo "📊 Step 3: Check Status"
|
| 64 |
+
echo "-----------------------"
|
| 65 |
+
curl -sS -X GET "$BACKEND_URL/upload/status?job_id=$JOB_ID" -H "Accept: application/json" | jq '.' || echo "Status check failed"
|
| 66 |
echo ""
|
| 67 |
|
| 68 |
+
echo "📋 Step 4: List Files"
|
| 69 |
echo "---------------------"
|
| 70 |
+
curl -sS -X GET "$BACKEND_URL/files?user_id=$USER_ID&project_id=$PROJECT_ID" -H "Accept: application/json" | jq '.' || echo "List files failed"
|
|
|
|
| 71 |
echo ""
|
| 72 |
|
| 73 |
+
echo "🎉 Simple test completed!"
|
| 74 |
+
echo "========================"
|
ingestion_js/test_upload_js.sh
CHANGED
|
@@ -124,7 +124,9 @@ for i in {1..12}; do
|
|
| 124 |
if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
|
| 125 |
echo "✅ Upload completed successfully!"; break
|
| 126 |
elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
|
| 127 |
-
echo "⏳ Still processing... waiting
|
|
|
|
|
|
|
| 128 |
else
|
| 129 |
echo "❌ Upload failed or unknown status: $STATUS_RESPONSE"; break
|
| 130 |
fi
|
|
@@ -186,7 +188,7 @@ echo "-------------------------------------"
|
|
| 186 |
for i in {1..12}; do
|
| 187 |
echo "Checking progress (attempt $i/12)..."
|
| 188 |
STATUS_RESPONSE=$(curl -L --http1.1 --fail-with-body -sS \
|
| 189 |
-
--connect-timeout
|
| 190 |
-H "Accept: application/json" \
|
| 191 |
"$BACKEND_URL/upload/status?job_id=$JOB_ID2" 2>/dev/null || echo '{"status":"error"}')
|
| 192 |
|
|
@@ -195,7 +197,7 @@ for i in {1..12}; do
|
|
| 195 |
if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
|
| 196 |
echo "✅ Upload 2 completed successfully!"; break
|
| 197 |
elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
|
| 198 |
-
echo "⏳ Still processing... waiting
|
| 199 |
else
|
| 200 |
echo "❌ Upload 2 failed or unknown status: $STATUS_RESPONSE"; break
|
| 201 |
fi
|
|
|
|
| 124 |
if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
|
| 125 |
echo "✅ Upload completed successfully!"; break
|
| 126 |
elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
|
| 127 |
+
echo "⏳ Still processing... waiting 20 seconds"; sleep 20
|
| 128 |
+
elif echo "$STATUS_RESPONSE" | grep -q '"status":"failed"'; then
|
| 129 |
+
echo "❌ Upload failed: $STATUS_RESPONSE"; break
|
| 130 |
else
|
| 131 |
echo "❌ Upload failed or unknown status: $STATUS_RESPONSE"; break
|
| 132 |
fi
|
|
|
|
| 188 |
for i in {1..12}; do
|
| 189 |
echo "Checking progress (attempt $i/12)..."
|
| 190 |
STATUS_RESPONSE=$(curl -L --http1.1 --fail-with-body -sS \
|
| 191 |
+
--connect-timeout 1800 --retry 3 --retry-delay 4 --retry-connrefused \
|
| 192 |
-H "Accept: application/json" \
|
| 193 |
"$BACKEND_URL/upload/status?job_id=$JOB_ID2" 2>/dev/null || echo '{"status":"error"}')
|
| 194 |
|
|
|
|
| 197 |
if echo "$STATUS_RESPONSE" | grep -q '"status":"completed"'; then
|
| 198 |
echo "✅ Upload 2 completed successfully!"; break
|
| 199 |
elif echo "$STATUS_RESPONSE" | grep -q '"status":"processing"'; then
|
| 200 |
+
echo "⏳ Still processing... waiting 120 seconds"; sleep 120
|
| 201 |
else
|
| 202 |
echo "❌ Upload 2 failed or unknown status: $STATUS_RESPONSE"; break
|
| 203 |
fi
|