LiamKhoaLe commited on
Commit
55180c4
·
1 Parent(s): 09431f4

Upd ingest js local loadup #1

Browse files
.gitignore CHANGED
@@ -14,3 +14,6 @@ node_modules
14
 
15
  # Resources
16
  exefiles
 
 
 
 
14
 
15
  # Resources
16
  exefiles
17
+
18
+ # Local Netlify folder
19
+ .netlify
ingestion_js/.gitignore CHANGED
@@ -1 +1,2 @@
1
  .vercel
 
 
1
  .vercel
2
+ .env
ingestion_js/app/api/debug/route.ts CHANGED
@@ -15,7 +15,7 @@ export async function GET(req: NextRequest) {
15
 
16
  switch (action) {
17
  case 'status':
18
- if (!job_id) {
19
  return NextResponse.json({ error: 'job_id required for status check' }, { status: 400 })
20
  }
21
  return await debugJobStatus(job_id)
 
15
 
16
  switch (action) {
17
  case 'status':
18
+ if (!job_id) {
19
  return NextResponse.json({ error: 'job_id required for status check' }, { status: 400 })
20
  }
21
  return await debugJobStatus(job_id)
ingestion_js/app/api/upload/route.ts CHANGED
@@ -14,6 +14,9 @@ export async function POST(req: NextRequest) {
14
  return NextResponse.json({ error: 'Server not configured: EMBED_BASE_URL missing' }, { status: 500 })
15
  }
16
 
 
 
 
17
  const job_id = randomUUID()
18
  const formData = await req.formData()
19
 
@@ -33,11 +36,14 @@ export async function POST(req: NextRequest) {
33
  const MAX_FILES_PER_UPLOAD = parseInt(process.env.MAX_FILES_PER_UPLOAD || '15')
34
  const MAX_FILE_MB = parseInt(process.env.MAX_FILE_MB || '50')
35
 
36
- if (files.length > MAX_FILES_PER_UPLOAD) {
37
- return NextResponse.json(
38
- { error: `Too many files. Max ${MAX_FILES_PER_UPLOAD} allowed per upload.` },
39
- { status: 400 }
40
- )
 
 
 
41
  }
42
 
43
  let replace_set = new Set<string>()
@@ -61,11 +67,14 @@ export async function POST(req: NextRequest) {
61
  const preloaded_files: { fname: string; buf: Buffer }[] = []
62
  for (const file of files) {
63
  const raw = Buffer.from(await file.arrayBuffer())
64
- if (raw.length > MAX_FILE_MB * 1024 * 1024) {
65
- return NextResponse.json(
66
- { error: `${file.name} exceeds ${MAX_FILE_MB} MB limit` },
67
- { status: 400 }
68
- )
 
 
 
69
  }
70
  const eff_name = rename_map[file.name] || file.name
71
  preloaded_files.push({ fname: eff_name, buf: raw })
@@ -145,21 +154,32 @@ async function processFilesInBackground(
145
  const cards = await imports.buildCardsFromPages(pages, fname, user_id, project_id)
146
  console.log(`[UPLOAD_DEBUG] Built ${cards.length} cards`)
147
 
148
- console.log(`[UPLOAD_DEBUG] Generating embeddings for ${cards.length} cards`)
149
- const vectors = await imports.embedRemote(cards.map((c: any) => c.content))
150
- if (vectors.length !== cards.length) {
151
- throw new Error(`Embedding mismatch: got ${vectors.length} for ${cards.length} cards`)
152
- }
153
- for (let j = 0; j < cards.length; j++) {
154
- cards[j].embedding = vectors[j]
 
 
 
 
 
 
155
  }
156
- console.log(`[UPLOAD_DEBUG] Generated embeddings`)
157
 
158
  console.log(`[UPLOAD_DEBUG] Storing ${cards.length} cards in MongoDB`)
159
  await imports.storeCards(cards)
160
  console.log(`[UPLOAD_DEBUG] Stored cards`)
161
 
162
- const full_text = pages.map((p: any) => p.text).join('\n\n')
 
 
 
 
 
163
  const file_summary = await imports.cheapSummarize(full_text, 6)
164
  await imports.upsertFileSummary(user_id, project_id, fname, file_summary)
165
  console.log(`[UPLOAD_DEBUG] Upserted file summary for ${fname}`)
 
14
  return NextResponse.json({ error: 'Server not configured: EMBED_BASE_URL missing' }, { status: 500 })
15
  }
16
 
17
+ // Treat anything not running on Vercel production as local/dev
18
+ const isLocal = (process.env.VERCEL !== '1') && (process.env.NODE_ENV !== 'production')
19
+
20
  const job_id = randomUUID()
21
  const formData = await req.formData()
22
 
 
36
  const MAX_FILES_PER_UPLOAD = parseInt(process.env.MAX_FILES_PER_UPLOAD || '15')
37
  const MAX_FILE_MB = parseInt(process.env.MAX_FILE_MB || '50')
38
 
39
+ // Skip per-upload count limits when running locally to aid debugging
40
+ if (!isLocal) {
41
+ if (files.length > MAX_FILES_PER_UPLOAD) {
42
+ return NextResponse.json(
43
+ { error: `Too many files. Max ${MAX_FILES_PER_UPLOAD} allowed per upload.` },
44
+ { status: 400 }
45
+ )
46
+ }
47
  }
48
 
49
  let replace_set = new Set<string>()
 
67
  const preloaded_files: { fname: string; buf: Buffer }[] = []
68
  for (const file of files) {
69
  const raw = Buffer.from(await file.arrayBuffer())
70
+ // Skip per-file size limits when running locally to aid debugging
71
+ if (!isLocal) {
72
+ if (raw.length > MAX_FILE_MB * 1024 * 1024) {
73
+ return NextResponse.json(
74
+ { error: `${file.name} exceeds ${MAX_FILE_MB} MB limit` },
75
+ { status: 400 }
76
+ )
77
+ }
78
  }
79
  const eff_name = rename_map[file.name] || file.name
80
  preloaded_files.push({ fname: eff_name, buf: raw })
 
154
  const cards = await imports.buildCardsFromPages(pages, fname, user_id, project_id)
155
  console.log(`[UPLOAD_DEBUG] Built ${cards.length} cards`)
156
 
157
+ // Batch embeddings to avoid high memory usage
158
+ console.log(`[UPLOAD_DEBUG] Generating embeddings for ${cards.length} cards in batches`)
159
+ const BATCH = 32
160
+ for (let start = 0; start < cards.length; start += BATCH) {
161
+ const end = Math.min(start + BATCH, cards.length)
162
+ const batch = cards.slice(start, end)
163
+ const vectors = await imports.embedRemote(batch.map((c: any) => c.content))
164
+ if (vectors.length !== batch.length) {
165
+ throw new Error(`Embedding mismatch: got ${vectors.length} for ${batch.length} cards`)
166
+ }
167
+ for (let j = 0; j < batch.length; j++) {
168
+ batch[j].embedding = vectors[j]
169
+ }
170
  }
171
+ console.log(`[UPLOAD_DEBUG] Generated embeddings (batched)`)
172
 
173
  console.log(`[UPLOAD_DEBUG] Storing ${cards.length} cards in MongoDB`)
174
  await imports.storeCards(cards)
175
  console.log(`[UPLOAD_DEBUG] Stored cards`)
176
 
177
+ // Cap text length to reduce memory pressure for summarization
178
+ const full_text_raw = pages.map((p: any) => p.text).join('\n\n')
179
+ const MAX_SUMMARY_CHARS = 200_000
180
+ const full_text = full_text_raw.length > MAX_SUMMARY_CHARS
181
+ ? full_text_raw.slice(0, MAX_SUMMARY_CHARS)
182
+ : full_text_raw
183
  const file_summary = await imports.cheapSummarize(full_text, 6)
184
  await imports.upsertFileSummary(user_id, project_id, fname, file_summary)
185
  console.log(`[UPLOAD_DEBUG] Upserted file summary for ${fname}`)
ingestion_js/lib/embedder.ts CHANGED
@@ -1,36 +1,49 @@
1
  export async function embedRemote(texts: string[]): Promise<number[][]> {
2
- console.log(`[EMBEDDER_DEBUG] Embedding ${texts.length} texts`)
3
-
 
4
  if (!texts || texts.length === 0) return []
5
  const base = (process.env.EMBED_BASE_URL || '').replace(/\/$/, '')
6
  if (!base) {
7
  console.error('[EMBEDDER_DEBUG] EMBED_BASE_URL is required')
8
  throw new Error('EMBED_BASE_URL is required')
9
  }
10
-
11
- console.log(`[EMBEDDER_DEBUG] Calling ${base}/embed`)
12
-
13
- const res = await fetch(`${base}/embed`, {
14
- method: 'POST',
15
- headers: { 'Content-Type': 'application/json' },
16
- body: JSON.stringify({ texts }),
17
- // 60s like Python client
18
- next: { revalidate: 0 }
19
- })
20
-
21
- if (!res.ok) {
22
- console.warn(`[EMBEDDER_DEBUG] Embedding failed with status ${res.status}, using zero vectors`)
23
- // Fail closed with zeros to avoid crashes (parity with Python fallback)
24
- return Array.from({ length: texts.length }, () => Array(384).fill(0))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
-
27
- const data = await res.json() as any
28
- const vectors = Array.isArray(data?.vectors) ? data.vectors : []
29
- if (!Array.isArray(vectors)) {
30
- console.warn('[EMBEDDER_DEBUG] Invalid vectors format, using zero vectors')
31
- return Array.from({ length: texts.length }, () => Array(384).fill(0))
32
- }
33
-
34
- console.log(`[EMBEDDER_DEBUG] Successfully embedded ${vectors.length} vectors`)
35
- return vectors
36
  }
 
1
  export async function embedRemote(texts: string[]): Promise<number[][]> {
2
+ const total = texts?.length || 0
3
+ console.log(`[EMBEDDER_DEBUG] Embedding ${total} texts`)
4
+
5
  if (!texts || texts.length === 0) return []
6
  const base = (process.env.EMBED_BASE_URL || '').replace(/\/$/, '')
7
  if (!base) {
8
  console.error('[EMBEDDER_DEBUG] EMBED_BASE_URL is required')
9
  throw new Error('EMBED_BASE_URL is required')
10
  }
11
+
12
+ // Memory-safe batching to avoid large payloads in Node/Vercel
13
+ const batchSize = Math.max(1, parseInt(process.env.EMBED_BATCH_SIZE || '32', 10))
14
+ const results: number[][] = []
15
+
16
+ for (let start = 0; start < texts.length; start += batchSize) {
17
+ const end = Math.min(start + batchSize, texts.length)
18
+ const batch = texts.slice(start, end)
19
+ console.log(`[EMBEDDER_DEBUG] Calling ${base}/embed for batch ${start}..${end - 1} (size=${batch.length})`)
20
+
21
+ const res = await fetch(`${base}/embed`, {
22
+ method: 'POST',
23
+ headers: { 'Content-Type': 'application/json' },
24
+ body: JSON.stringify({ texts: batch }),
25
+ next: { revalidate: 0 }
26
+ })
27
+
28
+ if (!res.ok) {
29
+ console.warn(`[EMBEDDER_DEBUG] Batch embedding failed with status ${res.status}, using zero vectors`)
30
+ const zeros = Array.from({ length: batch.length }, () => Array(384).fill(0))
31
+ results.push(...zeros)
32
+ continue
33
+ }
34
+
35
+ const data = (await res.json()) as any
36
+ const vectors = Array.isArray(data?.vectors) ? data.vectors : []
37
+ if (!Array.isArray(vectors) || vectors.length !== batch.length) {
38
+ console.warn('[EMBEDDER_DEBUG] Invalid vectors format/length, using zero vectors for this batch')
39
+ const zeros = Array.from({ length: batch.length }, () => Array(384).fill(0))
40
+ results.push(...zeros)
41
+ continue
42
+ }
43
+
44
+ results.push(...vectors)
45
  }
46
+
47
+ console.log(`[EMBEDDER_DEBUG] Successfully embedded ${results.length}/${total} vectors`)
48
+ return results
 
 
 
 
 
 
 
49
  }
ingestion_js/lib/parser.ts CHANGED
@@ -31,41 +31,40 @@ function extractTextFromPdfBuffer(buffer: Buffer): string {
31
  export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
32
  console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`)
33
 
34
- // First, try robust per-page extraction using pdfjs-dist (Node-safe, no canvas rendering)
35
- try {
36
- const pdfjs: any = await import('pdfjs-dist/legacy/build/pdf')
37
- // In Node, we don't use a worker
38
- if (pdfjs.GlobalWorkerOptions) {
39
- pdfjs.GlobalWorkerOptions.workerSrc = undefined
40
- }
41
-
42
- const loadingTask = pdfjs.getDocument({
43
- data: new Uint8Array(buf),
44
- disableFontFace: true,
45
- useSystemFonts: true,
46
- isEvalSupported: false
47
- })
48
- const pdf = await loadingTask.promise
49
- const pageCount: number = pdf.numPages
50
- console.log(`[PARSER_DEBUG] pdfjs-dist loaded. Pages: ${pageCount}`)
51
 
52
- const pages: Page[] = []
53
- for (let i = 1; i <= pageCount; i++) {
54
- const page = await pdf.getPage(i)
55
- const textContent = await page.getTextContent()
56
- const text = (textContent.items || [])
57
- .map((it: any) => (typeof it.str === 'string' ? it.str : ''))
58
- .join(' ')
59
- .replace(/\s+/g, ' ')
60
- .trim()
61
 
62
- pages.push({ page_num: i, text, images: [] })
 
 
 
 
 
63
  }
64
-
65
- console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via pdfjs-dist`)
66
- return pages
67
- } catch (err) {
68
- console.warn('[PARSER_DEBUG] pdfjs-dist extraction failed, falling back to basic extractor:', err)
69
  }
70
 
71
  // Fallback: use lightweight extractor and page-count from pdf-lib
 
31
  export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
32
  console.log(`[PARSER_DEBUG] Parsing PDF with ${buf.length} bytes`)
33
 
34
+ // Optional heavy parser (guarded by env to avoid OOM locally)
35
+ if (process.env.PARSER_USE_PDFJS === '1') {
36
+ try {
37
+ const pdfjs: any = await import('pdfjs-dist/legacy/build/pdf')
38
+ if (pdfjs.GlobalWorkerOptions) {
39
+ pdfjs.GlobalWorkerOptions.workerSrc = undefined
40
+ }
41
+ const loadingTask = pdfjs.getDocument({
42
+ data: new Uint8Array(buf),
43
+ disableFontFace: true,
44
+ useSystemFonts: true,
45
+ isEvalSupported: false
46
+ })
47
+ const pdf = await loadingTask.promise
48
+ const pageCount: number = pdf.numPages
49
+ console.log(`[PARSER_DEBUG] pdfjs-dist loaded. Pages: ${pageCount}`)
 
50
 
51
+ const pages: Page[] = []
52
+ for (let i = 1; i <= pageCount; i++) {
53
+ const page = await pdf.getPage(i)
54
+ const textContent = await page.getTextContent()
55
+ const text = (textContent.items || [])
56
+ .map((it: any) => (typeof it.str === 'string' ? it.str : ''))
57
+ .join(' ')
58
+ .replace(/\s+/g, ' ')
59
+ .trim()
60
 
61
+ pages.push({ page_num: i, text, images: [] })
62
+ }
63
+ console.log(`[PARSER_DEBUG] Parsed PDF with ${pages.length} pages via pdfjs-dist`)
64
+ return pages
65
+ } catch (err) {
66
+ console.warn('[PARSER_DEBUG] pdfjs-dist extraction failed, falling back to basic extractor:', err)
67
  }
 
 
 
 
 
68
  }
69
 
70
  // Fallback: use lightweight extractor and page-count from pdf-lib
ingestion_js/test_upload_js.sh CHANGED
@@ -6,7 +6,8 @@ echo "🚀 Testing Ingestion JS API on Vercel"
6
  echo "======================================"
7
 
8
  # Configuration
9
- BACKEND_URL="https://study-buddy-ingestion1.vercel.app/api"
 
10
  USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
11
  PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
12
 
 
6
  echo "======================================"
7
 
8
  # Configuration
9
+ # BACKEND_URL="https://study-buddy-ingestion1.vercel.app/api"
10
+ BACKEND_URL="http://localhost:3001/api"
11
  USER_ID="44e65346-8eaa-4f95-b17a-f6219953e7a8"
12
  PROJECT_ID="496e2fad-ec7e-4562-b06a-ea2491f2460"
13