LiamKhoaLe commited on
Commit
3fafddc
·
1 Parent(s): fc03a78

Initial setups refine for ingestion_js #11 (check buffer)

Browse files
ingestion_js/lib/chunker.ts CHANGED
@@ -35,19 +35,29 @@ function createOverlappingChunks(blocks: string[]): string[] {
35
  }
36
 
37
  export async function buildCardsFromPages(pages: Page[], filename: string, user_id: string, project_id: string) {
 
 
38
  let full = ''
39
  for (const p of pages) full += `\n\n[[Page ${p.page_num}]]\n${(p.text || '').trim()}\n`
 
 
40
  const coarse = byHeadings(full)
 
 
41
  const chunks = createOverlappingChunks(coarse)
 
42
 
43
  const out: any[] = []
44
  for (let i = 0; i < chunks.length; i++) {
 
 
45
  const cleaned = await cleanChunkText(chunks[i])
46
  const topic = (await cheapSummarize(cleaned, 1)) || (cleaned.slice(0, 80) + '...')
47
  const summary = await cheapSummarize(cleaned, 3)
48
  const firstPage = pages[0]?.page_num ?? 1
49
  const lastPage = pages[pages.length - 1]?.page_num ?? 1
50
- out.push({
 
51
  user_id,
52
  project_id,
53
  filename,
@@ -55,8 +65,13 @@ export async function buildCardsFromPages(pages: Page[], filename: string, user_
55
  summary,
56
  content: cleaned,
57
  page_span: [firstPage, lastPage],
58
- card_id: `${slugify(filename)}-c${String(i + 1).padStart(4, '0')}`
59
- })
 
 
 
60
  }
 
 
61
  return out
62
  }
 
35
  }
36
 
37
  export async function buildCardsFromPages(pages: Page[], filename: string, user_id: string, project_id: string) {
38
+ console.log(`[CHUNKER_DEBUG] Building cards from ${pages.length} pages for ${filename}`)
39
+
40
  let full = ''
41
  for (const p of pages) full += `\n\n[[Page ${p.page_num}]]\n${(p.text || '').trim()}\n`
42
+ console.log(`[CHUNKER_DEBUG] Full text length: ${full.length}`)
43
+
44
  const coarse = byHeadings(full)
45
+ console.log(`[CHUNKER_DEBUG] Split into ${coarse.length} heading blocks`)
46
+
47
  const chunks = createOverlappingChunks(coarse)
48
+ console.log(`[CHUNKER_DEBUG] Created ${chunks.length} overlapping chunks`)
49
 
50
  const out: any[] = []
51
  for (let i = 0; i < chunks.length; i++) {
52
+ console.log(`[CHUNKER_DEBUG] Processing chunk ${i + 1}/${chunks.length}`)
53
+
54
  const cleaned = await cleanChunkText(chunks[i])
55
  const topic = (await cheapSummarize(cleaned, 1)) || (cleaned.slice(0, 80) + '...')
56
  const summary = await cheapSummarize(cleaned, 3)
57
  const firstPage = pages[0]?.page_num ?? 1
58
  const lastPage = pages[pages.length - 1]?.page_num ?? 1
59
+
60
+ const card = {
61
  user_id,
62
  project_id,
63
  filename,
 
65
  summary,
66
  content: cleaned,
67
  page_span: [firstPage, lastPage],
68
+ card_id: `${slugify(String(filename))}-c${String(i + 1).padStart(4, '0')}`
69
+ }
70
+
71
+ console.log(`[CHUNKER_DEBUG] Created card ${card.card_id} with content length ${cleaned.length}`)
72
+ out.push(card)
73
  }
74
+
75
+ console.log(`[CHUNKER_DEBUG] Built ${out.length} cards total`)
76
  return out
77
  }
ingestion_js/lib/embedder.ts CHANGED
@@ -1,7 +1,15 @@
1
  export async function embedRemote(texts: string[]): Promise<number[][]> {
 
 
2
  if (!texts || texts.length === 0) return []
3
  const base = (process.env.EMBED_BASE_URL || '').replace(/\/$/, '')
4
- if (!base) throw new Error('EMBED_BASE_URL is required')
 
 
 
 
 
 
5
  const res = await fetch(`${base}/embed`, {
6
  method: 'POST',
7
  headers: { 'Content-Type': 'application/json' },
@@ -9,14 +17,20 @@ export async function embedRemote(texts: string[]): Promise<number[][]> {
9
  // 60s like Python client
10
  next: { revalidate: 0 }
11
  })
 
12
  if (!res.ok) {
 
13
  // Fail closed with zeros to avoid crashes (parity with Python fallback)
14
  return Array.from({ length: texts.length }, () => Array(384).fill(0))
15
  }
 
16
  const data = await res.json() as any
17
  const vectors = Array.isArray(data?.vectors) ? data.vectors : []
18
  if (!Array.isArray(vectors)) {
 
19
  return Array.from({ length: texts.length }, () => Array(384).fill(0))
20
  }
 
 
21
  return vectors
22
  }
 
1
  export async function embedRemote(texts: string[]): Promise<number[][]> {
2
+ console.log(`[EMBEDDER_DEBUG] Embedding ${texts.length} texts`)
3
+
4
  if (!texts || texts.length === 0) return []
5
  const base = (process.env.EMBED_BASE_URL || '').replace(/\/$/, '')
6
+ if (!base) {
7
+ console.error('[EMBEDDER_DEBUG] EMBED_BASE_URL is required')
8
+ throw new Error('EMBED_BASE_URL is required')
9
+ }
10
+
11
+ console.log(`[EMBEDDER_DEBUG] Calling ${base}/embed`)
12
+
13
  const res = await fetch(`${base}/embed`, {
14
  method: 'POST',
15
  headers: { 'Content-Type': 'application/json' },
 
17
  // 60s like Python client
18
  next: { revalidate: 0 }
19
  })
20
+
21
  if (!res.ok) {
22
+ console.warn(`[EMBEDDER_DEBUG] Embedding failed with status ${res.status}, using zero vectors`)
23
  // Fail closed with zeros to avoid crashes (parity with Python fallback)
24
  return Array.from({ length: texts.length }, () => Array(384).fill(0))
25
  }
26
+
27
  const data = await res.json() as any
28
  const vectors = Array.isArray(data?.vectors) ? data.vectors : []
29
  if (!Array.isArray(vectors)) {
30
+ console.warn('[EMBEDDER_DEBUG] Invalid vectors format, using zero vectors')
31
  return Array.from({ length: texts.length }, () => Array(384).fill(0))
32
  }
33
+
34
+ console.log(`[EMBEDDER_DEBUG] Successfully embedded ${vectors.length} vectors`)
35
  return vectors
36
  }