LiamKhoaLe commited on
Commit
ec9f00b
·
1 Parent(s): ee39cc9

Push submodules dir

Browse files
ingestion_js/app/api/health/route.ts ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextResponse } from 'next/server'
2
+ import { getMongo } from '@/lib/mongo'
3
+
4
+ export const dynamic = 'force-dynamic'
5
+
6
+ export async function GET() {
7
+ const mongo = await getMongo()
8
+ let mongodb_connected = false
9
+ try {
10
+ await mongo.db.command({ ping: 1 })
11
+ mongodb_connected = true
12
+ } catch {
13
+ mongodb_connected = false
14
+ }
15
+ return NextResponse.json({ ok: true, mongodb_connected, service: 'ingestion_pipeline' })
16
+ }
ingestion_js/lib/captioner.ts ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ type ImageLike = { data: Buffer } | Blob | ArrayBuffer | string
2
+
3
+ async function imageToJpegBase64(input: any): Promise<string> {
4
+ // input will be a Buffer or ArrayBuffer from parser; expect Buffer for server-side
5
+ if (typeof input === 'string') return input
6
+ const b64 = Buffer.isBuffer(input) ? input.toString('base64') : Buffer.from(input).toString('base64')
7
+ return b64
8
+ }
9
+
10
+ function getNvidiaKey(): string | null {
11
+ const direct = process.env.NVIDIA_API || null
12
+ if (direct) return direct
13
+ for (let i = 1; i <= 5; i++) {
14
+ const k = process.env[`NVIDIA_API_${i}`]
15
+ if (k) return k
16
+ }
17
+ return null
18
+ }
19
+
20
+ export async function captionImage(imageBuffer: Buffer): Promise<string> {
21
+ const key = getNvidiaKey()
22
+ if (!key) return ''
23
+ const imgB64 = await imageToJpegBase64(imageBuffer)
24
+ const system_prompt =
25
+ 'You are an expert vision captioner. Produce a precise, information-dense caption of the image. Do not include conversational phrases or meta commentary.'
26
+ const user_prompt = 'Caption this image at the finest level of detail. Return only the caption text.'
27
+ const payload = {
28
+ model: process.env.NVIDIA_MAVERICK_MODEL || 'meta/llama-4-maverick-17b-128e-instruct',
29
+ messages: [
30
+ { role: 'system', content: system_prompt },
31
+ {
32
+ role: 'user',
33
+ content: [
34
+ { type: 'text', text: user_prompt },
35
+ { type: 'image_url', image_url: { url: `data:image/jpeg;base64,${imgB64}` } }
36
+ ]
37
+ }
38
+ ],
39
+ max_tokens: 512,
40
+ temperature: 0.2,
41
+ stream: false
42
+ }
43
+ const res = await fetch('https://integrate.api.nvidia.com/v1/chat/completions', {
44
+ method: 'POST',
45
+ headers: {
46
+ 'Authorization': `Bearer ${key}`,
47
+ 'Content-Type': 'application/json'
48
+ },
49
+ body: JSON.stringify(payload)
50
+ })
51
+ if (!res.ok) return ''
52
+ const data = await res.json() as any
53
+ const text = data?.choices?.[0]?.message?.content || ''
54
+ return normalizeCaption(text)
55
+ }
56
+
57
+ export function normalizeCaption(text: string): string {
58
+ if (!text) return ''
59
+ let t = text.trim()
60
+ const banned = [
61
+ 'sure,', 'sure.', 'sure', 'here is', 'here are', 'this image', 'the image', 'image shows',
62
+ 'the picture', 'the photo', 'the text describes', 'the text describe', 'it shows', 'it depicts',
63
+ 'caption:', 'description:', 'output:', 'result:', 'answer:', 'analysis:', 'observation:'
64
+ ]
65
+ const lower = t.toLowerCase()
66
+ for (const p of banned) {
67
+ if (lower.startsWith(p)) {
68
+ t = t.slice(p.length).trimStart()
69
+ break
70
+ }
71
+ }
72
+ return t.replace(/^['\"]|['\"]$/g, '').trim()
73
+ }
ingestion_js/lib/embedder.ts ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export async function embedRemote(texts: string[]): Promise<number[][]> {
2
+ if (!texts || texts.length === 0) return []
3
+ const base = (process.env.EMBED_BASE_URL || '').replace(/\/$/, '')
4
+ if (!base) throw new Error('EMBED_BASE_URL is required')
5
+ const res = await fetch(`${base}/embed`, {
6
+ method: 'POST',
7
+ headers: { 'Content-Type': 'application/json' },
8
+ body: JSON.stringify({ texts }),
9
+ // 60s like Python client
10
+ next: { revalidate: 0 }
11
+ })
12
+ if (!res.ok) {
13
+ // Fail closed with zeros to avoid crashes (parity with Python fallback)
14
+ return Array.from({ length: texts.length }, () => Array(384).fill(0))
15
+ }
16
+ const data = await res.json() as any
17
+ const vectors = Array.isArray(data?.vectors) ? data.vectors : []
18
+ if (!Array.isArray(vectors)) {
19
+ return Array.from({ length: texts.length }, () => Array(384).fill(0))
20
+ }
21
+ return vectors
22
+ }
ingestion_js/lib/mongo.ts ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { MongoClient, Db } from 'mongodb'
2
+
3
+ let client: MongoClient | null = null
4
+ let db: Db | null = null
5
+
6
+ export async function getMongo() {
7
+ if (client && db) return { client, db }
8
+ const uri = process.env.MONGO_URI
9
+ const dbName = process.env.MONGO_DB || 'studybuddy'
10
+ if (!uri) throw new Error('MONGO_URI is required')
11
+ client = new MongoClient(uri)
12
+ await client.connect()
13
+ db = client.db(dbName)
14
+ return { client, db }
15
+ }
16
+
17
+ export const VECTOR_DIM = 384
18
+
19
+ export async function storeCards(cards: any[]) {
20
+ const { db } = await getMongo()
21
+ if (!cards || !cards.length) return
22
+ for (const c of cards) {
23
+ if (!c.embedding || c.embedding.length !== VECTOR_DIM) {
24
+ throw new Error(`Invalid embedding length; expected ${VECTOR_DIM}`)
25
+ }
26
+ }
27
+ await db.collection('chunks').insertMany(cards, { ordered: false })
28
+ }
29
+
30
+ export async function upsertFileSummary(user_id: string, project_id: string, filename: string, summary: string) {
31
+ const { db } = await getMongo()
32
+ await db.collection('files').updateOne(
33
+ { user_id, project_id, filename },
34
+ { $set: { summary } },
35
+ { upsert: true }
36
+ )
37
+ }
38
+
39
+ export async function listFiles(user_id: string, project_id: string) {
40
+ const { db } = await getMongo()
41
+ const cursor = db.collection('files').find({ user_id, project_id }, { projection: { _id: 0, filename: 1, summary: 1 } }).sort({ filename: 1 })
42
+ return cursor.toArray()
43
+ }
44
+
45
+ export async function getFileChunks(user_id: string, project_id: string, filename: string, limit = 20) {
46
+ const { db } = await getMongo()
47
+ const cursor = db.collection('chunks').find({ user_id, project_id, filename }).limit(limit)
48
+ const out: any[] = []
49
+ for await (const doc of cursor) {
50
+ const d: any = {}
51
+ for (const [k, v] of Object.entries(doc as any)) {
52
+ if (k === '_id') d[k] = String(v)
53
+ // @ts-ignore
54
+ else if (v && typeof v === 'object' && typeof (v as any).toISOString === 'function') d[k] = (v as any).toISOString()
55
+ else d[k] = v as any
56
+ }
57
+ out.push(d)
58
+ }
59
+ return out
60
+ }
61
+
62
+ export async function deleteFileData(user_id: string, project_id: string, filename: string) {
63
+ const { db } = await getMongo()
64
+ await db.collection('chunks').deleteMany({ user_id, project_id, filename })
65
+ await db.collection('files').deleteMany({ user_id, project_id, filename })
66
+ }
67
+
68
+ export async function ensureIndexes() {
69
+ const { db } = await getMongo()
70
+ await db.collection('chunks').createIndex({ user_id: 1, project_id: 1, filename: 1 })
71
+ await db.collection('files').createIndex({ user_id: 1, project_id: 1, filename: 1 })
72
+ }
ingestion_js/lib/parser.ts ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfParse from 'pdf-parse'
2
+ import mammoth from 'mammoth'
3
+
4
+ export type Page = { page_num: number; text: string; images: Buffer[] }
5
+
6
+ export async function parsePdfBytes(buf: Buffer): Promise<Page[]> {
7
+ // pdf-parse: text only; image extraction is non-trivial in Node serverless
8
+ const data = await pdfParse(buf)
9
+ const text = data.text || ''
10
+ const pages = text.split('\f') // pdf-parse uses form-feed between pages when available
11
+ const out: Page[] = []
12
+ for (let i = 0; i < pages.length; i++) {
13
+ out.push({ page_num: i + 1, text: pages[i] || '', images: [] })
14
+ }
15
+ if (out.length === 0) out.push({ page_num: 1, text, images: [] })
16
+ return out
17
+ }
18
+
19
+ export async function parseDocxBytes(buf: Buffer): Promise<Page[]> {
20
+ const { value } = await mammoth.extractRawText({ buffer: buf })
21
+ const text = value || ''
22
+ return [{ page_num: 1, text, images: [] }]
23
+ }
24
+
25
+ export function inferMime(filename: string): string {
26
+ const lower = filename.toLowerCase()
27
+ if (lower.endsWith('.pdf')) return 'application/pdf'
28
+ if (lower.endsWith('.docx'))
29
+ return 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
30
+ return 'application/octet-stream'
31
+ }
32
+
33
+ export async function extractPages(filename: string, file: Buffer): Promise<Page[]> {
34
+ const mime = inferMime(filename)
35
+ if (mime === 'application/pdf') return parsePdfBytes(file)
36
+ if (mime === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') return parseDocxBytes(file)
37
+ throw new Error(`Unsupported file type: ${filename}`)
38
+ }
ingestion_js/next.config.js ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ /** @type {import('next').NextConfig} */
2
+ const nextConfig = {
3
+ experimental: {
4
+ serverActions: {
5
+ bodySizeLimit: '50mb'
6
+ }
7
+ }
8
+ };
9
+
10
+ module.exports = nextConfig;
ingestion_js/package.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "ingestion-js",
3
+ "private": true,
4
+ "version": "0.1.0",
5
+ "scripts": {
6
+ "dev": "next dev",
7
+ "build": "next build",
8
+ "start": "next start"
9
+ },
10
+ "dependencies": {
11
+ "mongodb": "^6.8.0",
12
+ "next": "^14.2.5",
13
+ "react": "^18.3.1",
14
+ "react-dom": "^18.3.1",
15
+ "pdf-parse": "^1.1.1",
16
+ "mammoth": "^1.6.0",
17
+ "slugify": "^1.6.6"
18
+ },
19
+ "devDependencies": {
20
+ "@types/node": "^20.11.30",
21
+ "@types/react": "^18.2.66",
22
+ "typescript": "^5.4.5"
23
+ }
24
+ }
ingestion_js/tsconfig.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "compilerOptions": {
3
+ "target": "ES2022",
4
+ "lib": ["dom", "dom.iterable", "es2022"],
5
+ "allowJs": true,
6
+ "skipLibCheck": true,
7
+ "strict": false,
8
+ "forceConsistentCasingInFileNames": true,
9
+ "noEmit": true,
10
+ "esModuleInterop": true,
11
+ "module": "esnext",
12
+ "moduleResolution": "bundler",
13
+ "resolveJsonModule": true,
14
+ "isolatedModules": true,
15
+ "jsx": "preserve",
16
+ "incremental": true,
17
+ "types": ["node"]
18
+ },
19
+ "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"],
20
+ "exclude": ["node_modules"]
21
+ }