File size: 2,555 Bytes
ec9f00b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
type ImageLike = { data: Buffer } | Blob | ArrayBuffer | string

async function imageToJpegBase64(input: any): Promise<string> {
  // input will be a Buffer or ArrayBuffer from parser; expect Buffer for server-side
  if (typeof input === 'string') return input
  const b64 = Buffer.isBuffer(input) ? input.toString('base64') : Buffer.from(input).toString('base64')
  return b64
}

function getNvidiaKey(): string | null {
  const direct = process.env.NVIDIA_API || null
  if (direct) return direct
  for (let i = 1; i <= 5; i++) {
    const k = process.env[`NVIDIA_API_${i}`]
    if (k) return k
  }
  return null
}

export async function captionImage(imageBuffer: Buffer): Promise<string> {
  const key = getNvidiaKey()
  if (!key) return ''
  const imgB64 = await imageToJpegBase64(imageBuffer)
  const system_prompt =
    'You are an expert vision captioner. Produce a precise, information-dense caption of the image. Do not include conversational phrases or meta commentary.'
  const user_prompt = 'Caption this image at the finest level of detail. Return only the caption text.'
  const payload = {
    model: process.env.NVIDIA_MAVERICK_MODEL || 'meta/llama-4-maverick-17b-128e-instruct',
    messages: [
      { role: 'system', content: system_prompt },
      {
        role: 'user',
        content: [
          { type: 'text', text: user_prompt },
          { type: 'image_url', image_url: { url: `data:image/jpeg;base64,${imgB64}` } }
        ]
      }
    ],
    max_tokens: 512,
    temperature: 0.2,
    stream: false
  }
  const res = await fetch('https://integrate.api.nvidia.com/v1/chat/completions', {
    method: 'POST',
    headers: {
      'Authorization': `Bearer ${key}`,
      'Content-Type': 'application/json'
    },
    body: JSON.stringify(payload)
  })
  if (!res.ok) return ''
  const data = await res.json() as any
  const text = data?.choices?.[0]?.message?.content || ''
  return normalizeCaption(text)
}

export function normalizeCaption(text: string): string {
  if (!text) return ''
  let t = text.trim()
  const banned = [
    'sure,', 'sure.', 'sure', 'here is', 'here are', 'this image', 'the image', 'image shows',
    'the picture', 'the photo', 'the text describes', 'the text describe', 'it shows', 'it depicts',
    'caption:', 'description:', 'output:', 'result:', 'answer:', 'analysis:', 'observation:'
  ]
  const lower = t.toLowerCase()
  for (const p of banned) {
    if (lower.startsWith(p)) {
      t = t.slice(p.length).trimStart()
      break
    }
  }
  return t.replace(/^['\"]|['\"]$/g, '').trim()
}