File size: 5,347 Bytes
aa9003d
 
 
 
 
 
 
 
 
 
 
 
6d2a17c
4386026
aa9003d
 
 
 
4386026
aa9003d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4386026
ef1ba2b
4386026
ef1ba2b
 
4386026
ef1ba2b
4386026
ef1ba2b
 
aa9003d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4386026
aa9003d
 
 
 
 
ef1ba2b
4386026
 
aa9003d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4386026
3855268
aa9003d
 
 
 
 
 
 
 
 
4386026
 
aa9003d
 
4386026
aa9003d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# ────────────────────────────── memo/nvidia.py ──────────────────────────────
"""
NVIDIA Integration

Functions for interacting with NVIDIA's API for summarization and analysis.
"""

import os
import json
from typing import List, Dict, Any

from utils.logger import get_logger
from utils.api.rotator import robust_post_json
from utils.api.router import qwen_chat_completion

logger = get_logger("NVIDIA_INTEGRATION", __name__)

NVIDIA_SMALL = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")
NVIDIA_MEDIUM = os.getenv("NVIDIA_MEDIUM", "qwen/qwen3-next-80b-a3b-thinking")

async def nvidia_chat(system_prompt: str, user_prompt: str, nvidia_key: str, rotator) -> str:
    """
    Minimal NVIDIA Chat call that enforces no-comment concise outputs.
    """
    url = "https://integrate.api.nvidia.com/v1/chat/completions"
    payload = {
        "model": NVIDIA_SMALL,
        "temperature": 0.0,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]
    }
    headers = {"Content-Type": "application/json", "Authorization": f"Bearer {nvidia_key or ''}"}
    data = None
    try:
        data = await robust_post_json(url, headers, payload, rotator)
        return data["choices"][0]["message"]["content"]
    except Exception as e:
        logger.warning(f"NVIDIA chat error: {e} • response: {data}")
        return ""

async def qwen_chat(system_prompt: str, user_prompt: str, rotator) -> str:
    """
    Qwen chat call for medium complexity tasks with thinking mode.
    """
    try:
        return await qwen_chat_completion(system_prompt, user_prompt, rotator)
    except Exception as e:
        logger.warning(f"Qwen chat error: {e}")
        return ""

def safe_json(s: str) -> Any:
    """Safely parse JSON string"""
    try:
        return json.loads(s)
    except Exception:
        # Try to extract a JSON object from text
        start = s.find("{")
        end = s.rfind("}")
        if start != -1 and end != -1 and end > start:
            try:
                return json.loads(s[start:end+1])
            except Exception:
                return {}
        return {}

async def summarize_qa(question: str, answer: str, rotator) -> str:
    """
    Returns a single line block:
    q: <concise>\na: <concise>
    No extra commentary.
    """
    sys = "You are a terse summarizer. Output exactly two lines:\nq: <short question summary>\na: <short answer summary>\nNo extra text."
    user = f"Question:\n{question}\n\nAnswer:\n{answer}"
    key = rotator.get_key()
    out = await nvidia_chat(sys, user, key, rotator)
    
    # Basic guard if the model returns extra prose
    lines = [ln.strip() for ln in out.splitlines() if ln.strip()]
    ql = next((l for l in lines if l.lower().startswith('q:')), None)
    al = next((l for l in lines if l.lower().startswith('a:')), None)
    
    if not ql or not al:
        # Fallback truncate
        ql = "q: " + (question.strip()[:160] + ("…" if len(question.strip()) > 160 else ""))
        al = "a: " + (answer.strip()[:220] + ("…" if len(answer.strip()) > 220 else ""))
    
    return f"{ql}\n{al}"

async def files_relevance(question: str, file_summaries: List[Dict[str, str]], rotator) -> Dict[str, bool]:
    """
    Ask Qwen model to mark each file as relevant (true) or not (false) for the question.
    Returns {filename: bool}
    """
    sys = "You classify file relevance. Return STRICT JSON only with shape {\"relevance\":[{\"filename\":\"...\",\"relevant\":true|false}]}."
    items = [{"filename": f["filename"], "summary": f.get("summary","")} for f in file_summaries]
    user = f"Question: {question}\n\nFiles:\n{json.dumps(items, ensure_ascii=False)}\n\nReturn JSON only."
    
    # Use Qwen for better JSON parsing and reasoning
    out = await qwen_chat(sys, user, rotator)
    
    data = safe_json(out) or {}
    rels = {}
    for row in data.get("relevance", []):
        fn = row.get("filename")
        rv = row.get("relevant")
        if isinstance(fn, str) and isinstance(rv, bool):
            rels[fn] = rv
    
    # If parsing failed, default to considering all files possibly relevant
    if not rels and file_summaries:
        rels = {f["filename"]: True for f in file_summaries}
    
    return rels

async def related_recent_context(question: str, recent_memories: List[str], rotator) -> str:
    """
    Use Qwen to select related items from recent memories.
    Enhanced function for better context memory ability.
    """
    if not recent_memories:
        return ""
    
    sys = "Pick only items that directly relate to the new question. Output the selected items verbatim, no commentary. If none, output nothing."
    numbered = [{"id": i+1, "text": s} for i, s in enumerate(recent_memories)]
    user = f"Question: {question}\nCandidates:\n{json.dumps(numbered, ensure_ascii=False)}\nSelect any related items and output ONLY their 'text' lines concatenated."
    
    try:
        # Use Qwen for better reasoning and context selection
        out = await qwen_chat(sys, user, rotator)
        return out.strip()
    except Exception as e:
        logger.warning(f"Recent-related Qwen error: {e}")
        return ""