import os
import asyncio
from typing import List
from utils.logger import get_logger
from utils.api.rotator import robust_post_json, APIKeyRotator
from utils.api.router import qwen_chat_completion, nvidia_large_chat_completion

logger = get_logger("SUM", __name__)

# Create a module-level NVIDIA API key rotator (uses NVIDIA_API_1..N)
ROTATOR = APIKeyRotator(prefix="NVIDIA_API_", max_slots=5)


async def llama_chat(messages, temperature: float = 0.2) -> str:
  model = os.getenv("NVIDIA_SMALL", "meta/llama-3.1-8b-instruct")
  # Get key via rotator (supports rotation/retries in robust_post_json)
  key = ROTATOR.get_key()
  if not key:
    raise RuntimeError("NVIDIA API key not set (NVIDIA_API_*)")
  url = "https://integrate.api.nvidia.com/v1/chat/completions"
  headers = {"Content-Type": "application/json", "Authorization": f"Bearer {key}"}
  payload = {"model": model, "temperature": temperature, "messages": messages}
  data = await robust_post_json(url, headers, payload, ROTATOR)
  return data["choices"][0]["message"]["content"].strip()


async def llama_summarize(text: str, max_sentences: int = 3) -> str:
  """Flexible summarization using NVIDIA Small (Llama) for short text, NVIDIA Large for long context."""
  text = (text or "").strip()
  if not text:
    return ""
  
  # Use NVIDIA Large for long context (>1500 chars), NVIDIA Small for short context
  if len(text) > 1500:
    logger.info(f"[SUMMARIZER] Using NVIDIA Large for long context ({len(text)} chars)")
    return await nvidia_large_summarize(text, max_sentences)
  else:
    logger.info(f"[SUMMARIZER] Using NVIDIA Small for short context ({len(text)} chars)")
    return await nvidia_small_summarize(text, max_sentences)

async def nvidia_small_summarize(text: str, max_sentences: int = 3) -> str:
  """Summarization using NVIDIA Small (Llama) for short text."""
  system = (
    "You are a precise summarizer. Produce a clear, faithful summary of the user's text. "
    f"Return ~{max_sentences} sentences, no comments, no preface, no markdown."
  )
  user = f"Summarize this text:\n\n{text}"
  try:
    return await llama_chat([
      {"role": "system", "content": system},
      {"role": "user", "content": user},
    ])
  except Exception as e:
    logger.warning(f"NVIDIA Small summarization failed: {e}; using fallback")
    return naive_fallback(text, max_sentences)

async def nvidia_large_summarize(text: str, max_sentences: int = 3) -> str:
  """Summarization using NVIDIA Large (GPT-OSS) for long context."""
  system = (
    "You are a precise summarizer. Produce a clear, faithful summary of the user's text. "
    f"Return ~{max_sentences} sentences, no comments, no preface, no markdown."
  )
  user = f"Summarize this text:\n\n{text}"
  try:
    return await nvidia_large_chat_completion(system, user, ROTATOR)
  except Exception as e:
    logger.warning(f"NVIDIA Large summarization failed: {e}; using fallback")
    return naive_fallback(text, max_sentences)


def naive_fallback(text: str, max_sentences: int = 3) -> str:
  parts = [p.strip() for p in text.split('. ') if p.strip()]
  return '. '.join(parts[:max_sentences])


async def summarize_text(text: str, max_sentences: int = 6, chunk_size: int = 2500) -> str:
  """Hierarchical summarization for long texts using flexible model selection."""
  if not text:
    return ""
  if len(text) <= chunk_size:
    return await llama_summarize(text, max_sentences=max_sentences)
  
  # Split into chunks on paragraph boundaries if possible
  paragraphs = text.split('\n\n')
  chunks: List[str] = []
  buf = []
  total = 0
  for p in paragraphs:
    if total + len(p) > chunk_size and buf:
      chunks.append('\n\n'.join(buf))
      buf, total = [], 0
    buf.append(p)
    total += len(p)
  if buf:
    chunks.append('\n\n'.join(buf))

  # Process chunks with flexible model selection
  partials = []
  for ch in chunks:
    partials.append(await llama_summarize(ch, max_sentences=3))
    await asyncio.sleep(0)
  
  # Combine and summarize with flexible model selection
  combined = '\n'.join(partials)
  return await llama_summarize(combined, max_sentences=max_sentences)


async def clean_chunk_text(text: str) -> str:
  """Use Qwen LLM to remove headers/footers and personally identifying/institution boilerplate.
  Keep the core academic content intact. Do not remove page numbers or section titles.
  """
  content = (text or "").strip()
  if not content:
    return content
  system = (
    "You are a content cleaner. Remove boilerplate headers/footers like institution names, course codes, student IDs, "
    "emails, author IDs, document footers/headers repeated across pages. Keep headings and the main body content. "
    "Preserve meaningful section titles. Keep pagination references in the natural text if present. Return only cleaned text."
  )
  user = f"Clean this content by removing headers/footers and IDs, keep core content:\n\n{content}"
  try:
    # Use Qwen for better content cleaning
    return await qwen_chat_completion(system, user, ROTATOR)
  except Exception as e:
    logger.warning(f"Qwen cleaning failed: {e}; returning original text")
    return content

async def qwen_summarize(text: str, max_sentences: int = 3) -> str:
  """Use Qwen for better summarization with thinking mode."""
  text = (text or "").strip()
  if not text:
    return ""
  system = (
    "You are a precise summarizer. Produce a clear, faithful summary of the user's text. "
    f"Return ~{max_sentences} sentences, no comments, no preface, no markdown."
  )
  user = f"Summarize this text:\n\n{text}"
  try:
    return await qwen_chat_completion(system, user, ROTATOR)
  except Exception as e:
    logger.warning(f"Qwen summarization failed: {e}; using fallback")
    return naive_fallback(text, max_sentences)


# Backward-compatible name used by app.py
async def cheap_summarize(text: str, max_sentences: int = 3) -> str:
  """Backward-compatible summarization with flexible model selection."""
  return await llama_summarize(text, max_sentences)