Spaces:
Sleeping
Sleeping
File size: 38,056 Bytes
53cf39f 2d67168 53cf39f 2364a33 53cf39f 0d58cb1 019d6a8 1791aa5 019d6a8 0d58cb1 8d133dc 0d58cb1 74bc070 8d133dc 74bc070 0d58cb1 8d133dc 0d58cb1 74bc070 8d133dc 634c315 16ba0a7 634c315 019d6a8 1791aa5 019d6a8 16ba0a7 019d6a8 16ba0a7 019d6a8 74bc070 019d6a8 74bc070 019d6a8 74bc070 019d6a8 74bc070 019d6a8 74bc070 048b397 019d6a8 74bc070 019d6a8 634c315 2908396 634c315 53f192f 634c315 2908396 634c315 2908396 634c315 2908396 634c315 2908396 634c315 2908396 634c315 2908396 53f192f 2908396 634c315 2908396 634c315 2908396 634c315 53f192f 2908396 634c315 2908396 634c315 2908396 634c315 2908396 634c315 2364a33 8d133dc 74bc070 8d133dc 74bc070 8d133dc 74bc070 8d133dc 74bc070 8d133dc 74bc070 8d133dc 2364a33 53cf39f 2d67168 53cf39f 2d67168 53cf39f 2d67168 2908396 ba0d5f8 2d67168 ba0d5f8 2d67168 2908396 2d67168 1791aa5 2d67168 0d58cb1 2d67168 019d6a8 0d58cb1 2d67168 2364a33 048b397 2364a33 2d67168 53cf39f 2d67168 53cf39f 048b397 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 |
"""
PDF generation utilities for StudyBuddy
"""
import os
import tempfile
import markdown
import re
from datetime import datetime
from typing import List, Dict
from fastapi import HTTPException
from utils.logger import get_logger
logger = get_logger("PDF", __name__)
async def _parse_markdown_content(content: str, heading1_style, heading2_style, heading3_style, normal_style, code_style):
"""
Enhanced markdown parser that properly handles bold/italic formatting
"""
from reportlab.platypus import Paragraph, Spacer
from reportlab.lib.units import inch
story = []
lines = content.split('\n')
i = 0
while i < len(lines):
line = lines[i].strip()
if not line:
story.append(Spacer(1, 6))
i += 1
continue
# Headers
if line.startswith('#'):
level = len(line) - len(line.lstrip('#'))
header_text = line.lstrip('# ').strip()
header_text = _format_inline_markdown(header_text)
if level == 1:
story.append(Paragraph(header_text, heading1_style))
elif level == 2:
story.append(Paragraph(header_text, heading2_style))
elif level == 3:
story.append(Paragraph(header_text, heading3_style))
else:
story.append(Paragraph(header_text, normal_style))
# Code blocks with language detection
elif line.startswith('```'):
# Extract language if specified
language = line[3:].strip() if len(line) > 3 else 'text'
# Auto-detect language if not specified
if language == 'text':
language = _detect_language_from_content(lines, i)
code_lines = []
i += 1
while i < len(lines) and not lines[i].strip().startswith('```'):
code_lines.append(lines[i])
i += 1
if code_lines:
# Mermaid diagrams → render via Kroki PNG for PDF with retry logic
if language.lower() == 'mermaid':
try:
from reportlab.platypus import Image, Spacer
mermaid_code = '\n'.join(code_lines)
# Use retry logic from diagram.py
from helpers.diagram import _render_mermaid_with_retry
img_bytes = await _render_mermaid_with_retry(mermaid_code)
if img_bytes and len(img_bytes) > 0:
import io
img = Image(io.BytesIO(img_bytes))
# Fit within page width (~6 inches after margins)
max_width = 6.0 * inch
if img.drawWidth > max_width:
scale = max_width / float(img.drawWidth)
img.drawWidth = max_width
img.drawHeight = img.drawHeight * scale
story.append(img)
story.append(Spacer(1, 12))
i += 1
continue
else:
logger.warning("[PDF] Mermaid render returned empty image after retries, falling back to code block")
except Exception as me:
logger.warning(f"[PDF] Mermaid render failed after retries, falling back to code block: {me}")
# Fallback: render as code block with mermaid syntax
from reportlab.platypus import XPreformatted, Paragraph
raw_code = '\n'.join(code_lines)
raw_code = raw_code.replace('\t', ' ')
raw_code = raw_code.replace('\r\n', '\n').replace('\r', '\n')
raw_code = re.sub(r'[^\x09\x0A\x20-\x7E]', '', raw_code)
escaped = raw_code.replace('&', '&').replace('<', '<').replace('>', '>')
lang_header = f"<font color='#9aa5b1' size='8'>[MERMAID DIAGRAM]</font>"
story.append(Paragraph(lang_header, code_style))
story.append(XPreformatted(escaped, code_style))
i += 1
continue
from reportlab.platypus import XPreformatted, Paragraph
# Join and sanitize code content: expand tabs, remove control chars that render as squares
raw_code = '\n'.join(code_lines)
raw_code = raw_code.replace('\t', ' ')
raw_code = raw_code.replace('\r\n', '\n').replace('\r', '\n')
# Strip non-printable except tab/newline
raw_code = re.sub(r'[^\x09\x0A\x20-\x7E]', '', raw_code)
# Escape for XML and apply lightweight syntax highlighting
escaped = raw_code.replace('&', '&').replace('<', '<').replace('>', '>')
highlighted = _apply_syntax_highlight(escaped, language)
# Add a small language header, then render highlighted code with XPreformatted to preserve spacing
lang_header = f"<font color='#9aa5b1' size='8'>[{language.upper()}]</font>"
story.append(Paragraph(lang_header, code_style))
story.append(XPreformatted(highlighted, code_style))
# Lists (including nested)
elif line.startswith('- ') or line.startswith('* '):
# Count indentation level
indent_level = len(line) - len(line.lstrip())
list_text = line[2:].strip()
list_text = _format_inline_markdown(list_text)
# Add indentation based on level
indent = " " * (indent_level // 2) if indent_level > 0 else ""
story.append(Paragraph(f"{indent}• {list_text}", normal_style))
# Numbered lists (including nested)
elif re.match(r'^\d+\.\s', line):
# Count indentation level
indent_level = len(line) - len(line.lstrip())
list_text = re.sub(r'^\d+\.\s', '', line)
list_text = _format_inline_markdown(list_text)
# Add indentation based on level
indent = " " * (indent_level // 2) if indent_level > 0 else ""
story.append(Paragraph(f"{indent}• {list_text}", normal_style))
# Blockquotes
elif line.startswith('> '):
quote_text = line[2:].strip()
quote_text = _format_inline_markdown(quote_text)
story.append(Paragraph(f"<i>{quote_text}</i>", normal_style))
# Horizontal rules
elif line.startswith('---') or line.startswith('***'):
story.append(Spacer(1, 12))
story.append(Paragraph("_" * 50, normal_style))
story.append(Spacer(1, 12))
# Regular paragraphs - collect multi-line paragraphs
else:
paragraph_lines = [line]
i += 1
# Collect continuation lines until we hit a blank line or another block type
while i < len(lines):
next_line = lines[i].strip()
# Stop if we hit a blank line
if not next_line:
break
# Stop if we hit a new block type
if (next_line.startswith('#') or
next_line.startswith('```') or
next_line.startswith('- ') or
next_line.startswith('* ') or
re.match(r'^\d+\.\s', next_line) or
next_line.startswith('> ') or
next_line.startswith('---') or
next_line.startswith('***')):
break
paragraph_lines.append(next_line)
i += 1
# Process the complete paragraph
paragraph_text = ' '.join(paragraph_lines)
formatted_text = _format_inline_markdown(paragraph_text)
story.append(Paragraph(formatted_text, normal_style))
continue # Don't increment i again since we already did it in the loop
i += 1
return story
def _detect_language_from_content(lines: list, start_index: int) -> str:
"""
Auto-detect programming language from code content
"""
# Look at the next few lines to detect language
sample_lines = []
for i in range(start_index + 1, min(start_index + 10, len(lines))):
if lines[i].strip().startswith('```'):
break
sample_lines.append(lines[i])
sample_text = '\n'.join(sample_lines)
# Python detection
if (re.search(r'\bdef\s+\w+', sample_text) or
re.search(r'\bclass\s+\w+', sample_text) or
re.search(r'\bimport\s+\w+', sample_text) or
re.search(r'\bfrom\s+\w+', sample_text)):
return 'python'
# JavaScript detection
if (re.search(r'\bfunction\s+\w+', sample_text) or
re.search(r'\bvar\s+\w+', sample_text) or
re.search(r'\blet\s+\w+', sample_text) or
re.search(r'\bconst\s+\w+', sample_text) or
re.search(r'=>', sample_text)):
return 'javascript'
# Java detection
if (re.search(r'\bpublic\s+class', sample_text) or
re.search(r'\bprivate\s+\w+', sample_text) or
re.search(r'\bSystem\.out\.print', sample_text) or
re.search(r'\bimport\s+java\.', sample_text)):
return 'java'
# JSON detection
if (re.search(r'^\s*[{}]', sample_text) or
re.search(r'"[^"]*"\s*:', sample_text) or
re.search(r'\btrue\b|\bfalse\b|\bnull\b', sample_text)):
return 'json'
# XML/HTML detection
if (re.search(r'<[^>]+>', sample_text) or
re.search(r'<[^>]+>', sample_text)):
return 'xml'
# SQL detection
if (re.search(r'\bSELECT\b', sample_text, re.IGNORECASE) or
re.search(r'\bFROM\b', sample_text, re.IGNORECASE) or
re.search(r'\bWHERE\b', sample_text, re.IGNORECASE) or
re.search(r'\bINSERT\b', sample_text, re.IGNORECASE)):
return 'sql'
# YAML detection
if (re.search(r'^\s*\w+:', sample_text) or
re.search(r'^\s*-\s+', sample_text)):
return 'yaml'
# Bash detection
if (re.search(r'^\s*#!', sample_text) or
re.search(r'\$\w+', sample_text) or
re.search(r'^\s*\w+.*\|', sample_text)):
return 'bash'
return 'text'
def _format_code_block(code_text: str, language: str) -> str:
"""
Deprecated: We now render code blocks with Preformatted to avoid paragraph parser errors.
Kept for compatibility if referenced elsewhere; returns escaped plain text.
"""
code_text = code_text.replace('&', '&').replace('<', '<').replace('>', '>')
return f"<font name='Courier' size='9'>{code_text}</font>"
def _highlight_python(code: str) -> str:
"""Python syntax highlighting"""
# Keywords
keywords = ['def', 'class', 'if', 'else', 'elif', 'for', 'while', 'try', 'except', 'finally',
'import', 'from', 'as', 'with', 'return', 'yield', 'lambda', 'and', 'or', 'not',
'in', 'is', 'True', 'False', 'None', 'pass', 'break', 'continue', 'raise', 'assert']
# Built-in functions
builtins = ['print', 'len', 'str', 'int', 'float', 'list', 'dict', 'tuple', 'set', 'range',
'enumerate', 'zip', 'map', 'filter', 'sorted', 'reversed', 'open', 'input']
# String literals
code = re.sub(r'("""[\s\S]*?""")', r'<font color="#008000">\1</font>', code) # Triple quotes
code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code) # Double quotes
code = re.sub(r"('''[\s\S]*?''')", r'<font color="#008000">\1</font>', code) # Triple single quotes
code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code) # Single quotes
# Comments
code = re.sub(r'(#.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)
# Keywords
for keyword in keywords:
code = re.sub(r'\b(' + keyword + r')\b', r'<font color="#0000FF"><b>\1</b></font>', code)
# Built-in functions
for builtin in builtins:
code = re.sub(r'\b(' + builtin + r')\b', r'<font color="#800080">\1</font>', code)
# Numbers
code = re.sub(r'\b(\d+\.?\d*)\b', r'<font color="#FF0000">\1</font>', code)
return f"<font name='Courier' size='9'>{code}</font>"
def _highlight_json(code: str) -> str:
"""JSON syntax highlighting"""
# Strings
code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code)
# Numbers
code = re.sub(r'\b(\d+\.?\d*)\b', r'<font color="#FF0000">\1</font>', code)
# Keywords
code = re.sub(r'\b(true|false|null)\b', r'<font color="#0000FF"><b>\1</b></font>', code)
# Punctuation
code = re.sub(r'([{}[\]])', r'<font color="#800080"><b>\1</b></font>', code)
code = re.sub(r'([,])', r'<font color="#800080">\1</font>', code)
return f"<font name='Courier' size='9'>{code}</font>"
def _highlight_xml(code: str) -> str:
"""XML/HTML syntax highlighting"""
# Tags
code = re.sub(r'(<[^>]*>)', r'<font color="#0000FF"><b>\1</b></font>', code)
# Attributes
code = re.sub(r'(\w+)=', r'<font color="#800080">\1</font>=', code)
# Attribute values
code = re.sub(r'="([^"]*)"', r'="<font color="#008000">\1</font>"', code)
# Comments
code = re.sub(r'(<!--[\s\S]*?-->)', r'<font color="#808080">\1</font>', code)
return f"<font name='Courier' size='9'>{code}</font>"
def _highlight_java(code: str) -> str:
"""Java syntax highlighting"""
# Keywords
keywords = ['public', 'private', 'protected', 'static', 'final', 'class', 'interface', 'extends',
'implements', 'if', 'else', 'for', 'while', 'do', 'switch', 'case', 'break', 'continue',
'return', 'try', 'catch', 'finally', 'throw', 'throws', 'new', 'this', 'super', 'import',
'package', 'void', 'int', 'long', 'float', 'double', 'boolean', 'char', 'byte', 'short',
'true', 'false', 'null']
# String literals
code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code)
code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code)
# Comments
code = re.sub(r'(//.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)
code = re.sub(r'(/\*[\s\S]*?\*/)', r'<font color="#808080">\1</font>', code)
# Keywords
for keyword in keywords:
code = re.sub(r'\b(' + keyword + r')\b', r'<font color="#0000FF"><b>\1</b></font>', code)
# Numbers
code = re.sub(r'\b(\d+\.?\d*[fFdDlL]?)\b', r'<font color="#FF0000">\1</font>', code)
return f"<font name='Courier' size='9'>{code}</font>"
def _highlight_javascript(code: str) -> str:
"""JavaScript syntax highlighting"""
# Keywords
keywords = ['function', 'var', 'let', 'const', 'if', 'else', 'for', 'while', 'do', 'switch',
'case', 'break', 'continue', 'return', 'try', 'catch', 'finally', 'throw', 'new',
'this', 'typeof', 'instanceof', 'true', 'false', 'null', 'undefined', 'async', 'await']
# String literals
code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code)
code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code)
code = re.sub(r'(`.*?`)', r'<font color="#008000">\1</font>', code) # Template literals
# Comments
code = re.sub(r'(//.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)
code = re.sub(r'(/\*[\s\S]*?\*/)', r'<font color="#808080">\1</font>', code)
# Keywords
for keyword in keywords:
code = re.sub(r'\b(' + keyword + r')\b', r'<font color="#0000FF"><b>\1</b></font>', code)
# Numbers
code = re.sub(r'\b(\d+\.?\d*)\b', r'<font color="#FF0000">\1</font>', code)
return f"<font name='Courier' size='9'>{code}</font>"
def _highlight_sql(code: str) -> str:
"""SQL syntax highlighting"""
# Keywords
keywords = ['SELECT', 'FROM', 'WHERE', 'INSERT', 'UPDATE', 'DELETE', 'CREATE', 'DROP', 'ALTER',
'TABLE', 'INDEX', 'VIEW', 'DATABASE', 'SCHEMA', 'JOIN', 'LEFT', 'RIGHT', 'INNER', 'OUTER',
'ON', 'GROUP', 'BY', 'ORDER', 'HAVING', 'UNION', 'DISTINCT', 'COUNT', 'SUM', 'AVG', 'MAX', 'MIN',
'AND', 'OR', 'NOT', 'IN', 'BETWEEN', 'LIKE', 'IS', 'NULL', 'ASC', 'DESC', 'LIMIT', 'OFFSET']
# String literals
code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code)
# Comments
code = re.sub(r'(--.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)
code = re.sub(r'(/\*[\s\S]*?\*/)', r'<font color="#808080">\1</font>', code)
# Keywords (case insensitive)
for keyword in keywords:
code = re.sub(r'\b(' + keyword + r')\b', r'<font color="#0000FF"><b>\1</b></font>', code, flags=re.IGNORECASE)
# Numbers
code = re.sub(r'\b(\d+\.?\d*)\b', r'<font color="#FF0000">\1</font>', code)
return f"<font name='Courier' size='9'>{code}</font>"
def _highlight_yaml(code: str) -> str:
"""YAML syntax highlighting"""
# Keys
code = re.sub(r'^(\s*)([^:]+):', r'\1<font color="#0000FF"><b>\2</b></font>:', code, flags=re.MULTILINE)
# String values
code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code)
code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code)
# Numbers
code = re.sub(r'\b(\d+\.?\d*)\b', r'<font color="#FF0000">\1</font>', code)
# Booleans
code = re.sub(r'\b(true|false|yes|no|on|off)\b', r'<font color="#800080"><b>\1</b></font>', code)
# Comments
code = re.sub(r'(#.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)
return f"<font name='Courier' size='9'>{code}</font>"
def _highlight_bash(code: str) -> str:
"""Bash/Shell syntax highlighting"""
# Comments
code = re.sub(r'(#.*?)$', r'<font color="#808080">\1</font>', code, flags=re.MULTILINE)
# Commands (first word on line)
code = re.sub(r'^(\s*)([a-zA-Z_][a-zA-Z0-9_]*)', r'\1<font color="#0000FF"><b>\2</b></font>', code, flags=re.MULTILINE)
# Variables
code = re.sub(r'(\$[a-zA-Z_][a-zA-Z0-9_]*)', r'<font color="#800080">\1</font>', code)
code = re.sub(r'(\$\{[^}]+\})', r'<font color="#800080">\1</font>', code)
# Strings
code = re.sub(r'(".*?")', r'<font color="#008000">\1</font>', code)
code = re.sub(r"('.*?')", r'<font color="#008000">\1</font>', code)
# Redirections and pipes
code = re.sub(r'([<>|&])', r'<font color="#FF0000"><b>\1</b></font>', code)
return f"<font name='Courier' size='9'>{code}</font>"
def _format_inline_markdown(text: str) -> str:
"""
Format inline markdown elements (bold, italic, code, links)
"""
# Escape HTML characters first
text = text.replace('&', '&')
text = text.replace('<', '<')
text = text.replace('>', '>')
# Process in order of precedence to avoid nested tag conflicts
# 1. Inline code (`code`) - highest precedence, no nested formatting
text = re.sub(r'`([^`]+)`', r'<font name="Courier" size="9">\1</font>', text)
# 2. Bold text (**text** or __text__) - but not inside code blocks
text = re.sub(r'(?<!`)\*\*([^*]+)\*\*(?!`)', r'<b>\1</b>', text)
text = re.sub(r'(?<!`)__(?!_)([^_]+)__(?!`)', r'<b>\1</b>', text)
# 3. Italic text (*text* or _text_) - but not inside code blocks or bold
text = re.sub(r'(?<!`)(?<!\*)\*([^*]+)\*(?!\*)(?!`)', r'<i>\1</i>', text)
text = re.sub(r'(?<!`)(?<!_)_([^_]+)_(?!_)(?!`)', r'<i>\1</i>', text)
# 4. Strikethrough (~~text~~) - but not inside other formatting
text = re.sub(r'~~([^~]+)~~', r'<strike>\1</strike>', text)
# 5. Links [text](url) - convert to clickable text
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', r'<link href="\2">\1</link>', text)
# 6. Line breaks
text = text.replace('\n', '<br/>')
return text
def _apply_syntax_highlight(escaped_code: str, language: str) -> str:
"""
Apply professional IDE-like syntax highlighting on XML-escaped code text.
Works with escaped entities (< > &), so regexes should not rely on raw quotes.
"""
def sub_outside_tags(pattern, repl, text, flags=0):
parts = re.split(r'(</?[^>]+>)', text)
for idx in range(0, len(parts)):
if idx % 2 == 0: # outside tags
parts[idx] = re.sub(pattern, repl, parts[idx], flags=flags)
return ''.join(parts)
out = escaped_code
lang = (language or 'text').lower()
if lang in ('python', 'py'):
# Comments first (gray)
out = sub_outside_tags(r"(#[^\n]*)", r"<font color='#6a737d'>\1</font>", out)
# Docstrings (green)
out = sub_outside_tags(r'("""[\s\S]*?""")', r"<font color='#28a745'>\1</font>", out)
out = sub_outside_tags(r"('''[\s\S]*?''')", r"<font color='#28a745'>\1</font>", out)
# Keywords (purple)
keywords = (
'def|class|if|else|elif|for|while|try|except|finally|import|from|as|with|return|yield|lambda|and|or|not|in|is|True|False|None|pass|break|continue|raise|assert|global|nonlocal'
)
out = sub_outside_tags(rf"\b({keywords})\b", r"<font color='#6f42c1'><b>\1</b></font>", out)
# Built-in functions (blue)
builtins = (
'print|len|str|int|float|list|dict|tuple|set|range|enumerate|zip|map|filter|sorted|reversed|open|input|type|isinstance|hasattr|getattr|setattr|delattr'
)
out = sub_outside_tags(rf"\b({builtins})\b", r"<font color='#005cc5'>\1</font>", out)
elif lang in ('javascript', 'js', 'typescript', 'ts'):
# Comments (gray)
out = sub_outside_tags(r"(//[^\n]*)", r"<font color='#6a737d'>\1</font>", out)
out = sub_outside_tags(r"/\*[\s\S]*?\*/", lambda m: f"<font color='#6a737d'>{m.group(0)}</font>", out)
# Keywords (purple)
keywords = (
'function|var|let|const|if|else|for|while|do|switch|case|break|continue|return|try|catch|finally|throw|new|this|typeof|instanceof|true|false|null|undefined|async|await|class|extends|implements|interface|type|namespace|module|export|import|default|public|private|protected|static|abstract|readonly'
)
out = sub_outside_tags(rf"\b({keywords})\b", r"<font color='#6f42c1'><b>\1</b></font>", out)
# Built-in objects (blue)
builtins = (
'console|window|document|Array|Object|String|Number|Boolean|Date|Math|JSON|Promise|Set|Map|WeakSet|WeakMap|Symbol|Proxy|Reflect'
)
out = sub_outside_tags(rf"\b({builtins})\b", r"<font color='#005cc5'>\1</font>", out)
elif lang in ('json',):
# Boolean and null values (blue)
out = sub_outside_tags(r"\b(true|false|null)\b", r"<font color='#005cc5'><b>\1</b></font>", out)
# Keys (purple)
out = sub_outside_tags(r"("[^&]*?")(\s*:)", r"<font color='#6f42c1'>\1</font>\2", out)
elif lang in ('bash', 'sh', 'shell'):
# Comments (gray)
out = sub_outside_tags(r"(#[^\n]*)", r"<font color='#6a737d'>\1</font>", out)
# Commands (purple)
out = sub_outside_tags(r"(^|\n)(\s*)([a-zA-Z_][a-zA-Z0-9_-]*)", r"\1\2<font color='#6f42c1'><b>\3</b></font>", out)
# Variables (blue)
out = sub_outside_tags(r"(\$[a-zA-Z_][a-zA-Z0-9_]*)", r"<font color='#005cc5'>\1</font>", out)
out = sub_outside_tags(r"(\$\{[^}]+\})", r"<font color='#005cc5'>\1</font>", out)
elif lang in ('yaml', 'yml'):
# Keys (purple)
out = sub_outside_tags(r"(^|\n)(\s*)([^:\n]+)(:)", r"\1\2<font color='#6f42c1'>\3</font>\4", out)
# Boolean values (blue)
out = sub_outside_tags(r"\b(true|false|yes|no|on|off)\b", r"<font color='#005cc5'><b>\1</b></font>", out, flags=re.IGNORECASE)
elif lang in ('sql',):
# Keywords (purple)
keywords = (
'SELECT|FROM|WHERE|INSERT|UPDATE|DELETE|CREATE|DROP|ALTER|TABLE|INDEX|VIEW|DATABASE|SCHEMA|JOIN|LEFT|RIGHT|INNER|OUTER|ON|GROUP|BY|ORDER|HAVING|UNION|DISTINCT|COUNT|SUM|AVG|MAX|MIN|AND|OR|NOT|IN|BETWEEN|LIKE|IS|NULL|ASC|DESC|LIMIT|OFFSET|CASE|WHEN|THEN|ELSE|END|EXISTS|ALL|ANY|SOME'
)
out = sub_outside_tags(rf"\b({keywords})\b", r"<font color='#6f42c1'><b>\1</b></font>", out, flags=re.IGNORECASE)
elif lang in ('java',):
# Comments (gray)
out = sub_outside_tags(r"(//[^\n]*)", r"<font color='#6a737d'>\1</font>", out)
out = sub_outside_tags(r"/\*[\s\S]*?\*/", lambda m: f"<font color='#6a737d'>{m.group(0)}</font>", out)
# Keywords (purple)
keywords = (
'public|private|protected|static|final|class|interface|extends|implements|if|else|for|while|do|switch|case|break|continue|return|try|catch|finally|throw|throws|new|this|super|import|package|void|int|long|float|double|boolean|char|byte|short|true|false|null|abstract|native|synchronized|volatile|transient|strictfp'
)
out = sub_outside_tags(rf"\b({keywords})\b", r"<font color='#6f42c1'><b>\1</b></font>", out)
# Built-in classes (blue)
builtins = (
'String|Object|Integer|Long|Float|Double|Boolean|Character|Byte|Short|System|Math|ArrayList|HashMap|HashSet|LinkedList|Vector|Collections|Arrays'
)
out = sub_outside_tags(rf"\b({builtins})\b", r"<font color='#005cc5'>\1</font>", out)
elif lang in ('css',):
# Selectors (purple)
out = sub_outside_tags(r"([.#]?[a-zA-Z][a-zA-Z0-9_-]*)(\s*\{)", r"<font color='#6f42c1'>\1</font>\2", out)
# Properties (blue)
out = sub_outside_tags(r"([a-zA-Z-]+)(\s*:)", r"<font color='#005cc5'>\1</font>\2", out)
# Values (green)
out = sub_outside_tags(r"(\s*:\s*)([^;]+)(;)", r"\1<font color='#28a745'>\2</font>\3", out)
elif lang in ('html', 'xml'):
# Tags (purple)
out = sub_outside_tags(r"(<[^>]*>)", r"<font color='#6f42c1'><b>\1</b></font>", out)
# Attributes (blue)
out = sub_outside_tags(r"(\w+)=("[^&]*?")", r"<font color='#005cc5'>\1</font>=\2", out)
# Strings (green) - apply to all languages
out = sub_outside_tags(r"(".*?")", r"<font color='#28a745'>\1</font>", out)
out = sub_outside_tags(r"('.*?')", r"<font color='#28a745'>\1</font>", out)
out = sub_outside_tags(r"(`.*?`)", r"<font color='#28a745'>\1</font>", out)
# Numbers (orange) - apply to all languages
out = sub_outside_tags(r"\b(\d+\.?\d*)\b", r"<font color='#e36209'>\1</font>", out)
return out
def _render_mermaid_png(mermaid_text: str) -> bytes:
"""
Render mermaid code to PNG via Kroki service (no local mermaid-cli dependency).
Falls back to returning empty bytes on failure.
"""
try:
import base64
import json
import urllib.request
import urllib.error
# Validate and clean mermaid content
if not mermaid_text or not mermaid_text.strip():
logger.warning("[PDF] Empty mermaid content")
return b""
# Clean the mermaid text - remove any potential issues
cleaned_text = mermaid_text.strip()
# Basic mermaid syntax validation
if not cleaned_text.startswith(('graph', 'flowchart', 'sequenceDiagram', 'classDiagram', 'stateDiagram', 'erDiagram', 'journey', 'gantt', 'pie', 'gitgraph')):
logger.warning(f"[PDF] Invalid mermaid diagram type: {cleaned_text[:50]}...")
return b""
# Kroki POST API for mermaid -> png
data = json.dumps({"diagram_source": cleaned_text}).encode("utf-8")
req = urllib.request.Request(
url="https://kroki.io/mermaid/png",
data=data,
headers={"Content-Type": "application/json"},
method="POST"
)
with urllib.request.urlopen(req, timeout=15) as resp:
if resp.status == 200:
return resp.read()
else:
logger.warning(f"[PDF] Kroki returned status {resp.status}")
return b""
except urllib.error.HTTPError as e:
if e.code == 400:
logger.warning(f"[PDF] Kroki mermaid syntax error (400): {e.reason}")
else:
logger.warning(f"[PDF] Kroki HTTP error {e.code}: {e.reason}")
except urllib.error.URLError as e:
logger.warning(f"[PDF] Kroki connection error: {e.reason}")
except Exception as e:
logger.warning(f"[PDF] Kroki mermaid render error: {e}")
return b""
async def _format_references_ieee(sources: List[Dict]) -> List[str]:
"""Format sources in IEEE citation style using NVIDIA API."""
try:
from utils.api.router import generate_answer_with_model
from helpers.setup import nvidia_rotator
if not sources or not nvidia_rotator:
return []
# Prepare source data for formatting
source_data = []
for i, source in enumerate(sources, 1):
source_info = {
"number": i,
"filename": source.get("filename", "Unknown"),
"url": source.get("url", ""),
"topic_name": source.get("topic_name", ""),
"kind": source.get("kind", "document")
}
source_data.append(source_info)
sys_prompt = """You are an expert at formatting academic references in IEEE style.
Format the provided sources as IEEE-style references. Each reference should be numbered and formatted according to IEEE standards.
For web sources: [1] Author/Organization, "Title," Website Name, URL, accessed: Date.
For documents: [1] Author, "Title," Document Type, Filename, Year.
Return only the formatted references, one per line, numbered sequentially."""
user_prompt = f"Format these sources in IEEE style:\n\n{source_data}"
selection = {"provider": "nvidia", "model": "meta/llama-3.1-8b-instruct"}
response = await generate_answer_with_model(selection, sys_prompt, user_prompt, None, nvidia_rotator)
# Parse the response into individual references
references = [line.strip() for line in response.split('\n') if line.strip() and line.strip().startswith('[')]
# If NVIDIA formatting fails, create basic IEEE format
if not references:
references = []
for i, source in enumerate(sources, 1):
if source.get("kind") == "web":
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}."
else:
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}."
references.append(ref)
return references
except Exception as e:
logger.warning(f"[PDF] IEEE reference formatting failed: {e}")
# Fallback to basic formatting
references = []
for i, source in enumerate(sources, 1):
if source.get("kind") == "web":
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}."
else:
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}."
references.append(ref)
return references
async def generate_report_pdf(report_content: str, user_id: str, project_id: str, sources: List[Dict] = None) -> bytes:
"""
Generate a PDF from report content using reportlab
Args:
report_content: Markdown content of the report
user_id: User ID for logging
project_id: Project ID for logging
Returns:
PDF content as bytes
Raises:
HTTPException: If PDF generation fails
"""
try:
from reportlab.lib.pagesizes import letter, A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import inch
from reportlab.lib import colors
from io import BytesIO
logger.info(f"[PDF] Generating PDF for user {user_id}, project {project_id}")
# Create a BytesIO buffer for the PDF
buffer = BytesIO()
# Create the PDF document
doc = SimpleDocTemplate(
buffer,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=18
)
# Get styles
styles = getSampleStyleSheet()
# Create custom styles
title_style = ParagraphStyle(
'CustomTitle',
parent=styles['Heading1'],
fontSize=24,
spaceAfter=30,
textColor=colors.HexColor('#2c3e50'),
borderWidth=1,
borderColor=colors.HexColor('#3498db'),
borderPadding=10
)
heading1_style = ParagraphStyle(
'CustomHeading1',
parent=styles['Heading1'],
fontSize=18,
spaceAfter=12,
spaceBefore=20,
textColor=colors.HexColor('#2c3e50')
)
heading2_style = ParagraphStyle(
'CustomHeading2',
parent=styles['Heading2'],
fontSize=16,
spaceAfter=10,
spaceBefore=16,
textColor=colors.HexColor('#2c3e50')
)
heading3_style = ParagraphStyle(
'CustomHeading3',
parent=styles['Heading3'],
fontSize=14,
spaceAfter=8,
spaceBefore=12,
textColor=colors.HexColor('#2c3e50')
)
normal_style = ParagraphStyle(
'CustomNormal',
parent=styles['Normal'],
fontSize=11,
spaceAfter=6,
leading=14
)
# Professional IDE-like code styling with no background
base_code_parent = styles['Code'] if 'Code' in styles.byName else styles['Normal']
code_style = ParagraphStyle(
'Code',
parent=base_code_parent,
fontSize=9,
fontName='Courier',
textColor=colors.HexColor('#2c3e50'), # Dark text on white background
backColor=None, # No background color
borderColor=colors.HexColor('#e1e8ed'),
borderWidth=1,
borderPadding=8,
leftIndent=12,
rightIndent=12,
spaceBefore=6,
spaceAfter=6,
leading=11
)
# Parse markdown content
story = []
# Add title
story.append(Paragraph("StudyBuddy Report", title_style))
story.append(Paragraph(f"<i>Generated on {datetime.now().strftime('%B %d, %Y at %I:%M %p')}</i>", normal_style))
story.append(Spacer(1, 20))
# Enhanced markdown parser with proper formatting
story.extend(await _parse_markdown_content(report_content, heading1_style, heading2_style, heading3_style, normal_style, code_style))
# Add references section if sources provided
if sources:
story.append(PageBreak())
story.append(Paragraph("References", heading1_style))
story.append(Spacer(1, 12))
# Format references in IEEE style using NVIDIA API
try:
ieee_references = await _format_references_ieee(sources)
except Exception as _ie:
logger.warning(f"[PDF] Reference formatting failed, falling back: {_ie}")
ieee_references = []
for i, source in enumerate(sources, 1):
if source.get("kind") == "web":
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Web Source')}\", {source.get('url', '')}, accessed: {datetime.now().strftime('%B %d, %Y')}."
else:
ref = f"[{i}] {source.get('topic_name', 'Unknown')}, \"{source.get('filename', 'Document')}\", Document, {datetime.now().year}."
ieee_references.append(ref)
for ref in ieee_references:
story.append(Paragraph(ref, normal_style))
story.append(Spacer(1, 6))
# Build PDF
doc.build(story)
# Get PDF content
pdf_content = buffer.getvalue()
buffer.close()
logger.info(f"[PDF] Successfully generated PDF ({len(pdf_content)} bytes) for user {user_id}, project {project_id}")
return pdf_content
except ImportError:
logger.error("[PDF] reportlab not installed. Install with: pip install reportlab")
raise HTTPException(500, detail="PDF generation not available. Please install reportlab.")
except Exception as e:
logger.error(f"[PDF] Failed to generate PDF: {e}")
# Keep error generic for client; avoid leaking internals
raise HTTPException(500, detail="Failed to generate PDF")
|