#!/usr/bin/env node import { readFileSync, writeFileSync, existsSync, readdirSync } from 'fs'; import { join, dirname } from 'path'; import { fileURLToPath } from 'url'; const __filename = fileURLToPath(import.meta.url); const __dirname = dirname(__filename); /** * Post-processor for cleaning Markdown content from LaTeX conversion * Each function handles a specific type of cleanup for maintainability */ /** * Remove TeX low-level grouping commands that break KaTeX * @param {string} content - Markdown content * @returns {string} - Cleaned content */ function removeTexGroupingCommands(content) { console.log(' ๐Ÿงน Removing TeX grouping commands...'); return content .replace(/\\mathopen\{\}\\mathclose\\bgroup/g, '') .replace(/\\aftergroup\\egroup/g, '') .replace(/\\bgroup/g, '') .replace(/\\egroup/g, ''); } /** * Simplify LaTeX delimiter constructions * @param {string} content - Markdown content * @returns {string} - Cleaned content */ function simplifyLatexDelimiters(content) { console.log(' ๐Ÿ”ง Simplifying LaTeX delimiters...'); return content .replace(/\\left\[\s*/g, '[') .replace(/\s*\\right\]/g, ']'); } /** * Remove orphaned LaTeX labels * @param {string} content - Markdown content * @returns {string} - Cleaned content */ function removeOrphanedLabels(content) { console.log(' ๐Ÿท๏ธ Removing orphaned labels...'); return content .replace(/^\s*\\label\{[^}]+\}\s*$/gm, '') .replace(/\\label\{[^}]+\}/g, ''); } /** * Fix KaTeX-incompatible math commands * @param {string} content - Markdown content * @returns {string} - Cleaned content */ function fixMathCommands(content) { console.log(' ๐Ÿ“ Fixing KaTeX-incompatible math commands...'); return content // Replace \hdots with \ldots (KaTeX compatible) .replace(/\\hdots/g, '\\ldots') // Add more math command fixes here as needed .replace(/\\vdots/g, '\\vdots'); // This one should be fine, but kept for consistency } /** * Convert LaTeX matrix commands to KaTeX-compatible environments * @param {string} content - Markdown content * @returns {string} - Content with fixed matrix commands */ function fixMatrixCommands(content) { console.log(' ๐Ÿ”ข Converting matrix commands to KaTeX format...'); let fixedCount = 0; // Convert \pmatrix{...} to \begin{pmatrix}...\end{pmatrix} content = content.replace(/\\pmatrix\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, matrixContent) => { fixedCount++; // Split by \\ for rows, handle nested braces const rows = matrixContent.split('\\\\').map(row => row.trim()).filter(row => row); return `\\begin{pmatrix}\n${rows.join(' \\\\\n')}\n\\end{pmatrix}`; }); // Convert \bmatrix{...} to \begin{bmatrix}...\end{bmatrix} content = content.replace(/\\bmatrix\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, matrixContent) => { fixedCount++; const rows = matrixContent.split('\\\\').map(row => row.trim()).filter(row => row); return `\\begin{bmatrix}\n${rows.join(' \\\\\n')}\n\\end{bmatrix}`; }); // Convert \vmatrix{...} to \begin{vmatrix}...\end{vmatrix} content = content.replace(/\\vmatrix\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}/g, (match, matrixContent) => { fixedCount++; const rows = matrixContent.split('\\\\').map(row => row.trim()).filter(row => row); return `\\begin{vmatrix}\n${rows.join(' \\\\\n')}\n\\end{vmatrix}`; }); if (fixedCount > 0) { console.log(` โœ… Fixed ${fixedCount} matrix command(s)`); } return content; } /** * Fix Unicode characters that break MDX/JSX parsing * @param {string} content - Markdown content * @returns {string} - Cleaned content */ function fixUnicodeIssues(content) { console.log(' ๐ŸŒ Fixing Unicode characters for MDX compatibility...'); return content // Replace Unicode middle dot (ยท) with \cdot in math expressions .replace(/\$([^$]*?)ยท([^$]*?)\$/g, (match, before, after) => { return `$${before}\\cdot${after}$`; }) // Replace Unicode middle dot in display math .replace(/\$\$([^$]*?)ยท([^$]*?)\$\$/g, (match, before, after) => { return `$$${before}\\cdot${after}$$`; }) // Replace other problematic Unicode characters .replace(/[""]/g, '"') // Smart quotes to regular quotes .replace(/['']/g, "'") // Smart apostrophes to regular apostrophes .replace(/โ€ฆ/g, '...') // Ellipsis to three dots .replace(/โ€“/g, '-') // En dash to hyphen .replace(/โ€”/g, '--'); // Em dash to double hyphen } /** * Fix multiline math expressions for MDX compatibility * @param {string} content - Markdown content * @returns {string} - Cleaned content */ function fixMultilineMath(content) { console.log(' ๐Ÿ“ Fixing multiline math expressions for MDX...'); return content // Convert multiline inline math to display math blocks (more precise regex) // Only match if the content is a self-contained math expression within a single line .replace(/\$([^$\n]*\\\\[^$\n]*)\$/g, (match, mathContent) => { // Only convert if it contains actual math operators and line breaks if (mathContent.includes('\\\\') && /[=+\-*/^_{}]/.test(mathContent)) { // Remove leading/trailing whitespace and normalize newlines const cleanedMath = mathContent .replace(/^\s+|\s+$/g, '') .replace(/\s*\\\\\s*/g, '\\\\\n '); return `$$\n${cleanedMath}\n$$`; } return match; // Keep original if it doesn't look like multiline math }) // Ensure display math blocks are properly separated .replace(/\$\$\s*\n\s*([^$]+?)\s*\n\s*\$\$/g, (match, mathContent) => { return `\n$$\n${mathContent.trim()}\n$$\n`; }); } /** * Inject code snippets into empty code blocks * @param {string} content - Markdown content * @param {string} inputDir - Directory containing the LaTeX source and snippets * @returns {string} - Content with injected code snippets */ function injectCodeSnippets(content, inputDir = null) { console.log(' ๐Ÿ’ป Injecting code snippets...'); if (!inputDir) { console.log(' โš ๏ธ No input directory provided, skipping code injection'); return content; } const snippetsDir = join(inputDir, 'snippets'); if (!existsSync(snippetsDir)) { console.log(' โš ๏ธ Snippets directory not found, skipping code injection'); return content; } // Get all available snippet files let availableSnippets = []; try { availableSnippets = readdirSync(snippetsDir); console.log(` ๐Ÿ“ Found ${availableSnippets.length} snippet file(s): ${availableSnippets.join(', ')}`); } catch (error) { console.log(` โŒ Error reading snippets directory: ${error.message}`); return content; } // Find all empty code blocks const emptyCodeBlockPattern = /```\s*(\w+)\s*\n\s*```/g; let processedContent = content; let injectionCount = 0; processedContent = processedContent.replace(emptyCodeBlockPattern, (match, language) => { // Map language names to file extensions const extensionMap = { 'python': 'py', 'javascript': 'js', 'typescript': 'ts', 'bash': 'sh', 'shell': 'sh' }; const fileExtension = extensionMap[language] || language; // Try to find a matching snippet file for this language const matchingFiles = availableSnippets.filter(file => file.endsWith(`.${fileExtension}`) ); if (matchingFiles.length === 0) { console.log(` โš ๏ธ No ${language} snippet found (looking for .${fileExtension})`); return match; } // Use the first matching file (could be made smarter with context analysis) const selectedFile = matchingFiles[0]; const snippetPath = join(snippetsDir, selectedFile); try { const snippetContent = readFileSync(snippetPath, 'utf8'); injectionCount++; console.log(` โœ… Injected: ${selectedFile}`); return `\`\`\`${language}\n${snippetContent.trim()}\n\`\`\``; } catch (error) { console.log(` โŒ Error reading ${selectedFile}: ${error.message}`); return match; } }); if (injectionCount > 0) { console.log(` ๐Ÿ“Š Injected ${injectionCount} code snippet(s)`); } return processedContent; } /** * Fix all attributes that still contain colons (href, data-reference, id) * @param {string} content - Markdown content * @returns {string} - Cleaned content */ function fixAllAttributes(content) { console.log(' ๐Ÿ”— Fixing all attributes with colons...'); let fixedCount = 0; // Fix href attributes containing colons content = content.replace(/href="([^"]*):([^"]*)"/g, (match, before, after) => { fixedCount++; return `href="${before}-${after}"`; }); // Fix data-reference attributes containing colons content = content.replace(/data-reference="([^"]*):([^"]*)"/g, (match, before, after) => { fixedCount++; return `data-reference="${before}-${after}"`; }); // Fix id attributes containing colons (like in Figure components) content = content.replace(/id="([^"]*):([^"]*)"/g, (match, before, after) => { fixedCount++; return `id="${before}-${after}"`; }); if (fixedCount > 0) { console.log(` โœ… Fixed ${fixedCount} attribute(s) with colons`); } return content; } /** * Fix link text content that still contains colons * @param {string} content - Markdown content * @returns {string} - Cleaned content */ function fixLinkTextContent(content) { console.log(' ๐Ÿ“ Fixing link text content with colons...'); let fixedCount = 0; // Fix text content within links that contain references with colons // Pattern: [text:content] const cleanedContent = content.replace(/]*)>\[([^:]*):([^\]]*)\]<\/a>/g, (match, attributes, before, after) => { fixedCount++; return `[${before}-${after}]`; }); if (fixedCount > 0) { console.log(` โœ… Fixed ${fixedCount} link text(s) with colons`); } return cleanedContent; } /** * Convert align anchor markers to proper HTML spans outside math blocks * @param {string} content - Markdown content * @returns {string} - Content with converted anchor spans */ function convertAlignAnchors(content) { console.log(' ๐Ÿท๏ธ Converting align anchor markers to HTML spans...'); let convertedCount = 0; // Find and replace align anchor markers with proper spans outside math blocks content = content.replace(/``` math\n%%ALIGN_ANCHOR_ID\{([^}]+)\}%%\n([\s\S]*?)\n```/g, (match, anchorId, mathContent) => { convertedCount++; return `\n\n\`\`\` math\n${mathContent}\n\`\`\``; }); if (convertedCount > 0) { console.log(` โœ… Converted ${convertedCount} align anchor marker(s) to spans`); } return content; } /** * Main post-processing function that applies all cleanup steps * @param {string} content - Raw Markdown content from Pandoc * @param {string} inputDir - Optional: Directory containing LaTeX source for code injection * @returns {string} - Cleaned Markdown content */ export function postProcessMarkdown(content, inputDir = null) { console.log('๐Ÿ”ง Post-processing for KaTeX compatibility...'); let processedContent = content; // Apply each cleanup step sequentially processedContent = removeTexGroupingCommands(processedContent); processedContent = simplifyLatexDelimiters(processedContent); processedContent = removeOrphanedLabels(processedContent); processedContent = convertAlignAnchors(processedContent); processedContent = fixMathCommands(processedContent); processedContent = fixMatrixCommands(processedContent); processedContent = fixUnicodeIssues(processedContent); processedContent = fixMultilineMath(processedContent); processedContent = fixAllAttributes(processedContent); processedContent = fixLinkTextContent(processedContent); // Inject code snippets if input directory is provided if (inputDir) { processedContent = injectCodeSnippets(processedContent, inputDir); } return processedContent; } /** * CLI interface for standalone usage */ function parseArgs() { const args = process.argv.slice(2); const config = { input: join(__dirname, 'output', 'main.md'), output: null, // Will default to input if not specified verbose: false, }; for (const arg of args) { if (arg.startsWith('--input=')) { config.input = arg.substring('--input='.length); } else if (arg.startsWith('--output=')) { config.output = arg.substring('--output='.length); } else if (arg === '--verbose') { config.verbose = true; } else if (arg === '--help' || arg === '-h') { console.log(` ๐Ÿ”ง Markdown Post-Processor Usage: node post-processor.mjs [options] Options: --input=PATH Input Markdown file (default: output/main.md) --output=PATH Output file (default: overwrites input) --verbose Verbose output --help, -h Show this help Examples: # Process main.md in-place node post-processor.mjs # Process with custom paths node post-processor.mjs --input=raw.md --output=clean.md `); process.exit(0); } } // Default output to input if not specified if (!config.output) { config.output = config.input; } return config; } function main() { const config = parseArgs(); console.log('๐Ÿ”ง Markdown Post-Processor'); console.log(`๐Ÿ“ Input: ${config.input}`); console.log(`๐Ÿ“ Output: ${config.output}`); try { const content = readFileSync(config.input, 'utf8'); const processedContent = postProcessMarkdown(content); writeFileSync(config.output, processedContent); console.log(`โœ… Post-processing completed: ${config.output}`); // Show stats if verbose if (config.verbose) { const originalLines = content.split('\n').length; const processedLines = processedContent.split('\n').length; console.log(`๐Ÿ“Š Lines: ${originalLines} โ†’ ${processedLines}`); } } catch (error) { console.error('โŒ Post-processing failed:'); console.error(error.message); process.exit(1); } } // Run CLI if called directly if (import.meta.url === `file://${process.argv[1]}`) { main(); }