Molbap's picture
Molbap HF Staff
push a bunch of updates
e903a32
raw
history blame
4.97 kB
/**
* LaTeX Metadata Extractor
* Extracts document metadata from LaTeX files for frontmatter generation
*/
/**
* Extract metadata from LaTeX content
* @param {string} latexContent - Raw LaTeX content
* @returns {object} - Extracted metadata object
*/
export function extractLatexMetadata(latexContent) {
const metadata = {};
// Extract title
const titleMatch = latexContent.match(/\\title\s*\{\s*([^}]+)\s*\}/s);
if (titleMatch) {
metadata.title = titleMatch[1]
.replace(/\n/g, ' ')
.trim();
}
// Extract authors with their specific affiliations
const authors = [];
const authorMatches = latexContent.matchAll(/\\authorOne\[[^\]]*\]\{([^}]+)\}/g);
for (const match of authorMatches) {
const fullAuthorInfo = match[1];
// Determine affiliations based on macros present
const affiliations = [];
if (fullAuthorInfo.includes('\\ensps')) {
affiliations.push(1); // École Normale Supérieure
}
if (fullAuthorInfo.includes('\\hf')) {
affiliations.push(2); // Hugging Face
}
// Clean author name by removing macros
let authorName = fullAuthorInfo
.replace(/\\ensps/g, '') // Remove École macro
.replace(/\\hf/g, '') // Remove Hugging Face macro
.replace(/\s+/g, ' ') // Normalize whitespace
.trim();
// Skip empty authors or placeholder entries
if (authorName && authorName !== '...') {
authors.push({
name: authorName,
affiliations: affiliations.length > 0 ? affiliations : [2] // Default to HF if no macro
});
}
}
if (authors.length > 0) {
metadata.authors = authors;
}
// Extract affiliations - create the two distinct affiliations
metadata.affiliations = [
{
name: "École Normale Supérieure Paris-Saclay"
},
{
name: "Hugging Face"
}
];
// Extract date if available (common LaTeX patterns)
const datePatterns = [
/\\date\s*\{([^}]+)\}/,
/\\newcommand\s*\{\\date\}\s*\{([^}]+)\}/,
];
for (const pattern of datePatterns) {
const dateMatch = latexContent.match(pattern);
if (dateMatch) {
metadata.published = dateMatch[1].trim();
break;
}
}
// Fallback to current date if no date found
if (!metadata.published) {
metadata.published = new Date().toLocaleDateString('en-US', {
year: 'numeric',
month: 'short',
day: '2-digit'
});
}
return metadata;
}
/**
* Generate YAML frontmatter from metadata object
* @param {object} metadata - Metadata object
* @returns {string} - YAML frontmatter string
*/
export function generateFrontmatter(metadata) {
let frontmatter = '---\n';
// Title
if (metadata.title) {
frontmatter += `title: "${metadata.title}"\n`;
}
// Authors
if (metadata.authors && metadata.authors.length > 0) {
frontmatter += 'authors:\n';
metadata.authors.forEach(author => {
frontmatter += ` - name: "${author.name}"\n`;
if (author.url) {
frontmatter += ` url: "${author.url}"\n`;
}
frontmatter += ` affiliations: [${author.affiliations.join(', ')}]\n`;
});
}
// Affiliations
if (metadata.affiliations && metadata.affiliations.length > 0) {
frontmatter += 'affiliations:\n';
metadata.affiliations.forEach((affiliation, index) => {
frontmatter += ` - name: "${affiliation.name}"\n`;
if (affiliation.url) {
frontmatter += ` url: "${affiliation.url}"\n`;
}
});
}
// Publication date
if (metadata.published) {
frontmatter += `published: "${metadata.published}"\n`;
}
// Additional metadata
if (metadata.doi) {
frontmatter += `doi: "${metadata.doi}"\n`;
}
if (metadata.description) {
frontmatter += `description: "${metadata.description}"\n`;
}
if (metadata.licence) {
frontmatter += `licence: >\n ${metadata.licence}\n`;
}
if (metadata.tags && metadata.tags.length > 0) {
frontmatter += 'tags:\n';
metadata.tags.forEach(tag => {
frontmatter += ` - ${tag}\n`;
});
}
// Default Astro configuration
frontmatter += 'tableOfContentsAutoCollapse: true\n';
frontmatter += '---\n\n';
return frontmatter;
}
/**
* Extract and generate frontmatter from LaTeX content
* @param {string} latexContent - Raw LaTeX content
* @returns {string} - Complete YAML frontmatter
*/
export function extractAndGenerateFrontmatter(latexContent) {
const metadata = extractLatexMetadata(latexContent);
return generateFrontmatter(metadata);
}