File size: 4,970 Bytes
e903a32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
/**
 * LaTeX Metadata Extractor
 * Extracts document metadata from LaTeX files for frontmatter generation
 */

/**
 * Extract metadata from LaTeX content
 * @param {string} latexContent - Raw LaTeX content
 * @returns {object} - Extracted metadata object
 */
export function extractLatexMetadata(latexContent) {
    const metadata = {};

    // Extract title
    const titleMatch = latexContent.match(/\\title\s*\{\s*([^}]+)\s*\}/s);
    if (titleMatch) {
        metadata.title = titleMatch[1]
            .replace(/\n/g, ' ')
            .trim();
    }

    // Extract authors with their specific affiliations
    const authors = [];
    const authorMatches = latexContent.matchAll(/\\authorOne\[[^\]]*\]\{([^}]+)\}/g);

    for (const match of authorMatches) {
        const fullAuthorInfo = match[1];

        // Determine affiliations based on macros present
        const affiliations = [];
        if (fullAuthorInfo.includes('\\ensps')) {
            affiliations.push(1); // École Normale Supérieure
        }
        if (fullAuthorInfo.includes('\\hf')) {
            affiliations.push(2); // Hugging Face
        }

        // Clean author name by removing macros
        let authorName = fullAuthorInfo
            .replace(/\\ensps/g, '')      // Remove École macro
            .replace(/\\hf/g, '')         // Remove Hugging Face macro
            .replace(/\s+/g, ' ')         // Normalize whitespace
            .trim();

        // Skip empty authors or placeholder entries
        if (authorName && authorName !== '...') {
            authors.push({
                name: authorName,
                affiliations: affiliations.length > 0 ? affiliations : [2] // Default to HF if no macro
            });
        }
    }

    if (authors.length > 0) {
        metadata.authors = authors;
    }

    // Extract affiliations - create the two distinct affiliations
    metadata.affiliations = [
        {
            name: "École Normale Supérieure Paris-Saclay"
        },
        {
            name: "Hugging Face"
        }
    ];

    // Extract date if available (common LaTeX patterns)
    const datePatterns = [
        /\\date\s*\{([^}]+)\}/,
        /\\newcommand\s*\{\\date\}\s*\{([^}]+)\}/,
    ];

    for (const pattern of datePatterns) {
        const dateMatch = latexContent.match(pattern);
        if (dateMatch) {
            metadata.published = dateMatch[1].trim();
            break;
        }
    }

    // Fallback to current date if no date found
    if (!metadata.published) {
        metadata.published = new Date().toLocaleDateString('en-US', {
            year: 'numeric',
            month: 'short',
            day: '2-digit'
        });
    }

    return metadata;
}

/**
 * Generate YAML frontmatter from metadata object
 * @param {object} metadata - Metadata object
 * @returns {string} - YAML frontmatter string
 */
export function generateFrontmatter(metadata) {
    let frontmatter = '---\n';

    // Title
    if (metadata.title) {
        frontmatter += `title: "${metadata.title}"\n`;
    }

    // Authors
    if (metadata.authors && metadata.authors.length > 0) {
        frontmatter += 'authors:\n';
        metadata.authors.forEach(author => {
            frontmatter += `  - name: "${author.name}"\n`;
            if (author.url) {
                frontmatter += `    url: "${author.url}"\n`;
            }
            frontmatter += `    affiliations: [${author.affiliations.join(', ')}]\n`;
        });
    }

    // Affiliations
    if (metadata.affiliations && metadata.affiliations.length > 0) {
        frontmatter += 'affiliations:\n';
        metadata.affiliations.forEach((affiliation, index) => {
            frontmatter += `  - name: "${affiliation.name}"\n`;
            if (affiliation.url) {
                frontmatter += `    url: "${affiliation.url}"\n`;
            }
        });
    }

    // Publication date
    if (metadata.published) {
        frontmatter += `published: "${metadata.published}"\n`;
    }

    // Additional metadata
    if (metadata.doi) {
        frontmatter += `doi: "${metadata.doi}"\n`;
    }

    if (metadata.description) {
        frontmatter += `description: "${metadata.description}"\n`;
    }

    if (metadata.licence) {
        frontmatter += `licence: >\n  ${metadata.licence}\n`;
    }

    if (metadata.tags && metadata.tags.length > 0) {
        frontmatter += 'tags:\n';
        metadata.tags.forEach(tag => {
            frontmatter += `  - ${tag}\n`;
        });
    }

    // Default Astro configuration
    frontmatter += 'tableOfContentsAutoCollapse: true\n';
    frontmatter += '---\n\n';

    return frontmatter;
}

/**
 * Extract and generate frontmatter from LaTeX content
 * @param {string} latexContent - Raw LaTeX content
 * @returns {string} - Complete YAML frontmatter
 */
export function extractAndGenerateFrontmatter(latexContent) {
    const metadata = extractLatexMetadata(latexContent);
    return generateFrontmatter(metadata);
}