|
|
<div class="d3-benchmark"></div> |
|
|
<style> |
|
|
.d3-benchmark { position: relative; } |
|
|
.d3-benchmark .controls { |
|
|
display: flex; |
|
|
align-items: center; |
|
|
gap: 12px; |
|
|
margin-bottom: 10px; |
|
|
} |
|
|
.d3-benchmark .controls label { |
|
|
font-size: 12px; |
|
|
color: var(--muted-color); |
|
|
} |
|
|
.d3-benchmark .controls select { |
|
|
appearance: none; |
|
|
-webkit-appearance: none; |
|
|
-moz-appearance: none; |
|
|
border: 1px solid var(--border-color); |
|
|
border-radius: 8px; |
|
|
padding: 6px 28px 6px 10px; |
|
|
background-color: var(--surface-bg); |
|
|
color: var(--text-color); |
|
|
font-size: 13px; |
|
|
line-height: 1.2; |
|
|
background-image: url("data:image/svg+xml,%3Csvg width='12' height='8' viewBox='0 0 12 8' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath d='M1.41 1.59L6 6.17l4.59-4.58L12 3 6 9 0 3z' fill='%23999'/%3E%3C/svg%3E"); |
|
|
background-repeat: no-repeat; |
|
|
background-position: right 8px center; |
|
|
} |
|
|
.d3-benchmark .controls select:focus-visible { |
|
|
outline: 2px solid var(--primary-color); |
|
|
outline-offset: 2px; |
|
|
} |
|
|
.d3-benchmark .legend { |
|
|
display: flex; |
|
|
flex-direction: column; |
|
|
align-items: flex-start; |
|
|
gap: 6px; |
|
|
margin: 8px 0 0 0; |
|
|
} |
|
|
.d3-benchmark .legend .legend-title { |
|
|
font-size: 12px; |
|
|
font-weight: 700; |
|
|
color: var(--text-color); |
|
|
} |
|
|
.d3-benchmark .legend .items { |
|
|
display: flex; |
|
|
flex-wrap: wrap; |
|
|
gap: 8px 14px; |
|
|
} |
|
|
.d3-benchmark .legend .item { |
|
|
display: inline-flex; |
|
|
align-items: center; |
|
|
gap: 8px; |
|
|
font-size: 12px; |
|
|
color: var(--muted-color); |
|
|
cursor: pointer; |
|
|
} |
|
|
.d3-benchmark .legend .swatch { |
|
|
width: 14px; |
|
|
height: 14px; |
|
|
border-radius: 3px; |
|
|
border: 1px solid var(--border-color); |
|
|
} |
|
|
.d3-benchmark .ghost { opacity: .25; } |
|
|
.d3-benchmark .d3-tooltip { |
|
|
position: absolute; |
|
|
top: 0px; |
|
|
left: 0px; |
|
|
transform: translate(-9999px, -9999px); |
|
|
pointer-events: none; |
|
|
padding: 8px 10px; |
|
|
border-radius: 8px; |
|
|
font-size: 12px; |
|
|
line-height: 1.35; |
|
|
border: 1px solid var(--border-color); |
|
|
background: var(--surface-bg); |
|
|
color: var(--text-color); |
|
|
box-shadow: 0 4px 24px rgba(0,0,0,.18); |
|
|
opacity: 0; |
|
|
transition: opacity .12s ease; |
|
|
text-align: left; |
|
|
} |
|
|
.d3-benchmark .chart-card { |
|
|
background: var(--surface-bg); |
|
|
border: 1px solid var(--border-color); |
|
|
border-radius: 10px; |
|
|
padding: 8px; |
|
|
} |
|
|
</style> |
|
|
<script> |
|
|
(() => { |
|
|
const ensureD3 = (cb) => { |
|
|
if (window.d3 && typeof window.d3.select === 'function') return cb(); |
|
|
let s = document.getElementById('d3-cdn-script'); |
|
|
if (!s) { |
|
|
s = document.createElement('script'); |
|
|
s.id = 'd3-cdn-script'; |
|
|
s.src = 'https://cdn.jsdelivr.net/npm/d3@7/dist/d3.min.js'; |
|
|
document.head.appendChild(s); |
|
|
} |
|
|
const onReady = () => { if (window.d3 && typeof window.d3.select === 'function') cb(); }; |
|
|
s.addEventListener('load', onReady, { once: true }); |
|
|
if (window.d3) onReady(); |
|
|
}; |
|
|
|
|
|
const bootstrap = () => { |
|
|
const scriptEl = document.currentScript; |
|
|
let container = scriptEl ? scriptEl.previousElementSibling : null; |
|
|
if (!(container && container.classList && container.classList.contains('d3-benchmark'))){ |
|
|
const cs = Array.from(document.querySelectorAll('.d3-benchmark')).filter(el => !(el.dataset && el.dataset.mounted==='true')); |
|
|
container = cs[cs.length-1] || null; |
|
|
} |
|
|
if (!container) return; |
|
|
if (container.dataset) { if (container.dataset.mounted==='true') return; container.dataset.mounted='true'; } |
|
|
|
|
|
container.style.position = container.style.position || 'relative'; |
|
|
let tip = container.querySelector('.d3-tooltip'); let tipInner; |
|
|
if (!tip) { |
|
|
tip = document.createElement('div'); tip.className = 'd3-tooltip'; |
|
|
tipInner = document.createElement('div'); tipInner.className = 'd3-tooltip__inner'; tip.appendChild(tipInner); |
|
|
container.appendChild(tip); |
|
|
} else { tipInner = tip.querySelector('.d3-tooltip__inner') || tip; } |
|
|
|
|
|
|
|
|
const header = document.createElement('div'); header.className = 'chart-header'; |
|
|
|
|
|
const makeLegend = (series, colorBySeries) => { |
|
|
let legend = header.querySelector('.legend'); |
|
|
if (!legend) { legend = document.createElement('div'); legend.className = 'legend'; header.appendChild(legend); } |
|
|
|
|
|
let title = legend.querySelector('.legend-title'); |
|
|
if (!title) { title = document.createElement('div'); title.className = 'legend-title'; title.textContent = 'Legend'; legend.appendChild(title); } |
|
|
|
|
|
let items = legend.querySelector('.items'); |
|
|
if (!items) { items = document.createElement('div'); items.className = 'items'; legend.appendChild(items); } |
|
|
items.innerHTML = ''; |
|
|
series.forEach(name => { |
|
|
const item = document.createElement('div'); item.className = 'item'; |
|
|
const sw = document.createElement('span'); sw.className = 'swatch'; sw.style.background = colorBySeries(name); |
|
|
const txt = document.createElement('span'); txt.textContent = name; |
|
|
item.appendChild(sw); item.appendChild(txt); items.appendChild(item); |
|
|
item.addEventListener('mouseenter', () => { state.highlightModel = name; updateHighlight(); }); |
|
|
item.addEventListener('mouseleave', () => { state.highlightModel = null; updateHighlight(); }); |
|
|
}); |
|
|
}; |
|
|
|
|
|
|
|
|
const card = document.createElement('div'); card.className = 'chart-card'; container.appendChild(card); |
|
|
container.appendChild(header); |
|
|
const svg = d3.select(card).append('svg').attr('width','100%').style('display','block'); |
|
|
const gRoot = svg.append('g'); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const fetchFirstAvailable = async (paths) => { |
|
|
for (const p of paths) { |
|
|
try { |
|
|
const res = await fetch(p, { cache:'no-cache' }); |
|
|
if (!res.ok) throw new Error('HTTP '+res.status); |
|
|
const text = await res.text(); |
|
|
|
|
|
try { return JSON.parse(text); } catch(e) {} |
|
|
if (window.d3 && d3.csvParse) { return d3.csvParse(text); } |
|
|
} catch (e) { } |
|
|
} |
|
|
return null; |
|
|
}; |
|
|
|
|
|
|
|
|
const inlineData = [ |
|
|
{ benchmark:'MMLU', model:'GPT-4o', score: 88 }, |
|
|
{ benchmark:'MMLU', model:'Llama 3 70B', score: 80 }, |
|
|
{ benchmark:'MMLU', model:'Mixtral 8x7B',score: 73 }, |
|
|
{ benchmark:'MMLU', model:'Gemma 2 27B', score: 76 }, |
|
|
{ benchmark:'GSM8K', model:'GPT-4o', score: 94 }, |
|
|
{ benchmark:'GSM8K', model:'Llama 3 70B', score: 83 }, |
|
|
{ benchmark:'GSM8K', model:'Mixtral 8x7B',score: 79 }, |
|
|
{ benchmark:'GSM8K', model:'Gemma 2 27B', score: 81 }, |
|
|
{ benchmark:'HellaSwag', model:'GPT-4o', score: 95 }, |
|
|
{ benchmark:'HellaSwag', model:'Llama 3 70B', score: 89 }, |
|
|
{ benchmark:'HellaSwag', model:'Mixtral 8x7B',score: 86 }, |
|
|
{ benchmark:'HellaSwag', model:'Gemma 2 27B', score: 87 }, |
|
|
{ benchmark:'TruthfulQA', model:'GPT-4o', score: 64 }, |
|
|
{ benchmark:'TruthfulQA', model:'Llama 3 70B', score: 56 }, |
|
|
{ benchmark:'TruthfulQA', model:'Mixtral 8x7B',score: 51 }, |
|
|
{ benchmark:'TruthfulQA', model:'Gemma 2 27B', score: 53 }, |
|
|
{ benchmark:'ARC-C', model:'GPT-4o', score: 79 }, |
|
|
{ benchmark:'ARC-C', model:'Llama 3 70B', score: 72 }, |
|
|
{ benchmark:'ARC-C', model:'Mixtral 8x7B',score: 68 }, |
|
|
{ benchmark:'ARC-C', model:'Gemma 2 27B', score: 70 } |
|
|
]; |
|
|
|
|
|
const state = { |
|
|
data: inlineData, |
|
|
colorsByModel: null, |
|
|
highlightModel: null, |
|
|
}; |
|
|
|
|
|
const margin = { top: 12, right: 28, bottom: 24, left: 56 }; |
|
|
let width = 800, height = 360; |
|
|
const x0 = d3.scaleBand().paddingInner(0.2).paddingOuter(0.05); |
|
|
const x1 = d3.scaleBand().padding(0.12); |
|
|
const y = d3.scaleLinear(); |
|
|
const xAxis = d3.axisBottom(x0).tickSizeOuter(0); |
|
|
const yAxis = d3.axisLeft(y).ticks(6).tickSizeOuter(0); |
|
|
const yTopPadding = 2; |
|
|
|
|
|
function getPrimaryColor(){ |
|
|
try { if (window.ColorPalettes && typeof window.ColorPalettes.getPrimary === 'function') return window.ColorPalettes.getPrimary(); } catch(e) {} |
|
|
return getComputedStyle(document.documentElement).getPropertyValue('--primary-color') || '#6D4AFF'; |
|
|
} |
|
|
function getCategoricalColors(n){ |
|
|
try { if (window.ColorPalettes && typeof window.ColorPalettes.getColors === 'function') return window.ColorPalettes.getColors('categorical', n); } catch(e) {} |
|
|
|
|
|
const base = getPrimaryColor(); |
|
|
const colors = []; |
|
|
for (let i=0;i<n;i++) { |
|
|
const hue = Math.round((360/n)*i); |
|
|
colors.push(`hsl(${hue}, 60%, 55%)`); |
|
|
} |
|
|
return colors; |
|
|
} |
|
|
|
|
|
function computeSeriesColors(models){ |
|
|
const palette = getCategoricalColors(models.length); |
|
|
const map = new Map(models.map((m, i) => [m, palette[i % palette.length]])); |
|
|
return (model) => map.get(model) || getPrimaryColor(); |
|
|
} |
|
|
|
|
|
function getModels(data){ |
|
|
return Array.from(new Set(data.map(d => d.model))); |
|
|
} |
|
|
function getBenchmarks(data){ |
|
|
return Array.from(new Set(data.map(d => d.benchmark))); |
|
|
} |
|
|
|
|
|
function updateSize(){ |
|
|
width = container.clientWidth || 800; |
|
|
height = Math.max(240, Math.round(width / 3.4)); |
|
|
svg.attr('width', width).attr('height', height); |
|
|
gRoot.attr('transform', `translate(${margin.left},${margin.top})`); |
|
|
return { innerWidth: width - margin.left - margin.right, innerHeight: height - margin.top - margin.bottom }; |
|
|
} |
|
|
|
|
|
function showTip(html, x, y){ |
|
|
tip.style.transform = `translate(${x + 12}px, ${y + 12}px)`; |
|
|
tip.style.opacity = '1'; |
|
|
const inner = tip.querySelector('.d3-tooltip__inner') || tip; |
|
|
inner.innerHTML = html; |
|
|
} |
|
|
function hideTip(){ |
|
|
tip.style.opacity = '0'; |
|
|
tip.style.transform = 'translate(-9999px, -9999px)'; |
|
|
} |
|
|
|
|
|
function updateHighlight(){ |
|
|
const model = state.highlightModel; |
|
|
const bars = gRoot.selectAll('rect.bar'); |
|
|
const labels = gRoot.selectAll('text.value'); |
|
|
if (model) { |
|
|
bars.classed('ghost', d => d.model !== model); |
|
|
labels.classed('ghost', d => d.model !== model); |
|
|
const items = container.querySelectorAll('.legend .item'); |
|
|
items.forEach((el) => { |
|
|
const name = el.textContent.trim(); |
|
|
if (name !== model) el.classList.add('ghost'); else el.classList.remove('ghost'); |
|
|
}); |
|
|
} else { |
|
|
bars.classed('ghost', false); |
|
|
labels.classed('ghost', false); |
|
|
container.querySelectorAll('.legend .item').forEach(el => el.classList.remove('ghost')); |
|
|
} |
|
|
} |
|
|
|
|
|
function render(){ |
|
|
const { innerWidth, innerHeight } = updateSize(); |
|
|
const models = getModels(state.data); |
|
|
if (!state.colorsByModel) state.colorsByModel = computeSeriesColors(models); |
|
|
makeLegend(models, state.colorsByModel); |
|
|
|
|
|
x0.domain(getBenchmarks(state.data)).range([0, innerWidth]); |
|
|
x1.domain(models).range([0, x0.bandwidth()]); |
|
|
|
|
|
const yMaxRaw = 100; |
|
|
const yMax = yMaxRaw + yTopPadding; |
|
|
y.domain([0, yMax]).range([innerHeight, 0]).nice(); |
|
|
|
|
|
|
|
|
gRoot |
|
|
.selectAll('.axis-x') |
|
|
.data([0]) |
|
|
.join('g') |
|
|
.attr('class','axis-x') |
|
|
.attr('transform',`translate(0,${innerHeight})`) |
|
|
.call(xAxis) |
|
|
.call(g => { |
|
|
g.selectAll('path, line').attr('stroke', 'var(--axis-color)'); |
|
|
g.selectAll('text').attr('fill', 'var(--tick-color)').style('font-size','12px'); |
|
|
}); |
|
|
gRoot |
|
|
.selectAll('.axis-y') |
|
|
.data([0]) |
|
|
.join('g') |
|
|
.attr('class','axis-y') |
|
|
.call(yAxis) |
|
|
.call(g => { |
|
|
g.selectAll('path, line').attr('stroke', 'var(--axis-color)'); |
|
|
g.selectAll('text').attr('fill', 'var(--tick-color)').style('font-size','12px'); |
|
|
}); |
|
|
|
|
|
|
|
|
gRoot |
|
|
.selectAll('.grid-y') |
|
|
.data([0]) |
|
|
.join('g') |
|
|
.attr('class','grid-y') |
|
|
.call(d3.axisLeft(y).ticks(6).tickSize(-innerWidth).tickFormat('')) |
|
|
.call(g => g.select('.domain').remove()) |
|
|
.call(g => g.selectAll('.tick line').attr('stroke','var(--grid-color)').attr('stroke-opacity',1)) |
|
|
.call(g => g.selectAll('.tick').filter((d, i, nodes) => i === nodes.length - 1).select('line').attr('stroke-opacity', 0)); |
|
|
|
|
|
|
|
|
const groups = gRoot.selectAll('.group').data(getBenchmarks(state.data), d => d); |
|
|
const groupsEnter = groups.enter().append('g').attr('class','group'); |
|
|
groupsEnter.merge(groups).attr('transform', d => `translate(${x0(d)},0)`); |
|
|
groups.exit().remove(); |
|
|
|
|
|
|
|
|
const nested = d3.group(state.data, d => d.benchmark); |
|
|
groupsEnter.each(function(bench){ d3.select(this).selectAll('rect.bar').data([]).join('rect'); }); |
|
|
const allGroups = gRoot.selectAll('.group'); |
|
|
allGroups.each(function(bench){ |
|
|
const dataForBench = nested.get(bench) || []; |
|
|
const bars = d3.select(this).selectAll('rect.bar').data(models.map(m => ({ bench, model:m, score:(dataForBench.find(dd=>dd.model===m)||{score:0}).score })) , d => d.model); |
|
|
bars.join( |
|
|
enter => enter.append('rect') |
|
|
.attr('class','bar') |
|
|
.attr('x', d => x1(d.model)) |
|
|
.attr('y', innerHeight) |
|
|
.attr('width', x1.bandwidth()) |
|
|
.attr('height', 0) |
|
|
.attr('fill', d => state.colorsByModel(d.model)) |
|
|
.on('mouseenter', (event, d) => { state.highlightModel = d.model; updateHighlight(); }) |
|
|
.on('mousemove', (event, d) => { |
|
|
const [mx, my] = d3.pointer(event, container); |
|
|
showTip(`<strong>${d.model}</strong><br/>${d.bench}: <strong>${d.score}</strong>`, mx, my); |
|
|
}) |
|
|
.on('mouseleave', () => { hideTip(); state.highlightModel = null; updateHighlight(); }) |
|
|
.transition().duration(160) |
|
|
.attr('y', d => y(d.score)) |
|
|
.attr('height', d => Math.max(0, innerHeight - y(d.score))), |
|
|
update => update |
|
|
.on('mouseenter', (event, d) => { state.highlightModel = d.model; updateHighlight(); }) |
|
|
.on('mousemove', (event, d) => { |
|
|
const [mx, my] = d3.pointer(event, container); |
|
|
showTip(`<strong>${d.model}</strong><br/>${d.bench}: <strong>${d.score}</strong>`, mx, my); |
|
|
}) |
|
|
.on('mouseleave', () => { hideTip(); state.highlightModel = null; updateHighlight(); }) |
|
|
.transition().duration(160) |
|
|
.attr('x', d => x1(d.model)) |
|
|
.attr('y', d => y(d.score)) |
|
|
.attr('width', x1.bandwidth()) |
|
|
.attr('height', d => Math.max(0, innerHeight - y(d.score))) |
|
|
.attr('fill', d => state.colorsByModel(d.model)), |
|
|
exit => exit.transition().duration(120).attr('y', innerHeight).attr('height', 0).remove() |
|
|
); |
|
|
|
|
|
|
|
|
const labels = d3.select(this).selectAll('text.value').data(models.map(m => ({ bench, model:m, score:(dataForBench.find(dd=>dd.model===m)||{score:0}).score })) , d => d.model); |
|
|
labels.join( |
|
|
enter => enter.append('text') |
|
|
.attr('class','value') |
|
|
.attr('x', d => x1(d.model) + x1.bandwidth()/2) |
|
|
.attr('y', d => y(d.score) - 4) |
|
|
.attr('text-anchor','middle') |
|
|
.attr('fill','var(--text-color)') |
|
|
.attr('opacity',0.9) |
|
|
.attr('font-size',10) |
|
|
.text(d => d.score), |
|
|
update => update |
|
|
.transition().duration(160) |
|
|
.attr('x', d => x1(d.model) + x1.bandwidth()/2) |
|
|
.attr('y', d => y(d.score) - 4) |
|
|
.text(d => d.score), |
|
|
exit => exit.remove() |
|
|
); |
|
|
}); |
|
|
|
|
|
|
|
|
gRoot.selectAll('.y-label').data([0]).join('text').attr('class','y-label') |
|
|
.attr('transform', `rotate(-90)`) |
|
|
.attr('x', -innerHeight / 2) |
|
|
.attr('y', -margin.left + 24) |
|
|
.attr('text-anchor','middle') |
|
|
.attr('fill','var(--text-color)') |
|
|
.attr('font-size',12) |
|
|
.attr('font-weight',700) |
|
|
.text('score'); |
|
|
} |
|
|
|
|
|
|
|
|
render(); |
|
|
const rerender = () => render(); |
|
|
if (window.ResizeObserver) { const ro = new ResizeObserver(() => rerender()); ro.observe(container); } |
|
|
else { window.addEventListener('resize', rerender); } |
|
|
|
|
|
|
|
|
(async () => { |
|
|
const maybe = await fetchFirstAvailable([ |
|
|
'/data/llm_benchmarks.json', |
|
|
'./assets/data/llm_benchmarks.json', |
|
|
'../assets/data/llm_benchmarks.json' |
|
|
]); |
|
|
if (Array.isArray(maybe) && maybe.length && maybe[0].benchmark && maybe[0].model && (typeof maybe[0].score === 'number')) { |
|
|
state.data = maybe; |
|
|
state.colorsByModel = null; |
|
|
render(); |
|
|
} else if (maybe && maybe.columns) { |
|
|
|
|
|
const parsed = maybe.map(r => ({ benchmark: r.benchmark, model: r.model, score: +r.score })); |
|
|
if (parsed.length) { state.data = parsed; state.colorsByModel = null; render(); } |
|
|
} |
|
|
})().catch(() => { |
|
|
|
|
|
}); |
|
|
}; |
|
|
|
|
|
if (document.readyState === 'loading') { document.addEventListener('DOMContentLoaded', () => ensureD3(bootstrap), { once: true }); } |
|
|
else { ensureD3(bootstrap); } |
|
|
})(); |
|
|
</script> |
|
|
|
|
|
|
|
|
|