Top LLM Models
v3Publishedtop llm leaderboards + scoring of models
›Output & API
Author's sample data| date | 2026-06-11 |
|---|---|
| status | ✅ Prometheus Scraper completed — routing map + visualization data ready for In-Tune Layer at 2026-06-11T23:01:28.328Z |
| pipeline | |
| top_models | |
| routing_map | |
| visualization_data |
›Marketplace
0 subscribers0 runs in 14d · published 5h ago
›Versions
managed by authorv3manual updateapprovedcurrent4h ago
v2manual updateapproved5h ago
v1builtrejected10h ago
How this script collects data
import Firecrawl from "@mendable/firecrawl-js";
import * as cheerio from "cheerio";
import { z } from "zod";
// ===========================================================================
// Daily LLM leaderboard aggregation snapshot
// - normalized scores from 7 leaderboards (6 live, HF archived)
// - weighted composite rankings
// - task-routing map (emphasis on coding / agentic work)
// - visualization data (rank deltas, radar, correlations, time-series seed)
// All network access goes through the Firecrawl SDK. Output is JSON on stdout;
// all diagnostics go to stderr.
// ===========================================================================
const SNAPSHOT_DATE = "2026-06-11";
const apiKey = process.env.FIRECRAWL_API_KEY;
if (!apiKey) {
process.stderr.write("FIRECRAWL_API_KEY is not set\n");
process.exit(1);
}
const firecrawl = new Firecrawl({ apiKey });
function log(msg: string): void {
process.stderr.write(msg + "\n");
}
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
type Category =
| "human_pref"
| "reasoning"
| "coding"
| "agentic"
| "math"
| "efficiency"
| "academic"
| "safety";
type RawScore = {
leaderboard: string;
benchmark: string;
category: Category;
model_raw: string;
organization: string | null;
value: number;
display: string; // raw value as shown on the site
higherIsBetter: boolean;
};
type LbStatus = {
leaderboard: string;
url: string;
status: "live" | "archived_no_fresh_data" | "error";
benchmarks_fetched: number;
rows_fetched: number;
note?: string;
};
// ---------------------------------------------------------------------------
// Model name canonicalization (fuzzy matching + alias table)
// ---------------------------------------------------------------------------
// Strip harness/effort qualifiers, dates and punctuation; split version dots;
// sort tokens so word order doesn't matter ("Claude 4.8 Opus" == "Claude Opus 4.8")
const STRIP_TOKENS = new Set([
"thinking",
"non-thinking",
"nonthinking",
"effort",
"xhigh",
"high",
"medium",
"low",
"max",
"preview",
"new",
"unreleased",
"latest",
"experimental",
"chat",
"instruct",
]);
// alias key (normalized token-sorted) -> canonical normalized key
const ALIASES: Record<string, string> = {
"5 fable": "5 claude fable", // Scale: "Fable-5 (Claude Code)"
"4 7 opus": "4 7 claude opus", // Scale: "Opus-4.7 (Claude Code)"
"4 8 opus": "4 8 claude opus", // Scale: "Opus 4.8 (Claude Code)"
"5 mythos": "5 claude mythos",
};
function normKey(raw: string): string {
const s = raw
.toLowerCase()
.replace(/\\/g, "")
.replace(/\*+/g, "")
.replace(/\([^)]*\)/g, " ")
.replace(/['’]/g, "")
.replace(/(\d)\.(\d)/g, "$1 $2") // 4.8 -> "4 8"
.replace(/[-_/,:]/g, " ")
.replace(/\b\d{8}\b/g, " ") // dates like 20250929
.replace(/\b\d{4} \d{2} \d{2}\b/g, " "); // dates like 2025 12 11
const tokens = s
.split(/\s+/)
.filter((t) => t && !STRIP_TOKENS.has(t))
.sort();
const key = tokens.join(" ");
return ALIASES[key] ?? key;
}
// Registry of canonical display names: key -> preferred display + provider
const displayNames = new Map<string, string>();
const displayPriority = new Map<string, number>();
const providers = new Map<string, string>();
const aliasLog = new Map<string, Set<string>>();
const PROVIDER_CANON: Record<string, string> = {
anthropic: "Anthropic",
openai: "OpenAI",
google: "Google",
"google deepmind": "Google",
meta: "Meta",
xai: "xAI",
deepseek: "DeepSeek",
alibaba: "Alibaba / Qwen",
"alibaba cloud / qwen team": "Alibaba / Qwen",
qwen: "Alibaba / Qwen",
"moonshot ai": "Moonshot AI",
moonshotai: "Moonshot AI",
mistral: "Mistral",
microsoft: "Microsoft",
tencent: "Tencent",
bytedance: "ByteDance",
zhipu: "Zhipu AI",
"zhipu ai": "Zhipu AI",
minimax: "MiniMax",
nvidia: "NVIDIA",
amazon: "Amazon",
cohere: "Cohere",
"z.ai": "Zhipu AI",
zai: "Zhipu AI",
};
function inferProvider(key: string): string | null {
if (/\bclaude\b|\bfable\b|\bmythos\b/.test(key)) return "Anthropic";
if (/\bgpt\b|^o\d| o\d\b|\boss\b/.test(key)) return "OpenAI";
if (/\bgemini\b|\bgemma\b/.test(key)) return "Google";
if (/\bgrok\b/.test(key)) return "xAI";
if (/\bdeepseek\b/.test(key)) return "DeepSeek";
if (/\bqwen\b/.test(key)) return "Alibaba / Qwen";
if (/\bkimi\b/.test(key)) return "Moonshot AI";
if (/\bllama\b|\bmuse\b/.test(key)) return "Meta";
if (/\bmistral\b|\bmagistral\b/.test(key)) return "Mistral";
if (/\bminimax\b/.test(key)) return "MiniMax";
if (/\bglm\b/.test(key)) return "Zhipu AI";
if (/\bseed\b/.test(key)) return "ByteDance";
if (/\bnemotron\b/.test(key)) return "NVIDIA";
if (/\bnova\b/.test(key)) return "Amazon";
if (/\bmimo\b/.test(key)) return "Xiaomi";
return null;
}
function registerModel(raw: string, org: string | null, sourcePriority: number): string {
const key = normKey(raw);
if (!key) return key;
const cleanedDisplay = raw
.replace(/\\/g, "")
.replace(/\*+/g, "")
.replace(/\(([^)]*)\)/g, (full, inner) =>
/fallback|claude code|thinking|x?high|x?low|medium|max\b|effort/i.test(inner) ? "" : full
)
.replace(/\b(non-)?thinking\b/gi, "")
.replace(/\bx?(high|low|medium)\b(\s+effort)?/gi, "")
.replace(/\beffort\b/gi, "")
.replace(/\s+/g, " ")
.trim();
// prefer display names from higher-priority sources (lower number wins),
// recorded first-come within the same priority
const prev = displayNames.get(key);
const prevPrio = prev ? displayPriority.get(key)! : 99;
if (!prev || sourcePriority < prevPrio) {
displayNames.set(key, cleanedDisplay || raw.trim());
displayPriority.set(key, sourcePriority);
}
if (org) {
const canonOrg = PROVIDER_CANON[org.toLowerCase().trim()] ?? org.trim();
if (!providers.has(key)) providers.set(key, canonOrg);
} else if (!providers.has(key)) {
const inferred = inferProvider(key);
if (inferred) providers.set(key, inferred);
}
let set = aliasLog.get(key);
if (!set) aliasLog.set(key, (set = new Set()));
set.add(cleanedDisplay || raw.trim());
return key;
}
// ---------------------------------------------------------------------------
// Generic helpers
// ---------------------------------------------------------------------------
const scores: RawScore[] = [];
const statuses: LbStatus[] = [];
function round(x: number, d = 2): number {
const f = Math.pow(10, d);
return Math.round(x * f) / f;
}
function toNumber(s: string): number | null {
const m = String(s).replace(/[, ]/g, "").match(/-?\d+(\.\d+)?/);
if (!m) return null;
const n = Number(m[0]);
return Number.isFinite(n) ? n : null;
}
function cleanText(s: string): string {
return s
.replace(/[\u{1F000}-\u{1FFFF}☀-➿️]/gu, "") // emoji / symbols
.replace(/\s+/g, " ")
.trim();
}
async function getHtml(url: string): Promise<string> {
const doc = await firecrawl.scrape(url, {
formats: ["html"],
onlyMainContent: false,
integration: "prometheus",
});
return (doc as { html?: string }).html ?? "";
}
// Known provider display prefixes that some sites glue onto the model id
const ORG_PREFIXES: Array<[string, string]> = [
["Google DeepMind", "Google"],
["Anthropic", "Anthropic"],
["OpenAI", "OpenAI"],
["Google", "Google"],
["Meta", "Meta"],
["xAI", "xAI"],
["DeepSeek", "DeepSeek"],
["Alibaba", "Alibaba / Qwen"],
["Qwen", "Alibaba / Qwen"],
["Moonshot AI", "Moonshot AI"],
["Moonshot", "Moonshot AI"],
["Mistral", "Mistral"],
["Microsoft", "Microsoft"],
["Tencent", "Tencent"],
["ByteDance", "ByteDance"],
["Zhipu AI", "Zhipu AI"],
["Zhipu", "Zhipu AI"],
["MiniMax", "MiniMax"],
["NVIDIA", "NVIDIA"],
["Amazon", "Amazon"],
["Cohere", "Cohere"],
["Z.ai", "Zhipu AI"],
["Xiaomi", "Xiaomi"],
["Baidu", "Baidu"],
];
function splitGluedOrg(text: string): { org: string | null; model: string } {
const t = cleanText(text);
for (const [prefix, canon] of ORG_PREFIXES) {
if (t.toLowerCase().startsWith(prefix.toLowerCase()) && t.length > prefix.length) {
return { org: canon, model: t.slice(prefix.length).trim() };
}
}
return { org: null, model: t };
}
function push(
leaderboard: string,
benchmark: string,
category: Category,
modelRaw: string,
organization: string | null,
value: number,
display: string,
higherIsBetter: boolean
): void {
scores.push({
leaderboard,
benchmark,
category,
model_raw: modelRaw,
organization,
value,
display,
higherIsBetter,
});
}
// ---------------------------------------------------------------------------
// Per-leaderboard parsers (one per source). Each is wrapped by runSource()
// so a single failure never aborts the snapshot.
// ---------------------------------------------------------------------------
type SourceSpec = {
name: string;
url: string;
priority: number;
run: () => Promise<{ benchmarks: number; rows: number }>;
};
const MAX_ROWS = 60;
// 1) LMArena (Chatbot Arena) -- human preference (rank-based)
async function parseLmArena(name: string, url: string, priority: number) {
const html = await getHtml(url);
const $ = cheerio.load(html);
const table = $("table").first();
const headers = table
.find("thead th, tr").first()
.find("th, td")
.map((_i, e) => cleanText($(e).text()))
.get();
let overallIdx = headers.findIndex((h) => /overall/i.test(h));
let codingIdx = headers.findIndex((h) => /coding/i.test(h));
if (overallIdx < 0) overallIdx = 1;
let rows = 0;
const benchmarks = new Set<string>();
table.find("tbody tr").each((_i, tr) => {
if (rows >= MAX_ROWS) return;
const cells = $(tr).find("td, th").map((_j, c) => cleanText($(c).text())).get();
if (cells.length <= overallIdx) return;
const { org, model } = splitGluedOrg(cells[0]);
if (!model) return;
const overall = toNumber(cells[overallIdx]);
if (overall === null) return;
registerModel(model, org, priority);
push(name, "Arena Overall (rank)", "human_pref", model, org, overall, cells[overallIdx], false);
benchmarks.add("Arena Overall (rank)");
if (codingIdx > 0 && cells.length > codingIdx) {
const coding = toNumber(cells[codingIdx]);
if (coding !== null) {
push(name, "Arena Coding (rank)", "coding", model, org, coding, cells[codingIdx], false);
benchmarks.add("Arena Coding (rank)");
}
}
rows += 1;
});
return { benchmarks: benchmarks.size, rows };
}
// 2) Artificial Analysis -- intelligence index + output speed
async function parseArtificialAnalysis(name: string, url: string, priority: number) {
const html = await getHtml(url);
const $ = cheerio.load(html);
const table = $("table").first();
let rows = 0;
const benchmarks = new Set<string>();
table.find("tbody tr").each((_i, tr) => {
if (rows >= MAX_ROWS) return;
const cells = $(tr).find("td, th").map((_j, c) => cleanText($(c).text())).get();
if (cells.length < 6) return;
const model = cells[0];
const org = cells[2] || null;
const intelligence = toNumber(cells[3]);
const speed = toNumber(cells[5]);
if (!model || intelligence === null) return;
registerModel(model, org, priority);
push(name, "Intelligence Index", "reasoning", model, org, intelligence, cells[3], true);
benchmarks.add("Intelligence Index");
if (speed !== null) {
push(name, "Output Speed (tok/s)", "efficiency", model, org, speed, cells[5], true);
benchmarks.add("Output Speed (tok/s)");
}
rows += 1;
});
return { benchmarks: benchmarks.size, rows };
}
// 3) LiveBench -- multi-category averages
async function parseLiveBench(name: string, url: string, priority: number) {
const html = await getHtml(url);
const $ = cheerio.load(html);
const table = $("table").first();
const headers = table.find("thead th, tr").first().find("th, td")
.map((_i, e) => cleanText($(e).text())).get();
const colMap: Array<{ idx: number; benchmark: string; category: Category }> = [];
headers.forEach((h, idx) => {
if (/reasoning/i.test(h)) colMap.push({ idx, benchmark: "Reasoning Average", category: "reasoning" });
else if (/agentic/i.test(h)) colMap.push({ idx, benchmark: "Agentic Coding Average", category: "agentic" });
else if (/coding/i.test(h)) colMap.push({ idx, benchmark: "Coding Average", category: "coding" });
else if (/math/i.test(h)) colMap.push({ idx, benchmark: "Mathematics Average", category: "math" });
else if (/data analysis/i.test(h)) colMap.push({ idx, benchmark: "Data Analysis Average", category: "academic" });
else if (/\bif\b|instruction/i.test(h)) colMap.push({ idx, benchmark: "Instruction Following Average", category: "academic" });
});
// Fallback to known LiveBench column order if header detection failed
if (colMap.length === 0) {
colMap.push(
{ idx: 3, benchmark: "Reasoning Average", category: "reasoning" },
{ idx: 4, benchmark: "Coding Average", category: "coding" },
{ idx: 5, benchmark: "Agentic Coding Average", category: "agentic" },
{ idx: 6, benchmark: "Mathematics Average", category: "math" }
);
}
let rows = 0;
const benchmarks = new Set<string>();
table.find("tbody tr").each((_i, tr) => {
if (rows >= MAX_ROWS) return;
const cells = $(tr).find("td, th").map((_j, c) => cleanText($(c).text())).get();
if (cells.length < 4) return;
const model = cells[0].split("**")[0].trim();
const org = cells[1] || null;
if (!model || toNumber(cells[2]) === null) return;
registerModel(model, org, priority);
let any = false;
for (const col of colMap) {
if (cells.length <= col.idx) continue;
const v = toNumber(cells[col.idx]);
if (v === null) continue;
push(name, col.benchmark, col.category, model, org, v, cells[col.idx], true);
benchmarks.add(col.benchmark);
any = true;
}
if (any) rows += 1;
});
return { benchmarks: benchmarks.size, rows };
}
// 4) Aider polyglot -- coding (percent correct)
async function parseAider(name: string, url: string, priority: number) {
const html = await getHtml(url);
const $ = cheerio.load(html);
let rows = 0;
const benchmarks = new Set<string>();
const seen = new Set<string>();
$("table tbody tr, table tr").each((_i, tr) => {
if (rows >= MAX_ROWS) return;
const cells = $(tr).find("td, th").map((_j, c) => cleanText($(c).text())).get();
// Real rows look like: [▶, model, "88.0%", "$29.08", command, "91.6%", "diff"]
const modelCell = cells.find((c, idx) => idx > 0 && c && c.length < 80 && !/^[▶]?$/.test(c));
const pctIdx = cells.findIndex((c) => /^\$?\d+(\.\d+)?%$/.test(c));
if (pctIdx < 1) return;
const model = cleanText(cells[pctIdx - 1]);
const pct = toNumber(cells[pctIdx]);
if (!model || model.length > 80 || pct === null) return;
const key = normKey(model);
if (!key || seen.has(key)) return;
seen.add(key);
registerModel(model, null, priority);
push(name, "Polyglot Percent Correct", "coding", model, null, pct, cells[pctIdx], true);
benchmarks.add("Polyglot Percent Correct");
rows += 1;
void modelCell;
});
return { benchmarks: benchmarks.size, rows };
}
// 5) SWE-bench -- agentic software engineering (percent resolved)
async function parseSweBench(name: string, url: string, priority: number) {
const html = await getHtml(url);
const $ = cheerio.load(html);
const table = $("table").first();
const headers = table.find("thead th, tr").first().find("th, td")
.map((_i, e) => cleanText($(e).text())).get();
let modelIdx = headers.findIndex((h) => /model/i.test(h));
let resolvedIdx = headers.findIndex((h) => /resolved|%/i.test(h));
let orgIdx = headers.findIndex((h) => /^org/i.test(h));
if (modelIdx < 0) modelIdx = 1;
if (resolvedIdx < 0) resolvedIdx = 2;
let rows = 0;
const benchmarks = new Set<string>();
table.find("tbody tr").each((_i, tr) => {
if (rows >= MAX_ROWS) return;
const cells = $(tr).find("td, th").map((_j, c) => cleanText($(c).text())).get();
if (cells.length <= resolvedIdx) return;
const model = cleanText(cells[modelIdx] || "");
const resolved = toNumber(cells[resolvedIdx]);
if (!model || resolved === null) return;
const org = orgIdx > 0 && cells[orgIdx] ? cells[orgIdx] : null;
registerModel(model, org, priority);
push(name, "SWE-bench % Resolved", "agentic", model, org, resolved, cells[resolvedIdx], true);
benchmarks.add("SWE-bench % Resolved");
rows += 1;
});
return { benchmarks: benchmarks.size, rows };
}
// 6) Scale SEAL -- coding leaderboard (client-rendered; LLM-backed JSON mode)
const scaleSchema = z.object({
models: z.array(
z.object({
model: z.string(),
organization: z.string().nullable().optional(),
score: z.number().nullable().optional(),
})
),
});
async function parseScale(name: string, url: string, priority: number) {
const doc = await firecrawl.scrape(url, {
formats: [
{
type: "json",
prompt:
"Extract the leaderboard ranking table. Return an array 'models' where each entry has the model name, its organization/creator if shown, and its numeric score (the primary ranking metric).",
schema: scaleSchema,
},
],
onlyMainContent: false,
integration: "prometheus",
});
const parsed = scaleSchema.safeParse((doc as { json?: unknown }).json);
if (!parsed.success) return { benchmarks: 0, rows: 0 };
let rows = 0;
for (const m of parsed.data.models.slice(0, MAX_ROWS)) {
if (!m.model) continue;
const score = typeof m.score === "number" ? m.score : null;
if (score === null) continue;
const org = m.organization ?? null;
registerModel(m.model, org, priority);
push(name, "SEAL Coding", "coding", m.model, org, score, String(score), true);
rows += 1;
}
return { benchmarks: rows > 0 ? 1 : 0, rows };
}
// ---------------------------------------------------------------------------
// Source registry (7 leaderboards: 6 live + HF archived)
// ---------------------------------------------------------------------------
const SOURCES: SourceSpec[] = [
{
name: "LMArena (Chatbot Arena)",
url: "https://lmarena.ai/leaderboard",
priority: 2,
run: () => parseLmArena("LMArena (Chatbot Arena)", "https://lmarena.ai/leaderboard", 2),
},
{
name: "Artificial Analysis",
url: "https://artificialanalysis.ai/leaderboards/models",
priority: 1,
run: () =>
parseArtificialAnalysis(
"Artificial Analysis",
"https://artificialanalysis.ai/leaderboards/models",
1
),
},
{
name: "LiveBench",
url: "https://livebench.ai/",
priority: 1,
run: () => parseLiveBench("LiveBench", "https://livebench.ai/", 1),
},
{
name: "Aider Polyglot",
url: "https://aider.chat/docs/leaderboards/",
priority: 1,
run: () => parseAider("Aider Polyglot", "https://aider.chat/docs/leaderboards/", 1),
},
{
name: "SWE-bench",
url: "https://www.swebench.com/",
priority: 1,
run: () => parseSweBench("SWE-bench", "https://www.swebench.com/", 1),
},
{
name: "Scale SEAL",
url: "https://scale.com/leaderboard/coding",
priority: 3,
run: () => parseScale("Scale SEAL", "https://scale.com/leaderboard/coding", 3),
},
];
const HF_SOURCE = {
name: "HuggingFace Open LLM Leaderboard",
url: "https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard",
};
async function runSource(spec: SourceSpec): Promise<void> {
try {
log(`Fetching ${spec.name} ...`);
const r = await spec.run();
statuses.push({
leaderboard: spec.name,
url: spec.url,
status: r.rows > 0 ? "live" : "error",
benchmarks_fetched: r.benchmarks,
rows_fetched: r.rows,
note: r.rows > 0 ? undefined : "no rows parsed",
});
log(` ${spec.name}: ${r.rows} rows / ${r.benchmarks} benchmarks`);
} catch (e) {
const msg = e instanceof Error ? e.message : String(e);
statuses.push({
leaderboard: spec.name,
url: spec.url,
status: "error",
benchmarks_fetched: 0,
rows_fetched: 0,
note: msg.slice(0, 200),
});
log(` ${spec.name}: ERROR ${msg}`);
}
}
// ---------------------------------------------------------------------------
// Aggregation
// ---------------------------------------------------------------------------
const CATEGORY_WEIGHTS: Record<Category, number> = {
coding: 0.22,
agentic: 0.22,
reasoning: 0.18,
human_pref: 0.12,
math: 0.1,
academic: 0.08,
efficiency: 0.05,
safety: 0.03,
};
const ALL_CATEGORIES: Category[] = [
"human_pref",
"reasoning",
"coding",
"agentic",
"math",
"efficiency",
"academic",
"safety",
];
type NormScore = {
key: string;
category: Category;
normalized: number;
leaderboard: string;
benchmark: string;
};
// Normalize every raw score within its (leaderboard + benchmark) group to 0..100.
function normalizeScores(): NormScore[] {
const groups = new Map<string, RawScore[]>();
for (const s of scores) {
const g = `${s.leaderboard}::${s.benchmark}`;
let arr = groups.get(g);
if (!arr) groups.set(g, (arr = []));
arr.push(s);
}
const out: NormScore[] = [];
for (const [, arr] of groups) {
const vals = arr.map((s) => s.value);
const min = Math.min(...vals);
const max = Math.max(...vals);
const span = max - min;
for (const s of arr) {
let n: number;
if (span === 0) n = 50;
else if (s.higherIsBetter) n = ((s.value - min) / span) * 100;
else n = ((max - s.value) / span) * 100;
const key = normKey(s.model_raw);
if (!key) continue;
out.push({
key,
category: s.category,
normalized: round(n, 2),
leaderboard: s.leaderboard,
benchmark: s.benchmark,
});
}
}
return out;
}
type ModelAgg = {
key: string;
model: string;
provider: string | null;
categoryScores: Map<Category, number>;
leaderboards: Set<string>;
benchmarks: Set<string>;
};
function aggregate(norm: NormScore[]): ModelAgg[] {
const byModel = new Map<string, ModelAgg>();
const catBuckets = new Map<string, Map<Category, number[]>>();
for (const n of norm) {
let agg = byModel.get(n.key);
if (!agg) {
byModel.set(
n.key,
(agg = {
key: n.key,
model: displayNames.get(n.key) ?? n.key,
provider: providers.get(n.key) ?? inferProvider(n.key),
categoryScores: new Map(),
leaderboards: new Set(),
benchmarks: new Set(),
})
);
}
agg.leaderboards.add(n.leaderboard);
agg.benchmarks.add(`${n.leaderboard}::${n.benchmark}`);
let cb = catBuckets.get(n.key);
if (!cb) catBuckets.set(n.key, (cb = new Map()));
let list = cb.get(n.category);
if (!list) cb.set(n.category, (list = []));
list.push(n.normalized);
}
for (const [key, cb] of catBuckets) {
const agg = byModel.get(key)!;
for (const [cat, list] of cb) {
const mean = list.reduce((a, b) => a + b, 0) / list.length;
agg.categoryScores.set(cat, round(mean, 2));
}
}
return [...byModel.values()];
}
function compositeOf(agg: ModelAgg, weights: Record<Category, number>): number {
let wsum = 0;
let acc = 0;
for (const cat of ALL_CATEGORIES) {
const s = agg.categoryScores.get(cat);
if (s === undefined) continue;
const w = weights[cat] ?? 0;
acc += s * w;
wsum += w;
}
if (wsum === 0) return 0;
return round(acc / wsum, 2);
}
// ---------------------------------------------------------------------------
// Task routing (emphasis on coding / agentic work)
// ---------------------------------------------------------------------------
const TASK_PROFILES: Record<string, Partial<Record<Category, number>>> = {
coding: { coding: 0.6, agentic: 0.25, reasoning: 0.15 },
agentic_workflows: { agentic: 0.6, coding: 0.25, reasoning: 0.15 },
reasoning_research: { reasoning: 0.5, math: 0.3, academic: 0.2 },
math: { math: 0.7, reasoning: 0.3 },
general_chat: { human_pref: 0.6, reasoning: 0.2, coding: 0.2 },
cost_efficient_coding: { efficiency: 0.5, coding: 0.3, agentic: 0.2 },
};
function scoreForProfile(agg: ModelAgg, profile: Partial<Record<Category, number>>): number | null {
let wsum = 0;
let acc = 0;
let covered = 0;
for (const cat of Object.keys(profile) as Category[]) {
const s = agg.categoryScores.get(cat);
const w = profile[cat] ?? 0;
if (s === undefined) continue;
acc += s * w;
wsum += w;
covered += 1;
}
if (covered === 0 || wsum === 0) return null;
return round(acc / wsum, 2);
}
// ---------------------------------------------------------------------------
// Visualization helpers
// ---------------------------------------------------------------------------
function pearson(pairs: Array<[number, number]>): number | null {
const n = pairs.length;
if (n < 3) return null;
let sx = 0;
let sy = 0;
let sxx = 0;
let syy = 0;
let sxy = 0;
for (const [x, y] of pairs) {
sx += x;
sy += y;
sxx += x * x;
syy += y * y;
sxy += x * y;
}
const cov = n * sxy - sx * sy;
const dx = Math.sqrt(n * sxx - sx * sx);
const dy = Math.sqrt(n * syy - sy * sy);
if (dx === 0 || dy === 0) return null;
return round(cov / (dx * dy), 3);
}
// ---------------------------------------------------------------------------
// Main
// ---------------------------------------------------------------------------
async function main(): Promise<void> {
for (const spec of SOURCES) {
await runSource(spec);
}
// HuggingFace Open LLM Leaderboard: archived, no fresh data published.
statuses.push({
leaderboard: HF_SOURCE.name,
url: HF_SOURCE.url,
status: "archived_no_fresh_data",
benchmarks_fetched: 0,
rows_fetched: 0,
note: "Open LLM Leaderboard was archived; no fresh evaluations are published.",
});
const norm = normalizeScores();
const aggs = aggregate(norm);
// Composite ranking: models present on at least 2 leaderboards.
const ranked = aggs
.filter((a) => a.leaderboards.size >= 2)
.map((a) => ({
agg: a,
composite: compositeOf(a, CATEGORY_WEIGHTS),
}))
.sort((a, b) => b.composite - a.composite || a.agg.key.localeCompare(b.agg.key));
const models = ranked.map((r, i) => {
const catScores: Record<string, number> = {};
for (const cat of ALL_CATEGORIES) {
const s = r.agg.categoryScores.get(cat);
if (s !== undefined) catScores[cat] = s;
}
return {
rank: i + 1,
model: r.agg.model,
provider: r.agg.provider,
key: r.agg.key,
composite: r.composite,
coverage: {
leaderboards: r.agg.leaderboards.size,
benchmarks: r.agg.benchmarks.size,
categories: r.agg.categoryScores.size,
},
category_scores: catScores,
};
});
const compositeRankByKey = new Map<string, number>();
models.forEach((m) => compositeRankByKey.set(m.key, m.rank));
// Task routing: best models per task profile.
const taskRouting: Record<string, Array<{ rank: number; model: string; provider: string | null; score: number }>> = {};
for (const [task, profile] of Object.entries(TASK_PROFILES)) {
const scored = aggs
.map((a) => ({ a, s: scoreForProfile(a, profile) }))
.filter((x): x is { a: ModelAgg; s: number } => x.s !== null && x.a.leaderboards.size >= 1)
.sort((x, y) => y.s - x.s || x.a.key.localeCompare(y.a.key))
.slice(0, 8);
taskRouting[task] = scored.map((x, i) => ({
rank: i + 1,
model: x.a.model,
provider: x.a.provider,
score: x.s,
}));
}
// --- Visualization data ---
// Rank deltas: composite rank vs human-preference rank.
const humanPref = aggs
.filter((a) => a.categoryScores.has("human_pref"))
.sort(
(a, b) =>
(b.categoryScores.get("human_pref") ?? 0) - (a.categoryScores.get("human_pref") ?? 0) ||
a.key.localeCompare(b.key)
);
const humanPrefRankByKey = new Map<string, number>();
humanPref.forEach((a, i) => humanPrefRankByKey.set(a.key, i + 1));
const rankDeltas = models
.filter((m) => humanPrefRankByKey.has(m.key))
.map((m) => {
const hp = humanPrefRankByKey.get(m.key)!;
return {
model: m.model,
provider: m.provider,
composite_rank: m.rank,
human_pref_rank: hp,
delta: hp - m.rank, // positive: composite ranks higher than human preference
};
})
.sort((a, b) => Math.abs(b.delta) - Math.abs(a.delta) || a.model.localeCompare(b.model))
.slice(0, 20);
// Radar: per-category normalized score for the top models.
const radar = models.slice(0, 10).map((m) => {
const axes: Record<string, number | null> = {};
for (const cat of ALL_CATEGORIES) {
axes[cat] = m.category_scores[cat] ?? null;
}
return { model: m.model, provider: m.provider, axes };
});
// Correlations: Pearson between category score vectors across shared models.
const catVecByModel = new Map<string, Map<Category, number>>();
for (const a of aggs) catVecByModel.set(a.key, a.categoryScores);
const correlations: Array<{ a: Category; b: Category; r: number; n: number }> = [];
for (let i = 0; i < ALL_CATEGORIES.length; i++) {
for (let j = i + 1; j < ALL_CATEGORIES.length; j++) {
const ca = ALL_CATEGORIES[i];
const cb = ALL_CATEGORIES[j];
const pairs: Array<[number, number]> = [];
for (const [, cv] of catVecByModel) {
const va = cv.get(ca);
const vb = cv.get(cb);
if (va !== undefined && vb !== undefined) pairs.push([va, vb]);
}
const r = pearson(pairs);
if (r !== null) correlations.push({ a: ca, b: cb, r, n: pairs.length });
}
}
correlations.sort((x, y) => Math.abs(y.r) - Math.abs(x.r));
// Time-series seed: today's composite for the top models (first data point).
const timeSeriesSeed = models.slice(0, 15).map((m) => ({
date: SNAPSHOT_DATE,
model: m.model,
provider: m.provider,
composite: m.composite,
}));
// Normalized raw scores for transparency.
const normalizedScores = scores.map((s) => {
const group = norm.find(
(n) =>
n.key === normKey(s.model_raw) &&
n.leaderboard === s.leaderboard &&
n.benchmark === s.benchmark
);
return {
leaderboard: s.leaderboard,
benchmark: s.benchmark,
category: s.category,
model: displayNames.get(normKey(s.model_raw)) ?? s.model_raw,
key: normKey(s.model_raw),
provider: providers.get(normKey(s.model_raw)) ?? inferProvider(normKey(s.model_raw)),
value: s.value,
display: s.display,
higher_is_better: s.higherIsBetter,
normalized: group ? group.normalized : null,
};
});
const liveCount = statuses.filter((s) => s.status === "live").length;
const output = {
snapshot_date: SNAPSHOT_DATE,
summary: {
leaderboards_total: statuses.length,
leaderboards_live: liveCount,
leaderboards_archived: statuses.filter((s) => s.status === "archived_no_fresh_data").length,
leaderboards_error: statuses.filter((s) => s.status === "error").length,
raw_score_rows: normalizedScores.length,
ranked_models: models.length,
},
category_weights: CATEGORY_WEIGHTS,
leaderboards: statuses,
normalized_scores: normalizedScores,
composite_rankings: models,
task_routing: taskRouting,
visualization: {
rank_deltas: rankDeltas,
radar,
correlations,
time_series_seed: timeSeriesSeed,
},
};
process.stdout.write(JSON.stringify(output));
}
main().catch((e) => {
const msg = e instanceof Error ? e.stack || e.message : String(e);
log("FATAL " + msg);
// Emit a minimal valid JSON envelope so downstream consumers never break.
process.stdout.write(
JSON.stringify({
snapshot_date: SNAPSHOT_DATE,
summary: {
leaderboards_total: statuses.length,
leaderboards_live: 0,
leaderboards_archived: 0,
leaderboards_error: statuses.length,
raw_score_rows: 0,
ranked_models: 0,
},
category_weights: CATEGORY_WEIGHTS,
leaderboards: statuses,
normalized_scores: [],
composite_rankings: [],
task_routing: {},
visualization: { rank_deltas: [], radar: [], correlations: [], time_series_seed: [] },
})
);
});
Deploy this collector to unlock schedules, the API endpoint, and destinations.