0.0.2Updated 7 days ago
// deno-lint-ignore-file no-unversioned-import no-import-prefix
import ollama from "npm:ollama";

import { SYSTEM_PROMPT } from "./system_prompt.ts";
import { QUESTIONS } from "./questions.ts";
import { get_vram, get_vram_auto } from "./vram.ts";

// const LOG_FULL = Deno.args.includes('--full-output');

// region Helpers

let questions_answered = 0;
let SYSTEM_VRAM: number;

let first_answer_time: number | null = null;
let end_timeout: number | null = null;

const first_run = () => {
  if(end_timeout == null) {
    console.log('OLLAMA initialized. Starting benchmark timer of 5 minutes');
    console.log('');
    end_timeout = setTimeout(end_benchmark, 1000 * 60 * 5);
  }
  if(first_answer_time == null) {
    console.log(`Running through list of ${QUESTIONS.length + 1} questions`);
    console.log('');
    first_answer_time = Date.now();
  }
}

const end_benchmark = () => {
  const time_taken_seconds = (Date.now() - first_answer_time!) / 1000;
  const qpm = questions_answered / (time_taken_seconds / 60);
  console.log(`\n\nBenchmark completed. Answered %c${questions_answered}%c questions in %c${time_taken_seconds.toFixed(2)}%c seconds.`, 'color: orange', '', 'color: orange', '');
  console.log(`Questions per minute: ${qpm}`);
  console.log('');
  console.log('This benchmark only determines the performance of a single LLM instance. Each instance runs at full speed with 8GB of VRAM.');
  console.log(`The score of the benchmark on this system will be ${(qpm * SYSTEM_VRAM).toFixed(2)} (Assuming VRAM = ${SYSTEM_VRAM})`);
  console.log('%cScore = average questions per minute, including thinking and responding time', 'color: gray');
  console.log('%cScore is raw LLM power, and does not use external tools for more accurate benchmark metrics', 'color: gray');
  Deno.exit(0);
}

const run_benchmark = async () => {
  let message_shown = false;
  SYSTEM_VRAM = (await get_vram_auto())!;
  while(!SYSTEM_VRAM) {
    console.clear();
    console.log(`Steward LLM Benchmark v0.1\n`);
    if(!message_shown) {
      console.log('Could not automatically get system VRAM information.');
      message_shown = true;
    }
    SYSTEM_VRAM = get_vram()!;
  }

  console.log('');

  while(QUESTIONS.length) await next_question();
  end_benchmark();
}

// region LLM Setup

const LLM_MODEL = 'qwen3:8b';

enum QuestionState {
  STARTED,
  THINKING,
  RESPONDING,
  DONE,
}

let step_start = 0;
const next_question = async () => {
  const question = QUESTIONS.shift()!;

  const stream = await ollama.chat({
    model: LLM_MODEL,
    messages: [{
      role: 'system',
      content: SYSTEM_PROMPT
    }, {
      role: 'user',
      content: question
    }],
    tools: [],
    think: true,
    keep_alive: 0,
    stream: true,
    options: {
      seed: 0,
      temperature: 0,
    }
  })

  first_run();

  let state: QuestionState = QuestionState.STARTED;
  let doneThinking = false;

  const encoder = new TextEncoder();

  if(questions_answered > 0) {
    const step_time = Date.now() - step_start;
    Deno.stdout.write(encoder.encode(`  DONE (${(step_time/1000).toFixed(2)} seconds total)\n`))
  }

  console.log(`Q#${(questions_answered+1).toString().padStart(3, '0')}: ${question}`);

  for await (const chunk of stream) {
    if(chunk.message.thinking) {
      if(state !== QuestionState.THINKING) {
        step_start = Date.now();
        Deno.stdout.write(encoder.encode("       '- THINKING... "));
        // if(LOG_FULL) console.log('');
        state = QuestionState.THINKING;
      }
      // if(LOG_FULL) Deno.stdout.write(encoder.encode(chunk.message.thinking));
    }
    if(chunk.message.content) {
      if(state !== QuestionState.RESPONDING) {
        if(state == QuestionState.THINKING) {
          const step_time = Date.now() - step_start;
          Deno.stdout.write(encoder.encode(`  DONE (${(step_time/1000).toFixed(2)} seconds elapsed)\n`))
        }

        Deno.stdout.write(encoder.encode("       '- RESPONDING... "));
        // if(LOG_FULL) console.log('');
        state = QuestionState.RESPONDING;
      }
      if(!doneThinking) {
        doneThinking = true;
        state = QuestionState.RESPONDING;
      }
      // if(LOG_FULL) Deno.stdout.write(encoder.encode(chunk.message.content));
    }
  }
  if(state == QuestionState.RESPONDING) {
    const step_time = Date.now() - step_start;
    Deno.stdout.write(encoder.encode(`DONE (${(step_time/1000).toFixed(2)} seconds elapsed)\n`))
  }

  state = QuestionState.DONE;
  questions_answered++;

  Deno.stdout.write(encoder.encode("       '- CLEANING... "));
}

// region Run

await run_benchmark();