mongodb · mongodben · Feb 24, 2025 · Feb 24, 2025 · Feb 26, 2025 · Feb 26, 2025
diff --git a/packages/benchmarks/src/quizQuestions/mongodbUniversityAllQuestionBenchmark.ts b/packages/benchmarks/src/quizQuestions/mongodbUniversityAllQuestionBenchmark.ts
@@ -1,10 +1,10 @@
-import { models } from "mongodb-rag-core/models";
+import { getOpenAiEndpointAndApiKey, models } from "mongodb-rag-core/models";
 import "dotenv/config";
 import PromisePool from "@supercharge/promise-pool";
 import { runQuizQuestionEval } from "./QuizQuestionEval";
 import { getQuizQuestionEvalCasesFromBraintrust } from "./getQuizQuestionEvalCasesFromBraintrust";
 import { mongoDbQuizQuestionExamples } from "./mongoDbQuizQuestionExamples";
-import { openAiClientFactory } from "../openAiClients";
+import { OpenAI } from "mongodb-rag-core/openai";
 
 async function main() {
   const DEFAULT_MAX_CONCURRENCY = 15;
@@ -46,7 +46,9 @@ async function main() {
         await runQuizQuestionEval({
           projectName,
           model: modelInfo.deployment,
-          openaiClient: openAiClientFactory.makeOpenAiClient(modelInfo),
+          openaiClient: new OpenAI({
+            ...(await getOpenAiEndpointAndApiKey(modelInfo)),
+          }),
           experimentName,
           additionalMetadata: {
             ...modelInfo,

diff --git a/packages/benchmarks/src/reporting/getBraintrustExperimentSummary.test.ts b/packages/benchmarks/src/reporting/getBraintrustExperimentSummary.test.ts
@@ -0,0 +1,15 @@
+import { assertEnvVars, BRAINTRUST_ENV_VARS } from "mongodb-rag-core";
+import { getBraintrustExperimentSummary } from "./getBraintrustExperimentSummary";
+
+describe.skip("getBraintrustExperimentSummary", () => {
+  it("should return the experiment summary", async () => {
+    const { BRAINTRUST_API_KEY } = assertEnvVars(BRAINTRUST_ENV_VARS);
+    const result = await getBraintrustExperimentSummary({
+      experimentName:
+        "mongosh-benchmark-official?experimentType=agentic&model=gemini-2.5-pro-preview-03-25",
+      projectName: "natural-language-to-mongosh",
+      apiKey: BRAINTRUST_API_KEY,
+    });
+    expect(result).toBeDefined();
+  });
+});
diff --git a/packages/benchmarks/src/reporting/getBraintrustExperimentSummary.ts b/packages/benchmarks/src/reporting/getBraintrustExperimentSummary.ts
@@ -0,0 +1,237 @@
+import { init } from "mongodb-rag-core/braintrust";
+
+export interface GetBraintrustExperimentSummary {
+  experimentName: string;
+  projectName: string;
+  apiKey: string;
+}
+
+export async function getBraintrustExperimentSummary({
+  projectName,
+  experimentName,
+  apiKey,
+}: GetBraintrustExperimentSummary): Promise<unknown> {
+  const experiment = await init(projectName, {
+    experiment: experimentName,
+    apiKey,
+    open: true,
+  });
+  const id = await experiment.id;
+
+  const metadata = (await fetch(
+    `https://api.braintrust.dev/v1/experiment/${id}`,
+    {
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+      },
+    }
+  ).then((res) => res.json())) as GetExperimentMetadataResponse;
+
+  const summary = (await fetch(
+    `https://api.braintrust.dev/v1/experiment/${id}/summarize?summarize_scores=true&comparison_experiment_id=${id}`,
+    {
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+      },
+    }
+  ).then((res) => res.json())) as GetExperimentSummaryResponse;
+
+  return { metadata, summary };
+}
+
+// ---
+// Types from the Braintrust API docs
+// ---
+/**
+  Metadata about the state of the repo when the experiment was created
+ */
+export type RepoInfo = {
+  /**
+    SHA of most recent commit
+   */
+  commit?: string | null;
+  /**
+    Name of the branch the most recent commit belongs to
+   */
+  branch?: string | null;
+  /**
+    Name of the tag on the most recent commit
+   */
+  tag?: string | null;
+  /**
+    Whether or not the repo had uncommitted changes when snapshotted
+   */
+  dirty?: boolean | null;
+  /**
+    Name of the author of the most recent commit
+   */
+  author_name?: string | null;
+  /**
+    Email of the author of the most recent commit
+   */
+  author_email?: string | null;
+  /**
+    Most recent commit message
+   */
+  commit_message?: string | null;
+  /**
+    Time of the most recent commit
+   */
+  commit_time?: string | null;
+  /**
+    If the repo was dirty when run, this includes the diff between the current state of the repo and the most recent commit.
+   */
+  git_diff?: string | null;
+};
+
+export interface GetExperimentMetadataResponse {
+  /**
+    Unique identifier for the experiment
+   */
+  id: string;
+  /**
+    Unique identifier for the project that the experiment belongs under
+   */
+  project_id: string;
+  /**
+    Name of the experiment. Within a project, experiment names are unique
+   */
+  name: string;
+  /**
+    Textual description of the experiment
+   */
+  description?: string | null;
+  /**
+    Date of experiment creation
+   */
+  created?: string | null;
+  repo_info?: RepoInfo;
+  /**
+    Commit, taken directly from `repo_info.commit`
+   */
+  commit?: string | null;
+  /**
+    Id of default base experiment to compare against when viewing this experiment
+   */
+  base_exp_id?: string | null;
+  /**
+    Date of experiment deletion, or null if the experiment is still active
+   */
+  deleted_at?: string | null;
+  /**
+    Identifier of the linked dataset, or null if the experiment is not linked to a dataset
+   */
+  dataset_id?: string | null;
+  /**
+    Version number of the linked dataset the experiment was run against. This can be used to reproduce the experiment after the dataset has been modified.
+   */
+  dataset_version?: string | null;
+  /**
+    Whether or not the experiment is public. Public experiments can be viewed by anybody inside or outside the organization
+   */
+  public: boolean;
+  /**
+    Identifies the user who created the experiment
+   */
+  user_id?: string | null;
+  /**
+   User-controlled metadata about the experiment
+   */
+  metadata?: {
+    [k: string]: {
+      [k: string]: unknown;
+    };
+  } | null;
+}
+
+/**
+  Summary of an experiment
+ */
+export interface GetExperimentSummaryResponse {
+  /**
+    Name of the project that the experiment belongs to
+   */
+  project_name: string;
+  /**
+    Name of the experiment
+   */
+  experiment_name: string;
+  /**
+    URL to the project's page in the Braintrust app
+   */
+  project_url: string;
+  /**
+    URL to the experiment's page in the Braintrust app
+   */
+  experiment_url: string;
+  /**
+    The experiment which scores are baselined against
+   */
+  comparison_experiment_name?: string | null;
+  /**
+    Summary of the experiment's scores
+   */
+  scores?: {
+    [k: string]: ScoreSummary;
+  } | null;
+  /**
+    Summary of the experiment's metrics
+   */
+  metrics?: {
+    [k: string]: MetricSummary;
+  } | null;
+}
+/**
+  Summary of a score's performance
+ */
+export interface ScoreSummary {
+  /**
+    Name of the score
+   */
+  name: string;
+  /**
+    Average score across all examples
+   */
+  score: number;
+  /**
+    Difference in score between the current and comparison experiment
+   */
+  diff?: number;
+  /**
+    Number of improvements in the score
+   */
+  improvements: number;
+  /**
+    Number of regressions in the score
+   */
+  regressions: number;
+}
+/**
+  Summary of a metric's performance
+ */
+export interface MetricSummary {
+  /**
+    Name of the metric
+   */
+  name: string;
+  /**
+    Average metric across all examples
+   */
+  metric: number;
+  /**
+    Unit label for the metric
+   */
+  unit: string;
+  /**
+    Difference in metric between the current and comparison experiment
+   */
+  diff?: number;
+  /**
+    Number of improvements in the metric
+   */
+  improvements: number;
+  /**
+    Number of regressions in the metric
+   */
+  regressions: number;
+}