Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions lib/braintrust/api.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

require_relative "api/datasets"
require_relative "api/functions"
require_relative "api/btql"

module Braintrust
# API client for Braintrust REST API
Expand Down Expand Up @@ -42,5 +43,11 @@ def login
def object_permalink(object_type:, object_id:)
@state.object_permalink(object_type: object_type, object_id: object_id)
end

# Access to BTQL API
# @return [API::BTQL]
def btql
@btql ||= API::BTQL.new(self)
end
end
end
86 changes: 86 additions & 0 deletions lib/braintrust/api/btql.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# frozen_string_literal: true

require "net/http"
require "json"
require "uri"
require_relative "../logger"

module Braintrust
class API
# BTQL API namespace
# Provides methods for querying spans and other data using BTQL
class BTQL
def initialize(api)
@api = api
@state = api.state
end

# Query spans using BTQL
# POST /btql
# @param query [Hash] AST-based query filter
# @param object_type [String] Type of object (e.g., "experiment")
# @param object_id [String] Object ID
# @param fmt [String] Response format (default: "jsonl")
# @return [Hash] Response with :body, :freshness_state
def query(query:, object_type:, object_id:, fmt: "jsonl")
payload = {
query: query,
object_type: object_type,
object_id: object_id,
fmt: fmt
}

response = http_post_json_raw("/btql", payload)

{
body: response.body,
freshness_state: response["x-bt-freshness-state"] || "complete"
}
end

private

# Core HTTP request method (copied from datasets.rb pattern)
def http_request(method, path, params: {}, payload: nil, base_url: nil, parse_json: true)
base = base_url || @state.api_url
uri = URI("#{base}#{path}")
uri.query = URI.encode_www_form(params) unless params.empty?

request = case method
when :get
Net::HTTP::Get.new(uri)
when :post
req = Net::HTTP::Post.new(uri)
req["Content-Type"] = "application/json"
req.body = JSON.dump(payload) if payload
req
else
raise ArgumentError, "Unsupported HTTP method: #{method}"
end

request["Authorization"] = "Bearer #{@state.api_key}"

start_time = Time.now
Log.debug("[API] #{method.upcase} #{uri}")

http = Net::HTTP.new(uri.host, uri.port)
http.use_ssl = (uri.scheme == "https")
response = http.request(request)

duration_ms = ((Time.now - start_time) * 1000).round(2)
Log.debug("[API] #{method.upcase} #{uri} -> #{response.code} (#{duration_ms}ms, #{response.body.bytesize} bytes)")

unless response.is_a?(Net::HTTPSuccess)
Log.debug("[API] Error response body: #{response.body}")
raise Error, "HTTP #{response.code} for #{method.upcase} #{uri}: #{response.body}"
end

parse_json ? JSON.parse(response.body) : response
end

def http_post_json_raw(path, payload)
http_request(:post, path, payload: payload, parse_json: false)
end
end
end
end
40 changes: 25 additions & 15 deletions lib/braintrust/eval.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

require_relative "eval/scorer"
require_relative "eval/runner"
require_relative "eval/context"
require_relative "api/internal/projects"
require_relative "api/internal/experiments"
require_relative "dataset"
Expand Down Expand Up @@ -249,23 +250,32 @@ def run(project:, experiment:, task:, scorers:,
project_id = project_result["id"]
project_name = project_result["name"]

# Instantiate Runner and run evaluation
runner = Runner.new(
experiment_id: experiment_id,
experiment_name: experiment,
project_id: project_id,
project_name: project_name,
task: task,
scorers: scorers,
api: api,
tracer_provider: tracer_provider
)
result = runner.run(cases, parallelism: parallelism)
# Create evaluation context
eval_context = Eval::Context.new(experiment_id: experiment_id)

begin
# Instantiate Runner and run evaluation
runner = Runner.new(
experiment_id: experiment_id,
experiment_name: experiment,
project_id: project_id,
project_name: project_name,
task: task,
scorers: scorers,
api: api,
tracer_provider: tracer_provider,
eval_context: eval_context
)
result = runner.run(cases, parallelism: parallelism)

# Print result summary unless quiet
print_result(result) unless quiet
# Print result summary unless quiet
print_result(result) unless quiet

result
result
ensure
# Dispose evaluation context
eval_context&.dispose
end
end

private
Expand Down
23 changes: 23 additions & 0 deletions lib/braintrust/eval/context.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# frozen_string_literal: true

module Braintrust
module Eval
# Scoped context for a single evaluation run.
# Owns lifecycle of eval-specific resources.
# Created at eval start, disposed at eval end.
class Context
attr_reader :experiment_id

# @param experiment_id [String] The experiment ID
def initialize(experiment_id:)
@experiment_id = experiment_id
end

# Dispose of resources (for eager cleanup)
def dispose
# Currently no resources to dispose
# This method is kept for future extensibility
end
end
end
end
35 changes: 30 additions & 5 deletions lib/braintrust/eval/runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
require_relative "result"
require_relative "summary"
require_relative "../internal/thread_pool"
require_relative "../trace_context"

require "opentelemetry/sdk"
require "json"
Expand All @@ -18,7 +19,7 @@ class Runner
MAX_PARALLELISM = Internal::ThreadPool::MAX_PARALLELISM

def initialize(experiment_id:, experiment_name:, project_id:, project_name:,
task:, scorers:, api:, tracer_provider: nil)
task:, scorers:, api:, tracer_provider: nil, eval_context: nil)
@experiment_id = experiment_id
@experiment_name = experiment_name
@project_id = project_id
Expand All @@ -29,6 +30,7 @@ def initialize(experiment_id:, experiment_name:, project_id:, project_name:,
@tracer_provider = tracer_provider || OpenTelemetry.tracer_provider
@tracer = @tracer_provider.tracer("braintrust-eval")
@parent_attr = "experiment_id:#{experiment_id}"
@eval_context = eval_context

# Mutex for thread-safe score collection
@score_mutex = Mutex.new
Expand Down Expand Up @@ -103,8 +105,11 @@ def run_case(test_case, errors)
end

# Run scorers
# Create TraceContext for scorers (if scorers exist)
trace = scorers.empty? ? nil : create_trace_context(eval_span)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When would scorers not exist? (I would think that if you run an Eval you would want to score it?)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might not be using local code scorers though, right? Could just be getting scored in our backend.


begin
run_scorers(test_case, output)
run_scorers(test_case, output, trace)
rescue => e
# Error already recorded on score span, set eval span status
eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
Expand Down Expand Up @@ -149,15 +154,16 @@ def run_task(test_case)
# Creates single score span for all scorers
# @param test_case [Case] The test case
# @param output [Object] Task output
def run_scorers(test_case, output)
# @param trace [TraceContext, nil] Optional trace context for scorers
def run_scorers(test_case, output, trace = nil)
tracer.in_span("score") do |score_span|
score_span.set_attribute("braintrust.parent", parent_attr)
set_json_attr(score_span, "braintrust.span_attributes", {type: "score"})
set_json_attr(score_span, "braintrust.span_attributes", {type: "score", purpose: "scorer"})

scores = {}
scorer_error = nil
scorers.each do |scorer|
score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {})
score_value = scorer.call(test_case.input, test_case.expected, output, test_case.metadata || {}, trace)
scores[scorer.name] = score_value

# Collect raw score for summary (thread-safe)
Expand Down Expand Up @@ -239,6 +245,25 @@ def collect_score(name, value)
(@scores[name] ||= []) << value
end
end

# Create a TraceContext for scorers to access span data
# @param eval_span [OpenTelemetry::Trace::Span] The eval span
# @return [TraceContext, nil] TraceContext if eval_context present, nil otherwise
def create_trace_context(eval_span)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this belongs here: Eval::Runner concerns running evals, not building trace contexts.

# Skip if no eval_context (e.g., in tests)
return nil unless @eval_context

# Extract root_span_id from the eval span's trace_id
root_span_id = eval_span.context.trace_id.unpack1("H*")

TraceContext.new(
object_type: "experiment",
object_id: experiment_id,
root_span_id: root_span_id,
state: @api.state,
ensure_spans_flushed: -> { @tracer_provider.force_flush }
)
end
end
end
end
28 changes: 18 additions & 10 deletions lib/braintrust/eval/scorer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
module Braintrust
module Eval
# Scorer wraps a scoring function that evaluates task output against expected values
# Scorers can accept 3 params (input, expected, output) or 4 params (input, expected, output, metadata)
# Scorers can accept 3 params (input, expected, output), 4 params (input, expected, output, metadata),
# or 5 params (input, expected, output, metadata, trace)
# They can return a float, hash, or array of hashes
class Scorer
attr_reader :name
Expand Down Expand Up @@ -43,9 +44,10 @@ def initialize(name_or_callable = nil, callable = nil, &block)
# @param expected [Object] The expected output
# @param output [Object] The actual output from the task
# @param metadata [Hash] Optional metadata
# @param trace [TraceContext, nil] Optional trace context
# @return [Float, Hash, Array] Score value(s)
def call(input, expected, output, metadata = {})
@wrapped_callable.call(input, expected, output, metadata)
def call(input, expected, output, metadata = {}, trace = nil)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does a scorer take a trace context?

@wrapped_callable.call(input, expected, output, metadata, trace)
end

private
Expand All @@ -68,25 +70,31 @@ def detect_name(callable)
"scorer"
end

# Wrap the callable to always accept 4 parameters
# Wrap the callable to always accept 5 parameters
# @param callable [#call] The callable to wrap
# @return [Proc] Wrapped callable that accepts 4 params
# @return [Proc] Wrapped callable that accepts 5 params
def wrap_callable(callable)
arity = callable_arity(callable)

case arity
when 3
# Callable takes 3 params - wrap to ignore metadata
->(input, expected, output, metadata) {
# Callable takes 3 params - wrap to ignore metadata and trace
->(input, expected, output, metadata, trace) {
callable.call(input, expected, output)
}
when 4, -4, -1
# Callable takes 4 params (or variadic with 4+)
when 4, -4
# Callable takes 4 params - wrap to ignore trace
# -4 means optional 4th param
->(input, expected, output, metadata, trace) {
callable.call(input, expected, output, metadata)
}
when 5, -5, -1
# Callable takes 5 params (or variadic with 5+)
# -5 means optional 5th param
# -1 means variadic (*args)
callable
else
raise ArgumentError, "Scorer must accept 3 or 4 parameters (got arity #{arity})"
raise ArgumentError, "Scorer must accept 3, 4, or 5 parameters (got arity #{arity})"
end
end

Expand Down
Loading