Skip to main content
Glama

Genkit MCP

Official
by firebase
evaluator.go11.4 kB
// Copyright 2025 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // SPDX-License-Identifier: Apache-2.0 package ai import ( "context" "fmt" "github.com/firebase/genkit/go/core" "github.com/firebase/genkit/go/core/api" "github.com/firebase/genkit/go/core/logger" "github.com/firebase/genkit/go/core/tracing" "github.com/google/uuid" "go.opentelemetry.io/otel/trace" ) // EvaluatorFunc is the function type for evaluator implementations. type EvaluatorFunc = func(context.Context, *EvaluatorCallbackRequest) (*EvaluatorCallbackResponse, error) // BatchEvaluatorFunc is the function type for batch evaluator implementations. type BatchEvaluatorFunc = func(context.Context, *EvaluatorRequest) (*EvaluatorResponse, error) // Evaluator represents a evaluator action. type Evaluator interface { // Name returns the name of the evaluator. Name() string // Evaluates a dataset. Evaluate(ctx context.Context, req *EvaluatorRequest) (*EvaluatorResponse, error) // Register registers the evaluator with the given registry. Register(r api.Registry) } // EvaluatorArg is the interface for evaluator arguments. It can either be the evaluator action itself or a reference to be looked up. type EvaluatorArg interface { Name() string } // EvaluatorRef is a struct to hold evaluator name and configuration. type EvaluatorRef struct { name string config any } // NewEvaluatorRef creates a new EvaluatorRef with the given name and configuration. func NewEvaluatorRef(name string, config any) EvaluatorRef { return EvaluatorRef{name: name, config: config} } // Name returns the name of the evaluator. func (e EvaluatorRef) Name() string { return e.name } // Config returns the configuration to use by default for this evaluator. func (e EvaluatorRef) Config() any { return e.config } // evaluator is an action with functions specific to evaluating a dataset. type evaluator struct { core.ActionDef[*EvaluatorRequest, *EvaluatorResponse, struct{}] } // Example is a single example that requires evaluation type Example struct { TestCaseId string `json:"testCaseId,omitempty"` Input any `json:"input"` Output any `json:"output,omitempty"` Context []any `json:"context,omitempty"` Reference any `json:"reference,omitempty"` TraceIds []string `json:"traceIds,omitempty"` } // EvaluatorRequest is the data we pass to evaluate a dataset. // The Options field is specific to the actual evaluator implementation. type EvaluatorRequest struct { Dataset []*Example `json:"dataset"` EvaluationId string `json:"evalRunId"` Options any `json:"options,omitempty"` } // ScoreStatus is an enum used to indicate if a Score has passed or failed. This // drives additional features in tooling / the Dev UI. type ScoreStatus int const ( ScoreStatusUnknown ScoreStatus = iota ScoreStatusFail ScoreStatusPass ) var statusName = map[ScoreStatus]string{ ScoreStatusUnknown: "UNKNOWN", ScoreStatusFail: "FAIL", ScoreStatusPass: "PASS", } func (ss ScoreStatus) String() string { return statusName[ss] } // Score is the evaluation score that represents the result of an evaluator. // This struct includes information such as the score (numeric, string or other // types), the reasoning provided for this score (if any), the score status (if // any) and other details. type Score struct { Id string `json:"id,omitempty"` Score any `json:"score,omitempty"` Status string `json:"status,omitempty" jsonschema:"enum=UNKNOWN,enum=FAIL,enum=PASS"` Error string `json:"error,omitempty"` Details map[string]any `json:"details,omitempty"` } // EvaluationResult is the result of running the evaluator on a single Example. // An evaluator may provide multiple scores simultaneously (e.g. if they are using // an API to score on multiple criteria) type EvaluationResult struct { TestCaseId string `json:"testCaseId"` TraceID string `json:"traceId,omitempty"` SpanID string `json:"spanId,omitempty"` Evaluation []Score `json:"evaluation"` } // EvaluatorResponse is a collection of [EvaluationResult] structs, it // represents the result on the entire input dataset. type EvaluatorResponse = []EvaluationResult type EvaluatorOptions struct { // ConfigSchema is the JSON schema for the evaluator's config. ConfigSchema map[string]any `json:"configSchema,omitempty"` // DisplayName is the name of the evaluator as it appears in the UI. DisplayName string `json:"displayName"` // Definition is the definition of the evaluator. Definition string `json:"definition"` // IsBilled is a flag indicating if the evaluator is billed. IsBilled bool `json:"isBilled,omitempty"` } // EvaluatorCallbackRequest is the data we pass to the callback function // provided in defineEvaluator. The Options field is specific to the actual // evaluator implementation. type EvaluatorCallbackRequest struct { Input Example `json:"input"` Options any `json:"options,omitempty"` } // EvaluatorCallbackResponse is the result on evaluating a single [Example] type EvaluatorCallbackResponse = EvaluationResult // NewEvaluator creates a new [Evaluator]. // This method processes the input dataset one-by-one. func NewEvaluator(name string, opts *EvaluatorOptions, fn EvaluatorFunc) Evaluator { if name == "" { panic("ai.NewEvaluator: evaluator name is required") } if opts == nil { opts = &EvaluatorOptions{} } // TODO(ssbushi): Set this on `evaluator` key on action metadata metadata := map[string]any{ "type": api.ActionTypeEvaluator, "evaluator": map[string]any{ "evaluatorIsBilled": opts.IsBilled, "evaluatorDisplayName": opts.DisplayName, "evaluatorDefinition": opts.Definition, }, } inputSchema := core.InferSchemaMap(EvaluatorRequest{}) if inputSchema != nil && opts.ConfigSchema != nil { if props, ok := inputSchema["properties"].(map[string]any); ok { props["options"] = opts.ConfigSchema } } return &evaluator{ ActionDef: *core.NewAction(name, api.ActionTypeEvaluator, metadata, inputSchema, func(ctx context.Context, req *EvaluatorRequest) (output *EvaluatorResponse, err error) { var results []EvaluationResult for _, datapoint := range req.Dataset { if datapoint.TestCaseId == "" { datapoint.TestCaseId = uuid.New().String() } spanMetadata := &tracing.SpanMetadata{ Name: fmt.Sprintf("TestCase %s", datapoint.TestCaseId), Type: "evaluator", Subtype: "evaluator", } _, err := tracing.RunInNewSpan(ctx, spanMetadata, datapoint, func(ctx context.Context, input *Example) (*EvaluatorCallbackResponse, error) { traceId := trace.SpanContextFromContext(ctx).TraceID().String() spanId := trace.SpanContextFromContext(ctx).SpanID().String() callbackRequest := EvaluatorCallbackRequest{ Input: *input, Options: req.Options, } result, err := fn(ctx, &callbackRequest) if err != nil { failedScore := Score{ Status: ScoreStatusFail.String(), Error: fmt.Sprintf("Evaluation of test case %s failed: \n %s", input.TestCaseId, err.Error()), } failedResult := EvaluationResult{ TestCaseId: input.TestCaseId, Evaluation: []Score{failedScore}, TraceID: traceId, SpanID: spanId, } results = append(results, failedResult) // return error to mark span as failed return nil, err } result.TraceID = traceId result.SpanID = spanId results = append(results, *result) return result, nil }) if err != nil { logger.FromContext(ctx).Debug("EvaluatorAction", "err", err) continue } } return &results, nil }), } } // DefineEvaluator creates a new [Evaluator] and registers it. // This method processes the input dataset one-by-one. func DefineEvaluator(r api.Registry, name string, opts *EvaluatorOptions, fn EvaluatorFunc) Evaluator { e := NewEvaluator(name, opts, fn) e.Register(r) return e } // NewBatchEvaluator creates a new [Evaluator]. // This method provides the full [EvaluatorRequest] to the callback function, // giving more flexibility to the user for processing the data, such as batching or parallelization. func NewBatchEvaluator(name string, opts *EvaluatorOptions, fn BatchEvaluatorFunc) Evaluator { if name == "" { panic("ai.NewBatchEvaluator: batch evaluator name is required") } if opts == nil { opts = &EvaluatorOptions{} } metadata := map[string]any{ "type": api.ActionTypeEvaluator, "evaluator": map[string]any{ "evaluatorIsBilled": opts.IsBilled, "evaluatorDisplayName": opts.DisplayName, "evaluatorDefinition": opts.Definition, }, } return &evaluator{ ActionDef: *core.NewAction(name, api.ActionTypeEvaluator, metadata, nil, fn), } } // DefineBatchEvaluator creates a new [Evaluator] and registers it. // This method provides the full [EvaluatorRequest] to the callback function, // giving more flexibility to the user for processing the data, such as batching or parallelization. func DefineBatchEvaluator(r api.Registry, name string, opts *EvaluatorOptions, fn BatchEvaluatorFunc) Evaluator { e := NewBatchEvaluator(name, opts, fn) e.(*evaluator).Register(r) return e } // LookupEvaluator looks up an [Evaluator] registered by [DefineEvaluator]. // It returns nil if the evaluator was not defined. func LookupEvaluator(r api.Registry, name string) Evaluator { action := core.ResolveActionFor[*EvaluatorRequest, *EvaluatorResponse, struct{}](r, api.ActionTypeEvaluator, name) if action == nil { return nil } return &evaluator{ ActionDef: *action, } } // Evaluate runs the given [Evaluator]. func (e *evaluator) Evaluate(ctx context.Context, req *EvaluatorRequest) (*EvaluatorResponse, error) { if e == nil { return nil, core.NewError(core.INVALID_ARGUMENT, "Evaluator.Evaluate: evaluator called on a nil evaluator; check that all evaluators are defined") } return e.Run(ctx, req, nil) } // Evaluate calls the retrivers with provided options. func Evaluate(ctx context.Context, r api.Registry, opts ...EvaluatorOption) (*EvaluatorResponse, error) { evalOpts := &evaluatorOptions{} for _, opt := range opts { if err := opt.applyEvaluator(evalOpts); err != nil { return nil, err } } if evalOpts.Evaluator == nil { return nil, fmt.Errorf("ai.Evaluate: evaluator must be set") } e, ok := evalOpts.Evaluator.(Evaluator) if !ok { e = LookupEvaluator(r, evalOpts.Evaluator.Name()) } if e == nil { return nil, fmt.Errorf("ai.Evaluate: evaluator not found: %s", evalOpts.Evaluator.Name()) } if evalRef, ok := evalOpts.Evaluator.(EvaluatorRef); ok && evalOpts.Config == nil { evalOpts.Config = evalRef.Config() } req := &EvaluatorRequest{ Dataset: evalOpts.Dataset, EvaluationId: evalOpts.ID, Options: evalOpts.Config, } return e.Evaluate(ctx, req) }

MCP directory API

We provide all the information about MCP servers via our MCP API.

curl -X GET 'https://glama.ai/api/mcp/v1/servers/firebase/genkit'

If you have feedback or need assistance with the MCP directory API, please join our Discord server