Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions core/application/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/mudler/LocalAI/core/services/nodes"
"github.com/mudler/LocalAI/core/services/routing/admission"
"github.com/mudler/LocalAI/core/services/routing/billing"
"github.com/mudler/LocalAI/core/services/routing/corpus"
"github.com/mudler/LocalAI/core/services/routing/pii"
"github.com/mudler/LocalAI/core/services/routing/piidetector"
"github.com/mudler/LocalAI/core/services/routing/router"
Expand Down Expand Up @@ -74,6 +75,8 @@ type Application struct {
mitmHostConflicts atomic.Pointer[map[string][]string]
routerDecisions router.DecisionStore
routerRegistry *router.Registry
routerCorpus *corpus.Manager
routerCorpusOnce sync.Once
admissionLimiter *admission.Limiter
watchdogMutex sync.Mutex
watchdogStop chan bool
Expand Down Expand Up @@ -524,6 +527,12 @@ func (a *Application) start() error {
assistantClient.PIIRedactor = a.piiRedactor
assistantClient.PIIEvents = a.piiEvents
assistantClient.RouterDecisions = a.routerDecisions
// Router corpus tools — same factories the RouteModel middleware
// uses, so the assistant and the request path agree on store
// namespaces and model resolution.
assistantClient.RouterCorpus = a.RouterCorpus()
assistantClient.RouterEmbedder = a.Embedder
assistantClient.RouterVectorStore = a.VectorStore
if err := holder.Initialize(a.applicationConfig.Context, assistantClient, localaitools.Options{}); err != nil {
// Why log+continue instead of fail: the assistant is an optional
// feature; a failure here must not take down the whole server.
Expand Down
14 changes: 14 additions & 0 deletions core/application/router_factories.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package application

import (
"cmp"
"context"
"fmt"
"path/filepath"

"github.com/mudler/LocalAI/core/backend"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/services/routing/corpus"
)

// adapterConfig resolves a model name to its runtime ModelConfig, or nil when
Expand Down Expand Up @@ -118,3 +121,14 @@ func (l *lazyEmbedder) Embed(ctx context.Context, text string) ([]float32, error
func (a *Application) VectorStore(storeName string) backend.VectorStore {
return backend.NewVectorStore(a.modelLoader, a.applicationConfig, storeName)
}

// RouterCorpus returns the process-wide KNN corpus manager. Corpus
// files live under <state dir>/router-corpus (same DataPath →
// DynamicConfigsDir precedence the agent pool uses for its state).
func (a *Application) RouterCorpus() *corpus.Manager {
a.routerCorpusOnce.Do(func() {
root := cmp.Or(a.applicationConfig.DataPath, a.applicationConfig.DynamicConfigsDir, ".")
a.routerCorpus = corpus.NewManager(filepath.Join(root, "router-corpus"))
})
return a.routerCorpus
}
93 changes: 91 additions & 2 deletions core/backend/stores.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,23 @@ import (
)

// VectorStore is the narrowed KNN store used by the router's embedding
// cache. Search returns the top-1 match (cosine similarity in [-1, 1])
// and the serialised payload, or ok=false on a clean miss.
// cache and the KNN classifier. Search returns the top-1 match (cosine
// similarity in [-1, 1]) and the serialised payload, or ok=false on a
// clean miss. SearchK returns up to k nearest neighbours ordered by
// descending similarity; an empty slice is a clean miss.
type VectorStore interface {
Search(ctx context.Context, vec []float32) (similarity float64, payload []byte, ok bool, err error)
SearchK(ctx context.Context, vec []float32, k int) ([]Neighbor, error)
Insert(ctx context.Context, vec []float32, payload []byte) error
}

// Neighbor is one SearchK result — the stored payload and its cosine
// similarity to the query vector.
type Neighbor struct {
Similarity float64
Payload []byte
}

// NewVectorStore returns a VectorStore backed by the local-store
// gRPC backend, namespaced by storeName so two routers don't collide.
func NewVectorStore(loader *model.ModelLoader, appConfig *config.ApplicationConfig, storeName string) VectorStore {
Expand Down Expand Up @@ -63,6 +73,35 @@ func (s *localVectorStore) Search(ctx context.Context, vec []float32) (sim float
return float64(similarities[0]), values[0], true, nil
}

func (s *localVectorStore) SearchK(ctx context.Context, vec []float32, k int) (neighbors []Neighbor, err error) {
start := time.Now()
outcome := "hit"
sim := 0.0
defer func() {
s.recordTrace(start, "search", len(vec), sim, outcome, err)
}()
be, berr := s.backend(ctx)
if berr != nil {
outcome = "backend_load_error"
return nil, fmt.Errorf("vector store load: %w", berr)
}
_, values, similarities, ferr := store.Find(ctx, be, vec, k)
if ferr != nil {
outcome = "find_error"
return nil, fmt.Errorf("vector store find: %w", ferr)
}
if len(values) == 0 {
outcome = "miss"
return nil, nil
}
neighbors = make([]Neighbor, 0, len(values))
for i, v := range values {
neighbors = append(neighbors, Neighbor{Similarity: float64(similarities[i]), Payload: v})
}
sim = neighbors[0].Similarity
return neighbors, nil
}

func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []byte) (err error) {
start := time.Now()
outcome := "ok"
Expand All @@ -81,6 +120,56 @@ func (s *localVectorStore) Insert(ctx context.Context, vec []float32, payload []
return nil
}

// InsertBatch upserts many vectors in one gRPC round-trip. Not part of
// the VectorStore interface — the corpus manager type-asserts for it
// and falls back to per-entry Insert on stores that lack it.
func (s *localVectorStore) InsertBatch(ctx context.Context, vecs [][]float32, payloads [][]byte) (err error) {
start := time.Now()
outcome := "ok"
dim := 0
if len(vecs) > 0 {
dim = len(vecs[0])
}
defer func() {
s.recordTrace(start, "insert_batch", dim, 0, outcome, err)
}()
be, berr := s.backend(ctx)
if berr != nil {
outcome = "backend_load_error"
return fmt.Errorf("vector store load: %w", berr)
}
if serr := store.SetCols(ctx, be, vecs, payloads); serr != nil {
outcome = "insert_error"
return serr
}
return nil
}

// Delete removes vectors by key. Optional capability like InsertBatch;
// used by the corpus manager's Clear so a wiped corpus also leaves the
// live index.
func (s *localVectorStore) Delete(ctx context.Context, vecs [][]float32) (err error) {
start := time.Now()
outcome := "ok"
dim := 0
if len(vecs) > 0 {
dim = len(vecs[0])
}
defer func() {
s.recordTrace(start, "delete", dim, 0, outcome, err)
}()
be, berr := s.backend(ctx)
if berr != nil {
outcome = "backend_load_error"
return fmt.Errorf("vector store load: %w", berr)
}
if serr := store.DeleteCols(ctx, be, vecs); serr != nil {
outcome = "delete_error"
return serr
}
return nil
}

// recordTrace surfaces vector-store calls in /api/backend-traces, including
// the backend-load-failure path that otherwise vanishes into an xlog.Warn.
// modelName uses the store namespace (e.g. "router-cache-smart-router") so
Expand Down
49 changes: 47 additions & 2 deletions core/config/meta/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -801,17 +801,19 @@ func DefaultRegistry() map[string]FieldMetaOverride {
"router.classifier": {
Section: "router",
Label: "Classifier",
Description: "Picks a candidate by scoring every policy label against the prompt. Only \"score\" is shipped today; it asks the classifier_model to rank each label and reads off the softmax. Empty defaults to \"score\".",
Description: "How the router picks labels for a prompt. \"score\" asks the classifier_model to rank each policy label and reads off the softmax; \"colbert\" reranks policy descriptions against the prompt via a reranker model; \"knn\" votes over a curated corpus of labelled example prompts (seeded via the corpus API) and routes to the fallback when the prompt is unlike all corpus entries. Empty defaults to \"score\".",
Component: "select",
Options: []FieldOption{
{Value: "score", Label: "Score (Arch-Router-style)"},
{Value: "colbert", Label: "Colbert (reranker)"},
{Value: "knn", Label: "KNN (labelled corpus)"},
},
Order: 230,
},
"router.classifier_model": {
Section: "router",
Label: "Classifier Model",
Description: "Loaded LocalAI model the score classifier asks to rank each policy label as a continuation. Must support the Score gRPC primitive (today: llama-cpp, vLLM) and use the ChatML template. Arch-Router-1.5B Q4_K_M is the canonical choice; any small ChatML instruct model also works at a higher activation_threshold.",
Description: "Loaded LocalAI model the score classifier asks to rank each policy label as a continuation (for colbert: the reranker model). Must support the Score gRPC primitive (today: llama-cpp, vLLM) and use the ChatML template. Arch-Router-1.5B Q4_K_M is the canonical choice; any small ChatML instruct model also works at a higher activation_threshold. Not used by the knn classifier.",
Component: "model-select",
AutocompleteProvider: ProviderModelsScore,
Order: 231,
Expand Down Expand Up @@ -903,5 +905,48 @@ func DefaultRegistry() map[string]FieldMetaOverride {
Component: "input",
Order: 240,
},
"router.knn.embedding_model": {
Section: "router",
Label: "KNN: Embedding Model",
Description: "Embedding model the knn classifier uses for corpus entries and incoming prompts. Required when classifier is \"knn\". Changing it invalidates stored vectors — entries recorded under a different embedder are re-embedded on load. nomic-embed-text-v1.5 is the recommended default.",
Component: "model-select",
AutocompleteProvider: ProviderModels,
Order: 241,
},
"router.knn.k": {
Section: "router",
Label: "KNN: Neighbours (K)",
Description: "How many nearest corpus entries vote on a prompt. 0 picks the default (3). K=1 routes on the single nearest example; larger K tolerates a mislabelled exemplar but needs denser corpus coverage per label.",
Component: "number",
Min: f64(0),
Order: 242,
},
"router.knn.similarity_threshold": {
Section: "router",
Label: "KNN: Similarity Threshold",
Description: "Cosine-similarity floor a corpus entry must clear to vote. When no entry clears it the router uses the fallback model — a prompt unlike all labelled examples is treated as undecidable rather than guessed. 0 picks the default (0.80).",
Component: "slider",
Min: f64(0),
Max: f64(1),
Step: f64(0.01),
Order: 243,
},
"router.knn.vote_threshold": {
Section: "router",
Label: "KNN: Vote Threshold",
Description: "Similarity-weighted vote share a label needs to activate. 0 picks the default (0.5, a weighted majority). Lower values allow multi-label activations from minority neighbours; higher values demand near-unanimous neighbourhoods.",
Component: "slider",
Min: f64(0),
Max: f64(1),
Step: f64(0.05),
Order: 244,
},
"router.knn.store_name": {
Section: "router",
Label: "KNN: Store Name",
Description: "Optional override for the local-store collection holding the corpus vectors. Empty defaults to \"router-corpus-<router-model-name>\".",
Component: "input",
Order: 245,
},
}
}
48 changes: 48 additions & 0 deletions core/config/model_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,17 @@ type RouterConfig struct {
// embeddings to past decisions, so semantically-similar prompts
// reuse a classification instead of re-running the classifier
// model. Omit the block to disable. See router/embedding_cache.go.
// Ignored (with a warning) for the knn classifier — that IS a
// KNN lookup already; wrapping it in another would embed twice
// for no additional information.
EmbeddingCache *EmbeddingCacheConfig `yaml:"embedding_cache,omitempty" json:"embedding_cache,omitempty"`

// KNN configures the "knn" classifier: nearest-neighbour voting
// over a curated corpus of labelled example prompts. Required when
// classifier is "knn", ignored otherwise. The corpus is seeded and
// curated through the router corpus API (never through the UI);
// see router/knn.go for the decision semantics.
KNN *RouterKNNConfig `yaml:"knn,omitempty" json:"knn,omitempty"`
}

// EmbeddingCacheConfig configures the L2 embedding-similarity decision
Expand Down Expand Up @@ -371,6 +381,44 @@ type EmbeddingCacheConfig struct {
StoreName string `yaml:"store_name,omitempty" json:"store_name,omitempty"`
}

// RouterKNNConfig configures the knn classifier. It shares the
// embedding + local-store plumbing with EmbeddingCacheConfig but the
// two are deliberately separate blocks: the cache stores another
// classifier's decisions opportunistically, while the KNN corpus is
// explicit labelled ground truth — different lifecycle, different
// store namespace, different failure story.
type RouterKNNConfig struct {
// EmbeddingModel names the loaded LocalAI model used to embed
// both corpus entries and incoming probes. Required. Changing it
// invalidates the stored vectors — the corpus loader re-embeds
// entries recorded under a different embedder fingerprint.
EmbeddingModel string `yaml:"embedding_model" json:"embedding_model"`

// K is how many nearest corpus entries vote on a probe. 0 picks
// the package default (3). K=1 reproduces exact nearest-entry
// routing; larger K tolerates mislabelled exemplars at the cost
// of needing denser corpus coverage per label region.
K int `yaml:"k,omitempty" json:"k,omitempty"`

// SimilarityThreshold is the epistemic gate: corpus entries less
// similar than this to the probe cannot vote, and when none clear
// it the router uses the fallback model — a probe unlike all
// labelled experience is undecidable, not a guess. 0 picks the
// package default (0.80).
SimilarityThreshold float64 `yaml:"similarity_threshold,omitempty" json:"similarity_threshold,omitempty"`

// VoteThreshold is the similarity-weighted vote share a label
// needs to activate. 0 picks the package default (0.5, a weighted
// majority). Lower values let minority-label neighbours activate
// additional labels (multi-label routing); higher values demand
// near-unanimous neighbourhoods.
VoteThreshold float64 `yaml:"vote_threshold,omitempty" json:"vote_threshold,omitempty"`

// StoreName overrides the local-store collection holding the
// corpus vectors. Empty defaults to "router-corpus-<router>".
StoreName string `yaml:"store_name,omitempty" json:"store_name,omitempty"`
}

// RouterPolicy is one entry in the label vocabulary. The label string
// is what the classifier model emits and what candidates reference in
// their Labels field; the description is the natural-language hint
Expand Down
2 changes: 1 addition & 1 deletion core/http/endpoints/localai/api_instructions.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ var instructionDefs = []instructionDef{
Name: "intelligent-routing",
Description: "Per-model `router:` configuration that classifies requests and rewrites the served model",
Tags: []string{"router"},
Intro: "Add a `router:` block to a ModelConfig to turn it into a routing model. The block declares a classifier (today: `feature` — handcrafted rules over prompt length and code-fence presence), a list of candidates (label + downstream model + optional rule), and a fallback. When a client addresses the routing model, the RouteModel middleware invokes the classifier, picks a candidate, and rewrites input.Model — the standard model-resolution path then runs ACL, disabled-state, and per-model PII against the chosen target. Depth-1 invariant: candidates must NOT themselves carry a `router:` block; runtime check returns 500 on violation. Decisions are logged to GET /api/router/decisions and surfaced in the /app/middleware Routing tab. POST /api/router/decide is the programmatic decision-oracle: external routers (e.g. an organisation-wide router service) send `{router, input}` and receive the classifier's label set + candidate model WITHOUT LocalAI rewriting, forwarding, or recording the call. Shares the classifier cache with the in-band path so warm-up costs are paid once.",
Intro: "Add a `router:` block to a ModelConfig to turn it into a routing model. The block declares a classifier (`score` — a small model ranks each policy label, Arch-Router-style; `colbert` — a reranker scores policy descriptions against the prompt; `knn` — similarity-weighted vote over a curated corpus of labelled example prompts), `policies` (the label vocabulary), `candidates` (downstream model + labels it serves; first candidate whose labels cover the active set wins, so order small → large), and a `fallback`. The knn classifier needs a `knn: { embedding_model }` block instead of a classifier_model, and reads a persisted corpus seeded via POST /api/router/{name}/corpus with `{entries: [{text, labels}]}` (admin-only; texts are embedded server-side, persisted under the state dir, and NEVER returned by any endpoint — GET /api/router/{name}/corpus/stats reports label counts only, DELETE /api/router/{name}/corpus wipes it). knn routes to the fallback whenever the prompt is less similar than knn.similarity_threshold to every corpus entry — out-of-corpus prompts are treated as undecidable rather than guessed. When a client addresses the routing model, the RouteModel middleware invokes the classifier, picks a candidate, and rewrites input.Model — the standard model-resolution path then runs ACL, disabled-state, and per-model PII against the chosen target. Depth-1 invariant: candidates must NOT themselves carry a `router:` block; runtime check returns 500 on violation. Decisions are logged to GET /api/router/decisions and surfaced in the /app/middleware Routing tab. POST /api/router/decide is the programmatic decision-oracle: external routers (e.g. an organisation-wide router service) send `{router, input}` and receive the classifier's label set + candidate model WITHOUT LocalAI rewriting, forwarding, or recording the call. Shares the classifier cache with the in-band path so warm-up costs are paid once.",
},
}

Expand Down
Loading
Loading