KdaiP · FuShang114 · Mar 22, 2026
diff --git a/.env b/.env
@@ -0,0 +1,61 @@
+# OpenAI-compatible LLM settings (文本对话 - DeepSeek)
+LLM_API_KEY=your_llm_api_key_here
+LLM_MODEL=deepseek-chat
+LLM_BASE_URL=https://api.deepseek.com
+LLM_TIMEOUT=60
+# Optional: extra fields merged into every request body (JSON object)
+# Example: LLM_EXTRA_BODY={"chat_template_kwargs": {"enable_thinking": false}}
+LLM_EXTRA_BODY=
+
+# MiniMax Vision - 图像描述专用
+VISION_API_KEY=your_vision_api_key_here
+VISION_MODEL=qwen-vl-plus
+VISION_BASE_URL=https://dashscope.aliyuncs.com/compatible-mode/v1
+
+# Lightweight orchestration max_tokens budget
+# Used by the decision layer and short roleplay replies
+ECHOBOT_LIGHTWEIGHT_MAX_TOKENS=4096
+
+# Single-turn agent tool/skill loop step limit
+ECHOBOT_AGENT_MAX_STEPS=50
+
+# Whether to send a short "I'm checking" acknowledgement before delegating
+# to the full background agent
+ECHOBOT_DELEGATED_ACK_ENABLED=true
+
+# Runtime logging
+REME_LOG_LEVEL=WARNING
+AGENTSCOPE_LOG_LEVEL=WARNING
+
+# Web Live2D
+ECHOBOT_WEB_LIVE2D_MODEL=
+
+# Web TTS
+ECHOBOT_TTS_KOKORO_AUTO_DOWNLOAD=true
+ECHOBOT_TTS_KOKORO_MODEL_DIR=
+ECHOBOT_TTS_KOKORO_PROVIDER=cpu
+ECHOBOT_TTS_KOKORO_NUM_THREADS=2
+ECHOBOT_TTS_KOKORO_DEFAULT_VOICE=zf_001
+ECHOBOT_TTS_KOKORO_DOWNLOAD_TIMEOUT_SECONDS=600
+ECHOBOT_TTS_KOKORO_URL=
+ECHOBOT_TTS_KOKORO_LENGTH_SCALE=1.0
+ECHOBOT_TTS_KOKORO_LANG=
+
+# Web ASR / VAD
+ECHOBOT_ASR_AUTO_DOWNLOAD=true
+ECHOBOT_ASR_MODEL_DIR=
+ECHOBOT_ASR_PROVIDER=cpu
+ECHOBOT_ASR_NUM_THREADS=2
+ECHOBOT_ASR_LANGUAGE=auto
+ECHOBOT_ASR_USE_ITN=false
+ECHOBOT_ASR_SAMPLE_RATE=16000
+ECHOBOT_ASR_DOWNLOAD_TIMEOUT_SECONDS=600
+ECHOBOT_ASR_SENSEVOICE_URL=
+ECHOBOT_ASR_VAD_URL=
+
+# Video Call Plugin
+ECHOBOT_ENABLE_VIDEO_CALL=true
+ECHOBOT_VIDEO_FRAME_RATE=0.25
+ECHOBOT_VIDEO_MAX_FRAME_SIZE=1280x720
+ECHOBOT_VIDEO_FACE_CONFIDENCE_THRESHOLD=0.8
+ECHOBOT_VIDEO_FEATURE_MATCH_THRESHOLD=0.6
diff --git a/.gitignore b/.gitignore
@@ -26,7 +26,7 @@ htmlcov/
 .hypothesis/
 
 # Local environment files
-.env
+# .env  # Committed with placeholder values - copy and fill in your own keys
 .env.*
 !.env.example
 
@@ -43,3 +43,12 @@ example_projects/
 
 .echobot
 logs/
+
+# Node
+node_modules/
+package-lock.json
+
+# Generated docs
+VIDEO_CALL_FINAL_GUIDE.md
+VIDEO_CALL_INTEGRATION_GUIDE.md
+VIDEO_CALL_PLUGIN_SUMMARY.md
diff --git a/VIDEO_CALL_README.md b/VIDEO_CALL_README.md
@@ -0,0 +1,153 @@
+# EchoBot 视频通话插件 - 架构与使用说明
+
+## 功能概述
+
+视频通话插件（`video_call`）为 EchoBot 提供实时视觉感知能力：
+
+- **实时摄像头画面处理**：通过 WebSocket 接收前端推送的视频帧
+- **图像语义描述**：调用视觉模型对每帧画面生成自然语言描述
+- **人脸识别与绑定**：用 InsightFace 提取 512 维特征向量，与已知人脸库做余弦相似度匹配
+- **视觉上下文自动注入**：每次对话时，coordinator 自动将当前视觉信息追加为临时系统消息，模型无需工具调用即可感知画面
+- **主动记忆陌生人**：检测到未识别人脸时，模型会主动询问姓名并通过工具绑定，重启后仍可识别
+
+---
+
+## 架构概览
+
+```
+前端摄像头
+    │  WebSocket /api/web/video/stream
+    ▼
+VisionProcessingService
+    ├── ImageDescriptionService   # 视觉模型：图像 → 自然语言描述
+    └── FaceRecognitionService    # InsightFace：检测 + 512 维特征提取 + 余弦匹配
+    │
+    ▼
+VisionContextProvider（内存缓存，最近 80 帧）
+    │
+    │  on_startup 注入
+    ▼
+ConversationCoordinator._vision_context_provider
+    │  每次对话自动调用 _build_vision_message()
+    ▼
+transient_system_messages → RoleplayEngine / AgentRunner
+    │
+    ▼
+大模型（感知视觉 + 主动询问陌生人）
+    │  用户回答名字 → decision engine 路由到 agent
+    ▼
+BindFaceToNameTool（BaseTool）
+    ├── FaceRecognitionService.add_known_face_from_frame()  # 内存
+    └── FaceFeatureInterceptor.add_or_update_feature()      # 持久化到 .echobot/face_features.json
+```
+
+### 工具注入方式（非侵入）
+
+插件工具通过 `create_app.py` 里包装 `tool_registry_factory` 注入，不修改框架代码：
+
+```python
+# create_app.py
+def _plugin_context_builder(opts):
+    ctx = build_runtime_context(opts, load_session_state=False)
+    original_factory = ctx.tool_registry_factory
+
+    def wrapped_factory(session_name, scheduled_context):
+        registry = original_factory(session_name, scheduled_context)
+        for tool in video_plugin.get_tool_instances():
+            registry.register(tool)
+        return registry
+
+    ctx.tool_registry_factory = wrapped_factory
+    return ctx
+```
+
+### 视觉上下文注入方式
+
+插件在 `on_startup` 时调用：
+
+```python
+coordinator.set_vision_context_provider(self.vision_provider)
+```
+
+Coordinator 在每次 `handle_user_turn_stream` 时自动调用 `_build_vision_message()`，将视觉信息作为 `transient_system_messages` 注入，**不写入会话历史**。
+
+---
+
+## 目录结构
+
+```
+echobot/plugins/video_call/
+├── __init__.py                  # VideoCallPlugin：插件入口，on_startup/on_shutdown
+├── models.py                    # 数据模型：Face, VisionContext
+├── vision_provider.py           # VisionContextProvider：帧缓存与合并
+├── interceptors/
+│   └── __init__.py              # FaceFeatureInterceptor：特征持久化（JSON）
+├── routers/
+│   └── __init__.py              # API 路由：WebSocket 视频流、人脸绑定接口
+├── services/
+│   ├── __init__.py              # VisionProcessingService：双链路并行处理
+│   ├── face_recognition.py      # FaceRecognitionService：InsightFace 封装
+│   └── image_description.py     # ImageDescriptionService：视觉描述
+└── tools/
+    ├── __init__.py              # VISION_TOOLS：旧格式工具定义（兼容用）
+    ├── face_tools.py            # BaseTool 实现：bind_face_to_name / list_known_faces / forget_face
+    └── handlers.py              # VisionToolHandler：工具调用处理器
+```
+
+---
+
+## 可用工具（Agent 路径）
+
+| 工具名 | 触发场景 | 功能 |
+|--------|----------|------|
+| `bind_face_to_name` | "我叫XXX" / "记住我" / "这是XXX" | 从当前摄像头帧提取人脸特征并绑定姓名 |
+| `list_known_faces` | "你认识哪些人" | 列出所有已绑定的人脸姓名 |
+| `forget_face` | "忘掉XXX" | 删除某人的人脸绑定记录 |
+
+---
+
+## API 接口
+
+| 方法 | 路径 | 说明 |
+|------|------|------|
+| WS | `/api/web/video/stream` | 视频帧推送（JPEG bytes） |
+| GET | `/api/web/video/context` | 获取当前视觉上下文列表 |
+| GET | `/api/web/video/snapshot` | 获取最新一帧快照 |
+| POST | `/api/web/video/face-bind` | 通过特征向量绑定人脸 |
+| POST | `/api/web/video/face-bind-frame` | 通过图像帧绑定人脸 |
+| GET | `/api/web/video/face-list` | 查询已绑定人脸列表 |
+
+---
+
+## 启用方式
+
+在 `.env` 中设置：
+
+```env
+ECHOBOT_ENABLE_VIDEO_CALL=true
+```
+
+前端摄像头页面：`http://localhost:8000/web/camera`
+
+---
+
+## 人脸数据持久化
+
+绑定的人脸特征向量（512 维，InsightFace buffalo_sc）保存在：
+
+```
+.echobot/face_features.json
+```
+
+重启后自动加载，无需重新绑定。
+
+---
+
+## 依赖
+
+```
+insightface
+onnxruntime
+Pillow
+numpy
+```
diff --git a/echobot/app/create_app.py b/echobot/app/create_app.py
@@ -1,12 +1,23 @@
 from __future__ import annotations
 
+import os
 from contextlib import asynccontextmanager
 from pathlib import Path
 
 from fastapi import FastAPI
 from fastapi.responses import FileResponse
 from fastapi.staticfiles import StaticFiles
+from loguru import logger
 
+# 加载 .env 文件
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass
+
+from ..plugins import PluginRegistry
+from ..plugins.video_call import VideoCallPlugin
 from ..runtime.bootstrap import RuntimeOptions
 from .routers import chat, channels, cron, health, heartbeat, roles, sessions, web
 from .runtime import ASRServiceBuilder, AppRuntime, RuntimeContextBuilder, TTSServiceBuilder
@@ -24,10 +35,47 @@ def create_app(
     asr_service_builder: ASRServiceBuilder | None = None,
 ) -> FastAPI:
     options = runtime_options or RuntimeOptions()
+
+    # 初始化插件系统（先于 runtime 构建，以便注入工具）
+    plugin_registry = PluginRegistry()
+    video_plugin = VideoCallPlugin()
+    plugin_registry.register(video_plugin)
+    video_enabled = os.getenv("ECHOBOT_ENABLE_VIDEO_CALL", "false").lower() == "true"
+    if video_enabled:
+        plugin_registry.enable("video_call")
+        logger.info("VideoCall plugin enabled")
+
+    # 构建包装过的 context_builder，注入插件工具
+    def _plugin_context_builder(opts: RuntimeOptions):
+        from ..runtime.bootstrap import build_runtime_context
+        # 调用原始 builder（默认或用户传入）
+        if context_builder is not None:
+            ctx = context_builder(opts)
+        else:
+            ctx = build_runtime_context(opts, load_session_state=False)
+
+        if video_enabled:
+            # 包装 tool_registry_factory，追加插件工具
+            original_factory = ctx.tool_registry_factory
+
+            def wrapped_factory(session_name: str, scheduled_context: bool):
+                registry = original_factory(session_name, scheduled_context)
+                if registry is None:
+                    return registry
+                for tool in video_plugin.get_tool_instances():
+                    try:
+                        registry.register(tool)
+                    except ValueError:
+                        pass  # 已注册则跳过
+                return registry
+
+            ctx.tool_registry_factory = wrapped_factory
+        return ctx
+
     runtime = AppRuntime(
         runtime_options=options,
         channel_config_path=channel_config_path,
-        context_builder=context_builder,
+        context_builder=_plugin_context_builder,
         tts_service_builder=tts_service_builder,
         asr_service_builder=asr_service_builder,
     )
@@ -36,9 +84,19 @@ def create_app(
     async def lifespan(app: FastAPI):
         await runtime.start()
         app.state.runtime = runtime
+        app.state.plugin_registry = plugin_registry
+
+        # 启动所有启用的插件
+        for plugin in plugin_registry.get_enabled_plugins():
+            await plugin.on_startup(app, runtime)
+
         try:
             yield
         finally:
+            # 关闭所有插件
+            for plugin in plugin_registry.get_enabled_plugins():
+                await plugin.on_shutdown()
+
             await runtime.stop()
 
     app = FastAPI(
@@ -58,6 +116,10 @@ async def root() -> dict[str, str]:
     async def web_console() -> FileResponse:
         return FileResponse(WEB_ASSETS_DIR / "index.html")
 
+    @app.get("/web/camera", include_in_schema=False)
+    async def web_camera() -> FileResponse:
+        return FileResponse(WEB_ASSETS_DIR / "camera.html")
+
     @app.get("/favicon.ico", include_in_schema=False)
     async def favicon() -> FileResponse:
         return FileResponse(
@@ -79,4 +141,11 @@ async def favicon() -> FileResponse:
     app.include_router(roles.router, prefix="/api")
     app.include_router(channels.router, prefix="/api")
     app.include_router(web.router, prefix="/api")
+
+    # 加载启用的插件路由
+    for plugin in plugin_registry.get_enabled_plugins():
+        for router in plugin.get_routers():
+            app.include_router(router, prefix="/api")
+            logger.info(f"Loaded router from plugin: {plugin.name}")
+
     return app
diff --git a/echobot/app/schemas.py b/echobot/app/schemas.py
@@ -71,6 +71,7 @@ class ChatRequest(BaseModel):
     temperature: float | None = None
     max_tokens: int | None = None
     images: list["ChatImageInput"] = Field(default_factory=list)
+    vision_context: list[dict] = Field(default_factory=list)
 
 
 class ChatImageInput(BaseModel):

diff --git a/echobot/app/services/chat.py b/echobot/app/services/chat.py
@@ -36,13 +36,15 @@ async def run_prompt(
         image_urls: list[str] | None = None,
         role_name: str | None = None,
         route_mode: RouteMode | None = None,
+        transient_system_messages: list[str] | None = None,
     ) -> OrchestratedTurnResult:
         result = await self._coordinator.handle_user_turn(
             session_name,
             prompt,
             image_urls=image_urls,
             role_name=role_name,
             route_mode=route_mode,
+            transient_system_messages=transient_system_messages,
         )
         await self._session_service.set_current_session(result.session.name)
         return result
@@ -56,6 +58,7 @@ async def run_prompt_stream(
         role_name: str | None = None,
         route_mode: RouteMode | None = None,
         on_chunk: StreamCallback | None = None,
+        transient_system_messages: list[str] | None = None,
     ) -> OrchestratedTurnResult:
         result = await self._coordinator.handle_user_turn_stream(
             session_name,
@@ -64,6 +67,7 @@ async def run_prompt_stream(
             role_name=role_name,
             route_mode=route_mode,
             on_chunk=on_chunk,
+            transient_system_messages=transient_system_messages,
         )
         await self._session_service.set_current_session(result.session.name)
         return result