OpenClaw 多 Agent 协作协议（ACP）深度解析与实战

摘要本文介绍了 OpenClaw 中的多 Agent 协作协议（ACP），该协议通过协调器、工作 Agent 和消息总线实现复杂任务的自动化处理。文章详细阐述了 ACP 的架构设计，包括任务分解、角色分配和状态同步机制，并提供了 Python 实现的代码示例。ACP 支持动态扩展和容错处理，能够有效协调多个 Agent 完成数据分析、客户服务等复杂场景任务。核心组件包括任务状态管理、消息总线和

ZtongCai

388人浏览 · 2026-03-20 00:24:26

ZtongCai · 2026-03-20 00:24:26 发布

摘要

多 Agent 协作协议（Agent Collaboration Protocol，ACP）是 OpenClaw 实现复杂任务自动化的核心机制。本文详细分析 ACP 的架构设计、任务分配机制、状态同步协议、冲突解决策略，并提供完整的实战案例和性能优化方案。

关键词：OpenClaw、多 Agent 协作、ACP、任务分配、状态同步

1. 引言

1.1 背景

单个 AI Agent 的能力有限，复杂任务需要多个 Agent 协作完成。例如：

数据分析任务：需要数据采集 Agent、分析 Agent、可视化 Agent 协作
客户服务：需要接待 Agent、技术 support Agent、投诉处理 Agent 协作
软件开发：需求分析 Agent、编码 Agent、测试 Agent 协作

1.2 设计目标

ACP 协议的设计目标：

高效协作：Agent 之间无缝配合，避免重复劳动
状态一致：所有 Agent 对任务状态有统一认知
容错性：单个 Agent 失效不影响整体任务
可扩展：支持动态添加/移除 Agent
可观测：任务执行过程可追踪、可调试

2. 架构设计

2.1 整体架构

┌─────────────────────────────────────────────────────┐
│              协调器（Coordinator）                   │
│  ┌─────────────────────────────────────────────┐   │
│  │  - 任务分解                                  │   │
│  │  - Agent 选择                                │   │
│  │  - 任务分配                                  │   │
│  │  - 进度跟踪                                  │   │
│  └─────────────────────────────────────────────┘   │
└─────────────────┬───────────────────────────────────┘
                  │
        ┌─────────┼─────────┐
        │         │         │
┌───────▼───┐ ┌──▼────────┐ ┌▼──────────┐
│  Agent 1  │ │  Agent 2  │ │  Agent 3  │
│  (采集)   │ │  (分析)   │ │  (可视化)  │
└─────┬─────┘ └────┬──────┘ └─────┬─────┘
      │            │             │
      └────────────┼─────────────┘
                   │
┌──────────────────┼──────────────────────────┐
│              消息总线                        │
│  ┌──────────────┴──────────────┐            │
│  │  - 发布/订阅                 │            │
│  │  - 消息路由                  │            │
│  │  - 消息持久化                │            │
│  └─────────────────────────────┘            │
└──────────────────────────────────────────────┘

2.2 核心组件

from dataclasses import dataclass
from enum import Enum
from typing import List, Dict, Any, Optional
import uuid

class AgentRole(Enum):
    """Agent 角色"""
    COORDINATOR = "coordinator"  # 协调器
    WORKER = "worker"           # 工作 Agent
    OBSERVER = "observer"       # 观察者
    SUPERVISOR = "supervisor"   # 监督者

class TaskStatus(Enum):
    """任务状态"""
    PENDING = "pending"         # 待执行
    RUNNING = "running"         # 执行中
    COMPLETED = "completed"     # 已完成
    FAILED = "failed"           # 失败
    CANCELLED = "cancelled"     # 已取消

@dataclass
class Task:
    """任务定义"""
    id: str
    name: str
    description: str
    role: AgentRole
    status: TaskStatus
    input_data: Dict[str, Any]
    output_data: Optional[Dict[str, Any]]
    assigned_to: Optional[str]  # Agent ID
    dependencies: List[str]     # 依赖的任务 ID
    created_at: float
    started_at: Optional[float]
    completed_at: Optional[float]

@dataclass
class AgentState:
    """Agent 状态"""
    id: str
    name: str
    role: AgentRole
    capabilities: List[str]
    current_task: Optional[str]
    status: str  # idle, busy, offline
    last_heartbeat: float

class MessageBus:
    """消息总线"""
    
    def __init__(self):
        self.subscribers: Dict[str, List[callable]] = {}
        self.message_store: Dict[str, List[dict]] = {}
    
    def publish(self, topic: str, message: dict):
        """发布消息"""
        # 存储消息
        if topic not in self.message_store:
            self.message_store[topic] = []
        self.message_store[topic].append(message)
        
        # 通知订阅者
        if topic in self.subscribers:
            for callback in self.subscribers[topic]:
                callback(message)
    
    def subscribe(self, topic: str, callback: callable):
        """订阅消息"""
        if topic not in self.subscribers:
            self.subscribers[topic] = []
        self.subscribers[topic].append(callback)
    
    def get_messages(self, topic: str, since: float = 0) -> List[dict]:
        """获取消息历史"""
        messages = self.message_store.get(topic, [])
        return [m for m in messages if m['timestamp'] > since]

3. 任务分配机制

3.1 任务分解

class TaskDecomposer:
    """任务分解器"""
    
    def __init__(self, llm_client):
        self.llm = llm_client
    
    def decompose(self, task_description: str) -> List[Task]:
        """
        将复杂任务分解为多个子任务
        
        Args:
            task_description: 任务描述
        
        Returns:
            List[Task]: 子任务列表
        """
        prompt = f"""将以下任务分解为多个可执行的子任务：

任务：{task_description}

要求：
1. 每个子任务应该是独立的、可执行的
2. 明确子任务之间的依赖关系
3. 为每个子任务指定所需的 Agent 角色
4. 定义每个子任务的输入和输出

请以 JSON 格式输出，格式如下：
{{
    "subtasks": [
        {{
            "name": "子任务 1 名称",
            "description": "详细描述",
            "role": "需要的 Agent 角色",
            "dependencies": [],  # 依赖的子任务名称
            "input": {{}},       # 输入数据
            "output": {{}}       # 预期输出
        }},
        ...
    ]
}}"""
        
        response = self.llm.generate(prompt)
        tasks_data = json.loads(response)
        
        # 创建 Task 对象
        tasks = []
        for i, subtask in enumerate(tasks_data['subtasks']):
            task = Task(
                id=str(uuid.uuid4()),
                name=subtask['name'],
                description=subtask['description'],
                role=AgentRole(subtask['role']),
                status=TaskStatus.PENDING,
                input_data=subtask.get('input', {}),
                output_data=None,
                assigned_to=None,
                dependencies=[],  # 后续填充
                created_at=time.time(),
                started_at=None,
                completed_at=None
            )
            tasks.append(task)
        
        # 建立依赖关系
        self._build_dependencies(tasks, tasks_data['subtasks'])
        
        return tasks
    
    def _build_dependencies(self, tasks: List[Task], subtasks_data: List[dict]):
        """建立任务依赖关系"""
        name_to_task = {t.name: t for t in tasks}
        
        for task, subtask in zip(tasks, subtasks_data):
            for dep_name in subtask.get('dependencies', []):
                if dep_name in name_to_task:
                    task.dependencies.append(name_to_task[dep_name].id)

3.2 Agent 选择

class AgentSelector:
    """Agent 选择器"""
    
    def __init__(self, agent_registry: Dict[str, AgentState]):
        self.registry = agent_registry
    
    def select(self, task: Task) -> Optional[str]:
        """
        为任务选择合适的 Agent
        
        Args:
            task: 任务
        
        Returns:
            str: 选中的 Agent ID，没有合适的返回 None
        """
        candidates = []
        
        for agent_id, agent in self.registry.items():
            # 检查 Agent 状态
            if agent.status != 'idle':
                continue
            
            # 检查 Agent 角色
            if agent.role != task.role and task.role != AgentRole.WORKER:
                continue
            
            # 检查 Agent 能力
            if not self._has_required_capabilities(agent, task):
                continue
            
            # 计算匹配度
            score = self._calculate_match_score(agent, task)
            candidates.append((agent_id, score))
        
        if not candidates:
            return None
        
        # 选择匹配度最高的
        candidates.sort(key=lambda x: x[1], reverse=True)
        return candidates[0][0]
    
    def _has_required_capabilities(self, agent: AgentState, task: Task) -> bool:
        """检查 Agent 是否有所需能力"""
        required_caps = self._get_required_capabilities(task)
        return all(cap in agent.capabilities for cap in required_caps)
    
    def _get_required_capabilities(self, task: Task) -> List[str]:
        """获取任务所需能力"""
        # 根据任务类型返回所需能力
        capability_map = {
            '数据采集': ['web_scraping', 'api_integration'],
            '数据分析': ['statistics', 'data_processing'],
            '可视化': ['chart_generation', 'report_writing'],
            # ... 更多映射
        }
        return capability_map.get(task.name, [])
    
    def _calculate_match_score(self, agent: AgentState, task: Task) -> float:
        """计算匹配度分数"""
        score = 0.0
        
        # 角色匹配
        if agent.role == task.role:
            score += 50
        
        # 能力匹配
        required_caps = self._get_required_capabilities(task)
        matched_caps = sum(1 for cap in required_caps if cap in agent.capabilities)
        score += (matched_caps / len(required_caps)) * 30 if required_caps else 0
        
        # 负载均衡（优先选择空闲时间长的）
        idle_time = time.time() - agent.last_heartbeat
        score += min(idle_time / 60, 20)  # 最多 20 分
        
        return score

3.3 任务调度

class TaskScheduler:
    """任务调度器"""
    
    def __init__(self, task_decomposer: TaskDecomposer, 
                 agent_selector: AgentSelector,
                 message_bus: MessageBus):
        self.decomposer = task_decomposer
        self.selector = agent_selector
        self.bus = message_bus
        self.tasks: Dict[str, Task] = {}
        self.task_queue: List[str] = []  # 待执行的任务 ID
    
    def submit(self, task_description: str) -> str:
        """
        提交任务
        
        Args:
            task_description: 任务描述
        
        Returns:
            str: 任务 ID
        """
        # 分解任务
        subtasks = self.decomposer.decompose(task_description)
        
        # 注册任务
        for task in subtasks:
            self.tasks[task.id] = task
        
        # 发布任务创建事件
        self.bus.publish('tasks.created', {
            'task_ids': [t.id for t in subtasks],
            'timestamp': time.time()
        })
        
        # 开始调度
        self._schedule()
        
        return subtasks[0].id if subtasks else None
    
    def _schedule(self):
        """调度任务"""
        # 找出可执行的任务（依赖已满足）
        ready_tasks = self._find_ready_tasks()
        
        for task_id in ready_tasks:
            task = self.tasks[task_id]
            
            # 选择 Agent
            agent_id = self.selector.select(task)
            
            if agent_id:
                # 分配任务
                task.assigned_to = agent_id
                task.status = TaskStatus.RUNNING
                task.started_at = time.time()
                
                # 发送任务给 Agent
                self.bus.publish(f'agent.{agent_id}.task', {
                    'task_id': task.id,
                    'task_data': self._serialize_task(task)
                })
                
                # 从队列移除
                if task_id in self.task_queue:
                    self.task_queue.remove(task_id)
    
    def _find_ready_tasks(self) -> List[str]:
        """找出可执行的任务"""
        ready = []
        
        for task_id, task in self.tasks.items():
            if task.status != TaskStatus.PENDING:
                continue
            
            # 检查依赖
            deps_satisfied = all(
                self.tasks.get(dep_id, Task(
                    id='', name='', description='', role=AgentRole.WORKER,
                    status=TaskStatus.PENDING, input_data={}, output_data=None,
                    assigned_to=None, dependencies=[], created_at=0,
                    started_at=None, completed_at=None
                )).status == TaskStatus.COMPLETED
                for dep_id in task.dependencies
            )
            
            if deps_satisfied:
                ready.append(task_id)
        
        return ready
    
    def _serialize_task(self, task: Task) -> dict:
        """序列化任务"""
        return {
            'id': task.id,
            'name': task.name,
            'description': task.description,
            'input_data': task.input_data,
            'role': task.role.value
        }

4. 状态同步协议

4.1 状态同步机制

class StateSynchronizer:
    """状态同步器"""
    
    def __init__(self, message_bus: MessageBus):
        self.bus = message_bus
        self.state_store: Dict[str, Any] = {}
        self.version_vector: Dict[str, int] = {}
        
        # 订阅状态更新
        self.bus.subscribe('state.update', self._handle_state_update)
    
    def update_state(self, key: str, value: Any, agent_id: str):
        """
        更新状态
        
        Args:
            key: 状态键
            value: 状态值
            agent_id: 更新的 Agent ID
        """
        # 增加版本号
        if key not in self.version_vector:
            self.version_vector[key] = 0
        self.version_vector[key] += 1
        
        # 创建状态更新消息
        update = {
            'key': key,
            'value': value,
            'version': self.version_vector[key],
            'agent_id': agent_id,
            'timestamp': time.time()
        }
        
        # 本地存储
        self.state_store[key] = value
        
        # 广播
        self.bus.publish('state.update', update)
    
    def _handle_state_update(self, update: dict):
        """处理状态更新"""
        key = update['key']
        version = update['version']
        
        # 检查版本（只接受更新的版本）
        current_version = self.version_vector.get(key, 0)
        if version > current_version:
            self.version_vector[key] = version
            self.state_store[key] = update['value']
    
    def get_state(self, key: str) -> Any:
        """获取状态"""
        return self.state_store.get(key)
    
    def get_all_states(self) -> Dict[str, Any]:
        """获取所有状态"""
        return self.state_store.copy()

4.2 心跳机制

class HeartbeatMonitor:
    """心跳监控器"""
    
    def __init__(self, message_bus: MessageBus, timeout: int = 30):
        self.bus = message_bus
        self.timeout = timeout  # 超时时间（秒）
        self.agent_heartbeats: Dict[str, float] = {}
        
        # 订阅心跳
        self.bus.subscribe('heartbeat', self._handle_heartbeat)
        
        # 启动监控线程
        self._start_monitoring()
    
    def send_heartbeat(self, agent_id: str, metadata: dict = None):
        """发送心跳"""
        message = {
            'agent_id': agent_id,
            'timestamp': time.time(),
            'metadata': metadata or {}
        }
        self.bus.publish('heartbeat', message)
    
    def _handle_heartbeat(self, message: dict):
        """处理心跳"""
        agent_id = message['agent_id']
        self.agent_heartbeats[agent_id] = message['timestamp']
    
    def check_alive(self, agent_id: str) -> bool:
        """检查 Agent 是否存活"""
        last_heartbeat = self.agent_heartbeats.get(agent_id, 0)
        return (time.time() - last_heartbeat) < self.timeout
    
    def get_alive_agents(self) -> List[str]:
        """获取存活的 Agent 列表"""
        now = time.time()
        return [
            agent_id for agent_id, last_time in self.agent_heartbeats.items()
            if (now - last_time) < self.timeout
        ]
    
    def _start_monitoring(self):
        """启动监控线程"""
        import threading
        
        def monitor_loop():
            while True:
                time.sleep(self.timeout / 2)
                
                # 检查超时的 Agent
                dead_agents = []
                for agent_id, last_time in self.agent_heartbeats.items():
                    if (time.time() - last_time) > self.timeout:
                        dead_agents.append(agent_id)
                
                # 发布 Agent 离线事件
                for agent_id in dead_agents:
                    self.bus.publish('agent.offline', {
                        'agent_id': agent_id,
                        'timestamp': time.time()
                    })
                    del self.agent_heartbeats[agent_id]
        
        thread = threading.Thread(target=monitor_loop, daemon=True)
        thread.start()

5. 冲突解决策略

5.1 资源冲突

class ResourceLock:
    """资源锁"""
    
    def __init__(self, message_bus: MessageBus):
        self.bus = message_bus
        self.locks: Dict[str, str] = {}  # resource -> agent_id
        self.lock_requests: Dict[str, List[str]] = {}  # resource -> [agent_id]
    
    def acquire(self, resource: str, agent_id: str, timeout: int = 10) -> bool:
        """
        获取锁
        
        Args:
            resource: 资源名称
            agent_id: 请求的 Agent ID
            timeout: 超时时间（秒）
        
        Returns:
            bool: 是否成功获取
        """
        # 检查锁是否可用
        if resource not in self.locks:
            # 无主，直接获取
            self.locks[resource] = agent_id
            self._broadcast_lock_status(resource)
            return True
        
        # 已有主，加入等待队列
        if resource not in self.lock_requests:
            self.lock_requests[resource] = []
        self.lock_requests[resource].append(agent_id)
        
        # 等待锁释放
        start_time = time.time()
        while time.time() - start_time < timeout:
            if resource not in self.locks:
                # 锁释放了，尝试获取
                self.locks[resource] = agent_id
                if agent_id in self.lock_requests.get(resource, []):
                    self.lock_requests[resource].remove(agent_id)
                self._broadcast_lock_status(resource)
                return True
            time.sleep(0.1)
        
        # 超时失败
        if agent_id in self.lock_requests.get(resource, []):
            self.lock_requests[resource].remove(agent_id)
        
        return False
    
    def release(self, resource: str, agent_id: str):
        """释放锁"""
        if self.locks.get(resource) == agent_id:
            del self.locks[resource]
            self._broadcast_lock_status(resource)
            
            # 通知等待队列中的下一个
            if resource in self.lock_requests and self.lock_requests[resource]:
                next_agent = self.lock_requests[resource][0]
                self.bus.publish(f'lock.available.{resource}', {
                    'agent_id': next_agent,
                    'timestamp': time.time()
                })
    
    def _broadcast_lock_status(self, resource: str):
        """广播锁状态"""
        self.bus.publish('lock.status', {
            'resource': resource,
            'holder': self.locks.get(resource),
            'waiters': self.lock_requests.get(resource, []),
            'timestamp': time.time()
        })

5.2 数据一致性

class ConflictFreeReplicatedDataType:
    """无冲突复制数据类型（CRDT）"""
    
    def __init__(self, agent_id: str):
        self.agent_id = agent_id
        self.vector_clock: Dict[str, int] = {}
        self.data: Dict[str, Any] = {}
    
    def update(self, key: str, value: Any):
        """更新数据"""
        # 增加本地时钟
        if self.agent_id not in self.vector_clock:
            self.vector_clock[self.agent_id] = 0
        self.vector_clock[self.agent_id] += 1
        
        # 更新数据
        self.data[key] = {
            'value': value,
            'timestamp': time.time(),
            'vector_clock': self.vector_clock.copy()
        }
    
    def merge(self, remote_data: dict, remote_clock: dict) -> bool:
        """
        合并远程数据
        
        Args:
            remote_data: 远程数据
            remote_clock: 远程向量时钟
        
        Returns:
            bool: 是否接受了远程数据
        """
        # 比较向量时钟
        comparison = self._compare_clocks(self.vector_clock, remote_clock)
        
        if comparison == 'before':
            # 远程更新，接受
            self.data.update(remote_data)
            self.vector_clock = self._merge_clocks(self.vector_clock, remote_clock)
            return True
        
        elif comparison == 'concurrent':
            # 并发更新，使用最后写入者获胜（LWW）
            for key, remote_entry in remote_data.items():
                local_entry = self.data.get(key)
                
                if not local_entry or remote_entry['timestamp'] > local_entry['timestamp']:
                    self.data[key] = remote_entry
            
            self.vector_clock = self._merge_clocks(self.vector_clock, remote_clock)
            return True
        
        # comparison == 'after'，本地更新，忽略远程
        return False
    
    def _compare_clocks(self, clock1: dict, clock2: dict) -> str:
        """比较向量时钟"""
        all_keys = set(clock1.keys()) | set(clock2.keys())
        
        clock1_greater = False
        clock2_greater = False
        
        for key in all_keys:
            v1 = clock1.get(key, 0)
            v2 = clock2.get(key, 0)
            
            if v1 > v2:
                clock1_greater = True
            elif v2 > v1:
                clock2_greater = True
        
        if clock1_greater and not clock2_greater:
            return 'after'
        elif clock2_greater and not clock1_greater:
            return 'before'
        else:
            return 'concurrent'
    
    def _merge_clocks(self, clock1: dict, clock2: dict) -> dict:
        """合并向量时钟"""
        merged = {}
        all_keys = set(clock1.keys()) | set(clock2.keys())
        
        for key in all_keys:
            merged[key] = max(clock1.get(key, 0), clock2.get(key, 0))
        
        return merged

6. 实战案例

6.1 数据分析工作流

# 配置多 Agent 协作
class DataAnalysisWorkflow:
    """数据分析工作流"""
    
    def __init__(self):
        self.bus = MessageBus()
        self.scheduler = TaskScheduler(
            task_decomposer=TaskDecomposer(llm_client),
            agent_selector=AgentSelector(agent_registry),
            message_bus=self.bus
        )
        
        # 注册 Agent
        self._register_agents()
    
    def _register_agents(self):
        """注册 Agent"""
        self.agents = {
            'data_collector': AgentState(
                id='data_collector',
                name='数据采集 Agent',
                role=AgentRole.WORKER,
                capabilities=['web_scraping', 'api_integration', 'database_query'],
                current_task=None,
                status='idle',
                last_heartbeat=time.time()
            ),
            'data_analyst': AgentState(
                id='data_analyst',
                name='数据分析 Agent',
                role=AgentRole.WORKER,
                capabilities=['statistics', 'data_processing', 'ml_modeling'],
                current_task=None,
                status='idle',
                last_heartbeat=time.time()
            ),
            'visualizer': AgentState(
                id='visualizer',
                name='可视化 Agent',
                role=AgentRole.WORKER,
                capabilities=['chart_generation', 'dashboard_creation'],
                current_task=None,
                status='idle',
                last_heartbeat=time.time()
            ),
            'report_writer': AgentState(
                id='report_writer',
                name='报告撰写 Agent',
                role=AgentRole.WORKER,
                capabilities=['report_writing', 'summarization'],
                current_task=None,
                status='idle',
                last_heartbeat=time.time()
            )
        }
    
    def execute(self, analysis_request: str) -> str:
        """
        执行数据分析
        
        Args:
            analysis_request: 分析需求
        
        Returns:
            str: 任务 ID
        """
        # 提交任务
        task_id = self.scheduler.submit(analysis_request)
        
        # 监控进度
        self._monitor_progress(task_id)
        
        return task_id
    
    def _monitor_progress(self, task_id: str):
        """监控进度"""
        # 订阅任务完成事件
        def on_task_completed(message):
            print(f"任务完成：{message['task_id']}")
        
        self.bus.subscribe('task.completed', on_task_completed)

6.2 配置示例

# multi_agent_config.yaml
multi_agent:
  # 协调器配置
  coordinator:
    enabled: true
    max_concurrent_tasks: 10
    task_timeout: 300  # 5 分钟
    
  # Agent 配置
  agents:
    - id: "data_collector"
      name: "数据采集 Agent"
      role: "worker"
      capabilities:
        - web_scraping
        - api_integration
        - database_query
      resources:
        max_memory_mb: 512
        max_cpu_percent: 50
    
    - id: "data_analyst"
      name: "数据分析 Agent"
      role: "worker"
      capabilities:
        - statistics
        - data_processing
        - ml_modeling
      resources:
        max_memory_mb: 1024
        max_cpu_percent: 80
    
    - id: "visualizer"
      name: "可视化 Agent"
      role: "worker"
      capabilities:
        - chart_generation
        - dashboard_creation
      resources:
        max_memory_mb: 256
        max_cpu_percent: 30
  
  # 消息总线配置
  message_bus:
    type: "redis"  # redis, rabbitmq, kafka
    host: "localhost"
    port: 6379
    channel_prefix: "openclaw:acp:"
  
  # 状态同步配置
  state_sync:
    enabled: true
    sync_interval: 5  # 秒
    conflict_resolution: "lww"  # lww, crdt, manual
  
  # 监控配置
  monitoring:
    enabled: true
    heartbeat_interval: 10  # 秒
    agent_timeout: 30  # 秒
    alert_on_failure: true

7. 性能优化

7.1 消息队列优化

class OptimizedMessageBus:
    """优化的消息总线"""
    
    def __init__(self, config: dict):
        self.config = config
        self.batch_size = config.get('batch_size', 100)
        self.flush_interval = config.get('flush_interval', 0.1)  # 秒
        
        self.message_buffer: Dict[str, List[dict]] = {}
        self.last_flush: Dict[str, float] = {}
    
    def publish(self, topic: str, message: dict, batch: bool = True):
        """发布消息（支持批量）"""
        if batch:
            # 加入缓冲区
            if topic not in self.message_buffer:
                self.message_buffer[topic] = []
            self.message_buffer[topic].append(message)
            
            # 检查是否需要刷新
            now = time.time()
            if (len(self.message_buffer[topic]) >= self.batch_size or
                now - self.last_flush.get(topic, 0) > self.flush_interval):
                self._flush(topic)
        else:
            # 立即发送
            self._send_to_broker(topic, message)
    
    def _flush(self, topic: str):
        """刷新缓冲区"""
        if topic in self.message_buffer and self.message_buffer[topic]:
            messages = self.message_buffer[topic]
            self._send_to_broker(topic, {'batch': messages})
            self.message_buffer[topic] = []
            self.last_flush[topic] = time.time()

7.2 任务并行度优化

class ParallelTaskExecutor:
    """并行任务执行器"""
    
    def __init__(self, max_workers: int = 10):
        self.max_workers = max_workers
        self.executor = ThreadPoolExecutor(max_workers=max_workers)
        self.futures: Dict[str, Future] = {}
    
    def execute_parallel(self, tasks: List[Task], 
                        agent_map: Dict[str, str]) -> Dict[str, Any]:
        """
        并行执行任务
        
        Args:
            tasks: 任务列表
            agent_map: 任务 ID 到 Agent ID 的映射
        
        Returns:
            Dict: 执行结果
        """
        # 找出可并行的任务
        parallel_groups = self._find_parallel_groups(tasks)
        
        results = {}
        for group in parallel_groups:
            # 提交组内任务
            futures = []
            for task in group:
                agent_id = agent_map[task.id]
                future = self.executor.submit(
                    self._execute_single_task,
                    task, agent_id
                )
                futures.append((task.id, future))
            
            # 等待组内所有任务完成
            for task_id, future in futures:
                results[task_id] = future.result()
        
        return results
    
    def _find_parallel_groups(self, tasks: List[Task]) -> List[List[Task]]:
        """找出可并行的任务组"""
        # 拓扑排序，按依赖分组
        groups = []
        remaining = set(t.id for t in tasks)
        completed = set()
        
        while remaining:
            # 找出当前可执行的任务
            ready = []
            for task_id in remaining:
                task = next(t for t in tasks if t.id == task_id)
                if all(dep in completed for dep in task.dependencies):
                    ready.append(task)
            
            if not ready:
                raise ValueError("Circular dependency detected")
            
            groups.append(ready)
            for task in ready:
                remaining.remove(task.id)
                completed.add(task.id)
        
        return groups

8. 监控与调试

8.1 监控指标

monitoring:
  metrics:
    - name: "task_throughput"
      type: "counter"
      description: "任务吞吐量（任务/秒）"
      labels: ["task_type", "status"]
    
    - name: "agent_utilization"
      type: "gauge"
      description: "Agent 利用率"
      labels: ["agent_id"]
    
    - name: "task_latency"
      type: "histogram"
      description: "任务延迟分布"
      labels: ["task_type"]
      buckets: [0.1, 0.5, 1, 5, 10, 30, 60, 300]
    
    - name: "message_queue_size"
      type: "gauge"
      description: "消息队列大小"
      labels: ["topic"]
    
    - name: "conflict_rate"
      type: "gauge"
      description: "冲突发生率"
      labels: ["conflict_type"]

8.2 调试工具

class CollaborationDebugger:
    """协作调试器"""
    
    def __init__(self, message_bus: MessageBus):
        self.bus = message_bus
        self.event_log: List[dict] = []
        
        # 订阅所有事件
        self.bus.subscribe('+', self._log_event)
    
    def _log_event(self, event: dict):
        """记录事件"""
        self.event_log.append({
            'timestamp': time.time(),
            'event': event
        })
    
    def replay(self, task_id: str) -> str:
        """
        重放任务执行过程
        
        Args:
            task_id: 任务 ID
        
        Returns:
            str: 执行轨迹
        """
        # 过滤相关事件
        task_events = [
            e for e in self.event_log
            if e['event'].get('task_id') == task_id
        ]
        
        # 生成轨迹
        trajectory = []
        for event in sorted(task_events, key=lambda x: x['timestamp']):
            trajectory.append(
                f"[{event['timestamp']:.3f}] {event['event']['type']}: {event['event']}"
            )
        
        return '\n'.join(trajectory)
    
    def visualize(self, task_id: str) -> str:
        """可视化任务执行流程"""
        # 生成 Mermaid 流程图
        graph = ["graph TD"]
        
        task_events = [
            e for e in self.event_log
            if e['event'].get('task_id') == task_id
        ]
        
        for event in task_events:
            event_type = event['event']['type']
            if event_type == 'task.created':
                graph.append(f"    {task_id}[Task Created]")
            elif event_type == 'task.assigned':
                agent = event['event']['agent_id']
                graph.append(f"    {task_id} --> {agent}[Assigned to {agent}]")
            elif event_type == 'task.completed':
                graph.append(f"    {task_id} --> {task_id}_done[Completed]")
        
        return '\n'.join(graph)