Day 5: Production Multi-Agent Systems: State, Error Handling, and Observability

Today's Objective

Day 5 covers what separates a demo multi-agent system from a production one: state management, graceful error handling, retry logic, and logging to underst

Agent State Management

Real multi-agent systems need to track state: what each agent did, what its output was, and whether it succeeded. This enables retry, debugging, and audit trails.

agent-runtime.js

import Anthropic from '@anthropic-ai/sdk';
import * as dotenv from 'dotenv';
dotenv.config();

const client = new Anthropic();

class AgentRun {
  constructor(goal) {
    this.id = crypto.randomUUID();
    this.goal = goal;
    this.steps = [];
    this.status = 'running';
    this.startTime = Date.now();
  }
  
  addStep(agentName, input, output, durationMs) {
    this.steps.push({ agentName, input: input.slice(0, 200), output: output.slice(0, 200), durationMs, timestamp: new Date().toISOString() });
  }
  
  complete(finalOutput) {
    this.status = 'complete';
    this.finalOutput = finalOutput;
    this.totalMs = Date.now() - this.startTime;
    return this;
  }
  
  fail(error) {
    this.status = 'failed';
    this.error = error.message;
    this.totalMs = Date.now() - this.startTime;
    return this;
  }
  
  summary() {
    return {
      id: this.id,
      status: this.status,
      totalMs: this.totalMs,
      steps: this.steps.length,
      stepNames: this.steps.map(s => s.agentName)
    };
  }
}

async function runAgent(name, system, message, run) {
  const start = Date.now();
  
  for (let attempt = 1; attempt <= 3; attempt++) {
    try {
      const response = await client.messages.create({
        model: 'claude-opus-4-5',
        max_tokens: 2048,
        system,
        messages: [{ role: 'user', content: message }]
      });
      
      const output = response.content[0].text;
      run.addStep(name, message, output, Date.now() - start);
      return output;
      
    } catch (error) {
      if (attempt === 3) throw error;
      await new Promise(r => setTimeout(r, 1000 * attempt));
    }
  }
}

// Orchestrated workflow with full observability
async function runWorkflow(goal) {
  const run = new AgentRun(goal);
  
  try {
    const research = await runAgent('researcher',
      'Research topics thoroughly. Provide facts and examples.',
      `Research: ${goal}`, run);
    
    const draft = await runAgent('writer',
      'Write clear, engaging content from research notes.',
      `Write about: ${goal}

Research: ${research}`, run);
    
    const final = await runAgent('editor',
      'Edit for clarity and impact. Return the improved version.',
      `Edit: ${draft}`, run);
    
    return run.complete(final);
    
  } catch (error) {
    return run.fail(error);
  }
}

const result = await runWorkflow('Best practices for remote team communication');
console.log('Run summary:', result.summary());
console.log('
Final output:
', result.finalOutput);

Finished!

Back to course overview

→

Production Multi-Agent Systems: State, Error Handling, and Observability

Today's Objective

Agent State Management

Supporting References & Reading

Go deeper with these external resources.

Day 5 Checkpoint