Training Workflows

This comprehensive guide covers everything you need to know about training RL-powered agents in Azcore, from basic training loops to advanced techniques, evaluation strategies, and hyperparameter optimization.

🎯 Training Overview

Training an RL agent involves repeatedly:

Selecting tools based on current policy
Executing those tools
Receiving reward feedback
Updating Q-values
Adjusting exploration over time

The goal is to learn which tools work best for which types of queries through experience.

🏋️ Basic Training Loop

Simple Training Structure

from azcore.rl.rl_manager import RLManager
from azcore.rl.rewards import HeuristicRewardCalculator
from langchain_core.messages import HumanMessage

# Setup RL components
rl_manager = RLManager(
    tool_names=["search", "calculate", "weather", "email"],
    q_table_path="rl_data/training.pkl",
    exploration_rate=0.2,
    learning_rate=0.1,
    discount_factor=0.99,
    use_embeddings=True
)

reward_calculator = HeuristicRewardCalculator(
    success_reward=1.0,
    failure_penalty=-0.5
)

# Training data: (query, correct_tools)
training_data = [
    ("Calculate 15 * 23", ["calculate"]),
    ("What's 50 plus 25?", ["calculate"]),
    ("Weather in NYC", ["weather"]),
    ("Temperature in London", ["weather"]),
    ("Search for Python tutorials", ["search"]),
    ("Find information on AI", ["search"]),
    ("Send email to Bob", ["email"]),
]

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    epoch_rewards = []

    for query, correct_tools in training_data:
        # 1. Select tools using current policy
        selected_tools, state_key = rl_manager.select_tools(
            query,
            top_n=2
        )

        # 2. Simulate execution and compute reward
        # In production, this would be actual agent execution
        if any(tool in correct_tools for tool in selected_tools):
            reward = 1.0  # Success
        else:
            reward = -0.5  # Failure

        epoch_rewards.append(reward)

        # 3. Update Q-values for each selected tool
        for tool in selected_tools:
            rl_manager.update(state_key, tool, reward)

    # 4. Decay exploration after each epoch
    rl_manager.anneal_exploration(decay_rate=0.95)

    # 5. Log progress
    avg_reward = sum(epoch_rewards) / len(epoch_rewards)
    stats = rl_manager.get_statistics()

    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"  Avg Reward: {avg_reward:.3f}")
    print(f"  Exploration Rate: {stats['exploration_rate']:.3f}")
    print(f"  Total States: {stats['total_states']}")

    # 6. Save checkpoint
    if (epoch + 1) % 5 == 0:
        rl_manager.force_persist()
        print(f"  ✓ Checkpoint saved")
    print()

# Final save
rl_manager.force_persist()
print("Training complete!")

Expected Output

Epoch 1/10
  Avg Reward: 0.143
  Exploration Rate: 0.190
  Total States: 7

Epoch 2/10
  Avg Reward: 0.429
  Exploration Rate: 0.181
  Total States: 7

Epoch 3/10
  Avg Reward: 0.571
  Exploration Rate: 0.172
  Total States: 7

...

Epoch 10/10
  Avg Reward: 0.857
  Exploration Rate: 0.120
  Total States: 7
  ✓ Checkpoint saved

Training complete!

🎓 Training with Real Agent

Full Integration Example

from azcore.agents.agent_factory import AgentFactory
from azcore.rl.rl_manager import RLManager, ExplorationStrategy
from azcore.rl.rewards import HeuristicRewardCalculator
from langchain_openai import ChatOpenAI
from langchain_core.tools import tool
from langchain_core.messages import HumanMessage

# Define tools
@tool
def search_web(query: str) -> str:
    """Search the web for information."""
    return f"Search results for: {query}"

@tool
def calculate(expression: str) -> str:
    """Evaluate a mathematical expression."""
    try:
        result = eval(expression)
        return f"Result: {result}"
    except Exception as e:
        return f"Error: {str(e)}"

@tool
def fetch_weather(location: str) -> str:
    """Get weather for a location."""
    return f"Weather in {location}: 72°F, Sunny"

tools = [search_web, calculate, fetch_weather]

# Setup LLM
llm = ChatOpenAI(model="gpt-4", temperature=0)

# Create RL components
rl_manager = RLManager(
    tool_names=[t.name for t in tools],
    q_table_path="rl_data/real_agent_training.pkl",
    exploration_strategy=ExplorationStrategy.EPSILON_DECAY,
    exploration_rate=0.3,
    epsilon_decay_rate=0.98,
    min_exploration_rate=0.05,
    use_embeddings=True
)

reward_calc = HeuristicRewardCalculator(
    success_reward=1.0,
    failure_penalty=-0.5,
    empty_penalty=-0.3
)

# Create RL-enabled agent
factory = AgentFactory(default_llm=llm)
agent = factory.create_react_agent(
    name="training_agent",
    tools=tools,
    rl_enabled=True,
    rl_manager=rl_manager,
    reward_calculator=reward_calc,
    prompt="You are a helpful assistant. Use tools to answer queries accurately."
)

# Training queries
training_queries = [
    "What is 25 * 4?",
    "Calculate 100 / 5",
    "What's the square root of 144?",
    "Weather in San Francisco?",
    "What's the temperature in Tokyo?",
    "Search for machine learning tutorials",
    "Find information about Python",
]

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    print(f"\n{'='*60}")
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"{'='*60}\n")

    epoch_rewards = []

    for i, query in enumerate(training_queries, 1):
        print(f"Query {i}/{len(training_queries)}: {query}")

        # Invoke agent (RL happens automatically)
        state = {"messages": [HumanMessage(content=query)]}
        result = agent.invoke(state)

        # Extract response
        response = result["messages"][-1].content
        print(f"Response: {response[:100]}...")

        # Reward is calculated automatically by agent
        # Track for monitoring
        rl_metadata = state.get("rl_metadata", {})
        if "last_reward" in rl_metadata:
            epoch_rewards.append(rl_metadata["last_reward"])

        print()

    # Epoch summary
    if epoch_rewards:
        avg_reward = sum(epoch_rewards) / len(epoch_rewards)
        print(f"\nEpoch {epoch + 1} Summary:")
        print(f"  Average Reward: {avg_reward:.3f}")

    stats = rl_manager.get_statistics()
    print(f"  Exploration Rate: {stats['exploration_rate']:.3f}")
    print(f"  Total States: {stats['total_states']}")
    print(f"  Q-Table Size: {len(rl_manager.q_table)}")

    # Checkpoint
    if (epoch + 1) % 2 == 0:
        rl_manager.force_persist()
        print(f"  ✓ Checkpoint saved")

print("\n" + "="*60)
print("Training Complete!")
print("="*60)

# Final statistics
final_stats = rl_manager.get_statistics()
print(f"\nFinal Statistics:")
print(f"  Total States Learned: {final_stats['total_states']}")
print(f"  Total State Visits: {final_stats['total_state_visits']}")
print(f"  Final Exploration Rate: {final_stats['exploration_rate']:.3f}")
print(f"  Non-Zero Q-Values: {final_stats['non_zero_q_values']}")

# Show top performing tools
top_tools = rl_manager.get_top_performing_tools(top_n=3)
print(f"\nTop Performing Tools:")
for tool, avg_q in top_tools:
    print(f"  {tool}: {avg_q:.3f}")

📊 Evaluation

Separate Train/Test Split

# Split data
from sklearn.model_selection import train_test_split

all_queries = [
    ("Calculate 15 * 23", ["calculate"]),
    ("Weather in NYC", ["weather"]),
    ("Search for Python", ["search"]),
    # ... more queries
]

train_data, test_data = train_test_split(
    all_queries,
    test_size=0.2,
    random_state=42
)

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

Evaluation Function

def evaluate_rl_agent(rl_manager, test_data, verbose=True):
    """
    Evaluate RL agent on test data.

    Args:
        rl_manager: RLManager instance
        test_data: List of (query, correct_tools) tuples
        verbose: Print detailed results

    Returns:
        Dictionary with evaluation metrics
    """
    # Switch to pure exploitation
    original_exploration = rl_manager.exploration_rate
    rl_manager.exploration_rate = 0.0

    correct = 0
    total = len(test_data)
    tool_accuracy = {}

    for query, expected_tools in test_data:
        # Select tools
        selected_tools, state_key = rl_manager.select_tools(query, top_n=1)

        # Check if correct
        is_correct = selected_tools[0] in expected_tools
        if is_correct:
            correct += 1

        # Track per-tool accuracy
        for expected_tool in expected_tools:
            if expected_tool not in tool_accuracy:
                tool_accuracy[expected_tool] = {"correct": 0, "total": 0}

            tool_accuracy[expected_tool]["total"] += 1
            if is_correct and selected_tools[0] == expected_tool:
                tool_accuracy[expected_tool]["correct"] += 1

        if verbose:
            status = "✓" if is_correct else "✗"
            print(f"{status} Query: {query}")
            print(f"  Expected: {expected_tools}")
            print(f"  Selected: {selected_tools[0]}")
            print()

    # Restore exploration rate
    rl_manager.exploration_rate = original_exploration

    # Compute metrics
    accuracy = correct / total

    # Per-tool accuracy
    tool_accuracies = {}
    for tool, counts in tool_accuracy.items():
        tool_accuracies[tool] = counts["correct"] / counts["total"]

    results = {
        "accuracy": accuracy,
        "correct": correct,
        "total": total,
        "tool_accuracies": tool_accuracies
    }

    return results

# Usage
print("="*60)
print("EVALUATION")
print("="*60)

results = evaluate_rl_agent(rl_manager, test_data, verbose=True)

print(f"\nOverall Accuracy: {results['accuracy']:.2%}")
print(f"Correct: {results['correct']}/{results['total']}")

print(f"\nPer-Tool Accuracy:")
for tool, acc in results['tool_accuracies'].items():
    print(f"  {tool}: {acc:.2%}")

Cross-Validation

from sklearn.model_selection import KFold

def cross_validate_rl(data, n_splits=5):
    """Perform k-fold cross-validation."""
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    fold_results = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(data), 1):
        print(f"\nFold {fold}/{n_splits}")
        print("-" * 40)

        # Split data
        train_fold = [data[i] for i in train_idx]
        test_fold = [data[i] for i in test_idx]

        # Create fresh RL manager
        rl = RLManager(
            tool_names=["search", "calculate", "weather"],
            exploration_rate=0.2
        )

        # Train
        print("Training...")
        train_rl_agent(rl, train_fold, epochs=10)

        # Evaluate
        print("Evaluating...")
        results = evaluate_rl_agent(rl, test_fold, verbose=False)
        fold_results.append(results['accuracy'])

        print(f"Fold {fold} Accuracy: {results['accuracy']:.2%}")

    # Summary
    import numpy as np
    mean_acc = np.mean(fold_results)
    std_acc = np.std(fold_results)

    print(f"\n{'='*40}")
    print(f"Cross-Validation Results")
    print(f"{'='*40}")
    print(f"Mean Accuracy: {mean_acc:.2%} ± {std_acc:.2%}")
    print(f"Fold Accuracies: {[f'{acc:.2%}' for acc in fold_results]}")

    return fold_results

# Run cross-validation
cv_results = cross_validate_rl(all_queries, n_splits=5)

⚙️ Hyperparameter Tuning

Grid Search

from itertools import product

def grid_search_rl_hyperparameters(train_data, test_data):
    """
    Perform grid search over RL hyperparameters.

    Returns best configuration and results.
    """
    # Define hyperparameter grid
    param_grid = {
        'exploration_rate': [0.1, 0.15, 0.2, 0.3],
        'learning_rate': [0.05, 0.1, 0.2],
        'discount_factor': [0.9, 0.95, 0.99],
        'exploration_strategy': [
            ExplorationStrategy.EPSILON_GREEDY,
            ExplorationStrategy.EPSILON_DECAY
        ]
    }

    # Generate all combinations
    keys = param_grid.keys()
    values = param_grid.values()
    combinations = [dict(zip(keys, v)) for v in product(*values)]

    print(f"Testing {len(combinations)} hyperparameter combinations...\n")

    results = []

    for i, params in enumerate(combinations, 1):
        print(f"Configuration {i}/{len(combinations)}")
        print(f"  Params: {params}")

        # Create RL manager with these params
        rl = RLManager(
            tool_names=["search", "calculate", "weather"],
            q_table_path=f"rl_data/grid_search_{i}.pkl",
            **params
        )

        # Train
        train_rl_agent(rl, train_data, epochs=10)

        # Evaluate
        eval_results = evaluate_rl_agent(rl, test_data, verbose=False)
        accuracy = eval_results['accuracy']

        print(f"  Accuracy: {accuracy:.2%}\n")

        results.append({
            'params': params,
            'accuracy': accuracy,
            'eval_results': eval_results
        })

    # Sort by accuracy
    results.sort(key=lambda x: x['accuracy'], reverse=True)

    # Print top 5
    print("\n" + "="*60)
    print("Top 5 Configurations")
    print("="*60)

    for i, result in enumerate(results[:5], 1):
        print(f"\n{i}. Accuracy: {result['accuracy']:.2%}")
        print(f"   Parameters:")
        for key, value in result['params'].items():
            if isinstance(value, ExplorationStrategy):
                value = value.value
            print(f"     {key}: {value}")

    return results[0]  # Return best configuration

# Usage
best_config = grid_search_rl_hyperparameters(train_data, test_data)

print(f"\n{'='*60}")
print(f"Best Configuration Found")
print(f"{'='*60}")
print(f"Accuracy: {best_config['accuracy']:.2%}")
print(f"Parameters: {best_config['params']}")

Random Search

import random

def random_search_rl_hyperparameters(train_data, test_data, n_iter=20):
    """
    Perform random search over hyperparameters.

    More efficient than grid search for large parameter spaces.
    """
    results = []

    for i in range(n_iter):
        # Sample random hyperparameters
        params = {
            'exploration_rate': random.uniform(0.05, 0.4),
            'learning_rate': random.uniform(0.01, 0.3),
            'discount_factor': random.uniform(0.85, 0.99),
            'exploration_strategy': random.choice([
                ExplorationStrategy.EPSILON_GREEDY,
                ExplorationStrategy.EPSILON_DECAY,
                ExplorationStrategy.UCB
            ])
        }

        print(f"\nIteration {i + 1}/{n_iter}")
        print(f"Testing: {params}")

        # Create and train
        rl = RLManager(
            tool_names=["search", "calculate", "weather"],
            **params
        )

        train_rl_agent(rl, train_data, epochs=10)
        eval_results = evaluate_rl_agent(rl, test_data, verbose=False)

        results.append({
            'params': params,
            'accuracy': eval_results['accuracy']
        })

        print(f"Accuracy: {eval_results['accuracy']:.2%}")

    # Find best
    best = max(results, key=lambda x: x['accuracy'])

    print(f"\n{'='*60}")
    print("Best Configuration from Random Search")
    print(f"{'='*60}")
    print(f"Accuracy: {best['accuracy']:.2%}")
    print(f"Parameters: {best['params']}")

    return best

# Usage
best_random = random_search_rl_hyperparameters(train_data, test_data, n_iter=20)

📈 Learning Curves

Track Training Progress

def train_with_learning_curve(rl_manager, train_data, test_data, epochs=20):
    """
    Train while tracking learning curves.

    Returns training and validation metrics over time.
    """
    train_accuracies = []
    test_accuracies = []
    exploration_rates = []
    avg_q_values = []

    for epoch in range(epochs):
        # Training
        train_rl_agent(rl_manager, train_data, epochs=1)

        # Evaluate on train set
        train_results = evaluate_rl_agent(rl_manager, train_data, verbose=False)
        train_accuracies.append(train_results['accuracy'])

        # Evaluate on test set
        test_results = evaluate_rl_agent(rl_manager, test_data, verbose=False)
        test_accuracies.append(test_results['accuracy'])

        # Track exploration rate
        exploration_rates.append(rl_manager.exploration_rate)

        # Track average Q-value
        all_q_values = []
        for state_actions in rl_manager.q_table.values():
            all_q_values.extend(state_actions.values())
        avg_q = sum(all_q_values) / max(len(all_q_values), 1)
        avg_q_values.append(avg_q)

        print(f"Epoch {epoch + 1}/{epochs}")
        print(f"  Train Acc: {train_results['accuracy']:.2%}")
        print(f"  Test Acc: {test_results['accuracy']:.2%}")
        print(f"  Exploration: {rl_manager.exploration_rate:.3f}")
        print(f"  Avg Q-Value: {avg_q:.3f}\n")

    return {
        'train_accuracies': train_accuracies,
        'test_accuracies': test_accuracies,
        'exploration_rates': exploration_rates,
        'avg_q_values': avg_q_values
    }

# Usage
print("Training with learning curve tracking...\n")
metrics = train_with_learning_curve(rl_manager, train_data, test_data, epochs=20)

# Plot learning curves
import matplotlib.pyplot as plt

fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10))

# Accuracy plot
ax1.plot(metrics['train_accuracies'], label='Train', marker='o')
ax1.plot(metrics['test_accuracies'], label='Test', marker='s')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Accuracy')
ax1.set_title('Learning Curves')
ax1.legend()
ax1.grid(True)

# Exploration rate
ax2.plot(metrics['exploration_rates'], color='orange', marker='o')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Exploration Rate')
ax2.set_title('Exploration Rate Decay')
ax2.grid(True)

# Average Q-value
ax3.plot(metrics['avg_q_values'], color='green', marker='o')
ax3.set_xlabel('Epoch')
ax3.set_ylabel('Average Q-Value')
ax3.set_title('Q-Value Evolution')
ax3.grid(True)

# Overfitting gap
gap = [train - test for train, test in zip(
    metrics['train_accuracies'],
    metrics['test_accuracies']
)]
ax4.plot(gap, color='red', marker='o')
ax4.axhline(y=0, color='black', linestyle='--', alpha=0.3)
ax4.set_xlabel('Epoch')
ax4.set_ylabel('Train - Test Accuracy')
ax4.set_title('Overfitting Gap')
ax4.grid(True)

plt.tight_layout()
plt.savefig('learning_curves.png', dpi=150)
print("Learning curves saved to 'learning_curves.png'")

🎯 Advanced Training Techniques

Early Stopping

def train_with_early_stopping(
    rl_manager,
    train_data,
    test_data,
    max_epochs=100,
    patience=5,
    min_delta=0.01
):
    """
    Train with early stopping based on test accuracy.

    Stops if test accuracy doesn't improve for 'patience' epochs.
    """
    best_test_acc = 0.0
    epochs_without_improvement = 0
    best_q_table = None

    for epoch in range(max_epochs):
        # Train one epoch
        train_rl_agent(rl_manager, train_data, epochs=1)

        # Evaluate
        test_results = evaluate_rl_agent(rl_manager, test_data, verbose=False)
        test_acc = test_results['accuracy']

        print(f"Epoch {epoch + 1}: Test Accuracy = {test_acc:.2%}")

        # Check for improvement
        if test_acc > best_test_acc + min_delta:
            best_test_acc = test_acc
            epochs_without_improvement = 0
            # Save best Q-table
            best_q_table = {
                state: dict(actions)
                for state, actions in rl_manager.q_table.items()
            }
            print(f"  ✓ New best accuracy!")
        else:
            epochs_without_improvement += 1
            print(f"  No improvement ({epochs_without_improvement}/{patience})")

        # Early stopping check
        if epochs_without_improvement >= patience:
            print(f"\nEarly stopping after {epoch + 1} epochs")
            print(f"Best test accuracy: {best_test_acc:.2%}")

            # Restore best Q-table
            if best_q_table:
                rl_manager.q_table.clear()
                for state, actions in best_q_table.items():
                    rl_manager.q_table[state] = defaultdict(float, actions)
                print("Restored best Q-table")

            break

    return best_test_acc

# Usage
final_acc = train_with_early_stopping(
    rl_manager,
    train_data,
    test_data,
    max_epochs=50,
    patience=5
)

Learning Rate Scheduling

def train_with_lr_schedule(rl_manager, train_data, epochs=20):
    """Train with learning rate scheduling."""

    initial_lr = rl_manager.learning_rate

    for epoch in range(epochs):
        # Cosine annealing schedule
        import math
        progress = epoch / epochs
        rl_manager.learning_rate = initial_lr * (
            0.5 * (1 + math.cos(math.pi * progress))
        )

        # Train
        train_rl_agent(rl_manager, train_data, epochs=1)

        print(f"Epoch {epoch + 1}: LR = {rl_manager.learning_rate:.4f}")

    # Restore original
    rl_manager.learning_rate = initial_lr

# Usage
train_with_lr_schedule(rl_manager, train_data, epochs=20)

🎓 Best Practices

1. Start with High Exploration

# ✅ GOOD: High initial exploration
rl_manager = RLManager(
    tool_names=tools,
    exploration_rate=0.3,  # 30% exploration initially
    exploration_strategy=ExplorationStrategy.EPSILON_DECAY
)

# ❌ BAD: Too low initial exploration
rl_manager = RLManager(
    tool_names=tools,
    exploration_rate=0.05  # May miss good strategies
)

2. Use Train/Test Splits

# ✅ GOOD: Separate evaluation
train_data, test_data = train_test_split(data, test_size=0.2)

# ❌ BAD: Training and evaluating on same data
train_and_evaluate_on_all_data()

3. Monitor Multiple Metrics

# ✅ GOOD: Track various metrics
metrics_to_track = [
    'accuracy',
    'exploration_rate',
    'avg_q_value',
    'num_states',
    'tool_distribution'
]

# ❌ BAD: Only track accuracy
only_track_accuracy()

4. Save Checkpoints

# ✅ GOOD: Regular checkpoints
if epoch % 5 == 0:
    rl_manager.force_persist()
    shutil.copy(
        "rl_data/training.pkl",
        f"rl_data/checkpoint_epoch_{epoch}.pkl"
    )

5. Use Semantic Embeddings

# ✅ GOOD: Enable for better generalization
rl_manager = RLManager(
    tool_names=tools,
    use_embeddings=True
)

# ❌ BAD: Disable unless you have a reason
rl_manager = RLManager(
    tool_names=tools,
    use_embeddings=False
)

🚀 Complete Training Pipeline

def complete_training_pipeline(
    tools,
    training_data,
    test_data,
    output_dir="rl_training"
):
    """
    Complete end-to-end training pipeline.

    Returns trained RLManager and evaluation results.
    """
    import os
    import json
    from datetime import datetime

    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    print("="*60)
    print("RL TRAINING PIPELINE")
    print("="*60)

    # 1. Initialize RL Manager
    print("\n[1/6] Initializing RL Manager...")
    rl_manager = RLManager(
        tool_names=[t.name for t in tools],
        q_table_path=f"{output_dir}/qtable_{timestamp}.pkl",
        exploration_strategy=ExplorationStrategy.EPSILON_DECAY,
        exploration_rate=0.3,
        epsilon_decay_rate=0.98,
        learning_rate=0.1,
        use_embeddings=True
    )
    print(f"✓ Initialized with {len(tools)} tools")

    # 2. Train with learning curves
    print("\n[2/6] Training...")
    metrics = train_with_learning_curve(
        rl_manager,
        training_data,
        test_data,
        epochs=20
    )
    print("✓ Training complete")

    # 3. Evaluate
    print("\n[3/6] Final Evaluation...")
    eval_results = evaluate_rl_agent(rl_manager, test_data, verbose=False)
    print(f"✓ Test Accuracy: {eval_results['accuracy']:.2%}")

    # 4. Save plots
    print("\n[4/6] Generating visualizations...")
    # [Plot code from learning curves section]
    plt.savefig(f"{output_dir}/learning_curves_{timestamp}.png")
    print(f"✓ Saved learning curves")

    # 5. Export results
    print("\n[5/6] Exporting results...")
    results = {
        "timestamp": timestamp,
        "configuration": {
            "tools": [t.name for t in tools],
            "exploration_strategy": rl_manager.exploration_strategy.value,
            "initial_exploration_rate": 0.3
        },
        "training_metrics": {
            "train_accuracies": metrics['train_accuracies'],
            "test_accuracies": metrics['test_accuracies']
        },
        "final_evaluation": {
            "accuracy": eval_results['accuracy'],
            "tool_accuracies": eval_results['tool_accuracies']
        },
        "statistics": rl_manager.get_statistics()
    }

    with open(f"{output_dir}/results_{timestamp}.json", "w") as f:
        json.dump(results, f, indent=2)

    rl_manager.export_readable(f"{output_dir}/qtable_readable_{timestamp}.txt")
    print(f"✓ Results exported to {output_dir}/")

    # 6. Summary
    print("\n[6/6] Training Summary")
    print("="*60)
    print(f"Final Test Accuracy: {eval_results['accuracy']:.2%}")
    print(f"Total States Learned: {len(rl_manager.q_table)}")
    print(f"Total Training Samples: {len(training_data)}")
    print(f"Total Test Samples: {len(test_data)}")

    top_tools = rl_manager.get_top_performing_tools(3)
    print(f"\nTop 3 Tools:")
    for tool, avg_q in top_tools:
        print(f"  {tool}: {avg_q:.3f}")

    print(f"\nAll outputs saved to: {output_dir}/")
    print("="*60)

    return rl_manager, results

# Usage
trained_rl, results = complete_training_pipeline(
    tools=tools,
    training_data=train_data,
    test_data=test_data,
    output_dir="rl_training_output"
)

🎓 Summary

Effective RL training in Azcore requires:

Structured Training Loops: Systematic tool selection, execution, and Q-value updates
Proper Evaluation: Separate train/test splits, cross-validation
Hyperparameter Tuning: Grid search or random search for optimal parameters
Progress Monitoring: Track learning curves, Q-values, exploration rates
Advanced Techniques: Early stopping, learning rate scheduling
Best Practices: High initial exploration, semantic embeddings, regular checkpoints

With these techniques, you can train robust, high-performing RL agents that continuously improve through experience.

.css-79wky{color:var(--chakra-colors-white);}AzrienLabs