Implement intelligent agent learning from Knowledge Graph execution history with per-task-type expertise tracking, recency bias, and learning curves. ## Phase 5.3 Implementation ### Learning Infrastructure (✅ Complete) - LearningProfileService with per-task-type expertise metrics - TaskTypeExpertise model tracking success_rate, confidence, learning curves - Recency bias weighting: recent 7 days weighted 3x higher (exponential decay) - Confidence scoring prevents overfitting: min(1.0, executions / 20) - Learning curves computed from daily execution windows ### Agent Scoring Service (✅ Complete) - Unified AgentScore combining SwarmCoordinator + learning profiles - Scoring formula: 0.3*base + 0.5*expertise + 0.2*confidence - Rank agents by combined score for intelligent assignment - Support for recency-biased scoring (recent_success_rate) - Methods: rank_agents, select_best, rank_agents_with_recency ### KG Integration (✅ Complete) - KGPersistence::get_executions_for_task_type() - query by agent + task type - KGPersistence::get_agent_executions() - all executions for agent - Coordinator::load_learning_profile_from_kg() - core KG→Learning integration - Coordinator::load_all_learning_profiles() - batch load for multiple agents - Convert PersistedExecution → ExecutionData for learning calculations ### Agent Assignment Integration (✅ Complete) - AgentCoordinator uses learning profiles for task assignment - extract_task_type() infers task type from title/description - assign_task() scores candidates using AgentScoringService - Fallback to load-based selection if no learning data available - Learning profiles stored in coordinator.learning_profiles RwLock ### Profile Adapter Enhancements (✅ Complete) - create_learning_profile() - initialize empty profiles - add_task_type_expertise() - set task-type expertise - update_profile_with_learning() - update swarm profiles from learning ## Files Modified ### vapora-knowledge-graph/src/persistence.rs (+30 lines) - get_executions_for_task_type(agent_id, task_type, limit) - get_agent_executions(agent_id, limit) ### vapora-agents/src/coordinator.rs (+100 lines) - load_learning_profile_from_kg() - core KG integration method - load_all_learning_profiles() - batch loading for agents - assign_task() already uses learning-based scoring via AgentScoringService ### Existing Complete Implementation - vapora-knowledge-graph/src/learning.rs - calculation functions - vapora-agents/src/learning_profile.rs - data structures and expertise - vapora-agents/src/scoring.rs - unified scoring service - vapora-agents/src/profile_adapter.rs - adapter methods ## Tests Passing - learning_profile: 7 tests ✅ - scoring: 5 tests ✅ - profile_adapter: 6 tests ✅ - coordinator: learning-specific tests ✅ ## Data Flow 1. Task arrives → AgentCoordinator::assign_task() 2. Extract task_type from description 3. Query KG for task-type executions (load_learning_profile_from_kg) 4. Calculate expertise with recency bias 5. Score candidates (SwarmCoordinator + learning) 6. Assign to top-scored agent 7. Execution result → KG → Update learning profiles ## Key Design Decisions ✅ Recency bias: 7-day half-life with 3x weight for recent performance ✅ Confidence scoring: min(1.0, total_executions / 20) prevents overfitting ✅ Hierarchical scoring: 30% base load, 50% expertise, 20% confidence ✅ KG query limit: 100 recent executions per task-type for performance ✅ Async loading: load_learning_profile_from_kg supports concurrent loads ## Next: Phase 5.4 - Cost Optimization Ready to implement budget enforcement and cost-aware provider selection.
301 lines
9.4 KiB
Rust
301 lines
9.4 KiB
Rust
use crate::error::{AnalyticsError, Result};
|
|
use crate::events::*;
|
|
use chrono::{Duration, Utc};
|
|
use dashmap::DashMap;
|
|
use std::collections::VecDeque;
|
|
use std::sync::Arc;
|
|
use tokio::sync::mpsc;
|
|
use tracing::debug;
|
|
|
|
/// Streaming pipeline for event processing
|
|
#[derive(Clone)]
|
|
pub struct EventPipeline {
|
|
event_tx: mpsc::UnboundedSender<AgentEvent>,
|
|
event_rx: Arc<tokio::sync::Mutex<mpsc::UnboundedReceiver<AgentEvent>>>,
|
|
alerts_tx: mpsc::UnboundedSender<Alert>,
|
|
time_windows: Arc<DashMap<String, VecDeque<AgentEvent>>>,
|
|
}
|
|
|
|
impl EventPipeline {
|
|
/// Create new event pipeline
|
|
pub fn new(external_alert_tx: mpsc::UnboundedSender<Alert>) -> (Self, mpsc::UnboundedSender<Alert>) {
|
|
let (event_tx, event_rx) = mpsc::unbounded_channel();
|
|
|
|
let pipeline = Self {
|
|
event_tx,
|
|
event_rx: Arc::new(tokio::sync::Mutex::new(event_rx)),
|
|
alerts_tx: external_alert_tx.clone(),
|
|
time_windows: Arc::new(DashMap::new()),
|
|
};
|
|
|
|
(pipeline, external_alert_tx)
|
|
}
|
|
|
|
/// Emit an event into the pipeline
|
|
pub async fn emit_event(&self, event: AgentEvent) -> Result<()> {
|
|
self.event_tx.send(event).map_err(|e| {
|
|
AnalyticsError::ChannelError(format!("Failed to emit event: {}", e))
|
|
})?;
|
|
Ok(())
|
|
}
|
|
|
|
/// Start processing events from the pipeline
|
|
pub async fn run(&self, window_duration_secs: u64) -> Result<()> {
|
|
let mut rx = self.event_rx.lock().await;
|
|
let time_windows = self.time_windows.clone();
|
|
let alerts_tx = self.alerts_tx.clone();
|
|
|
|
while let Some(event) = rx.recv().await {
|
|
debug!("Processing event: {:?}", event.event_type);
|
|
|
|
// Store in time window
|
|
let window_key = format!(
|
|
"{}_{}",
|
|
event.event_type.as_str(),
|
|
event.timestamp.timestamp() / (window_duration_secs as i64)
|
|
);
|
|
|
|
time_windows
|
|
.entry(window_key.clone())
|
|
.or_insert_with(VecDeque::new)
|
|
.push_back(event.clone());
|
|
|
|
// Check for alerts
|
|
if event.event_type.is_error() {
|
|
let alert = Alert {
|
|
id: uuid::Uuid::new_v4().to_string(),
|
|
level: AlertLevel::Warning,
|
|
message: format!(
|
|
"Error in agent {}: {}",
|
|
event.agent_id,
|
|
event.error.clone().unwrap_or_default()
|
|
),
|
|
affected_agents: vec![event.agent_id.clone()],
|
|
affected_tasks: event.task_id.clone().into_iter().collect(),
|
|
triggered_at: Utc::now(),
|
|
resolution: None,
|
|
};
|
|
|
|
alerts_tx.send(alert).ok();
|
|
}
|
|
|
|
// Check for performance degradation
|
|
if let Some(duration) = event.duration_ms {
|
|
if duration > 30_000 {
|
|
let alert = Alert {
|
|
id: uuid::Uuid::new_v4().to_string(),
|
|
level: AlertLevel::Warning,
|
|
message: format!(
|
|
"Slow task execution: {} took {}ms",
|
|
event.agent_id, duration
|
|
),
|
|
affected_agents: vec![event.agent_id.clone()],
|
|
affected_tasks: event.task_id.clone().into_iter().collect(),
|
|
triggered_at: Utc::now(),
|
|
resolution: Some("Consider scaling or optimization".to_string()),
|
|
};
|
|
|
|
alerts_tx.send(alert).ok();
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Get aggregated statistics for a time window
|
|
pub async fn get_window_stats(
|
|
&self,
|
|
event_type: EventType,
|
|
window_secs: u64,
|
|
) -> Result<EventAggregation> {
|
|
let now = Utc::now();
|
|
let window_start = now - Duration::seconds(window_secs as i64);
|
|
|
|
let mut total_events = 0u64;
|
|
let mut agents = std::collections::HashSet::new();
|
|
let mut durations = Vec::new();
|
|
let mut error_count = 0u64;
|
|
let mut success_count = 0u64;
|
|
|
|
for entry in self.time_windows.iter() {
|
|
for event in entry.value().iter() {
|
|
if event.event_type == event_type && event.timestamp > window_start {
|
|
total_events += 1;
|
|
agents.insert(event.agent_id.clone());
|
|
|
|
if let Some(duration) = event.duration_ms {
|
|
durations.push(duration);
|
|
}
|
|
|
|
if event.event_type.is_error() {
|
|
error_count += 1;
|
|
} else if event.event_type.is_success() {
|
|
success_count += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let avg_duration = if !durations.is_empty() {
|
|
durations.iter().sum::<u64>() as f64 / durations.len() as f64
|
|
} else {
|
|
0.0
|
|
};
|
|
|
|
Ok(EventAggregation {
|
|
window_start,
|
|
window_end: now,
|
|
event_type,
|
|
total_events,
|
|
distinct_agents: agents.len() as u32,
|
|
avg_duration_ms: avg_duration,
|
|
error_count,
|
|
success_count,
|
|
})
|
|
}
|
|
|
|
/// Filter events by criteria
|
|
pub fn filter_events<F>(&self, predicate: F) -> Vec<AgentEvent>
|
|
where
|
|
F: Fn(&AgentEvent) -> bool,
|
|
{
|
|
self.time_windows
|
|
.iter()
|
|
.flat_map(|entry| {
|
|
entry
|
|
.value()
|
|
.iter()
|
|
.filter(|event| predicate(event))
|
|
.cloned()
|
|
.collect::<Vec<_>>()
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// Get error rate in last N seconds
|
|
pub async fn get_error_rate(&self, window_secs: u64) -> Result<f64> {
|
|
let now = Utc::now();
|
|
let window_start = now - Duration::seconds(window_secs as i64);
|
|
|
|
let mut total = 0u64;
|
|
let mut errors = 0u64;
|
|
|
|
for entry in self.time_windows.iter() {
|
|
for event in entry.value().iter() {
|
|
if event.timestamp > window_start {
|
|
total += 1;
|
|
if event.event_type.is_error() {
|
|
errors += 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if total == 0 {
|
|
Ok(0.0)
|
|
} else {
|
|
Ok(errors as f64 / total as f64)
|
|
}
|
|
}
|
|
|
|
/// Get throughput (events per second)
|
|
pub async fn get_throughput(&self, window_secs: u64) -> Result<f64> {
|
|
let now = Utc::now();
|
|
let window_start = now - Duration::seconds(window_secs as i64);
|
|
|
|
let mut count = 0u64;
|
|
|
|
for entry in self.time_windows.iter() {
|
|
for event in entry.value().iter() {
|
|
if event.timestamp > window_start {
|
|
count += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(count as f64 / window_secs as f64)
|
|
}
|
|
|
|
/// Get top N agents by task completion
|
|
pub async fn get_top_agents(&self, limit: usize) -> Result<Vec<(String, u64)>> {
|
|
let mut agent_counts: std::collections::HashMap<String, u64> =
|
|
std::collections::HashMap::new();
|
|
|
|
for entry in self.time_windows.iter() {
|
|
for event in entry.value().iter() {
|
|
if event.event_type.is_success() {
|
|
*agent_counts.entry(event.agent_id.clone()).or_insert(0) += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
let mut agents: Vec<_> = agent_counts.into_iter().collect();
|
|
agents.sort_by(|a, b| b.1.cmp(&a.1));
|
|
|
|
Ok(agents.into_iter().take(limit).collect())
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[tokio::test]
|
|
async fn test_pipeline_creation() {
|
|
let (_alert_tx, alert_rx) = mpsc::unbounded_channel();
|
|
let (_pipeline, _alerts) = EventPipeline::new(_alert_tx);
|
|
assert!(alert_rx.is_empty());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_emit_event() {
|
|
let (alert_tx, _alert_rx) = mpsc::unbounded_channel();
|
|
let (pipeline, _alerts) = EventPipeline::new(alert_tx);
|
|
|
|
let event = AgentEvent::new_task_completed(
|
|
"agent-1".to_string(),
|
|
"task-1".to_string(),
|
|
1000,
|
|
100,
|
|
50,
|
|
);
|
|
|
|
assert!(pipeline.emit_event(event).await.is_ok());
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn test_filter_events() {
|
|
let (alert_tx, _alert_rx) = mpsc::unbounded_channel();
|
|
let (pipeline, _alerts) = EventPipeline::new(alert_tx);
|
|
|
|
// Spawn pipeline processor in background
|
|
let pipeline_clone = pipeline.clone();
|
|
tokio::spawn(async move {
|
|
pipeline_clone.run(60).await.ok();
|
|
});
|
|
|
|
let event1 = AgentEvent::new_task_completed(
|
|
"agent-1".to_string(),
|
|
"task-1".to_string(),
|
|
1000,
|
|
100,
|
|
50,
|
|
);
|
|
let event2 = AgentEvent::new_task_failed(
|
|
"agent-2".to_string(),
|
|
"task-2".to_string(),
|
|
"error".to_string(),
|
|
);
|
|
|
|
pipeline.emit_event(event1).await.ok();
|
|
pipeline.emit_event(event2).await.ok();
|
|
|
|
// Give pipeline time to process events
|
|
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
|
|
|
|
let filtered = pipeline.filter_events(|e| e.event_type.is_error());
|
|
assert_eq!(filtered.len(), 1);
|
|
}
|
|
}
|