prvng_platform/crates/rag/examples/storage_integration.rs

146 lines
5.1 KiB
Rust
Raw Normal View History

//! Complete RAG workflow: Chunking -> Embedding -> Storage -> Retrieval
//!
//! This example demonstrates the full RAG pipeline including document
//! persistence in SurrealDB and similarity-based retrieval.
//!
//! Run with: `cargo run --example storage_integration`
#![allow(clippy::field_reassign_with_default)]
use provisioning_rag::{
config::{EmbeddingConfig, IngestionConfig, VectorDbConfig},
db::DbConnection,
embeddings::EmbeddingEngine,
ingestion::DocumentIngester,
};
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Initialize logging
tracing_subscriber::fmt::init();
println!("🚀 Provisioning RAG System - Storage Integration Example\n");
// Step 1: Setup database connection
println!("Step 1: Connecting to SurrealDB...");
let mut db_config = VectorDbConfig::default();
db_config.url = "memory".to_string(); // Use in-memory for demo
db_config.database = "rag_demo".to_string();
let db = DbConnection::new(db_config).await?;
db.initialize_schema().await?;
println!("✓ Connected and schema initialized\n");
// Step 2: Create embedding engine (local for demo)
println!("Step 2: Creating embedding engine...");
let mut embedding_config = EmbeddingConfig::default();
embedding_config.provider = "local".to_string();
let embedding_engine = EmbeddingEngine::new(embedding_config)?;
println!("✓ Embedding engine ready (local mode)\n");
// Step 3: Create document ingester
println!("Step 3: Creating document ingester...");
let ingestion_config = IngestionConfig::default();
let ingester = DocumentIngester::new(ingestion_config, embedding_engine);
println!("✓ Document ingester created\n");
// Step 4: Prepare sample documents
println!("Step 4: Processing sample documents...");
let sample_markdown = r#"
# Provisioning Platform Architecture
## Overview
The provisioning platform provides unified infrastructure automation across multiple cloud providers.
## Key Components
### Orchestrator
Central task coordination service that manages all infrastructure operations.
### Control Center
Web-based management interface for monitoring and control.
### MCP Server
Model Context Protocol integration for AI-powered operations.
## Security
Enterprise-grade security with JWT authentication, Cedar authorization, and MFA support.
"#;
// Chunk the document
let chunks = ingester
.chunking_engine()
.chunk_markdown(sample_markdown, "architecture.md")?;
println!("✓ Document chunked into {} chunks\n", chunks.len());
// Step 5: Embed and store documents
println!("Step 5: Embedding and storing documents...");
let embedded_docs = ingester.embedding_engine().embed_batch(&chunks).await?;
let stored_count = db.store_documents(&embedded_docs).await?;
println!("✓ Stored {} documents\n", stored_count);
// Step 6: Store deployment event
println!("Step 6: Recording deployment event...");
db.store_deployment_event(
"librecloud", // workspace
"aws-prod", // infrastructure
"taskserv_create", // event_type
"success", // status
"kubernetes", // resource_name
"aws", // provider
)
.await?;
println!("✓ Deployment event recorded\n");
// Step 7: Get system statistics
println!("Step 7: Retrieving system statistics...");
let stats = db.get_statistics().await?;
println!(
"📊 RAG Statistics:\n Documents: {}\n Deployments: {}\n",
stats.total_documents, stats.total_deployments
);
// Step 8: Demonstrate retrieval (mock similarity search)
println!("Step 8: Performing similarity search...");
if let Some(first_doc) = embedded_docs.first() {
// Search for documents similar to the first document
let similar_docs = db.search_similar(&first_doc.embedding, 5, 0.5).await?;
println!("✓ Found {} similar documents", similar_docs.len());
if let Some(result) = similar_docs.first() {
println!(
"\n📄 Top Result:\n ID: {}\n Type: {}\n Size: {} chars\n",
result.id,
result.doc_type,
result.content.len()
);
}
}
// Step 9: Demonstrate retrieval by ID
println!("\nStep 9: Retrieving document by ID...");
if let Some(first_doc) = embedded_docs.first() {
if let Some(retrieved) = db.get_document(&first_doc.id).await? {
println!(
"✓ Retrieved: {} ({} bytes)",
retrieved.id,
retrieved.content.len()
);
}
}
println!("\n✅ Complete integration example finished!\n");
println!("Key features demonstrated:");
println!(" • Document chunking (heading-aware)");
println!(" • Embedding generation");
println!(" • Batch document storage");
println!(" • Deployment event tracking");
println!(" • Vector similarity search");
println!(" • Document retrieval by ID");
println!(" • System statistics");
Ok(())
}