146 lines
5.1 KiB
Rust
146 lines
5.1 KiB
Rust
|
|
//! Complete RAG workflow: Chunking -> Embedding -> Storage -> Retrieval
|
||
|
|
//!
|
||
|
|
//! This example demonstrates the full RAG pipeline including document
|
||
|
|
//! persistence in SurrealDB and similarity-based retrieval.
|
||
|
|
//!
|
||
|
|
//! Run with: `cargo run --example storage_integration`
|
||
|
|
|
||
|
|
#![allow(clippy::field_reassign_with_default)]
|
||
|
|
|
||
|
|
use provisioning_rag::{
|
||
|
|
config::{EmbeddingConfig, IngestionConfig, VectorDbConfig},
|
||
|
|
db::DbConnection,
|
||
|
|
embeddings::EmbeddingEngine,
|
||
|
|
ingestion::DocumentIngester,
|
||
|
|
};
|
||
|
|
|
||
|
|
#[tokio::main]
|
||
|
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
|
|
// Initialize logging
|
||
|
|
tracing_subscriber::fmt::init();
|
||
|
|
|
||
|
|
println!("🚀 Provisioning RAG System - Storage Integration Example\n");
|
||
|
|
|
||
|
|
// Step 1: Setup database connection
|
||
|
|
println!("Step 1: Connecting to SurrealDB...");
|
||
|
|
let mut db_config = VectorDbConfig::default();
|
||
|
|
db_config.url = "memory".to_string(); // Use in-memory for demo
|
||
|
|
db_config.database = "rag_demo".to_string();
|
||
|
|
|
||
|
|
let db = DbConnection::new(db_config).await?;
|
||
|
|
db.initialize_schema().await?;
|
||
|
|
println!("✓ Connected and schema initialized\n");
|
||
|
|
|
||
|
|
// Step 2: Create embedding engine (local for demo)
|
||
|
|
println!("Step 2: Creating embedding engine...");
|
||
|
|
let mut embedding_config = EmbeddingConfig::default();
|
||
|
|
embedding_config.provider = "local".to_string();
|
||
|
|
let embedding_engine = EmbeddingEngine::new(embedding_config)?;
|
||
|
|
println!("✓ Embedding engine ready (local mode)\n");
|
||
|
|
|
||
|
|
// Step 3: Create document ingester
|
||
|
|
println!("Step 3: Creating document ingester...");
|
||
|
|
let ingestion_config = IngestionConfig::default();
|
||
|
|
let ingester = DocumentIngester::new(ingestion_config, embedding_engine);
|
||
|
|
println!("✓ Document ingester created\n");
|
||
|
|
|
||
|
|
// Step 4: Prepare sample documents
|
||
|
|
println!("Step 4: Processing sample documents...");
|
||
|
|
let sample_markdown = r#"
|
||
|
|
# Provisioning Platform Architecture
|
||
|
|
|
||
|
|
## Overview
|
||
|
|
The provisioning platform provides unified infrastructure automation across multiple cloud providers.
|
||
|
|
|
||
|
|
## Key Components
|
||
|
|
|
||
|
|
### Orchestrator
|
||
|
|
Central task coordination service that manages all infrastructure operations.
|
||
|
|
|
||
|
|
### Control Center
|
||
|
|
Web-based management interface for monitoring and control.
|
||
|
|
|
||
|
|
### MCP Server
|
||
|
|
Model Context Protocol integration for AI-powered operations.
|
||
|
|
|
||
|
|
## Security
|
||
|
|
Enterprise-grade security with JWT authentication, Cedar authorization, and MFA support.
|
||
|
|
"#;
|
||
|
|
|
||
|
|
// Chunk the document
|
||
|
|
let chunks = ingester
|
||
|
|
.chunking_engine()
|
||
|
|
.chunk_markdown(sample_markdown, "architecture.md")?;
|
||
|
|
|
||
|
|
println!("✓ Document chunked into {} chunks\n", chunks.len());
|
||
|
|
|
||
|
|
// Step 5: Embed and store documents
|
||
|
|
println!("Step 5: Embedding and storing documents...");
|
||
|
|
let embedded_docs = ingester.embedding_engine().embed_batch(&chunks).await?;
|
||
|
|
let stored_count = db.store_documents(&embedded_docs).await?;
|
||
|
|
println!("✓ Stored {} documents\n", stored_count);
|
||
|
|
|
||
|
|
// Step 6: Store deployment event
|
||
|
|
println!("Step 6: Recording deployment event...");
|
||
|
|
db.store_deployment_event(
|
||
|
|
"librecloud", // workspace
|
||
|
|
"aws-prod", // infrastructure
|
||
|
|
"taskserv_create", // event_type
|
||
|
|
"success", // status
|
||
|
|
"kubernetes", // resource_name
|
||
|
|
"aws", // provider
|
||
|
|
)
|
||
|
|
.await?;
|
||
|
|
println!("✓ Deployment event recorded\n");
|
||
|
|
|
||
|
|
// Step 7: Get system statistics
|
||
|
|
println!("Step 7: Retrieving system statistics...");
|
||
|
|
let stats = db.get_statistics().await?;
|
||
|
|
println!(
|
||
|
|
"📊 RAG Statistics:\n Documents: {}\n Deployments: {}\n",
|
||
|
|
stats.total_documents, stats.total_deployments
|
||
|
|
);
|
||
|
|
|
||
|
|
// Step 8: Demonstrate retrieval (mock similarity search)
|
||
|
|
println!("Step 8: Performing similarity search...");
|
||
|
|
if let Some(first_doc) = embedded_docs.first() {
|
||
|
|
// Search for documents similar to the first document
|
||
|
|
let similar_docs = db.search_similar(&first_doc.embedding, 5, 0.5).await?;
|
||
|
|
|
||
|
|
println!("✓ Found {} similar documents", similar_docs.len());
|
||
|
|
|
||
|
|
if let Some(result) = similar_docs.first() {
|
||
|
|
println!(
|
||
|
|
"\n📄 Top Result:\n ID: {}\n Type: {}\n Size: {} chars\n",
|
||
|
|
result.id,
|
||
|
|
result.doc_type,
|
||
|
|
result.content.len()
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Step 9: Demonstrate retrieval by ID
|
||
|
|
println!("\nStep 9: Retrieving document by ID...");
|
||
|
|
if let Some(first_doc) = embedded_docs.first() {
|
||
|
|
if let Some(retrieved) = db.get_document(&first_doc.id).await? {
|
||
|
|
println!(
|
||
|
|
"✓ Retrieved: {} ({} bytes)",
|
||
|
|
retrieved.id,
|
||
|
|
retrieved.content.len()
|
||
|
|
);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
println!("\n✅ Complete integration example finished!\n");
|
||
|
|
println!("Key features demonstrated:");
|
||
|
|
println!(" • Document chunking (heading-aware)");
|
||
|
|
println!(" • Embedding generation");
|
||
|
|
println!(" • Batch document storage");
|
||
|
|
println!(" • Deployment event tracking");
|
||
|
|
println!(" • Vector similarity search");
|
||
|
|
println!(" • Document retrieval by ID");
|
||
|
|
println!(" • System statistics");
|
||
|
|
|
||
|
|
Ok(())
|
||
|
|
}
|