prvng_platform/crates/rag/examples/basic_ingestion.rs

116 lines
3.7 KiB
Rust
Raw Normal View History

//! Basic RAG system example: Document ingestion and embedding
//!
//! This example demonstrates:
//! 1. Creating a chunking engine
//! 2. Creating an embedding engine
//! 3. Creating a document ingester
//! 4. Chunking and embedding documents
//!
//! Run with: `cargo run --example basic_ingestion`
#![allow(clippy::field_reassign_with_default)]
use provisioning_rag::{
chunking::ChunkingEngine,
config::{EmbeddingConfig, IngestionConfig},
embeddings::EmbeddingEngine,
ingestion::DocumentIngester,
};
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Initialize tracing for logging
tracing_subscriber::fmt::init();
println!("🚀 Provisioning RAG System - Basic Ingestion Example\n");
// Step 1: Create embedding engine
println!("Step 1: Creating embedding engine...");
let mut embedding_config = EmbeddingConfig::default();
embedding_config.provider = "local".to_string(); // Use local to avoid API costs
let embedding_engine = EmbeddingEngine::new(embedding_config)?;
println!("✓ Embedding engine created (local model)\n");
// Step 2: Create document ingester
println!("Step 2: Creating document ingester...");
let ingestion_config = IngestionConfig::default();
let ingester = DocumentIngester::new(ingestion_config, embedding_engine);
println!("✓ Document ingester created\n");
// Step 3: Chunk a sample markdown document
println!("Step 3: Chunking sample markdown document...");
let markdown_content = r#"
# Provisioning Platform
## Overview
The provisioning platform is a unified infrastructure automation system
built with Rust and Nushell for managing cloud resources across multiple providers.
### Key Features
- Multi-cloud support (AWS, UpCloud, local)
- KCL configuration language integration
- Kubernetes cluster management
- Comprehensive security system
- REST API and MCP integration
## Architecture
### Core Components
1. **Orchestrator**: Central task coordination service
2. **Control Center**: Web-based management interface
3. **MCP Server**: Model Context Protocol integration
4. **Platform Services**: Additional ecosystem services
### Data Flow
Documents flow through:
1. Configuration (KCL schemas)
2. Validation (type checking)
3. Execution (provider operations)
4. Monitoring (health checks)
"#;
let chunking_engine = ChunkingEngine::new(1024, 100);
let chunks = chunking_engine.chunk_markdown(markdown_content, "README.md")?;
println!("✓ Document chunked into {} chunks\n", chunks.len());
for (i, chunk) in chunks.iter().enumerate() {
println!("Chunk {}:", i + 1);
println!(" ID: {}", chunk.id);
println!(" Size: {} chars", chunk.content.len());
if let Some(heading) = chunk.metadata.get("heading_path") {
println!(" Path: {}", heading);
}
println!();
}
// Step 4: Embed the chunks
println!("Step 4: Embedding chunks...");
let embedded_docs = ingester.embedding_engine().embed_batch(&chunks).await?;
println!("✓ Embedded {} documents\n", embedded_docs.len());
for (i, doc) in embedded_docs.iter().enumerate() {
println!("Embedded document {}:", i + 1);
println!(" ID: {}", doc.id);
println!(" Vector dimension: {}", doc.embedding.len());
println!(
" Content preview: {}...",
&doc.content[..doc.content.len().min(50)]
);
println!();
}
println!("✅ Example completed successfully!");
println!("\nNext steps:");
println!("1. Store embedded documents in SurrealDB");
println!("2. Implement RAG agent for semantic search");
println!("3. Add LLM integration for response generation");
Ok(())
}