116 lines
3.7 KiB
Rust
116 lines
3.7 KiB
Rust
|
|
//! Basic RAG system example: Document ingestion and embedding
|
||
|
|
//!
|
||
|
|
//! This example demonstrates:
|
||
|
|
//! 1. Creating a chunking engine
|
||
|
|
//! 2. Creating an embedding engine
|
||
|
|
//! 3. Creating a document ingester
|
||
|
|
//! 4. Chunking and embedding documents
|
||
|
|
//!
|
||
|
|
//! Run with: `cargo run --example basic_ingestion`
|
||
|
|
|
||
|
|
#![allow(clippy::field_reassign_with_default)]
|
||
|
|
|
||
|
|
use provisioning_rag::{
|
||
|
|
chunking::ChunkingEngine,
|
||
|
|
config::{EmbeddingConfig, IngestionConfig},
|
||
|
|
embeddings::EmbeddingEngine,
|
||
|
|
ingestion::DocumentIngester,
|
||
|
|
};
|
||
|
|
|
||
|
|
#[tokio::main]
|
||
|
|
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||
|
|
// Initialize tracing for logging
|
||
|
|
tracing_subscriber::fmt::init();
|
||
|
|
|
||
|
|
println!("🚀 Provisioning RAG System - Basic Ingestion Example\n");
|
||
|
|
|
||
|
|
// Step 1: Create embedding engine
|
||
|
|
println!("Step 1: Creating embedding engine...");
|
||
|
|
let mut embedding_config = EmbeddingConfig::default();
|
||
|
|
embedding_config.provider = "local".to_string(); // Use local to avoid API costs
|
||
|
|
let embedding_engine = EmbeddingEngine::new(embedding_config)?;
|
||
|
|
println!("✓ Embedding engine created (local model)\n");
|
||
|
|
|
||
|
|
// Step 2: Create document ingester
|
||
|
|
println!("Step 2: Creating document ingester...");
|
||
|
|
let ingestion_config = IngestionConfig::default();
|
||
|
|
let ingester = DocumentIngester::new(ingestion_config, embedding_engine);
|
||
|
|
println!("✓ Document ingester created\n");
|
||
|
|
|
||
|
|
// Step 3: Chunk a sample markdown document
|
||
|
|
println!("Step 3: Chunking sample markdown document...");
|
||
|
|
let markdown_content = r#"
|
||
|
|
# Provisioning Platform
|
||
|
|
|
||
|
|
## Overview
|
||
|
|
|
||
|
|
The provisioning platform is a unified infrastructure automation system
|
||
|
|
built with Rust and Nushell for managing cloud resources across multiple providers.
|
||
|
|
|
||
|
|
### Key Features
|
||
|
|
|
||
|
|
- Multi-cloud support (AWS, UpCloud, local)
|
||
|
|
- KCL configuration language integration
|
||
|
|
- Kubernetes cluster management
|
||
|
|
- Comprehensive security system
|
||
|
|
- REST API and MCP integration
|
||
|
|
|
||
|
|
## Architecture
|
||
|
|
|
||
|
|
### Core Components
|
||
|
|
|
||
|
|
1. **Orchestrator**: Central task coordination service
|
||
|
|
2. **Control Center**: Web-based management interface
|
||
|
|
3. **MCP Server**: Model Context Protocol integration
|
||
|
|
4. **Platform Services**: Additional ecosystem services
|
||
|
|
|
||
|
|
### Data Flow
|
||
|
|
|
||
|
|
Documents flow through:
|
||
|
|
1. Configuration (KCL schemas)
|
||
|
|
2. Validation (type checking)
|
||
|
|
3. Execution (provider operations)
|
||
|
|
4. Monitoring (health checks)
|
||
|
|
"#;
|
||
|
|
|
||
|
|
let chunking_engine = ChunkingEngine::new(1024, 100);
|
||
|
|
let chunks = chunking_engine.chunk_markdown(markdown_content, "README.md")?;
|
||
|
|
|
||
|
|
println!("✓ Document chunked into {} chunks\n", chunks.len());
|
||
|
|
|
||
|
|
for (i, chunk) in chunks.iter().enumerate() {
|
||
|
|
println!("Chunk {}:", i + 1);
|
||
|
|
println!(" ID: {}", chunk.id);
|
||
|
|
println!(" Size: {} chars", chunk.content.len());
|
||
|
|
if let Some(heading) = chunk.metadata.get("heading_path") {
|
||
|
|
println!(" Path: {}", heading);
|
||
|
|
}
|
||
|
|
println!();
|
||
|
|
}
|
||
|
|
|
||
|
|
// Step 4: Embed the chunks
|
||
|
|
println!("Step 4: Embedding chunks...");
|
||
|
|
let embedded_docs = ingester.embedding_engine().embed_batch(&chunks).await?;
|
||
|
|
|
||
|
|
println!("✓ Embedded {} documents\n", embedded_docs.len());
|
||
|
|
|
||
|
|
for (i, doc) in embedded_docs.iter().enumerate() {
|
||
|
|
println!("Embedded document {}:", i + 1);
|
||
|
|
println!(" ID: {}", doc.id);
|
||
|
|
println!(" Vector dimension: {}", doc.embedding.len());
|
||
|
|
println!(
|
||
|
|
" Content preview: {}...",
|
||
|
|
&doc.content[..doc.content.len().min(50)]
|
||
|
|
);
|
||
|
|
println!();
|
||
|
|
}
|
||
|
|
|
||
|
|
println!("✅ Example completed successfully!");
|
||
|
|
println!("\nNext steps:");
|
||
|
|
println!("1. Store embedded documents in SurrealDB");
|
||
|
|
println!("2. Implement RAG agent for semantic search");
|
||
|
|
println!("3. Add LLM integration for response generation");
|
||
|
|
|
||
|
|
Ok(())
|
||
|
|
}
|