310 lines
6.4 KiB
Markdown
310 lines
6.4 KiB
Markdown
|
|
# RLM Production Setup Guide
|
||
|
|
|
||
|
|
This guide shows how to configure vapora-rlm for production use with LLM clients and embeddings.
|
||
|
|
|
||
|
|
## Prerequisites
|
||
|
|
|
||
|
|
1. **SurrealDB** running on port 8000
|
||
|
|
2. **LLM Provider** (choose one):
|
||
|
|
- OpenAI (cloud, requires API key)
|
||
|
|
- Anthropic Claude (cloud, requires API key)
|
||
|
|
- Ollama (local, free)
|
||
|
|
3. **Optional**: Docker for Docker sandbox tier
|
||
|
|
|
||
|
|
## Quick Start
|
||
|
|
|
||
|
|
### Option 1: Cloud (OpenAI)
|
||
|
|
|
||
|
|
```bash
|
||
|
|
# Set API key
|
||
|
|
export OPENAI_API_KEY="sk-..."
|
||
|
|
|
||
|
|
# Run example
|
||
|
|
cargo run --example production_setup
|
||
|
|
```
|
||
|
|
|
||
|
|
### Option 2: Local (Ollama)
|
||
|
|
|
||
|
|
```bash
|
||
|
|
# Install and start Ollama
|
||
|
|
brew install ollama
|
||
|
|
ollama serve
|
||
|
|
|
||
|
|
# Pull model
|
||
|
|
ollama pull llama3.2
|
||
|
|
|
||
|
|
# Run example
|
||
|
|
cargo run --example local_ollama
|
||
|
|
```
|
||
|
|
|
||
|
|
## Production Configuration
|
||
|
|
|
||
|
|
### 1. Create RLM Engine with LLM Client
|
||
|
|
|
||
|
|
```rust
|
||
|
|
use std::sync::Arc;
|
||
|
|
use vapora_llm_router::providers::OpenAIClient;
|
||
|
|
use vapora_rlm::RLMEngine;
|
||
|
|
|
||
|
|
// Setup LLM client
|
||
|
|
let llm_client = Arc::new(OpenAIClient::new(
|
||
|
|
api_key,
|
||
|
|
"gpt-4".to_string(),
|
||
|
|
4096, // max_tokens
|
||
|
|
0.7, // temperature
|
||
|
|
5.0, // cost per 1M input tokens
|
||
|
|
15.0, // cost per 1M output tokens
|
||
|
|
)?);
|
||
|
|
|
||
|
|
// Create engine with LLM
|
||
|
|
let engine = RLMEngine::with_llm_client(
|
||
|
|
storage,
|
||
|
|
bm25_index,
|
||
|
|
llm_client,
|
||
|
|
Some(config),
|
||
|
|
)?;
|
||
|
|
```
|
||
|
|
|
||
|
|
### 2. Configure Chunking Strategy
|
||
|
|
|
||
|
|
```rust
|
||
|
|
use vapora_rlm::chunking::{ChunkingConfig, ChunkingStrategy};
|
||
|
|
use vapora_rlm::engine::RLMEngineConfig;
|
||
|
|
|
||
|
|
let config = RLMEngineConfig {
|
||
|
|
chunking: ChunkingConfig {
|
||
|
|
strategy: ChunkingStrategy::Semantic, // or Fixed, Code
|
||
|
|
chunk_size: 1000,
|
||
|
|
overlap: 200,
|
||
|
|
},
|
||
|
|
embedding: Some(EmbeddingConfig::openai_small()),
|
||
|
|
auto_rebuild_bm25: true,
|
||
|
|
max_chunks_per_doc: 10_000,
|
||
|
|
};
|
||
|
|
```
|
||
|
|
|
||
|
|
### 3. Configure Embeddings
|
||
|
|
|
||
|
|
```rust
|
||
|
|
use vapora_rlm::embeddings::EmbeddingConfig;
|
||
|
|
|
||
|
|
// OpenAI (1536 dimensions)
|
||
|
|
let embedding_config = EmbeddingConfig::openai_small();
|
||
|
|
|
||
|
|
// OpenAI (3072 dimensions)
|
||
|
|
let embedding_config = EmbeddingConfig::openai_large();
|
||
|
|
|
||
|
|
// Ollama (local)
|
||
|
|
let embedding_config = EmbeddingConfig::ollama("llama3.2");
|
||
|
|
```
|
||
|
|
|
||
|
|
### 4. Use RLM in Production
|
||
|
|
|
||
|
|
```rust
|
||
|
|
// Load document
|
||
|
|
let chunk_count = engine.load_document(doc_id, content, None).await?;
|
||
|
|
|
||
|
|
// Query with hybrid search (BM25 + semantic + RRF)
|
||
|
|
let results = engine.query(doc_id, "your query", None, 5).await?;
|
||
|
|
|
||
|
|
// Dispatch to LLM for distributed reasoning
|
||
|
|
let response = engine
|
||
|
|
.dispatch_subtask(doc_id, "Analyze this code", None, 5)
|
||
|
|
.await?;
|
||
|
|
|
||
|
|
println!("LLM Response: {}", response.text);
|
||
|
|
println!("Tokens: {} in, {} out",
|
||
|
|
response.total_input_tokens,
|
||
|
|
response.total_output_tokens
|
||
|
|
);
|
||
|
|
```
|
||
|
|
|
||
|
|
## LLM Provider Options
|
||
|
|
|
||
|
|
### OpenAI
|
||
|
|
|
||
|
|
```rust
|
||
|
|
use vapora_llm_router::providers::OpenAIClient;
|
||
|
|
|
||
|
|
let client = Arc::new(OpenAIClient::new(
|
||
|
|
api_key,
|
||
|
|
"gpt-4".to_string(),
|
||
|
|
4096, 0.7, 5.0, 15.0,
|
||
|
|
)?);
|
||
|
|
```
|
||
|
|
|
||
|
|
**Models:**
|
||
|
|
- `gpt-4` - Most capable
|
||
|
|
- `gpt-4-turbo` - Faster, cheaper
|
||
|
|
- `gpt-3.5-turbo` - Fast, cheapest
|
||
|
|
|
||
|
|
### Anthropic Claude
|
||
|
|
|
||
|
|
```rust
|
||
|
|
use vapora_llm_router::providers::ClaudeClient;
|
||
|
|
|
||
|
|
let client = Arc::new(ClaudeClient::new(
|
||
|
|
api_key,
|
||
|
|
"claude-3-opus-20240229".to_string(),
|
||
|
|
4096, 0.7, 15.0, 75.0,
|
||
|
|
)?);
|
||
|
|
```
|
||
|
|
|
||
|
|
**Models:**
|
||
|
|
- `claude-3-opus` - Most capable
|
||
|
|
- `claude-3-sonnet` - Balanced
|
||
|
|
- `claude-3-haiku` - Fast, cheap
|
||
|
|
|
||
|
|
### Ollama (Local)
|
||
|
|
|
||
|
|
```rust
|
||
|
|
use vapora_llm_router::providers::OllamaClient;
|
||
|
|
|
||
|
|
let client = Arc::new(OllamaClient::new(
|
||
|
|
"http://localhost:11434".to_string(),
|
||
|
|
"llama3.2".to_string(),
|
||
|
|
4096, 0.7,
|
||
|
|
)?);
|
||
|
|
```
|
||
|
|
|
||
|
|
**Popular models:**
|
||
|
|
- `llama3.2` - Meta's latest
|
||
|
|
- `mistral` - Fast, capable
|
||
|
|
- `codellama` - Code-focused
|
||
|
|
- `mixtral` - Large, powerful
|
||
|
|
|
||
|
|
## Performance Tuning
|
||
|
|
|
||
|
|
### Chunk Size Optimization
|
||
|
|
|
||
|
|
```rust
|
||
|
|
// Small chunks (500 chars) - Better precision, more chunks
|
||
|
|
ChunkingConfig {
|
||
|
|
strategy: ChunkingStrategy::Fixed,
|
||
|
|
chunk_size: 500,
|
||
|
|
overlap: 100,
|
||
|
|
}
|
||
|
|
|
||
|
|
// Large chunks (2000 chars) - More context, fewer chunks
|
||
|
|
ChunkingConfig {
|
||
|
|
strategy: ChunkingStrategy::Fixed,
|
||
|
|
chunk_size: 2000,
|
||
|
|
overlap: 400,
|
||
|
|
}
|
||
|
|
```
|
||
|
|
|
||
|
|
### BM25 Index Tuning
|
||
|
|
|
||
|
|
```rust
|
||
|
|
let config = RLMEngineConfig {
|
||
|
|
auto_rebuild_bm25: true, // Rebuild after loading
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
```
|
||
|
|
|
||
|
|
### Max Chunks Per Document
|
||
|
|
|
||
|
|
```rust
|
||
|
|
let config = RLMEngineConfig {
|
||
|
|
max_chunks_per_doc: 10_000, // Safety limit
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
```
|
||
|
|
|
||
|
|
## Production Checklist
|
||
|
|
|
||
|
|
- [ ] LLM client configured with valid API key
|
||
|
|
- [ ] Embedding provider configured
|
||
|
|
- [ ] SurrealDB schema applied: `bash tests/test_setup.sh`
|
||
|
|
- [ ] Chunking strategy selected (Semantic for prose, Code for code)
|
||
|
|
- [ ] Max chunks per doc set appropriately
|
||
|
|
- [ ] Prometheus metrics endpoint exposed
|
||
|
|
- [ ] Error handling and retries in place
|
||
|
|
- [ ] Cost tracking enabled (for cloud providers)
|
||
|
|
|
||
|
|
## Troubleshooting
|
||
|
|
|
||
|
|
### "No LLM client configured"
|
||
|
|
|
||
|
|
```rust
|
||
|
|
// Don't use RLMEngine::new() - it has no LLM client
|
||
|
|
let engine = RLMEngine::new(storage, bm25_index)?; // ❌
|
||
|
|
|
||
|
|
// Use with_llm_client() instead
|
||
|
|
let engine = RLMEngine::with_llm_client(
|
||
|
|
storage, bm25_index, llm_client, Some(config)
|
||
|
|
)?; // ✅
|
||
|
|
```
|
||
|
|
|
||
|
|
### "Embedding generation failed"
|
||
|
|
|
||
|
|
```rust
|
||
|
|
// Make sure embedding config matches your provider
|
||
|
|
let config = RLMEngineConfig {
|
||
|
|
embedding: Some(EmbeddingConfig::openai_small()), // ✅
|
||
|
|
..Default::default()
|
||
|
|
};
|
||
|
|
```
|
||
|
|
|
||
|
|
### "SurrealDB schema error"
|
||
|
|
|
||
|
|
```bash
|
||
|
|
# Apply the schema
|
||
|
|
cd crates/vapora-rlm/tests
|
||
|
|
bash test_setup.sh
|
||
|
|
```
|
||
|
|
|
||
|
|
## Examples
|
||
|
|
|
||
|
|
See `examples/` directory:
|
||
|
|
|
||
|
|
- `production_setup.rs` - OpenAI production setup
|
||
|
|
- `local_ollama.rs` - Local development with Ollama
|
||
|
|
|
||
|
|
Run with:
|
||
|
|
```bash
|
||
|
|
cargo run --example production_setup
|
||
|
|
cargo run --example local_ollama
|
||
|
|
```
|
||
|
|
|
||
|
|
## Cost Optimization
|
||
|
|
|
||
|
|
### Use Local Ollama for Development
|
||
|
|
|
||
|
|
```rust
|
||
|
|
// Free, local, no API keys
|
||
|
|
let client = Arc::new(OllamaClient::new(
|
||
|
|
"http://localhost:11434".to_string(),
|
||
|
|
"llama3.2".to_string(),
|
||
|
|
4096, 0.7,
|
||
|
|
)?);
|
||
|
|
```
|
||
|
|
|
||
|
|
### Choose Cheaper Models for Production
|
||
|
|
|
||
|
|
```rust
|
||
|
|
// Instead of gpt-4 ($5/$15 per 1M tokens)
|
||
|
|
OpenAIClient::new(api_key, "gpt-4".to_string(), ...)
|
||
|
|
|
||
|
|
// Use gpt-3.5-turbo ($0.50/$1.50 per 1M tokens)
|
||
|
|
OpenAIClient::new(api_key, "gpt-3.5-turbo".to_string(), ...)
|
||
|
|
```
|
||
|
|
|
||
|
|
### Track Costs with Metrics
|
||
|
|
|
||
|
|
```rust
|
||
|
|
// RLM automatically tracks token usage
|
||
|
|
let response = engine.dispatch_subtask(...).await?;
|
||
|
|
println!("Cost: ${:.4}",
|
||
|
|
(response.total_input_tokens as f64 * 5.0 / 1_000_000.0) +
|
||
|
|
(response.total_output_tokens as f64 * 15.0 / 1_000_000.0)
|
||
|
|
);
|
||
|
|
```
|
||
|
|
|
||
|
|
## Next Steps
|
||
|
|
|
||
|
|
1. Review examples: `cargo run --example local_ollama`
|
||
|
|
2. Run tests: `cargo test -p vapora-rlm`
|
||
|
|
3. Check metrics: See `src/metrics.rs`
|
||
|
|
4. Integrate with backend: See `vapora-backend` integration patterns
|