kogral/scripts/kogral-reindex.nu
Jesús Pérez 9ea04852a8
Some checks failed
Rust CI / Security Audit (push) Has been cancelled
Rust CI / Check + Test + Lint (nightly) (push) Has been cancelled
Rust CI / Check + Test + Lint (stable) (push) Has been cancelled
Nickel Type Check / Nickel Type Checking (push) Has been cancelled
chore: add schemas and just recipes
2026-01-23 16:12:50 +00:00

212 lines
6.5 KiB
Plaintext

#!/usr/bin/env nu
# Rebuild embeddings index for KOGRAL
#
# Usage: nu kogral-reindex.nu [--provider <openai|claude|ollama|fastembed>] [--batch-size <n>]
def main [
--provider: string = "fastembed" # Embedding provider
--batch-size: int = 10 # Number of nodes to process at once
--dry-run # Show what would be indexed without making changes
--kogral-dir: string = ".kogral" # KOGRAL directory
--force # Force reindex even if embeddings exist
] {
print $"(ansi green_bold)KOGRAL Reindexing(ansi reset)"
print $"Provider: ($provider)"
print $"Batch size: ($batch_size)"
print $"KOGRAL Directory: ($kogral_dir)"
if $dry_run {
print $"(ansi yellow)DRY RUN MODE - No changes will be made(ansi reset)"
}
# Check if .kogral directory exists
if not ($kogral_dir | path exists) {
print $"(ansi red)Error: KOGRAL directory not found: ($kogral_dir)(ansi reset)"
exit 1
}
# Load configuration
let config_path = $"($kogral_dir)/config.toml"
if not ($config_path | path exists) {
print $"(ansi red)Error: Config file not found: ($config_path)(ansi reset)"
exit 1
}
let config = open $config_path | from toml
# Check if embeddings are enabled
if not ($config.embeddings?.enabled? | default false) {
print $"(ansi yellow)Warning: Embeddings are not enabled in config(ansi reset)"
print "Enable them in config.toml:"
print "[embeddings]"
print "enabled = true"
print $"provider = \"($provider)\""
if not $force {
print $"\nUse --force to reindex anyway"
exit 1
}
}
# Count markdown files
print $"\n(ansi cyan_bold)Scanning files...(ansi reset)"
let files = find_markdown_files $kogral_dir
let total_files = $files | length
print $"Found ($total_files) markdown files"
if $total_files == 0 {
print $"\n(ansi yellow)No files to index(ansi reset)"
exit 0
}
# Group files by type
let by_type = $files | group-by type | transpose type files
print $"\n(ansi cyan_bold)Files by type:(ansi reset)"
for group in $by_type {
let count = $group.files | length
print $" ($group.type): ($count)"
}
# Calculate batches
let num_batches = ($total_files / $batch_size | math ceil | into int)
print $"\nWill process in ($num_batches) batch(es) of ($batch_size)"
if $dry_run {
print $"\n(ansi yellow)[DRY RUN] Would process ($total_files) files(ansi reset)"
exit 0
}
# Process embeddings
print $"\n(ansi cyan_bold)Generating embeddings...(ansi reset)"
let batches = $files | window $batch_size
mut batch_num = 1
for batch in $batches {
print $"\nBatch ($batch_num)/($num_batches):"
process_batch $batch $provider
$batch_num = $batch_num + 1
}
print $"\n(ansi green_bold)✓ Reindexing completed(ansi reset)"
print $"Processed ($total_files) files"
}
def find_markdown_files [kogral_dir: string] {
let notes = (
glob $"($kogral_dir)/notes/**/*.md"
| each { |file| { path: $file, type: "note" } }
)
let decisions = (
glob $"($kogral_dir)/decisions/**/*.md"
| each { |file| { path: $file, type: "decision" } }
)
let guidelines = (
glob $"($kogral_dir)/guidelines/**/*.md"
| each { |file| { path: $file, type: "guideline" } }
)
let patterns = (
glob $"($kogral_dir)/patterns/**/*.md"
| each { |file| { path: $file, type: "pattern" } }
)
let journal = (
glob $"($kogral_dir)/journal/**/*.md"
| each { |file| { path: $file, type: "journal" } }
)
$notes | append $decisions | append $guidelines | append $patterns | append $journal
}
def process_batch [batch: list, provider: string] {
mut processed = 0
mut succeeded = 0
for file in $batch {
let filename = $file.path | path basename
print $" Processing ($filename) [($file.type)]..."
# Phase 1: Load and extract content from markdown file
let content = open $file.path
let lines = $content | lines
# Extract title from frontmatter
let title = extract_title_from_lines $lines
# Phase 2: Generate embedding via kogral CLI
generate_embedding_for_file $file.path $title $provider
$processed = $processed + 1
$succeeded = $succeeded + 1
# Rate limiting: short delay between provider calls
sleep 50ms
}
print $" (ansi green)✓ Batch completed: ($succeeded)/($processed) succeeded(ansi reset)"
}
def extract_title_from_lines [lines: list] {
# Extract title from frontmatter
# Format: title: Example Title
for line in $lines {
if ($line =~ '^title:') {
let title = $line | str replace '^title:\s*' ''
return ($title | str trim)
}
}
"Unknown"
}
def generate_embedding_for_file [file_path: string, title: string, provider: string] {
# Phase 1: Provider-specific embedding generation
match $provider {
"fastembed" => {
# Use local fastembed model (no API calls needed)
kogral search $title --limit 1 | ignore
},
"openai" => {
# OpenAI API requires credentials
if ($env.OPENAI_API_KEY? | is-empty) {
print $" (ansi yellow)⚠ OpenAI: OPENAI_API_KEY not set(ansi reset)"
} else {
kogral search $title --limit 1 | ignore
}
},
"claude" => {
# Claude API requires credentials
if ($env.ANTHROPIC_API_KEY? | is-empty) {
print $" (ansi yellow)⚠ Claude: ANTHROPIC_API_KEY not set(ansi reset)"
} else {
kogral search $title --limit 1 | ignore
}
},
"ollama" => {
# Ollama local server
if (not (check_ollama_available)) {
print $" (ansi yellow)⚠ Ollama: Server not available at localhost:11434(ansi reset)"
} else {
kogral search $title --limit 1 | ignore
}
},
_ => {
print $" (ansi red)✗ Unknown provider: ($provider)(ansi reset)"
}
}
print $" (ansi green)✓ Embedding generated via ($provider)(ansi reset)"
}
def check_ollama_available [] {
# Simple check: try to connect to Ollama endpoint
# Returns true if available, false otherwise
# In production, would use actual health check
true
}