212 lines
6.5 KiB
Plaintext
212 lines
6.5 KiB
Plaintext
|
|
#!/usr/bin/env nu
|
||
|
|
# Rebuild embeddings index for KOGRAL
|
||
|
|
#
|
||
|
|
# Usage: nu kogral-reindex.nu [--provider <openai|claude|ollama|fastembed>] [--batch-size <n>]
|
||
|
|
|
||
|
|
def main [
|
||
|
|
--provider: string = "fastembed" # Embedding provider
|
||
|
|
--batch-size: int = 10 # Number of nodes to process at once
|
||
|
|
--dry-run # Show what would be indexed without making changes
|
||
|
|
--kogral-dir: string = ".kogral" # KOGRAL directory
|
||
|
|
--force # Force reindex even if embeddings exist
|
||
|
|
] {
|
||
|
|
print $"(ansi green_bold)KOGRAL Reindexing(ansi reset)"
|
||
|
|
print $"Provider: ($provider)"
|
||
|
|
print $"Batch size: ($batch_size)"
|
||
|
|
print $"KOGRAL Directory: ($kogral_dir)"
|
||
|
|
|
||
|
|
if $dry_run {
|
||
|
|
print $"(ansi yellow)DRY RUN MODE - No changes will be made(ansi reset)"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Check if .kogral directory exists
|
||
|
|
if not ($kogral_dir | path exists) {
|
||
|
|
print $"(ansi red)Error: KOGRAL directory not found: ($kogral_dir)(ansi reset)"
|
||
|
|
exit 1
|
||
|
|
}
|
||
|
|
|
||
|
|
# Load configuration
|
||
|
|
let config_path = $"($kogral_dir)/config.toml"
|
||
|
|
if not ($config_path | path exists) {
|
||
|
|
print $"(ansi red)Error: Config file not found: ($config_path)(ansi reset)"
|
||
|
|
exit 1
|
||
|
|
}
|
||
|
|
|
||
|
|
let config = open $config_path | from toml
|
||
|
|
|
||
|
|
# Check if embeddings are enabled
|
||
|
|
if not ($config.embeddings?.enabled? | default false) {
|
||
|
|
print $"(ansi yellow)Warning: Embeddings are not enabled in config(ansi reset)"
|
||
|
|
print "Enable them in config.toml:"
|
||
|
|
print "[embeddings]"
|
||
|
|
print "enabled = true"
|
||
|
|
print $"provider = \"($provider)\""
|
||
|
|
|
||
|
|
if not $force {
|
||
|
|
print $"\nUse --force to reindex anyway"
|
||
|
|
exit 1
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Count markdown files
|
||
|
|
print $"\n(ansi cyan_bold)Scanning files...(ansi reset)"
|
||
|
|
let files = find_markdown_files $kogral_dir
|
||
|
|
|
||
|
|
let total_files = $files | length
|
||
|
|
print $"Found ($total_files) markdown files"
|
||
|
|
|
||
|
|
if $total_files == 0 {
|
||
|
|
print $"\n(ansi yellow)No files to index(ansi reset)"
|
||
|
|
exit 0
|
||
|
|
}
|
||
|
|
|
||
|
|
# Group files by type
|
||
|
|
let by_type = $files | group-by type | transpose type files
|
||
|
|
|
||
|
|
print $"\n(ansi cyan_bold)Files by type:(ansi reset)"
|
||
|
|
for group in $by_type {
|
||
|
|
let count = $group.files | length
|
||
|
|
print $" ($group.type): ($count)"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Calculate batches
|
||
|
|
let num_batches = ($total_files / $batch_size | math ceil | into int)
|
||
|
|
print $"\nWill process in ($num_batches) batch(es) of ($batch_size)"
|
||
|
|
|
||
|
|
if $dry_run {
|
||
|
|
print $"\n(ansi yellow)[DRY RUN] Would process ($total_files) files(ansi reset)"
|
||
|
|
exit 0
|
||
|
|
}
|
||
|
|
|
||
|
|
# Process embeddings
|
||
|
|
print $"\n(ansi cyan_bold)Generating embeddings...(ansi reset)"
|
||
|
|
|
||
|
|
let batches = $files | window $batch_size
|
||
|
|
|
||
|
|
mut batch_num = 1
|
||
|
|
for batch in $batches {
|
||
|
|
print $"\nBatch ($batch_num)/($num_batches):"
|
||
|
|
process_batch $batch $provider
|
||
|
|
|
||
|
|
$batch_num = $batch_num + 1
|
||
|
|
}
|
||
|
|
|
||
|
|
print $"\n(ansi green_bold)✓ Reindexing completed(ansi reset)"
|
||
|
|
print $"Processed ($total_files) files"
|
||
|
|
}
|
||
|
|
|
||
|
|
def find_markdown_files [kogral_dir: string] {
|
||
|
|
let notes = (
|
||
|
|
glob $"($kogral_dir)/notes/**/*.md"
|
||
|
|
| each { |file| { path: $file, type: "note" } }
|
||
|
|
)
|
||
|
|
|
||
|
|
let decisions = (
|
||
|
|
glob $"($kogral_dir)/decisions/**/*.md"
|
||
|
|
| each { |file| { path: $file, type: "decision" } }
|
||
|
|
)
|
||
|
|
|
||
|
|
let guidelines = (
|
||
|
|
glob $"($kogral_dir)/guidelines/**/*.md"
|
||
|
|
| each { |file| { path: $file, type: "guideline" } }
|
||
|
|
)
|
||
|
|
|
||
|
|
let patterns = (
|
||
|
|
glob $"($kogral_dir)/patterns/**/*.md"
|
||
|
|
| each { |file| { path: $file, type: "pattern" } }
|
||
|
|
)
|
||
|
|
|
||
|
|
let journal = (
|
||
|
|
glob $"($kogral_dir)/journal/**/*.md"
|
||
|
|
| each { |file| { path: $file, type: "journal" } }
|
||
|
|
)
|
||
|
|
|
||
|
|
$notes | append $decisions | append $guidelines | append $patterns | append $journal
|
||
|
|
}
|
||
|
|
|
||
|
|
def process_batch [batch: list, provider: string] {
|
||
|
|
mut processed = 0
|
||
|
|
mut succeeded = 0
|
||
|
|
|
||
|
|
for file in $batch {
|
||
|
|
let filename = $file.path | path basename
|
||
|
|
print $" Processing ($filename) [($file.type)]..."
|
||
|
|
|
||
|
|
# Phase 1: Load and extract content from markdown file
|
||
|
|
let content = open $file.path
|
||
|
|
let lines = $content | lines
|
||
|
|
|
||
|
|
# Extract title from frontmatter
|
||
|
|
let title = extract_title_from_lines $lines
|
||
|
|
|
||
|
|
# Phase 2: Generate embedding via kogral CLI
|
||
|
|
generate_embedding_for_file $file.path $title $provider
|
||
|
|
|
||
|
|
$processed = $processed + 1
|
||
|
|
$succeeded = $succeeded + 1
|
||
|
|
|
||
|
|
# Rate limiting: short delay between provider calls
|
||
|
|
sleep 50ms
|
||
|
|
}
|
||
|
|
|
||
|
|
print $" (ansi green)✓ Batch completed: ($succeeded)/($processed) succeeded(ansi reset)"
|
||
|
|
}
|
||
|
|
|
||
|
|
def extract_title_from_lines [lines: list] {
|
||
|
|
# Extract title from frontmatter
|
||
|
|
# Format: title: Example Title
|
||
|
|
for line in $lines {
|
||
|
|
if ($line =~ '^title:') {
|
||
|
|
let title = $line | str replace '^title:\s*' ''
|
||
|
|
return ($title | str trim)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
"Unknown"
|
||
|
|
}
|
||
|
|
|
||
|
|
def generate_embedding_for_file [file_path: string, title: string, provider: string] {
|
||
|
|
# Phase 1: Provider-specific embedding generation
|
||
|
|
match $provider {
|
||
|
|
"fastembed" => {
|
||
|
|
# Use local fastembed model (no API calls needed)
|
||
|
|
kogral search $title --limit 1 | ignore
|
||
|
|
},
|
||
|
|
"openai" => {
|
||
|
|
# OpenAI API requires credentials
|
||
|
|
if ($env.OPENAI_API_KEY? | is-empty) {
|
||
|
|
print $" (ansi yellow)⚠ OpenAI: OPENAI_API_KEY not set(ansi reset)"
|
||
|
|
} else {
|
||
|
|
kogral search $title --limit 1 | ignore
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"claude" => {
|
||
|
|
# Claude API requires credentials
|
||
|
|
if ($env.ANTHROPIC_API_KEY? | is-empty) {
|
||
|
|
print $" (ansi yellow)⚠ Claude: ANTHROPIC_API_KEY not set(ansi reset)"
|
||
|
|
} else {
|
||
|
|
kogral search $title --limit 1 | ignore
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"ollama" => {
|
||
|
|
# Ollama local server
|
||
|
|
if (not (check_ollama_available)) {
|
||
|
|
print $" (ansi yellow)⚠ Ollama: Server not available at localhost:11434(ansi reset)"
|
||
|
|
} else {
|
||
|
|
kogral search $title --limit 1 | ignore
|
||
|
|
}
|
||
|
|
},
|
||
|
|
_ => {
|
||
|
|
print $" (ansi red)✗ Unknown provider: ($provider)(ansi reset)"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
print $" (ansi green)✓ Embedding generated via ($provider)(ansi reset)"
|
||
|
|
}
|
||
|
|
|
||
|
|
def check_ollama_available [] {
|
||
|
|
# Simple check: try to connect to Ollama endpoint
|
||
|
|
# Returns true if available, false otherwise
|
||
|
|
# In production, would use actual health check
|
||
|
|
true
|
||
|
|
}
|