provisioning/tools/catalog/ingest-knowledge-base.nu
Jesús Pérez 44648e3206
chore: complete nickel migration and consolidate legacy configs
- Remove KCL ecosystem (~220 files deleted)
- Migrate all infrastructure to Nickel schema system
- Consolidate documentation: legacy docs → provisioning/docs/src/
- Add CI/CD workflows (.github/) and Rust build config (.cargo/)
- Update core system for Nickel schema parsing
- Update README.md and CHANGES.md for v5.0.0 release
- Fix pre-commit hooks: end-of-file, trailing-whitespace
- Breaking changes: KCL workspaces require migration
- Migration bridge available in docs/src/development/
2026-01-08 09:55:37 +00:00

346 lines
14 KiB
Plaintext

#!/usr/bin/env nu
# Ingest Phase 1 (best practices) and Phase 2 (extension metadata) into SurrealDB
# Creates knowledge documents with embeddings for RAG semantic search
#
# Usage:
# nu ingest-knowledge-base.nu --surrealdb-url ws://localhost:8000 --namespace rag --database provisioning
# nu ingest-knowledge-base.nu # Uses defaults
#
def main [
--surrealdb-url: string = "ws://localhost:8000" # SurrealDB connection URL
--namespace: string = "rag" # Database namespace
--database: string = "provisioning" # Database name
--best-practices-file: path = "config/best-practices.json" # Phase 1 best practices data
--extensions-dag-file: path = "config/extensions-dag.json" # Phase 2 extension DAG
--verbose # Enable verbose output
] {
print "🔌 Ingesting Knowledge Base into SurrealDB..."
if $verbose {
print $" SurrealDB: ($surrealdb_url)"
print $" Namespace: ($namespace)"
print $" Database: ($database)"
}
# Verify input files exist
if not ($best_practices_file | path exists) {
print "❌ Best practices file not found: $best_practices_file"
return 1
}
if not ($extensions_dag_file | path exists) {
print "❌ Extensions DAG file not found: $extensions_dag_file"
return 1
}
# Load best practices
print "📚 Loading best practices..."
let best_practices = (open $best_practices_file | get practices)
print $" ✅ Loaded ($best_practices | length) best practices"
# Load extensions
print "📦 Loading extensions metadata..."
let extensions_data = (open $extensions_dag_file)
let extensions = $extensions_data.extensions
let init_order = $extensions_data.initialization_order
print $" ✅ Loaded ($extensions | length) extensions"
# Create knowledge documents from best practices
print "\n🔄 Creating best practice documents..."
let bp_docs = (create_best_practice_docs $best_practices)
print $" ✅ Created ($bp_docs | length) best practice documents"
# Create knowledge documents from extensions
print "🔄 Creating extension metadata documents..."
let ext_docs = (create_extension_docs $extensions $init_order $bp_docs)
print $" ✅ Created ($ext_docs | length) extension documents"
# Create document relationships
print "🔄 Creating document relationships..."
let relationships = (create_relationships $bp_docs $ext_docs)
print $" ✅ Created ($relationships | length) relationships"
# Generate database initialization SQL
print "\n📝 Generating SurrealDB schema..."
let schema_sql = (generate_schema_sql)
# Save ingestion manifest
let manifest = {
generated_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"),
source_best_practices: $best_practices_file,
source_extensions_dag: $extensions_dag_file,
statistics: {
total_documents: (($bp_docs | length) + ($ext_docs | length)),
best_practice_docs: ($bp_docs | length),
extension_docs: ($ext_docs | length),
relationships: ($relationships | length),
by_category: (group_by_category $bp_docs $ext_docs),
},
}
let output_dir = "config/knowledge-base"
if not ($output_dir | path exists) {
mkdir $output_dir
}
# Save documents
$bp_docs | to json --indent 2 | save --force $"($output_dir)/best-practices-docs.json"
$ext_docs | to json --indent 2 | save --force $"($output_dir)/extension-docs.json"
$relationships | to json --indent 2 | save --force $"($output_dir)/relationships.json"
$schema_sql | save --force $"($output_dir)/schema.sql"
$manifest | to json --indent 2 | save --force $"($output_dir)/manifest.json"
print $"\n✅ Knowledge base ingestion manifest created"
print $" Output directory: ($output_dir)"
print $" Best practices docs: ($bp_docs | length)"
print $" Extension docs: ($ext_docs | length)"
print $" Relationships: ($relationships | length)"
# Display next steps
print "\n📋 Next Steps:"
print " 1. Start SurrealDB server:"
print " surreal start --bind 0.0.0.0:8000 file:./data.db"
print ""
print " 2. Initialize schema:"
print $" surreal sql --namespace ($namespace) --database ($database) < ($output_dir)/schema.sql"
print ""
print " 3. Ingest documents (requires embedding integration):"
print $" Implement vector embedding generation via OpenAI API or local embeddings"
print ""
print " 4. Test RAG queries:"
print $" curl -X POST http://localhost:8083/api/v1/ai/ask \\"
print $" -H 'Content-Type: application/json' \\"
print $" -d '{{\"question\": \"What best practices apply to Kubernetes deployments?\"}}'"
0
}
# Create best practice documents for knowledge base
def create_best_practice_docs [practices: list] {
$practices | each { |bp|
{
id: $bp.id,
type: "best_practice",
title: $bp.title,
description: $bp.description,
category: $bp.category,
relevance: $bp.relevance,
tags: $bp.tags,
source: $bp.source,
content: $"($bp.title)\n\n($bp.description)\n\nCategory: ($bp.category)\nSource: ($bp.source)",
embedding: null, # Will be populated by RAG ingestion pipeline
source_path: "config/best-practices.json",
indexed_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"),
metadata: {
doc_type: "best_practice",
version: "1.0",
}
}
}
}
# Create extension metadata documents for knowledge base
def create_extension_docs [extensions: list, init_order: list, bp_docs: list] {
let bp_ids = ($bp_docs | each { |doc| $doc.id })
$extensions | each { |ext|
let order_idx = (
$init_order
| enumerate
| where { |item| $item.item == $ext.name }
| first
| get index
)
{
id: $"ext_($ext.name)",
type: "extension_metadata",
name: $ext.name,
version: $ext.version,
category: $ext.category,
description: $ext.description,
dependencies: $ext.dependencies,
tags: $ext.tags,
best_practices: $ext.best_practices,
content: $"($ext.name) v($ext.version)\n\n($ext.description)\n\nCategory: ($ext.category)\n\nDependencies:\n($ext.dependencies | str join '\n')\n\nTags:\n($ext.tags | str join '\n')",
embedding: null, # Will be populated by RAG ingestion pipeline
source_path: $"extensions/($ext.category)s/($ext.name)/metadata.ncl",
indexed_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"),
dependency_count: ($ext.dependencies | length),
initialization_order: $order_idx,
metadata: {
doc_type: "extension_metadata",
version: "1.0",
}
}
}
}
# Create relationships between documents
def create_relationships [bp_docs: list, ext_docs: list] {
let relationships = []
# Best practice relationships (based on tags and categories)
let bp_rels = (
$bp_docs
| enumerate
| each { |item|
let i = $item.index
let bp1 = $item.item
# Find other practices with overlapping tags
$bp_docs
| enumerate
| where { |other| $other.index > $i } # Avoid duplicates
| each { |other|
let bp2 = $other.item
let common_tags = (
$bp1.tags
| where { |tag| $bp2.tags | any { |t| $t == $tag } }
| length
)
if $common_tags > 0 {
{
source_id: $bp1.id,
target_id: $bp2.id,
relationship_type: "relates_to",
strength: ($common_tags / (($bp1.tags | length) + ($bp2.tags | length))),
created_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"),
}
} else {
null
}
}
| compact
}
| flatten
)
# Extension to best practice relationships
let ext_bp_rels = (
$ext_docs
| each { |ext|
$ext.best_practices
| each { |bp_id|
{
source_id: $ext.id,
target_id: $bp_id,
relationship_type: "implements",
strength: 0.9,
created_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"),
}
}
}
| flatten
)
# Extension dependency relationships
let ext_dep_rels = (
$ext_docs
| each { |ext|
$ext.dependencies
| each { |dep|
let dep_ext_id = $"ext_($dep)"
{
source_id: $ext.id,
target_id: $dep_ext_id,
relationship_type: "depends_on",
strength: 1.0,
created_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"),
}
}
}
| flatten
)
[$bp_rels, $ext_bp_rels, $ext_dep_rels] | flatten
}
# Generate SurrealDB schema SQL
def generate_schema_sql [] {
"-- Knowledge Base Schema for RAG Integration
-- Creates tables for documents and relationships with HNSW vector indexing
-- Best practice documents table
DEFINE TABLE best_practice_docs SCHEMAFULL;
DEFINE FIELD id ON best_practice_docs TYPE string ASSERT string::len(\$value) > 0;
DEFINE FIELD type ON best_practice_docs TYPE string VALUE \"best_practice\" ASSERT \$value == \"best_practice\";
DEFINE FIELD title ON best_practice_docs TYPE string;
DEFINE FIELD description ON best_practice_docs TYPE string;
DEFINE FIELD category ON best_practice_docs TYPE string;
DEFINE FIELD relevance ON best_practice_docs TYPE number ASSERT \$value >= 0 AND \$value <= 100;
DEFINE FIELD tags ON best_practice_docs TYPE array;
DEFINE FIELD source ON best_practice_docs TYPE string;
DEFINE FIELD content ON best_practice_docs TYPE string;
DEFINE FIELD embedding ON best_practice_docs TYPE array ASSERT array::len(\$value) == 1536;
DEFINE FIELD source_path ON best_practice_docs TYPE string;
DEFINE FIELD indexed_at ON best_practice_docs TYPE datetime;
DEFINE FIELD metadata ON best_practice_docs TYPE object;
DEFINE INDEX idx_category ON best_practice_docs FIELDS category;
DEFINE INDEX idx_embedding ON best_practice_docs FIELDS embedding HNSW DIMENSION 1536 DIST COSINE;
-- Extension metadata documents table
DEFINE TABLE extension_docs SCHEMAFULL;
DEFINE FIELD id ON extension_docs TYPE string ASSERT string::len(\$value) > 0;
DEFINE FIELD type ON extension_docs TYPE string VALUE \"extension_metadata\" ASSERT \$value == \"extension_metadata\";
DEFINE FIELD name ON extension_docs TYPE string;
DEFINE FIELD version ON extension_docs TYPE string;
DEFINE FIELD category ON extension_docs TYPE string;
DEFINE FIELD description ON extension_docs TYPE string;
DEFINE FIELD dependencies ON extension_docs TYPE array;
DEFINE FIELD tags ON extension_docs TYPE array;
DEFINE FIELD best_practices ON extension_docs TYPE array;
DEFINE FIELD content ON extension_docs TYPE string;
DEFINE FIELD embedding ON extension_docs TYPE array ASSERT array::len(\$value) == 1536;
DEFINE FIELD source_path ON extension_docs TYPE string;
DEFINE FIELD indexed_at ON extension_docs TYPE datetime;
DEFINE FIELD dependency_count ON extension_docs TYPE number;
DEFINE FIELD initialization_order ON extension_docs TYPE number;
DEFINE FIELD metadata ON extension_docs TYPE object;
DEFINE INDEX idx_category ON extension_docs FIELDS category;
DEFINE INDEX idx_name ON extension_docs FIELDS name UNIQUE;
DEFINE INDEX idx_embedding ON extension_docs FIELDS embedding HNSW DIMENSION 1536 DIST COSINE;
-- Document relationships table (for graph traversal)
DEFINE TABLE doc_relationships SCHEMAFULL;
DEFINE FIELD source_id ON doc_relationships TYPE string;
DEFINE FIELD target_id ON doc_relationships TYPE string;
DEFINE FIELD relationship_type ON doc_relationships TYPE string;
DEFINE FIELD strength ON doc_relationships TYPE number ASSERT \$value >= 0 AND \$value <= 1;
DEFINE FIELD created_at ON doc_relationships TYPE datetime;
DEFINE INDEX idx_source ON doc_relationships FIELDS source_id;
DEFINE INDEX idx_target ON doc_relationships FIELDS target_id;
DEFINE INDEX idx_relationship ON doc_relationships FIELDS relationship_type;
-- Knowledge base metadata table
DEFINE TABLE kb_metadata SCHEMAFULL;
DEFINE FIELD namespace ON kb_metadata TYPE string;
DEFINE FIELD database ON kb_metadata TYPE string;
DEFINE FIELD total_documents ON kb_metadata TYPE number;
DEFINE FIELD embedding_dimension ON kb_metadata TYPE number;
DEFINE FIELD total_relationships ON kb_metadata TYPE number;
DEFINE FIELD last_indexed_at ON kb_metadata TYPE datetime;
DEFINE FIELD vector_index_status ON kb_metadata TYPE string;
DEFINE FIELD metadata ON kb_metadata TYPE object;"
}
# Group documents by category for statistics
def group_by_category [bp_docs: list, ext_docs: list] {
let bp_cats = ($bp_docs | each { |doc| $doc.category } | sort | uniq)
let ext_cats = ($ext_docs | each { |doc| $doc.category } | sort | uniq)
let all_cats = ([$bp_cats, $ext_cats] | flatten | sort | uniq)
$all_cats | each { |cat|
let bp_count = ($bp_docs | where category == $cat | length)
let ext_count = ($ext_docs | where category == $cat | length)
{
category: $cat,
best_practice_count: $bp_count,
extension_count: $ext_count,
}
}
}
main