#!/usr/bin/env nu # Ingest Phase 1 (best practices) and Phase 2 (extension metadata) into SurrealDB # Creates knowledge documents with embeddings for RAG semantic search # # Usage: # nu ingest-knowledge-base.nu --surrealdb-url ws://localhost:8000 --namespace rag --database provisioning # nu ingest-knowledge-base.nu # Uses defaults # def main [ --surrealdb-url: string = "ws://localhost:8000" # SurrealDB connection URL --namespace: string = "rag" # Database namespace --database: string = "provisioning" # Database name --best-practices-file: path = "config/best-practices.json" # Phase 1 best practices data --extensions-dag-file: path = "config/extensions-dag.json" # Phase 2 extension DAG --verbose # Enable verbose output ] { print "šŸ”Œ Ingesting Knowledge Base into SurrealDB..." if $verbose { print $" SurrealDB: ($surrealdb_url)" print $" Namespace: ($namespace)" print $" Database: ($database)" } # Verify input files exist if not ($best_practices_file | path exists) { print "āŒ Best practices file not found: $best_practices_file" return 1 } if not ($extensions_dag_file | path exists) { print "āŒ Extensions DAG file not found: $extensions_dag_file" return 1 } # Load best practices print "šŸ“š Loading best practices..." let best_practices = (open $best_practices_file | get practices) print $" āœ… Loaded ($best_practices | length) best practices" # Load extensions print "šŸ“¦ Loading extensions metadata..." let extensions_data = (open $extensions_dag_file) let extensions = $extensions_data.extensions let init_order = $extensions_data.initialization_order print $" āœ… Loaded ($extensions | length) extensions" # Create knowledge documents from best practices print "\nšŸ”„ Creating best practice documents..." let bp_docs = (create_best_practice_docs $best_practices) print $" āœ… Created ($bp_docs | length) best practice documents" # Create knowledge documents from extensions print "šŸ”„ Creating extension metadata documents..." let ext_docs = (create_extension_docs $extensions $init_order $bp_docs) print $" āœ… Created ($ext_docs | length) extension documents" # Create document relationships print "šŸ”„ Creating document relationships..." let relationships = (create_relationships $bp_docs $ext_docs) print $" āœ… Created ($relationships | length) relationships" # Generate database initialization SQL print "\nšŸ“ Generating SurrealDB schema..." let schema_sql = (generate_schema_sql) # Save ingestion manifest let manifest = { generated_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"), source_best_practices: $best_practices_file, source_extensions_dag: $extensions_dag_file, statistics: { total_documents: (($bp_docs | length) + ($ext_docs | length)), best_practice_docs: ($bp_docs | length), extension_docs: ($ext_docs | length), relationships: ($relationships | length), by_category: (group_by_category $bp_docs $ext_docs), }, } let output_dir = "config/knowledge-base" if not ($output_dir | path exists) { mkdir $output_dir } # Save documents $bp_docs | to json --indent 2 | save --force $"($output_dir)/best-practices-docs.json" $ext_docs | to json --indent 2 | save --force $"($output_dir)/extension-docs.json" $relationships | to json --indent 2 | save --force $"($output_dir)/relationships.json" $schema_sql | save --force $"($output_dir)/schema.sql" $manifest | to json --indent 2 | save --force $"($output_dir)/manifest.json" print $"\nāœ… Knowledge base ingestion manifest created" print $" Output directory: ($output_dir)" print $" Best practices docs: ($bp_docs | length)" print $" Extension docs: ($ext_docs | length)" print $" Relationships: ($relationships | length)" # Display next steps print "\nšŸ“‹ Next Steps:" print " 1. Start SurrealDB server:" print " surreal start --bind 0.0.0.0:8000 file:./data.db" print "" print " 2. Initialize schema:" print $" surreal sql --namespace ($namespace) --database ($database) < ($output_dir)/schema.sql" print "" print " 3. Ingest documents (requires embedding integration):" print $" Implement vector embedding generation via OpenAI API or local embeddings" print "" print " 4. Test RAG queries:" print $" curl -X POST http://localhost:8083/api/v1/ai/ask \\" print $" -H 'Content-Type: application/json' \\" print $" -d '{{\"question\": \"What best practices apply to Kubernetes deployments?\"}}'" 0 } # Create best practice documents for knowledge base def create_best_practice_docs [practices: list] { $practices | each { |bp| { id: $bp.id, type: "best_practice", title: $bp.title, description: $bp.description, category: $bp.category, relevance: $bp.relevance, tags: $bp.tags, source: $bp.source, content: $"($bp.title)\n\n($bp.description)\n\nCategory: ($bp.category)\nSource: ($bp.source)", embedding: null, # Will be populated by RAG ingestion pipeline source_path: "config/best-practices.json", indexed_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"), metadata: { doc_type: "best_practice", version: "1.0", } } } } # Create extension metadata documents for knowledge base def create_extension_docs [extensions: list, init_order: list, bp_docs: list] { let bp_ids = ($bp_docs | each { |doc| $doc.id }) $extensions | each { |ext| let order_idx = ( $init_order | enumerate | where { |item| $item.item == $ext.name } | first | get index ) { id: $"ext_($ext.name)", type: "extension_metadata", name: $ext.name, version: $ext.version, category: $ext.category, description: $ext.description, dependencies: $ext.dependencies, tags: $ext.tags, best_practices: $ext.best_practices, content: $"($ext.name) v($ext.version)\n\n($ext.description)\n\nCategory: ($ext.category)\n\nDependencies:\n($ext.dependencies | str join '\n')\n\nTags:\n($ext.tags | str join '\n')", embedding: null, # Will be populated by RAG ingestion pipeline source_path: $"extensions/($ext.category)s/($ext.name)/metadata.ncl", indexed_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"), dependency_count: ($ext.dependencies | length), initialization_order: $order_idx, metadata: { doc_type: "extension_metadata", version: "1.0", } } } } # Create relationships between documents def create_relationships [bp_docs: list, ext_docs: list] { let relationships = [] # Best practice relationships (based on tags and categories) let bp_rels = ( $bp_docs | enumerate | each { |item| let i = $item.index let bp1 = $item.item # Find other practices with overlapping tags $bp_docs | enumerate | where { |other| $other.index > $i } # Avoid duplicates | each { |other| let bp2 = $other.item let common_tags = ( $bp1.tags | where { |tag| $bp2.tags | any { |t| $t == $tag } } | length ) if $common_tags > 0 { { source_id: $bp1.id, target_id: $bp2.id, relationship_type: "relates_to", strength: ($common_tags / (($bp1.tags | length) + ($bp2.tags | length))), created_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"), } } else { null } } | compact } | flatten ) # Extension to best practice relationships let ext_bp_rels = ( $ext_docs | each { |ext| $ext.best_practices | each { |bp_id| { source_id: $ext.id, target_id: $bp_id, relationship_type: "implements", strength: 0.9, created_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"), } } } | flatten ) # Extension dependency relationships let ext_dep_rels = ( $ext_docs | each { |ext| $ext.dependencies | each { |dep| let dep_ext_id = $"ext_($dep)" { source_id: $ext.id, target_id: $dep_ext_id, relationship_type: "depends_on", strength: 1.0, created_at: (date now | format date "%Y-%m-%dT%H:%M:%SZ"), } } } | flatten ) [$bp_rels, $ext_bp_rels, $ext_dep_rels] | flatten } # Generate SurrealDB schema SQL def generate_schema_sql [] { "-- Knowledge Base Schema for RAG Integration -- Creates tables for documents and relationships with HNSW vector indexing -- Best practice documents table DEFINE TABLE best_practice_docs SCHEMAFULL; DEFINE FIELD id ON best_practice_docs TYPE string ASSERT string::len(\$value) > 0; DEFINE FIELD type ON best_practice_docs TYPE string VALUE \"best_practice\" ASSERT \$value == \"best_practice\"; DEFINE FIELD title ON best_practice_docs TYPE string; DEFINE FIELD description ON best_practice_docs TYPE string; DEFINE FIELD category ON best_practice_docs TYPE string; DEFINE FIELD relevance ON best_practice_docs TYPE number ASSERT \$value >= 0 AND \$value <= 100; DEFINE FIELD tags ON best_practice_docs TYPE array; DEFINE FIELD source ON best_practice_docs TYPE string; DEFINE FIELD content ON best_practice_docs TYPE string; DEFINE FIELD embedding ON best_practice_docs TYPE array ASSERT array::len(\$value) == 1536; DEFINE FIELD source_path ON best_practice_docs TYPE string; DEFINE FIELD indexed_at ON best_practice_docs TYPE datetime; DEFINE FIELD metadata ON best_practice_docs TYPE object; DEFINE INDEX idx_category ON best_practice_docs FIELDS category; DEFINE INDEX idx_embedding ON best_practice_docs FIELDS embedding HNSW DIMENSION 1536 DIST COSINE; -- Extension metadata documents table DEFINE TABLE extension_docs SCHEMAFULL; DEFINE FIELD id ON extension_docs TYPE string ASSERT string::len(\$value) > 0; DEFINE FIELD type ON extension_docs TYPE string VALUE \"extension_metadata\" ASSERT \$value == \"extension_metadata\"; DEFINE FIELD name ON extension_docs TYPE string; DEFINE FIELD version ON extension_docs TYPE string; DEFINE FIELD category ON extension_docs TYPE string; DEFINE FIELD description ON extension_docs TYPE string; DEFINE FIELD dependencies ON extension_docs TYPE array; DEFINE FIELD tags ON extension_docs TYPE array; DEFINE FIELD best_practices ON extension_docs TYPE array; DEFINE FIELD content ON extension_docs TYPE string; DEFINE FIELD embedding ON extension_docs TYPE array ASSERT array::len(\$value) == 1536; DEFINE FIELD source_path ON extension_docs TYPE string; DEFINE FIELD indexed_at ON extension_docs TYPE datetime; DEFINE FIELD dependency_count ON extension_docs TYPE number; DEFINE FIELD initialization_order ON extension_docs TYPE number; DEFINE FIELD metadata ON extension_docs TYPE object; DEFINE INDEX idx_category ON extension_docs FIELDS category; DEFINE INDEX idx_name ON extension_docs FIELDS name UNIQUE; DEFINE INDEX idx_embedding ON extension_docs FIELDS embedding HNSW DIMENSION 1536 DIST COSINE; -- Document relationships table (for graph traversal) DEFINE TABLE doc_relationships SCHEMAFULL; DEFINE FIELD source_id ON doc_relationships TYPE string; DEFINE FIELD target_id ON doc_relationships TYPE string; DEFINE FIELD relationship_type ON doc_relationships TYPE string; DEFINE FIELD strength ON doc_relationships TYPE number ASSERT \$value >= 0 AND \$value <= 1; DEFINE FIELD created_at ON doc_relationships TYPE datetime; DEFINE INDEX idx_source ON doc_relationships FIELDS source_id; DEFINE INDEX idx_target ON doc_relationships FIELDS target_id; DEFINE INDEX idx_relationship ON doc_relationships FIELDS relationship_type; -- Knowledge base metadata table DEFINE TABLE kb_metadata SCHEMAFULL; DEFINE FIELD namespace ON kb_metadata TYPE string; DEFINE FIELD database ON kb_metadata TYPE string; DEFINE FIELD total_documents ON kb_metadata TYPE number; DEFINE FIELD embedding_dimension ON kb_metadata TYPE number; DEFINE FIELD total_relationships ON kb_metadata TYPE number; DEFINE FIELD last_indexed_at ON kb_metadata TYPE datetime; DEFINE FIELD vector_index_status ON kb_metadata TYPE string; DEFINE FIELD metadata ON kb_metadata TYPE object;" } # Group documents by category for statistics def group_by_category [bp_docs: list, ext_docs: list] { let bp_cats = ($bp_docs | each { |doc| $doc.category } | sort | uniq) let ext_cats = ($ext_docs | each { |doc| $doc.category } | sort | uniq) let all_cats = ([$bp_cats, $ext_cats] | flatten | sort | uniq) $all_cats | each { |cat| let bp_count = ($bp_docs | where category == $cat | length) let ext_count = ($ext_docs | where category == $cat | length) { category: $cat, best_practice_count: $bp_count, extension_count: $ext_count, } } } main