diff --git a/.claude/memory.sqlite3-shm b/.claude/memory.sqlite3-shm deleted file mode 100644 index aabf066..0000000 --- a/.claude/memory.sqlite3-shm +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d2999ee60a0bda9f25773298c25be6a39ede192f5bb05b94d12ea45b874b428 -size 32768 diff --git a/.claude/memory.sqlite3-wal b/.claude/memory.sqlite3-wal deleted file mode 100644 index 33ada12..0000000 --- a/.claude/memory.sqlite3-wal +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff75380dc9f45d71ab1b77ff2b1aff7cc9b93c2161d2089a632aae6ea90eddde -size 1528552 diff --git a/lib/claude_memory.rb b/lib/claude_memory.rb index 85df8c2..7414459 100644 --- a/lib/claude_memory.rb +++ b/lib/claude_memory.rb @@ -70,6 +70,7 @@ class Error < StandardError; end require_relative "claude_memory/commands/git_lfs_command" require_relative "claude_memory/commands/install_skill_command" require_relative "claude_memory/commands/completion_command" +require_relative "claude_memory/commands/embeddings_command" require_relative "claude_memory/commands/registry" require_relative "claude_memory/cli" require_relative "claude_memory/configuration" @@ -80,6 +81,8 @@ class Error < StandardError; end require_relative "claude_memory/domain/entity" require_relative "claude_memory/domain/provenance" require_relative "claude_memory/domain/conflict" +require_relative "claude_memory/embeddings/model_registry" +require_relative "claude_memory/embeddings/inspector" require_relative "claude_memory/embeddings/generator" require_relative "claude_memory/embeddings/fastembed_adapter" require_relative "claude_memory/embeddings/api_adapter" diff --git a/lib/claude_memory/commands/embeddings_command.rb b/lib/claude_memory/commands/embeddings_command.rb new file mode 100644 index 0000000..1d404e4 --- /dev/null +++ b/lib/claude_memory/commands/embeddings_command.rb @@ -0,0 +1,198 @@ +# frozen_string_literal: true + +module ClaudeMemory + module Commands + # Shows embedding configuration, lists available models, and validates setup. + # + # Subcommands: + # claude-memory embeddings # Show current config + # claude-memory embeddings list # List available models + # claude-memory embeddings check # Validate current setup + # + class EmbeddingsCommand < BaseCommand + def call(args) + opts = parse_options(args, {}) do |o| + OptionParser.new do |parser| + parser.banner = "Usage: claude-memory embeddings [list|check]" + end + end + return 1 if opts.nil? + + subcommand = args.first + + case subcommand + when "list" then list_models + when "check" then check_setup + when nil then show_config + else + failure("Unknown subcommand: #{subcommand}. Use: list, check") + end + end + + private + + def inspector + @inspector ||= Embeddings::Inspector.new + end + + def show_config + provider = ENV["CLAUDE_MEMORY_EMBEDDING_PROVIDER"] || "tfidf" + model = ENV["CLAUDE_MEMORY_EMBEDDING_MODEL"] + api_url = ENV["CLAUDE_MEMORY_EMBEDDING_API_URL"] + + stdout.puts "Embedding Configuration" + stdout.puts "======================" + stdout.puts "Provider: #{provider}" + stdout.puts "Model: #{model || "(default)"}" + + if model + info = Embeddings::ModelRegistry.find(model) + if info + stdout.puts "Dimensions: #{info.dimensions}" + stdout.puts "Description: #{info.description}" + else + stdout.puts "Dimensions: (unknown - will be discovered at runtime)" + end + else + info = Embeddings::ModelRegistry.default_for_provider(provider) + if info + stdout.puts "Default model: #{info.name}" + stdout.puts "Dimensions: #{info.dimensions}" + end + end + + stdout.puts "API URL: #{api_url}" if api_url && provider == "api" + + inspector.database_states.each do |state| + stdout.puts "" + stdout.puts "#{state.label.capitalize} DB: provider=#{state.provider || "unknown"}, dimensions=#{state.dimensions || "unknown"}" + end + + stdout.puts "" + stdout.puts "ENV variables:" + stdout.puts " CLAUDE_MEMORY_EMBEDDING_PROVIDER Provider (tfidf, fastembed, api)" + stdout.puts " CLAUDE_MEMORY_EMBEDDING_MODEL Model name" + stdout.puts " CLAUDE_MEMORY_EMBEDDING_API_KEY API key (for api provider)" + stdout.puts " CLAUDE_MEMORY_EMBEDDING_API_URL API endpoint (for api provider)" + 0 + end + + def list_models + Embeddings::ModelRegistry.providers.each do |provider| + stdout.puts "" + stdout.puts "#{provider_label(provider)}:" + stdout.puts "-" * 40 + + Embeddings::ModelRegistry.models_for_provider(provider).each do |model| + size = model.size_mb ? "#{model.size_mb}MB" : "cloud" + tokens = model.max_tokens ? "#{model.max_tokens} tokens" : "" + stdout.puts " #{model.name}" + stdout.puts " #{model.dimensions}-dim | #{size} | #{tokens}" + stdout.puts " #{model.description}" + end + end + + stdout.puts "" + stdout.puts "Custom models: Set CLAUDE_MEMORY_EMBEDDING_MODEL to any model" + stdout.puts "supported by your provider. Dimensions are auto-detected." + 0 + end + + def check_setup + provider_name = ENV["CLAUDE_MEMORY_EMBEDDING_PROVIDER"] || "tfidf" + model_name = ENV["CLAUDE_MEMORY_EMBEDDING_MODEL"] + + stdout.puts "Checking embedding setup..." + stdout.puts "" + + ok = true + ok &= check_provider(provider_name) + ok &= check_model(provider_name, model_name) if model_name + ok &= render_dimension_checks(provider_name, model_name) + + stdout.puts "" + stdout.puts ok ? "All checks passed." : "Some checks failed. See above." + ok ? 0 : 1 + end + + def check_provider(name) + case name + when "fastembed" + check_fastembed + when "api" + check_api_config + when "tfidf" + stdout.puts " [OK] tfidf provider (built-in, always available)" + true + else + stdout.puts " [FAIL] Unknown provider: #{name}" + false + end + end + + def check_model(provider_name, model_name) + info = Embeddings::ModelRegistry.find(model_name) + if info + if info.provider != provider_name + stdout.puts " [WARN] Model '#{model_name}' is for '#{info.provider}' provider, but '#{provider_name}' is selected" + stdout.puts " Set CLAUDE_MEMORY_EMBEDDING_PROVIDER=#{info.provider}" + else + stdout.puts " [OK] Model '#{model_name}' (#{info.dimensions}-dim)" + end + else + stdout.puts " [INFO] Model '#{model_name}' not in registry (dimensions will be auto-detected)" + end + true + end + + def render_dimension_checks(provider_name, model_name) + ok = true + + inspector.dimension_checks(provider_name, model_name).each do |check| + case check.status + when :mismatch + stdout.puts " [WARN] #{check.label}: Dimension mismatch (stored: #{check.stored_dims}, current: #{check.current_dims})" + stdout.puts " Re-index with: claude-memory index --force --scope #{check.label}" + ok = false + when :match + stdout.puts " [OK] #{check.label}: #{check.stored_dims}-dim (provider: #{check.stored_provider || "unknown"})" + when :fresh + stdout.puts " [INFO] #{check.label}: No embeddings indexed yet" + end + end + + ok + end + + def check_fastembed + require "fastembed" + stdout.puts " [OK] fastembed gem available" + true + rescue LoadError + stdout.puts " [FAIL] fastembed gem not installed" + stdout.puts " Add `gem 'fastembed'` to your Gemfile" + false + end + + def check_api_config + key = ENV["CLAUDE_MEMORY_EMBEDDING_API_KEY"] || ENV["OPENAI_API_KEY"] + if key + stdout.puts " [OK] API key configured" + true + else + stdout.puts " [FAIL] No API key found" + stdout.puts " Set CLAUDE_MEMORY_EMBEDDING_API_KEY or OPENAI_API_KEY" + false + end + end + + def provider_label(provider) + case provider + when "fastembed" then "fastembed (local ONNX, no API key)" + when "api" then "api (OpenAI-compatible endpoints, requires API key)" + when "tfidf" then "tfidf (built-in, no dependencies)" + end + end + end + end +end diff --git a/lib/claude_memory/commands/registry.rb b/lib/claude_memory/commands/registry.rb index eda308b..f5d8a72 100644 --- a/lib/claude_memory/commands/registry.rb +++ b/lib/claude_memory/commands/registry.rb @@ -32,7 +32,8 @@ class Registry "export" => "ExportCommand", "git-lfs" => "GitLfsCommand", "install-skill" => "InstallSkillCommand", - "completion" => "CompletionCommand" + "completion" => "CompletionCommand", + "embeddings" => "EmbeddingsCommand" }.freeze # Find a command class by name diff --git a/lib/claude_memory/embeddings/api_adapter.rb b/lib/claude_memory/embeddings/api_adapter.rb index 60ea109..bbce860 100644 --- a/lib/claude_memory/embeddings/api_adapter.rb +++ b/lib/claude_memory/embeddings/api_adapter.rb @@ -22,19 +22,20 @@ class ApiError < StandardError; end DEFAULT_API_URL = "https://api.openai.com/v1/embeddings" DEFAULT_MODEL = "text-embedding-3-small" - def initialize(env: ENV) + def initialize(model: nil, env: ENV) @api_key = env["CLAUDE_MEMORY_EMBEDDING_API_KEY"] || env["OPENAI_API_KEY"] @api_url = env["CLAUDE_MEMORY_EMBEDDING_API_URL"] || DEFAULT_API_URL - @model = env["CLAUDE_MEMORY_EMBEDDING_MODEL"] || DEFAULT_MODEL + @model = model || env["CLAUDE_MEMORY_EMBEDDING_MODEL"] || DEFAULT_MODEL + @known_dimensions = ModelRegistry.dimensions_for(@model) raise ArgumentError, "Set CLAUDE_MEMORY_EMBEDDING_API_KEY or OPENAI_API_KEY" unless @api_key end def name = "api" - # Dimensions are lazy — derived from the first API response and cached. + # Dimensions resolved from registry if known, otherwise lazy from first API response. def dimensions - @dimensions ||= fetch_dimensions + @dimensions ||= @known_dimensions || fetch_dimensions end # Generate embedding for a query text. diff --git a/lib/claude_memory/embeddings/fastembed_adapter.rb b/lib/claude_memory/embeddings/fastembed_adapter.rb index 5fa6036..04bd032 100644 --- a/lib/claude_memory/embeddings/fastembed_adapter.rb +++ b/lib/claude_memory/embeddings/fastembed_adapter.rb @@ -2,37 +2,50 @@ module ClaudeMemory module Embeddings - # Adapter wrapping fastembed-rb for high-quality local embeddings - # Uses BAAI/bge-small-en-v1.5 by default (384-dim, ~67MB ONNX model) + # Adapter wrapping fastembed-rb for high-quality local embeddings. + # Supports any model available in fastembed-rb's SUPPORTED_MODELS. # - # Implements the same generate(text) interface as Generator for DI compatibility. - # Supports asymmetric query/passage encoding for better retrieval accuracy. + # Model selection (in priority order): + # 1. Explicit model_name parameter + # 2. CLAUDE_MEMORY_EMBEDDING_MODEL env var + # 3. Default: BAAI/bge-small-en-v1.5 (384-dim, ~67MB ONNX) + # + # Dimensions are resolved from the ModelRegistry for known models, + # or probed from fastembed's ModelInfo for unknown models. # # Usage: # adapter = FastembedAdapter.new # query_vec = adapter.generate("What database?") # query encoding # passage_vec = adapter.generate_passage("Uses PostgreSQL") # passage encoding # + # # Use a larger model: + # adapter = FastembedAdapter.new(model_name: "BAAI/bge-base-en-v1.5") + # adapter.dimensions # => 768 + # class FastembedAdapter - EMBEDDING_DIM = 384 DEFAULT_MODEL = "BAAI/bge-small-en-v1.5" + attr_reader :model_name, :dimensions + def name = "fastembed" - def dimensions = EMBEDDING_DIM + def initialize(model_name: nil, env: ENV) + @model_name = model_name || env["CLAUDE_MEMORY_EMBEDDING_MODEL"] || DEFAULT_MODEL + @dimensions = resolve_dimensions(@model_name) - def initialize(model_name: DEFAULT_MODEL) require "fastembed" - @model = Fastembed::TextEmbedding.new(model_name: model_name) + @model = Fastembed::TextEmbedding.new(model_name: @model_name) + + # If dimensions weren't known from registry, probe from fastembed + @dimensions ||= probe_dimensions_from_fastembed rescue LoadError raise LoadError, "fastembed gem is required for FastembedAdapter. Add `gem 'fastembed'` to your Gemfile." end # Generate query embedding (optimized for search queries) - # Compatible with Recall's embedding_generator interface # @param text [String] query text to embed - # @return [Array] normalized 384-dimensional vector + # @return [Array] normalized embedding vector def generate(text) return zero_vector if text.nil? || text.empty? @@ -40,9 +53,8 @@ def generate(text) end # Generate passage embedding (optimized for document/fact indexing) - # Use this when storing embeddings for facts # @param text [String] passage text to embed - # @return [Array] normalized 384-dimensional vector + # @return [Array] normalized embedding vector def generate_passage(text) return zero_vector if text.nil? || text.empty? @@ -51,8 +63,26 @@ def generate_passage(text) private + # Resolve dimensions from the model registry (fast, no I/O). + # Returns nil if the model isn't in the registry. + def resolve_dimensions(model) + ModelRegistry.dimensions_for(model) + end + + # Fallback: probe fastembed's SUPPORTED_MODELS for dimension info. + # This handles models added to fastembed-rb but not yet in our registry. + def probe_dimensions_from_fastembed + if defined?(Fastembed::SUPPORTED_MODELS) + info = Fastembed::SUPPORTED_MODELS[@model_name] + return info.dim if info + end + + # Last resort: generate a test embedding and measure its size + @model.query_embed("dimension probe").first.size + end + def zero_vector - Array.new(EMBEDDING_DIM, 0.0) + Array.new(@dimensions, 0.0) end end end diff --git a/lib/claude_memory/embeddings/inspector.rb b/lib/claude_memory/embeddings/inspector.rb new file mode 100644 index 0000000..00bc053 --- /dev/null +++ b/lib/claude_memory/embeddings/inspector.rb @@ -0,0 +1,91 @@ +# frozen_string_literal: true + +module ClaudeMemory + module Embeddings + # Reads embedding metadata from global and project databases. + # Returns structured data — no I/O formatting or stdout output. + # + # Used by EmbeddingsCommand to separate DB concerns from presentation. + class Inspector + DatabaseState = Data.define(:label, :provider, :dimensions) + DimensionResult = Data.define(:label, :status, :stored_dims, :stored_provider, :current_dims) + + def database_states + results = [] + + with_each_store do |label, store| + provider = store.get_meta("embedding_provider") + dims = store.get_meta("embedding_dimensions") + + next unless provider || dims + + results << DatabaseState.new(label: label, provider: provider, dimensions: dims) + end + + results + end + + def dimension_checks(provider_name, model_name) + results = [] + + with_each_store do |label, store| + stored_dims = store.get_meta("embedding_dimensions")&.to_i + stored_provider = store.get_meta("embedding_provider") + + if stored_dims + current_dims = resolve_current_dimensions(provider_name, model_name) + + status = if current_dims && current_dims != stored_dims + :mismatch + else + :match + end + + results << DimensionResult.new( + label: label, + status: status, + stored_dims: stored_dims, + stored_provider: stored_provider, + current_dims: current_dims + ) + else + results << DimensionResult.new( + label: label, + status: :fresh, + stored_dims: nil, + stored_provider: nil, + current_dims: nil + ) + end + end + + results + end + + private + + def resolve_current_dimensions(provider_name, model_name) + if model_name + ModelRegistry.dimensions_for(model_name) + else + ModelRegistry.default_for_provider(provider_name)&.dimensions + end + end + + def with_each_store + config = Configuration.new + + [["global", config.global_db_path], ["project", config.project_db_path]].each do |label, path| + next unless File.exist?(path) + + store = Store::SQLiteStore.new(path) + begin + yield label, store + ensure + store.close + end + end + end + end + end +end diff --git a/lib/claude_memory/embeddings/model_registry.rb b/lib/claude_memory/embeddings/model_registry.rb new file mode 100644 index 0000000..bbe0e4d --- /dev/null +++ b/lib/claude_memory/embeddings/model_registry.rb @@ -0,0 +1,210 @@ +# frozen_string_literal: true + +module ClaudeMemory + module Embeddings + # Registry of known embedding models with their properties. + # Enables model validation, dimension lookup, and discoverability. + # + # Models are registered by canonical name (e.g., "BAAI/bge-small-en-v1.5") + # with provider type, dimensions, and description. + # + # Usage: + # ModelRegistry.find("BAAI/bge-small-en-v1.5") + # # => {provider: "fastembed", dimensions: 384, description: "...", ...} + # + # ModelRegistry.models_for_provider("fastembed") + # # => [...] + # + class ModelRegistry + ModelInfo = Data.define(:name, :provider, :dimensions, :description, :size_mb, :max_tokens) + + # Known models with validated dimensions. + # Fastembed models sourced from fastembed-rb SUPPORTED_MODELS. + # API models sourced from provider documentation. + MODELS = [ + # --- fastembed: local ONNX models (no API key needed) --- + ModelInfo.new( + name: "BAAI/bge-small-en-v1.5", + provider: "fastembed", + dimensions: 384, + description: "Fast English embedding (default)", + size_mb: 67, + max_tokens: 512 + ), + ModelInfo.new( + name: "BAAI/bge-base-en-v1.5", + provider: "fastembed", + dimensions: 768, + description: "Balanced English embedding, higher accuracy", + size_mb: 210, + max_tokens: 512 + ), + ModelInfo.new( + name: "BAAI/bge-large-en-v1.5", + provider: "fastembed", + dimensions: 1024, + description: "High accuracy English embedding", + size_mb: 1200, + max_tokens: 512 + ), + ModelInfo.new( + name: "sentence-transformers/all-MiniLM-L6-v2", + provider: "fastembed", + dimensions: 384, + description: "Lightweight general-purpose sentence embedding", + size_mb: 90, + max_tokens: 512 + ), + ModelInfo.new( + name: "intfloat/multilingual-e5-small", + provider: "fastembed", + dimensions: 384, + description: "Multilingual embedding, 100+ languages", + size_mb: 450, + max_tokens: 512 + ), + ModelInfo.new( + name: "intfloat/multilingual-e5-base", + provider: "fastembed", + dimensions: 768, + description: "Larger multilingual embedding", + size_mb: 1110, + max_tokens: 512 + ), + ModelInfo.new( + name: "nomic-ai/nomic-embed-text-v1.5", + provider: "fastembed", + dimensions: 768, + description: "Long context (8192 tokens) with Matryoshka support", + size_mb: 520, + max_tokens: 8192 + ), + ModelInfo.new( + name: "jinaai/jina-embeddings-v2-small-en", + provider: "fastembed", + dimensions: 512, + description: "Small English embedding, 8192 token context", + size_mb: 60, + max_tokens: 8192 + ), + ModelInfo.new( + name: "jinaai/jina-embeddings-v2-base-en", + provider: "fastembed", + dimensions: 768, + description: "Base English embedding, 8192 token context", + size_mb: 520, + max_tokens: 8192 + ), + + # --- api: OpenAI-compatible endpoints --- + ModelInfo.new( + name: "text-embedding-3-small", + provider: "api", + dimensions: 1536, + description: "OpenAI small embedding (default API model)", + size_mb: nil, + max_tokens: 8191 + ), + ModelInfo.new( + name: "text-embedding-3-large", + provider: "api", + dimensions: 3072, + description: "OpenAI large embedding, highest accuracy", + size_mb: nil, + max_tokens: 8191 + ), + ModelInfo.new( + name: "text-embedding-ada-002", + provider: "api", + dimensions: 1536, + description: "OpenAI legacy embedding", + size_mb: nil, + max_tokens: 8191 + ), + ModelInfo.new( + name: "voyage-3", + provider: "api", + dimensions: 1024, + description: "Voyage AI general-purpose embedding", + size_mb: nil, + max_tokens: 32000 + ), + ModelInfo.new( + name: "voyage-3-lite", + provider: "api", + dimensions: 512, + description: "Voyage AI lightweight embedding", + size_mb: nil, + max_tokens: 32000 + ), + ModelInfo.new( + name: "voyage-code-3", + provider: "api", + dimensions: 1024, + description: "Voyage AI code-optimized embedding", + size_mb: nil, + max_tokens: 32000 + ), + + # --- tfidf: built-in, no dependencies --- + ModelInfo.new( + name: "tfidf", + provider: "tfidf", + dimensions: 384, + description: "Built-in TF-IDF embedding (no dependencies)", + size_mb: 0, + max_tokens: nil + ) + ].freeze + + MODELS_BY_NAME = MODELS.each_with_object({}) { |m, h| h[m.name] = m }.freeze + + DEFAULTS = { + "fastembed" => "BAAI/bge-small-en-v1.5", + "api" => "text-embedding-3-small", + "tfidf" => "tfidf" + }.freeze + + # Find a model by name. + # @param name [String] model name (e.g., "BAAI/bge-small-en-v1.5") + # @return [ModelInfo, nil] + def self.find(name) + MODELS_BY_NAME[name] + end + + # List all models for a given provider. + # @param provider [String] "fastembed", "api", or "tfidf" + # @return [Array] + def self.models_for_provider(provider) + MODELS.select { |m| m.provider == provider } + end + + # All known model names. + # @return [Array] + def self.model_names + MODELS.map(&:name) + end + + # All provider names. + # @return [Array] + def self.providers + MODELS.map(&:provider).uniq + end + + # Look up dimensions for a model name. Returns nil if unknown. + # @param name [String] model name + # @return [Integer, nil] + def self.dimensions_for(name) + find(name)&.dimensions + end + + # Return the default ModelInfo for a provider. + # @param provider [String] "fastembed", "api", or "tfidf" + # @return [ModelInfo, nil] + def self.default_for_provider(provider) + default_name = DEFAULTS[provider] + find(default_name) if default_name + end + end + end +end diff --git a/lib/claude_memory/embeddings/resolver.rb b/lib/claude_memory/embeddings/resolver.rb index d84077b..16611dc 100644 --- a/lib/claude_memory/embeddings/resolver.rb +++ b/lib/claude_memory/embeddings/resolver.rb @@ -2,17 +2,43 @@ module ClaudeMemory module Embeddings - # Resolves an embedding provider by name or ENV. - # Three providers: tfidf (default), fastembed, api. - def self.resolve(name = nil, env: ENV) - provider = name || env["CLAUDE_MEMORY_EMBEDDING_PROVIDER"] || "tfidf" + # Resolves an embedding provider by name, model, or ENV. + # + # Provider selection (in priority order): + # 1. Explicit name parameter + # 2. CLAUDE_MEMORY_EMBEDDING_PROVIDER env var + # 3. Default: "tfidf" + # + # Model selection is forwarded to the provider via CLAUDE_MEMORY_EMBEDDING_MODEL + # or the model parameter. The model can also imply the provider: + # - "BAAI/bge-small-en-v1.5" → fastembed + # - "text-embedding-3-small" → api + # + # Examples: + # Embeddings.resolve # tfidf default + # Embeddings.resolve("fastembed") # fastembed with default model + # Embeddings.resolve("fastembed", model: "BAAI/bge-base-en-v1.5") + # Embeddings.resolve(model: "text-embedding-3-small") # auto-detects api provider + # + def self.resolve(name = nil, model: nil, env: ENV) + model ||= env["CLAUDE_MEMORY_EMBEDDING_MODEL"] + provider = name || env["CLAUDE_MEMORY_EMBEDDING_PROVIDER"] || infer_provider(model) || "tfidf" case provider when "tfidf" then Generator.new - when "fastembed" then FastembedAdapter.new - when "api" then ApiAdapter.new(env: env) + when "fastembed" then FastembedAdapter.new(model_name: model, env: env) + when "api" then ApiAdapter.new(model: model, env: env) else raise ArgumentError, "Unknown embedding provider: #{provider}. Available: tfidf, fastembed, api" end end + + # Infer provider from a model name using the registry. + # Returns nil if the model is unknown. + def self.infer_provider(model) + return nil unless model + + ModelRegistry.find(model)&.provider + end + private_class_method :infer_provider end end diff --git a/spec/claude_memory/commands/embeddings_command_spec.rb b/spec/claude_memory/commands/embeddings_command_spec.rb new file mode 100644 index 0000000..6ef1e9e --- /dev/null +++ b/spec/claude_memory/commands/embeddings_command_spec.rb @@ -0,0 +1,105 @@ +# frozen_string_literal: true + +require "stringio" +require "tmpdir" +require "fileutils" + +RSpec.describe ClaudeMemory::Commands::EmbeddingsCommand do + let(:stdout) { StringIO.new } + let(:stderr) { StringIO.new } + let(:command) { described_class.new(stdout: stdout, stderr: stderr) } + let(:global_db_path) { File.join(Dir.tmpdir, "embed_cmd_global_#{Process.pid}.sqlite3") } + let(:project_db_path) { File.join(Dir.tmpdir, "embed_cmd_project_#{Process.pid}.sqlite3") } + + before do + config = instance_double(ClaudeMemory::Configuration, + global_db_path: global_db_path, + project_db_path: project_db_path) + allow(ClaudeMemory::Configuration).to receive(:new).and_return(config) + end + + after do + FileUtils.rm_f(global_db_path) + FileUtils.rm_f(project_db_path) + end + + describe "#call with no subcommand" do + it "shows current configuration" do + exit_code = command.call([]) + expect(exit_code).to eq(0) + expect(stdout.string).to include("Embedding Configuration") + expect(stdout.string).to include("Provider:") + expect(stdout.string).to include("CLAUDE_MEMORY_EMBEDDING_PROVIDER") + end + + it "shows database state when databases exist" do + # Create a real database with embedding metadata + store = ClaudeMemory::Store::SQLiteStore.new(global_db_path) + store.set_meta("embedding_provider", "tfidf") + store.set_meta("embedding_dimensions", "384") + store.close + + exit_code = command.call([]) + expect(exit_code).to eq(0) + expect(stdout.string).to include("Global DB:") + expect(stdout.string).to include("provider=tfidf") + expect(stdout.string).to include("dimensions=384") + end + end + + describe "#call with 'list'" do + it "lists available models by provider" do + exit_code = command.call(["list"]) + expect(exit_code).to eq(0) + expect(stdout.string).to include("fastembed") + expect(stdout.string).to include("api") + expect(stdout.string).to include("tfidf") + expect(stdout.string).to include("BAAI/bge-small-en-v1.5") + expect(stdout.string).to include("text-embedding-3-small") + expect(stdout.string).to include("384-dim") + expect(stdout.string).to include("1536-dim") + end + end + + describe "#call with 'check'" do + it "validates tfidf setup (always passes)" do + exit_code = command.call(["check"]) + expect(exit_code).to eq(0) + expect(stdout.string).to include("tfidf provider") + expect(stdout.string).to include("All checks passed") + end + + it "detects dimension mismatch in existing database" do + # Create a database with 768-dim embeddings (simulating a provider switch) + store = ClaudeMemory::Store::SQLiteStore.new(project_db_path) + store.set_meta("embedding_dimensions", "768") + store.set_meta("embedding_provider", "fastembed") + store.close + + command.call(["check"]) + # tfidf default is 384 but DB has 768 → mismatch warning + expect(stdout.string).to include("Dimension mismatch") + expect(stdout.string).to include("stored: 768") + expect(stdout.string).to include("current: 384") + end + + it "reports OK when dimensions match" do + store = ClaudeMemory::Store::SQLiteStore.new(project_db_path) + store.set_meta("embedding_dimensions", "384") + store.set_meta("embedding_provider", "tfidf") + store.close + + exit_code = command.call(["check"]) + expect(exit_code).to eq(0) + expect(stdout.string).to include("[OK] project: 384-dim") + end + end + + describe "#call with unknown subcommand" do + it "returns error" do + exit_code = command.call(["bogus"]) + expect(exit_code).to eq(1) + expect(stderr.string).to include("Unknown subcommand: bogus") + end + end +end diff --git a/spec/claude_memory/embeddings/fastembed_adapter_spec.rb b/spec/claude_memory/embeddings/fastembed_adapter_spec.rb new file mode 100644 index 0000000..dd09398 --- /dev/null +++ b/spec/claude_memory/embeddings/fastembed_adapter_spec.rb @@ -0,0 +1,89 @@ +# frozen_string_literal: true + +require "spec_helper" +require_relative "../../support/shared_examples/embedding_provider" + +RSpec.describe ClaudeMemory::Embeddings::FastembedAdapter do + # Use real fastembed; skip if gem unavailable (e.g., model download fails in CI) + def build_adapter(**kwargs) + described_class.new(**kwargs) + rescue LoadError, StandardError => e + skip "fastembed not available: #{e.message}" + end + + subject { build_adapter(env: {}) } + + it_behaves_like "an embedding provider" + + describe "#initialize" do + it "uses default model when no config is given" do + adapter = build_adapter(env: {}) + expect(adapter.model_name).to eq("BAAI/bge-small-en-v1.5") + expect(adapter.dimensions).to eq(384) + end + + it "reads model from CLAUDE_MEMORY_EMBEDDING_MODEL env" do + # bge-base is a known registry model with different dimensions + adapter = build_adapter(env: {"CLAUDE_MEMORY_EMBEDDING_MODEL" => "BAAI/bge-base-en-v1.5"}) + expect(adapter.model_name).to eq("BAAI/bge-base-en-v1.5") + expect(adapter.dimensions).to eq(768) + end + + it "prefers explicit model_name over env" do + adapter = build_adapter( + model_name: "BAAI/bge-large-en-v1.5", + env: {"CLAUDE_MEMORY_EMBEDDING_MODEL" => "BAAI/bge-small-en-v1.5"} + ) + expect(adapter.model_name).to eq("BAAI/bge-large-en-v1.5") + expect(adapter.dimensions).to eq(1024) + end + + it "raises LoadError when fastembed gem is missing" do + # Stub require to raise LoadError before Fastembed constant is accessed + allow_any_instance_of(described_class).to receive(:require).with("fastembed").and_raise(LoadError) # rubocop:disable RSpec/AnyInstance + + expect { + described_class.new(env: {}) + }.to raise_error(LoadError, /fastembed gem is required/) + end + end + + describe "#name" do + it "returns 'fastembed'" do + adapter = build_adapter(env: {}) + expect(adapter.name).to eq("fastembed") + end + end + + describe "#generate" do + it "returns embedding vector with correct dimensions" do + adapter = build_adapter(env: {}) + result = adapter.generate("What database does this project use?") + expect(result).to be_an(Array) + expect(result.size).to eq(384) + expect(result).to all(be_a(Float)) + end + + it "returns zero vector for nil input" do + adapter = build_adapter(env: {}) + result = adapter.generate(nil) + expect(result).to eq(Array.new(adapter.dimensions, 0.0)) + end + + it "returns zero vector for empty input" do + adapter = build_adapter(env: {}) + result = adapter.generate("") + expect(result).to eq(Array.new(adapter.dimensions, 0.0)) + end + end + + describe "#generate_passage" do + it "returns passage embedding with correct dimensions" do + adapter = build_adapter(env: {}) + result = adapter.generate_passage("This project uses PostgreSQL for the main database") + expect(result).to be_an(Array) + expect(result.size).to eq(384) + expect(result).to all(be_a(Float)) + end + end +end diff --git a/spec/claude_memory/embeddings/inspector_spec.rb b/spec/claude_memory/embeddings/inspector_spec.rb new file mode 100644 index 0000000..5d594fb --- /dev/null +++ b/spec/claude_memory/embeddings/inspector_spec.rb @@ -0,0 +1,104 @@ +# frozen_string_literal: true + +require "spec_helper" +require "tmpdir" +require "fileutils" + +RSpec.describe ClaudeMemory::Embeddings::Inspector do + let(:global_db_path) { File.join(Dir.tmpdir, "inspector_global_#{Process.pid}.sqlite3") } + let(:project_db_path) { File.join(Dir.tmpdir, "inspector_project_#{Process.pid}.sqlite3") } + + before do + config = instance_double(ClaudeMemory::Configuration, + global_db_path: global_db_path, + project_db_path: project_db_path) + allow(ClaudeMemory::Configuration).to receive(:new).and_return(config) + end + + after do + FileUtils.rm_f(global_db_path) + FileUtils.rm_f(project_db_path) + end + + describe "#database_states" do + it "returns empty array when no databases exist" do + expect(described_class.new.database_states).to eq([]) + end + + it "returns metadata for databases with embedding info" do + store = ClaudeMemory::Store::SQLiteStore.new(global_db_path) + store.set_meta("embedding_provider", "tfidf") + store.set_meta("embedding_dimensions", "384") + store.close + + states = described_class.new.database_states + expect(states.size).to eq(1) + expect(states.first).to have_attributes( + label: "global", + provider: "tfidf", + dimensions: "384" + ) + end + + it "skips databases without embedding metadata" do + store = ClaudeMemory::Store::SQLiteStore.new(global_db_path) + store.set_meta("some_other_key", "value") + store.close + + expect(described_class.new.database_states).to eq([]) + end + + it "closes store even on error" do + store = ClaudeMemory::Store::SQLiteStore.new(project_db_path) + store.set_meta("embedding_provider", "tfidf") + store.close + + spy_store = ClaudeMemory::Store::SQLiteStore.new(project_db_path) + allow(spy_store).to receive(:get_meta).and_raise(RuntimeError, "db error") + allow(spy_store).to receive(:close).and_call_original + allow(ClaudeMemory::Store::SQLiteStore).to receive(:new).with(project_db_path).and_return(spy_store) + + expect { described_class.new.database_states }.to raise_error(RuntimeError, "db error") + expect(spy_store).to have_received(:close) + end + end + + describe "#dimension_checks" do + it "returns :fresh for databases without embeddings" do + store = ClaudeMemory::Store::SQLiteStore.new(project_db_path) + store.close + + checks = described_class.new.dimension_checks("tfidf", nil) + expect(checks.size).to eq(1) + expect(checks.first).to have_attributes(label: "project", status: :fresh) + end + + it "returns :match when dimensions agree" do + store = ClaudeMemory::Store::SQLiteStore.new(project_db_path) + store.set_meta("embedding_dimensions", "384") + store.set_meta("embedding_provider", "tfidf") + store.close + + checks = described_class.new.dimension_checks("tfidf", nil) + expect(checks.first).to have_attributes( + status: :match, + stored_dims: 384, + stored_provider: "tfidf" + ) + end + + it "returns :mismatch when dimensions differ" do + store = ClaudeMemory::Store::SQLiteStore.new(project_db_path) + store.set_meta("embedding_dimensions", "768") + store.set_meta("embedding_provider", "fastembed") + store.close + + checks = described_class.new.dimension_checks("tfidf", nil) + expect(checks.first).to have_attributes( + status: :mismatch, + stored_dims: 768, + current_dims: 384 + ) + end + end +end diff --git a/spec/claude_memory/embeddings/model_registry_spec.rb b/spec/claude_memory/embeddings/model_registry_spec.rb new file mode 100644 index 0000000..0f9a099 --- /dev/null +++ b/spec/claude_memory/embeddings/model_registry_spec.rb @@ -0,0 +1,116 @@ +# frozen_string_literal: true + +require "spec_helper" + +RSpec.describe ClaudeMemory::Embeddings::ModelRegistry do + describe ".find" do + it "returns ModelInfo for a known fastembed model" do + info = described_class.find("BAAI/bge-small-en-v1.5") + expect(info).not_to be_nil + expect(info.provider).to eq("fastembed") + expect(info.dimensions).to eq(384) + end + + it "returns ModelInfo for a known API model" do + info = described_class.find("text-embedding-3-small") + expect(info).not_to be_nil + expect(info.provider).to eq("api") + expect(info.dimensions).to eq(1536) + end + + it "returns ModelInfo for tfidf" do + info = described_class.find("tfidf") + expect(info).not_to be_nil + expect(info.provider).to eq("tfidf") + expect(info.dimensions).to eq(384) + end + + it "returns nil for unknown models" do + expect(described_class.find("unknown-model")).to be_nil + end + end + + describe ".models_for_provider" do + it "returns all fastembed models" do + models = described_class.models_for_provider("fastembed") + expect(models).not_to be_empty + expect(models).to all(have_attributes(provider: "fastembed")) + end + + it "returns all API models" do + models = described_class.models_for_provider("api") + expect(models).not_to be_empty + expect(models).to all(have_attributes(provider: "api")) + end + + it "returns empty array for unknown provider" do + expect(described_class.models_for_provider("unknown")).to eq([]) + end + end + + describe ".dimensions_for" do + it "returns dimensions for known models" do + expect(described_class.dimensions_for("BAAI/bge-base-en-v1.5")).to eq(768) + expect(described_class.dimensions_for("text-embedding-3-large")).to eq(3072) + end + + it "returns nil for unknown models" do + expect(described_class.dimensions_for("custom-model")).to be_nil + end + end + + describe ".providers" do + it "returns all provider names" do + providers = described_class.providers + expect(providers).to include("fastembed", "api", "tfidf") + end + end + + describe ".model_names" do + it "returns all model names" do + names = described_class.model_names + expect(names).to include("BAAI/bge-small-en-v1.5", "text-embedding-3-small", "tfidf") + end + end + + describe ".default_for_provider" do + it "returns the default fastembed model" do + info = described_class.default_for_provider("fastembed") + expect(info).not_to be_nil + expect(info.name).to eq("BAAI/bge-small-en-v1.5") + end + + it "returns the default api model" do + info = described_class.default_for_provider("api") + expect(info).not_to be_nil + expect(info.name).to eq("text-embedding-3-small") + end + + it "returns the default tfidf model" do + info = described_class.default_for_provider("tfidf") + expect(info).not_to be_nil + expect(info.name).to eq("tfidf") + end + + it "returns nil for unknown provider" do + expect(described_class.default_for_provider("unknown")).to be_nil + end + end + + describe "MODELS" do + it "all have required fields" do + described_class::MODELS.each do |model| + expect(model.name).to be_a(String) + expect(model.provider).to be_a(String) + expect(model.dimensions).to be_a(Integer) + expect(model.dimensions).to be > 0 + expect(model.description).to be_a(String) + end + end + + it "has unique model names" do + names = described_class::MODELS.map(&:name) + expect(names.uniq.size).to eq(names.size) + end + end +end diff --git a/spec/claude_memory/embeddings/resolver_spec.rb b/spec/claude_memory/embeddings/resolver_spec.rb index 968cb84..75347f7 100644 --- a/spec/claude_memory/embeddings/resolver_spec.rb +++ b/spec/claude_memory/embeddings/resolver_spec.rb @@ -4,10 +4,18 @@ RSpec.describe ClaudeMemory::Embeddings do describe ".resolve" do + # Helper to resolve fastembed providers, skipping when gem unavailable + def resolve_or_skip(...) + described_class.resolve(...) + rescue LoadError, StandardError => e + skip "fastembed not available: #{e.message}" + end + it "defaults to tfidf when no name or ENV is set" do provider = described_class.resolve(env: {}) expect(provider).to be_a(ClaudeMemory::Embeddings::Generator) expect(provider.name).to eq("tfidf") + expect(provider.dimensions).to eq(384) end it "resolves 'tfidf' by name" do @@ -16,13 +24,10 @@ end it "resolves 'fastembed' by name" do - # fastembed gem may not be installed; stub the require - stub_const("Fastembed::TextEmbedding", Class.new { define_method(:initialize) { |**_| } }) - allow_any_instance_of(ClaudeMemory::Embeddings::FastembedAdapter).to receive(:require).with("fastembed") - - provider = described_class.resolve("fastembed") + provider = resolve_or_skip("fastembed", env: {}) expect(provider).to be_a(ClaudeMemory::Embeddings::FastembedAdapter) expect(provider.name).to eq("fastembed") + expect(provider.dimensions).to eq(384) end it "resolves 'api' by name with API key" do @@ -52,5 +57,39 @@ described_class.resolve("unknown") }.to raise_error(ArgumentError, /Unknown embedding provider: unknown/) end + + context "with model: parameter" do + it "forwards model to fastembed adapter with correct dimensions" do + adapter = resolve_or_skip("fastembed", model: "BAAI/bge-base-en-v1.5", env: {}) + expect(adapter.model_name).to eq("BAAI/bge-base-en-v1.5") + expect(adapter.dimensions).to eq(768) + end + + it "forwards model to api adapter with registry dimensions" do + provider = described_class.resolve("api", model: "text-embedding-3-large", env: {"CLAUDE_MEMORY_EMBEDDING_API_KEY" => "k"}) + expect(provider.dimensions).to eq(3072) + end + + it "infers fastembed provider from model name" do + provider = resolve_or_skip(model: "BAAI/bge-small-en-v1.5", env: {}) + expect(provider).to be_a(ClaudeMemory::Embeddings::FastembedAdapter) + end + + it "infers api provider from api model name" do + provider = described_class.resolve(model: "text-embedding-3-small", env: {"CLAUDE_MEMORY_EMBEDDING_API_KEY" => "k"}) + expect(provider).to be_a(ClaudeMemory::Embeddings::ApiAdapter) + end + + it "reads model from CLAUDE_MEMORY_EMBEDDING_MODEL ENV and infers provider" do + provider = resolve_or_skip(env: {"CLAUDE_MEMORY_EMBEDDING_MODEL" => "BAAI/bge-base-en-v1.5"}) + expect(provider).to be_a(ClaudeMemory::Embeddings::FastembedAdapter) + expect(provider.dimensions).to eq(768) + end + + it "falls back to tfidf when model is unknown and no provider set" do + provider = described_class.resolve(model: "totally-unknown-model", env: {}) + expect(provider).to be_a(ClaudeMemory::Embeddings::Generator) + end + end end end