diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 20cba03..6188c8d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -84,7 +84,7 @@ jobs: - name: Start optillm server run: | echo "Starting optillm server for integration tests..." - OPTILLM_API_KEY=optillm python optillm.py --model google/gemma-3-270m-it --port 8000 & + OPTILLM_API_KEY=optillm python optillm.py --model Qwen/Qwen2.5-Coder-0.5B-Instruct --port 8000 & echo $! > server.pid # Wait for server to be ready @@ -179,7 +179,7 @@ jobs: echo "Starting optillm server with conversation logging..." mkdir -p /tmp/optillm_conversations OPTILLM_API_KEY=optillm python optillm.py \ - --model google/gemma-3-270m-it \ + --model Qwen/Qwen2.5-Coder-0.5B-Instruct \ --port 8000 \ --log-conversations \ --conversation-log-dir /tmp/optillm_conversations & diff --git a/optillm/__init__.py b/optillm/__init__.py index 1350db0..56c5d98 100644 --- a/optillm/__init__.py +++ b/optillm/__init__.py @@ -1,5 +1,5 @@ # Version information -__version__ = "0.3.14" +__version__ = "0.3.15" # Import from server module from .server import ( diff --git a/optillm/plugins/compact_plugin.py b/optillm/plugins/compact_plugin.py new file mode 100644 index 0000000..8eb06a5 --- /dev/null +++ b/optillm/plugins/compact_plugin.py @@ -0,0 +1,197 @@ +""" +Compact plugin for OptiLLM. + +Automatically compresses conversation context when it exceeds a token budget, +preserving recent turns verbatim and generating a structured summary of older +content — inspired by Claude Code's compact mechanism. + +Uses one LLM call to produce a structured summary with: + Scope, Key decisions, User preferences, Pending work, Key files referenced. +Recent turns are preserved verbatim. + +Composable with other approaches via & operator: compact&moa, compact&bon, etc. + +Configuration (env vars or request_config): + COMPACT_CONTEXT_WINDOW / compact_context_window — max context tokens (default: 128000) + COMPACT_THRESHOLD / compact_threshold — trigger ratio 0.0-1.0 (default: 0.75) + COMPACT_KEEP_RECENT / compact_keep_recent — turns to preserve verbatim (default: 4) +""" + +import os +import re +import logging +from typing import Tuple, List, Optional + +logger = logging.getLogger(__name__) + +SLUG = "compact" + +DEFAULT_CONTEXT_WINDOW = 128000 +DEFAULT_THRESHOLD = 0.75 +DEFAULT_KEEP_RECENT = 4 + +COMPACT_SYSTEM_PROMPT = """You are a conversation summarizer. Given a conversation history, produce a structured summary. + +Output ONLY this format, nothing else: + + +Conversation summary: +- Scope: {N} earlier messages compacted (user={U}, assistant={A}). +- Key decisions: {list the main decisions or conclusions reached} +- User preferences: {any stated preferences or constraints} +- Pending work: {any remaining tasks or next steps mentioned} +- Key files referenced: {file paths mentioned, if any} +- Context: {a concise paragraph capturing the essential context needed to continue} + + +Rules: +- Be specific: include actual values, names, and file paths — not vague references +- Be concise: each section should be 1-2 lines maximum +- Omit pleasantries, greetings, and filler +- The Context paragraph is the most important part — it should capture everything a new assistant would need to pick up where this left off""" + + +def _get_config(request_config: Optional[dict], key: str, env_var: str, default): + val = None + if request_config: + val = request_config.get(key) + if val is None: + env_val = os.environ.get(env_var) + if env_val is not None: + try: + val = type(default)(env_val) + except (ValueError, TypeError): + logger.warning(f"Invalid value for {env_var}: {env_val!r}, using default {default}") + val = default + return val if val is not None else default + + +def _get_context_window(client, model: str, request_config: Optional[dict]) -> int: + """Get context window size: try provider /models endpoint first, then config fallback.""" + try: + model_info = client.models.retrieve(model) + for attr in ("context_length", "max_context_length", "context_window", + "max_model_length", "max_position_embeddings"): + val = getattr(model_info, attr, None) + if val is not None: + return int(val) + except Exception: + pass + + return _get_config(request_config, "compact_context_window", "COMPACT_CONTEXT_WINDOW", DEFAULT_CONTEXT_WINDOW) + + +def estimate_tokens(text: str) -> int: + try: + import tiktoken + enc = tiktoken.encoding_for_model("gpt-4") + return len(enc.encode(text)) + except (ImportError, KeyError): + return max(1, len(text) // 4) + + +def parse_tagged_conversation(text: str) -> List[Tuple[str, str]]: + turns = [] + for match in re.finditer(r'^(User:|Assistant:)\s*', text, re.MULTILINE): + role = "user" if match.group(1) == "User:" else "assistant" + start = match.end() + next_match = re.search(r'^(User:|Assistant:)', text[start:], re.MULTILINE) + if next_match: + content = text[start:start + next_match.start()].strip() + else: + content = text[start:].strip() + turns.append((role, content)) + return turns + + +def reconstruct_tagged(turns: List[Tuple[str, str]]) -> str: + lines = [] + for role, content in turns: + tag = "User:" if role == "user" else "Assistant:" + lines.append(f"{tag} {content}") + return "\n".join(lines) + + +def compress_with_llm( + older_turns: List[Tuple[str, str]], + system_prompt: str, + client, + model: str, +) -> Tuple[Optional[str], int]: + conversation_text = reconstruct_tagged(older_turns) + + system_content = COMPACT_SYSTEM_PROMPT + if system_prompt: + system_content += f"\n\nOriginal system context: {system_prompt}" + + messages = [ + {"role": "system", "content": system_content}, + {"role": "user", "content": conversation_text}, + ] + + try: + response = client.chat.completions.create( + model=model, + messages=messages, + max_tokens=2000, + temperature=0.3, + ) + except Exception as e: + logger.error(f"Compact: LLM compression failed: {e}") + return None, 0 + + raw = response.choices[0].message.content.strip() + tokens_used = response.usage.completion_tokens if response.usage else 0 + + match = re.search(r'(.*?)', raw, re.DOTALL) + if match: + summary = match.group(1).strip() + else: + summary = raw + + return summary, tokens_used + + +def run( + system_prompt: str, + initial_query: str, + client, + model: str, + request_config: Optional[dict] = None, +) -> Tuple[str, int]: + context_window = _get_context_window(client, model, request_config) + threshold = _get_config(request_config, "compact_threshold", "COMPACT_THRESHOLD", DEFAULT_THRESHOLD) + keep_recent = _get_config(request_config, "compact_keep_recent", "COMPACT_KEEP_RECENT", DEFAULT_KEEP_RECENT) + + token_count = estimate_tokens(initial_query) + budget = int(context_window * threshold) + + if token_count < budget: + logger.debug(f"Compact: passthrough ({token_count} tokens < {budget} budget)") + return initial_query, 0 + + turns = parse_tagged_conversation(initial_query) + if len(turns) <= keep_recent: + logger.debug(f"Compact: too few turns to compress ({len(turns)} <= {keep_recent})") + return initial_query, 0 + + split_idx = len(turns) - keep_recent + older_turns = turns[:split_idx] + recent_turns = turns[split_idx:] + + logger.info(f"Compact: compressing {len(older_turns)} older turns, keeping {len(recent_turns)} recent") + + summary, tokens_used = compress_with_llm(older_turns, system_prompt, client, model) + + if summary is None: + logger.warning("Compact: compression failed, returning original query") + return initial_query, 0 + + compressed_turns = [("user", f"[Conversation summary]:\n{summary}")] + compressed_turns.extend(recent_turns) + + result = reconstruct_tagged(compressed_turns) + new_token_count = estimate_tokens(result) + logger.info(f"Compact: {token_count} -> {new_token_count} tokens (used {tokens_used} for compression)") + + return result, tokens_used diff --git a/optillm/plugins/json_plugin.py b/optillm/plugins/json_plugin.py index 07a76ab..c686859 100644 --- a/optillm/plugins/json_plugin.py +++ b/optillm/plugins/json_plugin.py @@ -22,7 +22,7 @@ def get_device(self): else: return torch.device("cpu") - def __init__(self, model_name: str = "google/gemma-3-270m-it"): + def __init__(self, model_name: str = "Qwen/Qwen2.5-Coder-0.5B-Instruct"): """Initialize the JSON generator with a specific model.""" self.device = self.get_device() logger.info(f"Using device: {self.device}") diff --git a/pyproject.toml b/pyproject.toml index 81a5b0f..94f9a19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "optillm" -version = "0.3.14" +version = "0.3.15" description = "An optimizing inference proxy for LLMs." readme = "README.md" license = "Apache-2.0" diff --git a/tests/test_compact_plugin.py b/tests/test_compact_plugin.py new file mode 100644 index 0000000..acd1cfb --- /dev/null +++ b/tests/test_compact_plugin.py @@ -0,0 +1,303 @@ +"""Tests for compact_plugin.""" + +import os +import pytest +from unittest.mock import MagicMock, patch +from optillm.plugins.compact_plugin import ( + estimate_tokens, + parse_tagged_conversation, + reconstruct_tagged, + _get_config, + run, +) + + +def _make_client(summary_text="Test summary with Key decisions: decided X."): + mock_response = MagicMock() + mock_response.choices = [MagicMock()] + mock_response.choices[0].message.content = summary_text + mock_response.usage.completion_tokens = 80 + client = MagicMock() + client.chat.completions.create.return_value = mock_response + client.models.retrieve.return_value = MagicMock(spec=[]) + return client + + +class TestEstimateTokens: + def test_short_text(self): + assert estimate_tokens("hello") >= 1 + + def test_empty_string(self): + assert estimate_tokens("") == 0 + + def test_long_text_returns_positive(self): + assert estimate_tokens("a" * 400) > 0 + + +class TestParseTaggedConversation: + def test_single_user_message(self): + assert parse_tagged_conversation("User: hello") == [("user", "hello")] + + def test_conversation_pair(self): + assert parse_tagged_conversation("User: hi\nAssistant: hello") == [("user", "hi"), ("assistant", "hello")] + + def test_multi_turn(self): + turns = parse_tagged_conversation("User: q1\nAssistant: a1\nUser: q2\nAssistant: a2") + assert len(turns) == 4 + assert turns[0] == ("user", "q1") + assert turns[3] == ("assistant", "a2") + + def test_empty_string(self): + assert parse_tagged_conversation("") == [] + + def test_no_tags(self): + assert parse_tagged_conversation("just some text") == [] + + +class TestReconstructTagged: + def test_roundtrip(self): + result = reconstruct_tagged([("user", "hello"), ("assistant", "world")]) + assert "User: hello" in result + assert "Assistant: world" in result + + def test_empty(self): + assert reconstruct_tagged([]) == "" + + +class TestGetConfig: + def test_default_when_nothing_set(self): + assert _get_config(None, "key", "NONEXISTENT_VAR", 42) == 42 + + def test_request_config_takes_priority(self): + assert _get_config({"compact_threshold": 0.5}, "compact_threshold", "COMPACT_THRESHOLD", 0.75) == 0.5 + + def test_env_var_as_fallback(self): + with patch.dict(os.environ, {"COMPACT_KEEP_RECENT": "6"}): + assert _get_config(None, "compact_keep_recent", "COMPACT_KEEP_RECENT", 4) == 6 + + def test_request_config_overrides_env(self): + with patch.dict(os.environ, {"COMPACT_KEEP_RECENT": "6"}): + assert _get_config({"compact_keep_recent": 2}, "compact_keep_recent", "COMPACT_KEEP_RECENT", 4) == 2 + + +class TestRun: + def test_passthrough_short_conversation(self): + query = "User: hi\nAssistant: hello" + result, tokens = run("system", query, _make_client(), "gpt-4") + assert result == query + assert tokens == 0 + + def test_passthrough_few_turns(self): + turns = [] + for i in range(3): + turns.append(f"User: question {i}") + turns.append(f"Assistant: answer {i}") + query = "\n".join(turns) + result, tokens = run("system", query, _make_client(), "gpt-4", + request_config={"compact_context_window": 100, "compact_threshold": 0.5}) + assert result == query + assert tokens == 0 + + def test_compression_triggered_uses_llm(self): + turns = [] + for i in range(20): + turns.append(f"User: this is a longer question number {i} with extra text to increase token count") + turns.append(f"Assistant: this is a longer answer number {i} with extra text to increase token count") + query = "\n".join(turns) + + client = _make_client("\nConversation summary:\n- Scope: 36 messages.\n- Key decisions: decided to use compact.\n- Context: user was testing compression.\n") + result, tokens = run("system", query, client, "gpt-4", + request_config={"compact_context_window": 200, "compact_threshold": 0.3, + "compact_keep_recent": 4}) + + assert tokens == 80 # LLM was called + assert "[Conversation summary]" in result + assert "Scope:" in result + assert "question number 19" in result # last user turn preserved + client.chat.completions.create.assert_called_once() + + def test_structured_summary_format(self): + turns = [] + for i in range(20): + turns.append(f"User: question {i} " + "x" * 100) + turns.append(f"Assistant: answer {i} " + "y" * 100) + query = "\n".join(turns) + + summary_text = "\nConversation summary:\n- Scope: 36 messages.\n- Key decisions: use plugin.\n- Key files: src/main.py.\n- Context: testing.\n" + client = _make_client(summary_text) + result, tokens = run("system", query, client, "gpt-4", + request_config={"compact_context_window": 200, "compact_threshold": 0.1, + "compact_keep_recent": 2}) + + assert "Key decisions:" in result + assert "Key files:" in result + + def test_output_preserves_tag_format(self): + turns = [] + for i in range(10): + turns.append(f"User: question {i} " + "x" * 100) + turns.append(f"Assistant: answer {i} " + "y" * 100) + query = "\n".join(turns) + + client = _make_client() + result, tokens = run("system", query, client, "gpt-4", + request_config={"compact_context_window": 200, "compact_threshold": 0.1, + "compact_keep_recent": 2}) + + parsed = parse_tagged_conversation(result) + assert len(parsed) >= 2 + assert parsed[0][0] == "user" + assert "[Conversation summary]" in parsed[0][1] + + def test_env_var_configuration(self): + turns = [] + for i in range(10): + turns.append(f"User: question {i} " + "x" * 100) + turns.append(f"Assistant: answer {i} " + "y" * 100) + query = "\n".join(turns) + + client = _make_client() + with patch.dict(os.environ, {"COMPACT_CONTEXT_WINDOW": "200", "COMPACT_THRESHOLD": "0.1", + "COMPACT_KEEP_RECENT": "2"}): + result, tokens = run("system", query, client, "gpt-4") + + assert tokens == 80 + assert "[Conversation summary]" in result + + def test_llm_failure_falls_back_to_passthrough(self): + turns = [] + for i in range(10): + turns.append(f"User: question {i} " + "x" * 100) + turns.append(f"Assistant: answer {i} " + "y" * 100) + query = "\n".join(turns) + + client = MagicMock() + client.chat.completions.create.side_effect = Exception("API error") + + result, tokens = run("system", query, client, "gpt-4", + request_config={"compact_context_window": 200, "compact_threshold": 0.1, + "compact_keep_recent": 2}) + assert result == query + assert tokens == 0 + + def test_summary_tag_extraction(self): + turns = [] + for i in range(10): + turns.append(f"User: question {i} " + "x" * 100) + turns.append(f"Assistant: answer {i} " + "y" * 100) + query = "\n".join(turns) + + raw_llm_output = "Here is the summary:\n\nConversation summary:\n- Scope: 10 messages.\n- Key decisions: decided X.\n\nHope this helps!" + client = _make_client(summary_text=raw_llm_output) + + result, tokens = run("system", query, client, "gpt-4", + request_config={"compact_context_window": 200, "compact_threshold": 0.1, + "compact_keep_recent": 2}) + + assert "Here is the summary:" not in result + assert "Hope this helps!" not in result + assert "Scope:" in result + assert "Key decisions:" in result + + def test_summary_without_tags_used_raw(self): + turns = [] + for i in range(10): + turns.append(f"User: question {i} " + "x" * 100) + turns.append(f"Assistant: answer {i} " + "y" * 100) + query = "\n".join(turns) + + client = _make_client(summary_text="Plain summary without XML tags. Key decisions: use compact.") + + result, tokens = run("system", query, client, "gpt-4", + request_config={"compact_context_window": 200, "compact_threshold": 0.1, + "compact_keep_recent": 2}) + + assert "Plain summary" in result + assert tokens == 80 + + def test_system_prompt_included_in_compression(self): + turns = [] + for i in range(10): + turns.append(f"User: question {i} " + "x" * 100) + turns.append(f"Assistant: answer {i} " + "y" * 100) + query = "\n".join(turns) + + client = _make_client() + result, tokens = run("You are a medical coding assistant", query, client, "gpt-4", + request_config={"compact_context_window": 200, "compact_threshold": 0.1, + "compact_keep_recent": 2}) + + call_args = client.chat.completions.create.call_args + system_content = call_args.kwargs["messages"][0]["content"] + assert "medical coding assistant" in system_content + + def test_keep_recent_exceeds_turn_count(self): + query = "User: hi\nAssistant: hello" + client = _make_client() + result, tokens = run("system", query, client, "gpt-4", + request_config={"compact_context_window": 1, "compact_threshold": 0.1, + "compact_keep_recent": 100}) + assert result == query + assert tokens == 0 + + def test_threshold_zero_always_triggers(self): + turns = [] + for i in range(10): + turns.append(f"User: question {i} " + "x" * 100) + turns.append(f"Assistant: answer {i} " + "y" * 100) + query = "\n".join(turns) + + client = _make_client() + result, tokens = run("system", query, client, "gpt-4", + request_config={"compact_context_window": 200, "compact_threshold": 0.0, + "compact_keep_recent": 2}) + + assert tokens == 80 + assert "[Conversation summary]" in result + + def test_malformed_env_var_uses_default(self): + with patch.dict(os.environ, {"COMPACT_THRESHOLD": "not_a_number"}): + val = _get_config(None, "compact_threshold", "COMPACT_THRESHOLD", 0.75) + assert val == 0.75 + + def test_embedded_tags_not_split(self): + text = "User: I asked my friend: User: what is Python?\nAssistant: Here is the answer" + turns = parse_tagged_conversation(text) + assert len(turns) == 2 + assert "friend: User: what is Python?" in turns[0][1] + + +class TestGetContextWindow: + def test_provider_returns_context_length(self): + from optillm.plugins.compact_plugin import _get_context_window + model_info = MagicMock() + model_info.context_length = 32768 + client = MagicMock() + client.models.retrieve.return_value = model_info + result = _get_context_window(client, "test-model", None) + assert result == 32768 + + def test_provider_returns_max_context_length(self): + from optillm.plugins.compact_plugin import _get_context_window + model_info = MagicMock() + model_info.context_length = None + model_info.max_context_length = 65536 + client = MagicMock() + client.models.retrieve.return_value = model_info + result = _get_context_window(client, "test-model", None) + assert result == 65536 + + def test_provider_no_context_info_falls_back_to_config(self): + from optillm.plugins.compact_plugin import _get_context_window + model_info = MagicMock(spec=[]) + client = MagicMock() + client.models.retrieve.return_value = model_info + result = _get_context_window(client, "test-model", {"compact_context_window": 50000}) + assert result == 50000 + + def test_provider_api_fails_falls_back_to_default(self): + from optillm.plugins.compact_plugin import _get_context_window + client = MagicMock() + client.models.retrieve.side_effect = Exception("not supported") + result = _get_context_window(client, "test-model", None) + assert result == 128000 diff --git a/tests/test_conversation_logging_server.py b/tests/test_conversation_logging_server.py index ef22803..106c302 100644 --- a/tests/test_conversation_logging_server.py +++ b/tests/test_conversation_logging_server.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Server-based integration tests for conversation logging with real model -Tests conversation logging with actual OptILLM server and google/gemma-3-270m-it model +Tests conversation logging with actual OptILLM server and Qwen/Qwen2.5-Coder-0.5B-Instruct model """ import unittest diff --git a/tests/test_plugins.py b/tests/test_plugins.py index 3f55c38..d81e334 100644 --- a/tests/test_plugins.py +++ b/tests/test_plugins.py @@ -32,7 +32,8 @@ def test_plugin_module_imports(): 'optillm.plugins.longcepo_plugin', 'optillm.plugins.spl_plugin', 'optillm.plugins.proxy_plugin', - 'optillm.plugins.mcp_plugin' + 'optillm.plugins.mcp_plugin', + 'optillm.plugins.compact_plugin' ] for module_name in plugin_modules: @@ -53,7 +54,7 @@ def test_plugin_approach_detection(): load_plugins() # Check if known plugins are loaded - expected_plugins = ["memory", "readurls", "privacy", "web_search", "deep_research", "deepthink", "longcepo", "spl", "proxy", "mcp"] + expected_plugins = ["memory", "readurls", "privacy", "web_search", "deep_research", "deepthink", "longcepo", "spl", "proxy", "mcp", "compact"] for plugin_name in expected_plugins: assert plugin_name in plugin_approaches, f"Plugin {plugin_name} not loaded" @@ -304,6 +305,14 @@ def test_mcp_plugin(): assert plugin.SLUG == "mcp" +def test_compact_plugin(): + """Test compact plugin module""" + import optillm.plugins.compact_plugin as plugin + assert hasattr(plugin, 'run') + assert hasattr(plugin, 'SLUG') + assert plugin.SLUG == "compact" + + def test_plugin_subdirectory_imports(): """Test all plugins with subdirectories can import their submodules""" # Test deep_research diff --git a/tests/test_utils.py b/tests/test_utils.py index 30039a9..c31162b 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -12,8 +12,8 @@ from openai import OpenAI # Standard test model for all tests - small and fast -TEST_MODEL = "google/gemma-3-270m-it" -TEST_MODEL_MLX = "mlx-community/gemma-3-270m-it-bf16" +TEST_MODEL = "Qwen/Qwen2.5-Coder-0.5B-Instruct" +TEST_MODEL_MLX = "mlx-community/Qwen2.5-Coder-0.5B-Instruct-bf16" def setup_test_env(): """Set up test environment with local inference"""