diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 20cba03..6188c8d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -84,7 +84,7 @@ jobs:
- name: Start optillm server
run: |
echo "Starting optillm server for integration tests..."
- OPTILLM_API_KEY=optillm python optillm.py --model google/gemma-3-270m-it --port 8000 &
+ OPTILLM_API_KEY=optillm python optillm.py --model Qwen/Qwen2.5-Coder-0.5B-Instruct --port 8000 &
echo $! > server.pid
# Wait for server to be ready
@@ -179,7 +179,7 @@ jobs:
echo "Starting optillm server with conversation logging..."
mkdir -p /tmp/optillm_conversations
OPTILLM_API_KEY=optillm python optillm.py \
- --model google/gemma-3-270m-it \
+ --model Qwen/Qwen2.5-Coder-0.5B-Instruct \
--port 8000 \
--log-conversations \
--conversation-log-dir /tmp/optillm_conversations &
diff --git a/optillm/__init__.py b/optillm/__init__.py
index 1350db0..56c5d98 100644
--- a/optillm/__init__.py
+++ b/optillm/__init__.py
@@ -1,5 +1,5 @@
# Version information
-__version__ = "0.3.14"
+__version__ = "0.3.15"
# Import from server module
from .server import (
diff --git a/optillm/plugins/compact_plugin.py b/optillm/plugins/compact_plugin.py
new file mode 100644
index 0000000..8eb06a5
--- /dev/null
+++ b/optillm/plugins/compact_plugin.py
@@ -0,0 +1,197 @@
+"""
+Compact plugin for OptiLLM.
+
+Automatically compresses conversation context when it exceeds a token budget,
+preserving recent turns verbatim and generating a structured summary of older
+content — inspired by Claude Code's compact mechanism.
+
+Uses one LLM call to produce a structured summary with:
+ Scope, Key decisions, User preferences, Pending work, Key files referenced.
+Recent turns are preserved verbatim.
+
+Composable with other approaches via & operator: compact&moa, compact&bon, etc.
+
+Configuration (env vars or request_config):
+ COMPACT_CONTEXT_WINDOW / compact_context_window — max context tokens (default: 128000)
+ COMPACT_THRESHOLD / compact_threshold — trigger ratio 0.0-1.0 (default: 0.75)
+ COMPACT_KEEP_RECENT / compact_keep_recent — turns to preserve verbatim (default: 4)
+"""
+
+import os
+import re
+import logging
+from typing import Tuple, List, Optional
+
+logger = logging.getLogger(__name__)
+
+SLUG = "compact"
+
+DEFAULT_CONTEXT_WINDOW = 128000
+DEFAULT_THRESHOLD = 0.75
+DEFAULT_KEEP_RECENT = 4
+
+COMPACT_SYSTEM_PROMPT = """You are a conversation summarizer. Given a conversation history, produce a structured summary.
+
+Output ONLY this format, nothing else:
+
+
+Conversation summary:
+- Scope: {N} earlier messages compacted (user={U}, assistant={A}).
+- Key decisions: {list the main decisions or conclusions reached}
+- User preferences: {any stated preferences or constraints}
+- Pending work: {any remaining tasks or next steps mentioned}
+- Key files referenced: {file paths mentioned, if any}
+- Context: {a concise paragraph capturing the essential context needed to continue}
+
+
+Rules:
+- Be specific: include actual values, names, and file paths — not vague references
+- Be concise: each section should be 1-2 lines maximum
+- Omit pleasantries, greetings, and filler
+- The Context paragraph is the most important part — it should capture everything a new assistant would need to pick up where this left off"""
+
+
+def _get_config(request_config: Optional[dict], key: str, env_var: str, default):
+ val = None
+ if request_config:
+ val = request_config.get(key)
+ if val is None:
+ env_val = os.environ.get(env_var)
+ if env_val is not None:
+ try:
+ val = type(default)(env_val)
+ except (ValueError, TypeError):
+ logger.warning(f"Invalid value for {env_var}: {env_val!r}, using default {default}")
+ val = default
+ return val if val is not None else default
+
+
+def _get_context_window(client, model: str, request_config: Optional[dict]) -> int:
+ """Get context window size: try provider /models endpoint first, then config fallback."""
+ try:
+ model_info = client.models.retrieve(model)
+ for attr in ("context_length", "max_context_length", "context_window",
+ "max_model_length", "max_position_embeddings"):
+ val = getattr(model_info, attr, None)
+ if val is not None:
+ return int(val)
+ except Exception:
+ pass
+
+ return _get_config(request_config, "compact_context_window", "COMPACT_CONTEXT_WINDOW", DEFAULT_CONTEXT_WINDOW)
+
+
+def estimate_tokens(text: str) -> int:
+ try:
+ import tiktoken
+ enc = tiktoken.encoding_for_model("gpt-4")
+ return len(enc.encode(text))
+ except (ImportError, KeyError):
+ return max(1, len(text) // 4)
+
+
+def parse_tagged_conversation(text: str) -> List[Tuple[str, str]]:
+ turns = []
+ for match in re.finditer(r'^(User:|Assistant:)\s*', text, re.MULTILINE):
+ role = "user" if match.group(1) == "User:" else "assistant"
+ start = match.end()
+ next_match = re.search(r'^(User:|Assistant:)', text[start:], re.MULTILINE)
+ if next_match:
+ content = text[start:start + next_match.start()].strip()
+ else:
+ content = text[start:].strip()
+ turns.append((role, content))
+ return turns
+
+
+def reconstruct_tagged(turns: List[Tuple[str, str]]) -> str:
+ lines = []
+ for role, content in turns:
+ tag = "User:" if role == "user" else "Assistant:"
+ lines.append(f"{tag} {content}")
+ return "\n".join(lines)
+
+
+def compress_with_llm(
+ older_turns: List[Tuple[str, str]],
+ system_prompt: str,
+ client,
+ model: str,
+) -> Tuple[Optional[str], int]:
+ conversation_text = reconstruct_tagged(older_turns)
+
+ system_content = COMPACT_SYSTEM_PROMPT
+ if system_prompt:
+ system_content += f"\n\nOriginal system context: {system_prompt}"
+
+ messages = [
+ {"role": "system", "content": system_content},
+ {"role": "user", "content": conversation_text},
+ ]
+
+ try:
+ response = client.chat.completions.create(
+ model=model,
+ messages=messages,
+ max_tokens=2000,
+ temperature=0.3,
+ )
+ except Exception as e:
+ logger.error(f"Compact: LLM compression failed: {e}")
+ return None, 0
+
+ raw = response.choices[0].message.content.strip()
+ tokens_used = response.usage.completion_tokens if response.usage else 0
+
+ match = re.search(r'(.*?)', raw, re.DOTALL)
+ if match:
+ summary = match.group(1).strip()
+ else:
+ summary = raw
+
+ return summary, tokens_used
+
+
+def run(
+ system_prompt: str,
+ initial_query: str,
+ client,
+ model: str,
+ request_config: Optional[dict] = None,
+) -> Tuple[str, int]:
+ context_window = _get_context_window(client, model, request_config)
+ threshold = _get_config(request_config, "compact_threshold", "COMPACT_THRESHOLD", DEFAULT_THRESHOLD)
+ keep_recent = _get_config(request_config, "compact_keep_recent", "COMPACT_KEEP_RECENT", DEFAULT_KEEP_RECENT)
+
+ token_count = estimate_tokens(initial_query)
+ budget = int(context_window * threshold)
+
+ if token_count < budget:
+ logger.debug(f"Compact: passthrough ({token_count} tokens < {budget} budget)")
+ return initial_query, 0
+
+ turns = parse_tagged_conversation(initial_query)
+ if len(turns) <= keep_recent:
+ logger.debug(f"Compact: too few turns to compress ({len(turns)} <= {keep_recent})")
+ return initial_query, 0
+
+ split_idx = len(turns) - keep_recent
+ older_turns = turns[:split_idx]
+ recent_turns = turns[split_idx:]
+
+ logger.info(f"Compact: compressing {len(older_turns)} older turns, keeping {len(recent_turns)} recent")
+
+ summary, tokens_used = compress_with_llm(older_turns, system_prompt, client, model)
+
+ if summary is None:
+ logger.warning("Compact: compression failed, returning original query")
+ return initial_query, 0
+
+ compressed_turns = [("user", f"[Conversation summary]:\n{summary}")]
+ compressed_turns.extend(recent_turns)
+
+ result = reconstruct_tagged(compressed_turns)
+ new_token_count = estimate_tokens(result)
+ logger.info(f"Compact: {token_count} -> {new_token_count} tokens (used {tokens_used} for compression)")
+
+ return result, tokens_used
diff --git a/optillm/plugins/json_plugin.py b/optillm/plugins/json_plugin.py
index 07a76ab..c686859 100644
--- a/optillm/plugins/json_plugin.py
+++ b/optillm/plugins/json_plugin.py
@@ -22,7 +22,7 @@ def get_device(self):
else:
return torch.device("cpu")
- def __init__(self, model_name: str = "google/gemma-3-270m-it"):
+ def __init__(self, model_name: str = "Qwen/Qwen2.5-Coder-0.5B-Instruct"):
"""Initialize the JSON generator with a specific model."""
self.device = self.get_device()
logger.info(f"Using device: {self.device}")
diff --git a/pyproject.toml b/pyproject.toml
index 81a5b0f..94f9a19 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "optillm"
-version = "0.3.14"
+version = "0.3.15"
description = "An optimizing inference proxy for LLMs."
readme = "README.md"
license = "Apache-2.0"
diff --git a/tests/test_compact_plugin.py b/tests/test_compact_plugin.py
new file mode 100644
index 0000000..acd1cfb
--- /dev/null
+++ b/tests/test_compact_plugin.py
@@ -0,0 +1,303 @@
+"""Tests for compact_plugin."""
+
+import os
+import pytest
+from unittest.mock import MagicMock, patch
+from optillm.plugins.compact_plugin import (
+ estimate_tokens,
+ parse_tagged_conversation,
+ reconstruct_tagged,
+ _get_config,
+ run,
+)
+
+
+def _make_client(summary_text="Test summary with Key decisions: decided X."):
+ mock_response = MagicMock()
+ mock_response.choices = [MagicMock()]
+ mock_response.choices[0].message.content = summary_text
+ mock_response.usage.completion_tokens = 80
+ client = MagicMock()
+ client.chat.completions.create.return_value = mock_response
+ client.models.retrieve.return_value = MagicMock(spec=[])
+ return client
+
+
+class TestEstimateTokens:
+ def test_short_text(self):
+ assert estimate_tokens("hello") >= 1
+
+ def test_empty_string(self):
+ assert estimate_tokens("") == 0
+
+ def test_long_text_returns_positive(self):
+ assert estimate_tokens("a" * 400) > 0
+
+
+class TestParseTaggedConversation:
+ def test_single_user_message(self):
+ assert parse_tagged_conversation("User: hello") == [("user", "hello")]
+
+ def test_conversation_pair(self):
+ assert parse_tagged_conversation("User: hi\nAssistant: hello") == [("user", "hi"), ("assistant", "hello")]
+
+ def test_multi_turn(self):
+ turns = parse_tagged_conversation("User: q1\nAssistant: a1\nUser: q2\nAssistant: a2")
+ assert len(turns) == 4
+ assert turns[0] == ("user", "q1")
+ assert turns[3] == ("assistant", "a2")
+
+ def test_empty_string(self):
+ assert parse_tagged_conversation("") == []
+
+ def test_no_tags(self):
+ assert parse_tagged_conversation("just some text") == []
+
+
+class TestReconstructTagged:
+ def test_roundtrip(self):
+ result = reconstruct_tagged([("user", "hello"), ("assistant", "world")])
+ assert "User: hello" in result
+ assert "Assistant: world" in result
+
+ def test_empty(self):
+ assert reconstruct_tagged([]) == ""
+
+
+class TestGetConfig:
+ def test_default_when_nothing_set(self):
+ assert _get_config(None, "key", "NONEXISTENT_VAR", 42) == 42
+
+ def test_request_config_takes_priority(self):
+ assert _get_config({"compact_threshold": 0.5}, "compact_threshold", "COMPACT_THRESHOLD", 0.75) == 0.5
+
+ def test_env_var_as_fallback(self):
+ with patch.dict(os.environ, {"COMPACT_KEEP_RECENT": "6"}):
+ assert _get_config(None, "compact_keep_recent", "COMPACT_KEEP_RECENT", 4) == 6
+
+ def test_request_config_overrides_env(self):
+ with patch.dict(os.environ, {"COMPACT_KEEP_RECENT": "6"}):
+ assert _get_config({"compact_keep_recent": 2}, "compact_keep_recent", "COMPACT_KEEP_RECENT", 4) == 2
+
+
+class TestRun:
+ def test_passthrough_short_conversation(self):
+ query = "User: hi\nAssistant: hello"
+ result, tokens = run("system", query, _make_client(), "gpt-4")
+ assert result == query
+ assert tokens == 0
+
+ def test_passthrough_few_turns(self):
+ turns = []
+ for i in range(3):
+ turns.append(f"User: question {i}")
+ turns.append(f"Assistant: answer {i}")
+ query = "\n".join(turns)
+ result, tokens = run("system", query, _make_client(), "gpt-4",
+ request_config={"compact_context_window": 100, "compact_threshold": 0.5})
+ assert result == query
+ assert tokens == 0
+
+ def test_compression_triggered_uses_llm(self):
+ turns = []
+ for i in range(20):
+ turns.append(f"User: this is a longer question number {i} with extra text to increase token count")
+ turns.append(f"Assistant: this is a longer answer number {i} with extra text to increase token count")
+ query = "\n".join(turns)
+
+ client = _make_client("\nConversation summary:\n- Scope: 36 messages.\n- Key decisions: decided to use compact.\n- Context: user was testing compression.\n")
+ result, tokens = run("system", query, client, "gpt-4",
+ request_config={"compact_context_window": 200, "compact_threshold": 0.3,
+ "compact_keep_recent": 4})
+
+ assert tokens == 80 # LLM was called
+ assert "[Conversation summary]" in result
+ assert "Scope:" in result
+ assert "question number 19" in result # last user turn preserved
+ client.chat.completions.create.assert_called_once()
+
+ def test_structured_summary_format(self):
+ turns = []
+ for i in range(20):
+ turns.append(f"User: question {i} " + "x" * 100)
+ turns.append(f"Assistant: answer {i} " + "y" * 100)
+ query = "\n".join(turns)
+
+ summary_text = "\nConversation summary:\n- Scope: 36 messages.\n- Key decisions: use plugin.\n- Key files: src/main.py.\n- Context: testing.\n"
+ client = _make_client(summary_text)
+ result, tokens = run("system", query, client, "gpt-4",
+ request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+ "compact_keep_recent": 2})
+
+ assert "Key decisions:" in result
+ assert "Key files:" in result
+
+ def test_output_preserves_tag_format(self):
+ turns = []
+ for i in range(10):
+ turns.append(f"User: question {i} " + "x" * 100)
+ turns.append(f"Assistant: answer {i} " + "y" * 100)
+ query = "\n".join(turns)
+
+ client = _make_client()
+ result, tokens = run("system", query, client, "gpt-4",
+ request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+ "compact_keep_recent": 2})
+
+ parsed = parse_tagged_conversation(result)
+ assert len(parsed) >= 2
+ assert parsed[0][0] == "user"
+ assert "[Conversation summary]" in parsed[0][1]
+
+ def test_env_var_configuration(self):
+ turns = []
+ for i in range(10):
+ turns.append(f"User: question {i} " + "x" * 100)
+ turns.append(f"Assistant: answer {i} " + "y" * 100)
+ query = "\n".join(turns)
+
+ client = _make_client()
+ with patch.dict(os.environ, {"COMPACT_CONTEXT_WINDOW": "200", "COMPACT_THRESHOLD": "0.1",
+ "COMPACT_KEEP_RECENT": "2"}):
+ result, tokens = run("system", query, client, "gpt-4")
+
+ assert tokens == 80
+ assert "[Conversation summary]" in result
+
+ def test_llm_failure_falls_back_to_passthrough(self):
+ turns = []
+ for i in range(10):
+ turns.append(f"User: question {i} " + "x" * 100)
+ turns.append(f"Assistant: answer {i} " + "y" * 100)
+ query = "\n".join(turns)
+
+ client = MagicMock()
+ client.chat.completions.create.side_effect = Exception("API error")
+
+ result, tokens = run("system", query, client, "gpt-4",
+ request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+ "compact_keep_recent": 2})
+ assert result == query
+ assert tokens == 0
+
+ def test_summary_tag_extraction(self):
+ turns = []
+ for i in range(10):
+ turns.append(f"User: question {i} " + "x" * 100)
+ turns.append(f"Assistant: answer {i} " + "y" * 100)
+ query = "\n".join(turns)
+
+ raw_llm_output = "Here is the summary:\n\nConversation summary:\n- Scope: 10 messages.\n- Key decisions: decided X.\n\nHope this helps!"
+ client = _make_client(summary_text=raw_llm_output)
+
+ result, tokens = run("system", query, client, "gpt-4",
+ request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+ "compact_keep_recent": 2})
+
+ assert "Here is the summary:" not in result
+ assert "Hope this helps!" not in result
+ assert "Scope:" in result
+ assert "Key decisions:" in result
+
+ def test_summary_without_tags_used_raw(self):
+ turns = []
+ for i in range(10):
+ turns.append(f"User: question {i} " + "x" * 100)
+ turns.append(f"Assistant: answer {i} " + "y" * 100)
+ query = "\n".join(turns)
+
+ client = _make_client(summary_text="Plain summary without XML tags. Key decisions: use compact.")
+
+ result, tokens = run("system", query, client, "gpt-4",
+ request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+ "compact_keep_recent": 2})
+
+ assert "Plain summary" in result
+ assert tokens == 80
+
+ def test_system_prompt_included_in_compression(self):
+ turns = []
+ for i in range(10):
+ turns.append(f"User: question {i} " + "x" * 100)
+ turns.append(f"Assistant: answer {i} " + "y" * 100)
+ query = "\n".join(turns)
+
+ client = _make_client()
+ result, tokens = run("You are a medical coding assistant", query, client, "gpt-4",
+ request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+ "compact_keep_recent": 2})
+
+ call_args = client.chat.completions.create.call_args
+ system_content = call_args.kwargs["messages"][0]["content"]
+ assert "medical coding assistant" in system_content
+
+ def test_keep_recent_exceeds_turn_count(self):
+ query = "User: hi\nAssistant: hello"
+ client = _make_client()
+ result, tokens = run("system", query, client, "gpt-4",
+ request_config={"compact_context_window": 1, "compact_threshold": 0.1,
+ "compact_keep_recent": 100})
+ assert result == query
+ assert tokens == 0
+
+ def test_threshold_zero_always_triggers(self):
+ turns = []
+ for i in range(10):
+ turns.append(f"User: question {i} " + "x" * 100)
+ turns.append(f"Assistant: answer {i} " + "y" * 100)
+ query = "\n".join(turns)
+
+ client = _make_client()
+ result, tokens = run("system", query, client, "gpt-4",
+ request_config={"compact_context_window": 200, "compact_threshold": 0.0,
+ "compact_keep_recent": 2})
+
+ assert tokens == 80
+ assert "[Conversation summary]" in result
+
+ def test_malformed_env_var_uses_default(self):
+ with patch.dict(os.environ, {"COMPACT_THRESHOLD": "not_a_number"}):
+ val = _get_config(None, "compact_threshold", "COMPACT_THRESHOLD", 0.75)
+ assert val == 0.75
+
+ def test_embedded_tags_not_split(self):
+ text = "User: I asked my friend: User: what is Python?\nAssistant: Here is the answer"
+ turns = parse_tagged_conversation(text)
+ assert len(turns) == 2
+ assert "friend: User: what is Python?" in turns[0][1]
+
+
+class TestGetContextWindow:
+ def test_provider_returns_context_length(self):
+ from optillm.plugins.compact_plugin import _get_context_window
+ model_info = MagicMock()
+ model_info.context_length = 32768
+ client = MagicMock()
+ client.models.retrieve.return_value = model_info
+ result = _get_context_window(client, "test-model", None)
+ assert result == 32768
+
+ def test_provider_returns_max_context_length(self):
+ from optillm.plugins.compact_plugin import _get_context_window
+ model_info = MagicMock()
+ model_info.context_length = None
+ model_info.max_context_length = 65536
+ client = MagicMock()
+ client.models.retrieve.return_value = model_info
+ result = _get_context_window(client, "test-model", None)
+ assert result == 65536
+
+ def test_provider_no_context_info_falls_back_to_config(self):
+ from optillm.plugins.compact_plugin import _get_context_window
+ model_info = MagicMock(spec=[])
+ client = MagicMock()
+ client.models.retrieve.return_value = model_info
+ result = _get_context_window(client, "test-model", {"compact_context_window": 50000})
+ assert result == 50000
+
+ def test_provider_api_fails_falls_back_to_default(self):
+ from optillm.plugins.compact_plugin import _get_context_window
+ client = MagicMock()
+ client.models.retrieve.side_effect = Exception("not supported")
+ result = _get_context_window(client, "test-model", None)
+ assert result == 128000
diff --git a/tests/test_conversation_logging_server.py b/tests/test_conversation_logging_server.py
index ef22803..106c302 100644
--- a/tests/test_conversation_logging_server.py
+++ b/tests/test_conversation_logging_server.py
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""
Server-based integration tests for conversation logging with real model
-Tests conversation logging with actual OptILLM server and google/gemma-3-270m-it model
+Tests conversation logging with actual OptILLM server and Qwen/Qwen2.5-Coder-0.5B-Instruct model
"""
import unittest
diff --git a/tests/test_plugins.py b/tests/test_plugins.py
index 3f55c38..d81e334 100644
--- a/tests/test_plugins.py
+++ b/tests/test_plugins.py
@@ -32,7 +32,8 @@ def test_plugin_module_imports():
'optillm.plugins.longcepo_plugin',
'optillm.plugins.spl_plugin',
'optillm.plugins.proxy_plugin',
- 'optillm.plugins.mcp_plugin'
+ 'optillm.plugins.mcp_plugin',
+ 'optillm.plugins.compact_plugin'
]
for module_name in plugin_modules:
@@ -53,7 +54,7 @@ def test_plugin_approach_detection():
load_plugins()
# Check if known plugins are loaded
- expected_plugins = ["memory", "readurls", "privacy", "web_search", "deep_research", "deepthink", "longcepo", "spl", "proxy", "mcp"]
+ expected_plugins = ["memory", "readurls", "privacy", "web_search", "deep_research", "deepthink", "longcepo", "spl", "proxy", "mcp", "compact"]
for plugin_name in expected_plugins:
assert plugin_name in plugin_approaches, f"Plugin {plugin_name} not loaded"
@@ -304,6 +305,14 @@ def test_mcp_plugin():
assert plugin.SLUG == "mcp"
+def test_compact_plugin():
+ """Test compact plugin module"""
+ import optillm.plugins.compact_plugin as plugin
+ assert hasattr(plugin, 'run')
+ assert hasattr(plugin, 'SLUG')
+ assert plugin.SLUG == "compact"
+
+
def test_plugin_subdirectory_imports():
"""Test all plugins with subdirectories can import their submodules"""
# Test deep_research
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 30039a9..c31162b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -12,8 +12,8 @@
from openai import OpenAI
# Standard test model for all tests - small and fast
-TEST_MODEL = "google/gemma-3-270m-it"
-TEST_MODEL_MLX = "mlx-community/gemma-3-270m-it-bf16"
+TEST_MODEL = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
+TEST_MODEL_MLX = "mlx-community/Qwen2.5-Coder-0.5B-Instruct-bf16"
def setup_test_env():
"""Set up test environment with local inference"""