diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 20cba03..6188c8d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -84,7 +84,7 @@ jobs:
     - name: Start optillm server
       run: |
         echo "Starting optillm server for integration tests..."
-        OPTILLM_API_KEY=optillm python optillm.py --model google/gemma-3-270m-it --port 8000 &
+        OPTILLM_API_KEY=optillm python optillm.py --model Qwen/Qwen2.5-Coder-0.5B-Instruct --port 8000 &
         echo $! > server.pid
         
         # Wait for server to be ready
@@ -179,7 +179,7 @@ jobs:
         echo "Starting optillm server with conversation logging..."
         mkdir -p /tmp/optillm_conversations
         OPTILLM_API_KEY=optillm python optillm.py \
-          --model google/gemma-3-270m-it \
+          --model Qwen/Qwen2.5-Coder-0.5B-Instruct \
           --port 8000 \
           --log-conversations \
           --conversation-log-dir /tmp/optillm_conversations &
diff --git a/optillm/__init__.py b/optillm/__init__.py
index 1350db0..56c5d98 100644
--- a/optillm/__init__.py
+++ b/optillm/__init__.py
@@ -1,5 +1,5 @@
 # Version information
-__version__ = "0.3.14"
+__version__ = "0.3.15"
 
 # Import from server module
 from .server import (
diff --git a/optillm/plugins/compact_plugin.py b/optillm/plugins/compact_plugin.py
new file mode 100644
index 0000000..8eb06a5
--- /dev/null
+++ b/optillm/plugins/compact_plugin.py
@@ -0,0 +1,197 @@
+"""
+Compact plugin for OptiLLM.
+
+Automatically compresses conversation context when it exceeds a token budget,
+preserving recent turns verbatim and generating a structured summary of older
+content — inspired by Claude Code's compact mechanism.
+
+Uses one LLM call to produce a structured summary with:
+  Scope, Key decisions, User preferences, Pending work, Key files referenced.
+Recent turns are preserved verbatim.
+
+Composable with other approaches via & operator: compact&moa, compact&bon, etc.
+
+Configuration (env vars or request_config):
+  COMPACT_CONTEXT_WINDOW / compact_context_window — max context tokens (default: 128000)
+  COMPACT_THRESHOLD / compact_threshold — trigger ratio 0.0-1.0 (default: 0.75)
+  COMPACT_KEEP_RECENT / compact_keep_recent — turns to preserve verbatim (default: 4)
+"""
+
+import os
+import re
+import logging
+from typing import Tuple, List, Optional
+
+logger = logging.getLogger(__name__)
+
+SLUG = "compact"
+
+DEFAULT_CONTEXT_WINDOW = 128000
+DEFAULT_THRESHOLD = 0.75
+DEFAULT_KEEP_RECENT = 4
+
+COMPACT_SYSTEM_PROMPT = """You are a conversation summarizer. Given a conversation history, produce a structured summary.
+
+Output ONLY this format, nothing else:
+
+<summary>
+Conversation summary:
+- Scope: {N} earlier messages compacted (user={U}, assistant={A}).
+- Key decisions: {list the main decisions or conclusions reached}
+- User preferences: {any stated preferences or constraints}
+- Pending work: {any remaining tasks or next steps mentioned}
+- Key files referenced: {file paths mentioned, if any}
+- Context: {a concise paragraph capturing the essential context needed to continue}
+</summary>
+
+Rules:
+- Be specific: include actual values, names, and file paths — not vague references
+- Be concise: each section should be 1-2 lines maximum
+- Omit pleasantries, greetings, and filler
+- The Context paragraph is the most important part — it should capture everything a new assistant would need to pick up where this left off"""
+
+
+def _get_config(request_config: Optional[dict], key: str, env_var: str, default):
+    val = None
+    if request_config:
+        val = request_config.get(key)
+    if val is None:
+        env_val = os.environ.get(env_var)
+        if env_val is not None:
+            try:
+                val = type(default)(env_val)
+            except (ValueError, TypeError):
+                logger.warning(f"Invalid value for {env_var}: {env_val!r}, using default {default}")
+                val = default
+    return val if val is not None else default
+
+
+def _get_context_window(client, model: str, request_config: Optional[dict]) -> int:
+    """Get context window size: try provider /models endpoint first, then config fallback."""
+    try:
+        model_info = client.models.retrieve(model)
+        for attr in ("context_length", "max_context_length", "context_window",
+                     "max_model_length", "max_position_embeddings"):
+            val = getattr(model_info, attr, None)
+            if val is not None:
+                return int(val)
+    except Exception:
+        pass
+
+    return _get_config(request_config, "compact_context_window", "COMPACT_CONTEXT_WINDOW", DEFAULT_CONTEXT_WINDOW)
+
+
+def estimate_tokens(text: str) -> int:
+    try:
+        import tiktoken
+        enc = tiktoken.encoding_for_model("gpt-4")
+        return len(enc.encode(text))
+    except (ImportError, KeyError):
+        return max(1, len(text) // 4)
+
+
+def parse_tagged_conversation(text: str) -> List[Tuple[str, str]]:
+    turns = []
+    for match in re.finditer(r'^(User:|Assistant:)\s*', text, re.MULTILINE):
+        role = "user" if match.group(1) == "User:" else "assistant"
+        start = match.end()
+        next_match = re.search(r'^(User:|Assistant:)', text[start:], re.MULTILINE)
+        if next_match:
+            content = text[start:start + next_match.start()].strip()
+        else:
+            content = text[start:].strip()
+        turns.append((role, content))
+    return turns
+
+
+def reconstruct_tagged(turns: List[Tuple[str, str]]) -> str:
+    lines = []
+    for role, content in turns:
+        tag = "User:" if role == "user" else "Assistant:"
+        lines.append(f"{tag} {content}")
+    return "\n".join(lines)
+
+
+def compress_with_llm(
+    older_turns: List[Tuple[str, str]],
+    system_prompt: str,
+    client,
+    model: str,
+) -> Tuple[Optional[str], int]:
+    conversation_text = reconstruct_tagged(older_turns)
+
+    system_content = COMPACT_SYSTEM_PROMPT
+    if system_prompt:
+        system_content += f"\n\nOriginal system context: {system_prompt}"
+
+    messages = [
+        {"role": "system", "content": system_content},
+        {"role": "user", "content": conversation_text},
+    ]
+
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            max_tokens=2000,
+            temperature=0.3,
+        )
+    except Exception as e:
+        logger.error(f"Compact: LLM compression failed: {e}")
+        return None, 0
+
+    raw = response.choices[0].message.content.strip()
+    tokens_used = response.usage.completion_tokens if response.usage else 0
+
+    match = re.search(r'<summary>(.*?)</summary>', raw, re.DOTALL)
+    if match:
+        summary = match.group(1).strip()
+    else:
+        summary = raw
+
+    return summary, tokens_used
+
+
+def run(
+    system_prompt: str,
+    initial_query: str,
+    client,
+    model: str,
+    request_config: Optional[dict] = None,
+) -> Tuple[str, int]:
+    context_window = _get_context_window(client, model, request_config)
+    threshold = _get_config(request_config, "compact_threshold", "COMPACT_THRESHOLD", DEFAULT_THRESHOLD)
+    keep_recent = _get_config(request_config, "compact_keep_recent", "COMPACT_KEEP_RECENT", DEFAULT_KEEP_RECENT)
+
+    token_count = estimate_tokens(initial_query)
+    budget = int(context_window * threshold)
+
+    if token_count < budget:
+        logger.debug(f"Compact: passthrough ({token_count} tokens < {budget} budget)")
+        return initial_query, 0
+
+    turns = parse_tagged_conversation(initial_query)
+    if len(turns) <= keep_recent:
+        logger.debug(f"Compact: too few turns to compress ({len(turns)} <= {keep_recent})")
+        return initial_query, 0
+
+    split_idx = len(turns) - keep_recent
+    older_turns = turns[:split_idx]
+    recent_turns = turns[split_idx:]
+
+    logger.info(f"Compact: compressing {len(older_turns)} older turns, keeping {len(recent_turns)} recent")
+
+    summary, tokens_used = compress_with_llm(older_turns, system_prompt, client, model)
+
+    if summary is None:
+        logger.warning("Compact: compression failed, returning original query")
+        return initial_query, 0
+
+    compressed_turns = [("user", f"[Conversation summary]:\n{summary}")]
+    compressed_turns.extend(recent_turns)
+
+    result = reconstruct_tagged(compressed_turns)
+    new_token_count = estimate_tokens(result)
+    logger.info(f"Compact: {token_count} -> {new_token_count} tokens (used {tokens_used} for compression)")
+
+    return result, tokens_used
diff --git a/optillm/plugins/json_plugin.py b/optillm/plugins/json_plugin.py
index 07a76ab..c686859 100644
--- a/optillm/plugins/json_plugin.py
+++ b/optillm/plugins/json_plugin.py
@@ -22,7 +22,7 @@ def get_device(self):
         else:
             return torch.device("cpu")
 
-    def __init__(self, model_name: str = "google/gemma-3-270m-it"):
+    def __init__(self, model_name: str = "Qwen/Qwen2.5-Coder-0.5B-Instruct"):
         """Initialize the JSON generator with a specific model."""
         self.device = self.get_device()
         logger.info(f"Using device: {self.device}")
diff --git a/pyproject.toml b/pyproject.toml
index 81a5b0f..94f9a19 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "optillm"
-version = "0.3.14"
+version = "0.3.15"
 description = "An optimizing inference proxy for LLMs."
 readme = "README.md"
 license = "Apache-2.0"
diff --git a/tests/test_compact_plugin.py b/tests/test_compact_plugin.py
new file mode 100644
index 0000000..acd1cfb
--- /dev/null
+++ b/tests/test_compact_plugin.py
@@ -0,0 +1,303 @@
+"""Tests for compact_plugin."""
+
+import os
+import pytest
+from unittest.mock import MagicMock, patch
+from optillm.plugins.compact_plugin import (
+    estimate_tokens,
+    parse_tagged_conversation,
+    reconstruct_tagged,
+    _get_config,
+    run,
+)
+
+
+def _make_client(summary_text="<summary>Test summary with Key decisions: decided X.</summary>"):
+    mock_response = MagicMock()
+    mock_response.choices = [MagicMock()]
+    mock_response.choices[0].message.content = summary_text
+    mock_response.usage.completion_tokens = 80
+    client = MagicMock()
+    client.chat.completions.create.return_value = mock_response
+    client.models.retrieve.return_value = MagicMock(spec=[])
+    return client
+
+
+class TestEstimateTokens:
+    def test_short_text(self):
+        assert estimate_tokens("hello") >= 1
+
+    def test_empty_string(self):
+        assert estimate_tokens("") == 0
+
+    def test_long_text_returns_positive(self):
+        assert estimate_tokens("a" * 400) > 0
+
+
+class TestParseTaggedConversation:
+    def test_single_user_message(self):
+        assert parse_tagged_conversation("User: hello") == [("user", "hello")]
+
+    def test_conversation_pair(self):
+        assert parse_tagged_conversation("User: hi\nAssistant: hello") == [("user", "hi"), ("assistant", "hello")]
+
+    def test_multi_turn(self):
+        turns = parse_tagged_conversation("User: q1\nAssistant: a1\nUser: q2\nAssistant: a2")
+        assert len(turns) == 4
+        assert turns[0] == ("user", "q1")
+        assert turns[3] == ("assistant", "a2")
+
+    def test_empty_string(self):
+        assert parse_tagged_conversation("") == []
+
+    def test_no_tags(self):
+        assert parse_tagged_conversation("just some text") == []
+
+
+class TestReconstructTagged:
+    def test_roundtrip(self):
+        result = reconstruct_tagged([("user", "hello"), ("assistant", "world")])
+        assert "User: hello" in result
+        assert "Assistant: world" in result
+
+    def test_empty(self):
+        assert reconstruct_tagged([]) == ""
+
+
+class TestGetConfig:
+    def test_default_when_nothing_set(self):
+        assert _get_config(None, "key", "NONEXISTENT_VAR", 42) == 42
+
+    def test_request_config_takes_priority(self):
+        assert _get_config({"compact_threshold": 0.5}, "compact_threshold", "COMPACT_THRESHOLD", 0.75) == 0.5
+
+    def test_env_var_as_fallback(self):
+        with patch.dict(os.environ, {"COMPACT_KEEP_RECENT": "6"}):
+            assert _get_config(None, "compact_keep_recent", "COMPACT_KEEP_RECENT", 4) == 6
+
+    def test_request_config_overrides_env(self):
+        with patch.dict(os.environ, {"COMPACT_KEEP_RECENT": "6"}):
+            assert _get_config({"compact_keep_recent": 2}, "compact_keep_recent", "COMPACT_KEEP_RECENT", 4) == 2
+
+
+class TestRun:
+    def test_passthrough_short_conversation(self):
+        query = "User: hi\nAssistant: hello"
+        result, tokens = run("system", query, _make_client(), "gpt-4")
+        assert result == query
+        assert tokens == 0
+
+    def test_passthrough_few_turns(self):
+        turns = []
+        for i in range(3):
+            turns.append(f"User: question {i}")
+            turns.append(f"Assistant: answer {i}")
+        query = "\n".join(turns)
+        result, tokens = run("system", query, _make_client(), "gpt-4",
+                             request_config={"compact_context_window": 100, "compact_threshold": 0.5})
+        assert result == query
+        assert tokens == 0
+
+    def test_compression_triggered_uses_llm(self):
+        turns = []
+        for i in range(20):
+            turns.append(f"User: this is a longer question number {i} with extra text to increase token count")
+            turns.append(f"Assistant: this is a longer answer number {i} with extra text to increase token count")
+        query = "\n".join(turns)
+
+        client = _make_client("<summary>\nConversation summary:\n- Scope: 36 messages.\n- Key decisions: decided to use compact.\n- Context: user was testing compression.\n</summary>")
+        result, tokens = run("system", query, client, "gpt-4",
+                             request_config={"compact_context_window": 200, "compact_threshold": 0.3,
+                                             "compact_keep_recent": 4})
+
+        assert tokens == 80  # LLM was called
+        assert "[Conversation summary]" in result
+        assert "Scope:" in result
+        assert "question number 19" in result  # last user turn preserved
+        client.chat.completions.create.assert_called_once()
+
+    def test_structured_summary_format(self):
+        turns = []
+        for i in range(20):
+            turns.append(f"User: question {i} " + "x" * 100)
+            turns.append(f"Assistant: answer {i} " + "y" * 100)
+        query = "\n".join(turns)
+
+        summary_text = "<summary>\nConversation summary:\n- Scope: 36 messages.\n- Key decisions: use plugin.\n- Key files: src/main.py.\n- Context: testing.\n</summary>"
+        client = _make_client(summary_text)
+        result, tokens = run("system", query, client, "gpt-4",
+                             request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+                                             "compact_keep_recent": 2})
+
+        assert "Key decisions:" in result
+        assert "Key files:" in result
+
+    def test_output_preserves_tag_format(self):
+        turns = []
+        for i in range(10):
+            turns.append(f"User: question {i} " + "x" * 100)
+            turns.append(f"Assistant: answer {i} " + "y" * 100)
+        query = "\n".join(turns)
+
+        client = _make_client()
+        result, tokens = run("system", query, client, "gpt-4",
+                             request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+                                             "compact_keep_recent": 2})
+
+        parsed = parse_tagged_conversation(result)
+        assert len(parsed) >= 2
+        assert parsed[0][0] == "user"
+        assert "[Conversation summary]" in parsed[0][1]
+
+    def test_env_var_configuration(self):
+        turns = []
+        for i in range(10):
+            turns.append(f"User: question {i} " + "x" * 100)
+            turns.append(f"Assistant: answer {i} " + "y" * 100)
+        query = "\n".join(turns)
+
+        client = _make_client()
+        with patch.dict(os.environ, {"COMPACT_CONTEXT_WINDOW": "200", "COMPACT_THRESHOLD": "0.1",
+                                      "COMPACT_KEEP_RECENT": "2"}):
+            result, tokens = run("system", query, client, "gpt-4")
+
+        assert tokens == 80
+        assert "[Conversation summary]" in result
+
+    def test_llm_failure_falls_back_to_passthrough(self):
+        turns = []
+        for i in range(10):
+            turns.append(f"User: question {i} " + "x" * 100)
+            turns.append(f"Assistant: answer {i} " + "y" * 100)
+        query = "\n".join(turns)
+
+        client = MagicMock()
+        client.chat.completions.create.side_effect = Exception("API error")
+
+        result, tokens = run("system", query, client, "gpt-4",
+                             request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+                                             "compact_keep_recent": 2})
+        assert result == query
+        assert tokens == 0
+
+    def test_summary_tag_extraction(self):
+        turns = []
+        for i in range(10):
+            turns.append(f"User: question {i} " + "x" * 100)
+            turns.append(f"Assistant: answer {i} " + "y" * 100)
+        query = "\n".join(turns)
+
+        raw_llm_output = "Here is the summary:\n<summary>\nConversation summary:\n- Scope: 10 messages.\n- Key decisions: decided X.\n</summary>\nHope this helps!"
+        client = _make_client(summary_text=raw_llm_output)
+
+        result, tokens = run("system", query, client, "gpt-4",
+                             request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+                                             "compact_keep_recent": 2})
+
+        assert "Here is the summary:" not in result
+        assert "Hope this helps!" not in result
+        assert "Scope:" in result
+        assert "Key decisions:" in result
+
+    def test_summary_without_tags_used_raw(self):
+        turns = []
+        for i in range(10):
+            turns.append(f"User: question {i} " + "x" * 100)
+            turns.append(f"Assistant: answer {i} " + "y" * 100)
+        query = "\n".join(turns)
+
+        client = _make_client(summary_text="Plain summary without XML tags. Key decisions: use compact.")
+
+        result, tokens = run("system", query, client, "gpt-4",
+                             request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+                                             "compact_keep_recent": 2})
+
+        assert "Plain summary" in result
+        assert tokens == 80
+
+    def test_system_prompt_included_in_compression(self):
+        turns = []
+        for i in range(10):
+            turns.append(f"User: question {i} " + "x" * 100)
+            turns.append(f"Assistant: answer {i} " + "y" * 100)
+        query = "\n".join(turns)
+
+        client = _make_client()
+        result, tokens = run("You are a medical coding assistant", query, client, "gpt-4",
+                             request_config={"compact_context_window": 200, "compact_threshold": 0.1,
+                                             "compact_keep_recent": 2})
+
+        call_args = client.chat.completions.create.call_args
+        system_content = call_args.kwargs["messages"][0]["content"]
+        assert "medical coding assistant" in system_content
+
+    def test_keep_recent_exceeds_turn_count(self):
+        query = "User: hi\nAssistant: hello"
+        client = _make_client()
+        result, tokens = run("system", query, client, "gpt-4",
+                             request_config={"compact_context_window": 1, "compact_threshold": 0.1,
+                                             "compact_keep_recent": 100})
+        assert result == query
+        assert tokens == 0
+
+    def test_threshold_zero_always_triggers(self):
+        turns = []
+        for i in range(10):
+            turns.append(f"User: question {i} " + "x" * 100)
+            turns.append(f"Assistant: answer {i} " + "y" * 100)
+        query = "\n".join(turns)
+
+        client = _make_client()
+        result, tokens = run("system", query, client, "gpt-4",
+                             request_config={"compact_context_window": 200, "compact_threshold": 0.0,
+                                             "compact_keep_recent": 2})
+
+        assert tokens == 80
+        assert "[Conversation summary]" in result
+
+    def test_malformed_env_var_uses_default(self):
+        with patch.dict(os.environ, {"COMPACT_THRESHOLD": "not_a_number"}):
+            val = _get_config(None, "compact_threshold", "COMPACT_THRESHOLD", 0.75)
+        assert val == 0.75
+
+    def test_embedded_tags_not_split(self):
+        text = "User: I asked my friend: User: what is Python?\nAssistant: Here is the answer"
+        turns = parse_tagged_conversation(text)
+        assert len(turns) == 2
+        assert "friend: User: what is Python?" in turns[0][1]
+
+
+class TestGetContextWindow:
+    def test_provider_returns_context_length(self):
+        from optillm.plugins.compact_plugin import _get_context_window
+        model_info = MagicMock()
+        model_info.context_length = 32768
+        client = MagicMock()
+        client.models.retrieve.return_value = model_info
+        result = _get_context_window(client, "test-model", None)
+        assert result == 32768
+
+    def test_provider_returns_max_context_length(self):
+        from optillm.plugins.compact_plugin import _get_context_window
+        model_info = MagicMock()
+        model_info.context_length = None
+        model_info.max_context_length = 65536
+        client = MagicMock()
+        client.models.retrieve.return_value = model_info
+        result = _get_context_window(client, "test-model", None)
+        assert result == 65536
+
+    def test_provider_no_context_info_falls_back_to_config(self):
+        from optillm.plugins.compact_plugin import _get_context_window
+        model_info = MagicMock(spec=[])
+        client = MagicMock()
+        client.models.retrieve.return_value = model_info
+        result = _get_context_window(client, "test-model", {"compact_context_window": 50000})
+        assert result == 50000
+
+    def test_provider_api_fails_falls_back_to_default(self):
+        from optillm.plugins.compact_plugin import _get_context_window
+        client = MagicMock()
+        client.models.retrieve.side_effect = Exception("not supported")
+        result = _get_context_window(client, "test-model", None)
+        assert result == 128000
diff --git a/tests/test_conversation_logging_server.py b/tests/test_conversation_logging_server.py
index ef22803..106c302 100644
--- a/tests/test_conversation_logging_server.py
+++ b/tests/test_conversation_logging_server.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 Server-based integration tests for conversation logging with real model
-Tests conversation logging with actual OptILLM server and google/gemma-3-270m-it model
+Tests conversation logging with actual OptILLM server and Qwen/Qwen2.5-Coder-0.5B-Instruct model
 """
 
 import unittest
diff --git a/tests/test_plugins.py b/tests/test_plugins.py
index 3f55c38..d81e334 100644
--- a/tests/test_plugins.py
+++ b/tests/test_plugins.py
@@ -32,7 +32,8 @@ def test_plugin_module_imports():
         'optillm.plugins.longcepo_plugin',
         'optillm.plugins.spl_plugin',
         'optillm.plugins.proxy_plugin',
-        'optillm.plugins.mcp_plugin'
+        'optillm.plugins.mcp_plugin',
+        'optillm.plugins.compact_plugin'
     ]
     
     for module_name in plugin_modules:
@@ -53,7 +54,7 @@ def test_plugin_approach_detection():
     load_plugins()
     
     # Check if known plugins are loaded
-    expected_plugins = ["memory", "readurls", "privacy", "web_search", "deep_research", "deepthink", "longcepo", "spl", "proxy", "mcp"]
+    expected_plugins = ["memory", "readurls", "privacy", "web_search", "deep_research", "deepthink", "longcepo", "spl", "proxy", "mcp", "compact"]
     for plugin_name in expected_plugins:
         assert plugin_name in plugin_approaches, f"Plugin {plugin_name} not loaded"
 
@@ -304,6 +305,14 @@ def test_mcp_plugin():
     assert plugin.SLUG == "mcp"
 
 
+def test_compact_plugin():
+    """Test compact plugin module"""
+    import optillm.plugins.compact_plugin as plugin
+    assert hasattr(plugin, 'run')
+    assert hasattr(plugin, 'SLUG')
+    assert plugin.SLUG == "compact"
+
+
 def test_plugin_subdirectory_imports():
     """Test all plugins with subdirectories can import their submodules"""
     # Test deep_research
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 30039a9..c31162b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -12,8 +12,8 @@
 from openai import OpenAI
 
 # Standard test model for all tests - small and fast
-TEST_MODEL = "google/gemma-3-270m-it"
-TEST_MODEL_MLX = "mlx-community/gemma-3-270m-it-bf16"
+TEST_MODEL = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
+TEST_MODEL_MLX = "mlx-community/Qwen2.5-Coder-0.5B-Instruct-bf16"
 
 def setup_test_env():
     """Set up test environment with local inference"""