diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/pyproject.toml
new file mode 100644
index 000000000..69dbd269e
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/pyproject.toml
@@ -0,0 +1,54 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-algotune"
+dynamic = ["version"]
+description = "LoongSuite algotune instrumentation"
+license = "Apache-2.0"
+requires-python = ">=3.10,<4"
+authors = [
+  { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+  "opentelemetry-api >= 1.37.0",
+  "opentelemetry-instrumentation >= 0.58b0",
+  "opentelemetry-semantic-conventions >= 0.58b0",
+  "wrapt >= 1.0.0, < 2.0.0",
+]
+
+[project.optional-dependencies]
+instruments = [
+
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+algotune = "opentelemetry.instrumentation.algotune:AlgoTuneInstrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-algotune"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/algotune/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+  "/src",
+  "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/__init__.py
new file mode 100644
index 000000000..2f154dece
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/__init__.py
@@ -0,0 +1,304 @@
+"""
+OpenTelemetry AlgoTune Instrumentation
+======================================
+
+Automatic instrumentation for the `AlgoTune
+<https://github.com/oripress/AlgoTune>`_ benchmark framework.
+
+This instrumentor produces the AlgoTune-business span tree
+(``ENTRY`` / ``AGENT`` / ``STEP`` / ``TOOL`` / ``TASK``) and intentionally
+**does not** create LLM spans for the LiteLLM call path. Those are
+expected to be produced by an already-loaded LiteLLM instrumentor (e.g.
+``opentelemetry-instrumentation-litellm`` or
+``openinference-instrumentation-litellm``); they automatically become
+children of the active ``STEP`` span thanks to OpenTelemetry context
+propagation.
+
+A separate, **opt-in** wrapper exists for ``TogetherModel.query``, which
+hits the Together API directly via ``requests.post`` and is therefore
+not covered by the LiteLLM instrumentor. Enable it with the environment
+variable ``ALGOTUNE_OTEL_INSTRUMENT_TOGETHER=true``.
+
+Span hierarchy
+--------------
+
+::
+
+    ENTRY: enter_ai_application_system          ← AlgoTuner.main:main()
+    └── AGENT: invoke_agent AlgoTuner           ← LLMInterface.run_task()
+        ├── STEP: react step  [round=N]         ← get_response + handle_function_call
+        │   ├── LLM:  chat <model>              ← LiteLLM instrumentor (auto)
+        │   │                                     OR TogetherModel.query (this pkg)
+        │   └── TOOL: execute_tool <command>    ← CommandHandlers.handle_command
+        │       └── TASK: run_task benchmark.dataset_eval ← _runner_eval_dataset
+        │           ├── TASK: run_task benchmark.baseline_generation ← get_baseline_times
+        │           └── TASK: run_task benchmark.problem_eval [×N] ← evaluate_single
+        └── ...
+
+Usage
+-----
+
+.. code:: python
+
+    # 1) Load the LiteLLM instrumentor first so LLM spans are produced.
+    from opentelemetry.instrumentation.litellm import LiteLLMInstrumentor
+    LiteLLMInstrumentor().instrument()
+
+    # 2) Then load the AlgoTune instrumentor for business spans.
+    from opentelemetry.instrumentation.algotune import AlgoTuneInstrumentor
+    AlgoTuneInstrumentor().instrument()
+
+    # Run AlgoTune as normal.
+    # python -m AlgoTuner.main --model gpt-4o --task tsp
+
+Configuration
+-------------
+
+Environment variables:
+
+* ``OTEL_INSTRUMENTATION_ALGOTUNE_ENABLED`` — master enable switch (default ``true``).
+* ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT`` — capture
+  tool-call arguments / result messages (default ``false``).
+* ``ALGOTUNE_OTEL_MAX_CONTENT_LENGTH`` — character truncation for string
+  attributes (default ``4096``).
+* ``ALGOTUNE_OTEL_INSTRUMENT_TOGETHER`` — wrap ``TogetherModel.query`` with
+  a manual LLM span (default ``false``).
+
+API
+---
+"""
+
+from __future__ import annotations
+
+import importlib
+import logging
+from typing import Any, Collection
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from wrapt import wrap_function_wrapper
+
+from opentelemetry.instrumentation.algotune.config import (
+    ALGOTUNE_OTEL_INSTRUMENT_TOGETHER,
+    OTEL_INSTRUMENTATION_ALGOTUNE_ENABLED,
+)
+from opentelemetry.instrumentation.algotune.package import _instruments
+from opentelemetry.instrumentation.algotune.version import __version__
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["AlgoTuneInstrumentor"]
+
+
+# Patch sites are (module_path, attribute_name) tuples. We use the source
+# module so that the wrap survives import-order changes.
+_PATCH_SITES: list[tuple[str, str, str]] = [
+    # (logical_name, module_path, qualified_attribute)
+    ("main", "AlgoTuner.main", "main"),
+    ("run_task", "AlgoTuner.interfaces.llm_interface", "LLMInterface.run_task"),
+    ("get_response", "AlgoTuner.interfaces.llm_interface", "LLMInterface.get_response"),
+    (
+        "handle_function_call",
+        "AlgoTuner.interfaces.llm_interface",
+        "LLMInterface.handle_function_call",
+    ),
+    (
+        "handle_command",
+        "AlgoTuner.interfaces.commands.handlers",
+        "CommandHandlers.handle_command",
+    ),
+    (
+        "_runner_eval_dataset",
+        "AlgoTuner.interfaces.commands.handlers",
+        "CommandHandlers._runner_eval_dataset",
+    ),
+    (
+        "evaluate_single",
+        "AlgoTuner.utils.evaluator.evaluation_orchestrator",
+        "EvaluationOrchestrator.evaluate_single",
+    ),
+    (
+        "get_baseline_times",
+        "AlgoTuner.utils.evaluator.baseline_manager",
+        "BaselineManager.get_baseline_times",
+    ),
+    ("query", "AlgoTuner.models.lite_llm_model", "LiteLLMModel.query"),
+    (
+        "_execute_query",
+        "AlgoTuner.models.lite_llm_model",
+        "LiteLLMModel._execute_query",
+    ),
+]
+
+_TOGETHER_PATCH_SITE: tuple[str, str, str] = (
+    "together_query",
+    "AlgoTuner.models.together_model",
+    "TogetherModel.query",
+)
+
+
+def _safe_wrap(module_path: str, name: str, wrapper: Any) -> bool:
+    """Wrap ``module_path.name`` with ``wrapper``; swallow ImportError."""
+    try:
+        wrap_function_wrapper(module_path, name, wrapper)
+        return True
+    except (ImportError, AttributeError) as exc:
+        logger.debug(
+            "AlgoTune: skipping wrap %s.%s (%s)", module_path, name, exc
+        )
+        return False
+    except Exception as exc:  # noqa: BLE001
+        logger.warning(
+            "AlgoTune: could not wrap %s.%s: %s", module_path, name, exc
+        )
+        return False
+
+
+def _safe_unwrap(module_path: str, qualname: str) -> None:
+    """Restore an attribute wrapped by ``wrapt``.
+
+    ``qualname`` may be ``"Class.method"`` or just ``"func"``. We walk the
+    module/class chain and restore via ``__wrapped__`` when present.
+    """
+    try:
+        mod = importlib.import_module(module_path)
+    except ImportError:
+        return
+
+    parts = qualname.split(".")
+    parent: Any = mod
+    for part in parts[:-1]:
+        parent = getattr(parent, part, None)
+        if parent is None:
+            return
+    leaf_name = parts[-1]
+    leaf = getattr(parent, leaf_name, None)
+    if leaf is None:
+        return
+    original = getattr(leaf, "__wrapped__", None)
+    if original is None:
+        return
+    try:
+        setattr(parent, leaf_name, original)
+    except Exception:  # noqa: BLE001
+        pass
+
+
+class AlgoTuneInstrumentor(BaseInstrumentor):
+    """An instrumentor for the AlgoTune benchmark framework.
+
+    Covers six AlgoTune-business span kinds:
+
+    * **ENTRY** – ``AlgoTuner.main.main``
+    * **AGENT** – ``LLMInterface.run_task``
+    * **STEP**  – ``LLMInterface.get_response`` (open) +
+      ``LLMInterface.handle_function_call`` (close)
+    * **TOOL**  – ``CommandHandlers.handle_command``
+    * **TASK**  – ``CommandHandlers._runner_eval_dataset``,
+      ``EvaluationOrchestrator.evaluate_single``,
+      ``BaselineManager.get_baseline_times``
+
+    The LiteLLM call path (``LiteLLMModel.query`` / ``_execute_query``)
+    is wrapped only to publish ``algo.llm.retry_count`` onto the active
+    STEP span; **no LLM span is created**. LLM spans for that path are
+    expected from a separately-loaded LiteLLM instrumentor.
+
+    The ``TogetherModel.query`` bypass (raw HTTP, not via ``litellm``) is
+    only wrapped when ``ALGOTUNE_OTEL_INSTRUMENT_TOGETHER=true``.
+    """
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        if not OTEL_INSTRUMENTATION_ALGOTUNE_ENABLED:
+            logger.info("AlgoTune instrumentation disabled via env var")
+            return
+
+        tracer_provider = kwargs.get("tracer_provider")
+        tracer = trace_api.get_tracer(
+            __name__,
+            __version__,
+            tracer_provider=tracer_provider,
+        )
+
+        from opentelemetry.instrumentation.algotune.internal.wrappers import (
+            EvaluateSingleWrapper,
+            GetBaselineTimesWrapper,
+            GetResponseWrapper,
+            HandleCommandWrapper,
+            HandleFunctionCallWrapper,
+            LiteLLMExecuteQueryWrapper,
+            LiteLLMQueryWrapper,
+            MainWrapper,
+            RunTaskWrapper,
+            RunnerEvalDatasetWrapper,
+            TogetherModelQueryWrapper,
+        )
+
+        wrappers_by_name: dict[str, Any] = {
+            "main": MainWrapper(tracer),
+            "run_task": RunTaskWrapper(tracer),
+            "get_response": GetResponseWrapper(tracer),
+            "handle_function_call": HandleFunctionCallWrapper(),
+            "handle_command": HandleCommandWrapper(tracer),
+            "_runner_eval_dataset": RunnerEvalDatasetWrapper(tracer),
+            "evaluate_single": EvaluateSingleWrapper(tracer),
+            "get_baseline_times": GetBaselineTimesWrapper(tracer),
+            "query": LiteLLMQueryWrapper(),
+            "_execute_query": LiteLLMExecuteQueryWrapper(),
+        }
+
+        for logical_name, module_path, qualname in _PATCH_SITES:
+            wrapper = wrappers_by_name.get(logical_name)
+            if wrapper is None:
+                continue
+            if not _safe_wrap(module_path, qualname, wrapper):
+                logger.info(
+                    "AlgoTune: %s not yet importable; skipping wrap",
+                    f"{module_path}.{qualname}",
+                )
+
+        if ALGOTUNE_OTEL_INSTRUMENT_TOGETHER:
+            logical, module_path, qualname = _TOGETHER_PATCH_SITE
+            _safe_wrap(
+                module_path,
+                qualname,
+                TogetherModelQueryWrapper(tracer),
+            )
+
+        # Best-effort sanity check: warn if no LiteLLM instrumentor is
+        # loaded -- the trace tree will still be valid but LLM spans will
+        # be missing.
+        if not _is_litellm_instrumented():
+            logger.warning(
+                "AlgoTune instrumentation: litellm.completion does not look"
+                " instrumented. LLM spans will be missing from the trace"
+                " tree. Load opentelemetry-instrumentation-litellm (or"
+                " openinference-instrumentation-litellm) before AlgoTune"
+                " starts."
+            )
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        for _logical, module_path, qualname in _PATCH_SITES:
+            _safe_unwrap(module_path, qualname)
+        _logical, module_path, qualname = _TOGETHER_PATCH_SITE
+        _safe_unwrap(module_path, qualname)
+
+
+def _is_litellm_instrumented() -> bool:
+    """Return ``True`` iff ``litellm.completion`` appears to be wrapped.
+
+    We look for the ``__wrapped__`` attribute set by ``wrapt`` /
+    ``functools.wraps``. Returns ``False`` (no warning suppressed) when
+    ``litellm`` itself is not importable -- in that case AlgoTune will
+    fail before we get a chance to emit spans anyway.
+    """
+    try:
+        import litellm  # noqa: PLC0415
+    except ImportError:
+        return False
+    completion = getattr(litellm, "completion", None)
+    if completion is None:
+        return False
+    return hasattr(completion, "__wrapped__")
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/config.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/config.py
new file mode 100644
index 000000000..f012b43b2
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/config.py
@@ -0,0 +1,75 @@
+"""Configuration via environment variables for AlgoTune instrumentation."""
+
+from __future__ import annotations
+
+import os
+
+
+def _bool_env(name: str, default: bool) -> bool:
+    val = os.getenv(name)
+    if val is None:
+        return default
+    return val.strip().lower() in {"true", "1", "yes", "on"}
+
+
+def _int_env(name: str, default: str) -> int:
+    try:
+        return int(os.getenv(name, default))
+    except ValueError:
+        return int(default)
+
+
+def _float_env(name: str, default: str) -> float:
+    try:
+        return float(os.getenv(name, default))
+    except ValueError:
+        return float(default)
+
+
+def _genai_capture_enabled() -> bool:
+    val = os.getenv("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT")
+    if val is None:
+        return False
+    return val.strip().upper() in {
+        "TRUE",
+        "1",
+        "YES",
+        "ON",
+        "SPAN_ONLY",
+        "SPAN_AND_EVENT",
+        "EVENT_ONLY",
+    }
+
+
+# Master enable switch
+OTEL_INSTRUMENTATION_ALGOTUNE_ENABLED = _bool_env(
+    "OTEL_INSTRUMENTATION_ALGOTUNE_ENABLED", True
+)
+
+# Whether to capture potentially sensitive content (tool args/results).
+# LLM message content is controlled by the LiteLLM instrumentor itself.
+OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT = _genai_capture_enabled()
+
+# Maximum length of any string attribute the instrumentor produces.
+ALGOTUNE_OTEL_MAX_CONTENT_LENGTH = _int_env(
+    "ALGOTUNE_OTEL_MAX_CONTENT_LENGTH", "4096"
+)
+
+# Slow-call thresholds (seconds) used by the Span-to-Metrics processor.
+ALGOTUNE_OTEL_SLOW_TOOL_SECONDS = _float_env(
+    "ALGOTUNE_OTEL_SLOW_TOOL_SECONDS", "30"
+)
+ALGOTUNE_OTEL_SLOW_TASK_SECONDS = _float_env(
+    "ALGOTUNE_OTEL_SLOW_TASK_SECONDS", "60"
+)
+ALGOTUNE_OTEL_SLOW_AGENT_SECONDS = _float_env(
+    "ALGOTUNE_OTEL_SLOW_AGENT_SECONDS", "300"
+)
+
+# Whether to wrap TogetherModel.query() with a manual LLM span.
+# TogetherModel hits the Together API directly via requests.post and is NOT
+# covered by the LiteLLM instrumentor. Default off so the LiteLLM-only
+# environments stay clean.
+ALGOTUNE_OTEL_INSTRUMENT_TOGETHER = _bool_env(
+    "ALGOTUNE_OTEL_INSTRUMENT_TOGETHER", False
+)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/utils.py
new file mode 100644
index 000000000..47836b435
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/utils.py
@@ -0,0 +1,120 @@
+"""Shared helpers for AlgoTune wrappers."""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+from opentelemetry.semconv._incubating.attributes import (
+    gen_ai_attributes as GenAI,
+)
+
+from opentelemetry.instrumentation.algotune.config import (
+    ALGOTUNE_OTEL_MAX_CONTENT_LENGTH,
+)
+
+# Aliyun ARMS GenAI conventions (mirrors the values used by the other Robin
+# instrumentations such as minisweagent / pinchbench).
+GEN_AI_SPAN_KIND = "gen_ai.span.kind"
+GEN_AI_FRAMEWORK = "gen_ai.framework"
+GEN_AI_USAGE_TOTAL_TOKENS = "gen_ai.usage.total_tokens"
+
+ALGOTUNE_FRAMEWORK_VALUE = "AlgoTune"
+
+# Instance attribute names used by wrappers to share state across hooks
+# without polluting AlgoTune's public API.
+INST_STEP_SPAN_ATTR = "_otel_algo_step_span"
+INST_STEP_TOKEN_ATTR = "_otel_algo_step_token"
+INST_ROUND_ATTR = "_otel_algo_round"
+INST_LITELLM_ATTEMPTS_ATTR = "_otel_algo_litellm_attempts"
+
+
+def truncate(text: Any, max_len: int = ALGOTUNE_OTEL_MAX_CONTENT_LENGTH) -> str:
+    """Coerce ``text`` to ``str`` and truncate it to ``max_len`` characters."""
+    if text is None:
+        return ""
+    if not isinstance(text, str):
+        try:
+            text = str(text)
+        except Exception:  # noqa: BLE001
+            return ""
+    if len(text) <= max_len:
+        return text
+    if max_len <= 3:
+        return text[:max_len]
+    return text[: max_len - 3] + "..."
+
+
+def provider_from_model(model_name: str) -> str:
+    """Best-effort provider inference from a LiteLLM-style model name.
+
+    AlgoTune uses LiteLLM-style model identifiers (e.g.
+    ``openai/gpt-4o``, ``anthropic/claude-3-5-sonnet``). When no
+    explicit prefix is present we fall back to substring heuristics.
+    """
+    if not model_name:
+        return "unknown"
+    name = model_name.lower()
+    if "/" in name:
+        prefix = name.split("/", 1)[0]
+        # LiteLLM accepts a handful of provider prefixes; map common ones.
+        if prefix in {
+            "openai",
+            "anthropic",
+            "vertex_ai",
+            "gemini",
+            "google",
+            "mistral",
+            "azure",
+            "azure_ai",
+            "bedrock",
+            "groq",
+            "deepseek",
+            "openrouter",
+            "together_ai",
+        }:
+            if prefix == "vertex_ai" or prefix == "gemini":
+                return "google"
+            if prefix == "azure_ai":
+                return "azure"
+            return prefix
+    if "claude" in name or "anthropic" in name:
+        return "anthropic"
+    if "gemini" in name or "vertex" in name or "google" in name:
+        return "google"
+    if "mistral" in name:
+        return "mistral"
+    if "deepseek" in name:
+        return "deepseek"
+    if "qwen" in name or "dashscope" in name:
+        return "dashscope"
+    if "gpt" in name or "openai" in name or "o1" in name or "o3" in name:
+        return "openai"
+    return "unknown"
+
+
+def safe_close_step(instance: Any) -> None:
+    """End any STEP span dangling on ``instance`` and detach its context.
+
+    Used as a safety net in ``run_task``'s ``finally`` block so that a STEP
+    span never outlives the AGENT span (e.g. when ``get_response`` returns
+    None and the loop ``break``s before ``handle_function_call`` runs, or
+    when an exception propagates past STEP cleanup).
+    """
+    from opentelemetry import context as otel_context  # local import
+
+    span = getattr(instance, INST_STEP_SPAN_ATTR, None)
+    token = getattr(instance, INST_STEP_TOKEN_ATTR, None)
+    try:
+        if span is not None and span.is_recording():
+            span.end()
+    except Exception:  # noqa: BLE001
+        pass
+    try:
+        if token is not None:
+            otel_context.detach(token)
+    except Exception:  # noqa: BLE001
+        pass
+    try:
+        setattr(instance, INST_STEP_SPAN_ATTR, None)
+        setattr(instance, INST_STEP_TOKEN_ATTR, None)
+    except Exception:  # noqa: BLE001
+        pass
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/wrappers.py
new file mode 100644
index 000000000..3f0c7b3bb
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/wrappers.py
@@ -0,0 +1,1332 @@
+"""Wrapt wrappers for AlgoTune OpenTelemetry instrumentation.
+
+Span hierarchy (final selection)::
+
+    ENTRY: enter_ai_application_system          ← AlgoTuner.main:main()
+    └── AGENT: invoke_agent AlgoTuner           ← LLMInterface.run_task()
+        ├── STEP: react step  [round=N]         ← get_response + handle_function_call
+        │   ├── LLM:  chat <model>              ← LiteLLM instrumentor (auto)
+        │   │                                     OR TogetherModel.query (this pkg)
+        │   └── TOOL: execute_tool <command>    ← CommandHandlers.handle_command
+        │       └── TASK: run_task benchmark.dataset_eval ← _runner_eval_dataset
+        │           ├── TASK: run_task benchmark.baseline_generation ← get_baseline_times
+        │           └── TASK: run_task benchmark.problem_eval [×N] ← evaluate_single
+        └── ...
+
+This module never creates LLM spans for the LiteLLM path. The LiteLLM
+instrumentor (loaded separately at runtime) is responsible for that and
+naturally becomes a child of the active STEP span via OpenTelemetry
+context propagation. The only LLM-layer hook here is a lightweight
+attempt counter (``algo.llm.retry_count``) written onto the STEP span.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import os
+import sys
+import uuid
+from typing import Any, Callable, Optional
+
+from opentelemetry import context as otel_context
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.algotune.config import (
+    OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT,
+)
+from opentelemetry.instrumentation.algotune.internal.utils import (
+    ALGOTUNE_FRAMEWORK_VALUE,
+    GEN_AI_FRAMEWORK,
+    GEN_AI_SPAN_KIND,
+    GEN_AI_USAGE_TOTAL_TOKENS,
+    INST_LITELLM_ATTEMPTS_ATTR,
+    INST_ROUND_ATTR,
+    INST_STEP_SPAN_ATTR,
+    INST_STEP_TOKEN_ATTR,
+    provider_from_model,
+    safe_close_step,
+    truncate,
+)
+from opentelemetry.semconv._incubating.attributes import (
+    gen_ai_attributes as GenAI,
+)
+from opentelemetry.trace import (
+    Span,
+    SpanKind,
+    Status,
+    StatusCode,
+    Tracer,
+    set_span_in_context,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _algotune_capture_span_content_enabled() -> bool:
+    raw = os.getenv("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "")
+    return raw.strip().upper() in {
+        "TRUE",
+        "1",
+        "YES",
+        "ON",
+        "SPAN_ONLY",
+        "SPAN_AND_EVENT",
+    }
+
+
+def _text_value(value: Any) -> str:
+    if value is None:
+        return ""
+    if isinstance(value, str):
+        return value
+    try:
+        return json.dumps(value, ensure_ascii=False, default=str)
+    except Exception:  # noqa: BLE001
+        return str(value)
+
+
+def _span_message(role: str, content: Any) -> dict[str, Any]:
+    return {
+        "role": role or "user",
+        "parts": [
+            {"type": "text", "content": truncate(_text_value(content))}
+        ],
+    }
+
+
+def _algotune_tool_definitions() -> list[dict[str, Any]]:
+    try:
+        from AlgoTuner.interfaces.commands.types import (  # noqa: PLC0415
+            COMMAND_FORMATS,
+        )
+    except Exception:  # noqa: BLE001
+        return []
+
+    definitions: list[dict[str, Any]] = []
+    for name, fmt in COMMAND_FORMATS.items():
+        description = (
+            getattr(fmt, "description", "") or f"AlgoTune command {name}"
+        )
+        example = getattr(fmt, "example", "") or ""
+        definitions.append(
+            {
+                "type": "function",
+                "name": str(name),
+                "description": truncate(str(description)),
+                "parameters": {
+                    "type": "object",
+                    "properties": {
+                        "command": {
+                            "type": "string",
+                            "description": truncate(
+                                str(example).strip() or str(description)
+                            ),
+                        }
+                    },
+                    "required": ["command"],
+                },
+            }
+        )
+    return definitions
+
+
+def _agent_content_attributes(instance: Any) -> dict[str, Any]:
+    if not _algotune_capture_span_content_enabled():
+        return {}
+
+    state = getattr(instance, "state", None)
+    messages = list(getattr(state, "messages", None) or [])
+    input_messages: list[dict[str, Any]] = []
+    output_messages: list[dict[str, Any]] = []
+    system_instructions: list[dict[str, Any]] = []
+
+    for msg in messages[-20:]:
+        if not isinstance(msg, dict):
+            continue
+        role = str(msg.get("role") or "user")
+        content = msg.get("content")
+        if role == "assistant":
+            output_messages.append(_span_message("assistant", content))
+        elif role == "system":
+            system_instructions.append(
+                {"type": "text", "content": truncate(_text_value(content))}
+            )
+        else:
+            input_messages.append(_span_message(role, content))
+
+    # AlgoTune puts its application instructions in the first user message.
+    # Surface that separately for UIs that render system instructions.
+    if not system_instructions and messages:
+        first = messages[0]
+        if isinstance(first, dict) and first.get("content"):
+            system_instructions.append(
+                {
+                    "type": "text",
+                    "content": truncate(_text_value(first.get("content"))),
+                }
+            )
+
+    tool_definitions = _algotune_tool_definitions()
+    attrs: dict[str, Any] = {
+        "algo.debug.input_messages.count": len(input_messages),
+        "algo.debug.output_messages.count": len(output_messages),
+        "algo.debug.system_instructions.count": len(system_instructions),
+        "algo.debug.tool_definitions.count": len(tool_definitions),
+    }
+
+    # Keep parent span output compact; large parent attributes are commonly
+    # harder to render than LLM child attributes in trace UIs.
+    output_payload = output_messages[-1:] if output_messages else []
+    attrs["gen_ai.output.messages"] = json.dumps(
+        output_payload, ensure_ascii=False, default=str
+    )
+    if output_payload:
+        try:
+            attrs["output.value"] = truncate(
+                _text_value(output_payload[-1]["parts"][0].get("content", ""))
+            )
+        except Exception:  # noqa: BLE001
+            pass
+
+    if input_messages:
+        attrs["gen_ai.input.messages"] = json.dumps(
+            input_messages[-6:], ensure_ascii=False, default=str
+        )
+    if system_instructions:
+        attrs["gen_ai.system_instructions"] = json.dumps(
+            system_instructions[:1], ensure_ascii=False, default=str
+        )
+    if tool_definitions:
+        attrs["gen_ai.tool.definitions"] = json.dumps(
+            tool_definitions, ensure_ascii=False, default=str
+        )
+    return attrs
+
+
+def _publish_agent_content_attributes(instance: Any, *spans: Span) -> None:
+    attrs = _agent_content_attributes(instance)
+    if not attrs:
+        return
+    for span in spans:
+        try:
+            if span is not None and span.is_recording():
+                span.set_attributes(attrs)
+        except Exception:  # noqa: BLE001
+            pass
+
+
+def _task_json_value(value: Any) -> str:
+    try:
+        return truncate(json.dumps(value, ensure_ascii=False, default=str))
+    except Exception:  # noqa: BLE001
+        return truncate(str(value))
+
+
+def _set_task_input(span: Span, value: Any) -> None:
+    span.set_attribute("input.mime_type", "application/json")
+    span.set_attribute("input.value", _task_json_value(value))
+
+
+def _set_task_output(span: Span, value: Any) -> None:
+    span.set_attribute("output.mime_type", "application/json")
+    span.set_attribute("output.value", _task_json_value(value))
+
+
+# ---------------------------------------------------------------------------
+# ENTRY: AlgoTuner.main.main()
+# ---------------------------------------------------------------------------
+
+
+class MainWrapper:
+    """ENTRY span around ``AlgoTuner.main.main()``."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        session_id = uuid.uuid4().hex
+        argv_repr = ""
+        try:
+            argv_repr = " ".join(map(str, sys.argv[1:8]))
+        except Exception:  # noqa: BLE001
+            pass
+
+        with self._tracer.start_as_current_span(
+            "enter_ai_application_system", kind=SpanKind.INTERNAL
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "ENTRY")
+            span.set_attribute("gen_ai.operation.name", "enter")
+            span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE)
+            span.set_attribute("gen_ai.session.id", session_id)
+            if argv_repr:
+                span.set_attribute("algotune.invocation.argv", truncate(argv_repr))
+
+            # Best-effort: pull --model and --task out of sys.argv so the
+            # ENTRY span carries the user's intent before main() finishes.
+            try:
+                argv = list(sys.argv[1:])
+                for i, tok in enumerate(argv):
+                    if tok == "--model" and i + 1 < len(argv):
+                        span.set_attribute(
+                            GenAI.GEN_AI_REQUEST_MODEL, argv[i + 1]
+                        )
+                    elif tok == "--task" and i + 1 < len(argv):
+                        span.set_attribute("algo.task.name", argv[i + 1])
+            except Exception:  # noqa: BLE001
+                pass
+
+            try:
+                return wrapped(*args, **kwargs)
+            except SystemExit as exc:
+                code = exc.code if isinstance(exc.code, int) else 0
+                if code:
+                    span.set_attribute("algotune.exit_code", int(code))
+                    span.set_status(
+                        Status(StatusCode.ERROR, f"sys.exit({code})")
+                    )
+                raise
+            except MemoryError as exc:
+                span.set_attribute("error.type", "MemoryError")
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR, "MemoryError"))
+                raise
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+
+
+# ---------------------------------------------------------------------------
+# AGENT: LLMInterface.run_task()
+# ---------------------------------------------------------------------------
+
+
+class RunTaskWrapper:
+    """AGENT span around ``LLMInterface.run_task()``."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        # Reset round counter at the beginning of each AGENT invocation.
+        try:
+            setattr(instance, INST_ROUND_ATTR, 0)
+            setattr(instance, INST_STEP_SPAN_ATTR, None)
+            setattr(instance, INST_STEP_TOKEN_ATTR, None)
+        except Exception:  # noqa: BLE001
+            pass
+
+        model_name = str(getattr(instance, "model_name", "") or "")
+        parent_span = trace_api.get_current_span()
+
+        with self._tracer.start_as_current_span(
+            "invoke_agent AlgoTuner", kind=SpanKind.INTERNAL
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "AGENT")
+            span.set_attribute(
+                GenAI.GEN_AI_OPERATION_NAME,
+                GenAI.GenAiOperationNameValues.INVOKE_AGENT.value,
+            )
+            span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE)
+            span.set_attribute(GenAI.GEN_AI_AGENT_NAME, "AlgoTuner")
+            span.set_attribute(
+                GenAI.GEN_AI_AGENT_DESCRIPTION,
+                "Iterative code optimization agent for benchmark tasks",
+            )
+            if model_name:
+                span.set_attribute(GenAI.GEN_AI_REQUEST_MODEL, model_name)
+                span.set_attribute(
+                    GenAI.GEN_AI_PROVIDER_NAME,
+                    provider_from_model(model_name),
+                )
+
+            terminated_reason: str = "unknown"
+            try:
+                result = wrapped(*args, **kwargs)
+                terminated_reason = self._infer_termination_reason(instance)
+                return result
+            except (KeyboardInterrupt, SystemExit) as exc:
+                terminated_reason = type(exc).__qualname__
+                if isinstance(exc, SystemExit):
+                    code = exc.code if isinstance(exc.code, int) else 0
+                    if code:
+                        span.set_status(
+                            Status(StatusCode.ERROR, f"sys.exit({code})")
+                        )
+                raise
+            except Exception as exc:
+                terminated_reason = "exception"
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+            finally:
+                # Always close any dangling STEP span first so the trace tree
+                # never has STEP outliving AGENT.
+                safe_close_step(instance)
+
+                rounds = int(getattr(instance, INST_ROUND_ATTR, 0) or 0)
+                span.set_attribute("algo.agent.total_rounds", rounds)
+                span.set_attribute("algo.agent.final_status", terminated_reason)
+                _publish_agent_content_attributes(instance, span, parent_span)
+
+                # Spend / final eval bookkeeping (best-effort; AlgoTune may
+                # have torn the interface down by now).
+                try:
+                    state = getattr(instance, "state", None)
+                    if state is not None:
+                        spend = getattr(state, "spend", None)
+                        if spend is not None:
+                            span.set_attribute(
+                                "algo.agent.spend_usd", float(spend)
+                            )
+                except Exception:  # noqa: BLE001
+                    pass
+
+                try:
+                    final_success = getattr(
+                        instance, "_final_eval_success", None
+                    )
+                    if final_success is not None:
+                        span.set_attribute(
+                            "algo.agent.final_eval_success",
+                            bool(final_success),
+                        )
+                    final_eval_result = getattr(
+                        instance, "_final_eval_metrics", None
+                    )
+                    if isinstance(final_eval_result, dict):
+                        ms = final_eval_result.get("mean_speedup")
+                        if ms is not None:
+                            try:
+                                span.set_attribute(
+                                    "algo.agent.final_mean_speedup", float(ms)
+                                )
+                            except (TypeError, ValueError):
+                                pass
+                except Exception:  # noqa: BLE001
+                    pass
+
+                span.add_event(
+                    "agent.loop.terminated",
+                    {"reason": terminated_reason},
+                )
+
+
+    @staticmethod
+    def _infer_termination_reason(instance: Any) -> str:
+        # Heuristics that align with the loop logic in
+        # LLMInterface.run_task() (line 996+).
+        try:
+            check = getattr(instance, "check_limits", None)
+            if callable(check) and check():
+                return "terminated_by_limit"
+        except Exception:  # noqa: BLE001
+            pass
+        try:
+            if getattr(instance, "_final_eval_success", False):
+                return "completed"
+        except Exception:  # noqa: BLE001
+            pass
+        return "completed"
+
+
+# ---------------------------------------------------------------------------
+# STEP: LLMInterface.get_response() + handle_function_call()
+# ---------------------------------------------------------------------------
+
+
+class GetResponseWrapper:
+    """Open a STEP span when ``get_response`` starts a new react round."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        # Close any previously opened STEP span before starting a new one
+        # (covers the empty-response retry path where the loop ``continue``s
+        # without invoking handle_function_call).
+        safe_close_step(instance)
+
+        round_n = int(getattr(instance, INST_ROUND_ATTR, 0) or 0) + 1
+        try:
+            setattr(instance, INST_ROUND_ATTR, round_n)
+            setattr(instance, INST_LITELLM_ATTEMPTS_ATTR, 0)
+        except Exception:  # noqa: BLE001
+            pass
+
+        span = self._tracer.start_span("react step", kind=SpanKind.INTERNAL)
+        span.set_attribute(GEN_AI_SPAN_KIND, "STEP")
+        span.set_attribute("gen_ai.operation.name", "react")
+        span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE)
+        span.set_attribute("gen_ai.react.round", round_n)
+
+        ctx = set_span_in_context(span)
+        token = otel_context.attach(ctx)
+        try:
+            setattr(instance, INST_STEP_SPAN_ATTR, span)
+            setattr(instance, INST_STEP_TOKEN_ATTR, token)
+        except Exception:  # noqa: BLE001
+            pass
+
+        try:
+            response = wrapped(*args, **kwargs)
+        except BaseException as exc:
+            span.set_attribute(
+                "gen_ai.react.finish_reason", type(exc).__qualname__
+            )
+            span.record_exception(exc)
+            span.set_status(Status(StatusCode.ERROR))
+            self._publish_attempt_count(instance, span)
+            try:
+                span.end()
+            finally:
+                otel_context.detach(token)
+                _clear_step_state(instance)
+            raise
+
+        if response is None:
+            span.set_attribute("algo.step.response_empty", True)
+            span.set_attribute(
+                "gen_ai.react.finish_reason", "empty_response_retry"
+            )
+            self._publish_attempt_count(instance, span)
+            try:
+                span.end()
+            finally:
+                otel_context.detach(token)
+                _clear_step_state(instance)
+            return response
+
+        # Non-empty response: STEP stays open, handle_function_call wrapper
+        # is responsible for closing it.
+        return response
+
+    @staticmethod
+    def _publish_attempt_count(instance: Any, span: Span) -> None:
+        try:
+            attempts = int(getattr(instance, INST_LITELLM_ATTEMPTS_ATTR, 0) or 0)
+            if attempts:
+                span.set_attribute("algo.llm.retry_count", attempts)
+        except Exception:  # noqa: BLE001
+            pass
+
+
+class HandleFunctionCallWrapper:
+    """Close the STEP span opened by ``GetResponseWrapper`` after the tool
+    call (or its error path) completes."""
+
+    __slots__ = ()
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        span: Optional[Span] = getattr(instance, INST_STEP_SPAN_ATTR, None)
+        token = getattr(instance, INST_STEP_TOKEN_ATTR, None)
+
+        try:
+            result = wrapped(*args, **kwargs)
+        except BaseException as exc:
+            if span is not None and span.is_recording():
+                span.set_attribute(
+                    "gen_ai.react.finish_reason", type(exc).__qualname__
+                )
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+            self._close_step(instance, span, token)
+            raise
+
+        if span is not None and span.is_recording():
+            # finish_reason recorded based on result shape
+            cmd_name = _extract_command_name(result)
+            if cmd_name:
+                span.set_attribute("algo.step.command_name", cmd_name)
+            span.set_attribute("gen_ai.react.finish_reason", "tool_executed")
+            try:
+                attempts = int(
+                    getattr(instance, INST_LITELLM_ATTEMPTS_ATTR, 0) or 0
+                )
+                if attempts:
+                    span.set_attribute("algo.llm.retry_count", attempts)
+            except Exception:  # noqa: BLE001
+                pass
+
+        self._close_step(instance, span, token)
+        return result
+
+    @staticmethod
+    def _close_step(
+        instance: Any, span: Optional[Span], token: Optional[Any]
+    ) -> None:
+        try:
+            if span is not None and span.is_recording():
+                span.end()
+        except Exception:  # noqa: BLE001
+            pass
+        try:
+            if token is not None:
+                otel_context.detach(token)
+        except Exception:  # noqa: BLE001
+            pass
+        _clear_step_state(instance)
+
+
+def _clear_step_state(instance: Any) -> None:
+    try:
+        setattr(instance, INST_STEP_SPAN_ATTR, None)
+        setattr(instance, INST_STEP_TOKEN_ATTR, None)
+    except Exception:  # noqa: BLE001
+        pass
+
+
+def _extract_command_name(result: Any) -> str:
+    """Try to recover the executed command name from ``handle_function_call``
+    output."""
+    if not isinstance(result, dict):
+        return ""
+    # CommandResult-style payloads may carry the command name inside
+    # ``data`` or via ``status_field``-keyed entries; we keep this loose
+    # because the AlgoTune handlers vary per command.
+    for key in ("command", "name", "cmd"):
+        val = result.get(key)
+        if isinstance(val, str) and val:
+            return val
+    data = result.get("data")
+    if isinstance(data, dict):
+        for key in ("command", "name", "cmd"):
+            val = data.get(key)
+            if isinstance(val, str) and val:
+                return val
+    return ""
+
+
+# ---------------------------------------------------------------------------
+# TOOL: CommandHandlers.handle_command()
+# ---------------------------------------------------------------------------
+
+
+class HandleCommandWrapper:
+    """TOOL span around ``CommandHandlers.handle_command``."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        command_obj = args[0] if args else kwargs.get("command_str")
+        cmd_name, cmd_args, is_error_response = _parse_command(command_obj)
+
+        span_name = f"execute_tool {cmd_name or 'unknown'}"
+        with self._tracer.start_as_current_span(
+            span_name, kind=SpanKind.INTERNAL
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "TOOL")
+            span.set_attribute(
+                GenAI.GEN_AI_OPERATION_NAME,
+                GenAI.GenAiOperationNameValues.EXECUTE_TOOL.value,
+            )
+            span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE)
+            span.set_attribute(GenAI.GEN_AI_TOOL_NAME, cmd_name or "unknown")
+            span.set_attribute(GenAI.GEN_AI_TOOL_TYPE, "function")
+            span.set_attribute(
+                GenAI.GEN_AI_TOOL_DESCRIPTION,
+                "AlgoTune internal command",
+            )
+            span.set_attribute(GenAI.GEN_AI_TOOL_CALL_ID, uuid.uuid4().hex)
+
+            if is_error_response:
+                span.set_attribute("algotune.command.error_response", True)
+
+            if (
+                OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT
+                and cmd_args is not None
+            ):
+                try:
+                    span.set_attribute(
+                        GenAI.GEN_AI_TOOL_CALL_ARGUMENTS,
+                        truncate(json.dumps(cmd_args, default=str)),
+                    )
+                except Exception:  # noqa: BLE001
+                    pass
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+
+            if isinstance(result, dict):
+                success = bool(result.get("success", False))
+                span.set_attribute("algo.command.success", success)
+                if not success and not is_error_response:
+                    span.set_status(Status(StatusCode.ERROR, "command failed"))
+
+                if OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT:
+                    msg = result.get("message")
+                    if msg:
+                        span.set_attribute(
+                            GenAI.GEN_AI_TOOL_CALL_RESULT,
+                            truncate(msg),
+                        )
+
+                # Best-effort snapshot detection (only present for ``edit``).
+                data = result.get("data")
+                if isinstance(data, dict):
+                    snap = data.get("snapshot_saved")
+                    if snap is not None:
+                        try:
+                            span.set_attribute(
+                                "algo.snapshot.saved", bool(snap)
+                            )
+                        except Exception:  # noqa: BLE001
+                            pass
+
+            return result
+
+
+def _parse_command(command_obj: Any) -> tuple[str, Optional[dict], bool]:
+    """Extract ``(command_name, args_dict, is_error_response)`` from the
+    command object passed to ``handle_command``.
+
+    AlgoTune passes either a ``ParsedCommand`` dataclass or a structured
+    error dict (see handlers.py line 226).
+    """
+    if isinstance(command_obj, dict):
+        # Validation/parsing error dict path.
+        cmd = command_obj.get("command") or "error_response"
+        return str(cmd), None, True
+    name = getattr(command_obj, "command", None)
+    args = getattr(command_obj, "args", None)
+    if isinstance(args, dict):
+        return str(name or "unknown"), args, False
+    return str(name or "unknown"), None, False
+
+
+# ---------------------------------------------------------------------------
+# TASK(dataset_eval): CommandHandlers._runner_eval_dataset()
+# ---------------------------------------------------------------------------
+
+
+class RunnerEvalDatasetWrapper:
+    """TASK span around ``CommandHandlers._runner_eval_dataset``."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        data_subset = (
+            args[0] if len(args) >= 1 else kwargs.get("data_subset", "")
+        )
+        command_source = (
+            args[1] if len(args) >= 2 else kwargs.get("command_source", "")
+        )
+
+        with self._tracer.start_as_current_span(
+            "run_task benchmark.dataset_eval", kind=SpanKind.INTERNAL
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "TASK")
+            span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "run_task")
+            span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE)
+            span.set_attribute("gen_ai.task.name", "benchmark.dataset_eval")
+            if data_subset:
+                span.set_attribute("algo.eval.subset", str(data_subset))
+            if command_source:
+                span.set_attribute(
+                    "algo.eval.command_source", str(command_source)
+                )
+            _set_task_input(
+                span,
+                {
+                    "task": "benchmark.dataset_eval",
+                    "data_subset": str(data_subset) if data_subset else "",
+                    "command_source": str(command_source)
+                    if command_source
+                    else "",
+                },
+            )
+
+            interface = getattr(instance, "interface", None)
+            try:
+                max_samples = getattr(interface, "max_samples", None)
+                span.set_attribute(
+                    "algo.eval.test_mode", max_samples is not None
+                )
+            except Exception:  # noqa: BLE001
+                pass
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+            else:
+                self._record_eval_attributes(span, result)
+                try:
+                    result_data = result.data if hasattr(result, "data") else result
+                    _set_task_output(
+                        span,
+                        {
+                            "success": getattr(result, "success", None),
+                            "status": getattr(result, "status", None),
+                            "message": getattr(result, "message", None),
+                            "data": result_data,
+                        },
+                    )
+                except Exception:  # noqa: BLE001
+                    pass
+                return result
+            finally:
+                pass
+
+    @staticmethod
+    def _record_eval_attributes(span: Span, result: Any) -> None:
+        # ``result`` is typically a ``CommandResult`` dataclass with .data
+        # carrying aggregate evaluation values, but downstream code also accepts
+        # raw dicts. We use getattr/dict-access defensively.
+        try:
+            data = result.data if hasattr(result, "data") else result
+        except Exception:  # noqa: BLE001
+            data = None
+
+        if not isinstance(data, dict):
+            return
+
+        # The aggregate payload may live at the top level or inside
+        # ``data``/``raw``/``metrics``.
+        candidates = [data]
+        for key in ("aggregate_metrics", "metrics", "raw"):
+            sub = data.get(key) if isinstance(data, dict) else None
+            if isinstance(sub, dict):
+                candidates.append(sub)
+
+        for src in candidates:
+            for src_key, dst_attr, caster in (
+                ("num_evaluated", "algo.eval.total_problems", int),
+                ("mean_speedup", "algo.eval.mean_speedup", float),
+                ("num_valid", "algo.eval.num_valid", int),
+                ("num_invalid", "algo.eval.num_invalid", int),
+                ("num_timeout", "algo.eval.num_timeout", int),
+            ):
+                if src_key in src and src[src_key] is not None:
+                    try:
+                        span.set_attribute(dst_attr, caster(src[src_key]))
+                    except (TypeError, ValueError):
+                        pass
+
+
+# ---------------------------------------------------------------------------
+# TASK(problem_eval): EvaluationOrchestrator.evaluate_single()
+# ---------------------------------------------------------------------------
+
+
+class EvaluateSingleWrapper:
+    """TASK span around ``EvaluationOrchestrator.evaluate_single``."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        problem_id = kwargs.get("problem_id", "problem")
+        problem_index = kwargs.get("problem_index", 0)
+        baseline_time_ms = kwargs.get("baseline_time_ms")
+
+        with self._tracer.start_as_current_span(
+            "run_task benchmark.problem_eval", kind=SpanKind.INTERNAL
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "TASK")
+            span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "run_task")
+            span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE)
+            span.set_attribute("gen_ai.task.name", "benchmark.problem_eval")
+            span.set_attribute("algo.problem.id", str(problem_id))
+            try:
+                span.set_attribute("algo.problem.index", int(problem_index))
+            except (TypeError, ValueError):
+                pass
+            if baseline_time_ms is not None:
+                try:
+                    span.set_attribute(
+                        "algo.problem.baseline_time_ms", float(baseline_time_ms)
+                    )
+                except (TypeError, ValueError):
+                    pass
+            _set_task_input(
+                span,
+                {
+                    "task": "benchmark.problem_eval",
+                    "problem_id": str(problem_id),
+                    "problem_index": problem_index,
+                    "baseline_time_ms": baseline_time_ms,
+                    "kwargs": kwargs,
+                },
+            )
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+            else:
+                self._record_problem_attributes(span, result)
+                try:
+                    _set_task_output(
+                        span,
+                        {
+                            "speedup": _safe_get(result, "speedup"),
+                            "solver_time_ms": _safe_get(
+                                result, "solver_time_ms"
+                            ),
+                            "is_valid": _safe_get(result, "is_valid"),
+                            "error_type": _safe_get(
+                                _safe_get(result, "execution"),
+                                "error_type",
+                            ),
+                        },
+                    )
+                except Exception:  # noqa: BLE001
+                    pass
+                return result
+            finally:
+                pass
+
+    @staticmethod
+    def _record_problem_attributes(span: Span, result: Any) -> None:
+        # ``ProblemResult`` is a dataclass; defensive getattr handles
+        # alternate shapes (dict / namedtuple).
+        speedup = _safe_get(result, "speedup")
+        if speedup is not None:
+            try:
+                span.set_attribute("algo.problem.speedup", float(speedup))
+            except (TypeError, ValueError):
+                pass
+
+        solver_time = _safe_get(result, "solver_time_ms")
+        if solver_time is not None:
+            try:
+                span.set_attribute(
+                    "algo.problem.solver_time_ms", float(solver_time)
+                )
+            except (TypeError, ValueError):
+                pass
+
+        is_valid = _safe_get(result, "is_valid")
+        if is_valid is not None:
+            try:
+                span.set_attribute("algo.problem.is_valid", bool(is_valid))
+            except (TypeError, ValueError):
+                pass
+
+        execution = _safe_get(result, "execution")
+        if execution is not None:
+            timed_out = _safe_get(execution, "timeout_occurred")
+            if timed_out is not None:
+                try:
+                    span.set_attribute(
+                        "algo.problem.timeout_occurred", bool(timed_out)
+                    )
+                except (TypeError, ValueError):
+                    pass
+            err_type = _safe_get(execution, "error_type")
+            if err_type is not None:
+                value = getattr(err_type, "value", err_type)
+                span.set_attribute("algo.problem.error_type", str(value))
+
+
+def _safe_get(obj: Any, name: str) -> Any:
+    if obj is None:
+        return None
+    if isinstance(obj, dict):
+        return obj.get(name)
+    return getattr(obj, name, None)
+
+
+# ---------------------------------------------------------------------------
+# TASK(baseline): BaselineManager.get_baseline_times()
+# ---------------------------------------------------------------------------
+
+
+class GetBaselineTimesWrapper:
+    """TASK span around ``BaselineManager.get_baseline_times``.
+
+    Special-cased to keep the span healthy across ``SystemExit(1)``
+    raised from inside the retry loop on fatal failure.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        subset = args[0] if args else kwargs.get("subset", "")
+        cache_hit = False
+        try:
+            cache = getattr(instance, "_cache", None)
+            if isinstance(cache, dict) and cache.get(subset) is not None:
+                cache_hit = True
+        except Exception:  # noqa: BLE001
+            pass
+
+        with self._tracer.start_as_current_span(
+            "run_task benchmark.baseline_generation",
+            kind=SpanKind.INTERNAL,
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "TASK")
+            span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "run_task")
+            span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE)
+            span.set_attribute(
+                "gen_ai.task.name", "benchmark.baseline_generation"
+            )
+            if subset:
+                span.set_attribute("algo.baseline.subset", str(subset))
+            span.set_attribute("algo.baseline.cache_hit", cache_hit)
+            _set_task_input(
+                span,
+                {
+                    "task": "benchmark.baseline_generation",
+                    "subset": str(subset) if subset else "",
+                    "cache_hit": cache_hit,
+                },
+            )
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except SystemExit as exc:
+                code = exc.code if isinstance(exc.code, int) else 1
+                span.add_event(
+                    "baseline.fatal_failure", {"exit_code": int(code)}
+                )
+                span.set_status(
+                    Status(
+                        StatusCode.ERROR,
+                        "Baseline generation fatal failure",
+                    )
+                )
+                raise
+            except BaseException as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+            else:
+                if isinstance(result, dict):
+                    span.set_attribute(
+                        "algo.baseline.actual_count", len(result)
+                    )
+                _set_task_output(
+                    span,
+                    {
+                        "count": len(result) if isinstance(result, dict) else None,
+                        "result": result,
+                    },
+                )
+                return result
+            finally:
+                pass
+
+
+# ---------------------------------------------------------------------------
+# LLM retry counters (no spans). Cooperates with the LiteLLM instrumentor
+# which is responsible for actual LLM spans.
+# ---------------------------------------------------------------------------
+
+
+class LiteLLMQueryWrapper:
+    """Wrap ``LiteLLMModel.query`` to publish ``algo.llm.retry_count`` onto
+    the active STEP span. **Never creates a span.**"""
+
+    __slots__ = ()
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        # Use the LLMInterface instance (carrying the STEP span) which is
+        # accessible from the model only indirectly. We instead read the
+        # current span and treat it as the STEP if its kind matches.
+        step_span = trace_api.get_current_span()
+        # Reset attempt count on this LiteLLMModel instance for this call.
+        try:
+            setattr(instance, "_otel_algo_litellm_call_attempts", 0)
+        except Exception:  # noqa: BLE001
+            pass
+        try:
+            return wrapped(*args, **kwargs)
+        finally:
+            try:
+                attempts = int(
+                    getattr(
+                        instance, "_otel_algo_litellm_call_attempts", 0
+                    )
+                    or 0
+                )
+                if (
+                    attempts
+                    and step_span is not None
+                    and step_span.is_recording()
+                ):
+                    # Surface raw per-call attempts as a separate attribute
+                    # (the wrapping STEP also aggregates across multiple
+                    # query() invocations via INST_LITELLM_ATTEMPTS_ATTR).
+                    step_span.set_attribute(
+                        "algo.llm.last_call_attempts", attempts
+                    )
+            except Exception:  # noqa: BLE001
+                pass
+
+
+class LiteLLMExecuteQueryWrapper:
+    """Wrap ``LiteLLMModel._execute_query`` to count attempts.
+
+    Each call corresponds to one ``litellm.completion()`` invocation. We
+    increment a counter on both the LiteLLMModel instance (for the per-call
+    metric above) and on the LLMInterface instance hosting the STEP
+    span (for the total per-step retry count)."""
+
+    __slots__ = ()
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        # Per-call attempts (on LiteLLMModel instance).
+        try:
+            cur = int(
+                getattr(instance, "_otel_algo_litellm_call_attempts", 0) or 0
+            )
+            setattr(instance, "_otel_algo_litellm_call_attempts", cur + 1)
+        except Exception:  # noqa: BLE001
+            pass
+
+        # Per-step attempts (on LLMInterface instance, located via the
+        # current STEP span's holder). Walk up the wrapt context: the
+        # LLMInterface owns the LiteLLMModel via ``self.model``, so we
+        # use a global registry-free approach by looking at the active
+        # span's instance binding through the OTel context stack.
+        active = trace_api.get_current_span()
+        if active is not None and active.is_recording():
+            # We can't directly resolve the LLMInterface from the active span,
+            # so we increment a counter we keep on the active span itself.
+            try:
+                # Read existing total via OTel attribute is not supported;
+                # we keep our own counter on the span object via a private
+                # attribute. ``Span`` doesn't expose attribute reads, so
+                # we maintain a side-band store via setattr on ``active``
+                # only when it's a typed mutable Span (SDK ``ReadableSpan``
+                # is hashable and supports attribute assignment in CPython).
+                cur_total = getattr(active, "_otel_algo_step_attempts", 0) + 1
+                try:
+                    setattr(active, "_otel_algo_step_attempts", cur_total)
+                except Exception:  # noqa: BLE001
+                    cur_total = 0
+                if cur_total:
+                    active.set_attribute("algo.llm.retry_count", cur_total)
+            except Exception:  # noqa: BLE001
+                pass
+
+        return wrapped(*args, **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# LLM (optional bypass): TogetherModel.query()
+# ---------------------------------------------------------------------------
+
+
+class TogetherModelQueryWrapper:
+    """LLM span around ``TogetherModel.query``.
+
+    Together's HTTP client is invoked directly via ``requests.post`` and
+    therefore not covered by the LiteLLM instrumentor. This wrapper is
+    **opt-in** via ``ALGOTUNE_OTEL_INSTRUMENT_TOGETHER=true``.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        model_name = str(getattr(instance, "model_name", "") or "unknown")
+        span_name = f"chat {model_name}"
+        defaults = getattr(instance, "default_params", None) or {}
+
+        with self._tracer.start_as_current_span(
+            span_name, kind=SpanKind.CLIENT
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "LLM")
+            span.set_attribute(
+                GenAI.GEN_AI_OPERATION_NAME,
+                GenAI.GenAiOperationNameValues.CHAT.value,
+            )
+            span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE)
+            span.set_attribute(GenAI.GEN_AI_REQUEST_MODEL, model_name)
+            span.set_attribute(GenAI.GEN_AI_PROVIDER_NAME, "together_ai")
+
+            try:
+                if isinstance(defaults, dict):
+                    if "temperature" in defaults and defaults["temperature"] is not None:
+                        span.set_attribute(
+                            GenAI.GEN_AI_REQUEST_TEMPERATURE,
+                            float(defaults["temperature"]),
+                        )
+                    if "top_p" in defaults and defaults["top_p"] is not None:
+                        span.set_attribute(
+                            GenAI.GEN_AI_REQUEST_TOP_P,
+                            float(defaults["top_p"]),
+                        )
+                    if (
+                        "max_tokens" in defaults
+                        and defaults["max_tokens"] is not None
+                    ):
+                        span.set_attribute(
+                            GenAI.GEN_AI_REQUEST_MAX_TOKENS,
+                            int(defaults["max_tokens"]),
+                        )
+            except Exception:  # noqa: BLE001
+                pass
+
+            input_tokens = 0
+            output_tokens = 0
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+            else:
+                if isinstance(result, dict):
+                    cost = result.get("cost")
+                    if cost is not None:
+                        try:
+                            span.set_attribute(
+                                "algo.llm.response_cost_usd", float(cost)
+                            )
+                        except (TypeError, ValueError):
+                            pass
+                    usage = result.get("usage")
+                    if isinstance(usage, dict):
+                        input_tokens, output_tokens = _extract_together_usage(
+                            usage
+                        )
+                        if input_tokens:
+                            span.set_attribute(
+                                GenAI.GEN_AI_USAGE_INPUT_TOKENS, input_tokens
+                            )
+                        if output_tokens:
+                            span.set_attribute(
+                                GenAI.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens
+                            )
+                        total = (
+                            usage.get("total_tokens")
+                            if usage.get("total_tokens") is not None
+                            else (input_tokens + output_tokens or None)
+                        )
+                        if total:
+                            try:
+                                span.set_attribute(
+                                    GEN_AI_USAGE_TOTAL_TOKENS, int(total)
+                                )
+                            except (TypeError, ValueError):
+                                pass
+                    if OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT:
+                        msg = result.get("message")
+                        if msg:
+                            span.set_attribute(
+                                GenAI.GEN_AI_OUTPUT_MESSAGES, truncate(msg)
+                            )
+                return result
+            finally:
+                pass
+
+
+def _extract_together_usage(usage: dict) -> tuple[int, int]:
+    """Pick (input_tokens, output_tokens) from Together's usage payload.
+
+    Together returns OpenAI-compatible ``prompt_tokens`` /
+    ``completion_tokens`` but we tolerate ``input_tokens`` / ``output_tokens``
+    as well in case the upstream schema drifts.
+    """
+    inp = usage.get("prompt_tokens")
+    if inp is None:
+        inp = usage.get("input_tokens")
+    out = usage.get("completion_tokens")
+    if out is None:
+        out = usage.get("output_tokens")
+    try:
+        inp_i = int(inp) if inp is not None else 0
+    except (TypeError, ValueError):
+        inp_i = 0
+    try:
+        out_i = int(out) if out is not None else 0
+    except (TypeError, ValueError):
+        out_i = 0
+    return inp_i, out_i
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/package.py
new file mode 100644
index 000000000..758567afc
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/package.py
@@ -0,0 +1,3 @@
+_instruments = ()
+
+_supports_metrics = False
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/version.py
new file mode 100644
index 000000000..3dc1f76bc
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md
new file mode 100644
index 000000000..62fb6539b
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md
@@ -0,0 +1,22 @@
+# Changelog
+
+All notable changes to the LoongSuite BFCL v4 instrumentation are documented
+in this file.
+
+## Unreleased
+
+### Added
+
+- Initial release of `loongsuite-instrumentation-bfclv4`.
+- ENTRY span around `bfcl_eval._llm_response_generation.generate_results`.
+- AGENT span around `bfcl_eval.model_handler.base_handler.BaseHandler.inference`
+  with cross-thread OTel context propagation via a narrow patch of
+  `bfcl_eval._llm_response_generation.ThreadPoolExecutor`.
+- STEP spans created by reflectively wrapping each handler's
+  `_query_FC` / `_query_prompting` (discovered via
+  `bfcl_eval.constants.model_config.MODEL_CONFIG_MAPPING`).
+- Per-call TOOL spans emitted by wrapping
+  `bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils.execute_multi_turn_func_call`.
+- Provider override mapping for OSS handlers (vLLM / SGLang).
+- Multi-turn `bfcl.turn_idx` and ReAct `gen_ai.react.round` tracking via
+  `contextvars`.
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md
new file mode 100644
index 000000000..7a4e5d69d
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md
@@ -0,0 +1,79 @@
+# LoongSuite BFCL v4 Instrumentation
+
+LoongSuite Python instrumentation for the [Berkeley Function Call
+Leaderboard v4](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard)
+(`bfcl-eval`, package `bfcl_eval`).
+
+## Span Topology
+
+```
+ENTRY  enter_ai_application_system          gen_ai.span.kind=ENTRY,  op=enter
+└─ AGENT  invoke_agent {test_entry_id}      gen_ai.span.kind=AGENT,  op=invoke_agent
+   ├─ STEP  react step                      gen_ai.span.kind=STEP,   op=react
+   │   ├─ LLM   chat {model}                (created by downstream vendor SDK probe)
+   │   └─ TOOL  execute_tool {fn}           gen_ai.span.kind=TOOL,   op=execute_tool
+   └─ STEP  react step
+       └─ ...
+```
+
+This instrumentation deliberately does **not** create LLM spans. They are
+emitted by the downstream vendor SDK probe (OpenAI / Anthropic / Google /
+DashScope / LiteLLM / etc.) so that token usage and request payloads stay in
+sync with the SDK that actually performed the request.
+
+## Installation
+
+```bash
+pip install loongsuite-instrumentation-bfclv4
+```
+
+## Usage
+
+```bash
+opentelemetry-instrument bfcl generate \
+    --model gpt-4o-2024-11-20-FC \
+    --test-category simple_python \
+    --num-threads 2
+```
+
+Or programmatically:
+
+```python
+from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor
+
+BFCLv4Instrumentor().instrument()
+# ... run BFCL ...
+BFCLv4Instrumentor().uninstrument()
+```
+
+## Compatibility With Downstream LLM SDK Probes
+
+| Scenario | Recommended downstream probe |
+| --- | --- |
+| OpenAI / OpenAI Responses / OSS via vLLM / SGLang / DeepSeek (OpenAI-compatible) | `opentelemetry-instrumentation-openai` |
+| Anthropic / Claude | `loongsuite-instrumentation-claude-agent-sdk` |
+| Gemini / Google | `loongsuite-instrumentation-google-adk` |
+| Qwen / DashScope | `loongsuite-instrumentation-dashscope` |
+| LiteLLM | `loongsuite-instrumentation-litellm` |
+
+## OSS Provider Notes
+
+For OSS handlers (vLLM / SGLang served via the OpenAI-compatible API), the
+BFCL probe sets `gen_ai.provider.name` to `vllm` / `sglang` / `oss` and adds
+`bfcl.oss.backend` for disambiguation. Downstream OpenAI probes will still
+report `gen_ai.provider.name=openai` on the LLM span; this is expected.
+
+## Custom Attributes
+
+| Attribute | Where | Description |
+| --- | --- | --- |
+| `gen_ai.framework` = `bfclv4` | ENTRY/AGENT/STEP/TOOL | Framework tag |
+| `bfcl.test_category` | ENTRY/AGENT | Test category |
+| `bfcl.num_threads` | ENTRY | Configured thread pool size |
+| `bfcl.test_case_count` | ENTRY | Number of test cases |
+| `bfcl.run_ids` | ENTRY | Whether the run targeted specific IDs |
+| `bfcl.test_entry_id` | AGENT | Test entry id |
+| `bfcl.turn_idx` | STEP | Multi-turn turn index (0-based) |
+| `bfcl.query_mode` | STEP | `FC` or `prompting` |
+| `bfcl.oss.backend` | AGENT/STEP | `vllm` / `sglang` / `unknown` (only OSS) |
+| `bfcl.tool.duration_is_estimated` | TOOL | True (latency is averaged across batch) |
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml
new file mode 100644
index 000000000..3eeb5d026
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml
@@ -0,0 +1,54 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-bfclv4"
+dynamic = ["version"]
+description = "LoongSuite BFCL v4 (Berkeley Function Call Leaderboard) instrumentation"
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.10,<4"
+authors = [
+  { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+]
+dependencies = [
+  "opentelemetry-api >= 1.37.0",
+  "opentelemetry-instrumentation >= 0.58b0",
+  "opentelemetry-semantic-conventions >= 0.58b0",
+  "wrapt >= 1.0.0, < 2.0.0",
+  "opentelemetry-util-genai >= 0.3b0.dev0",
+]
+
+[project.optional-dependencies]
+instruments = [
+  "bfcl-eval >= 4.0.0",
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+bfclv4 = "opentelemetry.instrumentation.bfclv4:BFCLv4Instrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/bfclv4/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+  "/src",
+  "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py
new file mode 100644
index 000000000..6a7729940
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py
@@ -0,0 +1,322 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""LoongSuite BFCL v4 (Berkeley Function Call Leaderboard) instrumentation.
+
+Usage
+-----
+
+.. code:: python
+
+    from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor
+
+    BFCLv4Instrumentor().instrument()
+    # ... run BFCL ...
+    BFCLv4Instrumentor().uninstrument()
+
+API
+---
+"""
+
+from __future__ import annotations
+
+import importlib
+import logging
+from typing import Any, Collection, List, Tuple
+
+from wrapt import wrap_function_wrapper
+
+from opentelemetry.instrumentation.bfclv4.internal.wrappers import (
+    BaseHandlerInferenceWrapper,
+    ExecuteFuncCallWrapper,
+    GenerateResultsWrapper,
+    QueryWrapper,
+    TurnBumpWrapper,
+)
+from opentelemetry.instrumentation.bfclv4.package import _instruments
+from opentelemetry.instrumentation.bfclv4.utils import GenAIHookHelper
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from opentelemetry.instrumentation.utils import unwrap
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["BFCLv4Instrumentor"]
+
+
+_GENERATE_RESULTS_MODULE = "bfcl_eval._llm_response_generation"
+_GENERATE_RESULTS_NAME = "generate_results"
+
+_BASE_HANDLER_MODULE = "bfcl_eval.model_handler.base_handler"
+_BASE_HANDLER_NAME = "BaseHandler.inference"
+
+_EXECUTE_TOOL_MODULE = (
+    "bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils"
+)
+_EXECUTE_TOOL_NAME = "execute_multi_turn_func_call"
+
+
+# ``MODEL_CONFIG_MAPPING`` already imports every concrete handler at module
+# load time, so iterating over its values gives us the canonical handler
+# class set without risking new vendor SDK imports.
+def _iter_handler_classes() -> List[type]:
+    try:
+        from bfcl_eval.constants.model_config import (  # noqa: PLC0415
+            MODEL_CONFIG_MAPPING,
+        )
+    except Exception as exc:  # noqa: BLE001
+        logger.debug(
+            "bfclv4: cannot import MODEL_CONFIG_MAPPING: %s", exc
+        )
+        return []
+
+    classes: List[type] = []
+    seen_class_ids: set[int] = set()
+    for cfg in MODEL_CONFIG_MAPPING.values():
+        cls = getattr(cfg, "model_handler", None)
+        if cls is None or not isinstance(cls, type):
+            continue
+        if id(cls) in seen_class_ids:
+            continue
+        seen_class_ids.add(id(cls))
+        classes.append(cls)
+    return classes
+
+
+class BFCLv4Instrumentor(BaseInstrumentor):
+    """An instrumentor for the BFCL v4 (``bfcl_eval``) framework."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        if not hasattr(self, "_wrapped_query_methods"):
+            self._wrapped_query_methods: List[Tuple[type, str]] = []
+        if not hasattr(self, "_wrapped_turn_methods"):
+            self._wrapped_turn_methods: List[Tuple[type, str]] = []
+        if not hasattr(self, "_entry_wrapped"):
+            self._entry_wrapped = False
+        if not hasattr(self, "_inference_wrapped"):
+            self._inference_wrapped = False
+        if not hasattr(self, "_tool_wrapped"):
+            self._tool_wrapped = False
+        if not hasattr(self, "_tool_targets"):
+            self._tool_targets: List[Tuple[str, str]] = []
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    # ------------------------------------------------------------------
+    # _instrument
+
+    def _instrument(self, **kwargs: Any) -> None:  # noqa: D401
+        helper = GenAIHookHelper()
+
+        # 1) ENTRY -----------------------------------------------------
+        try:
+            wrap_function_wrapper(
+                _GENERATE_RESULTS_MODULE,
+                _GENERATE_RESULTS_NAME,
+                GenerateResultsWrapper(helper),
+            )
+            self._entry_wrapped = True
+        except Exception as exc:  # noqa: BLE001
+            logger.warning(
+                "bfclv4: failed to wrap %s.%s: %s",
+                _GENERATE_RESULTS_MODULE,
+                _GENERATE_RESULTS_NAME,
+                exc,
+            )
+
+        # 2) AGENT -----------------------------------------------------
+        try:
+            wrap_function_wrapper(
+                _BASE_HANDLER_MODULE,
+                _BASE_HANDLER_NAME,
+                BaseHandlerInferenceWrapper(helper),
+            )
+            self._inference_wrapped = True
+        except Exception as exc:  # noqa: BLE001
+            logger.warning(
+                "bfclv4: failed to wrap %s.%s: %s",
+                _BASE_HANDLER_MODULE,
+                _BASE_HANDLER_NAME,
+                exc,
+            )
+
+        # 3) STEP + 4) turn maintenance --------------------------------
+        self._instrument_handlers(helper)
+
+        # 5) TOOL ------------------------------------------------------
+        # ``execute_multi_turn_func_call`` is re-exported via ``from ... import``
+        # in several BFCL modules, so wrapping just the source module misses
+        # the call sites that use the local binding. We wrap each known
+        # re-export site as well to guarantee the TOOL span is always emitted.
+        tool_targets = [
+            (_EXECUTE_TOOL_MODULE, _EXECUTE_TOOL_NAME),
+            (
+                "bfcl_eval.model_handler.base_handler",
+                _EXECUTE_TOOL_NAME,
+            ),
+            (
+                "bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker",
+                _EXECUTE_TOOL_NAME,
+            ),
+        ]
+        wrapper_instance = ExecuteFuncCallWrapper(helper)
+        self._tool_targets = []
+        for module_name, attr_name in tool_targets:
+            try:
+                wrap_function_wrapper(
+                    module_name,
+                    attr_name,
+                    wrapper_instance,
+                )
+                self._tool_targets.append((module_name, attr_name))
+            except Exception as exc:  # noqa: BLE001
+                logger.debug(
+                    "bfclv4: failed to wrap %s.%s: %s",
+                    module_name,
+                    attr_name,
+                    exc,
+                )
+        self._tool_wrapped = bool(self._tool_targets)
+
+    def _instrument_handlers(self, helper: GenAIHookHelper) -> None:
+        # Reflectively wrap every concrete ``_query_FC`` / ``_query_prompting``
+        # plus the turn-maintenance helpers; we de-duplicate by function id so
+        # subclasses that share an inherited implementation are wrapped only
+        # once.
+        seen_func_ids: set[int] = set()
+
+        query_pairs = (
+            ("_query_FC", "FC"),
+            ("_query_prompting", "prompting"),
+        )
+        turn_pairs = (
+            ("add_first_turn_message_FC", True),
+            ("add_first_turn_message_prompting", True),
+            ("_add_next_turn_user_message_FC", False),
+            ("_add_next_turn_user_message_prompting", False),
+        )
+
+        for cls in _iter_handler_classes():
+            class_dict = getattr(cls, "__dict__", {})
+            for method_name, mode in query_pairs:
+                method = class_dict.get(method_name)
+                if method is None or not callable(method):
+                    continue
+                key = id(method)
+                if key in seen_func_ids:
+                    continue
+                seen_func_ids.add(key)
+                try:
+                    wrap_function_wrapper(
+                        cls.__module__,
+                        f"{cls.__name__}.{method_name}",
+                        QueryWrapper(helper, mode),
+                    )
+                    self._wrapped_query_methods.append((cls, method_name))
+                except Exception as exc:  # noqa: BLE001
+                    logger.debug(
+                        "bfclv4: failed to wrap %s.%s.%s: %s",
+                        cls.__module__,
+                        cls.__name__,
+                        method_name,
+                        exc,
+                    )
+
+            for method_name, is_first in turn_pairs:
+                method = class_dict.get(method_name)
+                if method is None or not callable(method):
+                    continue
+                key = id(method)
+                if key in seen_func_ids:
+                    continue
+                seen_func_ids.add(key)
+                try:
+                    wrap_function_wrapper(
+                        cls.__module__,
+                        f"{cls.__name__}.{method_name}",
+                        TurnBumpWrapper(reset=is_first),
+                    )
+                    self._wrapped_turn_methods.append((cls, method_name))
+                except Exception as exc:  # noqa: BLE001
+                    logger.debug(
+                        "bfclv4: failed to wrap %s.%s.%s: %s",
+                        cls.__module__,
+                        cls.__name__,
+                        method_name,
+                        exc,
+                    )
+
+    # ------------------------------------------------------------------
+    # _uninstrument
+
+    def _uninstrument(self, **kwargs: Any) -> None:  # noqa: D401
+        if self._tool_wrapped:
+            for module_name, attr_name in getattr(self, "_tool_targets", []):
+                try:
+                    module = importlib.import_module(module_name)
+                    unwrap(module, attr_name)
+                except Exception as exc:  # noqa: BLE001
+                    logger.debug(
+                        "bfclv4: failed to unwrap %s.%s: %s",
+                        module_name,
+                        attr_name,
+                        exc,
+                    )
+            self._tool_targets = []
+            self._tool_wrapped = False
+
+        for cls, method_name in self._wrapped_query_methods:
+            try:
+                unwrap(cls, method_name)
+            except Exception as exc:  # noqa: BLE001
+                logger.debug(
+                    "bfclv4: failed to unwrap %s.%s: %s",
+                    cls.__name__,
+                    method_name,
+                    exc,
+                )
+        self._wrapped_query_methods = []
+
+        for cls, method_name in self._wrapped_turn_methods:
+            try:
+                unwrap(cls, method_name)
+            except Exception as exc:  # noqa: BLE001
+                logger.debug(
+                    "bfclv4: failed to unwrap %s.%s: %s",
+                    cls.__name__,
+                    method_name,
+                    exc,
+                )
+        self._wrapped_turn_methods = []
+
+        if self._inference_wrapped:
+            try:
+                base_module = importlib.import_module(_BASE_HANDLER_MODULE)
+                unwrap(base_module.BaseHandler, "inference")
+            except Exception as exc:  # noqa: BLE001
+                logger.debug(
+                    "bfclv4: failed to unwrap BaseHandler.inference: %s", exc
+                )
+            self._inference_wrapped = False
+
+        if self._entry_wrapped:
+            try:
+                module = importlib.import_module(_GENERATE_RESULTS_MODULE)
+                unwrap(module, _GENERATE_RESULTS_NAME)
+            except Exception as exc:  # noqa: BLE001
+                logger.debug(
+                    "bfclv4: failed to unwrap generate_results: %s", exc
+                )
+            self._entry_wrapped = False
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py
new file mode 100644
index 000000000..b0a6f4284
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py
@@ -0,0 +1,13 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py
new file mode 100644
index 000000000..774200aba
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py
@@ -0,0 +1,38 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Constant attribute keys used by the BFCL v4 instrumentation."""
+
+from __future__ import annotations
+
+from typing import Final
+
+FRAMEWORK_NAME: Final = "bfclv4"
+
+# gen_ai.* attribute keys that are not exported by
+# opentelemetry-semantic-conventions today.
+GEN_AI_FRAMEWORK: Final = "gen_ai.framework"
+GEN_AI_PROVIDER_NAME: Final = "gen_ai.provider.name"
+
+# BFCL-specific (vendor) attribute keys.
+BFCL_TEST_CATEGORY: Final = "bfcl.test_category"
+BFCL_NUM_THREADS: Final = "bfcl.num_threads"
+BFCL_TEST_CASE_COUNT: Final = "bfcl.test_case_count"
+BFCL_RUN_IDS: Final = "bfcl.run_ids"
+BFCL_TEST_ENTRY_ID: Final = "bfcl.test_entry_id"
+BFCL_TURN_IDX: Final = "bfcl.turn_idx"
+BFCL_QUERY_MODE: Final = "bfcl.query_mode"
+BFCL_OSS_BACKEND: Final = "bfcl.oss.backend"
+BFCL_TOOL_DURATION_IS_ESTIMATED: Final = "bfcl.tool.duration_is_estimated"
+BFCL_TOOL_INDEX: Final = "bfcl.tool.index"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py
new file mode 100644
index 000000000..efa2c77dc
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py
@@ -0,0 +1,71 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Map BFCL ``ModelStyle`` enum values to ``gen_ai.provider.name``."""
+
+from __future__ import annotations
+
+import os
+from typing import Any, Dict, Tuple
+
+from opentelemetry.instrumentation.bfclv4.internal.attributes import (
+    BFCL_OSS_BACKEND,
+)
+
+# The BFCL backend name (vllm / sglang / ...) is communicated from the ENTRY
+# wrapper to the per-thread STEP/AGENT wrappers via this env var.  The ENTRY
+# wrapper writes to it before invoking the wrapped function and clears it in
+# the ``finally`` clause.
+OSS_BACKEND_ENV = "BFCL_BACKEND"
+
+
+def infer_provider(handler: Any) -> Tuple[str, Dict[str, Any]]:
+    """Return ``(provider_name, extra_attributes)`` for a BFCL handler.
+
+    Falls back to ``"unknown"`` if BFCL is not importable or if the handler
+    has no ``model_style`` attribute.
+    """
+
+    try:
+        from bfcl_eval.constants.enums import (  # noqa: PLC0415
+            ModelStyle,
+        )
+    except ImportError:
+        return "unknown", {}
+
+    style = getattr(handler, "model_style", None)
+    if style is None:
+        return "unknown", {}
+
+    if style is ModelStyle.OSSMODEL:
+        backend = (os.getenv(OSS_BACKEND_ENV) or "").lower()
+        if backend in ("vllm", "sglang"):
+            return backend, {BFCL_OSS_BACKEND: backend}
+        return "oss", {BFCL_OSS_BACKEND: "unknown"}
+
+    mapping = {
+        ModelStyle.OPENAI_COMPLETIONS: "openai",
+        ModelStyle.OPENAI_RESPONSES: "openai",
+        ModelStyle.ANTHROPIC: "anthropic",
+        ModelStyle.GOOGLE: "gcp.gemini",
+        ModelStyle.MISTRAL: "mistral_ai",
+        ModelStyle.COHERE: "cohere",
+        ModelStyle.AMAZON: "aws.bedrock",
+        ModelStyle.FIREWORK_AI: "fireworks_ai",
+        ModelStyle.WRITER: "writer",
+        ModelStyle.NOVITA_AI: "novita",
+        ModelStyle.NEXUS: "nexusflow",
+        ModelStyle.GORILLA: "gorilla",
+    }
+    return mapping.get(style, "unknown"), {}
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py
new file mode 100644
index 000000000..ae4861035
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py
@@ -0,0 +1,93 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Per-thread ReAct state for the BFCL v4 instrumentation.
+
+We use ``contextvars.ContextVar`` so that each worker thread spawned by the
+BFCL ``ThreadPoolExecutor`` gets its own copy.  ``_ContextPropagatingExecutor``
+in :mod:`threading_propagation` makes sure ENTRY-time context is copied into
+the worker thread; the BaseHandler.inference wrapper then initializes a fresh
+state on top of that copy.
+"""
+
+from __future__ import annotations
+
+import contextvars
+from typing import Any, Dict, Optional
+
+_REACT_STATE: contextvars.ContextVar[Optional[Dict[str, Any]]] = (
+    contextvars.ContextVar("bfclv4_react_state", default=None)
+)
+
+
+def init_state() -> contextvars.Token:
+    """Initialise per-AGENT state and return the reset token."""
+    state: Dict[str, Any] = {
+        # ``turn_idx`` is incremented by the wrapper around
+        # ``_add_next_turn_user_message_*``; it stays ``0`` for single-turn
+        # tests.
+        "turn_idx": 0,
+        # ``fc_round`` is the ReAct round counter.  We bump it on every STEP
+        # entry so the first STEP within a turn ends up with ``round=1``.
+        "fc_round": 0,
+        # Counter of executed tool calls within the current AGENT - useful for
+        # the TOOL span ``tool_call_id`` synthesis.
+        "tool_index": 0,
+    }
+    return _REACT_STATE.set(state)
+
+
+def reset_state(token: contextvars.Token) -> None:
+    try:
+        _REACT_STATE.reset(token)
+    except (LookupError, ValueError):
+        # Token may have already been reset (e.g. nested error path).
+        pass
+
+
+def get_state() -> Optional[Dict[str, Any]]:
+    return _REACT_STATE.get()
+
+
+def bump_round() -> int:
+    state = _REACT_STATE.get()
+    if state is None:
+        return 1
+    state["fc_round"] = state.get("fc_round", 0) + 1
+    return state["fc_round"]
+
+
+def reset_round_for_turn() -> None:
+    state = _REACT_STATE.get()
+    if state is None:
+        return
+    state["fc_round"] = 0
+
+
+def bump_turn() -> int:
+    state = _REACT_STATE.get()
+    if state is None:
+        return 0
+    state["turn_idx"] = state.get("turn_idx", 0) + 1
+    state["fc_round"] = 0
+    return state["turn_idx"]
+
+
+def next_tool_index() -> int:
+    state = _REACT_STATE.get()
+    if state is None:
+        return 0
+    idx = state.get("tool_index", 0)
+    state["tool_index"] = idx + 1
+    return idx
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py
new file mode 100644
index 000000000..d19c05799
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py
@@ -0,0 +1,43 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Context-propagating ``ThreadPoolExecutor`` used by the ENTRY wrapper.
+
+``concurrent.futures.ThreadPoolExecutor`` does not automatically copy the
+current ``contextvars`` context (which holds the OTel current span) into
+worker threads.  We subclass it and copy ``contextvars.copy_context()`` per
+``submit`` so the AGENT span created inside the worker thread can attach as
+a child of the ENTRY span.
+
+We only swap the ``ThreadPoolExecutor`` *name* in the
+``bfcl_eval._llm_response_generation`` namespace; the global
+``concurrent.futures.ThreadPoolExecutor`` is untouched.
+"""
+
+from __future__ import annotations
+
+import contextvars
+from concurrent.futures import ThreadPoolExecutor as _RealExecutor
+
+
+class ContextPropagatingExecutor(_RealExecutor):
+    """``ThreadPoolExecutor`` that propagates the calling ``Context``.
+
+    Only the ``submit`` method is overridden because BFCL only uses
+    ``submit`` (see ``_llm_response_generation.generate_results``).
+    """
+
+    def submit(self, fn, /, *args, **kwargs):  # type: ignore[override]
+        ctx = contextvars.copy_context()
+        return super().submit(ctx.run, fn, *args, **kwargs)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py
new file mode 100644
index 000000000..42f582c69
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py
@@ -0,0 +1,1217 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Wrapper classes for the BFCL v4 instrumentation.
+
+Each wrapper follows the standard ``wrapt`` callable contract::
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        ...
+
+All wrappers rely on :func:`get_extended_telemetry_handler` (LoongSuite
+``util-genai``) to create the actual spans, so that ENTRY / AGENT / STEP /
+TOOL spans get the canonical ``gen_ai.span.kind`` and operation-name values
+that the LoongSuite semantic-validator expects.
+"""
+
+from __future__ import annotations
+
+import ast
+import importlib
+import inspect
+import logging
+import os
+import sys
+import time
+from contextvars import ContextVar
+from typing import Any, Callable, Iterable, List, Optional
+
+from opentelemetry.instrumentation.bfclv4.internal.attributes import (
+    BFCL_NUM_THREADS,
+    BFCL_OSS_BACKEND,
+    BFCL_QUERY_MODE,
+    BFCL_RUN_IDS,
+    BFCL_TEST_CASE_COUNT,
+    BFCL_TEST_CATEGORY,
+    BFCL_TEST_ENTRY_ID,
+    BFCL_TOOL_DURATION_IS_ESTIMATED,
+    BFCL_TOOL_INDEX,
+    BFCL_TURN_IDX,
+    FRAMEWORK_NAME,
+    GEN_AI_FRAMEWORK,
+    GEN_AI_PROVIDER_NAME,
+)
+from opentelemetry.instrumentation.bfclv4.internal.provider import (
+    OSS_BACKEND_ENV,
+    infer_provider,
+)
+from opentelemetry.instrumentation.bfclv4.internal.state import (
+    bump_round,
+    bump_turn,
+    init_state,
+    next_tool_index,
+    reset_state,
+)
+from opentelemetry.instrumentation.bfclv4.internal.threading_propagation import (
+    ContextPropagatingExecutor,
+)
+from opentelemetry.instrumentation.bfclv4.utils import (
+    GenAIHookHelper,
+    to_text_input,
+    to_text_output,
+    truncate_text,
+)
+from opentelemetry.util.genai.extended_handler import (
+    get_extended_telemetry_handler,
+)
+from opentelemetry.util.genai.extended_types import (
+    EntryInvocation,
+    ExecuteToolInvocation,
+    InvokeAgentInvocation,
+    ReactStepInvocation,
+)
+from opentelemetry.util.genai.types import (
+    FunctionToolDefinition,
+    GenericToolDefinition,
+    Text,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+
+
+def _safe_get(obj: Any, key: str, default: Any = None) -> Any:
+    if isinstance(obj, dict):
+        return obj.get(key, default)
+    return getattr(obj, key, default)
+
+
+def _flatten_tokens(value: Any) -> Optional[int]:
+    """Sum a possibly nested ``int|float|list|list[list]`` BFCL token field."""
+    if value is None:
+        return None
+    if isinstance(value, (int, float)):
+        return int(value)
+    if isinstance(value, Iterable):
+        total = 0
+        any_seen = False
+        for item in value:
+            sub = _flatten_tokens(item)
+            if sub is not None:
+                total += sub
+                any_seen = True
+        if any_seen:
+            return total
+    return None
+
+
+def _test_category_from_id(test_entry_id: Optional[str]) -> Optional[str]:
+    if not test_entry_id or "_" not in test_entry_id:
+        return None
+    return test_entry_id.rsplit("_", 1)[0]
+
+
+def _join_test_category(value: Any) -> Optional[str]:
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value
+    if isinstance(value, (list, tuple, set)):
+        joined = ",".join(str(v) for v in value if v is not None)
+        return joined or None
+    return str(value)
+
+
+BFCLV4_DEBUG_ENV = "BFCLV4_DEBUG"
+GEN_AI_INPUT_MESSAGES_ATTR = "gen_ai.input.messages"
+GEN_AI_OUTPUT_MESSAGES_ATTR = "gen_ai.output.messages"
+GEN_AI_SYSTEM_INSTRUCTIONS_ATTR = "gen_ai.system_instructions"
+GEN_AI_TOOL_CALL_ARGUMENTS_ATTR = "gen_ai.tool.call.arguments"
+GEN_AI_TOOL_CALL_RESULT_ATTR = "gen_ai.tool.call.result"
+GEN_AI_TOOL_CALL_ID_ATTR = "gen_ai.tool.call.id"
+GEN_AI_TOOL_NAME_ATTR = "gen_ai.tool.name"
+GEN_AI_TOOL_TYPE_ATTR = "gen_ai.tool.type"
+GEN_AI_TOOL_DESCRIPTION_ATTR = "gen_ai.tool.description"
+BFCL_SYNTHETIC_TOOL_CALL = "bfcl.tool.synthetic_from_model_response"
+_TOOL_DESCRIPTION_MAP: ContextVar[dict[str, str]] = ContextVar(
+    "bfclv4_tool_description_map", default={}
+)
+
+
+
+def _json_attr(value: Any) -> str:
+    try:
+        import json
+
+        return json.dumps(value, ensure_ascii=False, default=str)
+    except Exception:  # noqa: BLE001
+        return _safe_str(value)
+
+
+def _message_dict(role: str, content: Any) -> dict:
+    return {
+        "role": role,
+        "parts": [{"type": "text", "content": truncate_text(_safe_str(content))}],
+    }
+
+
+def _system_instruction_dict(content: Any) -> dict:
+    return {"type": "text", "content": truncate_text(_safe_str(content))}
+
+
+def _test_entry_to_messages(test_entry: Any):
+    if not isinstance(test_entry, dict):
+        return [], []
+
+    inputs = []
+    system_instructions = []
+    for key in (
+        "system",
+        "system_prompt",
+        "system_instruction",
+        "system_instructions",
+    ):
+        value = test_entry.get(key)
+        if value not in (None, "", [], {}):
+            system_instructions.append(Text(content=truncate_text(_safe_str(value))))
+
+    _append_question_messages(
+        test_entry.get("question"),
+        inputs,
+        system_instructions,
+    )
+    return inputs, system_instructions
+
+
+def _append_question_messages(
+    value: Any,
+    inputs: list,
+    system_instructions: list,
+) -> None:
+    if value in (None, "", [], {}):
+        return
+
+    if isinstance(value, dict):
+        role = str(value.get("role") or "user")
+        content = value.get("content")
+        if content in (None, "", [], {}):
+            content = {
+                k: v
+                for k, v in value.items()
+                if k not in {"role", "name", "tool_call_id"}
+            }
+        if content in (None, "", [], {}):
+            return
+        text = truncate_text(_safe_str(content))
+        if role == "system":
+            system_instructions.append(Text(content=text))
+        else:
+            inputs.extend(to_text_input(role, text))
+        return
+
+    if isinstance(value, (list, tuple)):
+        for item in value:
+            _append_question_messages(item, inputs, system_instructions)
+        return
+
+    inputs.extend(to_text_input("user", truncate_text(_safe_str(value))))
+
+
+def _test_entry_to_tool_definitions(test_entry: Any) -> list:
+    if not isinstance(test_entry, dict):
+        return []
+
+    definitions = []
+    for key in ("function", "functions", "tools", "tool_definitions"):
+        definitions.extend(_tool_value_to_definitions(test_entry.get(key)))
+
+    missed_function = test_entry.get("missed_function")
+    if isinstance(missed_function, dict):
+        for value in missed_function.values():
+            definitions.extend(_tool_value_to_definitions(value))
+    else:
+        definitions.extend(_tool_value_to_definitions(missed_function))
+
+    return _dedupe_tool_definitions(definitions)
+
+
+def _tool_value_to_definitions(value: Any) -> list:
+    if value in (None, "", [], {}):
+        return []
+
+    if isinstance(value, str):
+        try:
+            import json
+
+            value = json.loads(value)
+        except Exception:  # noqa: BLE001
+            return []
+
+    if isinstance(value, (list, tuple)):
+        definitions = []
+        for item in value:
+            definitions.extend(_tool_value_to_definitions(item))
+        return definitions
+
+    if not isinstance(value, dict):
+        return []
+
+    nested_function = value.get("function")
+    if isinstance(nested_function, dict):
+        nested = dict(nested_function)
+        nested.setdefault("type", value.get("type", "function"))
+        return _tool_value_to_definitions(nested)
+
+    name = value.get("name") or value.get("function_name") or value.get("tool_name")
+    if not name:
+        return []
+
+    tool_type = value.get("type")
+    description = value.get("description")
+    parameters = value.get("parameters")
+    if tool_type not in (None, "", "function") and parameters is None:
+        return [GenericToolDefinition(name=str(name), type=str(tool_type))]
+
+    return [
+        FunctionToolDefinition(
+            name=str(name),
+            description=_safe_str(description) if description is not None else None,
+            parameters=parameters,
+        )
+    ]
+
+
+def _dedupe_tool_definitions(definitions: list) -> list:
+    deduped = []
+    seen = set()
+    for definition in definitions:
+        key = _json_attr(getattr(definition, "__dict__", repr(definition)))
+        if key in seen:
+            continue
+        seen.add(key)
+        deduped.append(definition)
+    return deduped
+
+
+def _tool_description_map(test_entry: Any) -> dict[str, str]:
+    descriptions: dict[str, str] = {}
+    for definition in _test_entry_to_tool_definitions(test_entry):
+        name = getattr(definition, "name", None)
+        description = getattr(definition, "description", None)
+        if name and description:
+            descriptions[str(name)] = _safe_str(description)
+
+    # Multi-turn BFCL cases often leave ``function`` empty and expose tools via
+    # involved_classes. Pull method docstrings from BFCL's executable classes so
+    # TOOL spans still carry gen_ai.tool.description.
+    if isinstance(test_entry, dict):
+        involved_classes = test_entry.get("involved_classes") or []
+        try:
+            from bfcl_eval.constants.executable_backend_config import (  # noqa: PLC0415
+                CLASS_FILE_PATH_MAPPING,
+            )
+        except Exception:  # noqa: BLE001
+            CLASS_FILE_PATH_MAPPING = {}
+        for class_name in involved_classes if isinstance(involved_classes, (list, tuple)) else []:
+            module_name = CLASS_FILE_PATH_MAPPING.get(class_name)
+            if not module_name:
+                continue
+            try:
+                module = importlib.import_module(module_name)
+                cls = getattr(module, class_name)
+            except Exception:  # noqa: BLE001
+                continue
+            for method_name, method in inspect.getmembers(cls, predicate=inspect.isfunction):
+                if method_name.startswith("_") or method_name in descriptions:
+                    continue
+                doc = inspect.getdoc(method)
+                if doc:
+                    descriptions[method_name] = truncate_text(doc, 1024)
+    return descriptions
+
+
+def _lookup_tool_description(tool_name: Optional[str]) -> Optional[str]:
+    if not tool_name:
+        return None
+    description = _TOOL_DESCRIPTION_MAP.get().get(str(tool_name))
+    if description:
+        return description
+    try:
+        from bfcl_eval.constants.executable_backend_config import (  # noqa: PLC0415
+            CLASS_FILE_PATH_MAPPING,
+        )
+    except Exception:  # noqa: BLE001
+        CLASS_FILE_PATH_MAPPING = {}
+    for module_name in CLASS_FILE_PATH_MAPPING.values():
+        try:
+            module = importlib.import_module(module_name)
+        except Exception:  # noqa: BLE001
+            continue
+        for _, cls in inspect.getmembers(module, inspect.isclass):
+            method = getattr(cls, str(tool_name), None)
+            if method is None:
+                continue
+            doc = inspect.getdoc(method)
+            if doc:
+                return truncate_text(doc, 1024)
+    return None
+
+
+def _normalise_tool_arguments(arguments: Any) -> Any:
+    return {} if arguments is None else arguments
+
+
+def _extract_questions_from_cases(cases: Any) -> list:
+    if not isinstance(cases, (list, tuple)):
+        return []
+    messages = []
+    for case in cases[:10]:
+        if isinstance(case, dict) and case.get("question") is not None:
+            messages.append(_message_dict("user", case.get("question")))
+    return messages
+
+
+def _extract_tool_defs_from_cases(cases: Any) -> list:
+    if not isinstance(cases, (list, tuple)):
+        return []
+    instructions = []
+    for case in cases[:10]:
+        if isinstance(case, dict) and case.get("function") is not None:
+            instructions.append(_system_instruction_dict(case.get("function")))
+    return instructions
+
+
+def _set_json_span_attr(span: Any, key: str, value: Any) -> None:
+    if not value or span is None:
+        return
+    try:
+        if span.is_recording():
+            span.set_attribute(key, _json_attr(value))
+    except Exception:  # noqa: BLE001
+        logger.debug("bfclv4: failed to set json attr %s", key, exc_info=True)
+
+
+def _span_attr_value(value: Any) -> str:
+    return value if isinstance(value, str) else _json_attr(value)
+
+
+def _set_tool_call_span_attrs(
+    span: Any,
+    *,
+    arguments: Any = None,
+    result: Any = None,
+    description: Optional[str] = None,
+    tool_name: Optional[str] = None,
+    tool_call_id: Optional[str] = None,
+    tool_type: Optional[str] = "function",
+) -> None:
+    if span is None:
+        return
+    try:
+        if not span.is_recording():
+            return
+        if tool_call_id:
+            span.set_attribute(GEN_AI_TOOL_CALL_ID_ATTR, tool_call_id)
+        if tool_name:
+            span.set_attribute(GEN_AI_TOOL_NAME_ATTR, tool_name)
+        if tool_type:
+            span.set_attribute(GEN_AI_TOOL_TYPE_ATTR, tool_type)
+        if arguments is not None:
+            span.set_attribute(
+                GEN_AI_TOOL_CALL_ARGUMENTS_ATTR,
+                _span_attr_value(arguments),
+            )
+        if result is not None:
+            span.set_attribute(
+                GEN_AI_TOOL_CALL_RESULT_ATTR,
+                _span_attr_value(result),
+            )
+        if description:
+            span.set_attribute(GEN_AI_TOOL_DESCRIPTION_ATTR, description)
+        print(
+            "[bfclv4-tool-attrs] "
+            f"name={tool_name} id={tool_call_id} "
+            f"has_arguments={arguments is not None} "
+            f"has_result={result is not None} "
+            f"has_description={bool(description)}",
+            file=sys.stderr,
+            flush=True,
+        )
+    except Exception:  # noqa: BLE001
+        logger.debug("bfclv4: failed to set TOOL call attrs", exc_info=True)
+
+
+def _parse_python_call_arguments(func_call: Any) -> Any:
+    if not isinstance(func_call, str) or "(" not in func_call:
+        return _extract_tool_arguments(func_call)
+    try:
+        expr = ast.parse(func_call, mode="eval").body
+    except SyntaxError:
+        return _extract_tool_arguments(func_call)
+    if not isinstance(expr, ast.Call):
+        return _extract_tool_arguments(func_call)
+
+    parsed: dict[str, Any] = {}
+    for index, arg in enumerate(expr.args):
+        parsed[f"arg_{index}"] = _literal_or_source(arg, func_call)
+    for keyword in expr.keywords:
+        if keyword.arg is None:
+            parsed["kwargs"] = _literal_or_source(keyword.value, func_call)
+        else:
+            parsed[keyword.arg] = _literal_or_source(keyword.value, func_call)
+    return parsed or None
+
+
+def _literal_or_source(node: ast.AST, source: str) -> Any:
+    try:
+        return ast.literal_eval(node)
+    except Exception:  # noqa: BLE001
+        segment = ast.get_source_segment(source, node)
+        return segment if segment is not None else _safe_str(node)
+
+
+def _iter_model_tool_calls(result_payload: Any):
+    """Yield (tool_name, arguments) pairs from BFCL single-turn decoded output."""
+    if not isinstance(result_payload, list):
+        return
+    for item in result_payload:
+        if isinstance(item, dict):
+            for name, arguments in item.items():
+                yield str(name), arguments
+        elif isinstance(item, str):
+            yield _extract_tool_name(item), _parse_python_call_arguments(item)
+
+
+def _emit_synthetic_tool_spans(
+    result_payload: Any,
+    *,
+    test_entry_id: Optional[Any],
+    model_name: Optional[Any],
+) -> int:
+    """Emit TOOL spans for BFCL cases that generate calls but do not execute them."""
+    calls = list(_iter_model_tool_calls(result_payload) or [])
+    if not calls:
+        return 0
+    handler_obj = get_extended_telemetry_handler()
+    emitted = 0
+    for index, (tool_name, arguments) in enumerate(calls):
+        description = _lookup_tool_description(tool_name)
+        tool_inv = ExecuteToolInvocation(
+            tool_name=tool_name or "unknown",
+            tool_call_id=_synth_tool_call_id(test_entry_id, model_name, index),
+            tool_type="function",
+            tool_description=description,
+            tool_call_arguments=_normalise_tool_arguments(arguments),
+            tool_call_result=None,
+        )
+        try:
+            with handler_obj.execute_tool(tool_inv) as inv:
+                span = inv.span
+                if span is not None and span.is_recording():
+                    span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME)
+                    span.set_attribute(BFCL_TOOL_INDEX, index)
+                    span.set_attribute(BFCL_SYNTHETIC_TOOL_CALL, True)
+                    if test_entry_id is not None:
+                        span.set_attribute(BFCL_TEST_ENTRY_ID, str(test_entry_id))
+                    _set_tool_call_span_attrs(
+                        span,
+                        arguments=_normalise_tool_arguments(arguments),
+                        description=description,
+                        tool_name=tool_name,
+                        tool_call_id=_synth_tool_call_id(test_entry_id, model_name, index),
+                        tool_type="function",
+                    )
+            emitted += 1
+        except Exception:  # noqa: BLE001
+            logger.debug("bfclv4 synthetic TOOL span emission failed", exc_info=True)
+    return emitted
+
+
+# ---------------------------------------------------------------------------
+# ENTRY wrapper
+
+
+class GenerateResultsWrapper:
+    """Wraps ``bfcl_eval._llm_response_generation.generate_results``.
+
+    Responsibilities:
+
+    * Open the ENTRY span (``enter_ai_application_system``).
+    * Temporarily swap the ``ThreadPoolExecutor`` reference inside the BFCL
+      generation module to a context-propagating subclass so that AGENT spans
+      created in worker threads inherit the ENTRY span as parent.
+    * Publish ``args.backend`` to ``BFCL_BACKEND`` so that
+      :func:`infer_provider` can attribute OSS spans to vllm / sglang.
+    """
+
+    def __init__(self, helper: GenAIHookHelper) -> None:
+        self._helper = helper
+
+    def __call__(self, wrapped: Callable, instance: Any, args, kwargs):  # noqa: D401
+        # ``generate_results(args, model_name, test_cases_total)``
+        cli_args = args[0] if len(args) >= 1 else kwargs.get("args")
+        model_name = args[1] if len(args) >= 2 else kwargs.get("model_name")
+        test_cases_total = (
+            args[2] if len(args) >= 3 else kwargs.get("test_cases_total")
+        )
+
+        try:
+            from bfcl_eval import (  # noqa: PLC0415
+                _llm_response_generation as _bfcl_gen,
+            )
+        except ImportError:
+            return wrapped(*args, **kwargs)
+
+        original_executor = getattr(_bfcl_gen, "ThreadPoolExecutor", None)
+        if original_executor is not None:
+            _bfcl_gen.ThreadPoolExecutor = ContextPropagatingExecutor
+
+        backend_value = (
+            _safe_get(cli_args, "backend", None) if cli_args is not None else None
+        )
+        previous_backend_env = os.environ.get(OSS_BACKEND_ENV)
+        if backend_value:
+            os.environ[OSS_BACKEND_ENV] = str(backend_value)
+
+        session_id_default = None
+        if model_name is not None:
+            try:
+                session_id_default = f"{model_name}@{int(time.time())}"
+            except Exception:  # noqa: BLE001
+                session_id_default = None
+        session_id = (
+            os.environ.get("BFCL_SESSION_ID") or session_id_default
+        )
+
+        entry_inv = EntryInvocation(session_id=session_id)
+        entry_input_messages = _extract_questions_from_cases(test_cases_total)
+        entry_system_instructions = _extract_tool_defs_from_cases(test_cases_total)
+        entry_inv.input_messages = to_text_input("user", _safe_str(entry_input_messages))
+        handler = get_extended_telemetry_handler()
+
+        attributes = {GEN_AI_FRAMEWORK: FRAMEWORK_NAME}
+        category_value = _join_test_category(
+            _safe_get(cli_args, "test_category", None)
+        )
+        if category_value:
+            attributes[BFCL_TEST_CATEGORY] = category_value
+        num_threads = _safe_get(cli_args, "num_threads", None)
+        if num_threads is not None:
+            try:
+                attributes[BFCL_NUM_THREADS] = int(num_threads)
+            except (TypeError, ValueError):
+                pass
+        if isinstance(test_cases_total, (list, tuple)):
+            attributes[BFCL_TEST_CASE_COUNT] = len(test_cases_total)
+        attributes[BFCL_RUN_IDS] = bool(
+            _safe_get(cli_args, "run_ids", False)
+        )
+
+        try:
+            with handler.entry(entry_inv) as inv:
+                if inv.span is not None and inv.span.is_recording():
+                    for key, value in attributes.items():
+                        try:
+                            inv.span.set_attribute(key, value)
+                        except Exception:  # noqa: BLE001
+                            logger.debug(
+                                "bfclv4 ENTRY set_attribute(%s) failed",
+                                key,
+                                exc_info=True,
+                            )
+                    _set_json_span_attr(inv.span, GEN_AI_INPUT_MESSAGES_ATTR, entry_input_messages)
+                    _set_json_span_attr(inv.span, GEN_AI_SYSTEM_INSTRUCTIONS_ATTR, entry_system_instructions)
+                result = wrapped(*args, **kwargs)
+                if inv.span is not None and inv.span.is_recording():
+                    _set_json_span_attr(
+                        inv.span,
+                        GEN_AI_OUTPUT_MESSAGES_ATTR,
+                        [_message_dict("assistant", {"model": model_name, "status": "generate_results_completed"})],
+                    )
+                return result
+        finally:
+            if original_executor is not None:
+                try:
+                    _bfcl_gen.ThreadPoolExecutor = original_executor
+                except Exception:  # noqa: BLE001
+                    logger.debug(
+                        "bfclv4 ENTRY: failed to restore ThreadPoolExecutor",
+                        exc_info=True,
+                    )
+            if backend_value:
+                if previous_backend_env is None:
+                    os.environ.pop(OSS_BACKEND_ENV, None)
+                else:
+                    os.environ[OSS_BACKEND_ENV] = previous_backend_env
+
+
+# ---------------------------------------------------------------------------
+# AGENT wrapper
+
+
+_BFCL_INFERENCE_ERROR_PREFIX = "Error during inference:"
+
+
+class BaseHandlerInferenceWrapper:
+    """Wraps ``BaseHandler.inference``.
+
+    Creates the AGENT span (kind=AGENT, op=invoke_agent) and initialises the
+    per-thread ReAct state used by the STEP wrapper.
+
+    BFCL's outer ``multi_threaded_inference`` catches every exception and
+    converts it into a ``"Error during inference: ..."`` string; we mirror
+    that behaviour by setting the AGENT span status to ERROR when the
+    returned ``result`` looks like an error string, instead of relying on
+    a re-raised exception.
+    """
+
+    def __init__(self, helper: GenAIHookHelper) -> None:
+        self._helper = helper
+
+    def __call__(self, wrapped: Callable, instance: Any, args, kwargs):  # noqa: D401
+        # ``inference(self, test_entry, include_input_log, exclude_state_log)``
+        test_entry = args[0] if args else kwargs.get("test_entry")
+        if not isinstance(test_entry, dict):
+            return wrapped(*args, **kwargs)
+
+        provider, extra_attrs = infer_provider(instance)
+        request_model = getattr(instance, "model_name", None)
+        test_entry_id = test_entry.get("id")
+        category = _test_category_from_id(test_entry_id)
+        involved_classes = test_entry.get("involved_classes") or []
+        agent_description = (
+            ", ".join(str(c) for c in involved_classes)
+            if isinstance(involved_classes, (list, tuple))
+            else None
+        )
+
+        invocation = InvokeAgentInvocation(
+            provider=provider or "unknown",
+            request_model=request_model,
+            agent_id=test_entry_id,
+            agent_name=category or "bfcl_agent",
+            agent_description=agent_description or None,
+            conversation_id=test_entry_id,
+        )
+
+        token = init_state()
+        tool_description_token = _TOOL_DESCRIPTION_MAP.set(
+            _tool_description_map(test_entry)
+        )
+        handler = get_extended_telemetry_handler()
+        try:
+            with handler.invoke_agent(invocation) as inv:
+                if inv.span is not None and inv.span.is_recording():
+                    inv.span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME)
+                    if provider:
+                        inv.span.set_attribute(GEN_AI_PROVIDER_NAME, provider)
+                    if test_entry_id is not None:
+                        inv.span.set_attribute(
+                            BFCL_TEST_ENTRY_ID, test_entry_id
+                        )
+                    if category is not None:
+                        inv.span.set_attribute(BFCL_TEST_CATEGORY, category)
+                    for key, value in extra_attrs.items():
+                        if value is not None:
+                            inv.span.set_attribute(key, value)
+
+                # Capture inputs for the AGENT. Also write span attributes directly
+                # because util-genai gates message attributes behind experimental
+                # content-capture mode, which makes K8s semantic validation opaque.
+                question = test_entry.get("question")
+                functions = test_entry.get("function")
+                if question is not None:
+                    inv.input_messages = to_text_input(
+                        "user", truncate_text(_safe_str(question))
+                    )
+                if functions is not None:
+                    inv.system_instruction = to_text_input(
+                        "system", truncate_text(_safe_str(functions))
+                    )[0].parts if to_text_input("system", truncate_text(_safe_str(functions))) else []
+                if inv.span is not None and inv.span.is_recording():
+                    _set_json_span_attr(inv.span, GEN_AI_INPUT_MESSAGES_ATTR, [_message_dict("user", question)])
+                    _set_json_span_attr(inv.span, GEN_AI_SYSTEM_INSTRUCTIONS_ATTR, [_system_instruction_dict(functions)])
+                # Run the original inference call.
+                try:
+                    result = wrapped(*args, **kwargs)
+                except Exception as exc:
+                    # The CM will mark the span as failed; we leave it to
+                    # the handler/CM to call ``fail_invoke_agent``.
+                    raise exc
+
+                # Detect BFCL's own captured error path (no exception raised
+                # but the returned result is the error string).
+                result_payload = (
+                    result[0] if isinstance(result, tuple) and result else None
+                )
+                metadata_payload = (
+                    result[1]
+                    if isinstance(result, tuple) and len(result) >= 2
+                    else None
+                )
+
+                if (
+                    isinstance(result_payload, str)
+                    and result_payload.startswith(_BFCL_INFERENCE_ERROR_PREFIX)
+                    and inv.span is not None
+                    and inv.span.is_recording()
+                ):
+                    try:
+                        from opentelemetry.trace import Status, StatusCode
+
+                        inv.span.set_status(
+                            Status(StatusCode.ERROR, result_payload[:200])
+                        )
+                    except Exception:  # noqa: BLE001
+                        logger.debug(
+                            "bfclv4 AGENT: failed to set ERROR status",
+                            exc_info=True,
+                        )
+
+                if isinstance(metadata_payload, dict):
+                    input_tokens = _flatten_tokens(
+                        metadata_payload.get("input_token_count")
+                    )
+                    output_tokens = _flatten_tokens(
+                        metadata_payload.get("output_token_count")
+                    )
+                    if input_tokens is not None:
+                        inv.input_tokens = input_tokens
+                    if output_tokens is not None:
+                        inv.output_tokens = output_tokens
+
+                if result_payload is not None:
+                    inv.output_messages = to_text_output(
+                        "assistant",
+                        truncate_text(_safe_str(result_payload)),
+                    )
+                    if inv.span is not None and inv.span.is_recording():
+                        _set_json_span_attr(inv.span, GEN_AI_OUTPUT_MESSAGES_ATTR, [_message_dict("assistant", result_payload)])
+
+                synthetic_tool_count = _emit_synthetic_tool_spans(
+                    result_payload,
+                    test_entry_id=test_entry_id,
+                    model_name=request_model,
+                )
+
+                return result
+        finally:
+            try:
+                _TOOL_DESCRIPTION_MAP.reset(tool_description_token)
+            except (LookupError, ValueError):
+                pass
+            reset_state(token)
+
+
+def _safe_str(value: Any) -> str:
+    try:
+        if isinstance(value, str):
+            return value
+        import json
+
+        return json.dumps(value, ensure_ascii=False, default=str)
+    except Exception:  # noqa: BLE001
+        try:
+            return str(value)
+        except Exception:  # noqa: BLE001
+            return "<unserialisable>"
+
+
+def _result_to_output_messages(result: Any):
+    payload = result[0] if isinstance(result, tuple) and result else result
+    if payload in (None, "", [], {}):
+        return []
+
+    if isinstance(payload, (list, tuple)):
+        messages = []
+        for item in payload:
+            messages.extend(_result_to_output_messages(item))
+        return messages
+
+    content = _extract_result_content(payload)
+    if content in (None, "", [], {}):
+        return []
+    return to_text_output("assistant", truncate_text(_safe_str(content)))
+
+
+def _extract_result_content(result: Any) -> Any:
+    if not isinstance(result, dict):
+        return result
+
+    for key in (
+        "final_answer",
+        "answer",
+        "output",
+        "result",
+        "model_response",
+        "model_responses",
+        "inference_output",
+    ):
+        value = result.get(key)
+        if value not in (None, "", [], {}):
+            return value
+
+    inference_log = result.get("inference_log")
+    if isinstance(inference_log, dict):
+        for key in sorted(
+            (k for k in inference_log if k.startswith("step_")),
+            key=_step_log_sort_key,
+            reverse=True,
+        ):
+            step_data = inference_log.get(key)
+            if not isinstance(step_data, dict):
+                continue
+            output = step_data.get("inference_output")
+            if output not in (None, "", [], {}):
+                return output
+            answer = step_data.get("inference_answer")
+            if answer not in (None, "", [], {}):
+                return answer
+
+    return result
+
+
+def _step_log_sort_key(key: str) -> int:
+    try:
+        return int(key[len("step_"):])
+    except (TypeError, ValueError):
+        return -1
+
+
+# ---------------------------------------------------------------------------
+# STEP wrapper
+
+
+class QueryWrapper:
+    """Wraps ``<Handler>._query_FC`` / ``_query_prompting``.
+
+    Creates a ReAct STEP span, attaches token usage by re-calling the
+    handler's matching ``_parse_query_response_*`` (which is documented as
+    side-effect-free).
+    """
+
+    def __init__(self, helper: GenAIHookHelper, mode: str) -> None:
+        self._helper = helper
+        self._mode = mode  # "FC" or "prompting"
+
+    def __call__(self, wrapped: Callable, instance: Any, args, kwargs):  # noqa: D401
+        round_idx = bump_round()
+        provider, extra_attrs = infer_provider(instance)
+
+        invocation = ReactStepInvocation(round=round_idx)
+        handler_obj = get_extended_telemetry_handler()
+        with handler_obj.react_step(invocation) as step_inv:
+            span = step_inv.span
+            if span is not None and span.is_recording():
+                span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME)
+                span.set_attribute(BFCL_QUERY_MODE, self._mode)
+                if provider:
+                    span.set_attribute(GEN_AI_PROVIDER_NAME, provider)
+                model_name = getattr(instance, "model_name", None)
+                if model_name:
+                    span.set_attribute(
+                        "gen_ai.request.model", str(model_name)
+                    )
+                from opentelemetry.instrumentation.bfclv4.internal.state import (
+                    get_state,
+                )
+
+                state = get_state()
+                if state is not None:
+                    span.set_attribute(BFCL_TURN_IDX, state.get("turn_idx", 0))
+                for key, value in extra_attrs.items():
+                    if value is not None:
+                        span.set_attribute(key, value)
+
+            try:
+                api_response, query_latency = wrapped(*args, **kwargs)
+            except Exception:
+                # Let the context-manager mark the span as failed; the BFCL
+                # outer try/except will turn this into an "Error during
+                # inference: ..." result string at the AGENT layer.
+                raise
+
+            # When the underlying handler returns a streaming wrapper
+            # (e.g. ``ChatStreamWrapper`` from openai-v2), the LLM span and
+            # its OTel context attach are kept alive until the stream is
+            # consumed by BFCL's ``_parse_query_response_*`` *outside* of
+            # this STEP context manager. That breaks the LIFO ordering of
+            # context attach/detach, leaving the LLM span as the "current"
+            # span after the STEP CM exits, which causes the next STEP and
+            # any TOOL spans to be parented to the previous STEP rather
+            # than to the AGENT.
+            #
+            # To preserve LIFO ordering, force-consume the stream here
+            # (inside the STEP context) and replace it with a plain
+            # iterator over the cached chunks. This makes ``stop_llm``
+            # (which detaches the LLM context) run *before* STEP detaches.
+            if api_response is not None and hasattr(
+                api_response, "__next__"
+            ) and not isinstance(api_response, (str, bytes)):
+                try:
+                    chunks = list(api_response)
+                    api_response = iter(chunks)
+                except Exception:  # noqa: BLE001
+                    logger.debug(
+                        "bfclv4 STEP: failed to materialise streaming "
+                        "response; LLM/STEP nesting may be incorrect",
+                        exc_info=True,
+                    )
+
+            # Post-call attribute enrichment - use try/except so that any
+            # vendor-side parsing surprise never breaks BFCL itself.
+            #
+            # IMPORTANT: We must NOT re-call ``_parse_query_response_*`` here,
+            # because for streaming providers (e.g. Qwen DashScope) the
+            # ``api_response`` is a single-pass generator that the parser
+            # consumes; calling it twice leaves BFCL's own subsequent call to
+            # the parser with an exhausted iterator, which crashes inference
+            # with ``UnboundLocalError: chunk``. Token usage will instead be
+            # recovered later from the AGENT-level metadata payload.
+            try:
+                if span is not None and span.is_recording():
+                    if isinstance(query_latency, (int, float)):
+                        try:
+                            span.set_attribute(
+                                "gen_ai.response.time_to_first_token",
+                                int(float(query_latency) * 1e9),
+                            )
+                        except Exception:  # noqa: BLE001
+                            pass
+            except Exception:  # noqa: BLE001
+                logger.debug(
+                    "bfclv4 STEP: post-call enrichment failed", exc_info=True
+                )
+
+            return api_response, query_latency
+
+
+def _infer_finish_reason(model_responses: Any) -> str:
+    """Best-effort heuristic for ``gen_ai.react.finish_reason``."""
+    if model_responses is None:
+        return "unknown"
+    if isinstance(model_responses, list):
+        if len(model_responses) == 0:
+            return "empty_response"
+        if len(model_responses) == 1 and not model_responses[0]:
+            return "empty_response"
+        return "tool_calls"
+    if isinstance(model_responses, str):
+        # Prompting models often return decoded strings even when there are
+        # no tool calls - treat as "stop" so downstream callers know there is
+        # no further work to do.
+        return "stop"
+    return "continue"
+
+
+# ---------------------------------------------------------------------------
+# turn_idx maintenance wrappers (no spans)
+
+
+class TurnBumpWrapper:
+    """Wraps ``<Handler>.add_first_turn_message_*`` and
+    ``<Handler>._add_next_turn_user_message_*`` to keep ``bfcl.turn_idx`` in
+    sync.  No spans are created here.
+    """
+
+    def __init__(self, *, reset: bool) -> None:
+        self._reset = reset
+
+    def __call__(self, wrapped: Callable, instance: Any, args, kwargs):  # noqa: D401
+        try:
+            if self._reset:
+                # ``add_first_turn_message_*`` runs once at the very start of
+                # multi-turn / single-turn inference.  We only want to reset
+                # to ``turn_idx=0`` here.
+                from opentelemetry.instrumentation.bfclv4.internal.state import (
+                    get_state,
+                )
+
+                state = get_state()
+                if state is not None:
+                    state["turn_idx"] = 0
+                    state["fc_round"] = 0
+            else:
+                bump_turn()
+        except Exception:  # noqa: BLE001
+            logger.debug(
+                "bfclv4: turn_idx maintenance failed", exc_info=True
+            )
+        return wrapped(*args, **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# TOOL wrapper
+
+
+class ExecuteFuncCallWrapper:
+    """Wraps
+    ``bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils.execute_multi_turn_func_call``.
+
+    BFCL evaluates a list of function-call strings in a single Python call;
+    we surface each one as its own TOOL span by post-processing the wrapped
+    result.  Per-call latency is approximated by averaging the total elapsed
+    time across the batch (``bfcl.tool.duration_is_estimated=true``).
+    """
+
+    def __init__(self, helper: GenAIHookHelper) -> None:
+        self._helper = helper
+
+    def __call__(self, wrapped: Callable, instance: Any, args, kwargs):  # noqa: D401
+        # ``execute_multi_turn_func_call(func_call_list, initial_config,
+        #                                involved_classes, model_name,
+        #                                test_entry_id, long_context=False,
+        #                                is_evaL_run=False)``
+        func_call_list = (
+            args[0] if args else kwargs.get("func_call_list", [])
+        )
+        model_name = (
+            args[3]
+            if len(args) >= 4
+            else kwargs.get("model_name")
+        )
+        test_entry_id = (
+            args[4]
+            if len(args) >= 5
+            else kwargs.get("test_entry_id")
+        )
+
+        if not isinstance(func_call_list, list) or not func_call_list:
+            return wrapped(*args, **kwargs)
+
+        t0 = time.perf_counter()
+        try:
+            result = wrapped(*args, **kwargs)
+        finally:
+            elapsed = max(time.perf_counter() - t0, 0.0)
+
+        execution_results: List[str] = []
+        if isinstance(result, tuple) and result:
+            payload = result[0]
+            if isinstance(payload, list):
+                execution_results = list(payload)
+
+        per_call_seconds = (
+            elapsed / len(func_call_list) if func_call_list else 0.0
+        )
+
+        handler_obj = get_extended_telemetry_handler()
+        for index, func_call in enumerate(func_call_list):
+            tool_name = _extract_tool_name(func_call)
+            arguments = _parse_python_call_arguments(func_call)
+            description = _lookup_tool_description(tool_name)
+            execution_result = (
+                execution_results[index]
+                if index < len(execution_results)
+                else None
+            )
+
+            tool_inv = ExecuteToolInvocation(
+                tool_name=tool_name,
+                tool_call_id=_synth_tool_call_id(
+                    test_entry_id, model_name, index
+                ),
+                tool_type="function",
+                tool_description=description,
+                tool_call_arguments=_normalise_tool_arguments(arguments),
+                tool_call_result=execution_result,
+            )
+
+            try:
+                with handler_obj.execute_tool(tool_inv) as inv:
+                    span = inv.span
+                    if span is not None and span.is_recording():
+                        span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME)
+                        span.set_attribute(BFCL_TOOL_INDEX, index)
+                        span.set_attribute(
+                            BFCL_TOOL_DURATION_IS_ESTIMATED, True
+                        )
+                        if test_entry_id is not None:
+                            span.set_attribute(
+                                BFCL_TEST_ENTRY_ID, str(test_entry_id)
+                            )
+                        _set_tool_call_span_attrs(
+                            span,
+                            arguments=_normalise_tool_arguments(arguments),
+                            result=execution_result,
+                            description=description,
+                            tool_name=tool_name,
+                            tool_call_id=_synth_tool_call_id(test_entry_id, model_name, index),
+                            tool_type="function",
+                        )
+                        if isinstance(execution_result, str) and execution_result.startswith(
+                            "Error during execution:"
+                        ):
+                            try:
+                                from opentelemetry.trace import (
+                                    Status,
+                                    StatusCode,
+                                )
+
+                                span.set_status(
+                                    Status(
+                                        StatusCode.ERROR,
+                                        execution_result[:200],
+                                    )
+                                )
+                            except Exception:  # noqa: BLE001
+                                pass
+                        # Approximate latency by sleeping the budgeted slice
+                        # would distort BFCL execution; we instead rely on
+                        # span start/end (currently both wall-clock-now).
+                        # The ``bfcl.tool.duration_is_estimated`` attribute
+                        # signals the limitation to consumers.
+                        _ = per_call_seconds  # unused but documented
+                # Bump a per-AGENT counter for downstream debugging.
+                next_tool_index()
+            except Exception:  # noqa: BLE001
+                logger.debug(
+                    "bfclv4 TOOL: span emission failed for %s",
+                    tool_name,
+                    exc_info=True,
+                )
+
+        return result
+
+
+def _extract_tool_name(func_call: Any) -> str:
+    if not isinstance(func_call, str) or "(" not in func_call:
+        return "unknown"
+    head = func_call.split("(", 1)[0]
+    # ``head`` may be ``module.method`` or ``instance.method`` - keep the
+    # last segment which is the actual callable.
+    return head.split(".")[-1] or "unknown"
+
+
+def _extract_tool_arguments(func_call: Any) -> Optional[str]:
+    if not isinstance(func_call, str):
+        return None
+    if "(" not in func_call or not func_call.endswith(")"):
+        return func_call
+    args_part = func_call[func_call.index("(") + 1 : -1]
+    return args_part if args_part else None
+
+
+def _synth_tool_call_id(
+    test_entry_id: Optional[Any], model_name: Optional[Any], index: int
+) -> str:
+    parts = [
+        str(test_entry_id) if test_entry_id is not None else "no_id",
+        str(model_name) if model_name is not None else "no_model",
+        str(index),
+    ]
+    return "-".join(parts)
\ No newline at end of file
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py
new file mode 100644
index 000000000..66e9fa6e1
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py
@@ -0,0 +1,17 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+_instruments = ("bfcl-eval >= 4.0.0",)
+
+_supports_metrics = False
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py
new file mode 100644
index 000000000..c63bbc62b
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py
@@ -0,0 +1,144 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helpers for the BFCL v4 instrumentation.
+
+The :class:`GenAIHookHelper` mirrors the helper used by the LoongSuite CrewAI
+instrumentation: it gates ``gen_ai.input.messages`` /
+``gen_ai.output.messages`` / ``gen_ai.system_instructions`` on the standard
+LoongSuite content-capture environment knobs so that prompt content is not
+exported by default.
+"""
+
+from __future__ import annotations
+
+import dataclasses
+import logging
+from typing import Any, Dict, List, Optional
+
+from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
+from opentelemetry.trace import Span
+from opentelemetry.util.genai.types import (
+    ContentCapturingMode,
+    InputMessage,
+    MessagePart,
+    OutputMessage,
+    Text,
+)
+from opentelemetry.util.genai.utils import (
+    gen_ai_json_dumps,
+    get_content_capturing_mode,
+    is_experimental_mode,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class GenAIHookHelper:
+    """Conditionally write prompt / completion content to the span."""
+
+    def __init__(self, capture_content: bool = True) -> None:
+        self.capture_content = capture_content
+
+    def on_completion(
+        self,
+        span: Span,
+        inputs: Optional[List[InputMessage]] = None,
+        outputs: Optional[List[OutputMessage]] = None,
+        system_instructions: Optional[List[MessagePart]] = None,
+        attributes: Optional[Dict[str, Any]] = None,
+    ) -> None:
+        if not span.is_recording():
+            return
+
+        if self.capture_content and is_experimental_mode():
+            mode = get_content_capturing_mode()
+            should_capture_span = mode in (
+                ContentCapturingMode.SPAN_ONLY,
+                ContentCapturingMode.SPAN_AND_EVENT,
+            )
+
+            if should_capture_span:
+                if inputs:
+                    span.set_attribute(
+                        gen_ai_attributes.GEN_AI_INPUT_MESSAGES,
+                        gen_ai_json_dumps(
+                            [dataclasses.asdict(i) for i in inputs]
+                        ),
+                    )
+                if outputs:
+                    span.set_attribute(
+                        gen_ai_attributes.GEN_AI_OUTPUT_MESSAGES,
+                        gen_ai_json_dumps(
+                            [dataclasses.asdict(o) for o in outputs]
+                        ),
+                    )
+                if system_instructions:
+                    span.set_attribute(
+                        gen_ai_attributes.GEN_AI_SYSTEM_INSTRUCTIONS,
+                        gen_ai_json_dumps(
+                            [dataclasses.asdict(s) for s in system_instructions]
+                        ),
+                    )
+
+        if attributes:
+            for key, value in attributes.items():
+                if value is None:
+                    continue
+                try:
+                    span.set_attribute(key, value)
+                except Exception:  # noqa: BLE001
+                    logger.debug(
+                        "bfclv4: failed to set attribute %s", key, exc_info=True
+                    )
+
+
+def to_text_input(role: str, content: Any) -> List[InputMessage]:
+    if content in (None, "", [], {}):
+        return []
+    text = content if isinstance(content, str) else _to_safe_str(content)
+    return [InputMessage(role=role, parts=[Text(content=text)])]
+
+
+def to_text_output(
+    role: str, content: Any, finish_reason: str = "stop"
+) -> List[OutputMessage]:
+    if content in (None, "", [], {}):
+        return []
+    text = content if isinstance(content, str) else _to_safe_str(content)
+    return [
+        OutputMessage(
+            role=role, parts=[Text(content=text)], finish_reason=finish_reason
+        )
+    ]
+
+
+def _to_safe_str(value: Any) -> str:
+    """Best-effort JSON serialisation, falling back to ``str()``.
+
+    The wrapper code never wants a serialisation failure to break a span.
+    """
+    try:
+        return gen_ai_json_dumps(value)
+    except Exception:  # noqa: BLE001
+        try:
+            return str(value)
+        except Exception:  # noqa: BLE001
+            return "<unserialisable>"
+
+
+def truncate_text(value: str, limit: int = 4096) -> str:
+    if len(value) <= limit:
+        return value
+    return value[:limit] + f"...<truncated {len(value) - limit} chars>"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py
new file mode 100644
index 000000000..3263662eb
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py
@@ -0,0 +1,15 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = "0.1.3.dev0"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py
new file mode 100644
index 000000000..41446ee3b
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py
@@ -0,0 +1,52 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Smoke tests for ``BFCLv4Instrumentor``.
+
+These tests do not require ``bfcl-eval`` to be installed; they only verify
+that importing the package and calling ``instrument()`` / ``uninstrument()``
+works (and degrades gracefully when ``bfcl-eval`` is missing).
+"""
+
+import importlib
+
+import pytest
+
+
+def test_import_instrumentor_package():
+    module = importlib.import_module("opentelemetry.instrumentation.bfclv4")
+    assert hasattr(module, "BFCLv4Instrumentor")
+
+
+def test_instrumentation_dependencies_listed():
+    from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor
+    from opentelemetry.instrumentation.bfclv4.package import _instruments
+
+    instr = BFCLv4Instrumentor()
+    assert tuple(instr.instrumentation_dependencies()) == _instruments
+
+
+def test_instrument_uninstrument_no_bfcl_no_raise():
+    """When ``bfcl-eval`` is missing, every wrap call logs and continues.
+
+    The instrumentor must not raise from ``instrument()`` /
+    ``uninstrument()`` even if the target framework cannot be imported.
+    """
+
+    pytest.importorskip("opentelemetry.util.genai.extended_handler")
+    from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor
+
+    instr = BFCLv4Instrumentor()
+    instr.instrument()
+    instr.uninstrument()
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py
new file mode 100644
index 000000000..fb760fd5e
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py
@@ -0,0 +1,222 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for the framework-agnostic helpers."""
+
+import contextvars
+
+import pytest
+
+
+def test_state_lifecycle():
+    from opentelemetry.instrumentation.bfclv4.internal.state import (
+        bump_round,
+        bump_turn,
+        get_state,
+        init_state,
+        next_tool_index,
+        reset_state,
+    )
+
+    token = init_state()
+    try:
+        state = get_state()
+        assert state == {"turn_idx": 0, "fc_round": 0, "tool_index": 0}
+
+        assert bump_round() == 1
+        assert bump_round() == 2
+        assert bump_turn() == 1
+        # bump_turn resets fc_round
+        state = get_state()
+        assert state["turn_idx"] == 1
+        assert state["fc_round"] == 0
+        assert next_tool_index() == 0
+        assert next_tool_index() == 1
+    finally:
+        reset_state(token)
+
+    # After reset the state should be gone (None default).
+    assert get_state() is None
+
+
+def test_context_propagating_executor_carries_contextvars():
+    from opentelemetry.instrumentation.bfclv4.internal.threading_propagation import (
+        ContextPropagatingExecutor,
+    )
+
+    cv: contextvars.ContextVar[str] = contextvars.ContextVar(
+        "bfclv4_test_cv", default="default"
+    )
+    cv.set("from_main_thread")
+
+    def _read():
+        return cv.get()
+
+    with ContextPropagatingExecutor(max_workers=2) as pool:
+        future = pool.submit(_read)
+        assert future.result() == "from_main_thread"
+
+
+def test_extract_tool_name_and_arguments():
+    from opentelemetry.instrumentation.bfclv4.internal.wrappers import (
+        _extract_tool_arguments,
+        _extract_tool_name,
+        _parse_python_call_arguments,
+    )
+
+    assert _extract_tool_name("calc.add(1, 2)") == "add"
+    assert _extract_tool_name("list_files()") == "list_files"
+    assert _extract_tool_name("not a call") == "unknown"
+    assert _extract_tool_arguments("foo(a=1, b=2)") == "a=1, b=2"
+    assert _extract_tool_arguments("foo()") is None
+    assert _parse_python_call_arguments("foo(a=1, b='x')") == {
+        "a": 1,
+        "b": "x",
+    }
+
+
+def test_infer_finish_reason_heuristic():
+    from opentelemetry.instrumentation.bfclv4.internal.wrappers import (
+        _infer_finish_reason,
+    )
+
+    assert _infer_finish_reason([]) == "empty_response"
+    assert _infer_finish_reason([[]]) == "empty_response"
+    assert _infer_finish_reason([{"name": "x"}]) == "tool_calls"
+    assert _infer_finish_reason("plain string") == "stop"
+    assert _infer_finish_reason(None) == "unknown"
+
+
+def test_test_entry_to_messages_extracts_genai_content():
+    from opentelemetry.instrumentation.bfclv4.internal.wrappers import (
+        _test_entry_to_messages,
+    )
+
+    test_entry = {
+        "id": "simple_001",
+        "system_prompt": "Use the provided tools.",
+        "question": [
+            [
+                {"role": "system", "content": "Answer concisely."},
+                {"role": "user", "content": "What is the weather in Paris?"},
+            ],
+            [{"role": "assistant", "content": "I will check."}],
+        ],
+    }
+
+    inputs, system_instructions = _test_entry_to_messages(test_entry)
+
+    assert [message.role for message in inputs] == ["user", "assistant"]
+    assert (
+        inputs[0].parts[0].content == "What is the weather in Paris?"
+    )
+    assert inputs[1].parts[0].content == "I will check."
+    assert [part.content for part in system_instructions] == [
+        "Use the provided tools.",
+        "Answer concisely.",
+    ]
+
+
+def test_test_entry_to_tool_definitions_extracts_bfcl_functions():
+    from opentelemetry.instrumentation.bfclv4.internal.wrappers import (
+        _test_entry_to_tool_definitions,
+        _tool_description_map,
+    )
+
+    test_entry = {
+        "id": "simple_001",
+        "function": [
+            {
+                "name": "get_weather",
+                "description": "Get weather information.",
+                "parameters": {
+                    "type": "object",
+                    "properties": {"location": {"type": "string"}},
+                    "required": ["location"],
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "book_flight",
+                    "description": "Book a flight.",
+                    "parameters": {"type": "object"},
+                },
+            },
+        ],
+        "missed_function": {
+            "1": [
+                {
+                    "name": "cancel_booking",
+                    "description": "Cancel a booking.",
+                    "parameters": {"type": "object"},
+                }
+            ]
+        },
+    }
+
+    definitions = _test_entry_to_tool_definitions(test_entry)
+
+    assert [definition.name for definition in definitions] == [
+        "get_weather",
+        "book_flight",
+        "cancel_booking",
+    ]
+    assert definitions[0].type == "function"
+    assert definitions[0].parameters["required"] == ["location"]
+    assert _tool_description_map(test_entry)["get_weather"] == (
+        "Get weather information."
+    )
+
+
+def test_result_to_output_messages_extracts_last_inference_log_output():
+    from opentelemetry.instrumentation.bfclv4.internal.wrappers import (
+        _result_to_output_messages,
+    )
+
+    outputs = _result_to_output_messages(
+        {
+            "inference_log": {
+                "step_0": {
+                    "inference_output": {"content": "intermediate"}
+                },
+                "step_1": {"inference_output": {"content": "final"}},
+            }
+        }
+    )
+
+    assert len(outputs) == 1
+    assert outputs[0].role == "assistant"
+    assert outputs[0].parts[0].content == '{"content": "final"}'
+    assert outputs[0].finish_reason == "stop"
+
+
+def test_provider_mapping_without_bfcl(monkeypatch):
+    from opentelemetry.instrumentation.bfclv4.internal.provider import (
+        infer_provider,
+    )
+
+    pytest.importorskip(
+        "opentelemetry.util.genai.extended_types",
+    )
+
+    class _Dummy:
+        model_style = None
+
+    name, extras = infer_provider(_Dummy())
+    # If bfcl-eval is not installed, ``ModelStyle`` import fails and we get
+    # ``unknown``; otherwise we still get ``unknown`` because ``model_style``
+    # is None.
+    assert name == "unknown"
+    assert extras == {}
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/pyproject.toml
new file mode 100644
index 000000000..c1124eaa8
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/pyproject.toml
@@ -0,0 +1,54 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-claw-eval"
+dynamic = ["version"]
+description = "LoongSuite claw-eval instrumentation"
+license = "Apache-2.0"
+requires-python = ">=3.10,<4"
+authors = [
+  { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+  "opentelemetry-api >= 1.37.0",
+  "opentelemetry-instrumentation >= 0.58b0",
+  "opentelemetry-semantic-conventions >= 0.58b0",
+  "wrapt >= 1.0.0, < 2.0.0",
+]
+
+[project.optional-dependencies]
+instruments = [
+  "claw-eval >= 0.1.0"
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+claw_eval = "opentelemetry.instrumentation.claw_eval:ClawEvalInstrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/claw_eval/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+  "/src",
+  "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/__init__.py
new file mode 100644
index 000000000..6c26aea38
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/__init__.py
@@ -0,0 +1,283 @@
+"""
+OpenTelemetry claw-eval Instrumentation
+=======================================
+
+Automatic instrumentation for the `claw-eval
+<https://github.com/claw-eval/claw-eval>`_ evaluation framework.
+
+Uses **wrapt** monkey-patching to wrap key entry points, the agent loop,
+tool dispatchers, compaction, and judge calls that should be suppressed from
+producing their own spans — producing a hierarchical trace:
+
+    ENTRY → AGENT → STEP → TOOL / CHAIN
+
+Usage
+-----
+
+.. code:: python
+
+    from opentelemetry.instrumentation.claw_eval import ClawEvalInstrumentor
+
+    ClawEvalInstrumentor().instrument()
+
+    # Then run claw-eval as normal (CLI or programmatic)
+
+API
+---
+"""
+
+from __future__ import annotations
+
+import importlib
+import logging
+from typing import Any, Collection
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from opentelemetry.instrumentation.claw_eval.config import (
+    OTEL_INSTRUMENTATION_CLAW_EVAL_ENABLED,
+)
+from opentelemetry.instrumentation.claw_eval.package import _instruments
+from opentelemetry.instrumentation.claw_eval.version import __version__
+from wrapt import wrap_function_wrapper
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["ClawEvalInstrumentor"]
+
+
+def _unwrap_func(module_path: str, func_name: str) -> None:
+    """Restore a module-level function wrapped by *wrapt*."""
+    try:
+        mod = importlib.import_module(module_path)
+        fn = getattr(mod, func_name, None)
+        if fn is not None and hasattr(fn, "__wrapped__"):
+            setattr(mod, func_name, fn.__wrapped__)
+    except Exception:
+        pass
+
+
+def _unwrap_method(
+    module_path: str, class_name: str, method_name: str
+) -> None:
+    """Restore a class method wrapped by *wrapt*."""
+    try:
+        mod = importlib.import_module(module_path)
+        cls = getattr(mod, class_name, None)
+        if cls is None:
+            return
+        meth = getattr(cls, method_name, None)
+        if meth is not None and hasattr(meth, "__wrapped__"):
+            setattr(cls, method_name, meth.__wrapped__)
+    except Exception:
+        pass
+
+
+class ClawEvalInstrumentor(BaseInstrumentor):
+    """Instrumentation that adds OpenTelemetry traces to claw-eval.
+
+    Wraps the following symbols via *wrapt*:
+
+    * **ENTRY** — ``cli.cmd_run``, ``cli.cmd_batch``, ``cli._run_single_task``
+    * **AGENT** — ``runner.loop.run_task``
+    * **STEP** — ``OpenAICompatProvider.chat`` rotates STEP spans
+    * **CHAIN** — ``compact.do_auto_compact``
+    * **TOOL**  — ``ToolDispatcher.dispatch``, ``SandboxToolDispatcher.dispatch``
+    * **Judge (suppress only)** — ``LLMJudge.evaluate``, ``evaluate_actions``,
+      ``evaluate_visual``: nested LLM SDK / HTTP spans are suppressed and no
+      judge LLM span is emitted, keeping the trace tail clean.
+    * **Per-task grader (suppress only)** — ``registry.get_grader`` and
+      ``base.load_peer_grader`` are wrapped so any grader class loaded via
+      them has its ``_llm_score_classifications`` (and similar evaluation
+      helpers) auto-suppressed. This catches the per-task grader code paths
+      that talk to ``judge.client.chat.completions.create`` directly,
+      bypassing ``LLMJudge.evaluate*``.
+    """
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        if not OTEL_INSTRUMENTATION_CLAW_EVAL_ENABLED:
+            logger.info("claw-eval instrumentation disabled via env var")
+            return
+
+        tracer_provider = kwargs.get("tracer_provider")
+        tracer = trace_api.get_tracer(
+            __name__,
+            __version__,
+            tracer_provider=tracer_provider,
+        )
+
+
+        from opentelemetry.instrumentation.claw_eval.internal.wrappers import (
+            DoAutoCompactWrapper,
+            EntryWrapper,
+            GetGraderWrapper,
+            JudgeWrapper,
+            LoadPeerGraderWrapper,
+            ProviderChatWrapper,
+            RunSingleTaskWrapper,
+            RunTaskWrapper,
+            ToolDispatchWrapper,
+        )
+
+        # --- CLI entry points (ENTRY) ---
+        for func_name, cmd in [("cmd_run", "run"), ("cmd_batch", "batch")]:
+            try:
+                wrap_function_wrapper(
+                    "claw_eval.cli",
+                    func_name,
+                    EntryWrapper(tracer, cmd),
+                )
+            except Exception as exc:
+                logger.warning(
+                    "Could not wrap claw_eval.cli.%s: %s", func_name, exc
+                )
+
+        try:
+            wrap_function_wrapper(
+                "claw_eval.cli",
+                "_run_single_task",
+                RunSingleTaskWrapper(tracer),
+            )
+        except Exception as exc:
+            logger.warning("Could not wrap _run_single_task: %s", exc)
+
+        # --- Agent loop (AGENT) ---
+        try:
+            wrap_function_wrapper(
+                "claw_eval.runner.loop",
+                "run_task",
+                RunTaskWrapper(tracer),
+            )
+        except Exception as exc:
+            logger.warning("Could not wrap run_task: %s", exc)
+
+        # --- Provider chat (STEP rotation) ---
+        try:
+            wrap_function_wrapper(
+                "claw_eval.runner.providers.openai_compat",
+                "OpenAICompatProvider.chat",
+                ProviderChatWrapper(tracer),
+            )
+        except Exception as exc:
+            logger.warning(
+                "Could not wrap OpenAICompatProvider.chat: %s", exc
+            )
+
+        # --- Context compaction (CHAIN) ---
+        try:
+            wrap_function_wrapper(
+                "claw_eval.runner.compact",
+                "do_auto_compact",
+                DoAutoCompactWrapper(tracer),
+            )
+        except Exception as exc:
+            logger.warning("Could not wrap do_auto_compact: %s", exc)
+
+        # --- Tool dispatchers (TOOL) ---
+        try:
+            wrap_function_wrapper(
+                "claw_eval.runner.dispatcher",
+                "ToolDispatcher.dispatch",
+                ToolDispatchWrapper(tracer),
+            )
+        except Exception as exc:
+            logger.warning("Could not wrap ToolDispatcher.dispatch: %s", exc)
+
+        try:
+            wrap_function_wrapper(
+                "claw_eval.runner.sandbox_dispatcher",
+                "SandboxToolDispatcher.dispatch",
+                ToolDispatchWrapper(tracer),
+            )
+        except Exception as exc:
+            logger.debug(
+                "Could not wrap SandboxToolDispatcher.dispatch: %s", exc
+            )
+
+        # --- LLM Judge (suppress nested SDK / HTTP spans, no judge span) ---
+        for method in ("evaluate", "evaluate_actions", "evaluate_visual"):
+            try:
+                wrap_function_wrapper(
+                    "claw_eval.graders.llm_judge",
+                    f"LLMJudge.{method}",
+                    JudgeWrapper(tracer, method),
+                )
+            except Exception as exc:
+                logger.warning(
+                    "Could not wrap LLMJudge.%s: %s", method, exc
+                )
+
+        # --- Per-task grader evaluation helpers ---
+        # Per-task ``tasks/T*/grader.py`` defines helpers like
+        # ``_llm_score_classifications`` that bypass ``LLMJudge.evaluate*``
+        # and call ``judge.client.chat.completions.create`` directly.
+        # Hooking the two grader loaders lets us walk each loaded grader's
+        # MRO and install span-suppression on those helpers automatically.
+        try:
+            wrap_function_wrapper(
+                "claw_eval.graders.registry",
+                "get_grader",
+                GetGraderWrapper(tracer),
+            )
+        except Exception as exc:
+            logger.warning("Could not wrap get_grader: %s", exc)
+
+        try:
+            wrap_function_wrapper(
+                "claw_eval.graders.base",
+                "load_peer_grader",
+                LoadPeerGraderWrapper(tracer),
+            )
+        except Exception as exc:
+            logger.warning("Could not wrap load_peer_grader: %s", exc)
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        # CLI entry points
+        _unwrap_func("claw_eval.cli", "cmd_run")
+        _unwrap_func("claw_eval.cli", "cmd_batch")
+        _unwrap_func("claw_eval.cli", "_run_single_task")
+
+        # Agent loop
+        _unwrap_func("claw_eval.runner.loop", "run_task")
+
+        # Provider chat
+        _unwrap_method(
+            "claw_eval.runner.providers.openai_compat",
+            "OpenAICompatProvider",
+            "chat",
+        )
+
+        # Context compaction
+        _unwrap_func("claw_eval.runner.compact", "do_auto_compact")
+
+        # Tool dispatchers
+        _unwrap_method(
+            "claw_eval.runner.dispatcher",
+            "ToolDispatcher",
+            "dispatch",
+        )
+        _unwrap_method(
+            "claw_eval.runner.sandbox_dispatcher",
+            "SandboxToolDispatcher",
+            "dispatch",
+        )
+
+        # LLM Judge
+        for method in ("evaluate", "evaluate_actions", "evaluate_visual"):
+            _unwrap_method(
+                "claw_eval.graders.llm_judge",
+                "LLMJudge",
+                method,
+            )
+
+        # Per-task grader loaders. Note: dynamically wrapped per-task
+        # ``_llm_score_classifications`` methods on already-loaded grader
+        # classes are intentionally not unwrapped here — those modules are
+        # loaded under synthetic names like ``task_grader_<id>`` and there
+        # is no stable handle to walk. Unwrapping the loaders is enough to
+        # stop *new* graders from getting wrapped after uninstrument.
+        _unwrap_func("claw_eval.graders.registry", "get_grader")
+        _unwrap_func("claw_eval.graders.base", "load_peer_grader")
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/config.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/config.py
new file mode 100644
index 000000000..abe5602bd
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/config.py
@@ -0,0 +1,25 @@
+"""Configuration via environment variables."""
+
+from __future__ import annotations
+
+import os
+
+
+def _bool_env(name: str, default: bool) -> bool:
+    val = os.getenv(name)
+    if val is None:
+        return default
+    return val.strip().lower() in {"true", "1", "yes", "on"}
+
+
+OTEL_INSTRUMENTATION_CLAW_EVAL_ENABLED = _bool_env(
+    "OTEL_INSTRUMENTATION_CLAW_EVAL_ENABLED", True
+)
+
+OTEL_CLAW_EVAL_CAPTURE_CONTENT = _bool_env(
+    "OTEL_CLAW_EVAL_CAPTURE_CONTENT", False
+)
+
+OTEL_CLAW_EVAL_PROPAGATE_TO_WORKER = _bool_env(
+    "OTEL_CLAW_EVAL_PROPAGATE_TO_WORKER", False
+)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/__init__.py
new file mode 100644
index 000000000..117870f87
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/__init__.py
@@ -0,0 +1 @@
+"""Internal helpers for claw-eval instrumentation."""
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/wrappers.py
new file mode 100644
index 000000000..fae491249
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/wrappers.py
@@ -0,0 +1,1003 @@
+"""Wrapt wrappers for claw-eval OpenTelemetry instrumentation.
+
+Span hierarchy
+--------------
+ENTRY (cmd_run / cmd_batch / _run_single_task)
+└── AGENT (run_task)
+    ├── STEP (rotated per main-loop provider.chat call)
+    │   ├── TOOL (dispatcher.dispatch / sandbox_dispatcher.dispatch)
+    │   ├── CHAIN (do_auto_compact)
+    └── (judge.evaluate* + per-task grader._llm_score_classifications:
+         nested LLM SDK / HTTP spans suppressed, no span emitted)
+"""
+
+from __future__ import annotations
+
+import json
+from contextvars import ContextVar
+from typing import Any
+
+from opentelemetry import context as otel_context
+from opentelemetry.context import _SUPPRESS_INSTRUMENTATION_KEY
+from opentelemetry.semconv._incubating.attributes import (
+    gen_ai_attributes as GenAI,
+)
+from opentelemetry.trace import (
+    SpanKind,
+    Status,
+    StatusCode,
+    Tracer,
+    set_span_in_context,
+)
+
+try:
+    from aliyun.sdk.extension.arms.semconv import _SUPPRESS_LLM_SDK_KEY
+except ImportError:
+    _SUPPRESS_LLM_SDK_KEY = None
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+GEN_AI_SPAN_KIND = "gen_ai.span.kind"
+GEN_AI_FRAMEWORK = "gen_ai.framework"
+GEN_AI_TOOL_CALL_ARGUMENTS = "gen_ai.tool.call.arguments"
+GEN_AI_TOOL_CALL_RESULT = "gen_ai.tool.call.result"
+# ``GEN_AI_TOOL_DEFINITIONS`` was added to the upstream semconv after the
+# version vendored by some Aliyun ARMS releases, so we hardcode the spec
+# string instead of reading it from ``gen_ai_attributes``.
+GEN_AI_TOOL_DEFINITIONS = "gen_ai.tool.definitions"
+
+# ---------------------------------------------------------------------------
+# ContextVars for STEP lifecycle & compact-depth tracking
+# ---------------------------------------------------------------------------
+
+_compact_depth: ContextVar[int] = ContextVar(
+    "claw_eval_compact_depth", default=0
+)
+_in_agent_run: ContextVar[bool] = ContextVar(
+    "claw_eval_in_agent_run", default=False
+)
+_step_counter: ContextVar[int] = ContextVar(
+    "claw_eval_step_counter", default=0
+)
+_current_step_span: ContextVar[Any] = ContextVar(
+    "claw_eval_current_step_span", default=None
+)
+_current_step_token: ContextVar[Any] = ContextVar(
+    "claw_eval_current_step_token", default=None
+)
+_in_tool_dispatch: ContextVar[bool] = ContextVar(
+    "claw_eval_in_tool_dispatch", default=False
+)
+
+# Per-call capture state for the active AGENT span. ``RunTaskWrapper`` sets a
+# fresh dict on entry; the lightweight ``provider.chat`` shim installed below
+# pushes data into it. Using a ContextVar keeps concurrent ``run_task``
+# invocations isolated even when they share the same provider instance.
+_agent_capture: ContextVar["dict[str, Any] | None"] = ContextVar(
+    "claw_eval_agent_capture", default=None
+)
+
+# JSON-serialized tool-definition list captured from the ``tools=`` kwarg of
+# the first ``provider.chat`` call inside an AGENT run. Read by
+# ``ToolDispatchWrapper`` to populate ``gen_ai.tool.definitions`` on every
+# TOOL span. Stored as a pre-serialized string so each TOOL span pays only an
+# attribute-set cost, not a JSON-encode cost.
+_agent_tool_definitions: ContextVar[str] = ContextVar(
+    "claw_eval_agent_tool_definitions", default=""
+)
+
+# Per-CLI-invocation capture for the ENTRY span. ``EntryWrapper`` /
+# ``RunSingleTaskWrapper`` initialize a list on entry; each completing AGENT
+# span pushes its own capture dict onto it. The first task prompt and the
+# final agent response surface as ENTRY ``gen_ai.input.messages`` /
+# ``gen_ai.output.messages`` so the trace root carries useful IO.
+_entry_capture: ContextVar["list[dict[str, Any]] | None"] = ContextVar(
+    "claw_eval_entry_capture", default=None
+)
+
+# ---------------------------------------------------------------------------
+# Content helpers
+# ---------------------------------------------------------------------------
+
+
+def _safe_json(obj: Any) -> str:
+    """JSON-serialize ``obj`` for span attributes.
+
+    Content is intentionally NOT truncated: downstream consumers (evaluators,
+    SLS analytics) need the full request/response payloads.
+    """
+    try:
+        return json.dumps(obj, ensure_ascii=False, default=str)
+    except Exception:
+        return str(obj)
+
+
+def _extract_tool_result_text(result) -> str:
+    """Extract text content from a ToolResultBlock for gen_ai.tool.call.result.
+
+    Tool output is intentionally NOT truncated so downstream consumers see the
+    full payload returned to the agent.
+    """
+    content = getattr(result, "content", None)
+    if not content:
+        return ""
+    parts: list[str] = []
+    for block in content:
+        text = getattr(block, "text", None)
+        if text:
+            parts.append(text)
+    return "\n".join(parts)
+
+
+def _extract_system_prompt(messages) -> str:
+    """Pull the text content of the first ``role=system`` message."""
+    if not messages:
+        return ""
+    for msg in messages:
+        if getattr(msg, "role", None) != "system":
+            continue
+        for block in getattr(msg, "content", []) or []:
+            if getattr(block, "type", None) == "text":
+                return getattr(block, "text", "") or ""
+        break
+    return ""
+
+
+# ---------------------------------------------------------------------------
+# Spec-compliant message serialization
+# ---------------------------------------------------------------------------
+#
+# These helpers convert claw-eval's internal ``Message``/``ContentBlock``
+# objects into the ARMS GenAI semantic-convention JSON shape documented in
+# ``arms_docs/trace/gen-ai.md`` and the message JSON schemas:
+#
+# * ``gen_ai.input.messages``  — array of ``ChatMessage`` ({role, parts})
+# * ``gen_ai.output.messages`` — array of ``OutputMessage``
+#                                ({role, parts, finish_reason})
+# * ``gen_ai.system_instructions`` — array of parts (TextPart, ...) — note
+#                                    that this is *not* wrapped in a message.
+#
+# Each ``part`` follows the schema:
+#   - TextPart:               {"type": "text", "content": ...}
+#   - ToolCallRequestPart:    {"type": "tool_call", "id", "name", "arguments"}
+#   - ToolCallResponsePart:   {"type": "tool_call_response", "id", "response"}
+
+
+def _block_to_part(block) -> dict[str, Any]:
+    """Convert a claw-eval ContentBlock to a spec-compliant message part."""
+    btype = getattr(block, "type", "")
+    if btype == "text":
+        return {
+            "type": "text",
+            "content": getattr(block, "text", "") or "",
+        }
+    if btype == "tool_use":
+        return {
+            "type": "tool_call",
+            "id": getattr(block, "id", "") or "",
+            "name": getattr(block, "name", "") or "",
+            "arguments": getattr(block, "input", None),
+        }
+    if btype == "tool_result":
+        inner_texts: list[str] = []
+        for ib in getattr(block, "content", []) or []:
+            t = getattr(ib, "text", None)
+            if t:
+                inner_texts.append(t)
+        return {
+            "type": "tool_call_response",
+            "id": getattr(block, "tool_use_id", "") or "",
+            "response": "\n".join(inner_texts),
+        }
+    if btype in {"image", "audio", "video"}:
+        return {"type": btype}
+    return {"type": btype or "unknown"}
+
+
+def _message_to_chat_message(msg) -> dict[str, Any]:
+    """Convert a claw-eval ``Message`` to a spec ``ChatMessage`` dict."""
+    role = getattr(msg, "role", "unknown")
+    parts = [
+        _block_to_part(b) for b in (getattr(msg, "content", None) or [])
+    ]
+    return {"role": role, "parts": parts}
+
+
+def _infer_finish_reason(message) -> str:
+    """Infer ``finish_reason`` for an output message.
+
+    The claw-eval ``Message`` returned from ``provider.chat`` does not carry
+    the upstream ``finish_reason``; the loop relies on the presence/absence of
+    ``tool_use`` blocks to decide whether to keep iterating. We mirror that
+    convention here so downstream consumers get a well-formed
+    ``OutputMessage``.
+    """
+    for b in getattr(message, "content", None) or []:
+        if getattr(b, "type", "") == "tool_use":
+            return "tool_call"
+    return "stop"
+
+
+def _serialize_input_messages(messages) -> str:
+    """Serialize a list of input ``Message`` objects to JSON per the spec."""
+    arr = [_message_to_chat_message(m) for m in (messages or [])]
+    try:
+        return json.dumps(arr, ensure_ascii=False, default=str)
+    except Exception:
+        return str(arr)
+
+
+def _serialize_output_message(message) -> str:
+    """Serialize a single response ``Message`` to a JSON ``OutputMessages`` array."""
+    if message is None:
+        return ""
+    role = getattr(message, "role", "assistant") or "assistant"
+    parts = [
+        _block_to_part(b) for b in (getattr(message, "content", None) or [])
+    ]
+    out = {
+        "role": role,
+        "parts": parts,
+        "finish_reason": _infer_finish_reason(message),
+    }
+    try:
+        return json.dumps([out], ensure_ascii=False, default=str)
+    except Exception:
+        return str([out])
+
+
+def _serialize_system_instructions(text: str) -> str:
+    """Wrap a system prompt string into a JSON ``SystemInstructions`` array."""
+    if not text:
+        return ""
+    arr = [{"type": "text", "content": text}]
+    try:
+        return json.dumps(arr, ensure_ascii=False, default=str)
+    except Exception:
+        return str(arr)
+
+
+def _build_user_text_messages(text: str) -> str:
+    """Build a one-message ``InputMessages`` JSON for a plain user prompt."""
+    if not text:
+        return ""
+    arr = [
+        {
+            "role": "user",
+            "parts": [{"type": "text", "content": text}],
+        }
+    ]
+    try:
+        return json.dumps(arr, ensure_ascii=False, default=str)
+    except Exception:
+        return str(arr)
+
+
+def _serialize_tool_definitions(tools) -> str:
+    """Serialize a ``ToolSpec`` iterable as the ``gen_ai.tool.definitions`` JSON.
+
+    Per the GenAI semantic convention each entry is a ``ToolDefinition`` object
+    of the form ``{"type": "function", "name": ..., "description": ...,
+    "parameters": ...}``. Anything not coercible to that shape is skipped so
+    a malformed entry never aborts serialization for the rest of the list.
+    """
+    if not tools:
+        return ""
+    arr: list[dict[str, Any]] = []
+    for t in tools:
+        name = getattr(t, "name", None)
+        if not name:
+            continue
+        entry: dict[str, Any] = {"type": "function", "name": str(name)}
+        desc = getattr(t, "description", None)
+        if desc:
+            entry["description"] = str(desc)
+        # claw-eval names it ``input_schema``; OpenAI / OTel spec uses
+        # ``parameters``. Translate so consumers don't have to special-case.
+        schema = getattr(t, "input_schema", None)
+        if schema is None:
+            schema = getattr(t, "parameters", None)
+        if schema is not None:
+            entry["parameters"] = schema
+        arr.append(entry)
+    if not arr:
+        return ""
+    try:
+        return json.dumps(arr, ensure_ascii=False, default=str)
+    except Exception:
+        return str(arr)
+
+
+# ---------------------------------------------------------------------------
+# STEP lifecycle helpers
+# ---------------------------------------------------------------------------
+
+
+def _end_current_step() -> None:
+    """End the active STEP span and detach its context token."""
+    span = _current_step_span.get(None)
+    token = _current_step_token.get(None)
+    if span is not None:
+        span.end()
+        _current_step_span.set(None)
+    if token is not None:
+        otel_context.detach(token)
+        _current_step_token.set(None)
+
+
+def _rotate_step(tracer: Tracer) -> None:
+    """End the previous STEP and start a new one under the current context."""
+    _end_current_step()
+
+    step_num = _step_counter.get(0) + 1
+    _step_counter.set(step_num)
+
+    step_span = tracer.start_span("react step", kind=SpanKind.INTERNAL)
+    step_span.set_attribute(GEN_AI_SPAN_KIND, "STEP")
+    step_span.set_attribute(
+        GenAI.GEN_AI_OPERATION_NAME,
+        GenAI.GenAiOperationNameValues.INVOKE_AGENT.value,
+    )
+    step_span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval")
+    step_span.set_attribute(GenAI.GEN_AI_AGENT_NAME, "claw-eval")
+    step_span.set_attribute("gen_ai.react.round", step_num)
+
+    _current_step_span.set(step_span)
+    ctx = set_span_in_context(step_span)
+    token = otel_context.attach(ctx)
+    _current_step_token.set(token)
+
+
+# ---------------------------------------------------------------------------
+# ENTRY wrappers (cli.cmd_run / cli.cmd_batch)
+# ---------------------------------------------------------------------------
+
+
+def _populate_entry_span(span, captures: list[dict] | None) -> None:
+    """Apply the first task prompt and the last agent response to ENTRY span.
+
+    ENTRY is the trace root for a CLI invocation; representing it with the
+    first user prompt and the final agent response gives the span a useful
+    summary view without trying to merge potentially conflicting data from
+    multiple trials/tasks.
+    """
+    if not captures:
+        return
+
+    # Input: prefer the first agent run's captured input messages (already in
+    # spec format); otherwise fall back to its task prompt.
+    input_msgs = ""
+    for cap in captures:
+        input_msgs = cap.get("input_messages_str", "") or ""
+        if input_msgs:
+            break
+    if not input_msgs:
+        for cap in captures:
+            prompt = cap.get("task_prompt", "") or ""
+            if prompt:
+                input_msgs = _build_user_text_messages(prompt)
+                break
+    if input_msgs:
+        span.set_attribute(GenAI.GEN_AI_INPUT_MESSAGES, input_msgs)
+
+    # Output: last agent's last response wins (most likely the final answer
+    # the user would care about).
+    output_msgs = ""
+    for cap in reversed(captures):
+        output_msgs = cap.get("last_response_str", "") or ""
+        if output_msgs:
+            break
+    if output_msgs:
+        span.set_attribute(GenAI.GEN_AI_OUTPUT_MESSAGES, output_msgs)
+
+
+class EntryWrapper:
+    """Creates an ENTRY span around CLI entry-point functions."""
+
+    __slots__ = ("_tracer", "_command")
+
+    def __init__(self, tracer: Tracer, command: str):
+        self._tracer = tracer
+        self._command = command
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        captures: list[dict] = []
+        cap_tok = _entry_capture.set(captures)
+        with self._tracer.start_as_current_span(
+                f"claw-eval {self._command}", kind=SpanKind.INTERNAL
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "ENTRY")
+            span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval")
+            span.set_attribute("claw_eval.command", self._command)
+            try:
+                return wrapped(*args, **kwargs)
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+            finally:
+                _populate_entry_span(span, captures)
+                _entry_capture.reset(cap_tok)
+
+
+class RunSingleTaskWrapper:
+    """Creates an ENTRY span for batch worker ``_run_single_task``."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        task_dir = args[0] if args else kwargs.get("task_dir", "")
+        captures: list[dict] = []
+        cap_tok = _entry_capture.set(captures)
+        with self._tracer.start_as_current_span(
+                "claw-eval batch_worker", kind=SpanKind.INTERNAL
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "ENTRY")
+            span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval")
+            span.set_attribute("claw_eval.command", "batch_worker")
+            if task_dir:
+                span.set_attribute("claw_eval.task_dir", str(task_dir))
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+            else:
+                if isinstance(result, dict):
+                    tid = result.get("task_id")
+                    if tid:
+                        span.set_attribute("claw_eval.task_id", str(tid))
+                return result
+            finally:
+                _populate_entry_span(span, captures)
+                _entry_capture.reset(cap_tok)
+
+
+# ---------------------------------------------------------------------------
+# AGENT wrapper (runner.loop.run_task)
+# ---------------------------------------------------------------------------
+
+
+class RunTaskWrapper:
+    """Creates an AGENT span and aggregates per-task GenAI attributes.
+
+    The wrapper installs a lightweight, idempotent shim on ``provider.chat``
+    that records the first-call input messages, system prompt, latest response
+    and accumulated token usage into a per-call ``_agent_capture`` dict. On
+    exit the data is written onto the AGENT span using the OTel GenAI
+    semantic conventions (``gen_ai.input.messages``,
+    ``gen_ai.output.messages``, ``gen_ai.system_instructions``,
+    ``gen_ai.usage.{input,output}_tokens``, ``gen_ai.request.model``).
+
+    ``ProviderChatWrapper`` is intentionally left untouched: the shim wraps
+    the *bound* method that already goes through ``ProviderChatWrapper``, so
+    STEP rotation continues to work exactly as before.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        task = args[0] if args else kwargs.get("task")
+        provider = args[1] if len(args) > 1 else kwargs.get("provider")
+        task_id = getattr(task, "task_id", "unknown") if task else "unknown"
+
+        with self._tracer.start_as_current_span(
+                "invoke_agent claw-eval", kind=SpanKind.INTERNAL
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "AGENT")
+            span.set_attribute(
+                GenAI.GEN_AI_OPERATION_NAME,
+                GenAI.GenAiOperationNameValues.INVOKE_AGENT.value,
+            )
+            span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval")
+            span.set_attribute(GenAI.GEN_AI_AGENT_NAME, "claw-eval")
+            span.set_attribute("claw_eval.task_id", str(task_id))
+
+            model_id = ""
+            if provider is not None:
+                model_id = str(getattr(provider, "model_id", "") or "")
+                if model_id:
+                    span.set_attribute(GenAI.GEN_AI_REQUEST_MODEL, model_id)
+
+            prompt = _get_task_prompt(task)
+            if prompt:
+                span.set_attribute(
+                    GenAI.GEN_AI_AGENT_DESCRIPTION,
+                    prompt,
+                )
+
+            capture: dict[str, Any] = {
+                "input_tokens": 0,
+                "output_tokens": 0,
+                "system_instructions": "",
+                "input_messages_str": "",
+                "last_response_str": "",
+                "task_prompt": prompt,
+                "first_call_done": False,
+            }
+
+            _install_provider_chat_capture_shim(provider)
+
+            tok_agent = _in_agent_run.set(True)
+            tok_cnt = _step_counter.set(0)
+            tok_ss = _current_step_span.set(None)
+            tok_st = _current_step_token.set(None)
+            tok_cap = _agent_capture.set(capture)
+            tok_tools = _agent_tool_definitions.set("")
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+            else:
+                total = _step_counter.get(0)
+                if total > 0:
+                    span.set_attribute("claw_eval.total_turns", total)
+                return result
+            finally:
+                _populate_agent_span(span, capture, prompt)
+                entry_caps = _entry_capture.get()
+                if entry_caps is not None:
+                    entry_caps.append(capture)
+                _end_current_step()
+                _in_agent_run.reset(tok_agent)
+                _step_counter.reset(tok_cnt)
+                _current_step_span.reset(tok_ss)
+                _current_step_token.reset(tok_st)
+                _agent_capture.reset(tok_cap)
+                _agent_tool_definitions.reset(tok_tools)
+
+
+def _install_provider_chat_capture_shim(provider) -> None:
+    """Idempotently install a pass-through shim on ``provider.chat``.
+
+    The shim reads the active capture dict from ``_agent_capture`` and
+    records token usage / input messages / latest response into it. When no
+    capture is active (e.g. provider used outside an AGENT span) the shim is
+    a transparent no-op. Recording is skipped while ``_compact_depth > 0``
+    so the AGENT totals match the framework's own ``total_usage`` accounting
+    (which excludes auto-compact LLM calls).
+    """
+    if provider is None:
+        return
+
+    existing = provider.__dict__.get("chat")
+    if existing is not None and getattr(existing, "_claw_eval_capture_shim", False):
+        return
+
+    cls = type(provider)
+    cls_chat = getattr(cls, "chat", None)
+    if cls_chat is None:
+        return
+    try:
+        bound_chat = cls_chat.__get__(provider, cls)
+    except Exception:
+        return
+    if not callable(bound_chat):
+        return
+
+    def chat(messages, *call_args, **call_kwargs):
+        # Capture the tools list *before* delegating so TOOL spans created
+        # inside ``bound_chat`` (none today, but cheap insurance) still see
+        # the populated ContextVar. The capture is idempotent — we only
+        # serialize once per AGENT run.
+        if _compact_depth.get(0) == 0 and not _agent_tool_definitions.get(""):
+            tools_arg = call_kwargs.get("tools")
+            if tools_arg is None and call_args:
+                tools_arg = call_args[0]
+            if tools_arg:
+                try:
+                    serialized = _serialize_tool_definitions(tools_arg)
+                except Exception:
+                    serialized = ""
+                if serialized:
+                    _agent_tool_definitions.set(serialized)
+
+        response, usage = bound_chat(messages, *call_args, **call_kwargs)
+        capture = _agent_capture.get()
+        if capture is None or _compact_depth.get(0) > 0:
+            return response, usage
+
+        try:
+            capture["input_tokens"] += int(
+                getattr(usage, "input_tokens", 0) or 0
+            )
+            capture["output_tokens"] += int(
+                getattr(usage, "output_tokens", 0) or 0
+            )
+        except Exception:
+            pass
+
+        if not capture.get("first_call_done", False):
+            capture["first_call_done"] = True
+            try:
+                capture["system_instructions"] = _extract_system_prompt(messages)
+                non_system = [
+                    m for m in messages
+                    if getattr(m, "role", None) != "system"
+                ]
+                if non_system:
+                    capture["input_messages_str"] = (
+                        _serialize_input_messages(non_system)
+                    )
+            except Exception:
+                pass
+
+        try:
+            capture["last_response_str"] = _serialize_output_message(response)
+        except Exception:
+            pass
+
+        return response, usage
+
+    chat._claw_eval_capture_shim = True
+    try:
+        provider.chat = chat
+    except Exception:
+        pass
+
+
+def _populate_agent_span(span, capture: dict, task_prompt: str) -> None:
+    """Apply aggregated LLM/token/message data to the AGENT span on exit.
+
+    The GenAI semantic-convention attributes (``gen_ai.input.messages``,
+    ``gen_ai.output.messages``, ``gen_ai.system_instructions``,
+    ``gen_ai.usage.{input,output}_tokens``) are always written when the data
+    has been captured. The AGENT span is the canonical record of a task's IO
+    and must surface it now that per-LLM-call spans are suppressed.
+    """
+    inp = int(capture.get("input_tokens", 0) or 0)
+    out = int(capture.get("output_tokens", 0) or 0)
+    if inp:
+        span.set_attribute(GenAI.GEN_AI_USAGE_INPUT_TOKENS, inp)
+    if out:
+        span.set_attribute(GenAI.GEN_AI_USAGE_OUTPUT_TOKENS, out)
+
+    sys_prompt = capture.get("system_instructions", "") or ""
+    if sys_prompt:
+        span.set_attribute(
+            GenAI.GEN_AI_SYSTEM_INSTRUCTIONS,
+            _serialize_system_instructions(sys_prompt),
+        )
+
+    input_msgs = capture.get("input_messages_str", "") or ""
+    if input_msgs:
+        span.set_attribute(GenAI.GEN_AI_INPUT_MESSAGES, input_msgs)
+    elif task_prompt:
+        span.set_attribute(
+            GenAI.GEN_AI_INPUT_MESSAGES,
+            _build_user_text_messages(task_prompt),
+        )
+
+    last_response_str = capture.get("last_response_str", "") or ""
+    if last_response_str:
+        span.set_attribute(GenAI.GEN_AI_OUTPUT_MESSAGES, last_response_str)
+
+
+def _get_task_prompt(task) -> str:
+    """Safely extract the prompt text from a TaskDefinition."""
+    if task is None:
+        return ""
+    prompt = getattr(task, "prompt", None)
+    if prompt is None:
+        return ""
+    return getattr(prompt, "text", "") or ""
+
+
+class ProviderChatWrapper:
+    """Rotates STEP spans around main-loop provider chat calls.
+
+    When ``compact_depth == 0`` and inside an agent run, each call ends
+    the previous STEP and starts a new one so that subsequent TOOL spans
+    become children of the latest STEP.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        compact_depth = _compact_depth.get(0)
+        in_agent = _in_agent_run.get(False)
+
+        if in_agent and compact_depth == 0:
+            _rotate_step(self._tracer)
+
+        return wrapped(*args, **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# CHAIN wrapper (compact.do_auto_compact)
+# ---------------------------------------------------------------------------
+
+
+class DoAutoCompactWrapper:
+    """Creates a CHAIN span and bumps ``_compact_depth``."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        focus = kwargs.get("focus")
+        layer = "manual" if focus is not None else "auto"
+
+        with self._tracer.start_as_current_span(
+                "compact", kind=SpanKind.INTERNAL
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "CHAIN")
+            span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval")
+            span.set_attribute("claw_eval.compact.layer", layer)
+
+            depth_tok = _compact_depth.set(_compact_depth.get(0) + 1)
+            try:
+                return wrapped(*args, **kwargs)
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+            finally:
+                _compact_depth.reset(depth_tok)
+
+
+# ---------------------------------------------------------------------------
+# TOOL wrapper (ToolDispatcher.dispatch / SandboxToolDispatcher.dispatch)
+# ---------------------------------------------------------------------------
+
+
+class ToolDispatchWrapper:
+    """Creates a TOOL span for ``dispatch`` calls.
+
+    Uses ``_in_tool_dispatch`` guard to prevent duplicate spans when
+    ``SandboxToolDispatcher.dispatch`` delegates to ``ToolDispatcher.dispatch``.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        if _in_tool_dispatch.get(False):
+            return wrapped(*args, **kwargs)
+
+        tool_use = args[0] if args else kwargs.get("tool_use")
+        tool_name = getattr(tool_use, "name", "unknown") if tool_use else "unknown"
+        tool_use_id = getattr(tool_use, "id", "") if tool_use else ""
+        tool_input = getattr(tool_use, "input", None) if tool_use else None
+        is_sandbox = hasattr(instance, "_http")
+
+        guard = _in_tool_dispatch.set(True)
+        with self._tracer.start_as_current_span(
+                f"execute_tool {tool_name}", kind=SpanKind.INTERNAL
+        ) as span:
+            span.set_attribute(GEN_AI_SPAN_KIND, "TOOL")
+            span.set_attribute(
+                GenAI.GEN_AI_OPERATION_NAME,
+                GenAI.GenAiOperationNameValues.EXECUTE_TOOL.value,
+            )
+            span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval")
+            span.set_attribute(GenAI.GEN_AI_TOOL_NAME, tool_name)
+            span.set_attribute(GenAI.GEN_AI_TOOL_TYPE, "function")
+            if tool_use_id:
+                span.set_attribute(GenAI.GEN_AI_TOOL_CALL_ID, tool_use_id)
+            tool_defs = _agent_tool_definitions.get("")
+            if tool_defs:
+                span.set_attribute(GEN_AI_TOOL_DEFINITIONS, tool_defs)
+            if is_sandbox:
+                sandbox_url = getattr(instance, "_sandbox_url", None)
+                span.set_attribute(
+                    "claw_eval.sandbox.remote", sandbox_url is not None
+                )
+            if tool_input is not None:
+                span.set_attribute(
+                    GEN_AI_TOOL_CALL_ARGUMENTS,
+                    _safe_json(tool_input),
+                )
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+            else:
+                _extract_dispatch_attrs(span, result)
+                return result
+            finally:
+                _in_tool_dispatch.reset(guard)
+
+
+def _extract_dispatch_attrs(span, result) -> None:
+    """Extract status, latency, and output from the dispatch result tuple."""
+    if not isinstance(result, tuple) or len(result) < 2:
+        return
+    tool_result, dispatch_event = result[0], result[1]
+    latency = getattr(dispatch_event, "latency_ms", None)
+    if latency is not None:
+        span.set_attribute("claw_eval.dispatch.latency_ms", float(latency))
+    status = getattr(dispatch_event, "response_status", None)
+    if status is not None:
+        span.set_attribute("http.response.status_code", int(status))
+    if getattr(tool_result, "is_error", False):
+        span.set_status(Status(StatusCode.ERROR))
+    output_text = _extract_tool_result_text(tool_result)
+    if output_text:
+        span.set_attribute(GEN_AI_TOOL_CALL_RESULT, output_text)
+
+
+# ---------------------------------------------------------------------------
+# Judge wrapper (LLMJudge.evaluate / evaluate_actions / evaluate_visual)
+# ---------------------------------------------------------------------------
+
+
+class JudgeWrapper:
+    """Suppresses nested LLM SDK spans for judge evaluation calls.
+
+    The judge step happens after the agent finishes and is conceptually an
+    evaluation/grading concern rather than part of the agent's own reasoning
+    trace. Emitting a dedicated LLM span here clutters the trace tail, so we
+    intentionally do *not* create a span; we only attach the suppression
+    context so the underlying LLM SDK (OpenAI / etc.) does not emit a chat
+    span either.
+    """
+
+    __slots__ = ("_tracer", "_method_name")
+
+    def __init__(self, tracer: Tracer, method_name: str = "evaluate"):
+        self._tracer = tracer
+        self._method_name = method_name
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        suppress_tok = _maybe_suppress_llm_sdk()
+        try:
+            return wrapped(*args, **kwargs)
+        finally:
+            if suppress_tok is not None:
+                otel_context.detach(suppress_tok)
+
+
+# ---------------------------------------------------------------------------
+# Per-task grader wrappers
+# ---------------------------------------------------------------------------
+#
+# Per-task graders (``tasks/T*/grader.py``) frequently bypass
+# ``LLMJudge.evaluate*`` and call ``judge.client.chat.completions.create``
+# directly inside helpers like ``_llm_score_classifications``. Those calls
+# would otherwise emit a stray "evaluation" LLM span at the very tail of
+# the trace.
+#
+# Rather than statically enumerating every task module, we hook the two
+# loader entry points (``registry.get_grader`` and
+# ``base.load_peer_grader``) and then walk the returned class' MRO to wrap
+# any matching evaluation-helper methods with ``JudgeWrapper``. This keeps
+# coverage automatic for any new task that follows the same naming
+# convention.
+
+
+import wrapt as _wrapt  # local import to avoid widening top-level deps
+
+_GRADER_EVAL_METHOD_NAMES: tuple[str, ...] = (
+    "_llm_score_classifications",
+)
+
+_GRADER_WRAP_MARKER = "_claw_eval_judge_wrapped"
+
+
+def _wrap_grader_eval_methods(
+        cls,
+        tracer: Tracer,
+) -> None:
+    """Wrap evaluation-helper methods on ``cls`` (and its bases) with JudgeWrapper.
+
+    Idempotent: a marker attribute is set on the wrapped descriptor so the
+    same method is never wrapped twice across multiple loads of the same
+    class (e.g. peer-grader inheritance chains).
+    """
+    if cls is None or cls is object:
+        return
+    for klass in getattr(cls, "__mro__", (cls,)):
+        if klass is object:
+            continue
+        for method_name in _GRADER_EVAL_METHOD_NAMES:
+            method = klass.__dict__.get(method_name)
+            if method is None:
+                continue
+            if getattr(method, _GRADER_WRAP_MARKER, False):
+                continue
+            try:
+                wrapper = JudgeWrapper(tracer, method_name)
+                wrapped = _wrapt.FunctionWrapper(method, wrapper)
+                setattr(wrapped, _GRADER_WRAP_MARKER, True)
+                setattr(klass, method_name, wrapped)
+            except Exception:
+                # Failure here only loses suppression for one method; never
+                # let it break grader loading.
+                pass
+
+
+class GetGraderWrapper:
+    """Wraps ``claw_eval.graders.registry.get_grader``.
+
+    After the upstream loader returns a grader instance, walk the
+    instance's class MRO and wrap evaluation helpers so the inner
+    ``judge.client.chat.completions.create`` calls don't emit a trailing
+    LLM span.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        grader = wrapped(*args, **kwargs)
+        try:
+            _wrap_grader_eval_methods(
+                type(grader), self._tracer
+            )
+        except Exception:
+            pass
+        return grader
+
+
+class LoadPeerGraderWrapper:
+    """Wraps ``claw_eval.graders.base.load_peer_grader``.
+
+    Peer graders are loaded lazily at module-import time of a sibling
+    task's ``grader.py`` (``_Base = load_peer_grader("T001zh_...")``).
+    Wrapping the returned class here ensures the parent-side
+    evaluation helpers are suppressed even when subclasses don't override
+    them.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        cls = wrapped(*args, **kwargs)
+        try:
+            _wrap_grader_eval_methods(cls, self._tracer)
+        except Exception:
+            pass
+        return cls
+
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+
+def _maybe_suppress_llm_sdk():
+    """Suppress nested LLM SDK / generic instrumentation under the wrapped call.
+
+    Sets two complementary context keys so the suppression covers both:
+
+    * ``_SUPPRESS_LLM_SDK_KEY`` — Aliyun-private key honored by
+      ``aliyun-instrumentation-openai``, ``opentelemetry-instrumentation-litellm``
+      and ``aliyun-opentelemetry-util-genai``.
+    * ``_SUPPRESS_INSTRUMENTATION_KEY`` — the OpenTelemetry standard
+      suppression key honored by community/upstream instrumentors
+      (httpx, requests, urllib3, etc.). This catches the HTTP-level span
+      that would otherwise be emitted for raw judge HTTP calls.
+    """
+    ctx = otel_context.get_current()
+    if _SUPPRESS_LLM_SDK_KEY is not None:
+        ctx = otel_context.set_value(_SUPPRESS_LLM_SDK_KEY, True, ctx)
+    ctx = otel_context.set_value(_SUPPRESS_INSTRUMENTATION_KEY, True, ctx)
+    return otel_context.attach(ctx)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/package.py
new file mode 100644
index 000000000..32c50b3db
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/package.py
@@ -0,0 +1,3 @@
+_instruments = ("claw-eval >= 0.1.0",)
+
+_supports_metrics = False
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/version.py
new file mode 100644
index 000000000..3dc1f76bc
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/pyproject.toml
new file mode 100644
index 000000000..6d37e87fe
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/pyproject.toml
@@ -0,0 +1,54 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-minisweagent"
+dynamic = ["version"]
+description = "LoongSuite mini-swe-agent instrumentation"
+license = "Apache-2.0"
+requires-python = ">=3.10,<4"
+authors = [
+  { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+  "opentelemetry-api >= 1.37.0",
+  "opentelemetry-instrumentation >= 0.58b0",
+  "opentelemetry-semantic-conventions >= 0.58b0",
+  "wrapt >= 1.0.0, < 2.0.0",
+]
+
+[project.optional-dependencies]
+instruments = [
+  "mini-swe-agent >= 2.2.0",
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+minisweagent = "opentelemetry.instrumentation.minisweagent:MiniSweAgentInstrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/minisweagent/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+  "/src",
+  "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/__init__.py
new file mode 100644
index 000000000..04274fa5d
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/__init__.py
@@ -0,0 +1,161 @@
+"""
+LoongSuite mini-swe-agent Instrumentation
+=========================================
+
+Automatic instrumentation for the `mini-swe-agent
+<https://github.com/SWE-agent/mini-swe-agent>`_ framework.
+
+Uses **Method C (hybrid)**:
+
+* factory injection via ``get_environment`` → ``TracingEnvironment`` (TOOL / ``execute_tool``)
+* ``wrapt`` on ``DefaultAgent.run`` / ``DefaultAgent.step``, and ENTRY on Typer ``minisweagent.run.mini:app``
+
+LLM spans stay in LiteLLM/OpenAI instrumentation; this package adds Agent/ReAct/ENTRY/TOOL spans and (with the env vars described in the instrumentor docstring) full ARMS-aligned message / tool payloads.
+
+Usage
+-----
+
+.. code:: python
+
+    from opentelemetry.instrumentation.minisweagent import MiniSweAgentInstrumentor
+
+    MiniSweAgentInstrumentor().instrument()
+
+    # Then use mini-swe-agent as normal
+    from minisweagent.models import get_model
+    from minisweagent.environments import get_environment
+    from minisweagent.agents.default import DefaultAgent
+
+    model = get_model("gpt-4o")
+    env = get_environment({"environment_class": "local"})
+    agent = DefaultAgent(model=model, environment=env)
+    agent.run("Fix the bug")
+
+API
+---
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Collection
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from opentelemetry.instrumentation.minisweagent.package import _instruments
+from opentelemetry.instrumentation.minisweagent.version import __version__
+from wrapt import wrap_function_wrapper
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["MiniSweAgentInstrumentor"]
+
+
+class MiniSweAgentInstrumentor(BaseInstrumentor):
+    """An instrumentor for the mini-swe-agent framework.
+
+    Covers GenAI span kinds (ARMS / LoongSuite conventions when
+    ``OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`` and
+    ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=SPAN_ONLY``):
+
+    * **ENTRY** – Typer ``mini`` callable ``app`` (``minisweagent.run.mini:app``), span name ``enter_ai_application_system``
+    * **AGENT** – ``DefaultAgent.run`` via ``invoke_agent`` (+ messages / system instruction / tool definitions)
+    * **STEP** – ``DefaultAgent.step`` (ReAct round)
+    * **TOOL** – ``TracingEnvironment.execute`` (``execute_tool`` for bash)
+
+    LLM-call spans remain with the underlying LiteLLM/OpenAI instrumentation.
+    """
+
+    _original_get_environment = None
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        tracer_provider = kwargs.get("tracer_provider")
+        tracer = trace_api.get_tracer(
+            __name__,
+            __version__,
+            tracer_provider=tracer_provider,
+        )
+
+        from opentelemetry.instrumentation.minisweagent.internal.agent_wrappers import (
+            DefaultAgentRunWrapper,
+            DefaultAgentStepWrapper,
+        )
+        from opentelemetry.instrumentation.minisweagent.internal.cli_wrappers import (
+            patch_mini_cli_app_module,
+        )
+        from opentelemetry.instrumentation.minisweagent.internal.delegates import (
+            TracingEnvironment,
+        )
+
+        # --- factory injection: get_environment ---
+        try:
+            import minisweagent.environments as _envs_mod
+
+            if self.__class__._original_get_environment is None:
+                self.__class__._original_get_environment = _envs_mod.get_environment
+
+            def _wrapped_get_environment(*args: Any, **kw: Any) -> Any:
+                env = MiniSweAgentInstrumentor._original_get_environment(*args, **kw)
+                return TracingEnvironment(env, tracer)
+
+            _envs_mod.get_environment = _wrapped_get_environment
+        except Exception as exc:
+            logger.warning("Could not wrap get_environment: %s", exc)
+
+        try:
+            patch_mini_cli_app_module()
+        except Exception as exc:
+            logger.warning("Could not patch minisweagent.run.mini.app (ENTRY): %s", exc)
+
+        # --- wrapt: DefaultAgent.run / DefaultAgent.step ---
+        try:
+            wrap_function_wrapper(
+                module="minisweagent.agents.default",
+                name="DefaultAgent.run",
+                wrapper=DefaultAgentRunWrapper(tracer),
+            )
+        except Exception as exc:
+            logger.warning("Could not wrap DefaultAgent.run: %s", exc)
+
+        try:
+            wrap_function_wrapper(
+                module="minisweagent.agents.default",
+                name="DefaultAgent.step",
+                wrapper=DefaultAgentStepWrapper(tracer),
+            )
+        except Exception as exc:
+            logger.warning("Could not wrap DefaultAgent.step: %s", exc)
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        # --- restore wrapt patches on DefaultAgent ---
+        try:
+            from minisweagent.agents.default import DefaultAgent
+
+            if hasattr(DefaultAgent.run, "__wrapped__"):
+                DefaultAgent.run = DefaultAgent.run.__wrapped__  # type: ignore[attr-defined]
+            if hasattr(DefaultAgent.step, "__wrapped__"):
+                DefaultAgent.step = DefaultAgent.step.__wrapped__  # type: ignore[attr-defined]
+        except Exception as exc:
+            logger.debug("Could not unwrap DefaultAgent: %s", exc)
+
+        try:
+            from opentelemetry.instrumentation.minisweagent.internal.cli_wrappers import (
+                unpatch_mini_cli_app_module,
+            )
+
+            unpatch_mini_cli_app_module()
+        except Exception as exc:
+            logger.debug("Could not unpatch mini app: %s", exc)
+
+        # --- restore original factory ---
+        if self.__class__._original_get_environment is not None:
+            try:
+                import minisweagent.environments as _envs_mod
+
+                _envs_mod.get_environment = self.__class__._original_get_environment
+                self.__class__._original_get_environment = None
+            except Exception as exc:
+                logger.debug("Could not restore get_environment: %s", exc)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/config.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/config.py
new file mode 100644
index 000000000..ded93cfae
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/config.py
@@ -0,0 +1,20 @@
+"""Configuration via environment variables."""
+
+from __future__ import annotations
+
+import os
+
+
+def _int_env(name: str, default: str) -> int:
+    try:
+        return int(os.getenv(name, default))
+    except ValueError:
+        return int(default)
+
+
+OTEL_MINISWEAGENT_TASK_PREVIEW_MAX_LEN = _int_env(
+    "OTEL_MINISWEAGENT_TASK_PREVIEW_MAX_LEN", "256"
+)
+OTEL_MINISWEAGENT_COMMAND_PREVIEW_MAX_LEN = _int_env(
+    "OTEL_MINISWEAGENT_COMMAND_PREVIEW_MAX_LEN", "256"
+)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/__init__.py
new file mode 100644
index 000000000..0b6c41cd6
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/__init__.py
@@ -0,0 +1 @@
+"""Internal helpers for mini-swe-agent instrumentation."""
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/agent_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/agent_wrappers.py
new file mode 100644
index 000000000..2e99fc56f
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/agent_wrappers.py
@@ -0,0 +1,161 @@
+"""wrapt hooks for DefaultAgent.run / DefaultAgent.step (ARMS / util-genai semantics)."""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Callable
+
+from opentelemetry import context as context_api
+from opentelemetry.trace import Tracer
+
+from opentelemetry.instrumentation.minisweagent.config import (
+    OTEL_MINISWEAGENT_TASK_PREVIEW_MAX_LEN,
+)
+from opentelemetry.instrumentation.minisweagent.internal.conversation import (
+    build_invoke_agent_payload,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def _task_preview(task: str) -> str:
+    if not task:
+        return ""
+    m = OTEL_MINISWEAGENT_TASK_PREVIEW_MAX_LEN
+    if len(task) <= m:
+        return task
+    return task[: m - 3] + "..."
+
+
+def _request_model_from_agent(instance: Any) -> str | None:
+    model = getattr(instance, "model", None)
+    if model is None:
+        return None
+    cfg = getattr(model, "config", None)
+    if cfg is None:
+        return None
+    mn = getattr(cfg, "model_name", None)
+    if mn is not None:
+        return str(mn)
+    return None
+
+
+def _populate_invoke_from_agent(inv: Any, instance: Any) -> None:
+    try:
+        payload = build_invoke_agent_payload(instance)
+    except Exception:
+        logger.debug("invoke_agent telemetry payload failed", exc_info=True)
+        return
+    inv.system_instruction = payload["system_instruction"]
+    inv.input_messages = payload["input_messages"]
+    inv.output_messages = payload["output_messages"]
+    inv.tool_definitions = payload["tool_definitions"]
+
+
+class DefaultAgentRunWrapper:
+    """AGENT invoke_agent span with conversation + system_instruction + bash tool defs."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):  # noqa: ARG002 — API compatibility
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        from opentelemetry.util.genai.extended_handler import get_extended_telemetry_handler  # noqa: PLC0415
+        from opentelemetry.util.genai.extended_types import InvokeAgentInvocation  # noqa: PLC0415
+        from opentelemetry.util.genai.types import Error as GenAIError  # noqa: PLC0415
+
+        task = args[0] if args else kwargs.get("task", "") or ""
+        agent_name = f"{instance.__class__.__module__}.{instance.__class__.__name__}"
+
+        han = get_extended_telemetry_handler()
+        inv = InvokeAgentInvocation(provider="minisweagent", agent_name=agent_name)
+        inv.request_model = _request_model_from_agent(instance)
+        inv.attributes.setdefault("gen_ai.framework", "minisweagent")
+        pv = _task_preview(str(task))
+        if pv:
+            inv.attributes["minisweagent.task.preview"] = pv
+
+        instance._otel_msw_round = 0  # noqa: SLF001
+        han.start_invoke_agent(inv, context=context_api.get_current())
+        try:
+            result = wrapped(*args, **kwargs)
+        except BaseException as exc:
+            try:
+                _populate_invoke_from_agent(inv, instance)
+            except Exception:
+                logger.debug("populate invoke_agent on error failed", exc_info=True)
+            if isinstance(exc, Exception):
+                han.fail_invoke_agent(
+                    inv, GenAIError(message=str(exc), type=type(exc))
+                )
+            else:
+                han.stop_invoke_agent(inv)
+            raise
+
+        try:
+            _populate_invoke_from_agent(inv, instance)
+            if isinstance(result, dict):
+                es = result.get("exit_status")
+                if es is not None:
+                    inv.attributes["minisweagent.exit_status"] = str(es)
+                sub = result.get("submission")
+                if sub is not None:
+                    inv.attributes["minisweagent.submission.preview"] = _task_preview(
+                        str(sub)
+                    )
+        finally:
+            han.stop_invoke_agent(inv)
+        return result
+
+
+class DefaultAgentStepWrapper:
+    """ReAct STEP span (gen_ai.span.kind=STEP, operation.name=react)."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):  # noqa: ARG002
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        from minisweagent.exceptions import InterruptAgentFlow  # noqa: PLC0415
+        from opentelemetry.util.genai.extended_handler import get_extended_telemetry_handler  # noqa: PLC0415
+        from opentelemetry.util.genai.extended_types import ReactStepInvocation  # noqa: PLC0415
+        from opentelemetry.util.genai.types import Error as GenAIError  # noqa: PLC0415
+
+        r = int(getattr(instance, "_otel_msw_round", 0) or 0) + 1
+        instance._otel_msw_round = r  # noqa: SLF001
+
+        han = get_extended_telemetry_handler()
+        inv = ReactStepInvocation(round=r)
+        han.start_react_step(inv, context=context_api.get_current())
+        try:
+            result = wrapped(*args, **kwargs)
+        except InterruptAgentFlow as flow_exc:
+            inv.finish_reason = type(flow_exc).__qualname__
+            han.stop_react_step(inv)
+            raise
+        except BaseException as exc:
+            inv.finish_reason = type(exc).__qualname__
+            if isinstance(exc, Exception):
+                han.fail_react_step(
+                    inv, GenAIError(message=str(exc), type=type(exc))
+                )
+            else:
+                han.stop_react_step(inv)
+            raise
+        else:
+            han.stop_react_step(inv)
+            return result
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/cli_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/cli_wrappers.py
new file mode 100644
index 000000000..761990398
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/cli_wrappers.py
@@ -0,0 +1,104 @@
+"""CLI ENTRY: ``mini`` is exposed as Typer ``app``, not Typer-decorated ``main``."""
+
+from __future__ import annotations
+
+import logging
+import sys
+from typing import Any
+
+from opentelemetry import context as context_api
+
+from opentelemetry.instrumentation.minisweagent.internal.conversation import (
+    apply_payload_to_entry_invocation,
+    try_fill_entry_payload_from_mini_trajectory,
+)
+
+logger = logging.getLogger(__name__)
+
+_PATCH_FLAG = "_otel_loongsuite_mini_app_patched"
+_ORIG_APP_ATTR = "_otel_loongsuite_orig_mini_app"
+
+
+class _MiniTyperAppProxy:
+    """Delegates to real Typer/Click ``app``; ``__call__`` wraps ENTRY span."""
+
+    __slots__ = ("_inner",)
+
+    def __init__(self, inner: Any):
+        object.__setattr__(self, "_inner", inner)
+
+    def _hydrate_entry(self, entry_inv: Any) -> None:
+        try:
+            payload = try_fill_entry_payload_from_mini_trajectory()
+            if payload:
+                apply_payload_to_entry_invocation(entry_inv, payload)
+        except Exception:
+            logger.debug("ENTRY traj hydrate failed", exc_info=True)
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        from opentelemetry.util.genai.extended_handler import get_extended_telemetry_handler  # noqa: PLC0415
+        from opentelemetry.util.genai.extended_types import EntryInvocation  # noqa: PLC0415
+        from opentelemetry.util.genai.types import Error as GenAIError  # noqa: PLC0415
+
+        han = get_extended_telemetry_handler()
+        entry_inv = EntryInvocation()
+        han.start_entry(entry_inv, context=context_api.get_current())
+        try:
+            result = self._inner(*args, **kwargs)
+        except Exception as exc:
+            self._hydrate_entry(entry_inv)
+            han.fail_entry(
+                entry_inv,
+                GenAIError(message=str(exc), type=type(exc)),
+            )
+            raise
+        except BaseException:
+            # Typer/Click commonly exits by raising SystemExit after the command
+            # callback has completed; the trajectory file is available here.
+            self._hydrate_entry(entry_inv)
+            han.stop_entry(entry_inv)
+            raise
+
+        self._hydrate_entry(entry_inv)
+        han.stop_entry(entry_inv)
+        return result
+
+    # Typer exposes click commands via attribute access — forward everything.
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._inner, name)
+
+
+def patch_mini_cli_app_module() -> None:
+    """Replace ``minisweagent.run.mini.app`` once the module is loaded."""
+    try:
+        import minisweagent.run.mini as mini_mod
+        import minisweagent.environments as envs_mod
+    except Exception as exc:
+        logger.debug(
+            "minisweagent.run.mini not available for ENTRY patch: %s", exc
+        )
+        return
+    if hasattr(mini_mod, "get_environment"):
+        mini_mod.get_environment = envs_mod.get_environment
+    if getattr(mini_mod, _PATCH_FLAG, False):
+        return
+    inner = getattr(mini_mod, "app", None)
+    if inner is None or isinstance(inner, _MiniTyperAppProxy):
+        return
+    setattr(mini_mod, _ORIG_APP_ATTR, inner)
+    setattr(mini_mod, "app", _MiniTyperAppProxy(inner))
+    setattr(mini_mod, _PATCH_FLAG, True)
+
+
+def unpatch_mini_cli_app_module() -> None:
+    try:
+        mini_mod = sys.modules.get("minisweagent.run.mini")
+        if mini_mod is None or not getattr(mini_mod, _PATCH_FLAG, False):
+            return
+        orig = getattr(mini_mod, _ORIG_APP_ATTR, None)
+        if orig is not None:
+            mini_mod.app = orig  # type: ignore[assignment]
+        delattr(mini_mod, _PATCH_FLAG)
+        delattr(mini_mod, _ORIG_APP_ATTR)
+    except Exception as exc:
+        logger.debug("unpatch mini app failed: %s", exc)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/conversation.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/conversation.py
new file mode 100644
index 000000000..ccef1146b
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/conversation.py
@@ -0,0 +1,221 @@
+"""Map mini-swe-agent trajectory dicts → OpenTelemetry GenAI message / tool-definition types."""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+from typing import Any
+
+from opentelemetry.util.genai.types import (
+    FunctionToolDefinition,
+    InputMessage,
+    OutputMessage,
+    Text,
+    ToolCall,
+    ToolCallResponse,
+)
+
+logger = logging.getLogger(__name__)
+
+_TRAJ_MAX_BYTES = 8_000_000
+
+
+def bash_tool_definition() -> FunctionToolDefinition:
+    """Single bash tool (same schema mini uses via LiteLLM)."""
+    from minisweagent.models.utils.actions_toolcall import BASH_TOOL  # noqa: PLC0415
+
+    fn = BASH_TOOL["function"]
+    return FunctionToolDefinition(
+        name=fn["name"],
+        description=fn.get("description"),
+        parameters=fn.get("parameters") or {},
+    )
+
+
+def _text_parts(content: str | None) -> list[Text]:
+    if content is None or str(content).strip() == "":
+        return []
+    return [Text(content=str(content))]
+
+
+def _normalized_tool_calls(msg: dict[str, Any]) -> list[ToolCall]:
+    parts: list[ToolCall] = []
+    raw = msg.get("tool_calls")
+    if raw:
+        for tc in raw:
+            fn_obj = getattr(tc, "function", None)
+            if fn_obj is None and isinstance(tc, dict):
+                fn_obj = tc.get("function")
+
+            tc_id = getattr(tc, "id", None)
+            if tc_id is None and isinstance(tc, dict):
+                tc_id = tc.get("id")
+
+            name = "bash"
+            raw_args: Any = "{}"
+            if fn_obj is not None:
+                name = getattr(fn_obj, "name", None) or (
+                    fn_obj.get("name") if isinstance(fn_obj, dict) else name
+                )
+                raw_args = getattr(fn_obj, "arguments", None)
+                if raw_args is None and isinstance(fn_obj, dict):
+                    raw_args = fn_obj.get("arguments", "{}")
+            if isinstance(raw_args, str):
+                try:
+                    args_obj = json.loads(raw_args)
+                except json.JSONDecodeError:
+                    args_obj = {"raw": raw_args}
+            else:
+                args_obj = raw_args if raw_args is not None else {}
+            parts.append(ToolCall(id=tc_id, name=str(name or "bash"), arguments=args_obj))
+
+    extra = msg.get("extra") or {}
+    actions = extra.get("actions") or []
+    if not raw and actions:
+        for act in actions:
+            cmd = act.get("command") if isinstance(act, dict) else None
+            if cmd is None:
+                continue
+            parts.append(
+                ToolCall(
+                    id=act.get("tool_call_id") if isinstance(act, dict) else None,
+                    name="bash",
+                    arguments={"command": cmd},
+                )
+            )
+
+    return parts
+
+
+def split_system_messages(
+    messages: list[dict[str, Any]],
+) -> tuple[list[Text], list[dict[str, Any]]]:
+    sys_parts: list[Text] = []
+    rest: list[dict[str, Any]] = []
+    for m in messages:
+        if not isinstance(m, dict):
+            continue
+        if m.get("role") == "system":
+            sys_parts.append(Text(content=str(m.get("content", ""))))
+        else:
+            rest.append(m)
+    return sys_parts, rest
+
+
+def _message_to_semconv_messages(
+    msg: dict[str, Any],
+) -> list[InputMessage | OutputMessage]:
+    role = msg.get("role")
+    if role == "user":
+        return [InputMessage(role="user", parts=_text_parts(msg.get("content")))]
+    if role == "tool":
+        tid = msg.get("tool_call_id")
+        return [
+            InputMessage(
+                role="tool",
+                parts=[
+                    ToolCallResponse(
+                        id=tid if isinstance(tid, str) else None,
+                        response=msg.get("content", ""),
+                    )
+                ],
+            )
+        ]
+    if role == "assistant":
+        parts: list[Any] = []
+        parts.extend(_text_parts(msg.get("content")))
+        parts.extend(_normalized_tool_calls(msg))
+        if not parts:
+            parts = [Text(content="")]
+        extra = msg.get("extra") or {}
+        finish = (
+            "tool_calls" if extra.get("actions") or msg.get("tool_calls") else "stop"
+        )
+        return [
+            OutputMessage(
+                role="assistant", parts=parts, finish_reason=finish  # type: ignore[arg-type]
+            )
+        ]
+    if role == "exit":
+        return [
+            InputMessage(
+                role="user",
+                parts=_text_parts(f"EXIT: {msg.get('content', '')}"),
+            )
+        ]
+    return [
+        InputMessage(
+            role=str(role or "unknown"), parts=_text_parts(str(msg.get("content")))
+        ),
+    ]
+
+
+def build_invoke_payload_from_messages(messages: list[dict[str, Any]]) -> dict[str, Any]:
+    """Core conversion: trajectory message dicts → invoke_agent / ENTRY payload."""
+    sys_inst, rest = split_system_messages(messages)
+    input_messages: list[InputMessage] = []
+    output_messages: list[OutputMessage] = []
+
+    try:
+        for m in rest:
+            for converted in _message_to_semconv_messages(m):
+                if isinstance(converted, OutputMessage):
+                    output_messages.append(converted)
+                else:
+                    input_messages.append(converted)
+    except Exception:
+        logger.debug("conversation serialization failed", exc_info=True)
+
+    return {
+        "system_instruction": sys_inst,
+        "input_messages": input_messages,
+        "output_messages": output_messages,
+        "tool_definitions": [bash_tool_definition()],
+    }
+
+
+def build_invoke_agent_payload(agent: Any) -> dict[str, Any]:
+    """Produce semantic fields from a DefaultAgent (or duck-typed agent) trajectory."""
+    raw_messages = list(getattr(agent, "messages", None) or [])
+    messages = [m for m in raw_messages if isinstance(m, dict)]
+    return build_invoke_payload_from_messages(messages)
+
+
+def try_fill_entry_payload_from_mini_trajectory() -> dict[str, Any] | None:
+    """Read default mini trajectory file and build ENTRY / invoke payloads."""
+    try:
+        from minisweagent import global_config_dir  # noqa: PLC0415
+    except Exception:
+        return None
+
+    path = Path(global_config_dir) / "last_mini_run.traj.json"
+    if not path.is_file():
+        return None
+    try:
+        if path.stat().st_size > _TRAJ_MAX_BYTES:
+            logger.warning("trajectory too large for telemetry snapshot: %s", path)
+            return None
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except Exception:
+        logger.debug("failed to read mini trajectory %s", path, exc_info=True)
+        return None
+
+    msgs = data.get("messages")
+    if not isinstance(msgs, list):
+        return None
+    dict_msgs = [m for m in msgs if isinstance(m, dict)]
+    if not dict_msgs:
+        return None
+    try:
+        return build_invoke_payload_from_messages(dict_msgs)
+    except Exception:
+        logger.debug("trajectory payload build failed", exc_info=True)
+        return None
+
+
+def apply_payload_to_entry_invocation(entry_inv: Any, payload: dict[str, Any]) -> None:
+    entry_inv.input_messages = payload["input_messages"]
+    entry_inv.output_messages = payload["output_messages"]
+    entry_inv.system_instruction = payload["system_instruction"]
+    entry_inv.tool_definitions = payload["tool_definitions"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/delegates.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/delegates.py
new file mode 100644
index 000000000..140d8d6f9
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/delegates.py
@@ -0,0 +1,81 @@
+"""Tracing delegates for Environment (factory-injected wrappers).
+
+LLM-call spans remain with LiteLLM/OpenAI instrumentation; this emits execute_tool.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any
+
+from opentelemetry import context as context_api
+from opentelemetry.trace import Tracer
+
+logger = logging.getLogger(__name__)
+
+
+def _sanitize_tool_result(payload: dict[str, Any]) -> dict[str, Any]:
+    try:
+        return json.loads(json.dumps(payload, default=str))
+    except (TypeError, ValueError):
+        logger.debug("tool result not JSON-normalizable", exc_info=True)
+        try:
+            return {"repr": repr(payload)}
+        except Exception:
+            return {"error": "unserializable_tool_result"}
+
+
+class TracingEnvironment:
+    """Delegates to inner Environment and emits ARMS-aligned TOOL (execute_tool) spans."""
+
+    __slots__ = ("_inner", "_tracer")
+
+    def __init__(self, inner: Any, tracer: Tracer):  # noqa: ARG002
+        object.__setattr__(self, "_inner", inner)
+        object.__setattr__(self, "_tracer", tracer)
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._inner, name)
+
+    def execute(self, action: dict, cwd: str = "", **kwargs: Any) -> dict[str, Any]:
+        from minisweagent.exceptions import InterruptAgentFlow  # noqa: PLC0415
+        from opentelemetry.util.genai.extended_handler import get_extended_telemetry_handler  # noqa: PLC0415
+        from opentelemetry.util.genai.extended_types import ExecuteToolInvocation  # noqa: PLC0415
+        from opentelemetry.util.genai.types import Error as GenAIError  # noqa: PLC0415
+
+        command = action.get("command", "") if isinstance(action, dict) else ""
+        tool_call_id = (
+            action.get("tool_call_id") if isinstance(action, dict) else None
+        )
+        han = get_extended_telemetry_handler()
+        inv = ExecuteToolInvocation(
+            tool_name="bash",
+            provider="minisweagent",
+            tool_type="function",
+            tool_call_id=tool_call_id if isinstance(tool_call_id, str) else None,
+            tool_description="Execute a bash command",
+            tool_call_arguments={"command": command},
+        )
+
+        han.start_execute_tool(inv, context=context_api.get_current())
+        try:
+            result = self._inner.execute(action, cwd, **kwargs)
+        except InterruptAgentFlow:
+            inv.tool_call_result = {"interrupted": "InterruptAgentFlow"}
+            han.stop_execute_tool(inv)
+            raise
+        except Exception as exc:
+            inv.tool_call_result = {"error": str(exc)}
+            han.fail_execute_tool(
+                inv, GenAIError(message=str(exc), type=type(exc))
+            )
+            raise
+
+        if isinstance(result, dict):
+            payload_out = dict(result)
+        else:
+            payload_out = {"value": result}
+        inv.tool_call_result = _sanitize_tool_result(payload_out)
+        han.stop_execute_tool(inv)
+        return result
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/package.py
new file mode 100644
index 000000000..238a8c08c
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/package.py
@@ -0,0 +1,3 @@
+_instruments = ("mini-swe-agent >= 2.2.0",)
+
+_supports_metrics = True
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/version.py
new file mode 100644
index 000000000..3dc1f76bc
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/README.rst b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/README.rst
new file mode 100644
index 000000000..cc8d36bdb
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/README.rst
@@ -0,0 +1,93 @@
+OpenTelemetry OpenHands Instrumentation
+========================================
+
+Automatic OpenTelemetry instrumentation for the legacy OpenHands V0 /
+CodeAct runtime.
+
+What is covered
+---------------
+
+This package wraps the V0 ``python -m openhands.core.main`` execution path:
+
+* ``openhands.core.main.run_controller`` for the ENTRY span.
+* ``openhands.core.loop.run_agent_until_done`` for the AGENT span fallback.
+* ``AgentController.__init__`` / ``AgentController.close`` for lifecycle-bound
+  ENTRY and AGENT spans that survive ``python -m`` from-import binding.
+* ``AgentController._step`` for ReAct STEP spans.
+* ``Runtime.run_action`` for TOOL spans.
+* ``LLM.__init__`` to bridge the current OpenHands context into LiteLLM calls.
+
+Span tree
+---------
+
+::
+
+    ENTRY  enter openhands
+    `-- AGENT invoke_agent codeact
+        |-- STEP  react step [xN]
+        |   |-- LLM   chat {model}
+        |   `-- TOOL  execute_tool {tool_name}
+        `-- STEP  react step [...]
+
+``python -m`` and from-import binding
+-------------------------------------
+
+When OpenHands V0 is launched via ``python -m openhands.core.main``, Python
+executes ``main.py`` as ``__main__``. Symbols imported with ``from ... import``
+can be bound before module-level wrappers are installed, so patching
+``openhands.core.main.run_controller`` is not enough by itself.
+
+To keep ENTRY and AGENT spans reliable, this instrumentation primarily opens
+them from ``AgentController.__init__`` and closes them from
+``AgentController.close``. The module-level wrappers remain as a fallback for
+programmatic invocations.
+
+Cross-thread context bridge
+---------------------------
+
+OpenHands V0 may execute controller steps and runtime tool calls in worker
+threads with fresh asyncio loops. The instrumentation stores the active OTel
+context by session id and re-attaches it in STEP, TOOL, and LLM bridge wrappers
+so the trace remains:
+
+``ENTRY -> AGENT -> STEP -> (LLM / TOOL)``.
+
+Semantic-convention I/O capture
+-------------------------------
+
+ENTRY and STEP spans emit ``input.value`` / ``output.value`` and GenAI semantic
+attributes where applicable. AGENT spans use GenAI-native attributes for
+messages without OpenInference ``input.value`` / ``output.value`` mirrors.
+TOOL spans never set ``input.value`` / ``output.value``; they always set
+``gen_ai.tool.call.arguments`` (JSON object string, ``"{}"`` when empty) and
+``gen_ai.tool.call.result``.
+
+* **ENTRY** emits ``gen_ai.input.messages`` and ``gen_ai.output.messages`` using
+  the ARMS parts-based message schema.
+* **AGENT** emits ``gen_ai.input.messages``, ``gen_ai.output.messages``,
+  ``gen_ai.system_instructions``, and ``gen_ai.tool.definitions``.
+* **STEP** emits recent input history and the pending assistant/tool-call
+  output for the ReAct round.
+* **TOOL** emits ``gen_ai.tool.name``, ``gen_ai.tool.type``,
+  ``gen_ai.tool.call.id``, ``gen_ai.tool.description``,
+  ``gen_ai.tool.call.arguments``, and ``gen_ai.tool.call.result``.
+
+Usage
+-----
+
+.. code:: python
+
+    from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor
+
+    OpenHandsInstrumentor().instrument()
+
+Configuration
+-------------
+
+Environment variables:
+
+* ``OTEL_INSTRUMENTATION_OPENHANDS_ENABLED`` (default ``true``)
+* ``OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS`` (default ``true``)
+* ``OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM`` (default ``true``)
+
+I/O capture is always on and content is emitted in full.
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/pyproject.toml
new file mode 100644
index 000000000..b9f0ae7f4
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/pyproject.toml
@@ -0,0 +1,50 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-openhands"
+dynamic = ["version"]
+description = "LoongSuite OpenHands Instrumentation"
+readme = "README.rst"
+license = "Apache-2.0"
+requires-python = ">=3.10"
+authors = [
+  { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+  "wrapt >= 1.0.0, < 2.0.0",
+]
+
+[project.optional-dependencies]
+instruments = []
+
+[project.entry-points.opentelemetry_instrumentor]
+openhands = "opentelemetry.instrumentation.openhands:OpenHandsInstrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-openhands"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/openhands/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+  "/src",
+  "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/__init__.py
new file mode 100644
index 000000000..a02a7d3b3
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/__init__.py
@@ -0,0 +1,265 @@
+"""OpenTelemetry OpenHands Instrumentation.
+
+Wraps the legacy V0 (CodeAct + AgentController + Runtime) path:
+
+* V0 — ``python -m openhands.core.main``. We add
+  ``ENTRY → AGENT → STEP → TOOL`` directly on top of the controller / runtime
+  call chain. LLM spans come from the bundled LiteLLM instrumentor.
+
+Usage
+-----
+
+.. code:: python
+
+    from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor
+
+    OpenHandsInstrumentor().instrument()
+"""
+
+from __future__ import annotations
+
+import importlib
+import logging
+from typing import Any, Collection
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from wrapt import wrap_function_wrapper
+
+from opentelemetry.instrumentation.openhands.config import (
+    OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM,
+    OTEL_INSTRUMENTATION_OPENHANDS_ENABLED,
+    OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS,
+)
+from opentelemetry.instrumentation.openhands.package import _instruments
+from opentelemetry.instrumentation.openhands.version import __version__
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["OpenHandsInstrumentor"]
+
+
+# ---------------------------------------------------------------------------
+# Wrap-point registry — single source of truth shared with _uninstrument.
+# Entries: (module, qualified_name)
+# ---------------------------------------------------------------------------
+
+_PATCH_TARGETS: list[tuple[str, str]] = [
+    ("openhands.core.main", "run_controller"),
+    ("openhands.core.loop", "run_agent_until_done"),
+    # AgentController.__init__ / .close are the *primary* ENTRY+AGENT
+    # span source for V0 — they're class methods, so they're patchable
+    # regardless of the from-import binding problem in main.py
+    # (see v0_wrappers.AgentControllerInitWrapper docstring).
+    (
+        "openhands.controller.agent_controller",
+        "AgentController.__init__",
+    ),
+    (
+        "openhands.controller.agent_controller",
+        "AgentController.close",
+    ),
+    (
+        "openhands.controller.agent_controller",
+        "AgentController._step",
+    ),
+    ("openhands.runtime.base", "Runtime.run_action"),
+    # LLM context bridge — re-attaches the current sid-stashed context
+    # (STEP while a step is open) onto every ``LLM.completion`` invocation
+    # so the downstream LiteLLM / Aliyun GenAI auto-instrumentation emits
+    # the LLM span as a child of STEP and shares its ``trace_id``.
+    ("openhands.llm.llm", "LLM.__init__"),
+]
+
+
+def _module_importable(module: str) -> bool:
+    try:
+        importlib.import_module(module)
+        return True
+    except ModuleNotFoundError:
+        return False
+    except Exception:
+        # Other import errors should still let the wrap attempt surface a
+        # warning.
+        return True
+
+
+def _safe_wrap(module: str, name: str, wrapper: Any) -> bool:
+    """Patch ``module.name`` with ``wrapper``; classify failures sensibly."""
+    if not _module_importable(module):
+        # OpenHands versions can move modules around. Missing V0 modules
+        # should not prevent applications from starting.
+        logger.debug(
+            "OpenHands instrumentation: module %s not importable, skipping %s",
+            module,
+            name,
+        )
+        return False
+    try:
+        wrap_function_wrapper(module=module, name=name, wrapper=wrapper)
+        logger.debug("OpenHands instrumentation: wrapped %s.%s", module, name)
+        return True
+    except (AttributeError, ImportError) as exc:
+        # Attribute missing inside the module — usually a version-skew issue.
+        logger.warning(
+            "OpenHands instrumentation: could not wrap %s.%s: %s",
+            module,
+            name,
+            exc,
+        )
+        return False
+    except Exception as exc:  # pragma: no cover - defensive
+        logger.warning(
+            "OpenHands instrumentation: unexpected error wrapping %s.%s: %s",
+            module,
+            name,
+            exc,
+        )
+        return False
+
+
+def _safe_unwrap(module: str, qualname: str) -> None:
+    """Unwrap a previously ``wrapt``-patched function or method."""
+    try:
+        mod = importlib.import_module(module)
+    except Exception:
+        return
+    parts = qualname.split(".")
+    obj: Any = mod
+    parents: list[Any] = [mod]
+    try:
+        for p in parts:
+            obj = getattr(obj, p)
+            parents.append(obj)
+    except Exception:
+        return
+    if not hasattr(obj, "__wrapped__"):
+        return
+    parent = parents[-2]
+    try:
+        setattr(parent, parts[-1], obj.__wrapped__)
+    except Exception:
+        pass
+
+
+class OpenHandsInstrumentor(BaseInstrumentor):
+    """Instrumentation entry point for OpenHands V0."""
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        if not OTEL_INSTRUMENTATION_OPENHANDS_ENABLED:
+            logger.info("OpenHands instrumentation disabled via env var")
+            return
+
+        tracer_provider = kwargs.get("tracer_provider")
+        tracer = trace_api.get_tracer(
+            __name__, __version__, tracer_provider=tracer_provider
+        )
+
+        from opentelemetry.instrumentation.openhands.internal.v0_wrappers import (
+            AgentControllerCloseWrapper,
+            AgentControllerInitWrapper,
+            AgentControllerStepWrapper,
+            LLMInitWrapper,
+            RunAgentUntilDoneWrapper,
+            RunControllerWrapper,
+            RuntimeRunActionWrapper,
+        )
+
+        if OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS:
+            self._install_v0_patches(tracer, {
+                "run_controller": RunControllerWrapper,
+                "run_agent_until_done": RunAgentUntilDoneWrapper,
+                "agent_init": AgentControllerInitWrapper,
+                "agent_close": AgentControllerCloseWrapper,
+                "agent_step": AgentControllerStepWrapper,
+                "runtime_run_action": RuntimeRunActionWrapper,
+                "llm_init": LLMInitWrapper,
+            })
+
+        # Auto-enable bundled LiteLLM instrumentation so SDK / V0 LLM
+        # ``litellm.completion()`` calls become LLM spans.
+        if OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM:
+            self._maybe_enable_litellm(**kwargs)
+
+    def _install_v0_patches(self, tracer, factories) -> None:
+        RunControllerWrapper = factories["run_controller"]
+        RunAgentUntilDoneWrapper = factories["run_agent_until_done"]
+        AgentControllerInitWrapper = factories["agent_init"]
+        AgentControllerCloseWrapper = factories["agent_close"]
+        AgentControllerStepWrapper = factories["agent_step"]
+        RuntimeRunActionWrapper = factories["runtime_run_action"]
+        LLMInitWrapper = factories["llm_init"]
+
+        # `run_controller` and `run_agent_until_done` patches are best-effort:
+        # they only fire when run_controller is called via the proper module
+        # path (programmatic / test). When OpenHands is launched via
+        # ``python -m openhands.core.main``, the from-import binding in
+        # main.py bypasses these patches — the AgentController.__init__ /
+        # .close patches below take over and produce ENTRY+AGENT spans
+        # reliably (class methods are immune to from-import binding).
+        _safe_wrap(
+            "openhands.core.main",
+            "run_controller",
+            RunControllerWrapper(tracer),
+        )
+        _safe_wrap(
+            "openhands.core.loop",
+            "run_agent_until_done",
+            RunAgentUntilDoneWrapper(tracer),
+        )
+        _safe_wrap(
+            "openhands.controller.agent_controller",
+            "AgentController.__init__",
+            AgentControllerInitWrapper(tracer),
+        )
+        _safe_wrap(
+            "openhands.controller.agent_controller",
+            "AgentController.close",
+            AgentControllerCloseWrapper(tracer),
+        )
+        _safe_wrap(
+            "openhands.controller.agent_controller",
+            "AgentController._step",
+            AgentControllerStepWrapper(tracer),
+        )
+        _safe_wrap(
+            "openhands.runtime.base",
+            "Runtime.run_action",
+            RuntimeRunActionWrapper(tracer),
+        )
+        # LLM context bridge — patches ``LLM.__init__`` so every instance's
+        # ``self._completion`` re-attaches the latest sid-stashed context.
+        # See ``LLMInitWrapper`` for why we need this even though the LLM
+        # call is synchronous: in real OpenHands deployments LiteLLM ends
+        # up creating its span in a thread / context that ``contextvars``
+        # didn't propagate STEP into, so we re-attach explicitly.
+        _safe_wrap(
+            "openhands.llm.llm",
+            "LLM.__init__",
+            LLMInitWrapper(tracer),
+        )
+
+    def _maybe_enable_litellm(self, **kwargs: Any) -> None:
+        try:
+            from opentelemetry.instrumentation.litellm import (
+                LiteLLMInstrumentor,
+            )
+        except Exception as exc:
+            logger.debug(
+                "LiteLLM instrumentation not available, skipping: %s", exc
+            )
+            return
+        try:
+            instr = LiteLLMInstrumentor()
+            already = getattr(instr, "_is_instrumented_by_opentelemetry", False)
+            if not already:
+                instr.instrument(**kwargs)
+        except Exception as exc:
+            logger.debug("Could not auto-enable LiteLLM instrumentation: %s", exc)
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        for module, qualname in _PATCH_TARGETS:
+            _safe_unwrap(module, qualname)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/config.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/config.py
new file mode 100644
index 000000000..4f5ad38db
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/config.py
@@ -0,0 +1,25 @@
+"""Environment-variable driven configuration for the OpenHands instrumentation."""
+
+from __future__ import annotations
+
+import os
+
+
+def _bool_env(name: str, default: bool) -> bool:
+    val = os.getenv(name)
+    if val is None:
+        return default
+    return val.strip().lower() in {"true", "1", "yes", "on"}
+
+
+OTEL_INSTRUMENTATION_OPENHANDS_ENABLED = _bool_env(
+    "OTEL_INSTRUMENTATION_OPENHANDS_ENABLED", True
+)
+
+OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS = _bool_env(
+    "OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS", True
+)
+
+OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM = _bool_env(
+    "OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM", True
+)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/__init__.py
new file mode 100644
index 000000000..7b2c8b6a1
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/__init__.py
@@ -0,0 +1 @@
+"""Internal helpers for OpenHands instrumentation."""
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/constants.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/constants.py
new file mode 100644
index 000000000..6d99a6820
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/constants.py
@@ -0,0 +1,12 @@
+"""Constant attribute keys & framework identity used across wrappers."""
+
+from __future__ import annotations
+
+GEN_AI_FRAMEWORK = "gen_ai.framework"
+GEN_AI_SPAN_KIND = "gen_ai.span.kind"
+
+FRAMEWORK_NAME = "openhands"
+
+# OpenHands-specific span attributes (namespaced to avoid clashing with the
+# generic GenAI semconv attributes already provided by upstream).
+OH_INITIAL_MESSAGE_PREVIEW = "openhands.initial_message.preview"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/session_context.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/session_context.py
new file mode 100644
index 000000000..534d3e611
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/session_context.py
@@ -0,0 +1,196 @@
+"""Cross-thread / cross-loop OTel context bridge keyed by OpenHands session id.
+
+Why this exists
+---------------
+
+OpenHands V0's ``EventStream`` delivers events to subscribers via a
+``ThreadPoolExecutor``. The ``AgentController.on_event`` callback then runs
+
+.. code:: python
+
+    asyncio.get_event_loop().run_until_complete(self._on_event(event))
+
+inside a *worker thread*, which spins up a brand-new asyncio loop with a
+fresh ``contextvars.Context``. This means none of the OTel context (tracer
+spans / baggage) attached on the main coroutine in ``run_controller`` is
+visible inside ``AgentController._step`` or ``Runtime.run_action`` — every
+STEP / TOOL span starts at the **trace root**, fragmenting the trace into
+many disconnected pieces.
+
+This module bridges that gap. We snapshot the OTel context at entry-time
+(``run_controller`` / ``run_agent_until_done``) under the controller's
+session id, and the STEP / TOOL wrappers re-attach the snapshot before
+opening their spans so every span shares a single ``trace_id`` rooted at
+the ENTRY span.
+
+The store is keyed by **session id (sid)** so concurrent benchmark
+sessions stay isolated.
+"""
+
+from __future__ import annotations
+
+import threading
+from typing import Optional
+
+from opentelemetry import context as otel_context
+
+_lock = threading.Lock()
+# Map session id -> OTel Context object. The Context contains the active
+# Span (and any baggage / suppression flags). Re-attaching it makes the
+# stored span the *current* span for whatever thread/loop attaches it.
+_session_contexts: dict[str, otel_context.Context] = {}
+
+# Map session id -> { tool_name: tool_definition_dict }. Captured at
+# AGENT span open from ``controller.agent.tools`` and consumed by the
+# TOOL wrapper to populate ``gen_ai.tool.description`` and friends — the
+# Runtime instance does not have direct access to the agent's tool list.
+_session_tool_registry: dict[str, dict[str, dict]] = {}
+
+# Tracks the most-recent sid we stored a context for. Used as a fallback
+# when a hook point (typically ``Runtime.run_action``) cannot locate the
+# session id from its arguments — in single-session CLI runs this is
+# always the right answer.
+_last_sid: Optional[str] = None
+
+
+def store_context(sid: Optional[str], ctx: otel_context.Context) -> None:
+    """Stash ``ctx`` under ``sid``. Updates ``_last_sid``."""
+    if not sid:
+        return
+    global _last_sid
+    with _lock:
+        _session_contexts[sid] = ctx
+        _last_sid = sid
+
+
+def get_context(sid: Optional[str]) -> Optional[otel_context.Context]:
+    """Return the stashed context for ``sid``, falling back to the last sid."""
+    with _lock:
+        if sid and sid in _session_contexts:
+            return _session_contexts[sid]
+        if _last_sid and _last_sid in _session_contexts:
+            return _session_contexts[_last_sid]
+        return None
+
+
+def clear_context(sid: Optional[str]) -> None:
+    if not sid:
+        return
+    global _last_sid
+    with _lock:
+        _session_contexts.pop(sid, None)
+        _session_tool_registry.pop(sid, None)
+        if _last_sid == sid:
+            _last_sid = None
+
+
+def clear_all() -> None:
+    """Drop everything (only used by tests)."""
+    global _last_sid
+    with _lock:
+        _session_contexts.clear()
+        _session_tool_registry.clear()
+        _last_sid = None
+
+
+# ---------------------------------------------------------------------------
+# Tool registry (per-sid)
+# ---------------------------------------------------------------------------
+
+
+def store_tool_registry(sid: Optional[str], tools: object) -> None:
+    """Index ``tools`` by name and stash under ``sid``.
+
+    ``tools`` is whatever ``controller.agent.tools`` exposes — typically a
+    list of LiteLLM ``ChatCompletionToolParam`` dicts of the form
+    ``{"type": "function", "function": {"name": ..., "description": ..., ...}}``.
+    Anything that doesn't fit that shape is best-effort skipped.
+    """
+    if not sid or not tools:
+        return
+    registry: dict[str, dict] = {}
+    try:
+        for t in tools:  # type: ignore[union-attr]
+            try:
+                if isinstance(t, dict):
+                    fn = t.get("function") or {}
+                    name = fn.get("name") if isinstance(fn, dict) else None
+                else:
+                    fn = getattr(t, "function", None)
+                    name = getattr(fn, "name", None) if fn is not None else None
+                    # Normalize to a dict so the consumer doesn't need type-knowledge.
+                    if name and not isinstance(t, dict):
+                        t = {
+                            "type": getattr(t, "type", "function"),
+                            "function": {
+                                "name": name,
+                                "description": getattr(fn, "description", "") or "",
+                                "parameters": getattr(fn, "parameters", None) or {},
+                            },
+                        }
+                if name:
+                    registry[str(name)] = t
+            except Exception:
+                continue
+    except TypeError:
+        return
+    if not registry:
+        return
+    with _lock:
+        _session_tool_registry[sid] = registry
+
+
+def get_tool_definition(sid: Optional[str], name: Optional[str]) -> Optional[dict]:
+    """Look up a single tool's definition (dict) by name, sid-scoped."""
+    if not name:
+        return None
+    with _lock:
+        if sid and sid in _session_tool_registry:
+            return _session_tool_registry[sid].get(name)
+        # Fallback to the most-recent session — single-CLI-run case.
+        if _last_sid and _last_sid in _session_tool_registry:
+            return _session_tool_registry[_last_sid].get(name)
+        return None
+
+
+def get_tool_registry(sid: Optional[str]) -> Optional[dict[str, dict]]:
+    """Return the full ``{name: definition}`` registry for ``sid``."""
+    with _lock:
+        if sid and sid in _session_tool_registry:
+            return dict(_session_tool_registry[sid])
+        if _last_sid and _last_sid in _session_tool_registry:
+            return dict(_session_tool_registry[_last_sid])
+        return None
+
+
+class AttachedSession:
+    """Context manager that attaches the stashed context for ``sid``.
+
+    Usage::
+
+        with AttachedSession(sid):
+            span = tracer.start_span(...)
+            # span is parented under whatever the stashed context contains
+
+    No-op when no stash exists for the given sid.
+    """
+
+    __slots__ = ("_sid", "_token")
+
+    def __init__(self, sid: Optional[str]):
+        self._sid = sid
+        self._token = None
+
+    def __enter__(self) -> "AttachedSession":
+        ctx = get_context(self._sid)
+        if ctx is not None:
+            self._token = otel_context.attach(ctx)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
+        if self._token is not None:
+            try:
+                otel_context.detach(self._token)
+            except Exception:
+                pass
+            self._token = None
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/utils.py
new file mode 100644
index 000000000..7354bb8b2
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/utils.py
@@ -0,0 +1,190 @@
+"""Small attribute / argument extraction helpers shared by the wrappers."""
+
+from __future__ import annotations
+
+import json
+from typing import Any
+
+
+def safe_str(value: Any) -> str:
+    """Best-effort string conversion that never raises."""
+    if value is None:
+        return ""
+    try:
+        return str(value)
+    except Exception:
+        return ""
+
+
+def preview(text: Any, max_len: int | None = None) -> str:
+    """Return a string preview of *text* (kept for API compatibility).
+
+    Truncation is no longer applied — captured content is emitted in
+    full so dashboards never lose information. ``max_len`` is accepted
+    but ignored.
+    """
+    return safe_str(text)
+
+
+def maybe_preview(text: Any) -> str:
+    """Alias for :func:`preview` — kept for API compatibility."""
+    return preview(text)
+
+
+def safe_get_attr(obj: Any, *names: str, default: Any = None) -> Any:
+    """Return the first non-None attribute among *names* on *obj*."""
+    for name in names:
+        if obj is None:
+            return default
+        try:
+            v = getattr(obj, name, None)
+        except Exception:
+            v = None
+        if v is not None:
+            return v
+    return default
+
+
+def serialize_message(message: Any) -> str:
+    """Best-effort serialize an OpenHands message-like object to text."""
+    if message is None:
+        return ""
+    if isinstance(message, str):
+        return message
+    text_parts: list[str] = []
+    for attr in ("text", "content", "value"):
+        v = safe_get_attr(message, attr)
+        if isinstance(v, str) and v:
+            return v
+        if isinstance(v, list):
+            for item in v:
+                t = safe_get_attr(item, "text", "content")
+                if isinstance(t, str) and t:
+                    text_parts.append(t)
+    if text_parts:
+        return "\n".join(text_parts)
+    return safe_str(message)
+
+
+def extract_uuid_str(value: Any) -> str:
+    """Convert a UUID-like value to its hex/string form, returning ''."""
+    if value is None:
+        return ""
+    hex_attr = getattr(value, "hex", None)
+    if isinstance(hex_attr, str) and hex_attr:
+        return hex_attr
+    return safe_str(value)
+
+
+# ---------------------------------------------------------------------------
+# Semconv I/O serialization (input.value / output.value)
+# ---------------------------------------------------------------------------
+
+
+def _to_jsonable(obj: Any, depth: int = 0, max_depth: int = 3) -> Any:
+    """Best-effort convert ``obj`` into something json.dumps can serialize."""
+    if obj is None or isinstance(obj, (bool, int, float, str)):
+        return obj
+    if depth >= max_depth:
+        return safe_str(obj)
+    if isinstance(obj, dict):
+        out: dict[str, Any] = {}
+        for k, v in obj.items():
+            try:
+                out[safe_str(k)] = _to_jsonable(v, depth + 1, max_depth)
+            except Exception:
+                out[safe_str(k)] = safe_str(v)
+        return out
+    if isinstance(obj, (list, tuple, set)):
+        return [_to_jsonable(v, depth + 1, max_depth) for v in obj]
+    # Pydantic v2
+    if hasattr(obj, "model_dump"):
+        try:
+            return _to_jsonable(obj.model_dump(), depth + 1, max_depth)
+        except Exception:
+            pass
+    # Dataclass / generic object
+    if hasattr(obj, "__dict__"):
+        try:
+            d = {
+                k: v
+                for k, v in vars(obj).items()
+                if not k.startswith("_")
+                and not callable(v)
+            }
+            if d:
+                return _to_jsonable(d, depth + 1, max_depth)
+        except Exception:
+            pass
+    return safe_str(obj)
+
+
+def to_json_str(obj: Any, max_len: int | None = None) -> str:
+    """Convert ``obj`` to a JSON string. Empty string on failure.
+
+    No truncation is applied — captured content is emitted in full.
+    ``max_len`` is accepted but ignored (kept for API compatibility).
+    """
+    try:
+        jsonable = _to_jsonable(obj)
+        s = json.dumps(jsonable, ensure_ascii=False, default=safe_str)
+    except Exception:
+        s = safe_str(obj)
+    return s or ""
+
+
+def maybe_to_json_str(obj: Any, max_len: int | None = None) -> str:
+    """Alias for :func:`to_json_str` — kept for API compatibility."""
+    return to_json_str(obj, max_len)
+
+
+def messages_to_genai_input(messages: Any) -> str:
+    """Serialize a chat-style ``messages`` list for ``gen_ai.input.messages``.
+
+    Each item is normalized into ``{"role": ..., "content": ...}``. Keeps
+    ``tool_calls`` when present.
+    """
+    if not isinstance(messages, list):
+        return ""
+    norm: list[dict[str, Any]] = []
+    for m in messages:
+        role = safe_get_attr(m, "role")
+        content = safe_get_attr(m, "content")
+        if role is None and content is None and isinstance(m, dict):
+            role = m.get("role")
+            content = m.get("content")
+        if isinstance(content, list):
+            content = "".join(
+                safe_str(safe_get_attr(c, "text") or safe_get_attr(c, "content") or c)
+                for c in content
+            )
+        item: dict[str, Any] = {"role": safe_str(role) or "user", "content": safe_str(content)}
+        tool_calls = safe_get_attr(m, "tool_calls")
+        if tool_calls:
+            item["tool_calls"] = _to_jsonable(tool_calls)
+        norm.append(item)
+    return to_json_str(norm)
+
+
+def action_to_genai_output(action: Any) -> str:
+    """Serialize an OpenHands V0 ``Action`` into a GenAI-style assistant message."""
+    if action is None:
+        return ""
+    action_type = safe_str(safe_get_attr(action, "action") or "")
+    thought = safe_str(safe_get_attr(action, "thought") or "")
+    item: dict[str, Any] = {"role": "assistant"}
+    if thought:
+        item["content"] = thought
+    args: dict[str, Any] = {}
+    for key in ("command", "code", "path", "url", "content", "task_list", "name", "arguments"):
+        v = safe_get_attr(action, key)
+        if v not in (None, "", []):
+            args[key] = _to_jsonable(v)
+    if action_type or args:
+        item["tool_calls"] = [
+            {
+                "type": "function",
+                "function": {"name": action_type or "agent.action", "arguments": args},
+            }
+        ]
+    return to_json_str([item])
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/v0_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/v0_wrappers.py
new file mode 100644
index 000000000..93212edd5
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/v0_wrappers.py
@@ -0,0 +1,2545 @@
+"""Wrappers for the OpenHands **V0** (Legacy CodeAct) architecture.
+
+Trace tree
+----------
+
+::
+
+    ENTRY enter openhands                       (openhands.core.main.run_controller)
+    `-- AGENT invoke_agent codeact              (openhands.core.loop.run_agent_until_done)
+        |-- STEP react step [×N]                (openhands.controller.agent_controller.AgentController._step)
+        |   `-- LLM chat {model}                (litellm — covered by litellm instrumentor)
+        `-- TOOL execute_tool {tool_name}       (openhands.runtime.base.Runtime.run_action)
+
+Context propagation across threads
+----------------------------------
+
+OpenHands V0's ``EventStream`` delivers events via ``ThreadPoolExecutor``,
+and ``AgentController.on_event`` then runs the actual handler with a
+*brand-new* asyncio loop in a worker thread:
+
+.. code:: python
+
+    asyncio.get_event_loop().run_until_complete(self._on_event(event))
+
+Python ``contextvars`` do NOT propagate from the main coroutine into these
+worker threads, so ``AgentController._step`` and ``Runtime.run_action``
+would otherwise start *root* spans with fresh ``trace_id``s, fragmenting
+the trace into many disconnected pieces.
+
+To fix that, we use :mod:`session_context` as a process-wide bridge: the
+ENTRY wrapper stashes the OTel context (carrying the ENTRY+AGENT span
+chain) keyed by session id, and STEP / TOOL wrappers re-attach it before
+opening their span. The result is one trace per session id with the
+correct parent-child links.
+
+I/O capture
+-----------
+
+ENTRY / STEP spans set:
+
+* ``input.value`` and ``output.value`` (OpenInference convention)
+* ``input.mime_type`` / ``output.mime_type``
+* ``gen_ai.input.messages`` / ``gen_ai.output.messages`` where the GenAI
+  semconv applies (LLM-style messages + assistant tool calls)
+
+AGENT spans set GenAI message attributes without OpenInference
+``input.value`` / ``output.value`` mirrors.
+
+TOOL spans set ``gen_ai.tool.call.arguments`` (always, including ``"{}"``
+when empty) and ``gen_ai.tool.call.result`` for observations. They do
+not set OpenInference ``input.value`` / ``output.value``.
+
+Capture is always on and content is emitted untruncated.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any
+
+from opentelemetry import context as otel_context
+from opentelemetry import trace as trace_api
+from opentelemetry.semconv._incubating.attributes import (
+    gen_ai_attributes as GenAI,
+)
+from opentelemetry.trace import (
+    SpanKind,
+    Status,
+    StatusCode,
+    Tracer,
+    set_span_in_context,
+)
+
+from opentelemetry.instrumentation.openhands.config import (
+    OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS,
+)
+from opentelemetry.instrumentation.openhands.internal.constants import (
+    FRAMEWORK_NAME,
+    GEN_AI_FRAMEWORK,
+    GEN_AI_SPAN_KIND,
+    OH_INITIAL_MESSAGE_PREVIEW,
+)
+from opentelemetry.instrumentation.openhands.internal.session_context import (
+    AttachedSession,
+    clear_context,
+    get_context,
+    get_tool_definition,
+    store_context,
+    store_tool_registry,
+)
+from opentelemetry.instrumentation.openhands.internal.utils import (
+    action_to_genai_output,
+    maybe_preview,
+    maybe_to_json_str,
+    messages_to_genai_input,
+    safe_get_attr,
+    safe_str,
+    serialize_message,
+    to_json_str,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# Constants -----------------------------------------------------------------
+
+OH_AGENT_NAME = "openhands.agent.name"
+OH_REACT_ROUND = "gen_ai.react.round"
+OH_AGENT_STATE = "openhands.agent.state"
+OH_RUNTIME_NAME = "openhands.runtime.name"
+OH_ACTION_TYPE = "openhands.action.type"
+OH_OBSERVATION_TYPE = "openhands.observation.type"
+OH_HISTORY_LENGTH = "openhands.history.length"
+
+# OpenInference / GenAI common I/O attribute keys
+INPUT_VALUE = "input.value"
+INPUT_MIME = "input.mime_type"
+OUTPUT_VALUE = "output.value"
+OUTPUT_MIME = "output.mime_type"
+GEN_AI_INPUT_MESSAGES = "gen_ai.input.messages"
+GEN_AI_OUTPUT_MESSAGES = "gen_ai.output.messages"
+GEN_AI_SYSTEM = "gen_ai.system"
+GEN_AI_AGENT_ID = "gen_ai.agent.id"
+GEN_AI_CONVERSATION_ID = "gen_ai.conversation.id"
+GEN_AI_SESSION_ID = "gen_ai.session.id"
+GEN_AI_REQUEST_MODEL = "gen_ai.request.model"
+GEN_AI_SYSTEM_INSTRUCTIONS = "gen_ai.system_instructions"
+
+# Tool span attributes per ARMS GenAI semconv (gen-ai.md §Tool).
+GEN_AI_TOOL_CALL_ID = "gen_ai.tool.call.id"
+GEN_AI_TOOL_CALL_ARGUMENTS = "gen_ai.tool.call.arguments"
+GEN_AI_TOOL_CALL_RESULT = "gen_ai.tool.call.result"
+GEN_AI_TOOL_DESCRIPTION = "gen_ai.tool.description"
+GEN_AI_TOOL_DEFINITIONS = "gen_ai.tool.definitions"
+
+# Stash slots on AgentController instances (set by AgentControllerInitWrapper).
+_OWNS_FLAG = "_otel_oh_owns_lifecycle"
+_ENTRY_SPAN_ATTR = "_otel_oh_entry_span"
+_AGENT_SPAN_ATTR = "_otel_oh_agent_span"
+_ENTRY_TOKEN_ATTR = "_otel_oh_entry_token"
+_AGENT_TOKEN_ATTR = "_otel_oh_agent_token"
+# STEP persistence — keeps the *most-recent* STEP span alive across the
+# return of ``_step`` so that ``Runtime.run_action`` (which fires *later*
+# in a thread-pool executor via ``call_sync_from_async``) can re-attach
+# the STEP context and become its child rather than a sibling.
+#
+# IMPORTANT: we deliberately do **not** stash an OTel attach-token across
+# the return of ``_step``. ``otel_context.attach()`` returns a Token that
+# is bound to the ``contextvars.Context`` it was created in; calling
+# ``detach(token)`` from a *different* context raises ``ValueError`` (and
+# in production the Aliyun OTel SDK floods the log with
+# "Token was created in a different Context" errors).  Attach/detach
+# always happen as a balanced pair *inside the same async task*; cross-
+# task / cross-thread propagation goes through the ``Context`` *object*
+# stashed in :mod:`session_context` and re-attached on the consumer side.
+_STEP_SPAN_ATTR = "_otel_oh_step_span"
+_AGENT_CTX_ATTR = "_otel_oh_agent_ctx"  # restore target when STEP closes
+
+
+def _set_common(span: trace_api.Span, kind: str) -> None:
+    span.set_attribute(GEN_AI_SPAN_KIND, kind)
+    span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME)
+    span.set_attribute(GEN_AI_SYSTEM, FRAMEWORK_NAME)
+
+
+def _set_io(
+    span: trace_api.Span,
+    *,
+    input_value: str = "",
+    output_value: str = "",
+    input_messages: str = "",
+    output_messages: str = "",
+    mime: str = "application/json",
+) -> None:
+    if input_value:
+        span.set_attribute(INPUT_VALUE, input_value)
+        span.set_attribute(INPUT_MIME, mime)
+    if output_value:
+        span.set_attribute(OUTPUT_VALUE, output_value)
+        span.set_attribute(OUTPUT_MIME, mime)
+    if input_messages:
+        span.set_attribute(GEN_AI_INPUT_MESSAGES, input_messages)
+    if output_messages:
+        span.set_attribute(GEN_AI_OUTPUT_MESSAGES, output_messages)
+
+
+def _extract_model_from_config(config: Any) -> str:
+    if config is None:
+        return ""
+    try:
+        llms = safe_get_attr(config, "llms")
+        if isinstance(llms, dict) and llms:
+            llm = next(iter(llms.values()))
+            model = safe_get_attr(llm, "model")
+            if model:
+                return safe_str(model)
+    except Exception:
+        pass
+    try:
+        llm = safe_get_attr(config, "llm")
+        model = safe_get_attr(llm, "model")
+        if model:
+            return safe_str(model)
+    except Exception:
+        pass
+    return ""
+
+
+def _extract_input_message_text(initial_user_action: Any) -> str:
+    """Pull human-readable text out of an ``initial_user_action`` argument."""
+    return serialize_message(initial_user_action)
+
+
+def _state_to_input_messages(state: Any, max_messages: int = 10) -> str:
+    """Best-effort extract a chat-style messages list from a controller State.
+
+    The actual messages sent to the LLM are built inside ``CodeActAgent.step``
+    and not stored on the controller, so this is a coarse summary derived
+    from ``state.history`` which is reliably available.
+    """
+    history = safe_get_attr(state, "history") or []
+    if not isinstance(history, list):
+        return ""
+    items: list[dict[str, str]] = []
+    # Keep the most recent ``max_messages`` events for size budget.
+    for ev in history[-max_messages:]:
+        cls_name = type(ev).__name__
+        # Map common event types to roles
+        if cls_name in ("MessageAction", "SystemMessageAction"):
+            role = "user" if str(safe_get_attr(ev, "source")) == "user" else "assistant"
+            content = safe_get_attr(ev, "content") or safe_get_attr(ev, "message") or ""
+        elif cls_name.endswith("Action"):
+            role = "assistant"
+            content = (
+                safe_get_attr(ev, "thought")
+                or safe_get_attr(ev, "command")
+                or safe_get_attr(ev, "code")
+                or safe_str(ev)
+            )
+        elif cls_name.endswith("Observation"):
+            role = "tool"
+            content = safe_get_attr(ev, "content") or safe_str(ev)
+        else:
+            role = "system"
+            content = safe_str(ev)
+        items.append({"role": role, "content": safe_str(content), "event": cls_name})
+    return to_json_str(items)
+
+
+def _final_state_to_output(state: Any) -> str:
+    """Serialize the controller's final state for output.value."""
+    if state is None:
+        return ""
+    payload: dict[str, Any] = {}
+    agent_state = safe_get_attr(state, "agent_state")
+    if agent_state is not None:
+        payload["agent_state"] = (
+            safe_get_attr(agent_state, "value") or safe_str(agent_state)
+        )
+    last_error = safe_get_attr(state, "last_error")
+    if last_error:
+        payload["last_error"] = safe_str(last_error)
+    iteration = safe_get_attr(state, "iteration")
+    if iteration is not None:
+        payload["iteration"] = safe_str(iteration)
+    history = safe_get_attr(state, "history") or []
+    if isinstance(history, list) and history:
+        payload["history_length"] = len(history)
+        # Find the last AgentFinishAction or last assistant content for a final answer summary.
+        for ev in reversed(history):
+            if type(ev).__name__ == "AgentFinishAction":
+                payload["final_thought"] = safe_str(
+                    safe_get_attr(ev, "final_thought")
+                    or safe_get_attr(ev, "thought")
+                    or ""
+                )
+                payload["outputs"] = safe_str(safe_get_attr(ev, "outputs") or {})
+                break
+    return to_json_str(payload)
+
+
+def _entry_input_messages_from_initial(initial_user_action: Any) -> str:
+    """Return ARMS gen_ai.input.messages for the ENTRY span."""
+    text = _extract_input_message_text(initial_user_action)
+    if not text:
+        return ""
+    return to_json_str(
+        [{"role": "user", "parts": [{"type": "text", "content": text}]}]
+    )
+
+
+def _entry_io_from_state(state: Any) -> tuple[str, str]:
+    """Return (input_messages, output_messages) for ENTRY from final state."""
+    history = safe_get_attr(state, "history") or []
+    input_messages = ""
+    output_messages = ""
+    if isinstance(history, list) and history:
+        input_payload = _history_to_input_messages_schema(history)
+        if input_payload:
+            input_messages = to_json_str(input_payload)
+        output_payload = _history_to_output_messages_schema(history)
+        if output_payload:
+            output_messages = to_json_str(output_payload)
+    if not output_messages:
+        final_state = _final_state_to_output(state)
+        if final_state:
+            output_messages = to_json_str(
+                [
+                    {
+                        "role": "assistant",
+                        "parts": [{"type": "text", "content": final_state}],
+                        "finish_reason": "stop",
+                    }
+                ]
+            )
+    return input_messages, output_messages
+
+
+# ---------------------------------------------------------------------------
+# ARMS GenAI semconv message-schema converters.
+#
+# Per gen-ai.md §LLM/§Agent, gen_ai.input.messages / gen_ai.output.messages
+# / gen_ai.system_instructions follow a "parts"-based structure:
+#
+#     [{"role": "user|assistant|tool|system",
+#       "parts": [{"type": "text|tool_call|tool_call_response|...",
+#                  "content": "...", "name": "...", "id": "...",
+#                  "arguments": {...}, "result": "..."}],
+#       "finish_reason": "stop|...",        # output only
+#     }]
+#
+# The system instructions schema is a flat list of parts:
+#
+#     [{"type": "text", "content": "..."}]
+# ---------------------------------------------------------------------------
+
+
+def _action_event_to_parts(ev: Any) -> list[dict[str, Any]]:
+    """Convert an Action event into a list of ``parts`` for AGENT messages.
+
+    Captures both the model's "thought" text and any ``tool_call`` part
+    derived from ``tool_call_metadata``.
+    """
+    parts: list[dict[str, Any]] = []
+    thought = safe_get_attr(ev, "thought")
+    if thought:
+        parts.append({"type": "text", "content": safe_str(thought)})
+    tcm = safe_get_attr(ev, "tool_call_metadata")
+    if tcm is not None:
+        fn_name = safe_str(safe_get_attr(tcm, "function_name") or "")
+        tcid = safe_str(safe_get_attr(tcm, "tool_call_id") or "")
+        # Best-effort harvest the original LLM-emitted JSON arguments.
+        args: Any = {}
+        try:
+            mr = safe_get_attr(tcm, "model_response")
+            choices = (
+                getattr(mr, "choices", None)
+                if mr is not None
+                else None
+            ) or []
+            for choice in choices:
+                msg = getattr(choice, "message", None) or (
+                    choice.get("message") if isinstance(choice, dict) else None
+                )
+                tool_calls = (
+                    getattr(msg, "tool_calls", None)
+                    if msg is not None
+                    else None
+                ) or (msg.get("tool_calls") if isinstance(msg, dict) else None)
+                if not tool_calls:
+                    continue
+                for tc in tool_calls:
+                    tc_id = (
+                        getattr(tc, "id", None)
+                        if not isinstance(tc, dict)
+                        else tc.get("id")
+                    )
+                    if tcid and safe_str(tc_id) != tcid:
+                        continue
+                    fn = (
+                        getattr(tc, "function", None)
+                        if not isinstance(tc, dict)
+                        else tc.get("function")
+                    )
+                    raw = (
+                        getattr(fn, "arguments", None)
+                        if not isinstance(fn, dict)
+                        else fn.get("arguments")
+                    )
+                    if isinstance(raw, str):
+                        try:
+                            import json as _json
+
+                            args = _json.loads(raw)
+                        except Exception:
+                            args = {"raw": raw}
+                    elif isinstance(raw, dict):
+                        args = raw
+        except Exception:
+            args = {}
+        if not args:
+            for key in (
+                "command",
+                "code",
+                "path",
+                "url",
+                "content",
+                "task_list",
+                "old_str",
+                "new_str",
+                "file_text",
+            ):
+                v = safe_get_attr(ev, key)
+                if v not in (None, "", [], {}):
+                    args[key] = v
+        if fn_name or tcid or args:
+            parts.append(
+                {
+                    "type": "tool_call",
+                    "id": tcid,
+                    "name": fn_name or safe_str(safe_get_attr(ev, "action") or ""),
+                    "arguments": args,
+                }
+            )
+    if not parts:
+        # Minimal fallback when nothing else could be extracted.
+        action_type = safe_str(safe_get_attr(ev, "action") or "")
+        if action_type:
+            parts.append({"type": "tool_call", "name": action_type, "arguments": {}})
+    return parts
+
+
+def _observation_event_to_parts(ev: Any) -> list[dict[str, Any]]:
+    """Convert an Observation event into ``parts`` for tool-response messages."""
+    tcm = safe_get_attr(ev, "tool_call_metadata")
+    tcid = safe_str(safe_get_attr(tcm, "tool_call_id") or "") if tcm else ""
+    result_payload: dict[str, Any] = {}
+    for key in ("content", "exit_code", "error", "stdout", "stderr", "url"):
+        v = safe_get_attr(ev, key)
+        if v not in (None, "", [], {}):
+            result_payload[key] = v
+    return [
+        {
+            "type": "tool_call_response",
+            "id": tcid,
+            "result": result_payload or safe_str(ev),
+        }
+    ]
+
+
+def _history_to_input_messages_schema(history: list, max_events: int = 200) -> list[dict[str, Any]]:
+    """Convert ``state.history`` into the ARMS gen_ai.input.messages schema.
+
+    Folds adjacent same-role events into a single message with multiple
+    ``parts``, mirroring how the messages were assembled when sent to
+    the LLM.
+    """
+    if not history:
+        return []
+    items = history[-max_events:]
+    messages: list[dict[str, Any]] = []
+    for ev in items:
+        cls = type(ev).__name__
+        # Determine role + parts for this event.
+        if cls == "SystemMessageAction":
+            # System is reported separately under gen_ai.system_instructions.
+            continue
+        if cls == "MessageAction":
+            src = str(safe_get_attr(ev, "source") or "").lower()
+            role = "user" if src == "user" else "assistant"
+            content = safe_str(safe_get_attr(ev, "content") or "")
+            parts = [{"type": "text", "content": content}]
+        elif cls.endswith("Observation"):
+            role = "tool"
+            parts = _observation_event_to_parts(ev)
+        elif cls.endswith("Action"):
+            role = "assistant"
+            parts = _action_event_to_parts(ev)
+        else:
+            role = "system"
+            parts = [{"type": "text", "content": safe_str(ev)}]
+        # Fold consecutive same-role messages.
+        if messages and messages[-1]["role"] == role:
+            messages[-1]["parts"].extend(parts)
+        else:
+            messages.append({"role": role, "parts": parts})
+    return messages
+
+
+def _history_to_output_messages_schema(history: list) -> list[dict[str, Any]]:
+    """Pull the *final* assistant turn from history per ARMS gen_ai.output.messages.
+
+    Walks back from the end of history and collects assistant-side events
+    (Actions) up to the previous user/tool boundary. Includes a
+    ``finish_reason`` derived from the last AgentFinishAction / state.
+    """
+    if not history:
+        return []
+    finish_reason = "stop"
+    tail_actions: list[Any] = []
+    for ev in reversed(history):
+        cls = type(ev).__name__
+        if cls == "AgentFinishAction":
+            finish_reason = safe_str(
+                safe_get_attr(ev, "final_thought") and "stop" or "stop"
+            )
+            tail_actions.insert(0, ev)
+            continue
+        if cls.endswith("Observation") or cls == "MessageAction":
+            # Stop once we cross back into user-input or tool-result territory.
+            if cls == "MessageAction" and str(
+                safe_get_attr(ev, "source") or ""
+            ).lower() == "user":
+                break
+            if cls.endswith("Observation"):
+                break
+        if cls.endswith("Action") or (
+            cls == "MessageAction"
+            and str(safe_get_attr(ev, "source") or "").lower() != "user"
+        ):
+            tail_actions.insert(0, ev)
+    if not tail_actions:
+        # Fallback: at least include the very last event as the assistant turn.
+        tail_actions = [history[-1]]
+    parts: list[dict[str, Any]] = []
+    for ev in tail_actions:
+        cls = type(ev).__name__
+        if cls == "MessageAction":
+            content = safe_str(safe_get_attr(ev, "content") or "")
+            if content:
+                parts.append({"type": "text", "content": content})
+        elif cls == "AgentFinishAction":
+            ft = safe_str(safe_get_attr(ev, "final_thought") or "")
+            if ft:
+                parts.append({"type": "text", "content": ft})
+            outputs = safe_get_attr(ev, "outputs")
+            if outputs:
+                parts.append({"type": "text", "content": safe_str(outputs)})
+        else:
+            parts.extend(_action_event_to_parts(ev))
+    if not parts:
+        parts = [{"type": "text", "content": ""}]
+    return [{"role": "assistant", "parts": parts, "finish_reason": finish_reason}]
+
+
+def _agent_to_system_instructions(agent: Any, state: Any) -> list[dict[str, Any]]:
+    """Return ARMS gen_ai.system_instructions for the controller's agent.
+
+    Tries the explicit ``agent.get_system_message()`` API first (most
+    accurate), then falls back to scanning ``state.history`` for a
+    ``SystemMessageAction``.
+    """
+    content = ""
+    try:
+        gsm = safe_get_attr(agent, "get_system_message")
+        if callable(gsm):
+            sm = gsm()
+            content = safe_str(safe_get_attr(sm, "content") or "")
+    except Exception:
+        content = ""
+    if not content:
+        history = safe_get_attr(state, "history") or []
+        if isinstance(history, list):
+            for ev in history:
+                if type(ev).__name__ == "SystemMessageAction":
+                    content = safe_str(safe_get_attr(ev, "content") or "")
+                    if content:
+                        break
+    if not content:
+        return []
+    return [{"type": "text", "content": content}]
+
+
+# ---------------------------------------------------------------------------
+# ENTRY: openhands.core.main.run_controller
+# ---------------------------------------------------------------------------
+
+
+class RunControllerWrapper:
+    """ENTRY span around the V0 CLI/headless ``run_controller`` coroutine.
+
+    Stashes the active OTel Context (with the ENTRY span attached) keyed
+    by ``sid`` so STEP / TOOL spans firing in worker threads can re-attach
+    it and remain in the same trace.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        return self._impl(wrapped, instance, args, kwargs)
+
+    async def _impl(self, wrapped, instance, args, kwargs):
+        if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS:
+            return await wrapped(*args, **kwargs)
+
+        config = kwargs.get("config")
+        if config is None and args:
+            config = args[0]
+        initial_user_action = kwargs.get("initial_user_action")
+        if initial_user_action is None and len(args) >= 2:
+            initial_user_action = args[1]
+        sid = kwargs.get("sid")
+        if sid is None and len(args) >= 3:
+            sid = args[2]
+        # When sid wasn't passed, we don't yet know the auto-generated one;
+        # the controller will publish ``controller.id`` later. We update
+        # the stash again from inside the AGENT wrapper.
+
+        span = self._tracer.start_span("enter openhands", kind=SpanKind.INTERNAL)
+        _set_common(span, "ENTRY")
+        span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "enter")
+        if sid:
+            span.set_attribute(GEN_AI_SESSION_ID, safe_str(sid))
+            span.set_attribute(GEN_AI_CONVERSATION_ID, safe_str(sid))
+        model = _extract_model_from_config(config)
+        if model:
+            span.set_attribute(GEN_AI_REQUEST_MODEL, model)
+
+        input_text = _extract_input_message_text(initial_user_action)
+        preview = maybe_preview(input_text)
+        if preview:
+            span.set_attribute(OH_INITIAL_MESSAGE_PREVIEW, preview)
+        captured_input = (
+            maybe_to_json_str({"role": "user", "content": input_text})
+            if input_text
+            else ""
+        )
+        if captured_input:
+            entry_input_messages = _entry_input_messages_from_initial(
+                initial_user_action
+            )
+            _set_io(
+                span,
+                input_value=captured_input,
+                input_messages=entry_input_messages,
+            )
+
+        ctx = set_span_in_context(span)
+        token = otel_context.attach(ctx)
+        if sid:
+            store_context(sid, ctx)
+        try:
+            try:
+                result = await wrapped(*args, **kwargs)
+            except BaseException as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR, type(exc).__qualname__))
+                raise
+            try:
+                final_state_repr = _final_state_to_output(result)
+                entry_input_messages, entry_output_messages = _entry_io_from_state(
+                    result
+                )
+                if final_state_repr:
+                    _set_io(
+                        span,
+                        output_value=final_state_repr,
+                        input_messages=entry_input_messages,
+                        output_messages=entry_output_messages,
+                    )
+                    agent_state = safe_get_attr(result, "agent_state")
+                    if agent_state is not None:
+                        span.set_attribute(
+                            OH_AGENT_STATE,
+                            safe_get_attr(agent_state, "value") or safe_str(agent_state),
+                        )
+                elif entry_input_messages or entry_output_messages:
+                    _set_io(
+                        span,
+                        input_messages=entry_input_messages,
+                        output_messages=entry_output_messages,
+                    )
+            except Exception:
+                pass
+            return result
+        finally:
+            try:
+                otel_context.detach(token)
+            except Exception:
+                pass
+            if sid:
+                clear_context(sid)
+            span.end()
+
+
+# ---------------------------------------------------------------------------
+# AGENT: openhands.core.loop.run_agent_until_done
+# ---------------------------------------------------------------------------
+
+
+class RunAgentUntilDoneWrapper:
+    """AGENT span around the V0 polling loop.
+
+    Re-attaches the ENTRY context (in case asyncio task creation didn't
+    propagate it for some reason) and re-stashes a fresh context that now
+    also includes the AGENT span — that's what STEP / TOOL re-attach.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        return self._impl(wrapped, instance, args, kwargs)
+
+    async def _impl(self, wrapped, instance, args, kwargs):
+        if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS:
+            return await wrapped(*args, **kwargs)
+
+        controller = kwargs.get("controller")
+        if controller is None and args:
+            controller = args[0]
+        agent = safe_get_attr(controller, "agent")
+        agent_name = safe_get_attr(agent, "name") or "codeact"
+        agent_class = (
+            f"{type(agent).__module__}.{type(agent).__name__}" if agent else ""
+        )
+        sid = safe_str(safe_get_attr(controller, "id") or "")
+        llm = safe_get_attr(agent, "llm")
+        llm_config = safe_get_attr(llm, "config")
+        model = safe_get_attr(llm_config, "model") or safe_get_attr(llm, "model")
+
+        # If AgentController.__init__ already opened lifecycle-bound ENTRY+AGENT
+        # spans, do not create a second AGENT here. Just run the loop with the
+        # existing AGENT context current so STEP/LLM/TOOL remain descendants.
+        lifecycle_agent_span = getattr(controller, _AGENT_SPAN_ATTR, None)
+        lifecycle_agent_ctx = getattr(controller, _AGENT_CTX_ATTR, None)
+        if lifecycle_agent_span is not None and lifecycle_agent_ctx is not None:
+            try:
+                _capture_agent_io_attributes(
+                    lifecycle_agent_span,
+                    controller,
+                    agent,
+                    safe_get_attr(controller, "state"),
+                )
+            except Exception:
+                pass
+            lifecycle_token = otel_context.attach(lifecycle_agent_ctx)
+            try:
+                return await wrapped(*args, **kwargs)
+            except BaseException as exc:
+                try:
+                    lifecycle_agent_span.record_exception(exc)
+                    lifecycle_agent_span.set_status(
+                        Status(StatusCode.ERROR, type(exc).__qualname__)
+                    )
+                except Exception:
+                    pass
+                raise
+            finally:
+                try:
+                    state = safe_get_attr(controller, "state")
+                    _capture_agent_io_attributes(
+                        lifecycle_agent_span, controller, agent, state
+                    )
+                    history = safe_get_attr(state, "history") or []
+                    if isinstance(history, list):
+                        lifecycle_agent_span.set_attribute(OH_HISTORY_LENGTH, len(history))
+                except Exception:
+                    pass
+                try:
+                    otel_context.detach(lifecycle_token)
+                except Exception:
+                    pass
+
+        # Bridge: re-attach whatever the ENTRY wrapper stashed (works even
+        # if asyncio.create_task somehow lost the context, and is the only
+        # way for the worker-thread STEP / TOOL spans to find us).
+        attach_ctx = get_context(sid)
+        fallback_entry_span: trace_api.Span | None = None
+        if attach_ctx is None:
+            fallback_entry_span = self._tracer.start_span(
+                "enter openhands", kind=SpanKind.INTERNAL
+            )
+            _set_common(fallback_entry_span, "ENTRY")
+            fallback_entry_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "enter")
+            if sid:
+                fallback_entry_span.set_attribute(GEN_AI_SESSION_ID, sid)
+                fallback_entry_span.set_attribute(GEN_AI_CONVERSATION_ID, sid)
+            if agent_class:
+                fallback_entry_span.set_attribute(OH_AGENT_NAME, agent_class)
+            if model:
+                fallback_entry_span.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model))
+            try:
+                state = safe_get_attr(controller, "state")
+                entry_input_messages, _ = _entry_io_from_state(state)
+                if entry_input_messages:
+                    _set_io(
+                        fallback_entry_span,
+                        input_value=entry_input_messages,
+                        input_messages=entry_input_messages,
+                    )
+            except Exception:
+                pass
+            attach_ctx = set_span_in_context(fallback_entry_span)
+            if sid:
+                store_context(sid, attach_ctx)
+        if attach_ctx is not None:
+            attach_token = otel_context.attach(attach_ctx)
+        else:
+            attach_token = None
+
+        try:
+            span = self._tracer.start_span(
+                f"invoke_agent {agent_name}",
+                kind=SpanKind.INTERNAL,
+                context=attach_ctx,
+            )
+            _set_common(span, "AGENT")
+            span.set_attribute(
+                GenAI.GEN_AI_OPERATION_NAME,
+                GenAI.GenAiOperationNameValues.INVOKE_AGENT.value,
+            )
+            span.set_attribute(GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name))
+            if agent_class:
+                span.set_attribute(OH_AGENT_NAME, agent_class)
+            if sid:
+                span.set_attribute(GEN_AI_SESSION_ID, sid)
+                span.set_attribute(GEN_AI_CONVERSATION_ID, sid)
+                span.set_attribute(GEN_AI_AGENT_ID, sid)
+            if model:
+                span.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model))
+
+            # Capture the agent's tool registry so the TOOL wrapper (which
+            # only sees a Runtime instance) can resolve tool descriptions
+            # and produce ``gen_ai.tool.description``. Also emit
+            # ``gen_ai.tool.definitions`` on this AGENT span itself per the
+            # ARMS GenAI semconv §Agent — minimal {type,name} entries by
+            # default; full definitions only when content capture is on.
+            try:
+                tools = safe_get_attr(agent, "tools") or []
+                if sid:
+                    store_tool_registry(sid, tools)
+                tool_defs_summary: list[dict[str, Any]] = []
+                for t in tools:
+                    if isinstance(t, dict):
+                        kind = t.get("type") or "function"
+                        fn = t.get("function") or {}
+                        name = fn.get("name") if isinstance(fn, dict) else None
+                    else:
+                        kind = safe_get_attr(t, "type") or "function"
+                        fn = safe_get_attr(t, "function")
+                        name = safe_get_attr(fn, "name")
+                    if not name:
+                        continue
+                    item: dict[str, Any] = {"type": safe_str(kind), "name": safe_str(name)}
+                    if isinstance(fn, dict):
+                        desc = fn.get("description")
+                        params = fn.get("parameters")
+                    else:
+                        desc = safe_get_attr(fn, "description")
+                        params = safe_get_attr(fn, "parameters")
+                    if desc:
+                        item["description"] = safe_str(desc)
+                    if params:
+                        item["parameters"] = params
+                    tool_defs_summary.append(item)
+                if tool_defs_summary:
+                    span.set_attribute(
+                        GEN_AI_TOOL_DEFINITIONS, to_json_str(tool_defs_summary)
+                    )
+            except Exception:
+                pass
+
+            # Capture initial user/system context for AGENT using the same
+            # ARMS message schema as the lifecycle-bound AGENT path.
+            try:
+                state = safe_get_attr(controller, "state")
+                _capture_agent_io_attributes(span, controller, agent, state)
+            except Exception:
+                pass
+
+            # Stash the context that now includes the AGENT span so STEP /
+            # TOOL re-attach correctly even when running in worker threads.
+            ctx_with_agent = set_span_in_context(span)
+            if sid:
+                store_context(sid, ctx_with_agent)
+            # Mirror onto the controller too — STEP wrapper uses this when
+            # closing a STEP to restore the session stash to AGENT instead
+            # of leaving a dangling closed-STEP context behind.
+            if controller is not None:
+                try:
+                    setattr(controller, _AGENT_CTX_ATTR, ctx_with_agent)
+                    setattr(controller, _AGENT_SPAN_ATTR, span)
+                except Exception:
+                    pass
+                if getattr(controller, _STEP_SPAN_ATTR, None) is None:
+                    try:
+                        warmup_step = self._tracer.start_span(
+                            "react step",
+                            kind=SpanKind.INTERNAL,
+                            context=ctx_with_agent,
+                        )
+                        _set_common(warmup_step, "STEP")
+                        warmup_step.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "react")
+                        warmup_step.set_attribute(OH_REACT_ROUND, 1)
+                        warmup_step.set_attribute(
+                            GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name)
+                        )
+                        if sid:
+                            warmup_step.set_attribute(GEN_AI_SESSION_ID, sid)
+                            warmup_step.set_attribute(GEN_AI_CONVERSATION_ID, sid)
+                            warmup_step.set_attribute(GEN_AI_AGENT_ID, sid)
+                        setattr(controller, _STEP_SPAN_ATTR, warmup_step)
+                        setattr(controller, "_otel_oh_round", 1)
+                        setattr(controller, "_otel_oh_step_consumed", False)
+                        if sid:
+                            store_context(sid, set_span_in_context(warmup_step))
+                    except Exception:
+                        pass
+            agent_token = otel_context.attach(ctx_with_agent)
+            try:
+                try:
+                    result = await wrapped(*args, **kwargs)
+                except BaseException as exc:
+                    span.record_exception(exc)
+                    span.set_status(
+                        Status(StatusCode.ERROR, type(exc).__qualname__)
+                    )
+                    raise
+                # Capture final AGENT I/O using ARMS gen_ai.* message attrs.
+                try:
+                    state = safe_get_attr(controller, "state")
+                    _capture_agent_io_attributes(span, controller, agent, state)
+                    if state is not None:
+                        agent_state = safe_get_attr(state, "agent_state")
+                        if agent_state is not None:
+                            span.set_attribute(
+                                OH_AGENT_STATE,
+                                safe_get_attr(agent_state, "value")
+                                or safe_str(agent_state),
+                            )
+                        history = safe_get_attr(state, "history") or []
+                        if isinstance(history, list):
+                            span.set_attribute(OH_HISTORY_LENGTH, len(history))
+                except Exception:
+                    pass
+                return result
+            finally:
+                try:
+                    otel_context.detach(agent_token)
+                except Exception:
+                    pass
+                if controller is not None:
+                    try:
+                        if getattr(controller, _AGENT_SPAN_ATTR, None) is span:
+                            setattr(controller, _AGENT_SPAN_ATTR, None)
+                    except Exception:
+                        pass
+                    try:
+                        _close_open_step(controller)
+                    except Exception:
+                        pass
+                span.end()
+        finally:
+            if attach_token is not None:
+                try:
+                    otel_context.detach(attach_token)
+                except Exception:
+                    pass
+            if fallback_entry_span is not None:
+                try:
+                    state = safe_get_attr(controller, "state")
+                    output_repr = _final_state_to_output(state)
+                    entry_input_messages, entry_output_messages = _entry_io_from_state(
+                        state
+                    )
+                    if output_repr:
+                        _set_io(
+                            fallback_entry_span,
+                            output_value=output_repr,
+                            input_messages=entry_input_messages,
+                            output_messages=entry_output_messages,
+                        )
+                    elif entry_input_messages or entry_output_messages:
+                        _set_io(
+                            fallback_entry_span,
+                            input_messages=entry_input_messages,
+                            output_messages=entry_output_messages,
+                        )
+                    history = safe_get_attr(state, "history") or []
+                    if isinstance(history, list):
+                        fallback_entry_span.set_attribute(OH_HISTORY_LENGTH, len(history))
+                except Exception:
+                    pass
+                try:
+                    fallback_entry_span.end()
+                except Exception:
+                    pass
+                if sid:
+                    try:
+                        clear_context(sid)
+                    except Exception:
+                        pass
+
+
+# ---------------------------------------------------------------------------
+# STEP: AgentController._step
+# ---------------------------------------------------------------------------
+
+
+def _close_open_step(controller: Any) -> None:
+    """End the controller's currently-open STEP span, if any.
+
+    Restores the session-context stash to the controller's AGENT context
+    (kept under ``_AGENT_CTX_ATTR``) so subsequent TOOL spans are still
+    parented correctly even after the last STEP closes.
+
+    Crucially, this function only ends the *span* — it never touches an
+    attach-token. The STEP wrapper attaches/detaches the STEP context
+    in a balanced pair *inside* the ``_step`` coroutine; cross-task
+    propagation happens via the ``Context`` object stashed in
+    :mod:`session_context`, which can be re-attached safely from any
+    task / thread because every attach is paired with a detach inside
+    its creating context.
+    """
+    span = getattr(controller, _STEP_SPAN_ATTR, None)
+    if span is None:
+        return
+    try:
+        span.end()
+    except Exception:
+        pass
+    try:
+        setattr(controller, _STEP_SPAN_ATTR, None)
+    except Exception:
+        pass
+    sid = safe_str(safe_get_attr(controller, "id") or "")
+    agent_ctx = getattr(controller, _AGENT_CTX_ATTR, None)
+    if sid and agent_ctx is not None:
+        store_context(sid, agent_ctx)
+
+
+class AgentControllerStepWrapper:
+    """STEP span around one ReAct iteration of the V0 controller.
+
+    The STEP span is intentionally **kept open across the return of
+    ``_step``**. Why: ``Runtime.run_action`` runs *later*, in a thread-pool
+    executor (``call_sync_from_async`` inside ``_handle_action``), so by
+    the time TOOL fires the STEP coroutine has already returned. Closing
+    STEP at end of ``_step`` would make every TOOL a sibling of STEP
+    (parented under AGENT) instead of a child.
+
+    Lifecycle:
+
+    1. New ``_step`` invoked → close *previous* STEP if any → open new
+       STEP (child of AGENT) → stash STEP context under ``sid`` so that
+       TOOL / LLM spans firing on worker threads re-attach STEP.
+    2. ``_step`` body runs to completion. We do **not** close STEP here.
+    3. The next ``_step`` (or ``AgentController.close``) closes the
+       still-open STEP.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        return self._impl(wrapped, instance, args, kwargs)
+
+    @staticmethod
+    def _will_step_be_noop(instance: Any) -> bool:
+        """Return True if this ``_step`` call will short-circuit without
+        producing real work (state != RUNNING, or a pending action is
+        already queued). We skip span emission for these so the round
+        counter stays sequential (1, 2, 3, ...) instead of inflating to
+        (1, 3, 5, ...) with empty 0.5ms STEP spans cluttering the trace.
+
+        This mirrors the early-return checks at the top of
+        ``AgentController._step`` (state-check + ``_pending_action``).
+        We read ``_pending_action_info`` directly rather than going
+        through the ``_pending_action`` *property* — the property has
+        logging side effects (it can emit a "pending action active for
+        Xs" log line at warn-level) that we don't want to trigger from
+        an instrumentation hot path.
+        """
+        try:
+            state = safe_get_attr(instance, "state")
+            agent_state = safe_get_attr(state, "agent_state")
+            # AgentState enum value is 'running' (case-insensitive).
+            agent_state_str = (
+                safe_str(safe_get_attr(agent_state, "value") or agent_state).lower()
+            )
+            if agent_state_str != "running":
+                return True
+            # Check the underlying tuple slot, not the property — the
+            # property's getter is non-trivial in OpenHands.
+            if getattr(instance, "_pending_action_info", None) is not None:
+                return True
+        except Exception:
+            return False
+        return False
+
+    @staticmethod
+    def _snapshot_for_work_detection(instance: Any) -> tuple[int, Any]:
+        """Snapshot the bits we need to tell whether ``_step`` body did
+        anything. Returned tuple is (history_length, pending_action_id).
+        Used by ``_impl`` to detect "empty" STEP invocations that get
+        through ``_will_step_be_noop`` (e.g. ``state_tracker`` raised,
+        ``_is_stuck`` early-returned, ``agent.step`` returned ``None``)
+        and shouldn't show up in the trace as 0.3ms placeholder spans.
+        """
+        try:
+            state = safe_get_attr(instance, "state")
+            history = safe_get_attr(state, "history")
+            history_len = len(history) if isinstance(history, list) else 0
+        except Exception:
+            history_len = 0
+        try:
+            info = getattr(instance, "_pending_action_info", None)
+            pending_id = id(info) if info is not None else None
+        except Exception:
+            pending_id = None
+        return history_len, pending_id
+
+    async def _impl(self, wrapped, instance, args, kwargs):
+        if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS:
+            return await wrapped(*args, **kwargs)
+
+        # Skip no-op _step invocations entirely so the trace shows only
+        # the rounds that actually do work (LLM call + tool dispatch).
+        if self._will_step_be_noop(instance):
+            return await wrapped(*args, **kwargs)
+
+        sid = safe_str(safe_get_attr(instance, "id") or "")
+        agent = safe_get_attr(instance, "agent")
+        agent_name = safe_get_attr(agent, "name") or "codeact"
+
+        # Snapshot the AGENT context if we don't already have one so
+        # ``_close_open_step`` can restore the session stash to AGENT
+        # after STEP ends.
+        if not hasattr(instance, _AGENT_CTX_ATTR) or getattr(instance, _AGENT_CTX_ATTR, None) is None:
+            try:
+                setattr(instance, _AGENT_CTX_ATTR, get_context(sid))
+            except Exception:
+                pass
+
+        # ----- Reuse warmup STEP if not yet consumed -----
+        # The init wrapper opens a warmup STEP (round 1) so pre-step
+        # actions like RECALL parent under STEP 1. The first real
+        # ``_step`` reuses that STEP (without bumping the round) so the
+        # LLM call + first LLM-driven tool also nest under STEP 1. From
+        # the second real ``_step`` onward, we close the previous STEP
+        # and open a new one with round = previous + 1.
+        existing_step = getattr(instance, _STEP_SPAN_ATTR, None)
+        consumed = bool(getattr(instance, "_otel_oh_step_consumed", True))
+        reused_warmup = False
+        is_new_span = False
+        if existing_step is not None and not consumed:
+            span = existing_step
+            round_num = int(getattr(instance, "_otel_oh_round", 1) or 1)
+            reused_warmup = True
+            try:
+                setattr(instance, "_otel_oh_step_consumed", True)
+            except Exception:
+                pass
+        else:
+            # Close any still-open consumed STEP from the previous round
+            # before opening a new one.
+            _close_open_step(instance)
+            # Tentative round number — only committed if body does work.
+            round_num = int(getattr(instance, "_otel_oh_round", 0) or 0) + 1
+
+            # Open the new STEP as a child of AGENT. Prefer the explicit
+            # AGENT context (more reliable than relying on contextvars
+            # propagation across asyncio task / thread boundaries).
+            agent_ctx = getattr(instance, _AGENT_CTX_ATTR, None)
+            if agent_ctx is None and sid:
+                agent_ctx = get_context(sid)
+            try:
+                span = self._tracer.start_span(
+                    "react step",
+                    kind=SpanKind.INTERNAL,
+                    context=agent_ctx,
+                )
+            except Exception:
+                # Fall back to current-context-based parenting if explicit
+                # context= isn't accepted (older OTel SDKs).
+                with AttachedSession(sid):
+                    span = self._tracer.start_span(
+                        "react step", kind=SpanKind.INTERNAL
+                    )
+            _set_common(span, "STEP")
+            span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "react")
+            span.set_attribute(OH_REACT_ROUND, round_num)
+            span.set_attribute(GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name))
+            if sid:
+                span.set_attribute(GEN_AI_SESSION_ID, sid)
+                span.set_attribute(GEN_AI_CONVERSATION_ID, sid)
+                span.set_attribute(GEN_AI_AGENT_ID, sid)
+            is_new_span = True
+            try:
+                setattr(instance, _STEP_SPAN_ATTR, span)
+                setattr(instance, "_otel_oh_step_consumed", True)
+            except Exception:
+                try:
+                    span.end()
+                except Exception:
+                    pass
+                return await wrapped(*args, **kwargs)
+
+        # Capture INPUT: messages going into this step.
+        try:
+            state = safe_get_attr(instance, "state")
+            history = safe_get_attr(state, "history") or []
+            if isinstance(history, list):
+                span.set_attribute(OH_HISTORY_LENGTH, len(history))
+            input_messages = _state_to_input_messages(state)
+            if input_messages:
+                _set_io(
+                    span,
+                    input_value=input_messages,
+                    input_messages=input_messages,
+                )
+        except Exception:
+            pass
+
+        # Build the STEP context object. Cross-thread propagation goes
+        # through this Context object stashed in session_context (TOOL /
+        # LLM wrappers re-attach it inside their own scopes with paired
+        # attach/detach so no token ever crosses a context boundary).
+        step_ctx = set_span_in_context(span)
+        if sid:
+            store_context(sid, step_ctx)
+
+        # Snapshot pre-body state so we can detect "empty" body that
+        # got through ``_will_step_be_noop`` (e.g. ``state_tracker``
+        # raised inside ``_step``, ``_is_stuck`` early-returned, or
+        # ``agent.step`` returned ``None`` / raised handled error).
+        pre_history_len, pre_pending_id = self._snapshot_for_work_detection(
+            instance
+        )
+
+        # Attach STEP for the *body's* contextvars propagation only.
+        # Both attach and the matching detach happen in this coroutine's
+        # own context, so the Aliyun SDK's strict token check is happy.
+        step_token = otel_context.attach(step_ctx)
+        body_error: BaseException | None = None
+        try:
+            result = await wrapped(*args, **kwargs)
+        except BaseException as exc:
+            body_error = exc
+        finally:
+            try:
+                otel_context.detach(step_token)
+            except Exception:
+                pass
+
+        if body_error is not None:
+            try:
+                span.set_attribute(
+                    "gen_ai.react.finish_reason", type(body_error).__qualname__
+                )
+                span.record_exception(body_error)
+                span.set_status(
+                    Status(StatusCode.ERROR, type(body_error).__qualname__)
+                )
+            except Exception:
+                pass
+            # On error, close STEP now so the failure surfaces cleanly
+            # rather than waiting for the next _step / controller close.
+            _close_open_step(instance)
+            # Make sure the round counter we *tentatively* assigned for
+            # this STEP gets committed so subsequent rounds renumber
+            # past it instead of overlapping.
+            if is_new_span:
+                try:
+                    instance._otel_oh_round = round_num
+                except Exception:
+                    pass
+            raise body_error
+
+        # Detect post-body "empty" STEP — the wrapper passed the
+        # ``_will_step_be_noop`` pre-check but the body still produced
+        # zero observable work (no new history events, no new pending
+        # action). The user has explicitly asked us not to clutter the
+        # trace with sub-millisecond placeholder STEP spans, so:
+        #
+        # * If we *opened* a fresh span this round, end it immediately,
+        #   mark it ``openhands.step.empty=true``, and DO NOT bump the
+        #   committed round counter. Next real _step opens a fresh STEP
+        #   with the same round number — the empty span still appears
+        #   in the trace (we have no way to suppress export from inside
+        #   a wrapper), but with a clear ``empty=true`` marker so it's
+        #   trivially filterable in the dashboard.
+        # * If we *reused* a warmup / persisted STEP that was already
+        #   meaningful (had earlier RECALL/TOOL children), keep it open
+        #   and don't mark it empty — the children give it value.
+        post_history_len, post_pending_id = self._snapshot_for_work_detection(
+            instance
+        )
+        did_work = (
+            post_history_len > pre_history_len
+            or (post_pending_id is not None and post_pending_id != pre_pending_id)
+        )
+
+        if not did_work and is_new_span:
+            try:
+                span.set_attribute("openhands.step.empty", True)
+                span.set_attribute(
+                    "gen_ai.react.finish_reason", "noop_step_body"
+                )
+                span.end()
+            except Exception:
+                pass
+            # Forget this empty STEP so the next _step opens a fresh one
+            # without trying to close-or-reuse this one.
+            try:
+                if getattr(instance, _STEP_SPAN_ATTR, None) is span:
+                    setattr(instance, _STEP_SPAN_ATTR, None)
+            except Exception:
+                pass
+            try:
+                # Roll back to the previous committed round (don't
+                # advance the counter for an empty STEP).
+                instance._otel_oh_round = round_num - 1
+                instance._otel_oh_step_consumed = True
+            except Exception:
+                pass
+            # Restore session stash to AGENT so subsequent TOOLs land
+            # under AGENT (not under a now-ended STEP).
+            if sid:
+                agent_ctx = getattr(instance, _AGENT_CTX_ATTR, None)
+                if agent_ctx is not None:
+                    try:
+                        store_context(sid, agent_ctx)
+                    except Exception:
+                        pass
+            return result
+
+        # Body did work — commit the round counter (we only update it
+        # *after* we're sure the STEP is meaningful).
+        if is_new_span:
+            try:
+                instance._otel_oh_round = round_num
+            except Exception:
+                pass
+
+        # Capture OUTPUT: the freshly-decided pending action.
+        try:
+            pending = getattr(instance, "_pending_action", None)
+            state = safe_get_attr(instance, "state")
+            agent_state = safe_get_attr(state, "agent_state")
+            if agent_state is not None:
+                span.set_attribute(
+                    OH_AGENT_STATE,
+                    safe_get_attr(agent_state, "value")
+                    or safe_str(agent_state),
+                )
+            if pending is not None:
+                action_type = _action_type_value(pending)
+                if action_type:
+                    span.set_attribute(OH_ACTION_TYPE, action_type)
+                out = action_to_genai_output(pending)
+                if out:
+                    _set_io(span, output_value=out, output_messages=out)
+        except Exception:
+            pass
+
+        # Mirror the latest history snapshot back up to the AGENT span
+        # so AGENT's GenAI message attributes stay current during the run
+        # (not just at close-time). Downstream dashboards may read AGENT
+        # before the controller actually closes.
+        try:
+            agent_span = getattr(instance, _AGENT_SPAN_ATTR, None)
+            if agent_span is not None:
+                _capture_agent_io_attributes(
+                    agent_span, instance, agent, safe_get_attr(instance, "state")
+                )
+        except Exception:
+            pass
+
+        # Mark the warmup STEP (round 1) the moment we know it carries
+        # real work — it now contains LLM/TOOL children and matters.
+        if reused_warmup:
+            try:
+                span.set_attribute("openhands.step.warmup_consumed", True)
+            except Exception:
+                pass
+
+        # STEP span stays open here — it lives until the next _step (or
+        # AgentController.close) ends it. Until then any TOOL fired by
+        # Runtime.run_action on a thread-pool worker will re-attach the
+        # STEP context object stashed above and become its child.
+        return result
+
+
+# ---------------------------------------------------------------------------
+# TOOL: Runtime.run_action
+# ---------------------------------------------------------------------------
+
+
+_TOOL_KIND_TO_NAME: dict[str, str] = {
+    "run": "bash",
+    "run_ipython": "ipython",
+    "browse_interactive": "browser",
+    "browse": "browser",
+    "edit": "str_replace_editor",
+    "read": "file_read",
+    "write": "file_write",
+    "delegate": "delegate",
+    "finish": "finish",
+    "think": "think",
+    "task_tracking": "task_tracker",
+    "mcp": "mcp",
+    "send_message": "send_message",
+    # ``recall`` is a real (non-LLM-initiated) tool: the controller posts
+    # a RecallAction and the memory subsystem runs it just like any other
+    # action via ``Runtime.run_action``. Worth a TOOL span.
+    "recall": "recall",
+}
+
+# Action types that are *not* real tool calls — they're internal control
+# events posted by the controller / event-stream itself (system prompt,
+# user message, agent-state transition, no-ops). Emitting TOOL spans for
+# these clutters the trace tree and confuses the GenAI semconv (these
+# aren't things the LLM "called").
+_INTERNAL_ACTION_TYPES: frozenset[str] = frozenset(
+    {
+        "message",
+        "system",
+        "change_agent_state",
+        "agent_state_changed",
+        "null",
+        "noop",
+    }
+)
+
+
+def _action_type_value(action: Any) -> str:
+    """Best-effort extract the canonical action-type string for ``action``.
+
+    OpenHands declares ``ActionType`` as ``class ActionType(str, Enum)``
+    with members like ``MESSAGE = 'message'``. Each Action subclass sets
+    ``action: str = ActionType.MESSAGE``. ``str(ActionType.MESSAGE)``
+    returns ``'ActionType.MESSAGE'`` (Python's default Enum.__str__),
+    *not* the value ``'message'`` we want for filtering / lookup. This
+    helper prefers ``.value`` when the attribute is enum-like, else the
+    raw string.
+    """
+    raw = safe_get_attr(action, "action")
+    if raw is None:
+        return ""
+    val = safe_get_attr(raw, "value")
+    if val is not None:
+        return safe_str(val).lower()
+    text = safe_str(raw).lower()
+    # ``str(ActionType.MESSAGE)`` → "actiontype.message"; strip the prefix.
+    prefix = "actiontype."
+    if text.startswith(prefix):
+        return text[len(prefix):]
+    return text
+
+
+def _is_real_tool_call(action: Any) -> bool:
+    """Return True iff ``action`` represents a meaningful tool execution.
+
+    Filtering rules (in order):
+
+    1. **Internal action types are *always* dropped** even when the
+       action carries ``tool_call_metadata``. OpenHands lets the LLM
+       produce ``MessageAction`` (via the ``send_message`` "tool"),
+       ``SystemMessageAction``, ``ChangeAgentStateAction`` etc. — those
+       are coordination signals, not real tool executions, and they
+       clutter the trace with sub-millisecond noise spans that the user
+       has explicitly asked us to suppress.
+    2. Otherwise, an action qualifies if it has ``tool_call_metadata``
+       (i.e. it was produced from an LLM ``tool_calls`` response — e.g.
+       ``execute_bash``, ``str_replace_editor``), or
+    3. Its action-type is in the executable-tool whitelist
+       (``_TOOL_KIND_TO_NAME``) — this catches synthesized actions like
+       ``RECALL`` that don't come from the LLM but are still worth
+       tracing as TOOL spans (memory retrieval, microagent loading,
+       etc.).
+    """
+    action_type = _action_type_value(action)
+    # Always drop internal/system actions regardless of how they were
+    # produced — see rule 1 above.
+    if action_type and action_type in _INTERNAL_ACTION_TYPES:
+        return False
+    if safe_get_attr(action, "tool_call_metadata") is not None:
+        return True
+    if not action_type:
+        return False
+    return action_type in _TOOL_KIND_TO_NAME
+
+
+def _extract_tool_name(action: Any) -> tuple[str, str]:
+    """Return (tool_name, action_type).
+
+    Prefers the function name carried on ``action.tool_call_metadata``
+    (set when the action came from an LLM tool call) — that's what the
+    LLM and our LLM-side instrumentation know it as. Falls back to the
+    canonical action-type string (``ActionType.RECALL`` → ``"recall"``)
+    mapped through ``_TOOL_KIND_TO_NAME``.
+    """
+    action_type = _action_type_value(action)
+    tcm = safe_get_attr(action, "tool_call_metadata")
+    if tcm is not None:
+        fn = safe_get_attr(tcm, "function_name")
+        if fn:
+            return safe_str(fn), action_type
+    tool_name = _TOOL_KIND_TO_NAME.get(action_type, action_type or "agent.action")
+    return tool_name, action_type
+
+
+def _extract_tool_call_id(action: Any) -> str:
+    tcm = safe_get_attr(action, "tool_call_metadata")
+    if tcm is None:
+        return ""
+    return safe_str(safe_get_attr(tcm, "tool_call_id") or "")
+
+
+def _runtime_sid(instance: Any) -> str:
+    """Best-effort discover the session id from a Runtime instance."""
+    sid = safe_get_attr(instance, "sid")
+    if sid:
+        return safe_str(sid)
+    es = safe_get_attr(instance, "event_stream")
+    es_sid = safe_get_attr(es, "sid")
+    if es_sid:
+        return safe_str(es_sid)
+    return ""
+
+
+class RuntimeRunActionWrapper:
+    """TOOL span around ``Runtime.run_action``.
+
+    Bridges the session context across worker threads, then opens a TOOL
+    span with GenAI tool-call attributes. Arguments are always recorded
+    in ``gen_ai.tool.call.arguments`` (``"{}"`` when none); results go to
+    ``gen_ai.tool.call.result``. No ``input.value`` / ``output.value``.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS:
+            return wrapped(*args, **kwargs)
+
+        action = args[0] if args else kwargs.get("action")
+        # Skip internal control events — system prompts, user messages,
+        # memory recalls, agent-state transitions etc. aren't tool calls
+        # and shouldn't appear as TOOL spans alongside the real ones.
+        if not _is_real_tool_call(action):
+            return wrapped(*args, **kwargs)
+
+        tool_name, action_type = _extract_tool_name(action)
+        tool_call_id = _extract_tool_call_id(action)
+        runtime_class = (
+            f"{type(instance).__module__}.{type(instance).__name__}"
+            if instance
+            else ""
+        )
+        sid = _runtime_sid(instance)
+
+        # Look up the session-stashed context (STEP if a step is open,
+        # AGENT otherwise) and use it as the *explicit* parent context
+        # for the TOOL span. Explicit context= is more robust than
+        # relying on contextvars propagation across worker threads — it
+        # always parents under the latest STEP/AGENT no matter what
+        # thread/loop the runtime is running on.
+        parent_ctx = get_context(sid)
+        try:
+            span = self._tracer.start_span(
+                f"execute_tool {tool_name}",
+                kind=SpanKind.INTERNAL,
+                context=parent_ctx,
+            )
+        except Exception:
+            with AttachedSession(sid):
+                span = self._tracer.start_span(
+                    f"execute_tool {tool_name}", kind=SpanKind.INTERNAL
+                )
+        # The TOOL span itself is parented *explicitly* via context=
+        # above. We additionally attach the session context throughout
+        # the wrapped call so any nested spans created by the runtime
+        # (e.g. a retried LLM call) that go through the contextvars
+        # propagation path also inherit the right session — and the
+        # ``otel_context.attach(set_span_in_context(span))`` below makes
+        # the TOOL itself current so retry-spawned child spans nest
+        # under TOOL, not under its parent STEP.
+        with AttachedSession(sid):
+            # ARMS GenAI semconv (Tool):
+            #   gen_ai.span.kind=TOOL, gen_ai.operation.name=execute_tool,
+            #   gen_ai.tool.name, gen_ai.tool.type
+            #   gen_ai.tool.call.id, gen_ai.tool.description    [recommended]
+            #   gen_ai.tool.call.arguments, gen_ai.tool.call.result
+            #     [optional, gated on capture-message-content]
+            _set_common(span, "TOOL")
+            span.set_attribute(
+                GenAI.GEN_AI_OPERATION_NAME,
+                GenAI.GenAiOperationNameValues.EXECUTE_TOOL.value,
+            )
+            span.set_attribute(GenAI.GEN_AI_TOOL_NAME, tool_name)
+            span.set_attribute(GenAI.GEN_AI_TOOL_TYPE, "function")
+            if tool_call_id:
+                span.set_attribute(GEN_AI_TOOL_CALL_ID, tool_call_id)
+            if action_type:
+                # ``action_type`` from ``_extract_tool_name`` is the
+                # canonical lowercased value (e.g. ``"recall"``), suitable
+                # for ``openhands.action.type``.
+                span.set_attribute(OH_ACTION_TYPE, action_type)
+            if runtime_class:
+                span.set_attribute(OH_RUNTIME_NAME, runtime_class)
+            if sid:
+                span.set_attribute(GEN_AI_SESSION_ID, sid)
+                span.set_attribute(GEN_AI_CONVERSATION_ID, sid)
+
+            # gen_ai.tool.description — looked up via the per-sid registry
+            # populated by the AGENT wrapper from ``controller.agent.tools``.
+            try:
+                tool_def = get_tool_definition(sid, tool_name)
+                if tool_def is not None:
+                    if isinstance(tool_def, dict):
+                        fn = tool_def.get("function") or {}
+                        desc = fn.get("description") if isinstance(fn, dict) else None
+                    else:
+                        fn = safe_get_attr(tool_def, "function")
+                        desc = safe_get_attr(fn, "description")
+                    if desc:
+                        span.set_attribute(GEN_AI_TOOL_DESCRIPTION, safe_str(desc))
+            except Exception:
+                pass
+
+            # gen_ai.tool.call.arguments — always emit (empty object as "{}" ).
+            # No OpenInference input.value / output.value on TOOL spans.
+            arguments_dict = _tool_call_arguments(action)
+            try:
+                args_json = to_json_str(arguments_dict)
+                if not args_json:
+                    args_json = "{}"
+                span.set_attribute(GEN_AI_TOOL_CALL_ARGUMENTS, args_json)
+                preview_field, preview_text = _first_preview_field(action)
+                if preview_text:
+                    span.set_attribute(
+                        f"openhands.action.{preview_field}", preview_text
+                    )
+            except Exception:
+                span.set_attribute(GEN_AI_TOOL_CALL_ARGUMENTS, "{}")
+
+            ctx = set_span_in_context(span)
+            token = otel_context.attach(ctx)
+            try:
+                try:
+                    observation = wrapped(*args, **kwargs)
+                except BaseException as exc:
+                    span.record_exception(exc)
+                    span.set_status(
+                        Status(StatusCode.ERROR, type(exc).__qualname__)
+                    )
+                    raise
+                try:
+                    _annotate_observation(span, observation)
+                except Exception:
+                    pass
+                return observation
+            finally:
+                try:
+                    otel_context.detach(token)
+                except Exception:
+                    pass
+                span.end()
+
+
+def _first_preview_field(action: Any) -> tuple[str, str]:
+    for attr in ("command", "code", "path", "url", "content"):
+        v = safe_get_attr(action, attr)
+        if v:
+            return attr, safe_str(v)
+    return "", ""
+
+
+_TOOL_ARG_FIELDS: tuple[str, ...] = (
+    "command",
+    "code",
+    "path",
+    "url",
+    "content",
+    "task_list",
+    "name",
+    "arguments",
+    "thought",
+    "is_input",
+    "blocking",
+    "keep_prompt",
+    "translated_ipython_code",
+    "browser_actions",
+    "agent_state",
+    "outputs",
+    "final_thought",
+    "old_str",
+    "new_str",
+    "view_range",
+    "file_text",
+    "insert_line",
+    "start_line",
+    "end_line",
+)
+
+
+def _coerce_tool_arguments(value: Any) -> dict[str, Any]:
+    """Normalize tool call arguments to a JSON-object-compatible dict."""
+    if value in (None, "", [], {}):
+        return {}
+    if isinstance(value, dict):
+        return value
+    if isinstance(value, str):
+        try:
+            parsed = json.loads(value)
+        except Exception:
+            return {"raw": value}
+        if isinstance(parsed, dict):
+            return parsed
+        return {"value": parsed}
+    return {"value": value}
+
+
+def _tool_call_arguments(action: Any) -> dict[str, Any]:
+    """Return the bare arguments dict for ``gen_ai.tool.call.arguments``.
+
+    Per ARMS GenAI semconv the value is a JSON string of *just* the call
+    arguments — e.g. ``{"location": "San Francisco", "date": "2025-10-01"}``
+    — not the wrapping ``{"tool": ..., "arguments": ...}`` envelope.
+    """
+    if action is None:
+        return {}
+    # When the action came from an LLM tool call, prefer the original
+    # JSON arguments the model emitted (most faithful to what the LLM
+    # actually requested).
+    tcm = safe_get_attr(action, "tool_call_metadata")
+    if tcm is not None:
+        direct_args = _coerce_tool_arguments(safe_get_attr(tcm, "arguments"))
+        if direct_args:
+            return direct_args
+    model_response = safe_get_attr(tcm, "model_response") if tcm else None
+    if model_response is not None:
+        try:
+            choices = (
+                model_response.choices
+                if hasattr(model_response, "choices")
+                else None
+            ) or []
+            for choice in choices:
+                msg = getattr(choice, "message", None) or (
+                    choice.get("message") if isinstance(choice, dict) else None
+                )
+                tool_calls = (
+                    getattr(msg, "tool_calls", None)
+                    if msg is not None
+                    else None
+                ) or (msg.get("tool_calls") if isinstance(msg, dict) else None)
+                if not tool_calls:
+                    continue
+                want_id = safe_str(safe_get_attr(tcm, "tool_call_id") or "")
+                for tc in tool_calls:
+                    tc_id = (
+                        getattr(tc, "id", None)
+                        if not isinstance(tc, dict)
+                        else tc.get("id")
+                    )
+                    if want_id and safe_str(tc_id) != want_id:
+                        continue
+                    fn = (
+                        getattr(tc, "function", None)
+                        if not isinstance(tc, dict)
+                        else tc.get("function")
+                    )
+                    raw_args = (
+                        getattr(fn, "arguments", None)
+                        if not isinstance(fn, dict)
+                        else fn.get("arguments")
+                    )
+                    parsed_args = _coerce_tool_arguments(raw_args)
+                    if parsed_args:
+                        return parsed_args
+        except Exception:
+            pass
+    # Fallback: harvest known argument-bearing fields off the Action object.
+    args: dict[str, Any] = {}
+    for key in _TOOL_ARG_FIELDS:
+        v = safe_get_attr(action, key)
+        if v not in (None, "", [], {}):
+            args[key] = v
+    return args
+
+
+def _observation_to_result(observation: Any) -> dict[str, Any]:
+    """Return a dict suitable for ``gen_ai.tool.call.result``."""
+    if observation is None:
+        return {}
+    payload: dict[str, Any] = {}
+    for key in (
+        "content",
+        "exit_code",
+        "error",
+        "interpreter_details",
+        "command",
+        "stdout",
+        "stderr",
+        "url",
+        "screenshot",
+        "outputs",
+    ):
+        v = safe_get_attr(observation, key)
+        if v not in (None, "", [], {}):
+            payload[key] = v
+    return payload
+
+
+def _annotate_observation(span: trace_api.Span, observation: Any) -> None:
+    if observation is None:
+        return
+    obs_type = safe_str(
+        safe_get_attr(observation, "observation") or type(observation).__name__
+    )
+    if obs_type:
+        span.set_attribute(OH_OBSERVATION_TYPE, obs_type)
+    exit_code = safe_get_attr(observation, "exit_code")
+    if exit_code is not None:
+        try:
+            ec = int(exit_code)
+            span.set_attribute("openhands.action.exit_code", ec)
+            if ec != 0:
+                span.set_status(Status(StatusCode.ERROR, f"exit_code={ec}"))
+        except (TypeError, ValueError):
+            pass
+    error = safe_get_attr(observation, "error")
+    if error:
+        span.set_attribute("openhands.observation.error", safe_str(error))
+        span.set_status(Status(StatusCode.ERROR, safe_str(error)))
+    # TOOL spans do not emit OpenInference output.value; the result lives in
+    # the GenAI tool-call result attribute.
+    try:
+        result_payload = _observation_to_result(observation)
+        result_payload.setdefault("observation", obs_type)
+        out = to_json_str(result_payload)
+        if out:
+            span.set_attribute(GEN_AI_TOOL_CALL_RESULT, out)
+    except Exception:
+        pass
+
+
+# ---------------------------------------------------------------------------
+# ENTRY + AGENT (controller-lifecycle bound)
+#
+# Why this exists in addition to RunControllerWrapper / RunAgentUntilDoneWrapper:
+#
+# When OpenHands V0 is launched via ``python -m openhands.core.main``, Python
+# executes ``main.py`` *as ``__main__``*. The ``from openhands.core.loop
+# import run_agent_until_done`` (and other from-imports) at the top of
+# ``main.py`` bind those symbols into ``__main__``'s namespace **before**
+# our instrumentor patches ``openhands.core.main.run_controller`` /
+# ``openhands.core.loop.run_agent_until_done``. The ``__main__`` block's
+# ``asyncio.run(run_controller(...))`` call uses the *unpatched* local
+# reference, so the wrappers above never fire — and the trace appears
+# without an ENTRY span.
+#
+# STEP / TOOL spans work because ``_step`` and ``run_action`` are *class
+# methods*: patching ``AgentController._step`` updates the class object
+# that both ``__main__.AgentController`` and
+# ``openhands.controller.agent_controller.AgentController`` reference, so
+# every method lookup at call time finds the wrapped version.
+#
+# ENTRY+AGENT here exploit the same principle — they hook
+# ``AgentController.__init__`` and ``AgentController.close``, both class
+# methods, so the spans bracket the controller's lifecycle reliably no
+# matter how ``run_controller`` was invoked. They no-op when a session
+# context is already stashed for this sid (i.e. ``RunControllerWrapper``
+# fired successfully — the API/test-suite code path).
+# ---------------------------------------------------------------------------
+
+
+def _capture_agent_io_attributes(
+    span: trace_api.Span, controller: Any, agent: Any, state: Any
+) -> None:
+    """Set gen_ai.system_instructions / input.messages / output.messages on
+    the AGENT span, following the ARMS GenAI semconv schema."""
+    try:
+        sys_instr = _agent_to_system_instructions(agent, state)
+        if sys_instr:
+            payload = to_json_str(sys_instr)
+            if payload:
+                span.set_attribute(GEN_AI_SYSTEM_INSTRUCTIONS, payload)
+    except Exception:
+        pass
+    try:
+        history = safe_get_attr(state, "history") or []
+        if isinstance(history, list) and history:
+            input_msgs = _history_to_input_messages_schema(history)
+            if input_msgs:
+                payload = to_json_str(input_msgs)
+                if payload:
+                    span.set_attribute(GEN_AI_INPUT_MESSAGES, payload)
+            output_msgs = _history_to_output_messages_schema(history)
+            if output_msgs:
+                payload = to_json_str(output_msgs)
+                if payload:
+                    span.set_attribute(GEN_AI_OUTPUT_MESSAGES, payload)
+    except Exception:
+        pass
+
+
+def _open_entry_and_agent_for_controller(
+    tracer: Tracer, controller: Any
+) -> None:
+    """Open ENTRY (parent) + AGENT (child) + warmup STEP for ``controller``.
+
+    Opening a *warmup STEP* (round 1) right after AGENT means that any
+    pre-step actions like RECALL — which are dispatched to the runtime
+    *before* the first ``_step`` invocation — become children of STEP 1
+    instead of dangling siblings under AGENT. The first real ``_step``
+    call detects that the warmup STEP isn't yet "consumed" and reuses
+    it (without bumping the round counter) so the LLM call + first
+    LLM-driven tool also nest under STEP 1.
+
+    All inner span creations use the explicit ``context=`` argument
+    (instead of relying on ``contextvars`` propagation through
+    ``otel_context.attach``) — this is the most deterministic way to
+    parent a child span and avoids the entire class of "Token was
+    created in a different Context" failures we used to chase across
+    asyncio-task / thread boundaries.
+
+    Idempotent on ``_OWNS_FLAG`` — safe to call multiple times for the
+    same controller. Deliberately does **not** check whether a session
+    context is already stashed: under ``python -m openhands.core.main``
+    the from-import binding bypasses ``RunControllerWrapper`` and
+    ``RunAgentUntilDoneWrapper``, so the init wrapper is the only
+    reliable source of ENTRY+AGENT and must always run.
+    """
+    if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS:
+        return
+    if getattr(controller, _OWNS_FLAG, False):
+        # Already opened (e.g. RunControllerWrapper fired first) — log
+        # and bail. We don't want to double-emit ENTRY/AGENT.
+        logger.debug(
+            "OpenHands instrumentation: ENTRY+AGENT already open on "
+            "controller %s — skipping init-wrapper open",
+            id(controller),
+        )
+        return
+
+    sid = safe_str(safe_get_attr(controller, "id") or "")
+    agent = safe_get_attr(controller, "agent")
+    agent_name = safe_get_attr(agent, "name") or "codeact"
+    agent_class = (
+        f"{type(agent).__module__}.{type(agent).__name__}" if agent else ""
+    )
+    llm = safe_get_attr(agent, "llm")
+    llm_config = safe_get_attr(llm, "config")
+    model = safe_get_attr(llm_config, "model") or safe_get_attr(llm, "model")
+
+    # ----- ENTRY -----
+    # If RunControllerWrapper already stashed an ENTRY context, parent AGENT
+    # directly under it. Otherwise create the lifecycle-owned ENTRY here.
+    entry: trace_api.Span | None = None
+    entry_ctx = get_context(sid)
+    if entry_ctx is None:
+        try:
+            entry = tracer.start_span("enter openhands", kind=SpanKind.INTERNAL)
+        except Exception as exc:
+            logger.error(
+                "OpenHands instrumentation: failed to start ENTRY span for "
+                "sid=%r: %s",
+                sid,
+                exc,
+                exc_info=True,
+            )
+            return
+
+        try:
+            _set_common(entry, "ENTRY")
+            entry.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "enter")
+            if sid:
+                entry.set_attribute(GEN_AI_SESSION_ID, sid)
+                entry.set_attribute(GEN_AI_CONVERSATION_ID, sid)
+            if agent_class:
+                entry.set_attribute(OH_AGENT_NAME, agent_class)
+            if model:
+                entry.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model))
+            state = safe_get_attr(controller, "state")
+            entry_input_messages, _ = _entry_io_from_state(state)
+            if entry_input_messages:
+                _set_io(
+                    entry,
+                    input_value=entry_input_messages,
+                    input_messages=entry_input_messages,
+                )
+        except Exception as exc:
+            logger.debug("OpenHands instrumentation: ENTRY attr setup: %s", exc)
+
+        entry_ctx = set_span_in_context(entry)
+
+    # ----- AGENT (child of ENTRY) -----
+    # Pass ``context=entry_ctx`` *explicitly* so AGENT inherits ENTRY
+    # as parent regardless of what the surrounding contextvars look
+    # like (some 3rd-party SDKs reset contextvars between calls).
+    try:
+        agent_span = tracer.start_span(
+            f"invoke_agent {agent_name}",
+            kind=SpanKind.INTERNAL,
+            context=entry_ctx,
+        )
+    except Exception as exc:
+        logger.error(
+            "OpenHands instrumentation: failed to start AGENT span for "
+            "sid=%r: %s",
+            sid,
+            exc,
+            exc_info=True,
+        )
+        if entry is not None:
+            try:
+                entry.end()
+            except Exception:
+                pass
+        return
+
+    try:
+        _set_common(agent_span, "AGENT")
+        agent_span.set_attribute(
+            GenAI.GEN_AI_OPERATION_NAME,
+            GenAI.GenAiOperationNameValues.INVOKE_AGENT.value,
+        )
+        agent_span.set_attribute(GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name))
+        if agent_class:
+            agent_span.set_attribute(OH_AGENT_NAME, agent_class)
+        if sid:
+            agent_span.set_attribute(GEN_AI_SESSION_ID, sid)
+            agent_span.set_attribute(GEN_AI_CONVERSATION_ID, sid)
+            agent_span.set_attribute(GEN_AI_AGENT_ID, sid)
+        if model:
+            agent_span.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model))
+    except Exception as exc:
+        logger.debug("OpenHands instrumentation: AGENT attr setup: %s", exc)
+
+    # Tool registry + gen_ai.tool.definitions — same logic as
+    # RunAgentUntilDoneWrapper, since this path also needs the
+    # registry for downstream TOOL spans.
+    try:
+        tools = safe_get_attr(agent, "tools") or []
+        if sid:
+            store_tool_registry(sid, tools)
+        defs_summary: list[dict[str, Any]] = []
+        for t in tools:
+            if isinstance(t, dict):
+                kind = t.get("type") or "function"
+                fn = t.get("function") or {}
+                name = fn.get("name") if isinstance(fn, dict) else None
+            else:
+                kind = safe_get_attr(t, "type") or "function"
+                fn = safe_get_attr(t, "function")
+                name = safe_get_attr(fn, "name")
+            if not name:
+                continue
+            item: dict[str, Any] = {"type": safe_str(kind), "name": safe_str(name)}
+            if isinstance(fn, dict):
+                desc = fn.get("description")
+                params = fn.get("parameters")
+            else:
+                desc = safe_get_attr(fn, "description")
+                params = safe_get_attr(fn, "parameters")
+            if desc:
+                item["description"] = safe_str(desc)
+            if params:
+                item["parameters"] = params
+            defs_summary.append(item)
+        if defs_summary:
+            agent_span.set_attribute(
+                GEN_AI_TOOL_DEFINITIONS, to_json_str(defs_summary)
+            )
+    except Exception:
+        pass
+
+    # Best-effort INPUT + system_instructions capture on AGENT at open
+    # time. ``_capture_agent_io_attributes`` will run again at close to
+    # overwrite these with the *final* state, but having them now means
+    # an in-flight read of the AGENT span (e.g. live dashboards) sees
+    # at least the system prompt + initial user message.
+    try:
+        state = safe_get_attr(controller, "state")
+        _capture_agent_io_attributes(agent_span, controller, agent, state)
+    except Exception as exc:
+        logger.debug(
+            "OpenHands instrumentation: AGENT initial I/O capture: %s", exc
+        )
+
+    agent_ctx = set_span_in_context(agent_span)
+    if sid:
+        # Stash ctx-with-AGENT so STEP / TOOL re-attach correctly even
+        # when fired from worker threads with brand-new asyncio loops.
+        # The downstream consumers (STEP / TOOL / LLM bridge) all do
+        # their own paired attach/detach, so it's safe to share this
+        # ``Context`` object across asyncio tasks and threads.
+        store_context(sid, agent_ctx)
+
+    # ----- WARMUP STEP (round 1) -----
+    # Open right after AGENT so any pre-_step actions (RECALL, etc.) that
+    # the controller dispatches to the runtime become children of STEP 1
+    # rather than dangling siblings under AGENT. The first real ``_step``
+    # call detects this open STEP isn't yet "consumed" and reuses it
+    # (preserving the round number) so the LLM call + first LLM-driven
+    # tool also nest under STEP 1 — giving the trace tree:
+    #
+    #   ENTRY > AGENT > STEP 1 > [RECALL, LLM, execute_bash]
+    #                  STEP 2 > [LLM, finish]
+    #                  ...
+    warmup_step_ctx: object | None = None
+    warmup_step_span: trace_api.Span | None = None
+    try:
+        warmup_step_span = tracer.start_span(
+            "react step",
+            kind=SpanKind.INTERNAL,
+            context=agent_ctx,
+        )
+        _set_common(warmup_step_span, "STEP")
+        warmup_step_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "react")
+        warmup_step_span.set_attribute(OH_REACT_ROUND, 1)
+        warmup_step_span.set_attribute(
+            GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name)
+        )
+        if sid:
+            warmup_step_span.set_attribute(GEN_AI_SESSION_ID, sid)
+            warmup_step_span.set_attribute(GEN_AI_CONVERSATION_ID, sid)
+            warmup_step_span.set_attribute(GEN_AI_AGENT_ID, sid)
+        warmup_step_ctx = set_span_in_context(warmup_step_span)
+        if sid and warmup_step_ctx is not None:
+            store_context(sid, warmup_step_ctx)
+    except Exception as exc:
+        logger.debug("Failed to open warmup STEP span: %s", exc)
+        warmup_step_span = None
+
+    # Stash everything we need to tear down in close().
+    try:
+        setattr(controller, _OWNS_FLAG, True)
+        setattr(controller, _ENTRY_SPAN_ATTR, entry)
+        setattr(controller, _AGENT_SPAN_ATTR, agent_span)
+        # Save the AGENT context so the STEP wrapper can restore the
+        # session stash to AGENT every time it closes a STEP — that way
+        # any TOOL fired between rounds re-attaches AGENT (not a closed
+        # STEP).
+        setattr(controller, _AGENT_CTX_ATTR, agent_ctx)
+        # Stash warmup STEP so the first real ``_step`` reuses it.
+        setattr(controller, _STEP_SPAN_ATTR, warmup_step_span)
+        setattr(controller, "_otel_oh_round", 1 if warmup_step_span is not None else 0)
+        setattr(controller, "_otel_oh_step_consumed", False)
+    except Exception:
+        # If we can't attach to the instance (slots, etc.), close the
+        # spans down so we don't leak them.
+        if warmup_step_span is not None:
+            try:
+                warmup_step_span.end()
+            except Exception:
+                pass
+        try:
+            agent_span.end()
+        except Exception:
+            pass
+        if entry is not None:
+            try:
+                entry.end()
+            except Exception:
+                pass
+        return
+
+    # Log at INFO so the user can verify in their app logs that the
+    # ENTRY+AGENT spans were actually opened (and which trace/span IDs
+    # they got). When a user reports "no ENTRY span" in their backend,
+    # the first thing to check is whether this log line appeared.
+    try:
+        entry_sc = entry.get_span_context() if entry is not None else None
+        agent_sc = agent_span.get_span_context()
+        warmup_sc = (
+            warmup_step_span.get_span_context()
+            if warmup_step_span is not None
+            else None
+        )
+        logger.info(
+            "OpenHands instrumentation: opened ENTRY+AGENT for sid=%r "
+            "(trace_id=%032x entry_span=%016x agent_span=%016x "
+            "warmup_step=%s agent_name=%s model=%s)",
+            sid,
+            entry_sc.trace_id if entry_sc is not None else agent_sc.trace_id,
+            entry_sc.span_id if entry_sc is not None else 0,
+            agent_sc.span_id,
+            f"{warmup_sc.span_id:016x}" if warmup_sc is not None else "none",
+            agent_name,
+            model or "",
+        )
+    except Exception:
+        pass
+
+
+def _close_entry_and_agent_for_controller(
+    controller: Any, *, error: BaseException | None = None
+) -> None:
+    """Tear down the ENTRY+AGENT spans previously opened for ``controller``.
+
+    Also closes any STEP span left open from the last ``_step`` invocation
+    (STEP spans are intentionally persisted across the return of ``_step``
+    so that thread-pooled TOOL / LLM calls fire as their children).
+    """
+    if not getattr(controller, _OWNS_FLAG, False):
+        logger.debug(
+            "OpenHands instrumentation: close called on controller %s "
+            "without an open ENTRY/AGENT — nothing to do",
+            id(controller),
+        )
+        return
+    sid = safe_str(safe_get_attr(controller, "id") or "")
+    agent = safe_get_attr(controller, "agent")
+    state = safe_get_attr(controller, "state")
+    entry_span: trace_api.Span | None = getattr(controller, _ENTRY_SPAN_ATTR, None)
+    agent_span: trace_api.Span | None = getattr(controller, _AGENT_SPAN_ATTR, None)
+    # Legacy slots — kept for back-compat with already-instrumented
+    # instances created before we stopped persisting attach-tokens.
+    # If they're set we simply ignore them (any detach attempt across
+    # asyncio task boundaries would raise ``ValueError`` in the Aliyun
+    # SDK; spans alone carry all the parentage info we need).
+    _ = getattr(controller, _AGENT_TOKEN_ATTR, None)
+    _ = getattr(controller, _ENTRY_TOKEN_ATTR, None)
+
+    # Close any STEP span still hanging from the last round before tearing
+    # down AGENT/ENTRY. Restores the session stash to AGENT context so any
+    # in-flight TOOL re-attaches AGENT (not a closed STEP).
+    try:
+        _close_open_step(controller)
+    except Exception:
+        pass
+
+    # Capture I/O attributes on the AGENT span before ending it.
+    if agent_span is not None:
+        try:
+            _capture_agent_io_attributes(agent_span, controller, agent, state)
+        except Exception:
+            pass
+        try:
+            history = safe_get_attr(state, "history") or []
+            if isinstance(history, list):
+                agent_span.set_attribute(OH_HISTORY_LENGTH, len(history))
+            agent_state = safe_get_attr(state, "agent_state")
+            if agent_state is not None:
+                agent_span.set_attribute(
+                    OH_AGENT_STATE,
+                    safe_get_attr(agent_state, "value") or safe_str(agent_state),
+                )
+        except Exception:
+            pass
+        if error is not None:
+            try:
+                agent_span.record_exception(error)
+                agent_span.set_status(
+                    Status(StatusCode.ERROR, type(error).__qualname__)
+                )
+            except Exception:
+                pass
+
+    # End AGENT (no detach — the token (if any) was attached in the
+    # ``__init__`` task's contextvars context and detaching here would
+    # cross a context boundary, raising ``ValueError`` in the Aliyun
+    # SDK. Legacy code may have set ``agent_token`` on older instances;
+    # we simply leave it alone — detaching is unnecessary because the
+    # span carries its own parentage and contextvars naturally unwind
+    # when the task that attached them exits).
+    if agent_span is not None:
+        try:
+            agent_span.end()
+        except Exception:
+            pass
+
+    # Mirror the most-useful bits onto ENTRY before closing it.
+    if entry_span is not None:
+        try:
+            agent_state = safe_get_attr(state, "agent_state")
+            if agent_state is not None:
+                entry_span.set_attribute(
+                    OH_AGENT_STATE,
+                    safe_get_attr(agent_state, "value") or safe_str(agent_state),
+                )
+            history = safe_get_attr(state, "history") or []
+            if isinstance(history, list):
+                entry_span.set_attribute(OH_HISTORY_LENGTH, len(history))
+            output_repr = _final_state_to_output(state)
+            entry_input_messages, entry_output_messages = _entry_io_from_state(
+                state
+            )
+            if output_repr:
+                _set_io(
+                    entry_span,
+                    output_value=output_repr,
+                    input_messages=entry_input_messages,
+                    output_messages=entry_output_messages,
+                )
+            elif entry_input_messages or entry_output_messages:
+                _set_io(
+                    entry_span,
+                    input_messages=entry_input_messages,
+                    output_messages=entry_output_messages,
+                )
+        except Exception:
+            pass
+        if error is not None:
+            try:
+                entry_span.record_exception(error)
+                entry_span.set_status(
+                    Status(StatusCode.ERROR, type(error).__qualname__)
+                )
+            except Exception:
+                pass
+
+    # Same as AGENT: end the span; never touch a possibly-leftover token
+    # from an older instrumentation run.
+    if entry_span is not None:
+        try:
+            entry_span.end()
+        except Exception:
+            pass
+
+    # Mirror the open-time INFO log so the user can confirm the spans
+    # actually closed and exported.
+    try:
+        agent_sc = (
+            agent_span.get_span_context() if agent_span is not None else None
+        )
+        entry_sc = (
+            entry_span.get_span_context() if entry_span is not None else None
+        )
+        logger.info(
+            "OpenHands instrumentation: closed ENTRY+AGENT for sid=%r "
+            "(entry_span=%s agent_span=%s rounds=%s error=%s)",
+            sid,
+            f"{entry_sc.span_id:016x}" if entry_sc is not None else "none",
+            f"{agent_sc.span_id:016x}" if agent_sc is not None else "none",
+            getattr(controller, "_otel_oh_round", 0),
+            type(error).__qualname__ if error is not None else "none",
+        )
+    except Exception:
+        pass
+
+    if sid:
+        try:
+            clear_context(sid)
+        except Exception:
+            pass
+
+    # Wipe stash slots so a re-used controller instance doesn't double-emit.
+    for attr in (
+        _OWNS_FLAG,
+        _ENTRY_SPAN_ATTR,
+        _AGENT_SPAN_ATTR,
+        _ENTRY_TOKEN_ATTR,
+        _AGENT_TOKEN_ATTR,
+        _STEP_SPAN_ATTR,
+        _AGENT_CTX_ATTR,
+        "_otel_oh_step_consumed",
+        "_otel_oh_round",
+    ):
+        try:
+            setattr(controller, attr, None)
+        except Exception:
+            pass
+    try:
+        setattr(controller, _OWNS_FLAG, False)
+    except Exception:
+        pass
+
+
+class AgentControllerInitWrapper:
+    """Open ENTRY + AGENT spans at the end of ``AgentController.__init__``.
+
+    Always reliable under ``python -m openhands.core.main`` because it
+    hooks a class method (immune to from-import binding).
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        try:
+            result = wrapped(*args, **kwargs)
+        except BaseException:
+            raise
+        try:
+            # Skip delegate sub-controllers — they shouldn't open another
+            # ENTRY span; they live within the parent controller's trace.
+            is_delegate = bool(safe_get_attr(instance, "is_delegate"))
+            if is_delegate:
+                logger.debug(
+                    "OpenHands instrumentation: skipping delegate "
+                    "controller %s for ENTRY/AGENT",
+                    id(instance),
+                )
+            else:
+                _open_entry_and_agent_for_controller(self._tracer, instance)
+        except Exception as exc:
+            # Promote to ERROR — if this fails the user will see "no
+            # ENTRY span" in their backend and we want a loud signal in
+            # the app logs to point at the cause.
+            logger.error(
+                "OpenHands instrumentation: AgentController init wrapper "
+                "failed to open ENTRY/AGENT for controller %s: %s",
+                id(instance),
+                exc,
+                exc_info=True,
+            )
+        return result
+
+
+class AgentControllerCloseWrapper:
+    """End the ENTRY + AGENT spans previously opened in ``__init__``."""
+
+    __slots__ = ()
+
+    def __init__(self, _tracer: Tracer):
+        # Tracer arg unused (we only need the spans we previously opened)
+        # but kept for symmetry with the other factories.
+        pass
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        return self._impl(wrapped, instance, args, kwargs)
+
+    async def _impl(self, wrapped, instance, args, kwargs):
+        err: BaseException | None = None
+        try:
+            return await wrapped(*args, **kwargs)
+        except BaseException as exc:
+            err = exc
+            raise
+        finally:
+            try:
+                _close_entry_and_agent_for_controller(instance, error=err)
+            except Exception as exc:
+                logger.error(
+                    "OpenHands instrumentation: AgentController close "
+                    "wrapper failed to end spans for controller %s: %s",
+                    id(instance),
+                    exc,
+                    exc_info=True,
+                )
+
+
+# ---------------------------------------------------------------------------
+# LLM context bridge: openhands.llm.llm.LLM.__init__
+# ---------------------------------------------------------------------------
+
+
+# Sentinel used to mark already-bridged completion callables so we don't
+# wrap them more than once if ``LLM.__init__`` runs again on the same
+# completion partial (e.g. live config reload).
+_LLM_BRIDGE_FLAG = "_otel_oh_ctx_bridged"
+
+
+class LLMInitWrapper:
+    """Make sure ``LLM.completion`` runs with the current STEP context attached.
+
+    Why this exists
+    ---------------
+    The LLM call inside ``AgentController._step`` is synchronous and *should*
+    inherit our STEP context via ``contextvars`` — but in real OpenHands
+    deployments LiteLLM ends up creating its span with a *different*
+    ``trace_id`` than the surrounding STEP/AGENT/ENTRY tree. Two known ways
+    that can happen:
+
+    * a 3rd-party auto-instrumentation injected before ours stashes the
+      LLM call onto a thread-pool worker (no contextvars propagation);
+    * the call is made from outside any of our wrappers (e.g. a condenser
+      / summarizer worker) where no OTel context is current.
+
+    The fix: at the end of ``LLM.__init__`` we monkey-patch ``self._completion``
+    with a tiny shim that re-attaches the latest sid-stashed context (which,
+    while a STEP is open, is the STEP context — see ``AgentControllerStepWrapper``).
+    The downstream ``opentelemetry-instrumentation-litellm`` (or the Aliyun
+    GenAI auto-instrumentation) will then create the LLM span as a child
+    of STEP and the ``trace_id`` finally lines up.
+    """
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer):
+        # Tracer arg unused — we only re-attach an existing OTel context
+        # so the *real* LLM instrumentor (litellm / aliyun) emits the
+        # span under it. We don't create our own LLM span here.
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        result = wrapped(*args, **kwargs)
+        try:
+            self._patch_completion(instance)
+        except Exception as exc:
+            logger.debug("LLM init wrapper failed to bridge completion: %s", exc)
+        return result
+
+    @staticmethod
+    def _patch_completion(instance: Any) -> None:
+        completion = getattr(instance, "_completion", None)
+        if completion is None:
+            return
+        if getattr(completion, _LLM_BRIDGE_FLAG, False):
+            return
+
+        def bridged(*a: Any, **kw: Any) -> Any:
+            # ``AttachedSession(None)`` re-attaches whatever context the
+            # most recent v0 wrapper stashed (STEP if a step is open,
+            # AGENT otherwise). When no OpenHands session is active the
+            # context manager is a no-op.
+            with AttachedSession(None):
+                return completion(*a, **kw)
+
+        try:
+            setattr(bridged, _LLM_BRIDGE_FLAG, True)
+        except Exception:
+            pass
+        try:
+            instance._completion = bridged
+        except Exception:
+            return
+        # Mirror onto the unwrapped slot too — some OpenHands codepaths
+        # call ``_completion_unwrapped`` directly when retries are
+        # disabled, and we want them to inherit the same parent context.
+        unwrapped = getattr(instance, "_completion_unwrapped", None)
+        if unwrapped is not None and not getattr(unwrapped, _LLM_BRIDGE_FLAG, False):
+
+            def bridged_unwrapped(*a: Any, **kw: Any) -> Any:
+                with AttachedSession(None):
+                    return unwrapped(*a, **kw)
+
+            try:
+                setattr(bridged_unwrapped, _LLM_BRIDGE_FLAG, True)
+            except Exception:
+                pass
+            try:
+                instance._completion_unwrapped = bridged_unwrapped
+            except Exception:
+                pass
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/package.py
new file mode 100644
index 000000000..6e3b6b925
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/package.py
@@ -0,0 +1 @@
+_instruments = ("openhands-ai >= 1.0.0",)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/version.py
new file mode 100644
index 000000000..3dc1f76bc
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/test-requirements.txt b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/test-requirements.txt
new file mode 100644
index 000000000..b5c521bd2
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/test-requirements.txt
@@ -0,0 +1,9 @@
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+wrapt>=1.0.0
+httpx>=0.24.0
+
+-e ./instrumentation-loongsuite/loongsuite-instrumentation-openhands
+-e ./opentelemetry-instrumentation
+-e ./opentelemetry-sdk
+-e ./opentelemetry-semantic-conventions
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/conftest.py
new file mode 100644
index 000000000..2fc095575
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/conftest.py
@@ -0,0 +1,247 @@
+"""Shared pytest fixtures and stub modules for the OpenHands instrumentation.
+
+We deliberately don't require ``openhands-ai`` to be installed at test time:
+instead we register lightweight stub modules under the same dotted paths so
+``wrap_function_wrapper`` can patch them. The wrappers themselves only rely on
+the *call signatures* documented in ``execute.md`` — which we faithfully
+reproduce in the stubs.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import sys
+import types
+from dataclasses import dataclass, field
+
+import pytest
+from opentelemetry import trace as trace_api
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+    InMemorySpanExporter,
+)
+
+
+def _ensure_stub_module(name: str) -> types.ModuleType:
+    if name in sys.modules:
+        return sys.modules[name]
+    mod = types.ModuleType(name)
+    sys.modules[name] = mod
+    parent_name, _, leaf = name.rpartition(".")
+    if parent_name:
+        parent = _ensure_stub_module(parent_name)
+        setattr(parent, leaf, mod)
+    return mod
+
+
+def _install_v0_stub_modules() -> None:
+    """Stubs for the V0 (Legacy CodeAct) hook points."""
+    _ensure_stub_module("openhands")
+    core = _ensure_stub_module("openhands.core")
+    main_mod = _ensure_stub_module("openhands.core.main")
+    loop_mod = _ensure_stub_module("openhands.core.loop")
+    ctrl_pkg = _ensure_stub_module("openhands.controller")
+    ctrl_mod = _ensure_stub_module("openhands.controller.agent_controller")
+    rt_pkg = _ensure_stub_module("openhands.runtime")
+    rt_base = _ensure_stub_module("openhands.runtime.base")
+
+    @dataclass
+    class _AgentState:
+        value: str = "finished"
+
+    @dataclass
+    class _State:
+        agent_state: _AgentState = field(default_factory=_AgentState)
+
+    @dataclass
+    class _LLMConfig:
+        model: str = "qwen3-coder-plus"
+
+    @dataclass
+    class _LLM:
+        config: _LLMConfig = field(default_factory=_LLMConfig)
+
+    @dataclass
+    class _Agent:
+        name: str = "CodeActAgent"
+        llm: _LLM = field(default_factory=_LLM)
+        # Mirrors litellm ChatCompletionToolParam dicts as produced by
+        # openhands.agenthub.codeact_agent.codeact_agent.CodeActAgent._get_tools.
+        tools: list = field(
+            default_factory=lambda: [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "execute_bash",
+                        "description": "Run a bash command on the runtime sandbox.",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {
+                                "command": {"type": "string"},
+                            },
+                            "required": ["command"],
+                        },
+                    },
+                },
+            ]
+        )
+
+    class AgentController:
+        step_calls = 0
+        close_calls = 0
+
+        def __init__(self, agent=None, sid="sid-test"):
+            self.agent = agent or _Agent()
+            self.id = sid
+            self.state = _State()
+            self._pending_action = None
+            self.is_delegate = False
+
+        async def _step(self) -> None:
+            type(self).step_calls += 1
+            class _Pending:
+                action = "run"
+                command = "echo step"
+                thought = "trying"
+
+            self._pending_action = _Pending()
+
+        async def close(self, set_stop_state: bool = True) -> None:
+            type(self).close_calls += 1
+
+    ctrl_mod.AgentController = AgentController
+
+    class _ToolCallMetadata:
+        """Stand-in for :class:`openhands.events.tool.ToolCallMetadata`."""
+
+        def __init__(self, function_name="", tool_call_id="", arguments=None):
+            import json as _json
+
+            self.function_name = function_name
+            self.tool_call_id = tool_call_id
+
+            class _Fn:
+                def __init__(self, name, args):
+                    self.name = name
+                    self.arguments = _json.dumps(args or {})
+
+            class _TC:
+                def __init__(self, tcid, fn):
+                    self.id = tcid
+                    self.function = fn
+
+            class _Msg:
+                def __init__(self, tcs):
+                    self.tool_calls = tcs
+
+            class _Choice:
+                def __init__(self, msg):
+                    self.message = msg
+
+            class _ModelResp:
+                def __init__(self, choices):
+                    self.choices = choices
+
+            self.model_response = _ModelResp(
+                [_Choice(_Msg([_TC(tool_call_id, _Fn(function_name, arguments))]))]
+            )
+
+    class _Action:
+        def __init__(
+            self,
+            action_type="run",
+            command="echo hi",
+            tool_call_metadata=None,
+        ):
+            self.action = action_type
+            self.command = command
+            self.tool_call_metadata = tool_call_metadata
+
+    class _Observation:
+        def __init__(self, exit_code=0, content=""):
+            self.exit_code = exit_code
+            self.content = content
+            self.observation = "run"
+
+    class Runtime:
+        run_action_calls = 0
+        # Tests can override on the instance to drive observation values.
+        _next_observation: _Observation | None = None
+
+        def __init__(self, sid="sid-test"):
+            self.sid = sid
+
+        def run_action(self, action) -> _Observation:
+            type(self).run_action_calls += 1
+            obs = self._next_observation
+            if obs is not None:
+                self._next_observation = None
+                return obs
+            return _Observation(exit_code=0)
+
+    rt_base.Runtime = Runtime
+    rt_base.Action = _Action
+    rt_base.Observation = _Observation
+    rt_base.ToolCallMetadata = _ToolCallMetadata
+
+    @dataclass
+    class _State2:
+        agent_state: _AgentState = field(default_factory=lambda: _AgentState("finished"))
+
+    async def run_controller(
+        config=None,
+        initial_user_action=None,
+        sid: str | None = None,
+        **kwargs,
+    ):
+        if getattr(main_mod, "_test_raise_cancelled", False):
+            raise asyncio.CancelledError()
+        # Mirror real V0: invoke the agent loop *inside* run_controller so
+        # the AGENT span lives within the ENTRY span (and inherits its
+        # stashed OTel context). Tests can install
+        # ``main_mod._test_inner_args = (controller, runtime)`` to opt in.
+        inner_args = getattr(main_mod, "_test_inner_args", None)
+        if inner_args is not None:
+            controller, runtime = inner_args
+            await loop_mod.run_agent_until_done(controller, runtime, None, [])
+        return _State2()
+
+    main_mod.run_controller = run_controller
+
+    async def run_agent_until_done(controller, runtime, memory, end_states):
+        # Tests can install a custom inner callback to drive STEP / TOOL
+        # spans inside the AGENT span; default is a no-op.
+        cb = getattr(loop_mod, "_test_inner_callback", None)
+        if callable(cb):
+            await cb(controller, runtime)
+        return None
+
+    loop_mod.run_agent_until_done = run_agent_until_done
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def tracer_provider() -> TracerProvider:
+    provider = TracerProvider()
+    exporter = InMemorySpanExporter()
+    provider.add_span_processor(SimpleSpanProcessor(exporter))
+    provider._exporter = exporter  # type: ignore[attr-defined]
+    return provider
+
+
+@pytest.fixture
+def stub_openhands_v0_modules() -> None:
+    _install_v0_stub_modules()
+
+
+@pytest.fixture(autouse=True)
+def _reset_global_tracer():
+    """Avoid bleed-through of the SDK provider between tests."""
+    yield
+    trace_api._TRACER_PROVIDER = None  # type: ignore[attr-defined]
+
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_tool_attributes.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_tool_attributes.py
new file mode 100644
index 000000000..91dcae22a
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_tool_attributes.py
@@ -0,0 +1,329 @@
+"""ARMS GenAI semconv §Tool conformance tests for the V0 TOOL wrapper.
+
+I/O capture is always on (no env-var gating, no truncation), so the
+TOOL span must carry every attribute the spec calls out — both
+required and recommended — on every run.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+
+import pytest
+
+
+def _spans_by_kind(exporter, kind: str):
+    return [
+        s
+        for s in exporter.get_finished_spans()
+        if s.attributes.get("gen_ai.span.kind") == kind
+    ]
+
+
+@pytest.fixture
+def instrumented(tracer_provider, stub_openhands_v0_modules):
+    from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor
+    from opentelemetry.instrumentation.openhands.internal import session_context
+
+    session_context.clear_all()
+    inst = OpenHandsInstrumentor()
+    inst.instrument(tracer_provider=tracer_provider, skip_dep_check=True)
+    try:
+        yield inst, tracer_provider._exporter  # type: ignore[attr-defined]
+    finally:
+        try:
+            inst.uninstrument()
+        except Exception:
+            pass
+        session_context.clear_all()
+
+
+def _run_one_tool_call(rt_base, ctrl_mod, loop_mod, main_mod):
+    """Drive a single ENTRY → AGENT → STEP → TOOL flow."""
+    ctrl = ctrl_mod.AgentController(sid="tool-sid")
+    runtime = rt_base.Runtime(sid="tool-sid")
+
+    tcm = rt_base.ToolCallMetadata(
+        function_name="execute_bash",
+        tool_call_id="call_abc123",
+        arguments={"command": "ls /tmp", "thought": "list temp"},
+    )
+    action = rt_base.Action(
+        action_type="run",
+        command="ls /tmp",
+        tool_call_metadata=tcm,
+    )
+
+    class MessageAction:
+        content = "list /tmp"
+        source = "user"
+
+    async def _inner(_c, _r):
+        await ctrl._step()
+        runtime.run_action(action)
+
+    loop_mod._test_inner_callback = _inner
+    main_mod._test_inner_args = (ctrl, runtime)
+
+    async def _scenario():
+        await main_mod.run_controller(
+            config=None,
+            initial_user_action=MessageAction(),
+            sid="tool-sid",
+        )
+
+    try:
+        asyncio.run(_scenario())
+    finally:
+        loop_mod._test_inner_callback = None
+        main_mod._test_inner_args = None
+
+
+def test_tool_span_carries_all_arms_required_attributes(instrumented):
+    inst, exporter = instrumented
+
+    import openhands.controller.agent_controller as ctrl_mod
+    import openhands.core.loop as loop_mod
+    import openhands.core.main as main_mod
+    import openhands.runtime.base as rt_base
+
+    _run_one_tool_call(rt_base, ctrl_mod, loop_mod, main_mod)
+
+    tools = _spans_by_kind(exporter, "TOOL")
+    assert len(tools) == 1
+    tool = tools[0]
+    attrs = tool.attributes
+
+    # Required
+    assert attrs["gen_ai.span.kind"] == "TOOL"
+    assert attrs["gen_ai.operation.name"] == "execute_tool"
+
+    # Span name should be `execute_tool {tool_name}`
+    assert tool.name == "execute_tool execute_bash"
+
+    # Recommended attributes
+    assert attrs["gen_ai.tool.name"] == "execute_bash"
+    assert attrs["gen_ai.tool.type"] == "function"
+    assert attrs["gen_ai.tool.call.id"] == "call_abc123"
+    assert attrs.get("gen_ai.tool.description") == (
+        "Run a bash command on the runtime sandbox."
+    )
+
+    # Arguments should be the BARE JSON dict, not the wrapping
+    # {"tool": ..., "arguments": ...} envelope.
+    args_json = attrs.get("gen_ai.tool.call.arguments")
+    assert args_json is not None
+    args = json.loads(args_json)
+    assert args == {"command": "ls /tmp", "thought": "list temp"}
+
+    # Result should reflect the observation.
+    result_json = attrs.get("gen_ai.tool.call.result")
+    assert result_json is not None
+    result = json.loads(result_json)
+    assert result.get("exit_code") == 0
+    assert "observation" in result
+    assert "input.value" not in attrs
+    assert "output.value" not in attrs
+
+
+def test_tool_span_falls_back_to_action_field_when_no_tool_call_metadata(
+    instrumented,
+):
+    """If the action wasn't generated from an LLM tool call (e.g. a
+    user-initiated agent.action), the wrapper should still produce a
+    sensible ``gen_ai.tool.name`` derived from the action type."""
+    inst, exporter = instrumented
+
+    import openhands.controller.agent_controller as ctrl_mod
+    import openhands.core.loop as loop_mod
+    import openhands.core.main as main_mod
+    import openhands.runtime.base as rt_base
+
+    ctrl = ctrl_mod.AgentController(sid="tool-fallback-sid")
+    runtime = rt_base.Runtime(sid="tool-fallback-sid")
+    action = rt_base.Action(action_type="run", command="echo hi")
+
+    class MessageAction:
+        content = "say hi"
+        source = "user"
+
+    async def _inner(_c, _r):
+        await ctrl._step()
+        runtime.run_action(action)
+
+    loop_mod._test_inner_callback = _inner
+    main_mod._test_inner_args = (ctrl, runtime)
+
+    async def _scenario():
+        await main_mod.run_controller(
+            config=None,
+            initial_user_action=MessageAction(),
+            sid="tool-fallback-sid",
+        )
+
+    try:
+        asyncio.run(_scenario())
+    finally:
+        loop_mod._test_inner_callback = None
+        main_mod._test_inner_args = None
+
+    tool = _spans_by_kind(exporter, "TOOL")[0]
+    attrs = tool.attributes
+
+    # Action.action == "run" → tool name "bash"
+    assert attrs["gen_ai.tool.name"] == "bash"
+    assert tool.name == "execute_tool bash"
+    # No tool-call id when the action wasn't from an LLM call
+    assert attrs.get("gen_ai.tool.call.id", "") == ""
+    # Arguments still produced from the action's fields
+    args = json.loads(attrs["gen_ai.tool.call.arguments"])
+    assert args.get("command") == "echo hi"
+
+
+def test_tool_span_reads_arguments_from_tool_call_metadata(instrumented):
+    inst, exporter = instrumented
+
+    import openhands.controller.agent_controller as ctrl_mod
+    import openhands.core.loop as loop_mod
+    import openhands.core.main as main_mod
+    import openhands.runtime.base as rt_base
+
+    ctrl = ctrl_mod.AgentController(sid="tool-direct-args-sid")
+    runtime = rt_base.Runtime(sid="tool-direct-args-sid")
+
+    class DirectToolCallMetadata:
+        function_name = "execute_bash"
+        tool_call_id = "call_direct_args"
+        arguments = {"command": "pwd", "timeout": 3}
+
+    action = rt_base.Action(
+        action_type="run",
+        command="pwd",
+        tool_call_metadata=DirectToolCallMetadata(),
+    )
+
+    class MessageAction:
+        content = "print cwd"
+        source = "user"
+
+    async def _inner(_c, _r):
+        await ctrl._step()
+        runtime.run_action(action)
+
+    loop_mod._test_inner_callback = _inner
+    main_mod._test_inner_args = (ctrl, runtime)
+
+    async def _scenario():
+        await main_mod.run_controller(
+            config=None,
+            initial_user_action=MessageAction(),
+            sid="tool-direct-args-sid",
+        )
+
+    try:
+        asyncio.run(_scenario())
+    finally:
+        loop_mod._test_inner_callback = None
+        main_mod._test_inner_args = None
+
+    tool = _spans_by_kind(exporter, "TOOL")[0]
+    attrs = tool.attributes
+    assert attrs["gen_ai.tool.call.id"] == "call_direct_args"
+    assert json.loads(attrs["gen_ai.tool.call.arguments"]) == {
+        "command": "pwd",
+        "timeout": 3,
+    }
+
+
+def test_tool_span_always_emits_arguments_attribute(instrumented):
+    inst, exporter = instrumented
+
+    import openhands.controller.agent_controller as ctrl_mod
+    import openhands.core.loop as loop_mod
+    import openhands.core.main as main_mod
+    import openhands.runtime.base as rt_base
+
+    ctrl = ctrl_mod.AgentController(sid="tool-empty-args-sid")
+    runtime = rt_base.Runtime(sid="tool-empty-args-sid")
+    action = rt_base.Action(action_type="run", command="")
+
+    class MessageAction:
+        content = "run empty command"
+        source = "user"
+
+    async def _inner(_c, _r):
+        await ctrl._step()
+        runtime.run_action(action)
+
+    loop_mod._test_inner_callback = _inner
+    main_mod._test_inner_args = (ctrl, runtime)
+
+    async def _scenario():
+        await main_mod.run_controller(
+            config=None,
+            initial_user_action=MessageAction(),
+            sid="tool-empty-args-sid",
+        )
+
+    try:
+        asyncio.run(_scenario())
+    finally:
+        loop_mod._test_inner_callback = None
+        main_mod._test_inner_args = None
+
+    attrs = _spans_by_kind(exporter, "TOOL")[0].attributes
+    assert attrs["gen_ai.tool.call.arguments"] == "{}"
+
+
+def test_agent_io_capture_omits_legacy_and_openinference_attrs(tracer_provider):
+    from opentelemetry.instrumentation.openhands.internal.v0_wrappers import (
+        _capture_agent_io_attributes,
+    )
+
+    class SystemMessageAction:
+        content = "You are helpful."
+
+    class MessageAction:
+        content = "hello"
+        source = "user"
+
+    class AgentFinishAction:
+        final_thought = "done"
+
+    class State:
+        history = [SystemMessageAction(), MessageAction(), AgentFinishAction()]
+
+    tracer = tracer_provider.get_tracer(__name__)
+    with tracer.start_as_current_span("agent") as span:
+        _capture_agent_io_attributes(span, None, None, State())
+
+    attrs = tracer_provider._exporter.get_finished_spans()[0].attributes  # type: ignore[attr-defined]
+    assert attrs.get("gen_ai.system_instructions")
+    assert attrs.get("gen_ai.input.messages")
+    assert attrs.get("gen_ai.output.messages")
+    assert "gen_ai.system_instruction" not in attrs
+    assert "input.value" not in attrs
+    assert "output.value" not in attrs
+
+
+def test_agent_span_emits_tool_definitions(instrumented):
+    """AGENT span should advertise the agent's available tools per the
+    ARMS GenAI semconv §Agent → ``gen_ai.tool.definitions``."""
+    inst, exporter = instrumented
+
+    import openhands.controller.agent_controller as ctrl_mod
+    import openhands.core.loop as loop_mod
+    import openhands.core.main as main_mod
+    import openhands.runtime.base as rt_base
+
+    _run_one_tool_call(rt_base, ctrl_mod, loop_mod, main_mod)
+
+    agent = _spans_by_kind(exporter, "AGENT")[0]
+    defs_json = agent.attributes.get("gen_ai.tool.definitions")
+    assert defs_json, "AGENT span should set gen_ai.tool.definitions"
+    defs = json.loads(defs_json)
+    assert isinstance(defs, list) and defs
+    assert defs[0]["type"] == "function"
+    assert defs[0]["name"] == "execute_bash"
+    assert "description" in defs[0]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_trace_continuity.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_trace_continuity.py
new file mode 100644
index 000000000..2d2adbd75
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_trace_continuity.py
@@ -0,0 +1,252 @@
+"""Cross-thread / cross-loop trace continuity tests for V0 wrappers.
+
+These tests model the *real* OpenHands V0 runtime behaviour: events are
+delivered by ``EventStream`` via a ``ThreadPoolExecutor`` and the controller
+processes them with ``asyncio.get_event_loop().run_until_complete(...)`` —
+which spins a brand-new asyncio loop in the worker thread. Without our
+session-context bridge, STEP / TOOL spans would start fresh root traces.
+
+We assert:
+
+* All ENTRY / AGENT / STEP / TOOL spans share the **same** ``trace_id``.
+* Parent-child wiring is correct (STEP is parented under AGENT, TOOL too).
+* The session-context store is cleaned up after the entry returns.
+* GenAI semantic-convention I/O attributes are populated when content
+  capture is enabled.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import os
+import threading
+from concurrent.futures import ThreadPoolExecutor
+
+import pytest
+
+
+def _spans_by_kind_attr(exporter, kind: str):
+    return [
+        s
+        for s in exporter.get_finished_spans()
+        if s.attributes.get("gen_ai.span.kind") == kind
+    ]
+
+
+@pytest.fixture
+def instrumented_v0(tracer_provider, stub_openhands_v0_modules):
+    from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor
+    from opentelemetry.instrumentation.openhands.internal import session_context
+
+    session_context.clear_all()
+    inst = OpenHandsInstrumentor()
+    inst.instrument(tracer_provider=tracer_provider, skip_dep_check=True)
+    try:
+        yield inst, tracer_provider._exporter  # type: ignore[attr-defined]
+    finally:
+        try:
+            inst.uninstrument()
+        except Exception:
+            pass
+        session_context.clear_all()
+
+
+def _drive_step_in_worker_thread(controller, runtime, action) -> None:
+    """Reproduce the V0 EventStream → ThreadPoolExecutor → run_until_complete path.
+
+    The worker thread (a) has no shared asyncio loop with the caller and
+    (b) has a *fresh* ``contextvars.Context`` (Python copies the snapshot
+    at submit-time, but the snapshot is from this test thread — the same
+    fresh context the real EventStream queue thread would have).
+    """
+    barrier = threading.Event()
+    err: list[BaseException] = []
+
+    def _worker():
+        try:
+            # New event loop per worker — exactly what V0 does.
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+            try:
+                loop.run_until_complete(controller._step())
+                # Run_action is sync — call it directly inside the worker.
+                runtime.run_action(action)
+            finally:
+                loop.close()
+        except BaseException as exc:  # pragma: no cover - surfaced via err
+            err.append(exc)
+        finally:
+            barrier.set()
+
+    pool = ThreadPoolExecutor(max_workers=1)
+    fut = pool.submit(_worker)
+    fut.result(timeout=5)
+    pool.shutdown(wait=True)
+    barrier.wait(timeout=5)
+    if err:
+        raise err[0]
+
+
+def test_all_spans_share_one_trace_id_across_threads(instrumented_v0):
+    """The whole V0 trace must collapse onto a single trace_id even when
+    STEP / TOOL run in fresh worker threads with fresh asyncio loops."""
+    inst, exporter = instrumented_v0
+
+    import openhands.controller.agent_controller as ctrl_mod
+    import openhands.core.loop as loop_mod
+    import openhands.core.main as main_mod
+    import openhands.runtime.base as rt_base
+
+    ctrl = ctrl_mod.AgentController(sid="bench-001")
+    runtime = rt_base.Runtime(sid="bench-001")
+    action = rt_base.Action(action_type="run", command="ls /")
+
+    async def _inner(_controller, _runtime):
+        for _ in range(2):
+            _drive_step_in_worker_thread(ctrl, runtime, action)
+
+    loop_mod._test_inner_callback = _inner
+    main_mod._test_inner_args = (ctrl, runtime)
+
+    class MessageAction:
+        content = "say hi"
+        source = "user"
+
+    async def _scenario():
+        await main_mod.run_controller(
+            config=None,
+            initial_user_action=MessageAction(),
+            sid="bench-001",
+        )
+
+    try:
+        asyncio.run(_scenario())
+    finally:
+        loop_mod._test_inner_callback = None
+        main_mod._test_inner_args = None
+
+    spans = exporter.get_finished_spans()
+    by_kind = {kind: _spans_by_kind_attr(exporter, kind) for kind in ("ENTRY", "AGENT", "STEP", "TOOL")}
+
+    assert len(by_kind["ENTRY"]) == 1
+    assert len(by_kind["AGENT"]) == 1
+    assert len(by_kind["STEP"]) == 2
+    assert len(by_kind["TOOL"]) == 2
+
+    entry = by_kind["ENTRY"][0]
+    agent = by_kind["AGENT"][0]
+    trace_id = entry.context.trace_id
+
+    # Same trace_id for every span
+    for s in spans:
+        assert s.context.trace_id == trace_id, (
+            f"span {s.name!r} (kind={s.attributes.get('gen_ai.span.kind')}) "
+            f"has trace_id {s.context.trace_id} but expected {trace_id}"
+        )
+
+    # Parent-child links: AGENT under ENTRY, STEP under AGENT, TOOL under AGENT
+    assert agent.parent is not None and agent.parent.span_id == entry.context.span_id
+    for s in by_kind["STEP"]:
+        assert s.parent is not None and s.parent.span_id == agent.context.span_id
+    for t in by_kind["TOOL"]:
+        assert t.parent is not None and t.parent.span_id == agent.context.span_id
+
+
+def test_session_context_cleared_after_entry(instrumented_v0):
+    """The per-sid stash must not leak across runs."""
+    inst, exporter = instrumented_v0
+
+    import openhands.core.loop as loop_mod
+    import openhands.core.main as main_mod
+    from opentelemetry.instrumentation.openhands.internal import session_context
+
+    async def _scenario():
+        await main_mod.run_controller(
+            config=None,
+            initial_user_action=type("Msg", (), {"content": "x", "source": "user"})(),
+            sid="ephemeral-sid",
+        )
+
+    asyncio.run(_scenario())
+    assert session_context.get_context("ephemeral-sid") is None
+
+
+def test_io_attributes_on_entry_agent_step(instrumented_v0):
+    """Verify GenAI / OpenInference I/O attributes are populated."""
+    inst, exporter = instrumented_v0
+
+    import openhands.controller.agent_controller as ctrl_mod
+    import openhands.core.loop as loop_mod
+    import openhands.core.main as main_mod
+    import openhands.runtime.base as rt_base
+
+    ctrl = ctrl_mod.AgentController(sid="io-sid")
+    runtime = rt_base.Runtime(sid="io-sid")
+    action = rt_base.Action(action_type="run", command="cat /etc/hosts")
+
+    # Seed history with a *MessageAction*-named instance — that's the type
+    # name the AGENT wrapper looks for when computing input.messages.
+    class MessageAction:
+        content = "do the thing"
+        source = "user"
+
+    ctrl.state.history = [MessageAction()]
+
+    async def _inner(_c, _r):
+        await ctrl._step()
+        runtime.run_action(action)
+
+    loop_mod._test_inner_callback = _inner
+    main_mod._test_inner_args = (ctrl, runtime)
+
+    async def _scenario():
+        await main_mod.run_controller(
+            config=None,
+            initial_user_action=MessageAction(),
+            sid="io-sid",
+        )
+
+    try:
+        asyncio.run(_scenario())
+    finally:
+        loop_mod._test_inner_callback = None
+        main_mod._test_inner_args = None
+
+    entry = _spans_by_kind_attr(exporter, "ENTRY")[0]
+    agent = _spans_by_kind_attr(exporter, "AGENT")[0]
+    step = _spans_by_kind_attr(exporter, "STEP")[0]
+    tool = _spans_by_kind_attr(exporter, "TOOL")[0]
+
+    # ENTRY
+    assert entry.attributes.get("gen_ai.framework") == "openhands"
+    assert entry.attributes.get("gen_ai.system") == "openhands"
+    assert entry.attributes.get("gen_ai.session.id") == "io-sid"
+    assert entry.attributes.get("input.value")
+    assert "do the thing" in entry.attributes.get("input.value")
+
+    # AGENT
+    assert agent.attributes.get("gen_ai.input.messages")
+    assert "do the thing" in agent.attributes.get("gen_ai.input.messages")
+    assert "gen_ai.system_instruction" not in agent.attributes
+    assert "input.value" not in agent.attributes
+    assert "output.value" not in agent.attributes
+    assert agent.attributes.get("gen_ai.session.id") == "io-sid"
+
+    # STEP
+    assert step.attributes.get("input.value")
+    assert step.attributes.get("output.value")
+    assert step.attributes.get("gen_ai.output.messages")
+    assert step.attributes.get("openhands.action.type") == "run"
+    out = step.attributes.get("output.value")
+    assert "tool_calls" in out and "echo step" in out
+
+    # TOOL spans: arguments only via gen_ai.tool.call.arguments; no input/output.value.
+    assert tool.attributes.get("gen_ai.tool.name") == "bash"
+    assert "input.value" not in tool.attributes
+    assert "output.value" not in tool.attributes
+    args = json.loads(tool.attributes["gen_ai.tool.call.arguments"])
+    assert args.get("command") == "cat /etc/hosts"
+    result = tool.attributes.get("gen_ai.tool.call.result")
+    assert result
+    assert "exit_code" in result
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_wrappers.py
new file mode 100644
index 000000000..cce832f66
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_wrappers.py
@@ -0,0 +1,187 @@
+"""Tests for V0 (Legacy CodeAct) wrappers.
+
+We exercise the four V0 patches (``run_controller``, ``run_agent_until_done``,
+``AgentController._step``, ``Runtime.run_action``) and assert that:
+
+* The ``ENTRY → AGENT → STEP → TOOL`` span tree is produced.
+* Parent-child linkage is correct.
+* Per-action ``gen_ai.tool.name`` is mapped from the V0 ``action`` field.
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+import pytest
+
+
+def _spans_by_kind_attr(exporter, kind: str):
+    return [
+        s
+        for s in exporter.get_finished_spans()
+        if s.attributes.get("gen_ai.span.kind") == kind
+    ]
+
+
+@pytest.fixture
+def instrumented_v0(tracer_provider, stub_openhands_v0_modules):
+    from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor
+
+    inst = OpenHandsInstrumentor()
+    inst.instrument(tracer_provider=tracer_provider, skip_dep_check=True)
+    try:
+        yield inst, tracer_provider._exporter  # type: ignore[attr-defined]
+    finally:
+        try:
+            inst.uninstrument()
+        except Exception:
+            pass
+
+
+def test_v0_full_span_tree(instrumented_v0):
+    inst, exporter = instrumented_v0
+
+    import openhands.controller.agent_controller as ctrl_mod
+    import openhands.core.loop as loop_mod
+    import openhands.core.main as main_mod
+    import openhands.runtime.base as rt_base
+
+    ctrl = ctrl_mod.AgentController()
+    runtime = rt_base.Runtime()
+    action = rt_base.Action(action_type="run", command="ls /")
+
+    async def _inner(controller, _runtime):
+        for _ in range(2):
+            await ctrl._step()
+            runtime.run_action(action)
+
+    loop_mod._test_inner_callback = _inner
+
+    async def _scenario():
+        # ENTRY span via run_controller wrapper
+        await main_mod.run_controller(
+            config=None,
+            initial_user_action=type("Msg", (), {"content": "hello"})(),
+            sid="sid-test",
+        )
+        # AGENT span via run_agent_until_done wrapper (which calls _inner)
+        await loop_mod.run_agent_until_done(ctrl, runtime, None, [])
+
+    try:
+        asyncio.run(_scenario())
+    finally:
+        loop_mod._test_inner_callback = None
+
+    entry = _spans_by_kind_attr(exporter, "ENTRY")
+    agent = _spans_by_kind_attr(exporter, "AGENT")
+    step = _spans_by_kind_attr(exporter, "STEP")
+    tool = _spans_by_kind_attr(exporter, "TOOL")
+
+    assert len(entry) == 1, f"unexpected ENTRY count: {len(entry)}"
+    assert len(agent) == 1, f"unexpected AGENT count: {len(agent)}"
+    assert len(step) == 2, f"unexpected STEP count: {len(step)}"
+    assert len(tool) == 2, f"unexpected TOOL count: {len(tool)}"
+
+    e = entry[0]
+    a = agent[0]
+    assert e.name == "enter openhands"
+    assert e.attributes.get("gen_ai.framework") == "openhands"
+    assert e.attributes.get("gen_ai.session.id") == "sid-test"
+
+    assert a.name.startswith("invoke_agent ")
+    assert a.attributes.get("gen_ai.agent.name") == "CodeActAgent"
+    assert a.attributes.get("gen_ai.request.model") == "qwen3-coder-plus"
+    assert "gen_ai.system_instruction" not in a.attributes
+    assert "input.value" not in a.attributes
+    assert "output.value" not in a.attributes
+
+    # All STEP spans share the AGENT as parent.
+    for s in step:
+        assert s.parent is not None
+        assert s.parent.span_id == a.context.span_id
+        assert s.attributes.get("gen_ai.operation.name") == "react"
+        assert s.attributes.get("gen_ai.react.round") in (1, 2)
+
+    # TOOL spans are siblings of STEP under AGENT (run_action is called after
+    # _step returns and is no longer in STEP context).
+    for t in tool:
+        assert t.attributes.get("gen_ai.tool.name") == "bash"
+        assert t.attributes.get("openhands.action.type") == "run"
+        assert t.attributes.get("openhands.action.exit_code") == 0
+
+
+def test_v0_step_round_increments_per_controller(instrumented_v0):
+    inst, exporter = instrumented_v0
+    import openhands.controller.agent_controller as ctrl_mod
+
+    ctrl_a = ctrl_mod.AgentController(sid="A")
+    ctrl_b = ctrl_mod.AgentController(sid="B")
+
+    async def _go():
+        await ctrl_a._step()
+        await ctrl_a._step()
+        await ctrl_b._step()
+
+    asyncio.run(_go())
+
+    step_spans = _spans_by_kind_attr(exporter, "STEP")
+    assert len(step_spans) == 3
+    rounds_a = sorted(
+        s.attributes.get("gen_ai.react.round")
+        for s in step_spans
+        if s.attributes.get("gen_ai.session.id") == "A"
+    )
+    rounds_b = sorted(
+        s.attributes.get("gen_ai.react.round")
+        for s in step_spans
+        if s.attributes.get("gen_ai.session.id") == "B"
+    )
+    assert rounds_a == [1, 2]
+    assert rounds_b == [1]
+
+
+def test_v0_runtime_error_observation_marks_span(instrumented_v0):
+    inst, exporter = instrumented_v0
+    import openhands.runtime.base as rt_base
+
+    runtime = rt_base.Runtime()
+
+    class _ErrAction:
+        action = "run"
+        command = "false"
+
+    # Use the conftest hook to make the next run_action return an error obs.
+    err_obs = rt_base.Observation(exit_code=2)
+    runtime._next_observation = err_obs
+
+    runtime.run_action(_ErrAction())
+
+    tool_spans = _spans_by_kind_attr(exporter, "TOOL")
+    assert len(tool_spans) == 1
+    span = tool_spans[0]
+    assert span.attributes.get("openhands.action.exit_code") == 2
+    assert span.status.status_code.name == "ERROR"
+
+
+def test_v0_run_controller_cancelled_is_not_span_error(instrumented_v0):
+    """``asyncio.CancelledError`` (e.g. wait_for) must not mark ENTRY as ERROR."""
+    _, exporter = instrumented_v0
+    import openhands.core.main as main_mod
+
+    main_mod._test_raise_cancelled = True
+    try:
+        with pytest.raises(asyncio.CancelledError):
+            asyncio.run(
+                main_mod.run_controller(
+                    config=None,
+                    initial_user_action=type("Msg", (), {"content": "hello"})(),
+                    sid="sid-cancel",
+                )
+            )
+    finally:
+        main_mod._test_raise_cancelled = False
+
+    entry = _spans_by_kind_attr(exporter, "ENTRY")
+    assert len(entry) == 1
+    assert entry[0].status.status_code.name == "UNSET"
+
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/README.md
new file mode 100644
index 000000000..4d4f4d7b1
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/README.md
@@ -0,0 +1,32 @@
+# LoongSuite slop-code-bench Instrumentation
+
+OpenTelemetry instrumentation for the [slop-code-bench](https://github.com/SprocketLab/slop-code-bench) benchmark orchestrator.
+
+## Span Tree
+
+```
+ENTRY  "slop-code.enter"
+└── CHAIN  "workflow.{problem_name}"
+    ├── TASK  "task.{checkpoint_name}"
+    │   └── AGENT  "agent.{agent_type}"
+    │       ├── STEP  "react.step.{N}"          [MiniSWE only]
+    │       └── ...
+    ├── TASK  "task.{checkpoint_name}"
+    │   └── AGENT  "agent.{agent_type}"
+    └── ...
+LLM  "chat {model_name}"                       [Rubric Judge]
+```
+
+## Installation
+
+```bash
+pip install loongsuite-instrumentation-slop-code
+```
+
+## Usage
+
+```python
+from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor
+
+SlopCodeInstrumentor().instrument()
+```
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/pyproject.toml
new file mode 100644
index 000000000..b443381c2
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/pyproject.toml
@@ -0,0 +1,61 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-slop-code"
+dynamic = ["version"]
+description = "LoongSuite slop-code-bench instrumentation"
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.10,<4"
+authors = [
+  { name = "Zhiyong Liu", email = "liuzhiyong.lzy@alibaba-inc.com" },
+  { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+]
+dependencies = [
+  "opentelemetry-api >= 1.37.0",
+  "opentelemetry-instrumentation >= 0.58b0",
+  "opentelemetry-semantic-conventions >= 0.58b0",
+  "wrapt >= 1.14.0, < 2.0.0",
+  "opentelemetry-util-genai >= 0.3b0.dev0",
+]
+
+[project.optional-dependencies]
+instruments = [
+  "slop-code-bench >= 0.1",
+]
+test = [
+  "pytest",
+  "pytest-asyncio",
+  "pytest-forked",
+  "opentelemetry-sdk",
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+slop_code = "opentelemetry.instrumentation.slop_code:SlopCodeInstrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-slop-code"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/slop_code/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+  "/src",
+  "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/__init__.py
new file mode 100644
index 000000000..983e60ab8
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/__init__.py
@@ -0,0 +1,246 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+OpenTelemetry slop-code-bench Instrumentation
+
+Instruments the slop-code benchmark orchestrator lifecycle:
+- ENTRY: run_agent (CLI entrypoint)
+- CHAIN/workflow: run_agent_on_problem (per-problem)
+- TASK: AgentRunner._run_checkpoint (per-checkpoint)
+- AGENT: Agent.run_checkpoint (concrete agent invocation)
+- STEP: MiniSWEAgent.agent_step (ReAct iteration)
+- LLM: grade_file_async (Rubric Judge)
+"""
+
+import logging
+from typing import Any, Collection
+
+from wrapt import wrap_function_wrapper
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from opentelemetry.instrumentation.slop_code.package import _instruments
+from opentelemetry.instrumentation.slop_code.version import __version__
+from opentelemetry.instrumentation.slop_code.wrappers.agent import (
+    _AgentRunCheckpointWrapper,
+)
+from opentelemetry.instrumentation.slop_code.wrappers.entry import (
+    _EntryWrapper,
+    _RunnerEntryWrapper,
+)
+from opentelemetry.instrumentation.slop_code.wrappers.llm import (
+    _RubricGradeWrapper,
+)
+from opentelemetry.instrumentation.slop_code.wrappers.step import (
+    _MiniSWEObservationWrapper,
+    _MiniSWEStepWrapper,
+)
+from opentelemetry.instrumentation.slop_code.wrappers.task import (
+    _TaskRunCheckpointWrapper,
+)
+from opentelemetry.instrumentation.slop_code.wrappers.tool import (
+    _ToolExecuteActionWrapper,
+)
+from opentelemetry.instrumentation.slop_code.wrappers.workflow import (
+    _WorkflowWrapper,
+)
+from opentelemetry.instrumentation.utils import unwrap
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["SlopCodeInstrumentor", "__version__"]
+
+_MODULE_ENTRY = "slop_code.entrypoints.commands.run_agent"
+_MODULE_WORKER = "slop_code.entrypoints.problem_runner.worker"
+# slop_code.entrypoints.problem_runner.driver re-imports
+# `run_agent_on_problem` via `from .worker import run_agent_on_problem`
+# at package-load time, capturing the original function reference. Because
+# our wrap happens after that bind, we must additionally replace the local
+# binding inside `driver` itself, otherwise the worker subprocess still
+# calls the un-wrapped original and the CHAIN span never fires.
+_MODULE_DRIVER = "slop_code.entrypoints.problem_runner.driver"
+_MODULE_RUNNER = "slop_code.agent_runner.runner"
+_MODULE_AGENT = "slop_code.agent_runner.agent"
+_MODULE_MINISWE = "slop_code.agent_runner.agents._miniswe_agent"
+_MODULE_RUBRIC = "slop_code.metrics.rubric.router"
+
+
+class SlopCodeInstrumentor(BaseInstrumentor):
+    """OpenTelemetry instrumentor for slop-code-bench framework."""
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        tracer_provider = kwargs.get("tracer_provider")
+        tracer = trace_api.get_tracer(
+            __name__,
+            __version__,
+            tracer_provider=tracer_provider,
+        )
+
+        # 3.1 ENTRY span: run_agent
+        try:
+            wrap_function_wrapper(
+                module=_MODULE_ENTRY,
+                name="run_agent",
+                wrapper=_EntryWrapper(tracer),
+            )
+        except Exception as e:
+            logger.warning(f"Could not wrap run_agent: {e}")
+
+        # 3.2 CHAIN span: run_agent_on_problem
+        workflow_wrapper = _WorkflowWrapper(tracer)
+        try:
+            wrap_function_wrapper(
+                module=_MODULE_WORKER,
+                name="run_agent_on_problem",
+                wrapper=workflow_wrapper,
+            )
+        except Exception as e:
+            logger.warning(f"Could not wrap run_agent_on_problem: {e}")
+        # Also wrap the re-bound name inside driver. driver.py imports
+        # run_agent_on_problem at module-load time via `from .worker import ...`,
+        # so the local name escapes our worker-module patch. The worker
+        # subprocess inherits this stale reference via fork(), and CHAIN
+        # spans never fire unless we patch the local re-bind too.
+        try:
+            wrap_function_wrapper(
+                module=_MODULE_DRIVER,
+                name="run_agent_on_problem",
+                wrapper=workflow_wrapper,
+            )
+        except Exception as e:
+            logger.warning(f"Could not wrap driver.run_agent_on_problem: {e}")
+
+        # 3.3 ENTRY span inside worker: AgentRunner.run
+        try:
+            wrap_function_wrapper(
+                module=_MODULE_RUNNER,
+                name="AgentRunner.run",
+                wrapper=_RunnerEntryWrapper(tracer),
+            )
+        except Exception as e:
+            logger.warning(f"Could not wrap AgentRunner.run: {e}")
+
+        # 3.4 TASK span: AgentRunner._run_checkpoint
+        try:
+            wrap_function_wrapper(
+                module=_MODULE_RUNNER,
+                name="AgentRunner._run_checkpoint",
+                wrapper=_TaskRunCheckpointWrapper(tracer),
+            )
+        except Exception as e:
+            logger.warning(f"Could not wrap AgentRunner._run_checkpoint: {e}")
+
+        # 3.5 AGENT span: Agent.run_checkpoint
+        try:
+            wrap_function_wrapper(
+                module=_MODULE_AGENT,
+                name="Agent.run_checkpoint",
+                wrapper=_AgentRunCheckpointWrapper(tracer),
+            )
+        except Exception as e:
+            logger.warning(f"Could not wrap Agent.run_checkpoint: {e}")
+
+        # 3.6 STEP span: MiniSWEAgent.agent_step
+        try:
+            wrap_function_wrapper(
+                module=_MODULE_MINISWE,
+                name="MiniSWEAgent.agent_step",
+                wrapper=_MiniSWEStepWrapper(tracer),
+            )
+        except Exception as e:
+            logger.debug(f"Could not wrap MiniSWEAgent.agent_step: {e}")
+
+        # 3.6 STEP end: MiniSWEAgent.get_observation
+        try:
+            wrap_function_wrapper(
+                module=_MODULE_MINISWE,
+                name="MiniSWEAgent.get_observation",
+                wrapper=_MiniSWEObservationWrapper(tracer),
+            )
+        except Exception as e:
+            logger.debug(f"Could not wrap MiniSWEAgent.get_observation: {e}")
+
+        # 3.7 TOOL span: MiniSWEAgent.execute_action
+        try:
+            wrap_function_wrapper(
+                module=_MODULE_MINISWE,
+                name="MiniSWEAgent.execute_action",
+                wrapper=_ToolExecuteActionWrapper(tracer),
+            )
+        except Exception as e:
+            logger.debug(f"Could not wrap MiniSWEAgent.execute_action: {e}")
+
+        # 3.8 LLM span: grade_file_async
+        try:
+            wrap_function_wrapper(
+                module=_MODULE_RUBRIC,
+                name="grade_file_async",
+                wrapper=_RubricGradeWrapper(tracer),
+            )
+        except Exception as e:
+            logger.debug(f"Could not wrap grade_file_async: {e}")
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        try:
+            import slop_code.entrypoints.commands.run_agent as mod_entry
+
+            unwrap(mod_entry, "run_agent")
+        except Exception:
+            pass
+
+        try:
+            import slop_code.entrypoints.problem_runner.worker as mod_worker
+
+            unwrap(mod_worker, "run_agent_on_problem")
+        except Exception:
+            pass
+
+        try:
+            import slop_code.entrypoints.problem_runner.driver as mod_driver
+
+            unwrap(mod_driver, "run_agent_on_problem")
+        except Exception:
+            pass
+
+        try:
+            import slop_code.agent_runner.runner as mod_runner
+
+            unwrap(mod_runner.AgentRunner, "_run_checkpoint")
+        except Exception:
+            pass
+
+        try:
+            import slop_code.agent_runner.agent as mod_agent
+
+            unwrap(mod_agent.Agent, "run_checkpoint")
+        except Exception:
+            pass
+
+        try:
+            import slop_code.agent_runner.agents.miniswe as mod_miniswe
+
+            unwrap(mod_miniswe.MiniSWEAgent, "agent_step")
+        except Exception:
+            pass
+
+        try:
+            import slop_code.metrics.rubric.router as mod_rubric
+
+            unwrap(mod_rubric, "grade_file_async")
+        except Exception:
+            pass
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/package.py
new file mode 100644
index 000000000..13b6fe785
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/package.py
@@ -0,0 +1,17 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+_instruments = ("slop-code-bench >= 0.1",)
+
+_supports_metrics = True
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/utils.py
new file mode 100644
index 000000000..34cd7a856
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/utils.py
@@ -0,0 +1,72 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for slop-code instrumentation."""
+
+from typing import Any, Optional
+
+from opentelemetry.trace import Span
+
+SYSTEM_NAME = "slop-code"
+MAX_ATTR_LEN = 1024
+
+
+def safe_get(obj: Any, attr: str, default: Any = None) -> Any:
+    """Safely get an attribute from an object, returning default on failure."""
+    try:
+        return getattr(obj, attr, default)
+    except Exception:
+        return default
+
+
+def safe_get_nested(obj: Any, *attrs: str, default: Any = None) -> Any:
+    """Safely traverse nested attributes."""
+    current = obj
+    for attr in attrs:
+        try:
+            current = getattr(current, attr)
+            if current is None:
+                return default
+        except (AttributeError, TypeError):
+            return default
+    return current
+
+
+def set_optional_attr(span: Span, key: str, value: Optional[Any]) -> None:
+    """Set a span attribute only if value is not None."""
+    if value is not None:
+        if isinstance(value, str) and len(value) > MAX_ATTR_LEN:
+            value = value[:MAX_ATTR_LEN]
+        span.set_attribute(key, value)
+
+
+def truncate_text(value: str, limit: int = MAX_ATTR_LEN) -> str:
+    """Return a bounded string suitable for span attributes."""
+    if value is None:
+        return value
+    return value if len(value) <= limit else value[:limit]
+
+def json_dumps_attr(value: Any) -> str:
+    """Serialize a value as JSON for ARMS GenAI string attributes."""
+    import json
+    return truncate_text(json.dumps(value, ensure_ascii=False, default=str))
+
+def genai_messages(messages: Any) -> str:
+    """Normalize chat-like messages to the ARMS GenAI message schema."""
+    normalized = []
+    for item in messages or []:
+        role = safe_get(item, "role") or (item.get("role") if isinstance(item, dict) else None) or "user"
+        content = safe_get(item, "content") or (item.get("content") if isinstance(item, dict) else None) or ""
+        normalized.append({"role": str(role), "parts": [{"type": "text", "content": str(content)}]})
+    return json_dumps_attr(normalized)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/version.py
new file mode 100644
index 000000000..7bee975f0
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/version.py
@@ -0,0 +1,15 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = "0.5.0.dev"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/__init__.py
new file mode 100644
index 000000000..b0a6f4284
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/__init__.py
@@ -0,0 +1,13 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/agent.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/agent.py
new file mode 100644
index 000000000..96d4a0f72
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/agent.py
@@ -0,0 +1,103 @@
+# Copyright The OpenTelemetry Authors
+# Licensed under the Apache License, Version 2.0
+
+"""AGENT span wrapper for Agent.run_checkpoint."""
+
+import logging
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.slop_code.utils import (
+    SYSTEM_NAME,
+    safe_get,
+    set_optional_attr,
+    genai_messages,
+)
+from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
+from opentelemetry.trace import SpanKind, Status, StatusCode
+from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes
+
+logger = logging.getLogger(__name__)
+
+
+def _assistant_messages(instance):
+    messages = []
+    for step in safe_get(instance, "_steps", []) or []:
+        role = safe_get(step, "role")
+        role_value = safe_get(role, "value", role)
+        if str(role_value).lower().endswith("assistant"):
+            content = safe_get(step, "content")
+            if content:
+                messages.append({"role": "assistant", "content": content})
+    if not messages:
+        for msg in safe_get(instance, "_messages", []) or []:
+            role = safe_get(msg, "role") or (msg.get("role") if isinstance(msg, dict) else None)
+            if role == "assistant":
+                content = safe_get(msg, "content") or (msg.get("content") if isinstance(msg, dict) else None)
+                if content:
+                    messages.append({"role": "assistant", "content": content})
+    return messages[-3:]
+
+
+class _AgentRunCheckpointWrapper:
+    """Wrapper for Agent.run_checkpoint to create AGENT span."""
+
+    def __init__(self, tracer: trace_api.Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        task_input = args[0] if args else kwargs.get("task")
+        agent_name = type(instance).__name__
+        problem_name = safe_get(instance, "problem_name", "unknown")
+        attrs = {
+            gen_ai_attributes.GEN_AI_OPERATION_NAME: "invoke_agent",
+            gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME,
+            gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.AGENT.value,
+            "gen_ai.framework": SYSTEM_NAME,
+            "gen_ai.agent.name": agent_name,
+            "gen_ai.agent.id": agent_name,
+            "gen_ai.agent.description": "slop-code benchmark agent",
+            "slop_code.problem.name": str(problem_name),
+        }
+        if task_input is not None:
+            attrs["gen_ai.input.messages"] = genai_messages([{"role": "user", "content": str(task_input)}])
+
+        with self._tracer.start_as_current_span(
+            name=f"invoke_agent {agent_name}",
+            kind=SpanKind.INTERNAL,
+            attributes=attrs,
+        ) as span:
+            try:
+                result = wrapped(*args, **kwargs)
+                agg = getattr(instance, "_otel_slop_aggregate_tokens", {}) or {}
+                input_tokens = int(agg.get("input", 0) or 0)
+                output_tokens = int(agg.get("output", 0) or 0)
+
+                usage = safe_get(result, "usage") if result is not None else None
+                net_tokens = safe_get(usage, "net_tokens") if usage is not None else None
+                if not input_tokens and net_tokens is not None:
+                    input_tokens = int(safe_get(net_tokens, "input", 0) or 0)
+                if not output_tokens and net_tokens is not None:
+                    output_tokens = int(safe_get(net_tokens, "output", 0) or 0)
+
+                if input_tokens:
+                    set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS, input_tokens)
+                if output_tokens:
+                    set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens)
+                if input_tokens or output_tokens:
+                    set_optional_attr(span, "gen_ai.usage.total_tokens", input_tokens + output_tokens)
+
+                messages = _assistant_messages(instance)
+                if messages:
+                    set_optional_attr(span, "gen_ai.output.messages", genai_messages(messages))
+
+                if usage is not None:
+                    set_optional_attr(span, "slop_code.usage.cost", safe_get(usage, "cost"))
+                    set_optional_attr(span, "slop_code.usage.steps", safe_get(usage, "steps"))
+                set_optional_attr(span, "slop_code.elapsed_seconds", safe_get(result, "elapsed") if result is not None else None)
+                span.set_status(Status(StatusCode.OK))
+                return result
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR, str(exc)))
+                span.set_attribute("error.type", type(exc).__name__)
+                raise
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/entry.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/entry.py
new file mode 100644
index 000000000..220f9e27f
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/entry.py
@@ -0,0 +1,85 @@
+# Copyright The OpenTelemetry Authors
+# Licensed under the Apache License, Version 2.0
+
+"""ENTRY span wrappers for slop-code benchmark runs."""
+
+import json
+import logging
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.slop_code.utils import (
+    SYSTEM_NAME,
+    genai_messages,
+    safe_get,
+    set_optional_attr,
+)
+from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
+from opentelemetry.trace import SpanKind, Status, StatusCode
+from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes
+
+logger = logging.getLogger(__name__)
+
+
+class _EntryWrapper:
+    """Wrapper for the top-level CLI run_agent command."""
+
+    def __init__(self, tracer: trace_api.Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        with self._tracer.start_as_current_span(
+            name="enter_ai_application_system",
+            kind=SpanKind.INTERNAL,
+            attributes={
+                gen_ai_attributes.GEN_AI_OPERATION_NAME: "enter",
+                gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME,
+                gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.ENTRY.value,
+                "gen_ai.framework": SYSTEM_NAME,
+            },
+        ) as span:
+            try:
+                result = wrapped(*args, **kwargs)
+                span.set_status(Status(StatusCode.OK))
+                return result
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR, str(exc)))
+                raise
+
+
+class _RunnerEntryWrapper:
+    """Create an ENTRY span inside the worker process so child spans share it."""
+
+    def __init__(self, tracer: trace_api.Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        problem = safe_get(safe_get(instance, "run_spec"), "problem")
+        problem_name = safe_get(problem, "name", "unknown")
+        attrs = {
+            gen_ai_attributes.GEN_AI_OPERATION_NAME: "enter",
+            gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME,
+            gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.ENTRY.value,
+            "gen_ai.framework": SYSTEM_NAME,
+            "gen_ai.session.id": str(problem_name),
+        }
+        # Capture the benchmark problem prompt as the application input when available.
+        task = safe_get(problem, "prompt") or safe_get(problem, "statement") or safe_get(problem, "description")
+        if task is not None:
+            attrs["gen_ai.input.messages"] = genai_messages([{"role": "user", "content": str(task)}])
+
+        with self._tracer.start_as_current_span(
+            name="enter_ai_application_system",
+            kind=SpanKind.INTERNAL,
+            attributes=attrs,
+        ) as span:
+            try:
+                result = wrapped(*args, **kwargs)
+                if result is not None:
+                    set_optional_attr(span, "output.value", json.dumps(result, ensure_ascii=False, default=str)[:1024])
+                span.set_status(Status(StatusCode.OK))
+                return result
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR, str(exc)))
+                raise
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/llm.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/llm.py
new file mode 100644
index 000000000..5090bc007
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/llm.py
@@ -0,0 +1,117 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""LLM span wrapper for grade_file_async (Rubric Judge)."""
+
+import logging
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.slop_code.utils import (
+    SYSTEM_NAME,
+    set_optional_attr,
+    json_dumps_attr,
+    genai_messages,
+)
+from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
+from opentelemetry.trace import SpanKind, Status, StatusCode
+from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes
+
+logger = logging.getLogger(__name__)
+
+
+class _RubricGradeWrapper:
+    """Wrapper for grade_file_async to create LLM span."""
+
+    def __init__(self, tracer: trace_api.Tracer):
+        self._tracer = tracer
+
+    async def __call__(self, wrapped, instance, args, kwargs):
+        # grade_file_async(prompt_prefix, criteria_text, file_name, model, provider, temperature, ...)
+        model = kwargs.get("model") or (args[3] if len(args) > 3 else "unknown")
+        provider = kwargs.get("provider") or (args[4] if len(args) > 4 else None)
+        temperature = kwargs.get("temperature") or (args[5] if len(args) > 5 else None)
+
+        # Determine system name from provider
+        system_name = SYSTEM_NAME
+        if provider is not None:
+            provider_val = provider.value if hasattr(provider, "value") else str(provider)
+            system_name = provider_val.lower()
+
+        span_name = f"chat {model}"
+
+        attrs = {
+            gen_ai_attributes.GEN_AI_OPERATION_NAME: "chat",
+            gen_ai_attributes.GEN_AI_SYSTEM: system_name,
+            gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.LLM.value,
+            gen_ai_attributes.GEN_AI_REQUEST_MODEL: str(model),
+            "gen_ai.provider.name": system_name,
+            "gen_ai.framework": SYSTEM_NAME,
+        }
+
+        prompt_prefix = args[0] if len(args) > 0 else kwargs.get("prompt_prefix")
+        criteria_text = args[1] if len(args) > 1 else kwargs.get("criteria_text")
+        if prompt_prefix is not None or criteria_text is not None:
+            attrs["gen_ai.input.messages"] = genai_messages([{"role": "user", "content": str(prompt_prefix or "") + "\n\n" + str(criteria_text or "")}])
+
+        if temperature is not None:
+            attrs[gen_ai_attributes.GEN_AI_REQUEST_TEMPERATURE] = float(temperature)
+
+        with self._tracer.start_as_current_span(
+            name=span_name,
+            kind=SpanKind.CLIENT,
+            attributes=attrs,
+        ) as span:
+            try:
+                result = await wrapped(*args, **kwargs)
+
+                # result is tuple[list[dict], dict[str, Any]]
+                if isinstance(result, tuple) and len(result) >= 2:
+                    response_data = result[1]
+                    if isinstance(response_data, dict):
+                        _set_usage_from_response(span, response_data)
+                        response_id = response_data.get("id")
+                        set_optional_attr(span, "gen_ai.response.id", response_id)
+                        if response_data.get("choices") is not None:
+                            span.set_attribute("gen_ai.output.messages", json_dumps_attr(response_data.get("choices")))
+
+                span.set_status(Status(StatusCode.OK))
+                return result
+            except Exception as e:
+                span.record_exception(e)
+                span.set_status(Status(StatusCode.ERROR, str(e)))
+                raise
+
+
+def _set_usage_from_response(span, response_data: dict) -> None:
+    """Extract and set token usage attributes from response_data."""
+    usage = response_data.get("usage")
+    if not isinstance(usage, dict):
+        return
+
+    # OpenRouter format: prompt_tokens / completion_tokens
+    # Bedrock format (normalized): input_tokens / output_tokens
+    input_tokens = usage.get("prompt_tokens") or usage.get("input_tokens")
+    output_tokens = usage.get("completion_tokens") or usage.get("output_tokens")
+
+    set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS, input_tokens)
+    set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens)
+    if input_tokens is not None and output_tokens is not None:
+        set_optional_attr(span, "gen_ai.usage.total_tokens", input_tokens + output_tokens)
+
+    # Cache tokens (OpenRouter specific)
+    cache_read = usage.get("cache_read_input_tokens")
+    set_optional_attr(span, gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read)
+
+    cache_creation = usage.get("cache_creation_input_tokens")
+    set_optional_attr(span, gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, cache_creation)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/step.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/step.py
new file mode 100644
index 000000000..4650d1689
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/step.py
@@ -0,0 +1,140 @@
+# Copyright The OpenTelemetry Authors
+# Licensed under the Apache License, Version 2.0
+
+"""STEP span wrappers for MiniSWEAgent ReAct iterations."""
+
+import logging
+
+from opentelemetry import context as context_api
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.slop_code.utils import (
+    SYSTEM_NAME,
+    safe_get,
+    set_optional_attr,
+    genai_messages,
+)
+from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
+from opentelemetry.trace import SpanKind, Status, StatusCode
+from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes
+
+logger = logging.getLogger(__name__)
+
+_STEP_SPAN_ATTR = "_otel_slop_step_span"
+_STEP_TOKEN_ATTR = "_otel_slop_step_token"
+_AGG_TOKENS_ATTR = "_otel_slop_aggregate_tokens"
+
+
+def _estimate_tokens(text) -> int:
+    if text is None:
+        return 0
+    text = str(text)
+    return max(1, (len(text) + 3) // 4) if text else 0
+
+
+def _add_agent_tokens(instance, input_tokens: int, output_tokens: int) -> None:
+    current = getattr(instance, _AGG_TOKENS_ATTR, {"input": 0, "output": 0})
+    current["input"] = int(current.get("input", 0)) + int(input_tokens or 0)
+    current["output"] = int(current.get("output", 0)) + int(output_tokens or 0)
+    setattr(instance, _AGG_TOKENS_ATTR, current)
+
+
+class _MiniSWEStepWrapper:
+    """Start a STEP span before the model call and keep it open for tool execution."""
+
+    def __init__(self, tracer: trace_api.Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        usage = safe_get(instance, "usage")
+        current_steps = safe_get(usage, "steps", 0) if usage else 0
+        step_num = current_steps + 1
+
+        messages = safe_get(instance, "_messages", [])
+        attrs = {
+            gen_ai_attributes.GEN_AI_OPERATION_NAME: "react",
+            gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME,
+            gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.STEP.value,
+            gen_ai_extended_attributes.GEN_AI_REACT_ROUND: step_num,
+            "gen_ai.framework": SYSTEM_NAME,
+        }
+        if messages:
+            attrs["gen_ai.input.messages"] = genai_messages(messages)
+
+        span = self._tracer.start_span("react step", kind=SpanKind.INTERNAL, attributes=attrs)
+        token = context_api.attach(trace_api.set_span_in_context(span))
+        setattr(instance, _STEP_SPAN_ATTR, span)
+        setattr(instance, _STEP_TOKEN_ATTR, token)
+
+        try:
+            result = wrapped(*args, **kwargs)
+            _record_step_result(instance, span, result, messages)
+            if result is None:
+                _finish_step(instance, Status(StatusCode.OK), "stop")
+            return result
+        except Exception as exc:
+            span.record_exception(exc)
+            _finish_step(instance, Status(StatusCode.ERROR, str(exc)), "error")
+            raise
+
+
+class _MiniSWEObservationWrapper:
+    """Finish the current STEP span after the environment/tool observation."""
+
+    def __init__(self, tracer: trace_api.Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        try:
+            return wrapped(*args, **kwargs)
+        except Exception as exc:
+            span = getattr(instance, _STEP_SPAN_ATTR, None)
+            if span is not None:
+                span.record_exception(exc)
+            _finish_step(instance, Status(StatusCode.ERROR, str(exc)), "error")
+            raise
+        finally:
+            if getattr(instance, _STEP_SPAN_ATTR, None) is not None:
+                _finish_step(instance, Status(StatusCode.OK), "stop")
+
+
+def _record_step_result(instance, span, result, messages) -> None:
+    if not isinstance(result, dict):
+        return
+    token_usage = result.get("token_usage")
+    input_tokens = safe_get(token_usage, "input") if token_usage is not None else None
+    output_tokens = safe_get(token_usage, "output") if token_usage is not None else None
+    content = result.get("content")
+    if not input_tokens:
+        input_tokens = _estimate_tokens(genai_messages(messages))
+    if not output_tokens:
+        output_tokens = _estimate_tokens(content)
+    set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS, input_tokens)
+    set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens)
+    if input_tokens is not None and output_tokens is not None:
+        set_optional_attr(span, "gen_ai.usage.total_tokens", input_tokens + output_tokens)
+        _add_agent_tokens(instance, input_tokens, output_tokens)
+    if token_usage is not None:
+        set_optional_attr(span, gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, safe_get(token_usage, "cache_read"))
+        set_optional_attr(span, gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, safe_get(token_usage, "cache_write"))
+    set_optional_attr(span, "slop_code.step.cost", result.get("step_cost"))
+    if content is not None:
+        set_optional_attr(span, "gen_ai.output.messages", genai_messages([{"role": "assistant", "content": content}]))
+
+
+def _finish_step(instance, status: Status, finish_reason: str) -> None:
+    span = getattr(instance, _STEP_SPAN_ATTR, None)
+    token = getattr(instance, _STEP_TOKEN_ATTR, None)
+    if span is None:
+        return
+    try:
+        span.set_attribute(gen_ai_extended_attributes.GEN_AI_REACT_FINISH_REASON, finish_reason)
+        span.set_status(status)
+        span.end()
+    finally:
+        if token is not None:
+            context_api.detach(token)
+        for attr in (_STEP_SPAN_ATTR, _STEP_TOKEN_ATTR):
+            try:
+                delattr(instance, attr)
+            except AttributeError:
+                pass
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/task.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/task.py
new file mode 100644
index 000000000..812e61b48
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/task.py
@@ -0,0 +1,77 @@
+# Copyright The OpenTelemetry Authors
+# Licensed under the Apache License, Version 2.0
+
+"""ENTRY + TASK span wrapper for AgentRunner._run_checkpoint."""
+
+import logging
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.slop_code.utils import SYSTEM_NAME, safe_get, set_optional_attr
+from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
+from opentelemetry.trace import SpanKind, Status, StatusCode
+from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes
+
+logger = logging.getLogger(__name__)
+
+
+class _TaskRunCheckpointWrapper:
+    """Create an ENTRY span and a child TASK span for each benchmark checkpoint."""
+
+    def __init__(self, tracer: trace_api.Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        checkpoint = args[0] if args else kwargs.get("checkpoint")
+        is_first_checkpoint = args[2] if len(args) > 2 else kwargs.get("is_first_checkpoint", False)
+        checkpoint_name = safe_get(checkpoint, "name", "unknown")
+        checkpoint_order = safe_get(checkpoint, "order")
+        problem = safe_get(safe_get(instance, "run_spec"), "problem")
+        problem_name = safe_get(problem, "name", checkpoint_name)
+
+        entry_attrs = {
+            gen_ai_attributes.GEN_AI_OPERATION_NAME: "enter",
+            gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME,
+            gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "ENTRY",
+            "gen_ai.framework": SYSTEM_NAME,
+            "gen_ai.session.id": str(problem_name),
+        }
+        task_attrs = {
+            gen_ai_attributes.GEN_AI_OPERATION_NAME: "run_task",
+            gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME,
+            gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "TASK",
+            "gen_ai.framework": SYSTEM_NAME,
+            "input.value": str(checkpoint_name),
+            "input.mime_type": "text/plain",
+            "slop_code.checkpoint.name": str(checkpoint_name),
+            "slop_code.is_first_checkpoint": bool(is_first_checkpoint),
+        }
+        if checkpoint_order is not None:
+            task_attrs["slop_code.checkpoint.order"] = checkpoint_order
+
+        with self._tracer.start_as_current_span(
+            name="enter_ai_application_system",
+            kind=SpanKind.INTERNAL,
+            attributes=entry_attrs,
+        ) as entry_span:
+            with self._tracer.start_as_current_span(
+                name=f"run_task {checkpoint_name}",
+                kind=SpanKind.INTERNAL,
+                attributes=task_attrs,
+            ) as task_span:
+                try:
+                    result = wrapped(*args, **kwargs)
+                    if result is not None:
+                        set_optional_attr(task_span, "slop_code.had_error", safe_get(result, "had_error"))
+                        set_optional_attr(task_span, "slop_code.passed_policy", safe_get(result, "passed_policy"))
+                        set_optional_attr(task_span, "output.value", str(result))
+                        set_optional_attr(task_span, "output.mime_type", "text/plain")
+                        set_optional_attr(entry_span, "output.value", str(result))
+                    task_span.set_status(Status(StatusCode.OK))
+                    entry_span.set_status(Status(StatusCode.OK))
+                    return result
+                except Exception as exc:
+                    task_span.record_exception(exc)
+                    task_span.set_status(Status(StatusCode.ERROR, str(exc)))
+                    entry_span.record_exception(exc)
+                    entry_span.set_status(Status(StatusCode.ERROR, str(exc)))
+                    raise
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/tool.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/tool.py
new file mode 100644
index 000000000..cec69b826
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/tool.py
@@ -0,0 +1,58 @@
+# Copyright The OpenTelemetry Authors
+# Licensed under the Apache License, Version 2.0
+
+"""TOOL span wrapper for MiniSWEAgent.execute_action."""
+
+import json
+import logging
+from uuid import uuid4
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.slop_code.utils import SYSTEM_NAME, truncate_text
+from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
+from opentelemetry.trace import SpanKind, Status, StatusCode
+from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes
+
+logger = logging.getLogger(__name__)
+
+
+def _json_attr(value) -> str:
+    return truncate_text(json.dumps(value, ensure_ascii=False, default=str))
+
+
+class _ToolExecuteActionWrapper:
+    """Wrap shell/tool execution performed by the benchmark agent."""
+
+    def __init__(self, tracer: trace_api.Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        action = args[0] if args else kwargs.get("action", {})
+        command = action.get("action") if isinstance(action, dict) else str(action)
+        attrs = {
+            gen_ai_attributes.GEN_AI_OPERATION_NAME: "execute_tool",
+            gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME,
+            gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "TOOL",
+            "gen_ai.framework": SYSTEM_NAME,
+            "gen_ai.tool.call.id": str(uuid4()),
+            "gen_ai.tool.name": "bash",
+            "gen_ai.tool.type": "function",
+            "gen_ai.tool.description": "Execute a shell command in the benchmark environment",
+            "gen_ai.tool.call.arguments": _json_attr({"command": command}),
+        }
+        with self._tracer.start_as_current_span(
+            name="execute_tool bash",
+            kind=SpanKind.INTERNAL,
+            attributes=attrs,
+        ) as span:
+            try:
+                result = wrapped(*args, **kwargs)
+                span.set_attribute("gen_ai.tool.call.result", _json_attr(result))
+                span.set_status(Status(StatusCode.OK))
+                return result
+            except Exception as exc:
+                span.record_exception(exc)
+                span.set_attribute("gen_ai.tool.call.result", _json_attr({"error": str(exc), "error.type": type(exc).__name__}))
+                span.set_status(Status(StatusCode.ERROR, str(exc)))
+                span.set_attribute("error.type", type(exc).__name__)
+                raise
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/workflow.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/workflow.py
new file mode 100644
index 000000000..5032a48c2
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/workflow.py
@@ -0,0 +1,123 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""CHAIN/workflow span wrapper for run_agent_on_problem."""
+
+import logging
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.slop_code.utils import (
+    SYSTEM_NAME,
+    safe_get,
+    safe_get_nested,
+    set_optional_attr,
+)
+from opentelemetry.semconv._incubating.attributes import gen_ai_attributes
+from opentelemetry.trace import SpanKind, Status, StatusCode
+from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes
+
+logger = logging.getLogger(__name__)
+
+
+class _WorkflowWrapper:
+    """Wrapper for run_agent_on_problem to create workflow (CHAIN) span."""
+
+    def __init__(self, tracer: trace_api.Tracer):
+        self._tracer = tracer
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        # run_agent_on_problem(problem_config, problem_name, config, progress_queue, output_path)
+        problem_name = args[1] if len(args) > 1 else kwargs.get("problem_name", "unknown")
+        config = args[2] if len(args) > 2 else kwargs.get("config")
+
+        span_name = f"chain {problem_name}"
+
+        attrs = {
+            gen_ai_attributes.GEN_AI_OPERATION_NAME: "workflow",
+            gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME,
+            gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "CHAIN",
+            "gen_ai.framework": SYSTEM_NAME,
+            "input.value": str(problem_name),
+            "slop_code.problem.name": str(problem_name),
+        }
+
+        # Extract optional attributes from config
+        if config is not None:
+            model_name = safe_get_nested(config, "model_def", "name")
+            set_optional_attr_dict(attrs, gen_ai_attributes.GEN_AI_REQUEST_MODEL, model_name)
+
+            agent_type = safe_get_nested(config, "agent_config", "type")
+            set_optional_attr_dict(attrs, "slop_code.agent.type", agent_type)
+
+            pass_policy = safe_get_nested(config, "pass_policy", "value")
+            if pass_policy is None:
+                pass_policy_obj = safe_get(config, "pass_policy")
+                if pass_policy_obj is not None and hasattr(pass_policy_obj, "value"):
+                    pass_policy = pass_policy_obj.value
+            set_optional_attr_dict(attrs, "slop_code.pass_policy", pass_policy)
+
+        try:
+            with self._tracer.start_as_current_span(
+                name=span_name,
+                kind=SpanKind.INTERNAL,
+                attributes={k: v for k, v in attrs.items() if v is not None},
+            ) as span:
+                try:
+                    result = wrapped(*args, **kwargs)
+
+                    if isinstance(result, dict):
+                        summary = result.get("summary")
+                        if isinstance(summary, dict):
+                            set_optional_attr(
+                                span, "slop_code.state", summary.get("state")
+                            )
+                            set_optional_attr(
+                                span,
+                                "slop_code.passed_policy",
+                                summary.get("passed_policy"),
+                            )
+                            set_optional_attr(span, "output.value", str(summary))
+
+                    span.set_status(Status(StatusCode.OK))
+                    return result
+                except Exception as e:
+                    span.record_exception(e)
+                    span.set_status(Status(StatusCode.ERROR, str(e)))
+                    raise
+        finally:
+            # Flush AFTER the `with` block so the workflow span itself
+            # is `on_end`-delivered to the SpanProcessor before we ask it
+            # to drain. run_agent_on_problem is the last meaningful work
+            # item inside the per-problem worker subprocess; once it
+            # returns, the process is reaped by ProcessPoolExecutor's
+            # shutdown which can short-circuit BatchSpanProcessor's
+            # atexit handler. Without this explicit flush the CHAIN span
+            # (and the tail batch of TASK/AGENT/STEP spans) gets dropped.
+            try:
+                provider = trace_api.get_tracer_provider()
+                flush = getattr(provider, "force_flush", None)
+                if callable(flush):
+                    flush(timeout_millis=5000)
+            except Exception as flush_err:  # noqa: BLE001
+                logger.debug(
+                    "force_flush after workflow span failed: %s", flush_err
+                )
+
+
+def set_optional_attr_dict(attrs: dict, key: str, value) -> None:
+    """Add to attrs dict only if value is not None."""
+    if value is not None:
+        if isinstance(value, str) and len(value) > 1024:
+            value = value[:1024]
+        attrs[key] = value
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/test-requirements.txt b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/test-requirements.txt
new file mode 100644
index 000000000..9facd6bc9
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/test-requirements.txt
@@ -0,0 +1,8 @@
+pytest
+pytest-asyncio
+pytest-forked==1.6.0
+opentelemetry-api
+opentelemetry-sdk
+opentelemetry-instrumentation
+opentelemetry-semantic-conventions
+wrapt
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/conftest.py
new file mode 100644
index 000000000..dcda695d0
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/conftest.py
@@ -0,0 +1,209 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test configuration for slop-code instrumentation tests."""
+
+import os
+import sys
+import types
+from unittest.mock import MagicMock
+
+import pytest
+
+os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental"
+
+
+def _make_module(name):
+    """Create a real module object."""
+    mod = types.ModuleType(name)
+    mod.__package__ = name.rsplit(".", 1)[0] if "." in name else name
+    return mod
+
+
+def _create_mock_slop_code_modules():
+    """Create mock modules for slop_code so instrumentation can wrap them."""
+    # Create all parent modules
+    mod_slop_code = _make_module("slop_code")
+    mod_entrypoints = _make_module("slop_code.entrypoints")
+    mod_commands = _make_module("slop_code.entrypoints.commands")
+    mod_run_agent = _make_module("slop_code.entrypoints.commands.run_agent")
+    mod_problem_runner = _make_module("slop_code.entrypoints.problem_runner")
+    mod_worker = _make_module("slop_code.entrypoints.problem_runner.worker")
+    mod_driver = _make_module("slop_code.entrypoints.problem_runner.driver")
+    mod_agent_runner = _make_module("slop_code.agent_runner")
+    mod_runner = _make_module("slop_code.agent_runner.runner")
+    mod_agent = _make_module("slop_code.agent_runner.agent")
+    mod_agents = _make_module("slop_code.agent_runner.agents")
+    mod_miniswe = _make_module("slop_code.agent_runner.agents.miniswe")
+    mod_metrics = _make_module("slop_code.metrics")
+    mod_rubric = _make_module("slop_code.metrics.rubric")
+    mod_router = _make_module("slop_code.metrics.rubric.router")
+
+    # --- ENTRY: run_agent ---
+    def run_agent(*args, **kwargs):
+        return {"status": "completed"}
+
+    mod_run_agent.run_agent = run_agent
+
+    # --- WORKFLOW: run_agent_on_problem ---
+    def run_agent_on_problem(*args, **kwargs):
+        return {"summary": {"state": "completed", "passed_policy": True}}
+
+    mod_worker.run_agent_on_problem = run_agent_on_problem
+    # driver re-imports the worker name at module load time. This mock mirrors
+    # the same pattern so the instrumentor's driver-side patch has a target.
+    mod_driver.run_agent_on_problem = run_agent_on_problem
+
+    # --- TASK: AgentRunner._run_checkpoint ---
+    class AgentRunner:
+        def __init__(self):
+            self.agent = MagicMock()
+            self.agent.usage = MagicMock()
+            self.agent.usage.net_tokens = MagicMock()
+            self.agent.usage.net_tokens.input = 100
+            self.agent.usage.net_tokens.output = 50
+
+        def _run_checkpoint(self, checkpoint, checkpoint_save_dir, is_first_checkpoint=False):
+            result = MagicMock()
+            result.had_error = False
+            result.passed_policy = True
+            return result
+
+    mod_runner.AgentRunner = AgentRunner
+
+    # --- AGENT: Agent.run_checkpoint ---
+    class Agent:
+        def __init__(self, problem_name="test_problem"):
+            self.problem_name = problem_name
+            self.usage = MagicMock()
+            self.usage.net_tokens = MagicMock()
+            self.usage.net_tokens.input = 100
+            self.usage.net_tokens.output = 50
+            self.usage.steps = 0
+            self.usage.cost = 0.05
+
+        def run_checkpoint(self, task):
+            result = MagicMock()
+            result.usage = self.usage
+            result.elapsed = 10.5
+            return result
+
+    mod_agent.Agent = Agent
+
+    # --- STEP: MiniSWEAgent.agent_step ---
+    class MiniSWEAgent(Agent):
+        def __init__(self, problem_name="test_problem"):
+            super().__init__(problem_name)
+
+        def agent_step(self):
+            return {
+                "token_usage": MagicMock(input=200, output=80, cache_read=50, cache_write=10),
+                "step_cost": 0.01,
+            }
+
+    mod_miniswe.MiniSWEAgent = MiniSWEAgent
+
+    # --- LLM: grade_file_async ---
+    async def grade_file_async(*args, **kwargs):
+        grades = [{"score": 8, "reasoning": "Good code"}]
+        response_data = {
+            "id": "resp-123",
+            "usage": {
+                "prompt_tokens": 500,
+                "completion_tokens": 200,
+                "cache_read_input_tokens": 100,
+                "cache_creation_input_tokens": 50,
+            },
+        }
+        return grades, response_data
+
+    mod_router.grade_file_async = grade_file_async
+
+    # Wire parent-child relationships
+    mod_slop_code.entrypoints = mod_entrypoints
+    mod_slop_code.agent_runner = mod_agent_runner
+    mod_slop_code.metrics = mod_metrics
+    mod_entrypoints.commands = mod_commands
+    mod_entrypoints.problem_runner = mod_problem_runner
+    mod_commands.run_agent = mod_run_agent
+    mod_problem_runner.worker = mod_worker
+    mod_problem_runner.driver = mod_driver
+    mod_agent_runner.runner = mod_runner
+    mod_agent_runner.agent = mod_agent
+    mod_agent_runner.agents = mod_agents
+    mod_agents.miniswe = mod_miniswe
+    mod_metrics.rubric = mod_rubric
+    mod_rubric.router = mod_router
+
+    # Register all modules in sys.modules
+    modules = {
+        "slop_code": mod_slop_code,
+        "slop_code.entrypoints": mod_entrypoints,
+        "slop_code.entrypoints.commands": mod_commands,
+        "slop_code.entrypoints.commands.run_agent": mod_run_agent,
+        "slop_code.entrypoints.problem_runner": mod_problem_runner,
+        "slop_code.entrypoints.problem_runner.worker": mod_worker,
+        "slop_code.entrypoints.problem_runner.driver": mod_driver,
+        "slop_code.agent_runner": mod_agent_runner,
+        "slop_code.agent_runner.runner": mod_runner,
+        "slop_code.agent_runner.agent": mod_agent,
+        "slop_code.agent_runner.agents": mod_agents,
+        "slop_code.agent_runner.agents.miniswe": mod_miniswe,
+        "slop_code.metrics": mod_metrics,
+        "slop_code.metrics.rubric": mod_rubric,
+        "slop_code.metrics.rubric.router": mod_router,
+    }
+
+    for name, mod in modules.items():
+        sys.modules[name] = mod
+
+    return modules
+
+
+# Install mock modules before any instrumentation imports
+_mock_modules = _create_mock_slop_code_modules()
+
+
+@pytest.fixture(scope="function")
+def span_exporter():
+    from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+        InMemorySpanExporter,
+    )
+
+    exporter = InMemorySpanExporter()
+    yield exporter
+    exporter.clear()
+
+
+@pytest.fixture(scope="function")
+def tracer_provider(span_exporter):
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+
+    provider = TracerProvider()
+    provider.add_span_processor(SimpleSpanProcessor(span_exporter))
+    return provider
+
+
+@pytest.fixture(scope="function")
+def instrument(tracer_provider):
+    from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor
+
+    instrumentor = SlopCodeInstrumentor()
+    instrumentor.instrument(
+        tracer_provider=tracer_provider,
+        skip_dep_check=True,
+    )
+    yield instrumentor
+    instrumentor.uninstrument()
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_agent_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_agent_span.py
new file mode 100644
index 000000000..d372ba220
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_agent_span.py
@@ -0,0 +1,102 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for AGENT span (Agent.run_checkpoint)."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from opentelemetry.trace import StatusCode
+
+
+class TestAgentSpan:
+    """Verify that Agent.run_checkpoint produces an AGENT span."""
+
+    def test_agent_span_created(self, span_exporter, instrument):
+        """Agent.run_checkpoint should create an AGENT span."""
+        import slop_code.agent_runner.agent as mod
+
+        agent = mod.Agent(problem_name="file_backup")
+        result = agent.run_checkpoint("solve the bug")
+
+        spans = span_exporter.get_finished_spans()
+        agent_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.operation.name") == "invoke_agent"
+        ]
+        assert len(agent_spans) == 1
+
+        span = agent_spans[0]
+        assert span.name == "agent.Agent"
+        assert span.attributes["gen_ai.system"] == "slop-code"
+        assert span.attributes["gen_ai.span.kind"] == "AGENT"
+        assert span.attributes["gen_ai.agent.name"] == "Agent"
+        assert span.attributes["slop_code.problem.name"] == "file_backup"
+        assert span.status.status_code == StatusCode.OK
+
+    def test_agent_span_captures_usage(self, span_exporter, instrument):
+        """AGENT span should capture token usage from result."""
+        import slop_code.agent_runner.agent as mod
+
+        agent = mod.Agent(problem_name="test_prob")
+        agent.run_checkpoint("task")
+
+        spans = span_exporter.get_finished_spans()
+        agent_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.operation.name") == "invoke_agent"
+        ]
+        assert len(agent_spans) == 1
+        span = agent_spans[0]
+
+        assert "gen_ai.usage.input_tokens" in span.attributes
+        assert "gen_ai.usage.output_tokens" in span.attributes
+        assert span.attributes["gen_ai.usage.input_tokens"] == 100
+        assert span.attributes["gen_ai.usage.output_tokens"] == 50
+
+    def test_agent_span_error(self, span_exporter, tracer_provider):
+        """Exception in Agent.run_checkpoint should produce error span."""
+        import slop_code.agent_runner.agent as mod
+
+        from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor
+
+        class FailingAgent(mod.Agent):
+            def run_checkpoint(self, task):
+                raise TimeoutError("Agent timeout")
+
+        OriginalAgent = mod.Agent
+        mod.Agent = FailingAgent
+
+        instrumentor = SlopCodeInstrumentor()
+        instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True)
+
+        try:
+            agent = mod.Agent(problem_name="test_prob")
+
+            with pytest.raises(TimeoutError, match="Agent timeout"):
+                agent.run_checkpoint("task")
+
+            spans = span_exporter.get_finished_spans()
+            agent_spans = [
+                s for s in spans
+                if s.attributes.get("gen_ai.operation.name") == "invoke_agent"
+            ]
+            assert len(agent_spans) == 1
+            span = agent_spans[0]
+            assert span.status.status_code == StatusCode.ERROR
+            assert span.attributes.get("error.type") == "TimeoutError"
+        finally:
+            instrumentor.uninstrument()
+            mod.Agent = OriginalAgent
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_entry_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_entry_span.py
new file mode 100644
index 000000000..2f7c1751f
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_entry_span.py
@@ -0,0 +1,74 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for ENTRY span (run_agent)."""
+
+import pytest
+
+from opentelemetry.trace import StatusCode
+
+
+class TestEntrySpan:
+    """Verify that run_agent produces an ENTRY span."""
+
+    def test_entry_span_created(self, span_exporter, instrument):
+        """run_agent should create an ENTRY span with correct attributes."""
+        import slop_code.entrypoints.commands.run_agent as mod
+
+        mod.run_agent()
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.span.kind") == "ENTRY"
+        ]
+        assert len(entry_spans) == 1
+
+        span = entry_spans[0]
+        assert span.name == "slop-code.enter"
+        assert span.attributes["gen_ai.system"] == "slop-code"
+        assert span.attributes["gen_ai.operation.name"] == "enter"
+        assert span.status.status_code == StatusCode.OK
+
+    def test_entry_span_error(self, span_exporter, tracer_provider):
+        """run_agent raising an exception should produce an error ENTRY span."""
+        import slop_code.entrypoints.commands.run_agent as mod
+
+        from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor
+
+        # Store original and replace with failing function
+        original = mod.run_agent
+
+        def failing_run_agent(*args, **kwargs):
+            raise RuntimeError("Config error")
+
+        mod.run_agent = failing_run_agent
+
+        instrumentor = SlopCodeInstrumentor()
+        instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True)
+
+        try:
+            with pytest.raises(RuntimeError, match="Config error"):
+                mod.run_agent()
+
+            spans = span_exporter.get_finished_spans()
+            entry_spans = [
+                s for s in spans
+                if s.attributes.get("gen_ai.span.kind") == "ENTRY"
+            ]
+            assert len(entry_spans) == 1
+            assert entry_spans[0].status.status_code == StatusCode.ERROR
+        finally:
+            instrumentor.uninstrument()
+            mod.run_agent = original
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_hierarchy.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_hierarchy.py
new file mode 100644
index 000000000..d33cc3568
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_hierarchy.py
@@ -0,0 +1,118 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for span hierarchy and parent-child relationships."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from opentelemetry.trace import StatusCode
+
+
+class TestSpanHierarchy:
+    """Verify parent-child relationships between spans."""
+
+    def test_entry_is_parent_of_workflow(self, span_exporter, instrument):
+        """ENTRY span should be parent of workflow span when called inline."""
+        import slop_code.entrypoints.commands.run_agent as entry_mod
+        import slop_code.entrypoints.problem_runner.worker as worker_mod
+
+        # Patch run_agent to call run_agent_on_problem internally
+        original = entry_mod.run_agent.__wrapped__
+
+        def run_with_workflow(*args, **kwargs):
+            config = MagicMock()
+            config.model_def = None
+            config.agent_config = None
+            config.pass_policy = None
+            return worker_mod.run_agent_on_problem(
+                MagicMock(), "test_problem", config, MagicMock(), "/tmp"
+            )
+
+        entry_mod.run_agent.__wrapped__ = run_with_workflow
+
+        try:
+            entry_mod.run_agent()
+
+            spans = span_exporter.get_finished_spans()
+            entry_spans = [
+                s for s in spans
+                if s.attributes.get("gen_ai.span.kind") == "ENTRY"
+            ]
+            workflow_spans = [
+                s for s in spans
+                if s.attributes.get("gen_ai.operation.name") == "workflow"
+            ]
+
+            assert len(entry_spans) == 1
+            assert len(workflow_spans) == 1
+
+            entry_span = entry_spans[0]
+            workflow_span = workflow_spans[0]
+
+            # workflow should be child of entry
+            assert workflow_span.context.trace_id == entry_span.context.trace_id
+            assert workflow_span.parent is not None
+            assert workflow_span.parent.span_id == entry_span.context.span_id
+        finally:
+            entry_mod.run_agent.__wrapped__ = original
+
+    def test_workflow_is_parent_of_task(self, span_exporter, instrument):
+        """Workflow span should be parent of task span when called inline."""
+        import slop_code.agent_runner.runner as runner_mod
+        import slop_code.entrypoints.problem_runner.worker as worker_mod
+
+        original = worker_mod.run_agent_on_problem.__wrapped__
+
+        def workflow_with_task(*args, **kwargs):
+            r = runner_mod.AgentRunner()
+            checkpoint = MagicMock()
+            checkpoint.name = "cp1"
+            checkpoint.order = 1
+            r._run_checkpoint(checkpoint, "/tmp", True)
+            return {"summary": {"state": "completed", "passed_policy": True}}
+
+        worker_mod.run_agent_on_problem.__wrapped__ = workflow_with_task
+
+        try:
+            config = MagicMock()
+            config.model_def = None
+            config.agent_config = None
+            config.pass_policy = None
+            worker_mod.run_agent_on_problem(
+                MagicMock(), "prob1", config, MagicMock(), "/tmp"
+            )
+
+            spans = span_exporter.get_finished_spans()
+            workflow_spans = [
+                s for s in spans
+                if s.attributes.get("gen_ai.operation.name") == "workflow"
+            ]
+            task_spans = [
+                s for s in spans
+                if s.attributes.get("gen_ai.operation.name") == "run_task"
+            ]
+
+            assert len(workflow_spans) == 1
+            assert len(task_spans) == 1
+
+            workflow_span = workflow_spans[0]
+            task_span = task_spans[0]
+
+            assert task_span.context.trace_id == workflow_span.context.trace_id
+            assert task_span.parent is not None
+            assert task_span.parent.span_id == workflow_span.context.span_id
+        finally:
+            worker_mod.run_agent_on_problem.__wrapped__ = original
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_llm_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_llm_span.py
new file mode 100644
index 000000000..c88e46430
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_llm_span.py
@@ -0,0 +1,142 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for LLM span (grade_file_async - Rubric Judge)."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from opentelemetry.trace import SpanKind, StatusCode
+
+
+@pytest.mark.asyncio
+class TestLLMSpan:
+    """Verify that grade_file_async produces an LLM span."""
+
+    async def test_llm_span_created(self, span_exporter, instrument):
+        """grade_file_async should create an LLM span."""
+        import slop_code.metrics.rubric.router as mod
+
+        provider = MagicMock()
+        provider.value = "openrouter"
+
+        grades, resp = await mod.grade_file_async(
+            "prompt_prefix",
+            "criteria_text",
+            "test.py",
+            "anthropic/claude-3.5-sonnet",
+            provider,
+            0.7,
+        )
+
+        spans = span_exporter.get_finished_spans()
+        llm_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.span.kind") == "LLM"
+        ]
+        assert len(llm_spans) == 1
+
+        span = llm_spans[0]
+        assert span.name == "chat anthropic/claude-3.5-sonnet"
+        assert span.attributes["gen_ai.system"] == "openrouter"
+        assert span.attributes["gen_ai.operation.name"] == "chat"
+        assert span.attributes["gen_ai.request.model"] == "anthropic/claude-3.5-sonnet"
+        assert span.attributes["gen_ai.request.temperature"] == 0.7
+        assert span.kind == SpanKind.CLIENT
+        assert span.status.status_code == StatusCode.OK
+
+    async def test_llm_span_captures_usage(self, span_exporter, instrument):
+        """LLM span should capture token usage from response."""
+        import slop_code.metrics.rubric.router as mod
+
+        provider = MagicMock()
+        provider.value = "openrouter"
+
+        await mod.grade_file_async(
+            "prefix", "criteria", "file.py",
+            "anthropic/claude-3.5-sonnet", provider, 0.5,
+        )
+
+        spans = span_exporter.get_finished_spans()
+        llm_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.span.kind") == "LLM"
+        ]
+        assert len(llm_spans) == 1
+        span = llm_spans[0]
+
+        assert span.attributes["gen_ai.usage.input_tokens"] == 500
+        assert span.attributes["gen_ai.usage.output_tokens"] == 200
+        assert span.attributes["gen_ai.usage.cache_read.input_tokens"] == 100
+        assert span.attributes["gen_ai.usage.cache_creation.input_tokens"] == 50
+        assert span.attributes["gen_ai.response.id"] == "resp-123"
+
+    async def test_llm_span_error(self, span_exporter, tracer_provider):
+        """Exception in grade_file_async should produce an error LLM span."""
+        import slop_code.metrics.rubric.router as mod
+
+        from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor
+
+        original = mod.grade_file_async
+
+        async def failing_grade(*args, **kwargs):
+            raise ConnectionError("API unreachable")
+
+        mod.grade_file_async = failing_grade
+
+        instrumentor = SlopCodeInstrumentor()
+        instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True)
+
+        provider = MagicMock()
+        provider.value = "bedrock"
+
+        try:
+            with pytest.raises(ConnectionError, match="API unreachable"):
+                await mod.grade_file_async(
+                    "prefix", "criteria", "file.py",
+                    "us.anthropic.claude-3-5-sonnet-20241022-v2:0", provider, 0.3,
+                )
+
+            spans = span_exporter.get_finished_spans()
+            llm_spans = [
+                s for s in spans
+                if s.attributes.get("gen_ai.span.kind") == "LLM"
+            ]
+            assert len(llm_spans) == 1
+            assert llm_spans[0].status.status_code == StatusCode.ERROR
+            assert llm_spans[0].attributes["gen_ai.system"] == "bedrock"
+        finally:
+            instrumentor.uninstrument()
+            mod.grade_file_async = original
+
+    async def test_llm_span_bedrock_provider(self, span_exporter, instrument):
+        """LLM span with bedrock provider should use 'bedrock' as system."""
+        import slop_code.metrics.rubric.router as mod
+
+        provider = MagicMock()
+        provider.value = "bedrock"
+
+        await mod.grade_file_async(
+            "prefix", "criteria", "file.py",
+            "us.anthropic.claude-3-5-sonnet-20241022-v2:0", provider, 0.5,
+        )
+
+        spans = span_exporter.get_finished_spans()
+        llm_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.span.kind") == "LLM"
+        ]
+        assert len(llm_spans) == 1
+        assert llm_spans[0].attributes["gen_ai.system"] == "bedrock"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_step_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_step_span.py
new file mode 100644
index 000000000..70e221da2
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_step_span.py
@@ -0,0 +1,133 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for STEP span (MiniSWEAgent.agent_step)."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from opentelemetry.trace import StatusCode
+
+
+class TestStepSpan:
+    """Verify that MiniSWEAgent.agent_step produces a STEP span."""
+
+    def test_step_span_created(self, span_exporter, instrument):
+        """agent_step should create a STEP span with token attributes."""
+        import slop_code.agent_runner.agents.miniswe as mod
+
+        agent = mod.MiniSWEAgent(problem_name="test_prob")
+        result = agent.agent_step()
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.span.kind") == "STEP"
+        ]
+        assert len(step_spans) == 1
+
+        span = step_spans[0]
+        assert span.name == "react.step.1"
+        assert span.attributes["gen_ai.system"] == "slop-code"
+        assert span.attributes["gen_ai.operation.name"] == "react"
+        assert span.attributes["gen_ai.react.round"] == 1
+        assert span.status.status_code == StatusCode.OK
+
+    def test_step_span_has_token_usage(self, span_exporter, instrument):
+        """STEP span should capture token usage from result."""
+        import slop_code.agent_runner.agents.miniswe as mod
+
+        agent = mod.MiniSWEAgent(problem_name="test_prob")
+        agent.agent_step()
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.span.kind") == "STEP"
+        ]
+        assert len(step_spans) == 1
+        span = step_spans[0]
+
+        assert span.attributes["gen_ai.usage.input_tokens"] == 200
+        assert span.attributes["gen_ai.usage.output_tokens"] == 80
+        assert span.attributes["gen_ai.usage.cache_read.input_tokens"] == 50
+        assert span.attributes["gen_ai.usage.cache_creation.input_tokens"] == 10
+
+    def test_step_span_increments_round(self, span_exporter, instrument):
+        """Multiple agent_step calls should increment the round number."""
+        import slop_code.agent_runner.agents.miniswe as mod
+
+        agent = mod.MiniSWEAgent(problem_name="test_prob")
+        # Simulate steps=2 already completed
+        agent.usage.steps = 2
+        agent.agent_step()
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.span.kind") == "STEP"
+        ]
+        assert len(step_spans) == 1
+        assert step_spans[0].name == "react.step.3"
+        assert step_spans[0].attributes["gen_ai.react.round"] == 3
+
+    def test_step_span_error(self, span_exporter, tracer_provider):
+        """Exception in agent_step should produce an error STEP span."""
+        import slop_code.agent_runner.agents.miniswe as mod
+
+        from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor
+
+        class FailingMiniSWE(mod.MiniSWEAgent):
+            def agent_step(self):
+                raise RuntimeError("LimitsExceeded")
+
+        OriginalClass = mod.MiniSWEAgent
+        mod.MiniSWEAgent = FailingMiniSWE
+
+        instrumentor = SlopCodeInstrumentor()
+        instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True)
+
+        try:
+            agent = mod.MiniSWEAgent(problem_name="test_prob")
+
+            with pytest.raises(RuntimeError, match="LimitsExceeded"):
+                agent.agent_step()
+
+            spans = span_exporter.get_finished_spans()
+            step_spans = [
+                s for s in spans
+                if s.attributes.get("gen_ai.span.kind") == "STEP"
+            ]
+            assert len(step_spans) == 1
+            span = step_spans[0]
+            assert span.status.status_code == StatusCode.ERROR
+            assert span.attributes["gen_ai.react.finish_reason"] == "error"
+        finally:
+            instrumentor.uninstrument()
+            mod.MiniSWEAgent = OriginalClass
+
+    def test_step_span_finish_reason_stop(self, span_exporter, instrument):
+        """Successful step should have finish_reason='stop'."""
+        import slop_code.agent_runner.agents.miniswe as mod
+
+        agent = mod.MiniSWEAgent(problem_name="test_prob")
+        agent.agent_step()
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.span.kind") == "STEP"
+        ]
+        assert step_spans[0].attributes["gen_ai.react.finish_reason"] == "stop"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_task_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_task_span.py
new file mode 100644
index 000000000..de3e16a95
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_task_span.py
@@ -0,0 +1,110 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for TASK span (AgentRunner._run_checkpoint)."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from opentelemetry.trace import StatusCode
+
+
+class TestTaskSpan:
+    """Verify that AgentRunner._run_checkpoint produces a TASK span."""
+
+    def test_task_span_created(self, span_exporter, instrument):
+        """_run_checkpoint should create a task span."""
+        import slop_code.agent_runner.runner as mod
+
+        runner = mod.AgentRunner()
+
+        checkpoint = MagicMock()
+        checkpoint.name = "checkpoint_1"
+        checkpoint.order = 1
+
+        result = runner._run_checkpoint(checkpoint, "/tmp/save", True)
+
+        spans = span_exporter.get_finished_spans()
+        task_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.operation.name") == "run_task"
+        ]
+        assert len(task_spans) == 1
+
+        span = task_spans[0]
+        assert span.name == "task.checkpoint_1"
+        assert span.attributes["gen_ai.system"] == "slop-code"
+        assert span.attributes["gen_ai.span.kind"] == "TASK"
+        assert span.attributes["slop_code.checkpoint.name"] == "checkpoint_1"
+        assert span.attributes["slop_code.checkpoint.order"] == 1
+        assert span.attributes["slop_code.is_first_checkpoint"] is True
+        assert span.status.status_code == StatusCode.OK
+
+    def test_task_span_error(self, span_exporter, tracer_provider):
+        """Exception in _run_checkpoint should produce an error task span."""
+        import slop_code.agent_runner.runner as mod
+
+        from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor
+
+        class FailingRunner(mod.AgentRunner):
+            def _run_checkpoint(self, checkpoint, checkpoint_save_dir, is_first_checkpoint=False):
+                raise RuntimeError("Checkpoint failed")
+
+        # Replace class temporarily
+        OriginalRunner = mod.AgentRunner
+        mod.AgentRunner = FailingRunner
+
+        instrumentor = SlopCodeInstrumentor()
+        instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True)
+
+        try:
+            runner = mod.AgentRunner()
+            checkpoint = MagicMock()
+            checkpoint.name = "bad_checkpoint"
+            checkpoint.order = 2
+
+            with pytest.raises(RuntimeError, match="Checkpoint failed"):
+                runner._run_checkpoint(checkpoint, "/tmp/save", False)
+
+            spans = span_exporter.get_finished_spans()
+            task_spans = [
+                s for s in spans
+                if s.attributes.get("gen_ai.operation.name") == "run_task"
+            ]
+            assert len(task_spans) == 1
+            assert task_spans[0].status.status_code == StatusCode.ERROR
+        finally:
+            instrumentor.uninstrument()
+            mod.AgentRunner = OriginalRunner
+
+    def test_task_span_not_first_checkpoint(self, span_exporter, instrument):
+        """Subsequent checkpoint should have is_first_checkpoint=False."""
+        import slop_code.agent_runner.runner as mod
+
+        runner = mod.AgentRunner()
+
+        checkpoint = MagicMock()
+        checkpoint.name = "checkpoint_2"
+        checkpoint.order = 2
+
+        runner._run_checkpoint(checkpoint, "/tmp/save", False)
+
+        spans = span_exporter.get_finished_spans()
+        task_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.operation.name") == "run_task"
+        ]
+        assert len(task_spans) == 1
+        assert task_spans[0].attributes["slop_code.is_first_checkpoint"] is False
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_workflow_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_workflow_span.py
new file mode 100644
index 000000000..6d0a79ddc
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_workflow_span.py
@@ -0,0 +1,117 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for CHAIN/workflow span (run_agent_on_problem)."""
+
+from unittest.mock import MagicMock
+
+import pytest
+
+from opentelemetry.trace import StatusCode
+
+
+class TestWorkflowSpan:
+    """Verify that run_agent_on_problem produces a workflow span."""
+
+    def test_workflow_span_created(self, span_exporter, instrument):
+        """run_agent_on_problem should create a workflow span."""
+        import slop_code.entrypoints.problem_runner.worker as mod
+
+        config = MagicMock()
+        config.model_def = MagicMock()
+        config.model_def.name = "anthropic/claude-3.5-sonnet"
+        config.agent_config = MagicMock()
+        config.agent_config.type = "claude_code"
+        config.pass_policy = MagicMock()
+        config.pass_policy.value = "any"
+
+        result = mod.run_agent_on_problem(
+            MagicMock(),  # problem_config
+            "file_backup",  # problem_name
+            config,  # config
+            MagicMock(),  # progress_queue
+            "/tmp/output",  # output_path
+        )
+
+        spans = span_exporter.get_finished_spans()
+        workflow_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.operation.name") == "workflow"
+        ]
+        assert len(workflow_spans) == 1
+
+        span = workflow_spans[0]
+        assert span.name == "workflow.file_backup"
+        assert span.attributes["gen_ai.system"] == "slop-code"
+        assert span.attributes["gen_ai.span.kind"] == "CHAIN"
+        assert span.attributes["slop_code.problem.name"] == "file_backup"
+        assert span.attributes["gen_ai.request.model"] == "anthropic/claude-3.5-sonnet"
+        assert span.attributes["slop_code.agent.type"] == "claude_code"
+        assert span.status.status_code == StatusCode.OK
+
+    def test_workflow_span_error(self, span_exporter, tracer_provider):
+        """Exception in run_agent_on_problem should produce error workflow span."""
+        import slop_code.entrypoints.problem_runner.worker as mod
+
+        from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor
+
+        original = mod.run_agent_on_problem
+
+        def failing_worker(*args, **kwargs):
+            raise ValueError("Problem not found")
+
+        mod.run_agent_on_problem = failing_worker
+
+        instrumentor = SlopCodeInstrumentor()
+        instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True)
+
+        try:
+            with pytest.raises(ValueError, match="Problem not found"):
+                mod.run_agent_on_problem(
+                    MagicMock(), "missing_problem", MagicMock(), MagicMock(), "/tmp"
+                )
+
+            spans = span_exporter.get_finished_spans()
+            workflow_spans = [
+                s for s in spans
+                if s.attributes.get("gen_ai.operation.name") == "workflow"
+            ]
+            assert len(workflow_spans) == 1
+            assert workflow_spans[0].status.status_code == StatusCode.ERROR
+        finally:
+            instrumentor.uninstrument()
+            mod.run_agent_on_problem = original
+
+    def test_workflow_span_with_none_config_fields(self, span_exporter, instrument):
+        """Workflow span should handle None config fields gracefully."""
+        import slop_code.entrypoints.problem_runner.worker as mod
+
+        config = MagicMock()
+        config.model_def = None
+        config.agent_config = None
+        config.pass_policy = None
+
+        mod.run_agent_on_problem(
+            MagicMock(), "test_problem", config, MagicMock(), "/tmp"
+        )
+
+        spans = span_exporter.get_finished_spans()
+        workflow_spans = [
+            s for s in spans
+            if s.attributes.get("gen_ai.operation.name") == "workflow"
+        ]
+        assert len(workflow_spans) == 1
+        span = workflow_spans[0]
+        assert span.attributes["slop_code.problem.name"] == "test_problem"
+        assert "gen_ai.request.model" not in span.attributes
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/LICENSE b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/LICENSE
new file mode 100644
index 000000000..261eeb9e9
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/pyproject.toml
new file mode 100644
index 000000000..62aaa6e5a
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/pyproject.toml
@@ -0,0 +1,52 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-terminus2"
+dynamic = ["version"]
+description = "LoongSuite Terminus2 Instrumentation"
+license = "Apache-2.0"
+requires-python = ">=3.8"
+authors = [
+  { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+]
+dependencies = [
+  "wrapt >= 1.0.0, < 2.0.0",
+]
+
+[project.optional-dependencies]
+instruments = [
+  "terminal-bench >= 0.1.0",
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+terminus2 = "opentelemetry.instrumentation.terminus2:Terminus2Instrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-terminus2"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/terminus2/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+  "/src",
+  "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/__init__.py
new file mode 100644
index 000000000..f5d018885
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/__init__.py
@@ -0,0 +1,826 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+OpenTelemetry Terminus2 Instrumentation
+
+Provides automatic instrumentation for the terminus-2 agent from terminal-bench
+via external monkey patching (no upstream changes required).
+
+Span hierarchy & semantic mapping (strictly follows ARMS gen-ai semantic
+conventions, see ``arms_docs/trace/gen-ai.md``):
+
+  enter_ai_application_system        (ENTRY  / enter)
+    └── invoke_agent terminus-2      (AGENT  / invoke_agent)
+          └── react step             (STEP   / react)              ── episode N
+                ├── (LLM span produced by ``opentelemetry-instrumentation-litellm``)
+                ├── run_task parse_response (TASK   / run_task)
+                ├── chain summarize  (CHAIN  / task)               ── on overflow
+                └── execute_tool terminal  (TOOL   / execute_tool)
+
+LLM spans are intentionally **not** produced by this package. The underlying
+``LiteLLM.call`` invokes ``litellm.completion`` which is already traced by
+``opentelemetry-instrumentation-litellm``; emitting another span here would
+duplicate that record.
+
+Patch targets (all monkey-patched via ``wrapt.wrap_function_wrapper``):
+
+  P0  Terminus2.perform_task          → ENTRY span (application entry)
+  P0  Terminus2._run_agent_loop       → AGENT span + episode lifecycle
+  P0  Terminus2._execute_commands     → TOOL span
+  P1  Terminus2._handle_llm_interaction → STEP span (per ReAct iteration)
+  P1  TerminusJSONPlainParser.parse_response /
+      TerminusXMLPlainParser.parse_response → TASK span
+  P2  Terminus2._summarize            → CHAIN span (handoff)
+"""
+
+import contextvars
+import json
+import logging
+from typing import Any, Collection
+
+from opentelemetry import context as context_api
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from opentelemetry.instrumentation.utils import unwrap
+from opentelemetry.trace import SpanKind, Status, StatusCode
+from wrapt import wrap_function_wrapper
+
+from aliyun.semconv.trace_v2 import (
+    CommonAttributes,
+    GenAiOperationName,
+    GenAiSpanKind,
+    GenAiToolType,
+    LLMAttributes,
+    ToolAttributes,
+)
+
+from aliyun.sdk.extension.arms.self_monitor.self_monitor_decorator import hook_advice
+
+from opentelemetry.instrumentation.terminus2.package import _instruments
+
+logger = logging.getLogger(__name__)
+
+# ── Framework / agent identifiers ────────────────────────────────────────────
+_FRAMEWORK = "terminal-bench"
+_AGENT_NAME = "terminus-2"
+_TERMINAL_TOOL_NAME = "terminal"
+_TERMINAL_TOOL_DESCRIPTION = "Send keystrokes to a tmux terminal session"
+
+# Spec-defined tool I/O attribute keys (not yet exposed as constants in
+# aliyun.semconv.trace_v2.ToolAttributes; see gen-ai.md §Tool).
+_GEN_AI_TOOL_CALL_ARGUMENTS = "gen_ai.tool.call.arguments"
+_GEN_AI_TOOL_CALL_RESULT = "gen_ai.tool.call.result"
+
+# Message content attributes. These are not exposed by
+# aliyun.semconv.trace_v2.CommonAttributes in all supported versions.
+_GEN_AI_INPUT_MESSAGES = "gen_ai.input.messages"
+_GEN_AI_OUTPUT_MESSAGES = "gen_ai.output.messages"
+
+# ── Span kind / operation values not present in trace_v2 enums ───────────────
+_SPAN_KIND_ENTRY = "ENTRY"
+_SPAN_KIND_STEP = "STEP"
+_OP_ENTER = "enter"
+_OP_REACT = "react"
+_OP_RUN_TASK = "run_task"
+_OP_TASK = "task"
+
+# ── ReAct extension attributes (阿里云扩展规范) ──────────────────────────────
+_GEN_AI_REACT_ROUND = "gen_ai.react.round"
+_GEN_AI_REACT_FINISH_REASON = "gen_ai.react.finish_reason"
+
+# ── Content capture ─────────────────────────────────────────────────────────
+# Inputs / outputs (instruction text, terminal keystrokes, terminal output,
+# AgentResult summary) are captured **unconditionally and untruncated** —
+# they are the primary observability signal for terminus-2. If full content
+# is undesirable in a given deployment, configure exporter-side filtering or
+# attribute-length limits in the SDK instead.
+
+
+def _commands_to_arguments_json(commands) -> str:
+    """Serialize a list of ``Command`` objects into a JSON string for
+    ``gen_ai.tool.call.arguments``."""
+    serialized = []
+    for cmd in commands:
+        serialized.append({
+            "keystrokes": getattr(cmd, "keystrokes", ""),
+            "duration_sec": getattr(cmd, "duration_sec", None),
+        })
+    try:
+        return json.dumps(serialized, ensure_ascii=False)
+    except Exception:
+        return str(serialized)
+
+
+def _text_messages_json(role: str, content: Any) -> str:
+    """Serialize a single text message using the GenAI message schema."""
+    message = {
+        "role": role,
+        "parts": [{"type": "text", "content": str(content)}],
+    }
+    try:
+        return json.dumps([message], ensure_ascii=False, separators=(",", ":"))
+    except Exception:
+        return str([message])
+
+
+def _semconv_value(value: Any) -> Any:
+    """Return enum.value when present, otherwise the value itself."""
+    return getattr(value, "value", value)
+
+# ── ReAct step lifecycle tracked via contextvars ────────────────────────────
+# A STEP span stays open across `_handle_llm_interaction` ⇒ `_execute_commands`
+# so both become its children. It is closed when the next iteration starts or
+# when `_run_agent_loop` returns.
+_current_step_span = contextvars.ContextVar(
+    "terminus2_current_step_span", default=None
+)
+_current_step_token = contextvars.ContextVar(
+    "terminus2_current_step_token", default=None
+)
+_react_round_counter = contextvars.ContextVar(
+    "terminus2_react_round_counter", default=0
+)
+
+
+def _end_current_step(finish_reason: str | None = None) -> None:
+    """End the active ReAct STEP span (if any) and detach its context."""
+    span = _current_step_span.get()
+    token = _current_step_token.get()
+    if span is not None:
+        if finish_reason:
+            span.set_attribute(_GEN_AI_REACT_FINISH_REASON, finish_reason)
+        span.end()
+        _current_step_span.set(None)
+    if token is not None:
+        context_api.detach(token)
+        _current_step_token.set(None)
+
+
+def _infer_provider_name(model_name: str) -> str:
+    """Infer ``gen_ai.provider.name`` from a model identifier string."""
+    if not model_name:
+        return "unknown"
+    lower = model_name.lower()
+    if any(k in lower for k in ("gpt", "o1-", "o3-", "o4-")):
+        return "openai"
+    if "claude" in lower or "anthropic" in lower:
+        return "anthropic"
+    if "gemini" in lower:
+        return "google"
+    if "llama" in lower or "meta" in lower:
+        return "meta"
+    if "mistral" in lower:
+        return "mistral"
+    if "qwen" in lower:
+        return "alibaba"
+    if "deepseek" in lower:
+        return "deepseek"
+    if "/" in model_name:
+        return model_name.split("/", 1)[0]
+    return "unknown"
+
+
+# Sentinel attribute attached to every target we successfully wrap. Stored
+# on the target callable itself (not in module-level state) so that
+# duplicate wraps are detected even if this package is loaded as multiple
+# module instances (e.g. wheel install + ``pip install -e`` source, or
+# under different sys.path roots), or if ``_instrument()`` is invoked
+# twice via auto-loader + manual call.
+_TERMINUS2_MARKER = "_otel_terminus2_wrapped"
+
+
+def _resolve_target(module: str, name: str):
+    """Resolve ``module.name`` (where ``name`` may be ``Class.method``).
+
+    Returns ``(parent, attr_name, current_value)``. Raises on missing
+    module / attribute.
+    """
+    from importlib import import_module
+    mod = import_module(module)
+    parts = name.split(".")
+    parent = mod
+    for p in parts[:-1]:
+        parent = getattr(parent, p)
+    attr = parts[-1]
+    return parent, attr, getattr(parent, attr, None)
+
+
+def _try_wrap(module: str, name: str, wrapper) -> None:
+    """Wrap ``module.name`` with ``wrapper`` exactly once.
+
+    Idempotency is enforced via a sentinel attribute attached to the
+    target — robust against multiple module instances of this package and
+    repeated ``_instrument()`` invocations.
+    """
+    try:
+        parent, attr, current = _resolve_target(module, name)
+    except Exception as e:
+        logger.warning(f"Could not resolve {module}.{name}: {e}")
+        return
+
+    if current is None:
+        logger.warning(f"{module}.{name} not found")
+        return
+
+    if getattr(current, _TERMINUS2_MARKER, False):
+        logger.debug(
+            f"{module}.{name} already wrapped by terminus2 instrumentation, "
+            "skipping"
+        )
+        return
+
+    try:
+        wrap_function_wrapper(module=module, name=name, wrapper=wrapper)
+    except Exception as e:
+        logger.warning(f"Could not wrap {module}.{name}: {e}")
+        return
+
+    # Mark the freshly installed wrapper. wrapt's FunctionWrapper proxies
+    # attribute writes to the underlying wrapped object, but reading the
+    # attribute back through the proxy returns the same value, so a
+    # subsequent ``getattr`` check on either layer detects the marker.
+    new_value = getattr(parent, attr, None)
+    if new_value is not None:
+        try:
+            setattr(new_value, _TERMINUS2_MARKER, True)
+        except Exception as e:
+            logger.debug(f"Could not mark {module}.{name}: {e}")
+
+
+def _try_unwrap(module: str, name: str) -> None:
+    """Reverse of :func:`_try_wrap`."""
+    try:
+        parent, attr, current = _resolve_target(module, name)
+    except Exception:
+        return
+
+    if current is None or not getattr(current, _TERMINUS2_MARKER, False):
+        return
+
+    # Clear the marker on the underlying object first (FunctionWrapper
+    # forwards delattr to the wrapped object, so the marker — which was
+    # written through to the original — is removed cleanly).
+    try:
+        delattr(current, _TERMINUS2_MARKER)
+    except (AttributeError, TypeError):
+        pass
+
+    try:
+        unwrap(parent, attr)
+    except Exception as e:
+        logger.debug(f"Could not unwrap {module}.{name}: {e}")
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# Instrumentor
+# ═══════════════════════════════════════════════════════════════════════════
+
+class Terminus2Instrumentor(BaseInstrumentor):
+    """Instrumentor for the terminus-2 agent from terminal-bench."""
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        tracer_provider = kwargs.get("tracer_provider")
+        tracer = trace_api.get_tracer(__name__, "", tracer_provider=tracer_provider)
+
+        # P0 – ENTRY span (application entry point)
+        _try_wrap(
+            "terminal_bench.agents.terminus_2.terminus_2",
+            "Terminus2.perform_task",
+            _PerformTaskWrapper(tracer),
+        )
+
+        # P0 – AGENT span (agent invocation) + ReAct loop lifecycle
+        _try_wrap(
+            "terminal_bench.agents.terminus_2.terminus_2",
+            "Terminus2._run_agent_loop",
+            _RunAgentLoopWrapper(tracer),
+        )
+
+        # NOTE: LLM spans for ``LiteLLM.call`` are NOT produced here —
+        # ``opentelemetry-instrumentation-litellm`` already traces the
+        # underlying ``litellm.completion`` invocation. Wrapping again would
+        # produce duplicate LLM spans for every model call.
+
+        # P0 – TOOL span for terminal command batch
+        _try_wrap(
+            "terminal_bench.agents.terminus_2.terminus_2",
+            "Terminus2._execute_commands",
+            _ExecuteCommandsWrapper(tracer),
+        )
+
+        # P1 – STEP span per ReAct iteration
+        _try_wrap(
+            "terminal_bench.agents.terminus_2.terminus_2",
+            "Terminus2._handle_llm_interaction",
+            _HandleLLMInteractionWrapper(tracer),
+        )
+
+        # P1 – TASK span for parser (json + xml)
+        _try_wrap(
+            "terminal_bench.agents.terminus_2.terminus_json_plain_parser",
+            "TerminusJSONPlainParser.parse_response",
+            _ParseResponseWrapper(tracer, "json"),
+        )
+        _try_wrap(
+            "terminal_bench.agents.terminus_2.terminus_xml_plain_parser",
+            "TerminusXMLPlainParser.parse_response",
+            _ParseResponseWrapper(tracer, "xml"),
+        )
+
+        # P2 – CHAIN span for context-overflow handoff
+        _try_wrap(
+            "terminal_bench.agents.terminus_2.terminus_2",
+            "Terminus2._summarize",
+            _SummarizeWrapper(tracer),
+        )
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        _try_unwrap(
+            "terminal_bench.agents.terminus_2.terminus_2",
+            "Terminus2.perform_task",
+        )
+        _try_unwrap(
+            "terminal_bench.agents.terminus_2.terminus_2",
+            "Terminus2._run_agent_loop",
+        )
+        _try_unwrap(
+            "terminal_bench.agents.terminus_2.terminus_2",
+            "Terminus2._execute_commands",
+        )
+        _try_unwrap(
+            "terminal_bench.agents.terminus_2.terminus_2",
+            "Terminus2._handle_llm_interaction",
+        )
+        _try_unwrap(
+            "terminal_bench.agents.terminus_2.terminus_json_plain_parser",
+            "TerminusJSONPlainParser.parse_response",
+        )
+        _try_unwrap(
+            "terminal_bench.agents.terminus_2.terminus_xml_plain_parser",
+            "TerminusXMLPlainParser.parse_response",
+        )
+        _try_unwrap(
+            "terminal_bench.agents.terminus_2.terminus_2",
+            "Terminus2._summarize",
+        )
+        _end_current_step()
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# P0 — ENTRY span: Terminus2.perform_task
+# ═══════════════════════════════════════════════════════════════════════════
+
+class _PerformTaskWrapper:
+    """Wrap ``Terminus2.perform_task`` to produce the **ENTRY** span.
+
+    Per spec: span name ``enter_ai_application_system``,
+    ``gen_ai.span.kind=ENTRY``, ``gen_ai.operation.name=enter``.
+
+    Records the user instruction as ``gen_ai.input.messages`` and a
+    serialized summary of ``AgentResult`` (failure_mode, token totals,
+    marker count) as ``gen_ai.output.messages`` once the task completes.
+    """
+
+    def __init__(self, tracer):
+        self._tracer = tracer
+
+    @hook_advice(
+        instrumentation_name="terminus2",
+        advice_method="perform_task",
+        throw_exception=True,
+    )
+    def __call__(self, wrapped, instance, args, kwargs):
+        model_name = getattr(instance, "_model_name", "unknown")
+        instruction = args[0] if args else kwargs.get("instruction", "")
+
+        with self._tracer.start_as_current_span(
+            "enter_ai_application_system",
+            kind=SpanKind.SERVER,
+        ) as span:
+            span.set_attribute(CommonAttributes.GEN_AI_SPAN_KIND, _SPAN_KIND_ENTRY)
+            span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_ENTER)
+            span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK)
+            span.set_attribute(LLMAttributes.GEN_AI_REQUEST_MODEL, model_name)
+            span.set_attribute(
+                LLMAttributes.GEN_AI_PROVIDER_NAME,
+                _infer_provider_name(model_name),
+            )
+
+            if instruction:
+                span.set_attribute(
+                    _GEN_AI_INPUT_MESSAGES,
+                    _text_messages_json("user", instruction),
+                )
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as e:
+                span.record_exception(e)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+
+            input_tokens = getattr(result, "total_input_tokens", 0) or 0
+            output_tokens = getattr(result, "total_output_tokens", 0) or 0
+            failure_mode = getattr(result, "failure_mode", None)
+            failure_mode_str = str(
+                getattr(failure_mode, "value", failure_mode)
+            ) if failure_mode is not None else "none"
+            markers = getattr(result, "timestamped_markers", None) or []
+
+            output_summary = {
+                "failure_mode": failure_mode_str,
+                "total_input_tokens": input_tokens,
+                "total_output_tokens": output_tokens,
+                "marker_count": len(markers),
+            }
+            try:
+                output_value = json.dumps(output_summary, ensure_ascii=False)
+            except Exception:
+                output_value = str(output_summary)
+
+            span.set_attribute(
+                _GEN_AI_OUTPUT_MESSAGES,
+                _text_messages_json("assistant", output_value),
+            )
+            span.set_attribute("terminus2.failure_mode", failure_mode_str)
+
+            span.set_status(Status(StatusCode.OK))
+            return result
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# P0 — AGENT span: Terminus2._run_agent_loop
+# ═══════════════════════════════════════════════════════════════════════════
+
+class _RunAgentLoopWrapper:
+    """Wrap ``Terminus2._run_agent_loop`` to produce the **AGENT** span.
+
+    Per spec: span name ``invoke_agent {agent.name}``,
+    ``gen_ai.span.kind=AGENT``, ``gen_ai.operation.name=invoke_agent``.
+
+    The AGENT span precisely brackets the ReAct loop body — STEP / TOOL /
+    TASK / CHAIN children all hang off it. Token totals are aggregated
+    from the ``Chat`` cumulative counters once the loop returns. Also
+    cleans up any trailing STEP span on loop exit.
+    """
+
+    def __init__(self, tracer):
+        self._tracer = tracer
+
+    @hook_advice(
+        instrumentation_name="terminus2",
+        advice_method="run_agent_loop",
+        throw_exception=True,
+    )
+    def __call__(self, wrapped, instance, args, kwargs):
+        # Reset per-loop ReAct state
+        _react_round_counter.set(0)
+        _end_current_step()
+
+        model_name = getattr(instance, "_model_name", "unknown")
+        parser_name = getattr(instance, "_parser_name", "unknown")
+
+        # _run_agent_loop signature:
+        #   (initial_prompt, session, chat, logging_dir=None,
+        #    original_instruction="")
+        chat = args[2] if len(args) > 2 else kwargs.get("chat")
+        original_instruction = (
+            args[4] if len(args) > 4 else kwargs.get("original_instruction", "")
+        )
+
+        with self._tracer.start_as_current_span(
+            f"invoke_agent {_AGENT_NAME}",
+            kind=SpanKind.INTERNAL,
+        ) as span:
+            span.set_attribute(
+                CommonAttributes.GEN_AI_SPAN_KIND,
+                _semconv_value(GenAiSpanKind.AGENT),
+            )
+            span.set_attribute(
+                CommonAttributes.GEN_AI_OPERATION_NAME,
+                _semconv_value(GenAiOperationName.INVOKE_AGENT),
+            )
+            span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK)
+            span.set_attribute("gen_ai.agent.name", _AGENT_NAME)
+            span.set_attribute(
+                "gen_ai.agent.description",
+                "Terminus-2 terminal-bench agent (ReAct loop over a tmux session)",
+            )
+            span.set_attribute(LLMAttributes.GEN_AI_REQUEST_MODEL, model_name)
+            span.set_attribute(
+                LLMAttributes.GEN_AI_PROVIDER_NAME,
+                _infer_provider_name(model_name),
+            )
+            span.set_attribute("terminus2.parser", parser_name)
+
+            if original_instruction:
+                span.set_attribute(
+                    _GEN_AI_INPUT_MESSAGES,
+                    _text_messages_json("user", original_instruction),
+                )
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as e:
+                span.record_exception(e)
+                span.set_status(Status(StatusCode.ERROR))
+                _end_current_step(finish_reason="loop_end")
+                raise
+
+            _end_current_step(finish_reason="loop_end")
+
+            # Aggregate token usage from the Chat object — captured here so
+            # the totals reflect the full loop, including the bare
+            # ``chat._model.call`` invoked inside ``_summarize``.
+            # ``Chat.total_*_tokens`` returns cumulative counters that
+            # survive context unwinding.
+            if chat is not None:
+                input_tokens = getattr(chat, "total_input_tokens", 0) or 0
+                output_tokens = getattr(chat, "total_output_tokens", 0) or 0
+                span.set_attribute(
+                    LLMAttributes.GEN_AI_USAGE_INPUT_TOKENS, input_tokens
+                )
+                span.set_attribute(
+                    LLMAttributes.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens
+                )
+                span.set_attribute(
+                    LLMAttributes.GEN_AI_USAGE_TOTAL_TOKENS,
+                    input_tokens + output_tokens,
+                )
+
+            span.set_attribute(
+                "terminus2.react.rounds", _react_round_counter.get()
+            )
+
+            span.set_status(Status(StatusCode.OK))
+            return result
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# P0 — TOOL span: Terminus2._execute_commands
+# ═══════════════════════════════════════════════════════════════════════════
+
+class _ExecuteCommandsWrapper:
+    """Wrap ``Terminus2._execute_commands`` to produce a **TOOL** span.
+
+    Per spec: span name ``execute_tool {tool_name}``,
+    ``gen_ai.span.kind=TOOL``, ``gen_ai.operation.name=execute_tool``.
+    """
+
+    def __init__(self, tracer):
+        self._tracer = tracer
+
+    @hook_advice(
+        instrumentation_name="terminus2",
+        advice_method="execute_commands",
+        throw_exception=True,
+    )
+    def __call__(self, wrapped, instance, args, kwargs):
+        commands = args[0] if args else kwargs.get("commands", [])
+
+        with self._tracer.start_as_current_span(
+            f"execute_tool {_TERMINAL_TOOL_NAME}",
+            kind=SpanKind.INTERNAL,
+        ) as span:
+            span.set_attribute(
+                CommonAttributes.GEN_AI_SPAN_KIND,
+                _semconv_value(GenAiSpanKind.TOOL),
+            )
+            span.set_attribute(
+                CommonAttributes.GEN_AI_OPERATION_NAME,
+                _semconv_value(GenAiOperationName.EXECUTE_TOOL),
+            )
+            span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK)
+            span.set_attribute(ToolAttributes.GEN_AI_TOOL_NAME, _TERMINAL_TOOL_NAME)
+            span.set_attribute(
+                ToolAttributes.GEN_AI_TOOL_DESCRIPTION, _TERMINAL_TOOL_DESCRIPTION
+            )
+            span.set_attribute(
+                ToolAttributes.GEN_AI_TOOL_TYPE,
+                _semconv_value(GenAiToolType.EXTENSION),
+            )
+            span.set_attribute("terminus2.commands.count", len(commands))
+
+            arguments_json = _commands_to_arguments_json(commands)
+            # Spec attribute (gen-ai.md §Tool)
+            span.set_attribute(_GEN_AI_TOOL_CALL_ARGUMENTS, arguments_json)
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as e:
+                span.record_exception(e)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+
+            timeout_occurred, terminal_output = result
+            span.set_attribute("terminus2.terminal.timeout", timeout_occurred)
+
+            if terminal_output is not None:
+                output_text = str(terminal_output)
+                # Spec attribute (gen-ai.md §Tool)
+                span.set_attribute(_GEN_AI_TOOL_CALL_RESULT, output_text)
+
+            span.set_status(Status(StatusCode.OK))
+            return result
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# P1 — STEP span: Terminus2._handle_llm_interaction
+# ═══════════════════════════════════════════════════════════════════════════
+
+class _HandleLLMInteractionWrapper:
+    """Wrap ``Terminus2._handle_llm_interaction`` to produce a **STEP** span.
+
+    The STEP span represents one ReAct iteration. It opens here, stays open
+    after this method returns (so the subsequent ``_execute_commands`` call
+    in ``_run_agent_loop`` becomes its child), and is closed on the next
+    iteration entry or by ``_RunAgentLoopWrapper`` cleanup.
+    """
+
+    def __init__(self, tracer):
+        self._tracer = tracer
+
+    @hook_advice(
+        instrumentation_name="terminus2",
+        advice_method="handle_llm_interaction",
+        throw_exception=True,
+    )
+    def __call__(self, wrapped, instance, args, kwargs):
+        # Close previous STEP first (if any)
+        _end_current_step(finish_reason="next_round")
+
+        round_num = _react_round_counter.get() + 1
+        _react_round_counter.set(round_num)
+
+        step_span = self._tracer.start_span(
+            "react step",
+            kind=SpanKind.INTERNAL,
+        )
+        step_span.set_attribute(CommonAttributes.GEN_AI_SPAN_KIND, _SPAN_KIND_STEP)
+        step_span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_REACT)
+        step_span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK)
+        step_span.set_attribute(_GEN_AI_REACT_ROUND, round_num)
+
+        ctx = trace_api.set_span_in_context(step_span)
+        token = context_api.attach(ctx)
+        _current_step_span.set(step_span)
+        _current_step_token.set(token)
+
+        try:
+            result = wrapped(*args, **kwargs)
+        except Exception as e:
+            step_span.set_attribute(_GEN_AI_REACT_FINISH_REASON, "error")
+            step_span.record_exception(e)
+            step_span.set_status(Status(StatusCode.ERROR))
+            raise
+
+        commands, is_task_complete, feedback = result
+
+        if is_task_complete:
+            step_span.set_attribute(_GEN_AI_REACT_FINISH_REASON, "complete")
+        elif feedback and "ERROR:" in feedback:
+            step_span.set_attribute(_GEN_AI_REACT_FINISH_REASON, "parse_error")
+
+        # Span stays open: closed by next iteration or _RunAgentLoopWrapper
+        return result
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# P1 — TASK span: parser.parse_response
+# ═══════════════════════════════════════════════════════════════════════════
+
+class _ParseResponseWrapper:
+    """Wrap ``parser.parse_response`` to produce a **TASK** span.
+
+    Per spec: span name ``run_task {task_name}``,
+    ``gen_ai.span.kind=TASK``, ``gen_ai.operation.name=run_task``.
+    """
+
+    def __init__(self, tracer, parser_type):
+        self._tracer = tracer
+        self._parser_type = parser_type
+
+    @hook_advice(
+        instrumentation_name="terminus2",
+        advice_method="parse_response",
+        throw_exception=True,
+    )
+    def __call__(self, wrapped, instance, args, kwargs):
+        # parse_response signature: (self, response: str)
+        response_text = args[0] if args else kwargs.get("response", "")
+
+        with self._tracer.start_as_current_span(
+            "run_task parse_response",
+            kind=SpanKind.INTERNAL,
+        ) as span:
+            span.set_attribute(
+                CommonAttributes.GEN_AI_SPAN_KIND,
+                _semconv_value(GenAiSpanKind.TASK),
+            )
+            span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_RUN_TASK)
+            span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK)
+            span.set_attribute("terminus2.parser", self._parser_type)
+
+            if response_text is not None:
+                span.set_attribute(
+                    _GEN_AI_INPUT_MESSAGES,
+                    _text_messages_json("assistant", response_text),
+                )
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as e:
+                span.record_exception(e)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+
+            span.set_attribute("terminus2.task_complete", result.is_task_complete)
+            span.set_attribute("terminus2.commands.count", len(result.commands))
+
+            output_summary = {
+                "is_task_complete": result.is_task_complete,
+                "commands": [
+                    {
+                        "keystrokes": getattr(c, "keystrokes", ""),
+                        "duration": getattr(c, "duration", None),
+                    }
+                    for c in result.commands
+                ],
+                "error": result.error or "",
+                "warning": result.warning or "",
+            }
+            try:
+                output_value = json.dumps(output_summary, ensure_ascii=False)
+            except Exception:
+                output_value = str(output_summary)
+            span.set_attribute(
+                _GEN_AI_OUTPUT_MESSAGES,
+                _text_messages_json("assistant", output_value),
+            )
+
+            if result.error:
+                span.set_attribute("terminus2.parse.error", str(result.error))
+
+            if result.warning:
+                span.set_attribute("terminus2.parse.warning", str(result.warning))
+
+            span.set_status(Status(StatusCode.OK))
+            return result
+
+
+# ═══════════════════════════════════════════════════════════════════════════
+# P2 — CHAIN span: Terminus2._summarize
+# ═══════════════════════════════════════════════════════════════════════════
+
+class _SummarizeWrapper:
+    """Wrap ``Terminus2._summarize`` to produce a **CHAIN** span.
+
+    Per spec: span name ``chain {chain_name}``,
+    ``gen_ai.span.kind=CHAIN``. The summarize handoff itself triggers
+    multiple inner LLM calls so it semantically maps to a Chain.
+    """
+
+    def __init__(self, tracer):
+        self._tracer = tracer
+
+    @hook_advice(
+        instrumentation_name="terminus2",
+        advice_method="summarize",
+        throw_exception=True,
+    )
+    def __call__(self, wrapped, instance, args, kwargs):
+        with self._tracer.start_as_current_span(
+            "chain summarize",
+            kind=SpanKind.INTERNAL,
+        ) as span:
+            span.set_attribute(
+                CommonAttributes.GEN_AI_SPAN_KIND,
+                _semconv_value(GenAiSpanKind.CHAIN),
+            )
+            span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_TASK)
+            span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK)
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except Exception as e:
+                span.record_exception(e)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+
+            span.set_status(Status(StatusCode.OK))
+            return result
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/package.py
new file mode 100644
index 000000000..d92c81333
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/package.py
@@ -0,0 +1,15 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+_instruments = ("terminal-bench >= 0.1.0",)
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/version.py
new file mode 100644
index 000000000..5fd301e2e
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/version.py
@@ -0,0 +1,15 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__version__ = "0.1.0"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/test-requirements.txt b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/test-requirements.txt
new file mode 100644
index 000000000..f98537dd8
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/test-requirements.txt
@@ -0,0 +1,4 @@
+terminal-bench>=0.1.0
+-e aliyun-semantic-conventions
+-e util/opentelemetry-util-http
+-e instrumentation-loongsuite/loongsuite-instrumentation-terminus2
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-vita/README.md
new file mode 100644
index 000000000..a91e8d879
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/README.md
@@ -0,0 +1,47 @@
+# LoongSuite VitaBench Instrumentation
+
+OpenTelemetry instrumentation for the VitaBench multi-domain simulation framework.
+
+## Installation
+
+```bash
+pip install loongsuite-instrumentation-vita
+```
+
+## Usage
+
+```python
+from opentelemetry.instrumentation.vita import VitaInstrumentor
+
+VitaInstrumentor().instrument()
+```
+
+For GenAI semantic conventions and span-only message content capture, set:
+
+```bash
+export OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental
+export OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=SPAN_ONLY
+```
+
+## VitaBench With DashScope
+
+VitaBench posts directly to the `base_url` configured in `models.yaml`, so the
+DashScope OpenAI-compatible endpoint must include `/chat/completions`. The API
+key must be supplied in the `Authorization` header.
+
+```yaml
+default:
+  base_url: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
+  temperature: 0.0
+  max_input_tokens: 8192
+  headers:
+    Content-Type: "application/json"
+    Authorization: "Bearer ${OPENAI_API_KEY}"
+models:
+  - name: qwen3.6-plus
+    max_tokens: 1024
+    max_input_tokens: 8192
+```
+
+See `examples/vitabench-dashscope` for a runnable setup used by the Kubernetes
+benchmark deployment.
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/README.md
new file mode 100644
index 000000000..7d63531c3
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/README.md
@@ -0,0 +1,23 @@
+# VitaBench DashScope Example
+
+This example runs a single VitaBench delivery task with LoongSuite
+instrumentation and DashScope's OpenAI-compatible chat completions endpoint.
+
+Required environment variables:
+
+```bash
+export OPENAI_API_KEY=<your DashScope API key>
+export OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental
+export OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=SPAN_ONLY
+```
+
+Then run:
+
+```bash
+./setup.sh
+./cmd.sh
+```
+
+`setup.sh` writes `models.yaml` with the full `/chat/completions` endpoint and
+injects the API key via the `Authorization` header at runtime. Do not commit a
+rendered `models.yaml` containing a real key.
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/cmd.sh b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/cmd.sh
new file mode 100755
index 000000000..813abb713
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/cmd.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+# Run one VitaBench delivery task with LoongSuite instrumentation.
+set -euo pipefail
+
+export OTEL_SEMCONV_STABILITY_OPT_IN="${OTEL_SEMCONV_STABILITY_OPT_IN:-gen_ai_latest_experimental}"
+export OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT="${OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT:-SPAN_ONLY}"
+
+VITA_ROOT=/work/upstream/vitabench
+if [ ! -d "$VITA_ROOT" ]; then
+  echo "[vita-cmd] vitabench not found, run setup.sh first" >&2
+  exit 1
+fi
+
+cd "$VITA_ROOT"
+export VITA_MODEL_CONFIG_PATH=/work/upstream/vitabench/models.yaml
+
+echo "[vita-cmd] invoking vita run --domain delivery --num-tasks 1"
+loongsuite-instrument vita run \
+  --domain delivery \
+  --user-llm qwen3.6-plus \
+  --agent-llm qwen3.6-plus \
+  --evaluator-llm qwen3.6-plus \
+  --num-tasks 1 \
+  --num-trials 1
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/setup.sh b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/setup.sh
new file mode 100755
index 000000000..669ef7602
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/setup.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Prepare VitaBench and write a DashScope-backed model config.
+set -euo pipefail
+
+: "${OPENAI_API_KEY:?OPENAI_API_KEY is required}"
+
+mkdir -p /work/upstream
+cd /work/upstream
+
+if [ ! -d vitabench ]; then
+  echo "[vita-setup] cloning vitabench"
+  git clone --depth=1 https://github.com/meituan-longcat/vitabench.git
+fi
+
+cd vitabench
+pip install --quiet --no-deps -e . || pip install --no-deps -e .
+pip install --quiet "openai>=1.0" "pydantic>=2" pyyaml "loguru" "anthropic" \
+  "litellm" "tenacity" "tiktoken" pandas toml addict deepdiff thefuzz \
+  json_repair holidays || true
+
+cat > /work/upstream/vitabench/models.yaml <<YAML
+default:
+  base_url: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions
+  temperature: 0.0
+  max_input_tokens: 8192
+  headers:
+    Content-Type: "application/json"
+    Authorization: "Bearer ${OPENAI_API_KEY}"
+models:
+  - name: qwen3.6-plus
+    max_tokens: 1024
+    max_input_tokens: 8192
+YAML
+
+echo "[vita-setup] done. config at /work/upstream/vitabench/models.yaml"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-vita/pyproject.toml
new file mode 100644
index 000000000..d1df8fa2e
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/pyproject.toml
@@ -0,0 +1,55 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-vita"
+dynamic = ["version"]
+description = "LoongSuite VitaBench instrumentation"
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.10,<4"
+authors = [
+  { name = "Zhiyong Liu", email = "liuzhiyong.lzy@alibaba-inc.com" },
+  { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+]
+dependencies = [
+  "opentelemetry-api >= 1.37.0",
+  "opentelemetry-instrumentation >= 0.58b0",
+  "opentelemetry-semantic-conventions >= 0.58b0",
+  "wrapt >= 1.0.0, < 2.0.0",
+  "opentelemetry-util-genai >= 0.3b0.dev0",
+]
+
+[project.optional-dependencies]
+instruments = [
+  "vita >= 0.0.1",
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+vita = "opentelemetry.instrumentation.vita:VitaInstrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-vita"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/vita/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+  "/src",
+  "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/__init__.py
new file mode 100644
index 000000000..1e58668a6
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/__init__.py
@@ -0,0 +1,223 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+OpenTelemetry VitaBench Instrumentation
+
+Usage
+-----
+.. code:: python
+
+    from opentelemetry.instrumentation.vita import VitaInstrumentor
+
+    VitaInstrumentor().instrument()
+
+    # ... run vitabench tasks ...
+
+    VitaInstrumentor().uninstrument()
+
+API
+---
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Collection
+
+from wrapt import wrap_function_wrapper
+
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from opentelemetry.instrumentation.utils import unwrap
+from opentelemetry.instrumentation.vita.package import _instruments
+from opentelemetry.instrumentation.vita.patch import (
+    wrap_generate,
+    wrap_generate_next_message,
+    wrap_get_response,
+    wrap_orchestrator_run,
+    wrap_orchestrator_step,
+    wrap_run_task,
+)
+from opentelemetry.instrumentation.vita.version import __version__
+from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["VitaInstrumentor", "__version__"]
+
+
+class VitaInstrumentor(BaseInstrumentor):
+    """OpenTelemetry instrumentor for VitaBench framework.
+
+    Instruments the following components:
+    - vita.run.run_task(): Entry spans (ENTRY)
+    - Orchestrator.run(): Workflow spans (CHAIN)
+    - Orchestrator.step(): ReAct step spans (STEP)
+    - LLMAgent.generate_next_message(): Agent spans (AGENT)
+    - generate(): LLM call spans (LLM)
+    - Environment.get_response(): Tool execution spans (TOOL)
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._handler = None
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        """Enable VitaBench instrumentation."""
+        tracer_provider = kwargs.get("tracer_provider")
+        meter_provider = kwargs.get("meter_provider")
+        logger_provider = kwargs.get("logger_provider")
+
+        self._handler = ExtendedTelemetryHandler(
+            tracer_provider=tracer_provider,
+            meter_provider=meter_provider,
+            logger_provider=logger_provider,
+        )
+
+        # Hook #5: generate -> LLM. Wrap this first so modules that import
+        # generate directly (for example vita.agent.llm_agent) bind to the
+        # instrumented function during their import.
+        try:
+            wrap_function_wrapper(
+                module="vita.utils.llm_utils",
+                name="generate",
+                wrapper=lambda w, i, a, k: wrap_generate(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented vita.utils.llm_utils.generate")
+        except Exception as e:
+            logger.warning(f"Could not wrap vita.utils.llm_utils.generate: {e}")
+
+        # Hook #1: run_task -> ENTRY
+        try:
+            wrap_function_wrapper(
+                module="vita.run",
+                name="run_task",
+                wrapper=lambda w, i, a, k: wrap_run_task(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented vita.run.run_task")
+        except Exception as e:
+            logger.warning(f"Could not wrap vita.run.run_task: {e}")
+
+        # Hook #2: Orchestrator.run -> CHAIN
+        try:
+            wrap_function_wrapper(
+                module="vita.orchestrator.orchestrator",
+                name="Orchestrator.run",
+                wrapper=lambda w, i, a, k: wrap_orchestrator_run(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented Orchestrator.run")
+        except Exception as e:
+            logger.warning(f"Could not wrap Orchestrator.run: {e}")
+
+        # Hook #3: Orchestrator.step -> STEP
+        try:
+            wrap_function_wrapper(
+                module="vita.orchestrator.orchestrator",
+                name="Orchestrator.step",
+                wrapper=lambda w, i, a, k: wrap_orchestrator_step(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented Orchestrator.step")
+        except Exception as e:
+            logger.warning(f"Could not wrap Orchestrator.step: {e}")
+
+        # Hook #4a: LLMAgent.generate_next_message -> AGENT
+        try:
+            wrap_function_wrapper(
+                module="vita.agent.llm_agent",
+                name="LLMAgent.generate_next_message",
+                wrapper=lambda w, i, a, k: wrap_generate_next_message(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented LLMAgent.generate_next_message")
+        except Exception as e:
+            logger.warning(f"Could not wrap LLMAgent.generate_next_message: {e}")
+
+        # Hook #4b: LLMSoloAgent.generate_next_message -> AGENT
+        try:
+            wrap_function_wrapper(
+                module="vita.agent.llm_agent",
+                name="LLMSoloAgent.generate_next_message",
+                wrapper=lambda w, i, a, k: wrap_generate_next_message(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented LLMSoloAgent.generate_next_message")
+        except Exception as e:
+            logger.warning(f"Could not wrap LLMSoloAgent.generate_next_message: {e}")
+
+        # Hook #6: Environment.get_response -> TOOL
+        try:
+            wrap_function_wrapper(
+                module="vita.environment.environment",
+                name="Environment.get_response",
+                wrapper=lambda w, i, a, k: wrap_get_response(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented Environment.get_response")
+        except Exception as e:
+            logger.warning(f"Could not wrap Environment.get_response: {e}")
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        """Disable VitaBench instrumentation."""
+        try:
+            import vita.run  # noqa: PLC0415
+
+            unwrap(vita.run, "run_task")
+        except Exception as e:
+            logger.debug(f"Failed to uninstrument vita.run.run_task: {e}")
+
+        try:
+            import vita.orchestrator.orchestrator  # noqa: PLC0415
+
+            unwrap(vita.orchestrator.orchestrator.Orchestrator, "run")
+            unwrap(vita.orchestrator.orchestrator.Orchestrator, "step")
+        except Exception as e:
+            logger.debug(f"Failed to uninstrument Orchestrator: {e}")
+
+        try:
+            import vita.agent.llm_agent  # noqa: PLC0415
+
+            unwrap(vita.agent.llm_agent.LLMAgent, "generate_next_message")
+            unwrap(vita.agent.llm_agent.LLMSoloAgent, "generate_next_message")
+        except Exception as e:
+            logger.debug(f"Failed to uninstrument LLMAgent: {e}")
+
+        try:
+            import vita.utils.llm_utils  # noqa: PLC0415
+
+            unwrap(vita.utils.llm_utils, "generate")
+        except Exception as e:
+            logger.debug(f"Failed to uninstrument generate: {e}")
+
+        try:
+            import vita.environment.environment  # noqa: PLC0415
+
+            unwrap(vita.environment.environment.Environment, "get_response")
+        except Exception as e:
+            logger.debug(f"Failed to uninstrument Environment: {e}")
+
+        self._handler = None
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/package.py
new file mode 100644
index 000000000..a776722c9
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/package.py
@@ -0,0 +1,3 @@
+_instruments = ("vita >= 0.0.1",)
+
+_supports_metrics = False
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/patch.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/patch.py
new file mode 100644
index 000000000..182da38d6
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/patch.py
@@ -0,0 +1,432 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Patch functions for VitaBench instrumentation.
+
+Wraps key vitabench methods to generate OpenTelemetry spans:
+- run_task() -> ENTRY spans
+- Orchestrator.run() -> CHAIN spans
+- Orchestrator.step() -> STEP spans (react)
+- LLMAgent.generate_next_message() -> AGENT spans
+- generate() -> LLM spans
+- Environment.get_response() -> TOOL spans
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+import uuid
+from contextvars import ContextVar
+from typing import Any, Optional
+
+from opentelemetry import trace as trace_api
+from opentelemetry.trace import SpanKind, Status, StatusCode
+from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler
+from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes
+from opentelemetry.util.genai.extended_types import (
+    EntryInvocation,
+    ExecuteToolInvocation,
+    InvokeAgentInvocation,
+    ReactStepInvocation,
+)
+from opentelemetry.util.genai.types import (
+    Error,
+    InputMessage,
+    LLMInvocation,
+    OutputMessage,
+    Text,
+)
+
+from .utils import (
+    _convert_vita_assistant_to_output,
+    _convert_vita_messages_to_input,
+    _get_tool_definitions,
+    _infer_provider,
+    _MAX_CONTENT_LEN,
+)
+
+logger = logging.getLogger(__name__)
+
+# ContextVars for ReAct step tracking
+_react_step_invocation: ContextVar[Optional[ReactStepInvocation]] = ContextVar(
+    "vita_react_step_invocation", default=None
+)
+_react_step_counter: ContextVar[int] = ContextVar(
+    "vita_react_step_counter", default=0
+)
+
+# Reentrancy guard for AGENT span (LLMSoloAgent extends LLMAgent)
+_in_agent_invoke: ContextVar[bool] = ContextVar(
+    "vita_in_agent_invoke", default=False
+)
+
+
+def _close_active_react_step(handler: ExtendedTelemetryHandler) -> None:
+    """Close the currently active react_step span, if any."""
+    prev = _react_step_invocation.get()
+    if prev is not None:
+        try:
+            handler.stop_react_step(prev)
+        except Exception as e:
+            logger.debug(f"Failed to close react step: {e}")
+        _react_step_invocation.set(None)
+
+
+# ==================== Hook #1: run_task -> ENTRY ====================
+
+
+def wrap_run_task(
+    wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler
+):
+    """Wrapper for vita.run.run_task to create ENTRY span."""
+    task = args[1] if len(args) > 1 else kwargs.get("task")
+    domain = args[0] if args else kwargs.get("domain")
+
+    invocation = EntryInvocation(
+        session_id=str(uuid.uuid4()),
+        user_id=None,
+    )
+    invocation.attributes["gen_ai.framework"] = "vitabench"
+
+    if task and hasattr(task, "instructions") and task.instructions:
+        invocation.input_messages = [
+            InputMessage(role="user", parts=[Text(content=str(task.instructions)[:_MAX_CONTENT_LEN])])
+        ]
+
+    handler.start_entry(invocation)
+    try:
+        result = wrapped(*args, **kwargs)
+
+        if result:
+            output_parts = []
+            if hasattr(result, "termination_reason") and result.termination_reason:
+                output_parts.append(Text(content=f"termination: {result.termination_reason}"))
+            if hasattr(result, "reward_info") and result.reward_info:
+                reward = getattr(result.reward_info, "reward", None)
+                if reward is not None:
+                    output_parts.append(Text(content=f"reward: {reward}"))
+            if output_parts:
+                invocation.output_messages = [
+                    OutputMessage(
+                        role="assistant",
+                        parts=output_parts,
+                        finish_reason="stop",
+                    )
+                ]
+
+        handler.stop_entry(invocation)
+        return result
+    except Exception as e:
+        handler.fail_entry(invocation, Error(message=str(e), type=type(e)))
+        raise
+
+
+# ==================== Hook #2: Orchestrator.run -> CHAIN ====================
+
+
+def wrap_orchestrator_run(
+    wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler
+):
+    """Wrapper for Orchestrator.run to create CHAIN span."""
+    task = getattr(instance, "task", None)
+    domain = getattr(instance, "domain", "unknown")
+    span_name = f"workflow {domain}"
+
+    input_text = ""
+    if task and hasattr(task, "instructions") and task.instructions:
+        input_text = str(task.instructions)[:_MAX_CONTENT_LEN]
+
+    tracer = handler._tracer
+
+    # Reset step counter for this orchestrator run
+    counter_token = _react_step_counter.set(0)
+    step_token = _react_step_invocation.set(None)
+
+    with tracer.start_as_current_span(
+        name=span_name,
+        kind=SpanKind.INTERNAL,
+        attributes={
+            "gen_ai.operation.name": "workflow",
+            "gen_ai.system": "vitabench",
+            gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "CHAIN",
+            "gen_ai.framework": "vitabench",
+        },
+    ) as span:
+        if input_text:
+            span.set_attribute("input.value", input_text)
+
+        try:
+            result = wrapped(*args, **kwargs)
+
+            # Close any remaining open step span
+            _close_active_react_step(handler)
+
+            if result and hasattr(result, "termination_reason") and result.termination_reason:
+                span.set_attribute("output.value", str(result.termination_reason))
+
+            span.set_status(Status(StatusCode.OK))
+            return result
+        except Exception as e:
+            # Close any remaining open step span
+            _close_active_react_step(handler)
+            span.record_exception(e)
+            span.set_status(Status(StatusCode.ERROR))
+            raise
+        finally:
+            _react_step_counter.reset(counter_token)
+            _react_step_invocation.reset(step_token)
+
+
+# ==================== Hook #3: Orchestrator.step -> STEP ====================
+
+
+def wrap_orchestrator_step(
+    wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler
+):
+    """Wrapper for Orchestrator.step to create STEP span on AGENT turns."""
+    to_role = getattr(instance, "to_role", None)
+
+    # Import Role enum dynamically to avoid import-time dependency
+    _Role = None
+    try:
+        from vita.orchestrator.orchestrator import Role
+        _Role = Role
+    except ImportError:
+        pass
+
+    is_agent_turn = False
+    if _Role is not None:
+        is_agent_turn = (to_role == _Role.AGENT)
+    else:
+        is_agent_turn = (str(to_role) == "Role.AGENT" or str(to_role) == "agent")
+
+    if is_agent_turn:
+        # Close previous STEP span (deferred close strategy)
+        _close_active_react_step(handler)
+
+        step_num = _react_step_counter.get() + 1
+        _react_step_counter.set(step_num)
+
+        step_inv = ReactStepInvocation(round=step_num)
+        handler.start_react_step(step_inv)
+        _react_step_invocation.set(step_inv)
+
+    try:
+        result = wrapped(*args, **kwargs)
+
+        if is_agent_turn:
+            current_step = _react_step_invocation.get()
+            if current_step:
+                done = getattr(instance, "done", False)
+                if done:
+                    term_reason = getattr(instance, "termination_reason", None)
+                    if term_reason:
+                        current_step.finish_reason = (
+                            term_reason.value
+                            if hasattr(term_reason, "value")
+                            else str(term_reason)
+                        )
+                    else:
+                        current_step.finish_reason = "agent_stop"
+                else:
+                    message = getattr(instance, "message", None)
+                    if message and hasattr(message, "is_tool_call") and message.is_tool_call():
+                        current_step.finish_reason = "tool_call"
+                    else:
+                        current_step.finish_reason = "assistant_text"
+
+        return result
+    except Exception as e:
+        current_step = _react_step_invocation.get()
+        if current_step:
+            current_step.finish_reason = "error"
+            handler.fail_react_step(current_step, Error(message=str(e), type=type(e)))
+            _react_step_invocation.set(None)
+        raise
+
+
+# ==================== Hook #4: generate_next_message -> AGENT ====================
+
+
+def wrap_generate_next_message(
+    wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler
+):
+    """Wrapper for LLMAgent.generate_next_message / LLMSoloAgent.generate_next_message."""
+    # Reentrancy guard
+    if _in_agent_invoke.get():
+        return wrapped(*args, **kwargs)
+    token = _in_agent_invoke.set(True)
+
+    try:
+        agent_name = instance.__class__.__name__
+        model = getattr(instance, "llm", None)
+
+        invocation = InvokeAgentInvocation(
+            provider="vitabench",
+            agent_name=agent_name,
+            request_model=model,
+        )
+
+        # input_messages
+        message = args[0] if args else kwargs.get("message")
+        state = args[1] if len(args) > 1 else kwargs.get("state")
+        if message:
+            invocation.input_messages = _convert_vita_messages_to_input([message])
+
+        # system_instruction
+        if state and hasattr(state, "system_messages") and state.system_messages:
+            invocation.system_instruction = [
+                Text(content=str(sm.content)[:_MAX_CONTENT_LEN])
+                for sm in state.system_messages
+                if sm and getattr(sm, "content", None)
+            ]
+
+        # tool_definitions
+        tools = getattr(instance, "tools", None)
+        tool_defs = _get_tool_definitions(tools)
+        if tool_defs:
+            invocation.tool_definitions = tool_defs
+
+        handler.start_invoke_agent(invocation)
+
+        try:
+            result = wrapped(*args, **kwargs)
+            assistant_msg, _ = result
+
+            # output_messages
+            invocation.output_messages = _convert_vita_assistant_to_output(assistant_msg)
+
+            # token usage
+            usage = getattr(assistant_msg, "usage", None)
+            if usage and isinstance(usage, dict):
+                invocation.input_tokens = usage.get("prompt_tokens")
+                invocation.output_tokens = usage.get("completion_tokens")
+
+            handler.stop_invoke_agent(invocation)
+            return result
+        except Exception as e:
+            handler.fail_invoke_agent(invocation, Error(message=str(e), type=type(e)))
+            raise
+    finally:
+        _in_agent_invoke.reset(token)
+
+
+# ==================== Hook #5: generate -> LLM ====================
+
+
+def wrap_generate(
+    wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler
+):
+    """Wrapper for vita.utils.llm_utils.generate to create LLM span."""
+    model = args[0] if args else kwargs.get("model", "unknown")
+    messages = args[1] if len(args) > 1 else kwargs.get("messages", [])
+    tools = args[2] if len(args) > 2 else kwargs.get("tools")
+    temperature = kwargs.get("temperature")
+
+    invocation = LLMInvocation(
+        request_model=model or "unknown",
+        provider=_infer_provider(model or ""),
+        temperature=temperature,
+    )
+    invocation.max_tokens = kwargs.get("max_tokens")
+
+    # input_messages
+    invocation.input_messages = _convert_vita_messages_to_input(messages)
+
+    # tool_definitions
+    tool_defs = _get_tool_definitions(tools)
+    if tool_defs:
+        invocation.tool_definitions = tool_defs
+
+    handler.start_llm(invocation)
+
+    try:
+        result = wrapped(*args, **kwargs)
+
+        if result:
+            # output_messages
+            invocation.output_messages = _convert_vita_assistant_to_output(result)
+
+            # response_model_name
+            invocation.response_model_name = model
+
+            # finish_reasons
+            if getattr(result, "tool_calls", None):
+                invocation.finish_reasons = ["tool_calls"]
+            else:
+                invocation.finish_reasons = ["stop"]
+
+            # token usage
+            usage = getattr(result, "usage", None)
+            if usage and isinstance(usage, dict):
+                invocation.input_tokens = usage.get("prompt_tokens")
+                invocation.output_tokens = usage.get("completion_tokens")
+
+        handler.stop_llm(invocation)
+        return result
+    except Exception as e:
+        handler.fail_llm(invocation, Error(message=str(e), type=type(e)))
+        raise
+
+
+# ==================== Hook #6: Environment.get_response -> TOOL ====================
+
+
+def wrap_get_response(
+    wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler
+):
+    """Wrapper for Environment.get_response to create TOOL span."""
+    message = args[0] if args else kwargs.get("message")
+
+    tool_name = getattr(message, "name", "unknown") if message else "unknown"
+    tool_call_id = getattr(message, "id", None) if message else None
+
+    invocation = ExecuteToolInvocation(
+        tool_name=tool_name,
+        tool_call_id=tool_call_id,
+        provider="vitabench",
+    )
+
+    # tool_call_arguments
+    if message and hasattr(message, "arguments") and message.arguments:
+        try:
+            invocation.tool_call_arguments = json.dumps(
+                message.arguments, ensure_ascii=False, default=str
+            )[:_MAX_CONTENT_LEN]
+        except Exception:
+            invocation.tool_call_arguments = str(message.arguments)[:_MAX_CONTENT_LEN]
+
+    handler.start_execute_tool(invocation)
+
+    try:
+        result = wrapped(*args, **kwargs)
+
+        # tool_call_result
+        if result and getattr(result, "content", None):
+            invocation.tool_call_result = str(result.content)[:_MAX_CONTENT_LEN]
+
+        # Check if tool reported an error
+        if result and getattr(result, "error", False):
+            handler.fail_execute_tool(
+                invocation,
+                Error(message=f"Tool error: {getattr(result, 'content', '')}", type=RuntimeError),
+            )
+        else:
+            handler.stop_execute_tool(invocation)
+
+        return result
+    except Exception as e:
+        handler.fail_execute_tool(invocation, Error(message=str(e), type=type(e)))
+        raise
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/utils.py
new file mode 100644
index 000000000..0793a6cc0
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/utils.py
@@ -0,0 +1,169 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Utility functions for VitaBench instrumentation.
+
+Handles conversion between vitabench Message types and
+OpenTelemetry GenAI semantic convention types.
+"""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any, List, Optional
+
+from opentelemetry.util.genai.types import (
+    FunctionToolDefinition,
+    InputMessage,
+    OutputMessage,
+    Text,
+    ToolCall as OTelToolCall,
+    ToolCallResponse,
+)
+
+logger = logging.getLogger(__name__)
+
+_MAX_CONTENT_LEN = 4096
+
+
+def _convert_vita_messages_to_input(messages: Any) -> List[InputMessage]:
+    """Convert vita Message list to OTel InputMessage list."""
+    if not messages:
+        return []
+
+    if not isinstance(messages, list):
+        messages = [messages]
+
+    result = []
+    for msg in messages:
+        try:
+            role = getattr(msg, "role", None)
+            if role is None:
+                continue
+
+            parts = []
+            content = getattr(msg, "content", None)
+            tool_calls = getattr(msg, "tool_calls", None)
+
+            if role == "tool":
+                msg_id = getattr(msg, "id", None) or ""
+                if content:
+                    parts.append(
+                        ToolCallResponse(
+                            id=msg_id,
+                            response=str(content)[:_MAX_CONTENT_LEN],
+                        )
+                    )
+            else:
+                if content:
+                    parts.append(Text(content=str(content)[:_MAX_CONTENT_LEN]))
+                if tool_calls:
+                    for tc in tool_calls:
+                        tc_args = getattr(tc, "arguments", {})
+                        if isinstance(tc_args, dict):
+                            tc_args = json.dumps(tc_args, ensure_ascii=False, default=str)
+                        parts.append(
+                            OTelToolCall(
+                                name=getattr(tc, "name", ""),
+                                id=getattr(tc, "id", None),
+                                arguments=tc_args,
+                            )
+                        )
+
+            if parts:
+                result.append(InputMessage(role=role, parts=parts))
+        except Exception as e:
+            logger.debug(f"Error converting vita message: {e}")
+            continue
+
+    return result
+
+
+def _convert_vita_assistant_to_output(msg: Any) -> List[OutputMessage]:
+    """Convert vita AssistantMessage to OTel OutputMessage list."""
+    if not msg:
+        return []
+
+    parts = []
+    content = getattr(msg, "content", None)
+    tool_calls = getattr(msg, "tool_calls", None)
+
+    if content:
+        parts.append(Text(content=str(content)[:_MAX_CONTENT_LEN]))
+    if tool_calls:
+        for tc in tool_calls:
+            tc_args = getattr(tc, "arguments", {})
+            if isinstance(tc_args, dict):
+                tc_args = json.dumps(tc_args, ensure_ascii=False, default=str)
+            parts.append(
+                OTelToolCall(
+                    name=getattr(tc, "name", ""),
+                    id=getattr(tc, "id", None),
+                    arguments=tc_args,
+                )
+            )
+
+    finish_reason = "tool_calls" if tool_calls else "stop"
+
+    if not parts:
+        parts.append(Text(content=""))
+
+    return [OutputMessage(role="assistant", parts=parts, finish_reason=finish_reason)]
+
+
+def _infer_provider(model_name: str) -> str:
+    """Infer provider from model name string."""
+    if not model_name:
+        return "unknown"
+    m = model_name.lower()
+    if "gpt" in m or "o1" in m or "o3" in m:
+        return "openai"
+    if "claude" in m:
+        return "anthropic"
+    if "qwen" in m:
+        return "alibaba_cloud"
+    if "deepseek" in m:
+        return "deepseek"
+    if "gemini" in m:
+        return "google"
+    return "unknown"
+
+
+def _get_tool_definitions(tools: Any) -> Optional[List[FunctionToolDefinition]]:
+    """Extract tool definitions from vita Tool list."""
+    if not tools:
+        return None
+
+    try:
+        defs = []
+        for t in tools:
+            name = getattr(t, "name", None)
+            if not name:
+                continue
+            parameters = None
+            openai_schema = getattr(t, "openai_schema", None)
+            if isinstance(openai_schema, dict):
+                function_schema = openai_schema.get("function", openai_schema)
+                parameters = function_schema.get("parameters")
+            defs.append(
+                FunctionToolDefinition(
+                    name=name,
+                    description=getattr(t, "short_desc", None),
+                    parameters=parameters,
+                )
+            )
+        return defs if defs else None
+    except Exception:
+        return None
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/version.py
new file mode 100644
index 000000000..26056b5d8
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/version.py
@@ -0,0 +1 @@
+__version__ = "0.5.0.dev"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/conftest.py
new file mode 100644
index 000000000..1e9dac354
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/conftest.py
@@ -0,0 +1,100 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test configuration for VitaBench instrumentation tests."""
+
+import os
+
+import pytest
+
+from opentelemetry.instrumentation.vita import VitaInstrumentor
+from opentelemetry.sdk._logs import LoggerProvider
+from opentelemetry.sdk._logs.export import (
+    InMemoryLogExporter,
+    SimpleLogRecordProcessor,
+)
+from opentelemetry.sdk.metrics import MeterProvider
+from opentelemetry.sdk.metrics.export import InMemoryMetricReader
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+    InMemorySpanExporter,
+)
+
+
+def pytest_configure(config: pytest.Config):
+    os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental"
+    os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "SPAN_ONLY"
+
+
+# ==================== Exporters ====================
+
+
+@pytest.fixture(scope="function", name="span_exporter")
+def fixture_span_exporter():
+    exporter = InMemorySpanExporter()
+    yield exporter
+
+
+@pytest.fixture(scope="function", name="log_exporter")
+def fixture_log_exporter():
+    exporter = InMemoryLogExporter()
+    yield exporter
+
+
+@pytest.fixture(scope="function", name="metric_reader")
+def fixture_metric_reader():
+    reader = InMemoryMetricReader()
+    yield reader
+
+
+# ==================== Providers ====================
+
+
+@pytest.fixture(scope="function", name="tracer_provider")
+def fixture_tracer_provider(span_exporter):
+    provider = TracerProvider()
+    provider.add_span_processor(SimpleSpanProcessor(span_exporter))
+    return provider
+
+
+@pytest.fixture(scope="function", name="logger_provider")
+def fixture_logger_provider(log_exporter):
+    provider = LoggerProvider()
+    provider.add_log_record_processor(SimpleLogRecordProcessor(log_exporter))
+    return provider
+
+
+@pytest.fixture(scope="function", name="meter_provider")
+def fixture_meter_provider(metric_reader):
+    meter_provider = MeterProvider(
+        metric_readers=[metric_reader],
+    )
+    return meter_provider
+
+
+# ==================== Instrumentation ====================
+
+
+@pytest.fixture(scope="function")
+def instrument(tracer_provider, logger_provider, meter_provider):
+    instrumentor = VitaInstrumentor()
+    instrumentor.instrument(
+        tracer_provider=tracer_provider,
+        logger_provider=logger_provider,
+        meter_provider=meter_provider,
+        skip_dep_check=True,
+    )
+    yield instrumentor
+    instrumentor.uninstrument()
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/test_instrumentor.py
new file mode 100644
index 000000000..a6a2339f8
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/test_instrumentor.py
@@ -0,0 +1,478 @@
+# Copyright The OpenTelemetry Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for VitaBench instrumentation.
+
+The suite exercises all execute.md hook points. External I/O is replaced at the
+HTTP/tool boundary, while the Vita agent/orchestrator call chain runs through
+the real framework methods.
+"""
+
+from __future__ import annotations
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from opentelemetry.instrumentation.vita import VitaInstrumentor
+
+
+FAKE_MODELS_CONFIG = {
+    "qwen-max": {
+        "base_url": "http://fake-api.example.com/v1/chat/completions",
+        "headers": {"Authorization": "Bearer test-key"},
+    },
+    "gpt-4": {
+        "base_url": "http://fake-api.example.com/v1/chat/completions",
+        "headers": {"Authorization": "Bearer test-key"},
+    },
+    "claude-3-opus": {
+        "base_url": "http://fake-api.example.com/v1/chat/completions",
+        "headers": {"Authorization": "Bearer test-key"},
+    },
+}
+
+
+def _make_openai_response(content=None, tool_calls=None, usage=None):
+    message = {"role": "assistant", "content": content}
+    if tool_calls:
+        message["tool_calls"] = tool_calls
+    return {
+        "id": "chatcmpl-test",
+        "model": "test-model",
+        "choices": [{"message": message, "finish_reason": "stop"}],
+        "usage": usage
+        or {"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150},
+    }
+
+
+def _mock_requests_post(response_dict):
+    mock_resp = MagicMock()
+    mock_resp.status_code = 200
+    mock_resp.json.return_value = response_dict
+    return mock_resp
+
+
+def _tool_call_response():
+    return _make_openai_response(
+        tool_calls=[
+            {
+                "id": "call_1",
+                "type": "function",
+                "function": {
+                    "name": "get_order",
+                    "arguments": '{"order_id": "123"}',
+                },
+            }
+        ],
+        usage={"prompt_tokens": 100, "completion_tokens": 20, "total_tokens": 120},
+    )
+
+
+def _text_response(content="Order 123 has been delivered. ###STOP###"):
+    return _make_openai_response(
+        content=content,
+        usage={"prompt_tokens": 200, "completion_tokens": 30, "total_tokens": 230},
+    )
+
+
+class FakeTool:
+    name = "get_order"
+    short_desc = "Get order details"
+    openai_schema = {
+        "type": "function",
+        "function": {
+            "name": "get_order",
+            "description": "Get order details",
+            "parameters": {
+                "type": "object",
+                "properties": {"order_id": {"type": "string"}},
+            },
+        },
+    }
+
+
+class FakeTools:
+    def __init__(self):
+        self.db = SimpleNamespace(time="2026-01-01 00:00:00")
+        self._tools = {"get_order": FakeTool()}
+
+    def get_tools(self):
+        return self._tools
+
+    def use_tool(self, tool_name, **kwargs):
+        return {"tool": tool_name, "arguments": kwargs, "status": "delivered"}
+
+    def get_db_hash(self):
+        return "fake-db-hash"
+
+
+class DeterministicUser:
+    def get_init_state(self, message_history=None):
+        return SimpleNamespace(messages=message_history or [])
+
+    def generate_next_message(self, message, state):
+        from vita.data_model.message import UserMessage
+
+        user_message = UserMessage(role="user", content="Check order 123")
+        state.messages.append(user_message)
+        return user_message, state
+
+
+def _make_agent():
+    from vita.agent.llm_agent import LLMAgent
+
+    return LLMAgent(
+        tools=[FakeTool()],
+        domain_policy="You are helpful at {time}.",
+        llm="qwen-max",
+        llm_args={},
+        time="2026-01-01 00:00:00",
+        language="english",
+    )
+
+
+def _make_orchestrator():
+    from vita.environment.environment import Environment
+    from vita.orchestrator.orchestrator import Orchestrator
+
+    return Orchestrator(
+        domain="delivery",
+        agent=_make_agent(),
+        user=DeterministicUser(),
+        environment=Environment(domain_name="delivery", tools=FakeTools()),
+        task=SimpleNamespace(
+            id="task_001",
+            instructions="Check order 123",
+            message_history=None,
+        ),
+        max_steps=6,
+        max_errors=3,
+        language="english",
+    )
+
+
+def _span_attrs(spans, name):
+    span = next(s for s in spans if s.name == name)
+    return dict(span.attributes)
+
+
+class TestVitaInstrumentor:
+    def test_instrument_and_uninstrument(
+        self, tracer_provider, logger_provider, meter_provider
+    ):
+        instrumentor = VitaInstrumentor()
+        instrumentor.instrument(
+            tracer_provider=tracer_provider,
+            logger_provider=logger_provider,
+            meter_provider=meter_provider,
+            skip_dep_check=True,
+        )
+        assert instrumentor._handler is not None
+        instrumentor.uninstrument()
+        assert instrumentor._handler is None
+
+    def test_instrumentation_dependencies(self):
+        assert VitaInstrumentor().instrumentation_dependencies() == (
+            "vita >= 0.0.1",
+        )
+
+
+class TestLLMSpan:
+    def test_llm_span_text_response(self, instrument, span_exporter):
+        from vita.data_model.message import UserMessage
+        from vita.utils.llm_utils import generate
+
+        with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch(
+            "requests.post",
+            return_value=_mock_requests_post(
+                _make_openai_response(
+                    content="The order has been delivered.",
+                    usage={
+                        "prompt_tokens": 150,
+                        "completion_tokens": 30,
+                        "total_tokens": 180,
+                    },
+                )
+            ),
+        ):
+            result = generate(
+                model="qwen-max",
+                messages=[UserMessage(role="user", content="Where is my order?")],
+            )
+
+        assert result.content == "The order has been delivered."
+        spans = span_exporter.get_finished_spans()
+        attrs = _span_attrs(spans, "chat qwen-max")
+        assert attrs["gen_ai.operation.name"] == "chat"
+        assert attrs["gen_ai.span.kind"] == "LLM"
+        assert attrs["gen_ai.request.model"] == "qwen-max"
+        assert attrs["gen_ai.provider.name"] == "alibaba_cloud"
+        assert attrs["gen_ai.usage.input_tokens"] == 150
+        assert attrs["gen_ai.usage.output_tokens"] == 30
+        assert attrs["gen_ai.response.finish_reasons"] == ("stop",)
+
+    def test_llm_span_tool_call_response(self, instrument, span_exporter):
+        from vita.data_model.message import UserMessage
+        from vita.utils.llm_utils import generate
+
+        with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch(
+            "requests.post", return_value=_mock_requests_post(_tool_call_response())
+        ):
+            result = generate(
+                model="gpt-4",
+                messages=[UserMessage(role="user", content="Check my order")],
+            )
+
+        assert result.tool_calls is not None
+        attrs = _span_attrs(span_exporter.get_finished_spans(), "chat gpt-4")
+        assert attrs["gen_ai.response.finish_reasons"] == ("tool_calls",)
+        assert attrs["gen_ai.provider.name"] == "openai"
+
+    def test_llm_span_captures_positional_tools(self, instrument, span_exporter):
+        from vita.data_model.message import UserMessage
+        from vita.utils.llm_utils import generate
+
+        with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch(
+            "requests.post", return_value=_mock_requests_post(_text_response("Done."))
+        ):
+            generate(
+                "qwen-max",
+                [UserMessage(role="user", content="Check my order")],
+                [FakeTool()],
+            )
+
+        attrs = _span_attrs(span_exporter.get_finished_spans(), "chat qwen-max")
+        assert "gen_ai.tool.definitions" in attrs
+        assert "get_order" in attrs["gen_ai.tool.definitions"]
+
+
+class TestToolSpan:
+    def test_tool_span_created(self, instrument, span_exporter):
+        from vita.data_model.message import ToolCall
+        from vita.environment.environment import Environment
+
+        env = Environment(domain_name="delivery", tools=FakeTools())
+        result = env.get_response(
+            ToolCall(id="tc_42", name="get_order", arguments={"order_id": "999"})
+        )
+
+        assert result.content is not None
+        attrs = _span_attrs(
+            span_exporter.get_finished_spans(), "execute_tool get_order"
+        )
+        assert attrs["gen_ai.operation.name"] == "execute_tool"
+        assert attrs["gen_ai.span.kind"] == "TOOL"
+        assert attrs["gen_ai.tool.name"] == "get_order"
+        assert attrs["gen_ai.tool.call.id"] == "tc_42"
+
+    def test_tool_span_on_error(self, instrument, span_exporter):
+        from vita.data_model.message import ToolCall
+        from vita.environment.environment import Environment
+
+        tools = FakeTools()
+        tools.use_tool = MagicMock(side_effect=RuntimeError("Tool failed"))
+        env = Environment(domain_name="delivery", tools=tools)
+        result = env.get_response(
+            ToolCall(id="tc_err", name="get_order", arguments={})
+        )
+
+        assert result.error is True
+        tool_span = next(
+            s
+            for s in span_exporter.get_finished_spans()
+            if s.name == "execute_tool get_order"
+        )
+        assert tool_span.status.status_code.name == "ERROR"
+
+
+class TestAgentSpan:
+    def test_agent_span_created_for_llm_agent(self, instrument, span_exporter):
+        from vita.data_model.message import UserMessage
+
+        agent = _make_agent()
+        state = agent.get_init_state([])
+
+        with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch(
+            "requests.post", return_value=_mock_requests_post(_text_response("Sure."))
+        ):
+            assistant_msg, _ = agent.generate_next_message(
+                UserMessage(role="user", content="I need help"), state
+            )
+
+        assert assistant_msg.content == "Sure."
+        spans = span_exporter.get_finished_spans()
+        agent_span = next(s for s in spans if s.name == "invoke_agent LLMAgent")
+        llm_span = next(s for s in spans if s.name == "chat qwen-max")
+        attrs = dict(agent_span.attributes)
+        assert attrs["gen_ai.operation.name"] == "invoke_agent"
+        assert attrs["gen_ai.span.kind"] == "AGENT"
+        assert attrs["gen_ai.agent.name"] == "LLMAgent"
+        assert attrs["gen_ai.request.model"] == "qwen-max"
+        assert llm_span.parent.span_id == agent_span.context.span_id
+
+    def test_agent_span_created_for_llm_solo_agent(self, instrument, span_exporter):
+        from vita.agent.llm_agent import LLMSoloAgent
+
+        agent = LLMSoloAgent(
+            tools=[FakeTool()],
+            domain_policy="unused",
+            llm="qwen-max",
+            llm_args={},
+            time="2026-01-01 00:00:00",
+            language="english",
+        )
+        state = agent.get_init_state([])
+
+        with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch(
+            "requests.post", return_value=_mock_requests_post(_tool_call_response())
+        ):
+            agent.generate_next_message(None, state)
+
+        attrs = _span_attrs(
+            span_exporter.get_finished_spans(), "invoke_agent LLMSoloAgent"
+        )
+        assert attrs["gen_ai.span.kind"] == "AGENT"
+        assert attrs["gen_ai.agent.name"] == "LLMSoloAgent"
+
+
+class TestStepAndChainSpans:
+    def test_orchestrator_run_creates_chain_steps_agents_llms_and_tools(
+        self, instrument, span_exporter
+    ):
+        responses = [
+            _mock_requests_post(_tool_call_response()),
+            _mock_requests_post(_text_response()),
+        ]
+
+        with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch(
+            "requests.post", side_effect=responses
+        ):
+            result = _make_orchestrator().run()
+
+        assert result.termination_reason == "agent_stop"
+        spans = span_exporter.get_finished_spans()
+        chain = next(s for s in spans if s.name == "workflow delivery")
+        steps = sorted(
+            [s for s in spans if s.name == "react step"], key=lambda s: s.start_time
+        )
+        agents = sorted(
+            [s for s in spans if s.name == "invoke_agent LLMAgent"],
+            key=lambda s: s.start_time,
+        )
+        llms = sorted(
+            [s for s in spans if s.name == "chat qwen-max"],
+            key=lambda s: s.start_time,
+        )
+        tools = [s for s in spans if s.name == "execute_tool get_order"]
+
+        assert len(steps) == 2
+        assert len(agents) == 2
+        assert len(llms) == 2
+        assert len(tools) == 1
+
+        chain_attrs = dict(chain.attributes)
+        assert chain_attrs["gen_ai.operation.name"] == "workflow"
+        assert chain_attrs["gen_ai.span.kind"] == "CHAIN"
+        assert chain_attrs["gen_ai.framework"] == "vitabench"
+
+        assert dict(steps[0].attributes)["gen_ai.react.round"] == 1
+        assert dict(steps[1].attributes)["gen_ai.react.round"] == 2
+        for step in steps:
+            assert step.parent.span_id == chain.context.span_id
+        assert agents[0].parent.span_id == steps[0].context.span_id
+        assert agents[1].parent.span_id == steps[1].context.span_id
+        assert llms[0].parent.span_id == agents[0].context.span_id
+        assert llms[1].parent.span_id == agents[1].context.span_id
+        assert tools[0].parent.span_id == steps[0].context.span_id
+
+    def test_open_step_fails_when_env_turn_raises(self, instrument, span_exporter):
+        with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch(
+            "requests.post", return_value=_mock_requests_post(_tool_call_response())
+        ), patch(
+            "vita.environment.environment.Environment.get_response",
+            side_effect=RuntimeError("env broke"),
+        ):
+            with pytest.raises(RuntimeError, match="env broke"):
+                _make_orchestrator().run()
+
+        spans = span_exporter.get_finished_spans()
+        step = next(s for s in spans if s.name == "react step")
+        chain = next(s for s in spans if s.name == "workflow delivery")
+        step_attrs = dict(step.attributes)
+        assert step.status.status_code.name == "ERROR"
+        assert step_attrs["gen_ai.react.finish_reason"] == "error"
+        assert chain.status.status_code.name == "ERROR"
+
+
+class TestEntrySpan:
+    def test_run_task_entry_wraps_orchestrator_trace(self, instrument, span_exporter):
+        from vita.run import run_task
+
+        def fake_internal(**kwargs):
+            return _make_orchestrator().run()
+
+        responses = [
+            _mock_requests_post(_tool_call_response()),
+            _mock_requests_post(_text_response()),
+        ]
+        task = SimpleNamespace(
+            id="task_001",
+            instructions="Check order 123",
+            message_history=None,
+        )
+
+        with patch("vita.run._run_task_internal", side_effect=fake_internal), patch(
+            "vita.utils.llm_utils.models", FAKE_MODELS_CONFIG
+        ), patch("requests.post", side_effect=responses):
+            result = run_task("delivery", task, "llm_agent", "user_simulator")
+
+        assert result.termination_reason == "agent_stop"
+        spans = span_exporter.get_finished_spans()
+        entry = next(s for s in spans if s.name == "enter_ai_application_system")
+        chain = next(s for s in spans if s.name == "workflow delivery")
+        attrs = dict(entry.attributes)
+        assert attrs["gen_ai.operation.name"] == "enter"
+        assert attrs["gen_ai.span.kind"] == "ENTRY"
+        assert attrs["gen_ai.framework"] == "vitabench"
+        assert "gen_ai.session.id" in attrs
+        assert chain.parent.span_id == entry.context.span_id
+
+
+class TestProviderInference:
+    def test_common_provider_names(self, instrument, span_exporter):
+        from vita.data_model.message import UserMessage
+        from vita.utils.llm_utils import generate
+
+        for model in ("gpt-4", "claude-3-opus", "qwen-max"):
+            with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch(
+                "requests.post",
+                return_value=_mock_requests_post(_make_openai_response(content="Hi")),
+            ):
+                generate(
+                    model=model,
+                    messages=[UserMessage(role="user", content="Hi")],
+                )
+
+        providers = {
+            dict(s.attributes)["gen_ai.request.model"]: dict(s.attributes)[
+                "gen_ai.provider.name"
+            ]
+            for s in span_exporter.get_finished_spans()
+            if s.name.startswith("chat ")
+        }
+        assert providers["gpt-4"] == "openai"
+        assert providers["claude-3-opus"] == "anthropic"
+        assert providers["qwen-max"] == "alibaba_cloud"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/pyproject.toml
new file mode 100644
index 000000000..c2fc31949
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/pyproject.toml
@@ -0,0 +1,54 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-webarena"
+dynamic = ["version"]
+description = "LoongSuite webarena instrumentation"
+license = "Apache-2.0"
+requires-python = ">=3.10,<4"
+authors = [
+  { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Intended Audience :: Developers",
+  "License :: OSI Approved :: Apache Software License",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+  "opentelemetry-api >= 1.37.0",
+  "opentelemetry-instrumentation >= 0.58b0",
+  "opentelemetry-semantic-conventions >= 0.58b0",
+  "wrapt >= 1.0.0, < 2.0.0",
+]
+
+[project.optional-dependencies]
+instruments = [
+  "webarena >= 0.0.1"
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+webarena = "opentelemetry.instrumentation.webarena:WebarenaInstrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-webarena"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/webarena/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+  "/src",
+  "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/__init__.py
new file mode 100644
index 000000000..c822df538
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/__init__.py
@@ -0,0 +1,231 @@
+"""
+OpenTelemetry WebArena Instrumentation
+======================================
+
+Automatic instrumentation for the
+`WebArena <https://github.com/web-arena-x/webarena>`_ benchmark framework.
+
+Span hierarchy
+--------------
+
+::
+
+    ENTRY  webarena_task                       (per task; ScriptBrowserEnv.reset)
+    └── CHAIN  workflow webarena_task          (same lifecycle as ENTRY)
+         ├── STEP  react step                  (one per ReAct round)
+         │    ├── AGENT  invoke_agent          (PromptAgent.next_action)
+         │    │    ├── TASK  build_prompt_context (PromptConstructor.construct)
+         │    │    └── LLM  chat / text_completion
+         │    │              * OpenAI provider — emitted by the OpenAI SDK probe
+         │    │              * HuggingFace provider — emitted by THIS package
+         │    └── TOOL  execute_tool {action_type}  (ScriptBrowserEnv.step)
+         └── ...
+
+    AGENT  create_agent                        (one-shot; construct_agent)
+
+Design principles
+-----------------
+
+* **Do not double-emit OpenAI LLM spans.** WebArena's
+  ``generate_from_openai_chat_completion`` / ``generate_from_openai_completion``
+  ultimately call ``openai.ChatCompletion.create`` /
+  ``openai.Completion.create`` which already have a dedicated OpenAI SDK
+  instrumentor (e.g. ``opentelemetry-instrumentation-openai``). We rely on
+  *that* instrumentor for token usage / model / finish-reason and let its
+  LLM span attach itself naturally as a child of our AGENT span via the
+  shared OTel context.
+* **HuggingFace path is ours.** The ``text_generation`` client has no
+  off-the-shelf probe, so we wrap
+  ``llms.providers.hf_utils.generate_from_huggingface_completion`` to emit
+  an LLM span for that path.
+* **No invasive rewrite of ``run.py:test()``.** ENTRY / CHAIN / STEP are
+  synthesised by latching on to ``ScriptBrowserEnv.reset`` (task start),
+  ``ScriptBrowserEnv.close`` (batch end) and ``PromptAgent.next_action``
+  (round start). See ``internal/_state.py`` for the state machine.
+
+Usage
+-----
+
+.. code:: python
+
+    from opentelemetry.instrumentation.webarena import WebarenaInstrumentor
+
+    WebarenaInstrumentor().instrument()
+
+    # Then run WebArena as normal (e.g. ``python run.py ...``).
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Collection
+
+from opentelemetry import trace as trace_api
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from wrapt import wrap_function_wrapper
+
+from opentelemetry.instrumentation.webarena.package import _instruments
+from opentelemetry.instrumentation.webarena.version import __version__
+
+logger = logging.getLogger(__name__)
+
+__all__ = ["WebarenaInstrumentor"]
+
+
+# WebArena uses *flat* package names (``setup.cfg`` declares ``packages =
+# browser_env, agent, evaluation_harness, llms`` with no ``webarena.``
+# prefix). Patch targets therefore use the bare module names.
+_PATCH_TARGETS = (
+    # (module, qualname, wrapper_attr_name)
+    ("browser_env.envs", "ScriptBrowserEnv.reset", "_env_reset_wrapper"),
+    ("browser_env.envs", "ScriptBrowserEnv.close", "_env_close_wrapper"),
+    ("browser_env.envs", "ScriptBrowserEnv.step", "_env_step_wrapper"),
+    ("agent.agent", "construct_agent", "_construct_agent_wrapper"),
+    ("agent.agent", "PromptAgent.next_action", "_next_action_wrapper"),
+)
+
+# PromptConstructor.construct is abstract on the base class, so we patch
+# the two known concrete subclasses individually.
+_PROMPT_CONSTRUCTOR_TARGETS = (
+    ("agent.prompts.prompt_constructor", "DirectPromptConstructor.construct"),
+    ("agent.prompts.prompt_constructor", "CoTPromptConstructor.construct"),
+)
+
+_HF_TARGET = ("llms.providers.hf_utils", "generate_from_huggingface_completion")
+
+
+class WebarenaInstrumentor(BaseInstrumentor):
+    """An ``opentelemetry-instrumentation`` plugin for WebArena.
+
+    Spans (see module docstring) are emitted via ``wrapt`` hooks on six
+    framework functions plus an optional HuggingFace LLM hook. OpenAI LLM
+    spans are intentionally **not** emitted here (the OpenAI SDK probe
+    handles them).
+    """
+
+    _patched: list[tuple[str, str]] = []
+    _patched_hf: bool = False
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        tracer_provider = kwargs.get("tracer_provider")
+        tracer = trace_api.get_tracer(
+            __name__, __version__, tracer_provider=tracer_provider
+        )
+
+        from opentelemetry.instrumentation.webarena.internal._wrappers import (
+            ConstructAgentWrapper,
+            EnvCloseWrapper,
+            EnvResetWrapper,
+            EnvStepWrapper,
+            HuggingFaceCompletionWrapper,
+            NextActionWrapper,
+            PromptConstructWrapper,
+        )
+
+        wrappers = {
+            "_env_reset_wrapper": EnvResetWrapper(tracer),
+            "_env_close_wrapper": EnvCloseWrapper(),
+            "_env_step_wrapper": EnvStepWrapper(tracer),
+            "_construct_agent_wrapper": ConstructAgentWrapper(tracer),
+            "_next_action_wrapper": NextActionWrapper(tracer),
+        }
+
+        # --- core patches (mandatory) ------------------------------------
+        type(self)._patched = []
+        for module, qualname, wrapper_key in _PATCH_TARGETS:
+            try:
+                wrap_function_wrapper(
+                    module=module,
+                    name=qualname,
+                    wrapper=wrappers[wrapper_key],
+                )
+                type(self)._patched.append((module, qualname))
+            except Exception as exc:  # noqa: BLE001
+                logger.warning(
+                    "WebarenaInstrumentor: could not wrap %s.%s: %s",
+                    module,
+                    qualname,
+                    exc,
+                )
+
+        # --- PromptConstructor (two concrete subclasses) ------------------
+        prompt_wrapper = PromptConstructWrapper(tracer)
+        for module, qualname in _PROMPT_CONSTRUCTOR_TARGETS:
+            try:
+                wrap_function_wrapper(
+                    module=module, name=qualname, wrapper=prompt_wrapper
+                )
+                type(self)._patched.append((module, qualname))
+            except Exception as exc:  # noqa: BLE001
+                logger.warning(
+                    "WebarenaInstrumentor: could not wrap %s.%s: %s",
+                    module,
+                    qualname,
+                    exc,
+                )
+
+        # --- HuggingFace provider (optional, only if module imports OK) --
+        try:
+            wrap_function_wrapper(
+                module=_HF_TARGET[0],
+                name=_HF_TARGET[1],
+                wrapper=HuggingFaceCompletionWrapper(tracer),
+            )
+            type(self)._patched_hf = True
+        except Exception as exc:  # noqa: BLE001
+            logger.debug(
+                "WebarenaInstrumentor: skipping HuggingFace wrapper: %s", exc
+            )
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        from opentelemetry.instrumentation.webarena.internal import _state as state
+
+        # Always make sure we don't leak open spans on uninstrument.
+        try:
+            state.end_task_spans()
+        except Exception:  # noqa: BLE001
+            pass
+
+        # Unwrap each successfully-patched target. We import the module
+        # lazily so uninstrument doesn't fail when WebArena is no longer
+        # importable (e.g. during teardown).
+        for module, qualname in list(type(self)._patched):
+            self._safe_unwrap(module, qualname)
+        type(self)._patched = []
+
+        if type(self)._patched_hf:
+            self._safe_unwrap(_HF_TARGET[0], _HF_TARGET[1])
+            type(self)._patched_hf = False
+
+    @staticmethod
+    def _safe_unwrap(module: str, qualname: str) -> None:
+        try:
+            import importlib  # noqa: PLC0415
+
+            mod = importlib.import_module(module)
+        except Exception as exc:  # noqa: BLE001
+            logger.debug(
+                "WebarenaInstrumentor: could not import %s for unwrap: %s",
+                module,
+                exc,
+            )
+            return
+
+        parts = qualname.split(".")
+        try:
+            target = mod
+            for p in parts[:-1]:
+                target = getattr(target, p)
+            attr = getattr(target, parts[-1], None)
+            if attr is not None and hasattr(attr, "__wrapped__"):
+                setattr(target, parts[-1], attr.__wrapped__)
+        except Exception as exc:  # noqa: BLE001
+            logger.debug(
+                "WebarenaInstrumentor: could not unwrap %s.%s: %s",
+                module,
+                qualname,
+                exc,
+            )
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/config.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/config.py
new file mode 100644
index 000000000..870338425
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/config.py
@@ -0,0 +1,40 @@
+"""Configuration via environment variables."""
+
+from __future__ import annotations
+
+import os
+
+
+def _int_env(name: str, default: str) -> int:
+    try:
+        return int(os.getenv(name, default))
+    except ValueError:
+        return int(default)
+
+
+def _bool_env(name: str, default: bool = False) -> bool:
+    raw = os.getenv(name)
+    if raw is None:
+        return default
+    return raw.strip().lower() in {"1", "true", "yes", "on"}
+
+
+# Cap on non-content string attribute values (URLs, tool names, etc.)
+WEBARENA_OTEL_MAX_ATTR_LENGTH = _int_env(
+    "WEBARENA_OTEL_MAX_ATTR_LENGTH", "1024"
+)
+
+# Cap on prompt / message preview length when capture-message-content is on
+WEBARENA_OTEL_PROMPT_PREVIEW_MAX_LEN = _int_env(
+    "WEBARENA_OTEL_PROMPT_PREVIEW_MAX_LEN", "4096"
+)
+
+
+def capture_message_content() -> bool:
+    """Whether to record prompt / completion / tool argument bodies.
+
+    Honours the standard semantic-conventions opt-in flag.
+    """
+    return _bool_env(
+        "OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", False
+    )
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_attrs.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_attrs.py
new file mode 100644
index 000000000..28db7f0ca
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_attrs.py
@@ -0,0 +1,131 @@
+"""Attribute / span-name constants and helpers for WebArena spans."""
+
+from __future__ import annotations
+
+import json
+from typing import Any, Iterable
+
+from opentelemetry.instrumentation.webarena.config import (
+    WEBARENA_OTEL_MAX_ATTR_LENGTH,
+    WEBARENA_OTEL_PROMPT_PREVIEW_MAX_LEN,
+)
+
+# --- vendor-extended attribute names -----------------------------------
+
+GEN_AI_SPAN_KIND = "gen_ai.span.kind"
+GEN_AI_FRAMEWORK = "gen_ai.framework"
+GEN_AI_USAGE_TOTAL_TOKENS = "gen_ai.usage.total_tokens"
+GEN_AI_REACT_ROUND = "gen_ai.react.round"
+GEN_AI_REACT_FINISH_REASON = "gen_ai.react.finish_reason"
+
+# WebArena-specific attribute names
+WEBARENA_TASK_ID = "webarena.task.id"
+WEBARENA_SITES = "webarena.sites"
+WEBARENA_REQUIRE_LOGIN = "webarena.require_login"
+WEBARENA_OBSERVATION_TYPE = "webarena.observation_type"
+WEBARENA_ACTION_SET_TAG = "webarena.action_set_tag"
+WEBARENA_ACTION_TYPE = "webarena.action.type"
+WEBARENA_FAIL_ERROR = "webarena.fail_error"
+WEBARENA_PAGE_URL_BEFORE = "webarena.page.url.before"
+WEBARENA_PAGE_URL_AFTER = "webarena.page.url.after"
+WEBARENA_BROWSER_ELEMENT_ID = "webarena.browser.element_id"
+WEBARENA_OBSERVATION_MAIN_TYPE = "webarena.observation.main_type"
+WEBARENA_STEP_COUNT = "webarena.step.count"
+WEBARENA_TOOL_COUNT = "webarena.tool.count"
+WEBARENA_PARSING_FAILURE_COUNT = "webarena.parsing_failure.count"
+WEBARENA_PREVIOUS_ACTION = "webarena.previous_action"
+WEBARENA_MEMORY_TRAJECTORY_LENGTH = "webarena.memory.trajectory_length"
+WEBARENA_MEMORY_OBS_TEXT_LENGTH = "webarena.memory.obs_text_length"
+
+FRAMEWORK_NAME = "webarena"
+
+
+def truncate(value: str, max_len: int = WEBARENA_OTEL_MAX_ATTR_LENGTH) -> str:
+    """Trim a string attribute to ``max_len`` characters with an ellipsis."""
+    if value is None:
+        return ""
+    if not isinstance(value, str):
+        value = str(value)
+    if len(value) <= max_len:
+        return value
+    if max_len <= 3:
+        return value[:max_len]
+    return value[: max_len - 3] + "..."
+
+
+def truncate_content(value: str) -> str:
+    """Trim a body / message-style attribute (longer cap than truncate())."""
+    return truncate(value, WEBARENA_OTEL_PROMPT_PREVIEW_MAX_LEN)
+
+
+def safe_json_dumps(value: Any, max_len: int | None = None) -> str:
+    """JSON-encode ``value`` with best-effort fallback to ``str``."""
+    try:
+        text = json.dumps(value, ensure_ascii=False, default=str)
+    except Exception:  # noqa: BLE001
+        text = str(value)
+    if max_len is None:
+        return truncate(text)
+    return truncate(text, max_len)
+
+
+def action_type_name(action: Any) -> str:
+    """Resolve an Action dict's ``action_type`` to its enum name."""
+    if not isinstance(action, dict):
+        return "UNKNOWN"
+    raw = action.get("action_type")
+    if raw is None:
+        return "UNKNOWN"
+    name = getattr(raw, "name", None)
+    if name:
+        return str(name)
+    try:
+        from browser_env.actions import ActionTypes  # noqa: PLC0415
+        return ActionTypes(raw).name
+    except Exception:  # noqa: BLE001
+        return str(raw)
+
+
+def action_arguments(action: Any) -> dict[str, Any]:
+    """Extract a small JSON-friendly subset of an Action dict.
+
+    We deliberately drop high-volume / binary-ish fields like
+    ``coords``, ``raw_prediction`` and ``page_screenshot`` so the
+    serialised value stays under the attribute length cap.
+    """
+    if not isinstance(action, dict):
+        return {}
+    keep_keys: Iterable[str] = (
+        "element_id",
+        "element_role",
+        "element_name",
+        "url",
+        "text",
+        "key_comb",
+        "direction",
+        "amount",
+        "answer",
+        "pw_code",
+        "nth",
+    )
+    out: dict[str, Any] = {"action_type": action_type_name(action)}
+    for k in keep_keys:
+        v = action.get(k)
+        if v in (None, "", [], {}):
+            continue
+        out[k] = v
+    return out
+
+
+def messages_to_input_value(messages: Any) -> str:
+    """Compact representation of an LLM/agent prompt for ``input.value``."""
+    if isinstance(messages, str):
+        return truncate_content(messages)
+    if isinstance(messages, list):
+        try:
+            return safe_json_dumps(
+                messages, max_len=WEBARENA_OTEL_PROMPT_PREVIEW_MAX_LEN
+            )
+        except Exception:  # noqa: BLE001
+            return truncate_content(str(messages))
+    return truncate_content(str(messages))
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_state.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_state.py
new file mode 100644
index 000000000..8244ac403
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_state.py
@@ -0,0 +1,185 @@
+"""Lifecycle state shared across WebArena wrappers.
+
+WebArena's ``run.py:test()`` is a single function with a *for* loop over
+config files (one task each) and a nested *while* loop (one ReAct round
+each). It exposes no per-task hook, so we synthesise ENTRY / CHAIN / STEP
+spans by latching on to the boundaries that *do* exist:
+
+* ``ScriptBrowserEnv.reset(...)`` — first call after a task starts
+* ``ScriptBrowserEnv.close(...)`` — end of the whole batch
+* ``PromptAgent.next_action(...)`` — start of a new ReAct round
+* ``ScriptBrowserEnv.step(...)`` — execution of the picked action
+
+This module owns the ``ContextVar`` slots used to thread span handles
+between those wrappers in a single process / thread, and the helpers
+that close any spans that may still be open when an outer boundary
+fires.
+"""
+
+from __future__ import annotations
+
+from contextvars import ContextVar
+from typing import Any
+
+from opentelemetry import context as otel_context
+
+# Whether we are currently inside a WebArena task (between an env.reset
+# and the next env.reset / env.close). Used by the AGENT(invoke_agent)
+# wrapper to decide whether STEP rotation is meaningful.
+_in_task: ContextVar[bool] = ContextVar("webarena_in_task", default=False)
+
+# ENTRY span handle + its attached context token.
+_entry_span: ContextVar[Any] = ContextVar("webarena_entry_span", default=None)
+_entry_token: ContextVar[Any] = ContextVar("webarena_entry_token", default=None)
+
+# CHAIN(workflow) span handle + token (always nested inside ENTRY).
+_chain_span: ContextVar[Any] = ContextVar("webarena_chain_span", default=None)
+_chain_token: ContextVar[Any] = ContextVar("webarena_chain_token", default=None)
+
+# Currently active STEP span handle + token.
+_step_span: ContextVar[Any] = ContextVar("webarena_step_span", default=None)
+_step_token: ContextVar[Any] = ContextVar("webarena_step_token", default=None)
+
+# Per-task counters, used to populate STEP attributes / CHAIN summaries.
+_step_counter: ContextVar[int] = ContextVar("webarena_step_counter", default=0)
+_tool_counter: ContextVar[int] = ContextVar("webarena_tool_counter", default=0)
+_parsing_failure_counter: ContextVar[int] = ContextVar(
+    "webarena_parsing_failure_counter", default=0
+)
+
+
+def _detach_token(token: Any) -> None:
+    """Detach an OTel context token, swallowing already-detached errors."""
+    if token is None:
+        return
+    try:
+        otel_context.detach(token)
+    except Exception:  # noqa: BLE001
+        pass
+
+
+def end_step() -> int:
+    """Close the active STEP span (if any) and return the round number it had.
+
+    Returns ``0`` when no STEP was active.
+    """
+    span = _step_span.get(None)
+    token = _step_token.get(None)
+    round_no = 0
+    if span is not None:
+        try:
+            round_no = int(span.attributes.get("gen_ai.react.round", 0))  # type: ignore[union-attr]
+        except Exception:  # noqa: BLE001
+            round_no = 0
+        try:
+            span.end()
+        except Exception:  # noqa: BLE001
+            pass
+    _step_span.set(None)
+    _detach_token(token)
+    _step_token.set(None)
+    return round_no
+
+
+def end_chain() -> None:
+    """Close the active CHAIN span (if any) and detach its token."""
+    span = _chain_span.get(None)
+    token = _chain_token.get(None)
+    if span is not None:
+        try:
+            span.end()
+        except Exception:  # noqa: BLE001
+            pass
+    _chain_span.set(None)
+    _detach_token(token)
+    _chain_token.set(None)
+
+
+def end_entry() -> None:
+    """Close the active ENTRY span (if any) and detach its token."""
+    span = _entry_span.get(None)
+    token = _entry_token.get(None)
+    if span is not None:
+        try:
+            span.end()
+        except Exception:  # noqa: BLE001
+            pass
+    _entry_span.set(None)
+    _detach_token(token)
+    _entry_token.set(None)
+
+
+def end_task_spans() -> None:
+    """Close STEP → CHAIN → ENTRY in order (most-nested first)."""
+    end_step()
+    end_chain()
+    end_entry()
+    _in_task.set(False)
+    _step_counter.set(0)
+    _tool_counter.set(0)
+    _parsing_failure_counter.set(0)
+
+
+def in_task() -> bool:
+    return bool(_in_task.get(False))
+
+
+def mark_in_task(value: bool) -> None:
+    _in_task.set(value)
+
+
+def set_entry(span: Any, token: Any) -> None:
+    _entry_span.set(span)
+    _entry_token.set(token)
+
+
+def set_chain(span: Any, token: Any) -> None:
+    _chain_span.set(span)
+    _chain_token.set(token)
+
+
+def set_step(span: Any, token: Any) -> None:
+    _step_span.set(span)
+    _step_token.set(token)
+
+
+def get_chain_span() -> Any:
+    return _chain_span.get(None)
+
+
+def get_entry_span() -> Any:
+    return _entry_span.get(None)
+
+
+def get_step_span() -> Any:
+    return _step_span.get(None)
+
+
+def increment_step() -> int:
+    n = int(_step_counter.get(0)) + 1
+    _step_counter.set(n)
+    return n
+
+
+def increment_tool() -> int:
+    n = int(_tool_counter.get(0)) + 1
+    _tool_counter.set(n)
+    return n
+
+
+def increment_parsing_failure() -> int:
+    n = int(_parsing_failure_counter.get(0)) + 1
+    _parsing_failure_counter.set(n)
+    return n
+
+
+def step_count() -> int:
+    return int(_step_counter.get(0))
+
+
+def tool_count() -> int:
+    return int(_tool_counter.get(0))
+
+
+def parsing_failure_count() -> int:
+    return int(_parsing_failure_counter.get(0))
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_wrappers.py
new file mode 100644
index 000000000..2e87f4399
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_wrappers.py
@@ -0,0 +1,836 @@
+"""``wrapt`` hooks that emit WebArena GenAI spans.
+
+Span hierarchy (per task)::
+
+    ENTRY  webarena_task                       (env.reset)
+    └── CHAIN  workflow webarena_task          (env.reset)
+         ├── STEP  react step (round=N)        (next_action enter)
+         │    ├── AGENT  invoke_agent          (next_action body)
+         │    │    ├── TASK  build_prompt_context  (PromptConstructor.construct)
+         │    │    └── LLM   chat / text_completion
+         │    │              (OpenAI: produced by the OpenAI SDK probe;
+         │    │               HuggingFace: produced by this package via
+         │    │               ``generate_from_huggingface_completion``)
+         │    └── TOOL  execute_tool {action_type}   (env.step)
+         └── ...
+
+ENTRY/CHAIN/STEP boundaries are *not* present as discrete functions in
+WebArena, so we synthesise them by latching on to:
+
+* ``ScriptBrowserEnv.reset`` — open ENTRY/CHAIN (one task starts)
+* ``ScriptBrowserEnv.close`` — close any open spans (batch ends)
+* ``PromptAgent.next_action`` — rotate STEP (one ReAct round starts)
+
+A new STEP is closed lazily: by the next ``next_action`` call (next
+round) or by ``env.reset`` / ``env.close`` (next task / batch end).
+That makes us robust against early-stop / STOP-action paths in
+``run.py:test()`` where ``env.step`` is *not* called for the last
+round.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+from typing import Any, Callable
+
+from opentelemetry import context as otel_context
+from opentelemetry import trace as trace_api
+from opentelemetry.semconv._incubating.attributes import (
+    gen_ai_attributes as GenAI,
+)
+from opentelemetry.trace import (
+    SpanKind,
+    Status,
+    StatusCode,
+    Tracer,
+    set_span_in_context,
+)
+
+from opentelemetry.instrumentation.webarena.config import (
+    capture_message_content,
+)
+from opentelemetry.instrumentation.webarena.internal import _state as state
+from opentelemetry.instrumentation.webarena.internal._attrs import (
+    FRAMEWORK_NAME,
+    GEN_AI_FRAMEWORK,
+    GEN_AI_REACT_FINISH_REASON,
+    GEN_AI_REACT_ROUND,
+    GEN_AI_SPAN_KIND,
+    WEBARENA_ACTION_SET_TAG,
+    WEBARENA_ACTION_TYPE,
+    WEBARENA_BROWSER_ELEMENT_ID,
+    WEBARENA_FAIL_ERROR,
+    WEBARENA_MEMORY_OBS_TEXT_LENGTH,
+    WEBARENA_MEMORY_TRAJECTORY_LENGTH,
+    WEBARENA_OBSERVATION_MAIN_TYPE,
+    WEBARENA_OBSERVATION_TYPE,
+    WEBARENA_PAGE_URL_AFTER,
+    WEBARENA_PAGE_URL_BEFORE,
+    WEBARENA_PARSING_FAILURE_COUNT,
+    WEBARENA_PREVIOUS_ACTION,
+    WEBARENA_REQUIRE_LOGIN,
+    WEBARENA_SITES,
+    WEBARENA_STEP_COUNT,
+    WEBARENA_TASK_ID,
+    WEBARENA_TOOL_COUNT,
+    action_arguments,
+    action_type_name,
+    messages_to_input_value,
+    safe_json_dumps,
+    truncate,
+    truncate_content,
+)
+
+logger = logging.getLogger(__name__)
+
+
+# ---------------------------------------------------------------------------
+# Generic helpers
+# ---------------------------------------------------------------------------
+
+
+def _read_config_file(options: dict[str, Any] | None) -> dict[str, Any] | None:
+    """Best-effort: load the WebArena task config attached to ``env.reset``."""
+    if not options or not isinstance(options, dict):
+        return None
+    cfg_file = options.get("config_file")
+    if not cfg_file:
+        return None
+    try:
+        import json as _json  # noqa: PLC0415
+        with open(cfg_file, "r", encoding="utf-8") as f:
+            data = _json.load(f)
+        if isinstance(data, dict):
+            return data
+    except Exception:  # noqa: BLE001
+        return None
+    return None
+
+
+def _set_common_attrs(span: trace_api.Span, kind: str) -> None:
+    span.set_attribute(GEN_AI_SPAN_KIND, kind)
+    span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME)
+
+
+
+# ---------------------------------------------------------------------------
+# ENTRY / CHAIN lifecycle (driven by ScriptBrowserEnv.reset / .close)
+# ---------------------------------------------------------------------------
+
+
+def _open_task_spans(
+    tracer: Tracer,
+    options: dict[str, Any] | None,
+) -> None:
+    """Start ENTRY + CHAIN spans for a fresh WebArena task."""
+
+    # Finalise any spans left open by the previous task (writes summary
+    # attributes such as step.count before closing). When called for the
+    # very first task this is a no-op.
+    _close_task_spans()
+
+    cfg = _read_config_file(options) or {}
+    task_id = cfg.get("task_id")
+    intent = cfg.get("intent") or ""
+    sites = cfg.get("sites") or []
+    require_login = bool(cfg.get("storage_state"))
+
+    span_name = (
+        f"enter webarena_task {task_id}"
+        if task_id is not None
+        else "enter webarena_task"
+    )
+    entry_span = tracer.start_span(span_name, kind=SpanKind.INTERNAL)
+    _set_common_attrs(entry_span, "ENTRY")
+    entry_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "enter")
+    if task_id is not None:
+        entry_span.set_attribute(WEBARENA_TASK_ID, str(task_id))
+        try:
+            entry_span.set_attribute(
+                GenAI.GEN_AI_CONVERSATION_ID, str(task_id)
+            )
+        except Exception:  # noqa: BLE001
+            pass
+    if sites:
+        entry_span.set_attribute(WEBARENA_SITES, safe_json_dumps(sites))
+    entry_span.set_attribute(WEBARENA_REQUIRE_LOGIN, require_login)
+    if intent and capture_message_content():
+        entry_span.set_attribute("input.value", truncate_content(intent))
+
+    entry_token = otel_context.attach(set_span_in_context(entry_span))
+    state.set_entry(entry_span, entry_token)
+
+    chain_span = tracer.start_span(
+        "workflow webarena_task", kind=SpanKind.INTERNAL
+    )
+    _set_common_attrs(chain_span, "CHAIN")
+    chain_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "workflow")
+    if intent and capture_message_content():
+        chain_span.set_attribute("input.value", truncate_content(intent))
+    chain_token = otel_context.attach(set_span_in_context(chain_span))
+    state.set_chain(chain_span, chain_token)
+
+    state.mark_in_task(True)
+
+    # Stash the resolved task_id on the entry span attributes for later use.
+    if task_id is not None:
+        try:
+            chain_span.set_attribute(WEBARENA_TASK_ID, str(task_id))
+        except Exception:  # noqa: BLE001
+            pass
+
+
+def _close_task_spans() -> None:
+    """Finalise CHAIN/ENTRY: write summary attributes and call ``end()``."""
+
+    chain = state.get_chain_span()
+    entry = state.get_entry_span()
+    steps = state.step_count()
+    tools = state.tool_count()
+    failures = state.parsing_failure_count()
+    if chain is not None:
+        try:
+            chain.set_attribute(WEBARENA_STEP_COUNT, steps)
+            chain.set_attribute(WEBARENA_TOOL_COUNT, tools)
+            chain.set_attribute(WEBARENA_PARSING_FAILURE_COUNT, failures)
+        except Exception:  # noqa: BLE001
+            pass
+    if entry is not None:
+        try:
+            entry.set_attribute(WEBARENA_STEP_COUNT, steps)
+        except Exception:  # noqa: BLE001
+            pass
+    state.end_task_spans()
+
+
+# ---------------------------------------------------------------------------
+# ScriptBrowserEnv.reset / .close
+# ---------------------------------------------------------------------------
+
+
+class EnvResetWrapper:
+    """Open ENTRY+CHAIN spans for a new task on every ``env.reset``."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer) -> None:
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        options = kwargs.get("options")
+        _open_task_spans(self._tracer, options)
+        try:
+            return wrapped(*args, **kwargs)
+        except BaseException as exc:
+            entry = state.get_entry_span()
+            if entry is not None:
+                try:
+                    entry.record_exception(exc)
+                    entry.set_status(Status(StatusCode.ERROR))
+                except Exception:  # noqa: BLE001
+                    pass
+            _close_task_spans()
+            raise
+
+
+class EnvCloseWrapper:
+    """Close any still-open ENTRY/CHAIN/STEP at end of the batch."""
+
+    __slots__ = ()
+
+    def __init__(self) -> None:
+        pass
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        try:
+            return wrapped(*args, **kwargs)
+        finally:
+            _close_task_spans()
+
+
+# ---------------------------------------------------------------------------
+# PromptAgent.next_action  → AGENT(invoke_agent), drives STEP rotation
+# ---------------------------------------------------------------------------
+
+
+def _rotate_step(tracer: Tracer) -> trace_api.Span:
+    """End the previous STEP and open a new one as a child of CHAIN."""
+    state.end_step()
+    round_no = state.increment_step()
+    step_span = tracer.start_span("react step", kind=SpanKind.INTERNAL)
+    _set_common_attrs(step_span, "STEP")
+    step_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "react")
+    step_span.set_attribute(GEN_AI_REACT_ROUND, round_no)
+    token = otel_context.attach(set_span_in_context(step_span))
+    state.set_step(step_span, token)
+    return step_span
+
+
+class NextActionWrapper:
+    """Wrap ``PromptAgent.next_action`` as AGENT(invoke_agent)."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer) -> None:
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        # Each call to next_action begins a new ReAct round.
+        if state.in_task():
+            _rotate_step(self._tracer)
+
+        agent_class = instance.__class__.__name__
+        try:
+            instr_path = getattr(
+                instance.prompt_constructor, "instruction_path", None
+            )
+            instr_stem = getattr(instr_path, "stem", None) if instr_path else None
+        except Exception:  # noqa: BLE001
+            instr_stem = None
+        agent_name = (
+            f"{agent_class}:{instr_stem}" if instr_stem else agent_class
+        )
+        span_name = f"invoke_agent {agent_class}"
+
+        meta_data: dict[str, Any] = {}
+        if len(args) >= 3 and isinstance(args[2], dict):
+            meta_data = args[2]
+        elif "meta_data" in kwargs and isinstance(kwargs["meta_data"], dict):
+            meta_data = kwargs["meta_data"]
+
+        intent: str | None = None
+        if len(args) >= 2 and isinstance(args[1], str):
+            intent = args[1]
+        elif "intent" in kwargs and isinstance(kwargs["intent"], str):
+            intent = kwargs["intent"]
+
+        with self._tracer.start_as_current_span(
+            span_name, kind=SpanKind.INTERNAL
+        ) as span:
+            _set_common_attrs(span, "AGENT")
+            span.set_attribute(
+                GenAI.GEN_AI_OPERATION_NAME,
+                GenAI.GenAiOperationNameValues.INVOKE_AGENT.value,
+            )
+            span.set_attribute(GenAI.GEN_AI_AGENT_NAME, agent_name)
+            try:
+                lm_cfg = getattr(instance, "lm_config", None)
+                if lm_cfg is not None:
+                    provider = getattr(lm_cfg, "provider", None)
+                    model = getattr(lm_cfg, "model", None)
+                    if provider:
+                        span.set_attribute(
+                            GenAI.GEN_AI_PROVIDER_NAME, str(provider)
+                        )
+                    if model:
+                        span.set_attribute(
+                            GenAI.GEN_AI_REQUEST_MODEL, str(model)
+                        )
+            except Exception:  # noqa: BLE001
+                pass
+
+            previous = "None"
+            if meta_data:
+                history = meta_data.get("action_history")
+                if isinstance(history, list) and history:
+                    previous = str(history[-1])
+            span.set_attribute(WEBARENA_PREVIOUS_ACTION, truncate(previous))
+
+            if intent and capture_message_content():
+                span.set_attribute("input.value", truncate_content(intent))
+
+            try:
+                action = wrapped(*args, **kwargs)
+            except BaseException as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                span.set_attribute(
+                    GEN_AI_REACT_FINISH_REASON, type(exc).__qualname__
+                )
+                # Tag STEP too, so the failing round is easy to spot.
+                step_span = state.get_step_span()
+                if step_span is not None:
+                    try:
+                        step_span.set_attribute(
+                            GEN_AI_REACT_FINISH_REASON,
+                            type(exc).__qualname__,
+                        )
+                        step_span.set_status(Status(StatusCode.ERROR))
+                    except Exception:  # noqa: BLE001
+                        pass
+                raise
+
+            # Successful next_action — record action info and propagate to STEP.
+            atype = action_type_name(action)
+            span.set_attribute(WEBARENA_ACTION_TYPE, atype)
+            raw_pred = (
+                action.get("raw_prediction") if isinstance(action, dict) else None
+            )
+            if raw_pred and capture_message_content():
+                span.set_attribute(
+                    "output.value", truncate_content(str(raw_pred))
+                )
+
+            if atype == "NONE":
+                # PromptAgent fell through every retry of action parsing.
+                state.increment_parsing_failure()
+
+            step_span = state.get_step_span()
+            if step_span is not None:
+                try:
+                    step_span.set_attribute(WEBARENA_ACTION_TYPE, atype)
+                    if atype == "STOP":
+                        step_span.set_attribute(
+                            GEN_AI_REACT_FINISH_REASON, "stop"
+                        )
+                    elif atype == "NONE":
+                        step_span.set_attribute(
+                            GEN_AI_REACT_FINISH_REASON, "parse_failure"
+                        )
+                except Exception:  # noqa: BLE001
+                    pass
+
+            return action
+
+
+# ---------------------------------------------------------------------------
+# PromptConstructor.construct  →  TASK(build_prompt_context)
+# ---------------------------------------------------------------------------
+
+
+class PromptConstructWrapper:
+    """Emit a TASK span for each prompt-construction call."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer) -> None:
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        trajectory = args[0] if len(args) >= 1 else kwargs.get("trajectory")
+        intent = args[1] if len(args) >= 2 else kwargs.get("intent")
+        meta_data = args[2] if len(args) >= 3 else kwargs.get("meta_data") or {}
+
+        with self._tracer.start_as_current_span(
+            "run_task build_prompt_context", kind=SpanKind.INTERNAL
+        ) as span:
+            _set_common_attrs(span, "TASK")
+            span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "run_task")
+            span.set_attribute("webarena.task.name", "build_prompt_context")
+
+            try:
+                if trajectory is not None:
+                    span.set_attribute(
+                        WEBARENA_MEMORY_TRAJECTORY_LENGTH,
+                        int(len(trajectory)),
+                    )
+            except Exception:  # noqa: BLE001
+                pass
+
+            previous = "None"
+            if isinstance(meta_data, dict):
+                history = meta_data.get("action_history")
+                if isinstance(history, list) and history:
+                    previous = str(history[-1])
+
+            url_before = ""
+            try:
+                if (
+                    trajectory is not None
+                    and len(trajectory) > 0
+                    and isinstance(trajectory[-1], dict)
+                ):
+                    info = trajectory[-1].get("info") or {}
+                    page = info.get("page") if isinstance(info, dict) else None
+                    if page is not None and getattr(page, "url", None):
+                        url_before = str(page.url)
+            except Exception:  # noqa: BLE001
+                pass
+
+            if capture_message_content():
+                input_summary = {
+                    "intent": str(intent) if intent else "",
+                    "url": url_before,
+                    "previous_action": previous,
+                }
+                span.set_attribute(
+                    "input.value", safe_json_dumps(input_summary)
+                )
+
+            try:
+                prompt = wrapped(*args, **kwargs)
+            except BaseException as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+
+            try:
+                if isinstance(prompt, list):
+                    span.set_attribute(
+                        "webarena.prompt.messages_count", len(prompt)
+                    )
+                elif isinstance(prompt, str):
+                    span.set_attribute(
+                        "webarena.prompt.length", len(prompt)
+                    )
+            except Exception:  # noqa: BLE001
+                pass
+
+            try:
+                obs_modality = getattr(instance, "obs_modality", None)
+                if (
+                    obs_modality
+                    and trajectory is not None
+                    and len(trajectory) > 0
+                    and isinstance(trajectory[-1], dict)
+                ):
+                    obs = trajectory[-1].get("observation")
+                    if isinstance(obs, dict) and obs_modality in obs:
+                        span.set_attribute(
+                            WEBARENA_MEMORY_OBS_TEXT_LENGTH,
+                            int(len(obs[obs_modality])),
+                        )
+            except Exception:  # noqa: BLE001
+                pass
+
+            if capture_message_content():
+                span.set_attribute(
+                    "output.value", messages_to_input_value(prompt)
+                )
+            return prompt
+
+
+# ---------------------------------------------------------------------------
+# ScriptBrowserEnv.step  →  TOOL(execute_tool)
+# ---------------------------------------------------------------------------
+
+
+class EnvStepWrapper:
+    """Wrap a single browser action execution as a TOOL span."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer) -> None:
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        action = args[0] if args else kwargs.get("action")
+        atype = action_type_name(action)
+
+        url_before = ""
+        try:
+            page = getattr(instance, "page", None)
+            if page is not None and getattr(page, "url", None):
+                url_before = str(page.url)
+        except Exception:  # noqa: BLE001
+            pass
+
+        with self._tracer.start_as_current_span(
+            f"execute_tool {atype}", kind=SpanKind.INTERNAL
+        ) as span:
+            _set_common_attrs(span, "TOOL")
+            span.set_attribute(
+                GenAI.GEN_AI_OPERATION_NAME,
+                GenAI.GenAiOperationNameValues.EXECUTE_TOOL.value,
+            )
+            span.set_attribute(GenAI.GEN_AI_TOOL_NAME, atype)
+            span.set_attribute(GenAI.GEN_AI_TOOL_TYPE, "browser_action")
+            if url_before:
+                span.set_attribute(
+                    WEBARENA_PAGE_URL_BEFORE, truncate(url_before)
+                )
+
+            try:
+                main_obs = getattr(instance, "main_observation_type", None)
+                if main_obs:
+                    span.set_attribute(
+                        WEBARENA_OBSERVATION_MAIN_TYPE, str(main_obs)
+                    )
+            except Exception:  # noqa: BLE001
+                pass
+
+            if isinstance(action, dict):
+                eid = action.get("element_id")
+                if eid:
+                    span.set_attribute(
+                        WEBARENA_BROWSER_ELEMENT_ID, str(eid)
+                    )
+
+            if capture_message_content():
+                span.set_attribute(
+                    GenAI.GEN_AI_TOOL_CALL_ARGUMENTS,
+                    safe_json_dumps(action_arguments(action)),
+                )
+
+            state.increment_tool()
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except BaseException as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+
+            url_after = ""
+            try:
+                page = getattr(instance, "page", None)
+                if page is not None and getattr(page, "url", None):
+                    url_after = str(page.url)
+            except Exception:  # noqa: BLE001
+                pass
+            if url_after:
+                span.set_attribute(
+                    WEBARENA_PAGE_URL_AFTER, truncate(url_after)
+                )
+
+            success = False
+            fail_error = ""
+            terminated = False
+            if isinstance(result, tuple) and len(result) >= 5:
+                try:
+                    success = bool(result[1])
+                    terminated = bool(result[2])
+                    info = result[4] or {}
+                    if isinstance(info, dict):
+                        fail_error = str(info.get("fail_error") or "")
+                except Exception:  # noqa: BLE001
+                    pass
+
+            span.set_attribute("webarena.tool.success", success)
+            if fail_error:
+                span.set_attribute(WEBARENA_FAIL_ERROR, truncate(fail_error))
+                span.set_status(Status(StatusCode.ERROR, fail_error))
+
+            if capture_message_content():
+                span.set_attribute(
+                    GenAI.GEN_AI_TOOL_CALL_RESULT,
+                    safe_json_dumps(
+                        {
+                            "success": success,
+                            "fail_error": fail_error,
+                            "url_after": url_after,
+                            "terminated": terminated,
+                        }
+                    ),
+                )
+
+            step_span = state.get_step_span()
+            if step_span is not None and terminated:
+                try:
+                    step_span.set_attribute(
+                        GEN_AI_REACT_FINISH_REASON, "terminated"
+                    )
+                except Exception:  # noqa: BLE001
+                    pass
+
+            return result
+
+
+# ---------------------------------------------------------------------------
+# construct_agent  →  AGENT(create_agent)
+# ---------------------------------------------------------------------------
+
+
+class ConstructAgentWrapper:
+    """Wrap the agent factory as a one-shot AGENT(create_agent) span."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer) -> None:
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        ns_args = args[0] if args else kwargs.get("args")
+        agent_type = getattr(ns_args, "agent_type", None) or "unknown"
+        provider = getattr(ns_args, "provider", None) or ""
+        model = getattr(ns_args, "model", None) or ""
+        instr_path = getattr(ns_args, "instruction_path", None) or ""
+        action_set = getattr(ns_args, "action_set_tag", None) or ""
+
+        with self._tracer.start_as_current_span(
+            f"create_agent {FRAMEWORK_NAME}", kind=SpanKind.INTERNAL
+        ) as span:
+            _set_common_attrs(span, "AGENT")
+            span.set_attribute(
+                GenAI.GEN_AI_OPERATION_NAME, "create_agent"
+            )
+            span.set_attribute(
+                GenAI.GEN_AI_AGENT_NAME,
+                truncate(f"{agent_type}:{instr_path}"),
+            )
+            span.set_attribute(
+                GenAI.GEN_AI_AGENT_DESCRIPTION,
+                truncate(
+                    f"provider={provider}, model={model}, action_set={action_set}"
+                ),
+            )
+            try:
+                aid = hashlib.md5(
+                    f"{provider}:{model}:{instr_path}:{action_set}".encode("utf-8")
+                ).hexdigest()[:16]
+                span.set_attribute(GenAI.GEN_AI_AGENT_ID, aid)
+            except Exception:  # noqa: BLE001
+                pass
+            if provider:
+                span.set_attribute(GenAI.GEN_AI_PROVIDER_NAME, str(provider))
+            if model:
+                span.set_attribute(GenAI.GEN_AI_REQUEST_MODEL, str(model))
+            if action_set:
+                span.set_attribute(WEBARENA_ACTION_SET_TAG, str(action_set))
+            obs_type = getattr(ns_args, "observation_type", None)
+            if obs_type:
+                span.set_attribute(WEBARENA_OBSERVATION_TYPE, str(obs_type))
+
+            try:
+                result = wrapped(*args, **kwargs)
+            except BaseException as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+            return result
+
+
+# ---------------------------------------------------------------------------
+# generate_from_huggingface_completion  →  LLM(text_completion)
+# ---------------------------------------------------------------------------
+
+
+class HuggingFaceCompletionWrapper:
+    """LLM span for the only WebArena LLM call **not** going through OpenAI SDK."""
+
+    __slots__ = ("_tracer",)
+
+    def __init__(self, tracer: Tracer) -> None:
+        self._tracer = tracer
+
+    def __call__(
+        self,
+        wrapped: Callable[..., Any],
+        instance: Any,
+        args: tuple[Any, ...],
+        kwargs: dict[str, Any],
+    ) -> Any:
+        # Signature:
+        # generate_from_huggingface_completion(
+        #     prompt, model_endpoint, temperature, top_p, max_new_tokens,
+        #     stop_sequences=None,
+        # )
+        def _arg(idx: int, name: str, default: Any = None) -> Any:
+            if len(args) > idx:
+                return args[idx]
+            return kwargs.get(name, default)
+
+        prompt = _arg(0, "prompt", "")
+        model_endpoint = _arg(1, "model_endpoint", "")
+        temperature = _arg(2, "temperature")
+        top_p = _arg(3, "top_p")
+        max_new_tokens = _arg(4, "max_new_tokens")
+        stop_sequences = _arg(5, "stop_sequences")
+
+        span_name = f"text_completion {model_endpoint or 'huggingface'}"
+        with self._tracer.start_as_current_span(
+            span_name, kind=SpanKind.CLIENT
+        ) as span:
+            _set_common_attrs(span, "LLM")
+            span.set_attribute(
+                GenAI.GEN_AI_OPERATION_NAME,
+                GenAI.GenAiOperationNameValues.TEXT_COMPLETION.value,
+            )
+            span.set_attribute(GenAI.GEN_AI_PROVIDER_NAME, "huggingface")
+            if model_endpoint:
+                span.set_attribute(
+                    GenAI.GEN_AI_REQUEST_MODEL, str(model_endpoint)
+                )
+                span.set_attribute(
+                    GenAI.GEN_AI_RESPONSE_MODEL, str(model_endpoint)
+                )
+            try:
+                if temperature is not None:
+                    span.set_attribute(
+                        GenAI.GEN_AI_REQUEST_TEMPERATURE, float(temperature)
+                    )
+                if top_p is not None:
+                    span.set_attribute(
+                        GenAI.GEN_AI_REQUEST_TOP_P, float(top_p)
+                    )
+                if max_new_tokens is not None:
+                    span.set_attribute(
+                        GenAI.GEN_AI_REQUEST_MAX_TOKENS, int(max_new_tokens)
+                    )
+            except (TypeError, ValueError):
+                pass
+            if stop_sequences:
+                try:
+                    span.set_attribute(
+                        GenAI.GEN_AI_REQUEST_STOP_SEQUENCES,
+                        list(stop_sequences),
+                    )
+                except Exception:  # noqa: BLE001
+                    pass
+            if capture_message_content() and isinstance(prompt, str) and prompt:
+                span.set_attribute(
+                    "input.value", truncate_content(prompt)
+                )
+
+            try:
+                generation = wrapped(*args, **kwargs)
+            except BaseException as exc:
+                span.record_exception(exc)
+                span.set_status(Status(StatusCode.ERROR))
+                raise
+
+            if capture_message_content() and isinstance(generation, str):
+                span.set_attribute(
+                    "output.value", truncate_content(generation)
+                )
+            span.set_attribute("gen_ai.output.type", "text")
+
+            return generation
+
+
+__all__ = [
+    "ConstructAgentWrapper",
+    "EnvCloseWrapper",
+    "EnvResetWrapper",
+    "EnvStepWrapper",
+    "HuggingFaceCompletionWrapper",
+    "NextActionWrapper",
+    "PromptConstructWrapper",
+]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/package.py
new file mode 100644
index 000000000..63ff43cff
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/package.py
@@ -0,0 +1,3 @@
+_instruments = ("webarena >= 0.0.1",)
+
+_supports_metrics = False
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/version.py
new file mode 100644
index 000000000..3dc1f76bc
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/README.md
new file mode 100644
index 000000000..4b4aac443
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/README.md
@@ -0,0 +1,17 @@
+# LoongSuite WideSearch Instrumentation
+
+OpenTelemetry instrumentation for the [WideSearch](https://github.com/ByteDance-Seed/WideSearch) multi-agent search framework.
+
+## Installation
+
+```bash
+pip install loongsuite-instrumentation-widesearch
+```
+
+## Usage
+
+```python
+from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor
+
+WideSearchInstrumentor().instrument()
+```
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/pyproject.toml
new file mode 100644
index 000000000..9a819d25a
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/pyproject.toml
@@ -0,0 +1,57 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-widesearch"
+dynamic = ["version"]
+description = "LoongSuite WideSearch Instrumentation"
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.11"
+authors = [
+    { name = "LoongSuite Python Agent Authors", email = "caishipeng.csp@alibaba-inc.com" },
+    { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+    "opentelemetry-api ~= 1.37",
+    "opentelemetry-instrumentation >= 0.58b0",
+    "opentelemetry-semantic-conventions >= 0.58b0",
+    "opentelemetry-util-genai",
+    "wrapt >= 1.17.3, < 2.0.0",
+]
+
+[project.optional-dependencies]
+instruments = [
+    "widesearch >= 0.1.0",
+]
+test = [
+    "pytest ~= 8.0",
+    "pytest-cov ~= 4.1.0",
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+widesearch = "opentelemetry.instrumentation.widesearch:WideSearchInstrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-widesearch"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/widesearch/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = ["/src", "/tests"]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/__init__.py
new file mode 100644
index 000000000..9c441d18f
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/__init__.py
@@ -0,0 +1,164 @@
+"""
+WideSearch instrumentation supporting `widesearch >= 0.1.0`.
+
+Usage
+-----
+.. code:: python
+
+    from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor
+
+    WideSearchInstrumentor().instrument()
+
+API
+---
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any, Collection
+
+from wrapt import wrap_function_wrapper
+
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from opentelemetry.instrumentation.utils import unwrap
+from opentelemetry.instrumentation.widesearch.package import _instruments
+from opentelemetry.instrumentation.widesearch.patch import (
+    wrap_create_sub_agents_factory,
+    wrap_invoke_tool_call,
+    wrap_run_single_query,
+    wrap_runner_run,
+    wrap_runner_step,
+)
+from opentelemetry.instrumentation.widesearch.version import __version__
+from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler
+
+logger = logging.getLogger(__name__)
+
+_RUN_MODULE = "src.agent.run"
+_MULTI_AGENT_MODULE = "src.agent.multi_agent_tools"
+
+__all__ = ["WideSearchInstrumentor", "__version__"]
+
+
+class WideSearchInstrumentor(BaseInstrumentor):
+    """OpenTelemetry instrumentor for WideSearch framework.
+
+    Instruments the following components:
+    - run_single_query(): ENTRY span
+    - Runner.run(): AGENT span (async generator)
+    - Runner._step(): STEP span
+    - Runner._invoke_tool_call(): TOOL spans
+    - create_sub_agents_wrap(): TASK span
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._handler = None
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        tracer_provider = kwargs.get("tracer_provider")
+        meter_provider = kwargs.get("meter_provider")
+        logger_provider = kwargs.get("logger_provider")
+
+        self._handler = ExtendedTelemetryHandler(
+            tracer_provider=tracer_provider,
+            meter_provider=meter_provider,
+            logger_provider=logger_provider,
+        )
+
+        # H1: ENTRY span
+        try:
+            wrap_function_wrapper(
+                module=_RUN_MODULE,
+                name="run_single_query",
+                wrapper=lambda w, i, a, k: wrap_run_single_query(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented run_single_query")
+        except Exception as e:
+            logger.warning(f"Failed to instrument run_single_query: {e}")
+
+        # H2: AGENT span
+        try:
+            wrap_function_wrapper(
+                module=_RUN_MODULE,
+                name="Runner.run",
+                wrapper=lambda w, i, a, k: wrap_runner_run(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented Runner.run")
+        except Exception as e:
+            logger.warning(f"Failed to instrument Runner.run: {e}")
+
+        # H3: STEP span
+        try:
+            wrap_function_wrapper(
+                module=_RUN_MODULE,
+                name="Runner._step",
+                wrapper=lambda w, i, a, k: wrap_runner_step(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented Runner._step")
+        except Exception as e:
+            logger.warning(f"Failed to instrument Runner._step: {e}")
+
+        # H4: TOOL spans
+        try:
+            wrap_function_wrapper(
+                module=_RUN_MODULE,
+                name="Runner._invoke_tool_call",
+                wrapper=lambda w, i, a, k: wrap_invoke_tool_call(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented Runner._invoke_tool_call")
+        except Exception as e:
+            logger.warning(
+                f"Failed to instrument Runner._invoke_tool_call: {e}"
+            )
+
+        # H5: TASK span (wrap factory)
+        try:
+            wrap_function_wrapper(
+                module=_MULTI_AGENT_MODULE,
+                name="create_sub_agents_wrap",
+                wrapper=lambda w, i, a, k: wrap_create_sub_agents_factory(
+                    w, i, a, k, handler=self._handler
+                ),
+            )
+            logger.debug("Instrumented create_sub_agents_wrap")
+        except Exception as e:
+            logger.warning(
+                f"Failed to instrument create_sub_agents_wrap: {e}"
+            )
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        try:
+            import src.agent.run  # noqa: PLC0415
+
+            unwrap(src.agent.run, "run_single_query")
+            unwrap(src.agent.run.Runner, "run")
+            unwrap(src.agent.run.Runner, "_step")
+            unwrap(src.agent.run.Runner, "_invoke_tool_call")
+            logger.debug("Uninstrumented src.agent.run")
+        except Exception as e:
+            logger.warning(f"Failed to uninstrument src.agent.run: {e}")
+
+        try:
+            import src.agent.multi_agent_tools  # noqa: PLC0415
+
+            unwrap(src.agent.multi_agent_tools, "create_sub_agents_wrap")
+            logger.debug("Uninstrumented src.agent.multi_agent_tools")
+        except Exception as e:
+            logger.warning(
+                f"Failed to uninstrument src.agent.multi_agent_tools: {e}"
+            )
+
+        self._handler = None
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/package.py
new file mode 100644
index 000000000..bd0572292
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/package.py
@@ -0,0 +1,2 @@
+_instruments = ("widesearch >= 0.1.0",)
+_supports_metrics = False
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/patch.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/patch.py
new file mode 100644
index 000000000..0813a7c8e
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/patch.py
@@ -0,0 +1,348 @@
+"""Patch functions for WideSearch instrumentation.
+
+Wraps key WideSearch methods to generate OpenTelemetry spans:
+- run_single_query -> ENTRY span
+- Runner.run -> AGENT span (async generator)
+- Runner._step -> STEP span
+- Runner._invoke_tool_call -> TOOL spans (one per tool_call)
+- create_sub_agents_wrap -> TASK span (on returned closure)
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+from contextvars import ContextVar
+
+from opentelemetry.trace import SpanKind, StatusCode
+from opentelemetry.trace.status import Status
+from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler
+from opentelemetry.util.genai.extended_types import ReactStepInvocation
+from opentelemetry.util.genai.types import Error
+
+from .utils import (
+    _create_agent_invocation,
+    _create_entry_invocation,
+    _create_tool_invocation,
+    _extract_output_messages,
+    _step_to_output_messages,
+)
+
+logger = logging.getLogger(__name__)
+
+_step_counter: ContextVar[int] = ContextVar("ws_step_counter", default=0)
+_in_run_single_query: ContextVar[bool] = ContextVar("ws_in_rsq", default=False)
+
+
+async def wrap_run_single_query(
+    wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler
+):
+    """H1: ENTRY span for run_single_query."""
+    if _in_run_single_query.get():
+        return await wrapped(*args, **kwargs)
+    token = _in_run_single_query.set(True)
+
+    query = args[0] if args else kwargs.get("query", "")
+    system_prompt = kwargs.get("system_prompt") or ""
+    tools_desc_kw = kwargs.get("tools_desc")
+    try:
+        invocation = _create_entry_invocation(
+            query,
+            system_prompt=system_prompt or None,
+            tools_desc=(
+                tools_desc_kw if isinstance(tools_desc_kw, list) else None
+            ),
+        )
+    except Exception as e:
+        logger.debug(f"Failed to create entry invocation: {e}")
+        _in_run_single_query.reset(token)
+        return await wrapped(*args, **kwargs)
+
+    handler.start_entry(invocation)
+
+    try:
+        result = await wrapped(*args, **kwargs)
+        invocation.output_messages = _extract_output_messages(result)
+        handler.stop_entry(invocation)
+        return result
+    except Exception as e:
+        handler.fail_entry(invocation, Error(message=str(e), type=type(e)))
+        raise
+    finally:
+        _in_run_single_query.reset(token)
+
+
+async def wrap_runner_run(
+    wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler
+):
+    """H2: AGENT span for Runner.run (async generator)."""
+    starting_agent = args[0] if args else kwargs.get("starting_agent")
+    user_input = args[1] if len(args) > 1 else kwargs.get("user_input", "")
+    memory = args[2] if len(args) > 2 else kwargs.get("memory")
+    system_prompt = getattr(memory, "system_instructions", None)
+
+    try:
+        invocation = _create_agent_invocation(
+            starting_agent, user_input, system_prompt=system_prompt
+        )
+    except Exception as e:
+        logger.debug(f"Failed to create agent invocation: {e}")
+        async for step in wrapped(*args, **kwargs):
+            yield step
+        return
+
+    counter_token = _step_counter.set(0)
+    handler.start_invoke_agent(invocation)
+
+    try:
+        last_step = None
+        async for step in wrapped(*args, **kwargs):
+            last_step = step
+            yield step
+
+        if last_step:
+            invocation.output_messages = _step_to_output_messages(last_step)
+        handler.stop_invoke_agent(invocation)
+    except GeneratorExit as e:
+        handler.fail_invoke_agent(
+            invocation, Error(message="GeneratorExit", type=GeneratorExit)
+        )
+        raise
+    except Exception as e:
+        handler.fail_invoke_agent(
+            invocation, Error(message=str(e), type=type(e))
+        )
+        raise
+    finally:
+        _step_counter.reset(counter_token)
+
+
+async def wrap_runner_step(
+    wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler
+):
+    """H3: STEP span for Runner._step."""
+    step_num = _step_counter.get() + 1
+    _step_counter.set(step_num)
+
+    invocation = ReactStepInvocation(round=step_num)
+    invocation.attributes["gen_ai.framework"] = "widesearch"
+
+    try:
+        handler.start_react_step(invocation)
+    except Exception as e:
+        logger.debug(f"Failed to start react step: {e}")
+        return await wrapped(*args, **kwargs)
+
+    try:
+        result = await wrapped(*args, **kwargs)
+
+        from src.agent.memory import ActionStep, ActionStepError, StepStatus
+
+        if isinstance(result, ActionStepError):
+            invocation.finish_reason = "error"
+            handler.fail_react_step(
+                invocation,
+                Error(message=result.message, type=type(result)),
+            )
+        else:
+            if result.step_status == StepStatus.FINISHED:
+                invocation.finish_reason = "finished"
+            elif result.error_marker is not None:
+                invocation.finish_reason = "error"
+            else:
+                invocation.finish_reason = "continue"
+            handler.stop_react_step(invocation)
+
+        return result
+    except Exception as e:
+        invocation.finish_reason = "error"
+        handler.fail_react_step(
+            invocation, Error(message=str(e), type=type(e))
+        )
+        raise
+
+
+async def wrap_invoke_tool_call(
+    wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler
+):
+    """H4: TOOL span for each tool_call inside Runner._invoke_tool_call."""
+    agent = args[0] if args else kwargs.get("agent")
+    model_response = args[1] if len(args) > 1 else kwargs.get("model_response")
+
+    if not model_response.outputs:
+        return await wrapped(*args, **kwargs)
+
+    resp = model_response.outputs[0]
+    if not resp.tool_calls:
+        return await wrapped(*args, **kwargs)
+
+    from src.agent.schema import ErrorMarker, ToolCallResult
+
+    async def _call_with_span(tool_call):
+        try:
+            invocation = _create_tool_invocation(tool_call, agent)
+        except Exception as e:
+            logger.debug(f"Failed to create tool invocation: {e}")
+            return await _call_original(tool_call, agent)
+
+        handler.start_execute_tool(invocation)
+
+        tool_name = tool_call.tool_name
+        tool = agent.get_tool_by_name(tool_name)
+        if tool is None:
+            invocation.tool_call_result = f"Tool {tool_name} not found"
+            handler.fail_execute_tool(
+                invocation,
+                Error(
+                    message=f"Tool {tool_name} not found",
+                    type=ValueError,
+                ),
+            )
+            return ToolCallResult(
+                tool_call_id=tool_call.tool_call_id,
+                error_marker=ErrorMarker(message=f"Tool {tool_name} not found"),
+            )
+
+        arguments = tool_call.arguments
+        if isinstance(arguments, str):
+            try:
+                arguments = json.loads(arguments)
+            except json.JSONDecodeError:
+                arguments = {}
+
+        try:
+            response = await tool(**arguments)
+        except Exception as e:
+            invocation.tool_call_result = str(e)
+            handler.fail_execute_tool(
+                invocation, Error(message=str(e), type=type(e))
+            )
+            return ToolCallResult(
+                tool_call_id=tool_call.tool_call_id,
+                error_marker=ErrorMarker(message=str(e)),
+            )
+
+        error_marker = (
+            ErrorMarker(message=response.error) if response.error else None
+        )
+        system_error_marker = (
+            ErrorMarker(message=response.system_error)
+            if response.system_error
+            else None
+        )
+
+        result_content = response.data
+        invocation.tool_call_result = result_content
+
+        if error_marker or system_error_marker:
+            msg = (error_marker or system_error_marker)["message"]
+            handler.fail_execute_tool(
+                invocation, Error(message=msg, type=RuntimeError)
+            )
+        else:
+            handler.stop_execute_tool(invocation)
+
+        return ToolCallResult(
+            tool_call_id=tool_call.tool_call_id,
+            content=result_content,
+            error_marker=error_marker,
+            system_error_marker=system_error_marker,
+            extra=response.extra if response.extra else {},
+        )
+
+    async def _call_original(tool_call, agent):
+        """Fallback: execute tool without span."""
+        tool_name = tool_call.tool_name
+        tool = agent.get_tool_by_name(tool_name)
+        if tool is None:
+            return ToolCallResult(
+                tool_call_id=tool_call.tool_call_id,
+                error_marker=ErrorMarker(message=f"Tool {tool_name} not found"),
+            )
+        arguments = tool_call.arguments
+        if isinstance(arguments, str):
+            try:
+                arguments = json.loads(arguments)
+            except json.JSONDecodeError:
+                arguments = {}
+        try:
+            response = await tool(**arguments)
+        except Exception as e:
+            return ToolCallResult(
+                tool_call_id=tool_call.tool_call_id,
+                error_marker=ErrorMarker(message=str(e)),
+            )
+        return ToolCallResult(
+            tool_call_id=tool_call.tool_call_id,
+            content=response.data,
+            error_marker=(
+                ErrorMarker(message=response.error) if response.error else None
+            ),
+            system_error_marker=(
+                ErrorMarker(message=response.system_error)
+                if response.system_error
+                else None
+            ),
+            extra=response.extra if response.extra else {},
+        )
+
+    tasks = [_call_with_span(tc) for tc in resp.tool_calls]
+    results = await asyncio.gather(*tasks)
+    return [r for r in results if r is not None]
+
+
+def wrap_create_sub_agents_factory(
+    wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler
+):
+    """H5: TASK span wrapping the closure returned by create_sub_agents_wrap."""
+    original_closure = wrapped(*args, **kwargs)
+
+    async def closure_with_task_span(sub_agents):
+        tracer = handler._tracer
+        span_name = "run_task create_sub_agents"
+
+        with tracer.start_as_current_span(
+            name=span_name,
+            kind=SpanKind.INTERNAL,
+        ) as span:
+            span.set_attribute("gen_ai.span.kind", "TASK")
+            span.set_attribute("gen_ai.operation.name", "run_task")
+            span.set_attribute("gen_ai.framework", "widesearch")
+
+            try:
+                safe_input = json.dumps(
+                    [
+                        {
+                            "index": sa.get("index"),
+                            "prompt": sa.get("prompt", "")[:200],
+                        }
+                        for sa in sub_agents
+                    ],
+                    ensure_ascii=False,
+                )
+                span.set_attribute("input.value", safe_input)
+            except Exception:
+                pass
+
+            try:
+                result = await original_closure(sub_agents)
+
+                if result and hasattr(result, "data") and result.data:
+                    output_str = (
+                        result.data
+                        if isinstance(result.data, str)
+                        else json.dumps(result.data, ensure_ascii=False)
+                    )
+                    if len(output_str) > 4096:
+                        output_str = output_str[:4096] + "...(truncated)"
+                    span.set_attribute("output.value", output_str)
+
+                span.set_status(Status(StatusCode.OK))
+                return result
+            except Exception as e:
+                span.record_exception(e)
+                span.set_status(Status(StatusCode.ERROR, str(e)))
+                raise
+
+    return closure_with_task_span
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/utils.py
new file mode 100644
index 000000000..8f85f6c6f
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/utils.py
@@ -0,0 +1,202 @@
+"""Utility functions for WideSearch instrumentation."""
+
+from __future__ import annotations
+
+import json
+import logging
+from typing import Any, List, Optional
+
+from opentelemetry.util.genai.extended_types import (
+    EntryInvocation,
+    ExecuteToolInvocation,
+    InvokeAgentInvocation,
+    ReactStepInvocation,
+)
+from opentelemetry.util.genai.types import (
+    FunctionToolDefinition,
+    InputMessage,
+    OutputMessage,
+    Text,
+    ToolCall as GenAIToolCall,
+    ToolCallResponse,
+)
+
+logger = logging.getLogger(__name__)
+
+
+_FRAMEWORK = "widesearch"
+
+
+def _create_entry_invocation(
+    query: str,
+    *,
+    system_prompt: Optional[str] = None,
+    tools_desc: Optional[List[dict[str, Any]]] = None,
+) -> EntryInvocation:
+    invocation = EntryInvocation()
+    invocation.input_messages = [
+        InputMessage(role="user", parts=[Text(content=query)])
+    ]
+    invocation.attributes["gen_ai.framework"] = _FRAMEWORK
+    if system_prompt:
+        invocation.system_instruction = [Text(content=system_prompt)]
+
+    defs = None
+    if tools_desc:
+        defs = _convert_tools_desc(tools_desc)
+        if defs is not None:
+            invocation.tool_definitions = defs
+
+    return invocation
+
+
+def _create_agent_invocation(
+    agent: Any, user_input: str, system_prompt: Optional[str] = None
+) -> InvokeAgentInvocation:
+    agent_name = getattr(agent, "name", None) or "widesearch-agent"
+
+    request_model = None
+    model_config_name = getattr(agent, "model_config_name", None)
+    if model_config_name:
+        try:
+            from src.utils.config import model_config
+
+            request_model = model_config.get(model_config_name, {}).get(
+                "model_name"
+            )
+        except Exception:
+            pass
+    request_model = request_model or model_config_name
+
+    instructions = system_prompt or getattr(agent, "instructions", None) or ""
+
+    invocation = InvokeAgentInvocation(
+        provider="widesearch",
+        agent_name=agent_name,
+        agent_description=instructions[:200] if instructions else "",
+        request_model=request_model,
+        input_messages=[
+            InputMessage(role="user", parts=[Text(content=user_input)])
+        ],
+    )
+    invocation.attributes["gen_ai.framework"] = _FRAMEWORK
+
+    if instructions:
+        invocation.system_instruction = [Text(content=instructions)]
+
+    tools_desc = getattr(agent, "tools_desc", None)
+    if tools_desc:
+        invocation.tool_definitions = _convert_tools_desc(tools_desc)
+
+    return invocation
+
+
+def _create_tool_invocation(
+    tool_call: Any, agent: Any
+) -> ExecuteToolInvocation:
+    args = tool_call.arguments
+    if isinstance(args, str):
+        try:
+            args = json.loads(args)
+        except (json.JSONDecodeError, ValueError):
+            args = {"raw": args}
+
+    description = None
+    if hasattr(agent, "tools_desc"):
+        for td in agent.tools_desc:
+            func = td.get("function", {})
+            if func.get("name") == tool_call.tool_name:
+                description = func.get("description")
+                break
+
+    invocation = ExecuteToolInvocation(
+        tool_name=tool_call.tool_name,
+        tool_call_id=getattr(tool_call, "tool_call_id", None),
+        tool_call_arguments=args,
+        tool_description=description,
+        tool_type="function",
+    )
+    invocation.attributes["gen_ai.framework"] = _FRAMEWORK
+    return invocation
+
+
+def _extract_output_messages(messages: Any) -> List[OutputMessage]:
+    """Extract output messages from run_single_query return value."""
+    if not messages:
+        return []
+    last_msg = messages[-1]
+    content = ""
+    if isinstance(last_msg, dict):
+        c = last_msg.get("content", {})
+        if isinstance(c, dict):
+            content = c.get("content", "")
+        elif isinstance(c, str):
+            content = c
+    return [
+        OutputMessage(
+            role="assistant",
+            parts=[Text(content=content)],
+            finish_reason="stop",
+        )
+    ]
+
+
+def _step_to_output_messages(step: Any) -> List[OutputMessage]:
+    """Extract output messages from an ActionStep."""
+    content = getattr(step, "content", None) or ""
+    parts = []
+    if content:
+        parts.append(Text(content=content))
+
+    for tool_call in getattr(step, "tool_calls", []) or []:
+        args = getattr(tool_call, "arguments", None)
+        if isinstance(args, str):
+            try:
+                args = json.loads(args)
+            except (json.JSONDecodeError, ValueError):
+                pass
+        parts.append(
+            GenAIToolCall(
+                id=getattr(tool_call, "tool_call_id", None),
+                name=getattr(tool_call, "tool_name", ""),
+                arguments=args,
+            )
+        )
+
+    for tool_result in getattr(step, "tool_call_results", []) or []:
+        result = getattr(tool_result, "content", None)
+        if result is None and getattr(tool_result, "error_marker", None):
+            result = getattr(tool_result, "error_marker", {}).get("message")
+        parts.append(
+            ToolCallResponse(
+                id=getattr(tool_result, "tool_call_id", None),
+                response=result,
+            )
+        )
+
+    finish_reason = "tool_calls" if getattr(step, "tool_calls", None) else "stop"
+    return [
+        OutputMessage(
+            role="assistant",
+            parts=parts or [Text(content="")],
+            finish_reason=finish_reason,
+        )
+    ]
+
+
+def _convert_tools_desc(
+    tools_desc: List[dict],
+) -> Optional[List[FunctionToolDefinition]]:
+    """Convert WideSearch tools_desc to FunctionToolDefinition list."""
+    result = []
+    for td in tools_desc:
+        if td.get("type") == "function":
+            func = td.get("function", {})
+            result.append(
+                FunctionToolDefinition(
+                    name=func.get("name", ""),
+                    description=func.get("description"),
+                    parameters=func.get("parameters"),
+                )
+            )
+    return result if result else None
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/version.py
new file mode 100644
index 000000000..26056b5d8
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/version.py
@@ -0,0 +1 @@
+__version__ = "0.5.0.dev"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/conftest.py
new file mode 100644
index 000000000..461bf8e1f
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/conftest.py
@@ -0,0 +1,412 @@
+"""Test configuration for WideSearch instrumentation tests.
+
+Injects lightweight stub modules for `src.agent.*` into sys.modules
+so that wrap_function_wrapper can find them without installing WideSearch.
+"""
+
+from __future__ import annotations
+
+import os
+import sys
+import types
+from dataclasses import dataclass, field
+from pathlib import Path
+from enum import Enum
+from typing import Any, Callable, List, Literal
+
+# Ensure workspace opentelemetry-util-genai is imported (not stale site-packages).
+_REPO_ROOT = Path(__file__).resolve().parents[3]
+_UTIL_GENAI_SRC = _REPO_ROOT / "util" / "opentelemetry-util-genai" / "src"
+if _UTIL_GENAI_SRC.is_dir() and str(_UTIL_GENAI_SRC) not in sys.path:
+    sys.path.insert(0, str(_UTIL_GENAI_SRC))
+    # Plugins or other loaders may pull opentelemetry.util.genai.* from
+    # site-packages before this conftest runs — drop caches so imports resolve here.
+    for _m in list(sys.modules):
+        if _m == "opentelemetry.util.genai" or _m.startswith(
+            "opentelemetry.util.genai."
+        ):
+            del sys.modules[_m]
+
+_WIDESEARCH_PLUGIN_SRC = Path(__file__).resolve().parents[1] / "src"
+if _WIDESEARCH_PLUGIN_SRC.is_dir() and str(_WIDESEARCH_PLUGIN_SRC) not in sys.path:
+    sys.path.insert(0, str(_WIDESEARCH_PLUGIN_SRC))
+
+import pytest
+
+# ---------------------------------------------------------------------------
+# Stub modules for WideSearch (src.agent.*)
+# ---------------------------------------------------------------------------
+
+
+class StepStatus(str, Enum):
+    USER = "USER"
+    FINISHED = "FINISHED"
+    CONTINUE = "CONTINUE"
+    ERROR = "ERROR"
+
+
+@dataclass
+class ActionStepError:
+    message: str
+    source: Literal["llm"] = "llm"
+
+
+@dataclass
+class ToolCall:
+    tool_name: str
+    arguments: Any
+    tool_call_id: str
+
+
+@dataclass
+class ErrorMarker:
+    message: str
+
+    def __getitem__(self, key):
+        if key == "message":
+            return self.message
+        raise KeyError(key)
+
+
+@dataclass
+class ToolCallResult:
+    tool_call_id: str
+    content: str | None = None
+    error_marker: Any = None
+    system_error_marker: Any = None
+    extra: dict = field(default_factory=dict)
+
+
+@dataclass
+class LLMOutputItem:
+    role: str = "assistant"
+    content: str | None = None
+    reasoning_content: str | None = None
+    signature: str | None = None
+    tool_calls: list = field(default_factory=list)
+
+
+@dataclass
+class ModelResponse:
+    outputs: list = field(default_factory=list)
+    session_id: str | None = None
+    error_marker: Any = None
+
+
+@dataclass
+class ActionStep:
+    step_status: StepStatus = StepStatus.CONTINUE
+    content: str | None = None
+    reasoning_content: str | None = None
+    signature: str | None = None
+    tool_calls: list = field(default_factory=list)
+    tool_call_results: list = field(default_factory=list)
+    error_marker: Any = None
+
+
+@dataclass
+class UserInputStep:
+    user_input: str
+    step_status: StepStatus = StepStatus.USER
+
+
+@dataclass
+class MemoryTurn:
+    steps: list = field(default_factory=list)
+
+    @property
+    def step_number(self):
+        return sum(1 for s in self.steps if isinstance(s, ActionStep))
+
+    def is_finished(self) -> bool:
+        if not self.steps:
+            return False
+        return self.steps[-1].step_status == StepStatus.FINISHED
+
+
+@dataclass
+class MemoryAgent:
+    system_instructions: str | None = None
+    turns: list = field(default_factory=list)
+
+    def insert_user_input(self, user_input: str):
+        turn = MemoryTurn()
+        turn.steps.append(UserInputStep(user_input=user_input))
+        self.turns.append(turn)
+        return turn
+
+    def insert_action_step(self, action_step):
+        last_turn = self.turns[-1]
+        last_turn.steps.append(action_step)
+        return last_turn
+
+    def to_message(self, **kwargs):
+        return []
+
+
+@dataclass
+class InternalResponse:
+    data: Any = None
+    error: str | None = None
+    system_error: str | None = None
+    extra: dict | None = None
+
+
+@dataclass
+class Agent:
+    name: str = "test-agent"
+    instructions: str | None = "You are a helpful agent."
+    tools: dict = field(default_factory=dict)
+    tools_desc: list = field(default_factory=list)
+    model_config_name: str = "gpt-4o"
+
+    def get_tool_by_name(self, tool_name: str):
+        return self.tools.get(tool_name)
+
+
+DEFAULT_MAX_STEPS = 50
+DEFAULT_MAX_ERROR_COUNT = 3
+
+
+class Runner:
+    _step_override = None  # Set to a callable to override _step behavior
+
+    @classmethod
+    async def run(
+        cls,
+        starting_agent,
+        user_input: str,
+        memory=None,
+        *,
+        max_steps: int = DEFAULT_MAX_STEPS,
+        llm_error_strategy: str = "retry",
+    ):
+        if memory is None:
+            memory = MemoryAgent(
+                system_instructions=starting_agent.instructions
+            )
+        last_turn = memory.insert_user_input(user_input)
+        step_result = await cls._step(agent=starting_agent, memory=memory)
+        if not isinstance(step_result, ActionStepError):
+            yield step_result
+
+    @classmethod
+    async def _step(cls, *, agent, memory) -> ActionStep | ActionStepError:
+        if cls._step_override is not None:
+            return await cls._step_override(agent=agent, memory=memory)
+        return ActionStep(step_status=StepStatus.FINISHED, content="Done")
+
+    @classmethod
+    async def _invoke_tool_call(
+        cls, agent, model_response
+    ) -> list:
+        return []
+
+
+async def run_single_query(
+    query: str,
+    agent_name: str = "",
+    model_config_name: str = "",
+    tools: dict = None,
+    tools_desc: list = None,
+    system_prompt: str = "",
+):
+    agent_instructions = (
+        system_prompt if system_prompt else "You are a helpful agent."
+    )
+    agent = Agent(
+        name=agent_name,
+        tools=tools or {},
+        tools_desc=tools_desc or [],
+        model_config_name=model_config_name,
+        instructions=agent_instructions,
+    )
+    memory = MemoryAgent(system_instructions=system_prompt)
+
+    # Mirrors real implementation: calls Runner.run as async generator
+    async for step in Runner.run(agent, query, memory):
+        pass
+
+    last_content = "final answer"
+    if memory.turns:
+        last_turn = memory.turns[-1]
+        for s in reversed(last_turn.steps):
+            if isinstance(s, ActionStep) and s.content:
+                last_content = s.content
+                break
+
+    return [
+        {"role": "user", "content": query},
+        {"role": "assistant", "content": {"content": last_content}},
+    ]
+
+
+def _default_tools():
+    return {}
+
+
+def get_system_prompt(language="zh"):
+    return "You are a helpful assistant."
+
+
+def create_sub_agents_wrap(
+    agent_name, model_config_name, tools, tools_desc, system_prompt
+):
+    async def create_sub_agents(sub_agents: list) -> InternalResponse:
+        import json
+
+        results = []
+        for sa in sub_agents:
+            results.append(
+                {"index": sa.get("index"), "prompt": sa.get("prompt", ""), "response": "sub result"}
+            )
+        return InternalResponse(
+            data=json.dumps(results, ensure_ascii=False)
+        )
+
+    return create_sub_agents
+
+
+def _inject_stub_modules():
+    """Inject stub modules into sys.modules so that wrapt can resolve them."""
+    # Create module hierarchy: src -> src.agent -> src.agent.run, etc.
+    src_mod = types.ModuleType("src")
+    src_agent_mod = types.ModuleType("src.agent")
+    src_agent_run_mod = types.ModuleType("src.agent.run")
+    src_agent_multi_agent_tools_mod = types.ModuleType("src.agent.multi_agent_tools")
+    src_agent_memory_mod = types.ModuleType("src.agent.memory")
+    src_agent_schema_mod = types.ModuleType("src.agent.schema")
+    src_agent_tools_mod = types.ModuleType("src.agent.tools")
+    src_agent_prompt_mod = types.ModuleType("src.agent.prompt")
+    src_utils_mod = types.ModuleType("src.utils")
+    src_utils_config_mod = types.ModuleType("src.utils.config")
+
+    # Populate src.agent.run
+    src_agent_run_mod.Runner = Runner
+    src_agent_run_mod.run_single_query = run_single_query
+    src_agent_run_mod.run_turn = None
+    src_agent_run_mod.extract_messages_from_memory = None
+
+    # Populate src.agent.multi_agent_tools
+    src_agent_multi_agent_tools_mod.create_sub_agents_wrap = create_sub_agents_wrap
+
+    # Populate src.agent.memory
+    src_agent_memory_mod.ActionStep = ActionStep
+    src_agent_memory_mod.ActionStepError = ActionStepError
+    src_agent_memory_mod.MemoryAgent = MemoryAgent
+    src_agent_memory_mod.StepStatus = StepStatus
+    src_agent_memory_mod.UserInputStep = UserInputStep
+
+    # Populate src.agent.schema
+    src_agent_schema_mod.ToolCall = ToolCall
+    src_agent_schema_mod.ToolCallResult = ToolCallResult
+    src_agent_schema_mod.ModelResponse = ModelResponse
+    src_agent_schema_mod.ErrorMarker = ErrorMarker
+    src_agent_schema_mod.LLMOutputItem = LLMOutputItem
+
+    # Populate src.agent.tools
+    src_agent_tools_mod.InternalResponse = InternalResponse
+    src_agent_tools_mod._default_tools = {}
+
+    # Populate src.agent.prompt
+    src_agent_prompt_mod.get_system_prompt = get_system_prompt
+
+    # Populate src.agent.agent
+    src_agent_agent_mod = types.ModuleType("src.agent.agent")
+    src_agent_agent_mod.Agent = Agent
+    src_agent_agent_mod.DEFAULT_MAX_STEPS = DEFAULT_MAX_STEPS
+    src_agent_agent_mod.DEFAULT_MAX_ERROR_COUNT = DEFAULT_MAX_ERROR_COUNT
+
+    # Populate src.utils.config
+    src_utils_config_mod.model_config = {
+        "gpt-4o": {"model_name": "gpt-4o-2024-05-13"},
+    }
+
+    # Wire up parent references
+    src_mod.agent = src_agent_mod
+    src_mod.utils = src_utils_mod
+    src_agent_mod.run = src_agent_run_mod
+    src_agent_mod.multi_agent_tools = src_agent_multi_agent_tools_mod
+    src_agent_mod.memory = src_agent_memory_mod
+    src_agent_mod.schema = src_agent_schema_mod
+    src_agent_mod.tools = src_agent_tools_mod
+    src_agent_mod.prompt = src_agent_prompt_mod
+    src_agent_mod.agent = src_agent_agent_mod
+
+    # Register in sys.modules
+    sys.modules["src"] = src_mod
+    sys.modules["src.agent"] = src_agent_mod
+    sys.modules["src.agent.run"] = src_agent_run_mod
+    sys.modules["src.agent.multi_agent_tools"] = src_agent_multi_agent_tools_mod
+    sys.modules["src.agent.memory"] = src_agent_memory_mod
+    sys.modules["src.agent.schema"] = src_agent_schema_mod
+    sys.modules["src.agent.tools"] = src_agent_tools_mod
+    sys.modules["src.agent.prompt"] = src_agent_prompt_mod
+    sys.modules["src.agent.agent"] = src_agent_agent_mod
+    sys.modules["src.utils"] = src_utils_mod
+    sys.modules["src.utils.config"] = src_utils_config_mod
+
+
+# Inject stubs before any test imports the instrumentation module
+_inject_stub_modules()
+
+
+# ---------------------------------------------------------------------------
+# OTel test fixtures
+# ---------------------------------------------------------------------------
+
+
+def pytest_configure(config: pytest.Config):
+    os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental"
+    os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "span_only"
+
+
+for _m in list(sys.modules):
+    if _m.startswith("opentelemetry.instrumentation.widesearch"):
+        del sys.modules[_m]
+
+from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+    InMemorySpanExporter,
+)
+from opentelemetry.sdk.metrics import MeterProvider
+from opentelemetry.sdk.metrics.export import InMemoryMetricReader
+
+
+@pytest.fixture(scope="function", name="span_exporter")
+def fixture_span_exporter():
+    exporter = InMemorySpanExporter()
+    yield exporter
+
+
+@pytest.fixture(scope="function", name="metric_reader")
+def fixture_metric_reader():
+    reader = InMemoryMetricReader()
+    yield reader
+
+
+@pytest.fixture(scope="function", name="tracer_provider")
+def fixture_tracer_provider(span_exporter):
+    provider = TracerProvider()
+    provider.add_span_processor(SimpleSpanProcessor(span_exporter))
+    return provider
+
+
+@pytest.fixture(scope="function", name="meter_provider")
+def fixture_meter_provider(metric_reader):
+    meter_provider = MeterProvider(metric_readers=[metric_reader])
+    return meter_provider
+
+
+@pytest.fixture(scope="function")
+def instrument(tracer_provider, meter_provider):
+    instrumentor = WideSearchInstrumentor()
+    instrumentor.instrument(
+        tracer_provider=tracer_provider,
+        meter_provider=meter_provider,
+        skip_dep_check=True,
+    )
+    yield instrumentor
+    instrumentor.uninstrument()
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/test_widesearch.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/test_widesearch.py
new file mode 100644
index 000000000..3f4be12d9
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/test_widesearch.py
@@ -0,0 +1,797 @@
+"""Tests for WideSearch instrumentation.
+
+Covers:
+- Instrumentor lifecycle (instrument/uninstrument idempotency)
+- 5 span types: ENTRY, AGENT, STEP, TOOL, TASK
+- Parent-child relationships
+- Key attributes
+- Error paths
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import sys
+from dataclasses import field
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
+from opentelemetry.trace import StatusCode
+
+from .conftest import (
+    ActionStep,
+    ActionStepError,
+    Agent,
+    ErrorMarker,
+    InternalResponse,
+    LLMOutputItem,
+    MemoryAgent,
+    ModelResponse,
+    Runner,
+    StepStatus,
+    ToolCall,
+    ToolCallResult,
+)
+
+
+def _run_async(coro):
+    """Helper to run async coroutines in tests."""
+    loop = asyncio.new_event_loop()
+    try:
+        return loop.run_until_complete(coro)
+    finally:
+        loop.close()
+
+
+def _run_async_gen(async_gen):
+    """Helper to consume an async generator."""
+    async def _consume():
+        results = []
+        async for item in async_gen:
+            results.append(item)
+        return results
+    loop = asyncio.new_event_loop()
+    try:
+        return loop.run_until_complete(_consume())
+    finally:
+        loop.close()
+
+
+# ---------------------------------------------------------------------------
+# Instrumentor Lifecycle Tests
+# ---------------------------------------------------------------------------
+
+
+class TestInstrumentorLifecycle:
+    def test_instrument_and_uninstrument(self, tracer_provider, meter_provider):
+        from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor
+
+        instrumentor = WideSearchInstrumentor()
+        instrumentor.instrument(
+            tracer_provider=tracer_provider,
+            meter_provider=meter_provider,
+            skip_dep_check=True,
+        )
+        assert instrumentor._handler is not None
+        instrumentor.uninstrument()
+        assert instrumentor._handler is None
+
+    def test_double_instrument_uninstrument(self, tracer_provider, meter_provider):
+        from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor
+
+        instrumentor = WideSearchInstrumentor()
+        instrumentor.instrument(
+            tracer_provider=tracer_provider,
+            meter_provider=meter_provider,
+            skip_dep_check=True,
+        )
+        instrumentor.uninstrument()
+
+        instrumentor2 = WideSearchInstrumentor()
+        instrumentor2.instrument(
+            tracer_provider=tracer_provider,
+            meter_provider=meter_provider,
+            skip_dep_check=True,
+        )
+        assert instrumentor2._handler is not None
+        instrumentor2.uninstrument()
+
+    def test_instrumentation_dependencies(self):
+        from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor
+
+        instrumentor = WideSearchInstrumentor()
+        deps = instrumentor.instrumentation_dependencies()
+        assert ("widesearch >= 0.1.0",) == deps
+
+
+# ---------------------------------------------------------------------------
+# ENTRY Span Tests (H1: run_single_query)
+# ---------------------------------------------------------------------------
+
+
+class TestEntrySpan:
+    def test_entry_span_created(self, span_exporter, instrument):
+        """run_single_query should produce an ENTRY span."""
+        from src.agent.run import run_single_query
+
+        _run_async(run_single_query("What is AI?", agent_name="searcher"))
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        assert len(entry_spans) == 1
+
+        entry = entry_spans[0]
+        attrs = dict(entry.attributes)
+        assert attrs.get("gen_ai.span.kind") == "ENTRY"
+        assert attrs.get("gen_ai.operation.name") == "enter"
+        assert attrs.get("gen_ai.framework") == "widesearch"
+
+    def test_entry_span_records_gen_ai_arms_semantic_attrs(self, span_exporter, instrument):
+        """ENTRY should record input/output messages, but not agent-only metadata.
+
+        Controlled by OTEL_SEMCONV_STABILITY_OPT_IN + SPAN_ONLY capture mode (see conftest).
+        """
+        from src.agent.run import run_single_query
+
+        tools_desc = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "search_global",
+                    "description": "Search the web",
+                    "properties": {},
+                },
+            }
+        ]
+
+        _run_async(
+            run_single_query(
+                "What is AI?",
+                agent_name="searcher",
+                system_prompt="You are an expert researcher.",
+                tools_desc=tools_desc,
+            )
+        )
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        assert len(entry_spans) == 1
+        attrs = dict(entry_spans[0].attributes)
+        assert "gen_ai.input.messages" in attrs
+        assert '"role":"user"' in attrs["gen_ai.input.messages"]
+        assert "gen_ai.output.messages" in attrs
+        assert "gen_ai.system_instructions" not in attrs
+        assert "gen_ai.tool.definitions" not in attrs
+
+    def test_entry_span_error(self, span_exporter, instrument):
+        """ENTRY span should record ERROR on exception."""
+        from src.agent.run import Runner, run_single_query
+
+        async def failing_step(*, agent, memory):
+            raise RuntimeError("LLM connection failed")
+
+        Runner._step_override = failing_step
+
+        try:
+            with pytest.raises(RuntimeError, match="LLM connection failed"):
+                _run_async(run_single_query("test"))
+        finally:
+            Runner._step_override = None
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        assert len(entry_spans) == 1
+        assert entry_spans[0].status.status_code == StatusCode.ERROR
+
+
+# ---------------------------------------------------------------------------
+# AGENT Span Tests (H2: Runner.run)
+# ---------------------------------------------------------------------------
+
+
+class TestAgentSpan:
+    def test_agent_span_created(self, span_exporter, instrument):
+        """Runner.run should produce an AGENT span."""
+        from src.agent.run import Runner
+
+        agent = Agent(name="search-agent", model_config_name="gpt-4o")
+
+        async def _run():
+            results = []
+            async for step in Runner.run(agent, "Hello"):
+                results.append(step)
+            return results
+
+        _run_async(_run())
+
+        spans = span_exporter.get_finished_spans()
+        agent_spans = [
+            s for s in spans if "invoke_agent" in s.name
+        ]
+        assert len(agent_spans) == 1
+
+        span = agent_spans[0]
+        attrs = dict(span.attributes)
+        assert attrs.get("gen_ai.span.kind") == "AGENT"
+        assert attrs.get("gen_ai.operation.name") == "invoke_agent"
+        assert attrs.get("gen_ai.agent.name") == "search-agent"
+        assert attrs.get("gen_ai.framework") == "widesearch"
+
+    def test_agent_span_records_gen_ai_arms_semantic_attrs(self, span_exporter, instrument):
+        """AGENT invoke_agent should expose ARMS-aligned message/tool attributes."""
+        from src.agent.run import Runner
+
+        tools_desc = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "add",
+                    "description": "Add numbers",
+                    "parameters": {},
+                },
+            }
+        ]
+
+        agent = Agent(
+            name="search-agent",
+            model_config_name="gpt-4o",
+            tools_desc=tools_desc,
+            instructions="Solve tasks with tools.",
+        )
+
+        async def _run():
+            results = []
+            async for step in Runner.run(agent, "Hello"):
+                results.append(step)
+            return results
+
+        _run_async(_run())
+
+        spans = span_exporter.get_finished_spans()
+        agent_spans = [
+            s for s in spans if "invoke_agent" in s.name
+        ]
+        assert len(agent_spans) == 1
+        attrs = dict(agent_spans[0].attributes)
+        assert "gen_ai.input.messages" in attrs
+        assert '"role":"user"' in attrs["gen_ai.input.messages"]
+        assert "gen_ai.output.messages" in attrs
+        assert "gen_ai.system_instructions" in attrs
+        assert "gen_ai.tool.definitions" in attrs
+        assert "add" in attrs["gen_ai.tool.definitions"]
+
+    def test_agent_span_is_child_of_entry(self, span_exporter, instrument):
+        """AGENT span should be a child of ENTRY span."""
+        from src.agent.run import run_single_query
+
+        _run_async(run_single_query("test query", agent_name="test"))
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        agent_spans = [s for s in spans if "invoke_agent" in s.name]
+
+        assert len(entry_spans) == 1
+        assert len(agent_spans) == 1
+
+        entry = entry_spans[0]
+        agent = agent_spans[0]
+        assert agent.parent.span_id == entry.context.span_id
+
+    def test_agent_span_error(self, span_exporter, instrument):
+        """AGENT span should record ERROR when _step raises."""
+        from src.agent.run import Runner
+
+        async def failing_step(*, agent, memory):
+            raise ValueError("Step failure")
+
+        Runner._step_override = failing_step
+        agent = Agent(name="fail-agent")
+
+        async def _run():
+            async for _ in Runner.run(agent, "Hello"):
+                pass
+
+        try:
+            with pytest.raises(ValueError):
+                _run_async(_run())
+        finally:
+            Runner._step_override = None
+
+        spans = span_exporter.get_finished_spans()
+        agent_spans = [s for s in spans if "invoke_agent" in s.name]
+        assert len(agent_spans) == 1
+        assert agent_spans[0].status.status_code == StatusCode.ERROR
+
+
+# ---------------------------------------------------------------------------
+# STEP Span Tests (H3: Runner._step)
+# ---------------------------------------------------------------------------
+
+
+class TestStepSpan:
+    def test_step_span_created(self, span_exporter, instrument):
+        """Runner._step should produce a STEP span."""
+        from src.agent.run import Runner
+
+        agent = Agent(name="stepper")
+
+        async def _run():
+            async for _ in Runner.run(agent, "test"):
+                pass
+
+        _run_async(_run())
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [s for s in spans if s.name == "react step"]
+        assert len(step_spans) >= 1
+
+        step = step_spans[0]
+        attrs = dict(step.attributes)
+        assert attrs.get("gen_ai.span.kind") == "STEP"
+        assert attrs.get("gen_ai.operation.name") == "react"
+        assert attrs.get("gen_ai.react.round") == 1
+
+    def test_step_span_is_child_of_agent(self, span_exporter, instrument):
+        """STEP span should be child of AGENT span."""
+        from src.agent.run import Runner
+
+        agent = Agent(name="stepper")
+
+        async def _run():
+            async for _ in Runner.run(agent, "test"):
+                pass
+
+        _run_async(_run())
+
+        spans = span_exporter.get_finished_spans()
+        agent_spans = [s for s in spans if "invoke_agent" in s.name]
+        step_spans = [s for s in spans if s.name == "react step"]
+
+        assert len(agent_spans) == 1
+        assert len(step_spans) >= 1
+
+        agent_span = agent_spans[0]
+        step_span = step_spans[0]
+        assert step_span.parent.span_id == agent_span.context.span_id
+
+    def test_step_span_finish_reason_finished(self, span_exporter, instrument):
+        """STEP span should have finish_reason='finished' when step finishes."""
+        from src.agent.run import Runner
+
+        agent = Agent(name="stepper")
+
+        async def _run():
+            async for _ in Runner.run(agent, "test"):
+                pass
+
+        _run_async(_run())
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [s for s in spans if s.name == "react step"]
+        assert len(step_spans) >= 1
+        attrs = dict(step_spans[0].attributes)
+        assert attrs.get("gen_ai.react.finish_reason") == "finished"
+
+    def test_step_span_error_on_action_step_error(
+        self, span_exporter, instrument
+    ):
+        """STEP span should record ERROR when _step returns ActionStepError."""
+        from src.agent.run import Runner
+
+        async def error_step(*, agent, memory):
+            return ActionStepError(message="LLM timeout")
+
+        Runner._step_override = error_step
+        agent = Agent(name="error-agent")
+
+        try:
+            async def _run():
+                async for _ in Runner.run(agent, "test"):
+                    pass
+
+            _run_async(_run())
+        finally:
+            Runner._step_override = None
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [s for s in spans if s.name == "react step"]
+        assert len(step_spans) >= 1
+        assert step_spans[0].status.status_code == StatusCode.ERROR
+        attrs = dict(step_spans[0].attributes)
+        assert attrs.get("gen_ai.react.finish_reason") == "error"
+
+
+# ---------------------------------------------------------------------------
+# TOOL Span Tests (H4: Runner._invoke_tool_call)
+# ---------------------------------------------------------------------------
+
+
+class TestToolSpan:
+    def test_tool_span_created(self, span_exporter, instrument):
+        """_invoke_tool_call should produce TOOL spans."""
+        from src.agent.run import Runner
+
+        async def mock_tool(**kwargs):
+            return InternalResponse(data="search results")
+
+        agent = Agent(
+            name="tool-agent",
+            tools={"search_global": mock_tool},
+            tools_desc=[
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "search_global",
+                        "description": "Search the web",
+                        "parameters": {},
+                    },
+                }
+            ],
+        )
+
+        tc = ToolCall(
+            tool_name="search_global",
+            arguments='{"q": "AI"}',
+            tool_call_id="call_123",
+        )
+        model_resp = ModelResponse(
+            outputs=[LLMOutputItem(tool_calls=[tc])]
+        )
+
+        _run_async(Runner._invoke_tool_call(agent, model_resp))
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = [s for s in spans if "execute_tool" in s.name]
+        assert len(tool_spans) == 1
+
+        span = tool_spans[0]
+        attrs = dict(span.attributes)
+        assert attrs.get("gen_ai.span.kind") == "TOOL"
+        assert attrs.get("gen_ai.operation.name") == "execute_tool"
+        assert attrs.get("gen_ai.tool.name") == "search_global"
+        assert attrs.get("gen_ai.tool.call.id") == "call_123"
+        assert attrs.get("gen_ai.framework") == "widesearch"
+
+    def test_tool_span_records_arguments_and_result(
+        self, span_exporter, instrument
+    ):
+        """TOOL span should record arguments and result."""
+        from src.agent.run import Runner
+
+        async def mock_tool(q=""):
+            return InternalResponse(data=f"results for: {q}")
+
+        agent = Agent(
+            name="tool-agent",
+            tools={"search_global": mock_tool},
+        )
+
+        tc = ToolCall(
+            tool_name="search_global",
+            arguments=json.dumps({"q": "OpenTelemetry"}),
+            tool_call_id="call_456",
+        )
+        model_resp = ModelResponse(
+            outputs=[LLMOutputItem(tool_calls=[tc])]
+        )
+
+        results = _run_async(Runner._invoke_tool_call(agent, model_resp))
+        assert len(results) == 1
+        assert results[0].content == "results for: OpenTelemetry"
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = [s for s in spans if "execute_tool" in s.name]
+        assert len(tool_spans) == 1
+        attrs = dict(tool_spans[0].attributes)
+        assert "gen_ai.tool.call.arguments" in attrs
+        assert "gen_ai.tool.call.result" in attrs
+
+    def test_tool_span_error_on_missing_tool(self, span_exporter, instrument):
+        """TOOL span should record ERROR when tool not found."""
+        from src.agent.run import Runner
+
+        agent = Agent(name="tool-agent", tools={})
+
+        tc = ToolCall(
+            tool_name="nonexistent_tool",
+            arguments="{}",
+            tool_call_id="call_789",
+        )
+        model_resp = ModelResponse(
+            outputs=[LLMOutputItem(tool_calls=[tc])]
+        )
+
+        results = _run_async(Runner._invoke_tool_call(agent, model_resp))
+        assert len(results) == 1
+        assert results[0].error_marker is not None
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = [s for s in spans if "execute_tool" in s.name]
+        assert len(tool_spans) == 1
+        assert tool_spans[0].status.status_code == StatusCode.ERROR
+
+    def test_tool_span_error_on_exception(self, span_exporter, instrument):
+        """TOOL span should record ERROR when tool raises exception."""
+        from src.agent.run import Runner
+
+        async def failing_tool(**kwargs):
+            raise ConnectionError("Network error")
+
+        agent = Agent(
+            name="tool-agent",
+            tools={"flaky_tool": failing_tool},
+        )
+
+        tc = ToolCall(
+            tool_name="flaky_tool",
+            arguments="{}",
+            tool_call_id="call_err",
+        )
+        model_resp = ModelResponse(
+            outputs=[LLMOutputItem(tool_calls=[tc])]
+        )
+
+        results = _run_async(Runner._invoke_tool_call(agent, model_resp))
+        assert len(results) == 1
+        assert results[0].error_marker is not None
+        assert "Network error" in results[0].error_marker.message
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = [s for s in spans if "execute_tool" in s.name]
+        assert len(tool_spans) == 1
+        assert tool_spans[0].status.status_code == StatusCode.ERROR
+
+    def test_multiple_tool_spans(self, span_exporter, instrument):
+        """Multiple tool_calls should produce multiple TOOL spans."""
+        from src.agent.run import Runner
+
+        async def mock_search(**kwargs):
+            return InternalResponse(data="search result")
+
+        async def mock_browse(**kwargs):
+            return InternalResponse(data="page content")
+
+        agent = Agent(
+            name="multi-tool",
+            tools={
+                "search_global": mock_search,
+                "text_browser_view": mock_browse,
+            },
+        )
+
+        tc1 = ToolCall(
+            tool_name="search_global",
+            arguments='{"q": "test"}',
+            tool_call_id="call_1",
+        )
+        tc2 = ToolCall(
+            tool_name="text_browser_view",
+            arguments='{"url": "http://example.com"}',
+            tool_call_id="call_2",
+        )
+        model_resp = ModelResponse(
+            outputs=[LLMOutputItem(tool_calls=[tc1, tc2])]
+        )
+
+        results = _run_async(Runner._invoke_tool_call(agent, model_resp))
+        assert len(results) == 2
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = [s for s in spans if "execute_tool" in s.name]
+        assert len(tool_spans) == 2
+
+
+# ---------------------------------------------------------------------------
+# TASK Span Tests (H5: create_sub_agents_wrap)
+# ---------------------------------------------------------------------------
+
+
+class TestTaskSpan:
+    def test_task_span_created(self, span_exporter, instrument):
+        """create_sub_agents closure should produce a TASK span."""
+        from src.agent.multi_agent_tools import create_sub_agents_wrap
+
+        closure = create_sub_agents_wrap(
+            "main-agent", "gpt-4o", {}, [], "system prompt"
+        )
+
+        sub_agents = [
+            {"index": 0, "prompt": "Search for X"},
+            {"index": 1, "prompt": "Search for Y"},
+        ]
+
+        result = _run_async(closure(sub_agents))
+        assert result is not None
+
+        spans = span_exporter.get_finished_spans()
+        task_spans = [
+            s for s in spans if s.name == "run_task create_sub_agents"
+        ]
+        assert len(task_spans) == 1
+
+        span = task_spans[0]
+        attrs = dict(span.attributes)
+        assert attrs.get("gen_ai.span.kind") == "TASK"
+        assert attrs.get("gen_ai.operation.name") == "run_task"
+        assert attrs.get("gen_ai.framework") == "widesearch"
+        assert "input.value" in attrs
+
+    def test_task_span_records_output(self, span_exporter, instrument):
+        """TASK span should record output.value."""
+        from src.agent.multi_agent_tools import create_sub_agents_wrap
+
+        closure = create_sub_agents_wrap(
+            "agent", "gpt-4o", {}, [], "prompt"
+        )
+
+        sub_agents = [{"index": 0, "prompt": "find info"}]
+        result = _run_async(closure(sub_agents))
+
+        spans = span_exporter.get_finished_spans()
+        task_spans = [
+            s for s in spans if s.name == "run_task create_sub_agents"
+        ]
+        assert len(task_spans) == 1
+        attrs = dict(task_spans[0].attributes)
+        assert "output.value" in attrs
+
+    def test_task_span_error(self, span_exporter, instrument):
+        """TASK span should record ERROR when closure raises."""
+        from src.agent.multi_agent_tools import create_sub_agents_wrap
+
+        # Temporarily replace create_sub_agents_wrap's inner closure behavior
+        import src.agent.multi_agent_tools as mat
+
+        original = mat.create_sub_agents_wrap
+
+        def error_factory(*args, **kwargs):
+            original_closure = original(*args, **kwargs)
+
+            async def error_closure(sub_agents):
+                raise RuntimeError("Sub-agent execution failed")
+
+            return error_closure
+
+        mat.create_sub_agents_wrap = error_factory
+
+        # Re-instrument to pick up the new function
+        from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor
+
+        instrument.uninstrument()
+        instrument.instrument(
+            tracer_provider=span_exporter._tracer_provider
+            if hasattr(span_exporter, "_tracer_provider")
+            else None,
+            skip_dep_check=True,
+        )
+
+        # Since re-instrumentation is complex, let's just test the wrapper directly
+        # by calling the instrumented version
+        instrument.uninstrument()
+
+        # Simpler approach: directly test the wrap function
+        from opentelemetry.instrumentation.widesearch.patch import (
+            wrap_create_sub_agents_factory,
+        )
+        from opentelemetry.util.genai.extended_handler import (
+            ExtendedTelemetryHandler,
+        )
+        from opentelemetry.sdk.trace import TracerProvider
+        from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+        from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+            InMemorySpanExporter,
+        )
+
+        exporter = InMemorySpanExporter()
+        tp = TracerProvider()
+        tp.add_span_processor(SimpleSpanProcessor(exporter))
+        handler = ExtendedTelemetryHandler(tracer_provider=tp)
+
+        def failing_factory(*args, **kwargs):
+            async def failing_closure(sub_agents):
+                raise RuntimeError("Boom")
+
+            return failing_closure
+
+        wrapped_factory = wrap_create_sub_agents_factory(
+            failing_factory, None, (), {}, handler=handler
+        )
+
+        with pytest.raises(RuntimeError, match="Boom"):
+            _run_async(wrapped_factory([{"index": 0, "prompt": "x"}]))
+
+        spans = exporter.get_finished_spans()
+        task_spans = [
+            s for s in spans if s.name == "run_task create_sub_agents"
+        ]
+        assert len(task_spans) == 1
+        assert task_spans[0].status.status_code == StatusCode.ERROR
+
+
+# ---------------------------------------------------------------------------
+# Parent-Child Relationship Tests
+# ---------------------------------------------------------------------------
+
+
+class TestParentChildRelationships:
+    def test_full_hierarchy_entry_agent_step(self, span_exporter, instrument):
+        """Full call through run_single_query should produce ENTRY > AGENT > STEP."""
+        from src.agent.run import run_single_query
+
+        _run_async(run_single_query("hierarchy test", agent_name="root"))
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        agent_spans = [s for s in spans if "invoke_agent" in s.name]
+        step_spans = [s for s in spans if s.name == "react step"]
+
+        assert len(entry_spans) == 1
+        assert len(agent_spans) == 1
+        assert len(step_spans) >= 1
+
+        entry = entry_spans[0]
+        agent = agent_spans[0]
+        step = step_spans[0]
+
+        # AGENT is child of ENTRY
+        assert agent.parent.span_id == entry.context.span_id
+        # STEP is child of AGENT
+        assert step.parent.span_id == agent.context.span_id
+
+    def test_tool_span_is_child_of_step(self, span_exporter, instrument):
+        """TOOL span should be child of the STEP span when invoked during a step."""
+        from src.agent.run import Runner
+
+        async def mock_tool(**kwargs):
+            return InternalResponse(data="result")
+
+        agent = Agent(
+            name="hierarchy-agent",
+            tools={"my_tool": mock_tool},
+        )
+
+        async def custom_step(*, agent, memory):
+            tc = ToolCall(
+                tool_name="my_tool",
+                arguments="{}",
+                tool_call_id="tc_hier",
+            )
+            model_resp = ModelResponse(
+                outputs=[LLMOutputItem(tool_calls=[tc])]
+            )
+            await Runner._invoke_tool_call(agent, model_resp)
+            return ActionStep(step_status=StepStatus.FINISHED, content="done")
+
+        Runner._step_override = custom_step
+
+        try:
+            async def _run():
+                async for _ in Runner.run(agent, "test"):
+                    pass
+
+            _run_async(_run())
+        finally:
+            Runner._step_override = None
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [s for s in spans if s.name == "react step"]
+        tool_spans = [s for s in spans if "execute_tool" in s.name]
+
+        assert len(step_spans) >= 1
+        assert len(tool_spans) >= 1
+
+        step_span = step_spans[0]
+        tool_span = tool_spans[0]
+        assert tool_span.parent.span_id == step_span.context.span_id
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md
new file mode 100644
index 000000000..1b0499fa4
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md
@@ -0,0 +1,55 @@
+# LoongSuite WildToolBench Instrumentation
+
+OpenTelemetry instrumentation for the [WildToolBench](https://github.com/yupeijei1997/WildToolBench) benchmark framework.
+
+## Installation
+
+WildToolBench is not available on PyPI. Install it from source:
+
+```bash
+pip install -e /path/to/WildToolBench/wild-tool-bench
+pip install loongsuite-instrumentation-wildtool
+```
+
+## Requirements
+
+- **OpenAI provider instrumentation**: To produce LLM spans, you must also enable an OpenAI provider instrumentation (e.g., `opentelemetry-instrumentation-openai` or LoongSuite's equivalent). This plugin creates ENTRY/AGENT/CHAIN/STEP/TOOL spans but does **not** create LLM spans itself.
+
+## Usage
+
+```python
+from opentelemetry.instrumentation.wildtool import WildToolInstrumentor
+
+WildToolInstrumentor().instrument()
+
+# Run WildToolBench as usual — spans are automatically generated.
+```
+
+## Span Topology
+
+```
+ENTRY (enter_ai_application_system)
+└── AGENT (invoke_agent wildtool)
+    └── CHAIN (workflow task_{idx})
+        └── STEP (react step)
+            ├── [LLM span — provider instrumentation]
+            └── TOOL (execute_tool {tool_name})
+```
+
+## Patch Points
+
+| # | Target | Span Type |
+|---|--------|-----------|
+| P1 | `multi_threaded_inference` | ENTRY |
+| P2 | `BaseHandler.inference_multi_turn` | AGENT |
+| P3 | `BaseHandler.inference_and_eval_multi_step` | CHAIN + TOOL |
+| P4 | `BaseHandler._request_tool_call` | STEP |
+| P5 | `BaseHandler._parse_api_response` | (token extraction) |
+
+## Round 2 fixes (see `llm-dev/execute.md` § "修订记录 (Round 2 fix)")
+
+- **H1**: TOOL span is now parented on STEP, not CHAIN. Strategy A enhanced — the chain wrapper holds a `round → STEP span` map and uses `trace.set_span_in_context(step_span)` to anchor each post-hoc TOOL span on the matching STEP. STEP `SpanContext`s remain valid parents even after `end()`.
+- **H2 (provider-name fallback)**: `opentelemetry-instrumentation-openai-v2 == 0.62b1` only emits the legacy `gen_ai.system` attribute on its LLM span; the new `gen_ai.provider.name` attribute is missing. As a *pure fallback* the wildtool plugin writes both `gen_ai.system="openai"` and `gen_ai.provider.name="openai"` on the **STEP** span (not on the LLM span — that is owned by the OpenAI v2 instrumentation and we do **not** patch it). Once the OpenAI v2 instrumentation upstream emits `gen_ai.provider.name` natively this fallback can be removed.
+- **M1**: CHAIN span now carries `input.value` (last user message in `inference_data["messages"]`, truncated to 4096 chars) and `output.value` (JSON of `action_name_label`/`task_idx`/`is_optimal`).
+- **M2**: STEP span now carries `gen_ai.react.finish_reason` on error paths. Mapping table is in `execute.md` § "M2: gen_ai.react.finish_reason 取值映射".
+- **M3**: TOOL span explicitly writes `gen_ai.tool.call.arguments` / `gen_ai.tool.call.result` / `gen_ai.tool.description`, bypassing `OTEL_INSTRUMENTATION_GENAI_CAPTURE_*` gating in `opentelemetry-util-genai`. The custom `wildtool.tool.execution_mode = "ground_truth_replay"` is preserved.
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml
new file mode 100644
index 000000000..b8f9f44d0
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml
@@ -0,0 +1,66 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "loongsuite-instrumentation-wildtool"
+dynamic = ["version"]
+description = "LoongSuite WildToolBench Instrumentation"
+readme = "README.md"
+license = "Apache-2.0"
+requires-python = ">=3.9"
+authors = [
+    { name = "LoongSuite Python Agent Authors", email = "caishipeng.csp@alibaba-inc.com" },
+    { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" },
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
+]
+dependencies = [
+    "opentelemetry-api ~= 1.37",
+    "opentelemetry-instrumentation >= 0.58b0",
+    "opentelemetry-semantic-conventions >= 0.58b0",
+    "opentelemetry-util-genai",
+    "wrapt >= 1.17.3, < 3.0.0",
+]
+
+[project.optional-dependencies]
+instruments = [
+    "openai >= 1.0.0",
+]
+
+test = [
+    "pytest ~= 8.0",
+    "pytest-cov ~= 4.1.0",
+    "pytest-forked >= 1.6.0",
+    "opentelemetry-sdk >= 1.37",
+    "openai >= 1.0.0",
+]
+
+[project.entry-points.opentelemetry_instrumentor]
+wildtool = "opentelemetry.instrumentation.wildtool:WildToolInstrumentor"
+
+[project.urls]
+Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-wildtool"
+Repository = "https://github.com/alibaba/loongsuite-python-agent"
+
+[tool.hatch.version]
+path = "src/opentelemetry/instrumentation/wildtool/version.py"
+
+[tool.hatch.build.targets.sdist]
+include = [
+    "/src",
+    "/tests",
+]
+
+[tool.hatch.build.targets.wheel]
+packages = ["src/opentelemetry"]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py
new file mode 100644
index 000000000..dad772500
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py
@@ -0,0 +1,161 @@
+"""OpenTelemetry WildToolBench Instrumentation"""
+
+import logging
+from typing import Any, Collection
+
+from wrapt import wrap_function_wrapper
+
+from opentelemetry.instrumentation.instrumentor import BaseInstrumentor
+from opentelemetry.instrumentation.utils import unwrap
+from opentelemetry.instrumentation.wildtool.package import _instruments
+from opentelemetry.instrumentation.wildtool.version import __version__
+from opentelemetry.instrumentation.wildtool._wrappers import (
+    WildToolAgentWrapper,
+    WildToolChainWrapper,
+    WildToolEntryWrapper,
+    WildToolParseWrapper,
+    WildToolRequestWrapper,
+)
+from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler
+
+logger = logging.getLogger(__name__)
+
+_LLM_RESPONSE_GEN_MODULE = "wtb._llm_response_generation"
+_BASE_HANDLER_MODULE = "wtb.model_handler.base_handler"
+
+__all__ = ["WildToolInstrumentor", "__version__"]
+
+
+class WildToolInstrumentor(BaseInstrumentor):
+    """OpenTelemetry instrumentor for WildToolBench framework."""
+
+    def __init__(self):
+        super().__init__()
+        self._handler = None
+        # Track concrete handler subclasses whose abstract _request_tool_call /
+        # _parse_api_response we have already wrapped, so we can unwrap on
+        # uninstrument and avoid double-wrapping.
+        self._patched_handler_classes: set = set()
+        self._request_wrapper = None
+        self._parse_wrapper = None
+
+    def instrumentation_dependencies(self) -> Collection[str]:
+        return _instruments
+
+    def _instrument(self, **kwargs: Any) -> None:
+        tracer_provider = kwargs.get("tracer_provider")
+        meter_provider = kwargs.get("meter_provider")
+        logger_provider = kwargs.get("logger_provider")
+
+        self._handler = ExtendedTelemetryHandler(
+            tracer_provider=tracer_provider,
+            meter_provider=meter_provider,
+            logger_provider=logger_provider,
+        )
+        self._request_wrapper = WildToolRequestWrapper(self._handler)
+        self._parse_wrapper = WildToolParseWrapper(self._handler)
+
+        # P1: ENTRY span
+        try:
+            wrap_function_wrapper(
+                _LLM_RESPONSE_GEN_MODULE,
+                "multi_threaded_inference",
+                WildToolEntryWrapper(self._handler),
+            )
+        except Exception as e:
+            logger.warning("Failed to instrument multi_threaded_inference: %s", e)
+
+        # P2: AGENT span
+        try:
+            wrap_function_wrapper(
+                _BASE_HANDLER_MODULE,
+                "BaseHandler.inference_multi_turn",
+                WildToolAgentWrapper(self._handler),
+            )
+        except Exception as e:
+            logger.warning("Failed to instrument inference_multi_turn: %s", e)
+
+        # P3: CHAIN span (+ STEP + TOOL management).
+        # The chain wrapper also lazily patches the concrete subclass'
+        # `_request_tool_call` / `_parse_api_response` on first use, so that
+        # subclasses overriding the abstract base methods are still
+        # intercepted (P4 / P5).
+        try:
+            wrap_function_wrapper(
+                _BASE_HANDLER_MODULE,
+                "BaseHandler.inference_and_eval_multi_step",
+                WildToolChainWrapper(self._handler, self),
+            )
+        except Exception as e:
+            logger.warning(
+                "Failed to instrument inference_and_eval_multi_step: %s", e
+            )
+
+    def ensure_handler_class_patched(self, handler_cls) -> None:
+        """Lazily wrap the concrete handler subclass' P4/P5 methods.
+
+        WildToolBench declares ``_request_tool_call`` and ``_parse_api_response``
+        as abstract on ``BaseHandler``, but real handlers (and tests) override
+        them. Python method resolution dispatches directly to the override and
+        therefore never reaches a wrapper installed on the base class. We
+        instead wrap the override on first invocation per subclass.
+        """
+        if handler_cls in self._patched_handler_classes:
+            return
+        self._patched_handler_classes.add(handler_cls)
+
+        module_name = handler_cls.__module__
+        cls_name = handler_cls.__name__
+        for method, wrapper in (
+            ("_request_tool_call", self._request_wrapper),
+            ("_parse_api_response", self._parse_wrapper),
+        ):
+            if method not in handler_cls.__dict__:
+                continue
+            try:
+                wrap_function_wrapper(
+                    module_name,
+                    f"{cls_name}.{method}",
+                    wrapper,
+                )
+            except Exception as e:
+                logger.debug(
+                    "Failed to wrap %s.%s.%s: %s",
+                    module_name,
+                    cls_name,
+                    method,
+                    e,
+                )
+
+    def _uninstrument(self, **kwargs: Any) -> None:
+        try:
+            import wtb._llm_response_generation as llm_gen
+
+            unwrap(llm_gen, "multi_threaded_inference")
+        except Exception as e:
+            logger.debug("Failed to uninstrument multi_threaded_inference: %s", e)
+
+        try:
+            import wtb.model_handler.base_handler as bh
+
+            unwrap(bh.BaseHandler, "inference_multi_turn")
+            unwrap(bh.BaseHandler, "inference_and_eval_multi_step")
+        except Exception as e:
+            logger.debug("Failed to uninstrument BaseHandler methods: %s", e)
+
+        for cls in list(self._patched_handler_classes):
+            for method in ("_request_tool_call", "_parse_api_response"):
+                if method in cls.__dict__:
+                    try:
+                        unwrap(cls, method)
+                    except Exception as e:
+                        logger.debug(
+                            "Failed to unwrap %s.%s: %s",
+                            cls.__name__,
+                            method,
+                            e,
+                        )
+        self._patched_handler_classes.clear()
+        self._request_wrapper = None
+        self._parse_wrapper = None
+        self._handler = None
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py
new file mode 100644
index 000000000..8b16d5247
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py
@@ -0,0 +1,857 @@
+"""Wrapper classes for WildToolBench instrumentation.
+
+Each wrapper corresponds to one patch point and manages the lifecycle
+of one or more span types.
+
+Round 2 fix highlights (see ``llm-dev/execute.md`` § "修订记录 (Round 2 fix)"):
+
+H1
+    TOOL span parent is now STEP rather than CHAIN. Each STEP invocation is
+    appended to a per-chain list in :data:`_chain_step_invocations`; when the
+    chain wrapper post-processes ``inference_log`` it looks up the matching
+    STEP span by ``round`` and uses
+    :func:`opentelemetry.trace.set_span_in_context` so ``start_execute_tool``
+    parents the TOOL span on the STEP context (even if STEP is already
+    closed — its :class:`SpanContext` remains a valid parent reference).
+
+H2
+    The OpenAI v2 provider instrumentation (0.62b1) writes only the legacy
+    ``gen_ai.system`` attribute to its LLM span. The wildtool plugin now
+    writes both ``gen_ai.system`` and ``gen_ai.provider.name`` on the STEP
+    span as a fallback so the new semantic-conventions attribute is present
+    in the trace tree even before the upstream OpenAI v2 instrumentation
+    catches up. We do **not** patch the OpenAI v2 instrumentation itself.
+
+M1
+    ``input.value`` (last user message in the chain's ``messages``, truncated
+    to 4096 chars) and ``output.value`` (a JSON of action label, task index
+    and is_optimal) are written on the CHAIN span.
+
+M2
+    ``gen_ai.react.finish_reason`` is derived from ``inference_log`` on the
+    *last* (currently active) STEP. Mappings:
+
+    ``"parse_tool_calls_failed"``
+        ``error_reason`` contains "parse tool_calls failed".
+    ``"action_name_mismatch"``
+        ``error_reason`` contains "action name not in candidate".
+    ``"empty_response"``
+        ``error_reason`` contains "tool_calls and content are None".
+    ``"error"``
+        request raised an exception (handled in
+        :class:`WildToolRequestWrapper`).
+
+M3
+    ``gen_ai.tool.call.arguments``, ``gen_ai.tool.call.result`` and
+    ``gen_ai.tool.description`` are written explicitly on TOOL spans
+    *before* close as a fallback. ``opentelemetry-util-genai`` gates these
+    sensitive attributes behind ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_*`` env
+    vars; the wildtool plugin always writes them since wtb data is
+    benchmark-synthetic and never PII.
+"""
+
+import json
+import logging
+from contextvars import ContextVar
+from dataclasses import asdict
+from typing import List, Optional
+
+from opentelemetry.trace import StatusCode, set_span_in_context
+from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler
+from opentelemetry.util.genai.extended_types import (
+    EntryInvocation,
+    ExecuteToolInvocation,
+    InvokeAgentInvocation,
+    ReactStepInvocation,
+)
+from opentelemetry.util.genai.types import (
+    Error,
+    InputMessage,
+    OutputMessage,
+    Text,
+)
+
+logger = logging.getLogger(__name__)
+
+# ─────────────────────────── ContextVars ───────────────────────────────
+# The CHAIN wrapper opens a new logical "chain" by flipping ``_in_chain``
+# and resetting the counter. The REQUEST wrapper reads these to decide
+# whether to create a STEP span and what round number to assign.
+_in_chain: ContextVar[bool] = ContextVar("_wt_in_chain", default=False)
+
+# Currently open STEP invocation. Used by the parse wrapper to attach
+# token attributes to the right span.
+_step_invocation: ContextVar[Optional[ReactStepInvocation]] = ContextVar(
+    "_wt_step_inv", default=None
+)
+_step_counter: ContextVar[int] = ContextVar("_wt_step_ctr", default=0)
+
+# Per-chain list of every STEP invocation created in the current chain
+# (in `round` order). The chain wrapper allocates this list on entry and
+# uses it after ``wrapped`` returns to re-parent TOOL spans onto the
+# matching STEP. Even if a STEP span is already ``end()``-ed, its
+# :class:`SpanContext` stays valid as a parent reference for new spans.
+_chain_step_invocations: ContextVar[Optional[List[ReactStepInvocation]]] = (
+    ContextVar("_wt_chain_step_invs", default=None)
+)
+
+_PROVIDER_FALLBACK_NAME = "openai"
+_INPUT_VALUE_MAX_CHARS = 4096
+_MESSAGE_CONTENT_MAX_CHARS = 4096
+
+
+def _close_active_step(handler: ExtendedTelemetryHandler) -> None:
+    """Close the currently active STEP span, if any."""
+    prev = _step_invocation.get()
+    if prev is not None:
+        try:
+            handler.stop_react_step(prev)
+        except Exception as e:  # noqa: BLE001
+            logger.debug("Failed to close step: %s", e)
+        _step_invocation.set(None)
+
+
+def _truncate(text: str, max_chars: int) -> str:
+    if len(text) <= max_chars:
+        return text
+    return text[:max_chars] + "...(truncated)"
+
+
+def _stringify(value) -> str:
+    if isinstance(value, str):
+        return value
+    try:
+        return json.dumps(value, ensure_ascii=False)
+    except (TypeError, ValueError):
+        return str(value)
+
+
+def _tasks_to_input_messages(test_entry) -> List[InputMessage]:
+    if not isinstance(test_entry, dict):
+        return []
+    tasks = test_entry.get("english_tasks")
+    if not isinstance(tasks, list):
+        return []
+
+    messages = []
+    for task in tasks:
+        if task in (None, "", [], {}):
+            continue
+        messages.append(
+            InputMessage(
+                role="user",
+                parts=[
+                    Text(
+                        content=_truncate(
+                            _stringify(task), _MESSAGE_CONTENT_MAX_CHARS
+                        )
+                    )
+                ],
+            )
+        )
+    return messages
+
+
+def _task_results_to_output_messages(result) -> List[OutputMessage]:
+    task_results = _extract_task_results(result)
+    messages = []
+    for task_result in task_results:
+        content = _extract_task_result_output(task_result)
+        if content in (None, "", [], {}):
+            continue
+        messages.append(
+            OutputMessage(
+                role="assistant",
+                parts=[
+                    Text(
+                        content=_truncate(
+                            _stringify(content), _MESSAGE_CONTENT_MAX_CHARS
+                        )
+                    )
+                ],
+                finish_reason=_extract_finish_reason(task_result),
+            )
+        )
+    return messages
+
+
+def _get_message_attributes(input_messages, output_messages) -> dict:
+    attributes = {}
+    try:
+        if input_messages:
+            attributes["gen_ai.input.messages"] = json.dumps(
+                [asdict(message) for message in input_messages],
+                ensure_ascii=False,
+            )
+        if output_messages:
+            attributes["gen_ai.output.messages"] = json.dumps(
+                [asdict(message) for message in output_messages],
+                ensure_ascii=False,
+            )
+    except Exception as e:  # noqa: BLE001
+        logger.debug("Failed to serialize message attrs: %s", e)
+    return attributes
+
+
+def _set_message_attributes(invocation) -> None:
+    attributes = _get_message_attributes(
+        invocation.input_messages, invocation.output_messages
+    )
+    if not attributes:
+        return
+    invocation.attributes.update(attributes)
+    span = invocation.span
+    if span is None or not span.is_recording():
+        return
+    try:
+        span.set_attributes(attributes)
+    except Exception as e:  # noqa: BLE001
+        logger.debug("Failed to set message attrs: %s", e)
+
+
+def _extract_task_results(result) -> List:
+    if isinstance(result, list):
+        return result
+    if not isinstance(result, dict):
+        return []
+
+    for key in (
+        "result",
+        "results",
+        "inference_result",
+        "inference_results",
+        "result_list",
+        "task_results",
+        "answer",
+        "answers",
+    ):
+        value = result.get(key)
+        if isinstance(value, list):
+            return value
+        if isinstance(value, dict):
+            return [value]
+        if value not in (None, "", [], {}):
+            return [value]
+
+    if any(
+        key in result
+        for key in (
+            "action_name_label",
+            "is_optimal",
+            "inference_log",
+            "inference_output",
+            "final_answer",
+        )
+    ):
+        return [result]
+    return []
+
+
+def _extract_task_result_output(task_result):
+    if not isinstance(task_result, dict):
+        return task_result
+
+    for key in ("final_answer", "answer", "output", "result"):
+        value = task_result.get(key)
+        if value not in (None, "", [], {}):
+            return value
+
+    inference_log = task_result.get("inference_log")
+    output_from_log = _extract_output_from_inference_log(inference_log)
+    if output_from_log not in (None, "", [], {}):
+        return output_from_log
+
+    label = task_result.get("action_name_label")
+    if label is not None or "is_optimal" in task_result:
+        return {
+            "action_name_label": label,
+            "is_optimal": task_result.get("is_optimal"),
+        }
+    return None
+
+
+def _extract_output_from_inference_log(inference_log):
+    if not isinstance(inference_log, dict):
+        return None
+
+    for key in sorted(
+        (k for k in inference_log if k.startswith("step_")),
+        key=_step_log_sort_key,
+        reverse=True,
+    ):
+        step_data = inference_log.get(key)
+        if not isinstance(step_data, dict):
+            continue
+
+        output = step_data.get("inference_output")
+        if isinstance(output, dict):
+            for output_key in (
+                "content",
+                "reasoning_content",
+                "current_action_name_label",
+                "error_reason",
+            ):
+                value = output.get(output_key)
+                if value not in (None, "", [], {}):
+                    return value
+
+        answer = step_data.get("inference_answer")
+        if isinstance(answer, dict):
+            candidate = answer.get("candidate_0_answer_function_list")
+            if isinstance(candidate, dict):
+                observation = candidate.get("observation")
+                if observation not in (None, "", [], {}):
+                    return observation
+            if answer not in (None, "", [], {}):
+                return answer
+    return None
+
+
+def _step_log_sort_key(key: str) -> int:
+    try:
+        return int(key[len("step_"):])
+    except (TypeError, ValueError):
+        return -1
+
+
+def _extract_finish_reason(task_result) -> str:
+    if isinstance(task_result, dict):
+        label = task_result.get("action_name_label")
+        if label == "error":
+            return "error"
+    return "stop"
+
+
+class WildToolEntryWrapper:
+    """P1: Wraps multi_threaded_inference → ENTRY span."""
+
+    def __init__(self, handler: ExtendedTelemetryHandler):
+        self._handler = handler
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        # Signature: multi_threaded_inference(handler, model_name, test_case).
+        # We only need model_name and test_case for ENTRY attributes; the
+        # handler instance flows through as args[0] untouched.
+        model_name = args[1] if len(args) > 1 else kwargs.get("model_name", "")
+        test_case = args[2] if len(args) > 2 else kwargs.get("test_case", {})
+
+        invocation = EntryInvocation(
+            session_id=test_case.get("id"),
+            input_messages=_tasks_to_input_messages(test_case),
+            attributes={
+                "gen_ai.framework": "wildtool",
+                "gen_ai.request.model": model_name,
+                "wildtool.turn_count": len(test_case.get("english_tasks", [])),
+            },
+        )
+        self._handler.start_entry(invocation)
+        _set_message_attributes(invocation)
+        try:
+            result = wrapped(*args, **kwargs)
+            invocation.output_messages = _task_results_to_output_messages(result)
+            _set_message_attributes(invocation)
+            self._handler.stop_entry(invocation)
+            return result
+        except Exception as e:
+            _set_message_attributes(invocation)
+            self._handler.fail_entry(
+                invocation, Error(message=str(e), type=type(e))
+            )
+            raise
+
+
+class WildToolAgentWrapper:
+    """P2: Wraps BaseHandler.inference_multi_turn → AGENT span."""
+
+    def __init__(self, handler: ExtendedTelemetryHandler):
+        self._handler = handler
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        test_entry = args[0] if args else kwargs.get("test_entry", {})
+
+        invocation = InvokeAgentInvocation(
+            provider=None,
+            agent_name=type(instance).__name__,
+            input_messages=_tasks_to_input_messages(test_entry),
+            conversation_id=test_entry.get("id"),
+            request_model=getattr(instance, "model_name", None),
+            attributes={
+                "gen_ai.framework": "wildtool",
+                "wildtool.turn_count": len(
+                    test_entry.get("english_answer_list", [])
+                ),
+            },
+        )
+        self._handler.start_invoke_agent(invocation)
+        _set_message_attributes(invocation)
+        try:
+            result = wrapped(*args, **kwargs)
+            invocation.output_messages = _task_results_to_output_messages(result)
+            _set_message_attributes(invocation)
+            total_input = 0
+            total_output = 0
+            for task_result in (result or []):
+                if isinstance(task_result, dict):
+                    total_input += sum(
+                        task_result.get("input_token_count", [])
+                    )
+                    total_output += sum(
+                        task_result.get("output_token_count", [])
+                    )
+            if total_input:
+                invocation.input_tokens = total_input
+            if total_output:
+                invocation.output_tokens = total_output
+            self._handler.stop_invoke_agent(invocation)
+            return result
+        except Exception as e:
+            _set_message_attributes(invocation)
+            self._handler.fail_invoke_agent(
+                invocation, Error(message=str(e), type=type(e))
+            )
+            raise
+
+
+class WildToolChainWrapper:
+    """P3: Wraps BaseHandler.inference_and_eval_multi_step → CHAIN span.
+
+    Also manages the lifecycle of the final STEP span and creates TOOL spans
+    from the returned ``inference_log`` after the original function completes.
+    Round 2 fixes (H1/M1/M2/M3) are implemented here.
+    """
+
+    def __init__(self, handler: ExtendedTelemetryHandler, instrumentor=None):
+        self._handler = handler
+        self._instrumentor = instrumentor
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        if self._instrumentor is not None and instance is not None:
+            try:
+                self._instrumentor.ensure_handler_class_patched(type(instance))
+            except Exception as e:  # noqa: BLE001
+                logger.debug("Failed to ensure subclass patched: %s", e)
+
+        inference_data = args[0] if args else kwargs.get("inference_data", {})
+        if not isinstance(inference_data, dict):
+            inference_data = {}
+        task_idx = inference_data.get("task_idx", 0)
+        test_entry_id = inference_data.get("test_entry_id", "")
+
+        span_name = f"workflow task_{task_idx}"
+        tracer = self._handler._tracer
+
+        chain_token = _in_chain.set(True)
+        counter_token = _step_counter.set(0)
+        step_token = _step_invocation.set(None)
+        chain_steps: List[ReactStepInvocation] = []
+        chain_steps_token = _chain_step_invocations.set(chain_steps)
+
+        chain_attributes = {
+            "gen_ai.span.kind": "CHAIN",
+            "gen_ai.operation.name": "workflow",
+            "gen_ai.framework": "wildtool",
+            "wildtool.task_idx": task_idx,
+            "wildtool.test_entry_id": test_entry_id,
+        }
+
+        # M1: Capture last user message as ``input.value`` BEFORE running the
+        # wrapped function (the wtb function mutates ``messages`` in place).
+        input_value = self._extract_input_value(inference_data)
+        if input_value is not None:
+            chain_attributes["input.value"] = input_value
+
+        with tracer.start_as_current_span(
+            name=span_name, attributes=chain_attributes
+        ) as span:
+            try:
+                result = wrapped(*args, **kwargs)
+
+                # M2: Set finish_reason on the currently active (last) STEP
+                # BEFORE we close it. Only the terminal step ever carries an
+                # error finish_reason (every wtb error path triggers `break`).
+                if isinstance(result, dict):
+                    self._apply_last_step_finish_reason(
+                        result.get("inference_log", {})
+                    )
+
+                _close_active_step(self._handler)
+
+                if isinstance(result, dict):
+                    label = result.get("action_name_label", "")
+                    is_optimal = bool(result.get("is_optimal", False))
+                    span.set_attribute("wildtool.action_name_label", label)
+                    span.set_attribute("wildtool.is_optimal", is_optimal)
+
+                    # M1: ``output.value`` summarising chain outcome.
+                    try:
+                        span.set_attribute(
+                            "output.value",
+                            json.dumps(
+                                {
+                                    "action_name_label": label,
+                                    "task_idx": task_idx,
+                                    "is_optimal": is_optimal,
+                                },
+                                ensure_ascii=False,
+                            ),
+                        )
+                    except Exception as e:  # noqa: BLE001
+                        logger.debug("Failed to set output.value: %s", e)
+
+                    # H1 + M3: re-parent TOOL spans on STEP and force-write
+                    # tool call sensitive attributes.
+                    self._create_tool_spans_from_log(
+                        result.get("inference_log", {}),
+                        inference_data,
+                        chain_steps,
+                    )
+
+                span.set_status(StatusCode.OK)
+                return result
+            except Exception as e:
+                _close_active_step(self._handler)
+                span.record_exception(e)
+                span.set_status(StatusCode.ERROR)
+                raise
+            finally:
+                _chain_step_invocations.reset(chain_steps_token)
+                _step_counter.reset(counter_token)
+                _step_invocation.reset(step_token)
+                _in_chain.reset(chain_token)
+
+    # -- M1 ---------------------------------------------------------------
+
+    @staticmethod
+    def _extract_input_value(inference_data) -> Optional[str]:
+        msgs = inference_data.get("messages") if isinstance(
+            inference_data, dict
+        ) else None
+        if not isinstance(msgs, list):
+            return None
+        for m in reversed(msgs):
+            if not isinstance(m, dict) or m.get("role") != "user":
+                continue
+            content = m.get("content")
+            if content is None:
+                continue
+            text = _stringify(content)
+            return _truncate(text, _INPUT_VALUE_MAX_CHARS)
+        return None
+
+    # -- M2 ---------------------------------------------------------------
+
+    def _apply_last_step_finish_reason(self, inference_log) -> None:
+        if not isinstance(inference_log, dict):
+            return
+        current_step = _step_invocation.get()
+        if current_step is None or current_step.round is None:
+            return
+        step_key = f"step_{current_step.round - 1}"
+        step_data = inference_log.get(step_key)
+        if not isinstance(step_data, dict):
+            return
+        output = step_data.get("inference_output") or {}
+        if not isinstance(output, dict):
+            return
+        label = output.get("current_action_name_label")
+        error_reason = output.get("error_reason") or ""
+        reason = self._derive_step_finish_reason(label, error_reason)
+        if reason is None:
+            return
+        # Setting `invocation.finish_reason` is enough — the util-genai
+        # `_apply_react_step_finish_attributes` writes
+        # ``gen_ai.react.finish_reason`` from this field on stop.
+        current_step.finish_reason = reason
+
+    @staticmethod
+    def _derive_step_finish_reason(
+        label, error_reason: str
+    ) -> Optional[str]:
+        """Map wtb inference_log error_reason → gen_ai.react.finish_reason."""
+        if label != "error":
+            return None
+        if "parse tool_calls failed" in error_reason:
+            return "parse_tool_calls_failed"
+        if "action name not in candidate" in error_reason:
+            return "action_name_mismatch"
+        if "tool_calls and content are None" in error_reason:
+            return "empty_response"
+        return "error"
+
+    # -- H1 + M3 ----------------------------------------------------------
+
+    def _create_tool_spans_from_log(
+        self,
+        inference_log,
+        inference_data,
+        chain_steps: List[ReactStepInvocation],
+    ) -> None:
+        """Post-hoc TOOL span creation from inference_log.
+
+        Uses the per-chain STEP invocation list to parent each TOOL span on
+        the matching STEP span (H1).  Sensitive tool-call attributes are
+        written explicitly on the span (M3) so they appear regardless of
+        ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_*`` settings.
+        """
+        if not isinstance(inference_log, dict):
+            return
+
+        # round → SpanContext-bearing OTel context for parenting
+        step_ctx_by_round = {}
+        for step_inv in chain_steps:
+            if step_inv.round is None or step_inv.span is None:
+                continue
+            try:
+                step_ctx_by_round[step_inv.round] = set_span_in_context(
+                    step_inv.span
+                )
+            except Exception as e:  # noqa: BLE001
+                logger.debug("Failed to compute step parent context: %s", e)
+
+        # tool name → description (for gen_ai.tool.description)
+        tool_desc_map = {}
+        tools = inference_data.get("tools") if isinstance(
+            inference_data, dict
+        ) else None
+        if isinstance(tools, list):
+            for tool in tools:
+                if not isinstance(tool, dict):
+                    continue
+                func = tool.get("function") or tool
+                if not isinstance(func, dict):
+                    continue
+                name = func.get("name")
+                desc = func.get("description")
+                if name:
+                    tool_desc_map[name] = desc
+
+        # Extract tool observations from final messages keyed by tool_call_id;
+        # wtb only embeds them in messages (not in inference_answer) for the
+        # tool_call branch.
+        observation_by_call_id = {}
+        messages = inference_data.get("messages") if isinstance(
+            inference_data, dict
+        ) else None
+        if isinstance(messages, list):
+            for msg in messages:
+                if not isinstance(msg, dict) or msg.get("role") != "tool":
+                    continue
+                tid = msg.get("tool_call_id")
+                if tid is None:
+                    continue
+                content = msg.get("content")
+                if content is None:
+                    continue
+                observation_by_call_id[tid] = (
+                    content if isinstance(content, str) else _stringify(content)
+                )
+
+        for key in sorted(k for k in inference_log if k.startswith("step_")):
+            try:
+                step_idx = int(key[len("step_"):])
+            except ValueError:
+                continue
+            round_num = step_idx + 1
+
+            step_data = inference_log[key]
+            if not isinstance(step_data, dict):
+                continue
+            output = step_data.get("inference_output") or {}
+            if not isinstance(output, dict):
+                continue
+            tool_calls = output.get("tool_calls")
+            label = output.get("current_action_name_label")
+            if not tool_calls or label != "correct":
+                continue
+
+            answer_data = step_data.get("inference_answer") or {}
+            candidate = (
+                answer_data.get("candidate_0_answer_function_list")
+                if isinstance(answer_data, dict)
+                else None
+            ) or {}
+            candidate_observation = (
+                candidate.get("observation")
+                if isinstance(candidate, dict)
+                else None
+            )
+
+            parent_ctx = step_ctx_by_round.get(round_num)
+
+            for tc in tool_calls:
+                if not isinstance(tc, dict):
+                    continue
+                func = tc.get("function") or {}
+                if not isinstance(func, dict):
+                    func = {}
+                tool_name = func.get("name", "unknown")
+                tool_id = tc.get("id")
+                tool_args_raw = func.get("arguments", "")
+                tool_args_str = (
+                    tool_args_raw
+                    if isinstance(tool_args_raw, str)
+                    else _stringify(tool_args_raw)
+                )
+
+                observation_str: Optional[str] = None
+                if tool_id is not None and tool_id in observation_by_call_id:
+                    observation_str = observation_by_call_id[tool_id]
+                elif candidate_observation is not None:
+                    observation_str = (
+                        candidate_observation
+                        if isinstance(candidate_observation, str)
+                        else _stringify(candidate_observation)
+                    )
+
+                description = tool_desc_map.get(tool_name)
+
+                invocation = ExecuteToolInvocation(
+                    tool_name=tool_name,
+                    tool_call_id=tool_id,
+                    tool_call_arguments=tool_args_str,
+                    tool_call_result=observation_str,
+                    tool_type="function",
+                    tool_description=description,
+                    attributes={
+                        "wildtool.tool.execution_mode": "ground_truth_replay",
+                    },
+                )
+
+                try:
+                    self._handler.start_execute_tool(
+                        invocation, context=parent_ctx
+                    )
+                except Exception as e:  # noqa: BLE001
+                    logger.debug("Failed to start_execute_tool: %s", e)
+                    continue
+
+                # M3: explicitly write tool_call sensitive attrs. The
+                # util-genai `_get_tool_call_data_attributes` helper guards
+                # these behind experimental-mode + content-capture-mode env
+                # vars which are not always set in real deployments.
+                tool_span = invocation.span
+                if tool_span is not None and tool_span.is_recording():
+                    try:
+                        tool_span.set_attribute(
+                            "gen_ai.tool.call.arguments", tool_args_str
+                        )
+                        if observation_str is not None:
+                            tool_span.set_attribute(
+                                "gen_ai.tool.call.result", observation_str
+                            )
+                        if description:
+                            tool_span.set_attribute(
+                                "gen_ai.tool.description", description
+                            )
+                    except Exception as e:  # noqa: BLE001
+                        logger.debug("Failed to set tool span attrs: %s", e)
+
+                try:
+                    self._handler.stop_execute_tool(invocation)
+                except Exception as e:  # noqa: BLE001
+                    logger.debug("Failed to stop_execute_tool: %s", e)
+
+
+class WildToolRequestWrapper:
+    """P4: Wraps BaseHandler._request_tool_call.
+
+    Creates STEP span (ReactStepInvocation) before each LLM call.
+    Extracts latency from return value. Also writes the H2 provider-name
+    fallback attributes (``gen_ai.system`` + ``gen_ai.provider.name``) on
+    the STEP span so the new semconv attribute is present in the trace
+    even when the upstream OpenAI v2 instrumentation only emits the legacy
+    ``gen_ai.system``.
+    """
+
+    def __init__(self, handler: ExtendedTelemetryHandler):
+        self._handler = handler
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        if not _in_chain.get():
+            return wrapped(*args, **kwargs)
+
+        # Close the previous step (the natural end-of-step is when the next
+        # request fires). The STEP span's SpanContext stays valid as a
+        # parent for TOOL spans created later.
+        _close_active_step(self._handler)
+
+        step_num = _step_counter.get() + 1
+        _step_counter.set(step_num)
+
+        step_inv = ReactStepInvocation(round=step_num)
+        try:
+            self._handler.start_react_step(step_inv)
+        except Exception as e:  # noqa: BLE001
+            logger.debug("Failed to start react step: %s", e)
+            return wrapped(*args, **kwargs)
+
+        # H2: provider-name fallback attributes. Written on the STEP, not
+        # on the LLM span, because the LLM span is owned by the OpenAI v2
+        # provider instrumentation and is created lazily inside the wtb
+        # request implementation.
+        if step_inv.span is not None and step_inv.span.is_recording():
+            try:
+                step_inv.span.set_attribute(
+                    "gen_ai.system", _PROVIDER_FALLBACK_NAME
+                )
+                step_inv.span.set_attribute(
+                    "gen_ai.provider.name", _PROVIDER_FALLBACK_NAME
+                )
+            except Exception as e:  # noqa: BLE001
+                logger.debug("Failed to set provider fallback attrs: %s", e)
+
+        # Track this step for H1 TOOL re-parenting.
+        chain_steps = _chain_step_invocations.get()
+        if chain_steps is not None:
+            chain_steps.append(step_inv)
+        _step_invocation.set(step_inv)
+
+        try:
+            result = wrapped(*args, **kwargs)
+            if isinstance(result, tuple) and len(result) == 2:
+                _, latency = result
+                if step_inv.span and step_inv.span.is_recording():
+                    try:
+                        step_inv.span.set_attribute(
+                            "wildtool.latency", float(latency)
+                        )
+                    except Exception as e:  # noqa: BLE001
+                        logger.debug("Failed to set wildtool.latency: %s", e)
+            return result
+        except Exception as e:
+            step_inv.finish_reason = "error"
+            self._handler.fail_react_step(
+                step_inv, Error(message=str(e), type=type(e))
+            )
+            _step_invocation.set(None)
+            raise
+
+
+class WildToolParseWrapper:
+    """P5: Wraps BaseHandler._parse_api_response.
+
+    Extracts token counts from parsed response and sets them on the
+    current STEP span as attributes.
+    """
+
+    def __init__(self, handler: ExtendedTelemetryHandler):
+        self._handler = handler
+
+    def __call__(self, wrapped, instance, args, kwargs):
+        result = wrapped(*args, **kwargs)
+
+        step_inv = _step_invocation.get()
+        if step_inv and step_inv.span and step_inv.span.is_recording():
+            if isinstance(result, dict):
+                input_t = result.get("input_token")
+                output_t = result.get("output_token")
+                if input_t is not None:
+                    step_inv.span.set_attribute(
+                        "gen_ai.usage.input_tokens", input_t
+                    )
+                if output_t is not None:
+                    step_inv.span.set_attribute(
+                        "gen_ai.usage.output_tokens", output_t
+                    )
+
+        return result
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py
new file mode 100644
index 000000000..1ac5bcfee
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py
@@ -0,0 +1,2 @@
+_instruments = ("openai >= 1.0.0",)
+_supports_metrics = False
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py
new file mode 100644
index 000000000..c26b7711d
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py
@@ -0,0 +1,17 @@
+"""Utility functions for WildToolBench instrumentation."""
+
+import json
+from typing import Any, Optional
+
+
+def safe_json_dumps(obj: Any, max_length: int = 10000) -> Optional[str]:
+    """Safely serialize object to JSON string with length limit."""
+    if obj is None:
+        return None
+    try:
+        s = json.dumps(obj, ensure_ascii=False)
+        if len(s) > max_length:
+            return s[:max_length] + "...(truncated)"
+        return s
+    except (TypeError, ValueError):
+        return str(obj)[:max_length]
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py
new file mode 100644
index 000000000..3dc1f76bc
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py
@@ -0,0 +1 @@
+__version__ = "0.1.0"
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py
new file mode 100644
index 000000000..014186185
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py
@@ -0,0 +1,182 @@
+"""Test configuration for WildToolBench instrumentation tests."""
+
+import json
+import os
+
+import pytest
+
+os.environ.setdefault("OPENAI_API_KEY", "test_key_not_real")
+os.environ.setdefault("OPENAI_BASE_URL", "http://localhost:9999/v1")
+
+from opentelemetry.instrumentation.wildtool import WildToolInstrumentor
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from opentelemetry.sdk.trace.export.in_memory_span_exporter import (
+    InMemorySpanExporter,
+)
+
+
+def pytest_configure(config: pytest.Config):
+    os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental"
+
+
+@pytest.fixture(scope="function", name="span_exporter")
+def fixture_span_exporter():
+    exporter = InMemorySpanExporter()
+    yield exporter
+
+
+@pytest.fixture(scope="function", name="tracer_provider")
+def fixture_tracer_provider(span_exporter):
+    provider = TracerProvider()
+    provider.add_span_processor(SimpleSpanProcessor(span_exporter))
+    return provider
+
+
+@pytest.fixture(scope="function")
+def instrument(tracer_provider):
+    instrumentor = WildToolInstrumentor()
+    instrumentor.instrument(
+        tracer_provider=tracer_provider,
+        skip_dep_check=True,
+    )
+    yield instrumentor
+    instrumentor.uninstrument()
+
+
+# ==================== Minimal test data fixtures ====================
+
+
+def _make_chat_completion_response(
+    content=None,
+    tool_calls=None,
+    input_tokens=10,
+    output_tokens=5,
+    model="gpt-4o",
+):
+    """Build a minimal ChatCompletion-like dict that can be JSON-serialized."""
+    message = {"role": "assistant", "content": content or ""}
+    if tool_calls:
+        message["tool_calls"] = tool_calls
+    return {
+        "id": "chatcmpl-test",
+        "object": "chat.completion",
+        "model": model,
+        "choices": [{"index": 0, "message": message, "finish_reason": "stop"}],
+        "usage": {
+            "prompt_tokens": input_tokens,
+            "completion_tokens": output_tokens,
+            "total_tokens": input_tokens + output_tokens,
+        },
+    }
+
+
+class FakeChatCompletion:
+    """Mimics openai.types.chat.ChatCompletion enough for _parse_api_response."""
+
+    def __init__(self, data: dict):
+        self._data = data
+
+    def json(self):
+        return json.dumps(self._data)
+
+    def __getattr__(self, name):
+        return self._data[name]
+
+
+@pytest.fixture()
+def make_completion():
+    """Factory fixture to build FakeChatCompletion objects."""
+
+    def _factory(**kwargs):
+        return FakeChatCompletion(_make_chat_completion_response(**kwargs))
+
+    return _factory
+
+
+@pytest.fixture()
+def simple_test_entry():
+    """A minimal WildToolBench test_entry with 1 task, 1 step (prepare_to_answer)."""
+    return {
+        "id": "wild_tool_bench_test_001",
+        "english_env_info": "2025-01-01",
+        "english_tools": [
+            {
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "description": "Get weather for a city",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "city": {"type": "string"},
+                        },
+                        "required": ["city"],
+                    },
+                },
+            }
+        ],
+        "english_tasks": ["What is the weather in Beijing?"],
+        "english_answer_list": [
+            [
+                {
+                    "action": {
+                        "name": "get_weather",
+                        "arguments": {"city": "Beijing"},
+                    },
+                    "observation": "Sunny, 25°C",
+                    "dependency_list": [],
+                },
+                {
+                    "action": {
+                        "name": "prepare_to_answer",
+                        "arguments": {},
+                    },
+                    "observation": "The weather in Beijing is Sunny, 25°C",
+                    "dependency_list": [0],
+                },
+            ]
+        ],
+    }
+
+
+@pytest.fixture()
+def tool_call_response_factory():
+    """Factory to make tool_call ChatCompletion responses."""
+
+    def _factory(tool_name, arguments, tool_call_id="call_001"):
+        tc = [
+            {
+                "id": tool_call_id,
+                "type": "function",
+                "function": {
+                    "name": tool_name,
+                    "arguments": (
+                        json.dumps(arguments)
+                        if isinstance(arguments, dict)
+                        else arguments
+                    ),
+                },
+            }
+        ]
+        return FakeChatCompletion(
+            _make_chat_completion_response(tool_calls=tc)
+        )
+
+    return _factory
+
+
+@pytest.fixture()
+def text_response_factory():
+    """Factory to make text-only ChatCompletion responses."""
+
+    def _factory(content, input_tokens=10, output_tokens=5):
+        return FakeChatCompletion(
+            _make_chat_completion_response(
+                content=content,
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
+            )
+        )
+
+    return _factory
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py
new file mode 100644
index 000000000..2711089fc
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py
@@ -0,0 +1,136 @@
+"""Tests for AGENT span (P2: inference_multi_turn)."""
+
+import json
+
+from wtb.model_handler.base_handler import BaseHandler
+
+
+class _StubHandler(BaseHandler):
+    """Minimal handler subclass for testing AGENT span."""
+
+    def __init__(self):
+        super().__init__("test-model", 0.0)
+        self._step_responses = []
+        self._step_idx = 0
+
+    def _request_tool_call(self, inference_data):
+        resp = self._step_responses[self._step_idx]
+        self._step_idx += 1
+        return resp, 0.1
+
+    def _parse_api_response(self, api_response):
+        data = json.loads(api_response.json())
+        choice = data["choices"][0]
+        message = choice["message"]
+        return {
+            "reasoning_content": None,
+            "content": message.get("content"),
+            "tool_calls": message.get("tool_calls"),
+            "input_token": data["usage"]["prompt_tokens"],
+            "output_token": data["usage"]["completion_tokens"],
+        }
+
+
+class TestAgentSpan:
+    def test_agent_span_attributes(
+        self, span_exporter, instrument, simple_test_entry, make_completion,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """AGENT span should exist with correct attributes and token aggregation."""
+        handler = _StubHandler()
+
+        # Step 0: model returns tool call for get_weather
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        # Step 1: model returns text (prepare_to_answer match)
+        resp1 = text_response_factory(
+            "The weather in Beijing is Sunny, 25°C",
+            input_tokens=20, output_tokens=15,
+        )
+        handler._step_responses = [resp0, resp1]
+
+        result = handler.inference_multi_turn(simple_test_entry)
+        assert result is not None
+
+        spans = span_exporter.get_finished_spans()
+        agent_spans = [s for s in spans if "invoke_agent" in s.name]
+        assert len(agent_spans) == 1
+
+        span = agent_spans[0]
+        assert span.name == "invoke_agent _StubHandler"
+        attrs = dict(span.attributes or {})
+        assert attrs.get("gen_ai.span.kind") == "AGENT"
+        assert attrs.get("gen_ai.operation.name") == "invoke_agent"
+        assert attrs.get("gen_ai.framework") == "wildtool"
+        assert attrs.get("gen_ai.agent.name") == "_StubHandler"
+        assert attrs.get("gen_ai.conversation.id") == "wild_tool_bench_test_001"
+        assert attrs.get("gen_ai.request.model") == "test-model"
+        assert attrs.get("wildtool.turn_count") == 1
+
+        assert attrs.get("gen_ai.usage.input_tokens") == 30
+        assert attrs.get("gen_ai.usage.output_tokens") == 20
+
+    def test_agent_span_captures_input_and_output_messages(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """AGENT span should always carry GenAI input/output messages."""
+
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        agent_span = [s for s in spans if "invoke_agent" in s.name][0]
+        attrs = dict(agent_span.attributes or {})
+        input_messages = json.loads(attrs["gen_ai.input.messages"])
+        output_messages = json.loads(attrs["gen_ai.output.messages"])
+
+        assert input_messages[0]["role"] == "user"
+        assert (
+            input_messages[0]["parts"][0]["content"]
+            == "What is the weather in Beijing?"
+        )
+        assert output_messages[0]["role"] == "assistant"
+        assert (
+            output_messages[0]["parts"][0]["content"]
+            == "The weather in Beijing is Sunny, 25°C"
+        )
+
+    def test_agent_parent_is_entry(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """When called via multi_threaded_inference, AGENT span should be child of ENTRY."""
+        from wtb._llm_response_generation import multi_threaded_inference  # noqa: I001, PLC0415
+
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        test_case = simple_test_entry.copy()
+        multi_threaded_inference(handler, "test-model", test_case)
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        agent_spans = [s for s in spans if "invoke_agent" in s.name]
+
+        assert len(entry_spans) == 1
+        assert len(agent_spans) == 1
+
+        entry = entry_spans[0]
+        agent = agent_spans[0]
+        assert agent.context.trace_id == entry.context.trace_id
+        assert agent.parent is not None
+        assert agent.parent.span_id == entry.context.span_id
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py
new file mode 100644
index 000000000..d7dd7b4aa
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py
@@ -0,0 +1,283 @@
+"""Tests for CHAIN / STEP / TOOL spans (P3, P4, P5)."""
+
+import json
+
+import pytest
+from opentelemetry.trace import StatusCode
+
+from wtb.model_handler.base_handler import BaseHandler
+
+
+class _StubHandler(BaseHandler):
+    """Minimal handler subclass with controllable responses."""
+
+    def __init__(self):
+        super().__init__("test-model", 0.0)
+        self._step_responses = []
+        self._step_idx = 0
+
+    def _request_tool_call(self, inference_data):
+        resp = self._step_responses[self._step_idx]
+        self._step_idx += 1
+        return resp, 0.05
+
+    def _parse_api_response(self, api_response):
+        data = json.loads(api_response.json())
+        choice = data["choices"][0]
+        message = choice["message"]
+        return {
+            "reasoning_content": None,
+            "content": message.get("content"),
+            "tool_calls": message.get("tool_calls"),
+            "input_token": data["usage"]["prompt_tokens"],
+            "output_token": data["usage"]["completion_tokens"],
+        }
+
+
+class TestChainSpan:
+    def test_chain_span_per_task(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """Each task should produce one CHAIN span with correct attributes."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        assert len(chain_spans) == 1
+
+        chain = chain_spans[0]
+        assert chain.name == "workflow task_0"
+        attrs = dict(chain.attributes or {})
+        assert attrs.get("gen_ai.span.kind") == "CHAIN"
+        assert attrs.get("gen_ai.operation.name") == "workflow"
+        assert attrs.get("gen_ai.framework") == "wildtool"
+        assert attrs.get("wildtool.task_idx") == 0
+        assert attrs.get("wildtool.test_entry_id") == "wild_tool_bench_test_001"
+        assert attrs.get("wildtool.action_name_label") == "correct"
+        assert attrs.get("wildtool.is_optimal") is True
+
+    def test_chain_parent_is_agent(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """CHAIN span should be child of AGENT span."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        agent_spans = [s for s in spans if "invoke_agent" in s.name]
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+
+        assert len(agent_spans) == 1
+        assert len(chain_spans) == 1
+
+        agent = agent_spans[0]
+        chain = chain_spans[0]
+        assert chain.context.trace_id == agent.context.trace_id
+        assert chain.parent is not None
+        assert chain.parent.span_id == agent.context.span_id
+
+
+class TestStepSpans:
+    def test_step_spans_per_chain(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """Each _request_tool_call invocation should produce a STEP span."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [s for s in spans if s.name == "react step"]
+        assert len(step_spans) == 2
+
+        attrs0 = dict(step_spans[0].attributes or {})
+        attrs1 = dict(step_spans[1].attributes or {})
+        rounds = sorted([attrs0.get("gen_ai.react.round"), attrs1.get("gen_ai.react.round")])
+        assert rounds == [1, 2]
+
+        for ss in step_spans:
+            a = dict(ss.attributes or {})
+            assert a.get("gen_ai.span.kind") == "STEP"
+            assert a.get("gen_ai.operation.name") == "react"
+
+    def test_step_parent_is_chain(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """STEP spans should be children of CHAIN span."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        step_spans = [s for s in spans if s.name == "react step"]
+
+        assert len(chain_spans) == 1
+        chain = chain_spans[0]
+
+        for ss in step_spans:
+            assert ss.context.trace_id == chain.context.trace_id
+            assert ss.parent is not None
+            assert ss.parent.span_id == chain.context.span_id
+
+    def test_step_token_attributes(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """STEP span should have gen_ai.usage.input_tokens and output_tokens."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory(
+            "The weather in Beijing is Sunny, 25°C",
+            input_tokens=25, output_tokens=12,
+        )
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = sorted(
+            [s for s in spans if s.name == "react step"],
+            key=lambda s: s.attributes.get("gen_ai.react.round", 0),
+        )
+        assert len(step_spans) == 2
+
+        # First step: default 10 input, 5 output from make_completion defaults
+        a0 = dict(step_spans[0].attributes or {})
+        assert a0.get("gen_ai.usage.input_tokens") == 10
+        assert a0.get("gen_ai.usage.output_tokens") == 5
+
+        # Second step: 25 input, 12 output
+        a1 = dict(step_spans[1].attributes or {})
+        assert a1.get("gen_ai.usage.input_tokens") == 25
+        assert a1.get("gen_ai.usage.output_tokens") == 12
+
+
+class TestToolSpans:
+    def test_tool_span_attributes(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """TOOL span should have correct attributes including execution_mode."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = [s for s in spans if "execute_tool" in s.name]
+        assert len(tool_spans) == 1
+
+        tool = tool_spans[0]
+        assert tool.name == "execute_tool get_weather"
+        attrs = dict(tool.attributes or {})
+        assert attrs.get("gen_ai.span.kind") == "TOOL"
+        assert attrs.get("gen_ai.operation.name") == "execute_tool"
+        assert attrs.get("gen_ai.tool.name") == "get_weather"
+        assert attrs.get("gen_ai.tool.type") == "function"
+        assert (
+            attrs.get("wildtool.tool.execution_mode") == "ground_truth_replay"
+        )
+
+    def test_tool_span_parent_is_chain(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """TOOL spans share the CHAIN trace_id (parent is STEP after Round 2)."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        tool_spans = [s for s in spans if "execute_tool" in s.name]
+
+        assert len(chain_spans) == 1
+        assert len(tool_spans) >= 1
+
+        chain = chain_spans[0]
+        for ts in tool_spans:
+            assert ts.context.trace_id == chain.context.trace_id
+
+
+class TestSpanHierarchy:
+    def test_full_hierarchy(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """Verify ENTRY → AGENT → CHAIN → STEP hierarchy and consistent trace_id."""
+        from wtb._llm_response_generation import multi_threaded_inference
+
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        test_case = simple_test_entry.copy()
+        multi_threaded_inference(handler, "test-model", test_case)
+
+        spans = span_exporter.get_finished_spans()
+
+        entry = [s for s in spans if s.name == "enter_ai_application_system"]
+        agent = [s for s in spans if "invoke_agent" in s.name]
+        chain = [s for s in spans if s.name.startswith("workflow")]
+        step = [s for s in spans if s.name == "react step"]
+        tool = [s for s in spans if "execute_tool" in s.name]
+
+        assert len(entry) == 1
+        assert len(agent) == 1
+        assert len(chain) == 1
+        assert len(step) == 2
+        assert len(tool) >= 1
+
+        trace_id = entry[0].context.trace_id
+        for s in spans:
+            assert s.context.trace_id == trace_id
+
+        # AGENT parent = ENTRY
+        assert agent[0].parent.span_id == entry[0].context.span_id
+        # CHAIN parent = AGENT
+        assert chain[0].parent.span_id == agent[0].context.span_id
+        # STEP parent = CHAIN
+        for s in step:
+            assert s.parent.span_id == chain[0].context.span_id
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py
new file mode 100644
index 000000000..2a1e864b5
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py
@@ -0,0 +1,168 @@
+"""Tests for ENTRY span (P1: multi_threaded_inference).
+
+Module-level imports of ``wtb._llm_response_generation.multi_threaded_inference``
+must be avoided: ``wrapt.wrap_function_wrapper`` patches the attribute on the
+module, but a pre-imported local binding still references the original
+unwrapped function. All tests therefore import the symbol lazily after the
+``instrument`` fixture has run.
+"""
+
+import json
+
+import pytest
+from wtb.model_handler.base_handler import BaseHandler
+
+from opentelemetry.trace import StatusCode
+
+
+class _StubHandler(BaseHandler):
+    """Minimal handler subclass for testing.
+
+    Overrides ``inference`` so the multi_threaded_inference wrapper invokes a
+    deterministic, side-effect-free body that returns a fake result dict and
+    therefore exercises only the ENTRY span codepath.
+    """
+
+    def __init__(self):
+        super().__init__("test-model", 0.0)
+
+    def _request_tool_call(self, inference_data):
+        raise NotImplementedError
+
+    def _parse_api_response(self, api_response):
+        raise NotImplementedError
+
+    def inference(self, test_entry):
+        return [
+            {
+                "action_name_label": "correct",
+                "is_optimal": True,
+                "inference_log": {},
+                "latency": [0.1],
+                "input_token_count": [10],
+                "output_token_count": [5],
+            }
+        ]
+
+
+class TestEntrySpan:
+    def test_entry_span_created(self, span_exporter, instrument):
+        """ENTRY span should be created with correct attributes."""
+        from wtb._llm_response_generation import multi_threaded_inference  # noqa: I001, PLC0415
+
+        handler = _StubHandler()
+        test_case = {
+            "id": "wild_tool_bench_test_001",
+            "english_tasks": ["task1", "task2"],
+        }
+
+        result = multi_threaded_inference(handler, "gpt-4o", test_case)
+
+        assert result is not None
+        assert result["id"] == "wild_tool_bench_test_001"
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        assert len(entry_spans) == 1
+
+        span = entry_spans[0]
+        attrs = dict(span.attributes or {})
+        assert attrs.get("gen_ai.span.kind") == "ENTRY"
+        assert attrs.get("gen_ai.operation.name") == "enter"
+        assert attrs.get("gen_ai.framework") == "wildtool"
+        assert attrs.get("gen_ai.session.id") == "wild_tool_bench_test_001"
+        assert attrs.get("gen_ai.request.model") == "gpt-4o"
+        assert attrs.get("wildtool.turn_count") == 2
+        # ENTRY spans rely on default OTel status semantics: success leaves
+        # the span UNSET, failures explicitly mark it ERROR.
+        assert span.status.status_code != StatusCode.ERROR
+
+    def test_entry_span_captures_input_and_output_messages(
+        self, span_exporter, instrument,
+    ):
+        """ENTRY span should always carry GenAI input/output messages."""
+
+        from opentelemetry.instrumentation.wildtool._wrappers import (  # noqa: PLC0415
+            WildToolEntryWrapper,
+        )
+
+        wrapper = WildToolEntryWrapper(instrument._handler)
+        test_case = {
+            "id": "wild_tool_bench_test_messages",
+            "english_tasks": ["Search for the capital of France"],
+        }
+
+        def _success(handler, model_name, test_case):
+            return [
+                {
+                    "action_name_label": "correct",
+                    "is_optimal": True,
+                    "inference_log": {
+                        "step_0": {
+                            "inference_output": {
+                                "content": "Paris is the capital of France."
+                            }
+                        }
+                    },
+                }
+            ]
+
+        wrapper(_success, None, (_StubHandler(), "gpt-4o", test_case), {})
+
+        spans = span_exporter.get_finished_spans()
+        entry_span = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ][0]
+        attrs = dict(entry_span.attributes or {})
+        input_messages = json.loads(attrs["gen_ai.input.messages"])
+        output_messages = json.loads(attrs["gen_ai.output.messages"])
+
+        assert input_messages[0]["role"] == "user"
+        assert (
+            input_messages[0]["parts"][0]["content"]
+            == "Search for the capital of France"
+        )
+        assert output_messages[0]["role"] == "assistant"
+        assert (
+            output_messages[0]["parts"][0]["content"]
+            == "Paris is the capital of France."
+        )
+
+    def test_entry_span_error_path(self, span_exporter, instrument):
+        """The ENTRY wrapper marks the span ERROR when the wrapped callable
+        raises an unhandled exception.
+
+        ``multi_threaded_inference`` swallows non-rate-limit errors itself
+        (see test_error_scenarios.test_entry_span_captures_retry_error_path
+        for that path). To exercise the wrapper's failure branch directly we
+        invoke the underlying ``WildToolEntryWrapper`` with a callable that
+        deliberately raises, bypassing ``multi_threaded_inference``'s own
+        error handling.
+        """
+        from opentelemetry.instrumentation.wildtool._wrappers import (  # noqa: PLC0415
+            WildToolEntryWrapper,
+        )
+
+        wrapper = WildToolEntryWrapper(instrument._handler)
+
+        def _raising(handler, model_name, test_case):
+            raise RuntimeError("API connection failed")
+
+        handler = _StubHandler()
+        test_case = {
+            "id": "wild_tool_bench_test_002",
+            "english_tasks": ["task1"],
+        }
+
+        with pytest.raises(RuntimeError, match="API connection failed"):
+            wrapper(_raising, None, (handler, "gpt-4o", test_case), {})
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        assert len(entry_spans) == 1
+        span = entry_spans[0]
+        assert span.status.status_code == StatusCode.ERROR
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py
new file mode 100644
index 000000000..c14a3f40c
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py
@@ -0,0 +1,135 @@
+"""Tests for error/edge-case scenarios."""
+
+import json
+
+import pytest
+from opentelemetry.trace import StatusCode
+
+from wtb.model_handler.base_handler import BaseHandler
+
+
+class _StubHandler(BaseHandler):
+    """Handler with controllable step responses."""
+
+    def __init__(self):
+        super().__init__("test-model", 0.0)
+        self._step_responses = []
+        self._step_idx = 0
+
+    def _request_tool_call(self, inference_data):
+        resp = self._step_responses[self._step_idx]
+        self._step_idx += 1
+        if isinstance(resp, Exception):
+            raise resp
+        return resp, 0.05
+
+    def _parse_api_response(self, api_response):
+        data = json.loads(api_response.json())
+        choice = data["choices"][0]
+        message = choice["message"]
+        return {
+            "reasoning_content": None,
+            "content": message.get("content"),
+            "tool_calls": message.get("tool_calls"),
+            "input_token": data["usage"]["prompt_tokens"],
+            "output_token": data["usage"]["completion_tokens"],
+        }
+
+
+class TestErrorScenarios:
+    def test_action_name_mismatch(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory,
+    ):
+        """When model calls wrong tool, CHAIN span should still be OK with error label."""
+        handler = _StubHandler()
+        # Model calls wrong_tool instead of get_weather
+        resp0 = tool_call_response_factory(
+            "wrong_tool", {"x": 1}, "call_bad"
+        )
+        handler._step_responses = [resp0]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        assert len(chain_spans) == 1
+
+        chain = chain_spans[0]
+        attrs = dict(chain.attributes or {})
+        assert attrs.get("wildtool.action_name_label") == "error"
+        assert chain.status.status_code == StatusCode.OK
+
+    def test_empty_response(
+        self, span_exporter, instrument, simple_test_entry,
+        make_completion,
+    ):
+        """When model returns no content and no tool_calls, process terminates gracefully."""
+        from tests.conftest import FakeChatCompletion, _make_chat_completion_response
+
+        handler = _StubHandler()
+        resp = FakeChatCompletion(
+            _make_chat_completion_response(content="", tool_calls=None)
+        )
+        handler._step_responses = [resp]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        assert len(chain_spans) == 1
+        attrs = dict(chain_spans[0].attributes or {})
+        assert attrs.get("wildtool.action_name_label") == "error"
+
+    def test_request_tool_call_exception_sets_error(
+        self, span_exporter, instrument, simple_test_entry,
+    ):
+        """Exception in _request_tool_call should produce ERROR on STEP span and propagate."""
+        handler = _StubHandler()
+        handler._step_responses = [RuntimeError("Connection timeout")]
+
+        with pytest.raises(RuntimeError, match="Connection timeout"):
+            handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        step_spans = [s for s in spans if s.name == "react step"]
+        assert len(step_spans) == 1
+        assert step_spans[0].status.status_code == StatusCode.ERROR
+
+        chain_spans = [s for s in spans if s.name.startswith("workflow")]
+        assert len(chain_spans) == 1
+        assert chain_spans[0].status.status_code == StatusCode.ERROR
+
+    def test_entry_span_captures_retry_error_path(
+        self, span_exporter, instrument,
+    ):
+        """multi_threaded_inference catches non-rate-limit errors and returns error dict.
+        ENTRY span should still complete successfully (not raise)."""
+        from wtb._llm_response_generation import multi_threaded_inference
+
+        handler = _StubHandler()
+
+        def failing_inference(test_entry):
+            raise ValueError("Invalid JSON from model")
+
+        handler.inference = failing_inference
+
+        test_case = {
+            "id": "wild_tool_bench_err_001",
+            "english_tasks": ["task1"],
+        }
+
+        # multi_threaded_inference catches non-rate-limit errors
+        result = multi_threaded_inference(handler, "test-model", test_case)
+        assert "Error during inference" in result["result"]
+
+        spans = span_exporter.get_finished_spans()
+        entry_spans = [
+            s for s in spans if s.name == "enter_ai_application_system"
+        ]
+        assert len(entry_spans) == 1
+        # multi_threaded_inference's own try/except converts the error into a
+        # normal return, so the ENTRY wrapper observes a successful call and
+        # leaves the span at the default UNSET status (definitely not ERROR).
+        span = entry_spans[0]
+        assert span.status.status_code != StatusCode.ERROR
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py
new file mode 100644
index 000000000..a8be5b4da
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py
@@ -0,0 +1,20 @@
+"""Tests for WildToolInstrumentor lifecycle."""
+
+from opentelemetry.instrumentation.wildtool import WildToolInstrumentor
+
+
+class TestWildToolInstrumentor:
+    def test_instrument_and_uninstrument(self, tracer_provider):
+        instrumentor = WildToolInstrumentor()
+        instrumentor.instrument(
+            tracer_provider=tracer_provider,
+            skip_dep_check=True,
+        )
+        assert instrumentor._handler is not None
+        instrumentor.uninstrument()
+        assert instrumentor._handler is None
+
+    def test_instrumentation_dependencies(self):
+        instrumentor = WildToolInstrumentor()
+        deps = instrumentor.instrumentation_dependencies()
+        assert ("openai >= 1.0.0",) == deps
diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py
new file mode 100644
index 000000000..9f4f4d895
--- /dev/null
+++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py
@@ -0,0 +1,441 @@
+"""Round 2 regression tests covering the H1 / H2 / M1 / M2 / M3 fixes.
+
+See ``llm-dev/execute.md`` § "修订记录 (Round 2 fix)" and
+``example-deploy/validation/SUMMARY.md`` for the original validation gaps
+addressed by these tests.
+"""
+
+import json
+
+import pytest
+from opentelemetry.trace import StatusCode
+
+from wtb.model_handler.base_handler import BaseHandler
+
+
+class _StubHandler(BaseHandler):
+    """Minimal handler with controllable LLM responses (no real network)."""
+
+    def __init__(self):
+        super().__init__("test-model", 0.0)
+        self._step_responses = []
+        self._step_idx = 0
+
+    def _request_tool_call(self, inference_data):
+        resp = self._step_responses[self._step_idx]
+        self._step_idx += 1
+        if isinstance(resp, Exception):
+            raise resp
+        return resp, 0.05
+
+    def _parse_api_response(self, api_response):
+        data = json.loads(api_response.json())
+        choice = data["choices"][0]
+        message = choice["message"]
+        return {
+            "reasoning_content": None,
+            "content": message.get("content"),
+            "tool_calls": message.get("tool_calls"),
+            "input_token": data["usage"]["prompt_tokens"],
+            "output_token": data["usage"]["completion_tokens"],
+        }
+
+
+def _spans_by_kind(spans, kind):
+    return [s for s in spans if (s.attributes or {}).get("gen_ai.span.kind") == kind]
+
+
+def _spans_named(spans, name):
+    return [s for s in spans if s.name == name]
+
+
+def _step_for_round(spans, round_num):
+    for s in _spans_named(spans, "react step"):
+        attrs = s.attributes or {}
+        if attrs.get("gen_ai.react.round") == round_num:
+            return s
+    raise AssertionError(f"no STEP span found for round={round_num}")
+
+
+# ============================================================================
+# H1: TOOL span parent_span_id == STEP span_id (was CHAIN before fix)
+# ============================================================================
+
+
+class TestToolParentIsStep:
+    def test_single_tool_parent_is_step_round_one(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """The single TOOL span in simple_test_entry should be a child of the
+        first STEP span (round=1), not the CHAIN span."""
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = _spans_by_kind(spans, "TOOL")
+        assert len(tool_spans) == 1, [s.name for s in spans]
+
+        tool = tool_spans[0]
+        step_round1 = _step_for_round(spans, 1)
+        chain = _spans_by_kind(spans, "CHAIN")[0]
+
+        # H1 core assertion: parent is STEP, not CHAIN.
+        assert tool.parent is not None
+        assert tool.parent.span_id == step_round1.context.span_id, (
+            "TOOL parent should be STEP round=1, got "
+            f"{tool.parent.span_id} (STEP={step_round1.context.span_id}, "
+            f"CHAIN={chain.context.span_id})"
+        )
+        assert tool.parent.span_id != chain.context.span_id
+
+        # And trace_id of course remains consistent.
+        assert tool.context.trace_id == step_round1.context.trace_id
+
+    def test_multi_step_each_tool_parented_to_correct_step(
+        self, span_exporter, instrument,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """multi-step scenario: 2 successful tool steps + 1 prepare_to_answer.
+
+        Each TOOL span must be parented to the STEP span of its own round,
+        not to the CHAIN or to a different round's STEP.
+        """
+        handler = _StubHandler()
+        # Test entry with 2 tool steps (search, lookup) then prepare_to_answer.
+        test_entry = {
+            "id": "wild_tool_bench_multi_001",
+            "english_env_info": "2025-01-01",
+            "english_tools": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "search",
+                        "description": "Search items",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"q": {"type": "string"}},
+                            "required": ["q"],
+                        },
+                    },
+                },
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "lookup",
+                        "description": "Look up details",
+                        "parameters": {
+                            "type": "object",
+                            "properties": {"id": {"type": "string"}},
+                            "required": ["id"],
+                        },
+                    },
+                },
+            ],
+            "english_tasks": ["Find and summarize item X"],
+            "english_answer_list": [
+                [
+                    {
+                        "action": {"name": "search", "arguments": {"q": "X"}},
+                        "observation": "found:item_42",
+                        "dependency_list": [],
+                    },
+                    {
+                        "action": {"name": "lookup", "arguments": {"id": "item_42"}},
+                        "observation": "details:hello",
+                        "dependency_list": [0],
+                    },
+                    {
+                        "action": {"name": "prepare_to_answer", "arguments": {}},
+                        "observation": "Item X is hello.",
+                        "dependency_list": [1],
+                    },
+                ]
+            ],
+        }
+
+        resp_step1 = tool_call_response_factory(
+            "search", {"q": "X"}, "call_search_1"
+        )
+        resp_step2 = tool_call_response_factory(
+            "lookup", {"id": "item_42"}, "call_lookup_1"
+        )
+        resp_step3 = text_response_factory("Item X is hello.")
+        handler._step_responses = [resp_step1, resp_step2, resp_step3]
+
+        handler.inference_multi_turn(test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = sorted(
+            _spans_by_kind(spans, "TOOL"),
+            key=lambda s: (s.attributes or {}).get("gen_ai.tool.name") or "",
+        )
+        assert len(tool_spans) == 2, [s.name for s in spans]
+
+        step_round1 = _step_for_round(spans, 1)
+        step_round2 = _step_for_round(spans, 2)
+        chain = _spans_by_kind(spans, "CHAIN")[0]
+
+        lookup_tool = next(
+            t for t in tool_spans
+            if (t.attributes or {}).get("gen_ai.tool.name") == "lookup"
+        )
+        search_tool = next(
+            t for t in tool_spans
+            if (t.attributes or {}).get("gen_ai.tool.name") == "search"
+        )
+
+        # search → STEP round=1, lookup → STEP round=2
+        assert search_tool.parent.span_id == step_round1.context.span_id
+        assert lookup_tool.parent.span_id == step_round2.context.span_id
+        # Neither parented on CHAIN (the regression we are fixing)
+        for t in tool_spans:
+            assert t.parent.span_id != chain.context.span_id
+            assert t.context.trace_id == chain.context.trace_id
+
+
+# ============================================================================
+# M1: CHAIN span carries input.value and output.value
+# ============================================================================
+
+
+class TestChainInputOutputValue:
+    def test_chain_input_value_and_output_value(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain_spans = _spans_by_kind(spans, "CHAIN")
+        assert len(chain_spans) == 1
+        attrs = dict(chain_spans[0].attributes or {})
+
+        # input.value: last user message of the chain (prepared by wtb's
+        # _pre_messages_processing which appends the current task as user).
+        assert "input.value" in attrs, attrs
+        assert attrs["input.value"] == "What is the weather in Beijing?"
+
+        # output.value: JSON containing action_name_label, task_idx, is_optimal.
+        assert "output.value" in attrs, attrs
+        out = json.loads(attrs["output.value"])
+        assert out["action_name_label"] == "correct"
+        assert out["task_idx"] == 0
+        assert out["is_optimal"] is True
+
+    def test_chain_input_value_truncated_when_long(
+        self, span_exporter, instrument,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """Very long user content should be truncated to keep span attribute small."""
+        handler = _StubHandler()
+        long_text = "x" * 5000
+        test_entry = {
+            "id": "wild_tool_bench_long_001",
+            "english_env_info": "2025-01-01",
+            "english_tools": [
+                {
+                    "type": "function",
+                    "function": {
+                        "name": "noop",
+                        "description": "noop",
+                        "parameters": {"type": "object", "properties": {}},
+                    },
+                }
+            ],
+            "english_tasks": [long_text],
+            "english_answer_list": [
+                [
+                    {
+                        "action": {"name": "prepare_to_answer", "arguments": {}},
+                        "observation": "ok",
+                        "dependency_list": [],
+                    }
+                ]
+            ],
+        }
+        handler._step_responses = [text_response_factory("ok")]
+
+        handler.inference_multi_turn(test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        chain = _spans_by_kind(spans, "CHAIN")[0]
+        attrs = dict(chain.attributes or {})
+        assert "input.value" in attrs
+        # Default cap is 4096; truncated form must be <= cap + suffix length.
+        assert len(attrs["input.value"]) <= 4096 + len("...(truncated)")
+        assert attrs["input.value"].startswith("xxx")
+
+
+# ============================================================================
+# M2: STEP span carries gen_ai.react.finish_reason on error paths
+# ============================================================================
+
+
+class TestStepFinishReason:
+    def test_finish_reason_action_name_mismatch(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory,
+    ):
+        handler = _StubHandler()
+        # wrong tool name → wtb's "action name not in candidate" branch
+        handler._step_responses = [
+            tool_call_response_factory("wrong_tool", {"x": 1}, "call_bad")
+        ]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        steps = _spans_named(spans, "react step")
+        assert len(steps) == 1
+        attrs = dict(steps[0].attributes or {})
+        assert attrs.get("gen_ai.react.finish_reason") == "action_name_mismatch"
+
+    def test_finish_reason_empty_response(
+        self, span_exporter, instrument, simple_test_entry, make_completion,
+    ):
+        """Empty content + no tool_calls → STEP gets finish_reason=empty_response."""
+        from tests.conftest import (
+            FakeChatCompletion,
+            _make_chat_completion_response,
+        )
+
+        handler = _StubHandler()
+        handler._step_responses = [
+            FakeChatCompletion(
+                _make_chat_completion_response(content="", tool_calls=None)
+            )
+        ]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        steps = _spans_named(spans, "react step")
+        assert len(steps) == 1
+        attrs = dict(steps[0].attributes or {})
+        assert attrs.get("gen_ai.react.finish_reason") == "empty_response"
+
+    def test_finish_reason_request_exception(
+        self, span_exporter, instrument, simple_test_entry,
+    ):
+        """Exception in _request_tool_call → STEP ERROR + finish_reason=error."""
+        handler = _StubHandler()
+        handler._step_responses = [RuntimeError("Boom")]
+
+        with pytest.raises(RuntimeError):
+            handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        steps = _spans_named(spans, "react step")
+        assert len(steps) == 1
+        attrs = dict(steps[0].attributes or {})
+        assert steps[0].status.status_code == StatusCode.ERROR
+        assert attrs.get("gen_ai.react.finish_reason") == "error"
+
+    def test_finish_reason_omitted_on_success(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        """Successful steps should NOT have a finish_reason (per execute.md)."""
+        handler = _StubHandler()
+        handler._step_responses = [
+            tool_call_response_factory(
+                "get_weather", {"city": "Beijing"}, "call_001"
+            ),
+            text_response_factory("OK"),
+        ]
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        for s in _spans_named(spans, "react step"):
+            attrs = dict(s.attributes or {})
+            assert "gen_ai.react.finish_reason" not in attrs, (
+                f"unexpected finish_reason on success step round="
+                f"{attrs.get('gen_ai.react.round')}: {attrs.get('gen_ai.react.finish_reason')}"
+            )
+
+
+# ============================================================================
+# M3: TOOL span carries gen_ai.tool.call.arguments / result / description
+#     (and keeps wildtool.tool.execution_mode)
+# ============================================================================
+
+
+class TestToolSensitiveAttributes:
+    def test_tool_args_result_description_and_execution_mode(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        handler = _StubHandler()
+        resp0 = tool_call_response_factory(
+            "get_weather", {"city": "Beijing"}, "call_001"
+        )
+        resp1 = text_response_factory("Sunny day")
+        handler._step_responses = [resp0, resp1]
+
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        tool_spans = _spans_by_kind(spans, "TOOL")
+        assert len(tool_spans) == 1
+        attrs = dict(tool_spans[0].attributes or {})
+
+        # M3 explicit attrs.
+        args_attr = attrs.get("gen_ai.tool.call.arguments")
+        assert args_attr is not None
+        assert json.loads(args_attr) == {"city": "Beijing"}
+
+        # observation comes from the appended {"role": "tool", ...} message
+        # written by wtb after the call matches the answer; it's a string.
+        result_attr = attrs.get("gen_ai.tool.call.result")
+        assert result_attr == "Sunny, 25°C", attrs
+
+        # description sourced from inference_data["tools"][i].function.description
+        assert attrs.get("gen_ai.tool.description") == "Get weather for a city"
+
+        # Existing custom attribute must still be present.
+        assert (
+            attrs.get("wildtool.tool.execution_mode")
+            == "ground_truth_replay"
+        )
+
+
+# ============================================================================
+# H2: STEP span carries gen_ai.system / gen_ai.provider.name fallback
+# ============================================================================
+
+
+class TestStepProviderFallback:
+    def test_step_has_provider_name_fallback(
+        self, span_exporter, instrument, simple_test_entry,
+        tool_call_response_factory, text_response_factory,
+    ):
+        handler = _StubHandler()
+        handler._step_responses = [
+            tool_call_response_factory(
+                "get_weather", {"city": "Beijing"}, "call_001"
+            ),
+            text_response_factory("OK"),
+        ]
+        handler.inference_multi_turn(simple_test_entry)
+
+        spans = span_exporter.get_finished_spans()
+        steps = _spans_named(spans, "react step")
+        assert len(steps) == 2
+        for s in steps:
+            attrs = dict(s.attributes or {})
+            assert attrs.get("gen_ai.system") == "openai", attrs
+            assert attrs.get("gen_ai.provider.name") == "openai", attrs
diff --git a/packages.txt b/packages.txt
new file mode 100644
index 000000000..cee224898
--- /dev/null
+++ b/packages.txt
@@ -0,0 +1,112 @@
+aiohappyeyeballs==2.6.1
+aiohttp==3.10.2
+aiosignal==1.3.1
+aliyun-instrumentation-sglang @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/aliyun-instrumentation-sglang
+aliyun-instrumentation-vllm @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/aliyun-instrumentation-vllm
+-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=aliyun_sdk_extension_arms&subdirectory=sdk-extension/aliyun-sdk-extension-arms
+aliyun-semantic-conventions==1.2.0
+annotated-types==0.7.0
+anyio==4.10.0
+asgiref==3.8.1
+asttokens==3.0.0
+async-timeout==4.0.3
+attrs==25.3.0
+blinker==1.7.0
+build==1.3.0
+bytecode==0.17.0
+certifi==2024.7.4
+chardet==5.2.0
+charset-normalizer==3.3.2
+click==8.1.7
+cramjam==2.10.0
+crcmod==1.7
+decorator==5.2.1
+Deprecated==1.2.14
+Django==5.2.4
+executing==2.2.1
+fastapi==0.116.1
+filelock==3.19.1
+Flask==3.0.2
+frozenlist==1.4.1
+fsspec==2025.9.0
+googleapis-common-protos==1.70.0
+h11==0.16.0
+http_server_mock==1.7
+httpcore==1.0.9
+httpretty==1.1.4
+httpx==0.28.1
+idna==3.7
+importlib_metadata==8.4.0
+iniconfig==2.0.0
+ipython==9.5.0
+ipython_pygments_lexers==1.1.1
+itsdangerous==2.1.2
+jedi==0.19.2
+Jinja2==3.1.4
+jsonpath==0.82.2
+MarkupSafe==2.1.5
+matplotlib-inline==0.1.7
+mpmath==1.3.0
+multidict==6.0.5
+networkx==3.5
+numpy==2.3.2
+opentelemetry-api==1.30.0
+-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_exporter_otlp_proto_http&subdirectory=exporter/opentelemetry-exporter-otlp-proto-http
+-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_instrumentation&subdirectory=opentelemetry-instrumentation
+-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_instrumentation_aiohttp_client&subdirectory=instrumentation/opentelemetry-instrumentation-aiohttp-client
+opentelemetry-instrumentation-asgi @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-asgi
+-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_instrumentation_django&subdirectory=instrumentation/opentelemetry-instrumentation-django
+opentelemetry-instrumentation-fastapi @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-fastapi
+opentelemetry-instrumentation-flask @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-flask
+opentelemetry-instrumentation-httpx @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-httpx
+opentelemetry-instrumentation-requests @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-requests
+opentelemetry-instrumentation-tornado @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-tornado
+opentelemetry-instrumentation-wsgi==0.51b0
+-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_sdk&subdirectory=opentelemetry-sdk
+-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_semantic_conventions&subdirectory=opentelemetry-semantic-conventions
+-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_test_utils&subdirectory=opentelemetry-test-utils
+-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_util_http&subdirectory=util/opentelemetry-util-http
+packaging==24.0
+parso==0.8.5
+pexpect==4.9.0
+pillow==11.3.0
+pluggy==1.5.0
+prompt_toolkit==3.0.52
+propcache==0.3.2
+protobuf==6.32.0
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+py-cpuinfo==9.0.0
+pydantic==2.11.7
+pydantic_core==2.33.2
+Pygments==2.19.2
+pyproject_hooks==1.2.0
+pytest==7.4.4
+python-snappy==0.7.3
+PyYAML==6.0.2
+requests==2.32.3
+setproctitle==1.3.6
+setuptools==80.9.0
+sglang==0.4.8
+sniffio==1.3.1
+sqlparse==0.5.3
+stack-data==0.6.3
+starlette==0.47.2
+sympy==1.14.0
+tomli==2.0.1
+tomlkit==0.13.3
+torch==2.8.0
+tornado==6.5.2
+tqdm==4.67.1
+traitlets==5.14.3
+typing-inspection==0.4.1
+typing_extensions==4.12.2
+urllib3==2.2.2
+uvloop==0.21.0
+wcwidth==0.2.13
+Werkzeug==3.0.6
+wheel==0.45.1
+wrapt==1.16.0
+yarl==1.9.4
+zipp==3.19.2
diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/extended_types.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/extended_types.py
index e110fdcd3..d74131cf9 100644
--- a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/extended_types.py
+++ b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/extended_types.py
@@ -297,6 +297,12 @@ class EntryInvocation:
     output_messages: List[OutputMessage] = field(
         default_factory=_new_output_messages
     )
+    system_instruction: List[MessagePart] = field(
+        default_factory=_new_system_instruction
+    )
+    tool_definitions: List[ToolDefinition] = field(
+        default_factory=_new_tool_definitions
+    )
     response_time_to_first_token: int | None = None  # nanoseconds
     monotonic_start_s: float | None = None