diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/pyproject.toml new file mode 100644 index 000000000..69dbd269e --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-algotune" +dynamic = ["version"] +description = "LoongSuite algotune instrumentation" +license = "Apache-2.0" +requires-python = ">=3.10,<4" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + "opentelemetry-api >= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.0.0, < 2.0.0", +] + +[project.optional-dependencies] +instruments = [ + +] + +[project.entry-points.opentelemetry_instrumentor] +algotune = "opentelemetry.instrumentation.algotune:AlgoTuneInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-algotune" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/algotune/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/__init__.py new file mode 100644 index 000000000..2f154dece --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/__init__.py @@ -0,0 +1,304 @@ +""" +OpenTelemetry AlgoTune Instrumentation +====================================== + +Automatic instrumentation for the `AlgoTune +`_ benchmark framework. + +This instrumentor produces the AlgoTune-business span tree +(``ENTRY`` / ``AGENT`` / ``STEP`` / ``TOOL`` / ``TASK``) and intentionally +**does not** create LLM spans for the LiteLLM call path. Those are +expected to be produced by an already-loaded LiteLLM instrumentor (e.g. +``opentelemetry-instrumentation-litellm`` or +``openinference-instrumentation-litellm``); they automatically become +children of the active ``STEP`` span thanks to OpenTelemetry context +propagation. + +A separate, **opt-in** wrapper exists for ``TogetherModel.query``, which +hits the Together API directly via ``requests.post`` and is therefore +not covered by the LiteLLM instrumentor. Enable it with the environment +variable ``ALGOTUNE_OTEL_INSTRUMENT_TOGETHER=true``. + +Span hierarchy +-------------- + +:: + + ENTRY: enter_ai_application_system ← AlgoTuner.main:main() + └── AGENT: invoke_agent AlgoTuner ← LLMInterface.run_task() + ├── STEP: react step [round=N] ← get_response + handle_function_call + │ ├── LLM: chat ← LiteLLM instrumentor (auto) + │ │ OR TogetherModel.query (this pkg) + │ └── TOOL: execute_tool ← CommandHandlers.handle_command + │ └── TASK: run_task benchmark.dataset_eval ← _runner_eval_dataset + │ ├── TASK: run_task benchmark.baseline_generation ← get_baseline_times + │ └── TASK: run_task benchmark.problem_eval [×N] ← evaluate_single + └── ... + +Usage +----- + +.. code:: python + + # 1) Load the LiteLLM instrumentor first so LLM spans are produced. + from opentelemetry.instrumentation.litellm import LiteLLMInstrumentor + LiteLLMInstrumentor().instrument() + + # 2) Then load the AlgoTune instrumentor for business spans. + from opentelemetry.instrumentation.algotune import AlgoTuneInstrumentor + AlgoTuneInstrumentor().instrument() + + # Run AlgoTune as normal. + # python -m AlgoTuner.main --model gpt-4o --task tsp + +Configuration +------------- + +Environment variables: + +* ``OTEL_INSTRUMENTATION_ALGOTUNE_ENABLED`` — master enable switch (default ``true``). +* ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT`` — capture + tool-call arguments / result messages (default ``false``). +* ``ALGOTUNE_OTEL_MAX_CONTENT_LENGTH`` — character truncation for string + attributes (default ``4096``). +* ``ALGOTUNE_OTEL_INSTRUMENT_TOGETHER`` — wrap ``TogetherModel.query`` with + a manual LLM span (default ``false``). + +API +--- +""" + +from __future__ import annotations + +import importlib +import logging +from typing import Any, Collection + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.algotune.config import ( + ALGOTUNE_OTEL_INSTRUMENT_TOGETHER, + OTEL_INSTRUMENTATION_ALGOTUNE_ENABLED, +) +from opentelemetry.instrumentation.algotune.package import _instruments +from opentelemetry.instrumentation.algotune.version import __version__ + +logger = logging.getLogger(__name__) + +__all__ = ["AlgoTuneInstrumentor"] + + +# Patch sites are (module_path, attribute_name) tuples. We use the source +# module so that the wrap survives import-order changes. +_PATCH_SITES: list[tuple[str, str, str]] = [ + # (logical_name, module_path, qualified_attribute) + ("main", "AlgoTuner.main", "main"), + ("run_task", "AlgoTuner.interfaces.llm_interface", "LLMInterface.run_task"), + ("get_response", "AlgoTuner.interfaces.llm_interface", "LLMInterface.get_response"), + ( + "handle_function_call", + "AlgoTuner.interfaces.llm_interface", + "LLMInterface.handle_function_call", + ), + ( + "handle_command", + "AlgoTuner.interfaces.commands.handlers", + "CommandHandlers.handle_command", + ), + ( + "_runner_eval_dataset", + "AlgoTuner.interfaces.commands.handlers", + "CommandHandlers._runner_eval_dataset", + ), + ( + "evaluate_single", + "AlgoTuner.utils.evaluator.evaluation_orchestrator", + "EvaluationOrchestrator.evaluate_single", + ), + ( + "get_baseline_times", + "AlgoTuner.utils.evaluator.baseline_manager", + "BaselineManager.get_baseline_times", + ), + ("query", "AlgoTuner.models.lite_llm_model", "LiteLLMModel.query"), + ( + "_execute_query", + "AlgoTuner.models.lite_llm_model", + "LiteLLMModel._execute_query", + ), +] + +_TOGETHER_PATCH_SITE: tuple[str, str, str] = ( + "together_query", + "AlgoTuner.models.together_model", + "TogetherModel.query", +) + + +def _safe_wrap(module_path: str, name: str, wrapper: Any) -> bool: + """Wrap ``module_path.name`` with ``wrapper``; swallow ImportError.""" + try: + wrap_function_wrapper(module_path, name, wrapper) + return True + except (ImportError, AttributeError) as exc: + logger.debug( + "AlgoTune: skipping wrap %s.%s (%s)", module_path, name, exc + ) + return False + except Exception as exc: # noqa: BLE001 + logger.warning( + "AlgoTune: could not wrap %s.%s: %s", module_path, name, exc + ) + return False + + +def _safe_unwrap(module_path: str, qualname: str) -> None: + """Restore an attribute wrapped by ``wrapt``. + + ``qualname`` may be ``"Class.method"`` or just ``"func"``. We walk the + module/class chain and restore via ``__wrapped__`` when present. + """ + try: + mod = importlib.import_module(module_path) + except ImportError: + return + + parts = qualname.split(".") + parent: Any = mod + for part in parts[:-1]: + parent = getattr(parent, part, None) + if parent is None: + return + leaf_name = parts[-1] + leaf = getattr(parent, leaf_name, None) + if leaf is None: + return + original = getattr(leaf, "__wrapped__", None) + if original is None: + return + try: + setattr(parent, leaf_name, original) + except Exception: # noqa: BLE001 + pass + + +class AlgoTuneInstrumentor(BaseInstrumentor): + """An instrumentor for the AlgoTune benchmark framework. + + Covers six AlgoTune-business span kinds: + + * **ENTRY** – ``AlgoTuner.main.main`` + * **AGENT** – ``LLMInterface.run_task`` + * **STEP** – ``LLMInterface.get_response`` (open) + + ``LLMInterface.handle_function_call`` (close) + * **TOOL** – ``CommandHandlers.handle_command`` + * **TASK** – ``CommandHandlers._runner_eval_dataset``, + ``EvaluationOrchestrator.evaluate_single``, + ``BaselineManager.get_baseline_times`` + + The LiteLLM call path (``LiteLLMModel.query`` / ``_execute_query``) + is wrapped only to publish ``algo.llm.retry_count`` onto the active + STEP span; **no LLM span is created**. LLM spans for that path are + expected from a separately-loaded LiteLLM instrumentor. + + The ``TogetherModel.query`` bypass (raw HTTP, not via ``litellm``) is + only wrapped when ``ALGOTUNE_OTEL_INSTRUMENT_TOGETHER=true``. + """ + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + if not OTEL_INSTRUMENTATION_ALGOTUNE_ENABLED: + logger.info("AlgoTune instrumentation disabled via env var") + return + + tracer_provider = kwargs.get("tracer_provider") + tracer = trace_api.get_tracer( + __name__, + __version__, + tracer_provider=tracer_provider, + ) + + from opentelemetry.instrumentation.algotune.internal.wrappers import ( + EvaluateSingleWrapper, + GetBaselineTimesWrapper, + GetResponseWrapper, + HandleCommandWrapper, + HandleFunctionCallWrapper, + LiteLLMExecuteQueryWrapper, + LiteLLMQueryWrapper, + MainWrapper, + RunTaskWrapper, + RunnerEvalDatasetWrapper, + TogetherModelQueryWrapper, + ) + + wrappers_by_name: dict[str, Any] = { + "main": MainWrapper(tracer), + "run_task": RunTaskWrapper(tracer), + "get_response": GetResponseWrapper(tracer), + "handle_function_call": HandleFunctionCallWrapper(), + "handle_command": HandleCommandWrapper(tracer), + "_runner_eval_dataset": RunnerEvalDatasetWrapper(tracer), + "evaluate_single": EvaluateSingleWrapper(tracer), + "get_baseline_times": GetBaselineTimesWrapper(tracer), + "query": LiteLLMQueryWrapper(), + "_execute_query": LiteLLMExecuteQueryWrapper(), + } + + for logical_name, module_path, qualname in _PATCH_SITES: + wrapper = wrappers_by_name.get(logical_name) + if wrapper is None: + continue + if not _safe_wrap(module_path, qualname, wrapper): + logger.info( + "AlgoTune: %s not yet importable; skipping wrap", + f"{module_path}.{qualname}", + ) + + if ALGOTUNE_OTEL_INSTRUMENT_TOGETHER: + logical, module_path, qualname = _TOGETHER_PATCH_SITE + _safe_wrap( + module_path, + qualname, + TogetherModelQueryWrapper(tracer), + ) + + # Best-effort sanity check: warn if no LiteLLM instrumentor is + # loaded -- the trace tree will still be valid but LLM spans will + # be missing. + if not _is_litellm_instrumented(): + logger.warning( + "AlgoTune instrumentation: litellm.completion does not look" + " instrumented. LLM spans will be missing from the trace" + " tree. Load opentelemetry-instrumentation-litellm (or" + " openinference-instrumentation-litellm) before AlgoTune" + " starts." + ) + + def _uninstrument(self, **kwargs: Any) -> None: + for _logical, module_path, qualname in _PATCH_SITES: + _safe_unwrap(module_path, qualname) + _logical, module_path, qualname = _TOGETHER_PATCH_SITE + _safe_unwrap(module_path, qualname) + + +def _is_litellm_instrumented() -> bool: + """Return ``True`` iff ``litellm.completion`` appears to be wrapped. + + We look for the ``__wrapped__`` attribute set by ``wrapt`` / + ``functools.wraps``. Returns ``False`` (no warning suppressed) when + ``litellm`` itself is not importable -- in that case AlgoTune will + fail before we get a chance to emit spans anyway. + """ + try: + import litellm # noqa: PLC0415 + except ImportError: + return False + completion = getattr(litellm, "completion", None) + if completion is None: + return False + return hasattr(completion, "__wrapped__") diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/config.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/config.py new file mode 100644 index 000000000..f012b43b2 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/config.py @@ -0,0 +1,75 @@ +"""Configuration via environment variables for AlgoTune instrumentation.""" + +from __future__ import annotations + +import os + + +def _bool_env(name: str, default: bool) -> bool: + val = os.getenv(name) + if val is None: + return default + return val.strip().lower() in {"true", "1", "yes", "on"} + + +def _int_env(name: str, default: str) -> int: + try: + return int(os.getenv(name, default)) + except ValueError: + return int(default) + + +def _float_env(name: str, default: str) -> float: + try: + return float(os.getenv(name, default)) + except ValueError: + return float(default) + + +def _genai_capture_enabled() -> bool: + val = os.getenv("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT") + if val is None: + return False + return val.strip().upper() in { + "TRUE", + "1", + "YES", + "ON", + "SPAN_ONLY", + "SPAN_AND_EVENT", + "EVENT_ONLY", + } + + +# Master enable switch +OTEL_INSTRUMENTATION_ALGOTUNE_ENABLED = _bool_env( + "OTEL_INSTRUMENTATION_ALGOTUNE_ENABLED", True +) + +# Whether to capture potentially sensitive content (tool args/results). +# LLM message content is controlled by the LiteLLM instrumentor itself. +OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT = _genai_capture_enabled() + +# Maximum length of any string attribute the instrumentor produces. +ALGOTUNE_OTEL_MAX_CONTENT_LENGTH = _int_env( + "ALGOTUNE_OTEL_MAX_CONTENT_LENGTH", "4096" +) + +# Slow-call thresholds (seconds) used by the Span-to-Metrics processor. +ALGOTUNE_OTEL_SLOW_TOOL_SECONDS = _float_env( + "ALGOTUNE_OTEL_SLOW_TOOL_SECONDS", "30" +) +ALGOTUNE_OTEL_SLOW_TASK_SECONDS = _float_env( + "ALGOTUNE_OTEL_SLOW_TASK_SECONDS", "60" +) +ALGOTUNE_OTEL_SLOW_AGENT_SECONDS = _float_env( + "ALGOTUNE_OTEL_SLOW_AGENT_SECONDS", "300" +) + +# Whether to wrap TogetherModel.query() with a manual LLM span. +# TogetherModel hits the Together API directly via requests.post and is NOT +# covered by the LiteLLM instrumentor. Default off so the LiteLLM-only +# environments stay clean. +ALGOTUNE_OTEL_INSTRUMENT_TOGETHER = _bool_env( + "ALGOTUNE_OTEL_INSTRUMENT_TOGETHER", False +) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/utils.py new file mode 100644 index 000000000..47836b435 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/utils.py @@ -0,0 +1,120 @@ +"""Shared helpers for AlgoTune wrappers.""" + +from __future__ import annotations + +from typing import Any, Optional +from opentelemetry.semconv._incubating.attributes import ( + gen_ai_attributes as GenAI, +) + +from opentelemetry.instrumentation.algotune.config import ( + ALGOTUNE_OTEL_MAX_CONTENT_LENGTH, +) + +# Aliyun ARMS GenAI conventions (mirrors the values used by the other Robin +# instrumentations such as minisweagent / pinchbench). +GEN_AI_SPAN_KIND = "gen_ai.span.kind" +GEN_AI_FRAMEWORK = "gen_ai.framework" +GEN_AI_USAGE_TOTAL_TOKENS = "gen_ai.usage.total_tokens" + +ALGOTUNE_FRAMEWORK_VALUE = "AlgoTune" + +# Instance attribute names used by wrappers to share state across hooks +# without polluting AlgoTune's public API. +INST_STEP_SPAN_ATTR = "_otel_algo_step_span" +INST_STEP_TOKEN_ATTR = "_otel_algo_step_token" +INST_ROUND_ATTR = "_otel_algo_round" +INST_LITELLM_ATTEMPTS_ATTR = "_otel_algo_litellm_attempts" + + +def truncate(text: Any, max_len: int = ALGOTUNE_OTEL_MAX_CONTENT_LENGTH) -> str: + """Coerce ``text`` to ``str`` and truncate it to ``max_len`` characters.""" + if text is None: + return "" + if not isinstance(text, str): + try: + text = str(text) + except Exception: # noqa: BLE001 + return "" + if len(text) <= max_len: + return text + if max_len <= 3: + return text[:max_len] + return text[: max_len - 3] + "..." + + +def provider_from_model(model_name: str) -> str: + """Best-effort provider inference from a LiteLLM-style model name. + + AlgoTune uses LiteLLM-style model identifiers (e.g. + ``openai/gpt-4o``, ``anthropic/claude-3-5-sonnet``). When no + explicit prefix is present we fall back to substring heuristics. + """ + if not model_name: + return "unknown" + name = model_name.lower() + if "/" in name: + prefix = name.split("/", 1)[0] + # LiteLLM accepts a handful of provider prefixes; map common ones. + if prefix in { + "openai", + "anthropic", + "vertex_ai", + "gemini", + "google", + "mistral", + "azure", + "azure_ai", + "bedrock", + "groq", + "deepseek", + "openrouter", + "together_ai", + }: + if prefix == "vertex_ai" or prefix == "gemini": + return "google" + if prefix == "azure_ai": + return "azure" + return prefix + if "claude" in name or "anthropic" in name: + return "anthropic" + if "gemini" in name or "vertex" in name or "google" in name: + return "google" + if "mistral" in name: + return "mistral" + if "deepseek" in name: + return "deepseek" + if "qwen" in name or "dashscope" in name: + return "dashscope" + if "gpt" in name or "openai" in name or "o1" in name or "o3" in name: + return "openai" + return "unknown" + + +def safe_close_step(instance: Any) -> None: + """End any STEP span dangling on ``instance`` and detach its context. + + Used as a safety net in ``run_task``'s ``finally`` block so that a STEP + span never outlives the AGENT span (e.g. when ``get_response`` returns + None and the loop ``break``s before ``handle_function_call`` runs, or + when an exception propagates past STEP cleanup). + """ + from opentelemetry import context as otel_context # local import + + span = getattr(instance, INST_STEP_SPAN_ATTR, None) + token = getattr(instance, INST_STEP_TOKEN_ATTR, None) + try: + if span is not None and span.is_recording(): + span.end() + except Exception: # noqa: BLE001 + pass + try: + if token is not None: + otel_context.detach(token) + except Exception: # noqa: BLE001 + pass + try: + setattr(instance, INST_STEP_SPAN_ATTR, None) + setattr(instance, INST_STEP_TOKEN_ATTR, None) + except Exception: # noqa: BLE001 + pass diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/wrappers.py new file mode 100644 index 000000000..3f0c7b3bb --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/internal/wrappers.py @@ -0,0 +1,1332 @@ +"""Wrapt wrappers for AlgoTune OpenTelemetry instrumentation. + +Span hierarchy (final selection):: + + ENTRY: enter_ai_application_system ← AlgoTuner.main:main() + └── AGENT: invoke_agent AlgoTuner ← LLMInterface.run_task() + ├── STEP: react step [round=N] ← get_response + handle_function_call + │ ├── LLM: chat ← LiteLLM instrumentor (auto) + │ │ OR TogetherModel.query (this pkg) + │ └── TOOL: execute_tool ← CommandHandlers.handle_command + │ └── TASK: run_task benchmark.dataset_eval ← _runner_eval_dataset + │ ├── TASK: run_task benchmark.baseline_generation ← get_baseline_times + │ └── TASK: run_task benchmark.problem_eval [×N] ← evaluate_single + └── ... + +This module never creates LLM spans for the LiteLLM path. The LiteLLM +instrumentor (loaded separately at runtime) is responsible for that and +naturally becomes a child of the active STEP span via OpenTelemetry +context propagation. The only LLM-layer hook here is a lightweight +attempt counter (``algo.llm.retry_count``) written onto the STEP span. +""" + +from __future__ import annotations + +import json +import logging +import os +import sys +import uuid +from typing import Any, Callable, Optional + +from opentelemetry import context as otel_context +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.algotune.config import ( + OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT, +) +from opentelemetry.instrumentation.algotune.internal.utils import ( + ALGOTUNE_FRAMEWORK_VALUE, + GEN_AI_FRAMEWORK, + GEN_AI_SPAN_KIND, + GEN_AI_USAGE_TOTAL_TOKENS, + INST_LITELLM_ATTEMPTS_ATTR, + INST_ROUND_ATTR, + INST_STEP_SPAN_ATTR, + INST_STEP_TOKEN_ATTR, + provider_from_model, + safe_close_step, + truncate, +) +from opentelemetry.semconv._incubating.attributes import ( + gen_ai_attributes as GenAI, +) +from opentelemetry.trace import ( + Span, + SpanKind, + Status, + StatusCode, + Tracer, + set_span_in_context, +) + +logger = logging.getLogger(__name__) + + +def _algotune_capture_span_content_enabled() -> bool: + raw = os.getenv("OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", "") + return raw.strip().upper() in { + "TRUE", + "1", + "YES", + "ON", + "SPAN_ONLY", + "SPAN_AND_EVENT", + } + + +def _text_value(value: Any) -> str: + if value is None: + return "" + if isinstance(value, str): + return value + try: + return json.dumps(value, ensure_ascii=False, default=str) + except Exception: # noqa: BLE001 + return str(value) + + +def _span_message(role: str, content: Any) -> dict[str, Any]: + return { + "role": role or "user", + "parts": [ + {"type": "text", "content": truncate(_text_value(content))} + ], + } + + +def _algotune_tool_definitions() -> list[dict[str, Any]]: + try: + from AlgoTuner.interfaces.commands.types import ( # noqa: PLC0415 + COMMAND_FORMATS, + ) + except Exception: # noqa: BLE001 + return [] + + definitions: list[dict[str, Any]] = [] + for name, fmt in COMMAND_FORMATS.items(): + description = ( + getattr(fmt, "description", "") or f"AlgoTune command {name}" + ) + example = getattr(fmt, "example", "") or "" + definitions.append( + { + "type": "function", + "name": str(name), + "description": truncate(str(description)), + "parameters": { + "type": "object", + "properties": { + "command": { + "type": "string", + "description": truncate( + str(example).strip() or str(description) + ), + } + }, + "required": ["command"], + }, + } + ) + return definitions + + +def _agent_content_attributes(instance: Any) -> dict[str, Any]: + if not _algotune_capture_span_content_enabled(): + return {} + + state = getattr(instance, "state", None) + messages = list(getattr(state, "messages", None) or []) + input_messages: list[dict[str, Any]] = [] + output_messages: list[dict[str, Any]] = [] + system_instructions: list[dict[str, Any]] = [] + + for msg in messages[-20:]: + if not isinstance(msg, dict): + continue + role = str(msg.get("role") or "user") + content = msg.get("content") + if role == "assistant": + output_messages.append(_span_message("assistant", content)) + elif role == "system": + system_instructions.append( + {"type": "text", "content": truncate(_text_value(content))} + ) + else: + input_messages.append(_span_message(role, content)) + + # AlgoTune puts its application instructions in the first user message. + # Surface that separately for UIs that render system instructions. + if not system_instructions and messages: + first = messages[0] + if isinstance(first, dict) and first.get("content"): + system_instructions.append( + { + "type": "text", + "content": truncate(_text_value(first.get("content"))), + } + ) + + tool_definitions = _algotune_tool_definitions() + attrs: dict[str, Any] = { + "algo.debug.input_messages.count": len(input_messages), + "algo.debug.output_messages.count": len(output_messages), + "algo.debug.system_instructions.count": len(system_instructions), + "algo.debug.tool_definitions.count": len(tool_definitions), + } + + # Keep parent span output compact; large parent attributes are commonly + # harder to render than LLM child attributes in trace UIs. + output_payload = output_messages[-1:] if output_messages else [] + attrs["gen_ai.output.messages"] = json.dumps( + output_payload, ensure_ascii=False, default=str + ) + if output_payload: + try: + attrs["output.value"] = truncate( + _text_value(output_payload[-1]["parts"][0].get("content", "")) + ) + except Exception: # noqa: BLE001 + pass + + if input_messages: + attrs["gen_ai.input.messages"] = json.dumps( + input_messages[-6:], ensure_ascii=False, default=str + ) + if system_instructions: + attrs["gen_ai.system_instructions"] = json.dumps( + system_instructions[:1], ensure_ascii=False, default=str + ) + if tool_definitions: + attrs["gen_ai.tool.definitions"] = json.dumps( + tool_definitions, ensure_ascii=False, default=str + ) + return attrs + + +def _publish_agent_content_attributes(instance: Any, *spans: Span) -> None: + attrs = _agent_content_attributes(instance) + if not attrs: + return + for span in spans: + try: + if span is not None and span.is_recording(): + span.set_attributes(attrs) + except Exception: # noqa: BLE001 + pass + + +def _task_json_value(value: Any) -> str: + try: + return truncate(json.dumps(value, ensure_ascii=False, default=str)) + except Exception: # noqa: BLE001 + return truncate(str(value)) + + +def _set_task_input(span: Span, value: Any) -> None: + span.set_attribute("input.mime_type", "application/json") + span.set_attribute("input.value", _task_json_value(value)) + + +def _set_task_output(span: Span, value: Any) -> None: + span.set_attribute("output.mime_type", "application/json") + span.set_attribute("output.value", _task_json_value(value)) + + +# --------------------------------------------------------------------------- +# ENTRY: AlgoTuner.main.main() +# --------------------------------------------------------------------------- + + +class MainWrapper: + """ENTRY span around ``AlgoTuner.main.main()``.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + session_id = uuid.uuid4().hex + argv_repr = "" + try: + argv_repr = " ".join(map(str, sys.argv[1:8])) + except Exception: # noqa: BLE001 + pass + + with self._tracer.start_as_current_span( + "enter_ai_application_system", kind=SpanKind.INTERNAL + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "ENTRY") + span.set_attribute("gen_ai.operation.name", "enter") + span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE) + span.set_attribute("gen_ai.session.id", session_id) + if argv_repr: + span.set_attribute("algotune.invocation.argv", truncate(argv_repr)) + + # Best-effort: pull --model and --task out of sys.argv so the + # ENTRY span carries the user's intent before main() finishes. + try: + argv = list(sys.argv[1:]) + for i, tok in enumerate(argv): + if tok == "--model" and i + 1 < len(argv): + span.set_attribute( + GenAI.GEN_AI_REQUEST_MODEL, argv[i + 1] + ) + elif tok == "--task" and i + 1 < len(argv): + span.set_attribute("algo.task.name", argv[i + 1]) + except Exception: # noqa: BLE001 + pass + + try: + return wrapped(*args, **kwargs) + except SystemExit as exc: + code = exc.code if isinstance(exc.code, int) else 0 + if code: + span.set_attribute("algotune.exit_code", int(code)) + span.set_status( + Status(StatusCode.ERROR, f"sys.exit({code})") + ) + raise + except MemoryError as exc: + span.set_attribute("error.type", "MemoryError") + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR, "MemoryError")) + raise + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + + +# --------------------------------------------------------------------------- +# AGENT: LLMInterface.run_task() +# --------------------------------------------------------------------------- + + +class RunTaskWrapper: + """AGENT span around ``LLMInterface.run_task()``.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + # Reset round counter at the beginning of each AGENT invocation. + try: + setattr(instance, INST_ROUND_ATTR, 0) + setattr(instance, INST_STEP_SPAN_ATTR, None) + setattr(instance, INST_STEP_TOKEN_ATTR, None) + except Exception: # noqa: BLE001 + pass + + model_name = str(getattr(instance, "model_name", "") or "") + parent_span = trace_api.get_current_span() + + with self._tracer.start_as_current_span( + "invoke_agent AlgoTuner", kind=SpanKind.INTERNAL + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "AGENT") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.INVOKE_AGENT.value, + ) + span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE) + span.set_attribute(GenAI.GEN_AI_AGENT_NAME, "AlgoTuner") + span.set_attribute( + GenAI.GEN_AI_AGENT_DESCRIPTION, + "Iterative code optimization agent for benchmark tasks", + ) + if model_name: + span.set_attribute(GenAI.GEN_AI_REQUEST_MODEL, model_name) + span.set_attribute( + GenAI.GEN_AI_PROVIDER_NAME, + provider_from_model(model_name), + ) + + terminated_reason: str = "unknown" + try: + result = wrapped(*args, **kwargs) + terminated_reason = self._infer_termination_reason(instance) + return result + except (KeyboardInterrupt, SystemExit) as exc: + terminated_reason = type(exc).__qualname__ + if isinstance(exc, SystemExit): + code = exc.code if isinstance(exc.code, int) else 0 + if code: + span.set_status( + Status(StatusCode.ERROR, f"sys.exit({code})") + ) + raise + except Exception as exc: + terminated_reason = "exception" + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + finally: + # Always close any dangling STEP span first so the trace tree + # never has STEP outliving AGENT. + safe_close_step(instance) + + rounds = int(getattr(instance, INST_ROUND_ATTR, 0) or 0) + span.set_attribute("algo.agent.total_rounds", rounds) + span.set_attribute("algo.agent.final_status", terminated_reason) + _publish_agent_content_attributes(instance, span, parent_span) + + # Spend / final eval bookkeeping (best-effort; AlgoTune may + # have torn the interface down by now). + try: + state = getattr(instance, "state", None) + if state is not None: + spend = getattr(state, "spend", None) + if spend is not None: + span.set_attribute( + "algo.agent.spend_usd", float(spend) + ) + except Exception: # noqa: BLE001 + pass + + try: + final_success = getattr( + instance, "_final_eval_success", None + ) + if final_success is not None: + span.set_attribute( + "algo.agent.final_eval_success", + bool(final_success), + ) + final_eval_result = getattr( + instance, "_final_eval_metrics", None + ) + if isinstance(final_eval_result, dict): + ms = final_eval_result.get("mean_speedup") + if ms is not None: + try: + span.set_attribute( + "algo.agent.final_mean_speedup", float(ms) + ) + except (TypeError, ValueError): + pass + except Exception: # noqa: BLE001 + pass + + span.add_event( + "agent.loop.terminated", + {"reason": terminated_reason}, + ) + + + @staticmethod + def _infer_termination_reason(instance: Any) -> str: + # Heuristics that align with the loop logic in + # LLMInterface.run_task() (line 996+). + try: + check = getattr(instance, "check_limits", None) + if callable(check) and check(): + return "terminated_by_limit" + except Exception: # noqa: BLE001 + pass + try: + if getattr(instance, "_final_eval_success", False): + return "completed" + except Exception: # noqa: BLE001 + pass + return "completed" + + +# --------------------------------------------------------------------------- +# STEP: LLMInterface.get_response() + handle_function_call() +# --------------------------------------------------------------------------- + + +class GetResponseWrapper: + """Open a STEP span when ``get_response`` starts a new react round.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + # Close any previously opened STEP span before starting a new one + # (covers the empty-response retry path where the loop ``continue``s + # without invoking handle_function_call). + safe_close_step(instance) + + round_n = int(getattr(instance, INST_ROUND_ATTR, 0) or 0) + 1 + try: + setattr(instance, INST_ROUND_ATTR, round_n) + setattr(instance, INST_LITELLM_ATTEMPTS_ATTR, 0) + except Exception: # noqa: BLE001 + pass + + span = self._tracer.start_span("react step", kind=SpanKind.INTERNAL) + span.set_attribute(GEN_AI_SPAN_KIND, "STEP") + span.set_attribute("gen_ai.operation.name", "react") + span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE) + span.set_attribute("gen_ai.react.round", round_n) + + ctx = set_span_in_context(span) + token = otel_context.attach(ctx) + try: + setattr(instance, INST_STEP_SPAN_ATTR, span) + setattr(instance, INST_STEP_TOKEN_ATTR, token) + except Exception: # noqa: BLE001 + pass + + try: + response = wrapped(*args, **kwargs) + except BaseException as exc: + span.set_attribute( + "gen_ai.react.finish_reason", type(exc).__qualname__ + ) + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + self._publish_attempt_count(instance, span) + try: + span.end() + finally: + otel_context.detach(token) + _clear_step_state(instance) + raise + + if response is None: + span.set_attribute("algo.step.response_empty", True) + span.set_attribute( + "gen_ai.react.finish_reason", "empty_response_retry" + ) + self._publish_attempt_count(instance, span) + try: + span.end() + finally: + otel_context.detach(token) + _clear_step_state(instance) + return response + + # Non-empty response: STEP stays open, handle_function_call wrapper + # is responsible for closing it. + return response + + @staticmethod + def _publish_attempt_count(instance: Any, span: Span) -> None: + try: + attempts = int(getattr(instance, INST_LITELLM_ATTEMPTS_ATTR, 0) or 0) + if attempts: + span.set_attribute("algo.llm.retry_count", attempts) + except Exception: # noqa: BLE001 + pass + + +class HandleFunctionCallWrapper: + """Close the STEP span opened by ``GetResponseWrapper`` after the tool + call (or its error path) completes.""" + + __slots__ = () + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + span: Optional[Span] = getattr(instance, INST_STEP_SPAN_ATTR, None) + token = getattr(instance, INST_STEP_TOKEN_ATTR, None) + + try: + result = wrapped(*args, **kwargs) + except BaseException as exc: + if span is not None and span.is_recording(): + span.set_attribute( + "gen_ai.react.finish_reason", type(exc).__qualname__ + ) + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + self._close_step(instance, span, token) + raise + + if span is not None and span.is_recording(): + # finish_reason recorded based on result shape + cmd_name = _extract_command_name(result) + if cmd_name: + span.set_attribute("algo.step.command_name", cmd_name) + span.set_attribute("gen_ai.react.finish_reason", "tool_executed") + try: + attempts = int( + getattr(instance, INST_LITELLM_ATTEMPTS_ATTR, 0) or 0 + ) + if attempts: + span.set_attribute("algo.llm.retry_count", attempts) + except Exception: # noqa: BLE001 + pass + + self._close_step(instance, span, token) + return result + + @staticmethod + def _close_step( + instance: Any, span: Optional[Span], token: Optional[Any] + ) -> None: + try: + if span is not None and span.is_recording(): + span.end() + except Exception: # noqa: BLE001 + pass + try: + if token is not None: + otel_context.detach(token) + except Exception: # noqa: BLE001 + pass + _clear_step_state(instance) + + +def _clear_step_state(instance: Any) -> None: + try: + setattr(instance, INST_STEP_SPAN_ATTR, None) + setattr(instance, INST_STEP_TOKEN_ATTR, None) + except Exception: # noqa: BLE001 + pass + + +def _extract_command_name(result: Any) -> str: + """Try to recover the executed command name from ``handle_function_call`` + output.""" + if not isinstance(result, dict): + return "" + # CommandResult-style payloads may carry the command name inside + # ``data`` or via ``status_field``-keyed entries; we keep this loose + # because the AlgoTune handlers vary per command. + for key in ("command", "name", "cmd"): + val = result.get(key) + if isinstance(val, str) and val: + return val + data = result.get("data") + if isinstance(data, dict): + for key in ("command", "name", "cmd"): + val = data.get(key) + if isinstance(val, str) and val: + return val + return "" + + +# --------------------------------------------------------------------------- +# TOOL: CommandHandlers.handle_command() +# --------------------------------------------------------------------------- + + +class HandleCommandWrapper: + """TOOL span around ``CommandHandlers.handle_command``.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + command_obj = args[0] if args else kwargs.get("command_str") + cmd_name, cmd_args, is_error_response = _parse_command(command_obj) + + span_name = f"execute_tool {cmd_name or 'unknown'}" + with self._tracer.start_as_current_span( + span_name, kind=SpanKind.INTERNAL + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "TOOL") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.EXECUTE_TOOL.value, + ) + span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE) + span.set_attribute(GenAI.GEN_AI_TOOL_NAME, cmd_name or "unknown") + span.set_attribute(GenAI.GEN_AI_TOOL_TYPE, "function") + span.set_attribute( + GenAI.GEN_AI_TOOL_DESCRIPTION, + "AlgoTune internal command", + ) + span.set_attribute(GenAI.GEN_AI_TOOL_CALL_ID, uuid.uuid4().hex) + + if is_error_response: + span.set_attribute("algotune.command.error_response", True) + + if ( + OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT + and cmd_args is not None + ): + try: + span.set_attribute( + GenAI.GEN_AI_TOOL_CALL_ARGUMENTS, + truncate(json.dumps(cmd_args, default=str)), + ) + except Exception: # noqa: BLE001 + pass + + try: + result = wrapped(*args, **kwargs) + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + + if isinstance(result, dict): + success = bool(result.get("success", False)) + span.set_attribute("algo.command.success", success) + if not success and not is_error_response: + span.set_status(Status(StatusCode.ERROR, "command failed")) + + if OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: + msg = result.get("message") + if msg: + span.set_attribute( + GenAI.GEN_AI_TOOL_CALL_RESULT, + truncate(msg), + ) + + # Best-effort snapshot detection (only present for ``edit``). + data = result.get("data") + if isinstance(data, dict): + snap = data.get("snapshot_saved") + if snap is not None: + try: + span.set_attribute( + "algo.snapshot.saved", bool(snap) + ) + except Exception: # noqa: BLE001 + pass + + return result + + +def _parse_command(command_obj: Any) -> tuple[str, Optional[dict], bool]: + """Extract ``(command_name, args_dict, is_error_response)`` from the + command object passed to ``handle_command``. + + AlgoTune passes either a ``ParsedCommand`` dataclass or a structured + error dict (see handlers.py line 226). + """ + if isinstance(command_obj, dict): + # Validation/parsing error dict path. + cmd = command_obj.get("command") or "error_response" + return str(cmd), None, True + name = getattr(command_obj, "command", None) + args = getattr(command_obj, "args", None) + if isinstance(args, dict): + return str(name or "unknown"), args, False + return str(name or "unknown"), None, False + + +# --------------------------------------------------------------------------- +# TASK(dataset_eval): CommandHandlers._runner_eval_dataset() +# --------------------------------------------------------------------------- + + +class RunnerEvalDatasetWrapper: + """TASK span around ``CommandHandlers._runner_eval_dataset``.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + data_subset = ( + args[0] if len(args) >= 1 else kwargs.get("data_subset", "") + ) + command_source = ( + args[1] if len(args) >= 2 else kwargs.get("command_source", "") + ) + + with self._tracer.start_as_current_span( + "run_task benchmark.dataset_eval", kind=SpanKind.INTERNAL + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "TASK") + span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "run_task") + span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE) + span.set_attribute("gen_ai.task.name", "benchmark.dataset_eval") + if data_subset: + span.set_attribute("algo.eval.subset", str(data_subset)) + if command_source: + span.set_attribute( + "algo.eval.command_source", str(command_source) + ) + _set_task_input( + span, + { + "task": "benchmark.dataset_eval", + "data_subset": str(data_subset) if data_subset else "", + "command_source": str(command_source) + if command_source + else "", + }, + ) + + interface = getattr(instance, "interface", None) + try: + max_samples = getattr(interface, "max_samples", None) + span.set_attribute( + "algo.eval.test_mode", max_samples is not None + ) + except Exception: # noqa: BLE001 + pass + + try: + result = wrapped(*args, **kwargs) + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + else: + self._record_eval_attributes(span, result) + try: + result_data = result.data if hasattr(result, "data") else result + _set_task_output( + span, + { + "success": getattr(result, "success", None), + "status": getattr(result, "status", None), + "message": getattr(result, "message", None), + "data": result_data, + }, + ) + except Exception: # noqa: BLE001 + pass + return result + finally: + pass + + @staticmethod + def _record_eval_attributes(span: Span, result: Any) -> None: + # ``result`` is typically a ``CommandResult`` dataclass with .data + # carrying aggregate evaluation values, but downstream code also accepts + # raw dicts. We use getattr/dict-access defensively. + try: + data = result.data if hasattr(result, "data") else result + except Exception: # noqa: BLE001 + data = None + + if not isinstance(data, dict): + return + + # The aggregate payload may live at the top level or inside + # ``data``/``raw``/``metrics``. + candidates = [data] + for key in ("aggregate_metrics", "metrics", "raw"): + sub = data.get(key) if isinstance(data, dict) else None + if isinstance(sub, dict): + candidates.append(sub) + + for src in candidates: + for src_key, dst_attr, caster in ( + ("num_evaluated", "algo.eval.total_problems", int), + ("mean_speedup", "algo.eval.mean_speedup", float), + ("num_valid", "algo.eval.num_valid", int), + ("num_invalid", "algo.eval.num_invalid", int), + ("num_timeout", "algo.eval.num_timeout", int), + ): + if src_key in src and src[src_key] is not None: + try: + span.set_attribute(dst_attr, caster(src[src_key])) + except (TypeError, ValueError): + pass + + +# --------------------------------------------------------------------------- +# TASK(problem_eval): EvaluationOrchestrator.evaluate_single() +# --------------------------------------------------------------------------- + + +class EvaluateSingleWrapper: + """TASK span around ``EvaluationOrchestrator.evaluate_single``.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + problem_id = kwargs.get("problem_id", "problem") + problem_index = kwargs.get("problem_index", 0) + baseline_time_ms = kwargs.get("baseline_time_ms") + + with self._tracer.start_as_current_span( + "run_task benchmark.problem_eval", kind=SpanKind.INTERNAL + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "TASK") + span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "run_task") + span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE) + span.set_attribute("gen_ai.task.name", "benchmark.problem_eval") + span.set_attribute("algo.problem.id", str(problem_id)) + try: + span.set_attribute("algo.problem.index", int(problem_index)) + except (TypeError, ValueError): + pass + if baseline_time_ms is not None: + try: + span.set_attribute( + "algo.problem.baseline_time_ms", float(baseline_time_ms) + ) + except (TypeError, ValueError): + pass + _set_task_input( + span, + { + "task": "benchmark.problem_eval", + "problem_id": str(problem_id), + "problem_index": problem_index, + "baseline_time_ms": baseline_time_ms, + "kwargs": kwargs, + }, + ) + + try: + result = wrapped(*args, **kwargs) + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + else: + self._record_problem_attributes(span, result) + try: + _set_task_output( + span, + { + "speedup": _safe_get(result, "speedup"), + "solver_time_ms": _safe_get( + result, "solver_time_ms" + ), + "is_valid": _safe_get(result, "is_valid"), + "error_type": _safe_get( + _safe_get(result, "execution"), + "error_type", + ), + }, + ) + except Exception: # noqa: BLE001 + pass + return result + finally: + pass + + @staticmethod + def _record_problem_attributes(span: Span, result: Any) -> None: + # ``ProblemResult`` is a dataclass; defensive getattr handles + # alternate shapes (dict / namedtuple). + speedup = _safe_get(result, "speedup") + if speedup is not None: + try: + span.set_attribute("algo.problem.speedup", float(speedup)) + except (TypeError, ValueError): + pass + + solver_time = _safe_get(result, "solver_time_ms") + if solver_time is not None: + try: + span.set_attribute( + "algo.problem.solver_time_ms", float(solver_time) + ) + except (TypeError, ValueError): + pass + + is_valid = _safe_get(result, "is_valid") + if is_valid is not None: + try: + span.set_attribute("algo.problem.is_valid", bool(is_valid)) + except (TypeError, ValueError): + pass + + execution = _safe_get(result, "execution") + if execution is not None: + timed_out = _safe_get(execution, "timeout_occurred") + if timed_out is not None: + try: + span.set_attribute( + "algo.problem.timeout_occurred", bool(timed_out) + ) + except (TypeError, ValueError): + pass + err_type = _safe_get(execution, "error_type") + if err_type is not None: + value = getattr(err_type, "value", err_type) + span.set_attribute("algo.problem.error_type", str(value)) + + +def _safe_get(obj: Any, name: str) -> Any: + if obj is None: + return None + if isinstance(obj, dict): + return obj.get(name) + return getattr(obj, name, None) + + +# --------------------------------------------------------------------------- +# TASK(baseline): BaselineManager.get_baseline_times() +# --------------------------------------------------------------------------- + + +class GetBaselineTimesWrapper: + """TASK span around ``BaselineManager.get_baseline_times``. + + Special-cased to keep the span healthy across ``SystemExit(1)`` + raised from inside the retry loop on fatal failure. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + subset = args[0] if args else kwargs.get("subset", "") + cache_hit = False + try: + cache = getattr(instance, "_cache", None) + if isinstance(cache, dict) and cache.get(subset) is not None: + cache_hit = True + except Exception: # noqa: BLE001 + pass + + with self._tracer.start_as_current_span( + "run_task benchmark.baseline_generation", + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "TASK") + span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "run_task") + span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE) + span.set_attribute( + "gen_ai.task.name", "benchmark.baseline_generation" + ) + if subset: + span.set_attribute("algo.baseline.subset", str(subset)) + span.set_attribute("algo.baseline.cache_hit", cache_hit) + _set_task_input( + span, + { + "task": "benchmark.baseline_generation", + "subset": str(subset) if subset else "", + "cache_hit": cache_hit, + }, + ) + + try: + result = wrapped(*args, **kwargs) + except SystemExit as exc: + code = exc.code if isinstance(exc.code, int) else 1 + span.add_event( + "baseline.fatal_failure", {"exit_code": int(code)} + ) + span.set_status( + Status( + StatusCode.ERROR, + "Baseline generation fatal failure", + ) + ) + raise + except BaseException as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + else: + if isinstance(result, dict): + span.set_attribute( + "algo.baseline.actual_count", len(result) + ) + _set_task_output( + span, + { + "count": len(result) if isinstance(result, dict) else None, + "result": result, + }, + ) + return result + finally: + pass + + +# --------------------------------------------------------------------------- +# LLM retry counters (no spans). Cooperates with the LiteLLM instrumentor +# which is responsible for actual LLM spans. +# --------------------------------------------------------------------------- + + +class LiteLLMQueryWrapper: + """Wrap ``LiteLLMModel.query`` to publish ``algo.llm.retry_count`` onto + the active STEP span. **Never creates a span.**""" + + __slots__ = () + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + # Use the LLMInterface instance (carrying the STEP span) which is + # accessible from the model only indirectly. We instead read the + # current span and treat it as the STEP if its kind matches. + step_span = trace_api.get_current_span() + # Reset attempt count on this LiteLLMModel instance for this call. + try: + setattr(instance, "_otel_algo_litellm_call_attempts", 0) + except Exception: # noqa: BLE001 + pass + try: + return wrapped(*args, **kwargs) + finally: + try: + attempts = int( + getattr( + instance, "_otel_algo_litellm_call_attempts", 0 + ) + or 0 + ) + if ( + attempts + and step_span is not None + and step_span.is_recording() + ): + # Surface raw per-call attempts as a separate attribute + # (the wrapping STEP also aggregates across multiple + # query() invocations via INST_LITELLM_ATTEMPTS_ATTR). + step_span.set_attribute( + "algo.llm.last_call_attempts", attempts + ) + except Exception: # noqa: BLE001 + pass + + +class LiteLLMExecuteQueryWrapper: + """Wrap ``LiteLLMModel._execute_query`` to count attempts. + + Each call corresponds to one ``litellm.completion()`` invocation. We + increment a counter on both the LiteLLMModel instance (for the per-call + metric above) and on the LLMInterface instance hosting the STEP + span (for the total per-step retry count).""" + + __slots__ = () + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + # Per-call attempts (on LiteLLMModel instance). + try: + cur = int( + getattr(instance, "_otel_algo_litellm_call_attempts", 0) or 0 + ) + setattr(instance, "_otel_algo_litellm_call_attempts", cur + 1) + except Exception: # noqa: BLE001 + pass + + # Per-step attempts (on LLMInterface instance, located via the + # current STEP span's holder). Walk up the wrapt context: the + # LLMInterface owns the LiteLLMModel via ``self.model``, so we + # use a global registry-free approach by looking at the active + # span's instance binding through the OTel context stack. + active = trace_api.get_current_span() + if active is not None and active.is_recording(): + # We can't directly resolve the LLMInterface from the active span, + # so we increment a counter we keep on the active span itself. + try: + # Read existing total via OTel attribute is not supported; + # we keep our own counter on the span object via a private + # attribute. ``Span`` doesn't expose attribute reads, so + # we maintain a side-band store via setattr on ``active`` + # only when it's a typed mutable Span (SDK ``ReadableSpan`` + # is hashable and supports attribute assignment in CPython). + cur_total = getattr(active, "_otel_algo_step_attempts", 0) + 1 + try: + setattr(active, "_otel_algo_step_attempts", cur_total) + except Exception: # noqa: BLE001 + cur_total = 0 + if cur_total: + active.set_attribute("algo.llm.retry_count", cur_total) + except Exception: # noqa: BLE001 + pass + + return wrapped(*args, **kwargs) + + +# --------------------------------------------------------------------------- +# LLM (optional bypass): TogetherModel.query() +# --------------------------------------------------------------------------- + + +class TogetherModelQueryWrapper: + """LLM span around ``TogetherModel.query``. + + Together's HTTP client is invoked directly via ``requests.post`` and + therefore not covered by the LiteLLM instrumentor. This wrapper is + **opt-in** via ``ALGOTUNE_OTEL_INSTRUMENT_TOGETHER=true``. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + model_name = str(getattr(instance, "model_name", "") or "unknown") + span_name = f"chat {model_name}" + defaults = getattr(instance, "default_params", None) or {} + + with self._tracer.start_as_current_span( + span_name, kind=SpanKind.CLIENT + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "LLM") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.CHAT.value, + ) + span.set_attribute(GEN_AI_FRAMEWORK, ALGOTUNE_FRAMEWORK_VALUE) + span.set_attribute(GenAI.GEN_AI_REQUEST_MODEL, model_name) + span.set_attribute(GenAI.GEN_AI_PROVIDER_NAME, "together_ai") + + try: + if isinstance(defaults, dict): + if "temperature" in defaults and defaults["temperature"] is not None: + span.set_attribute( + GenAI.GEN_AI_REQUEST_TEMPERATURE, + float(defaults["temperature"]), + ) + if "top_p" in defaults and defaults["top_p"] is not None: + span.set_attribute( + GenAI.GEN_AI_REQUEST_TOP_P, + float(defaults["top_p"]), + ) + if ( + "max_tokens" in defaults + and defaults["max_tokens"] is not None + ): + span.set_attribute( + GenAI.GEN_AI_REQUEST_MAX_TOKENS, + int(defaults["max_tokens"]), + ) + except Exception: # noqa: BLE001 + pass + + input_tokens = 0 + output_tokens = 0 + try: + result = wrapped(*args, **kwargs) + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + else: + if isinstance(result, dict): + cost = result.get("cost") + if cost is not None: + try: + span.set_attribute( + "algo.llm.response_cost_usd", float(cost) + ) + except (TypeError, ValueError): + pass + usage = result.get("usage") + if isinstance(usage, dict): + input_tokens, output_tokens = _extract_together_usage( + usage + ) + if input_tokens: + span.set_attribute( + GenAI.GEN_AI_USAGE_INPUT_TOKENS, input_tokens + ) + if output_tokens: + span.set_attribute( + GenAI.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens + ) + total = ( + usage.get("total_tokens") + if usage.get("total_tokens") is not None + else (input_tokens + output_tokens or None) + ) + if total: + try: + span.set_attribute( + GEN_AI_USAGE_TOTAL_TOKENS, int(total) + ) + except (TypeError, ValueError): + pass + if OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT: + msg = result.get("message") + if msg: + span.set_attribute( + GenAI.GEN_AI_OUTPUT_MESSAGES, truncate(msg) + ) + return result + finally: + pass + + +def _extract_together_usage(usage: dict) -> tuple[int, int]: + """Pick (input_tokens, output_tokens) from Together's usage payload. + + Together returns OpenAI-compatible ``prompt_tokens`` / + ``completion_tokens`` but we tolerate ``input_tokens`` / ``output_tokens`` + as well in case the upstream schema drifts. + """ + inp = usage.get("prompt_tokens") + if inp is None: + inp = usage.get("input_tokens") + out = usage.get("completion_tokens") + if out is None: + out = usage.get("output_tokens") + try: + inp_i = int(inp) if inp is not None else 0 + except (TypeError, ValueError): + inp_i = 0 + try: + out_i = int(out) if out is not None else 0 + except (TypeError, ValueError): + out_i = 0 + return inp_i, out_i diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/package.py new file mode 100644 index 000000000..758567afc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/package.py @@ -0,0 +1,3 @@ +_instruments = () + +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/version.py new file mode 100644 index 000000000..3dc1f76bc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-algotune/src/opentelemetry/instrumentation/algotune/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md new file mode 100644 index 000000000..62fb6539b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/CHANGELOG.md @@ -0,0 +1,22 @@ +# Changelog + +All notable changes to the LoongSuite BFCL v4 instrumentation are documented +in this file. + +## Unreleased + +### Added + +- Initial release of `loongsuite-instrumentation-bfclv4`. +- ENTRY span around `bfcl_eval._llm_response_generation.generate_results`. +- AGENT span around `bfcl_eval.model_handler.base_handler.BaseHandler.inference` + with cross-thread OTel context propagation via a narrow patch of + `bfcl_eval._llm_response_generation.ThreadPoolExecutor`. +- STEP spans created by reflectively wrapping each handler's + `_query_FC` / `_query_prompting` (discovered via + `bfcl_eval.constants.model_config.MODEL_CONFIG_MAPPING`). +- Per-call TOOL spans emitted by wrapping + `bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils.execute_multi_turn_func_call`. +- Provider override mapping for OSS handlers (vLLM / SGLang). +- Multi-turn `bfcl.turn_idx` and ReAct `gen_ai.react.round` tracking via + `contextvars`. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md new file mode 100644 index 000000000..7a4e5d69d --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/README.md @@ -0,0 +1,79 @@ +# LoongSuite BFCL v4 Instrumentation + +LoongSuite Python instrumentation for the [Berkeley Function Call +Leaderboard v4](https://github.com/ShishirPatil/gorilla/tree/main/berkeley-function-call-leaderboard) +(`bfcl-eval`, package `bfcl_eval`). + +## Span Topology + +``` +ENTRY enter_ai_application_system gen_ai.span.kind=ENTRY, op=enter +└─ AGENT invoke_agent {test_entry_id} gen_ai.span.kind=AGENT, op=invoke_agent + ├─ STEP react step gen_ai.span.kind=STEP, op=react + │ ├─ LLM chat {model} (created by downstream vendor SDK probe) + │ └─ TOOL execute_tool {fn} gen_ai.span.kind=TOOL, op=execute_tool + └─ STEP react step + └─ ... +``` + +This instrumentation deliberately does **not** create LLM spans. They are +emitted by the downstream vendor SDK probe (OpenAI / Anthropic / Google / +DashScope / LiteLLM / etc.) so that token usage and request payloads stay in +sync with the SDK that actually performed the request. + +## Installation + +```bash +pip install loongsuite-instrumentation-bfclv4 +``` + +## Usage + +```bash +opentelemetry-instrument bfcl generate \ + --model gpt-4o-2024-11-20-FC \ + --test-category simple_python \ + --num-threads 2 +``` + +Or programmatically: + +```python +from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + +BFCLv4Instrumentor().instrument() +# ... run BFCL ... +BFCLv4Instrumentor().uninstrument() +``` + +## Compatibility With Downstream LLM SDK Probes + +| Scenario | Recommended downstream probe | +| --- | --- | +| OpenAI / OpenAI Responses / OSS via vLLM / SGLang / DeepSeek (OpenAI-compatible) | `opentelemetry-instrumentation-openai` | +| Anthropic / Claude | `loongsuite-instrumentation-claude-agent-sdk` | +| Gemini / Google | `loongsuite-instrumentation-google-adk` | +| Qwen / DashScope | `loongsuite-instrumentation-dashscope` | +| LiteLLM | `loongsuite-instrumentation-litellm` | + +## OSS Provider Notes + +For OSS handlers (vLLM / SGLang served via the OpenAI-compatible API), the +BFCL probe sets `gen_ai.provider.name` to `vllm` / `sglang` / `oss` and adds +`bfcl.oss.backend` for disambiguation. Downstream OpenAI probes will still +report `gen_ai.provider.name=openai` on the LLM span; this is expected. + +## Custom Attributes + +| Attribute | Where | Description | +| --- | --- | --- | +| `gen_ai.framework` = `bfclv4` | ENTRY/AGENT/STEP/TOOL | Framework tag | +| `bfcl.test_category` | ENTRY/AGENT | Test category | +| `bfcl.num_threads` | ENTRY | Configured thread pool size | +| `bfcl.test_case_count` | ENTRY | Number of test cases | +| `bfcl.run_ids` | ENTRY | Whether the run targeted specific IDs | +| `bfcl.test_entry_id` | AGENT | Test entry id | +| `bfcl.turn_idx` | STEP | Multi-turn turn index (0-based) | +| `bfcl.query_mode` | STEP | `FC` or `prompting` | +| `bfcl.oss.backend` | AGENT/STEP | `vllm` / `sglang` / `unknown` (only OSS) | +| `bfcl.tool.duration_is_estimated` | TOOL | True (latency is averaged across batch) | diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml new file mode 100644 index 000000000..3eeb5d026 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-bfclv4" +dynamic = ["version"] +description = "LoongSuite BFCL v4 (Berkeley Function Call Leaderboard) instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.10,<4" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "opentelemetry-api >= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.0.0, < 2.0.0", + "opentelemetry-util-genai >= 0.3b0.dev0", +] + +[project.optional-dependencies] +instruments = [ + "bfcl-eval >= 4.0.0", +] + +[project.entry-points.opentelemetry_instrumentor] +bfclv4 = "opentelemetry.instrumentation.bfclv4:BFCLv4Instrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/bfclv4/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py new file mode 100644 index 000000000..6a7729940 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/__init__.py @@ -0,0 +1,322 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""LoongSuite BFCL v4 (Berkeley Function Call Leaderboard) instrumentation. + +Usage +----- + +.. code:: python + + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + + BFCLv4Instrumentor().instrument() + # ... run BFCL ... + BFCLv4Instrumentor().uninstrument() + +API +--- +""" + +from __future__ import annotations + +import importlib +import logging +from typing import Any, Collection, List, Tuple + +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + BaseHandlerInferenceWrapper, + ExecuteFuncCallWrapper, + GenerateResultsWrapper, + QueryWrapper, + TurnBumpWrapper, +) +from opentelemetry.instrumentation.bfclv4.package import _instruments +from opentelemetry.instrumentation.bfclv4.utils import GenAIHookHelper +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap + +logger = logging.getLogger(__name__) + +__all__ = ["BFCLv4Instrumentor"] + + +_GENERATE_RESULTS_MODULE = "bfcl_eval._llm_response_generation" +_GENERATE_RESULTS_NAME = "generate_results" + +_BASE_HANDLER_MODULE = "bfcl_eval.model_handler.base_handler" +_BASE_HANDLER_NAME = "BaseHandler.inference" + +_EXECUTE_TOOL_MODULE = ( + "bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils" +) +_EXECUTE_TOOL_NAME = "execute_multi_turn_func_call" + + +# ``MODEL_CONFIG_MAPPING`` already imports every concrete handler at module +# load time, so iterating over its values gives us the canonical handler +# class set without risking new vendor SDK imports. +def _iter_handler_classes() -> List[type]: + try: + from bfcl_eval.constants.model_config import ( # noqa: PLC0415 + MODEL_CONFIG_MAPPING, + ) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: cannot import MODEL_CONFIG_MAPPING: %s", exc + ) + return [] + + classes: List[type] = [] + seen_class_ids: set[int] = set() + for cfg in MODEL_CONFIG_MAPPING.values(): + cls = getattr(cfg, "model_handler", None) + if cls is None or not isinstance(cls, type): + continue + if id(cls) in seen_class_ids: + continue + seen_class_ids.add(id(cls)) + classes.append(cls) + return classes + + +class BFCLv4Instrumentor(BaseInstrumentor): + """An instrumentor for the BFCL v4 (``bfcl_eval``) framework.""" + + def __init__(self) -> None: + super().__init__() + if not hasattr(self, "_wrapped_query_methods"): + self._wrapped_query_methods: List[Tuple[type, str]] = [] + if not hasattr(self, "_wrapped_turn_methods"): + self._wrapped_turn_methods: List[Tuple[type, str]] = [] + if not hasattr(self, "_entry_wrapped"): + self._entry_wrapped = False + if not hasattr(self, "_inference_wrapped"): + self._inference_wrapped = False + if not hasattr(self, "_tool_wrapped"): + self._tool_wrapped = False + if not hasattr(self, "_tool_targets"): + self._tool_targets: List[Tuple[str, str]] = [] + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + # ------------------------------------------------------------------ + # _instrument + + def _instrument(self, **kwargs: Any) -> None: # noqa: D401 + helper = GenAIHookHelper() + + # 1) ENTRY ----------------------------------------------------- + try: + wrap_function_wrapper( + _GENERATE_RESULTS_MODULE, + _GENERATE_RESULTS_NAME, + GenerateResultsWrapper(helper), + ) + self._entry_wrapped = True + except Exception as exc: # noqa: BLE001 + logger.warning( + "bfclv4: failed to wrap %s.%s: %s", + _GENERATE_RESULTS_MODULE, + _GENERATE_RESULTS_NAME, + exc, + ) + + # 2) AGENT ----------------------------------------------------- + try: + wrap_function_wrapper( + _BASE_HANDLER_MODULE, + _BASE_HANDLER_NAME, + BaseHandlerInferenceWrapper(helper), + ) + self._inference_wrapped = True + except Exception as exc: # noqa: BLE001 + logger.warning( + "bfclv4: failed to wrap %s.%s: %s", + _BASE_HANDLER_MODULE, + _BASE_HANDLER_NAME, + exc, + ) + + # 3) STEP + 4) turn maintenance -------------------------------- + self._instrument_handlers(helper) + + # 5) TOOL ------------------------------------------------------ + # ``execute_multi_turn_func_call`` is re-exported via ``from ... import`` + # in several BFCL modules, so wrapping just the source module misses + # the call sites that use the local binding. We wrap each known + # re-export site as well to guarantee the TOOL span is always emitted. + tool_targets = [ + (_EXECUTE_TOOL_MODULE, _EXECUTE_TOOL_NAME), + ( + "bfcl_eval.model_handler.base_handler", + _EXECUTE_TOOL_NAME, + ), + ( + "bfcl_eval.eval_checker.multi_turn_eval.multi_turn_checker", + _EXECUTE_TOOL_NAME, + ), + ] + wrapper_instance = ExecuteFuncCallWrapper(helper) + self._tool_targets = [] + for module_name, attr_name in tool_targets: + try: + wrap_function_wrapper( + module_name, + attr_name, + wrapper_instance, + ) + self._tool_targets.append((module_name, attr_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s: %s", + module_name, + attr_name, + exc, + ) + self._tool_wrapped = bool(self._tool_targets) + + def _instrument_handlers(self, helper: GenAIHookHelper) -> None: + # Reflectively wrap every concrete ``_query_FC`` / ``_query_prompting`` + # plus the turn-maintenance helpers; we de-duplicate by function id so + # subclasses that share an inherited implementation are wrapped only + # once. + seen_func_ids: set[int] = set() + + query_pairs = ( + ("_query_FC", "FC"), + ("_query_prompting", "prompting"), + ) + turn_pairs = ( + ("add_first_turn_message_FC", True), + ("add_first_turn_message_prompting", True), + ("_add_next_turn_user_message_FC", False), + ("_add_next_turn_user_message_prompting", False), + ) + + for cls in _iter_handler_classes(): + class_dict = getattr(cls, "__dict__", {}) + for method_name, mode in query_pairs: + method = class_dict.get(method_name) + if method is None or not callable(method): + continue + key = id(method) + if key in seen_func_ids: + continue + seen_func_ids.add(key) + try: + wrap_function_wrapper( + cls.__module__, + f"{cls.__name__}.{method_name}", + QueryWrapper(helper, mode), + ) + self._wrapped_query_methods.append((cls, method_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s.%s: %s", + cls.__module__, + cls.__name__, + method_name, + exc, + ) + + for method_name, is_first in turn_pairs: + method = class_dict.get(method_name) + if method is None or not callable(method): + continue + key = id(method) + if key in seen_func_ids: + continue + seen_func_ids.add(key) + try: + wrap_function_wrapper( + cls.__module__, + f"{cls.__name__}.{method_name}", + TurnBumpWrapper(reset=is_first), + ) + self._wrapped_turn_methods.append((cls, method_name)) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to wrap %s.%s.%s: %s", + cls.__module__, + cls.__name__, + method_name, + exc, + ) + + # ------------------------------------------------------------------ + # _uninstrument + + def _uninstrument(self, **kwargs: Any) -> None: # noqa: D401 + if self._tool_wrapped: + for module_name, attr_name in getattr(self, "_tool_targets", []): + try: + module = importlib.import_module(module_name) + unwrap(module, attr_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + module_name, + attr_name, + exc, + ) + self._tool_targets = [] + self._tool_wrapped = False + + for cls, method_name in self._wrapped_query_methods: + try: + unwrap(cls, method_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + cls.__name__, + method_name, + exc, + ) + self._wrapped_query_methods = [] + + for cls, method_name in self._wrapped_turn_methods: + try: + unwrap(cls, method_name) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap %s.%s: %s", + cls.__name__, + method_name, + exc, + ) + self._wrapped_turn_methods = [] + + if self._inference_wrapped: + try: + base_module = importlib.import_module(_BASE_HANDLER_MODULE) + unwrap(base_module.BaseHandler, "inference") + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap BaseHandler.inference: %s", exc + ) + self._inference_wrapped = False + + if self._entry_wrapped: + try: + module = importlib.import_module(_GENERATE_RESULTS_MODULE) + unwrap(module, _GENERATE_RESULTS_NAME) + except Exception as exc: # noqa: BLE001 + logger.debug( + "bfclv4: failed to unwrap generate_results: %s", exc + ) + self._entry_wrapped = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py new file mode 100644 index 000000000..b0a6f4284 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/__init__.py @@ -0,0 +1,13 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py new file mode 100644 index 000000000..774200aba --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/attributes.py @@ -0,0 +1,38 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Constant attribute keys used by the BFCL v4 instrumentation.""" + +from __future__ import annotations + +from typing import Final + +FRAMEWORK_NAME: Final = "bfclv4" + +# gen_ai.* attribute keys that are not exported by +# opentelemetry-semantic-conventions today. +GEN_AI_FRAMEWORK: Final = "gen_ai.framework" +GEN_AI_PROVIDER_NAME: Final = "gen_ai.provider.name" + +# BFCL-specific (vendor) attribute keys. +BFCL_TEST_CATEGORY: Final = "bfcl.test_category" +BFCL_NUM_THREADS: Final = "bfcl.num_threads" +BFCL_TEST_CASE_COUNT: Final = "bfcl.test_case_count" +BFCL_RUN_IDS: Final = "bfcl.run_ids" +BFCL_TEST_ENTRY_ID: Final = "bfcl.test_entry_id" +BFCL_TURN_IDX: Final = "bfcl.turn_idx" +BFCL_QUERY_MODE: Final = "bfcl.query_mode" +BFCL_OSS_BACKEND: Final = "bfcl.oss.backend" +BFCL_TOOL_DURATION_IS_ESTIMATED: Final = "bfcl.tool.duration_is_estimated" +BFCL_TOOL_INDEX: Final = "bfcl.tool.index" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py new file mode 100644 index 000000000..efa2c77dc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/provider.py @@ -0,0 +1,71 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Map BFCL ``ModelStyle`` enum values to ``gen_ai.provider.name``.""" + +from __future__ import annotations + +import os +from typing import Any, Dict, Tuple + +from opentelemetry.instrumentation.bfclv4.internal.attributes import ( + BFCL_OSS_BACKEND, +) + +# The BFCL backend name (vllm / sglang / ...) is communicated from the ENTRY +# wrapper to the per-thread STEP/AGENT wrappers via this env var. The ENTRY +# wrapper writes to it before invoking the wrapped function and clears it in +# the ``finally`` clause. +OSS_BACKEND_ENV = "BFCL_BACKEND" + + +def infer_provider(handler: Any) -> Tuple[str, Dict[str, Any]]: + """Return ``(provider_name, extra_attributes)`` for a BFCL handler. + + Falls back to ``"unknown"`` if BFCL is not importable or if the handler + has no ``model_style`` attribute. + """ + + try: + from bfcl_eval.constants.enums import ( # noqa: PLC0415 + ModelStyle, + ) + except ImportError: + return "unknown", {} + + style = getattr(handler, "model_style", None) + if style is None: + return "unknown", {} + + if style is ModelStyle.OSSMODEL: + backend = (os.getenv(OSS_BACKEND_ENV) or "").lower() + if backend in ("vllm", "sglang"): + return backend, {BFCL_OSS_BACKEND: backend} + return "oss", {BFCL_OSS_BACKEND: "unknown"} + + mapping = { + ModelStyle.OPENAI_COMPLETIONS: "openai", + ModelStyle.OPENAI_RESPONSES: "openai", + ModelStyle.ANTHROPIC: "anthropic", + ModelStyle.GOOGLE: "gcp.gemini", + ModelStyle.MISTRAL: "mistral_ai", + ModelStyle.COHERE: "cohere", + ModelStyle.AMAZON: "aws.bedrock", + ModelStyle.FIREWORK_AI: "fireworks_ai", + ModelStyle.WRITER: "writer", + ModelStyle.NOVITA_AI: "novita", + ModelStyle.NEXUS: "nexusflow", + ModelStyle.GORILLA: "gorilla", + } + return mapping.get(style, "unknown"), {} diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py new file mode 100644 index 000000000..ae4861035 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/state.py @@ -0,0 +1,93 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Per-thread ReAct state for the BFCL v4 instrumentation. + +We use ``contextvars.ContextVar`` so that each worker thread spawned by the +BFCL ``ThreadPoolExecutor`` gets its own copy. ``_ContextPropagatingExecutor`` +in :mod:`threading_propagation` makes sure ENTRY-time context is copied into +the worker thread; the BaseHandler.inference wrapper then initializes a fresh +state on top of that copy. +""" + +from __future__ import annotations + +import contextvars +from typing import Any, Dict, Optional + +_REACT_STATE: contextvars.ContextVar[Optional[Dict[str, Any]]] = ( + contextvars.ContextVar("bfclv4_react_state", default=None) +) + + +def init_state() -> contextvars.Token: + """Initialise per-AGENT state and return the reset token.""" + state: Dict[str, Any] = { + # ``turn_idx`` is incremented by the wrapper around + # ``_add_next_turn_user_message_*``; it stays ``0`` for single-turn + # tests. + "turn_idx": 0, + # ``fc_round`` is the ReAct round counter. We bump it on every STEP + # entry so the first STEP within a turn ends up with ``round=1``. + "fc_round": 0, + # Counter of executed tool calls within the current AGENT - useful for + # the TOOL span ``tool_call_id`` synthesis. + "tool_index": 0, + } + return _REACT_STATE.set(state) + + +def reset_state(token: contextvars.Token) -> None: + try: + _REACT_STATE.reset(token) + except (LookupError, ValueError): + # Token may have already been reset (e.g. nested error path). + pass + + +def get_state() -> Optional[Dict[str, Any]]: + return _REACT_STATE.get() + + +def bump_round() -> int: + state = _REACT_STATE.get() + if state is None: + return 1 + state["fc_round"] = state.get("fc_round", 0) + 1 + return state["fc_round"] + + +def reset_round_for_turn() -> None: + state = _REACT_STATE.get() + if state is None: + return + state["fc_round"] = 0 + + +def bump_turn() -> int: + state = _REACT_STATE.get() + if state is None: + return 0 + state["turn_idx"] = state.get("turn_idx", 0) + 1 + state["fc_round"] = 0 + return state["turn_idx"] + + +def next_tool_index() -> int: + state = _REACT_STATE.get() + if state is None: + return 0 + idx = state.get("tool_index", 0) + state["tool_index"] = idx + 1 + return idx diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py new file mode 100644 index 000000000..d19c05799 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/threading_propagation.py @@ -0,0 +1,43 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Context-propagating ``ThreadPoolExecutor`` used by the ENTRY wrapper. + +``concurrent.futures.ThreadPoolExecutor`` does not automatically copy the +current ``contextvars`` context (which holds the OTel current span) into +worker threads. We subclass it and copy ``contextvars.copy_context()`` per +``submit`` so the AGENT span created inside the worker thread can attach as +a child of the ENTRY span. + +We only swap the ``ThreadPoolExecutor`` *name* in the +``bfcl_eval._llm_response_generation`` namespace; the global +``concurrent.futures.ThreadPoolExecutor`` is untouched. +""" + +from __future__ import annotations + +import contextvars +from concurrent.futures import ThreadPoolExecutor as _RealExecutor + + +class ContextPropagatingExecutor(_RealExecutor): + """``ThreadPoolExecutor`` that propagates the calling ``Context``. + + Only the ``submit`` method is overridden because BFCL only uses + ``submit`` (see ``_llm_response_generation.generate_results``). + """ + + def submit(self, fn, /, *args, **kwargs): # type: ignore[override] + ctx = contextvars.copy_context() + return super().submit(ctx.run, fn, *args, **kwargs) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py new file mode 100644 index 000000000..42f582c69 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/internal/wrappers.py @@ -0,0 +1,1217 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Wrapper classes for the BFCL v4 instrumentation. + +Each wrapper follows the standard ``wrapt`` callable contract:: + + def __call__(self, wrapped, instance, args, kwargs): + ... + +All wrappers rely on :func:`get_extended_telemetry_handler` (LoongSuite +``util-genai``) to create the actual spans, so that ENTRY / AGENT / STEP / +TOOL spans get the canonical ``gen_ai.span.kind`` and operation-name values +that the LoongSuite semantic-validator expects. +""" + +from __future__ import annotations + +import ast +import importlib +import inspect +import logging +import os +import sys +import time +from contextvars import ContextVar +from typing import Any, Callable, Iterable, List, Optional + +from opentelemetry.instrumentation.bfclv4.internal.attributes import ( + BFCL_NUM_THREADS, + BFCL_OSS_BACKEND, + BFCL_QUERY_MODE, + BFCL_RUN_IDS, + BFCL_TEST_CASE_COUNT, + BFCL_TEST_CATEGORY, + BFCL_TEST_ENTRY_ID, + BFCL_TOOL_DURATION_IS_ESTIMATED, + BFCL_TOOL_INDEX, + BFCL_TURN_IDX, + FRAMEWORK_NAME, + GEN_AI_FRAMEWORK, + GEN_AI_PROVIDER_NAME, +) +from opentelemetry.instrumentation.bfclv4.internal.provider import ( + OSS_BACKEND_ENV, + infer_provider, +) +from opentelemetry.instrumentation.bfclv4.internal.state import ( + bump_round, + bump_turn, + init_state, + next_tool_index, + reset_state, +) +from opentelemetry.instrumentation.bfclv4.internal.threading_propagation import ( + ContextPropagatingExecutor, +) +from opentelemetry.instrumentation.bfclv4.utils import ( + GenAIHookHelper, + to_text_input, + to_text_output, + truncate_text, +) +from opentelemetry.util.genai.extended_handler import ( + get_extended_telemetry_handler, +) +from opentelemetry.util.genai.extended_types import ( + EntryInvocation, + ExecuteToolInvocation, + InvokeAgentInvocation, + ReactStepInvocation, +) +from opentelemetry.util.genai.types import ( + FunctionToolDefinition, + GenericToolDefinition, + Text, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Helpers + + +def _safe_get(obj: Any, key: str, default: Any = None) -> Any: + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _flatten_tokens(value: Any) -> Optional[int]: + """Sum a possibly nested ``int|float|list|list[list]`` BFCL token field.""" + if value is None: + return None + if isinstance(value, (int, float)): + return int(value) + if isinstance(value, Iterable): + total = 0 + any_seen = False + for item in value: + sub = _flatten_tokens(item) + if sub is not None: + total += sub + any_seen = True + if any_seen: + return total + return None + + +def _test_category_from_id(test_entry_id: Optional[str]) -> Optional[str]: + if not test_entry_id or "_" not in test_entry_id: + return None + return test_entry_id.rsplit("_", 1)[0] + + +def _join_test_category(value: Any) -> Optional[str]: + if value is None: + return None + if isinstance(value, str): + return value + if isinstance(value, (list, tuple, set)): + joined = ",".join(str(v) for v in value if v is not None) + return joined or None + return str(value) + + +BFCLV4_DEBUG_ENV = "BFCLV4_DEBUG" +GEN_AI_INPUT_MESSAGES_ATTR = "gen_ai.input.messages" +GEN_AI_OUTPUT_MESSAGES_ATTR = "gen_ai.output.messages" +GEN_AI_SYSTEM_INSTRUCTIONS_ATTR = "gen_ai.system_instructions" +GEN_AI_TOOL_CALL_ARGUMENTS_ATTR = "gen_ai.tool.call.arguments" +GEN_AI_TOOL_CALL_RESULT_ATTR = "gen_ai.tool.call.result" +GEN_AI_TOOL_CALL_ID_ATTR = "gen_ai.tool.call.id" +GEN_AI_TOOL_NAME_ATTR = "gen_ai.tool.name" +GEN_AI_TOOL_TYPE_ATTR = "gen_ai.tool.type" +GEN_AI_TOOL_DESCRIPTION_ATTR = "gen_ai.tool.description" +BFCL_SYNTHETIC_TOOL_CALL = "bfcl.tool.synthetic_from_model_response" +_TOOL_DESCRIPTION_MAP: ContextVar[dict[str, str]] = ContextVar( + "bfclv4_tool_description_map", default={} +) + + + +def _json_attr(value: Any) -> str: + try: + import json + + return json.dumps(value, ensure_ascii=False, default=str) + except Exception: # noqa: BLE001 + return _safe_str(value) + + +def _message_dict(role: str, content: Any) -> dict: + return { + "role": role, + "parts": [{"type": "text", "content": truncate_text(_safe_str(content))}], + } + + +def _system_instruction_dict(content: Any) -> dict: + return {"type": "text", "content": truncate_text(_safe_str(content))} + + +def _test_entry_to_messages(test_entry: Any): + if not isinstance(test_entry, dict): + return [], [] + + inputs = [] + system_instructions = [] + for key in ( + "system", + "system_prompt", + "system_instruction", + "system_instructions", + ): + value = test_entry.get(key) + if value not in (None, "", [], {}): + system_instructions.append(Text(content=truncate_text(_safe_str(value)))) + + _append_question_messages( + test_entry.get("question"), + inputs, + system_instructions, + ) + return inputs, system_instructions + + +def _append_question_messages( + value: Any, + inputs: list, + system_instructions: list, +) -> None: + if value in (None, "", [], {}): + return + + if isinstance(value, dict): + role = str(value.get("role") or "user") + content = value.get("content") + if content in (None, "", [], {}): + content = { + k: v + for k, v in value.items() + if k not in {"role", "name", "tool_call_id"} + } + if content in (None, "", [], {}): + return + text = truncate_text(_safe_str(content)) + if role == "system": + system_instructions.append(Text(content=text)) + else: + inputs.extend(to_text_input(role, text)) + return + + if isinstance(value, (list, tuple)): + for item in value: + _append_question_messages(item, inputs, system_instructions) + return + + inputs.extend(to_text_input("user", truncate_text(_safe_str(value)))) + + +def _test_entry_to_tool_definitions(test_entry: Any) -> list: + if not isinstance(test_entry, dict): + return [] + + definitions = [] + for key in ("function", "functions", "tools", "tool_definitions"): + definitions.extend(_tool_value_to_definitions(test_entry.get(key))) + + missed_function = test_entry.get("missed_function") + if isinstance(missed_function, dict): + for value in missed_function.values(): + definitions.extend(_tool_value_to_definitions(value)) + else: + definitions.extend(_tool_value_to_definitions(missed_function)) + + return _dedupe_tool_definitions(definitions) + + +def _tool_value_to_definitions(value: Any) -> list: + if value in (None, "", [], {}): + return [] + + if isinstance(value, str): + try: + import json + + value = json.loads(value) + except Exception: # noqa: BLE001 + return [] + + if isinstance(value, (list, tuple)): + definitions = [] + for item in value: + definitions.extend(_tool_value_to_definitions(item)) + return definitions + + if not isinstance(value, dict): + return [] + + nested_function = value.get("function") + if isinstance(nested_function, dict): + nested = dict(nested_function) + nested.setdefault("type", value.get("type", "function")) + return _tool_value_to_definitions(nested) + + name = value.get("name") or value.get("function_name") or value.get("tool_name") + if not name: + return [] + + tool_type = value.get("type") + description = value.get("description") + parameters = value.get("parameters") + if tool_type not in (None, "", "function") and parameters is None: + return [GenericToolDefinition(name=str(name), type=str(tool_type))] + + return [ + FunctionToolDefinition( + name=str(name), + description=_safe_str(description) if description is not None else None, + parameters=parameters, + ) + ] + + +def _dedupe_tool_definitions(definitions: list) -> list: + deduped = [] + seen = set() + for definition in definitions: + key = _json_attr(getattr(definition, "__dict__", repr(definition))) + if key in seen: + continue + seen.add(key) + deduped.append(definition) + return deduped + + +def _tool_description_map(test_entry: Any) -> dict[str, str]: + descriptions: dict[str, str] = {} + for definition in _test_entry_to_tool_definitions(test_entry): + name = getattr(definition, "name", None) + description = getattr(definition, "description", None) + if name and description: + descriptions[str(name)] = _safe_str(description) + + # Multi-turn BFCL cases often leave ``function`` empty and expose tools via + # involved_classes. Pull method docstrings from BFCL's executable classes so + # TOOL spans still carry gen_ai.tool.description. + if isinstance(test_entry, dict): + involved_classes = test_entry.get("involved_classes") or [] + try: + from bfcl_eval.constants.executable_backend_config import ( # noqa: PLC0415 + CLASS_FILE_PATH_MAPPING, + ) + except Exception: # noqa: BLE001 + CLASS_FILE_PATH_MAPPING = {} + for class_name in involved_classes if isinstance(involved_classes, (list, tuple)) else []: + module_name = CLASS_FILE_PATH_MAPPING.get(class_name) + if not module_name: + continue + try: + module = importlib.import_module(module_name) + cls = getattr(module, class_name) + except Exception: # noqa: BLE001 + continue + for method_name, method in inspect.getmembers(cls, predicate=inspect.isfunction): + if method_name.startswith("_") or method_name in descriptions: + continue + doc = inspect.getdoc(method) + if doc: + descriptions[method_name] = truncate_text(doc, 1024) + return descriptions + + +def _lookup_tool_description(tool_name: Optional[str]) -> Optional[str]: + if not tool_name: + return None + description = _TOOL_DESCRIPTION_MAP.get().get(str(tool_name)) + if description: + return description + try: + from bfcl_eval.constants.executable_backend_config import ( # noqa: PLC0415 + CLASS_FILE_PATH_MAPPING, + ) + except Exception: # noqa: BLE001 + CLASS_FILE_PATH_MAPPING = {} + for module_name in CLASS_FILE_PATH_MAPPING.values(): + try: + module = importlib.import_module(module_name) + except Exception: # noqa: BLE001 + continue + for _, cls in inspect.getmembers(module, inspect.isclass): + method = getattr(cls, str(tool_name), None) + if method is None: + continue + doc = inspect.getdoc(method) + if doc: + return truncate_text(doc, 1024) + return None + + +def _normalise_tool_arguments(arguments: Any) -> Any: + return {} if arguments is None else arguments + + +def _extract_questions_from_cases(cases: Any) -> list: + if not isinstance(cases, (list, tuple)): + return [] + messages = [] + for case in cases[:10]: + if isinstance(case, dict) and case.get("question") is not None: + messages.append(_message_dict("user", case.get("question"))) + return messages + + +def _extract_tool_defs_from_cases(cases: Any) -> list: + if not isinstance(cases, (list, tuple)): + return [] + instructions = [] + for case in cases[:10]: + if isinstance(case, dict) and case.get("function") is not None: + instructions.append(_system_instruction_dict(case.get("function"))) + return instructions + + +def _set_json_span_attr(span: Any, key: str, value: Any) -> None: + if not value or span is None: + return + try: + if span.is_recording(): + span.set_attribute(key, _json_attr(value)) + except Exception: # noqa: BLE001 + logger.debug("bfclv4: failed to set json attr %s", key, exc_info=True) + + +def _span_attr_value(value: Any) -> str: + return value if isinstance(value, str) else _json_attr(value) + + +def _set_tool_call_span_attrs( + span: Any, + *, + arguments: Any = None, + result: Any = None, + description: Optional[str] = None, + tool_name: Optional[str] = None, + tool_call_id: Optional[str] = None, + tool_type: Optional[str] = "function", +) -> None: + if span is None: + return + try: + if not span.is_recording(): + return + if tool_call_id: + span.set_attribute(GEN_AI_TOOL_CALL_ID_ATTR, tool_call_id) + if tool_name: + span.set_attribute(GEN_AI_TOOL_NAME_ATTR, tool_name) + if tool_type: + span.set_attribute(GEN_AI_TOOL_TYPE_ATTR, tool_type) + if arguments is not None: + span.set_attribute( + GEN_AI_TOOL_CALL_ARGUMENTS_ATTR, + _span_attr_value(arguments), + ) + if result is not None: + span.set_attribute( + GEN_AI_TOOL_CALL_RESULT_ATTR, + _span_attr_value(result), + ) + if description: + span.set_attribute(GEN_AI_TOOL_DESCRIPTION_ATTR, description) + print( + "[bfclv4-tool-attrs] " + f"name={tool_name} id={tool_call_id} " + f"has_arguments={arguments is not None} " + f"has_result={result is not None} " + f"has_description={bool(description)}", + file=sys.stderr, + flush=True, + ) + except Exception: # noqa: BLE001 + logger.debug("bfclv4: failed to set TOOL call attrs", exc_info=True) + + +def _parse_python_call_arguments(func_call: Any) -> Any: + if not isinstance(func_call, str) or "(" not in func_call: + return _extract_tool_arguments(func_call) + try: + expr = ast.parse(func_call, mode="eval").body + except SyntaxError: + return _extract_tool_arguments(func_call) + if not isinstance(expr, ast.Call): + return _extract_tool_arguments(func_call) + + parsed: dict[str, Any] = {} + for index, arg in enumerate(expr.args): + parsed[f"arg_{index}"] = _literal_or_source(arg, func_call) + for keyword in expr.keywords: + if keyword.arg is None: + parsed["kwargs"] = _literal_or_source(keyword.value, func_call) + else: + parsed[keyword.arg] = _literal_or_source(keyword.value, func_call) + return parsed or None + + +def _literal_or_source(node: ast.AST, source: str) -> Any: + try: + return ast.literal_eval(node) + except Exception: # noqa: BLE001 + segment = ast.get_source_segment(source, node) + return segment if segment is not None else _safe_str(node) + + +def _iter_model_tool_calls(result_payload: Any): + """Yield (tool_name, arguments) pairs from BFCL single-turn decoded output.""" + if not isinstance(result_payload, list): + return + for item in result_payload: + if isinstance(item, dict): + for name, arguments in item.items(): + yield str(name), arguments + elif isinstance(item, str): + yield _extract_tool_name(item), _parse_python_call_arguments(item) + + +def _emit_synthetic_tool_spans( + result_payload: Any, + *, + test_entry_id: Optional[Any], + model_name: Optional[Any], +) -> int: + """Emit TOOL spans for BFCL cases that generate calls but do not execute them.""" + calls = list(_iter_model_tool_calls(result_payload) or []) + if not calls: + return 0 + handler_obj = get_extended_telemetry_handler() + emitted = 0 + for index, (tool_name, arguments) in enumerate(calls): + description = _lookup_tool_description(tool_name) + tool_inv = ExecuteToolInvocation( + tool_name=tool_name or "unknown", + tool_call_id=_synth_tool_call_id(test_entry_id, model_name, index), + tool_type="function", + tool_description=description, + tool_call_arguments=_normalise_tool_arguments(arguments), + tool_call_result=None, + ) + try: + with handler_obj.execute_tool(tool_inv) as inv: + span = inv.span + if span is not None and span.is_recording(): + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + span.set_attribute(BFCL_TOOL_INDEX, index) + span.set_attribute(BFCL_SYNTHETIC_TOOL_CALL, True) + if test_entry_id is not None: + span.set_attribute(BFCL_TEST_ENTRY_ID, str(test_entry_id)) + _set_tool_call_span_attrs( + span, + arguments=_normalise_tool_arguments(arguments), + description=description, + tool_name=tool_name, + tool_call_id=_synth_tool_call_id(test_entry_id, model_name, index), + tool_type="function", + ) + emitted += 1 + except Exception: # noqa: BLE001 + logger.debug("bfclv4 synthetic TOOL span emission failed", exc_info=True) + return emitted + + +# --------------------------------------------------------------------------- +# ENTRY wrapper + + +class GenerateResultsWrapper: + """Wraps ``bfcl_eval._llm_response_generation.generate_results``. + + Responsibilities: + + * Open the ENTRY span (``enter_ai_application_system``). + * Temporarily swap the ``ThreadPoolExecutor`` reference inside the BFCL + generation module to a context-propagating subclass so that AGENT spans + created in worker threads inherit the ENTRY span as parent. + * Publish ``args.backend`` to ``BFCL_BACKEND`` so that + :func:`infer_provider` can attribute OSS spans to vllm / sglang. + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``generate_results(args, model_name, test_cases_total)`` + cli_args = args[0] if len(args) >= 1 else kwargs.get("args") + model_name = args[1] if len(args) >= 2 else kwargs.get("model_name") + test_cases_total = ( + args[2] if len(args) >= 3 else kwargs.get("test_cases_total") + ) + + try: + from bfcl_eval import ( # noqa: PLC0415 + _llm_response_generation as _bfcl_gen, + ) + except ImportError: + return wrapped(*args, **kwargs) + + original_executor = getattr(_bfcl_gen, "ThreadPoolExecutor", None) + if original_executor is not None: + _bfcl_gen.ThreadPoolExecutor = ContextPropagatingExecutor + + backend_value = ( + _safe_get(cli_args, "backend", None) if cli_args is not None else None + ) + previous_backend_env = os.environ.get(OSS_BACKEND_ENV) + if backend_value: + os.environ[OSS_BACKEND_ENV] = str(backend_value) + + session_id_default = None + if model_name is not None: + try: + session_id_default = f"{model_name}@{int(time.time())}" + except Exception: # noqa: BLE001 + session_id_default = None + session_id = ( + os.environ.get("BFCL_SESSION_ID") or session_id_default + ) + + entry_inv = EntryInvocation(session_id=session_id) + entry_input_messages = _extract_questions_from_cases(test_cases_total) + entry_system_instructions = _extract_tool_defs_from_cases(test_cases_total) + entry_inv.input_messages = to_text_input("user", _safe_str(entry_input_messages)) + handler = get_extended_telemetry_handler() + + attributes = {GEN_AI_FRAMEWORK: FRAMEWORK_NAME} + category_value = _join_test_category( + _safe_get(cli_args, "test_category", None) + ) + if category_value: + attributes[BFCL_TEST_CATEGORY] = category_value + num_threads = _safe_get(cli_args, "num_threads", None) + if num_threads is not None: + try: + attributes[BFCL_NUM_THREADS] = int(num_threads) + except (TypeError, ValueError): + pass + if isinstance(test_cases_total, (list, tuple)): + attributes[BFCL_TEST_CASE_COUNT] = len(test_cases_total) + attributes[BFCL_RUN_IDS] = bool( + _safe_get(cli_args, "run_ids", False) + ) + + try: + with handler.entry(entry_inv) as inv: + if inv.span is not None and inv.span.is_recording(): + for key, value in attributes.items(): + try: + inv.span.set_attribute(key, value) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 ENTRY set_attribute(%s) failed", + key, + exc_info=True, + ) + _set_json_span_attr(inv.span, GEN_AI_INPUT_MESSAGES_ATTR, entry_input_messages) + _set_json_span_attr(inv.span, GEN_AI_SYSTEM_INSTRUCTIONS_ATTR, entry_system_instructions) + result = wrapped(*args, **kwargs) + if inv.span is not None and inv.span.is_recording(): + _set_json_span_attr( + inv.span, + GEN_AI_OUTPUT_MESSAGES_ATTR, + [_message_dict("assistant", {"model": model_name, "status": "generate_results_completed"})], + ) + return result + finally: + if original_executor is not None: + try: + _bfcl_gen.ThreadPoolExecutor = original_executor + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 ENTRY: failed to restore ThreadPoolExecutor", + exc_info=True, + ) + if backend_value: + if previous_backend_env is None: + os.environ.pop(OSS_BACKEND_ENV, None) + else: + os.environ[OSS_BACKEND_ENV] = previous_backend_env + + +# --------------------------------------------------------------------------- +# AGENT wrapper + + +_BFCL_INFERENCE_ERROR_PREFIX = "Error during inference:" + + +class BaseHandlerInferenceWrapper: + """Wraps ``BaseHandler.inference``. + + Creates the AGENT span (kind=AGENT, op=invoke_agent) and initialises the + per-thread ReAct state used by the STEP wrapper. + + BFCL's outer ``multi_threaded_inference`` catches every exception and + converts it into a ``"Error during inference: ..."`` string; we mirror + that behaviour by setting the AGENT span status to ERROR when the + returned ``result`` looks like an error string, instead of relying on + a re-raised exception. + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``inference(self, test_entry, include_input_log, exclude_state_log)`` + test_entry = args[0] if args else kwargs.get("test_entry") + if not isinstance(test_entry, dict): + return wrapped(*args, **kwargs) + + provider, extra_attrs = infer_provider(instance) + request_model = getattr(instance, "model_name", None) + test_entry_id = test_entry.get("id") + category = _test_category_from_id(test_entry_id) + involved_classes = test_entry.get("involved_classes") or [] + agent_description = ( + ", ".join(str(c) for c in involved_classes) + if isinstance(involved_classes, (list, tuple)) + else None + ) + + invocation = InvokeAgentInvocation( + provider=provider or "unknown", + request_model=request_model, + agent_id=test_entry_id, + agent_name=category or "bfcl_agent", + agent_description=agent_description or None, + conversation_id=test_entry_id, + ) + + token = init_state() + tool_description_token = _TOOL_DESCRIPTION_MAP.set( + _tool_description_map(test_entry) + ) + handler = get_extended_telemetry_handler() + try: + with handler.invoke_agent(invocation) as inv: + if inv.span is not None and inv.span.is_recording(): + inv.span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + if provider: + inv.span.set_attribute(GEN_AI_PROVIDER_NAME, provider) + if test_entry_id is not None: + inv.span.set_attribute( + BFCL_TEST_ENTRY_ID, test_entry_id + ) + if category is not None: + inv.span.set_attribute(BFCL_TEST_CATEGORY, category) + for key, value in extra_attrs.items(): + if value is not None: + inv.span.set_attribute(key, value) + + # Capture inputs for the AGENT. Also write span attributes directly + # because util-genai gates message attributes behind experimental + # content-capture mode, which makes K8s semantic validation opaque. + question = test_entry.get("question") + functions = test_entry.get("function") + if question is not None: + inv.input_messages = to_text_input( + "user", truncate_text(_safe_str(question)) + ) + if functions is not None: + inv.system_instruction = to_text_input( + "system", truncate_text(_safe_str(functions)) + )[0].parts if to_text_input("system", truncate_text(_safe_str(functions))) else [] + if inv.span is not None and inv.span.is_recording(): + _set_json_span_attr(inv.span, GEN_AI_INPUT_MESSAGES_ATTR, [_message_dict("user", question)]) + _set_json_span_attr(inv.span, GEN_AI_SYSTEM_INSTRUCTIONS_ATTR, [_system_instruction_dict(functions)]) + # Run the original inference call. + try: + result = wrapped(*args, **kwargs) + except Exception as exc: + # The CM will mark the span as failed; we leave it to + # the handler/CM to call ``fail_invoke_agent``. + raise exc + + # Detect BFCL's own captured error path (no exception raised + # but the returned result is the error string). + result_payload = ( + result[0] if isinstance(result, tuple) and result else None + ) + metadata_payload = ( + result[1] + if isinstance(result, tuple) and len(result) >= 2 + else None + ) + + if ( + isinstance(result_payload, str) + and result_payload.startswith(_BFCL_INFERENCE_ERROR_PREFIX) + and inv.span is not None + and inv.span.is_recording() + ): + try: + from opentelemetry.trace import Status, StatusCode + + inv.span.set_status( + Status(StatusCode.ERROR, result_payload[:200]) + ) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 AGENT: failed to set ERROR status", + exc_info=True, + ) + + if isinstance(metadata_payload, dict): + input_tokens = _flatten_tokens( + metadata_payload.get("input_token_count") + ) + output_tokens = _flatten_tokens( + metadata_payload.get("output_token_count") + ) + if input_tokens is not None: + inv.input_tokens = input_tokens + if output_tokens is not None: + inv.output_tokens = output_tokens + + if result_payload is not None: + inv.output_messages = to_text_output( + "assistant", + truncate_text(_safe_str(result_payload)), + ) + if inv.span is not None and inv.span.is_recording(): + _set_json_span_attr(inv.span, GEN_AI_OUTPUT_MESSAGES_ATTR, [_message_dict("assistant", result_payload)]) + + synthetic_tool_count = _emit_synthetic_tool_spans( + result_payload, + test_entry_id=test_entry_id, + model_name=request_model, + ) + + return result + finally: + try: + _TOOL_DESCRIPTION_MAP.reset(tool_description_token) + except (LookupError, ValueError): + pass + reset_state(token) + + +def _safe_str(value: Any) -> str: + try: + if isinstance(value, str): + return value + import json + + return json.dumps(value, ensure_ascii=False, default=str) + except Exception: # noqa: BLE001 + try: + return str(value) + except Exception: # noqa: BLE001 + return "" + + +def _result_to_output_messages(result: Any): + payload = result[0] if isinstance(result, tuple) and result else result + if payload in (None, "", [], {}): + return [] + + if isinstance(payload, (list, tuple)): + messages = [] + for item in payload: + messages.extend(_result_to_output_messages(item)) + return messages + + content = _extract_result_content(payload) + if content in (None, "", [], {}): + return [] + return to_text_output("assistant", truncate_text(_safe_str(content))) + + +def _extract_result_content(result: Any) -> Any: + if not isinstance(result, dict): + return result + + for key in ( + "final_answer", + "answer", + "output", + "result", + "model_response", + "model_responses", + "inference_output", + ): + value = result.get(key) + if value not in (None, "", [], {}): + return value + + inference_log = result.get("inference_log") + if isinstance(inference_log, dict): + for key in sorted( + (k for k in inference_log if k.startswith("step_")), + key=_step_log_sort_key, + reverse=True, + ): + step_data = inference_log.get(key) + if not isinstance(step_data, dict): + continue + output = step_data.get("inference_output") + if output not in (None, "", [], {}): + return output + answer = step_data.get("inference_answer") + if answer not in (None, "", [], {}): + return answer + + return result + + +def _step_log_sort_key(key: str) -> int: + try: + return int(key[len("step_"):]) + except (TypeError, ValueError): + return -1 + + +# --------------------------------------------------------------------------- +# STEP wrapper + + +class QueryWrapper: + """Wraps ``._query_FC`` / ``_query_prompting``. + + Creates a ReAct STEP span, attaches token usage by re-calling the + handler's matching ``_parse_query_response_*`` (which is documented as + side-effect-free). + """ + + def __init__(self, helper: GenAIHookHelper, mode: str) -> None: + self._helper = helper + self._mode = mode # "FC" or "prompting" + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + round_idx = bump_round() + provider, extra_attrs = infer_provider(instance) + + invocation = ReactStepInvocation(round=round_idx) + handler_obj = get_extended_telemetry_handler() + with handler_obj.react_step(invocation) as step_inv: + span = step_inv.span + if span is not None and span.is_recording(): + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + span.set_attribute(BFCL_QUERY_MODE, self._mode) + if provider: + span.set_attribute(GEN_AI_PROVIDER_NAME, provider) + model_name = getattr(instance, "model_name", None) + if model_name: + span.set_attribute( + "gen_ai.request.model", str(model_name) + ) + from opentelemetry.instrumentation.bfclv4.internal.state import ( + get_state, + ) + + state = get_state() + if state is not None: + span.set_attribute(BFCL_TURN_IDX, state.get("turn_idx", 0)) + for key, value in extra_attrs.items(): + if value is not None: + span.set_attribute(key, value) + + try: + api_response, query_latency = wrapped(*args, **kwargs) + except Exception: + # Let the context-manager mark the span as failed; the BFCL + # outer try/except will turn this into an "Error during + # inference: ..." result string at the AGENT layer. + raise + + # When the underlying handler returns a streaming wrapper + # (e.g. ``ChatStreamWrapper`` from openai-v2), the LLM span and + # its OTel context attach are kept alive until the stream is + # consumed by BFCL's ``_parse_query_response_*`` *outside* of + # this STEP context manager. That breaks the LIFO ordering of + # context attach/detach, leaving the LLM span as the "current" + # span after the STEP CM exits, which causes the next STEP and + # any TOOL spans to be parented to the previous STEP rather + # than to the AGENT. + # + # To preserve LIFO ordering, force-consume the stream here + # (inside the STEP context) and replace it with a plain + # iterator over the cached chunks. This makes ``stop_llm`` + # (which detaches the LLM context) run *before* STEP detaches. + if api_response is not None and hasattr( + api_response, "__next__" + ) and not isinstance(api_response, (str, bytes)): + try: + chunks = list(api_response) + api_response = iter(chunks) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 STEP: failed to materialise streaming " + "response; LLM/STEP nesting may be incorrect", + exc_info=True, + ) + + # Post-call attribute enrichment - use try/except so that any + # vendor-side parsing surprise never breaks BFCL itself. + # + # IMPORTANT: We must NOT re-call ``_parse_query_response_*`` here, + # because for streaming providers (e.g. Qwen DashScope) the + # ``api_response`` is a single-pass generator that the parser + # consumes; calling it twice leaves BFCL's own subsequent call to + # the parser with an exhausted iterator, which crashes inference + # with ``UnboundLocalError: chunk``. Token usage will instead be + # recovered later from the AGENT-level metadata payload. + try: + if span is not None and span.is_recording(): + if isinstance(query_latency, (int, float)): + try: + span.set_attribute( + "gen_ai.response.time_to_first_token", + int(float(query_latency) * 1e9), + ) + except Exception: # noqa: BLE001 + pass + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 STEP: post-call enrichment failed", exc_info=True + ) + + return api_response, query_latency + + +def _infer_finish_reason(model_responses: Any) -> str: + """Best-effort heuristic for ``gen_ai.react.finish_reason``.""" + if model_responses is None: + return "unknown" + if isinstance(model_responses, list): + if len(model_responses) == 0: + return "empty_response" + if len(model_responses) == 1 and not model_responses[0]: + return "empty_response" + return "tool_calls" + if isinstance(model_responses, str): + # Prompting models often return decoded strings even when there are + # no tool calls - treat as "stop" so downstream callers know there is + # no further work to do. + return "stop" + return "continue" + + +# --------------------------------------------------------------------------- +# turn_idx maintenance wrappers (no spans) + + +class TurnBumpWrapper: + """Wraps ``.add_first_turn_message_*`` and + ``._add_next_turn_user_message_*`` to keep ``bfcl.turn_idx`` in + sync. No spans are created here. + """ + + def __init__(self, *, reset: bool) -> None: + self._reset = reset + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + try: + if self._reset: + # ``add_first_turn_message_*`` runs once at the very start of + # multi-turn / single-turn inference. We only want to reset + # to ``turn_idx=0`` here. + from opentelemetry.instrumentation.bfclv4.internal.state import ( + get_state, + ) + + state = get_state() + if state is not None: + state["turn_idx"] = 0 + state["fc_round"] = 0 + else: + bump_turn() + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4: turn_idx maintenance failed", exc_info=True + ) + return wrapped(*args, **kwargs) + + +# --------------------------------------------------------------------------- +# TOOL wrapper + + +class ExecuteFuncCallWrapper: + """Wraps + ``bfcl_eval.eval_checker.multi_turn_eval.multi_turn_utils.execute_multi_turn_func_call``. + + BFCL evaluates a list of function-call strings in a single Python call; + we surface each one as its own TOOL span by post-processing the wrapped + result. Per-call latency is approximated by averaging the total elapsed + time across the batch (``bfcl.tool.duration_is_estimated=true``). + """ + + def __init__(self, helper: GenAIHookHelper) -> None: + self._helper = helper + + def __call__(self, wrapped: Callable, instance: Any, args, kwargs): # noqa: D401 + # ``execute_multi_turn_func_call(func_call_list, initial_config, + # involved_classes, model_name, + # test_entry_id, long_context=False, + # is_evaL_run=False)`` + func_call_list = ( + args[0] if args else kwargs.get("func_call_list", []) + ) + model_name = ( + args[3] + if len(args) >= 4 + else kwargs.get("model_name") + ) + test_entry_id = ( + args[4] + if len(args) >= 5 + else kwargs.get("test_entry_id") + ) + + if not isinstance(func_call_list, list) or not func_call_list: + return wrapped(*args, **kwargs) + + t0 = time.perf_counter() + try: + result = wrapped(*args, **kwargs) + finally: + elapsed = max(time.perf_counter() - t0, 0.0) + + execution_results: List[str] = [] + if isinstance(result, tuple) and result: + payload = result[0] + if isinstance(payload, list): + execution_results = list(payload) + + per_call_seconds = ( + elapsed / len(func_call_list) if func_call_list else 0.0 + ) + + handler_obj = get_extended_telemetry_handler() + for index, func_call in enumerate(func_call_list): + tool_name = _extract_tool_name(func_call) + arguments = _parse_python_call_arguments(func_call) + description = _lookup_tool_description(tool_name) + execution_result = ( + execution_results[index] + if index < len(execution_results) + else None + ) + + tool_inv = ExecuteToolInvocation( + tool_name=tool_name, + tool_call_id=_synth_tool_call_id( + test_entry_id, model_name, index + ), + tool_type="function", + tool_description=description, + tool_call_arguments=_normalise_tool_arguments(arguments), + tool_call_result=execution_result, + ) + + try: + with handler_obj.execute_tool(tool_inv) as inv: + span = inv.span + if span is not None and span.is_recording(): + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + span.set_attribute(BFCL_TOOL_INDEX, index) + span.set_attribute( + BFCL_TOOL_DURATION_IS_ESTIMATED, True + ) + if test_entry_id is not None: + span.set_attribute( + BFCL_TEST_ENTRY_ID, str(test_entry_id) + ) + _set_tool_call_span_attrs( + span, + arguments=_normalise_tool_arguments(arguments), + result=execution_result, + description=description, + tool_name=tool_name, + tool_call_id=_synth_tool_call_id(test_entry_id, model_name, index), + tool_type="function", + ) + if isinstance(execution_result, str) and execution_result.startswith( + "Error during execution:" + ): + try: + from opentelemetry.trace import ( + Status, + StatusCode, + ) + + span.set_status( + Status( + StatusCode.ERROR, + execution_result[:200], + ) + ) + except Exception: # noqa: BLE001 + pass + # Approximate latency by sleeping the budgeted slice + # would distort BFCL execution; we instead rely on + # span start/end (currently both wall-clock-now). + # The ``bfcl.tool.duration_is_estimated`` attribute + # signals the limitation to consumers. + _ = per_call_seconds # unused but documented + # Bump a per-AGENT counter for downstream debugging. + next_tool_index() + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4 TOOL: span emission failed for %s", + tool_name, + exc_info=True, + ) + + return result + + +def _extract_tool_name(func_call: Any) -> str: + if not isinstance(func_call, str) or "(" not in func_call: + return "unknown" + head = func_call.split("(", 1)[0] + # ``head`` may be ``module.method`` or ``instance.method`` - keep the + # last segment which is the actual callable. + return head.split(".")[-1] or "unknown" + + +def _extract_tool_arguments(func_call: Any) -> Optional[str]: + if not isinstance(func_call, str): + return None + if "(" not in func_call or not func_call.endswith(")"): + return func_call + args_part = func_call[func_call.index("(") + 1 : -1] + return args_part if args_part else None + + +def _synth_tool_call_id( + test_entry_id: Optional[Any], model_name: Optional[Any], index: int +) -> str: + parts = [ + str(test_entry_id) if test_entry_id is not None else "no_id", + str(model_name) if model_name is not None else "no_model", + str(index), + ] + return "-".join(parts) \ No newline at end of file diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py new file mode 100644 index 000000000..66e9fa6e1 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/package.py @@ -0,0 +1,17 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +_instruments = ("bfcl-eval >= 4.0.0",) + +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py new file mode 100644 index 000000000..c63bbc62b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/utils.py @@ -0,0 +1,144 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Helpers for the BFCL v4 instrumentation. + +The :class:`GenAIHookHelper` mirrors the helper used by the LoongSuite CrewAI +instrumentation: it gates ``gen_ai.input.messages`` / +``gen_ai.output.messages`` / ``gen_ai.system_instructions`` on the standard +LoongSuite content-capture environment knobs so that prompt content is not +exported by default. +""" + +from __future__ import annotations + +import dataclasses +import logging +from typing import Any, Dict, List, Optional + +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import Span +from opentelemetry.util.genai.types import ( + ContentCapturingMode, + InputMessage, + MessagePart, + OutputMessage, + Text, +) +from opentelemetry.util.genai.utils import ( + gen_ai_json_dumps, + get_content_capturing_mode, + is_experimental_mode, +) + +logger = logging.getLogger(__name__) + + +class GenAIHookHelper: + """Conditionally write prompt / completion content to the span.""" + + def __init__(self, capture_content: bool = True) -> None: + self.capture_content = capture_content + + def on_completion( + self, + span: Span, + inputs: Optional[List[InputMessage]] = None, + outputs: Optional[List[OutputMessage]] = None, + system_instructions: Optional[List[MessagePart]] = None, + attributes: Optional[Dict[str, Any]] = None, + ) -> None: + if not span.is_recording(): + return + + if self.capture_content and is_experimental_mode(): + mode = get_content_capturing_mode() + should_capture_span = mode in ( + ContentCapturingMode.SPAN_ONLY, + ContentCapturingMode.SPAN_AND_EVENT, + ) + + if should_capture_span: + if inputs: + span.set_attribute( + gen_ai_attributes.GEN_AI_INPUT_MESSAGES, + gen_ai_json_dumps( + [dataclasses.asdict(i) for i in inputs] + ), + ) + if outputs: + span.set_attribute( + gen_ai_attributes.GEN_AI_OUTPUT_MESSAGES, + gen_ai_json_dumps( + [dataclasses.asdict(o) for o in outputs] + ), + ) + if system_instructions: + span.set_attribute( + gen_ai_attributes.GEN_AI_SYSTEM_INSTRUCTIONS, + gen_ai_json_dumps( + [dataclasses.asdict(s) for s in system_instructions] + ), + ) + + if attributes: + for key, value in attributes.items(): + if value is None: + continue + try: + span.set_attribute(key, value) + except Exception: # noqa: BLE001 + logger.debug( + "bfclv4: failed to set attribute %s", key, exc_info=True + ) + + +def to_text_input(role: str, content: Any) -> List[InputMessage]: + if content in (None, "", [], {}): + return [] + text = content if isinstance(content, str) else _to_safe_str(content) + return [InputMessage(role=role, parts=[Text(content=text)])] + + +def to_text_output( + role: str, content: Any, finish_reason: str = "stop" +) -> List[OutputMessage]: + if content in (None, "", [], {}): + return [] + text = content if isinstance(content, str) else _to_safe_str(content) + return [ + OutputMessage( + role=role, parts=[Text(content=text)], finish_reason=finish_reason + ) + ] + + +def _to_safe_str(value: Any) -> str: + """Best-effort JSON serialisation, falling back to ``str()``. + + The wrapper code never wants a serialisation failure to break a span. + """ + try: + return gen_ai_json_dumps(value) + except Exception: # noqa: BLE001 + try: + return str(value) + except Exception: # noqa: BLE001 + return "" + + +def truncate_text(value: str, limit: int = 4096) -> str: + if len(value) <= limit: + return value + return value[:limit] + f"..." diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py new file mode 100644 index 000000000..3263662eb --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/src/opentelemetry/instrumentation/bfclv4/version.py @@ -0,0 +1,15 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.1.3.dev0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py new file mode 100644 index 000000000..41446ee3b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_instrumentor.py @@ -0,0 +1,52 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Smoke tests for ``BFCLv4Instrumentor``. + +These tests do not require ``bfcl-eval`` to be installed; they only verify +that importing the package and calling ``instrument()`` / ``uninstrument()`` +works (and degrades gracefully when ``bfcl-eval`` is missing). +""" + +import importlib + +import pytest + + +def test_import_instrumentor_package(): + module = importlib.import_module("opentelemetry.instrumentation.bfclv4") + assert hasattr(module, "BFCLv4Instrumentor") + + +def test_instrumentation_dependencies_listed(): + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + from opentelemetry.instrumentation.bfclv4.package import _instruments + + instr = BFCLv4Instrumentor() + assert tuple(instr.instrumentation_dependencies()) == _instruments + + +def test_instrument_uninstrument_no_bfcl_no_raise(): + """When ``bfcl-eval`` is missing, every wrap call logs and continues. + + The instrumentor must not raise from ``instrument()`` / + ``uninstrument()`` even if the target framework cannot be imported. + """ + + pytest.importorskip("opentelemetry.util.genai.extended_handler") + from opentelemetry.instrumentation.bfclv4 import BFCLv4Instrumentor + + instr = BFCLv4Instrumentor() + instr.instrument() + instr.uninstrument() diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py new file mode 100644 index 000000000..fb760fd5e --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-bfclv4/tests/test_internals.py @@ -0,0 +1,222 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unit tests for the framework-agnostic helpers.""" + +import contextvars + +import pytest + + +def test_state_lifecycle(): + from opentelemetry.instrumentation.bfclv4.internal.state import ( + bump_round, + bump_turn, + get_state, + init_state, + next_tool_index, + reset_state, + ) + + token = init_state() + try: + state = get_state() + assert state == {"turn_idx": 0, "fc_round": 0, "tool_index": 0} + + assert bump_round() == 1 + assert bump_round() == 2 + assert bump_turn() == 1 + # bump_turn resets fc_round + state = get_state() + assert state["turn_idx"] == 1 + assert state["fc_round"] == 0 + assert next_tool_index() == 0 + assert next_tool_index() == 1 + finally: + reset_state(token) + + # After reset the state should be gone (None default). + assert get_state() is None + + +def test_context_propagating_executor_carries_contextvars(): + from opentelemetry.instrumentation.bfclv4.internal.threading_propagation import ( + ContextPropagatingExecutor, + ) + + cv: contextvars.ContextVar[str] = contextvars.ContextVar( + "bfclv4_test_cv", default="default" + ) + cv.set("from_main_thread") + + def _read(): + return cv.get() + + with ContextPropagatingExecutor(max_workers=2) as pool: + future = pool.submit(_read) + assert future.result() == "from_main_thread" + + +def test_extract_tool_name_and_arguments(): + from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + _extract_tool_arguments, + _extract_tool_name, + _parse_python_call_arguments, + ) + + assert _extract_tool_name("calc.add(1, 2)") == "add" + assert _extract_tool_name("list_files()") == "list_files" + assert _extract_tool_name("not a call") == "unknown" + assert _extract_tool_arguments("foo(a=1, b=2)") == "a=1, b=2" + assert _extract_tool_arguments("foo()") is None + assert _parse_python_call_arguments("foo(a=1, b='x')") == { + "a": 1, + "b": "x", + } + + +def test_infer_finish_reason_heuristic(): + from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + _infer_finish_reason, + ) + + assert _infer_finish_reason([]) == "empty_response" + assert _infer_finish_reason([[]]) == "empty_response" + assert _infer_finish_reason([{"name": "x"}]) == "tool_calls" + assert _infer_finish_reason("plain string") == "stop" + assert _infer_finish_reason(None) == "unknown" + + +def test_test_entry_to_messages_extracts_genai_content(): + from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + _test_entry_to_messages, + ) + + test_entry = { + "id": "simple_001", + "system_prompt": "Use the provided tools.", + "question": [ + [ + {"role": "system", "content": "Answer concisely."}, + {"role": "user", "content": "What is the weather in Paris?"}, + ], + [{"role": "assistant", "content": "I will check."}], + ], + } + + inputs, system_instructions = _test_entry_to_messages(test_entry) + + assert [message.role for message in inputs] == ["user", "assistant"] + assert ( + inputs[0].parts[0].content == "What is the weather in Paris?" + ) + assert inputs[1].parts[0].content == "I will check." + assert [part.content for part in system_instructions] == [ + "Use the provided tools.", + "Answer concisely.", + ] + + +def test_test_entry_to_tool_definitions_extracts_bfcl_functions(): + from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + _test_entry_to_tool_definitions, + _tool_description_map, + ) + + test_entry = { + "id": "simple_001", + "function": [ + { + "name": "get_weather", + "description": "Get weather information.", + "parameters": { + "type": "object", + "properties": {"location": {"type": "string"}}, + "required": ["location"], + }, + }, + { + "type": "function", + "function": { + "name": "book_flight", + "description": "Book a flight.", + "parameters": {"type": "object"}, + }, + }, + ], + "missed_function": { + "1": [ + { + "name": "cancel_booking", + "description": "Cancel a booking.", + "parameters": {"type": "object"}, + } + ] + }, + } + + definitions = _test_entry_to_tool_definitions(test_entry) + + assert [definition.name for definition in definitions] == [ + "get_weather", + "book_flight", + "cancel_booking", + ] + assert definitions[0].type == "function" + assert definitions[0].parameters["required"] == ["location"] + assert _tool_description_map(test_entry)["get_weather"] == ( + "Get weather information." + ) + + +def test_result_to_output_messages_extracts_last_inference_log_output(): + from opentelemetry.instrumentation.bfclv4.internal.wrappers import ( + _result_to_output_messages, + ) + + outputs = _result_to_output_messages( + { + "inference_log": { + "step_0": { + "inference_output": {"content": "intermediate"} + }, + "step_1": {"inference_output": {"content": "final"}}, + } + } + ) + + assert len(outputs) == 1 + assert outputs[0].role == "assistant" + assert outputs[0].parts[0].content == '{"content": "final"}' + assert outputs[0].finish_reason == "stop" + + +def test_provider_mapping_without_bfcl(monkeypatch): + from opentelemetry.instrumentation.bfclv4.internal.provider import ( + infer_provider, + ) + + pytest.importorskip( + "opentelemetry.util.genai.extended_types", + ) + + class _Dummy: + model_style = None + + name, extras = infer_provider(_Dummy()) + # If bfcl-eval is not installed, ``ModelStyle`` import fails and we get + # ``unknown``; otherwise we still get ``unknown`` because ``model_style`` + # is None. + assert name == "unknown" + assert extras == {} diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/pyproject.toml new file mode 100644 index 000000000..c1124eaa8 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-claw-eval" +dynamic = ["version"] +description = "LoongSuite claw-eval instrumentation" +license = "Apache-2.0" +requires-python = ">=3.10,<4" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + "opentelemetry-api >= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.0.0, < 2.0.0", +] + +[project.optional-dependencies] +instruments = [ + "claw-eval >= 0.1.0" +] + +[project.entry-points.opentelemetry_instrumentor] +claw_eval = "opentelemetry.instrumentation.claw_eval:ClawEvalInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/claw_eval/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/__init__.py new file mode 100644 index 000000000..6c26aea38 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/__init__.py @@ -0,0 +1,283 @@ +""" +OpenTelemetry claw-eval Instrumentation +======================================= + +Automatic instrumentation for the `claw-eval +`_ evaluation framework. + +Uses **wrapt** monkey-patching to wrap key entry points, the agent loop, +tool dispatchers, compaction, and judge calls that should be suppressed from +producing their own spans — producing a hierarchical trace: + + ENTRY → AGENT → STEP → TOOL / CHAIN + +Usage +----- + +.. code:: python + + from opentelemetry.instrumentation.claw_eval import ClawEvalInstrumentor + + ClawEvalInstrumentor().instrument() + + # Then run claw-eval as normal (CLI or programmatic) + +API +--- +""" + +from __future__ import annotations + +import importlib +import logging +from typing import Any, Collection + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.claw_eval.config import ( + OTEL_INSTRUMENTATION_CLAW_EVAL_ENABLED, +) +from opentelemetry.instrumentation.claw_eval.package import _instruments +from opentelemetry.instrumentation.claw_eval.version import __version__ +from wrapt import wrap_function_wrapper + +logger = logging.getLogger(__name__) + +__all__ = ["ClawEvalInstrumentor"] + + +def _unwrap_func(module_path: str, func_name: str) -> None: + """Restore a module-level function wrapped by *wrapt*.""" + try: + mod = importlib.import_module(module_path) + fn = getattr(mod, func_name, None) + if fn is not None and hasattr(fn, "__wrapped__"): + setattr(mod, func_name, fn.__wrapped__) + except Exception: + pass + + +def _unwrap_method( + module_path: str, class_name: str, method_name: str +) -> None: + """Restore a class method wrapped by *wrapt*.""" + try: + mod = importlib.import_module(module_path) + cls = getattr(mod, class_name, None) + if cls is None: + return + meth = getattr(cls, method_name, None) + if meth is not None and hasattr(meth, "__wrapped__"): + setattr(cls, method_name, meth.__wrapped__) + except Exception: + pass + + +class ClawEvalInstrumentor(BaseInstrumentor): + """Instrumentation that adds OpenTelemetry traces to claw-eval. + + Wraps the following symbols via *wrapt*: + + * **ENTRY** — ``cli.cmd_run``, ``cli.cmd_batch``, ``cli._run_single_task`` + * **AGENT** — ``runner.loop.run_task`` + * **STEP** — ``OpenAICompatProvider.chat`` rotates STEP spans + * **CHAIN** — ``compact.do_auto_compact`` + * **TOOL** — ``ToolDispatcher.dispatch``, ``SandboxToolDispatcher.dispatch`` + * **Judge (suppress only)** — ``LLMJudge.evaluate``, ``evaluate_actions``, + ``evaluate_visual``: nested LLM SDK / HTTP spans are suppressed and no + judge LLM span is emitted, keeping the trace tail clean. + * **Per-task grader (suppress only)** — ``registry.get_grader`` and + ``base.load_peer_grader`` are wrapped so any grader class loaded via + them has its ``_llm_score_classifications`` (and similar evaluation + helpers) auto-suppressed. This catches the per-task grader code paths + that talk to ``judge.client.chat.completions.create`` directly, + bypassing ``LLMJudge.evaluate*``. + """ + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + if not OTEL_INSTRUMENTATION_CLAW_EVAL_ENABLED: + logger.info("claw-eval instrumentation disabled via env var") + return + + tracer_provider = kwargs.get("tracer_provider") + tracer = trace_api.get_tracer( + __name__, + __version__, + tracer_provider=tracer_provider, + ) + + + from opentelemetry.instrumentation.claw_eval.internal.wrappers import ( + DoAutoCompactWrapper, + EntryWrapper, + GetGraderWrapper, + JudgeWrapper, + LoadPeerGraderWrapper, + ProviderChatWrapper, + RunSingleTaskWrapper, + RunTaskWrapper, + ToolDispatchWrapper, + ) + + # --- CLI entry points (ENTRY) --- + for func_name, cmd in [("cmd_run", "run"), ("cmd_batch", "batch")]: + try: + wrap_function_wrapper( + "claw_eval.cli", + func_name, + EntryWrapper(tracer, cmd), + ) + except Exception as exc: + logger.warning( + "Could not wrap claw_eval.cli.%s: %s", func_name, exc + ) + + try: + wrap_function_wrapper( + "claw_eval.cli", + "_run_single_task", + RunSingleTaskWrapper(tracer), + ) + except Exception as exc: + logger.warning("Could not wrap _run_single_task: %s", exc) + + # --- Agent loop (AGENT) --- + try: + wrap_function_wrapper( + "claw_eval.runner.loop", + "run_task", + RunTaskWrapper(tracer), + ) + except Exception as exc: + logger.warning("Could not wrap run_task: %s", exc) + + # --- Provider chat (STEP rotation) --- + try: + wrap_function_wrapper( + "claw_eval.runner.providers.openai_compat", + "OpenAICompatProvider.chat", + ProviderChatWrapper(tracer), + ) + except Exception as exc: + logger.warning( + "Could not wrap OpenAICompatProvider.chat: %s", exc + ) + + # --- Context compaction (CHAIN) --- + try: + wrap_function_wrapper( + "claw_eval.runner.compact", + "do_auto_compact", + DoAutoCompactWrapper(tracer), + ) + except Exception as exc: + logger.warning("Could not wrap do_auto_compact: %s", exc) + + # --- Tool dispatchers (TOOL) --- + try: + wrap_function_wrapper( + "claw_eval.runner.dispatcher", + "ToolDispatcher.dispatch", + ToolDispatchWrapper(tracer), + ) + except Exception as exc: + logger.warning("Could not wrap ToolDispatcher.dispatch: %s", exc) + + try: + wrap_function_wrapper( + "claw_eval.runner.sandbox_dispatcher", + "SandboxToolDispatcher.dispatch", + ToolDispatchWrapper(tracer), + ) + except Exception as exc: + logger.debug( + "Could not wrap SandboxToolDispatcher.dispatch: %s", exc + ) + + # --- LLM Judge (suppress nested SDK / HTTP spans, no judge span) --- + for method in ("evaluate", "evaluate_actions", "evaluate_visual"): + try: + wrap_function_wrapper( + "claw_eval.graders.llm_judge", + f"LLMJudge.{method}", + JudgeWrapper(tracer, method), + ) + except Exception as exc: + logger.warning( + "Could not wrap LLMJudge.%s: %s", method, exc + ) + + # --- Per-task grader evaluation helpers --- + # Per-task ``tasks/T*/grader.py`` defines helpers like + # ``_llm_score_classifications`` that bypass ``LLMJudge.evaluate*`` + # and call ``judge.client.chat.completions.create`` directly. + # Hooking the two grader loaders lets us walk each loaded grader's + # MRO and install span-suppression on those helpers automatically. + try: + wrap_function_wrapper( + "claw_eval.graders.registry", + "get_grader", + GetGraderWrapper(tracer), + ) + except Exception as exc: + logger.warning("Could not wrap get_grader: %s", exc) + + try: + wrap_function_wrapper( + "claw_eval.graders.base", + "load_peer_grader", + LoadPeerGraderWrapper(tracer), + ) + except Exception as exc: + logger.warning("Could not wrap load_peer_grader: %s", exc) + + def _uninstrument(self, **kwargs: Any) -> None: + # CLI entry points + _unwrap_func("claw_eval.cli", "cmd_run") + _unwrap_func("claw_eval.cli", "cmd_batch") + _unwrap_func("claw_eval.cli", "_run_single_task") + + # Agent loop + _unwrap_func("claw_eval.runner.loop", "run_task") + + # Provider chat + _unwrap_method( + "claw_eval.runner.providers.openai_compat", + "OpenAICompatProvider", + "chat", + ) + + # Context compaction + _unwrap_func("claw_eval.runner.compact", "do_auto_compact") + + # Tool dispatchers + _unwrap_method( + "claw_eval.runner.dispatcher", + "ToolDispatcher", + "dispatch", + ) + _unwrap_method( + "claw_eval.runner.sandbox_dispatcher", + "SandboxToolDispatcher", + "dispatch", + ) + + # LLM Judge + for method in ("evaluate", "evaluate_actions", "evaluate_visual"): + _unwrap_method( + "claw_eval.graders.llm_judge", + "LLMJudge", + method, + ) + + # Per-task grader loaders. Note: dynamically wrapped per-task + # ``_llm_score_classifications`` methods on already-loaded grader + # classes are intentionally not unwrapped here — those modules are + # loaded under synthetic names like ``task_grader_`` and there + # is no stable handle to walk. Unwrapping the loaders is enough to + # stop *new* graders from getting wrapped after uninstrument. + _unwrap_func("claw_eval.graders.registry", "get_grader") + _unwrap_func("claw_eval.graders.base", "load_peer_grader") diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/config.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/config.py new file mode 100644 index 000000000..abe5602bd --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/config.py @@ -0,0 +1,25 @@ +"""Configuration via environment variables.""" + +from __future__ import annotations + +import os + + +def _bool_env(name: str, default: bool) -> bool: + val = os.getenv(name) + if val is None: + return default + return val.strip().lower() in {"true", "1", "yes", "on"} + + +OTEL_INSTRUMENTATION_CLAW_EVAL_ENABLED = _bool_env( + "OTEL_INSTRUMENTATION_CLAW_EVAL_ENABLED", True +) + +OTEL_CLAW_EVAL_CAPTURE_CONTENT = _bool_env( + "OTEL_CLAW_EVAL_CAPTURE_CONTENT", False +) + +OTEL_CLAW_EVAL_PROPAGATE_TO_WORKER = _bool_env( + "OTEL_CLAW_EVAL_PROPAGATE_TO_WORKER", False +) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/__init__.py new file mode 100644 index 000000000..117870f87 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/__init__.py @@ -0,0 +1 @@ +"""Internal helpers for claw-eval instrumentation.""" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/wrappers.py new file mode 100644 index 000000000..fae491249 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/internal/wrappers.py @@ -0,0 +1,1003 @@ +"""Wrapt wrappers for claw-eval OpenTelemetry instrumentation. + +Span hierarchy +-------------- +ENTRY (cmd_run / cmd_batch / _run_single_task) +└── AGENT (run_task) + ├── STEP (rotated per main-loop provider.chat call) + │ ├── TOOL (dispatcher.dispatch / sandbox_dispatcher.dispatch) + │ ├── CHAIN (do_auto_compact) + └── (judge.evaluate* + per-task grader._llm_score_classifications: + nested LLM SDK / HTTP spans suppressed, no span emitted) +""" + +from __future__ import annotations + +import json +from contextvars import ContextVar +from typing import Any + +from opentelemetry import context as otel_context +from opentelemetry.context import _SUPPRESS_INSTRUMENTATION_KEY +from opentelemetry.semconv._incubating.attributes import ( + gen_ai_attributes as GenAI, +) +from opentelemetry.trace import ( + SpanKind, + Status, + StatusCode, + Tracer, + set_span_in_context, +) + +try: + from aliyun.sdk.extension.arms.semconv import _SUPPRESS_LLM_SDK_KEY +except ImportError: + _SUPPRESS_LLM_SDK_KEY = None + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +GEN_AI_SPAN_KIND = "gen_ai.span.kind" +GEN_AI_FRAMEWORK = "gen_ai.framework" +GEN_AI_TOOL_CALL_ARGUMENTS = "gen_ai.tool.call.arguments" +GEN_AI_TOOL_CALL_RESULT = "gen_ai.tool.call.result" +# ``GEN_AI_TOOL_DEFINITIONS`` was added to the upstream semconv after the +# version vendored by some Aliyun ARMS releases, so we hardcode the spec +# string instead of reading it from ``gen_ai_attributes``. +GEN_AI_TOOL_DEFINITIONS = "gen_ai.tool.definitions" + +# --------------------------------------------------------------------------- +# ContextVars for STEP lifecycle & compact-depth tracking +# --------------------------------------------------------------------------- + +_compact_depth: ContextVar[int] = ContextVar( + "claw_eval_compact_depth", default=0 +) +_in_agent_run: ContextVar[bool] = ContextVar( + "claw_eval_in_agent_run", default=False +) +_step_counter: ContextVar[int] = ContextVar( + "claw_eval_step_counter", default=0 +) +_current_step_span: ContextVar[Any] = ContextVar( + "claw_eval_current_step_span", default=None +) +_current_step_token: ContextVar[Any] = ContextVar( + "claw_eval_current_step_token", default=None +) +_in_tool_dispatch: ContextVar[bool] = ContextVar( + "claw_eval_in_tool_dispatch", default=False +) + +# Per-call capture state for the active AGENT span. ``RunTaskWrapper`` sets a +# fresh dict on entry; the lightweight ``provider.chat`` shim installed below +# pushes data into it. Using a ContextVar keeps concurrent ``run_task`` +# invocations isolated even when they share the same provider instance. +_agent_capture: ContextVar["dict[str, Any] | None"] = ContextVar( + "claw_eval_agent_capture", default=None +) + +# JSON-serialized tool-definition list captured from the ``tools=`` kwarg of +# the first ``provider.chat`` call inside an AGENT run. Read by +# ``ToolDispatchWrapper`` to populate ``gen_ai.tool.definitions`` on every +# TOOL span. Stored as a pre-serialized string so each TOOL span pays only an +# attribute-set cost, not a JSON-encode cost. +_agent_tool_definitions: ContextVar[str] = ContextVar( + "claw_eval_agent_tool_definitions", default="" +) + +# Per-CLI-invocation capture for the ENTRY span. ``EntryWrapper`` / +# ``RunSingleTaskWrapper`` initialize a list on entry; each completing AGENT +# span pushes its own capture dict onto it. The first task prompt and the +# final agent response surface as ENTRY ``gen_ai.input.messages`` / +# ``gen_ai.output.messages`` so the trace root carries useful IO. +_entry_capture: ContextVar["list[dict[str, Any]] | None"] = ContextVar( + "claw_eval_entry_capture", default=None +) + +# --------------------------------------------------------------------------- +# Content helpers +# --------------------------------------------------------------------------- + + +def _safe_json(obj: Any) -> str: + """JSON-serialize ``obj`` for span attributes. + + Content is intentionally NOT truncated: downstream consumers (evaluators, + SLS analytics) need the full request/response payloads. + """ + try: + return json.dumps(obj, ensure_ascii=False, default=str) + except Exception: + return str(obj) + + +def _extract_tool_result_text(result) -> str: + """Extract text content from a ToolResultBlock for gen_ai.tool.call.result. + + Tool output is intentionally NOT truncated so downstream consumers see the + full payload returned to the agent. + """ + content = getattr(result, "content", None) + if not content: + return "" + parts: list[str] = [] + for block in content: + text = getattr(block, "text", None) + if text: + parts.append(text) + return "\n".join(parts) + + +def _extract_system_prompt(messages) -> str: + """Pull the text content of the first ``role=system`` message.""" + if not messages: + return "" + for msg in messages: + if getattr(msg, "role", None) != "system": + continue + for block in getattr(msg, "content", []) or []: + if getattr(block, "type", None) == "text": + return getattr(block, "text", "") or "" + break + return "" + + +# --------------------------------------------------------------------------- +# Spec-compliant message serialization +# --------------------------------------------------------------------------- +# +# These helpers convert claw-eval's internal ``Message``/``ContentBlock`` +# objects into the ARMS GenAI semantic-convention JSON shape documented in +# ``arms_docs/trace/gen-ai.md`` and the message JSON schemas: +# +# * ``gen_ai.input.messages`` — array of ``ChatMessage`` ({role, parts}) +# * ``gen_ai.output.messages`` — array of ``OutputMessage`` +# ({role, parts, finish_reason}) +# * ``gen_ai.system_instructions`` — array of parts (TextPart, ...) — note +# that this is *not* wrapped in a message. +# +# Each ``part`` follows the schema: +# - TextPart: {"type": "text", "content": ...} +# - ToolCallRequestPart: {"type": "tool_call", "id", "name", "arguments"} +# - ToolCallResponsePart: {"type": "tool_call_response", "id", "response"} + + +def _block_to_part(block) -> dict[str, Any]: + """Convert a claw-eval ContentBlock to a spec-compliant message part.""" + btype = getattr(block, "type", "") + if btype == "text": + return { + "type": "text", + "content": getattr(block, "text", "") or "", + } + if btype == "tool_use": + return { + "type": "tool_call", + "id": getattr(block, "id", "") or "", + "name": getattr(block, "name", "") or "", + "arguments": getattr(block, "input", None), + } + if btype == "tool_result": + inner_texts: list[str] = [] + for ib in getattr(block, "content", []) or []: + t = getattr(ib, "text", None) + if t: + inner_texts.append(t) + return { + "type": "tool_call_response", + "id": getattr(block, "tool_use_id", "") or "", + "response": "\n".join(inner_texts), + } + if btype in {"image", "audio", "video"}: + return {"type": btype} + return {"type": btype or "unknown"} + + +def _message_to_chat_message(msg) -> dict[str, Any]: + """Convert a claw-eval ``Message`` to a spec ``ChatMessage`` dict.""" + role = getattr(msg, "role", "unknown") + parts = [ + _block_to_part(b) for b in (getattr(msg, "content", None) or []) + ] + return {"role": role, "parts": parts} + + +def _infer_finish_reason(message) -> str: + """Infer ``finish_reason`` for an output message. + + The claw-eval ``Message`` returned from ``provider.chat`` does not carry + the upstream ``finish_reason``; the loop relies on the presence/absence of + ``tool_use`` blocks to decide whether to keep iterating. We mirror that + convention here so downstream consumers get a well-formed + ``OutputMessage``. + """ + for b in getattr(message, "content", None) or []: + if getattr(b, "type", "") == "tool_use": + return "tool_call" + return "stop" + + +def _serialize_input_messages(messages) -> str: + """Serialize a list of input ``Message`` objects to JSON per the spec.""" + arr = [_message_to_chat_message(m) for m in (messages or [])] + try: + return json.dumps(arr, ensure_ascii=False, default=str) + except Exception: + return str(arr) + + +def _serialize_output_message(message) -> str: + """Serialize a single response ``Message`` to a JSON ``OutputMessages`` array.""" + if message is None: + return "" + role = getattr(message, "role", "assistant") or "assistant" + parts = [ + _block_to_part(b) for b in (getattr(message, "content", None) or []) + ] + out = { + "role": role, + "parts": parts, + "finish_reason": _infer_finish_reason(message), + } + try: + return json.dumps([out], ensure_ascii=False, default=str) + except Exception: + return str([out]) + + +def _serialize_system_instructions(text: str) -> str: + """Wrap a system prompt string into a JSON ``SystemInstructions`` array.""" + if not text: + return "" + arr = [{"type": "text", "content": text}] + try: + return json.dumps(arr, ensure_ascii=False, default=str) + except Exception: + return str(arr) + + +def _build_user_text_messages(text: str) -> str: + """Build a one-message ``InputMessages`` JSON for a plain user prompt.""" + if not text: + return "" + arr = [ + { + "role": "user", + "parts": [{"type": "text", "content": text}], + } + ] + try: + return json.dumps(arr, ensure_ascii=False, default=str) + except Exception: + return str(arr) + + +def _serialize_tool_definitions(tools) -> str: + """Serialize a ``ToolSpec`` iterable as the ``gen_ai.tool.definitions`` JSON. + + Per the GenAI semantic convention each entry is a ``ToolDefinition`` object + of the form ``{"type": "function", "name": ..., "description": ..., + "parameters": ...}``. Anything not coercible to that shape is skipped so + a malformed entry never aborts serialization for the rest of the list. + """ + if not tools: + return "" + arr: list[dict[str, Any]] = [] + for t in tools: + name = getattr(t, "name", None) + if not name: + continue + entry: dict[str, Any] = {"type": "function", "name": str(name)} + desc = getattr(t, "description", None) + if desc: + entry["description"] = str(desc) + # claw-eval names it ``input_schema``; OpenAI / OTel spec uses + # ``parameters``. Translate so consumers don't have to special-case. + schema = getattr(t, "input_schema", None) + if schema is None: + schema = getattr(t, "parameters", None) + if schema is not None: + entry["parameters"] = schema + arr.append(entry) + if not arr: + return "" + try: + return json.dumps(arr, ensure_ascii=False, default=str) + except Exception: + return str(arr) + + +# --------------------------------------------------------------------------- +# STEP lifecycle helpers +# --------------------------------------------------------------------------- + + +def _end_current_step() -> None: + """End the active STEP span and detach its context token.""" + span = _current_step_span.get(None) + token = _current_step_token.get(None) + if span is not None: + span.end() + _current_step_span.set(None) + if token is not None: + otel_context.detach(token) + _current_step_token.set(None) + + +def _rotate_step(tracer: Tracer) -> None: + """End the previous STEP and start a new one under the current context.""" + _end_current_step() + + step_num = _step_counter.get(0) + 1 + _step_counter.set(step_num) + + step_span = tracer.start_span("react step", kind=SpanKind.INTERNAL) + step_span.set_attribute(GEN_AI_SPAN_KIND, "STEP") + step_span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.INVOKE_AGENT.value, + ) + step_span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval") + step_span.set_attribute(GenAI.GEN_AI_AGENT_NAME, "claw-eval") + step_span.set_attribute("gen_ai.react.round", step_num) + + _current_step_span.set(step_span) + ctx = set_span_in_context(step_span) + token = otel_context.attach(ctx) + _current_step_token.set(token) + + +# --------------------------------------------------------------------------- +# ENTRY wrappers (cli.cmd_run / cli.cmd_batch) +# --------------------------------------------------------------------------- + + +def _populate_entry_span(span, captures: list[dict] | None) -> None: + """Apply the first task prompt and the last agent response to ENTRY span. + + ENTRY is the trace root for a CLI invocation; representing it with the + first user prompt and the final agent response gives the span a useful + summary view without trying to merge potentially conflicting data from + multiple trials/tasks. + """ + if not captures: + return + + # Input: prefer the first agent run's captured input messages (already in + # spec format); otherwise fall back to its task prompt. + input_msgs = "" + for cap in captures: + input_msgs = cap.get("input_messages_str", "") or "" + if input_msgs: + break + if not input_msgs: + for cap in captures: + prompt = cap.get("task_prompt", "") or "" + if prompt: + input_msgs = _build_user_text_messages(prompt) + break + if input_msgs: + span.set_attribute(GenAI.GEN_AI_INPUT_MESSAGES, input_msgs) + + # Output: last agent's last response wins (most likely the final answer + # the user would care about). + output_msgs = "" + for cap in reversed(captures): + output_msgs = cap.get("last_response_str", "") or "" + if output_msgs: + break + if output_msgs: + span.set_attribute(GenAI.GEN_AI_OUTPUT_MESSAGES, output_msgs) + + +class EntryWrapper: + """Creates an ENTRY span around CLI entry-point functions.""" + + __slots__ = ("_tracer", "_command") + + def __init__(self, tracer: Tracer, command: str): + self._tracer = tracer + self._command = command + + def __call__(self, wrapped, instance, args, kwargs): + captures: list[dict] = [] + cap_tok = _entry_capture.set(captures) + with self._tracer.start_as_current_span( + f"claw-eval {self._command}", kind=SpanKind.INTERNAL + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "ENTRY") + span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval") + span.set_attribute("claw_eval.command", self._command) + try: + return wrapped(*args, **kwargs) + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + finally: + _populate_entry_span(span, captures) + _entry_capture.reset(cap_tok) + + +class RunSingleTaskWrapper: + """Creates an ENTRY span for batch worker ``_run_single_task``.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + task_dir = args[0] if args else kwargs.get("task_dir", "") + captures: list[dict] = [] + cap_tok = _entry_capture.set(captures) + with self._tracer.start_as_current_span( + "claw-eval batch_worker", kind=SpanKind.INTERNAL + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "ENTRY") + span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval") + span.set_attribute("claw_eval.command", "batch_worker") + if task_dir: + span.set_attribute("claw_eval.task_dir", str(task_dir)) + try: + result = wrapped(*args, **kwargs) + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + else: + if isinstance(result, dict): + tid = result.get("task_id") + if tid: + span.set_attribute("claw_eval.task_id", str(tid)) + return result + finally: + _populate_entry_span(span, captures) + _entry_capture.reset(cap_tok) + + +# --------------------------------------------------------------------------- +# AGENT wrapper (runner.loop.run_task) +# --------------------------------------------------------------------------- + + +class RunTaskWrapper: + """Creates an AGENT span and aggregates per-task GenAI attributes. + + The wrapper installs a lightweight, idempotent shim on ``provider.chat`` + that records the first-call input messages, system prompt, latest response + and accumulated token usage into a per-call ``_agent_capture`` dict. On + exit the data is written onto the AGENT span using the OTel GenAI + semantic conventions (``gen_ai.input.messages``, + ``gen_ai.output.messages``, ``gen_ai.system_instructions``, + ``gen_ai.usage.{input,output}_tokens``, ``gen_ai.request.model``). + + ``ProviderChatWrapper`` is intentionally left untouched: the shim wraps + the *bound* method that already goes through ``ProviderChatWrapper``, so + STEP rotation continues to work exactly as before. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + task = args[0] if args else kwargs.get("task") + provider = args[1] if len(args) > 1 else kwargs.get("provider") + task_id = getattr(task, "task_id", "unknown") if task else "unknown" + + with self._tracer.start_as_current_span( + "invoke_agent claw-eval", kind=SpanKind.INTERNAL + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "AGENT") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.INVOKE_AGENT.value, + ) + span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval") + span.set_attribute(GenAI.GEN_AI_AGENT_NAME, "claw-eval") + span.set_attribute("claw_eval.task_id", str(task_id)) + + model_id = "" + if provider is not None: + model_id = str(getattr(provider, "model_id", "") or "") + if model_id: + span.set_attribute(GenAI.GEN_AI_REQUEST_MODEL, model_id) + + prompt = _get_task_prompt(task) + if prompt: + span.set_attribute( + GenAI.GEN_AI_AGENT_DESCRIPTION, + prompt, + ) + + capture: dict[str, Any] = { + "input_tokens": 0, + "output_tokens": 0, + "system_instructions": "", + "input_messages_str": "", + "last_response_str": "", + "task_prompt": prompt, + "first_call_done": False, + } + + _install_provider_chat_capture_shim(provider) + + tok_agent = _in_agent_run.set(True) + tok_cnt = _step_counter.set(0) + tok_ss = _current_step_span.set(None) + tok_st = _current_step_token.set(None) + tok_cap = _agent_capture.set(capture) + tok_tools = _agent_tool_definitions.set("") + + try: + result = wrapped(*args, **kwargs) + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + else: + total = _step_counter.get(0) + if total > 0: + span.set_attribute("claw_eval.total_turns", total) + return result + finally: + _populate_agent_span(span, capture, prompt) + entry_caps = _entry_capture.get() + if entry_caps is not None: + entry_caps.append(capture) + _end_current_step() + _in_agent_run.reset(tok_agent) + _step_counter.reset(tok_cnt) + _current_step_span.reset(tok_ss) + _current_step_token.reset(tok_st) + _agent_capture.reset(tok_cap) + _agent_tool_definitions.reset(tok_tools) + + +def _install_provider_chat_capture_shim(provider) -> None: + """Idempotently install a pass-through shim on ``provider.chat``. + + The shim reads the active capture dict from ``_agent_capture`` and + records token usage / input messages / latest response into it. When no + capture is active (e.g. provider used outside an AGENT span) the shim is + a transparent no-op. Recording is skipped while ``_compact_depth > 0`` + so the AGENT totals match the framework's own ``total_usage`` accounting + (which excludes auto-compact LLM calls). + """ + if provider is None: + return + + existing = provider.__dict__.get("chat") + if existing is not None and getattr(existing, "_claw_eval_capture_shim", False): + return + + cls = type(provider) + cls_chat = getattr(cls, "chat", None) + if cls_chat is None: + return + try: + bound_chat = cls_chat.__get__(provider, cls) + except Exception: + return + if not callable(bound_chat): + return + + def chat(messages, *call_args, **call_kwargs): + # Capture the tools list *before* delegating so TOOL spans created + # inside ``bound_chat`` (none today, but cheap insurance) still see + # the populated ContextVar. The capture is idempotent — we only + # serialize once per AGENT run. + if _compact_depth.get(0) == 0 and not _agent_tool_definitions.get(""): + tools_arg = call_kwargs.get("tools") + if tools_arg is None and call_args: + tools_arg = call_args[0] + if tools_arg: + try: + serialized = _serialize_tool_definitions(tools_arg) + except Exception: + serialized = "" + if serialized: + _agent_tool_definitions.set(serialized) + + response, usage = bound_chat(messages, *call_args, **call_kwargs) + capture = _agent_capture.get() + if capture is None or _compact_depth.get(0) > 0: + return response, usage + + try: + capture["input_tokens"] += int( + getattr(usage, "input_tokens", 0) or 0 + ) + capture["output_tokens"] += int( + getattr(usage, "output_tokens", 0) or 0 + ) + except Exception: + pass + + if not capture.get("first_call_done", False): + capture["first_call_done"] = True + try: + capture["system_instructions"] = _extract_system_prompt(messages) + non_system = [ + m for m in messages + if getattr(m, "role", None) != "system" + ] + if non_system: + capture["input_messages_str"] = ( + _serialize_input_messages(non_system) + ) + except Exception: + pass + + try: + capture["last_response_str"] = _serialize_output_message(response) + except Exception: + pass + + return response, usage + + chat._claw_eval_capture_shim = True + try: + provider.chat = chat + except Exception: + pass + + +def _populate_agent_span(span, capture: dict, task_prompt: str) -> None: + """Apply aggregated LLM/token/message data to the AGENT span on exit. + + The GenAI semantic-convention attributes (``gen_ai.input.messages``, + ``gen_ai.output.messages``, ``gen_ai.system_instructions``, + ``gen_ai.usage.{input,output}_tokens``) are always written when the data + has been captured. The AGENT span is the canonical record of a task's IO + and must surface it now that per-LLM-call spans are suppressed. + """ + inp = int(capture.get("input_tokens", 0) or 0) + out = int(capture.get("output_tokens", 0) or 0) + if inp: + span.set_attribute(GenAI.GEN_AI_USAGE_INPUT_TOKENS, inp) + if out: + span.set_attribute(GenAI.GEN_AI_USAGE_OUTPUT_TOKENS, out) + + sys_prompt = capture.get("system_instructions", "") or "" + if sys_prompt: + span.set_attribute( + GenAI.GEN_AI_SYSTEM_INSTRUCTIONS, + _serialize_system_instructions(sys_prompt), + ) + + input_msgs = capture.get("input_messages_str", "") or "" + if input_msgs: + span.set_attribute(GenAI.GEN_AI_INPUT_MESSAGES, input_msgs) + elif task_prompt: + span.set_attribute( + GenAI.GEN_AI_INPUT_MESSAGES, + _build_user_text_messages(task_prompt), + ) + + last_response_str = capture.get("last_response_str", "") or "" + if last_response_str: + span.set_attribute(GenAI.GEN_AI_OUTPUT_MESSAGES, last_response_str) + + +def _get_task_prompt(task) -> str: + """Safely extract the prompt text from a TaskDefinition.""" + if task is None: + return "" + prompt = getattr(task, "prompt", None) + if prompt is None: + return "" + return getattr(prompt, "text", "") or "" + + +class ProviderChatWrapper: + """Rotates STEP spans around main-loop provider chat calls. + + When ``compact_depth == 0`` and inside an agent run, each call ends + the previous STEP and starts a new one so that subsequent TOOL spans + become children of the latest STEP. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + compact_depth = _compact_depth.get(0) + in_agent = _in_agent_run.get(False) + + if in_agent and compact_depth == 0: + _rotate_step(self._tracer) + + return wrapped(*args, **kwargs) + + +# --------------------------------------------------------------------------- +# CHAIN wrapper (compact.do_auto_compact) +# --------------------------------------------------------------------------- + + +class DoAutoCompactWrapper: + """Creates a CHAIN span and bumps ``_compact_depth``.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + focus = kwargs.get("focus") + layer = "manual" if focus is not None else "auto" + + with self._tracer.start_as_current_span( + "compact", kind=SpanKind.INTERNAL + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "CHAIN") + span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval") + span.set_attribute("claw_eval.compact.layer", layer) + + depth_tok = _compact_depth.set(_compact_depth.get(0) + 1) + try: + return wrapped(*args, **kwargs) + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + finally: + _compact_depth.reset(depth_tok) + + +# --------------------------------------------------------------------------- +# TOOL wrapper (ToolDispatcher.dispatch / SandboxToolDispatcher.dispatch) +# --------------------------------------------------------------------------- + + +class ToolDispatchWrapper: + """Creates a TOOL span for ``dispatch`` calls. + + Uses ``_in_tool_dispatch`` guard to prevent duplicate spans when + ``SandboxToolDispatcher.dispatch`` delegates to ``ToolDispatcher.dispatch``. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + if _in_tool_dispatch.get(False): + return wrapped(*args, **kwargs) + + tool_use = args[0] if args else kwargs.get("tool_use") + tool_name = getattr(tool_use, "name", "unknown") if tool_use else "unknown" + tool_use_id = getattr(tool_use, "id", "") if tool_use else "" + tool_input = getattr(tool_use, "input", None) if tool_use else None + is_sandbox = hasattr(instance, "_http") + + guard = _in_tool_dispatch.set(True) + with self._tracer.start_as_current_span( + f"execute_tool {tool_name}", kind=SpanKind.INTERNAL + ) as span: + span.set_attribute(GEN_AI_SPAN_KIND, "TOOL") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.EXECUTE_TOOL.value, + ) + span.set_attribute(GEN_AI_FRAMEWORK, "claw-eval") + span.set_attribute(GenAI.GEN_AI_TOOL_NAME, tool_name) + span.set_attribute(GenAI.GEN_AI_TOOL_TYPE, "function") + if tool_use_id: + span.set_attribute(GenAI.GEN_AI_TOOL_CALL_ID, tool_use_id) + tool_defs = _agent_tool_definitions.get("") + if tool_defs: + span.set_attribute(GEN_AI_TOOL_DEFINITIONS, tool_defs) + if is_sandbox: + sandbox_url = getattr(instance, "_sandbox_url", None) + span.set_attribute( + "claw_eval.sandbox.remote", sandbox_url is not None + ) + if tool_input is not None: + span.set_attribute( + GEN_AI_TOOL_CALL_ARGUMENTS, + _safe_json(tool_input), + ) + + try: + result = wrapped(*args, **kwargs) + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + else: + _extract_dispatch_attrs(span, result) + return result + finally: + _in_tool_dispatch.reset(guard) + + +def _extract_dispatch_attrs(span, result) -> None: + """Extract status, latency, and output from the dispatch result tuple.""" + if not isinstance(result, tuple) or len(result) < 2: + return + tool_result, dispatch_event = result[0], result[1] + latency = getattr(dispatch_event, "latency_ms", None) + if latency is not None: + span.set_attribute("claw_eval.dispatch.latency_ms", float(latency)) + status = getattr(dispatch_event, "response_status", None) + if status is not None: + span.set_attribute("http.response.status_code", int(status)) + if getattr(tool_result, "is_error", False): + span.set_status(Status(StatusCode.ERROR)) + output_text = _extract_tool_result_text(tool_result) + if output_text: + span.set_attribute(GEN_AI_TOOL_CALL_RESULT, output_text) + + +# --------------------------------------------------------------------------- +# Judge wrapper (LLMJudge.evaluate / evaluate_actions / evaluate_visual) +# --------------------------------------------------------------------------- + + +class JudgeWrapper: + """Suppresses nested LLM SDK spans for judge evaluation calls. + + The judge step happens after the agent finishes and is conceptually an + evaluation/grading concern rather than part of the agent's own reasoning + trace. Emitting a dedicated LLM span here clutters the trace tail, so we + intentionally do *not* create a span; we only attach the suppression + context so the underlying LLM SDK (OpenAI / etc.) does not emit a chat + span either. + """ + + __slots__ = ("_tracer", "_method_name") + + def __init__(self, tracer: Tracer, method_name: str = "evaluate"): + self._tracer = tracer + self._method_name = method_name + + def __call__(self, wrapped, instance, args, kwargs): + suppress_tok = _maybe_suppress_llm_sdk() + try: + return wrapped(*args, **kwargs) + finally: + if suppress_tok is not None: + otel_context.detach(suppress_tok) + + +# --------------------------------------------------------------------------- +# Per-task grader wrappers +# --------------------------------------------------------------------------- +# +# Per-task graders (``tasks/T*/grader.py``) frequently bypass +# ``LLMJudge.evaluate*`` and call ``judge.client.chat.completions.create`` +# directly inside helpers like ``_llm_score_classifications``. Those calls +# would otherwise emit a stray "evaluation" LLM span at the very tail of +# the trace. +# +# Rather than statically enumerating every task module, we hook the two +# loader entry points (``registry.get_grader`` and +# ``base.load_peer_grader``) and then walk the returned class' MRO to wrap +# any matching evaluation-helper methods with ``JudgeWrapper``. This keeps +# coverage automatic for any new task that follows the same naming +# convention. + + +import wrapt as _wrapt # local import to avoid widening top-level deps + +_GRADER_EVAL_METHOD_NAMES: tuple[str, ...] = ( + "_llm_score_classifications", +) + +_GRADER_WRAP_MARKER = "_claw_eval_judge_wrapped" + + +def _wrap_grader_eval_methods( + cls, + tracer: Tracer, +) -> None: + """Wrap evaluation-helper methods on ``cls`` (and its bases) with JudgeWrapper. + + Idempotent: a marker attribute is set on the wrapped descriptor so the + same method is never wrapped twice across multiple loads of the same + class (e.g. peer-grader inheritance chains). + """ + if cls is None or cls is object: + return + for klass in getattr(cls, "__mro__", (cls,)): + if klass is object: + continue + for method_name in _GRADER_EVAL_METHOD_NAMES: + method = klass.__dict__.get(method_name) + if method is None: + continue + if getattr(method, _GRADER_WRAP_MARKER, False): + continue + try: + wrapper = JudgeWrapper(tracer, method_name) + wrapped = _wrapt.FunctionWrapper(method, wrapper) + setattr(wrapped, _GRADER_WRAP_MARKER, True) + setattr(klass, method_name, wrapped) + except Exception: + # Failure here only loses suppression for one method; never + # let it break grader loading. + pass + + +class GetGraderWrapper: + """Wraps ``claw_eval.graders.registry.get_grader``. + + After the upstream loader returns a grader instance, walk the + instance's class MRO and wrap evaluation helpers so the inner + ``judge.client.chat.completions.create`` calls don't emit a trailing + LLM span. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + grader = wrapped(*args, **kwargs) + try: + _wrap_grader_eval_methods( + type(grader), self._tracer + ) + except Exception: + pass + return grader + + +class LoadPeerGraderWrapper: + """Wraps ``claw_eval.graders.base.load_peer_grader``. + + Peer graders are loaded lazily at module-import time of a sibling + task's ``grader.py`` (``_Base = load_peer_grader("T001zh_...")``). + Wrapping the returned class here ensures the parent-side + evaluation helpers are suppressed even when subclasses don't override + them. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + cls = wrapped(*args, **kwargs) + try: + _wrap_grader_eval_methods(cls, self._tracer) + except Exception: + pass + return cls + + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + + +def _maybe_suppress_llm_sdk(): + """Suppress nested LLM SDK / generic instrumentation under the wrapped call. + + Sets two complementary context keys so the suppression covers both: + + * ``_SUPPRESS_LLM_SDK_KEY`` — Aliyun-private key honored by + ``aliyun-instrumentation-openai``, ``opentelemetry-instrumentation-litellm`` + and ``aliyun-opentelemetry-util-genai``. + * ``_SUPPRESS_INSTRUMENTATION_KEY`` — the OpenTelemetry standard + suppression key honored by community/upstream instrumentors + (httpx, requests, urllib3, etc.). This catches the HTTP-level span + that would otherwise be emitted for raw judge HTTP calls. + """ + ctx = otel_context.get_current() + if _SUPPRESS_LLM_SDK_KEY is not None: + ctx = otel_context.set_value(_SUPPRESS_LLM_SDK_KEY, True, ctx) + ctx = otel_context.set_value(_SUPPRESS_INSTRUMENTATION_KEY, True, ctx) + return otel_context.attach(ctx) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/package.py new file mode 100644 index 000000000..32c50b3db --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/package.py @@ -0,0 +1,3 @@ +_instruments = ("claw-eval >= 0.1.0",) + +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/version.py new file mode 100644 index 000000000..3dc1f76bc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-claw-eval/src/opentelemetry/instrumentation/claw_eval/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/pyproject.toml new file mode 100644 index 000000000..6d37e87fe --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-minisweagent" +dynamic = ["version"] +description = "LoongSuite mini-swe-agent instrumentation" +license = "Apache-2.0" +requires-python = ">=3.10,<4" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + "opentelemetry-api >= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.0.0, < 2.0.0", +] + +[project.optional-dependencies] +instruments = [ + "mini-swe-agent >= 2.2.0", +] + +[project.entry-points.opentelemetry_instrumentor] +minisweagent = "opentelemetry.instrumentation.minisweagent:MiniSweAgentInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/minisweagent/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/__init__.py new file mode 100644 index 000000000..04274fa5d --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/__init__.py @@ -0,0 +1,161 @@ +""" +LoongSuite mini-swe-agent Instrumentation +========================================= + +Automatic instrumentation for the `mini-swe-agent +`_ framework. + +Uses **Method C (hybrid)**: + +* factory injection via ``get_environment`` → ``TracingEnvironment`` (TOOL / ``execute_tool``) +* ``wrapt`` on ``DefaultAgent.run`` / ``DefaultAgent.step``, and ENTRY on Typer ``minisweagent.run.mini:app`` + +LLM spans stay in LiteLLM/OpenAI instrumentation; this package adds Agent/ReAct/ENTRY/TOOL spans and (with the env vars described in the instrumentor docstring) full ARMS-aligned message / tool payloads. + +Usage +----- + +.. code:: python + + from opentelemetry.instrumentation.minisweagent import MiniSweAgentInstrumentor + + MiniSweAgentInstrumentor().instrument() + + # Then use mini-swe-agent as normal + from minisweagent.models import get_model + from minisweagent.environments import get_environment + from minisweagent.agents.default import DefaultAgent + + model = get_model("gpt-4o") + env = get_environment({"environment_class": "local"}) + agent = DefaultAgent(model=model, environment=env) + agent.run("Fix the bug") + +API +--- +""" + +from __future__ import annotations + +import logging +from typing import Any, Collection + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.minisweagent.package import _instruments +from opentelemetry.instrumentation.minisweagent.version import __version__ +from wrapt import wrap_function_wrapper + +logger = logging.getLogger(__name__) + +__all__ = ["MiniSweAgentInstrumentor"] + + +class MiniSweAgentInstrumentor(BaseInstrumentor): + """An instrumentor for the mini-swe-agent framework. + + Covers GenAI span kinds (ARMS / LoongSuite conventions when + ``OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental`` and + ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=SPAN_ONLY``): + + * **ENTRY** – Typer ``mini`` callable ``app`` (``minisweagent.run.mini:app``), span name ``enter_ai_application_system`` + * **AGENT** – ``DefaultAgent.run`` via ``invoke_agent`` (+ messages / system instruction / tool definitions) + * **STEP** – ``DefaultAgent.step`` (ReAct round) + * **TOOL** – ``TracingEnvironment.execute`` (``execute_tool`` for bash) + + LLM-call spans remain with the underlying LiteLLM/OpenAI instrumentation. + """ + + _original_get_environment = None + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + tracer_provider = kwargs.get("tracer_provider") + tracer = trace_api.get_tracer( + __name__, + __version__, + tracer_provider=tracer_provider, + ) + + from opentelemetry.instrumentation.minisweagent.internal.agent_wrappers import ( + DefaultAgentRunWrapper, + DefaultAgentStepWrapper, + ) + from opentelemetry.instrumentation.minisweagent.internal.cli_wrappers import ( + patch_mini_cli_app_module, + ) + from opentelemetry.instrumentation.minisweagent.internal.delegates import ( + TracingEnvironment, + ) + + # --- factory injection: get_environment --- + try: + import minisweagent.environments as _envs_mod + + if self.__class__._original_get_environment is None: + self.__class__._original_get_environment = _envs_mod.get_environment + + def _wrapped_get_environment(*args: Any, **kw: Any) -> Any: + env = MiniSweAgentInstrumentor._original_get_environment(*args, **kw) + return TracingEnvironment(env, tracer) + + _envs_mod.get_environment = _wrapped_get_environment + except Exception as exc: + logger.warning("Could not wrap get_environment: %s", exc) + + try: + patch_mini_cli_app_module() + except Exception as exc: + logger.warning("Could not patch minisweagent.run.mini.app (ENTRY): %s", exc) + + # --- wrapt: DefaultAgent.run / DefaultAgent.step --- + try: + wrap_function_wrapper( + module="minisweagent.agents.default", + name="DefaultAgent.run", + wrapper=DefaultAgentRunWrapper(tracer), + ) + except Exception as exc: + logger.warning("Could not wrap DefaultAgent.run: %s", exc) + + try: + wrap_function_wrapper( + module="minisweagent.agents.default", + name="DefaultAgent.step", + wrapper=DefaultAgentStepWrapper(tracer), + ) + except Exception as exc: + logger.warning("Could not wrap DefaultAgent.step: %s", exc) + + def _uninstrument(self, **kwargs: Any) -> None: + # --- restore wrapt patches on DefaultAgent --- + try: + from minisweagent.agents.default import DefaultAgent + + if hasattr(DefaultAgent.run, "__wrapped__"): + DefaultAgent.run = DefaultAgent.run.__wrapped__ # type: ignore[attr-defined] + if hasattr(DefaultAgent.step, "__wrapped__"): + DefaultAgent.step = DefaultAgent.step.__wrapped__ # type: ignore[attr-defined] + except Exception as exc: + logger.debug("Could not unwrap DefaultAgent: %s", exc) + + try: + from opentelemetry.instrumentation.minisweagent.internal.cli_wrappers import ( + unpatch_mini_cli_app_module, + ) + + unpatch_mini_cli_app_module() + except Exception as exc: + logger.debug("Could not unpatch mini app: %s", exc) + + # --- restore original factory --- + if self.__class__._original_get_environment is not None: + try: + import minisweagent.environments as _envs_mod + + _envs_mod.get_environment = self.__class__._original_get_environment + self.__class__._original_get_environment = None + except Exception as exc: + logger.debug("Could not restore get_environment: %s", exc) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/config.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/config.py new file mode 100644 index 000000000..ded93cfae --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/config.py @@ -0,0 +1,20 @@ +"""Configuration via environment variables.""" + +from __future__ import annotations + +import os + + +def _int_env(name: str, default: str) -> int: + try: + return int(os.getenv(name, default)) + except ValueError: + return int(default) + + +OTEL_MINISWEAGENT_TASK_PREVIEW_MAX_LEN = _int_env( + "OTEL_MINISWEAGENT_TASK_PREVIEW_MAX_LEN", "256" +) +OTEL_MINISWEAGENT_COMMAND_PREVIEW_MAX_LEN = _int_env( + "OTEL_MINISWEAGENT_COMMAND_PREVIEW_MAX_LEN", "256" +) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/__init__.py new file mode 100644 index 000000000..0b6c41cd6 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/__init__.py @@ -0,0 +1 @@ +"""Internal helpers for mini-swe-agent instrumentation.""" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/agent_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/agent_wrappers.py new file mode 100644 index 000000000..2e99fc56f --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/agent_wrappers.py @@ -0,0 +1,161 @@ +"""wrapt hooks for DefaultAgent.run / DefaultAgent.step (ARMS / util-genai semantics).""" + +from __future__ import annotations + +import logging +from typing import Any, Callable + +from opentelemetry import context as context_api +from opentelemetry.trace import Tracer + +from opentelemetry.instrumentation.minisweagent.config import ( + OTEL_MINISWEAGENT_TASK_PREVIEW_MAX_LEN, +) +from opentelemetry.instrumentation.minisweagent.internal.conversation import ( + build_invoke_agent_payload, +) + +logger = logging.getLogger(__name__) + + +def _task_preview(task: str) -> str: + if not task: + return "" + m = OTEL_MINISWEAGENT_TASK_PREVIEW_MAX_LEN + if len(task) <= m: + return task + return task[: m - 3] + "..." + + +def _request_model_from_agent(instance: Any) -> str | None: + model = getattr(instance, "model", None) + if model is None: + return None + cfg = getattr(model, "config", None) + if cfg is None: + return None + mn = getattr(cfg, "model_name", None) + if mn is not None: + return str(mn) + return None + + +def _populate_invoke_from_agent(inv: Any, instance: Any) -> None: + try: + payload = build_invoke_agent_payload(instance) + except Exception: + logger.debug("invoke_agent telemetry payload failed", exc_info=True) + return + inv.system_instruction = payload["system_instruction"] + inv.input_messages = payload["input_messages"] + inv.output_messages = payload["output_messages"] + inv.tool_definitions = payload["tool_definitions"] + + +class DefaultAgentRunWrapper: + """AGENT invoke_agent span with conversation + system_instruction + bash tool defs.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): # noqa: ARG002 — API compatibility + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + from opentelemetry.util.genai.extended_handler import get_extended_telemetry_handler # noqa: PLC0415 + from opentelemetry.util.genai.extended_types import InvokeAgentInvocation # noqa: PLC0415 + from opentelemetry.util.genai.types import Error as GenAIError # noqa: PLC0415 + + task = args[0] if args else kwargs.get("task", "") or "" + agent_name = f"{instance.__class__.__module__}.{instance.__class__.__name__}" + + han = get_extended_telemetry_handler() + inv = InvokeAgentInvocation(provider="minisweagent", agent_name=agent_name) + inv.request_model = _request_model_from_agent(instance) + inv.attributes.setdefault("gen_ai.framework", "minisweagent") + pv = _task_preview(str(task)) + if pv: + inv.attributes["minisweagent.task.preview"] = pv + + instance._otel_msw_round = 0 # noqa: SLF001 + han.start_invoke_agent(inv, context=context_api.get_current()) + try: + result = wrapped(*args, **kwargs) + except BaseException as exc: + try: + _populate_invoke_from_agent(inv, instance) + except Exception: + logger.debug("populate invoke_agent on error failed", exc_info=True) + if isinstance(exc, Exception): + han.fail_invoke_agent( + inv, GenAIError(message=str(exc), type=type(exc)) + ) + else: + han.stop_invoke_agent(inv) + raise + + try: + _populate_invoke_from_agent(inv, instance) + if isinstance(result, dict): + es = result.get("exit_status") + if es is not None: + inv.attributes["minisweagent.exit_status"] = str(es) + sub = result.get("submission") + if sub is not None: + inv.attributes["minisweagent.submission.preview"] = _task_preview( + str(sub) + ) + finally: + han.stop_invoke_agent(inv) + return result + + +class DefaultAgentStepWrapper: + """ReAct STEP span (gen_ai.span.kind=STEP, operation.name=react).""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): # noqa: ARG002 + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + from minisweagent.exceptions import InterruptAgentFlow # noqa: PLC0415 + from opentelemetry.util.genai.extended_handler import get_extended_telemetry_handler # noqa: PLC0415 + from opentelemetry.util.genai.extended_types import ReactStepInvocation # noqa: PLC0415 + from opentelemetry.util.genai.types import Error as GenAIError # noqa: PLC0415 + + r = int(getattr(instance, "_otel_msw_round", 0) or 0) + 1 + instance._otel_msw_round = r # noqa: SLF001 + + han = get_extended_telemetry_handler() + inv = ReactStepInvocation(round=r) + han.start_react_step(inv, context=context_api.get_current()) + try: + result = wrapped(*args, **kwargs) + except InterruptAgentFlow as flow_exc: + inv.finish_reason = type(flow_exc).__qualname__ + han.stop_react_step(inv) + raise + except BaseException as exc: + inv.finish_reason = type(exc).__qualname__ + if isinstance(exc, Exception): + han.fail_react_step( + inv, GenAIError(message=str(exc), type=type(exc)) + ) + else: + han.stop_react_step(inv) + raise + else: + han.stop_react_step(inv) + return result diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/cli_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/cli_wrappers.py new file mode 100644 index 000000000..761990398 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/cli_wrappers.py @@ -0,0 +1,104 @@ +"""CLI ENTRY: ``mini`` is exposed as Typer ``app``, not Typer-decorated ``main``.""" + +from __future__ import annotations + +import logging +import sys +from typing import Any + +from opentelemetry import context as context_api + +from opentelemetry.instrumentation.minisweagent.internal.conversation import ( + apply_payload_to_entry_invocation, + try_fill_entry_payload_from_mini_trajectory, +) + +logger = logging.getLogger(__name__) + +_PATCH_FLAG = "_otel_loongsuite_mini_app_patched" +_ORIG_APP_ATTR = "_otel_loongsuite_orig_mini_app" + + +class _MiniTyperAppProxy: + """Delegates to real Typer/Click ``app``; ``__call__`` wraps ENTRY span.""" + + __slots__ = ("_inner",) + + def __init__(self, inner: Any): + object.__setattr__(self, "_inner", inner) + + def _hydrate_entry(self, entry_inv: Any) -> None: + try: + payload = try_fill_entry_payload_from_mini_trajectory() + if payload: + apply_payload_to_entry_invocation(entry_inv, payload) + except Exception: + logger.debug("ENTRY traj hydrate failed", exc_info=True) + + def __call__(self, *args: Any, **kwargs: Any) -> Any: + from opentelemetry.util.genai.extended_handler import get_extended_telemetry_handler # noqa: PLC0415 + from opentelemetry.util.genai.extended_types import EntryInvocation # noqa: PLC0415 + from opentelemetry.util.genai.types import Error as GenAIError # noqa: PLC0415 + + han = get_extended_telemetry_handler() + entry_inv = EntryInvocation() + han.start_entry(entry_inv, context=context_api.get_current()) + try: + result = self._inner(*args, **kwargs) + except Exception as exc: + self._hydrate_entry(entry_inv) + han.fail_entry( + entry_inv, + GenAIError(message=str(exc), type=type(exc)), + ) + raise + except BaseException: + # Typer/Click commonly exits by raising SystemExit after the command + # callback has completed; the trajectory file is available here. + self._hydrate_entry(entry_inv) + han.stop_entry(entry_inv) + raise + + self._hydrate_entry(entry_inv) + han.stop_entry(entry_inv) + return result + + # Typer exposes click commands via attribute access — forward everything. + def __getattr__(self, name: str) -> Any: + return getattr(self._inner, name) + + +def patch_mini_cli_app_module() -> None: + """Replace ``minisweagent.run.mini.app`` once the module is loaded.""" + try: + import minisweagent.run.mini as mini_mod + import minisweagent.environments as envs_mod + except Exception as exc: + logger.debug( + "minisweagent.run.mini not available for ENTRY patch: %s", exc + ) + return + if hasattr(mini_mod, "get_environment"): + mini_mod.get_environment = envs_mod.get_environment + if getattr(mini_mod, _PATCH_FLAG, False): + return + inner = getattr(mini_mod, "app", None) + if inner is None or isinstance(inner, _MiniTyperAppProxy): + return + setattr(mini_mod, _ORIG_APP_ATTR, inner) + setattr(mini_mod, "app", _MiniTyperAppProxy(inner)) + setattr(mini_mod, _PATCH_FLAG, True) + + +def unpatch_mini_cli_app_module() -> None: + try: + mini_mod = sys.modules.get("minisweagent.run.mini") + if mini_mod is None or not getattr(mini_mod, _PATCH_FLAG, False): + return + orig = getattr(mini_mod, _ORIG_APP_ATTR, None) + if orig is not None: + mini_mod.app = orig # type: ignore[assignment] + delattr(mini_mod, _PATCH_FLAG) + delattr(mini_mod, _ORIG_APP_ATTR) + except Exception as exc: + logger.debug("unpatch mini app failed: %s", exc) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/conversation.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/conversation.py new file mode 100644 index 000000000..ccef1146b --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/conversation.py @@ -0,0 +1,221 @@ +"""Map mini-swe-agent trajectory dicts → OpenTelemetry GenAI message / tool-definition types.""" + +from __future__ import annotations + +import json +import logging +from pathlib import Path +from typing import Any + +from opentelemetry.util.genai.types import ( + FunctionToolDefinition, + InputMessage, + OutputMessage, + Text, + ToolCall, + ToolCallResponse, +) + +logger = logging.getLogger(__name__) + +_TRAJ_MAX_BYTES = 8_000_000 + + +def bash_tool_definition() -> FunctionToolDefinition: + """Single bash tool (same schema mini uses via LiteLLM).""" + from minisweagent.models.utils.actions_toolcall import BASH_TOOL # noqa: PLC0415 + + fn = BASH_TOOL["function"] + return FunctionToolDefinition( + name=fn["name"], + description=fn.get("description"), + parameters=fn.get("parameters") or {}, + ) + + +def _text_parts(content: str | None) -> list[Text]: + if content is None or str(content).strip() == "": + return [] + return [Text(content=str(content))] + + +def _normalized_tool_calls(msg: dict[str, Any]) -> list[ToolCall]: + parts: list[ToolCall] = [] + raw = msg.get("tool_calls") + if raw: + for tc in raw: + fn_obj = getattr(tc, "function", None) + if fn_obj is None and isinstance(tc, dict): + fn_obj = tc.get("function") + + tc_id = getattr(tc, "id", None) + if tc_id is None and isinstance(tc, dict): + tc_id = tc.get("id") + + name = "bash" + raw_args: Any = "{}" + if fn_obj is not None: + name = getattr(fn_obj, "name", None) or ( + fn_obj.get("name") if isinstance(fn_obj, dict) else name + ) + raw_args = getattr(fn_obj, "arguments", None) + if raw_args is None and isinstance(fn_obj, dict): + raw_args = fn_obj.get("arguments", "{}") + if isinstance(raw_args, str): + try: + args_obj = json.loads(raw_args) + except json.JSONDecodeError: + args_obj = {"raw": raw_args} + else: + args_obj = raw_args if raw_args is not None else {} + parts.append(ToolCall(id=tc_id, name=str(name or "bash"), arguments=args_obj)) + + extra = msg.get("extra") or {} + actions = extra.get("actions") or [] + if not raw and actions: + for act in actions: + cmd = act.get("command") if isinstance(act, dict) else None + if cmd is None: + continue + parts.append( + ToolCall( + id=act.get("tool_call_id") if isinstance(act, dict) else None, + name="bash", + arguments={"command": cmd}, + ) + ) + + return parts + + +def split_system_messages( + messages: list[dict[str, Any]], +) -> tuple[list[Text], list[dict[str, Any]]]: + sys_parts: list[Text] = [] + rest: list[dict[str, Any]] = [] + for m in messages: + if not isinstance(m, dict): + continue + if m.get("role") == "system": + sys_parts.append(Text(content=str(m.get("content", "")))) + else: + rest.append(m) + return sys_parts, rest + + +def _message_to_semconv_messages( + msg: dict[str, Any], +) -> list[InputMessage | OutputMessage]: + role = msg.get("role") + if role == "user": + return [InputMessage(role="user", parts=_text_parts(msg.get("content")))] + if role == "tool": + tid = msg.get("tool_call_id") + return [ + InputMessage( + role="tool", + parts=[ + ToolCallResponse( + id=tid if isinstance(tid, str) else None, + response=msg.get("content", ""), + ) + ], + ) + ] + if role == "assistant": + parts: list[Any] = [] + parts.extend(_text_parts(msg.get("content"))) + parts.extend(_normalized_tool_calls(msg)) + if not parts: + parts = [Text(content="")] + extra = msg.get("extra") or {} + finish = ( + "tool_calls" if extra.get("actions") or msg.get("tool_calls") else "stop" + ) + return [ + OutputMessage( + role="assistant", parts=parts, finish_reason=finish # type: ignore[arg-type] + ) + ] + if role == "exit": + return [ + InputMessage( + role="user", + parts=_text_parts(f"EXIT: {msg.get('content', '')}"), + ) + ] + return [ + InputMessage( + role=str(role or "unknown"), parts=_text_parts(str(msg.get("content"))) + ), + ] + + +def build_invoke_payload_from_messages(messages: list[dict[str, Any]]) -> dict[str, Any]: + """Core conversion: trajectory message dicts → invoke_agent / ENTRY payload.""" + sys_inst, rest = split_system_messages(messages) + input_messages: list[InputMessage] = [] + output_messages: list[OutputMessage] = [] + + try: + for m in rest: + for converted in _message_to_semconv_messages(m): + if isinstance(converted, OutputMessage): + output_messages.append(converted) + else: + input_messages.append(converted) + except Exception: + logger.debug("conversation serialization failed", exc_info=True) + + return { + "system_instruction": sys_inst, + "input_messages": input_messages, + "output_messages": output_messages, + "tool_definitions": [bash_tool_definition()], + } + + +def build_invoke_agent_payload(agent: Any) -> dict[str, Any]: + """Produce semantic fields from a DefaultAgent (or duck-typed agent) trajectory.""" + raw_messages = list(getattr(agent, "messages", None) or []) + messages = [m for m in raw_messages if isinstance(m, dict)] + return build_invoke_payload_from_messages(messages) + + +def try_fill_entry_payload_from_mini_trajectory() -> dict[str, Any] | None: + """Read default mini trajectory file and build ENTRY / invoke payloads.""" + try: + from minisweagent import global_config_dir # noqa: PLC0415 + except Exception: + return None + + path = Path(global_config_dir) / "last_mini_run.traj.json" + if not path.is_file(): + return None + try: + if path.stat().st_size > _TRAJ_MAX_BYTES: + logger.warning("trajectory too large for telemetry snapshot: %s", path) + return None + data = json.loads(path.read_text(encoding="utf-8")) + except Exception: + logger.debug("failed to read mini trajectory %s", path, exc_info=True) + return None + + msgs = data.get("messages") + if not isinstance(msgs, list): + return None + dict_msgs = [m for m in msgs if isinstance(m, dict)] + if not dict_msgs: + return None + try: + return build_invoke_payload_from_messages(dict_msgs) + except Exception: + logger.debug("trajectory payload build failed", exc_info=True) + return None + + +def apply_payload_to_entry_invocation(entry_inv: Any, payload: dict[str, Any]) -> None: + entry_inv.input_messages = payload["input_messages"] + entry_inv.output_messages = payload["output_messages"] + entry_inv.system_instruction = payload["system_instruction"] + entry_inv.tool_definitions = payload["tool_definitions"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/delegates.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/delegates.py new file mode 100644 index 000000000..140d8d6f9 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/internal/delegates.py @@ -0,0 +1,81 @@ +"""Tracing delegates for Environment (factory-injected wrappers). + +LLM-call spans remain with LiteLLM/OpenAI instrumentation; this emits execute_tool. +""" + +from __future__ import annotations + +import json +import logging +from typing import Any + +from opentelemetry import context as context_api +from opentelemetry.trace import Tracer + +logger = logging.getLogger(__name__) + + +def _sanitize_tool_result(payload: dict[str, Any]) -> dict[str, Any]: + try: + return json.loads(json.dumps(payload, default=str)) + except (TypeError, ValueError): + logger.debug("tool result not JSON-normalizable", exc_info=True) + try: + return {"repr": repr(payload)} + except Exception: + return {"error": "unserializable_tool_result"} + + +class TracingEnvironment: + """Delegates to inner Environment and emits ARMS-aligned TOOL (execute_tool) spans.""" + + __slots__ = ("_inner", "_tracer") + + def __init__(self, inner: Any, tracer: Tracer): # noqa: ARG002 + object.__setattr__(self, "_inner", inner) + object.__setattr__(self, "_tracer", tracer) + + def __getattr__(self, name: str) -> Any: + return getattr(self._inner, name) + + def execute(self, action: dict, cwd: str = "", **kwargs: Any) -> dict[str, Any]: + from minisweagent.exceptions import InterruptAgentFlow # noqa: PLC0415 + from opentelemetry.util.genai.extended_handler import get_extended_telemetry_handler # noqa: PLC0415 + from opentelemetry.util.genai.extended_types import ExecuteToolInvocation # noqa: PLC0415 + from opentelemetry.util.genai.types import Error as GenAIError # noqa: PLC0415 + + command = action.get("command", "") if isinstance(action, dict) else "" + tool_call_id = ( + action.get("tool_call_id") if isinstance(action, dict) else None + ) + han = get_extended_telemetry_handler() + inv = ExecuteToolInvocation( + tool_name="bash", + provider="minisweagent", + tool_type="function", + tool_call_id=tool_call_id if isinstance(tool_call_id, str) else None, + tool_description="Execute a bash command", + tool_call_arguments={"command": command}, + ) + + han.start_execute_tool(inv, context=context_api.get_current()) + try: + result = self._inner.execute(action, cwd, **kwargs) + except InterruptAgentFlow: + inv.tool_call_result = {"interrupted": "InterruptAgentFlow"} + han.stop_execute_tool(inv) + raise + except Exception as exc: + inv.tool_call_result = {"error": str(exc)} + han.fail_execute_tool( + inv, GenAIError(message=str(exc), type=type(exc)) + ) + raise + + if isinstance(result, dict): + payload_out = dict(result) + else: + payload_out = {"value": result} + inv.tool_call_result = _sanitize_tool_result(payload_out) + han.stop_execute_tool(inv) + return result diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/package.py new file mode 100644 index 000000000..238a8c08c --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/package.py @@ -0,0 +1,3 @@ +_instruments = ("mini-swe-agent >= 2.2.0",) + +_supports_metrics = True diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/version.py new file mode 100644 index 000000000..3dc1f76bc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-minisweagent/src/opentelemetry/instrumentation/minisweagent/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/README.rst b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/README.rst new file mode 100644 index 000000000..cc8d36bdb --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/README.rst @@ -0,0 +1,93 @@ +OpenTelemetry OpenHands Instrumentation +======================================== + +Automatic OpenTelemetry instrumentation for the legacy OpenHands V0 / +CodeAct runtime. + +What is covered +--------------- + +This package wraps the V0 ``python -m openhands.core.main`` execution path: + +* ``openhands.core.main.run_controller`` for the ENTRY span. +* ``openhands.core.loop.run_agent_until_done`` for the AGENT span fallback. +* ``AgentController.__init__`` / ``AgentController.close`` for lifecycle-bound + ENTRY and AGENT spans that survive ``python -m`` from-import binding. +* ``AgentController._step`` for ReAct STEP spans. +* ``Runtime.run_action`` for TOOL spans. +* ``LLM.__init__`` to bridge the current OpenHands context into LiteLLM calls. + +Span tree +--------- + +:: + + ENTRY enter openhands + `-- AGENT invoke_agent codeact + |-- STEP react step [xN] + | |-- LLM chat {model} + | `-- TOOL execute_tool {tool_name} + `-- STEP react step [...] + +``python -m`` and from-import binding +------------------------------------- + +When OpenHands V0 is launched via ``python -m openhands.core.main``, Python +executes ``main.py`` as ``__main__``. Symbols imported with ``from ... import`` +can be bound before module-level wrappers are installed, so patching +``openhands.core.main.run_controller`` is not enough by itself. + +To keep ENTRY and AGENT spans reliable, this instrumentation primarily opens +them from ``AgentController.__init__`` and closes them from +``AgentController.close``. The module-level wrappers remain as a fallback for +programmatic invocations. + +Cross-thread context bridge +--------------------------- + +OpenHands V0 may execute controller steps and runtime tool calls in worker +threads with fresh asyncio loops. The instrumentation stores the active OTel +context by session id and re-attaches it in STEP, TOOL, and LLM bridge wrappers +so the trace remains: + +``ENTRY -> AGENT -> STEP -> (LLM / TOOL)``. + +Semantic-convention I/O capture +------------------------------- + +ENTRY and STEP spans emit ``input.value`` / ``output.value`` and GenAI semantic +attributes where applicable. AGENT spans use GenAI-native attributes for +messages without OpenInference ``input.value`` / ``output.value`` mirrors. +TOOL spans never set ``input.value`` / ``output.value``; they always set +``gen_ai.tool.call.arguments`` (JSON object string, ``"{}"`` when empty) and +``gen_ai.tool.call.result``. + +* **ENTRY** emits ``gen_ai.input.messages`` and ``gen_ai.output.messages`` using + the ARMS parts-based message schema. +* **AGENT** emits ``gen_ai.input.messages``, ``gen_ai.output.messages``, + ``gen_ai.system_instructions``, and ``gen_ai.tool.definitions``. +* **STEP** emits recent input history and the pending assistant/tool-call + output for the ReAct round. +* **TOOL** emits ``gen_ai.tool.name``, ``gen_ai.tool.type``, + ``gen_ai.tool.call.id``, ``gen_ai.tool.description``, + ``gen_ai.tool.call.arguments``, and ``gen_ai.tool.call.result``. + +Usage +----- + +.. code:: python + + from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor + + OpenHandsInstrumentor().instrument() + +Configuration +------------- + +Environment variables: + +* ``OTEL_INSTRUMENTATION_OPENHANDS_ENABLED`` (default ``true``) +* ``OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS`` (default ``true``) +* ``OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM`` (default ``true``) + +I/O capture is always on and content is emitted in full. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/pyproject.toml new file mode 100644 index 000000000..b9f0ae7f4 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/pyproject.toml @@ -0,0 +1,50 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-openhands" +dynamic = ["version"] +description = "LoongSuite OpenHands Instrumentation" +readme = "README.rst" +license = "Apache-2.0" +requires-python = ">=3.10" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + "wrapt >= 1.0.0, < 2.0.0", +] + +[project.optional-dependencies] +instruments = [] + +[project.entry-points.opentelemetry_instrumentor] +openhands = "opentelemetry.instrumentation.openhands:OpenHandsInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-openhands" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/openhands/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/__init__.py new file mode 100644 index 000000000..a02a7d3b3 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/__init__.py @@ -0,0 +1,265 @@ +"""OpenTelemetry OpenHands Instrumentation. + +Wraps the legacy V0 (CodeAct + AgentController + Runtime) path: + +* V0 — ``python -m openhands.core.main``. We add + ``ENTRY → AGENT → STEP → TOOL`` directly on top of the controller / runtime + call chain. LLM spans come from the bundled LiteLLM instrumentor. + +Usage +----- + +.. code:: python + + from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor + + OpenHandsInstrumentor().instrument() +""" + +from __future__ import annotations + +import importlib +import logging +from typing import Any, Collection + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.openhands.config import ( + OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM, + OTEL_INSTRUMENTATION_OPENHANDS_ENABLED, + OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS, +) +from opentelemetry.instrumentation.openhands.package import _instruments +from opentelemetry.instrumentation.openhands.version import __version__ + +logger = logging.getLogger(__name__) + +__all__ = ["OpenHandsInstrumentor"] + + +# --------------------------------------------------------------------------- +# Wrap-point registry — single source of truth shared with _uninstrument. +# Entries: (module, qualified_name) +# --------------------------------------------------------------------------- + +_PATCH_TARGETS: list[tuple[str, str]] = [ + ("openhands.core.main", "run_controller"), + ("openhands.core.loop", "run_agent_until_done"), + # AgentController.__init__ / .close are the *primary* ENTRY+AGENT + # span source for V0 — they're class methods, so they're patchable + # regardless of the from-import binding problem in main.py + # (see v0_wrappers.AgentControllerInitWrapper docstring). + ( + "openhands.controller.agent_controller", + "AgentController.__init__", + ), + ( + "openhands.controller.agent_controller", + "AgentController.close", + ), + ( + "openhands.controller.agent_controller", + "AgentController._step", + ), + ("openhands.runtime.base", "Runtime.run_action"), + # LLM context bridge — re-attaches the current sid-stashed context + # (STEP while a step is open) onto every ``LLM.completion`` invocation + # so the downstream LiteLLM / Aliyun GenAI auto-instrumentation emits + # the LLM span as a child of STEP and shares its ``trace_id``. + ("openhands.llm.llm", "LLM.__init__"), +] + + +def _module_importable(module: str) -> bool: + try: + importlib.import_module(module) + return True + except ModuleNotFoundError: + return False + except Exception: + # Other import errors should still let the wrap attempt surface a + # warning. + return True + + +def _safe_wrap(module: str, name: str, wrapper: Any) -> bool: + """Patch ``module.name`` with ``wrapper``; classify failures sensibly.""" + if not _module_importable(module): + # OpenHands versions can move modules around. Missing V0 modules + # should not prevent applications from starting. + logger.debug( + "OpenHands instrumentation: module %s not importable, skipping %s", + module, + name, + ) + return False + try: + wrap_function_wrapper(module=module, name=name, wrapper=wrapper) + logger.debug("OpenHands instrumentation: wrapped %s.%s", module, name) + return True + except (AttributeError, ImportError) as exc: + # Attribute missing inside the module — usually a version-skew issue. + logger.warning( + "OpenHands instrumentation: could not wrap %s.%s: %s", + module, + name, + exc, + ) + return False + except Exception as exc: # pragma: no cover - defensive + logger.warning( + "OpenHands instrumentation: unexpected error wrapping %s.%s: %s", + module, + name, + exc, + ) + return False + + +def _safe_unwrap(module: str, qualname: str) -> None: + """Unwrap a previously ``wrapt``-patched function or method.""" + try: + mod = importlib.import_module(module) + except Exception: + return + parts = qualname.split(".") + obj: Any = mod + parents: list[Any] = [mod] + try: + for p in parts: + obj = getattr(obj, p) + parents.append(obj) + except Exception: + return + if not hasattr(obj, "__wrapped__"): + return + parent = parents[-2] + try: + setattr(parent, parts[-1], obj.__wrapped__) + except Exception: + pass + + +class OpenHandsInstrumentor(BaseInstrumentor): + """Instrumentation entry point for OpenHands V0.""" + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + if not OTEL_INSTRUMENTATION_OPENHANDS_ENABLED: + logger.info("OpenHands instrumentation disabled via env var") + return + + tracer_provider = kwargs.get("tracer_provider") + tracer = trace_api.get_tracer( + __name__, __version__, tracer_provider=tracer_provider + ) + + from opentelemetry.instrumentation.openhands.internal.v0_wrappers import ( + AgentControllerCloseWrapper, + AgentControllerInitWrapper, + AgentControllerStepWrapper, + LLMInitWrapper, + RunAgentUntilDoneWrapper, + RunControllerWrapper, + RuntimeRunActionWrapper, + ) + + if OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + self._install_v0_patches(tracer, { + "run_controller": RunControllerWrapper, + "run_agent_until_done": RunAgentUntilDoneWrapper, + "agent_init": AgentControllerInitWrapper, + "agent_close": AgentControllerCloseWrapper, + "agent_step": AgentControllerStepWrapper, + "runtime_run_action": RuntimeRunActionWrapper, + "llm_init": LLMInitWrapper, + }) + + # Auto-enable bundled LiteLLM instrumentation so SDK / V0 LLM + # ``litellm.completion()`` calls become LLM spans. + if OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM: + self._maybe_enable_litellm(**kwargs) + + def _install_v0_patches(self, tracer, factories) -> None: + RunControllerWrapper = factories["run_controller"] + RunAgentUntilDoneWrapper = factories["run_agent_until_done"] + AgentControllerInitWrapper = factories["agent_init"] + AgentControllerCloseWrapper = factories["agent_close"] + AgentControllerStepWrapper = factories["agent_step"] + RuntimeRunActionWrapper = factories["runtime_run_action"] + LLMInitWrapper = factories["llm_init"] + + # `run_controller` and `run_agent_until_done` patches are best-effort: + # they only fire when run_controller is called via the proper module + # path (programmatic / test). When OpenHands is launched via + # ``python -m openhands.core.main``, the from-import binding in + # main.py bypasses these patches — the AgentController.__init__ / + # .close patches below take over and produce ENTRY+AGENT spans + # reliably (class methods are immune to from-import binding). + _safe_wrap( + "openhands.core.main", + "run_controller", + RunControllerWrapper(tracer), + ) + _safe_wrap( + "openhands.core.loop", + "run_agent_until_done", + RunAgentUntilDoneWrapper(tracer), + ) + _safe_wrap( + "openhands.controller.agent_controller", + "AgentController.__init__", + AgentControllerInitWrapper(tracer), + ) + _safe_wrap( + "openhands.controller.agent_controller", + "AgentController.close", + AgentControllerCloseWrapper(tracer), + ) + _safe_wrap( + "openhands.controller.agent_controller", + "AgentController._step", + AgentControllerStepWrapper(tracer), + ) + _safe_wrap( + "openhands.runtime.base", + "Runtime.run_action", + RuntimeRunActionWrapper(tracer), + ) + # LLM context bridge — patches ``LLM.__init__`` so every instance's + # ``self._completion`` re-attaches the latest sid-stashed context. + # See ``LLMInitWrapper`` for why we need this even though the LLM + # call is synchronous: in real OpenHands deployments LiteLLM ends + # up creating its span in a thread / context that ``contextvars`` + # didn't propagate STEP into, so we re-attach explicitly. + _safe_wrap( + "openhands.llm.llm", + "LLM.__init__", + LLMInitWrapper(tracer), + ) + + def _maybe_enable_litellm(self, **kwargs: Any) -> None: + try: + from opentelemetry.instrumentation.litellm import ( + LiteLLMInstrumentor, + ) + except Exception as exc: + logger.debug( + "LiteLLM instrumentation not available, skipping: %s", exc + ) + return + try: + instr = LiteLLMInstrumentor() + already = getattr(instr, "_is_instrumented_by_opentelemetry", False) + if not already: + instr.instrument(**kwargs) + except Exception as exc: + logger.debug("Could not auto-enable LiteLLM instrumentation: %s", exc) + + def _uninstrument(self, **kwargs: Any) -> None: + for module, qualname in _PATCH_TARGETS: + _safe_unwrap(module, qualname) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/config.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/config.py new file mode 100644 index 000000000..4f5ad38db --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/config.py @@ -0,0 +1,25 @@ +"""Environment-variable driven configuration for the OpenHands instrumentation.""" + +from __future__ import annotations + +import os + + +def _bool_env(name: str, default: bool) -> bool: + val = os.getenv(name) + if val is None: + return default + return val.strip().lower() in {"true", "1", "yes", "on"} + + +OTEL_INSTRUMENTATION_OPENHANDS_ENABLED = _bool_env( + "OTEL_INSTRUMENTATION_OPENHANDS_ENABLED", True +) + +OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS = _bool_env( + "OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS", True +) + +OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM = _bool_env( + "OTEL_INSTRUMENTATION_OPENHANDS_AUTO_INSTRUMENT_LITELLM", True +) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/__init__.py new file mode 100644 index 000000000..7b2c8b6a1 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/__init__.py @@ -0,0 +1 @@ +"""Internal helpers for OpenHands instrumentation.""" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/constants.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/constants.py new file mode 100644 index 000000000..6d99a6820 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/constants.py @@ -0,0 +1,12 @@ +"""Constant attribute keys & framework identity used across wrappers.""" + +from __future__ import annotations + +GEN_AI_FRAMEWORK = "gen_ai.framework" +GEN_AI_SPAN_KIND = "gen_ai.span.kind" + +FRAMEWORK_NAME = "openhands" + +# OpenHands-specific span attributes (namespaced to avoid clashing with the +# generic GenAI semconv attributes already provided by upstream). +OH_INITIAL_MESSAGE_PREVIEW = "openhands.initial_message.preview" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/session_context.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/session_context.py new file mode 100644 index 000000000..534d3e611 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/session_context.py @@ -0,0 +1,196 @@ +"""Cross-thread / cross-loop OTel context bridge keyed by OpenHands session id. + +Why this exists +--------------- + +OpenHands V0's ``EventStream`` delivers events to subscribers via a +``ThreadPoolExecutor``. The ``AgentController.on_event`` callback then runs + +.. code:: python + + asyncio.get_event_loop().run_until_complete(self._on_event(event)) + +inside a *worker thread*, which spins up a brand-new asyncio loop with a +fresh ``contextvars.Context``. This means none of the OTel context (tracer +spans / baggage) attached on the main coroutine in ``run_controller`` is +visible inside ``AgentController._step`` or ``Runtime.run_action`` — every +STEP / TOOL span starts at the **trace root**, fragmenting the trace into +many disconnected pieces. + +This module bridges that gap. We snapshot the OTel context at entry-time +(``run_controller`` / ``run_agent_until_done``) under the controller's +session id, and the STEP / TOOL wrappers re-attach the snapshot before +opening their spans so every span shares a single ``trace_id`` rooted at +the ENTRY span. + +The store is keyed by **session id (sid)** so concurrent benchmark +sessions stay isolated. +""" + +from __future__ import annotations + +import threading +from typing import Optional + +from opentelemetry import context as otel_context + +_lock = threading.Lock() +# Map session id -> OTel Context object. The Context contains the active +# Span (and any baggage / suppression flags). Re-attaching it makes the +# stored span the *current* span for whatever thread/loop attaches it. +_session_contexts: dict[str, otel_context.Context] = {} + +# Map session id -> { tool_name: tool_definition_dict }. Captured at +# AGENT span open from ``controller.agent.tools`` and consumed by the +# TOOL wrapper to populate ``gen_ai.tool.description`` and friends — the +# Runtime instance does not have direct access to the agent's tool list. +_session_tool_registry: dict[str, dict[str, dict]] = {} + +# Tracks the most-recent sid we stored a context for. Used as a fallback +# when a hook point (typically ``Runtime.run_action``) cannot locate the +# session id from its arguments — in single-session CLI runs this is +# always the right answer. +_last_sid: Optional[str] = None + + +def store_context(sid: Optional[str], ctx: otel_context.Context) -> None: + """Stash ``ctx`` under ``sid``. Updates ``_last_sid``.""" + if not sid: + return + global _last_sid + with _lock: + _session_contexts[sid] = ctx + _last_sid = sid + + +def get_context(sid: Optional[str]) -> Optional[otel_context.Context]: + """Return the stashed context for ``sid``, falling back to the last sid.""" + with _lock: + if sid and sid in _session_contexts: + return _session_contexts[sid] + if _last_sid and _last_sid in _session_contexts: + return _session_contexts[_last_sid] + return None + + +def clear_context(sid: Optional[str]) -> None: + if not sid: + return + global _last_sid + with _lock: + _session_contexts.pop(sid, None) + _session_tool_registry.pop(sid, None) + if _last_sid == sid: + _last_sid = None + + +def clear_all() -> None: + """Drop everything (only used by tests).""" + global _last_sid + with _lock: + _session_contexts.clear() + _session_tool_registry.clear() + _last_sid = None + + +# --------------------------------------------------------------------------- +# Tool registry (per-sid) +# --------------------------------------------------------------------------- + + +def store_tool_registry(sid: Optional[str], tools: object) -> None: + """Index ``tools`` by name and stash under ``sid``. + + ``tools`` is whatever ``controller.agent.tools`` exposes — typically a + list of LiteLLM ``ChatCompletionToolParam`` dicts of the form + ``{"type": "function", "function": {"name": ..., "description": ..., ...}}``. + Anything that doesn't fit that shape is best-effort skipped. + """ + if not sid or not tools: + return + registry: dict[str, dict] = {} + try: + for t in tools: # type: ignore[union-attr] + try: + if isinstance(t, dict): + fn = t.get("function") or {} + name = fn.get("name") if isinstance(fn, dict) else None + else: + fn = getattr(t, "function", None) + name = getattr(fn, "name", None) if fn is not None else None + # Normalize to a dict so the consumer doesn't need type-knowledge. + if name and not isinstance(t, dict): + t = { + "type": getattr(t, "type", "function"), + "function": { + "name": name, + "description": getattr(fn, "description", "") or "", + "parameters": getattr(fn, "parameters", None) or {}, + }, + } + if name: + registry[str(name)] = t + except Exception: + continue + except TypeError: + return + if not registry: + return + with _lock: + _session_tool_registry[sid] = registry + + +def get_tool_definition(sid: Optional[str], name: Optional[str]) -> Optional[dict]: + """Look up a single tool's definition (dict) by name, sid-scoped.""" + if not name: + return None + with _lock: + if sid and sid in _session_tool_registry: + return _session_tool_registry[sid].get(name) + # Fallback to the most-recent session — single-CLI-run case. + if _last_sid and _last_sid in _session_tool_registry: + return _session_tool_registry[_last_sid].get(name) + return None + + +def get_tool_registry(sid: Optional[str]) -> Optional[dict[str, dict]]: + """Return the full ``{name: definition}`` registry for ``sid``.""" + with _lock: + if sid and sid in _session_tool_registry: + return dict(_session_tool_registry[sid]) + if _last_sid and _last_sid in _session_tool_registry: + return dict(_session_tool_registry[_last_sid]) + return None + + +class AttachedSession: + """Context manager that attaches the stashed context for ``sid``. + + Usage:: + + with AttachedSession(sid): + span = tracer.start_span(...) + # span is parented under whatever the stashed context contains + + No-op when no stash exists for the given sid. + """ + + __slots__ = ("_sid", "_token") + + def __init__(self, sid: Optional[str]): + self._sid = sid + self._token = None + + def __enter__(self) -> "AttachedSession": + ctx = get_context(self._sid) + if ctx is not None: + self._token = otel_context.attach(ctx) + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + if self._token is not None: + try: + otel_context.detach(self._token) + except Exception: + pass + self._token = None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/utils.py new file mode 100644 index 000000000..7354bb8b2 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/utils.py @@ -0,0 +1,190 @@ +"""Small attribute / argument extraction helpers shared by the wrappers.""" + +from __future__ import annotations + +import json +from typing import Any + + +def safe_str(value: Any) -> str: + """Best-effort string conversion that never raises.""" + if value is None: + return "" + try: + return str(value) + except Exception: + return "" + + +def preview(text: Any, max_len: int | None = None) -> str: + """Return a string preview of *text* (kept for API compatibility). + + Truncation is no longer applied — captured content is emitted in + full so dashboards never lose information. ``max_len`` is accepted + but ignored. + """ + return safe_str(text) + + +def maybe_preview(text: Any) -> str: + """Alias for :func:`preview` — kept for API compatibility.""" + return preview(text) + + +def safe_get_attr(obj: Any, *names: str, default: Any = None) -> Any: + """Return the first non-None attribute among *names* on *obj*.""" + for name in names: + if obj is None: + return default + try: + v = getattr(obj, name, None) + except Exception: + v = None + if v is not None: + return v + return default + + +def serialize_message(message: Any) -> str: + """Best-effort serialize an OpenHands message-like object to text.""" + if message is None: + return "" + if isinstance(message, str): + return message + text_parts: list[str] = [] + for attr in ("text", "content", "value"): + v = safe_get_attr(message, attr) + if isinstance(v, str) and v: + return v + if isinstance(v, list): + for item in v: + t = safe_get_attr(item, "text", "content") + if isinstance(t, str) and t: + text_parts.append(t) + if text_parts: + return "\n".join(text_parts) + return safe_str(message) + + +def extract_uuid_str(value: Any) -> str: + """Convert a UUID-like value to its hex/string form, returning ''.""" + if value is None: + return "" + hex_attr = getattr(value, "hex", None) + if isinstance(hex_attr, str) and hex_attr: + return hex_attr + return safe_str(value) + + +# --------------------------------------------------------------------------- +# Semconv I/O serialization (input.value / output.value) +# --------------------------------------------------------------------------- + + +def _to_jsonable(obj: Any, depth: int = 0, max_depth: int = 3) -> Any: + """Best-effort convert ``obj`` into something json.dumps can serialize.""" + if obj is None or isinstance(obj, (bool, int, float, str)): + return obj + if depth >= max_depth: + return safe_str(obj) + if isinstance(obj, dict): + out: dict[str, Any] = {} + for k, v in obj.items(): + try: + out[safe_str(k)] = _to_jsonable(v, depth + 1, max_depth) + except Exception: + out[safe_str(k)] = safe_str(v) + return out + if isinstance(obj, (list, tuple, set)): + return [_to_jsonable(v, depth + 1, max_depth) for v in obj] + # Pydantic v2 + if hasattr(obj, "model_dump"): + try: + return _to_jsonable(obj.model_dump(), depth + 1, max_depth) + except Exception: + pass + # Dataclass / generic object + if hasattr(obj, "__dict__"): + try: + d = { + k: v + for k, v in vars(obj).items() + if not k.startswith("_") + and not callable(v) + } + if d: + return _to_jsonable(d, depth + 1, max_depth) + except Exception: + pass + return safe_str(obj) + + +def to_json_str(obj: Any, max_len: int | None = None) -> str: + """Convert ``obj`` to a JSON string. Empty string on failure. + + No truncation is applied — captured content is emitted in full. + ``max_len`` is accepted but ignored (kept for API compatibility). + """ + try: + jsonable = _to_jsonable(obj) + s = json.dumps(jsonable, ensure_ascii=False, default=safe_str) + except Exception: + s = safe_str(obj) + return s or "" + + +def maybe_to_json_str(obj: Any, max_len: int | None = None) -> str: + """Alias for :func:`to_json_str` — kept for API compatibility.""" + return to_json_str(obj, max_len) + + +def messages_to_genai_input(messages: Any) -> str: + """Serialize a chat-style ``messages`` list for ``gen_ai.input.messages``. + + Each item is normalized into ``{"role": ..., "content": ...}``. Keeps + ``tool_calls`` when present. + """ + if not isinstance(messages, list): + return "" + norm: list[dict[str, Any]] = [] + for m in messages: + role = safe_get_attr(m, "role") + content = safe_get_attr(m, "content") + if role is None and content is None and isinstance(m, dict): + role = m.get("role") + content = m.get("content") + if isinstance(content, list): + content = "".join( + safe_str(safe_get_attr(c, "text") or safe_get_attr(c, "content") or c) + for c in content + ) + item: dict[str, Any] = {"role": safe_str(role) or "user", "content": safe_str(content)} + tool_calls = safe_get_attr(m, "tool_calls") + if tool_calls: + item["tool_calls"] = _to_jsonable(tool_calls) + norm.append(item) + return to_json_str(norm) + + +def action_to_genai_output(action: Any) -> str: + """Serialize an OpenHands V0 ``Action`` into a GenAI-style assistant message.""" + if action is None: + return "" + action_type = safe_str(safe_get_attr(action, "action") or "") + thought = safe_str(safe_get_attr(action, "thought") or "") + item: dict[str, Any] = {"role": "assistant"} + if thought: + item["content"] = thought + args: dict[str, Any] = {} + for key in ("command", "code", "path", "url", "content", "task_list", "name", "arguments"): + v = safe_get_attr(action, key) + if v not in (None, "", []): + args[key] = _to_jsonable(v) + if action_type or args: + item["tool_calls"] = [ + { + "type": "function", + "function": {"name": action_type or "agent.action", "arguments": args}, + } + ] + return to_json_str([item]) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/v0_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/v0_wrappers.py new file mode 100644 index 000000000..93212edd5 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/internal/v0_wrappers.py @@ -0,0 +1,2545 @@ +"""Wrappers for the OpenHands **V0** (Legacy CodeAct) architecture. + +Trace tree +---------- + +:: + + ENTRY enter openhands (openhands.core.main.run_controller) + `-- AGENT invoke_agent codeact (openhands.core.loop.run_agent_until_done) + |-- STEP react step [×N] (openhands.controller.agent_controller.AgentController._step) + | `-- LLM chat {model} (litellm — covered by litellm instrumentor) + `-- TOOL execute_tool {tool_name} (openhands.runtime.base.Runtime.run_action) + +Context propagation across threads +---------------------------------- + +OpenHands V0's ``EventStream`` delivers events via ``ThreadPoolExecutor``, +and ``AgentController.on_event`` then runs the actual handler with a +*brand-new* asyncio loop in a worker thread: + +.. code:: python + + asyncio.get_event_loop().run_until_complete(self._on_event(event)) + +Python ``contextvars`` do NOT propagate from the main coroutine into these +worker threads, so ``AgentController._step`` and ``Runtime.run_action`` +would otherwise start *root* spans with fresh ``trace_id``s, fragmenting +the trace into many disconnected pieces. + +To fix that, we use :mod:`session_context` as a process-wide bridge: the +ENTRY wrapper stashes the OTel context (carrying the ENTRY+AGENT span +chain) keyed by session id, and STEP / TOOL wrappers re-attach it before +opening their span. The result is one trace per session id with the +correct parent-child links. + +I/O capture +----------- + +ENTRY / STEP spans set: + +* ``input.value`` and ``output.value`` (OpenInference convention) +* ``input.mime_type`` / ``output.mime_type`` +* ``gen_ai.input.messages`` / ``gen_ai.output.messages`` where the GenAI + semconv applies (LLM-style messages + assistant tool calls) + +AGENT spans set GenAI message attributes without OpenInference +``input.value`` / ``output.value`` mirrors. + +TOOL spans set ``gen_ai.tool.call.arguments`` (always, including ``"{}"`` +when empty) and ``gen_ai.tool.call.result`` for observations. They do +not set OpenInference ``input.value`` / ``output.value``. + +Capture is always on and content is emitted untruncated. +""" + +from __future__ import annotations + +import json +import logging +from typing import Any + +from opentelemetry import context as otel_context +from opentelemetry import trace as trace_api +from opentelemetry.semconv._incubating.attributes import ( + gen_ai_attributes as GenAI, +) +from opentelemetry.trace import ( + SpanKind, + Status, + StatusCode, + Tracer, + set_span_in_context, +) + +from opentelemetry.instrumentation.openhands.config import ( + OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS, +) +from opentelemetry.instrumentation.openhands.internal.constants import ( + FRAMEWORK_NAME, + GEN_AI_FRAMEWORK, + GEN_AI_SPAN_KIND, + OH_INITIAL_MESSAGE_PREVIEW, +) +from opentelemetry.instrumentation.openhands.internal.session_context import ( + AttachedSession, + clear_context, + get_context, + get_tool_definition, + store_context, + store_tool_registry, +) +from opentelemetry.instrumentation.openhands.internal.utils import ( + action_to_genai_output, + maybe_preview, + maybe_to_json_str, + messages_to_genai_input, + safe_get_attr, + safe_str, + serialize_message, + to_json_str, +) + +logger = logging.getLogger(__name__) + + +# Constants ----------------------------------------------------------------- + +OH_AGENT_NAME = "openhands.agent.name" +OH_REACT_ROUND = "gen_ai.react.round" +OH_AGENT_STATE = "openhands.agent.state" +OH_RUNTIME_NAME = "openhands.runtime.name" +OH_ACTION_TYPE = "openhands.action.type" +OH_OBSERVATION_TYPE = "openhands.observation.type" +OH_HISTORY_LENGTH = "openhands.history.length" + +# OpenInference / GenAI common I/O attribute keys +INPUT_VALUE = "input.value" +INPUT_MIME = "input.mime_type" +OUTPUT_VALUE = "output.value" +OUTPUT_MIME = "output.mime_type" +GEN_AI_INPUT_MESSAGES = "gen_ai.input.messages" +GEN_AI_OUTPUT_MESSAGES = "gen_ai.output.messages" +GEN_AI_SYSTEM = "gen_ai.system" +GEN_AI_AGENT_ID = "gen_ai.agent.id" +GEN_AI_CONVERSATION_ID = "gen_ai.conversation.id" +GEN_AI_SESSION_ID = "gen_ai.session.id" +GEN_AI_REQUEST_MODEL = "gen_ai.request.model" +GEN_AI_SYSTEM_INSTRUCTIONS = "gen_ai.system_instructions" + +# Tool span attributes per ARMS GenAI semconv (gen-ai.md §Tool). +GEN_AI_TOOL_CALL_ID = "gen_ai.tool.call.id" +GEN_AI_TOOL_CALL_ARGUMENTS = "gen_ai.tool.call.arguments" +GEN_AI_TOOL_CALL_RESULT = "gen_ai.tool.call.result" +GEN_AI_TOOL_DESCRIPTION = "gen_ai.tool.description" +GEN_AI_TOOL_DEFINITIONS = "gen_ai.tool.definitions" + +# Stash slots on AgentController instances (set by AgentControllerInitWrapper). +_OWNS_FLAG = "_otel_oh_owns_lifecycle" +_ENTRY_SPAN_ATTR = "_otel_oh_entry_span" +_AGENT_SPAN_ATTR = "_otel_oh_agent_span" +_ENTRY_TOKEN_ATTR = "_otel_oh_entry_token" +_AGENT_TOKEN_ATTR = "_otel_oh_agent_token" +# STEP persistence — keeps the *most-recent* STEP span alive across the +# return of ``_step`` so that ``Runtime.run_action`` (which fires *later* +# in a thread-pool executor via ``call_sync_from_async``) can re-attach +# the STEP context and become its child rather than a sibling. +# +# IMPORTANT: we deliberately do **not** stash an OTel attach-token across +# the return of ``_step``. ``otel_context.attach()`` returns a Token that +# is bound to the ``contextvars.Context`` it was created in; calling +# ``detach(token)`` from a *different* context raises ``ValueError`` (and +# in production the Aliyun OTel SDK floods the log with +# "Token was created in a different Context" errors). Attach/detach +# always happen as a balanced pair *inside the same async task*; cross- +# task / cross-thread propagation goes through the ``Context`` *object* +# stashed in :mod:`session_context` and re-attached on the consumer side. +_STEP_SPAN_ATTR = "_otel_oh_step_span" +_AGENT_CTX_ATTR = "_otel_oh_agent_ctx" # restore target when STEP closes + + +def _set_common(span: trace_api.Span, kind: str) -> None: + span.set_attribute(GEN_AI_SPAN_KIND, kind) + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + span.set_attribute(GEN_AI_SYSTEM, FRAMEWORK_NAME) + + +def _set_io( + span: trace_api.Span, + *, + input_value: str = "", + output_value: str = "", + input_messages: str = "", + output_messages: str = "", + mime: str = "application/json", +) -> None: + if input_value: + span.set_attribute(INPUT_VALUE, input_value) + span.set_attribute(INPUT_MIME, mime) + if output_value: + span.set_attribute(OUTPUT_VALUE, output_value) + span.set_attribute(OUTPUT_MIME, mime) + if input_messages: + span.set_attribute(GEN_AI_INPUT_MESSAGES, input_messages) + if output_messages: + span.set_attribute(GEN_AI_OUTPUT_MESSAGES, output_messages) + + +def _extract_model_from_config(config: Any) -> str: + if config is None: + return "" + try: + llms = safe_get_attr(config, "llms") + if isinstance(llms, dict) and llms: + llm = next(iter(llms.values())) + model = safe_get_attr(llm, "model") + if model: + return safe_str(model) + except Exception: + pass + try: + llm = safe_get_attr(config, "llm") + model = safe_get_attr(llm, "model") + if model: + return safe_str(model) + except Exception: + pass + return "" + + +def _extract_input_message_text(initial_user_action: Any) -> str: + """Pull human-readable text out of an ``initial_user_action`` argument.""" + return serialize_message(initial_user_action) + + +def _state_to_input_messages(state: Any, max_messages: int = 10) -> str: + """Best-effort extract a chat-style messages list from a controller State. + + The actual messages sent to the LLM are built inside ``CodeActAgent.step`` + and not stored on the controller, so this is a coarse summary derived + from ``state.history`` which is reliably available. + """ + history = safe_get_attr(state, "history") or [] + if not isinstance(history, list): + return "" + items: list[dict[str, str]] = [] + # Keep the most recent ``max_messages`` events for size budget. + for ev in history[-max_messages:]: + cls_name = type(ev).__name__ + # Map common event types to roles + if cls_name in ("MessageAction", "SystemMessageAction"): + role = "user" if str(safe_get_attr(ev, "source")) == "user" else "assistant" + content = safe_get_attr(ev, "content") or safe_get_attr(ev, "message") or "" + elif cls_name.endswith("Action"): + role = "assistant" + content = ( + safe_get_attr(ev, "thought") + or safe_get_attr(ev, "command") + or safe_get_attr(ev, "code") + or safe_str(ev) + ) + elif cls_name.endswith("Observation"): + role = "tool" + content = safe_get_attr(ev, "content") or safe_str(ev) + else: + role = "system" + content = safe_str(ev) + items.append({"role": role, "content": safe_str(content), "event": cls_name}) + return to_json_str(items) + + +def _final_state_to_output(state: Any) -> str: + """Serialize the controller's final state for output.value.""" + if state is None: + return "" + payload: dict[str, Any] = {} + agent_state = safe_get_attr(state, "agent_state") + if agent_state is not None: + payload["agent_state"] = ( + safe_get_attr(agent_state, "value") or safe_str(agent_state) + ) + last_error = safe_get_attr(state, "last_error") + if last_error: + payload["last_error"] = safe_str(last_error) + iteration = safe_get_attr(state, "iteration") + if iteration is not None: + payload["iteration"] = safe_str(iteration) + history = safe_get_attr(state, "history") or [] + if isinstance(history, list) and history: + payload["history_length"] = len(history) + # Find the last AgentFinishAction or last assistant content for a final answer summary. + for ev in reversed(history): + if type(ev).__name__ == "AgentFinishAction": + payload["final_thought"] = safe_str( + safe_get_attr(ev, "final_thought") + or safe_get_attr(ev, "thought") + or "" + ) + payload["outputs"] = safe_str(safe_get_attr(ev, "outputs") or {}) + break + return to_json_str(payload) + + +def _entry_input_messages_from_initial(initial_user_action: Any) -> str: + """Return ARMS gen_ai.input.messages for the ENTRY span.""" + text = _extract_input_message_text(initial_user_action) + if not text: + return "" + return to_json_str( + [{"role": "user", "parts": [{"type": "text", "content": text}]}] + ) + + +def _entry_io_from_state(state: Any) -> tuple[str, str]: + """Return (input_messages, output_messages) for ENTRY from final state.""" + history = safe_get_attr(state, "history") or [] + input_messages = "" + output_messages = "" + if isinstance(history, list) and history: + input_payload = _history_to_input_messages_schema(history) + if input_payload: + input_messages = to_json_str(input_payload) + output_payload = _history_to_output_messages_schema(history) + if output_payload: + output_messages = to_json_str(output_payload) + if not output_messages: + final_state = _final_state_to_output(state) + if final_state: + output_messages = to_json_str( + [ + { + "role": "assistant", + "parts": [{"type": "text", "content": final_state}], + "finish_reason": "stop", + } + ] + ) + return input_messages, output_messages + + +# --------------------------------------------------------------------------- +# ARMS GenAI semconv message-schema converters. +# +# Per gen-ai.md §LLM/§Agent, gen_ai.input.messages / gen_ai.output.messages +# / gen_ai.system_instructions follow a "parts"-based structure: +# +# [{"role": "user|assistant|tool|system", +# "parts": [{"type": "text|tool_call|tool_call_response|...", +# "content": "...", "name": "...", "id": "...", +# "arguments": {...}, "result": "..."}], +# "finish_reason": "stop|...", # output only +# }] +# +# The system instructions schema is a flat list of parts: +# +# [{"type": "text", "content": "..."}] +# --------------------------------------------------------------------------- + + +def _action_event_to_parts(ev: Any) -> list[dict[str, Any]]: + """Convert an Action event into a list of ``parts`` for AGENT messages. + + Captures both the model's "thought" text and any ``tool_call`` part + derived from ``tool_call_metadata``. + """ + parts: list[dict[str, Any]] = [] + thought = safe_get_attr(ev, "thought") + if thought: + parts.append({"type": "text", "content": safe_str(thought)}) + tcm = safe_get_attr(ev, "tool_call_metadata") + if tcm is not None: + fn_name = safe_str(safe_get_attr(tcm, "function_name") or "") + tcid = safe_str(safe_get_attr(tcm, "tool_call_id") or "") + # Best-effort harvest the original LLM-emitted JSON arguments. + args: Any = {} + try: + mr = safe_get_attr(tcm, "model_response") + choices = ( + getattr(mr, "choices", None) + if mr is not None + else None + ) or [] + for choice in choices: + msg = getattr(choice, "message", None) or ( + choice.get("message") if isinstance(choice, dict) else None + ) + tool_calls = ( + getattr(msg, "tool_calls", None) + if msg is not None + else None + ) or (msg.get("tool_calls") if isinstance(msg, dict) else None) + if not tool_calls: + continue + for tc in tool_calls: + tc_id = ( + getattr(tc, "id", None) + if not isinstance(tc, dict) + else tc.get("id") + ) + if tcid and safe_str(tc_id) != tcid: + continue + fn = ( + getattr(tc, "function", None) + if not isinstance(tc, dict) + else tc.get("function") + ) + raw = ( + getattr(fn, "arguments", None) + if not isinstance(fn, dict) + else fn.get("arguments") + ) + if isinstance(raw, str): + try: + import json as _json + + args = _json.loads(raw) + except Exception: + args = {"raw": raw} + elif isinstance(raw, dict): + args = raw + except Exception: + args = {} + if not args: + for key in ( + "command", + "code", + "path", + "url", + "content", + "task_list", + "old_str", + "new_str", + "file_text", + ): + v = safe_get_attr(ev, key) + if v not in (None, "", [], {}): + args[key] = v + if fn_name or tcid or args: + parts.append( + { + "type": "tool_call", + "id": tcid, + "name": fn_name or safe_str(safe_get_attr(ev, "action") or ""), + "arguments": args, + } + ) + if not parts: + # Minimal fallback when nothing else could be extracted. + action_type = safe_str(safe_get_attr(ev, "action") or "") + if action_type: + parts.append({"type": "tool_call", "name": action_type, "arguments": {}}) + return parts + + +def _observation_event_to_parts(ev: Any) -> list[dict[str, Any]]: + """Convert an Observation event into ``parts`` for tool-response messages.""" + tcm = safe_get_attr(ev, "tool_call_metadata") + tcid = safe_str(safe_get_attr(tcm, "tool_call_id") or "") if tcm else "" + result_payload: dict[str, Any] = {} + for key in ("content", "exit_code", "error", "stdout", "stderr", "url"): + v = safe_get_attr(ev, key) + if v not in (None, "", [], {}): + result_payload[key] = v + return [ + { + "type": "tool_call_response", + "id": tcid, + "result": result_payload or safe_str(ev), + } + ] + + +def _history_to_input_messages_schema(history: list, max_events: int = 200) -> list[dict[str, Any]]: + """Convert ``state.history`` into the ARMS gen_ai.input.messages schema. + + Folds adjacent same-role events into a single message with multiple + ``parts``, mirroring how the messages were assembled when sent to + the LLM. + """ + if not history: + return [] + items = history[-max_events:] + messages: list[dict[str, Any]] = [] + for ev in items: + cls = type(ev).__name__ + # Determine role + parts for this event. + if cls == "SystemMessageAction": + # System is reported separately under gen_ai.system_instructions. + continue + if cls == "MessageAction": + src = str(safe_get_attr(ev, "source") or "").lower() + role = "user" if src == "user" else "assistant" + content = safe_str(safe_get_attr(ev, "content") or "") + parts = [{"type": "text", "content": content}] + elif cls.endswith("Observation"): + role = "tool" + parts = _observation_event_to_parts(ev) + elif cls.endswith("Action"): + role = "assistant" + parts = _action_event_to_parts(ev) + else: + role = "system" + parts = [{"type": "text", "content": safe_str(ev)}] + # Fold consecutive same-role messages. + if messages and messages[-1]["role"] == role: + messages[-1]["parts"].extend(parts) + else: + messages.append({"role": role, "parts": parts}) + return messages + + +def _history_to_output_messages_schema(history: list) -> list[dict[str, Any]]: + """Pull the *final* assistant turn from history per ARMS gen_ai.output.messages. + + Walks back from the end of history and collects assistant-side events + (Actions) up to the previous user/tool boundary. Includes a + ``finish_reason`` derived from the last AgentFinishAction / state. + """ + if not history: + return [] + finish_reason = "stop" + tail_actions: list[Any] = [] + for ev in reversed(history): + cls = type(ev).__name__ + if cls == "AgentFinishAction": + finish_reason = safe_str( + safe_get_attr(ev, "final_thought") and "stop" or "stop" + ) + tail_actions.insert(0, ev) + continue + if cls.endswith("Observation") or cls == "MessageAction": + # Stop once we cross back into user-input or tool-result territory. + if cls == "MessageAction" and str( + safe_get_attr(ev, "source") or "" + ).lower() == "user": + break + if cls.endswith("Observation"): + break + if cls.endswith("Action") or ( + cls == "MessageAction" + and str(safe_get_attr(ev, "source") or "").lower() != "user" + ): + tail_actions.insert(0, ev) + if not tail_actions: + # Fallback: at least include the very last event as the assistant turn. + tail_actions = [history[-1]] + parts: list[dict[str, Any]] = [] + for ev in tail_actions: + cls = type(ev).__name__ + if cls == "MessageAction": + content = safe_str(safe_get_attr(ev, "content") or "") + if content: + parts.append({"type": "text", "content": content}) + elif cls == "AgentFinishAction": + ft = safe_str(safe_get_attr(ev, "final_thought") or "") + if ft: + parts.append({"type": "text", "content": ft}) + outputs = safe_get_attr(ev, "outputs") + if outputs: + parts.append({"type": "text", "content": safe_str(outputs)}) + else: + parts.extend(_action_event_to_parts(ev)) + if not parts: + parts = [{"type": "text", "content": ""}] + return [{"role": "assistant", "parts": parts, "finish_reason": finish_reason}] + + +def _agent_to_system_instructions(agent: Any, state: Any) -> list[dict[str, Any]]: + """Return ARMS gen_ai.system_instructions for the controller's agent. + + Tries the explicit ``agent.get_system_message()`` API first (most + accurate), then falls back to scanning ``state.history`` for a + ``SystemMessageAction``. + """ + content = "" + try: + gsm = safe_get_attr(agent, "get_system_message") + if callable(gsm): + sm = gsm() + content = safe_str(safe_get_attr(sm, "content") or "") + except Exception: + content = "" + if not content: + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + for ev in history: + if type(ev).__name__ == "SystemMessageAction": + content = safe_str(safe_get_attr(ev, "content") or "") + if content: + break + if not content: + return [] + return [{"type": "text", "content": content}] + + +# --------------------------------------------------------------------------- +# ENTRY: openhands.core.main.run_controller +# --------------------------------------------------------------------------- + + +class RunControllerWrapper: + """ENTRY span around the V0 CLI/headless ``run_controller`` coroutine. + + Stashes the active OTel Context (with the ENTRY span attached) keyed + by ``sid`` so STEP / TOOL spans firing in worker threads can re-attach + it and remain in the same trace. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + return self._impl(wrapped, instance, args, kwargs) + + async def _impl(self, wrapped, instance, args, kwargs): + if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + return await wrapped(*args, **kwargs) + + config = kwargs.get("config") + if config is None and args: + config = args[0] + initial_user_action = kwargs.get("initial_user_action") + if initial_user_action is None and len(args) >= 2: + initial_user_action = args[1] + sid = kwargs.get("sid") + if sid is None and len(args) >= 3: + sid = args[2] + # When sid wasn't passed, we don't yet know the auto-generated one; + # the controller will publish ``controller.id`` later. We update + # the stash again from inside the AGENT wrapper. + + span = self._tracer.start_span("enter openhands", kind=SpanKind.INTERNAL) + _set_common(span, "ENTRY") + span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "enter") + if sid: + span.set_attribute(GEN_AI_SESSION_ID, safe_str(sid)) + span.set_attribute(GEN_AI_CONVERSATION_ID, safe_str(sid)) + model = _extract_model_from_config(config) + if model: + span.set_attribute(GEN_AI_REQUEST_MODEL, model) + + input_text = _extract_input_message_text(initial_user_action) + preview = maybe_preview(input_text) + if preview: + span.set_attribute(OH_INITIAL_MESSAGE_PREVIEW, preview) + captured_input = ( + maybe_to_json_str({"role": "user", "content": input_text}) + if input_text + else "" + ) + if captured_input: + entry_input_messages = _entry_input_messages_from_initial( + initial_user_action + ) + _set_io( + span, + input_value=captured_input, + input_messages=entry_input_messages, + ) + + ctx = set_span_in_context(span) + token = otel_context.attach(ctx) + if sid: + store_context(sid, ctx) + try: + try: + result = await wrapped(*args, **kwargs) + except BaseException as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR, type(exc).__qualname__)) + raise + try: + final_state_repr = _final_state_to_output(result) + entry_input_messages, entry_output_messages = _entry_io_from_state( + result + ) + if final_state_repr: + _set_io( + span, + output_value=final_state_repr, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + agent_state = safe_get_attr(result, "agent_state") + if agent_state is not None: + span.set_attribute( + OH_AGENT_STATE, + safe_get_attr(agent_state, "value") or safe_str(agent_state), + ) + elif entry_input_messages or entry_output_messages: + _set_io( + span, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + except Exception: + pass + return result + finally: + try: + otel_context.detach(token) + except Exception: + pass + if sid: + clear_context(sid) + span.end() + + +# --------------------------------------------------------------------------- +# AGENT: openhands.core.loop.run_agent_until_done +# --------------------------------------------------------------------------- + + +class RunAgentUntilDoneWrapper: + """AGENT span around the V0 polling loop. + + Re-attaches the ENTRY context (in case asyncio task creation didn't + propagate it for some reason) and re-stashes a fresh context that now + also includes the AGENT span — that's what STEP / TOOL re-attach. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + return self._impl(wrapped, instance, args, kwargs) + + async def _impl(self, wrapped, instance, args, kwargs): + if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + return await wrapped(*args, **kwargs) + + controller = kwargs.get("controller") + if controller is None and args: + controller = args[0] + agent = safe_get_attr(controller, "agent") + agent_name = safe_get_attr(agent, "name") or "codeact" + agent_class = ( + f"{type(agent).__module__}.{type(agent).__name__}" if agent else "" + ) + sid = safe_str(safe_get_attr(controller, "id") or "") + llm = safe_get_attr(agent, "llm") + llm_config = safe_get_attr(llm, "config") + model = safe_get_attr(llm_config, "model") or safe_get_attr(llm, "model") + + # If AgentController.__init__ already opened lifecycle-bound ENTRY+AGENT + # spans, do not create a second AGENT here. Just run the loop with the + # existing AGENT context current so STEP/LLM/TOOL remain descendants. + lifecycle_agent_span = getattr(controller, _AGENT_SPAN_ATTR, None) + lifecycle_agent_ctx = getattr(controller, _AGENT_CTX_ATTR, None) + if lifecycle_agent_span is not None and lifecycle_agent_ctx is not None: + try: + _capture_agent_io_attributes( + lifecycle_agent_span, + controller, + agent, + safe_get_attr(controller, "state"), + ) + except Exception: + pass + lifecycle_token = otel_context.attach(lifecycle_agent_ctx) + try: + return await wrapped(*args, **kwargs) + except BaseException as exc: + try: + lifecycle_agent_span.record_exception(exc) + lifecycle_agent_span.set_status( + Status(StatusCode.ERROR, type(exc).__qualname__) + ) + except Exception: + pass + raise + finally: + try: + state = safe_get_attr(controller, "state") + _capture_agent_io_attributes( + lifecycle_agent_span, controller, agent, state + ) + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + lifecycle_agent_span.set_attribute(OH_HISTORY_LENGTH, len(history)) + except Exception: + pass + try: + otel_context.detach(lifecycle_token) + except Exception: + pass + + # Bridge: re-attach whatever the ENTRY wrapper stashed (works even + # if asyncio.create_task somehow lost the context, and is the only + # way for the worker-thread STEP / TOOL spans to find us). + attach_ctx = get_context(sid) + fallback_entry_span: trace_api.Span | None = None + if attach_ctx is None: + fallback_entry_span = self._tracer.start_span( + "enter openhands", kind=SpanKind.INTERNAL + ) + _set_common(fallback_entry_span, "ENTRY") + fallback_entry_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "enter") + if sid: + fallback_entry_span.set_attribute(GEN_AI_SESSION_ID, sid) + fallback_entry_span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + if agent_class: + fallback_entry_span.set_attribute(OH_AGENT_NAME, agent_class) + if model: + fallback_entry_span.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model)) + try: + state = safe_get_attr(controller, "state") + entry_input_messages, _ = _entry_io_from_state(state) + if entry_input_messages: + _set_io( + fallback_entry_span, + input_value=entry_input_messages, + input_messages=entry_input_messages, + ) + except Exception: + pass + attach_ctx = set_span_in_context(fallback_entry_span) + if sid: + store_context(sid, attach_ctx) + if attach_ctx is not None: + attach_token = otel_context.attach(attach_ctx) + else: + attach_token = None + + try: + span = self._tracer.start_span( + f"invoke_agent {agent_name}", + kind=SpanKind.INTERNAL, + context=attach_ctx, + ) + _set_common(span, "AGENT") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.INVOKE_AGENT.value, + ) + span.set_attribute(GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name)) + if agent_class: + span.set_attribute(OH_AGENT_NAME, agent_class) + if sid: + span.set_attribute(GEN_AI_SESSION_ID, sid) + span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + span.set_attribute(GEN_AI_AGENT_ID, sid) + if model: + span.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model)) + + # Capture the agent's tool registry so the TOOL wrapper (which + # only sees a Runtime instance) can resolve tool descriptions + # and produce ``gen_ai.tool.description``. Also emit + # ``gen_ai.tool.definitions`` on this AGENT span itself per the + # ARMS GenAI semconv §Agent — minimal {type,name} entries by + # default; full definitions only when content capture is on. + try: + tools = safe_get_attr(agent, "tools") or [] + if sid: + store_tool_registry(sid, tools) + tool_defs_summary: list[dict[str, Any]] = [] + for t in tools: + if isinstance(t, dict): + kind = t.get("type") or "function" + fn = t.get("function") or {} + name = fn.get("name") if isinstance(fn, dict) else None + else: + kind = safe_get_attr(t, "type") or "function" + fn = safe_get_attr(t, "function") + name = safe_get_attr(fn, "name") + if not name: + continue + item: dict[str, Any] = {"type": safe_str(kind), "name": safe_str(name)} + if isinstance(fn, dict): + desc = fn.get("description") + params = fn.get("parameters") + else: + desc = safe_get_attr(fn, "description") + params = safe_get_attr(fn, "parameters") + if desc: + item["description"] = safe_str(desc) + if params: + item["parameters"] = params + tool_defs_summary.append(item) + if tool_defs_summary: + span.set_attribute( + GEN_AI_TOOL_DEFINITIONS, to_json_str(tool_defs_summary) + ) + except Exception: + pass + + # Capture initial user/system context for AGENT using the same + # ARMS message schema as the lifecycle-bound AGENT path. + try: + state = safe_get_attr(controller, "state") + _capture_agent_io_attributes(span, controller, agent, state) + except Exception: + pass + + # Stash the context that now includes the AGENT span so STEP / + # TOOL re-attach correctly even when running in worker threads. + ctx_with_agent = set_span_in_context(span) + if sid: + store_context(sid, ctx_with_agent) + # Mirror onto the controller too — STEP wrapper uses this when + # closing a STEP to restore the session stash to AGENT instead + # of leaving a dangling closed-STEP context behind. + if controller is not None: + try: + setattr(controller, _AGENT_CTX_ATTR, ctx_with_agent) + setattr(controller, _AGENT_SPAN_ATTR, span) + except Exception: + pass + if getattr(controller, _STEP_SPAN_ATTR, None) is None: + try: + warmup_step = self._tracer.start_span( + "react step", + kind=SpanKind.INTERNAL, + context=ctx_with_agent, + ) + _set_common(warmup_step, "STEP") + warmup_step.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "react") + warmup_step.set_attribute(OH_REACT_ROUND, 1) + warmup_step.set_attribute( + GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name) + ) + if sid: + warmup_step.set_attribute(GEN_AI_SESSION_ID, sid) + warmup_step.set_attribute(GEN_AI_CONVERSATION_ID, sid) + warmup_step.set_attribute(GEN_AI_AGENT_ID, sid) + setattr(controller, _STEP_SPAN_ATTR, warmup_step) + setattr(controller, "_otel_oh_round", 1) + setattr(controller, "_otel_oh_step_consumed", False) + if sid: + store_context(sid, set_span_in_context(warmup_step)) + except Exception: + pass + agent_token = otel_context.attach(ctx_with_agent) + try: + try: + result = await wrapped(*args, **kwargs) + except BaseException as exc: + span.record_exception(exc) + span.set_status( + Status(StatusCode.ERROR, type(exc).__qualname__) + ) + raise + # Capture final AGENT I/O using ARMS gen_ai.* message attrs. + try: + state = safe_get_attr(controller, "state") + _capture_agent_io_attributes(span, controller, agent, state) + if state is not None: + agent_state = safe_get_attr(state, "agent_state") + if agent_state is not None: + span.set_attribute( + OH_AGENT_STATE, + safe_get_attr(agent_state, "value") + or safe_str(agent_state), + ) + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + span.set_attribute(OH_HISTORY_LENGTH, len(history)) + except Exception: + pass + return result + finally: + try: + otel_context.detach(agent_token) + except Exception: + pass + if controller is not None: + try: + if getattr(controller, _AGENT_SPAN_ATTR, None) is span: + setattr(controller, _AGENT_SPAN_ATTR, None) + except Exception: + pass + try: + _close_open_step(controller) + except Exception: + pass + span.end() + finally: + if attach_token is not None: + try: + otel_context.detach(attach_token) + except Exception: + pass + if fallback_entry_span is not None: + try: + state = safe_get_attr(controller, "state") + output_repr = _final_state_to_output(state) + entry_input_messages, entry_output_messages = _entry_io_from_state( + state + ) + if output_repr: + _set_io( + fallback_entry_span, + output_value=output_repr, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + elif entry_input_messages or entry_output_messages: + _set_io( + fallback_entry_span, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + fallback_entry_span.set_attribute(OH_HISTORY_LENGTH, len(history)) + except Exception: + pass + try: + fallback_entry_span.end() + except Exception: + pass + if sid: + try: + clear_context(sid) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# STEP: AgentController._step +# --------------------------------------------------------------------------- + + +def _close_open_step(controller: Any) -> None: + """End the controller's currently-open STEP span, if any. + + Restores the session-context stash to the controller's AGENT context + (kept under ``_AGENT_CTX_ATTR``) so subsequent TOOL spans are still + parented correctly even after the last STEP closes. + + Crucially, this function only ends the *span* — it never touches an + attach-token. The STEP wrapper attaches/detaches the STEP context + in a balanced pair *inside* the ``_step`` coroutine; cross-task + propagation happens via the ``Context`` object stashed in + :mod:`session_context`, which can be re-attached safely from any + task / thread because every attach is paired with a detach inside + its creating context. + """ + span = getattr(controller, _STEP_SPAN_ATTR, None) + if span is None: + return + try: + span.end() + except Exception: + pass + try: + setattr(controller, _STEP_SPAN_ATTR, None) + except Exception: + pass + sid = safe_str(safe_get_attr(controller, "id") or "") + agent_ctx = getattr(controller, _AGENT_CTX_ATTR, None) + if sid and agent_ctx is not None: + store_context(sid, agent_ctx) + + +class AgentControllerStepWrapper: + """STEP span around one ReAct iteration of the V0 controller. + + The STEP span is intentionally **kept open across the return of + ``_step``**. Why: ``Runtime.run_action`` runs *later*, in a thread-pool + executor (``call_sync_from_async`` inside ``_handle_action``), so by + the time TOOL fires the STEP coroutine has already returned. Closing + STEP at end of ``_step`` would make every TOOL a sibling of STEP + (parented under AGENT) instead of a child. + + Lifecycle: + + 1. New ``_step`` invoked → close *previous* STEP if any → open new + STEP (child of AGENT) → stash STEP context under ``sid`` so that + TOOL / LLM spans firing on worker threads re-attach STEP. + 2. ``_step`` body runs to completion. We do **not** close STEP here. + 3. The next ``_step`` (or ``AgentController.close``) closes the + still-open STEP. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + return self._impl(wrapped, instance, args, kwargs) + + @staticmethod + def _will_step_be_noop(instance: Any) -> bool: + """Return True if this ``_step`` call will short-circuit without + producing real work (state != RUNNING, or a pending action is + already queued). We skip span emission for these so the round + counter stays sequential (1, 2, 3, ...) instead of inflating to + (1, 3, 5, ...) with empty 0.5ms STEP spans cluttering the trace. + + This mirrors the early-return checks at the top of + ``AgentController._step`` (state-check + ``_pending_action``). + We read ``_pending_action_info`` directly rather than going + through the ``_pending_action`` *property* — the property has + logging side effects (it can emit a "pending action active for + Xs" log line at warn-level) that we don't want to trigger from + an instrumentation hot path. + """ + try: + state = safe_get_attr(instance, "state") + agent_state = safe_get_attr(state, "agent_state") + # AgentState enum value is 'running' (case-insensitive). + agent_state_str = ( + safe_str(safe_get_attr(agent_state, "value") or agent_state).lower() + ) + if agent_state_str != "running": + return True + # Check the underlying tuple slot, not the property — the + # property's getter is non-trivial in OpenHands. + if getattr(instance, "_pending_action_info", None) is not None: + return True + except Exception: + return False + return False + + @staticmethod + def _snapshot_for_work_detection(instance: Any) -> tuple[int, Any]: + """Snapshot the bits we need to tell whether ``_step`` body did + anything. Returned tuple is (history_length, pending_action_id). + Used by ``_impl`` to detect "empty" STEP invocations that get + through ``_will_step_be_noop`` (e.g. ``state_tracker`` raised, + ``_is_stuck`` early-returned, ``agent.step`` returned ``None``) + and shouldn't show up in the trace as 0.3ms placeholder spans. + """ + try: + state = safe_get_attr(instance, "state") + history = safe_get_attr(state, "history") + history_len = len(history) if isinstance(history, list) else 0 + except Exception: + history_len = 0 + try: + info = getattr(instance, "_pending_action_info", None) + pending_id = id(info) if info is not None else None + except Exception: + pending_id = None + return history_len, pending_id + + async def _impl(self, wrapped, instance, args, kwargs): + if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + return await wrapped(*args, **kwargs) + + # Skip no-op _step invocations entirely so the trace shows only + # the rounds that actually do work (LLM call + tool dispatch). + if self._will_step_be_noop(instance): + return await wrapped(*args, **kwargs) + + sid = safe_str(safe_get_attr(instance, "id") or "") + agent = safe_get_attr(instance, "agent") + agent_name = safe_get_attr(agent, "name") or "codeact" + + # Snapshot the AGENT context if we don't already have one so + # ``_close_open_step`` can restore the session stash to AGENT + # after STEP ends. + if not hasattr(instance, _AGENT_CTX_ATTR) or getattr(instance, _AGENT_CTX_ATTR, None) is None: + try: + setattr(instance, _AGENT_CTX_ATTR, get_context(sid)) + except Exception: + pass + + # ----- Reuse warmup STEP if not yet consumed ----- + # The init wrapper opens a warmup STEP (round 1) so pre-step + # actions like RECALL parent under STEP 1. The first real + # ``_step`` reuses that STEP (without bumping the round) so the + # LLM call + first LLM-driven tool also nest under STEP 1. From + # the second real ``_step`` onward, we close the previous STEP + # and open a new one with round = previous + 1. + existing_step = getattr(instance, _STEP_SPAN_ATTR, None) + consumed = bool(getattr(instance, "_otel_oh_step_consumed", True)) + reused_warmup = False + is_new_span = False + if existing_step is not None and not consumed: + span = existing_step + round_num = int(getattr(instance, "_otel_oh_round", 1) or 1) + reused_warmup = True + try: + setattr(instance, "_otel_oh_step_consumed", True) + except Exception: + pass + else: + # Close any still-open consumed STEP from the previous round + # before opening a new one. + _close_open_step(instance) + # Tentative round number — only committed if body does work. + round_num = int(getattr(instance, "_otel_oh_round", 0) or 0) + 1 + + # Open the new STEP as a child of AGENT. Prefer the explicit + # AGENT context (more reliable than relying on contextvars + # propagation across asyncio task / thread boundaries). + agent_ctx = getattr(instance, _AGENT_CTX_ATTR, None) + if agent_ctx is None and sid: + agent_ctx = get_context(sid) + try: + span = self._tracer.start_span( + "react step", + kind=SpanKind.INTERNAL, + context=agent_ctx, + ) + except Exception: + # Fall back to current-context-based parenting if explicit + # context= isn't accepted (older OTel SDKs). + with AttachedSession(sid): + span = self._tracer.start_span( + "react step", kind=SpanKind.INTERNAL + ) + _set_common(span, "STEP") + span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "react") + span.set_attribute(OH_REACT_ROUND, round_num) + span.set_attribute(GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name)) + if sid: + span.set_attribute(GEN_AI_SESSION_ID, sid) + span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + span.set_attribute(GEN_AI_AGENT_ID, sid) + is_new_span = True + try: + setattr(instance, _STEP_SPAN_ATTR, span) + setattr(instance, "_otel_oh_step_consumed", True) + except Exception: + try: + span.end() + except Exception: + pass + return await wrapped(*args, **kwargs) + + # Capture INPUT: messages going into this step. + try: + state = safe_get_attr(instance, "state") + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + span.set_attribute(OH_HISTORY_LENGTH, len(history)) + input_messages = _state_to_input_messages(state) + if input_messages: + _set_io( + span, + input_value=input_messages, + input_messages=input_messages, + ) + except Exception: + pass + + # Build the STEP context object. Cross-thread propagation goes + # through this Context object stashed in session_context (TOOL / + # LLM wrappers re-attach it inside their own scopes with paired + # attach/detach so no token ever crosses a context boundary). + step_ctx = set_span_in_context(span) + if sid: + store_context(sid, step_ctx) + + # Snapshot pre-body state so we can detect "empty" body that + # got through ``_will_step_be_noop`` (e.g. ``state_tracker`` + # raised inside ``_step``, ``_is_stuck`` early-returned, or + # ``agent.step`` returned ``None`` / raised handled error). + pre_history_len, pre_pending_id = self._snapshot_for_work_detection( + instance + ) + + # Attach STEP for the *body's* contextvars propagation only. + # Both attach and the matching detach happen in this coroutine's + # own context, so the Aliyun SDK's strict token check is happy. + step_token = otel_context.attach(step_ctx) + body_error: BaseException | None = None + try: + result = await wrapped(*args, **kwargs) + except BaseException as exc: + body_error = exc + finally: + try: + otel_context.detach(step_token) + except Exception: + pass + + if body_error is not None: + try: + span.set_attribute( + "gen_ai.react.finish_reason", type(body_error).__qualname__ + ) + span.record_exception(body_error) + span.set_status( + Status(StatusCode.ERROR, type(body_error).__qualname__) + ) + except Exception: + pass + # On error, close STEP now so the failure surfaces cleanly + # rather than waiting for the next _step / controller close. + _close_open_step(instance) + # Make sure the round counter we *tentatively* assigned for + # this STEP gets committed so subsequent rounds renumber + # past it instead of overlapping. + if is_new_span: + try: + instance._otel_oh_round = round_num + except Exception: + pass + raise body_error + + # Detect post-body "empty" STEP — the wrapper passed the + # ``_will_step_be_noop`` pre-check but the body still produced + # zero observable work (no new history events, no new pending + # action). The user has explicitly asked us not to clutter the + # trace with sub-millisecond placeholder STEP spans, so: + # + # * If we *opened* a fresh span this round, end it immediately, + # mark it ``openhands.step.empty=true``, and DO NOT bump the + # committed round counter. Next real _step opens a fresh STEP + # with the same round number — the empty span still appears + # in the trace (we have no way to suppress export from inside + # a wrapper), but with a clear ``empty=true`` marker so it's + # trivially filterable in the dashboard. + # * If we *reused* a warmup / persisted STEP that was already + # meaningful (had earlier RECALL/TOOL children), keep it open + # and don't mark it empty — the children give it value. + post_history_len, post_pending_id = self._snapshot_for_work_detection( + instance + ) + did_work = ( + post_history_len > pre_history_len + or (post_pending_id is not None and post_pending_id != pre_pending_id) + ) + + if not did_work and is_new_span: + try: + span.set_attribute("openhands.step.empty", True) + span.set_attribute( + "gen_ai.react.finish_reason", "noop_step_body" + ) + span.end() + except Exception: + pass + # Forget this empty STEP so the next _step opens a fresh one + # without trying to close-or-reuse this one. + try: + if getattr(instance, _STEP_SPAN_ATTR, None) is span: + setattr(instance, _STEP_SPAN_ATTR, None) + except Exception: + pass + try: + # Roll back to the previous committed round (don't + # advance the counter for an empty STEP). + instance._otel_oh_round = round_num - 1 + instance._otel_oh_step_consumed = True + except Exception: + pass + # Restore session stash to AGENT so subsequent TOOLs land + # under AGENT (not under a now-ended STEP). + if sid: + agent_ctx = getattr(instance, _AGENT_CTX_ATTR, None) + if agent_ctx is not None: + try: + store_context(sid, agent_ctx) + except Exception: + pass + return result + + # Body did work — commit the round counter (we only update it + # *after* we're sure the STEP is meaningful). + if is_new_span: + try: + instance._otel_oh_round = round_num + except Exception: + pass + + # Capture OUTPUT: the freshly-decided pending action. + try: + pending = getattr(instance, "_pending_action", None) + state = safe_get_attr(instance, "state") + agent_state = safe_get_attr(state, "agent_state") + if agent_state is not None: + span.set_attribute( + OH_AGENT_STATE, + safe_get_attr(agent_state, "value") + or safe_str(agent_state), + ) + if pending is not None: + action_type = _action_type_value(pending) + if action_type: + span.set_attribute(OH_ACTION_TYPE, action_type) + out = action_to_genai_output(pending) + if out: + _set_io(span, output_value=out, output_messages=out) + except Exception: + pass + + # Mirror the latest history snapshot back up to the AGENT span + # so AGENT's GenAI message attributes stay current during the run + # (not just at close-time). Downstream dashboards may read AGENT + # before the controller actually closes. + try: + agent_span = getattr(instance, _AGENT_SPAN_ATTR, None) + if agent_span is not None: + _capture_agent_io_attributes( + agent_span, instance, agent, safe_get_attr(instance, "state") + ) + except Exception: + pass + + # Mark the warmup STEP (round 1) the moment we know it carries + # real work — it now contains LLM/TOOL children and matters. + if reused_warmup: + try: + span.set_attribute("openhands.step.warmup_consumed", True) + except Exception: + pass + + # STEP span stays open here — it lives until the next _step (or + # AgentController.close) ends it. Until then any TOOL fired by + # Runtime.run_action on a thread-pool worker will re-attach the + # STEP context object stashed above and become its child. + return result + + +# --------------------------------------------------------------------------- +# TOOL: Runtime.run_action +# --------------------------------------------------------------------------- + + +_TOOL_KIND_TO_NAME: dict[str, str] = { + "run": "bash", + "run_ipython": "ipython", + "browse_interactive": "browser", + "browse": "browser", + "edit": "str_replace_editor", + "read": "file_read", + "write": "file_write", + "delegate": "delegate", + "finish": "finish", + "think": "think", + "task_tracking": "task_tracker", + "mcp": "mcp", + "send_message": "send_message", + # ``recall`` is a real (non-LLM-initiated) tool: the controller posts + # a RecallAction and the memory subsystem runs it just like any other + # action via ``Runtime.run_action``. Worth a TOOL span. + "recall": "recall", +} + +# Action types that are *not* real tool calls — they're internal control +# events posted by the controller / event-stream itself (system prompt, +# user message, agent-state transition, no-ops). Emitting TOOL spans for +# these clutters the trace tree and confuses the GenAI semconv (these +# aren't things the LLM "called"). +_INTERNAL_ACTION_TYPES: frozenset[str] = frozenset( + { + "message", + "system", + "change_agent_state", + "agent_state_changed", + "null", + "noop", + } +) + + +def _action_type_value(action: Any) -> str: + """Best-effort extract the canonical action-type string for ``action``. + + OpenHands declares ``ActionType`` as ``class ActionType(str, Enum)`` + with members like ``MESSAGE = 'message'``. Each Action subclass sets + ``action: str = ActionType.MESSAGE``. ``str(ActionType.MESSAGE)`` + returns ``'ActionType.MESSAGE'`` (Python's default Enum.__str__), + *not* the value ``'message'`` we want for filtering / lookup. This + helper prefers ``.value`` when the attribute is enum-like, else the + raw string. + """ + raw = safe_get_attr(action, "action") + if raw is None: + return "" + val = safe_get_attr(raw, "value") + if val is not None: + return safe_str(val).lower() + text = safe_str(raw).lower() + # ``str(ActionType.MESSAGE)`` → "actiontype.message"; strip the prefix. + prefix = "actiontype." + if text.startswith(prefix): + return text[len(prefix):] + return text + + +def _is_real_tool_call(action: Any) -> bool: + """Return True iff ``action`` represents a meaningful tool execution. + + Filtering rules (in order): + + 1. **Internal action types are *always* dropped** even when the + action carries ``tool_call_metadata``. OpenHands lets the LLM + produce ``MessageAction`` (via the ``send_message`` "tool"), + ``SystemMessageAction``, ``ChangeAgentStateAction`` etc. — those + are coordination signals, not real tool executions, and they + clutter the trace with sub-millisecond noise spans that the user + has explicitly asked us to suppress. + 2. Otherwise, an action qualifies if it has ``tool_call_metadata`` + (i.e. it was produced from an LLM ``tool_calls`` response — e.g. + ``execute_bash``, ``str_replace_editor``), or + 3. Its action-type is in the executable-tool whitelist + (``_TOOL_KIND_TO_NAME``) — this catches synthesized actions like + ``RECALL`` that don't come from the LLM but are still worth + tracing as TOOL spans (memory retrieval, microagent loading, + etc.). + """ + action_type = _action_type_value(action) + # Always drop internal/system actions regardless of how they were + # produced — see rule 1 above. + if action_type and action_type in _INTERNAL_ACTION_TYPES: + return False + if safe_get_attr(action, "tool_call_metadata") is not None: + return True + if not action_type: + return False + return action_type in _TOOL_KIND_TO_NAME + + +def _extract_tool_name(action: Any) -> tuple[str, str]: + """Return (tool_name, action_type). + + Prefers the function name carried on ``action.tool_call_metadata`` + (set when the action came from an LLM tool call) — that's what the + LLM and our LLM-side instrumentation know it as. Falls back to the + canonical action-type string (``ActionType.RECALL`` → ``"recall"``) + mapped through ``_TOOL_KIND_TO_NAME``. + """ + action_type = _action_type_value(action) + tcm = safe_get_attr(action, "tool_call_metadata") + if tcm is not None: + fn = safe_get_attr(tcm, "function_name") + if fn: + return safe_str(fn), action_type + tool_name = _TOOL_KIND_TO_NAME.get(action_type, action_type or "agent.action") + return tool_name, action_type + + +def _extract_tool_call_id(action: Any) -> str: + tcm = safe_get_attr(action, "tool_call_metadata") + if tcm is None: + return "" + return safe_str(safe_get_attr(tcm, "tool_call_id") or "") + + +def _runtime_sid(instance: Any) -> str: + """Best-effort discover the session id from a Runtime instance.""" + sid = safe_get_attr(instance, "sid") + if sid: + return safe_str(sid) + es = safe_get_attr(instance, "event_stream") + es_sid = safe_get_attr(es, "sid") + if es_sid: + return safe_str(es_sid) + return "" + + +class RuntimeRunActionWrapper: + """TOOL span around ``Runtime.run_action``. + + Bridges the session context across worker threads, then opens a TOOL + span with GenAI tool-call attributes. Arguments are always recorded + in ``gen_ai.tool.call.arguments`` (``"{}"`` when none); results go to + ``gen_ai.tool.call.result``. No ``input.value`` / ``output.value``. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + return wrapped(*args, **kwargs) + + action = args[0] if args else kwargs.get("action") + # Skip internal control events — system prompts, user messages, + # memory recalls, agent-state transitions etc. aren't tool calls + # and shouldn't appear as TOOL spans alongside the real ones. + if not _is_real_tool_call(action): + return wrapped(*args, **kwargs) + + tool_name, action_type = _extract_tool_name(action) + tool_call_id = _extract_tool_call_id(action) + runtime_class = ( + f"{type(instance).__module__}.{type(instance).__name__}" + if instance + else "" + ) + sid = _runtime_sid(instance) + + # Look up the session-stashed context (STEP if a step is open, + # AGENT otherwise) and use it as the *explicit* parent context + # for the TOOL span. Explicit context= is more robust than + # relying on contextvars propagation across worker threads — it + # always parents under the latest STEP/AGENT no matter what + # thread/loop the runtime is running on. + parent_ctx = get_context(sid) + try: + span = self._tracer.start_span( + f"execute_tool {tool_name}", + kind=SpanKind.INTERNAL, + context=parent_ctx, + ) + except Exception: + with AttachedSession(sid): + span = self._tracer.start_span( + f"execute_tool {tool_name}", kind=SpanKind.INTERNAL + ) + # The TOOL span itself is parented *explicitly* via context= + # above. We additionally attach the session context throughout + # the wrapped call so any nested spans created by the runtime + # (e.g. a retried LLM call) that go through the contextvars + # propagation path also inherit the right session — and the + # ``otel_context.attach(set_span_in_context(span))`` below makes + # the TOOL itself current so retry-spawned child spans nest + # under TOOL, not under its parent STEP. + with AttachedSession(sid): + # ARMS GenAI semconv (Tool): + # gen_ai.span.kind=TOOL, gen_ai.operation.name=execute_tool, + # gen_ai.tool.name, gen_ai.tool.type + # gen_ai.tool.call.id, gen_ai.tool.description [recommended] + # gen_ai.tool.call.arguments, gen_ai.tool.call.result + # [optional, gated on capture-message-content] + _set_common(span, "TOOL") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.EXECUTE_TOOL.value, + ) + span.set_attribute(GenAI.GEN_AI_TOOL_NAME, tool_name) + span.set_attribute(GenAI.GEN_AI_TOOL_TYPE, "function") + if tool_call_id: + span.set_attribute(GEN_AI_TOOL_CALL_ID, tool_call_id) + if action_type: + # ``action_type`` from ``_extract_tool_name`` is the + # canonical lowercased value (e.g. ``"recall"``), suitable + # for ``openhands.action.type``. + span.set_attribute(OH_ACTION_TYPE, action_type) + if runtime_class: + span.set_attribute(OH_RUNTIME_NAME, runtime_class) + if sid: + span.set_attribute(GEN_AI_SESSION_ID, sid) + span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + + # gen_ai.tool.description — looked up via the per-sid registry + # populated by the AGENT wrapper from ``controller.agent.tools``. + try: + tool_def = get_tool_definition(sid, tool_name) + if tool_def is not None: + if isinstance(tool_def, dict): + fn = tool_def.get("function") or {} + desc = fn.get("description") if isinstance(fn, dict) else None + else: + fn = safe_get_attr(tool_def, "function") + desc = safe_get_attr(fn, "description") + if desc: + span.set_attribute(GEN_AI_TOOL_DESCRIPTION, safe_str(desc)) + except Exception: + pass + + # gen_ai.tool.call.arguments — always emit (empty object as "{}" ). + # No OpenInference input.value / output.value on TOOL spans. + arguments_dict = _tool_call_arguments(action) + try: + args_json = to_json_str(arguments_dict) + if not args_json: + args_json = "{}" + span.set_attribute(GEN_AI_TOOL_CALL_ARGUMENTS, args_json) + preview_field, preview_text = _first_preview_field(action) + if preview_text: + span.set_attribute( + f"openhands.action.{preview_field}", preview_text + ) + except Exception: + span.set_attribute(GEN_AI_TOOL_CALL_ARGUMENTS, "{}") + + ctx = set_span_in_context(span) + token = otel_context.attach(ctx) + try: + try: + observation = wrapped(*args, **kwargs) + except BaseException as exc: + span.record_exception(exc) + span.set_status( + Status(StatusCode.ERROR, type(exc).__qualname__) + ) + raise + try: + _annotate_observation(span, observation) + except Exception: + pass + return observation + finally: + try: + otel_context.detach(token) + except Exception: + pass + span.end() + + +def _first_preview_field(action: Any) -> tuple[str, str]: + for attr in ("command", "code", "path", "url", "content"): + v = safe_get_attr(action, attr) + if v: + return attr, safe_str(v) + return "", "" + + +_TOOL_ARG_FIELDS: tuple[str, ...] = ( + "command", + "code", + "path", + "url", + "content", + "task_list", + "name", + "arguments", + "thought", + "is_input", + "blocking", + "keep_prompt", + "translated_ipython_code", + "browser_actions", + "agent_state", + "outputs", + "final_thought", + "old_str", + "new_str", + "view_range", + "file_text", + "insert_line", + "start_line", + "end_line", +) + + +def _coerce_tool_arguments(value: Any) -> dict[str, Any]: + """Normalize tool call arguments to a JSON-object-compatible dict.""" + if value in (None, "", [], {}): + return {} + if isinstance(value, dict): + return value + if isinstance(value, str): + try: + parsed = json.loads(value) + except Exception: + return {"raw": value} + if isinstance(parsed, dict): + return parsed + return {"value": parsed} + return {"value": value} + + +def _tool_call_arguments(action: Any) -> dict[str, Any]: + """Return the bare arguments dict for ``gen_ai.tool.call.arguments``. + + Per ARMS GenAI semconv the value is a JSON string of *just* the call + arguments — e.g. ``{"location": "San Francisco", "date": "2025-10-01"}`` + — not the wrapping ``{"tool": ..., "arguments": ...}`` envelope. + """ + if action is None: + return {} + # When the action came from an LLM tool call, prefer the original + # JSON arguments the model emitted (most faithful to what the LLM + # actually requested). + tcm = safe_get_attr(action, "tool_call_metadata") + if tcm is not None: + direct_args = _coerce_tool_arguments(safe_get_attr(tcm, "arguments")) + if direct_args: + return direct_args + model_response = safe_get_attr(tcm, "model_response") if tcm else None + if model_response is not None: + try: + choices = ( + model_response.choices + if hasattr(model_response, "choices") + else None + ) or [] + for choice in choices: + msg = getattr(choice, "message", None) or ( + choice.get("message") if isinstance(choice, dict) else None + ) + tool_calls = ( + getattr(msg, "tool_calls", None) + if msg is not None + else None + ) or (msg.get("tool_calls") if isinstance(msg, dict) else None) + if not tool_calls: + continue + want_id = safe_str(safe_get_attr(tcm, "tool_call_id") or "") + for tc in tool_calls: + tc_id = ( + getattr(tc, "id", None) + if not isinstance(tc, dict) + else tc.get("id") + ) + if want_id and safe_str(tc_id) != want_id: + continue + fn = ( + getattr(tc, "function", None) + if not isinstance(tc, dict) + else tc.get("function") + ) + raw_args = ( + getattr(fn, "arguments", None) + if not isinstance(fn, dict) + else fn.get("arguments") + ) + parsed_args = _coerce_tool_arguments(raw_args) + if parsed_args: + return parsed_args + except Exception: + pass + # Fallback: harvest known argument-bearing fields off the Action object. + args: dict[str, Any] = {} + for key in _TOOL_ARG_FIELDS: + v = safe_get_attr(action, key) + if v not in (None, "", [], {}): + args[key] = v + return args + + +def _observation_to_result(observation: Any) -> dict[str, Any]: + """Return a dict suitable for ``gen_ai.tool.call.result``.""" + if observation is None: + return {} + payload: dict[str, Any] = {} + for key in ( + "content", + "exit_code", + "error", + "interpreter_details", + "command", + "stdout", + "stderr", + "url", + "screenshot", + "outputs", + ): + v = safe_get_attr(observation, key) + if v not in (None, "", [], {}): + payload[key] = v + return payload + + +def _annotate_observation(span: trace_api.Span, observation: Any) -> None: + if observation is None: + return + obs_type = safe_str( + safe_get_attr(observation, "observation") or type(observation).__name__ + ) + if obs_type: + span.set_attribute(OH_OBSERVATION_TYPE, obs_type) + exit_code = safe_get_attr(observation, "exit_code") + if exit_code is not None: + try: + ec = int(exit_code) + span.set_attribute("openhands.action.exit_code", ec) + if ec != 0: + span.set_status(Status(StatusCode.ERROR, f"exit_code={ec}")) + except (TypeError, ValueError): + pass + error = safe_get_attr(observation, "error") + if error: + span.set_attribute("openhands.observation.error", safe_str(error)) + span.set_status(Status(StatusCode.ERROR, safe_str(error))) + # TOOL spans do not emit OpenInference output.value; the result lives in + # the GenAI tool-call result attribute. + try: + result_payload = _observation_to_result(observation) + result_payload.setdefault("observation", obs_type) + out = to_json_str(result_payload) + if out: + span.set_attribute(GEN_AI_TOOL_CALL_RESULT, out) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# ENTRY + AGENT (controller-lifecycle bound) +# +# Why this exists in addition to RunControllerWrapper / RunAgentUntilDoneWrapper: +# +# When OpenHands V0 is launched via ``python -m openhands.core.main``, Python +# executes ``main.py`` *as ``__main__``*. The ``from openhands.core.loop +# import run_agent_until_done`` (and other from-imports) at the top of +# ``main.py`` bind those symbols into ``__main__``'s namespace **before** +# our instrumentor patches ``openhands.core.main.run_controller`` / +# ``openhands.core.loop.run_agent_until_done``. The ``__main__`` block's +# ``asyncio.run(run_controller(...))`` call uses the *unpatched* local +# reference, so the wrappers above never fire — and the trace appears +# without an ENTRY span. +# +# STEP / TOOL spans work because ``_step`` and ``run_action`` are *class +# methods*: patching ``AgentController._step`` updates the class object +# that both ``__main__.AgentController`` and +# ``openhands.controller.agent_controller.AgentController`` reference, so +# every method lookup at call time finds the wrapped version. +# +# ENTRY+AGENT here exploit the same principle — they hook +# ``AgentController.__init__`` and ``AgentController.close``, both class +# methods, so the spans bracket the controller's lifecycle reliably no +# matter how ``run_controller`` was invoked. They no-op when a session +# context is already stashed for this sid (i.e. ``RunControllerWrapper`` +# fired successfully — the API/test-suite code path). +# --------------------------------------------------------------------------- + + +def _capture_agent_io_attributes( + span: trace_api.Span, controller: Any, agent: Any, state: Any +) -> None: + """Set gen_ai.system_instructions / input.messages / output.messages on + the AGENT span, following the ARMS GenAI semconv schema.""" + try: + sys_instr = _agent_to_system_instructions(agent, state) + if sys_instr: + payload = to_json_str(sys_instr) + if payload: + span.set_attribute(GEN_AI_SYSTEM_INSTRUCTIONS, payload) + except Exception: + pass + try: + history = safe_get_attr(state, "history") or [] + if isinstance(history, list) and history: + input_msgs = _history_to_input_messages_schema(history) + if input_msgs: + payload = to_json_str(input_msgs) + if payload: + span.set_attribute(GEN_AI_INPUT_MESSAGES, payload) + output_msgs = _history_to_output_messages_schema(history) + if output_msgs: + payload = to_json_str(output_msgs) + if payload: + span.set_attribute(GEN_AI_OUTPUT_MESSAGES, payload) + except Exception: + pass + + +def _open_entry_and_agent_for_controller( + tracer: Tracer, controller: Any +) -> None: + """Open ENTRY (parent) + AGENT (child) + warmup STEP for ``controller``. + + Opening a *warmup STEP* (round 1) right after AGENT means that any + pre-step actions like RECALL — which are dispatched to the runtime + *before* the first ``_step`` invocation — become children of STEP 1 + instead of dangling siblings under AGENT. The first real ``_step`` + call detects that the warmup STEP isn't yet "consumed" and reuses + it (without bumping the round counter) so the LLM call + first + LLM-driven tool also nest under STEP 1. + + All inner span creations use the explicit ``context=`` argument + (instead of relying on ``contextvars`` propagation through + ``otel_context.attach``) — this is the most deterministic way to + parent a child span and avoids the entire class of "Token was + created in a different Context" failures we used to chase across + asyncio-task / thread boundaries. + + Idempotent on ``_OWNS_FLAG`` — safe to call multiple times for the + same controller. Deliberately does **not** check whether a session + context is already stashed: under ``python -m openhands.core.main`` + the from-import binding bypasses ``RunControllerWrapper`` and + ``RunAgentUntilDoneWrapper``, so the init wrapper is the only + reliable source of ENTRY+AGENT and must always run. + """ + if not OTEL_INSTRUMENTATION_OPENHANDS_OUTER_SPANS: + return + if getattr(controller, _OWNS_FLAG, False): + # Already opened (e.g. RunControllerWrapper fired first) — log + # and bail. We don't want to double-emit ENTRY/AGENT. + logger.debug( + "OpenHands instrumentation: ENTRY+AGENT already open on " + "controller %s — skipping init-wrapper open", + id(controller), + ) + return + + sid = safe_str(safe_get_attr(controller, "id") or "") + agent = safe_get_attr(controller, "agent") + agent_name = safe_get_attr(agent, "name") or "codeact" + agent_class = ( + f"{type(agent).__module__}.{type(agent).__name__}" if agent else "" + ) + llm = safe_get_attr(agent, "llm") + llm_config = safe_get_attr(llm, "config") + model = safe_get_attr(llm_config, "model") or safe_get_attr(llm, "model") + + # ----- ENTRY ----- + # If RunControllerWrapper already stashed an ENTRY context, parent AGENT + # directly under it. Otherwise create the lifecycle-owned ENTRY here. + entry: trace_api.Span | None = None + entry_ctx = get_context(sid) + if entry_ctx is None: + try: + entry = tracer.start_span("enter openhands", kind=SpanKind.INTERNAL) + except Exception as exc: + logger.error( + "OpenHands instrumentation: failed to start ENTRY span for " + "sid=%r: %s", + sid, + exc, + exc_info=True, + ) + return + + try: + _set_common(entry, "ENTRY") + entry.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "enter") + if sid: + entry.set_attribute(GEN_AI_SESSION_ID, sid) + entry.set_attribute(GEN_AI_CONVERSATION_ID, sid) + if agent_class: + entry.set_attribute(OH_AGENT_NAME, agent_class) + if model: + entry.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model)) + state = safe_get_attr(controller, "state") + entry_input_messages, _ = _entry_io_from_state(state) + if entry_input_messages: + _set_io( + entry, + input_value=entry_input_messages, + input_messages=entry_input_messages, + ) + except Exception as exc: + logger.debug("OpenHands instrumentation: ENTRY attr setup: %s", exc) + + entry_ctx = set_span_in_context(entry) + + # ----- AGENT (child of ENTRY) ----- + # Pass ``context=entry_ctx`` *explicitly* so AGENT inherits ENTRY + # as parent regardless of what the surrounding contextvars look + # like (some 3rd-party SDKs reset contextvars between calls). + try: + agent_span = tracer.start_span( + f"invoke_agent {agent_name}", + kind=SpanKind.INTERNAL, + context=entry_ctx, + ) + except Exception as exc: + logger.error( + "OpenHands instrumentation: failed to start AGENT span for " + "sid=%r: %s", + sid, + exc, + exc_info=True, + ) + if entry is not None: + try: + entry.end() + except Exception: + pass + return + + try: + _set_common(agent_span, "AGENT") + agent_span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.INVOKE_AGENT.value, + ) + agent_span.set_attribute(GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name)) + if agent_class: + agent_span.set_attribute(OH_AGENT_NAME, agent_class) + if sid: + agent_span.set_attribute(GEN_AI_SESSION_ID, sid) + agent_span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + agent_span.set_attribute(GEN_AI_AGENT_ID, sid) + if model: + agent_span.set_attribute(GEN_AI_REQUEST_MODEL, safe_str(model)) + except Exception as exc: + logger.debug("OpenHands instrumentation: AGENT attr setup: %s", exc) + + # Tool registry + gen_ai.tool.definitions — same logic as + # RunAgentUntilDoneWrapper, since this path also needs the + # registry for downstream TOOL spans. + try: + tools = safe_get_attr(agent, "tools") or [] + if sid: + store_tool_registry(sid, tools) + defs_summary: list[dict[str, Any]] = [] + for t in tools: + if isinstance(t, dict): + kind = t.get("type") or "function" + fn = t.get("function") or {} + name = fn.get("name") if isinstance(fn, dict) else None + else: + kind = safe_get_attr(t, "type") or "function" + fn = safe_get_attr(t, "function") + name = safe_get_attr(fn, "name") + if not name: + continue + item: dict[str, Any] = {"type": safe_str(kind), "name": safe_str(name)} + if isinstance(fn, dict): + desc = fn.get("description") + params = fn.get("parameters") + else: + desc = safe_get_attr(fn, "description") + params = safe_get_attr(fn, "parameters") + if desc: + item["description"] = safe_str(desc) + if params: + item["parameters"] = params + defs_summary.append(item) + if defs_summary: + agent_span.set_attribute( + GEN_AI_TOOL_DEFINITIONS, to_json_str(defs_summary) + ) + except Exception: + pass + + # Best-effort INPUT + system_instructions capture on AGENT at open + # time. ``_capture_agent_io_attributes`` will run again at close to + # overwrite these with the *final* state, but having them now means + # an in-flight read of the AGENT span (e.g. live dashboards) sees + # at least the system prompt + initial user message. + try: + state = safe_get_attr(controller, "state") + _capture_agent_io_attributes(agent_span, controller, agent, state) + except Exception as exc: + logger.debug( + "OpenHands instrumentation: AGENT initial I/O capture: %s", exc + ) + + agent_ctx = set_span_in_context(agent_span) + if sid: + # Stash ctx-with-AGENT so STEP / TOOL re-attach correctly even + # when fired from worker threads with brand-new asyncio loops. + # The downstream consumers (STEP / TOOL / LLM bridge) all do + # their own paired attach/detach, so it's safe to share this + # ``Context`` object across asyncio tasks and threads. + store_context(sid, agent_ctx) + + # ----- WARMUP STEP (round 1) ----- + # Open right after AGENT so any pre-_step actions (RECALL, etc.) that + # the controller dispatches to the runtime become children of STEP 1 + # rather than dangling siblings under AGENT. The first real ``_step`` + # call detects this open STEP isn't yet "consumed" and reuses it + # (preserving the round number) so the LLM call + first LLM-driven + # tool also nest under STEP 1 — giving the trace tree: + # + # ENTRY > AGENT > STEP 1 > [RECALL, LLM, execute_bash] + # STEP 2 > [LLM, finish] + # ... + warmup_step_ctx: object | None = None + warmup_step_span: trace_api.Span | None = None + try: + warmup_step_span = tracer.start_span( + "react step", + kind=SpanKind.INTERNAL, + context=agent_ctx, + ) + _set_common(warmup_step_span, "STEP") + warmup_step_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "react") + warmup_step_span.set_attribute(OH_REACT_ROUND, 1) + warmup_step_span.set_attribute( + GenAI.GEN_AI_AGENT_NAME, safe_str(agent_name) + ) + if sid: + warmup_step_span.set_attribute(GEN_AI_SESSION_ID, sid) + warmup_step_span.set_attribute(GEN_AI_CONVERSATION_ID, sid) + warmup_step_span.set_attribute(GEN_AI_AGENT_ID, sid) + warmup_step_ctx = set_span_in_context(warmup_step_span) + if sid and warmup_step_ctx is not None: + store_context(sid, warmup_step_ctx) + except Exception as exc: + logger.debug("Failed to open warmup STEP span: %s", exc) + warmup_step_span = None + + # Stash everything we need to tear down in close(). + try: + setattr(controller, _OWNS_FLAG, True) + setattr(controller, _ENTRY_SPAN_ATTR, entry) + setattr(controller, _AGENT_SPAN_ATTR, agent_span) + # Save the AGENT context so the STEP wrapper can restore the + # session stash to AGENT every time it closes a STEP — that way + # any TOOL fired between rounds re-attaches AGENT (not a closed + # STEP). + setattr(controller, _AGENT_CTX_ATTR, agent_ctx) + # Stash warmup STEP so the first real ``_step`` reuses it. + setattr(controller, _STEP_SPAN_ATTR, warmup_step_span) + setattr(controller, "_otel_oh_round", 1 if warmup_step_span is not None else 0) + setattr(controller, "_otel_oh_step_consumed", False) + except Exception: + # If we can't attach to the instance (slots, etc.), close the + # spans down so we don't leak them. + if warmup_step_span is not None: + try: + warmup_step_span.end() + except Exception: + pass + try: + agent_span.end() + except Exception: + pass + if entry is not None: + try: + entry.end() + except Exception: + pass + return + + # Log at INFO so the user can verify in their app logs that the + # ENTRY+AGENT spans were actually opened (and which trace/span IDs + # they got). When a user reports "no ENTRY span" in their backend, + # the first thing to check is whether this log line appeared. + try: + entry_sc = entry.get_span_context() if entry is not None else None + agent_sc = agent_span.get_span_context() + warmup_sc = ( + warmup_step_span.get_span_context() + if warmup_step_span is not None + else None + ) + logger.info( + "OpenHands instrumentation: opened ENTRY+AGENT for sid=%r " + "(trace_id=%032x entry_span=%016x agent_span=%016x " + "warmup_step=%s agent_name=%s model=%s)", + sid, + entry_sc.trace_id if entry_sc is not None else agent_sc.trace_id, + entry_sc.span_id if entry_sc is not None else 0, + agent_sc.span_id, + f"{warmup_sc.span_id:016x}" if warmup_sc is not None else "none", + agent_name, + model or "", + ) + except Exception: + pass + + +def _close_entry_and_agent_for_controller( + controller: Any, *, error: BaseException | None = None +) -> None: + """Tear down the ENTRY+AGENT spans previously opened for ``controller``. + + Also closes any STEP span left open from the last ``_step`` invocation + (STEP spans are intentionally persisted across the return of ``_step`` + so that thread-pooled TOOL / LLM calls fire as their children). + """ + if not getattr(controller, _OWNS_FLAG, False): + logger.debug( + "OpenHands instrumentation: close called on controller %s " + "without an open ENTRY/AGENT — nothing to do", + id(controller), + ) + return + sid = safe_str(safe_get_attr(controller, "id") or "") + agent = safe_get_attr(controller, "agent") + state = safe_get_attr(controller, "state") + entry_span: trace_api.Span | None = getattr(controller, _ENTRY_SPAN_ATTR, None) + agent_span: trace_api.Span | None = getattr(controller, _AGENT_SPAN_ATTR, None) + # Legacy slots — kept for back-compat with already-instrumented + # instances created before we stopped persisting attach-tokens. + # If they're set we simply ignore them (any detach attempt across + # asyncio task boundaries would raise ``ValueError`` in the Aliyun + # SDK; spans alone carry all the parentage info we need). + _ = getattr(controller, _AGENT_TOKEN_ATTR, None) + _ = getattr(controller, _ENTRY_TOKEN_ATTR, None) + + # Close any STEP span still hanging from the last round before tearing + # down AGENT/ENTRY. Restores the session stash to AGENT context so any + # in-flight TOOL re-attaches AGENT (not a closed STEP). + try: + _close_open_step(controller) + except Exception: + pass + + # Capture I/O attributes on the AGENT span before ending it. + if agent_span is not None: + try: + _capture_agent_io_attributes(agent_span, controller, agent, state) + except Exception: + pass + try: + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + agent_span.set_attribute(OH_HISTORY_LENGTH, len(history)) + agent_state = safe_get_attr(state, "agent_state") + if agent_state is not None: + agent_span.set_attribute( + OH_AGENT_STATE, + safe_get_attr(agent_state, "value") or safe_str(agent_state), + ) + except Exception: + pass + if error is not None: + try: + agent_span.record_exception(error) + agent_span.set_status( + Status(StatusCode.ERROR, type(error).__qualname__) + ) + except Exception: + pass + + # End AGENT (no detach — the token (if any) was attached in the + # ``__init__`` task's contextvars context and detaching here would + # cross a context boundary, raising ``ValueError`` in the Aliyun + # SDK. Legacy code may have set ``agent_token`` on older instances; + # we simply leave it alone — detaching is unnecessary because the + # span carries its own parentage and contextvars naturally unwind + # when the task that attached them exits). + if agent_span is not None: + try: + agent_span.end() + except Exception: + pass + + # Mirror the most-useful bits onto ENTRY before closing it. + if entry_span is not None: + try: + agent_state = safe_get_attr(state, "agent_state") + if agent_state is not None: + entry_span.set_attribute( + OH_AGENT_STATE, + safe_get_attr(agent_state, "value") or safe_str(agent_state), + ) + history = safe_get_attr(state, "history") or [] + if isinstance(history, list): + entry_span.set_attribute(OH_HISTORY_LENGTH, len(history)) + output_repr = _final_state_to_output(state) + entry_input_messages, entry_output_messages = _entry_io_from_state( + state + ) + if output_repr: + _set_io( + entry_span, + output_value=output_repr, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + elif entry_input_messages or entry_output_messages: + _set_io( + entry_span, + input_messages=entry_input_messages, + output_messages=entry_output_messages, + ) + except Exception: + pass + if error is not None: + try: + entry_span.record_exception(error) + entry_span.set_status( + Status(StatusCode.ERROR, type(error).__qualname__) + ) + except Exception: + pass + + # Same as AGENT: end the span; never touch a possibly-leftover token + # from an older instrumentation run. + if entry_span is not None: + try: + entry_span.end() + except Exception: + pass + + # Mirror the open-time INFO log so the user can confirm the spans + # actually closed and exported. + try: + agent_sc = ( + agent_span.get_span_context() if agent_span is not None else None + ) + entry_sc = ( + entry_span.get_span_context() if entry_span is not None else None + ) + logger.info( + "OpenHands instrumentation: closed ENTRY+AGENT for sid=%r " + "(entry_span=%s agent_span=%s rounds=%s error=%s)", + sid, + f"{entry_sc.span_id:016x}" if entry_sc is not None else "none", + f"{agent_sc.span_id:016x}" if agent_sc is not None else "none", + getattr(controller, "_otel_oh_round", 0), + type(error).__qualname__ if error is not None else "none", + ) + except Exception: + pass + + if sid: + try: + clear_context(sid) + except Exception: + pass + + # Wipe stash slots so a re-used controller instance doesn't double-emit. + for attr in ( + _OWNS_FLAG, + _ENTRY_SPAN_ATTR, + _AGENT_SPAN_ATTR, + _ENTRY_TOKEN_ATTR, + _AGENT_TOKEN_ATTR, + _STEP_SPAN_ATTR, + _AGENT_CTX_ATTR, + "_otel_oh_step_consumed", + "_otel_oh_round", + ): + try: + setattr(controller, attr, None) + except Exception: + pass + try: + setattr(controller, _OWNS_FLAG, False) + except Exception: + pass + + +class AgentControllerInitWrapper: + """Open ENTRY + AGENT spans at the end of ``AgentController.__init__``. + + Always reliable under ``python -m openhands.core.main`` because it + hooks a class method (immune to from-import binding). + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + try: + result = wrapped(*args, **kwargs) + except BaseException: + raise + try: + # Skip delegate sub-controllers — they shouldn't open another + # ENTRY span; they live within the parent controller's trace. + is_delegate = bool(safe_get_attr(instance, "is_delegate")) + if is_delegate: + logger.debug( + "OpenHands instrumentation: skipping delegate " + "controller %s for ENTRY/AGENT", + id(instance), + ) + else: + _open_entry_and_agent_for_controller(self._tracer, instance) + except Exception as exc: + # Promote to ERROR — if this fails the user will see "no + # ENTRY span" in their backend and we want a loud signal in + # the app logs to point at the cause. + logger.error( + "OpenHands instrumentation: AgentController init wrapper " + "failed to open ENTRY/AGENT for controller %s: %s", + id(instance), + exc, + exc_info=True, + ) + return result + + +class AgentControllerCloseWrapper: + """End the ENTRY + AGENT spans previously opened in ``__init__``.""" + + __slots__ = () + + def __init__(self, _tracer: Tracer): + # Tracer arg unused (we only need the spans we previously opened) + # but kept for symmetry with the other factories. + pass + + def __call__(self, wrapped, instance, args, kwargs): + return self._impl(wrapped, instance, args, kwargs) + + async def _impl(self, wrapped, instance, args, kwargs): + err: BaseException | None = None + try: + return await wrapped(*args, **kwargs) + except BaseException as exc: + err = exc + raise + finally: + try: + _close_entry_and_agent_for_controller(instance, error=err) + except Exception as exc: + logger.error( + "OpenHands instrumentation: AgentController close " + "wrapper failed to end spans for controller %s: %s", + id(instance), + exc, + exc_info=True, + ) + + +# --------------------------------------------------------------------------- +# LLM context bridge: openhands.llm.llm.LLM.__init__ +# --------------------------------------------------------------------------- + + +# Sentinel used to mark already-bridged completion callables so we don't +# wrap them more than once if ``LLM.__init__`` runs again on the same +# completion partial (e.g. live config reload). +_LLM_BRIDGE_FLAG = "_otel_oh_ctx_bridged" + + +class LLMInitWrapper: + """Make sure ``LLM.completion`` runs with the current STEP context attached. + + Why this exists + --------------- + The LLM call inside ``AgentController._step`` is synchronous and *should* + inherit our STEP context via ``contextvars`` — but in real OpenHands + deployments LiteLLM ends up creating its span with a *different* + ``trace_id`` than the surrounding STEP/AGENT/ENTRY tree. Two known ways + that can happen: + + * a 3rd-party auto-instrumentation injected before ours stashes the + LLM call onto a thread-pool worker (no contextvars propagation); + * the call is made from outside any of our wrappers (e.g. a condenser + / summarizer worker) where no OTel context is current. + + The fix: at the end of ``LLM.__init__`` we monkey-patch ``self._completion`` + with a tiny shim that re-attaches the latest sid-stashed context (which, + while a STEP is open, is the STEP context — see ``AgentControllerStepWrapper``). + The downstream ``opentelemetry-instrumentation-litellm`` (or the Aliyun + GenAI auto-instrumentation) will then create the LLM span as a child + of STEP and the ``trace_id`` finally lines up. + """ + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer): + # Tracer arg unused — we only re-attach an existing OTel context + # so the *real* LLM instrumentor (litellm / aliyun) emits the + # span under it. We don't create our own LLM span here. + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + result = wrapped(*args, **kwargs) + try: + self._patch_completion(instance) + except Exception as exc: + logger.debug("LLM init wrapper failed to bridge completion: %s", exc) + return result + + @staticmethod + def _patch_completion(instance: Any) -> None: + completion = getattr(instance, "_completion", None) + if completion is None: + return + if getattr(completion, _LLM_BRIDGE_FLAG, False): + return + + def bridged(*a: Any, **kw: Any) -> Any: + # ``AttachedSession(None)`` re-attaches whatever context the + # most recent v0 wrapper stashed (STEP if a step is open, + # AGENT otherwise). When no OpenHands session is active the + # context manager is a no-op. + with AttachedSession(None): + return completion(*a, **kw) + + try: + setattr(bridged, _LLM_BRIDGE_FLAG, True) + except Exception: + pass + try: + instance._completion = bridged + except Exception: + return + # Mirror onto the unwrapped slot too — some OpenHands codepaths + # call ``_completion_unwrapped`` directly when retries are + # disabled, and we want them to inherit the same parent context. + unwrapped = getattr(instance, "_completion_unwrapped", None) + if unwrapped is not None and not getattr(unwrapped, _LLM_BRIDGE_FLAG, False): + + def bridged_unwrapped(*a: Any, **kw: Any) -> Any: + with AttachedSession(None): + return unwrapped(*a, **kw) + + try: + setattr(bridged_unwrapped, _LLM_BRIDGE_FLAG, True) + except Exception: + pass + try: + instance._completion_unwrapped = bridged_unwrapped + except Exception: + pass diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/package.py new file mode 100644 index 000000000..6e3b6b925 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/package.py @@ -0,0 +1 @@ +_instruments = ("openhands-ai >= 1.0.0",) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/version.py new file mode 100644 index 000000000..3dc1f76bc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/src/opentelemetry/instrumentation/openhands/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/test-requirements.txt b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/test-requirements.txt new file mode 100644 index 000000000..b5c521bd2 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/test-requirements.txt @@ -0,0 +1,9 @@ +pytest>=7.0.0 +pytest-asyncio>=0.21.0 +wrapt>=1.0.0 +httpx>=0.24.0 + +-e ./instrumentation-loongsuite/loongsuite-instrumentation-openhands +-e ./opentelemetry-instrumentation +-e ./opentelemetry-sdk +-e ./opentelemetry-semantic-conventions diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/conftest.py new file mode 100644 index 000000000..2fc095575 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/conftest.py @@ -0,0 +1,247 @@ +"""Shared pytest fixtures and stub modules for the OpenHands instrumentation. + +We deliberately don't require ``openhands-ai`` to be installed at test time: +instead we register lightweight stub modules under the same dotted paths so +``wrap_function_wrapper`` can patch them. The wrappers themselves only rely on +the *call signatures* documented in ``execute.md`` — which we faithfully +reproduce in the stubs. +""" + +from __future__ import annotations + +import asyncio +import sys +import types +from dataclasses import dataclass, field + +import pytest +from opentelemetry import trace as trace_api +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) + + +def _ensure_stub_module(name: str) -> types.ModuleType: + if name in sys.modules: + return sys.modules[name] + mod = types.ModuleType(name) + sys.modules[name] = mod + parent_name, _, leaf = name.rpartition(".") + if parent_name: + parent = _ensure_stub_module(parent_name) + setattr(parent, leaf, mod) + return mod + + +def _install_v0_stub_modules() -> None: + """Stubs for the V0 (Legacy CodeAct) hook points.""" + _ensure_stub_module("openhands") + core = _ensure_stub_module("openhands.core") + main_mod = _ensure_stub_module("openhands.core.main") + loop_mod = _ensure_stub_module("openhands.core.loop") + ctrl_pkg = _ensure_stub_module("openhands.controller") + ctrl_mod = _ensure_stub_module("openhands.controller.agent_controller") + rt_pkg = _ensure_stub_module("openhands.runtime") + rt_base = _ensure_stub_module("openhands.runtime.base") + + @dataclass + class _AgentState: + value: str = "finished" + + @dataclass + class _State: + agent_state: _AgentState = field(default_factory=_AgentState) + + @dataclass + class _LLMConfig: + model: str = "qwen3-coder-plus" + + @dataclass + class _LLM: + config: _LLMConfig = field(default_factory=_LLMConfig) + + @dataclass + class _Agent: + name: str = "CodeActAgent" + llm: _LLM = field(default_factory=_LLM) + # Mirrors litellm ChatCompletionToolParam dicts as produced by + # openhands.agenthub.codeact_agent.codeact_agent.CodeActAgent._get_tools. + tools: list = field( + default_factory=lambda: [ + { + "type": "function", + "function": { + "name": "execute_bash", + "description": "Run a bash command on the runtime sandbox.", + "parameters": { + "type": "object", + "properties": { + "command": {"type": "string"}, + }, + "required": ["command"], + }, + }, + }, + ] + ) + + class AgentController: + step_calls = 0 + close_calls = 0 + + def __init__(self, agent=None, sid="sid-test"): + self.agent = agent or _Agent() + self.id = sid + self.state = _State() + self._pending_action = None + self.is_delegate = False + + async def _step(self) -> None: + type(self).step_calls += 1 + class _Pending: + action = "run" + command = "echo step" + thought = "trying" + + self._pending_action = _Pending() + + async def close(self, set_stop_state: bool = True) -> None: + type(self).close_calls += 1 + + ctrl_mod.AgentController = AgentController + + class _ToolCallMetadata: + """Stand-in for :class:`openhands.events.tool.ToolCallMetadata`.""" + + def __init__(self, function_name="", tool_call_id="", arguments=None): + import json as _json + + self.function_name = function_name + self.tool_call_id = tool_call_id + + class _Fn: + def __init__(self, name, args): + self.name = name + self.arguments = _json.dumps(args or {}) + + class _TC: + def __init__(self, tcid, fn): + self.id = tcid + self.function = fn + + class _Msg: + def __init__(self, tcs): + self.tool_calls = tcs + + class _Choice: + def __init__(self, msg): + self.message = msg + + class _ModelResp: + def __init__(self, choices): + self.choices = choices + + self.model_response = _ModelResp( + [_Choice(_Msg([_TC(tool_call_id, _Fn(function_name, arguments))]))] + ) + + class _Action: + def __init__( + self, + action_type="run", + command="echo hi", + tool_call_metadata=None, + ): + self.action = action_type + self.command = command + self.tool_call_metadata = tool_call_metadata + + class _Observation: + def __init__(self, exit_code=0, content=""): + self.exit_code = exit_code + self.content = content + self.observation = "run" + + class Runtime: + run_action_calls = 0 + # Tests can override on the instance to drive observation values. + _next_observation: _Observation | None = None + + def __init__(self, sid="sid-test"): + self.sid = sid + + def run_action(self, action) -> _Observation: + type(self).run_action_calls += 1 + obs = self._next_observation + if obs is not None: + self._next_observation = None + return obs + return _Observation(exit_code=0) + + rt_base.Runtime = Runtime + rt_base.Action = _Action + rt_base.Observation = _Observation + rt_base.ToolCallMetadata = _ToolCallMetadata + + @dataclass + class _State2: + agent_state: _AgentState = field(default_factory=lambda: _AgentState("finished")) + + async def run_controller( + config=None, + initial_user_action=None, + sid: str | None = None, + **kwargs, + ): + if getattr(main_mod, "_test_raise_cancelled", False): + raise asyncio.CancelledError() + # Mirror real V0: invoke the agent loop *inside* run_controller so + # the AGENT span lives within the ENTRY span (and inherits its + # stashed OTel context). Tests can install + # ``main_mod._test_inner_args = (controller, runtime)`` to opt in. + inner_args = getattr(main_mod, "_test_inner_args", None) + if inner_args is not None: + controller, runtime = inner_args + await loop_mod.run_agent_until_done(controller, runtime, None, []) + return _State2() + + main_mod.run_controller = run_controller + + async def run_agent_until_done(controller, runtime, memory, end_states): + # Tests can install a custom inner callback to drive STEP / TOOL + # spans inside the AGENT span; default is a no-op. + cb = getattr(loop_mod, "_test_inner_callback", None) + if callable(cb): + await cb(controller, runtime) + return None + + loop_mod.run_agent_until_done = run_agent_until_done + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def tracer_provider() -> TracerProvider: + provider = TracerProvider() + exporter = InMemorySpanExporter() + provider.add_span_processor(SimpleSpanProcessor(exporter)) + provider._exporter = exporter # type: ignore[attr-defined] + return provider + + +@pytest.fixture +def stub_openhands_v0_modules() -> None: + _install_v0_stub_modules() + + +@pytest.fixture(autouse=True) +def _reset_global_tracer(): + """Avoid bleed-through of the SDK provider between tests.""" + yield + trace_api._TRACER_PROVIDER = None # type: ignore[attr-defined] + diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_tool_attributes.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_tool_attributes.py new file mode 100644 index 000000000..91dcae22a --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_tool_attributes.py @@ -0,0 +1,329 @@ +"""ARMS GenAI semconv §Tool conformance tests for the V0 TOOL wrapper. + +I/O capture is always on (no env-var gating, no truncation), so the +TOOL span must carry every attribute the spec calls out — both +required and recommended — on every run. +""" + +from __future__ import annotations + +import asyncio +import json + +import pytest + + +def _spans_by_kind(exporter, kind: str): + return [ + s + for s in exporter.get_finished_spans() + if s.attributes.get("gen_ai.span.kind") == kind + ] + + +@pytest.fixture +def instrumented(tracer_provider, stub_openhands_v0_modules): + from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor + from opentelemetry.instrumentation.openhands.internal import session_context + + session_context.clear_all() + inst = OpenHandsInstrumentor() + inst.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + try: + yield inst, tracer_provider._exporter # type: ignore[attr-defined] + finally: + try: + inst.uninstrument() + except Exception: + pass + session_context.clear_all() + + +def _run_one_tool_call(rt_base, ctrl_mod, loop_mod, main_mod): + """Drive a single ENTRY → AGENT → STEP → TOOL flow.""" + ctrl = ctrl_mod.AgentController(sid="tool-sid") + runtime = rt_base.Runtime(sid="tool-sid") + + tcm = rt_base.ToolCallMetadata( + function_name="execute_bash", + tool_call_id="call_abc123", + arguments={"command": "ls /tmp", "thought": "list temp"}, + ) + action = rt_base.Action( + action_type="run", + command="ls /tmp", + tool_call_metadata=tcm, + ) + + class MessageAction: + content = "list /tmp" + source = "user" + + async def _inner(_c, _r): + await ctrl._step() + runtime.run_action(action) + + loop_mod._test_inner_callback = _inner + main_mod._test_inner_args = (ctrl, runtime) + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=MessageAction(), + sid="tool-sid", + ) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + main_mod._test_inner_args = None + + +def test_tool_span_carries_all_arms_required_attributes(instrumented): + inst, exporter = instrumented + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + _run_one_tool_call(rt_base, ctrl_mod, loop_mod, main_mod) + + tools = _spans_by_kind(exporter, "TOOL") + assert len(tools) == 1 + tool = tools[0] + attrs = tool.attributes + + # Required + assert attrs["gen_ai.span.kind"] == "TOOL" + assert attrs["gen_ai.operation.name"] == "execute_tool" + + # Span name should be `execute_tool {tool_name}` + assert tool.name == "execute_tool execute_bash" + + # Recommended attributes + assert attrs["gen_ai.tool.name"] == "execute_bash" + assert attrs["gen_ai.tool.type"] == "function" + assert attrs["gen_ai.tool.call.id"] == "call_abc123" + assert attrs.get("gen_ai.tool.description") == ( + "Run a bash command on the runtime sandbox." + ) + + # Arguments should be the BARE JSON dict, not the wrapping + # {"tool": ..., "arguments": ...} envelope. + args_json = attrs.get("gen_ai.tool.call.arguments") + assert args_json is not None + args = json.loads(args_json) + assert args == {"command": "ls /tmp", "thought": "list temp"} + + # Result should reflect the observation. + result_json = attrs.get("gen_ai.tool.call.result") + assert result_json is not None + result = json.loads(result_json) + assert result.get("exit_code") == 0 + assert "observation" in result + assert "input.value" not in attrs + assert "output.value" not in attrs + + +def test_tool_span_falls_back_to_action_field_when_no_tool_call_metadata( + instrumented, +): + """If the action wasn't generated from an LLM tool call (e.g. a + user-initiated agent.action), the wrapper should still produce a + sensible ``gen_ai.tool.name`` derived from the action type.""" + inst, exporter = instrumented + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + ctrl = ctrl_mod.AgentController(sid="tool-fallback-sid") + runtime = rt_base.Runtime(sid="tool-fallback-sid") + action = rt_base.Action(action_type="run", command="echo hi") + + class MessageAction: + content = "say hi" + source = "user" + + async def _inner(_c, _r): + await ctrl._step() + runtime.run_action(action) + + loop_mod._test_inner_callback = _inner + main_mod._test_inner_args = (ctrl, runtime) + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=MessageAction(), + sid="tool-fallback-sid", + ) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + main_mod._test_inner_args = None + + tool = _spans_by_kind(exporter, "TOOL")[0] + attrs = tool.attributes + + # Action.action == "run" → tool name "bash" + assert attrs["gen_ai.tool.name"] == "bash" + assert tool.name == "execute_tool bash" + # No tool-call id when the action wasn't from an LLM call + assert attrs.get("gen_ai.tool.call.id", "") == "" + # Arguments still produced from the action's fields + args = json.loads(attrs["gen_ai.tool.call.arguments"]) + assert args.get("command") == "echo hi" + + +def test_tool_span_reads_arguments_from_tool_call_metadata(instrumented): + inst, exporter = instrumented + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + ctrl = ctrl_mod.AgentController(sid="tool-direct-args-sid") + runtime = rt_base.Runtime(sid="tool-direct-args-sid") + + class DirectToolCallMetadata: + function_name = "execute_bash" + tool_call_id = "call_direct_args" + arguments = {"command": "pwd", "timeout": 3} + + action = rt_base.Action( + action_type="run", + command="pwd", + tool_call_metadata=DirectToolCallMetadata(), + ) + + class MessageAction: + content = "print cwd" + source = "user" + + async def _inner(_c, _r): + await ctrl._step() + runtime.run_action(action) + + loop_mod._test_inner_callback = _inner + main_mod._test_inner_args = (ctrl, runtime) + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=MessageAction(), + sid="tool-direct-args-sid", + ) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + main_mod._test_inner_args = None + + tool = _spans_by_kind(exporter, "TOOL")[0] + attrs = tool.attributes + assert attrs["gen_ai.tool.call.id"] == "call_direct_args" + assert json.loads(attrs["gen_ai.tool.call.arguments"]) == { + "command": "pwd", + "timeout": 3, + } + + +def test_tool_span_always_emits_arguments_attribute(instrumented): + inst, exporter = instrumented + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + ctrl = ctrl_mod.AgentController(sid="tool-empty-args-sid") + runtime = rt_base.Runtime(sid="tool-empty-args-sid") + action = rt_base.Action(action_type="run", command="") + + class MessageAction: + content = "run empty command" + source = "user" + + async def _inner(_c, _r): + await ctrl._step() + runtime.run_action(action) + + loop_mod._test_inner_callback = _inner + main_mod._test_inner_args = (ctrl, runtime) + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=MessageAction(), + sid="tool-empty-args-sid", + ) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + main_mod._test_inner_args = None + + attrs = _spans_by_kind(exporter, "TOOL")[0].attributes + assert attrs["gen_ai.tool.call.arguments"] == "{}" + + +def test_agent_io_capture_omits_legacy_and_openinference_attrs(tracer_provider): + from opentelemetry.instrumentation.openhands.internal.v0_wrappers import ( + _capture_agent_io_attributes, + ) + + class SystemMessageAction: + content = "You are helpful." + + class MessageAction: + content = "hello" + source = "user" + + class AgentFinishAction: + final_thought = "done" + + class State: + history = [SystemMessageAction(), MessageAction(), AgentFinishAction()] + + tracer = tracer_provider.get_tracer(__name__) + with tracer.start_as_current_span("agent") as span: + _capture_agent_io_attributes(span, None, None, State()) + + attrs = tracer_provider._exporter.get_finished_spans()[0].attributes # type: ignore[attr-defined] + assert attrs.get("gen_ai.system_instructions") + assert attrs.get("gen_ai.input.messages") + assert attrs.get("gen_ai.output.messages") + assert "gen_ai.system_instruction" not in attrs + assert "input.value" not in attrs + assert "output.value" not in attrs + + +def test_agent_span_emits_tool_definitions(instrumented): + """AGENT span should advertise the agent's available tools per the + ARMS GenAI semconv §Agent → ``gen_ai.tool.definitions``.""" + inst, exporter = instrumented + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + _run_one_tool_call(rt_base, ctrl_mod, loop_mod, main_mod) + + agent = _spans_by_kind(exporter, "AGENT")[0] + defs_json = agent.attributes.get("gen_ai.tool.definitions") + assert defs_json, "AGENT span should set gen_ai.tool.definitions" + defs = json.loads(defs_json) + assert isinstance(defs, list) and defs + assert defs[0]["type"] == "function" + assert defs[0]["name"] == "execute_bash" + assert "description" in defs[0] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_trace_continuity.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_trace_continuity.py new file mode 100644 index 000000000..2d2adbd75 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_trace_continuity.py @@ -0,0 +1,252 @@ +"""Cross-thread / cross-loop trace continuity tests for V0 wrappers. + +These tests model the *real* OpenHands V0 runtime behaviour: events are +delivered by ``EventStream`` via a ``ThreadPoolExecutor`` and the controller +processes them with ``asyncio.get_event_loop().run_until_complete(...)`` — +which spins a brand-new asyncio loop in the worker thread. Without our +session-context bridge, STEP / TOOL spans would start fresh root traces. + +We assert: + +* All ENTRY / AGENT / STEP / TOOL spans share the **same** ``trace_id``. +* Parent-child wiring is correct (STEP is parented under AGENT, TOOL too). +* The session-context store is cleaned up after the entry returns. +* GenAI semantic-convention I/O attributes are populated when content + capture is enabled. +""" + +from __future__ import annotations + +import asyncio +import json +import os +import threading +from concurrent.futures import ThreadPoolExecutor + +import pytest + + +def _spans_by_kind_attr(exporter, kind: str): + return [ + s + for s in exporter.get_finished_spans() + if s.attributes.get("gen_ai.span.kind") == kind + ] + + +@pytest.fixture +def instrumented_v0(tracer_provider, stub_openhands_v0_modules): + from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor + from opentelemetry.instrumentation.openhands.internal import session_context + + session_context.clear_all() + inst = OpenHandsInstrumentor() + inst.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + try: + yield inst, tracer_provider._exporter # type: ignore[attr-defined] + finally: + try: + inst.uninstrument() + except Exception: + pass + session_context.clear_all() + + +def _drive_step_in_worker_thread(controller, runtime, action) -> None: + """Reproduce the V0 EventStream → ThreadPoolExecutor → run_until_complete path. + + The worker thread (a) has no shared asyncio loop with the caller and + (b) has a *fresh* ``contextvars.Context`` (Python copies the snapshot + at submit-time, but the snapshot is from this test thread — the same + fresh context the real EventStream queue thread would have). + """ + barrier = threading.Event() + err: list[BaseException] = [] + + def _worker(): + try: + # New event loop per worker — exactly what V0 does. + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(controller._step()) + # Run_action is sync — call it directly inside the worker. + runtime.run_action(action) + finally: + loop.close() + except BaseException as exc: # pragma: no cover - surfaced via err + err.append(exc) + finally: + barrier.set() + + pool = ThreadPoolExecutor(max_workers=1) + fut = pool.submit(_worker) + fut.result(timeout=5) + pool.shutdown(wait=True) + barrier.wait(timeout=5) + if err: + raise err[0] + + +def test_all_spans_share_one_trace_id_across_threads(instrumented_v0): + """The whole V0 trace must collapse onto a single trace_id even when + STEP / TOOL run in fresh worker threads with fresh asyncio loops.""" + inst, exporter = instrumented_v0 + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + ctrl = ctrl_mod.AgentController(sid="bench-001") + runtime = rt_base.Runtime(sid="bench-001") + action = rt_base.Action(action_type="run", command="ls /") + + async def _inner(_controller, _runtime): + for _ in range(2): + _drive_step_in_worker_thread(ctrl, runtime, action) + + loop_mod._test_inner_callback = _inner + main_mod._test_inner_args = (ctrl, runtime) + + class MessageAction: + content = "say hi" + source = "user" + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=MessageAction(), + sid="bench-001", + ) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + main_mod._test_inner_args = None + + spans = exporter.get_finished_spans() + by_kind = {kind: _spans_by_kind_attr(exporter, kind) for kind in ("ENTRY", "AGENT", "STEP", "TOOL")} + + assert len(by_kind["ENTRY"]) == 1 + assert len(by_kind["AGENT"]) == 1 + assert len(by_kind["STEP"]) == 2 + assert len(by_kind["TOOL"]) == 2 + + entry = by_kind["ENTRY"][0] + agent = by_kind["AGENT"][0] + trace_id = entry.context.trace_id + + # Same trace_id for every span + for s in spans: + assert s.context.trace_id == trace_id, ( + f"span {s.name!r} (kind={s.attributes.get('gen_ai.span.kind')}) " + f"has trace_id {s.context.trace_id} but expected {trace_id}" + ) + + # Parent-child links: AGENT under ENTRY, STEP under AGENT, TOOL under AGENT + assert agent.parent is not None and agent.parent.span_id == entry.context.span_id + for s in by_kind["STEP"]: + assert s.parent is not None and s.parent.span_id == agent.context.span_id + for t in by_kind["TOOL"]: + assert t.parent is not None and t.parent.span_id == agent.context.span_id + + +def test_session_context_cleared_after_entry(instrumented_v0): + """The per-sid stash must not leak across runs.""" + inst, exporter = instrumented_v0 + + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + from opentelemetry.instrumentation.openhands.internal import session_context + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=type("Msg", (), {"content": "x", "source": "user"})(), + sid="ephemeral-sid", + ) + + asyncio.run(_scenario()) + assert session_context.get_context("ephemeral-sid") is None + + +def test_io_attributes_on_entry_agent_step(instrumented_v0): + """Verify GenAI / OpenInference I/O attributes are populated.""" + inst, exporter = instrumented_v0 + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + ctrl = ctrl_mod.AgentController(sid="io-sid") + runtime = rt_base.Runtime(sid="io-sid") + action = rt_base.Action(action_type="run", command="cat /etc/hosts") + + # Seed history with a *MessageAction*-named instance — that's the type + # name the AGENT wrapper looks for when computing input.messages. + class MessageAction: + content = "do the thing" + source = "user" + + ctrl.state.history = [MessageAction()] + + async def _inner(_c, _r): + await ctrl._step() + runtime.run_action(action) + + loop_mod._test_inner_callback = _inner + main_mod._test_inner_args = (ctrl, runtime) + + async def _scenario(): + await main_mod.run_controller( + config=None, + initial_user_action=MessageAction(), + sid="io-sid", + ) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + main_mod._test_inner_args = None + + entry = _spans_by_kind_attr(exporter, "ENTRY")[0] + agent = _spans_by_kind_attr(exporter, "AGENT")[0] + step = _spans_by_kind_attr(exporter, "STEP")[0] + tool = _spans_by_kind_attr(exporter, "TOOL")[0] + + # ENTRY + assert entry.attributes.get("gen_ai.framework") == "openhands" + assert entry.attributes.get("gen_ai.system") == "openhands" + assert entry.attributes.get("gen_ai.session.id") == "io-sid" + assert entry.attributes.get("input.value") + assert "do the thing" in entry.attributes.get("input.value") + + # AGENT + assert agent.attributes.get("gen_ai.input.messages") + assert "do the thing" in agent.attributes.get("gen_ai.input.messages") + assert "gen_ai.system_instruction" not in agent.attributes + assert "input.value" not in agent.attributes + assert "output.value" not in agent.attributes + assert agent.attributes.get("gen_ai.session.id") == "io-sid" + + # STEP + assert step.attributes.get("input.value") + assert step.attributes.get("output.value") + assert step.attributes.get("gen_ai.output.messages") + assert step.attributes.get("openhands.action.type") == "run" + out = step.attributes.get("output.value") + assert "tool_calls" in out and "echo step" in out + + # TOOL spans: arguments only via gen_ai.tool.call.arguments; no input/output.value. + assert tool.attributes.get("gen_ai.tool.name") == "bash" + assert "input.value" not in tool.attributes + assert "output.value" not in tool.attributes + args = json.loads(tool.attributes["gen_ai.tool.call.arguments"]) + assert args.get("command") == "cat /etc/hosts" + result = tool.attributes.get("gen_ai.tool.call.result") + assert result + assert "exit_code" in result diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_wrappers.py new file mode 100644 index 000000000..cce832f66 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-openhands/tests/test_v0_wrappers.py @@ -0,0 +1,187 @@ +"""Tests for V0 (Legacy CodeAct) wrappers. + +We exercise the four V0 patches (``run_controller``, ``run_agent_until_done``, +``AgentController._step``, ``Runtime.run_action``) and assert that: + +* The ``ENTRY → AGENT → STEP → TOOL`` span tree is produced. +* Parent-child linkage is correct. +* Per-action ``gen_ai.tool.name`` is mapped from the V0 ``action`` field. +""" + +from __future__ import annotations + +import asyncio + +import pytest + + +def _spans_by_kind_attr(exporter, kind: str): + return [ + s + for s in exporter.get_finished_spans() + if s.attributes.get("gen_ai.span.kind") == kind + ] + + +@pytest.fixture +def instrumented_v0(tracer_provider, stub_openhands_v0_modules): + from opentelemetry.instrumentation.openhands import OpenHandsInstrumentor + + inst = OpenHandsInstrumentor() + inst.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + try: + yield inst, tracer_provider._exporter # type: ignore[attr-defined] + finally: + try: + inst.uninstrument() + except Exception: + pass + + +def test_v0_full_span_tree(instrumented_v0): + inst, exporter = instrumented_v0 + + import openhands.controller.agent_controller as ctrl_mod + import openhands.core.loop as loop_mod + import openhands.core.main as main_mod + import openhands.runtime.base as rt_base + + ctrl = ctrl_mod.AgentController() + runtime = rt_base.Runtime() + action = rt_base.Action(action_type="run", command="ls /") + + async def _inner(controller, _runtime): + for _ in range(2): + await ctrl._step() + runtime.run_action(action) + + loop_mod._test_inner_callback = _inner + + async def _scenario(): + # ENTRY span via run_controller wrapper + await main_mod.run_controller( + config=None, + initial_user_action=type("Msg", (), {"content": "hello"})(), + sid="sid-test", + ) + # AGENT span via run_agent_until_done wrapper (which calls _inner) + await loop_mod.run_agent_until_done(ctrl, runtime, None, []) + + try: + asyncio.run(_scenario()) + finally: + loop_mod._test_inner_callback = None + + entry = _spans_by_kind_attr(exporter, "ENTRY") + agent = _spans_by_kind_attr(exporter, "AGENT") + step = _spans_by_kind_attr(exporter, "STEP") + tool = _spans_by_kind_attr(exporter, "TOOL") + + assert len(entry) == 1, f"unexpected ENTRY count: {len(entry)}" + assert len(agent) == 1, f"unexpected AGENT count: {len(agent)}" + assert len(step) == 2, f"unexpected STEP count: {len(step)}" + assert len(tool) == 2, f"unexpected TOOL count: {len(tool)}" + + e = entry[0] + a = agent[0] + assert e.name == "enter openhands" + assert e.attributes.get("gen_ai.framework") == "openhands" + assert e.attributes.get("gen_ai.session.id") == "sid-test" + + assert a.name.startswith("invoke_agent ") + assert a.attributes.get("gen_ai.agent.name") == "CodeActAgent" + assert a.attributes.get("gen_ai.request.model") == "qwen3-coder-plus" + assert "gen_ai.system_instruction" not in a.attributes + assert "input.value" not in a.attributes + assert "output.value" not in a.attributes + + # All STEP spans share the AGENT as parent. + for s in step: + assert s.parent is not None + assert s.parent.span_id == a.context.span_id + assert s.attributes.get("gen_ai.operation.name") == "react" + assert s.attributes.get("gen_ai.react.round") in (1, 2) + + # TOOL spans are siblings of STEP under AGENT (run_action is called after + # _step returns and is no longer in STEP context). + for t in tool: + assert t.attributes.get("gen_ai.tool.name") == "bash" + assert t.attributes.get("openhands.action.type") == "run" + assert t.attributes.get("openhands.action.exit_code") == 0 + + +def test_v0_step_round_increments_per_controller(instrumented_v0): + inst, exporter = instrumented_v0 + import openhands.controller.agent_controller as ctrl_mod + + ctrl_a = ctrl_mod.AgentController(sid="A") + ctrl_b = ctrl_mod.AgentController(sid="B") + + async def _go(): + await ctrl_a._step() + await ctrl_a._step() + await ctrl_b._step() + + asyncio.run(_go()) + + step_spans = _spans_by_kind_attr(exporter, "STEP") + assert len(step_spans) == 3 + rounds_a = sorted( + s.attributes.get("gen_ai.react.round") + for s in step_spans + if s.attributes.get("gen_ai.session.id") == "A" + ) + rounds_b = sorted( + s.attributes.get("gen_ai.react.round") + for s in step_spans + if s.attributes.get("gen_ai.session.id") == "B" + ) + assert rounds_a == [1, 2] + assert rounds_b == [1] + + +def test_v0_runtime_error_observation_marks_span(instrumented_v0): + inst, exporter = instrumented_v0 + import openhands.runtime.base as rt_base + + runtime = rt_base.Runtime() + + class _ErrAction: + action = "run" + command = "false" + + # Use the conftest hook to make the next run_action return an error obs. + err_obs = rt_base.Observation(exit_code=2) + runtime._next_observation = err_obs + + runtime.run_action(_ErrAction()) + + tool_spans = _spans_by_kind_attr(exporter, "TOOL") + assert len(tool_spans) == 1 + span = tool_spans[0] + assert span.attributes.get("openhands.action.exit_code") == 2 + assert span.status.status_code.name == "ERROR" + + +def test_v0_run_controller_cancelled_is_not_span_error(instrumented_v0): + """``asyncio.CancelledError`` (e.g. wait_for) must not mark ENTRY as ERROR.""" + _, exporter = instrumented_v0 + import openhands.core.main as main_mod + + main_mod._test_raise_cancelled = True + try: + with pytest.raises(asyncio.CancelledError): + asyncio.run( + main_mod.run_controller( + config=None, + initial_user_action=type("Msg", (), {"content": "hello"})(), + sid="sid-cancel", + ) + ) + finally: + main_mod._test_raise_cancelled = False + + entry = _spans_by_kind_attr(exporter, "ENTRY") + assert len(entry) == 1 + assert entry[0].status.status_code.name == "UNSET" + diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/README.md new file mode 100644 index 000000000..4d4f4d7b1 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/README.md @@ -0,0 +1,32 @@ +# LoongSuite slop-code-bench Instrumentation + +OpenTelemetry instrumentation for the [slop-code-bench](https://github.com/SprocketLab/slop-code-bench) benchmark orchestrator. + +## Span Tree + +``` +ENTRY "slop-code.enter" +└── CHAIN "workflow.{problem_name}" + ├── TASK "task.{checkpoint_name}" + │ └── AGENT "agent.{agent_type}" + │ ├── STEP "react.step.{N}" [MiniSWE only] + │ └── ... + ├── TASK "task.{checkpoint_name}" + │ └── AGENT "agent.{agent_type}" + └── ... +LLM "chat {model_name}" [Rubric Judge] +``` + +## Installation + +```bash +pip install loongsuite-instrumentation-slop-code +``` + +## Usage + +```python +from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + +SlopCodeInstrumentor().instrument() +``` diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/pyproject.toml new file mode 100644 index 000000000..b443381c2 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/pyproject.toml @@ -0,0 +1,61 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-slop-code" +dynamic = ["version"] +description = "LoongSuite slop-code-bench instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.10,<4" +authors = [ + { name = "Zhiyong Liu", email = "liuzhiyong.lzy@alibaba-inc.com" }, + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "opentelemetry-api >= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.14.0, < 2.0.0", + "opentelemetry-util-genai >= 0.3b0.dev0", +] + +[project.optional-dependencies] +instruments = [ + "slop-code-bench >= 0.1", +] +test = [ + "pytest", + "pytest-asyncio", + "pytest-forked", + "opentelemetry-sdk", +] + +[project.entry-points.opentelemetry_instrumentor] +slop_code = "opentelemetry.instrumentation.slop_code:SlopCodeInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-slop-code" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/slop_code/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/__init__.py new file mode 100644 index 000000000..983e60ab8 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/__init__.py @@ -0,0 +1,246 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +OpenTelemetry slop-code-bench Instrumentation + +Instruments the slop-code benchmark orchestrator lifecycle: +- ENTRY: run_agent (CLI entrypoint) +- CHAIN/workflow: run_agent_on_problem (per-problem) +- TASK: AgentRunner._run_checkpoint (per-checkpoint) +- AGENT: Agent.run_checkpoint (concrete agent invocation) +- STEP: MiniSWEAgent.agent_step (ReAct iteration) +- LLM: grade_file_async (Rubric Judge) +""" + +import logging +from typing import Any, Collection + +from wrapt import wrap_function_wrapper + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.slop_code.package import _instruments +from opentelemetry.instrumentation.slop_code.version import __version__ +from opentelemetry.instrumentation.slop_code.wrappers.agent import ( + _AgentRunCheckpointWrapper, +) +from opentelemetry.instrumentation.slop_code.wrappers.entry import ( + _EntryWrapper, + _RunnerEntryWrapper, +) +from opentelemetry.instrumentation.slop_code.wrappers.llm import ( + _RubricGradeWrapper, +) +from opentelemetry.instrumentation.slop_code.wrappers.step import ( + _MiniSWEObservationWrapper, + _MiniSWEStepWrapper, +) +from opentelemetry.instrumentation.slop_code.wrappers.task import ( + _TaskRunCheckpointWrapper, +) +from opentelemetry.instrumentation.slop_code.wrappers.tool import ( + _ToolExecuteActionWrapper, +) +from opentelemetry.instrumentation.slop_code.wrappers.workflow import ( + _WorkflowWrapper, +) +from opentelemetry.instrumentation.utils import unwrap + +logger = logging.getLogger(__name__) + +__all__ = ["SlopCodeInstrumentor", "__version__"] + +_MODULE_ENTRY = "slop_code.entrypoints.commands.run_agent" +_MODULE_WORKER = "slop_code.entrypoints.problem_runner.worker" +# slop_code.entrypoints.problem_runner.driver re-imports +# `run_agent_on_problem` via `from .worker import run_agent_on_problem` +# at package-load time, capturing the original function reference. Because +# our wrap happens after that bind, we must additionally replace the local +# binding inside `driver` itself, otherwise the worker subprocess still +# calls the un-wrapped original and the CHAIN span never fires. +_MODULE_DRIVER = "slop_code.entrypoints.problem_runner.driver" +_MODULE_RUNNER = "slop_code.agent_runner.runner" +_MODULE_AGENT = "slop_code.agent_runner.agent" +_MODULE_MINISWE = "slop_code.agent_runner.agents._miniswe_agent" +_MODULE_RUBRIC = "slop_code.metrics.rubric.router" + + +class SlopCodeInstrumentor(BaseInstrumentor): + """OpenTelemetry instrumentor for slop-code-bench framework.""" + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + tracer_provider = kwargs.get("tracer_provider") + tracer = trace_api.get_tracer( + __name__, + __version__, + tracer_provider=tracer_provider, + ) + + # 3.1 ENTRY span: run_agent + try: + wrap_function_wrapper( + module=_MODULE_ENTRY, + name="run_agent", + wrapper=_EntryWrapper(tracer), + ) + except Exception as e: + logger.warning(f"Could not wrap run_agent: {e}") + + # 3.2 CHAIN span: run_agent_on_problem + workflow_wrapper = _WorkflowWrapper(tracer) + try: + wrap_function_wrapper( + module=_MODULE_WORKER, + name="run_agent_on_problem", + wrapper=workflow_wrapper, + ) + except Exception as e: + logger.warning(f"Could not wrap run_agent_on_problem: {e}") + # Also wrap the re-bound name inside driver. driver.py imports + # run_agent_on_problem at module-load time via `from .worker import ...`, + # so the local name escapes our worker-module patch. The worker + # subprocess inherits this stale reference via fork(), and CHAIN + # spans never fire unless we patch the local re-bind too. + try: + wrap_function_wrapper( + module=_MODULE_DRIVER, + name="run_agent_on_problem", + wrapper=workflow_wrapper, + ) + except Exception as e: + logger.warning(f"Could not wrap driver.run_agent_on_problem: {e}") + + # 3.3 ENTRY span inside worker: AgentRunner.run + try: + wrap_function_wrapper( + module=_MODULE_RUNNER, + name="AgentRunner.run", + wrapper=_RunnerEntryWrapper(tracer), + ) + except Exception as e: + logger.warning(f"Could not wrap AgentRunner.run: {e}") + + # 3.4 TASK span: AgentRunner._run_checkpoint + try: + wrap_function_wrapper( + module=_MODULE_RUNNER, + name="AgentRunner._run_checkpoint", + wrapper=_TaskRunCheckpointWrapper(tracer), + ) + except Exception as e: + logger.warning(f"Could not wrap AgentRunner._run_checkpoint: {e}") + + # 3.5 AGENT span: Agent.run_checkpoint + try: + wrap_function_wrapper( + module=_MODULE_AGENT, + name="Agent.run_checkpoint", + wrapper=_AgentRunCheckpointWrapper(tracer), + ) + except Exception as e: + logger.warning(f"Could not wrap Agent.run_checkpoint: {e}") + + # 3.6 STEP span: MiniSWEAgent.agent_step + try: + wrap_function_wrapper( + module=_MODULE_MINISWE, + name="MiniSWEAgent.agent_step", + wrapper=_MiniSWEStepWrapper(tracer), + ) + except Exception as e: + logger.debug(f"Could not wrap MiniSWEAgent.agent_step: {e}") + + # 3.6 STEP end: MiniSWEAgent.get_observation + try: + wrap_function_wrapper( + module=_MODULE_MINISWE, + name="MiniSWEAgent.get_observation", + wrapper=_MiniSWEObservationWrapper(tracer), + ) + except Exception as e: + logger.debug(f"Could not wrap MiniSWEAgent.get_observation: {e}") + + # 3.7 TOOL span: MiniSWEAgent.execute_action + try: + wrap_function_wrapper( + module=_MODULE_MINISWE, + name="MiniSWEAgent.execute_action", + wrapper=_ToolExecuteActionWrapper(tracer), + ) + except Exception as e: + logger.debug(f"Could not wrap MiniSWEAgent.execute_action: {e}") + + # 3.8 LLM span: grade_file_async + try: + wrap_function_wrapper( + module=_MODULE_RUBRIC, + name="grade_file_async", + wrapper=_RubricGradeWrapper(tracer), + ) + except Exception as e: + logger.debug(f"Could not wrap grade_file_async: {e}") + + def _uninstrument(self, **kwargs: Any) -> None: + try: + import slop_code.entrypoints.commands.run_agent as mod_entry + + unwrap(mod_entry, "run_agent") + except Exception: + pass + + try: + import slop_code.entrypoints.problem_runner.worker as mod_worker + + unwrap(mod_worker, "run_agent_on_problem") + except Exception: + pass + + try: + import slop_code.entrypoints.problem_runner.driver as mod_driver + + unwrap(mod_driver, "run_agent_on_problem") + except Exception: + pass + + try: + import slop_code.agent_runner.runner as mod_runner + + unwrap(mod_runner.AgentRunner, "_run_checkpoint") + except Exception: + pass + + try: + import slop_code.agent_runner.agent as mod_agent + + unwrap(mod_agent.Agent, "run_checkpoint") + except Exception: + pass + + try: + import slop_code.agent_runner.agents.miniswe as mod_miniswe + + unwrap(mod_miniswe.MiniSWEAgent, "agent_step") + except Exception: + pass + + try: + import slop_code.metrics.rubric.router as mod_rubric + + unwrap(mod_rubric, "grade_file_async") + except Exception: + pass diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/package.py new file mode 100644 index 000000000..13b6fe785 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/package.py @@ -0,0 +1,17 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +_instruments = ("slop-code-bench >= 0.1",) + +_supports_metrics = True diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/utils.py new file mode 100644 index 000000000..34cd7a856 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/utils.py @@ -0,0 +1,72 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions for slop-code instrumentation.""" + +from typing import Any, Optional + +from opentelemetry.trace import Span + +SYSTEM_NAME = "slop-code" +MAX_ATTR_LEN = 1024 + + +def safe_get(obj: Any, attr: str, default: Any = None) -> Any: + """Safely get an attribute from an object, returning default on failure.""" + try: + return getattr(obj, attr, default) + except Exception: + return default + + +def safe_get_nested(obj: Any, *attrs: str, default: Any = None) -> Any: + """Safely traverse nested attributes.""" + current = obj + for attr in attrs: + try: + current = getattr(current, attr) + if current is None: + return default + except (AttributeError, TypeError): + return default + return current + + +def set_optional_attr(span: Span, key: str, value: Optional[Any]) -> None: + """Set a span attribute only if value is not None.""" + if value is not None: + if isinstance(value, str) and len(value) > MAX_ATTR_LEN: + value = value[:MAX_ATTR_LEN] + span.set_attribute(key, value) + + +def truncate_text(value: str, limit: int = MAX_ATTR_LEN) -> str: + """Return a bounded string suitable for span attributes.""" + if value is None: + return value + return value if len(value) <= limit else value[:limit] + +def json_dumps_attr(value: Any) -> str: + """Serialize a value as JSON for ARMS GenAI string attributes.""" + import json + return truncate_text(json.dumps(value, ensure_ascii=False, default=str)) + +def genai_messages(messages: Any) -> str: + """Normalize chat-like messages to the ARMS GenAI message schema.""" + normalized = [] + for item in messages or []: + role = safe_get(item, "role") or (item.get("role") if isinstance(item, dict) else None) or "user" + content = safe_get(item, "content") or (item.get("content") if isinstance(item, dict) else None) or "" + normalized.append({"role": str(role), "parts": [{"type": "text", "content": str(content)}]}) + return json_dumps_attr(normalized) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/version.py new file mode 100644 index 000000000..7bee975f0 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/version.py @@ -0,0 +1,15 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.5.0.dev" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/__init__.py new file mode 100644 index 000000000..b0a6f4284 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/__init__.py @@ -0,0 +1,13 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/agent.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/agent.py new file mode 100644 index 000000000..96d4a0f72 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/agent.py @@ -0,0 +1,103 @@ +# Copyright The OpenTelemetry Authors +# Licensed under the Apache License, Version 2.0 + +"""AGENT span wrapper for Agent.run_checkpoint.""" + +import logging + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import ( + SYSTEM_NAME, + safe_get, + set_optional_attr, + genai_messages, +) +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +def _assistant_messages(instance): + messages = [] + for step in safe_get(instance, "_steps", []) or []: + role = safe_get(step, "role") + role_value = safe_get(role, "value", role) + if str(role_value).lower().endswith("assistant"): + content = safe_get(step, "content") + if content: + messages.append({"role": "assistant", "content": content}) + if not messages: + for msg in safe_get(instance, "_messages", []) or []: + role = safe_get(msg, "role") or (msg.get("role") if isinstance(msg, dict) else None) + if role == "assistant": + content = safe_get(msg, "content") or (msg.get("content") if isinstance(msg, dict) else None) + if content: + messages.append({"role": "assistant", "content": content}) + return messages[-3:] + + +class _AgentRunCheckpointWrapper: + """Wrapper for Agent.run_checkpoint to create AGENT span.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + task_input = args[0] if args else kwargs.get("task") + agent_name = type(instance).__name__ + problem_name = safe_get(instance, "problem_name", "unknown") + attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "invoke_agent", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.AGENT.value, + "gen_ai.framework": SYSTEM_NAME, + "gen_ai.agent.name": agent_name, + "gen_ai.agent.id": agent_name, + "gen_ai.agent.description": "slop-code benchmark agent", + "slop_code.problem.name": str(problem_name), + } + if task_input is not None: + attrs["gen_ai.input.messages"] = genai_messages([{"role": "user", "content": str(task_input)}]) + + with self._tracer.start_as_current_span( + name=f"invoke_agent {agent_name}", + kind=SpanKind.INTERNAL, + attributes=attrs, + ) as span: + try: + result = wrapped(*args, **kwargs) + agg = getattr(instance, "_otel_slop_aggregate_tokens", {}) or {} + input_tokens = int(agg.get("input", 0) or 0) + output_tokens = int(agg.get("output", 0) or 0) + + usage = safe_get(result, "usage") if result is not None else None + net_tokens = safe_get(usage, "net_tokens") if usage is not None else None + if not input_tokens and net_tokens is not None: + input_tokens = int(safe_get(net_tokens, "input", 0) or 0) + if not output_tokens and net_tokens is not None: + output_tokens = int(safe_get(net_tokens, "output", 0) or 0) + + if input_tokens: + set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS, input_tokens) + if output_tokens: + set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens) + if input_tokens or output_tokens: + set_optional_attr(span, "gen_ai.usage.total_tokens", input_tokens + output_tokens) + + messages = _assistant_messages(instance) + if messages: + set_optional_attr(span, "gen_ai.output.messages", genai_messages(messages)) + + if usage is not None: + set_optional_attr(span, "slop_code.usage.cost", safe_get(usage, "cost")) + set_optional_attr(span, "slop_code.usage.steps", safe_get(usage, "steps")) + set_optional_attr(span, "slop_code.elapsed_seconds", safe_get(result, "elapsed") if result is not None else None) + span.set_status(Status(StatusCode.OK)) + return result + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR, str(exc))) + span.set_attribute("error.type", type(exc).__name__) + raise diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/entry.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/entry.py new file mode 100644 index 000000000..220f9e27f --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/entry.py @@ -0,0 +1,85 @@ +# Copyright The OpenTelemetry Authors +# Licensed under the Apache License, Version 2.0 + +"""ENTRY span wrappers for slop-code benchmark runs.""" + +import json +import logging + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import ( + SYSTEM_NAME, + genai_messages, + safe_get, + set_optional_attr, +) +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +class _EntryWrapper: + """Wrapper for the top-level CLI run_agent command.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + with self._tracer.start_as_current_span( + name="enter_ai_application_system", + kind=SpanKind.INTERNAL, + attributes={ + gen_ai_attributes.GEN_AI_OPERATION_NAME: "enter", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.ENTRY.value, + "gen_ai.framework": SYSTEM_NAME, + }, + ) as span: + try: + result = wrapped(*args, **kwargs) + span.set_status(Status(StatusCode.OK)) + return result + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR, str(exc))) + raise + + +class _RunnerEntryWrapper: + """Create an ENTRY span inside the worker process so child spans share it.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + problem = safe_get(safe_get(instance, "run_spec"), "problem") + problem_name = safe_get(problem, "name", "unknown") + attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "enter", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.ENTRY.value, + "gen_ai.framework": SYSTEM_NAME, + "gen_ai.session.id": str(problem_name), + } + # Capture the benchmark problem prompt as the application input when available. + task = safe_get(problem, "prompt") or safe_get(problem, "statement") or safe_get(problem, "description") + if task is not None: + attrs["gen_ai.input.messages"] = genai_messages([{"role": "user", "content": str(task)}]) + + with self._tracer.start_as_current_span( + name="enter_ai_application_system", + kind=SpanKind.INTERNAL, + attributes=attrs, + ) as span: + try: + result = wrapped(*args, **kwargs) + if result is not None: + set_optional_attr(span, "output.value", json.dumps(result, ensure_ascii=False, default=str)[:1024]) + span.set_status(Status(StatusCode.OK)) + return result + except Exception as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR, str(exc))) + raise diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/llm.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/llm.py new file mode 100644 index 000000000..5090bc007 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/llm.py @@ -0,0 +1,117 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""LLM span wrapper for grade_file_async (Rubric Judge).""" + +import logging + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import ( + SYSTEM_NAME, + set_optional_attr, + json_dumps_attr, + genai_messages, +) +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +class _RubricGradeWrapper: + """Wrapper for grade_file_async to create LLM span.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + async def __call__(self, wrapped, instance, args, kwargs): + # grade_file_async(prompt_prefix, criteria_text, file_name, model, provider, temperature, ...) + model = kwargs.get("model") or (args[3] if len(args) > 3 else "unknown") + provider = kwargs.get("provider") or (args[4] if len(args) > 4 else None) + temperature = kwargs.get("temperature") or (args[5] if len(args) > 5 else None) + + # Determine system name from provider + system_name = SYSTEM_NAME + if provider is not None: + provider_val = provider.value if hasattr(provider, "value") else str(provider) + system_name = provider_val.lower() + + span_name = f"chat {model}" + + attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "chat", + gen_ai_attributes.GEN_AI_SYSTEM: system_name, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.LLM.value, + gen_ai_attributes.GEN_AI_REQUEST_MODEL: str(model), + "gen_ai.provider.name": system_name, + "gen_ai.framework": SYSTEM_NAME, + } + + prompt_prefix = args[0] if len(args) > 0 else kwargs.get("prompt_prefix") + criteria_text = args[1] if len(args) > 1 else kwargs.get("criteria_text") + if prompt_prefix is not None or criteria_text is not None: + attrs["gen_ai.input.messages"] = genai_messages([{"role": "user", "content": str(prompt_prefix or "") + "\n\n" + str(criteria_text or "")}]) + + if temperature is not None: + attrs[gen_ai_attributes.GEN_AI_REQUEST_TEMPERATURE] = float(temperature) + + with self._tracer.start_as_current_span( + name=span_name, + kind=SpanKind.CLIENT, + attributes=attrs, + ) as span: + try: + result = await wrapped(*args, **kwargs) + + # result is tuple[list[dict], dict[str, Any]] + if isinstance(result, tuple) and len(result) >= 2: + response_data = result[1] + if isinstance(response_data, dict): + _set_usage_from_response(span, response_data) + response_id = response_data.get("id") + set_optional_attr(span, "gen_ai.response.id", response_id) + if response_data.get("choices") is not None: + span.set_attribute("gen_ai.output.messages", json_dumps_attr(response_data.get("choices"))) + + span.set_status(Status(StatusCode.OK)) + return result + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR, str(e))) + raise + + +def _set_usage_from_response(span, response_data: dict) -> None: + """Extract and set token usage attributes from response_data.""" + usage = response_data.get("usage") + if not isinstance(usage, dict): + return + + # OpenRouter format: prompt_tokens / completion_tokens + # Bedrock format (normalized): input_tokens / output_tokens + input_tokens = usage.get("prompt_tokens") or usage.get("input_tokens") + output_tokens = usage.get("completion_tokens") or usage.get("output_tokens") + + set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS, input_tokens) + set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens) + if input_tokens is not None and output_tokens is not None: + set_optional_attr(span, "gen_ai.usage.total_tokens", input_tokens + output_tokens) + + # Cache tokens (OpenRouter specific) + cache_read = usage.get("cache_read_input_tokens") + set_optional_attr(span, gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, cache_read) + + cache_creation = usage.get("cache_creation_input_tokens") + set_optional_attr(span, gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, cache_creation) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/step.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/step.py new file mode 100644 index 000000000..4650d1689 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/step.py @@ -0,0 +1,140 @@ +# Copyright The OpenTelemetry Authors +# Licensed under the Apache License, Version 2.0 + +"""STEP span wrappers for MiniSWEAgent ReAct iterations.""" + +import logging + +from opentelemetry import context as context_api +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import ( + SYSTEM_NAME, + safe_get, + set_optional_attr, + genai_messages, +) +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + +_STEP_SPAN_ATTR = "_otel_slop_step_span" +_STEP_TOKEN_ATTR = "_otel_slop_step_token" +_AGG_TOKENS_ATTR = "_otel_slop_aggregate_tokens" + + +def _estimate_tokens(text) -> int: + if text is None: + return 0 + text = str(text) + return max(1, (len(text) + 3) // 4) if text else 0 + + +def _add_agent_tokens(instance, input_tokens: int, output_tokens: int) -> None: + current = getattr(instance, _AGG_TOKENS_ATTR, {"input": 0, "output": 0}) + current["input"] = int(current.get("input", 0)) + int(input_tokens or 0) + current["output"] = int(current.get("output", 0)) + int(output_tokens or 0) + setattr(instance, _AGG_TOKENS_ATTR, current) + + +class _MiniSWEStepWrapper: + """Start a STEP span before the model call and keep it open for tool execution.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + usage = safe_get(instance, "usage") + current_steps = safe_get(usage, "steps", 0) if usage else 0 + step_num = current_steps + 1 + + messages = safe_get(instance, "_messages", []) + attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "react", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: gen_ai_extended_attributes.GenAiSpanKindValues.STEP.value, + gen_ai_extended_attributes.GEN_AI_REACT_ROUND: step_num, + "gen_ai.framework": SYSTEM_NAME, + } + if messages: + attrs["gen_ai.input.messages"] = genai_messages(messages) + + span = self._tracer.start_span("react step", kind=SpanKind.INTERNAL, attributes=attrs) + token = context_api.attach(trace_api.set_span_in_context(span)) + setattr(instance, _STEP_SPAN_ATTR, span) + setattr(instance, _STEP_TOKEN_ATTR, token) + + try: + result = wrapped(*args, **kwargs) + _record_step_result(instance, span, result, messages) + if result is None: + _finish_step(instance, Status(StatusCode.OK), "stop") + return result + except Exception as exc: + span.record_exception(exc) + _finish_step(instance, Status(StatusCode.ERROR, str(exc)), "error") + raise + + +class _MiniSWEObservationWrapper: + """Finish the current STEP span after the environment/tool observation.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + try: + return wrapped(*args, **kwargs) + except Exception as exc: + span = getattr(instance, _STEP_SPAN_ATTR, None) + if span is not None: + span.record_exception(exc) + _finish_step(instance, Status(StatusCode.ERROR, str(exc)), "error") + raise + finally: + if getattr(instance, _STEP_SPAN_ATTR, None) is not None: + _finish_step(instance, Status(StatusCode.OK), "stop") + + +def _record_step_result(instance, span, result, messages) -> None: + if not isinstance(result, dict): + return + token_usage = result.get("token_usage") + input_tokens = safe_get(token_usage, "input") if token_usage is not None else None + output_tokens = safe_get(token_usage, "output") if token_usage is not None else None + content = result.get("content") + if not input_tokens: + input_tokens = _estimate_tokens(genai_messages(messages)) + if not output_tokens: + output_tokens = _estimate_tokens(content) + set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_INPUT_TOKENS, input_tokens) + set_optional_attr(span, gen_ai_attributes.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens) + if input_tokens is not None and output_tokens is not None: + set_optional_attr(span, "gen_ai.usage.total_tokens", input_tokens + output_tokens) + _add_agent_tokens(instance, input_tokens, output_tokens) + if token_usage is not None: + set_optional_attr(span, gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_READ_INPUT_TOKENS, safe_get(token_usage, "cache_read")) + set_optional_attr(span, gen_ai_extended_attributes.GEN_AI_USAGE_CACHE_CREATION_INPUT_TOKENS, safe_get(token_usage, "cache_write")) + set_optional_attr(span, "slop_code.step.cost", result.get("step_cost")) + if content is not None: + set_optional_attr(span, "gen_ai.output.messages", genai_messages([{"role": "assistant", "content": content}])) + + +def _finish_step(instance, status: Status, finish_reason: str) -> None: + span = getattr(instance, _STEP_SPAN_ATTR, None) + token = getattr(instance, _STEP_TOKEN_ATTR, None) + if span is None: + return + try: + span.set_attribute(gen_ai_extended_attributes.GEN_AI_REACT_FINISH_REASON, finish_reason) + span.set_status(status) + span.end() + finally: + if token is not None: + context_api.detach(token) + for attr in (_STEP_SPAN_ATTR, _STEP_TOKEN_ATTR): + try: + delattr(instance, attr) + except AttributeError: + pass diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/task.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/task.py new file mode 100644 index 000000000..812e61b48 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/task.py @@ -0,0 +1,77 @@ +# Copyright The OpenTelemetry Authors +# Licensed under the Apache License, Version 2.0 + +"""ENTRY + TASK span wrapper for AgentRunner._run_checkpoint.""" + +import logging + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import SYSTEM_NAME, safe_get, set_optional_attr +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +class _TaskRunCheckpointWrapper: + """Create an ENTRY span and a child TASK span for each benchmark checkpoint.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + checkpoint = args[0] if args else kwargs.get("checkpoint") + is_first_checkpoint = args[2] if len(args) > 2 else kwargs.get("is_first_checkpoint", False) + checkpoint_name = safe_get(checkpoint, "name", "unknown") + checkpoint_order = safe_get(checkpoint, "order") + problem = safe_get(safe_get(instance, "run_spec"), "problem") + problem_name = safe_get(problem, "name", checkpoint_name) + + entry_attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "enter", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "ENTRY", + "gen_ai.framework": SYSTEM_NAME, + "gen_ai.session.id": str(problem_name), + } + task_attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "run_task", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "TASK", + "gen_ai.framework": SYSTEM_NAME, + "input.value": str(checkpoint_name), + "input.mime_type": "text/plain", + "slop_code.checkpoint.name": str(checkpoint_name), + "slop_code.is_first_checkpoint": bool(is_first_checkpoint), + } + if checkpoint_order is not None: + task_attrs["slop_code.checkpoint.order"] = checkpoint_order + + with self._tracer.start_as_current_span( + name="enter_ai_application_system", + kind=SpanKind.INTERNAL, + attributes=entry_attrs, + ) as entry_span: + with self._tracer.start_as_current_span( + name=f"run_task {checkpoint_name}", + kind=SpanKind.INTERNAL, + attributes=task_attrs, + ) as task_span: + try: + result = wrapped(*args, **kwargs) + if result is not None: + set_optional_attr(task_span, "slop_code.had_error", safe_get(result, "had_error")) + set_optional_attr(task_span, "slop_code.passed_policy", safe_get(result, "passed_policy")) + set_optional_attr(task_span, "output.value", str(result)) + set_optional_attr(task_span, "output.mime_type", "text/plain") + set_optional_attr(entry_span, "output.value", str(result)) + task_span.set_status(Status(StatusCode.OK)) + entry_span.set_status(Status(StatusCode.OK)) + return result + except Exception as exc: + task_span.record_exception(exc) + task_span.set_status(Status(StatusCode.ERROR, str(exc))) + entry_span.record_exception(exc) + entry_span.set_status(Status(StatusCode.ERROR, str(exc))) + raise diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/tool.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/tool.py new file mode 100644 index 000000000..cec69b826 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/tool.py @@ -0,0 +1,58 @@ +# Copyright The OpenTelemetry Authors +# Licensed under the Apache License, Version 2.0 + +"""TOOL span wrapper for MiniSWEAgent.execute_action.""" + +import json +import logging +from uuid import uuid4 + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import SYSTEM_NAME, truncate_text +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +def _json_attr(value) -> str: + return truncate_text(json.dumps(value, ensure_ascii=False, default=str)) + + +class _ToolExecuteActionWrapper: + """Wrap shell/tool execution performed by the benchmark agent.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + action = args[0] if args else kwargs.get("action", {}) + command = action.get("action") if isinstance(action, dict) else str(action) + attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "execute_tool", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "TOOL", + "gen_ai.framework": SYSTEM_NAME, + "gen_ai.tool.call.id": str(uuid4()), + "gen_ai.tool.name": "bash", + "gen_ai.tool.type": "function", + "gen_ai.tool.description": "Execute a shell command in the benchmark environment", + "gen_ai.tool.call.arguments": _json_attr({"command": command}), + } + with self._tracer.start_as_current_span( + name="execute_tool bash", + kind=SpanKind.INTERNAL, + attributes=attrs, + ) as span: + try: + result = wrapped(*args, **kwargs) + span.set_attribute("gen_ai.tool.call.result", _json_attr(result)) + span.set_status(Status(StatusCode.OK)) + return result + except Exception as exc: + span.record_exception(exc) + span.set_attribute("gen_ai.tool.call.result", _json_attr({"error": str(exc), "error.type": type(exc).__name__})) + span.set_status(Status(StatusCode.ERROR, str(exc))) + span.set_attribute("error.type", type(exc).__name__) + raise diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/workflow.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/workflow.py new file mode 100644 index 000000000..5032a48c2 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/src/opentelemetry/instrumentation/slop_code/wrappers/workflow.py @@ -0,0 +1,123 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""CHAIN/workflow span wrapper for run_agent_on_problem.""" + +import logging + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.slop_code.utils import ( + SYSTEM_NAME, + safe_get, + safe_get_nested, + set_optional_attr, +) +from opentelemetry.semconv._incubating.attributes import gen_ai_attributes +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes + +logger = logging.getLogger(__name__) + + +class _WorkflowWrapper: + """Wrapper for run_agent_on_problem to create workflow (CHAIN) span.""" + + def __init__(self, tracer: trace_api.Tracer): + self._tracer = tracer + + def __call__(self, wrapped, instance, args, kwargs): + # run_agent_on_problem(problem_config, problem_name, config, progress_queue, output_path) + problem_name = args[1] if len(args) > 1 else kwargs.get("problem_name", "unknown") + config = args[2] if len(args) > 2 else kwargs.get("config") + + span_name = f"chain {problem_name}" + + attrs = { + gen_ai_attributes.GEN_AI_OPERATION_NAME: "workflow", + gen_ai_attributes.GEN_AI_SYSTEM: SYSTEM_NAME, + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "CHAIN", + "gen_ai.framework": SYSTEM_NAME, + "input.value": str(problem_name), + "slop_code.problem.name": str(problem_name), + } + + # Extract optional attributes from config + if config is not None: + model_name = safe_get_nested(config, "model_def", "name") + set_optional_attr_dict(attrs, gen_ai_attributes.GEN_AI_REQUEST_MODEL, model_name) + + agent_type = safe_get_nested(config, "agent_config", "type") + set_optional_attr_dict(attrs, "slop_code.agent.type", agent_type) + + pass_policy = safe_get_nested(config, "pass_policy", "value") + if pass_policy is None: + pass_policy_obj = safe_get(config, "pass_policy") + if pass_policy_obj is not None and hasattr(pass_policy_obj, "value"): + pass_policy = pass_policy_obj.value + set_optional_attr_dict(attrs, "slop_code.pass_policy", pass_policy) + + try: + with self._tracer.start_as_current_span( + name=span_name, + kind=SpanKind.INTERNAL, + attributes={k: v for k, v in attrs.items() if v is not None}, + ) as span: + try: + result = wrapped(*args, **kwargs) + + if isinstance(result, dict): + summary = result.get("summary") + if isinstance(summary, dict): + set_optional_attr( + span, "slop_code.state", summary.get("state") + ) + set_optional_attr( + span, + "slop_code.passed_policy", + summary.get("passed_policy"), + ) + set_optional_attr(span, "output.value", str(summary)) + + span.set_status(Status(StatusCode.OK)) + return result + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR, str(e))) + raise + finally: + # Flush AFTER the `with` block so the workflow span itself + # is `on_end`-delivered to the SpanProcessor before we ask it + # to drain. run_agent_on_problem is the last meaningful work + # item inside the per-problem worker subprocess; once it + # returns, the process is reaped by ProcessPoolExecutor's + # shutdown which can short-circuit BatchSpanProcessor's + # atexit handler. Without this explicit flush the CHAIN span + # (and the tail batch of TASK/AGENT/STEP spans) gets dropped. + try: + provider = trace_api.get_tracer_provider() + flush = getattr(provider, "force_flush", None) + if callable(flush): + flush(timeout_millis=5000) + except Exception as flush_err: # noqa: BLE001 + logger.debug( + "force_flush after workflow span failed: %s", flush_err + ) + + +def set_optional_attr_dict(attrs: dict, key: str, value) -> None: + """Add to attrs dict only if value is not None.""" + if value is not None: + if isinstance(value, str) and len(value) > 1024: + value = value[:1024] + attrs[key] = value diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/test-requirements.txt b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/test-requirements.txt new file mode 100644 index 000000000..9facd6bc9 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/test-requirements.txt @@ -0,0 +1,8 @@ +pytest +pytest-asyncio +pytest-forked==1.6.0 +opentelemetry-api +opentelemetry-sdk +opentelemetry-instrumentation +opentelemetry-semantic-conventions +wrapt diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/conftest.py new file mode 100644 index 000000000..dcda695d0 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/conftest.py @@ -0,0 +1,209 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test configuration for slop-code instrumentation tests.""" + +import os +import sys +import types +from unittest.mock import MagicMock + +import pytest + +os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental" + + +def _make_module(name): + """Create a real module object.""" + mod = types.ModuleType(name) + mod.__package__ = name.rsplit(".", 1)[0] if "." in name else name + return mod + + +def _create_mock_slop_code_modules(): + """Create mock modules for slop_code so instrumentation can wrap them.""" + # Create all parent modules + mod_slop_code = _make_module("slop_code") + mod_entrypoints = _make_module("slop_code.entrypoints") + mod_commands = _make_module("slop_code.entrypoints.commands") + mod_run_agent = _make_module("slop_code.entrypoints.commands.run_agent") + mod_problem_runner = _make_module("slop_code.entrypoints.problem_runner") + mod_worker = _make_module("slop_code.entrypoints.problem_runner.worker") + mod_driver = _make_module("slop_code.entrypoints.problem_runner.driver") + mod_agent_runner = _make_module("slop_code.agent_runner") + mod_runner = _make_module("slop_code.agent_runner.runner") + mod_agent = _make_module("slop_code.agent_runner.agent") + mod_agents = _make_module("slop_code.agent_runner.agents") + mod_miniswe = _make_module("slop_code.agent_runner.agents.miniswe") + mod_metrics = _make_module("slop_code.metrics") + mod_rubric = _make_module("slop_code.metrics.rubric") + mod_router = _make_module("slop_code.metrics.rubric.router") + + # --- ENTRY: run_agent --- + def run_agent(*args, **kwargs): + return {"status": "completed"} + + mod_run_agent.run_agent = run_agent + + # --- WORKFLOW: run_agent_on_problem --- + def run_agent_on_problem(*args, **kwargs): + return {"summary": {"state": "completed", "passed_policy": True}} + + mod_worker.run_agent_on_problem = run_agent_on_problem + # driver re-imports the worker name at module load time. This mock mirrors + # the same pattern so the instrumentor's driver-side patch has a target. + mod_driver.run_agent_on_problem = run_agent_on_problem + + # --- TASK: AgentRunner._run_checkpoint --- + class AgentRunner: + def __init__(self): + self.agent = MagicMock() + self.agent.usage = MagicMock() + self.agent.usage.net_tokens = MagicMock() + self.agent.usage.net_tokens.input = 100 + self.agent.usage.net_tokens.output = 50 + + def _run_checkpoint(self, checkpoint, checkpoint_save_dir, is_first_checkpoint=False): + result = MagicMock() + result.had_error = False + result.passed_policy = True + return result + + mod_runner.AgentRunner = AgentRunner + + # --- AGENT: Agent.run_checkpoint --- + class Agent: + def __init__(self, problem_name="test_problem"): + self.problem_name = problem_name + self.usage = MagicMock() + self.usage.net_tokens = MagicMock() + self.usage.net_tokens.input = 100 + self.usage.net_tokens.output = 50 + self.usage.steps = 0 + self.usage.cost = 0.05 + + def run_checkpoint(self, task): + result = MagicMock() + result.usage = self.usage + result.elapsed = 10.5 + return result + + mod_agent.Agent = Agent + + # --- STEP: MiniSWEAgent.agent_step --- + class MiniSWEAgent(Agent): + def __init__(self, problem_name="test_problem"): + super().__init__(problem_name) + + def agent_step(self): + return { + "token_usage": MagicMock(input=200, output=80, cache_read=50, cache_write=10), + "step_cost": 0.01, + } + + mod_miniswe.MiniSWEAgent = MiniSWEAgent + + # --- LLM: grade_file_async --- + async def grade_file_async(*args, **kwargs): + grades = [{"score": 8, "reasoning": "Good code"}] + response_data = { + "id": "resp-123", + "usage": { + "prompt_tokens": 500, + "completion_tokens": 200, + "cache_read_input_tokens": 100, + "cache_creation_input_tokens": 50, + }, + } + return grades, response_data + + mod_router.grade_file_async = grade_file_async + + # Wire parent-child relationships + mod_slop_code.entrypoints = mod_entrypoints + mod_slop_code.agent_runner = mod_agent_runner + mod_slop_code.metrics = mod_metrics + mod_entrypoints.commands = mod_commands + mod_entrypoints.problem_runner = mod_problem_runner + mod_commands.run_agent = mod_run_agent + mod_problem_runner.worker = mod_worker + mod_problem_runner.driver = mod_driver + mod_agent_runner.runner = mod_runner + mod_agent_runner.agent = mod_agent + mod_agent_runner.agents = mod_agents + mod_agents.miniswe = mod_miniswe + mod_metrics.rubric = mod_rubric + mod_rubric.router = mod_router + + # Register all modules in sys.modules + modules = { + "slop_code": mod_slop_code, + "slop_code.entrypoints": mod_entrypoints, + "slop_code.entrypoints.commands": mod_commands, + "slop_code.entrypoints.commands.run_agent": mod_run_agent, + "slop_code.entrypoints.problem_runner": mod_problem_runner, + "slop_code.entrypoints.problem_runner.worker": mod_worker, + "slop_code.entrypoints.problem_runner.driver": mod_driver, + "slop_code.agent_runner": mod_agent_runner, + "slop_code.agent_runner.runner": mod_runner, + "slop_code.agent_runner.agent": mod_agent, + "slop_code.agent_runner.agents": mod_agents, + "slop_code.agent_runner.agents.miniswe": mod_miniswe, + "slop_code.metrics": mod_metrics, + "slop_code.metrics.rubric": mod_rubric, + "slop_code.metrics.rubric.router": mod_router, + } + + for name, mod in modules.items(): + sys.modules[name] = mod + + return modules + + +# Install mock modules before any instrumentation imports +_mock_modules = _create_mock_slop_code_modules() + + +@pytest.fixture(scope="function") +def span_exporter(): + from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, + ) + + exporter = InMemorySpanExporter() + yield exporter + exporter.clear() + + +@pytest.fixture(scope="function") +def tracer_provider(span_exporter): + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + return provider + + +@pytest.fixture(scope="function") +def instrument(tracer_provider): + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + skip_dep_check=True, + ) + yield instrumentor + instrumentor.uninstrument() diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_agent_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_agent_span.py new file mode 100644 index 000000000..d372ba220 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_agent_span.py @@ -0,0 +1,102 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for AGENT span (Agent.run_checkpoint).""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestAgentSpan: + """Verify that Agent.run_checkpoint produces an AGENT span.""" + + def test_agent_span_created(self, span_exporter, instrument): + """Agent.run_checkpoint should create an AGENT span.""" + import slop_code.agent_runner.agent as mod + + agent = mod.Agent(problem_name="file_backup") + result = agent.run_checkpoint("solve the bug") + + spans = span_exporter.get_finished_spans() + agent_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "invoke_agent" + ] + assert len(agent_spans) == 1 + + span = agent_spans[0] + assert span.name == "agent.Agent" + assert span.attributes["gen_ai.system"] == "slop-code" + assert span.attributes["gen_ai.span.kind"] == "AGENT" + assert span.attributes["gen_ai.agent.name"] == "Agent" + assert span.attributes["slop_code.problem.name"] == "file_backup" + assert span.status.status_code == StatusCode.OK + + def test_agent_span_captures_usage(self, span_exporter, instrument): + """AGENT span should capture token usage from result.""" + import slop_code.agent_runner.agent as mod + + agent = mod.Agent(problem_name="test_prob") + agent.run_checkpoint("task") + + spans = span_exporter.get_finished_spans() + agent_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "invoke_agent" + ] + assert len(agent_spans) == 1 + span = agent_spans[0] + + assert "gen_ai.usage.input_tokens" in span.attributes + assert "gen_ai.usage.output_tokens" in span.attributes + assert span.attributes["gen_ai.usage.input_tokens"] == 100 + assert span.attributes["gen_ai.usage.output_tokens"] == 50 + + def test_agent_span_error(self, span_exporter, tracer_provider): + """Exception in Agent.run_checkpoint should produce error span.""" + import slop_code.agent_runner.agent as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + class FailingAgent(mod.Agent): + def run_checkpoint(self, task): + raise TimeoutError("Agent timeout") + + OriginalAgent = mod.Agent + mod.Agent = FailingAgent + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + try: + agent = mod.Agent(problem_name="test_prob") + + with pytest.raises(TimeoutError, match="Agent timeout"): + agent.run_checkpoint("task") + + spans = span_exporter.get_finished_spans() + agent_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "invoke_agent" + ] + assert len(agent_spans) == 1 + span = agent_spans[0] + assert span.status.status_code == StatusCode.ERROR + assert span.attributes.get("error.type") == "TimeoutError" + finally: + instrumentor.uninstrument() + mod.Agent = OriginalAgent diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_entry_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_entry_span.py new file mode 100644 index 000000000..2f7c1751f --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_entry_span.py @@ -0,0 +1,74 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for ENTRY span (run_agent).""" + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestEntrySpan: + """Verify that run_agent produces an ENTRY span.""" + + def test_entry_span_created(self, span_exporter, instrument): + """run_agent should create an ENTRY span with correct attributes.""" + import slop_code.entrypoints.commands.run_agent as mod + + mod.run_agent() + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "ENTRY" + ] + assert len(entry_spans) == 1 + + span = entry_spans[0] + assert span.name == "slop-code.enter" + assert span.attributes["gen_ai.system"] == "slop-code" + assert span.attributes["gen_ai.operation.name"] == "enter" + assert span.status.status_code == StatusCode.OK + + def test_entry_span_error(self, span_exporter, tracer_provider): + """run_agent raising an exception should produce an error ENTRY span.""" + import slop_code.entrypoints.commands.run_agent as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + # Store original and replace with failing function + original = mod.run_agent + + def failing_run_agent(*args, **kwargs): + raise RuntimeError("Config error") + + mod.run_agent = failing_run_agent + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + try: + with pytest.raises(RuntimeError, match="Config error"): + mod.run_agent() + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "ENTRY" + ] + assert len(entry_spans) == 1 + assert entry_spans[0].status.status_code == StatusCode.ERROR + finally: + instrumentor.uninstrument() + mod.run_agent = original diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_hierarchy.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_hierarchy.py new file mode 100644 index 000000000..d33cc3568 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_hierarchy.py @@ -0,0 +1,118 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for span hierarchy and parent-child relationships.""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestSpanHierarchy: + """Verify parent-child relationships between spans.""" + + def test_entry_is_parent_of_workflow(self, span_exporter, instrument): + """ENTRY span should be parent of workflow span when called inline.""" + import slop_code.entrypoints.commands.run_agent as entry_mod + import slop_code.entrypoints.problem_runner.worker as worker_mod + + # Patch run_agent to call run_agent_on_problem internally + original = entry_mod.run_agent.__wrapped__ + + def run_with_workflow(*args, **kwargs): + config = MagicMock() + config.model_def = None + config.agent_config = None + config.pass_policy = None + return worker_mod.run_agent_on_problem( + MagicMock(), "test_problem", config, MagicMock(), "/tmp" + ) + + entry_mod.run_agent.__wrapped__ = run_with_workflow + + try: + entry_mod.run_agent() + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "ENTRY" + ] + workflow_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "workflow" + ] + + assert len(entry_spans) == 1 + assert len(workflow_spans) == 1 + + entry_span = entry_spans[0] + workflow_span = workflow_spans[0] + + # workflow should be child of entry + assert workflow_span.context.trace_id == entry_span.context.trace_id + assert workflow_span.parent is not None + assert workflow_span.parent.span_id == entry_span.context.span_id + finally: + entry_mod.run_agent.__wrapped__ = original + + def test_workflow_is_parent_of_task(self, span_exporter, instrument): + """Workflow span should be parent of task span when called inline.""" + import slop_code.agent_runner.runner as runner_mod + import slop_code.entrypoints.problem_runner.worker as worker_mod + + original = worker_mod.run_agent_on_problem.__wrapped__ + + def workflow_with_task(*args, **kwargs): + r = runner_mod.AgentRunner() + checkpoint = MagicMock() + checkpoint.name = "cp1" + checkpoint.order = 1 + r._run_checkpoint(checkpoint, "/tmp", True) + return {"summary": {"state": "completed", "passed_policy": True}} + + worker_mod.run_agent_on_problem.__wrapped__ = workflow_with_task + + try: + config = MagicMock() + config.model_def = None + config.agent_config = None + config.pass_policy = None + worker_mod.run_agent_on_problem( + MagicMock(), "prob1", config, MagicMock(), "/tmp" + ) + + spans = span_exporter.get_finished_spans() + workflow_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "workflow" + ] + task_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "run_task" + ] + + assert len(workflow_spans) == 1 + assert len(task_spans) == 1 + + workflow_span = workflow_spans[0] + task_span = task_spans[0] + + assert task_span.context.trace_id == workflow_span.context.trace_id + assert task_span.parent is not None + assert task_span.parent.span_id == workflow_span.context.span_id + finally: + worker_mod.run_agent_on_problem.__wrapped__ = original diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_llm_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_llm_span.py new file mode 100644 index 000000000..c88e46430 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_llm_span.py @@ -0,0 +1,142 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for LLM span (grade_file_async - Rubric Judge).""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import SpanKind, StatusCode + + +@pytest.mark.asyncio +class TestLLMSpan: + """Verify that grade_file_async produces an LLM span.""" + + async def test_llm_span_created(self, span_exporter, instrument): + """grade_file_async should create an LLM span.""" + import slop_code.metrics.rubric.router as mod + + provider = MagicMock() + provider.value = "openrouter" + + grades, resp = await mod.grade_file_async( + "prompt_prefix", + "criteria_text", + "test.py", + "anthropic/claude-3.5-sonnet", + provider, + 0.7, + ) + + spans = span_exporter.get_finished_spans() + llm_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "LLM" + ] + assert len(llm_spans) == 1 + + span = llm_spans[0] + assert span.name == "chat anthropic/claude-3.5-sonnet" + assert span.attributes["gen_ai.system"] == "openrouter" + assert span.attributes["gen_ai.operation.name"] == "chat" + assert span.attributes["gen_ai.request.model"] == "anthropic/claude-3.5-sonnet" + assert span.attributes["gen_ai.request.temperature"] == 0.7 + assert span.kind == SpanKind.CLIENT + assert span.status.status_code == StatusCode.OK + + async def test_llm_span_captures_usage(self, span_exporter, instrument): + """LLM span should capture token usage from response.""" + import slop_code.metrics.rubric.router as mod + + provider = MagicMock() + provider.value = "openrouter" + + await mod.grade_file_async( + "prefix", "criteria", "file.py", + "anthropic/claude-3.5-sonnet", provider, 0.5, + ) + + spans = span_exporter.get_finished_spans() + llm_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "LLM" + ] + assert len(llm_spans) == 1 + span = llm_spans[0] + + assert span.attributes["gen_ai.usage.input_tokens"] == 500 + assert span.attributes["gen_ai.usage.output_tokens"] == 200 + assert span.attributes["gen_ai.usage.cache_read.input_tokens"] == 100 + assert span.attributes["gen_ai.usage.cache_creation.input_tokens"] == 50 + assert span.attributes["gen_ai.response.id"] == "resp-123" + + async def test_llm_span_error(self, span_exporter, tracer_provider): + """Exception in grade_file_async should produce an error LLM span.""" + import slop_code.metrics.rubric.router as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + original = mod.grade_file_async + + async def failing_grade(*args, **kwargs): + raise ConnectionError("API unreachable") + + mod.grade_file_async = failing_grade + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + provider = MagicMock() + provider.value = "bedrock" + + try: + with pytest.raises(ConnectionError, match="API unreachable"): + await mod.grade_file_async( + "prefix", "criteria", "file.py", + "us.anthropic.claude-3-5-sonnet-20241022-v2:0", provider, 0.3, + ) + + spans = span_exporter.get_finished_spans() + llm_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "LLM" + ] + assert len(llm_spans) == 1 + assert llm_spans[0].status.status_code == StatusCode.ERROR + assert llm_spans[0].attributes["gen_ai.system"] == "bedrock" + finally: + instrumentor.uninstrument() + mod.grade_file_async = original + + async def test_llm_span_bedrock_provider(self, span_exporter, instrument): + """LLM span with bedrock provider should use 'bedrock' as system.""" + import slop_code.metrics.rubric.router as mod + + provider = MagicMock() + provider.value = "bedrock" + + await mod.grade_file_async( + "prefix", "criteria", "file.py", + "us.anthropic.claude-3-5-sonnet-20241022-v2:0", provider, 0.5, + ) + + spans = span_exporter.get_finished_spans() + llm_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "LLM" + ] + assert len(llm_spans) == 1 + assert llm_spans[0].attributes["gen_ai.system"] == "bedrock" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_step_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_step_span.py new file mode 100644 index 000000000..70e221da2 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_step_span.py @@ -0,0 +1,133 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for STEP span (MiniSWEAgent.agent_step).""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestStepSpan: + """Verify that MiniSWEAgent.agent_step produces a STEP span.""" + + def test_step_span_created(self, span_exporter, instrument): + """agent_step should create a STEP span with token attributes.""" + import slop_code.agent_runner.agents.miniswe as mod + + agent = mod.MiniSWEAgent(problem_name="test_prob") + result = agent.agent_step() + + spans = span_exporter.get_finished_spans() + step_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "STEP" + ] + assert len(step_spans) == 1 + + span = step_spans[0] + assert span.name == "react.step.1" + assert span.attributes["gen_ai.system"] == "slop-code" + assert span.attributes["gen_ai.operation.name"] == "react" + assert span.attributes["gen_ai.react.round"] == 1 + assert span.status.status_code == StatusCode.OK + + def test_step_span_has_token_usage(self, span_exporter, instrument): + """STEP span should capture token usage from result.""" + import slop_code.agent_runner.agents.miniswe as mod + + agent = mod.MiniSWEAgent(problem_name="test_prob") + agent.agent_step() + + spans = span_exporter.get_finished_spans() + step_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "STEP" + ] + assert len(step_spans) == 1 + span = step_spans[0] + + assert span.attributes["gen_ai.usage.input_tokens"] == 200 + assert span.attributes["gen_ai.usage.output_tokens"] == 80 + assert span.attributes["gen_ai.usage.cache_read.input_tokens"] == 50 + assert span.attributes["gen_ai.usage.cache_creation.input_tokens"] == 10 + + def test_step_span_increments_round(self, span_exporter, instrument): + """Multiple agent_step calls should increment the round number.""" + import slop_code.agent_runner.agents.miniswe as mod + + agent = mod.MiniSWEAgent(problem_name="test_prob") + # Simulate steps=2 already completed + agent.usage.steps = 2 + agent.agent_step() + + spans = span_exporter.get_finished_spans() + step_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "STEP" + ] + assert len(step_spans) == 1 + assert step_spans[0].name == "react.step.3" + assert step_spans[0].attributes["gen_ai.react.round"] == 3 + + def test_step_span_error(self, span_exporter, tracer_provider): + """Exception in agent_step should produce an error STEP span.""" + import slop_code.agent_runner.agents.miniswe as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + class FailingMiniSWE(mod.MiniSWEAgent): + def agent_step(self): + raise RuntimeError("LimitsExceeded") + + OriginalClass = mod.MiniSWEAgent + mod.MiniSWEAgent = FailingMiniSWE + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + try: + agent = mod.MiniSWEAgent(problem_name="test_prob") + + with pytest.raises(RuntimeError, match="LimitsExceeded"): + agent.agent_step() + + spans = span_exporter.get_finished_spans() + step_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "STEP" + ] + assert len(step_spans) == 1 + span = step_spans[0] + assert span.status.status_code == StatusCode.ERROR + assert span.attributes["gen_ai.react.finish_reason"] == "error" + finally: + instrumentor.uninstrument() + mod.MiniSWEAgent = OriginalClass + + def test_step_span_finish_reason_stop(self, span_exporter, instrument): + """Successful step should have finish_reason='stop'.""" + import slop_code.agent_runner.agents.miniswe as mod + + agent = mod.MiniSWEAgent(problem_name="test_prob") + agent.agent_step() + + spans = span_exporter.get_finished_spans() + step_spans = [ + s for s in spans + if s.attributes.get("gen_ai.span.kind") == "STEP" + ] + assert step_spans[0].attributes["gen_ai.react.finish_reason"] == "stop" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_task_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_task_span.py new file mode 100644 index 000000000..de3e16a95 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_task_span.py @@ -0,0 +1,110 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for TASK span (AgentRunner._run_checkpoint).""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestTaskSpan: + """Verify that AgentRunner._run_checkpoint produces a TASK span.""" + + def test_task_span_created(self, span_exporter, instrument): + """_run_checkpoint should create a task span.""" + import slop_code.agent_runner.runner as mod + + runner = mod.AgentRunner() + + checkpoint = MagicMock() + checkpoint.name = "checkpoint_1" + checkpoint.order = 1 + + result = runner._run_checkpoint(checkpoint, "/tmp/save", True) + + spans = span_exporter.get_finished_spans() + task_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "run_task" + ] + assert len(task_spans) == 1 + + span = task_spans[0] + assert span.name == "task.checkpoint_1" + assert span.attributes["gen_ai.system"] == "slop-code" + assert span.attributes["gen_ai.span.kind"] == "TASK" + assert span.attributes["slop_code.checkpoint.name"] == "checkpoint_1" + assert span.attributes["slop_code.checkpoint.order"] == 1 + assert span.attributes["slop_code.is_first_checkpoint"] is True + assert span.status.status_code == StatusCode.OK + + def test_task_span_error(self, span_exporter, tracer_provider): + """Exception in _run_checkpoint should produce an error task span.""" + import slop_code.agent_runner.runner as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + class FailingRunner(mod.AgentRunner): + def _run_checkpoint(self, checkpoint, checkpoint_save_dir, is_first_checkpoint=False): + raise RuntimeError("Checkpoint failed") + + # Replace class temporarily + OriginalRunner = mod.AgentRunner + mod.AgentRunner = FailingRunner + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + try: + runner = mod.AgentRunner() + checkpoint = MagicMock() + checkpoint.name = "bad_checkpoint" + checkpoint.order = 2 + + with pytest.raises(RuntimeError, match="Checkpoint failed"): + runner._run_checkpoint(checkpoint, "/tmp/save", False) + + spans = span_exporter.get_finished_spans() + task_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "run_task" + ] + assert len(task_spans) == 1 + assert task_spans[0].status.status_code == StatusCode.ERROR + finally: + instrumentor.uninstrument() + mod.AgentRunner = OriginalRunner + + def test_task_span_not_first_checkpoint(self, span_exporter, instrument): + """Subsequent checkpoint should have is_first_checkpoint=False.""" + import slop_code.agent_runner.runner as mod + + runner = mod.AgentRunner() + + checkpoint = MagicMock() + checkpoint.name = "checkpoint_2" + checkpoint.order = 2 + + runner._run_checkpoint(checkpoint, "/tmp/save", False) + + spans = span_exporter.get_finished_spans() + task_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "run_task" + ] + assert len(task_spans) == 1 + assert task_spans[0].attributes["slop_code.is_first_checkpoint"] is False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_workflow_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_workflow_span.py new file mode 100644 index 000000000..6d0a79ddc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-slop-code/tests/test_workflow_span.py @@ -0,0 +1,117 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for CHAIN/workflow span (run_agent_on_problem).""" + +from unittest.mock import MagicMock + +import pytest + +from opentelemetry.trace import StatusCode + + +class TestWorkflowSpan: + """Verify that run_agent_on_problem produces a workflow span.""" + + def test_workflow_span_created(self, span_exporter, instrument): + """run_agent_on_problem should create a workflow span.""" + import slop_code.entrypoints.problem_runner.worker as mod + + config = MagicMock() + config.model_def = MagicMock() + config.model_def.name = "anthropic/claude-3.5-sonnet" + config.agent_config = MagicMock() + config.agent_config.type = "claude_code" + config.pass_policy = MagicMock() + config.pass_policy.value = "any" + + result = mod.run_agent_on_problem( + MagicMock(), # problem_config + "file_backup", # problem_name + config, # config + MagicMock(), # progress_queue + "/tmp/output", # output_path + ) + + spans = span_exporter.get_finished_spans() + workflow_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "workflow" + ] + assert len(workflow_spans) == 1 + + span = workflow_spans[0] + assert span.name == "workflow.file_backup" + assert span.attributes["gen_ai.system"] == "slop-code" + assert span.attributes["gen_ai.span.kind"] == "CHAIN" + assert span.attributes["slop_code.problem.name"] == "file_backup" + assert span.attributes["gen_ai.request.model"] == "anthropic/claude-3.5-sonnet" + assert span.attributes["slop_code.agent.type"] == "claude_code" + assert span.status.status_code == StatusCode.OK + + def test_workflow_span_error(self, span_exporter, tracer_provider): + """Exception in run_agent_on_problem should produce error workflow span.""" + import slop_code.entrypoints.problem_runner.worker as mod + + from opentelemetry.instrumentation.slop_code import SlopCodeInstrumentor + + original = mod.run_agent_on_problem + + def failing_worker(*args, **kwargs): + raise ValueError("Problem not found") + + mod.run_agent_on_problem = failing_worker + + instrumentor = SlopCodeInstrumentor() + instrumentor.instrument(tracer_provider=tracer_provider, skip_dep_check=True) + + try: + with pytest.raises(ValueError, match="Problem not found"): + mod.run_agent_on_problem( + MagicMock(), "missing_problem", MagicMock(), MagicMock(), "/tmp" + ) + + spans = span_exporter.get_finished_spans() + workflow_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "workflow" + ] + assert len(workflow_spans) == 1 + assert workflow_spans[0].status.status_code == StatusCode.ERROR + finally: + instrumentor.uninstrument() + mod.run_agent_on_problem = original + + def test_workflow_span_with_none_config_fields(self, span_exporter, instrument): + """Workflow span should handle None config fields gracefully.""" + import slop_code.entrypoints.problem_runner.worker as mod + + config = MagicMock() + config.model_def = None + config.agent_config = None + config.pass_policy = None + + mod.run_agent_on_problem( + MagicMock(), "test_problem", config, MagicMock(), "/tmp" + ) + + spans = span_exporter.get_finished_spans() + workflow_spans = [ + s for s in spans + if s.attributes.get("gen_ai.operation.name") == "workflow" + ] + assert len(workflow_spans) == 1 + span = workflow_spans[0] + assert span.attributes["slop_code.problem.name"] == "test_problem" + assert "gen_ai.request.model" not in span.attributes diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/LICENSE b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/pyproject.toml new file mode 100644 index 000000000..62aaa6e5a --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/pyproject.toml @@ -0,0 +1,52 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-terminus2" +dynamic = ["version"] +description = "LoongSuite Terminus2 Instrumentation" +license = "Apache-2.0" +requires-python = ">=3.8" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +dependencies = [ + "wrapt >= 1.0.0, < 2.0.0", +] + +[project.optional-dependencies] +instruments = [ + "terminal-bench >= 0.1.0", +] + +[project.entry-points.opentelemetry_instrumentor] +terminus2 = "opentelemetry.instrumentation.terminus2:Terminus2Instrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-terminus2" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/terminus2/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/__init__.py new file mode 100644 index 000000000..f5d018885 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/__init__.py @@ -0,0 +1,826 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +OpenTelemetry Terminus2 Instrumentation + +Provides automatic instrumentation for the terminus-2 agent from terminal-bench +via external monkey patching (no upstream changes required). + +Span hierarchy & semantic mapping (strictly follows ARMS gen-ai semantic +conventions, see ``arms_docs/trace/gen-ai.md``): + + enter_ai_application_system (ENTRY / enter) + └── invoke_agent terminus-2 (AGENT / invoke_agent) + └── react step (STEP / react) ── episode N + ├── (LLM span produced by ``opentelemetry-instrumentation-litellm``) + ├── run_task parse_response (TASK / run_task) + ├── chain summarize (CHAIN / task) ── on overflow + └── execute_tool terminal (TOOL / execute_tool) + +LLM spans are intentionally **not** produced by this package. The underlying +``LiteLLM.call`` invokes ``litellm.completion`` which is already traced by +``opentelemetry-instrumentation-litellm``; emitting another span here would +duplicate that record. + +Patch targets (all monkey-patched via ``wrapt.wrap_function_wrapper``): + + P0 Terminus2.perform_task → ENTRY span (application entry) + P0 Terminus2._run_agent_loop → AGENT span + episode lifecycle + P0 Terminus2._execute_commands → TOOL span + P1 Terminus2._handle_llm_interaction → STEP span (per ReAct iteration) + P1 TerminusJSONPlainParser.parse_response / + TerminusXMLPlainParser.parse_response → TASK span + P2 Terminus2._summarize → CHAIN span (handoff) +""" + +import contextvars +import json +import logging +from typing import Any, Collection + +from opentelemetry import context as context_api +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap +from opentelemetry.trace import SpanKind, Status, StatusCode +from wrapt import wrap_function_wrapper + +from aliyun.semconv.trace_v2 import ( + CommonAttributes, + GenAiOperationName, + GenAiSpanKind, + GenAiToolType, + LLMAttributes, + ToolAttributes, +) + +from aliyun.sdk.extension.arms.self_monitor.self_monitor_decorator import hook_advice + +from opentelemetry.instrumentation.terminus2.package import _instruments + +logger = logging.getLogger(__name__) + +# ── Framework / agent identifiers ──────────────────────────────────────────── +_FRAMEWORK = "terminal-bench" +_AGENT_NAME = "terminus-2" +_TERMINAL_TOOL_NAME = "terminal" +_TERMINAL_TOOL_DESCRIPTION = "Send keystrokes to a tmux terminal session" + +# Spec-defined tool I/O attribute keys (not yet exposed as constants in +# aliyun.semconv.trace_v2.ToolAttributes; see gen-ai.md §Tool). +_GEN_AI_TOOL_CALL_ARGUMENTS = "gen_ai.tool.call.arguments" +_GEN_AI_TOOL_CALL_RESULT = "gen_ai.tool.call.result" + +# Message content attributes. These are not exposed by +# aliyun.semconv.trace_v2.CommonAttributes in all supported versions. +_GEN_AI_INPUT_MESSAGES = "gen_ai.input.messages" +_GEN_AI_OUTPUT_MESSAGES = "gen_ai.output.messages" + +# ── Span kind / operation values not present in trace_v2 enums ─────────────── +_SPAN_KIND_ENTRY = "ENTRY" +_SPAN_KIND_STEP = "STEP" +_OP_ENTER = "enter" +_OP_REACT = "react" +_OP_RUN_TASK = "run_task" +_OP_TASK = "task" + +# ── ReAct extension attributes (阿里云扩展规范) ────────────────────────────── +_GEN_AI_REACT_ROUND = "gen_ai.react.round" +_GEN_AI_REACT_FINISH_REASON = "gen_ai.react.finish_reason" + +# ── Content capture ───────────────────────────────────────────────────────── +# Inputs / outputs (instruction text, terminal keystrokes, terminal output, +# AgentResult summary) are captured **unconditionally and untruncated** — +# they are the primary observability signal for terminus-2. If full content +# is undesirable in a given deployment, configure exporter-side filtering or +# attribute-length limits in the SDK instead. + + +def _commands_to_arguments_json(commands) -> str: + """Serialize a list of ``Command`` objects into a JSON string for + ``gen_ai.tool.call.arguments``.""" + serialized = [] + for cmd in commands: + serialized.append({ + "keystrokes": getattr(cmd, "keystrokes", ""), + "duration_sec": getattr(cmd, "duration_sec", None), + }) + try: + return json.dumps(serialized, ensure_ascii=False) + except Exception: + return str(serialized) + + +def _text_messages_json(role: str, content: Any) -> str: + """Serialize a single text message using the GenAI message schema.""" + message = { + "role": role, + "parts": [{"type": "text", "content": str(content)}], + } + try: + return json.dumps([message], ensure_ascii=False, separators=(",", ":")) + except Exception: + return str([message]) + + +def _semconv_value(value: Any) -> Any: + """Return enum.value when present, otherwise the value itself.""" + return getattr(value, "value", value) + +# ── ReAct step lifecycle tracked via contextvars ──────────────────────────── +# A STEP span stays open across `_handle_llm_interaction` ⇒ `_execute_commands` +# so both become its children. It is closed when the next iteration starts or +# when `_run_agent_loop` returns. +_current_step_span = contextvars.ContextVar( + "terminus2_current_step_span", default=None +) +_current_step_token = contextvars.ContextVar( + "terminus2_current_step_token", default=None +) +_react_round_counter = contextvars.ContextVar( + "terminus2_react_round_counter", default=0 +) + + +def _end_current_step(finish_reason: str | None = None) -> None: + """End the active ReAct STEP span (if any) and detach its context.""" + span = _current_step_span.get() + token = _current_step_token.get() + if span is not None: + if finish_reason: + span.set_attribute(_GEN_AI_REACT_FINISH_REASON, finish_reason) + span.end() + _current_step_span.set(None) + if token is not None: + context_api.detach(token) + _current_step_token.set(None) + + +def _infer_provider_name(model_name: str) -> str: + """Infer ``gen_ai.provider.name`` from a model identifier string.""" + if not model_name: + return "unknown" + lower = model_name.lower() + if any(k in lower for k in ("gpt", "o1-", "o3-", "o4-")): + return "openai" + if "claude" in lower or "anthropic" in lower: + return "anthropic" + if "gemini" in lower: + return "google" + if "llama" in lower or "meta" in lower: + return "meta" + if "mistral" in lower: + return "mistral" + if "qwen" in lower: + return "alibaba" + if "deepseek" in lower: + return "deepseek" + if "/" in model_name: + return model_name.split("/", 1)[0] + return "unknown" + + +# Sentinel attribute attached to every target we successfully wrap. Stored +# on the target callable itself (not in module-level state) so that +# duplicate wraps are detected even if this package is loaded as multiple +# module instances (e.g. wheel install + ``pip install -e`` source, or +# under different sys.path roots), or if ``_instrument()`` is invoked +# twice via auto-loader + manual call. +_TERMINUS2_MARKER = "_otel_terminus2_wrapped" + + +def _resolve_target(module: str, name: str): + """Resolve ``module.name`` (where ``name`` may be ``Class.method``). + + Returns ``(parent, attr_name, current_value)``. Raises on missing + module / attribute. + """ + from importlib import import_module + mod = import_module(module) + parts = name.split(".") + parent = mod + for p in parts[:-1]: + parent = getattr(parent, p) + attr = parts[-1] + return parent, attr, getattr(parent, attr, None) + + +def _try_wrap(module: str, name: str, wrapper) -> None: + """Wrap ``module.name`` with ``wrapper`` exactly once. + + Idempotency is enforced via a sentinel attribute attached to the + target — robust against multiple module instances of this package and + repeated ``_instrument()`` invocations. + """ + try: + parent, attr, current = _resolve_target(module, name) + except Exception as e: + logger.warning(f"Could not resolve {module}.{name}: {e}") + return + + if current is None: + logger.warning(f"{module}.{name} not found") + return + + if getattr(current, _TERMINUS2_MARKER, False): + logger.debug( + f"{module}.{name} already wrapped by terminus2 instrumentation, " + "skipping" + ) + return + + try: + wrap_function_wrapper(module=module, name=name, wrapper=wrapper) + except Exception as e: + logger.warning(f"Could not wrap {module}.{name}: {e}") + return + + # Mark the freshly installed wrapper. wrapt's FunctionWrapper proxies + # attribute writes to the underlying wrapped object, but reading the + # attribute back through the proxy returns the same value, so a + # subsequent ``getattr`` check on either layer detects the marker. + new_value = getattr(parent, attr, None) + if new_value is not None: + try: + setattr(new_value, _TERMINUS2_MARKER, True) + except Exception as e: + logger.debug(f"Could not mark {module}.{name}: {e}") + + +def _try_unwrap(module: str, name: str) -> None: + """Reverse of :func:`_try_wrap`.""" + try: + parent, attr, current = _resolve_target(module, name) + except Exception: + return + + if current is None or not getattr(current, _TERMINUS2_MARKER, False): + return + + # Clear the marker on the underlying object first (FunctionWrapper + # forwards delattr to the wrapped object, so the marker — which was + # written through to the original — is removed cleanly). + try: + delattr(current, _TERMINUS2_MARKER) + except (AttributeError, TypeError): + pass + + try: + unwrap(parent, attr) + except Exception as e: + logger.debug(f"Could not unwrap {module}.{name}: {e}") + + +# ═══════════════════════════════════════════════════════════════════════════ +# Instrumentor +# ═══════════════════════════════════════════════════════════════════════════ + +class Terminus2Instrumentor(BaseInstrumentor): + """Instrumentor for the terminus-2 agent from terminal-bench.""" + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + tracer_provider = kwargs.get("tracer_provider") + tracer = trace_api.get_tracer(__name__, "", tracer_provider=tracer_provider) + + # P0 – ENTRY span (application entry point) + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2.perform_task", + _PerformTaskWrapper(tracer), + ) + + # P0 – AGENT span (agent invocation) + ReAct loop lifecycle + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._run_agent_loop", + _RunAgentLoopWrapper(tracer), + ) + + # NOTE: LLM spans for ``LiteLLM.call`` are NOT produced here — + # ``opentelemetry-instrumentation-litellm`` already traces the + # underlying ``litellm.completion`` invocation. Wrapping again would + # produce duplicate LLM spans for every model call. + + # P0 – TOOL span for terminal command batch + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._execute_commands", + _ExecuteCommandsWrapper(tracer), + ) + + # P1 – STEP span per ReAct iteration + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._handle_llm_interaction", + _HandleLLMInteractionWrapper(tracer), + ) + + # P1 – TASK span for parser (json + xml) + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_json_plain_parser", + "TerminusJSONPlainParser.parse_response", + _ParseResponseWrapper(tracer, "json"), + ) + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_xml_plain_parser", + "TerminusXMLPlainParser.parse_response", + _ParseResponseWrapper(tracer, "xml"), + ) + + # P2 – CHAIN span for context-overflow handoff + _try_wrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._summarize", + _SummarizeWrapper(tracer), + ) + + def _uninstrument(self, **kwargs: Any) -> None: + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2.perform_task", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._run_agent_loop", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._execute_commands", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._handle_llm_interaction", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_json_plain_parser", + "TerminusJSONPlainParser.parse_response", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_xml_plain_parser", + "TerminusXMLPlainParser.parse_response", + ) + _try_unwrap( + "terminal_bench.agents.terminus_2.terminus_2", + "Terminus2._summarize", + ) + _end_current_step() + + +# ═══════════════════════════════════════════════════════════════════════════ +# P0 — ENTRY span: Terminus2.perform_task +# ═══════════════════════════════════════════════════════════════════════════ + +class _PerformTaskWrapper: + """Wrap ``Terminus2.perform_task`` to produce the **ENTRY** span. + + Per spec: span name ``enter_ai_application_system``, + ``gen_ai.span.kind=ENTRY``, ``gen_ai.operation.name=enter``. + + Records the user instruction as ``gen_ai.input.messages`` and a + serialized summary of ``AgentResult`` (failure_mode, token totals, + marker count) as ``gen_ai.output.messages`` once the task completes. + """ + + def __init__(self, tracer): + self._tracer = tracer + + @hook_advice( + instrumentation_name="terminus2", + advice_method="perform_task", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + model_name = getattr(instance, "_model_name", "unknown") + instruction = args[0] if args else kwargs.get("instruction", "") + + with self._tracer.start_as_current_span( + "enter_ai_application_system", + kind=SpanKind.SERVER, + ) as span: + span.set_attribute(CommonAttributes.GEN_AI_SPAN_KIND, _SPAN_KIND_ENTRY) + span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_ENTER) + span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + span.set_attribute(LLMAttributes.GEN_AI_REQUEST_MODEL, model_name) + span.set_attribute( + LLMAttributes.GEN_AI_PROVIDER_NAME, + _infer_provider_name(model_name), + ) + + if instruction: + span.set_attribute( + _GEN_AI_INPUT_MESSAGES, + _text_messages_json("user", instruction), + ) + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + raise + + input_tokens = getattr(result, "total_input_tokens", 0) or 0 + output_tokens = getattr(result, "total_output_tokens", 0) or 0 + failure_mode = getattr(result, "failure_mode", None) + failure_mode_str = str( + getattr(failure_mode, "value", failure_mode) + ) if failure_mode is not None else "none" + markers = getattr(result, "timestamped_markers", None) or [] + + output_summary = { + "failure_mode": failure_mode_str, + "total_input_tokens": input_tokens, + "total_output_tokens": output_tokens, + "marker_count": len(markers), + } + try: + output_value = json.dumps(output_summary, ensure_ascii=False) + except Exception: + output_value = str(output_summary) + + span.set_attribute( + _GEN_AI_OUTPUT_MESSAGES, + _text_messages_json("assistant", output_value), + ) + span.set_attribute("terminus2.failure_mode", failure_mode_str) + + span.set_status(Status(StatusCode.OK)) + return result + + +# ═══════════════════════════════════════════════════════════════════════════ +# P0 — AGENT span: Terminus2._run_agent_loop +# ═══════════════════════════════════════════════════════════════════════════ + +class _RunAgentLoopWrapper: + """Wrap ``Terminus2._run_agent_loop`` to produce the **AGENT** span. + + Per spec: span name ``invoke_agent {agent.name}``, + ``gen_ai.span.kind=AGENT``, ``gen_ai.operation.name=invoke_agent``. + + The AGENT span precisely brackets the ReAct loop body — STEP / TOOL / + TASK / CHAIN children all hang off it. Token totals are aggregated + from the ``Chat`` cumulative counters once the loop returns. Also + cleans up any trailing STEP span on loop exit. + """ + + def __init__(self, tracer): + self._tracer = tracer + + @hook_advice( + instrumentation_name="terminus2", + advice_method="run_agent_loop", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + # Reset per-loop ReAct state + _react_round_counter.set(0) + _end_current_step() + + model_name = getattr(instance, "_model_name", "unknown") + parser_name = getattr(instance, "_parser_name", "unknown") + + # _run_agent_loop signature: + # (initial_prompt, session, chat, logging_dir=None, + # original_instruction="") + chat = args[2] if len(args) > 2 else kwargs.get("chat") + original_instruction = ( + args[4] if len(args) > 4 else kwargs.get("original_instruction", "") + ) + + with self._tracer.start_as_current_span( + f"invoke_agent {_AGENT_NAME}", + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute( + CommonAttributes.GEN_AI_SPAN_KIND, + _semconv_value(GenAiSpanKind.AGENT), + ) + span.set_attribute( + CommonAttributes.GEN_AI_OPERATION_NAME, + _semconv_value(GenAiOperationName.INVOKE_AGENT), + ) + span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + span.set_attribute("gen_ai.agent.name", _AGENT_NAME) + span.set_attribute( + "gen_ai.agent.description", + "Terminus-2 terminal-bench agent (ReAct loop over a tmux session)", + ) + span.set_attribute(LLMAttributes.GEN_AI_REQUEST_MODEL, model_name) + span.set_attribute( + LLMAttributes.GEN_AI_PROVIDER_NAME, + _infer_provider_name(model_name), + ) + span.set_attribute("terminus2.parser", parser_name) + + if original_instruction: + span.set_attribute( + _GEN_AI_INPUT_MESSAGES, + _text_messages_json("user", original_instruction), + ) + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + _end_current_step(finish_reason="loop_end") + raise + + _end_current_step(finish_reason="loop_end") + + # Aggregate token usage from the Chat object — captured here so + # the totals reflect the full loop, including the bare + # ``chat._model.call`` invoked inside ``_summarize``. + # ``Chat.total_*_tokens`` returns cumulative counters that + # survive context unwinding. + if chat is not None: + input_tokens = getattr(chat, "total_input_tokens", 0) or 0 + output_tokens = getattr(chat, "total_output_tokens", 0) or 0 + span.set_attribute( + LLMAttributes.GEN_AI_USAGE_INPUT_TOKENS, input_tokens + ) + span.set_attribute( + LLMAttributes.GEN_AI_USAGE_OUTPUT_TOKENS, output_tokens + ) + span.set_attribute( + LLMAttributes.GEN_AI_USAGE_TOTAL_TOKENS, + input_tokens + output_tokens, + ) + + span.set_attribute( + "terminus2.react.rounds", _react_round_counter.get() + ) + + span.set_status(Status(StatusCode.OK)) + return result + + +# ═══════════════════════════════════════════════════════════════════════════ +# P0 — TOOL span: Terminus2._execute_commands +# ═══════════════════════════════════════════════════════════════════════════ + +class _ExecuteCommandsWrapper: + """Wrap ``Terminus2._execute_commands`` to produce a **TOOL** span. + + Per spec: span name ``execute_tool {tool_name}``, + ``gen_ai.span.kind=TOOL``, ``gen_ai.operation.name=execute_tool``. + """ + + def __init__(self, tracer): + self._tracer = tracer + + @hook_advice( + instrumentation_name="terminus2", + advice_method="execute_commands", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + commands = args[0] if args else kwargs.get("commands", []) + + with self._tracer.start_as_current_span( + f"execute_tool {_TERMINAL_TOOL_NAME}", + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute( + CommonAttributes.GEN_AI_SPAN_KIND, + _semconv_value(GenAiSpanKind.TOOL), + ) + span.set_attribute( + CommonAttributes.GEN_AI_OPERATION_NAME, + _semconv_value(GenAiOperationName.EXECUTE_TOOL), + ) + span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + span.set_attribute(ToolAttributes.GEN_AI_TOOL_NAME, _TERMINAL_TOOL_NAME) + span.set_attribute( + ToolAttributes.GEN_AI_TOOL_DESCRIPTION, _TERMINAL_TOOL_DESCRIPTION + ) + span.set_attribute( + ToolAttributes.GEN_AI_TOOL_TYPE, + _semconv_value(GenAiToolType.EXTENSION), + ) + span.set_attribute("terminus2.commands.count", len(commands)) + + arguments_json = _commands_to_arguments_json(commands) + # Spec attribute (gen-ai.md §Tool) + span.set_attribute(_GEN_AI_TOOL_CALL_ARGUMENTS, arguments_json) + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + raise + + timeout_occurred, terminal_output = result + span.set_attribute("terminus2.terminal.timeout", timeout_occurred) + + if terminal_output is not None: + output_text = str(terminal_output) + # Spec attribute (gen-ai.md §Tool) + span.set_attribute(_GEN_AI_TOOL_CALL_RESULT, output_text) + + span.set_status(Status(StatusCode.OK)) + return result + + +# ═══════════════════════════════════════════════════════════════════════════ +# P1 — STEP span: Terminus2._handle_llm_interaction +# ═══════════════════════════════════════════════════════════════════════════ + +class _HandleLLMInteractionWrapper: + """Wrap ``Terminus2._handle_llm_interaction`` to produce a **STEP** span. + + The STEP span represents one ReAct iteration. It opens here, stays open + after this method returns (so the subsequent ``_execute_commands`` call + in ``_run_agent_loop`` becomes its child), and is closed on the next + iteration entry or by ``_RunAgentLoopWrapper`` cleanup. + """ + + def __init__(self, tracer): + self._tracer = tracer + + @hook_advice( + instrumentation_name="terminus2", + advice_method="handle_llm_interaction", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + # Close previous STEP first (if any) + _end_current_step(finish_reason="next_round") + + round_num = _react_round_counter.get() + 1 + _react_round_counter.set(round_num) + + step_span = self._tracer.start_span( + "react step", + kind=SpanKind.INTERNAL, + ) + step_span.set_attribute(CommonAttributes.GEN_AI_SPAN_KIND, _SPAN_KIND_STEP) + step_span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_REACT) + step_span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + step_span.set_attribute(_GEN_AI_REACT_ROUND, round_num) + + ctx = trace_api.set_span_in_context(step_span) + token = context_api.attach(ctx) + _current_step_span.set(step_span) + _current_step_token.set(token) + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + step_span.set_attribute(_GEN_AI_REACT_FINISH_REASON, "error") + step_span.record_exception(e) + step_span.set_status(Status(StatusCode.ERROR)) + raise + + commands, is_task_complete, feedback = result + + if is_task_complete: + step_span.set_attribute(_GEN_AI_REACT_FINISH_REASON, "complete") + elif feedback and "ERROR:" in feedback: + step_span.set_attribute(_GEN_AI_REACT_FINISH_REASON, "parse_error") + + # Span stays open: closed by next iteration or _RunAgentLoopWrapper + return result + + +# ═══════════════════════════════════════════════════════════════════════════ +# P1 — TASK span: parser.parse_response +# ═══════════════════════════════════════════════════════════════════════════ + +class _ParseResponseWrapper: + """Wrap ``parser.parse_response`` to produce a **TASK** span. + + Per spec: span name ``run_task {task_name}``, + ``gen_ai.span.kind=TASK``, ``gen_ai.operation.name=run_task``. + """ + + def __init__(self, tracer, parser_type): + self._tracer = tracer + self._parser_type = parser_type + + @hook_advice( + instrumentation_name="terminus2", + advice_method="parse_response", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + # parse_response signature: (self, response: str) + response_text = args[0] if args else kwargs.get("response", "") + + with self._tracer.start_as_current_span( + "run_task parse_response", + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute( + CommonAttributes.GEN_AI_SPAN_KIND, + _semconv_value(GenAiSpanKind.TASK), + ) + span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_RUN_TASK) + span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + span.set_attribute("terminus2.parser", self._parser_type) + + if response_text is not None: + span.set_attribute( + _GEN_AI_INPUT_MESSAGES, + _text_messages_json("assistant", response_text), + ) + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + raise + + span.set_attribute("terminus2.task_complete", result.is_task_complete) + span.set_attribute("terminus2.commands.count", len(result.commands)) + + output_summary = { + "is_task_complete": result.is_task_complete, + "commands": [ + { + "keystrokes": getattr(c, "keystrokes", ""), + "duration": getattr(c, "duration", None), + } + for c in result.commands + ], + "error": result.error or "", + "warning": result.warning or "", + } + try: + output_value = json.dumps(output_summary, ensure_ascii=False) + except Exception: + output_value = str(output_summary) + span.set_attribute( + _GEN_AI_OUTPUT_MESSAGES, + _text_messages_json("assistant", output_value), + ) + + if result.error: + span.set_attribute("terminus2.parse.error", str(result.error)) + + if result.warning: + span.set_attribute("terminus2.parse.warning", str(result.warning)) + + span.set_status(Status(StatusCode.OK)) + return result + + +# ═══════════════════════════════════════════════════════════════════════════ +# P2 — CHAIN span: Terminus2._summarize +# ═══════════════════════════════════════════════════════════════════════════ + +class _SummarizeWrapper: + """Wrap ``Terminus2._summarize`` to produce a **CHAIN** span. + + Per spec: span name ``chain {chain_name}``, + ``gen_ai.span.kind=CHAIN``. The summarize handoff itself triggers + multiple inner LLM calls so it semantically maps to a Chain. + """ + + def __init__(self, tracer): + self._tracer = tracer + + @hook_advice( + instrumentation_name="terminus2", + advice_method="summarize", + throw_exception=True, + ) + def __call__(self, wrapped, instance, args, kwargs): + with self._tracer.start_as_current_span( + "chain summarize", + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute( + CommonAttributes.GEN_AI_SPAN_KIND, + _semconv_value(GenAiSpanKind.CHAIN), + ) + span.set_attribute(CommonAttributes.GEN_AI_OPERATION_NAME, _OP_TASK) + span.set_attribute(CommonAttributes.GEN_AI_FRAMEWORK, _FRAMEWORK) + + try: + result = wrapped(*args, **kwargs) + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + raise + + span.set_status(Status(StatusCode.OK)) + return result diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/package.py new file mode 100644 index 000000000..d92c81333 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/package.py @@ -0,0 +1,15 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +_instruments = ("terminal-bench >= 0.1.0",) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/version.py new file mode 100644 index 000000000..5fd301e2e --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/src/opentelemetry/instrumentation/terminus2/version.py @@ -0,0 +1,15 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__version__ = "0.1.0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/test-requirements.txt b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/test-requirements.txt new file mode 100644 index 000000000..f98537dd8 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/test-requirements.txt @@ -0,0 +1,4 @@ +terminal-bench>=0.1.0 +-e aliyun-semantic-conventions +-e util/opentelemetry-util-http +-e instrumentation-loongsuite/loongsuite-instrumentation-terminus2 diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-terminus2/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-vita/README.md new file mode 100644 index 000000000..a91e8d879 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/README.md @@ -0,0 +1,47 @@ +# LoongSuite VitaBench Instrumentation + +OpenTelemetry instrumentation for the VitaBench multi-domain simulation framework. + +## Installation + +```bash +pip install loongsuite-instrumentation-vita +``` + +## Usage + +```python +from opentelemetry.instrumentation.vita import VitaInstrumentor + +VitaInstrumentor().instrument() +``` + +For GenAI semantic conventions and span-only message content capture, set: + +```bash +export OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental +export OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=SPAN_ONLY +``` + +## VitaBench With DashScope + +VitaBench posts directly to the `base_url` configured in `models.yaml`, so the +DashScope OpenAI-compatible endpoint must include `/chat/completions`. The API +key must be supplied in the `Authorization` header. + +```yaml +default: + base_url: https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions + temperature: 0.0 + max_input_tokens: 8192 + headers: + Content-Type: "application/json" + Authorization: "Bearer ${OPENAI_API_KEY}" +models: + - name: qwen3.6-plus + max_tokens: 1024 + max_input_tokens: 8192 +``` + +See `examples/vitabench-dashscope` for a runnable setup used by the Kubernetes +benchmark deployment. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/README.md new file mode 100644 index 000000000..7d63531c3 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/README.md @@ -0,0 +1,23 @@ +# VitaBench DashScope Example + +This example runs a single VitaBench delivery task with LoongSuite +instrumentation and DashScope's OpenAI-compatible chat completions endpoint. + +Required environment variables: + +```bash +export OPENAI_API_KEY= +export OTEL_SEMCONV_STABILITY_OPT_IN=gen_ai_latest_experimental +export OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT=SPAN_ONLY +``` + +Then run: + +```bash +./setup.sh +./cmd.sh +``` + +`setup.sh` writes `models.yaml` with the full `/chat/completions` endpoint and +injects the API key via the `Authorization` header at runtime. Do not commit a +rendered `models.yaml` containing a real key. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/cmd.sh b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/cmd.sh new file mode 100755 index 000000000..813abb713 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/cmd.sh @@ -0,0 +1,24 @@ +#!/usr/bin/env bash +# Run one VitaBench delivery task with LoongSuite instrumentation. +set -euo pipefail + +export OTEL_SEMCONV_STABILITY_OPT_IN="${OTEL_SEMCONV_STABILITY_OPT_IN:-gen_ai_latest_experimental}" +export OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT="${OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT:-SPAN_ONLY}" + +VITA_ROOT=/work/upstream/vitabench +if [ ! -d "$VITA_ROOT" ]; then + echo "[vita-cmd] vitabench not found, run setup.sh first" >&2 + exit 1 +fi + +cd "$VITA_ROOT" +export VITA_MODEL_CONFIG_PATH=/work/upstream/vitabench/models.yaml + +echo "[vita-cmd] invoking vita run --domain delivery --num-tasks 1" +loongsuite-instrument vita run \ + --domain delivery \ + --user-llm qwen3.6-plus \ + --agent-llm qwen3.6-plus \ + --evaluator-llm qwen3.6-plus \ + --num-tasks 1 \ + --num-trials 1 diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/setup.sh b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/setup.sh new file mode 100755 index 000000000..669ef7602 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/examples/vitabench-dashscope/setup.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Prepare VitaBench and write a DashScope-backed model config. +set -euo pipefail + +: "${OPENAI_API_KEY:?OPENAI_API_KEY is required}" + +mkdir -p /work/upstream +cd /work/upstream + +if [ ! -d vitabench ]; then + echo "[vita-setup] cloning vitabench" + git clone --depth=1 https://github.com/meituan-longcat/vitabench.git +fi + +cd vitabench +pip install --quiet --no-deps -e . || pip install --no-deps -e . +pip install --quiet "openai>=1.0" "pydantic>=2" pyyaml "loguru" "anthropic" \ + "litellm" "tenacity" "tiktoken" pandas toml addict deepdiff thefuzz \ + json_repair holidays || true + +cat > /work/upstream/vitabench/models.yaml <= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.0.0, < 2.0.0", + "opentelemetry-util-genai >= 0.3b0.dev0", +] + +[project.optional-dependencies] +instruments = [ + "vita >= 0.0.1", +] + +[project.entry-points.opentelemetry_instrumentor] +vita = "opentelemetry.instrumentation.vita:VitaInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-vita" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/vita/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/__init__.py new file mode 100644 index 000000000..1e58668a6 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/__init__.py @@ -0,0 +1,223 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +OpenTelemetry VitaBench Instrumentation + +Usage +----- +.. code:: python + + from opentelemetry.instrumentation.vita import VitaInstrumentor + + VitaInstrumentor().instrument() + + # ... run vitabench tasks ... + + VitaInstrumentor().uninstrument() + +API +--- +""" + +from __future__ import annotations + +import logging +from typing import Any, Collection + +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap +from opentelemetry.instrumentation.vita.package import _instruments +from opentelemetry.instrumentation.vita.patch import ( + wrap_generate, + wrap_generate_next_message, + wrap_get_response, + wrap_orchestrator_run, + wrap_orchestrator_step, + wrap_run_task, +) +from opentelemetry.instrumentation.vita.version import __version__ +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler + +logger = logging.getLogger(__name__) + +__all__ = ["VitaInstrumentor", "__version__"] + + +class VitaInstrumentor(BaseInstrumentor): + """OpenTelemetry instrumentor for VitaBench framework. + + Instruments the following components: + - vita.run.run_task(): Entry spans (ENTRY) + - Orchestrator.run(): Workflow spans (CHAIN) + - Orchestrator.step(): ReAct step spans (STEP) + - LLMAgent.generate_next_message(): Agent spans (AGENT) + - generate(): LLM call spans (LLM) + - Environment.get_response(): Tool execution spans (TOOL) + """ + + def __init__(self): + super().__init__() + self._handler = None + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + """Enable VitaBench instrumentation.""" + tracer_provider = kwargs.get("tracer_provider") + meter_provider = kwargs.get("meter_provider") + logger_provider = kwargs.get("logger_provider") + + self._handler = ExtendedTelemetryHandler( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + logger_provider=logger_provider, + ) + + # Hook #5: generate -> LLM. Wrap this first so modules that import + # generate directly (for example vita.agent.llm_agent) bind to the + # instrumented function during their import. + try: + wrap_function_wrapper( + module="vita.utils.llm_utils", + name="generate", + wrapper=lambda w, i, a, k: wrap_generate( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented vita.utils.llm_utils.generate") + except Exception as e: + logger.warning(f"Could not wrap vita.utils.llm_utils.generate: {e}") + + # Hook #1: run_task -> ENTRY + try: + wrap_function_wrapper( + module="vita.run", + name="run_task", + wrapper=lambda w, i, a, k: wrap_run_task( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented vita.run.run_task") + except Exception as e: + logger.warning(f"Could not wrap vita.run.run_task: {e}") + + # Hook #2: Orchestrator.run -> CHAIN + try: + wrap_function_wrapper( + module="vita.orchestrator.orchestrator", + name="Orchestrator.run", + wrapper=lambda w, i, a, k: wrap_orchestrator_run( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Orchestrator.run") + except Exception as e: + logger.warning(f"Could not wrap Orchestrator.run: {e}") + + # Hook #3: Orchestrator.step -> STEP + try: + wrap_function_wrapper( + module="vita.orchestrator.orchestrator", + name="Orchestrator.step", + wrapper=lambda w, i, a, k: wrap_orchestrator_step( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Orchestrator.step") + except Exception as e: + logger.warning(f"Could not wrap Orchestrator.step: {e}") + + # Hook #4a: LLMAgent.generate_next_message -> AGENT + try: + wrap_function_wrapper( + module="vita.agent.llm_agent", + name="LLMAgent.generate_next_message", + wrapper=lambda w, i, a, k: wrap_generate_next_message( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented LLMAgent.generate_next_message") + except Exception as e: + logger.warning(f"Could not wrap LLMAgent.generate_next_message: {e}") + + # Hook #4b: LLMSoloAgent.generate_next_message -> AGENT + try: + wrap_function_wrapper( + module="vita.agent.llm_agent", + name="LLMSoloAgent.generate_next_message", + wrapper=lambda w, i, a, k: wrap_generate_next_message( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented LLMSoloAgent.generate_next_message") + except Exception as e: + logger.warning(f"Could not wrap LLMSoloAgent.generate_next_message: {e}") + + # Hook #6: Environment.get_response -> TOOL + try: + wrap_function_wrapper( + module="vita.environment.environment", + name="Environment.get_response", + wrapper=lambda w, i, a, k: wrap_get_response( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Environment.get_response") + except Exception as e: + logger.warning(f"Could not wrap Environment.get_response: {e}") + + def _uninstrument(self, **kwargs: Any) -> None: + """Disable VitaBench instrumentation.""" + try: + import vita.run # noqa: PLC0415 + + unwrap(vita.run, "run_task") + except Exception as e: + logger.debug(f"Failed to uninstrument vita.run.run_task: {e}") + + try: + import vita.orchestrator.orchestrator # noqa: PLC0415 + + unwrap(vita.orchestrator.orchestrator.Orchestrator, "run") + unwrap(vita.orchestrator.orchestrator.Orchestrator, "step") + except Exception as e: + logger.debug(f"Failed to uninstrument Orchestrator: {e}") + + try: + import vita.agent.llm_agent # noqa: PLC0415 + + unwrap(vita.agent.llm_agent.LLMAgent, "generate_next_message") + unwrap(vita.agent.llm_agent.LLMSoloAgent, "generate_next_message") + except Exception as e: + logger.debug(f"Failed to uninstrument LLMAgent: {e}") + + try: + import vita.utils.llm_utils # noqa: PLC0415 + + unwrap(vita.utils.llm_utils, "generate") + except Exception as e: + logger.debug(f"Failed to uninstrument generate: {e}") + + try: + import vita.environment.environment # noqa: PLC0415 + + unwrap(vita.environment.environment.Environment, "get_response") + except Exception as e: + logger.debug(f"Failed to uninstrument Environment: {e}") + + self._handler = None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/package.py new file mode 100644 index 000000000..a776722c9 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/package.py @@ -0,0 +1,3 @@ +_instruments = ("vita >= 0.0.1",) + +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/patch.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/patch.py new file mode 100644 index 000000000..182da38d6 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/patch.py @@ -0,0 +1,432 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Patch functions for VitaBench instrumentation. + +Wraps key vitabench methods to generate OpenTelemetry spans: +- run_task() -> ENTRY spans +- Orchestrator.run() -> CHAIN spans +- Orchestrator.step() -> STEP spans (react) +- LLMAgent.generate_next_message() -> AGENT spans +- generate() -> LLM spans +- Environment.get_response() -> TOOL spans +""" + +from __future__ import annotations + +import json +import logging +import uuid +from contextvars import ContextVar +from typing import Any, Optional + +from opentelemetry import trace as trace_api +from opentelemetry.trace import SpanKind, Status, StatusCode +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler +from opentelemetry.util.genai.extended_semconv import gen_ai_extended_attributes +from opentelemetry.util.genai.extended_types import ( + EntryInvocation, + ExecuteToolInvocation, + InvokeAgentInvocation, + ReactStepInvocation, +) +from opentelemetry.util.genai.types import ( + Error, + InputMessage, + LLMInvocation, + OutputMessage, + Text, +) + +from .utils import ( + _convert_vita_assistant_to_output, + _convert_vita_messages_to_input, + _get_tool_definitions, + _infer_provider, + _MAX_CONTENT_LEN, +) + +logger = logging.getLogger(__name__) + +# ContextVars for ReAct step tracking +_react_step_invocation: ContextVar[Optional[ReactStepInvocation]] = ContextVar( + "vita_react_step_invocation", default=None +) +_react_step_counter: ContextVar[int] = ContextVar( + "vita_react_step_counter", default=0 +) + +# Reentrancy guard for AGENT span (LLMSoloAgent extends LLMAgent) +_in_agent_invoke: ContextVar[bool] = ContextVar( + "vita_in_agent_invoke", default=False +) + + +def _close_active_react_step(handler: ExtendedTelemetryHandler) -> None: + """Close the currently active react_step span, if any.""" + prev = _react_step_invocation.get() + if prev is not None: + try: + handler.stop_react_step(prev) + except Exception as e: + logger.debug(f"Failed to close react step: {e}") + _react_step_invocation.set(None) + + +# ==================== Hook #1: run_task -> ENTRY ==================== + + +def wrap_run_task( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for vita.run.run_task to create ENTRY span.""" + task = args[1] if len(args) > 1 else kwargs.get("task") + domain = args[0] if args else kwargs.get("domain") + + invocation = EntryInvocation( + session_id=str(uuid.uuid4()), + user_id=None, + ) + invocation.attributes["gen_ai.framework"] = "vitabench" + + if task and hasattr(task, "instructions") and task.instructions: + invocation.input_messages = [ + InputMessage(role="user", parts=[Text(content=str(task.instructions)[:_MAX_CONTENT_LEN])]) + ] + + handler.start_entry(invocation) + try: + result = wrapped(*args, **kwargs) + + if result: + output_parts = [] + if hasattr(result, "termination_reason") and result.termination_reason: + output_parts.append(Text(content=f"termination: {result.termination_reason}")) + if hasattr(result, "reward_info") and result.reward_info: + reward = getattr(result.reward_info, "reward", None) + if reward is not None: + output_parts.append(Text(content=f"reward: {reward}")) + if output_parts: + invocation.output_messages = [ + OutputMessage( + role="assistant", + parts=output_parts, + finish_reason="stop", + ) + ] + + handler.stop_entry(invocation) + return result + except Exception as e: + handler.fail_entry(invocation, Error(message=str(e), type=type(e))) + raise + + +# ==================== Hook #2: Orchestrator.run -> CHAIN ==================== + + +def wrap_orchestrator_run( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for Orchestrator.run to create CHAIN span.""" + task = getattr(instance, "task", None) + domain = getattr(instance, "domain", "unknown") + span_name = f"workflow {domain}" + + input_text = "" + if task and hasattr(task, "instructions") and task.instructions: + input_text = str(task.instructions)[:_MAX_CONTENT_LEN] + + tracer = handler._tracer + + # Reset step counter for this orchestrator run + counter_token = _react_step_counter.set(0) + step_token = _react_step_invocation.set(None) + + with tracer.start_as_current_span( + name=span_name, + kind=SpanKind.INTERNAL, + attributes={ + "gen_ai.operation.name": "workflow", + "gen_ai.system": "vitabench", + gen_ai_extended_attributes.GEN_AI_SPAN_KIND: "CHAIN", + "gen_ai.framework": "vitabench", + }, + ) as span: + if input_text: + span.set_attribute("input.value", input_text) + + try: + result = wrapped(*args, **kwargs) + + # Close any remaining open step span + _close_active_react_step(handler) + + if result and hasattr(result, "termination_reason") and result.termination_reason: + span.set_attribute("output.value", str(result.termination_reason)) + + span.set_status(Status(StatusCode.OK)) + return result + except Exception as e: + # Close any remaining open step span + _close_active_react_step(handler) + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR)) + raise + finally: + _react_step_counter.reset(counter_token) + _react_step_invocation.reset(step_token) + + +# ==================== Hook #3: Orchestrator.step -> STEP ==================== + + +def wrap_orchestrator_step( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for Orchestrator.step to create STEP span on AGENT turns.""" + to_role = getattr(instance, "to_role", None) + + # Import Role enum dynamically to avoid import-time dependency + _Role = None + try: + from vita.orchestrator.orchestrator import Role + _Role = Role + except ImportError: + pass + + is_agent_turn = False + if _Role is not None: + is_agent_turn = (to_role == _Role.AGENT) + else: + is_agent_turn = (str(to_role) == "Role.AGENT" or str(to_role) == "agent") + + if is_agent_turn: + # Close previous STEP span (deferred close strategy) + _close_active_react_step(handler) + + step_num = _react_step_counter.get() + 1 + _react_step_counter.set(step_num) + + step_inv = ReactStepInvocation(round=step_num) + handler.start_react_step(step_inv) + _react_step_invocation.set(step_inv) + + try: + result = wrapped(*args, **kwargs) + + if is_agent_turn: + current_step = _react_step_invocation.get() + if current_step: + done = getattr(instance, "done", False) + if done: + term_reason = getattr(instance, "termination_reason", None) + if term_reason: + current_step.finish_reason = ( + term_reason.value + if hasattr(term_reason, "value") + else str(term_reason) + ) + else: + current_step.finish_reason = "agent_stop" + else: + message = getattr(instance, "message", None) + if message and hasattr(message, "is_tool_call") and message.is_tool_call(): + current_step.finish_reason = "tool_call" + else: + current_step.finish_reason = "assistant_text" + + return result + except Exception as e: + current_step = _react_step_invocation.get() + if current_step: + current_step.finish_reason = "error" + handler.fail_react_step(current_step, Error(message=str(e), type=type(e))) + _react_step_invocation.set(None) + raise + + +# ==================== Hook #4: generate_next_message -> AGENT ==================== + + +def wrap_generate_next_message( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for LLMAgent.generate_next_message / LLMSoloAgent.generate_next_message.""" + # Reentrancy guard + if _in_agent_invoke.get(): + return wrapped(*args, **kwargs) + token = _in_agent_invoke.set(True) + + try: + agent_name = instance.__class__.__name__ + model = getattr(instance, "llm", None) + + invocation = InvokeAgentInvocation( + provider="vitabench", + agent_name=agent_name, + request_model=model, + ) + + # input_messages + message = args[0] if args else kwargs.get("message") + state = args[1] if len(args) > 1 else kwargs.get("state") + if message: + invocation.input_messages = _convert_vita_messages_to_input([message]) + + # system_instruction + if state and hasattr(state, "system_messages") and state.system_messages: + invocation.system_instruction = [ + Text(content=str(sm.content)[:_MAX_CONTENT_LEN]) + for sm in state.system_messages + if sm and getattr(sm, "content", None) + ] + + # tool_definitions + tools = getattr(instance, "tools", None) + tool_defs = _get_tool_definitions(tools) + if tool_defs: + invocation.tool_definitions = tool_defs + + handler.start_invoke_agent(invocation) + + try: + result = wrapped(*args, **kwargs) + assistant_msg, _ = result + + # output_messages + invocation.output_messages = _convert_vita_assistant_to_output(assistant_msg) + + # token usage + usage = getattr(assistant_msg, "usage", None) + if usage and isinstance(usage, dict): + invocation.input_tokens = usage.get("prompt_tokens") + invocation.output_tokens = usage.get("completion_tokens") + + handler.stop_invoke_agent(invocation) + return result + except Exception as e: + handler.fail_invoke_agent(invocation, Error(message=str(e), type=type(e))) + raise + finally: + _in_agent_invoke.reset(token) + + +# ==================== Hook #5: generate -> LLM ==================== + + +def wrap_generate( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for vita.utils.llm_utils.generate to create LLM span.""" + model = args[0] if args else kwargs.get("model", "unknown") + messages = args[1] if len(args) > 1 else kwargs.get("messages", []) + tools = args[2] if len(args) > 2 else kwargs.get("tools") + temperature = kwargs.get("temperature") + + invocation = LLMInvocation( + request_model=model or "unknown", + provider=_infer_provider(model or ""), + temperature=temperature, + ) + invocation.max_tokens = kwargs.get("max_tokens") + + # input_messages + invocation.input_messages = _convert_vita_messages_to_input(messages) + + # tool_definitions + tool_defs = _get_tool_definitions(tools) + if tool_defs: + invocation.tool_definitions = tool_defs + + handler.start_llm(invocation) + + try: + result = wrapped(*args, **kwargs) + + if result: + # output_messages + invocation.output_messages = _convert_vita_assistant_to_output(result) + + # response_model_name + invocation.response_model_name = model + + # finish_reasons + if getattr(result, "tool_calls", None): + invocation.finish_reasons = ["tool_calls"] + else: + invocation.finish_reasons = ["stop"] + + # token usage + usage = getattr(result, "usage", None) + if usage and isinstance(usage, dict): + invocation.input_tokens = usage.get("prompt_tokens") + invocation.output_tokens = usage.get("completion_tokens") + + handler.stop_llm(invocation) + return result + except Exception as e: + handler.fail_llm(invocation, Error(message=str(e), type=type(e))) + raise + + +# ==================== Hook #6: Environment.get_response -> TOOL ==================== + + +def wrap_get_response( + wrapped, instance, args, kwargs, handler: ExtendedTelemetryHandler +): + """Wrapper for Environment.get_response to create TOOL span.""" + message = args[0] if args else kwargs.get("message") + + tool_name = getattr(message, "name", "unknown") if message else "unknown" + tool_call_id = getattr(message, "id", None) if message else None + + invocation = ExecuteToolInvocation( + tool_name=tool_name, + tool_call_id=tool_call_id, + provider="vitabench", + ) + + # tool_call_arguments + if message and hasattr(message, "arguments") and message.arguments: + try: + invocation.tool_call_arguments = json.dumps( + message.arguments, ensure_ascii=False, default=str + )[:_MAX_CONTENT_LEN] + except Exception: + invocation.tool_call_arguments = str(message.arguments)[:_MAX_CONTENT_LEN] + + handler.start_execute_tool(invocation) + + try: + result = wrapped(*args, **kwargs) + + # tool_call_result + if result and getattr(result, "content", None): + invocation.tool_call_result = str(result.content)[:_MAX_CONTENT_LEN] + + # Check if tool reported an error + if result and getattr(result, "error", False): + handler.fail_execute_tool( + invocation, + Error(message=f"Tool error: {getattr(result, 'content', '')}", type=RuntimeError), + ) + else: + handler.stop_execute_tool(invocation) + + return result + except Exception as e: + handler.fail_execute_tool(invocation, Error(message=str(e), type=type(e))) + raise diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/utils.py new file mode 100644 index 000000000..0793a6cc0 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/utils.py @@ -0,0 +1,169 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility functions for VitaBench instrumentation. + +Handles conversion between vitabench Message types and +OpenTelemetry GenAI semantic convention types. +""" + +from __future__ import annotations + +import json +import logging +from typing import Any, List, Optional + +from opentelemetry.util.genai.types import ( + FunctionToolDefinition, + InputMessage, + OutputMessage, + Text, + ToolCall as OTelToolCall, + ToolCallResponse, +) + +logger = logging.getLogger(__name__) + +_MAX_CONTENT_LEN = 4096 + + +def _convert_vita_messages_to_input(messages: Any) -> List[InputMessage]: + """Convert vita Message list to OTel InputMessage list.""" + if not messages: + return [] + + if not isinstance(messages, list): + messages = [messages] + + result = [] + for msg in messages: + try: + role = getattr(msg, "role", None) + if role is None: + continue + + parts = [] + content = getattr(msg, "content", None) + tool_calls = getattr(msg, "tool_calls", None) + + if role == "tool": + msg_id = getattr(msg, "id", None) or "" + if content: + parts.append( + ToolCallResponse( + id=msg_id, + response=str(content)[:_MAX_CONTENT_LEN], + ) + ) + else: + if content: + parts.append(Text(content=str(content)[:_MAX_CONTENT_LEN])) + if tool_calls: + for tc in tool_calls: + tc_args = getattr(tc, "arguments", {}) + if isinstance(tc_args, dict): + tc_args = json.dumps(tc_args, ensure_ascii=False, default=str) + parts.append( + OTelToolCall( + name=getattr(tc, "name", ""), + id=getattr(tc, "id", None), + arguments=tc_args, + ) + ) + + if parts: + result.append(InputMessage(role=role, parts=parts)) + except Exception as e: + logger.debug(f"Error converting vita message: {e}") + continue + + return result + + +def _convert_vita_assistant_to_output(msg: Any) -> List[OutputMessage]: + """Convert vita AssistantMessage to OTel OutputMessage list.""" + if not msg: + return [] + + parts = [] + content = getattr(msg, "content", None) + tool_calls = getattr(msg, "tool_calls", None) + + if content: + parts.append(Text(content=str(content)[:_MAX_CONTENT_LEN])) + if tool_calls: + for tc in tool_calls: + tc_args = getattr(tc, "arguments", {}) + if isinstance(tc_args, dict): + tc_args = json.dumps(tc_args, ensure_ascii=False, default=str) + parts.append( + OTelToolCall( + name=getattr(tc, "name", ""), + id=getattr(tc, "id", None), + arguments=tc_args, + ) + ) + + finish_reason = "tool_calls" if tool_calls else "stop" + + if not parts: + parts.append(Text(content="")) + + return [OutputMessage(role="assistant", parts=parts, finish_reason=finish_reason)] + + +def _infer_provider(model_name: str) -> str: + """Infer provider from model name string.""" + if not model_name: + return "unknown" + m = model_name.lower() + if "gpt" in m or "o1" in m or "o3" in m: + return "openai" + if "claude" in m: + return "anthropic" + if "qwen" in m: + return "alibaba_cloud" + if "deepseek" in m: + return "deepseek" + if "gemini" in m: + return "google" + return "unknown" + + +def _get_tool_definitions(tools: Any) -> Optional[List[FunctionToolDefinition]]: + """Extract tool definitions from vita Tool list.""" + if not tools: + return None + + try: + defs = [] + for t in tools: + name = getattr(t, "name", None) + if not name: + continue + parameters = None + openai_schema = getattr(t, "openai_schema", None) + if isinstance(openai_schema, dict): + function_schema = openai_schema.get("function", openai_schema) + parameters = function_schema.get("parameters") + defs.append( + FunctionToolDefinition( + name=name, + description=getattr(t, "short_desc", None), + parameters=parameters, + ) + ) + return defs if defs else None + except Exception: + return None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/version.py new file mode 100644 index 000000000..26056b5d8 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/src/opentelemetry/instrumentation/vita/version.py @@ -0,0 +1 @@ +__version__ = "0.5.0.dev" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/conftest.py new file mode 100644 index 000000000..1e9dac354 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/conftest.py @@ -0,0 +1,100 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Test configuration for VitaBench instrumentation tests.""" + +import os + +import pytest + +from opentelemetry.instrumentation.vita import VitaInstrumentor +from opentelemetry.sdk._logs import LoggerProvider +from opentelemetry.sdk._logs.export import ( + InMemoryLogExporter, + SimpleLogRecordProcessor, +) +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import InMemoryMetricReader +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) + + +def pytest_configure(config: pytest.Config): + os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental" + os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "SPAN_ONLY" + + +# ==================== Exporters ==================== + + +@pytest.fixture(scope="function", name="span_exporter") +def fixture_span_exporter(): + exporter = InMemorySpanExporter() + yield exporter + + +@pytest.fixture(scope="function", name="log_exporter") +def fixture_log_exporter(): + exporter = InMemoryLogExporter() + yield exporter + + +@pytest.fixture(scope="function", name="metric_reader") +def fixture_metric_reader(): + reader = InMemoryMetricReader() + yield reader + + +# ==================== Providers ==================== + + +@pytest.fixture(scope="function", name="tracer_provider") +def fixture_tracer_provider(span_exporter): + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + return provider + + +@pytest.fixture(scope="function", name="logger_provider") +def fixture_logger_provider(log_exporter): + provider = LoggerProvider() + provider.add_log_record_processor(SimpleLogRecordProcessor(log_exporter)) + return provider + + +@pytest.fixture(scope="function", name="meter_provider") +def fixture_meter_provider(metric_reader): + meter_provider = MeterProvider( + metric_readers=[metric_reader], + ) + return meter_provider + + +# ==================== Instrumentation ==================== + + +@pytest.fixture(scope="function") +def instrument(tracer_provider, logger_provider, meter_provider): + instrumentor = VitaInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + logger_provider=logger_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + yield instrumentor + instrumentor.uninstrument() diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/test_instrumentor.py new file mode 100644 index 000000000..a6a2339f8 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-vita/tests/test_instrumentor.py @@ -0,0 +1,478 @@ +# Copyright The OpenTelemetry Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for VitaBench instrumentation. + +The suite exercises all execute.md hook points. External I/O is replaced at the +HTTP/tool boundary, while the Vita agent/orchestrator call chain runs through +the real framework methods. +""" + +from __future__ import annotations + +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest + +from opentelemetry.instrumentation.vita import VitaInstrumentor + + +FAKE_MODELS_CONFIG = { + "qwen-max": { + "base_url": "http://fake-api.example.com/v1/chat/completions", + "headers": {"Authorization": "Bearer test-key"}, + }, + "gpt-4": { + "base_url": "http://fake-api.example.com/v1/chat/completions", + "headers": {"Authorization": "Bearer test-key"}, + }, + "claude-3-opus": { + "base_url": "http://fake-api.example.com/v1/chat/completions", + "headers": {"Authorization": "Bearer test-key"}, + }, +} + + +def _make_openai_response(content=None, tool_calls=None, usage=None): + message = {"role": "assistant", "content": content} + if tool_calls: + message["tool_calls"] = tool_calls + return { + "id": "chatcmpl-test", + "model": "test-model", + "choices": [{"message": message, "finish_reason": "stop"}], + "usage": usage + or {"prompt_tokens": 100, "completion_tokens": 50, "total_tokens": 150}, + } + + +def _mock_requests_post(response_dict): + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = response_dict + return mock_resp + + +def _tool_call_response(): + return _make_openai_response( + tool_calls=[ + { + "id": "call_1", + "type": "function", + "function": { + "name": "get_order", + "arguments": '{"order_id": "123"}', + }, + } + ], + usage={"prompt_tokens": 100, "completion_tokens": 20, "total_tokens": 120}, + ) + + +def _text_response(content="Order 123 has been delivered. ###STOP###"): + return _make_openai_response( + content=content, + usage={"prompt_tokens": 200, "completion_tokens": 30, "total_tokens": 230}, + ) + + +class FakeTool: + name = "get_order" + short_desc = "Get order details" + openai_schema = { + "type": "function", + "function": { + "name": "get_order", + "description": "Get order details", + "parameters": { + "type": "object", + "properties": {"order_id": {"type": "string"}}, + }, + }, + } + + +class FakeTools: + def __init__(self): + self.db = SimpleNamespace(time="2026-01-01 00:00:00") + self._tools = {"get_order": FakeTool()} + + def get_tools(self): + return self._tools + + def use_tool(self, tool_name, **kwargs): + return {"tool": tool_name, "arguments": kwargs, "status": "delivered"} + + def get_db_hash(self): + return "fake-db-hash" + + +class DeterministicUser: + def get_init_state(self, message_history=None): + return SimpleNamespace(messages=message_history or []) + + def generate_next_message(self, message, state): + from vita.data_model.message import UserMessage + + user_message = UserMessage(role="user", content="Check order 123") + state.messages.append(user_message) + return user_message, state + + +def _make_agent(): + from vita.agent.llm_agent import LLMAgent + + return LLMAgent( + tools=[FakeTool()], + domain_policy="You are helpful at {time}.", + llm="qwen-max", + llm_args={}, + time="2026-01-01 00:00:00", + language="english", + ) + + +def _make_orchestrator(): + from vita.environment.environment import Environment + from vita.orchestrator.orchestrator import Orchestrator + + return Orchestrator( + domain="delivery", + agent=_make_agent(), + user=DeterministicUser(), + environment=Environment(domain_name="delivery", tools=FakeTools()), + task=SimpleNamespace( + id="task_001", + instructions="Check order 123", + message_history=None, + ), + max_steps=6, + max_errors=3, + language="english", + ) + + +def _span_attrs(spans, name): + span = next(s for s in spans if s.name == name) + return dict(span.attributes) + + +class TestVitaInstrumentor: + def test_instrument_and_uninstrument( + self, tracer_provider, logger_provider, meter_provider + ): + instrumentor = VitaInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + logger_provider=logger_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + assert instrumentor._handler is not None + instrumentor.uninstrument() + assert instrumentor._handler is None + + def test_instrumentation_dependencies(self): + assert VitaInstrumentor().instrumentation_dependencies() == ( + "vita >= 0.0.1", + ) + + +class TestLLMSpan: + def test_llm_span_text_response(self, instrument, span_exporter): + from vita.data_model.message import UserMessage + from vita.utils.llm_utils import generate + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", + return_value=_mock_requests_post( + _make_openai_response( + content="The order has been delivered.", + usage={ + "prompt_tokens": 150, + "completion_tokens": 30, + "total_tokens": 180, + }, + ) + ), + ): + result = generate( + model="qwen-max", + messages=[UserMessage(role="user", content="Where is my order?")], + ) + + assert result.content == "The order has been delivered." + spans = span_exporter.get_finished_spans() + attrs = _span_attrs(spans, "chat qwen-max") + assert attrs["gen_ai.operation.name"] == "chat" + assert attrs["gen_ai.span.kind"] == "LLM" + assert attrs["gen_ai.request.model"] == "qwen-max" + assert attrs["gen_ai.provider.name"] == "alibaba_cloud" + assert attrs["gen_ai.usage.input_tokens"] == 150 + assert attrs["gen_ai.usage.output_tokens"] == 30 + assert attrs["gen_ai.response.finish_reasons"] == ("stop",) + + def test_llm_span_tool_call_response(self, instrument, span_exporter): + from vita.data_model.message import UserMessage + from vita.utils.llm_utils import generate + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", return_value=_mock_requests_post(_tool_call_response()) + ): + result = generate( + model="gpt-4", + messages=[UserMessage(role="user", content="Check my order")], + ) + + assert result.tool_calls is not None + attrs = _span_attrs(span_exporter.get_finished_spans(), "chat gpt-4") + assert attrs["gen_ai.response.finish_reasons"] == ("tool_calls",) + assert attrs["gen_ai.provider.name"] == "openai" + + def test_llm_span_captures_positional_tools(self, instrument, span_exporter): + from vita.data_model.message import UserMessage + from vita.utils.llm_utils import generate + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", return_value=_mock_requests_post(_text_response("Done.")) + ): + generate( + "qwen-max", + [UserMessage(role="user", content="Check my order")], + [FakeTool()], + ) + + attrs = _span_attrs(span_exporter.get_finished_spans(), "chat qwen-max") + assert "gen_ai.tool.definitions" in attrs + assert "get_order" in attrs["gen_ai.tool.definitions"] + + +class TestToolSpan: + def test_tool_span_created(self, instrument, span_exporter): + from vita.data_model.message import ToolCall + from vita.environment.environment import Environment + + env = Environment(domain_name="delivery", tools=FakeTools()) + result = env.get_response( + ToolCall(id="tc_42", name="get_order", arguments={"order_id": "999"}) + ) + + assert result.content is not None + attrs = _span_attrs( + span_exporter.get_finished_spans(), "execute_tool get_order" + ) + assert attrs["gen_ai.operation.name"] == "execute_tool" + assert attrs["gen_ai.span.kind"] == "TOOL" + assert attrs["gen_ai.tool.name"] == "get_order" + assert attrs["gen_ai.tool.call.id"] == "tc_42" + + def test_tool_span_on_error(self, instrument, span_exporter): + from vita.data_model.message import ToolCall + from vita.environment.environment import Environment + + tools = FakeTools() + tools.use_tool = MagicMock(side_effect=RuntimeError("Tool failed")) + env = Environment(domain_name="delivery", tools=tools) + result = env.get_response( + ToolCall(id="tc_err", name="get_order", arguments={}) + ) + + assert result.error is True + tool_span = next( + s + for s in span_exporter.get_finished_spans() + if s.name == "execute_tool get_order" + ) + assert tool_span.status.status_code.name == "ERROR" + + +class TestAgentSpan: + def test_agent_span_created_for_llm_agent(self, instrument, span_exporter): + from vita.data_model.message import UserMessage + + agent = _make_agent() + state = agent.get_init_state([]) + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", return_value=_mock_requests_post(_text_response("Sure.")) + ): + assistant_msg, _ = agent.generate_next_message( + UserMessage(role="user", content="I need help"), state + ) + + assert assistant_msg.content == "Sure." + spans = span_exporter.get_finished_spans() + agent_span = next(s for s in spans if s.name == "invoke_agent LLMAgent") + llm_span = next(s for s in spans if s.name == "chat qwen-max") + attrs = dict(agent_span.attributes) + assert attrs["gen_ai.operation.name"] == "invoke_agent" + assert attrs["gen_ai.span.kind"] == "AGENT" + assert attrs["gen_ai.agent.name"] == "LLMAgent" + assert attrs["gen_ai.request.model"] == "qwen-max" + assert llm_span.parent.span_id == agent_span.context.span_id + + def test_agent_span_created_for_llm_solo_agent(self, instrument, span_exporter): + from vita.agent.llm_agent import LLMSoloAgent + + agent = LLMSoloAgent( + tools=[FakeTool()], + domain_policy="unused", + llm="qwen-max", + llm_args={}, + time="2026-01-01 00:00:00", + language="english", + ) + state = agent.get_init_state([]) + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", return_value=_mock_requests_post(_tool_call_response()) + ): + agent.generate_next_message(None, state) + + attrs = _span_attrs( + span_exporter.get_finished_spans(), "invoke_agent LLMSoloAgent" + ) + assert attrs["gen_ai.span.kind"] == "AGENT" + assert attrs["gen_ai.agent.name"] == "LLMSoloAgent" + + +class TestStepAndChainSpans: + def test_orchestrator_run_creates_chain_steps_agents_llms_and_tools( + self, instrument, span_exporter + ): + responses = [ + _mock_requests_post(_tool_call_response()), + _mock_requests_post(_text_response()), + ] + + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", side_effect=responses + ): + result = _make_orchestrator().run() + + assert result.termination_reason == "agent_stop" + spans = span_exporter.get_finished_spans() + chain = next(s for s in spans if s.name == "workflow delivery") + steps = sorted( + [s for s in spans if s.name == "react step"], key=lambda s: s.start_time + ) + agents = sorted( + [s for s in spans if s.name == "invoke_agent LLMAgent"], + key=lambda s: s.start_time, + ) + llms = sorted( + [s for s in spans if s.name == "chat qwen-max"], + key=lambda s: s.start_time, + ) + tools = [s for s in spans if s.name == "execute_tool get_order"] + + assert len(steps) == 2 + assert len(agents) == 2 + assert len(llms) == 2 + assert len(tools) == 1 + + chain_attrs = dict(chain.attributes) + assert chain_attrs["gen_ai.operation.name"] == "workflow" + assert chain_attrs["gen_ai.span.kind"] == "CHAIN" + assert chain_attrs["gen_ai.framework"] == "vitabench" + + assert dict(steps[0].attributes)["gen_ai.react.round"] == 1 + assert dict(steps[1].attributes)["gen_ai.react.round"] == 2 + for step in steps: + assert step.parent.span_id == chain.context.span_id + assert agents[0].parent.span_id == steps[0].context.span_id + assert agents[1].parent.span_id == steps[1].context.span_id + assert llms[0].parent.span_id == agents[0].context.span_id + assert llms[1].parent.span_id == agents[1].context.span_id + assert tools[0].parent.span_id == steps[0].context.span_id + + def test_open_step_fails_when_env_turn_raises(self, instrument, span_exporter): + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", return_value=_mock_requests_post(_tool_call_response()) + ), patch( + "vita.environment.environment.Environment.get_response", + side_effect=RuntimeError("env broke"), + ): + with pytest.raises(RuntimeError, match="env broke"): + _make_orchestrator().run() + + spans = span_exporter.get_finished_spans() + step = next(s for s in spans if s.name == "react step") + chain = next(s for s in spans if s.name == "workflow delivery") + step_attrs = dict(step.attributes) + assert step.status.status_code.name == "ERROR" + assert step_attrs["gen_ai.react.finish_reason"] == "error" + assert chain.status.status_code.name == "ERROR" + + +class TestEntrySpan: + def test_run_task_entry_wraps_orchestrator_trace(self, instrument, span_exporter): + from vita.run import run_task + + def fake_internal(**kwargs): + return _make_orchestrator().run() + + responses = [ + _mock_requests_post(_tool_call_response()), + _mock_requests_post(_text_response()), + ] + task = SimpleNamespace( + id="task_001", + instructions="Check order 123", + message_history=None, + ) + + with patch("vita.run._run_task_internal", side_effect=fake_internal), patch( + "vita.utils.llm_utils.models", FAKE_MODELS_CONFIG + ), patch("requests.post", side_effect=responses): + result = run_task("delivery", task, "llm_agent", "user_simulator") + + assert result.termination_reason == "agent_stop" + spans = span_exporter.get_finished_spans() + entry = next(s for s in spans if s.name == "enter_ai_application_system") + chain = next(s for s in spans if s.name == "workflow delivery") + attrs = dict(entry.attributes) + assert attrs["gen_ai.operation.name"] == "enter" + assert attrs["gen_ai.span.kind"] == "ENTRY" + assert attrs["gen_ai.framework"] == "vitabench" + assert "gen_ai.session.id" in attrs + assert chain.parent.span_id == entry.context.span_id + + +class TestProviderInference: + def test_common_provider_names(self, instrument, span_exporter): + from vita.data_model.message import UserMessage + from vita.utils.llm_utils import generate + + for model in ("gpt-4", "claude-3-opus", "qwen-max"): + with patch("vita.utils.llm_utils.models", FAKE_MODELS_CONFIG), patch( + "requests.post", + return_value=_mock_requests_post(_make_openai_response(content="Hi")), + ): + generate( + model=model, + messages=[UserMessage(role="user", content="Hi")], + ) + + providers = { + dict(s.attributes)["gen_ai.request.model"]: dict(s.attributes)[ + "gen_ai.provider.name" + ] + for s in span_exporter.get_finished_spans() + if s.name.startswith("chat ") + } + assert providers["gpt-4"] == "openai" + assert providers["claude-3-opus"] == "anthropic" + assert providers["qwen-max"] == "alibaba_cloud" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/pyproject.toml new file mode 100644 index 000000000..c2fc31949 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/pyproject.toml @@ -0,0 +1,54 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-webarena" +dynamic = ["version"] +description = "LoongSuite webarena instrumentation" +license = "Apache-2.0" +requires-python = ">=3.10,<4" +authors = [ + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + "opentelemetry-api >= 1.37.0", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "wrapt >= 1.0.0, < 2.0.0", +] + +[project.optional-dependencies] +instruments = [ + "webarena >= 0.0.1" +] + +[project.entry-points.opentelemetry_instrumentor] +webarena = "opentelemetry.instrumentation.webarena:WebarenaInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-webarena" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/webarena/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/__init__.py new file mode 100644 index 000000000..c822df538 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/__init__.py @@ -0,0 +1,231 @@ +""" +OpenTelemetry WebArena Instrumentation +====================================== + +Automatic instrumentation for the +`WebArena `_ benchmark framework. + +Span hierarchy +-------------- + +:: + + ENTRY webarena_task (per task; ScriptBrowserEnv.reset) + └── CHAIN workflow webarena_task (same lifecycle as ENTRY) + ├── STEP react step (one per ReAct round) + │ ├── AGENT invoke_agent (PromptAgent.next_action) + │ │ ├── TASK build_prompt_context (PromptConstructor.construct) + │ │ └── LLM chat / text_completion + │ │ * OpenAI provider — emitted by the OpenAI SDK probe + │ │ * HuggingFace provider — emitted by THIS package + │ └── TOOL execute_tool {action_type} (ScriptBrowserEnv.step) + └── ... + + AGENT create_agent (one-shot; construct_agent) + +Design principles +----------------- + +* **Do not double-emit OpenAI LLM spans.** WebArena's + ``generate_from_openai_chat_completion`` / ``generate_from_openai_completion`` + ultimately call ``openai.ChatCompletion.create`` / + ``openai.Completion.create`` which already have a dedicated OpenAI SDK + instrumentor (e.g. ``opentelemetry-instrumentation-openai``). We rely on + *that* instrumentor for token usage / model / finish-reason and let its + LLM span attach itself naturally as a child of our AGENT span via the + shared OTel context. +* **HuggingFace path is ours.** The ``text_generation`` client has no + off-the-shelf probe, so we wrap + ``llms.providers.hf_utils.generate_from_huggingface_completion`` to emit + an LLM span for that path. +* **No invasive rewrite of ``run.py:test()``.** ENTRY / CHAIN / STEP are + synthesised by latching on to ``ScriptBrowserEnv.reset`` (task start), + ``ScriptBrowserEnv.close`` (batch end) and ``PromptAgent.next_action`` + (round start). See ``internal/_state.py`` for the state machine. + +Usage +----- + +.. code:: python + + from opentelemetry.instrumentation.webarena import WebarenaInstrumentor + + WebarenaInstrumentor().instrument() + + # Then run WebArena as normal (e.g. ``python run.py ...``). +""" + +from __future__ import annotations + +import logging +from typing import Any, Collection + +from opentelemetry import trace as trace_api +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.webarena.package import _instruments +from opentelemetry.instrumentation.webarena.version import __version__ + +logger = logging.getLogger(__name__) + +__all__ = ["WebarenaInstrumentor"] + + +# WebArena uses *flat* package names (``setup.cfg`` declares ``packages = +# browser_env, agent, evaluation_harness, llms`` with no ``webarena.`` +# prefix). Patch targets therefore use the bare module names. +_PATCH_TARGETS = ( + # (module, qualname, wrapper_attr_name) + ("browser_env.envs", "ScriptBrowserEnv.reset", "_env_reset_wrapper"), + ("browser_env.envs", "ScriptBrowserEnv.close", "_env_close_wrapper"), + ("browser_env.envs", "ScriptBrowserEnv.step", "_env_step_wrapper"), + ("agent.agent", "construct_agent", "_construct_agent_wrapper"), + ("agent.agent", "PromptAgent.next_action", "_next_action_wrapper"), +) + +# PromptConstructor.construct is abstract on the base class, so we patch +# the two known concrete subclasses individually. +_PROMPT_CONSTRUCTOR_TARGETS = ( + ("agent.prompts.prompt_constructor", "DirectPromptConstructor.construct"), + ("agent.prompts.prompt_constructor", "CoTPromptConstructor.construct"), +) + +_HF_TARGET = ("llms.providers.hf_utils", "generate_from_huggingface_completion") + + +class WebarenaInstrumentor(BaseInstrumentor): + """An ``opentelemetry-instrumentation`` plugin for WebArena. + + Spans (see module docstring) are emitted via ``wrapt`` hooks on six + framework functions plus an optional HuggingFace LLM hook. OpenAI LLM + spans are intentionally **not** emitted here (the OpenAI SDK probe + handles them). + """ + + _patched: list[tuple[str, str]] = [] + _patched_hf: bool = False + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + tracer_provider = kwargs.get("tracer_provider") + tracer = trace_api.get_tracer( + __name__, __version__, tracer_provider=tracer_provider + ) + + from opentelemetry.instrumentation.webarena.internal._wrappers import ( + ConstructAgentWrapper, + EnvCloseWrapper, + EnvResetWrapper, + EnvStepWrapper, + HuggingFaceCompletionWrapper, + NextActionWrapper, + PromptConstructWrapper, + ) + + wrappers = { + "_env_reset_wrapper": EnvResetWrapper(tracer), + "_env_close_wrapper": EnvCloseWrapper(), + "_env_step_wrapper": EnvStepWrapper(tracer), + "_construct_agent_wrapper": ConstructAgentWrapper(tracer), + "_next_action_wrapper": NextActionWrapper(tracer), + } + + # --- core patches (mandatory) ------------------------------------ + type(self)._patched = [] + for module, qualname, wrapper_key in _PATCH_TARGETS: + try: + wrap_function_wrapper( + module=module, + name=qualname, + wrapper=wrappers[wrapper_key], + ) + type(self)._patched.append((module, qualname)) + except Exception as exc: # noqa: BLE001 + logger.warning( + "WebarenaInstrumentor: could not wrap %s.%s: %s", + module, + qualname, + exc, + ) + + # --- PromptConstructor (two concrete subclasses) ------------------ + prompt_wrapper = PromptConstructWrapper(tracer) + for module, qualname in _PROMPT_CONSTRUCTOR_TARGETS: + try: + wrap_function_wrapper( + module=module, name=qualname, wrapper=prompt_wrapper + ) + type(self)._patched.append((module, qualname)) + except Exception as exc: # noqa: BLE001 + logger.warning( + "WebarenaInstrumentor: could not wrap %s.%s: %s", + module, + qualname, + exc, + ) + + # --- HuggingFace provider (optional, only if module imports OK) -- + try: + wrap_function_wrapper( + module=_HF_TARGET[0], + name=_HF_TARGET[1], + wrapper=HuggingFaceCompletionWrapper(tracer), + ) + type(self)._patched_hf = True + except Exception as exc: # noqa: BLE001 + logger.debug( + "WebarenaInstrumentor: skipping HuggingFace wrapper: %s", exc + ) + + def _uninstrument(self, **kwargs: Any) -> None: + from opentelemetry.instrumentation.webarena.internal import _state as state + + # Always make sure we don't leak open spans on uninstrument. + try: + state.end_task_spans() + except Exception: # noqa: BLE001 + pass + + # Unwrap each successfully-patched target. We import the module + # lazily so uninstrument doesn't fail when WebArena is no longer + # importable (e.g. during teardown). + for module, qualname in list(type(self)._patched): + self._safe_unwrap(module, qualname) + type(self)._patched = [] + + if type(self)._patched_hf: + self._safe_unwrap(_HF_TARGET[0], _HF_TARGET[1]) + type(self)._patched_hf = False + + @staticmethod + def _safe_unwrap(module: str, qualname: str) -> None: + try: + import importlib # noqa: PLC0415 + + mod = importlib.import_module(module) + except Exception as exc: # noqa: BLE001 + logger.debug( + "WebarenaInstrumentor: could not import %s for unwrap: %s", + module, + exc, + ) + return + + parts = qualname.split(".") + try: + target = mod + for p in parts[:-1]: + target = getattr(target, p) + attr = getattr(target, parts[-1], None) + if attr is not None and hasattr(attr, "__wrapped__"): + setattr(target, parts[-1], attr.__wrapped__) + except Exception as exc: # noqa: BLE001 + logger.debug( + "WebarenaInstrumentor: could not unwrap %s.%s: %s", + module, + qualname, + exc, + ) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/config.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/config.py new file mode 100644 index 000000000..870338425 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/config.py @@ -0,0 +1,40 @@ +"""Configuration via environment variables.""" + +from __future__ import annotations + +import os + + +def _int_env(name: str, default: str) -> int: + try: + return int(os.getenv(name, default)) + except ValueError: + return int(default) + + +def _bool_env(name: str, default: bool = False) -> bool: + raw = os.getenv(name) + if raw is None: + return default + return raw.strip().lower() in {"1", "true", "yes", "on"} + + +# Cap on non-content string attribute values (URLs, tool names, etc.) +WEBARENA_OTEL_MAX_ATTR_LENGTH = _int_env( + "WEBARENA_OTEL_MAX_ATTR_LENGTH", "1024" +) + +# Cap on prompt / message preview length when capture-message-content is on +WEBARENA_OTEL_PROMPT_PREVIEW_MAX_LEN = _int_env( + "WEBARENA_OTEL_PROMPT_PREVIEW_MAX_LEN", "4096" +) + + +def capture_message_content() -> bool: + """Whether to record prompt / completion / tool argument bodies. + + Honours the standard semantic-conventions opt-in flag. + """ + return _bool_env( + "OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT", False + ) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_attrs.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_attrs.py new file mode 100644 index 000000000..28db7f0ca --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_attrs.py @@ -0,0 +1,131 @@ +"""Attribute / span-name constants and helpers for WebArena spans.""" + +from __future__ import annotations + +import json +from typing import Any, Iterable + +from opentelemetry.instrumentation.webarena.config import ( + WEBARENA_OTEL_MAX_ATTR_LENGTH, + WEBARENA_OTEL_PROMPT_PREVIEW_MAX_LEN, +) + +# --- vendor-extended attribute names ----------------------------------- + +GEN_AI_SPAN_KIND = "gen_ai.span.kind" +GEN_AI_FRAMEWORK = "gen_ai.framework" +GEN_AI_USAGE_TOTAL_TOKENS = "gen_ai.usage.total_tokens" +GEN_AI_REACT_ROUND = "gen_ai.react.round" +GEN_AI_REACT_FINISH_REASON = "gen_ai.react.finish_reason" + +# WebArena-specific attribute names +WEBARENA_TASK_ID = "webarena.task.id" +WEBARENA_SITES = "webarena.sites" +WEBARENA_REQUIRE_LOGIN = "webarena.require_login" +WEBARENA_OBSERVATION_TYPE = "webarena.observation_type" +WEBARENA_ACTION_SET_TAG = "webarena.action_set_tag" +WEBARENA_ACTION_TYPE = "webarena.action.type" +WEBARENA_FAIL_ERROR = "webarena.fail_error" +WEBARENA_PAGE_URL_BEFORE = "webarena.page.url.before" +WEBARENA_PAGE_URL_AFTER = "webarena.page.url.after" +WEBARENA_BROWSER_ELEMENT_ID = "webarena.browser.element_id" +WEBARENA_OBSERVATION_MAIN_TYPE = "webarena.observation.main_type" +WEBARENA_STEP_COUNT = "webarena.step.count" +WEBARENA_TOOL_COUNT = "webarena.tool.count" +WEBARENA_PARSING_FAILURE_COUNT = "webarena.parsing_failure.count" +WEBARENA_PREVIOUS_ACTION = "webarena.previous_action" +WEBARENA_MEMORY_TRAJECTORY_LENGTH = "webarena.memory.trajectory_length" +WEBARENA_MEMORY_OBS_TEXT_LENGTH = "webarena.memory.obs_text_length" + +FRAMEWORK_NAME = "webarena" + + +def truncate(value: str, max_len: int = WEBARENA_OTEL_MAX_ATTR_LENGTH) -> str: + """Trim a string attribute to ``max_len`` characters with an ellipsis.""" + if value is None: + return "" + if not isinstance(value, str): + value = str(value) + if len(value) <= max_len: + return value + if max_len <= 3: + return value[:max_len] + return value[: max_len - 3] + "..." + + +def truncate_content(value: str) -> str: + """Trim a body / message-style attribute (longer cap than truncate()).""" + return truncate(value, WEBARENA_OTEL_PROMPT_PREVIEW_MAX_LEN) + + +def safe_json_dumps(value: Any, max_len: int | None = None) -> str: + """JSON-encode ``value`` with best-effort fallback to ``str``.""" + try: + text = json.dumps(value, ensure_ascii=False, default=str) + except Exception: # noqa: BLE001 + text = str(value) + if max_len is None: + return truncate(text) + return truncate(text, max_len) + + +def action_type_name(action: Any) -> str: + """Resolve an Action dict's ``action_type`` to its enum name.""" + if not isinstance(action, dict): + return "UNKNOWN" + raw = action.get("action_type") + if raw is None: + return "UNKNOWN" + name = getattr(raw, "name", None) + if name: + return str(name) + try: + from browser_env.actions import ActionTypes # noqa: PLC0415 + return ActionTypes(raw).name + except Exception: # noqa: BLE001 + return str(raw) + + +def action_arguments(action: Any) -> dict[str, Any]: + """Extract a small JSON-friendly subset of an Action dict. + + We deliberately drop high-volume / binary-ish fields like + ``coords``, ``raw_prediction`` and ``page_screenshot`` so the + serialised value stays under the attribute length cap. + """ + if not isinstance(action, dict): + return {} + keep_keys: Iterable[str] = ( + "element_id", + "element_role", + "element_name", + "url", + "text", + "key_comb", + "direction", + "amount", + "answer", + "pw_code", + "nth", + ) + out: dict[str, Any] = {"action_type": action_type_name(action)} + for k in keep_keys: + v = action.get(k) + if v in (None, "", [], {}): + continue + out[k] = v + return out + + +def messages_to_input_value(messages: Any) -> str: + """Compact representation of an LLM/agent prompt for ``input.value``.""" + if isinstance(messages, str): + return truncate_content(messages) + if isinstance(messages, list): + try: + return safe_json_dumps( + messages, max_len=WEBARENA_OTEL_PROMPT_PREVIEW_MAX_LEN + ) + except Exception: # noqa: BLE001 + return truncate_content(str(messages)) + return truncate_content(str(messages)) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_state.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_state.py new file mode 100644 index 000000000..8244ac403 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_state.py @@ -0,0 +1,185 @@ +"""Lifecycle state shared across WebArena wrappers. + +WebArena's ``run.py:test()`` is a single function with a *for* loop over +config files (one task each) and a nested *while* loop (one ReAct round +each). It exposes no per-task hook, so we synthesise ENTRY / CHAIN / STEP +spans by latching on to the boundaries that *do* exist: + +* ``ScriptBrowserEnv.reset(...)`` — first call after a task starts +* ``ScriptBrowserEnv.close(...)`` — end of the whole batch +* ``PromptAgent.next_action(...)`` — start of a new ReAct round +* ``ScriptBrowserEnv.step(...)`` — execution of the picked action + +This module owns the ``ContextVar`` slots used to thread span handles +between those wrappers in a single process / thread, and the helpers +that close any spans that may still be open when an outer boundary +fires. +""" + +from __future__ import annotations + +from contextvars import ContextVar +from typing import Any + +from opentelemetry import context as otel_context + +# Whether we are currently inside a WebArena task (between an env.reset +# and the next env.reset / env.close). Used by the AGENT(invoke_agent) +# wrapper to decide whether STEP rotation is meaningful. +_in_task: ContextVar[bool] = ContextVar("webarena_in_task", default=False) + +# ENTRY span handle + its attached context token. +_entry_span: ContextVar[Any] = ContextVar("webarena_entry_span", default=None) +_entry_token: ContextVar[Any] = ContextVar("webarena_entry_token", default=None) + +# CHAIN(workflow) span handle + token (always nested inside ENTRY). +_chain_span: ContextVar[Any] = ContextVar("webarena_chain_span", default=None) +_chain_token: ContextVar[Any] = ContextVar("webarena_chain_token", default=None) + +# Currently active STEP span handle + token. +_step_span: ContextVar[Any] = ContextVar("webarena_step_span", default=None) +_step_token: ContextVar[Any] = ContextVar("webarena_step_token", default=None) + +# Per-task counters, used to populate STEP attributes / CHAIN summaries. +_step_counter: ContextVar[int] = ContextVar("webarena_step_counter", default=0) +_tool_counter: ContextVar[int] = ContextVar("webarena_tool_counter", default=0) +_parsing_failure_counter: ContextVar[int] = ContextVar( + "webarena_parsing_failure_counter", default=0 +) + + +def _detach_token(token: Any) -> None: + """Detach an OTel context token, swallowing already-detached errors.""" + if token is None: + return + try: + otel_context.detach(token) + except Exception: # noqa: BLE001 + pass + + +def end_step() -> int: + """Close the active STEP span (if any) and return the round number it had. + + Returns ``0`` when no STEP was active. + """ + span = _step_span.get(None) + token = _step_token.get(None) + round_no = 0 + if span is not None: + try: + round_no = int(span.attributes.get("gen_ai.react.round", 0)) # type: ignore[union-attr] + except Exception: # noqa: BLE001 + round_no = 0 + try: + span.end() + except Exception: # noqa: BLE001 + pass + _step_span.set(None) + _detach_token(token) + _step_token.set(None) + return round_no + + +def end_chain() -> None: + """Close the active CHAIN span (if any) and detach its token.""" + span = _chain_span.get(None) + token = _chain_token.get(None) + if span is not None: + try: + span.end() + except Exception: # noqa: BLE001 + pass + _chain_span.set(None) + _detach_token(token) + _chain_token.set(None) + + +def end_entry() -> None: + """Close the active ENTRY span (if any) and detach its token.""" + span = _entry_span.get(None) + token = _entry_token.get(None) + if span is not None: + try: + span.end() + except Exception: # noqa: BLE001 + pass + _entry_span.set(None) + _detach_token(token) + _entry_token.set(None) + + +def end_task_spans() -> None: + """Close STEP → CHAIN → ENTRY in order (most-nested first).""" + end_step() + end_chain() + end_entry() + _in_task.set(False) + _step_counter.set(0) + _tool_counter.set(0) + _parsing_failure_counter.set(0) + + +def in_task() -> bool: + return bool(_in_task.get(False)) + + +def mark_in_task(value: bool) -> None: + _in_task.set(value) + + +def set_entry(span: Any, token: Any) -> None: + _entry_span.set(span) + _entry_token.set(token) + + +def set_chain(span: Any, token: Any) -> None: + _chain_span.set(span) + _chain_token.set(token) + + +def set_step(span: Any, token: Any) -> None: + _step_span.set(span) + _step_token.set(token) + + +def get_chain_span() -> Any: + return _chain_span.get(None) + + +def get_entry_span() -> Any: + return _entry_span.get(None) + + +def get_step_span() -> Any: + return _step_span.get(None) + + +def increment_step() -> int: + n = int(_step_counter.get(0)) + 1 + _step_counter.set(n) + return n + + +def increment_tool() -> int: + n = int(_tool_counter.get(0)) + 1 + _tool_counter.set(n) + return n + + +def increment_parsing_failure() -> int: + n = int(_parsing_failure_counter.get(0)) + 1 + _parsing_failure_counter.set(n) + return n + + +def step_count() -> int: + return int(_step_counter.get(0)) + + +def tool_count() -> int: + return int(_tool_counter.get(0)) + + +def parsing_failure_count() -> int: + return int(_parsing_failure_counter.get(0)) diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_wrappers.py new file mode 100644 index 000000000..2e87f4399 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/internal/_wrappers.py @@ -0,0 +1,836 @@ +"""``wrapt`` hooks that emit WebArena GenAI spans. + +Span hierarchy (per task):: + + ENTRY webarena_task (env.reset) + └── CHAIN workflow webarena_task (env.reset) + ├── STEP react step (round=N) (next_action enter) + │ ├── AGENT invoke_agent (next_action body) + │ │ ├── TASK build_prompt_context (PromptConstructor.construct) + │ │ └── LLM chat / text_completion + │ │ (OpenAI: produced by the OpenAI SDK probe; + │ │ HuggingFace: produced by this package via + │ │ ``generate_from_huggingface_completion``) + │ └── TOOL execute_tool {action_type} (env.step) + └── ... + +ENTRY/CHAIN/STEP boundaries are *not* present as discrete functions in +WebArena, so we synthesise them by latching on to: + +* ``ScriptBrowserEnv.reset`` — open ENTRY/CHAIN (one task starts) +* ``ScriptBrowserEnv.close`` — close any open spans (batch ends) +* ``PromptAgent.next_action`` — rotate STEP (one ReAct round starts) + +A new STEP is closed lazily: by the next ``next_action`` call (next +round) or by ``env.reset`` / ``env.close`` (next task / batch end). +That makes us robust against early-stop / STOP-action paths in +``run.py:test()`` where ``env.step`` is *not* called for the last +round. +""" + +from __future__ import annotations + +import hashlib +import logging +from typing import Any, Callable + +from opentelemetry import context as otel_context +from opentelemetry import trace as trace_api +from opentelemetry.semconv._incubating.attributes import ( + gen_ai_attributes as GenAI, +) +from opentelemetry.trace import ( + SpanKind, + Status, + StatusCode, + Tracer, + set_span_in_context, +) + +from opentelemetry.instrumentation.webarena.config import ( + capture_message_content, +) +from opentelemetry.instrumentation.webarena.internal import _state as state +from opentelemetry.instrumentation.webarena.internal._attrs import ( + FRAMEWORK_NAME, + GEN_AI_FRAMEWORK, + GEN_AI_REACT_FINISH_REASON, + GEN_AI_REACT_ROUND, + GEN_AI_SPAN_KIND, + WEBARENA_ACTION_SET_TAG, + WEBARENA_ACTION_TYPE, + WEBARENA_BROWSER_ELEMENT_ID, + WEBARENA_FAIL_ERROR, + WEBARENA_MEMORY_OBS_TEXT_LENGTH, + WEBARENA_MEMORY_TRAJECTORY_LENGTH, + WEBARENA_OBSERVATION_MAIN_TYPE, + WEBARENA_OBSERVATION_TYPE, + WEBARENA_PAGE_URL_AFTER, + WEBARENA_PAGE_URL_BEFORE, + WEBARENA_PARSING_FAILURE_COUNT, + WEBARENA_PREVIOUS_ACTION, + WEBARENA_REQUIRE_LOGIN, + WEBARENA_SITES, + WEBARENA_STEP_COUNT, + WEBARENA_TASK_ID, + WEBARENA_TOOL_COUNT, + action_arguments, + action_type_name, + messages_to_input_value, + safe_json_dumps, + truncate, + truncate_content, +) + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Generic helpers +# --------------------------------------------------------------------------- + + +def _read_config_file(options: dict[str, Any] | None) -> dict[str, Any] | None: + """Best-effort: load the WebArena task config attached to ``env.reset``.""" + if not options or not isinstance(options, dict): + return None + cfg_file = options.get("config_file") + if not cfg_file: + return None + try: + import json as _json # noqa: PLC0415 + with open(cfg_file, "r", encoding="utf-8") as f: + data = _json.load(f) + if isinstance(data, dict): + return data + except Exception: # noqa: BLE001 + return None + return None + + +def _set_common_attrs(span: trace_api.Span, kind: str) -> None: + span.set_attribute(GEN_AI_SPAN_KIND, kind) + span.set_attribute(GEN_AI_FRAMEWORK, FRAMEWORK_NAME) + + + +# --------------------------------------------------------------------------- +# ENTRY / CHAIN lifecycle (driven by ScriptBrowserEnv.reset / .close) +# --------------------------------------------------------------------------- + + +def _open_task_spans( + tracer: Tracer, + options: dict[str, Any] | None, +) -> None: + """Start ENTRY + CHAIN spans for a fresh WebArena task.""" + + # Finalise any spans left open by the previous task (writes summary + # attributes such as step.count before closing). When called for the + # very first task this is a no-op. + _close_task_spans() + + cfg = _read_config_file(options) or {} + task_id = cfg.get("task_id") + intent = cfg.get("intent") or "" + sites = cfg.get("sites") or [] + require_login = bool(cfg.get("storage_state")) + + span_name = ( + f"enter webarena_task {task_id}" + if task_id is not None + else "enter webarena_task" + ) + entry_span = tracer.start_span(span_name, kind=SpanKind.INTERNAL) + _set_common_attrs(entry_span, "ENTRY") + entry_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "enter") + if task_id is not None: + entry_span.set_attribute(WEBARENA_TASK_ID, str(task_id)) + try: + entry_span.set_attribute( + GenAI.GEN_AI_CONVERSATION_ID, str(task_id) + ) + except Exception: # noqa: BLE001 + pass + if sites: + entry_span.set_attribute(WEBARENA_SITES, safe_json_dumps(sites)) + entry_span.set_attribute(WEBARENA_REQUIRE_LOGIN, require_login) + if intent and capture_message_content(): + entry_span.set_attribute("input.value", truncate_content(intent)) + + entry_token = otel_context.attach(set_span_in_context(entry_span)) + state.set_entry(entry_span, entry_token) + + chain_span = tracer.start_span( + "workflow webarena_task", kind=SpanKind.INTERNAL + ) + _set_common_attrs(chain_span, "CHAIN") + chain_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "workflow") + if intent and capture_message_content(): + chain_span.set_attribute("input.value", truncate_content(intent)) + chain_token = otel_context.attach(set_span_in_context(chain_span)) + state.set_chain(chain_span, chain_token) + + state.mark_in_task(True) + + # Stash the resolved task_id on the entry span attributes for later use. + if task_id is not None: + try: + chain_span.set_attribute(WEBARENA_TASK_ID, str(task_id)) + except Exception: # noqa: BLE001 + pass + + +def _close_task_spans() -> None: + """Finalise CHAIN/ENTRY: write summary attributes and call ``end()``.""" + + chain = state.get_chain_span() + entry = state.get_entry_span() + steps = state.step_count() + tools = state.tool_count() + failures = state.parsing_failure_count() + if chain is not None: + try: + chain.set_attribute(WEBARENA_STEP_COUNT, steps) + chain.set_attribute(WEBARENA_TOOL_COUNT, tools) + chain.set_attribute(WEBARENA_PARSING_FAILURE_COUNT, failures) + except Exception: # noqa: BLE001 + pass + if entry is not None: + try: + entry.set_attribute(WEBARENA_STEP_COUNT, steps) + except Exception: # noqa: BLE001 + pass + state.end_task_spans() + + +# --------------------------------------------------------------------------- +# ScriptBrowserEnv.reset / .close +# --------------------------------------------------------------------------- + + +class EnvResetWrapper: + """Open ENTRY+CHAIN spans for a new task on every ``env.reset``.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer) -> None: + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + options = kwargs.get("options") + _open_task_spans(self._tracer, options) + try: + return wrapped(*args, **kwargs) + except BaseException as exc: + entry = state.get_entry_span() + if entry is not None: + try: + entry.record_exception(exc) + entry.set_status(Status(StatusCode.ERROR)) + except Exception: # noqa: BLE001 + pass + _close_task_spans() + raise + + +class EnvCloseWrapper: + """Close any still-open ENTRY/CHAIN/STEP at end of the batch.""" + + __slots__ = () + + def __init__(self) -> None: + pass + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + try: + return wrapped(*args, **kwargs) + finally: + _close_task_spans() + + +# --------------------------------------------------------------------------- +# PromptAgent.next_action → AGENT(invoke_agent), drives STEP rotation +# --------------------------------------------------------------------------- + + +def _rotate_step(tracer: Tracer) -> trace_api.Span: + """End the previous STEP and open a new one as a child of CHAIN.""" + state.end_step() + round_no = state.increment_step() + step_span = tracer.start_span("react step", kind=SpanKind.INTERNAL) + _set_common_attrs(step_span, "STEP") + step_span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "react") + step_span.set_attribute(GEN_AI_REACT_ROUND, round_no) + token = otel_context.attach(set_span_in_context(step_span)) + state.set_step(step_span, token) + return step_span + + +class NextActionWrapper: + """Wrap ``PromptAgent.next_action`` as AGENT(invoke_agent).""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer) -> None: + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + # Each call to next_action begins a new ReAct round. + if state.in_task(): + _rotate_step(self._tracer) + + agent_class = instance.__class__.__name__ + try: + instr_path = getattr( + instance.prompt_constructor, "instruction_path", None + ) + instr_stem = getattr(instr_path, "stem", None) if instr_path else None + except Exception: # noqa: BLE001 + instr_stem = None + agent_name = ( + f"{agent_class}:{instr_stem}" if instr_stem else agent_class + ) + span_name = f"invoke_agent {agent_class}" + + meta_data: dict[str, Any] = {} + if len(args) >= 3 and isinstance(args[2], dict): + meta_data = args[2] + elif "meta_data" in kwargs and isinstance(kwargs["meta_data"], dict): + meta_data = kwargs["meta_data"] + + intent: str | None = None + if len(args) >= 2 and isinstance(args[1], str): + intent = args[1] + elif "intent" in kwargs and isinstance(kwargs["intent"], str): + intent = kwargs["intent"] + + with self._tracer.start_as_current_span( + span_name, kind=SpanKind.INTERNAL + ) as span: + _set_common_attrs(span, "AGENT") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.INVOKE_AGENT.value, + ) + span.set_attribute(GenAI.GEN_AI_AGENT_NAME, agent_name) + try: + lm_cfg = getattr(instance, "lm_config", None) + if lm_cfg is not None: + provider = getattr(lm_cfg, "provider", None) + model = getattr(lm_cfg, "model", None) + if provider: + span.set_attribute( + GenAI.GEN_AI_PROVIDER_NAME, str(provider) + ) + if model: + span.set_attribute( + GenAI.GEN_AI_REQUEST_MODEL, str(model) + ) + except Exception: # noqa: BLE001 + pass + + previous = "None" + if meta_data: + history = meta_data.get("action_history") + if isinstance(history, list) and history: + previous = str(history[-1]) + span.set_attribute(WEBARENA_PREVIOUS_ACTION, truncate(previous)) + + if intent and capture_message_content(): + span.set_attribute("input.value", truncate_content(intent)) + + try: + action = wrapped(*args, **kwargs) + except BaseException as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + span.set_attribute( + GEN_AI_REACT_FINISH_REASON, type(exc).__qualname__ + ) + # Tag STEP too, so the failing round is easy to spot. + step_span = state.get_step_span() + if step_span is not None: + try: + step_span.set_attribute( + GEN_AI_REACT_FINISH_REASON, + type(exc).__qualname__, + ) + step_span.set_status(Status(StatusCode.ERROR)) + except Exception: # noqa: BLE001 + pass + raise + + # Successful next_action — record action info and propagate to STEP. + atype = action_type_name(action) + span.set_attribute(WEBARENA_ACTION_TYPE, atype) + raw_pred = ( + action.get("raw_prediction") if isinstance(action, dict) else None + ) + if raw_pred and capture_message_content(): + span.set_attribute( + "output.value", truncate_content(str(raw_pred)) + ) + + if atype == "NONE": + # PromptAgent fell through every retry of action parsing. + state.increment_parsing_failure() + + step_span = state.get_step_span() + if step_span is not None: + try: + step_span.set_attribute(WEBARENA_ACTION_TYPE, atype) + if atype == "STOP": + step_span.set_attribute( + GEN_AI_REACT_FINISH_REASON, "stop" + ) + elif atype == "NONE": + step_span.set_attribute( + GEN_AI_REACT_FINISH_REASON, "parse_failure" + ) + except Exception: # noqa: BLE001 + pass + + return action + + +# --------------------------------------------------------------------------- +# PromptConstructor.construct → TASK(build_prompt_context) +# --------------------------------------------------------------------------- + + +class PromptConstructWrapper: + """Emit a TASK span for each prompt-construction call.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer) -> None: + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + trajectory = args[0] if len(args) >= 1 else kwargs.get("trajectory") + intent = args[1] if len(args) >= 2 else kwargs.get("intent") + meta_data = args[2] if len(args) >= 3 else kwargs.get("meta_data") or {} + + with self._tracer.start_as_current_span( + "run_task build_prompt_context", kind=SpanKind.INTERNAL + ) as span: + _set_common_attrs(span, "TASK") + span.set_attribute(GenAI.GEN_AI_OPERATION_NAME, "run_task") + span.set_attribute("webarena.task.name", "build_prompt_context") + + try: + if trajectory is not None: + span.set_attribute( + WEBARENA_MEMORY_TRAJECTORY_LENGTH, + int(len(trajectory)), + ) + except Exception: # noqa: BLE001 + pass + + previous = "None" + if isinstance(meta_data, dict): + history = meta_data.get("action_history") + if isinstance(history, list) and history: + previous = str(history[-1]) + + url_before = "" + try: + if ( + trajectory is not None + and len(trajectory) > 0 + and isinstance(trajectory[-1], dict) + ): + info = trajectory[-1].get("info") or {} + page = info.get("page") if isinstance(info, dict) else None + if page is not None and getattr(page, "url", None): + url_before = str(page.url) + except Exception: # noqa: BLE001 + pass + + if capture_message_content(): + input_summary = { + "intent": str(intent) if intent else "", + "url": url_before, + "previous_action": previous, + } + span.set_attribute( + "input.value", safe_json_dumps(input_summary) + ) + + try: + prompt = wrapped(*args, **kwargs) + except BaseException as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + + try: + if isinstance(prompt, list): + span.set_attribute( + "webarena.prompt.messages_count", len(prompt) + ) + elif isinstance(prompt, str): + span.set_attribute( + "webarena.prompt.length", len(prompt) + ) + except Exception: # noqa: BLE001 + pass + + try: + obs_modality = getattr(instance, "obs_modality", None) + if ( + obs_modality + and trajectory is not None + and len(trajectory) > 0 + and isinstance(trajectory[-1], dict) + ): + obs = trajectory[-1].get("observation") + if isinstance(obs, dict) and obs_modality in obs: + span.set_attribute( + WEBARENA_MEMORY_OBS_TEXT_LENGTH, + int(len(obs[obs_modality])), + ) + except Exception: # noqa: BLE001 + pass + + if capture_message_content(): + span.set_attribute( + "output.value", messages_to_input_value(prompt) + ) + return prompt + + +# --------------------------------------------------------------------------- +# ScriptBrowserEnv.step → TOOL(execute_tool) +# --------------------------------------------------------------------------- + + +class EnvStepWrapper: + """Wrap a single browser action execution as a TOOL span.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer) -> None: + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + action = args[0] if args else kwargs.get("action") + atype = action_type_name(action) + + url_before = "" + try: + page = getattr(instance, "page", None) + if page is not None and getattr(page, "url", None): + url_before = str(page.url) + except Exception: # noqa: BLE001 + pass + + with self._tracer.start_as_current_span( + f"execute_tool {atype}", kind=SpanKind.INTERNAL + ) as span: + _set_common_attrs(span, "TOOL") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.EXECUTE_TOOL.value, + ) + span.set_attribute(GenAI.GEN_AI_TOOL_NAME, atype) + span.set_attribute(GenAI.GEN_AI_TOOL_TYPE, "browser_action") + if url_before: + span.set_attribute( + WEBARENA_PAGE_URL_BEFORE, truncate(url_before) + ) + + try: + main_obs = getattr(instance, "main_observation_type", None) + if main_obs: + span.set_attribute( + WEBARENA_OBSERVATION_MAIN_TYPE, str(main_obs) + ) + except Exception: # noqa: BLE001 + pass + + if isinstance(action, dict): + eid = action.get("element_id") + if eid: + span.set_attribute( + WEBARENA_BROWSER_ELEMENT_ID, str(eid) + ) + + if capture_message_content(): + span.set_attribute( + GenAI.GEN_AI_TOOL_CALL_ARGUMENTS, + safe_json_dumps(action_arguments(action)), + ) + + state.increment_tool() + + try: + result = wrapped(*args, **kwargs) + except BaseException as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + + url_after = "" + try: + page = getattr(instance, "page", None) + if page is not None and getattr(page, "url", None): + url_after = str(page.url) + except Exception: # noqa: BLE001 + pass + if url_after: + span.set_attribute( + WEBARENA_PAGE_URL_AFTER, truncate(url_after) + ) + + success = False + fail_error = "" + terminated = False + if isinstance(result, tuple) and len(result) >= 5: + try: + success = bool(result[1]) + terminated = bool(result[2]) + info = result[4] or {} + if isinstance(info, dict): + fail_error = str(info.get("fail_error") or "") + except Exception: # noqa: BLE001 + pass + + span.set_attribute("webarena.tool.success", success) + if fail_error: + span.set_attribute(WEBARENA_FAIL_ERROR, truncate(fail_error)) + span.set_status(Status(StatusCode.ERROR, fail_error)) + + if capture_message_content(): + span.set_attribute( + GenAI.GEN_AI_TOOL_CALL_RESULT, + safe_json_dumps( + { + "success": success, + "fail_error": fail_error, + "url_after": url_after, + "terminated": terminated, + } + ), + ) + + step_span = state.get_step_span() + if step_span is not None and terminated: + try: + step_span.set_attribute( + GEN_AI_REACT_FINISH_REASON, "terminated" + ) + except Exception: # noqa: BLE001 + pass + + return result + + +# --------------------------------------------------------------------------- +# construct_agent → AGENT(create_agent) +# --------------------------------------------------------------------------- + + +class ConstructAgentWrapper: + """Wrap the agent factory as a one-shot AGENT(create_agent) span.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer) -> None: + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + ns_args = args[0] if args else kwargs.get("args") + agent_type = getattr(ns_args, "agent_type", None) or "unknown" + provider = getattr(ns_args, "provider", None) or "" + model = getattr(ns_args, "model", None) or "" + instr_path = getattr(ns_args, "instruction_path", None) or "" + action_set = getattr(ns_args, "action_set_tag", None) or "" + + with self._tracer.start_as_current_span( + f"create_agent {FRAMEWORK_NAME}", kind=SpanKind.INTERNAL + ) as span: + _set_common_attrs(span, "AGENT") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, "create_agent" + ) + span.set_attribute( + GenAI.GEN_AI_AGENT_NAME, + truncate(f"{agent_type}:{instr_path}"), + ) + span.set_attribute( + GenAI.GEN_AI_AGENT_DESCRIPTION, + truncate( + f"provider={provider}, model={model}, action_set={action_set}" + ), + ) + try: + aid = hashlib.md5( + f"{provider}:{model}:{instr_path}:{action_set}".encode("utf-8") + ).hexdigest()[:16] + span.set_attribute(GenAI.GEN_AI_AGENT_ID, aid) + except Exception: # noqa: BLE001 + pass + if provider: + span.set_attribute(GenAI.GEN_AI_PROVIDER_NAME, str(provider)) + if model: + span.set_attribute(GenAI.GEN_AI_REQUEST_MODEL, str(model)) + if action_set: + span.set_attribute(WEBARENA_ACTION_SET_TAG, str(action_set)) + obs_type = getattr(ns_args, "observation_type", None) + if obs_type: + span.set_attribute(WEBARENA_OBSERVATION_TYPE, str(obs_type)) + + try: + result = wrapped(*args, **kwargs) + except BaseException as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + return result + + +# --------------------------------------------------------------------------- +# generate_from_huggingface_completion → LLM(text_completion) +# --------------------------------------------------------------------------- + + +class HuggingFaceCompletionWrapper: + """LLM span for the only WebArena LLM call **not** going through OpenAI SDK.""" + + __slots__ = ("_tracer",) + + def __init__(self, tracer: Tracer) -> None: + self._tracer = tracer + + def __call__( + self, + wrapped: Callable[..., Any], + instance: Any, + args: tuple[Any, ...], + kwargs: dict[str, Any], + ) -> Any: + # Signature: + # generate_from_huggingface_completion( + # prompt, model_endpoint, temperature, top_p, max_new_tokens, + # stop_sequences=None, + # ) + def _arg(idx: int, name: str, default: Any = None) -> Any: + if len(args) > idx: + return args[idx] + return kwargs.get(name, default) + + prompt = _arg(0, "prompt", "") + model_endpoint = _arg(1, "model_endpoint", "") + temperature = _arg(2, "temperature") + top_p = _arg(3, "top_p") + max_new_tokens = _arg(4, "max_new_tokens") + stop_sequences = _arg(5, "stop_sequences") + + span_name = f"text_completion {model_endpoint or 'huggingface'}" + with self._tracer.start_as_current_span( + span_name, kind=SpanKind.CLIENT + ) as span: + _set_common_attrs(span, "LLM") + span.set_attribute( + GenAI.GEN_AI_OPERATION_NAME, + GenAI.GenAiOperationNameValues.TEXT_COMPLETION.value, + ) + span.set_attribute(GenAI.GEN_AI_PROVIDER_NAME, "huggingface") + if model_endpoint: + span.set_attribute( + GenAI.GEN_AI_REQUEST_MODEL, str(model_endpoint) + ) + span.set_attribute( + GenAI.GEN_AI_RESPONSE_MODEL, str(model_endpoint) + ) + try: + if temperature is not None: + span.set_attribute( + GenAI.GEN_AI_REQUEST_TEMPERATURE, float(temperature) + ) + if top_p is not None: + span.set_attribute( + GenAI.GEN_AI_REQUEST_TOP_P, float(top_p) + ) + if max_new_tokens is not None: + span.set_attribute( + GenAI.GEN_AI_REQUEST_MAX_TOKENS, int(max_new_tokens) + ) + except (TypeError, ValueError): + pass + if stop_sequences: + try: + span.set_attribute( + GenAI.GEN_AI_REQUEST_STOP_SEQUENCES, + list(stop_sequences), + ) + except Exception: # noqa: BLE001 + pass + if capture_message_content() and isinstance(prompt, str) and prompt: + span.set_attribute( + "input.value", truncate_content(prompt) + ) + + try: + generation = wrapped(*args, **kwargs) + except BaseException as exc: + span.record_exception(exc) + span.set_status(Status(StatusCode.ERROR)) + raise + + if capture_message_content() and isinstance(generation, str): + span.set_attribute( + "output.value", truncate_content(generation) + ) + span.set_attribute("gen_ai.output.type", "text") + + return generation + + +__all__ = [ + "ConstructAgentWrapper", + "EnvCloseWrapper", + "EnvResetWrapper", + "EnvStepWrapper", + "HuggingFaceCompletionWrapper", + "NextActionWrapper", + "PromptConstructWrapper", +] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/package.py new file mode 100644 index 000000000..63ff43cff --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/package.py @@ -0,0 +1,3 @@ +_instruments = ("webarena >= 0.0.1",) + +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/version.py new file mode 100644 index 000000000..3dc1f76bc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-webarena/src/opentelemetry/instrumentation/webarena/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/README.md new file mode 100644 index 000000000..4b4aac443 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/README.md @@ -0,0 +1,17 @@ +# LoongSuite WideSearch Instrumentation + +OpenTelemetry instrumentation for the [WideSearch](https://github.com/ByteDance-Seed/WideSearch) multi-agent search framework. + +## Installation + +```bash +pip install loongsuite-instrumentation-widesearch +``` + +## Usage + +```python +from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + +WideSearchInstrumentor().instrument() +``` diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/pyproject.toml new file mode 100644 index 000000000..9a819d25a --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/pyproject.toml @@ -0,0 +1,57 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-widesearch" +dynamic = ["version"] +description = "LoongSuite WideSearch Instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.11" +authors = [ + { name = "LoongSuite Python Agent Authors", email = "caishipeng.csp@alibaba-inc.com" }, + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + "opentelemetry-api ~= 1.37", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "opentelemetry-util-genai", + "wrapt >= 1.17.3, < 2.0.0", +] + +[project.optional-dependencies] +instruments = [ + "widesearch >= 0.1.0", +] +test = [ + "pytest ~= 8.0", + "pytest-cov ~= 4.1.0", +] + +[project.entry-points.opentelemetry_instrumentor] +widesearch = "opentelemetry.instrumentation.widesearch:WideSearchInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-widesearch" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/widesearch/version.py" + +[tool.hatch.build.targets.sdist] +include = ["/src", "/tests"] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/__init__.py new file mode 100644 index 000000000..9c441d18f --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/__init__.py @@ -0,0 +1,164 @@ +""" +WideSearch instrumentation supporting `widesearch >= 0.1.0`. + +Usage +----- +.. code:: python + + from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + + WideSearchInstrumentor().instrument() + +API +--- +""" + +from __future__ import annotations + +import logging +from typing import Any, Collection + +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap +from opentelemetry.instrumentation.widesearch.package import _instruments +from opentelemetry.instrumentation.widesearch.patch import ( + wrap_create_sub_agents_factory, + wrap_invoke_tool_call, + wrap_run_single_query, + wrap_runner_run, + wrap_runner_step, +) +from opentelemetry.instrumentation.widesearch.version import __version__ +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler + +logger = logging.getLogger(__name__) + +_RUN_MODULE = "src.agent.run" +_MULTI_AGENT_MODULE = "src.agent.multi_agent_tools" + +__all__ = ["WideSearchInstrumentor", "__version__"] + + +class WideSearchInstrumentor(BaseInstrumentor): + """OpenTelemetry instrumentor for WideSearch framework. + + Instruments the following components: + - run_single_query(): ENTRY span + - Runner.run(): AGENT span (async generator) + - Runner._step(): STEP span + - Runner._invoke_tool_call(): TOOL spans + - create_sub_agents_wrap(): TASK span + """ + + def __init__(self): + super().__init__() + self._handler = None + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + tracer_provider = kwargs.get("tracer_provider") + meter_provider = kwargs.get("meter_provider") + logger_provider = kwargs.get("logger_provider") + + self._handler = ExtendedTelemetryHandler( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + logger_provider=logger_provider, + ) + + # H1: ENTRY span + try: + wrap_function_wrapper( + module=_RUN_MODULE, + name="run_single_query", + wrapper=lambda w, i, a, k: wrap_run_single_query( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented run_single_query") + except Exception as e: + logger.warning(f"Failed to instrument run_single_query: {e}") + + # H2: AGENT span + try: + wrap_function_wrapper( + module=_RUN_MODULE, + name="Runner.run", + wrapper=lambda w, i, a, k: wrap_runner_run( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Runner.run") + except Exception as e: + logger.warning(f"Failed to instrument Runner.run: {e}") + + # H3: STEP span + try: + wrap_function_wrapper( + module=_RUN_MODULE, + name="Runner._step", + wrapper=lambda w, i, a, k: wrap_runner_step( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Runner._step") + except Exception as e: + logger.warning(f"Failed to instrument Runner._step: {e}") + + # H4: TOOL spans + try: + wrap_function_wrapper( + module=_RUN_MODULE, + name="Runner._invoke_tool_call", + wrapper=lambda w, i, a, k: wrap_invoke_tool_call( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented Runner._invoke_tool_call") + except Exception as e: + logger.warning( + f"Failed to instrument Runner._invoke_tool_call: {e}" + ) + + # H5: TASK span (wrap factory) + try: + wrap_function_wrapper( + module=_MULTI_AGENT_MODULE, + name="create_sub_agents_wrap", + wrapper=lambda w, i, a, k: wrap_create_sub_agents_factory( + w, i, a, k, handler=self._handler + ), + ) + logger.debug("Instrumented create_sub_agents_wrap") + except Exception as e: + logger.warning( + f"Failed to instrument create_sub_agents_wrap: {e}" + ) + + def _uninstrument(self, **kwargs: Any) -> None: + try: + import src.agent.run # noqa: PLC0415 + + unwrap(src.agent.run, "run_single_query") + unwrap(src.agent.run.Runner, "run") + unwrap(src.agent.run.Runner, "_step") + unwrap(src.agent.run.Runner, "_invoke_tool_call") + logger.debug("Uninstrumented src.agent.run") + except Exception as e: + logger.warning(f"Failed to uninstrument src.agent.run: {e}") + + try: + import src.agent.multi_agent_tools # noqa: PLC0415 + + unwrap(src.agent.multi_agent_tools, "create_sub_agents_wrap") + logger.debug("Uninstrumented src.agent.multi_agent_tools") + except Exception as e: + logger.warning( + f"Failed to uninstrument src.agent.multi_agent_tools: {e}" + ) + + self._handler = None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/package.py new file mode 100644 index 000000000..bd0572292 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/package.py @@ -0,0 +1,2 @@ +_instruments = ("widesearch >= 0.1.0",) +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/patch.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/patch.py new file mode 100644 index 000000000..0813a7c8e --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/patch.py @@ -0,0 +1,348 @@ +"""Patch functions for WideSearch instrumentation. + +Wraps key WideSearch methods to generate OpenTelemetry spans: +- run_single_query -> ENTRY span +- Runner.run -> AGENT span (async generator) +- Runner._step -> STEP span +- Runner._invoke_tool_call -> TOOL spans (one per tool_call) +- create_sub_agents_wrap -> TASK span (on returned closure) +""" + +from __future__ import annotations + +import asyncio +import json +import logging +from contextvars import ContextVar + +from opentelemetry.trace import SpanKind, StatusCode +from opentelemetry.trace.status import Status +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler +from opentelemetry.util.genai.extended_types import ReactStepInvocation +from opentelemetry.util.genai.types import Error + +from .utils import ( + _create_agent_invocation, + _create_entry_invocation, + _create_tool_invocation, + _extract_output_messages, + _step_to_output_messages, +) + +logger = logging.getLogger(__name__) + +_step_counter: ContextVar[int] = ContextVar("ws_step_counter", default=0) +_in_run_single_query: ContextVar[bool] = ContextVar("ws_in_rsq", default=False) + + +async def wrap_run_single_query( + wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler +): + """H1: ENTRY span for run_single_query.""" + if _in_run_single_query.get(): + return await wrapped(*args, **kwargs) + token = _in_run_single_query.set(True) + + query = args[0] if args else kwargs.get("query", "") + system_prompt = kwargs.get("system_prompt") or "" + tools_desc_kw = kwargs.get("tools_desc") + try: + invocation = _create_entry_invocation( + query, + system_prompt=system_prompt or None, + tools_desc=( + tools_desc_kw if isinstance(tools_desc_kw, list) else None + ), + ) + except Exception as e: + logger.debug(f"Failed to create entry invocation: {e}") + _in_run_single_query.reset(token) + return await wrapped(*args, **kwargs) + + handler.start_entry(invocation) + + try: + result = await wrapped(*args, **kwargs) + invocation.output_messages = _extract_output_messages(result) + handler.stop_entry(invocation) + return result + except Exception as e: + handler.fail_entry(invocation, Error(message=str(e), type=type(e))) + raise + finally: + _in_run_single_query.reset(token) + + +async def wrap_runner_run( + wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler +): + """H2: AGENT span for Runner.run (async generator).""" + starting_agent = args[0] if args else kwargs.get("starting_agent") + user_input = args[1] if len(args) > 1 else kwargs.get("user_input", "") + memory = args[2] if len(args) > 2 else kwargs.get("memory") + system_prompt = getattr(memory, "system_instructions", None) + + try: + invocation = _create_agent_invocation( + starting_agent, user_input, system_prompt=system_prompt + ) + except Exception as e: + logger.debug(f"Failed to create agent invocation: {e}") + async for step in wrapped(*args, **kwargs): + yield step + return + + counter_token = _step_counter.set(0) + handler.start_invoke_agent(invocation) + + try: + last_step = None + async for step in wrapped(*args, **kwargs): + last_step = step + yield step + + if last_step: + invocation.output_messages = _step_to_output_messages(last_step) + handler.stop_invoke_agent(invocation) + except GeneratorExit as e: + handler.fail_invoke_agent( + invocation, Error(message="GeneratorExit", type=GeneratorExit) + ) + raise + except Exception as e: + handler.fail_invoke_agent( + invocation, Error(message=str(e), type=type(e)) + ) + raise + finally: + _step_counter.reset(counter_token) + + +async def wrap_runner_step( + wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler +): + """H3: STEP span for Runner._step.""" + step_num = _step_counter.get() + 1 + _step_counter.set(step_num) + + invocation = ReactStepInvocation(round=step_num) + invocation.attributes["gen_ai.framework"] = "widesearch" + + try: + handler.start_react_step(invocation) + except Exception as e: + logger.debug(f"Failed to start react step: {e}") + return await wrapped(*args, **kwargs) + + try: + result = await wrapped(*args, **kwargs) + + from src.agent.memory import ActionStep, ActionStepError, StepStatus + + if isinstance(result, ActionStepError): + invocation.finish_reason = "error" + handler.fail_react_step( + invocation, + Error(message=result.message, type=type(result)), + ) + else: + if result.step_status == StepStatus.FINISHED: + invocation.finish_reason = "finished" + elif result.error_marker is not None: + invocation.finish_reason = "error" + else: + invocation.finish_reason = "continue" + handler.stop_react_step(invocation) + + return result + except Exception as e: + invocation.finish_reason = "error" + handler.fail_react_step( + invocation, Error(message=str(e), type=type(e)) + ) + raise + + +async def wrap_invoke_tool_call( + wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler +): + """H4: TOOL span for each tool_call inside Runner._invoke_tool_call.""" + agent = args[0] if args else kwargs.get("agent") + model_response = args[1] if len(args) > 1 else kwargs.get("model_response") + + if not model_response.outputs: + return await wrapped(*args, **kwargs) + + resp = model_response.outputs[0] + if not resp.tool_calls: + return await wrapped(*args, **kwargs) + + from src.agent.schema import ErrorMarker, ToolCallResult + + async def _call_with_span(tool_call): + try: + invocation = _create_tool_invocation(tool_call, agent) + except Exception as e: + logger.debug(f"Failed to create tool invocation: {e}") + return await _call_original(tool_call, agent) + + handler.start_execute_tool(invocation) + + tool_name = tool_call.tool_name + tool = agent.get_tool_by_name(tool_name) + if tool is None: + invocation.tool_call_result = f"Tool {tool_name} not found" + handler.fail_execute_tool( + invocation, + Error( + message=f"Tool {tool_name} not found", + type=ValueError, + ), + ) + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + error_marker=ErrorMarker(message=f"Tool {tool_name} not found"), + ) + + arguments = tool_call.arguments + if isinstance(arguments, str): + try: + arguments = json.loads(arguments) + except json.JSONDecodeError: + arguments = {} + + try: + response = await tool(**arguments) + except Exception as e: + invocation.tool_call_result = str(e) + handler.fail_execute_tool( + invocation, Error(message=str(e), type=type(e)) + ) + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + error_marker=ErrorMarker(message=str(e)), + ) + + error_marker = ( + ErrorMarker(message=response.error) if response.error else None + ) + system_error_marker = ( + ErrorMarker(message=response.system_error) + if response.system_error + else None + ) + + result_content = response.data + invocation.tool_call_result = result_content + + if error_marker or system_error_marker: + msg = (error_marker or system_error_marker)["message"] + handler.fail_execute_tool( + invocation, Error(message=msg, type=RuntimeError) + ) + else: + handler.stop_execute_tool(invocation) + + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + content=result_content, + error_marker=error_marker, + system_error_marker=system_error_marker, + extra=response.extra if response.extra else {}, + ) + + async def _call_original(tool_call, agent): + """Fallback: execute tool without span.""" + tool_name = tool_call.tool_name + tool = agent.get_tool_by_name(tool_name) + if tool is None: + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + error_marker=ErrorMarker(message=f"Tool {tool_name} not found"), + ) + arguments = tool_call.arguments + if isinstance(arguments, str): + try: + arguments = json.loads(arguments) + except json.JSONDecodeError: + arguments = {} + try: + response = await tool(**arguments) + except Exception as e: + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + error_marker=ErrorMarker(message=str(e)), + ) + return ToolCallResult( + tool_call_id=tool_call.tool_call_id, + content=response.data, + error_marker=( + ErrorMarker(message=response.error) if response.error else None + ), + system_error_marker=( + ErrorMarker(message=response.system_error) + if response.system_error + else None + ), + extra=response.extra if response.extra else {}, + ) + + tasks = [_call_with_span(tc) for tc in resp.tool_calls] + results = await asyncio.gather(*tasks) + return [r for r in results if r is not None] + + +def wrap_create_sub_agents_factory( + wrapped, instance, args, kwargs, *, handler: ExtendedTelemetryHandler +): + """H5: TASK span wrapping the closure returned by create_sub_agents_wrap.""" + original_closure = wrapped(*args, **kwargs) + + async def closure_with_task_span(sub_agents): + tracer = handler._tracer + span_name = "run_task create_sub_agents" + + with tracer.start_as_current_span( + name=span_name, + kind=SpanKind.INTERNAL, + ) as span: + span.set_attribute("gen_ai.span.kind", "TASK") + span.set_attribute("gen_ai.operation.name", "run_task") + span.set_attribute("gen_ai.framework", "widesearch") + + try: + safe_input = json.dumps( + [ + { + "index": sa.get("index"), + "prompt": sa.get("prompt", "")[:200], + } + for sa in sub_agents + ], + ensure_ascii=False, + ) + span.set_attribute("input.value", safe_input) + except Exception: + pass + + try: + result = await original_closure(sub_agents) + + if result and hasattr(result, "data") and result.data: + output_str = ( + result.data + if isinstance(result.data, str) + else json.dumps(result.data, ensure_ascii=False) + ) + if len(output_str) > 4096: + output_str = output_str[:4096] + "...(truncated)" + span.set_attribute("output.value", output_str) + + span.set_status(Status(StatusCode.OK)) + return result + except Exception as e: + span.record_exception(e) + span.set_status(Status(StatusCode.ERROR, str(e))) + raise + + return closure_with_task_span diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/utils.py new file mode 100644 index 000000000..8f85f6c6f --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/utils.py @@ -0,0 +1,202 @@ +"""Utility functions for WideSearch instrumentation.""" + +from __future__ import annotations + +import json +import logging +from typing import Any, List, Optional + +from opentelemetry.util.genai.extended_types import ( + EntryInvocation, + ExecuteToolInvocation, + InvokeAgentInvocation, + ReactStepInvocation, +) +from opentelemetry.util.genai.types import ( + FunctionToolDefinition, + InputMessage, + OutputMessage, + Text, + ToolCall as GenAIToolCall, + ToolCallResponse, +) + +logger = logging.getLogger(__name__) + + +_FRAMEWORK = "widesearch" + + +def _create_entry_invocation( + query: str, + *, + system_prompt: Optional[str] = None, + tools_desc: Optional[List[dict[str, Any]]] = None, +) -> EntryInvocation: + invocation = EntryInvocation() + invocation.input_messages = [ + InputMessage(role="user", parts=[Text(content=query)]) + ] + invocation.attributes["gen_ai.framework"] = _FRAMEWORK + if system_prompt: + invocation.system_instruction = [Text(content=system_prompt)] + + defs = None + if tools_desc: + defs = _convert_tools_desc(tools_desc) + if defs is not None: + invocation.tool_definitions = defs + + return invocation + + +def _create_agent_invocation( + agent: Any, user_input: str, system_prompt: Optional[str] = None +) -> InvokeAgentInvocation: + agent_name = getattr(agent, "name", None) or "widesearch-agent" + + request_model = None + model_config_name = getattr(agent, "model_config_name", None) + if model_config_name: + try: + from src.utils.config import model_config + + request_model = model_config.get(model_config_name, {}).get( + "model_name" + ) + except Exception: + pass + request_model = request_model or model_config_name + + instructions = system_prompt or getattr(agent, "instructions", None) or "" + + invocation = InvokeAgentInvocation( + provider="widesearch", + agent_name=agent_name, + agent_description=instructions[:200] if instructions else "", + request_model=request_model, + input_messages=[ + InputMessage(role="user", parts=[Text(content=user_input)]) + ], + ) + invocation.attributes["gen_ai.framework"] = _FRAMEWORK + + if instructions: + invocation.system_instruction = [Text(content=instructions)] + + tools_desc = getattr(agent, "tools_desc", None) + if tools_desc: + invocation.tool_definitions = _convert_tools_desc(tools_desc) + + return invocation + + +def _create_tool_invocation( + tool_call: Any, agent: Any +) -> ExecuteToolInvocation: + args = tool_call.arguments + if isinstance(args, str): + try: + args = json.loads(args) + except (json.JSONDecodeError, ValueError): + args = {"raw": args} + + description = None + if hasattr(agent, "tools_desc"): + for td in agent.tools_desc: + func = td.get("function", {}) + if func.get("name") == tool_call.tool_name: + description = func.get("description") + break + + invocation = ExecuteToolInvocation( + tool_name=tool_call.tool_name, + tool_call_id=getattr(tool_call, "tool_call_id", None), + tool_call_arguments=args, + tool_description=description, + tool_type="function", + ) + invocation.attributes["gen_ai.framework"] = _FRAMEWORK + return invocation + + +def _extract_output_messages(messages: Any) -> List[OutputMessage]: + """Extract output messages from run_single_query return value.""" + if not messages: + return [] + last_msg = messages[-1] + content = "" + if isinstance(last_msg, dict): + c = last_msg.get("content", {}) + if isinstance(c, dict): + content = c.get("content", "") + elif isinstance(c, str): + content = c + return [ + OutputMessage( + role="assistant", + parts=[Text(content=content)], + finish_reason="stop", + ) + ] + + +def _step_to_output_messages(step: Any) -> List[OutputMessage]: + """Extract output messages from an ActionStep.""" + content = getattr(step, "content", None) or "" + parts = [] + if content: + parts.append(Text(content=content)) + + for tool_call in getattr(step, "tool_calls", []) or []: + args = getattr(tool_call, "arguments", None) + if isinstance(args, str): + try: + args = json.loads(args) + except (json.JSONDecodeError, ValueError): + pass + parts.append( + GenAIToolCall( + id=getattr(tool_call, "tool_call_id", None), + name=getattr(tool_call, "tool_name", ""), + arguments=args, + ) + ) + + for tool_result in getattr(step, "tool_call_results", []) or []: + result = getattr(tool_result, "content", None) + if result is None and getattr(tool_result, "error_marker", None): + result = getattr(tool_result, "error_marker", {}).get("message") + parts.append( + ToolCallResponse( + id=getattr(tool_result, "tool_call_id", None), + response=result, + ) + ) + + finish_reason = "tool_calls" if getattr(step, "tool_calls", None) else "stop" + return [ + OutputMessage( + role="assistant", + parts=parts or [Text(content="")], + finish_reason=finish_reason, + ) + ] + + +def _convert_tools_desc( + tools_desc: List[dict], +) -> Optional[List[FunctionToolDefinition]]: + """Convert WideSearch tools_desc to FunctionToolDefinition list.""" + result = [] + for td in tools_desc: + if td.get("type") == "function": + func = td.get("function", {}) + result.append( + FunctionToolDefinition( + name=func.get("name", ""), + description=func.get("description"), + parameters=func.get("parameters"), + ) + ) + return result if result else None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/version.py new file mode 100644 index 000000000..26056b5d8 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/src/opentelemetry/instrumentation/widesearch/version.py @@ -0,0 +1 @@ +__version__ = "0.5.0.dev" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/conftest.py new file mode 100644 index 000000000..461bf8e1f --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/conftest.py @@ -0,0 +1,412 @@ +"""Test configuration for WideSearch instrumentation tests. + +Injects lightweight stub modules for `src.agent.*` into sys.modules +so that wrap_function_wrapper can find them without installing WideSearch. +""" + +from __future__ import annotations + +import os +import sys +import types +from dataclasses import dataclass, field +from pathlib import Path +from enum import Enum +from typing import Any, Callable, List, Literal + +# Ensure workspace opentelemetry-util-genai is imported (not stale site-packages). +_REPO_ROOT = Path(__file__).resolve().parents[3] +_UTIL_GENAI_SRC = _REPO_ROOT / "util" / "opentelemetry-util-genai" / "src" +if _UTIL_GENAI_SRC.is_dir() and str(_UTIL_GENAI_SRC) not in sys.path: + sys.path.insert(0, str(_UTIL_GENAI_SRC)) + # Plugins or other loaders may pull opentelemetry.util.genai.* from + # site-packages before this conftest runs — drop caches so imports resolve here. + for _m in list(sys.modules): + if _m == "opentelemetry.util.genai" or _m.startswith( + "opentelemetry.util.genai." + ): + del sys.modules[_m] + +_WIDESEARCH_PLUGIN_SRC = Path(__file__).resolve().parents[1] / "src" +if _WIDESEARCH_PLUGIN_SRC.is_dir() and str(_WIDESEARCH_PLUGIN_SRC) not in sys.path: + sys.path.insert(0, str(_WIDESEARCH_PLUGIN_SRC)) + +import pytest + +# --------------------------------------------------------------------------- +# Stub modules for WideSearch (src.agent.*) +# --------------------------------------------------------------------------- + + +class StepStatus(str, Enum): + USER = "USER" + FINISHED = "FINISHED" + CONTINUE = "CONTINUE" + ERROR = "ERROR" + + +@dataclass +class ActionStepError: + message: str + source: Literal["llm"] = "llm" + + +@dataclass +class ToolCall: + tool_name: str + arguments: Any + tool_call_id: str + + +@dataclass +class ErrorMarker: + message: str + + def __getitem__(self, key): + if key == "message": + return self.message + raise KeyError(key) + + +@dataclass +class ToolCallResult: + tool_call_id: str + content: str | None = None + error_marker: Any = None + system_error_marker: Any = None + extra: dict = field(default_factory=dict) + + +@dataclass +class LLMOutputItem: + role: str = "assistant" + content: str | None = None + reasoning_content: str | None = None + signature: str | None = None + tool_calls: list = field(default_factory=list) + + +@dataclass +class ModelResponse: + outputs: list = field(default_factory=list) + session_id: str | None = None + error_marker: Any = None + + +@dataclass +class ActionStep: + step_status: StepStatus = StepStatus.CONTINUE + content: str | None = None + reasoning_content: str | None = None + signature: str | None = None + tool_calls: list = field(default_factory=list) + tool_call_results: list = field(default_factory=list) + error_marker: Any = None + + +@dataclass +class UserInputStep: + user_input: str + step_status: StepStatus = StepStatus.USER + + +@dataclass +class MemoryTurn: + steps: list = field(default_factory=list) + + @property + def step_number(self): + return sum(1 for s in self.steps if isinstance(s, ActionStep)) + + def is_finished(self) -> bool: + if not self.steps: + return False + return self.steps[-1].step_status == StepStatus.FINISHED + + +@dataclass +class MemoryAgent: + system_instructions: str | None = None + turns: list = field(default_factory=list) + + def insert_user_input(self, user_input: str): + turn = MemoryTurn() + turn.steps.append(UserInputStep(user_input=user_input)) + self.turns.append(turn) + return turn + + def insert_action_step(self, action_step): + last_turn = self.turns[-1] + last_turn.steps.append(action_step) + return last_turn + + def to_message(self, **kwargs): + return [] + + +@dataclass +class InternalResponse: + data: Any = None + error: str | None = None + system_error: str | None = None + extra: dict | None = None + + +@dataclass +class Agent: + name: str = "test-agent" + instructions: str | None = "You are a helpful agent." + tools: dict = field(default_factory=dict) + tools_desc: list = field(default_factory=list) + model_config_name: str = "gpt-4o" + + def get_tool_by_name(self, tool_name: str): + return self.tools.get(tool_name) + + +DEFAULT_MAX_STEPS = 50 +DEFAULT_MAX_ERROR_COUNT = 3 + + +class Runner: + _step_override = None # Set to a callable to override _step behavior + + @classmethod + async def run( + cls, + starting_agent, + user_input: str, + memory=None, + *, + max_steps: int = DEFAULT_MAX_STEPS, + llm_error_strategy: str = "retry", + ): + if memory is None: + memory = MemoryAgent( + system_instructions=starting_agent.instructions + ) + last_turn = memory.insert_user_input(user_input) + step_result = await cls._step(agent=starting_agent, memory=memory) + if not isinstance(step_result, ActionStepError): + yield step_result + + @classmethod + async def _step(cls, *, agent, memory) -> ActionStep | ActionStepError: + if cls._step_override is not None: + return await cls._step_override(agent=agent, memory=memory) + return ActionStep(step_status=StepStatus.FINISHED, content="Done") + + @classmethod + async def _invoke_tool_call( + cls, agent, model_response + ) -> list: + return [] + + +async def run_single_query( + query: str, + agent_name: str = "", + model_config_name: str = "", + tools: dict = None, + tools_desc: list = None, + system_prompt: str = "", +): + agent_instructions = ( + system_prompt if system_prompt else "You are a helpful agent." + ) + agent = Agent( + name=agent_name, + tools=tools or {}, + tools_desc=tools_desc or [], + model_config_name=model_config_name, + instructions=agent_instructions, + ) + memory = MemoryAgent(system_instructions=system_prompt) + + # Mirrors real implementation: calls Runner.run as async generator + async for step in Runner.run(agent, query, memory): + pass + + last_content = "final answer" + if memory.turns: + last_turn = memory.turns[-1] + for s in reversed(last_turn.steps): + if isinstance(s, ActionStep) and s.content: + last_content = s.content + break + + return [ + {"role": "user", "content": query}, + {"role": "assistant", "content": {"content": last_content}}, + ] + + +def _default_tools(): + return {} + + +def get_system_prompt(language="zh"): + return "You are a helpful assistant." + + +def create_sub_agents_wrap( + agent_name, model_config_name, tools, tools_desc, system_prompt +): + async def create_sub_agents(sub_agents: list) -> InternalResponse: + import json + + results = [] + for sa in sub_agents: + results.append( + {"index": sa.get("index"), "prompt": sa.get("prompt", ""), "response": "sub result"} + ) + return InternalResponse( + data=json.dumps(results, ensure_ascii=False) + ) + + return create_sub_agents + + +def _inject_stub_modules(): + """Inject stub modules into sys.modules so that wrapt can resolve them.""" + # Create module hierarchy: src -> src.agent -> src.agent.run, etc. + src_mod = types.ModuleType("src") + src_agent_mod = types.ModuleType("src.agent") + src_agent_run_mod = types.ModuleType("src.agent.run") + src_agent_multi_agent_tools_mod = types.ModuleType("src.agent.multi_agent_tools") + src_agent_memory_mod = types.ModuleType("src.agent.memory") + src_agent_schema_mod = types.ModuleType("src.agent.schema") + src_agent_tools_mod = types.ModuleType("src.agent.tools") + src_agent_prompt_mod = types.ModuleType("src.agent.prompt") + src_utils_mod = types.ModuleType("src.utils") + src_utils_config_mod = types.ModuleType("src.utils.config") + + # Populate src.agent.run + src_agent_run_mod.Runner = Runner + src_agent_run_mod.run_single_query = run_single_query + src_agent_run_mod.run_turn = None + src_agent_run_mod.extract_messages_from_memory = None + + # Populate src.agent.multi_agent_tools + src_agent_multi_agent_tools_mod.create_sub_agents_wrap = create_sub_agents_wrap + + # Populate src.agent.memory + src_agent_memory_mod.ActionStep = ActionStep + src_agent_memory_mod.ActionStepError = ActionStepError + src_agent_memory_mod.MemoryAgent = MemoryAgent + src_agent_memory_mod.StepStatus = StepStatus + src_agent_memory_mod.UserInputStep = UserInputStep + + # Populate src.agent.schema + src_agent_schema_mod.ToolCall = ToolCall + src_agent_schema_mod.ToolCallResult = ToolCallResult + src_agent_schema_mod.ModelResponse = ModelResponse + src_agent_schema_mod.ErrorMarker = ErrorMarker + src_agent_schema_mod.LLMOutputItem = LLMOutputItem + + # Populate src.agent.tools + src_agent_tools_mod.InternalResponse = InternalResponse + src_agent_tools_mod._default_tools = {} + + # Populate src.agent.prompt + src_agent_prompt_mod.get_system_prompt = get_system_prompt + + # Populate src.agent.agent + src_agent_agent_mod = types.ModuleType("src.agent.agent") + src_agent_agent_mod.Agent = Agent + src_agent_agent_mod.DEFAULT_MAX_STEPS = DEFAULT_MAX_STEPS + src_agent_agent_mod.DEFAULT_MAX_ERROR_COUNT = DEFAULT_MAX_ERROR_COUNT + + # Populate src.utils.config + src_utils_config_mod.model_config = { + "gpt-4o": {"model_name": "gpt-4o-2024-05-13"}, + } + + # Wire up parent references + src_mod.agent = src_agent_mod + src_mod.utils = src_utils_mod + src_agent_mod.run = src_agent_run_mod + src_agent_mod.multi_agent_tools = src_agent_multi_agent_tools_mod + src_agent_mod.memory = src_agent_memory_mod + src_agent_mod.schema = src_agent_schema_mod + src_agent_mod.tools = src_agent_tools_mod + src_agent_mod.prompt = src_agent_prompt_mod + src_agent_mod.agent = src_agent_agent_mod + + # Register in sys.modules + sys.modules["src"] = src_mod + sys.modules["src.agent"] = src_agent_mod + sys.modules["src.agent.run"] = src_agent_run_mod + sys.modules["src.agent.multi_agent_tools"] = src_agent_multi_agent_tools_mod + sys.modules["src.agent.memory"] = src_agent_memory_mod + sys.modules["src.agent.schema"] = src_agent_schema_mod + sys.modules["src.agent.tools"] = src_agent_tools_mod + sys.modules["src.agent.prompt"] = src_agent_prompt_mod + sys.modules["src.agent.agent"] = src_agent_agent_mod + sys.modules["src.utils"] = src_utils_mod + sys.modules["src.utils.config"] = src_utils_config_mod + + +# Inject stubs before any test imports the instrumentation module +_inject_stub_modules() + + +# --------------------------------------------------------------------------- +# OTel test fixtures +# --------------------------------------------------------------------------- + + +def pytest_configure(config: pytest.Config): + os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental" + os.environ["OTEL_INSTRUMENTATION_GENAI_CAPTURE_MESSAGE_CONTENT"] = "span_only" + + +for _m in list(sys.modules): + if _m.startswith("opentelemetry.instrumentation.widesearch"): + del sys.modules[_m] + +from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import InMemoryMetricReader + + +@pytest.fixture(scope="function", name="span_exporter") +def fixture_span_exporter(): + exporter = InMemorySpanExporter() + yield exporter + + +@pytest.fixture(scope="function", name="metric_reader") +def fixture_metric_reader(): + reader = InMemoryMetricReader() + yield reader + + +@pytest.fixture(scope="function", name="tracer_provider") +def fixture_tracer_provider(span_exporter): + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + return provider + + +@pytest.fixture(scope="function", name="meter_provider") +def fixture_meter_provider(metric_reader): + meter_provider = MeterProvider(metric_readers=[metric_reader]) + return meter_provider + + +@pytest.fixture(scope="function") +def instrument(tracer_provider, meter_provider): + instrumentor = WideSearchInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + yield instrumentor + instrumentor.uninstrument() diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/test_widesearch.py b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/test_widesearch.py new file mode 100644 index 000000000..3f4be12d9 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-widesearch/tests/test_widesearch.py @@ -0,0 +1,797 @@ +"""Tests for WideSearch instrumentation. + +Covers: +- Instrumentor lifecycle (instrument/uninstrument idempotency) +- 5 span types: ENTRY, AGENT, STEP, TOOL, TASK +- Parent-child relationships +- Key attributes +- Error paths +""" + +from __future__ import annotations + +import asyncio +import json +import sys +from dataclasses import field +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from opentelemetry.trace import StatusCode + +from .conftest import ( + ActionStep, + ActionStepError, + Agent, + ErrorMarker, + InternalResponse, + LLMOutputItem, + MemoryAgent, + ModelResponse, + Runner, + StepStatus, + ToolCall, + ToolCallResult, +) + + +def _run_async(coro): + """Helper to run async coroutines in tests.""" + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(coro) + finally: + loop.close() + + +def _run_async_gen(async_gen): + """Helper to consume an async generator.""" + async def _consume(): + results = [] + async for item in async_gen: + results.append(item) + return results + loop = asyncio.new_event_loop() + try: + return loop.run_until_complete(_consume()) + finally: + loop.close() + + +# --------------------------------------------------------------------------- +# Instrumentor Lifecycle Tests +# --------------------------------------------------------------------------- + + +class TestInstrumentorLifecycle: + def test_instrument_and_uninstrument(self, tracer_provider, meter_provider): + from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + + instrumentor = WideSearchInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + assert instrumentor._handler is not None + instrumentor.uninstrument() + assert instrumentor._handler is None + + def test_double_instrument_uninstrument(self, tracer_provider, meter_provider): + from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + + instrumentor = WideSearchInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + instrumentor.uninstrument() + + instrumentor2 = WideSearchInstrumentor() + instrumentor2.instrument( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + skip_dep_check=True, + ) + assert instrumentor2._handler is not None + instrumentor2.uninstrument() + + def test_instrumentation_dependencies(self): + from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + + instrumentor = WideSearchInstrumentor() + deps = instrumentor.instrumentation_dependencies() + assert ("widesearch >= 0.1.0",) == deps + + +# --------------------------------------------------------------------------- +# ENTRY Span Tests (H1: run_single_query) +# --------------------------------------------------------------------------- + + +class TestEntrySpan: + def test_entry_span_created(self, span_exporter, instrument): + """run_single_query should produce an ENTRY span.""" + from src.agent.run import run_single_query + + _run_async(run_single_query("What is AI?", agent_name="searcher")) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + + entry = entry_spans[0] + attrs = dict(entry.attributes) + assert attrs.get("gen_ai.span.kind") == "ENTRY" + assert attrs.get("gen_ai.operation.name") == "enter" + assert attrs.get("gen_ai.framework") == "widesearch" + + def test_entry_span_records_gen_ai_arms_semantic_attrs(self, span_exporter, instrument): + """ENTRY should record input/output messages, but not agent-only metadata. + + Controlled by OTEL_SEMCONV_STABILITY_OPT_IN + SPAN_ONLY capture mode (see conftest). + """ + from src.agent.run import run_single_query + + tools_desc = [ + { + "type": "function", + "function": { + "name": "search_global", + "description": "Search the web", + "properties": {}, + }, + } + ] + + _run_async( + run_single_query( + "What is AI?", + agent_name="searcher", + system_prompt="You are an expert researcher.", + tools_desc=tools_desc, + ) + ) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + attrs = dict(entry_spans[0].attributes) + assert "gen_ai.input.messages" in attrs + assert '"role":"user"' in attrs["gen_ai.input.messages"] + assert "gen_ai.output.messages" in attrs + assert "gen_ai.system_instructions" not in attrs + assert "gen_ai.tool.definitions" not in attrs + + def test_entry_span_error(self, span_exporter, instrument): + """ENTRY span should record ERROR on exception.""" + from src.agent.run import Runner, run_single_query + + async def failing_step(*, agent, memory): + raise RuntimeError("LLM connection failed") + + Runner._step_override = failing_step + + try: + with pytest.raises(RuntimeError, match="LLM connection failed"): + _run_async(run_single_query("test")) + finally: + Runner._step_override = None + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + assert entry_spans[0].status.status_code == StatusCode.ERROR + + +# --------------------------------------------------------------------------- +# AGENT Span Tests (H2: Runner.run) +# --------------------------------------------------------------------------- + + +class TestAgentSpan: + def test_agent_span_created(self, span_exporter, instrument): + """Runner.run should produce an AGENT span.""" + from src.agent.run import Runner + + agent = Agent(name="search-agent", model_config_name="gpt-4o") + + async def _run(): + results = [] + async for step in Runner.run(agent, "Hello"): + results.append(step) + return results + + _run_async(_run()) + + spans = span_exporter.get_finished_spans() + agent_spans = [ + s for s in spans if "invoke_agent" in s.name + ] + assert len(agent_spans) == 1 + + span = agent_spans[0] + attrs = dict(span.attributes) + assert attrs.get("gen_ai.span.kind") == "AGENT" + assert attrs.get("gen_ai.operation.name") == "invoke_agent" + assert attrs.get("gen_ai.agent.name") == "search-agent" + assert attrs.get("gen_ai.framework") == "widesearch" + + def test_agent_span_records_gen_ai_arms_semantic_attrs(self, span_exporter, instrument): + """AGENT invoke_agent should expose ARMS-aligned message/tool attributes.""" + from src.agent.run import Runner + + tools_desc = [ + { + "type": "function", + "function": { + "name": "add", + "description": "Add numbers", + "parameters": {}, + }, + } + ] + + agent = Agent( + name="search-agent", + model_config_name="gpt-4o", + tools_desc=tools_desc, + instructions="Solve tasks with tools.", + ) + + async def _run(): + results = [] + async for step in Runner.run(agent, "Hello"): + results.append(step) + return results + + _run_async(_run()) + + spans = span_exporter.get_finished_spans() + agent_spans = [ + s for s in spans if "invoke_agent" in s.name + ] + assert len(agent_spans) == 1 + attrs = dict(agent_spans[0].attributes) + assert "gen_ai.input.messages" in attrs + assert '"role":"user"' in attrs["gen_ai.input.messages"] + assert "gen_ai.output.messages" in attrs + assert "gen_ai.system_instructions" in attrs + assert "gen_ai.tool.definitions" in attrs + assert "add" in attrs["gen_ai.tool.definitions"] + + def test_agent_span_is_child_of_entry(self, span_exporter, instrument): + """AGENT span should be a child of ENTRY span.""" + from src.agent.run import run_single_query + + _run_async(run_single_query("test query", agent_name="test")) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + agent_spans = [s for s in spans if "invoke_agent" in s.name] + + assert len(entry_spans) == 1 + assert len(agent_spans) == 1 + + entry = entry_spans[0] + agent = agent_spans[0] + assert agent.parent.span_id == entry.context.span_id + + def test_agent_span_error(self, span_exporter, instrument): + """AGENT span should record ERROR when _step raises.""" + from src.agent.run import Runner + + async def failing_step(*, agent, memory): + raise ValueError("Step failure") + + Runner._step_override = failing_step + agent = Agent(name="fail-agent") + + async def _run(): + async for _ in Runner.run(agent, "Hello"): + pass + + try: + with pytest.raises(ValueError): + _run_async(_run()) + finally: + Runner._step_override = None + + spans = span_exporter.get_finished_spans() + agent_spans = [s for s in spans if "invoke_agent" in s.name] + assert len(agent_spans) == 1 + assert agent_spans[0].status.status_code == StatusCode.ERROR + + +# --------------------------------------------------------------------------- +# STEP Span Tests (H3: Runner._step) +# --------------------------------------------------------------------------- + + +class TestStepSpan: + def test_step_span_created(self, span_exporter, instrument): + """Runner._step should produce a STEP span.""" + from src.agent.run import Runner + + agent = Agent(name="stepper") + + async def _run(): + async for _ in Runner.run(agent, "test"): + pass + + _run_async(_run()) + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) >= 1 + + step = step_spans[0] + attrs = dict(step.attributes) + assert attrs.get("gen_ai.span.kind") == "STEP" + assert attrs.get("gen_ai.operation.name") == "react" + assert attrs.get("gen_ai.react.round") == 1 + + def test_step_span_is_child_of_agent(self, span_exporter, instrument): + """STEP span should be child of AGENT span.""" + from src.agent.run import Runner + + agent = Agent(name="stepper") + + async def _run(): + async for _ in Runner.run(agent, "test"): + pass + + _run_async(_run()) + + spans = span_exporter.get_finished_spans() + agent_spans = [s for s in spans if "invoke_agent" in s.name] + step_spans = [s for s in spans if s.name == "react step"] + + assert len(agent_spans) == 1 + assert len(step_spans) >= 1 + + agent_span = agent_spans[0] + step_span = step_spans[0] + assert step_span.parent.span_id == agent_span.context.span_id + + def test_step_span_finish_reason_finished(self, span_exporter, instrument): + """STEP span should have finish_reason='finished' when step finishes.""" + from src.agent.run import Runner + + agent = Agent(name="stepper") + + async def _run(): + async for _ in Runner.run(agent, "test"): + pass + + _run_async(_run()) + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) >= 1 + attrs = dict(step_spans[0].attributes) + assert attrs.get("gen_ai.react.finish_reason") == "finished" + + def test_step_span_error_on_action_step_error( + self, span_exporter, instrument + ): + """STEP span should record ERROR when _step returns ActionStepError.""" + from src.agent.run import Runner + + async def error_step(*, agent, memory): + return ActionStepError(message="LLM timeout") + + Runner._step_override = error_step + agent = Agent(name="error-agent") + + try: + async def _run(): + async for _ in Runner.run(agent, "test"): + pass + + _run_async(_run()) + finally: + Runner._step_override = None + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) >= 1 + assert step_spans[0].status.status_code == StatusCode.ERROR + attrs = dict(step_spans[0].attributes) + assert attrs.get("gen_ai.react.finish_reason") == "error" + + +# --------------------------------------------------------------------------- +# TOOL Span Tests (H4: Runner._invoke_tool_call) +# --------------------------------------------------------------------------- + + +class TestToolSpan: + def test_tool_span_created(self, span_exporter, instrument): + """_invoke_tool_call should produce TOOL spans.""" + from src.agent.run import Runner + + async def mock_tool(**kwargs): + return InternalResponse(data="search results") + + agent = Agent( + name="tool-agent", + tools={"search_global": mock_tool}, + tools_desc=[ + { + "type": "function", + "function": { + "name": "search_global", + "description": "Search the web", + "parameters": {}, + }, + } + ], + ) + + tc = ToolCall( + tool_name="search_global", + arguments='{"q": "AI"}', + tool_call_id="call_123", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc])] + ) + + _run_async(Runner._invoke_tool_call(agent, model_resp)) + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 1 + + span = tool_spans[0] + attrs = dict(span.attributes) + assert attrs.get("gen_ai.span.kind") == "TOOL" + assert attrs.get("gen_ai.operation.name") == "execute_tool" + assert attrs.get("gen_ai.tool.name") == "search_global" + assert attrs.get("gen_ai.tool.call.id") == "call_123" + assert attrs.get("gen_ai.framework") == "widesearch" + + def test_tool_span_records_arguments_and_result( + self, span_exporter, instrument + ): + """TOOL span should record arguments and result.""" + from src.agent.run import Runner + + async def mock_tool(q=""): + return InternalResponse(data=f"results for: {q}") + + agent = Agent( + name="tool-agent", + tools={"search_global": mock_tool}, + ) + + tc = ToolCall( + tool_name="search_global", + arguments=json.dumps({"q": "OpenTelemetry"}), + tool_call_id="call_456", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc])] + ) + + results = _run_async(Runner._invoke_tool_call(agent, model_resp)) + assert len(results) == 1 + assert results[0].content == "results for: OpenTelemetry" + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 1 + attrs = dict(tool_spans[0].attributes) + assert "gen_ai.tool.call.arguments" in attrs + assert "gen_ai.tool.call.result" in attrs + + def test_tool_span_error_on_missing_tool(self, span_exporter, instrument): + """TOOL span should record ERROR when tool not found.""" + from src.agent.run import Runner + + agent = Agent(name="tool-agent", tools={}) + + tc = ToolCall( + tool_name="nonexistent_tool", + arguments="{}", + tool_call_id="call_789", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc])] + ) + + results = _run_async(Runner._invoke_tool_call(agent, model_resp)) + assert len(results) == 1 + assert results[0].error_marker is not None + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 1 + assert tool_spans[0].status.status_code == StatusCode.ERROR + + def test_tool_span_error_on_exception(self, span_exporter, instrument): + """TOOL span should record ERROR when tool raises exception.""" + from src.agent.run import Runner + + async def failing_tool(**kwargs): + raise ConnectionError("Network error") + + agent = Agent( + name="tool-agent", + tools={"flaky_tool": failing_tool}, + ) + + tc = ToolCall( + tool_name="flaky_tool", + arguments="{}", + tool_call_id="call_err", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc])] + ) + + results = _run_async(Runner._invoke_tool_call(agent, model_resp)) + assert len(results) == 1 + assert results[0].error_marker is not None + assert "Network error" in results[0].error_marker.message + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 1 + assert tool_spans[0].status.status_code == StatusCode.ERROR + + def test_multiple_tool_spans(self, span_exporter, instrument): + """Multiple tool_calls should produce multiple TOOL spans.""" + from src.agent.run import Runner + + async def mock_search(**kwargs): + return InternalResponse(data="search result") + + async def mock_browse(**kwargs): + return InternalResponse(data="page content") + + agent = Agent( + name="multi-tool", + tools={ + "search_global": mock_search, + "text_browser_view": mock_browse, + }, + ) + + tc1 = ToolCall( + tool_name="search_global", + arguments='{"q": "test"}', + tool_call_id="call_1", + ) + tc2 = ToolCall( + tool_name="text_browser_view", + arguments='{"url": "http://example.com"}', + tool_call_id="call_2", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc1, tc2])] + ) + + results = _run_async(Runner._invoke_tool_call(agent, model_resp)) + assert len(results) == 2 + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 2 + + +# --------------------------------------------------------------------------- +# TASK Span Tests (H5: create_sub_agents_wrap) +# --------------------------------------------------------------------------- + + +class TestTaskSpan: + def test_task_span_created(self, span_exporter, instrument): + """create_sub_agents closure should produce a TASK span.""" + from src.agent.multi_agent_tools import create_sub_agents_wrap + + closure = create_sub_agents_wrap( + "main-agent", "gpt-4o", {}, [], "system prompt" + ) + + sub_agents = [ + {"index": 0, "prompt": "Search for X"}, + {"index": 1, "prompt": "Search for Y"}, + ] + + result = _run_async(closure(sub_agents)) + assert result is not None + + spans = span_exporter.get_finished_spans() + task_spans = [ + s for s in spans if s.name == "run_task create_sub_agents" + ] + assert len(task_spans) == 1 + + span = task_spans[0] + attrs = dict(span.attributes) + assert attrs.get("gen_ai.span.kind") == "TASK" + assert attrs.get("gen_ai.operation.name") == "run_task" + assert attrs.get("gen_ai.framework") == "widesearch" + assert "input.value" in attrs + + def test_task_span_records_output(self, span_exporter, instrument): + """TASK span should record output.value.""" + from src.agent.multi_agent_tools import create_sub_agents_wrap + + closure = create_sub_agents_wrap( + "agent", "gpt-4o", {}, [], "prompt" + ) + + sub_agents = [{"index": 0, "prompt": "find info"}] + result = _run_async(closure(sub_agents)) + + spans = span_exporter.get_finished_spans() + task_spans = [ + s for s in spans if s.name == "run_task create_sub_agents" + ] + assert len(task_spans) == 1 + attrs = dict(task_spans[0].attributes) + assert "output.value" in attrs + + def test_task_span_error(self, span_exporter, instrument): + """TASK span should record ERROR when closure raises.""" + from src.agent.multi_agent_tools import create_sub_agents_wrap + + # Temporarily replace create_sub_agents_wrap's inner closure behavior + import src.agent.multi_agent_tools as mat + + original = mat.create_sub_agents_wrap + + def error_factory(*args, **kwargs): + original_closure = original(*args, **kwargs) + + async def error_closure(sub_agents): + raise RuntimeError("Sub-agent execution failed") + + return error_closure + + mat.create_sub_agents_wrap = error_factory + + # Re-instrument to pick up the new function + from opentelemetry.instrumentation.widesearch import WideSearchInstrumentor + + instrument.uninstrument() + instrument.instrument( + tracer_provider=span_exporter._tracer_provider + if hasattr(span_exporter, "_tracer_provider") + else None, + skip_dep_check=True, + ) + + # Since re-instrumentation is complex, let's just test the wrapper directly + # by calling the instrumented version + instrument.uninstrument() + + # Simpler approach: directly test the wrap function + from opentelemetry.instrumentation.widesearch.patch import ( + wrap_create_sub_agents_factory, + ) + from opentelemetry.util.genai.extended_handler import ( + ExtendedTelemetryHandler, + ) + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import SimpleSpanProcessor + from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, + ) + + exporter = InMemorySpanExporter() + tp = TracerProvider() + tp.add_span_processor(SimpleSpanProcessor(exporter)) + handler = ExtendedTelemetryHandler(tracer_provider=tp) + + def failing_factory(*args, **kwargs): + async def failing_closure(sub_agents): + raise RuntimeError("Boom") + + return failing_closure + + wrapped_factory = wrap_create_sub_agents_factory( + failing_factory, None, (), {}, handler=handler + ) + + with pytest.raises(RuntimeError, match="Boom"): + _run_async(wrapped_factory([{"index": 0, "prompt": "x"}])) + + spans = exporter.get_finished_spans() + task_spans = [ + s for s in spans if s.name == "run_task create_sub_agents" + ] + assert len(task_spans) == 1 + assert task_spans[0].status.status_code == StatusCode.ERROR + + +# --------------------------------------------------------------------------- +# Parent-Child Relationship Tests +# --------------------------------------------------------------------------- + + +class TestParentChildRelationships: + def test_full_hierarchy_entry_agent_step(self, span_exporter, instrument): + """Full call through run_single_query should produce ENTRY > AGENT > STEP.""" + from src.agent.run import run_single_query + + _run_async(run_single_query("hierarchy test", agent_name="root")) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + agent_spans = [s for s in spans if "invoke_agent" in s.name] + step_spans = [s for s in spans if s.name == "react step"] + + assert len(entry_spans) == 1 + assert len(agent_spans) == 1 + assert len(step_spans) >= 1 + + entry = entry_spans[0] + agent = agent_spans[0] + step = step_spans[0] + + # AGENT is child of ENTRY + assert agent.parent.span_id == entry.context.span_id + # STEP is child of AGENT + assert step.parent.span_id == agent.context.span_id + + def test_tool_span_is_child_of_step(self, span_exporter, instrument): + """TOOL span should be child of the STEP span when invoked during a step.""" + from src.agent.run import Runner + + async def mock_tool(**kwargs): + return InternalResponse(data="result") + + agent = Agent( + name="hierarchy-agent", + tools={"my_tool": mock_tool}, + ) + + async def custom_step(*, agent, memory): + tc = ToolCall( + tool_name="my_tool", + arguments="{}", + tool_call_id="tc_hier", + ) + model_resp = ModelResponse( + outputs=[LLMOutputItem(tool_calls=[tc])] + ) + await Runner._invoke_tool_call(agent, model_resp) + return ActionStep(step_status=StepStatus.FINISHED, content="done") + + Runner._step_override = custom_step + + try: + async def _run(): + async for _ in Runner.run(agent, "test"): + pass + + _run_async(_run()) + finally: + Runner._step_override = None + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + tool_spans = [s for s in spans if "execute_tool" in s.name] + + assert len(step_spans) >= 1 + assert len(tool_spans) >= 1 + + step_span = step_spans[0] + tool_span = tool_spans[0] + assert tool_span.parent.span_id == step_span.context.span_id diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md new file mode 100644 index 000000000..1b0499fa4 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/README.md @@ -0,0 +1,55 @@ +# LoongSuite WildToolBench Instrumentation + +OpenTelemetry instrumentation for the [WildToolBench](https://github.com/yupeijei1997/WildToolBench) benchmark framework. + +## Installation + +WildToolBench is not available on PyPI. Install it from source: + +```bash +pip install -e /path/to/WildToolBench/wild-tool-bench +pip install loongsuite-instrumentation-wildtool +``` + +## Requirements + +- **OpenAI provider instrumentation**: To produce LLM spans, you must also enable an OpenAI provider instrumentation (e.g., `opentelemetry-instrumentation-openai` or LoongSuite's equivalent). This plugin creates ENTRY/AGENT/CHAIN/STEP/TOOL spans but does **not** create LLM spans itself. + +## Usage + +```python +from opentelemetry.instrumentation.wildtool import WildToolInstrumentor + +WildToolInstrumentor().instrument() + +# Run WildToolBench as usual — spans are automatically generated. +``` + +## Span Topology + +``` +ENTRY (enter_ai_application_system) +└── AGENT (invoke_agent wildtool) + └── CHAIN (workflow task_{idx}) + └── STEP (react step) + ├── [LLM span — provider instrumentation] + └── TOOL (execute_tool {tool_name}) +``` + +## Patch Points + +| # | Target | Span Type | +|---|--------|-----------| +| P1 | `multi_threaded_inference` | ENTRY | +| P2 | `BaseHandler.inference_multi_turn` | AGENT | +| P3 | `BaseHandler.inference_and_eval_multi_step` | CHAIN + TOOL | +| P4 | `BaseHandler._request_tool_call` | STEP | +| P5 | `BaseHandler._parse_api_response` | (token extraction) | + +## Round 2 fixes (see `llm-dev/execute.md` § "修订记录 (Round 2 fix)") + +- **H1**: TOOL span is now parented on STEP, not CHAIN. Strategy A enhanced — the chain wrapper holds a `round → STEP span` map and uses `trace.set_span_in_context(step_span)` to anchor each post-hoc TOOL span on the matching STEP. STEP `SpanContext`s remain valid parents even after `end()`. +- **H2 (provider-name fallback)**: `opentelemetry-instrumentation-openai-v2 == 0.62b1` only emits the legacy `gen_ai.system` attribute on its LLM span; the new `gen_ai.provider.name` attribute is missing. As a *pure fallback* the wildtool plugin writes both `gen_ai.system="openai"` and `gen_ai.provider.name="openai"` on the **STEP** span (not on the LLM span — that is owned by the OpenAI v2 instrumentation and we do **not** patch it). Once the OpenAI v2 instrumentation upstream emits `gen_ai.provider.name` natively this fallback can be removed. +- **M1**: CHAIN span now carries `input.value` (last user message in `inference_data["messages"]`, truncated to 4096 chars) and `output.value` (JSON of `action_name_label`/`task_idx`/`is_optimal`). +- **M2**: STEP span now carries `gen_ai.react.finish_reason` on error paths. Mapping table is in `execute.md` § "M2: gen_ai.react.finish_reason 取值映射". +- **M3**: TOOL span explicitly writes `gen_ai.tool.call.arguments` / `gen_ai.tool.call.result` / `gen_ai.tool.description`, bypassing `OTEL_INSTRUMENTATION_GENAI_CAPTURE_*` gating in `opentelemetry-util-genai`. The custom `wildtool.tool.execution_mode = "ground_truth_replay"` is preserved. diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml new file mode 100644 index 000000000..b8f9f44d0 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/pyproject.toml @@ -0,0 +1,66 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "loongsuite-instrumentation-wildtool" +dynamic = ["version"] +description = "LoongSuite WildToolBench Instrumentation" +readme = "README.md" +license = "Apache-2.0" +requires-python = ">=3.9" +authors = [ + { name = "LoongSuite Python Agent Authors", email = "caishipeng.csp@alibaba-inc.com" }, + { name = "OpenTelemetry Authors", email = "cncf-opentelemetry-contributors@lists.cncf.io" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", +] +dependencies = [ + "opentelemetry-api ~= 1.37", + "opentelemetry-instrumentation >= 0.58b0", + "opentelemetry-semantic-conventions >= 0.58b0", + "opentelemetry-util-genai", + "wrapt >= 1.17.3, < 3.0.0", +] + +[project.optional-dependencies] +instruments = [ + "openai >= 1.0.0", +] + +test = [ + "pytest ~= 8.0", + "pytest-cov ~= 4.1.0", + "pytest-forked >= 1.6.0", + "opentelemetry-sdk >= 1.37", + "openai >= 1.0.0", +] + +[project.entry-points.opentelemetry_instrumentor] +wildtool = "opentelemetry.instrumentation.wildtool:WildToolInstrumentor" + +[project.urls] +Homepage = "https://github.com/alibaba/loongsuite-python-agent/tree/main/instrumentation-loongsuite/loongsuite-instrumentation-wildtool" +Repository = "https://github.com/alibaba/loongsuite-python-agent" + +[tool.hatch.version] +path = "src/opentelemetry/instrumentation/wildtool/version.py" + +[tool.hatch.build.targets.sdist] +include = [ + "/src", + "/tests", +] + +[tool.hatch.build.targets.wheel] +packages = ["src/opentelemetry"] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py new file mode 100644 index 000000000..dad772500 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/__init__.py @@ -0,0 +1,161 @@ +"""OpenTelemetry WildToolBench Instrumentation""" + +import logging +from typing import Any, Collection + +from wrapt import wrap_function_wrapper + +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor +from opentelemetry.instrumentation.utils import unwrap +from opentelemetry.instrumentation.wildtool.package import _instruments +from opentelemetry.instrumentation.wildtool.version import __version__ +from opentelemetry.instrumentation.wildtool._wrappers import ( + WildToolAgentWrapper, + WildToolChainWrapper, + WildToolEntryWrapper, + WildToolParseWrapper, + WildToolRequestWrapper, +) +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler + +logger = logging.getLogger(__name__) + +_LLM_RESPONSE_GEN_MODULE = "wtb._llm_response_generation" +_BASE_HANDLER_MODULE = "wtb.model_handler.base_handler" + +__all__ = ["WildToolInstrumentor", "__version__"] + + +class WildToolInstrumentor(BaseInstrumentor): + """OpenTelemetry instrumentor for WildToolBench framework.""" + + def __init__(self): + super().__init__() + self._handler = None + # Track concrete handler subclasses whose abstract _request_tool_call / + # _parse_api_response we have already wrapped, so we can unwrap on + # uninstrument and avoid double-wrapping. + self._patched_handler_classes: set = set() + self._request_wrapper = None + self._parse_wrapper = None + + def instrumentation_dependencies(self) -> Collection[str]: + return _instruments + + def _instrument(self, **kwargs: Any) -> None: + tracer_provider = kwargs.get("tracer_provider") + meter_provider = kwargs.get("meter_provider") + logger_provider = kwargs.get("logger_provider") + + self._handler = ExtendedTelemetryHandler( + tracer_provider=tracer_provider, + meter_provider=meter_provider, + logger_provider=logger_provider, + ) + self._request_wrapper = WildToolRequestWrapper(self._handler) + self._parse_wrapper = WildToolParseWrapper(self._handler) + + # P1: ENTRY span + try: + wrap_function_wrapper( + _LLM_RESPONSE_GEN_MODULE, + "multi_threaded_inference", + WildToolEntryWrapper(self._handler), + ) + except Exception as e: + logger.warning("Failed to instrument multi_threaded_inference: %s", e) + + # P2: AGENT span + try: + wrap_function_wrapper( + _BASE_HANDLER_MODULE, + "BaseHandler.inference_multi_turn", + WildToolAgentWrapper(self._handler), + ) + except Exception as e: + logger.warning("Failed to instrument inference_multi_turn: %s", e) + + # P3: CHAIN span (+ STEP + TOOL management). + # The chain wrapper also lazily patches the concrete subclass' + # `_request_tool_call` / `_parse_api_response` on first use, so that + # subclasses overriding the abstract base methods are still + # intercepted (P4 / P5). + try: + wrap_function_wrapper( + _BASE_HANDLER_MODULE, + "BaseHandler.inference_and_eval_multi_step", + WildToolChainWrapper(self._handler, self), + ) + except Exception as e: + logger.warning( + "Failed to instrument inference_and_eval_multi_step: %s", e + ) + + def ensure_handler_class_patched(self, handler_cls) -> None: + """Lazily wrap the concrete handler subclass' P4/P5 methods. + + WildToolBench declares ``_request_tool_call`` and ``_parse_api_response`` + as abstract on ``BaseHandler``, but real handlers (and tests) override + them. Python method resolution dispatches directly to the override and + therefore never reaches a wrapper installed on the base class. We + instead wrap the override on first invocation per subclass. + """ + if handler_cls in self._patched_handler_classes: + return + self._patched_handler_classes.add(handler_cls) + + module_name = handler_cls.__module__ + cls_name = handler_cls.__name__ + for method, wrapper in ( + ("_request_tool_call", self._request_wrapper), + ("_parse_api_response", self._parse_wrapper), + ): + if method not in handler_cls.__dict__: + continue + try: + wrap_function_wrapper( + module_name, + f"{cls_name}.{method}", + wrapper, + ) + except Exception as e: + logger.debug( + "Failed to wrap %s.%s.%s: %s", + module_name, + cls_name, + method, + e, + ) + + def _uninstrument(self, **kwargs: Any) -> None: + try: + import wtb._llm_response_generation as llm_gen + + unwrap(llm_gen, "multi_threaded_inference") + except Exception as e: + logger.debug("Failed to uninstrument multi_threaded_inference: %s", e) + + try: + import wtb.model_handler.base_handler as bh + + unwrap(bh.BaseHandler, "inference_multi_turn") + unwrap(bh.BaseHandler, "inference_and_eval_multi_step") + except Exception as e: + logger.debug("Failed to uninstrument BaseHandler methods: %s", e) + + for cls in list(self._patched_handler_classes): + for method in ("_request_tool_call", "_parse_api_response"): + if method in cls.__dict__: + try: + unwrap(cls, method) + except Exception as e: + logger.debug( + "Failed to unwrap %s.%s: %s", + cls.__name__, + method, + e, + ) + self._patched_handler_classes.clear() + self._request_wrapper = None + self._parse_wrapper = None + self._handler = None diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py new file mode 100644 index 000000000..8b16d5247 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/_wrappers.py @@ -0,0 +1,857 @@ +"""Wrapper classes for WildToolBench instrumentation. + +Each wrapper corresponds to one patch point and manages the lifecycle +of one or more span types. + +Round 2 fix highlights (see ``llm-dev/execute.md`` § "修订记录 (Round 2 fix)"): + +H1 + TOOL span parent is now STEP rather than CHAIN. Each STEP invocation is + appended to a per-chain list in :data:`_chain_step_invocations`; when the + chain wrapper post-processes ``inference_log`` it looks up the matching + STEP span by ``round`` and uses + :func:`opentelemetry.trace.set_span_in_context` so ``start_execute_tool`` + parents the TOOL span on the STEP context (even if STEP is already + closed — its :class:`SpanContext` remains a valid parent reference). + +H2 + The OpenAI v2 provider instrumentation (0.62b1) writes only the legacy + ``gen_ai.system`` attribute to its LLM span. The wildtool plugin now + writes both ``gen_ai.system`` and ``gen_ai.provider.name`` on the STEP + span as a fallback so the new semantic-conventions attribute is present + in the trace tree even before the upstream OpenAI v2 instrumentation + catches up. We do **not** patch the OpenAI v2 instrumentation itself. + +M1 + ``input.value`` (last user message in the chain's ``messages``, truncated + to 4096 chars) and ``output.value`` (a JSON of action label, task index + and is_optimal) are written on the CHAIN span. + +M2 + ``gen_ai.react.finish_reason`` is derived from ``inference_log`` on the + *last* (currently active) STEP. Mappings: + + ``"parse_tool_calls_failed"`` + ``error_reason`` contains "parse tool_calls failed". + ``"action_name_mismatch"`` + ``error_reason`` contains "action name not in candidate". + ``"empty_response"`` + ``error_reason`` contains "tool_calls and content are None". + ``"error"`` + request raised an exception (handled in + :class:`WildToolRequestWrapper`). + +M3 + ``gen_ai.tool.call.arguments``, ``gen_ai.tool.call.result`` and + ``gen_ai.tool.description`` are written explicitly on TOOL spans + *before* close as a fallback. ``opentelemetry-util-genai`` gates these + sensitive attributes behind ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_*`` env + vars; the wildtool plugin always writes them since wtb data is + benchmark-synthetic and never PII. +""" + +import json +import logging +from contextvars import ContextVar +from dataclasses import asdict +from typing import List, Optional + +from opentelemetry.trace import StatusCode, set_span_in_context +from opentelemetry.util.genai.extended_handler import ExtendedTelemetryHandler +from opentelemetry.util.genai.extended_types import ( + EntryInvocation, + ExecuteToolInvocation, + InvokeAgentInvocation, + ReactStepInvocation, +) +from opentelemetry.util.genai.types import ( + Error, + InputMessage, + OutputMessage, + Text, +) + +logger = logging.getLogger(__name__) + +# ─────────────────────────── ContextVars ─────────────────────────────── +# The CHAIN wrapper opens a new logical "chain" by flipping ``_in_chain`` +# and resetting the counter. The REQUEST wrapper reads these to decide +# whether to create a STEP span and what round number to assign. +_in_chain: ContextVar[bool] = ContextVar("_wt_in_chain", default=False) + +# Currently open STEP invocation. Used by the parse wrapper to attach +# token attributes to the right span. +_step_invocation: ContextVar[Optional[ReactStepInvocation]] = ContextVar( + "_wt_step_inv", default=None +) +_step_counter: ContextVar[int] = ContextVar("_wt_step_ctr", default=0) + +# Per-chain list of every STEP invocation created in the current chain +# (in `round` order). The chain wrapper allocates this list on entry and +# uses it after ``wrapped`` returns to re-parent TOOL spans onto the +# matching STEP. Even if a STEP span is already ``end()``-ed, its +# :class:`SpanContext` stays valid as a parent reference for new spans. +_chain_step_invocations: ContextVar[Optional[List[ReactStepInvocation]]] = ( + ContextVar("_wt_chain_step_invs", default=None) +) + +_PROVIDER_FALLBACK_NAME = "openai" +_INPUT_VALUE_MAX_CHARS = 4096 +_MESSAGE_CONTENT_MAX_CHARS = 4096 + + +def _close_active_step(handler: ExtendedTelemetryHandler) -> None: + """Close the currently active STEP span, if any.""" + prev = _step_invocation.get() + if prev is not None: + try: + handler.stop_react_step(prev) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to close step: %s", e) + _step_invocation.set(None) + + +def _truncate(text: str, max_chars: int) -> str: + if len(text) <= max_chars: + return text + return text[:max_chars] + "...(truncated)" + + +def _stringify(value) -> str: + if isinstance(value, str): + return value + try: + return json.dumps(value, ensure_ascii=False) + except (TypeError, ValueError): + return str(value) + + +def _tasks_to_input_messages(test_entry) -> List[InputMessage]: + if not isinstance(test_entry, dict): + return [] + tasks = test_entry.get("english_tasks") + if not isinstance(tasks, list): + return [] + + messages = [] + for task in tasks: + if task in (None, "", [], {}): + continue + messages.append( + InputMessage( + role="user", + parts=[ + Text( + content=_truncate( + _stringify(task), _MESSAGE_CONTENT_MAX_CHARS + ) + ) + ], + ) + ) + return messages + + +def _task_results_to_output_messages(result) -> List[OutputMessage]: + task_results = _extract_task_results(result) + messages = [] + for task_result in task_results: + content = _extract_task_result_output(task_result) + if content in (None, "", [], {}): + continue + messages.append( + OutputMessage( + role="assistant", + parts=[ + Text( + content=_truncate( + _stringify(content), _MESSAGE_CONTENT_MAX_CHARS + ) + ) + ], + finish_reason=_extract_finish_reason(task_result), + ) + ) + return messages + + +def _get_message_attributes(input_messages, output_messages) -> dict: + attributes = {} + try: + if input_messages: + attributes["gen_ai.input.messages"] = json.dumps( + [asdict(message) for message in input_messages], + ensure_ascii=False, + ) + if output_messages: + attributes["gen_ai.output.messages"] = json.dumps( + [asdict(message) for message in output_messages], + ensure_ascii=False, + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to serialize message attrs: %s", e) + return attributes + + +def _set_message_attributes(invocation) -> None: + attributes = _get_message_attributes( + invocation.input_messages, invocation.output_messages + ) + if not attributes: + return + invocation.attributes.update(attributes) + span = invocation.span + if span is None or not span.is_recording(): + return + try: + span.set_attributes(attributes) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set message attrs: %s", e) + + +def _extract_task_results(result) -> List: + if isinstance(result, list): + return result + if not isinstance(result, dict): + return [] + + for key in ( + "result", + "results", + "inference_result", + "inference_results", + "result_list", + "task_results", + "answer", + "answers", + ): + value = result.get(key) + if isinstance(value, list): + return value + if isinstance(value, dict): + return [value] + if value not in (None, "", [], {}): + return [value] + + if any( + key in result + for key in ( + "action_name_label", + "is_optimal", + "inference_log", + "inference_output", + "final_answer", + ) + ): + return [result] + return [] + + +def _extract_task_result_output(task_result): + if not isinstance(task_result, dict): + return task_result + + for key in ("final_answer", "answer", "output", "result"): + value = task_result.get(key) + if value not in (None, "", [], {}): + return value + + inference_log = task_result.get("inference_log") + output_from_log = _extract_output_from_inference_log(inference_log) + if output_from_log not in (None, "", [], {}): + return output_from_log + + label = task_result.get("action_name_label") + if label is not None or "is_optimal" in task_result: + return { + "action_name_label": label, + "is_optimal": task_result.get("is_optimal"), + } + return None + + +def _extract_output_from_inference_log(inference_log): + if not isinstance(inference_log, dict): + return None + + for key in sorted( + (k for k in inference_log if k.startswith("step_")), + key=_step_log_sort_key, + reverse=True, + ): + step_data = inference_log.get(key) + if not isinstance(step_data, dict): + continue + + output = step_data.get("inference_output") + if isinstance(output, dict): + for output_key in ( + "content", + "reasoning_content", + "current_action_name_label", + "error_reason", + ): + value = output.get(output_key) + if value not in (None, "", [], {}): + return value + + answer = step_data.get("inference_answer") + if isinstance(answer, dict): + candidate = answer.get("candidate_0_answer_function_list") + if isinstance(candidate, dict): + observation = candidate.get("observation") + if observation not in (None, "", [], {}): + return observation + if answer not in (None, "", [], {}): + return answer + return None + + +def _step_log_sort_key(key: str) -> int: + try: + return int(key[len("step_"):]) + except (TypeError, ValueError): + return -1 + + +def _extract_finish_reason(task_result) -> str: + if isinstance(task_result, dict): + label = task_result.get("action_name_label") + if label == "error": + return "error" + return "stop" + + +class WildToolEntryWrapper: + """P1: Wraps multi_threaded_inference → ENTRY span.""" + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + # Signature: multi_threaded_inference(handler, model_name, test_case). + # We only need model_name and test_case for ENTRY attributes; the + # handler instance flows through as args[0] untouched. + model_name = args[1] if len(args) > 1 else kwargs.get("model_name", "") + test_case = args[2] if len(args) > 2 else kwargs.get("test_case", {}) + + invocation = EntryInvocation( + session_id=test_case.get("id"), + input_messages=_tasks_to_input_messages(test_case), + attributes={ + "gen_ai.framework": "wildtool", + "gen_ai.request.model": model_name, + "wildtool.turn_count": len(test_case.get("english_tasks", [])), + }, + ) + self._handler.start_entry(invocation) + _set_message_attributes(invocation) + try: + result = wrapped(*args, **kwargs) + invocation.output_messages = _task_results_to_output_messages(result) + _set_message_attributes(invocation) + self._handler.stop_entry(invocation) + return result + except Exception as e: + _set_message_attributes(invocation) + self._handler.fail_entry( + invocation, Error(message=str(e), type=type(e)) + ) + raise + + +class WildToolAgentWrapper: + """P2: Wraps BaseHandler.inference_multi_turn → AGENT span.""" + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + test_entry = args[0] if args else kwargs.get("test_entry", {}) + + invocation = InvokeAgentInvocation( + provider=None, + agent_name=type(instance).__name__, + input_messages=_tasks_to_input_messages(test_entry), + conversation_id=test_entry.get("id"), + request_model=getattr(instance, "model_name", None), + attributes={ + "gen_ai.framework": "wildtool", + "wildtool.turn_count": len( + test_entry.get("english_answer_list", []) + ), + }, + ) + self._handler.start_invoke_agent(invocation) + _set_message_attributes(invocation) + try: + result = wrapped(*args, **kwargs) + invocation.output_messages = _task_results_to_output_messages(result) + _set_message_attributes(invocation) + total_input = 0 + total_output = 0 + for task_result in (result or []): + if isinstance(task_result, dict): + total_input += sum( + task_result.get("input_token_count", []) + ) + total_output += sum( + task_result.get("output_token_count", []) + ) + if total_input: + invocation.input_tokens = total_input + if total_output: + invocation.output_tokens = total_output + self._handler.stop_invoke_agent(invocation) + return result + except Exception as e: + _set_message_attributes(invocation) + self._handler.fail_invoke_agent( + invocation, Error(message=str(e), type=type(e)) + ) + raise + + +class WildToolChainWrapper: + """P3: Wraps BaseHandler.inference_and_eval_multi_step → CHAIN span. + + Also manages the lifecycle of the final STEP span and creates TOOL spans + from the returned ``inference_log`` after the original function completes. + Round 2 fixes (H1/M1/M2/M3) are implemented here. + """ + + def __init__(self, handler: ExtendedTelemetryHandler, instrumentor=None): + self._handler = handler + self._instrumentor = instrumentor + + def __call__(self, wrapped, instance, args, kwargs): + if self._instrumentor is not None and instance is not None: + try: + self._instrumentor.ensure_handler_class_patched(type(instance)) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to ensure subclass patched: %s", e) + + inference_data = args[0] if args else kwargs.get("inference_data", {}) + if not isinstance(inference_data, dict): + inference_data = {} + task_idx = inference_data.get("task_idx", 0) + test_entry_id = inference_data.get("test_entry_id", "") + + span_name = f"workflow task_{task_idx}" + tracer = self._handler._tracer + + chain_token = _in_chain.set(True) + counter_token = _step_counter.set(0) + step_token = _step_invocation.set(None) + chain_steps: List[ReactStepInvocation] = [] + chain_steps_token = _chain_step_invocations.set(chain_steps) + + chain_attributes = { + "gen_ai.span.kind": "CHAIN", + "gen_ai.operation.name": "workflow", + "gen_ai.framework": "wildtool", + "wildtool.task_idx": task_idx, + "wildtool.test_entry_id": test_entry_id, + } + + # M1: Capture last user message as ``input.value`` BEFORE running the + # wrapped function (the wtb function mutates ``messages`` in place). + input_value = self._extract_input_value(inference_data) + if input_value is not None: + chain_attributes["input.value"] = input_value + + with tracer.start_as_current_span( + name=span_name, attributes=chain_attributes + ) as span: + try: + result = wrapped(*args, **kwargs) + + # M2: Set finish_reason on the currently active (last) STEP + # BEFORE we close it. Only the terminal step ever carries an + # error finish_reason (every wtb error path triggers `break`). + if isinstance(result, dict): + self._apply_last_step_finish_reason( + result.get("inference_log", {}) + ) + + _close_active_step(self._handler) + + if isinstance(result, dict): + label = result.get("action_name_label", "") + is_optimal = bool(result.get("is_optimal", False)) + span.set_attribute("wildtool.action_name_label", label) + span.set_attribute("wildtool.is_optimal", is_optimal) + + # M1: ``output.value`` summarising chain outcome. + try: + span.set_attribute( + "output.value", + json.dumps( + { + "action_name_label": label, + "task_idx": task_idx, + "is_optimal": is_optimal, + }, + ensure_ascii=False, + ), + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set output.value: %s", e) + + # H1 + M3: re-parent TOOL spans on STEP and force-write + # tool call sensitive attributes. + self._create_tool_spans_from_log( + result.get("inference_log", {}), + inference_data, + chain_steps, + ) + + span.set_status(StatusCode.OK) + return result + except Exception as e: + _close_active_step(self._handler) + span.record_exception(e) + span.set_status(StatusCode.ERROR) + raise + finally: + _chain_step_invocations.reset(chain_steps_token) + _step_counter.reset(counter_token) + _step_invocation.reset(step_token) + _in_chain.reset(chain_token) + + # -- M1 --------------------------------------------------------------- + + @staticmethod + def _extract_input_value(inference_data) -> Optional[str]: + msgs = inference_data.get("messages") if isinstance( + inference_data, dict + ) else None + if not isinstance(msgs, list): + return None + for m in reversed(msgs): + if not isinstance(m, dict) or m.get("role") != "user": + continue + content = m.get("content") + if content is None: + continue + text = _stringify(content) + return _truncate(text, _INPUT_VALUE_MAX_CHARS) + return None + + # -- M2 --------------------------------------------------------------- + + def _apply_last_step_finish_reason(self, inference_log) -> None: + if not isinstance(inference_log, dict): + return + current_step = _step_invocation.get() + if current_step is None or current_step.round is None: + return + step_key = f"step_{current_step.round - 1}" + step_data = inference_log.get(step_key) + if not isinstance(step_data, dict): + return + output = step_data.get("inference_output") or {} + if not isinstance(output, dict): + return + label = output.get("current_action_name_label") + error_reason = output.get("error_reason") or "" + reason = self._derive_step_finish_reason(label, error_reason) + if reason is None: + return + # Setting `invocation.finish_reason` is enough — the util-genai + # `_apply_react_step_finish_attributes` writes + # ``gen_ai.react.finish_reason`` from this field on stop. + current_step.finish_reason = reason + + @staticmethod + def _derive_step_finish_reason( + label, error_reason: str + ) -> Optional[str]: + """Map wtb inference_log error_reason → gen_ai.react.finish_reason.""" + if label != "error": + return None + if "parse tool_calls failed" in error_reason: + return "parse_tool_calls_failed" + if "action name not in candidate" in error_reason: + return "action_name_mismatch" + if "tool_calls and content are None" in error_reason: + return "empty_response" + return "error" + + # -- H1 + M3 ---------------------------------------------------------- + + def _create_tool_spans_from_log( + self, + inference_log, + inference_data, + chain_steps: List[ReactStepInvocation], + ) -> None: + """Post-hoc TOOL span creation from inference_log. + + Uses the per-chain STEP invocation list to parent each TOOL span on + the matching STEP span (H1). Sensitive tool-call attributes are + written explicitly on the span (M3) so they appear regardless of + ``OTEL_INSTRUMENTATION_GENAI_CAPTURE_*`` settings. + """ + if not isinstance(inference_log, dict): + return + + # round → SpanContext-bearing OTel context for parenting + step_ctx_by_round = {} + for step_inv in chain_steps: + if step_inv.round is None or step_inv.span is None: + continue + try: + step_ctx_by_round[step_inv.round] = set_span_in_context( + step_inv.span + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to compute step parent context: %s", e) + + # tool name → description (for gen_ai.tool.description) + tool_desc_map = {} + tools = inference_data.get("tools") if isinstance( + inference_data, dict + ) else None + if isinstance(tools, list): + for tool in tools: + if not isinstance(tool, dict): + continue + func = tool.get("function") or tool + if not isinstance(func, dict): + continue + name = func.get("name") + desc = func.get("description") + if name: + tool_desc_map[name] = desc + + # Extract tool observations from final messages keyed by tool_call_id; + # wtb only embeds them in messages (not in inference_answer) for the + # tool_call branch. + observation_by_call_id = {} + messages = inference_data.get("messages") if isinstance( + inference_data, dict + ) else None + if isinstance(messages, list): + for msg in messages: + if not isinstance(msg, dict) or msg.get("role") != "tool": + continue + tid = msg.get("tool_call_id") + if tid is None: + continue + content = msg.get("content") + if content is None: + continue + observation_by_call_id[tid] = ( + content if isinstance(content, str) else _stringify(content) + ) + + for key in sorted(k for k in inference_log if k.startswith("step_")): + try: + step_idx = int(key[len("step_"):]) + except ValueError: + continue + round_num = step_idx + 1 + + step_data = inference_log[key] + if not isinstance(step_data, dict): + continue + output = step_data.get("inference_output") or {} + if not isinstance(output, dict): + continue + tool_calls = output.get("tool_calls") + label = output.get("current_action_name_label") + if not tool_calls or label != "correct": + continue + + answer_data = step_data.get("inference_answer") or {} + candidate = ( + answer_data.get("candidate_0_answer_function_list") + if isinstance(answer_data, dict) + else None + ) or {} + candidate_observation = ( + candidate.get("observation") + if isinstance(candidate, dict) + else None + ) + + parent_ctx = step_ctx_by_round.get(round_num) + + for tc in tool_calls: + if not isinstance(tc, dict): + continue + func = tc.get("function") or {} + if not isinstance(func, dict): + func = {} + tool_name = func.get("name", "unknown") + tool_id = tc.get("id") + tool_args_raw = func.get("arguments", "") + tool_args_str = ( + tool_args_raw + if isinstance(tool_args_raw, str) + else _stringify(tool_args_raw) + ) + + observation_str: Optional[str] = None + if tool_id is not None and tool_id in observation_by_call_id: + observation_str = observation_by_call_id[tool_id] + elif candidate_observation is not None: + observation_str = ( + candidate_observation + if isinstance(candidate_observation, str) + else _stringify(candidate_observation) + ) + + description = tool_desc_map.get(tool_name) + + invocation = ExecuteToolInvocation( + tool_name=tool_name, + tool_call_id=tool_id, + tool_call_arguments=tool_args_str, + tool_call_result=observation_str, + tool_type="function", + tool_description=description, + attributes={ + "wildtool.tool.execution_mode": "ground_truth_replay", + }, + ) + + try: + self._handler.start_execute_tool( + invocation, context=parent_ctx + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to start_execute_tool: %s", e) + continue + + # M3: explicitly write tool_call sensitive attrs. The + # util-genai `_get_tool_call_data_attributes` helper guards + # these behind experimental-mode + content-capture-mode env + # vars which are not always set in real deployments. + tool_span = invocation.span + if tool_span is not None and tool_span.is_recording(): + try: + tool_span.set_attribute( + "gen_ai.tool.call.arguments", tool_args_str + ) + if observation_str is not None: + tool_span.set_attribute( + "gen_ai.tool.call.result", observation_str + ) + if description: + tool_span.set_attribute( + "gen_ai.tool.description", description + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set tool span attrs: %s", e) + + try: + self._handler.stop_execute_tool(invocation) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to stop_execute_tool: %s", e) + + +class WildToolRequestWrapper: + """P4: Wraps BaseHandler._request_tool_call. + + Creates STEP span (ReactStepInvocation) before each LLM call. + Extracts latency from return value. Also writes the H2 provider-name + fallback attributes (``gen_ai.system`` + ``gen_ai.provider.name``) on + the STEP span so the new semconv attribute is present in the trace + even when the upstream OpenAI v2 instrumentation only emits the legacy + ``gen_ai.system``. + """ + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + if not _in_chain.get(): + return wrapped(*args, **kwargs) + + # Close the previous step (the natural end-of-step is when the next + # request fires). The STEP span's SpanContext stays valid as a + # parent for TOOL spans created later. + _close_active_step(self._handler) + + step_num = _step_counter.get() + 1 + _step_counter.set(step_num) + + step_inv = ReactStepInvocation(round=step_num) + try: + self._handler.start_react_step(step_inv) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to start react step: %s", e) + return wrapped(*args, **kwargs) + + # H2: provider-name fallback attributes. Written on the STEP, not + # on the LLM span, because the LLM span is owned by the OpenAI v2 + # provider instrumentation and is created lazily inside the wtb + # request implementation. + if step_inv.span is not None and step_inv.span.is_recording(): + try: + step_inv.span.set_attribute( + "gen_ai.system", _PROVIDER_FALLBACK_NAME + ) + step_inv.span.set_attribute( + "gen_ai.provider.name", _PROVIDER_FALLBACK_NAME + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set provider fallback attrs: %s", e) + + # Track this step for H1 TOOL re-parenting. + chain_steps = _chain_step_invocations.get() + if chain_steps is not None: + chain_steps.append(step_inv) + _step_invocation.set(step_inv) + + try: + result = wrapped(*args, **kwargs) + if isinstance(result, tuple) and len(result) == 2: + _, latency = result + if step_inv.span and step_inv.span.is_recording(): + try: + step_inv.span.set_attribute( + "wildtool.latency", float(latency) + ) + except Exception as e: # noqa: BLE001 + logger.debug("Failed to set wildtool.latency: %s", e) + return result + except Exception as e: + step_inv.finish_reason = "error" + self._handler.fail_react_step( + step_inv, Error(message=str(e), type=type(e)) + ) + _step_invocation.set(None) + raise + + +class WildToolParseWrapper: + """P5: Wraps BaseHandler._parse_api_response. + + Extracts token counts from parsed response and sets them on the + current STEP span as attributes. + """ + + def __init__(self, handler: ExtendedTelemetryHandler): + self._handler = handler + + def __call__(self, wrapped, instance, args, kwargs): + result = wrapped(*args, **kwargs) + + step_inv = _step_invocation.get() + if step_inv and step_inv.span and step_inv.span.is_recording(): + if isinstance(result, dict): + input_t = result.get("input_token") + output_t = result.get("output_token") + if input_t is not None: + step_inv.span.set_attribute( + "gen_ai.usage.input_tokens", input_t + ) + if output_t is not None: + step_inv.span.set_attribute( + "gen_ai.usage.output_tokens", output_t + ) + + return result diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py new file mode 100644 index 000000000..1ac5bcfee --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/package.py @@ -0,0 +1,2 @@ +_instruments = ("openai >= 1.0.0",) +_supports_metrics = False diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py new file mode 100644 index 000000000..c26b7711d --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/utils.py @@ -0,0 +1,17 @@ +"""Utility functions for WildToolBench instrumentation.""" + +import json +from typing import Any, Optional + + +def safe_json_dumps(obj: Any, max_length: int = 10000) -> Optional[str]: + """Safely serialize object to JSON string with length limit.""" + if obj is None: + return None + try: + s = json.dumps(obj, ensure_ascii=False) + if len(s) > max_length: + return s[:max_length] + "...(truncated)" + return s + except (TypeError, ValueError): + return str(obj)[:max_length] diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py new file mode 100644 index 000000000..3dc1f76bc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/src/opentelemetry/instrumentation/wildtool/version.py @@ -0,0 +1 @@ +__version__ = "0.1.0" diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py new file mode 100644 index 000000000..014186185 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/conftest.py @@ -0,0 +1,182 @@ +"""Test configuration for WildToolBench instrumentation tests.""" + +import json +import os + +import pytest + +os.environ.setdefault("OPENAI_API_KEY", "test_key_not_real") +os.environ.setdefault("OPENAI_BASE_URL", "http://localhost:9999/v1") + +from opentelemetry.instrumentation.wildtool import WildToolInstrumentor +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import SimpleSpanProcessor +from opentelemetry.sdk.trace.export.in_memory_span_exporter import ( + InMemorySpanExporter, +) + + +def pytest_configure(config: pytest.Config): + os.environ["OTEL_SEMCONV_STABILITY_OPT_IN"] = "gen_ai_latest_experimental" + + +@pytest.fixture(scope="function", name="span_exporter") +def fixture_span_exporter(): + exporter = InMemorySpanExporter() + yield exporter + + +@pytest.fixture(scope="function", name="tracer_provider") +def fixture_tracer_provider(span_exporter): + provider = TracerProvider() + provider.add_span_processor(SimpleSpanProcessor(span_exporter)) + return provider + + +@pytest.fixture(scope="function") +def instrument(tracer_provider): + instrumentor = WildToolInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + skip_dep_check=True, + ) + yield instrumentor + instrumentor.uninstrument() + + +# ==================== Minimal test data fixtures ==================== + + +def _make_chat_completion_response( + content=None, + tool_calls=None, + input_tokens=10, + output_tokens=5, + model="gpt-4o", +): + """Build a minimal ChatCompletion-like dict that can be JSON-serialized.""" + message = {"role": "assistant", "content": content or ""} + if tool_calls: + message["tool_calls"] = tool_calls + return { + "id": "chatcmpl-test", + "object": "chat.completion", + "model": model, + "choices": [{"index": 0, "message": message, "finish_reason": "stop"}], + "usage": { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + }, + } + + +class FakeChatCompletion: + """Mimics openai.types.chat.ChatCompletion enough for _parse_api_response.""" + + def __init__(self, data: dict): + self._data = data + + def json(self): + return json.dumps(self._data) + + def __getattr__(self, name): + return self._data[name] + + +@pytest.fixture() +def make_completion(): + """Factory fixture to build FakeChatCompletion objects.""" + + def _factory(**kwargs): + return FakeChatCompletion(_make_chat_completion_response(**kwargs)) + + return _factory + + +@pytest.fixture() +def simple_test_entry(): + """A minimal WildToolBench test_entry with 1 task, 1 step (prepare_to_answer).""" + return { + "id": "wild_tool_bench_test_001", + "english_env_info": "2025-01-01", + "english_tools": [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "Get weather for a city", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string"}, + }, + "required": ["city"], + }, + }, + } + ], + "english_tasks": ["What is the weather in Beijing?"], + "english_answer_list": [ + [ + { + "action": { + "name": "get_weather", + "arguments": {"city": "Beijing"}, + }, + "observation": "Sunny, 25°C", + "dependency_list": [], + }, + { + "action": { + "name": "prepare_to_answer", + "arguments": {}, + }, + "observation": "The weather in Beijing is Sunny, 25°C", + "dependency_list": [0], + }, + ] + ], + } + + +@pytest.fixture() +def tool_call_response_factory(): + """Factory to make tool_call ChatCompletion responses.""" + + def _factory(tool_name, arguments, tool_call_id="call_001"): + tc = [ + { + "id": tool_call_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": ( + json.dumps(arguments) + if isinstance(arguments, dict) + else arguments + ), + }, + } + ] + return FakeChatCompletion( + _make_chat_completion_response(tool_calls=tc) + ) + + return _factory + + +@pytest.fixture() +def text_response_factory(): + """Factory to make text-only ChatCompletion responses.""" + + def _factory(content, input_tokens=10, output_tokens=5): + return FakeChatCompletion( + _make_chat_completion_response( + content=content, + input_tokens=input_tokens, + output_tokens=output_tokens, + ) + ) + + return _factory diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py new file mode 100644 index 000000000..2711089fc --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_agent_span.py @@ -0,0 +1,136 @@ +"""Tests for AGENT span (P2: inference_multi_turn).""" + +import json + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Minimal handler subclass for testing AGENT span.""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + return resp, 0.1 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +class TestAgentSpan: + def test_agent_span_attributes( + self, span_exporter, instrument, simple_test_entry, make_completion, + tool_call_response_factory, text_response_factory, + ): + """AGENT span should exist with correct attributes and token aggregation.""" + handler = _StubHandler() + + # Step 0: model returns tool call for get_weather + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + # Step 1: model returns text (prepare_to_answer match) + resp1 = text_response_factory( + "The weather in Beijing is Sunny, 25°C", + input_tokens=20, output_tokens=15, + ) + handler._step_responses = [resp0, resp1] + + result = handler.inference_multi_turn(simple_test_entry) + assert result is not None + + spans = span_exporter.get_finished_spans() + agent_spans = [s for s in spans if "invoke_agent" in s.name] + assert len(agent_spans) == 1 + + span = agent_spans[0] + assert span.name == "invoke_agent _StubHandler" + attrs = dict(span.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "AGENT" + assert attrs.get("gen_ai.operation.name") == "invoke_agent" + assert attrs.get("gen_ai.framework") == "wildtool" + assert attrs.get("gen_ai.agent.name") == "_StubHandler" + assert attrs.get("gen_ai.conversation.id") == "wild_tool_bench_test_001" + assert attrs.get("gen_ai.request.model") == "test-model" + assert attrs.get("wildtool.turn_count") == 1 + + assert attrs.get("gen_ai.usage.input_tokens") == 30 + assert attrs.get("gen_ai.usage.output_tokens") == 20 + + def test_agent_span_captures_input_and_output_messages( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """AGENT span should always carry GenAI input/output messages.""" + + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + agent_span = [s for s in spans if "invoke_agent" in s.name][0] + attrs = dict(agent_span.attributes or {}) + input_messages = json.loads(attrs["gen_ai.input.messages"]) + output_messages = json.loads(attrs["gen_ai.output.messages"]) + + assert input_messages[0]["role"] == "user" + assert ( + input_messages[0]["parts"][0]["content"] + == "What is the weather in Beijing?" + ) + assert output_messages[0]["role"] == "assistant" + assert ( + output_messages[0]["parts"][0]["content"] + == "The weather in Beijing is Sunny, 25°C" + ) + + def test_agent_parent_is_entry( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """When called via multi_threaded_inference, AGENT span should be child of ENTRY.""" + from wtb._llm_response_generation import multi_threaded_inference # noqa: I001, PLC0415 + + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + test_case = simple_test_entry.copy() + multi_threaded_inference(handler, "test-model", test_case) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + agent_spans = [s for s in spans if "invoke_agent" in s.name] + + assert len(entry_spans) == 1 + assert len(agent_spans) == 1 + + entry = entry_spans[0] + agent = agent_spans[0] + assert agent.context.trace_id == entry.context.trace_id + assert agent.parent is not None + assert agent.parent.span_id == entry.context.span_id diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py new file mode 100644 index 000000000..d7dd7b4aa --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_chain_step_tool_spans.py @@ -0,0 +1,283 @@ +"""Tests for CHAIN / STEP / TOOL spans (P3, P4, P5).""" + +import json + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Minimal handler subclass with controllable responses.""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + return resp, 0.05 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +class TestChainSpan: + def test_chain_span_per_task( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Each task should produce one CHAIN span with correct attributes.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + + chain = chain_spans[0] + assert chain.name == "workflow task_0" + attrs = dict(chain.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "CHAIN" + assert attrs.get("gen_ai.operation.name") == "workflow" + assert attrs.get("gen_ai.framework") == "wildtool" + assert attrs.get("wildtool.task_idx") == 0 + assert attrs.get("wildtool.test_entry_id") == "wild_tool_bench_test_001" + assert attrs.get("wildtool.action_name_label") == "correct" + assert attrs.get("wildtool.is_optimal") is True + + def test_chain_parent_is_agent( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """CHAIN span should be child of AGENT span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + agent_spans = [s for s in spans if "invoke_agent" in s.name] + chain_spans = [s for s in spans if s.name.startswith("workflow")] + + assert len(agent_spans) == 1 + assert len(chain_spans) == 1 + + agent = agent_spans[0] + chain = chain_spans[0] + assert chain.context.trace_id == agent.context.trace_id + assert chain.parent is not None + assert chain.parent.span_id == agent.context.span_id + + +class TestStepSpans: + def test_step_spans_per_chain( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Each _request_tool_call invocation should produce a STEP span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) == 2 + + attrs0 = dict(step_spans[0].attributes or {}) + attrs1 = dict(step_spans[1].attributes or {}) + rounds = sorted([attrs0.get("gen_ai.react.round"), attrs1.get("gen_ai.react.round")]) + assert rounds == [1, 2] + + for ss in step_spans: + a = dict(ss.attributes or {}) + assert a.get("gen_ai.span.kind") == "STEP" + assert a.get("gen_ai.operation.name") == "react" + + def test_step_parent_is_chain( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """STEP spans should be children of CHAIN span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + step_spans = [s for s in spans if s.name == "react step"] + + assert len(chain_spans) == 1 + chain = chain_spans[0] + + for ss in step_spans: + assert ss.context.trace_id == chain.context.trace_id + assert ss.parent is not None + assert ss.parent.span_id == chain.context.span_id + + def test_step_token_attributes( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """STEP span should have gen_ai.usage.input_tokens and output_tokens.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory( + "The weather in Beijing is Sunny, 25°C", + input_tokens=25, output_tokens=12, + ) + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + step_spans = sorted( + [s for s in spans if s.name == "react step"], + key=lambda s: s.attributes.get("gen_ai.react.round", 0), + ) + assert len(step_spans) == 2 + + # First step: default 10 input, 5 output from make_completion defaults + a0 = dict(step_spans[0].attributes or {}) + assert a0.get("gen_ai.usage.input_tokens") == 10 + assert a0.get("gen_ai.usage.output_tokens") == 5 + + # Second step: 25 input, 12 output + a1 = dict(step_spans[1].attributes or {}) + assert a1.get("gen_ai.usage.input_tokens") == 25 + assert a1.get("gen_ai.usage.output_tokens") == 12 + + +class TestToolSpans: + def test_tool_span_attributes( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """TOOL span should have correct attributes including execution_mode.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = [s for s in spans if "execute_tool" in s.name] + assert len(tool_spans) == 1 + + tool = tool_spans[0] + assert tool.name == "execute_tool get_weather" + attrs = dict(tool.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "TOOL" + assert attrs.get("gen_ai.operation.name") == "execute_tool" + assert attrs.get("gen_ai.tool.name") == "get_weather" + assert attrs.get("gen_ai.tool.type") == "function" + assert ( + attrs.get("wildtool.tool.execution_mode") == "ground_truth_replay" + ) + + def test_tool_span_parent_is_chain( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """TOOL spans share the CHAIN trace_id (parent is STEP after Round 2).""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + tool_spans = [s for s in spans if "execute_tool" in s.name] + + assert len(chain_spans) == 1 + assert len(tool_spans) >= 1 + + chain = chain_spans[0] + for ts in tool_spans: + assert ts.context.trace_id == chain.context.trace_id + + +class TestSpanHierarchy: + def test_full_hierarchy( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Verify ENTRY → AGENT → CHAIN → STEP hierarchy and consistent trace_id.""" + from wtb._llm_response_generation import multi_threaded_inference + + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + test_case = simple_test_entry.copy() + multi_threaded_inference(handler, "test-model", test_case) + + spans = span_exporter.get_finished_spans() + + entry = [s for s in spans if s.name == "enter_ai_application_system"] + agent = [s for s in spans if "invoke_agent" in s.name] + chain = [s for s in spans if s.name.startswith("workflow")] + step = [s for s in spans if s.name == "react step"] + tool = [s for s in spans if "execute_tool" in s.name] + + assert len(entry) == 1 + assert len(agent) == 1 + assert len(chain) == 1 + assert len(step) == 2 + assert len(tool) >= 1 + + trace_id = entry[0].context.trace_id + for s in spans: + assert s.context.trace_id == trace_id + + # AGENT parent = ENTRY + assert agent[0].parent.span_id == entry[0].context.span_id + # CHAIN parent = AGENT + assert chain[0].parent.span_id == agent[0].context.span_id + # STEP parent = CHAIN + for s in step: + assert s.parent.span_id == chain[0].context.span_id diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py new file mode 100644 index 000000000..2a1e864b5 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_entry_span.py @@ -0,0 +1,168 @@ +"""Tests for ENTRY span (P1: multi_threaded_inference). + +Module-level imports of ``wtb._llm_response_generation.multi_threaded_inference`` +must be avoided: ``wrapt.wrap_function_wrapper`` patches the attribute on the +module, but a pre-imported local binding still references the original +unwrapped function. All tests therefore import the symbol lazily after the +``instrument`` fixture has run. +""" + +import json + +import pytest +from wtb.model_handler.base_handler import BaseHandler + +from opentelemetry.trace import StatusCode + + +class _StubHandler(BaseHandler): + """Minimal handler subclass for testing. + + Overrides ``inference`` so the multi_threaded_inference wrapper invokes a + deterministic, side-effect-free body that returns a fake result dict and + therefore exercises only the ENTRY span codepath. + """ + + def __init__(self): + super().__init__("test-model", 0.0) + + def _request_tool_call(self, inference_data): + raise NotImplementedError + + def _parse_api_response(self, api_response): + raise NotImplementedError + + def inference(self, test_entry): + return [ + { + "action_name_label": "correct", + "is_optimal": True, + "inference_log": {}, + "latency": [0.1], + "input_token_count": [10], + "output_token_count": [5], + } + ] + + +class TestEntrySpan: + def test_entry_span_created(self, span_exporter, instrument): + """ENTRY span should be created with correct attributes.""" + from wtb._llm_response_generation import multi_threaded_inference # noqa: I001, PLC0415 + + handler = _StubHandler() + test_case = { + "id": "wild_tool_bench_test_001", + "english_tasks": ["task1", "task2"], + } + + result = multi_threaded_inference(handler, "gpt-4o", test_case) + + assert result is not None + assert result["id"] == "wild_tool_bench_test_001" + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + + span = entry_spans[0] + attrs = dict(span.attributes or {}) + assert attrs.get("gen_ai.span.kind") == "ENTRY" + assert attrs.get("gen_ai.operation.name") == "enter" + assert attrs.get("gen_ai.framework") == "wildtool" + assert attrs.get("gen_ai.session.id") == "wild_tool_bench_test_001" + assert attrs.get("gen_ai.request.model") == "gpt-4o" + assert attrs.get("wildtool.turn_count") == 2 + # ENTRY spans rely on default OTel status semantics: success leaves + # the span UNSET, failures explicitly mark it ERROR. + assert span.status.status_code != StatusCode.ERROR + + def test_entry_span_captures_input_and_output_messages( + self, span_exporter, instrument, + ): + """ENTRY span should always carry GenAI input/output messages.""" + + from opentelemetry.instrumentation.wildtool._wrappers import ( # noqa: PLC0415 + WildToolEntryWrapper, + ) + + wrapper = WildToolEntryWrapper(instrument._handler) + test_case = { + "id": "wild_tool_bench_test_messages", + "english_tasks": ["Search for the capital of France"], + } + + def _success(handler, model_name, test_case): + return [ + { + "action_name_label": "correct", + "is_optimal": True, + "inference_log": { + "step_0": { + "inference_output": { + "content": "Paris is the capital of France." + } + } + }, + } + ] + + wrapper(_success, None, (_StubHandler(), "gpt-4o", test_case), {}) + + spans = span_exporter.get_finished_spans() + entry_span = [ + s for s in spans if s.name == "enter_ai_application_system" + ][0] + attrs = dict(entry_span.attributes or {}) + input_messages = json.loads(attrs["gen_ai.input.messages"]) + output_messages = json.loads(attrs["gen_ai.output.messages"]) + + assert input_messages[0]["role"] == "user" + assert ( + input_messages[0]["parts"][0]["content"] + == "Search for the capital of France" + ) + assert output_messages[0]["role"] == "assistant" + assert ( + output_messages[0]["parts"][0]["content"] + == "Paris is the capital of France." + ) + + def test_entry_span_error_path(self, span_exporter, instrument): + """The ENTRY wrapper marks the span ERROR when the wrapped callable + raises an unhandled exception. + + ``multi_threaded_inference`` swallows non-rate-limit errors itself + (see test_error_scenarios.test_entry_span_captures_retry_error_path + for that path). To exercise the wrapper's failure branch directly we + invoke the underlying ``WildToolEntryWrapper`` with a callable that + deliberately raises, bypassing ``multi_threaded_inference``'s own + error handling. + """ + from opentelemetry.instrumentation.wildtool._wrappers import ( # noqa: PLC0415 + WildToolEntryWrapper, + ) + + wrapper = WildToolEntryWrapper(instrument._handler) + + def _raising(handler, model_name, test_case): + raise RuntimeError("API connection failed") + + handler = _StubHandler() + test_case = { + "id": "wild_tool_bench_test_002", + "english_tasks": ["task1"], + } + + with pytest.raises(RuntimeError, match="API connection failed"): + wrapper(_raising, None, (handler, "gpt-4o", test_case), {}) + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + span = entry_spans[0] + assert span.status.status_code == StatusCode.ERROR diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py new file mode 100644 index 000000000..c14a3f40c --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_error_scenarios.py @@ -0,0 +1,135 @@ +"""Tests for error/edge-case scenarios.""" + +import json + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Handler with controllable step responses.""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + if isinstance(resp, Exception): + raise resp + return resp, 0.05 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +class TestErrorScenarios: + def test_action_name_mismatch( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, + ): + """When model calls wrong tool, CHAIN span should still be OK with error label.""" + handler = _StubHandler() + # Model calls wrong_tool instead of get_weather + resp0 = tool_call_response_factory( + "wrong_tool", {"x": 1}, "call_bad" + ) + handler._step_responses = [resp0] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + + chain = chain_spans[0] + attrs = dict(chain.attributes or {}) + assert attrs.get("wildtool.action_name_label") == "error" + assert chain.status.status_code == StatusCode.OK + + def test_empty_response( + self, span_exporter, instrument, simple_test_entry, + make_completion, + ): + """When model returns no content and no tool_calls, process terminates gracefully.""" + from tests.conftest import FakeChatCompletion, _make_chat_completion_response + + handler = _StubHandler() + resp = FakeChatCompletion( + _make_chat_completion_response(content="", tool_calls=None) + ) + handler._step_responses = [resp] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + attrs = dict(chain_spans[0].attributes or {}) + assert attrs.get("wildtool.action_name_label") == "error" + + def test_request_tool_call_exception_sets_error( + self, span_exporter, instrument, simple_test_entry, + ): + """Exception in _request_tool_call should produce ERROR on STEP span and propagate.""" + handler = _StubHandler() + handler._step_responses = [RuntimeError("Connection timeout")] + + with pytest.raises(RuntimeError, match="Connection timeout"): + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + step_spans = [s for s in spans if s.name == "react step"] + assert len(step_spans) == 1 + assert step_spans[0].status.status_code == StatusCode.ERROR + + chain_spans = [s for s in spans if s.name.startswith("workflow")] + assert len(chain_spans) == 1 + assert chain_spans[0].status.status_code == StatusCode.ERROR + + def test_entry_span_captures_retry_error_path( + self, span_exporter, instrument, + ): + """multi_threaded_inference catches non-rate-limit errors and returns error dict. + ENTRY span should still complete successfully (not raise).""" + from wtb._llm_response_generation import multi_threaded_inference + + handler = _StubHandler() + + def failing_inference(test_entry): + raise ValueError("Invalid JSON from model") + + handler.inference = failing_inference + + test_case = { + "id": "wild_tool_bench_err_001", + "english_tasks": ["task1"], + } + + # multi_threaded_inference catches non-rate-limit errors + result = multi_threaded_inference(handler, "test-model", test_case) + assert "Error during inference" in result["result"] + + spans = span_exporter.get_finished_spans() + entry_spans = [ + s for s in spans if s.name == "enter_ai_application_system" + ] + assert len(entry_spans) == 1 + # multi_threaded_inference's own try/except converts the error into a + # normal return, so the ENTRY wrapper observes a successful call and + # leaves the span at the default UNSET status (definitely not ERROR). + span = entry_spans[0] + assert span.status.status_code != StatusCode.ERROR diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py new file mode 100644 index 000000000..a8be5b4da --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_instrumentor.py @@ -0,0 +1,20 @@ +"""Tests for WildToolInstrumentor lifecycle.""" + +from opentelemetry.instrumentation.wildtool import WildToolInstrumentor + + +class TestWildToolInstrumentor: + def test_instrument_and_uninstrument(self, tracer_provider): + instrumentor = WildToolInstrumentor() + instrumentor.instrument( + tracer_provider=tracer_provider, + skip_dep_check=True, + ) + assert instrumentor._handler is not None + instrumentor.uninstrument() + assert instrumentor._handler is None + + def test_instrumentation_dependencies(self): + instrumentor = WildToolInstrumentor() + deps = instrumentor.instrumentation_dependencies() + assert ("openai >= 1.0.0",) == deps diff --git a/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py new file mode 100644 index 000000000..9f4f4d895 --- /dev/null +++ b/instrumentation-loongsuite/loongsuite-instrumentation-wildtool/tests/test_round2_fixes.py @@ -0,0 +1,441 @@ +"""Round 2 regression tests covering the H1 / H2 / M1 / M2 / M3 fixes. + +See ``llm-dev/execute.md`` § "修订记录 (Round 2 fix)" and +``example-deploy/validation/SUMMARY.md`` for the original validation gaps +addressed by these tests. +""" + +import json + +import pytest +from opentelemetry.trace import StatusCode + +from wtb.model_handler.base_handler import BaseHandler + + +class _StubHandler(BaseHandler): + """Minimal handler with controllable LLM responses (no real network).""" + + def __init__(self): + super().__init__("test-model", 0.0) + self._step_responses = [] + self._step_idx = 0 + + def _request_tool_call(self, inference_data): + resp = self._step_responses[self._step_idx] + self._step_idx += 1 + if isinstance(resp, Exception): + raise resp + return resp, 0.05 + + def _parse_api_response(self, api_response): + data = json.loads(api_response.json()) + choice = data["choices"][0] + message = choice["message"] + return { + "reasoning_content": None, + "content": message.get("content"), + "tool_calls": message.get("tool_calls"), + "input_token": data["usage"]["prompt_tokens"], + "output_token": data["usage"]["completion_tokens"], + } + + +def _spans_by_kind(spans, kind): + return [s for s in spans if (s.attributes or {}).get("gen_ai.span.kind") == kind] + + +def _spans_named(spans, name): + return [s for s in spans if s.name == name] + + +def _step_for_round(spans, round_num): + for s in _spans_named(spans, "react step"): + attrs = s.attributes or {} + if attrs.get("gen_ai.react.round") == round_num: + return s + raise AssertionError(f"no STEP span found for round={round_num}") + + +# ============================================================================ +# H1: TOOL span parent_span_id == STEP span_id (was CHAIN before fix) +# ============================================================================ + + +class TestToolParentIsStep: + def test_single_tool_parent_is_step_round_one( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """The single TOOL span in simple_test_entry should be a child of the + first STEP span (round=1), not the CHAIN span.""" + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = _spans_by_kind(spans, "TOOL") + assert len(tool_spans) == 1, [s.name for s in spans] + + tool = tool_spans[0] + step_round1 = _step_for_round(spans, 1) + chain = _spans_by_kind(spans, "CHAIN")[0] + + # H1 core assertion: parent is STEP, not CHAIN. + assert tool.parent is not None + assert tool.parent.span_id == step_round1.context.span_id, ( + "TOOL parent should be STEP round=1, got " + f"{tool.parent.span_id} (STEP={step_round1.context.span_id}, " + f"CHAIN={chain.context.span_id})" + ) + assert tool.parent.span_id != chain.context.span_id + + # And trace_id of course remains consistent. + assert tool.context.trace_id == step_round1.context.trace_id + + def test_multi_step_each_tool_parented_to_correct_step( + self, span_exporter, instrument, + tool_call_response_factory, text_response_factory, + ): + """multi-step scenario: 2 successful tool steps + 1 prepare_to_answer. + + Each TOOL span must be parented to the STEP span of its own round, + not to the CHAIN or to a different round's STEP. + """ + handler = _StubHandler() + # Test entry with 2 tool steps (search, lookup) then prepare_to_answer. + test_entry = { + "id": "wild_tool_bench_multi_001", + "english_env_info": "2025-01-01", + "english_tools": [ + { + "type": "function", + "function": { + "name": "search", + "description": "Search items", + "parameters": { + "type": "object", + "properties": {"q": {"type": "string"}}, + "required": ["q"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "lookup", + "description": "Look up details", + "parameters": { + "type": "object", + "properties": {"id": {"type": "string"}}, + "required": ["id"], + }, + }, + }, + ], + "english_tasks": ["Find and summarize item X"], + "english_answer_list": [ + [ + { + "action": {"name": "search", "arguments": {"q": "X"}}, + "observation": "found:item_42", + "dependency_list": [], + }, + { + "action": {"name": "lookup", "arguments": {"id": "item_42"}}, + "observation": "details:hello", + "dependency_list": [0], + }, + { + "action": {"name": "prepare_to_answer", "arguments": {}}, + "observation": "Item X is hello.", + "dependency_list": [1], + }, + ] + ], + } + + resp_step1 = tool_call_response_factory( + "search", {"q": "X"}, "call_search_1" + ) + resp_step2 = tool_call_response_factory( + "lookup", {"id": "item_42"}, "call_lookup_1" + ) + resp_step3 = text_response_factory("Item X is hello.") + handler._step_responses = [resp_step1, resp_step2, resp_step3] + + handler.inference_multi_turn(test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = sorted( + _spans_by_kind(spans, "TOOL"), + key=lambda s: (s.attributes or {}).get("gen_ai.tool.name") or "", + ) + assert len(tool_spans) == 2, [s.name for s in spans] + + step_round1 = _step_for_round(spans, 1) + step_round2 = _step_for_round(spans, 2) + chain = _spans_by_kind(spans, "CHAIN")[0] + + lookup_tool = next( + t for t in tool_spans + if (t.attributes or {}).get("gen_ai.tool.name") == "lookup" + ) + search_tool = next( + t for t in tool_spans + if (t.attributes or {}).get("gen_ai.tool.name") == "search" + ) + + # search → STEP round=1, lookup → STEP round=2 + assert search_tool.parent.span_id == step_round1.context.span_id + assert lookup_tool.parent.span_id == step_round2.context.span_id + # Neither parented on CHAIN (the regression we are fixing) + for t in tool_spans: + assert t.parent.span_id != chain.context.span_id + assert t.context.trace_id == chain.context.trace_id + + +# ============================================================================ +# M1: CHAIN span carries input.value and output.value +# ============================================================================ + + +class TestChainInputOutputValue: + def test_chain_input_value_and_output_value( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("The weather in Beijing is Sunny, 25°C") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + chain_spans = _spans_by_kind(spans, "CHAIN") + assert len(chain_spans) == 1 + attrs = dict(chain_spans[0].attributes or {}) + + # input.value: last user message of the chain (prepared by wtb's + # _pre_messages_processing which appends the current task as user). + assert "input.value" in attrs, attrs + assert attrs["input.value"] == "What is the weather in Beijing?" + + # output.value: JSON containing action_name_label, task_idx, is_optimal. + assert "output.value" in attrs, attrs + out = json.loads(attrs["output.value"]) + assert out["action_name_label"] == "correct" + assert out["task_idx"] == 0 + assert out["is_optimal"] is True + + def test_chain_input_value_truncated_when_long( + self, span_exporter, instrument, + tool_call_response_factory, text_response_factory, + ): + """Very long user content should be truncated to keep span attribute small.""" + handler = _StubHandler() + long_text = "x" * 5000 + test_entry = { + "id": "wild_tool_bench_long_001", + "english_env_info": "2025-01-01", + "english_tools": [ + { + "type": "function", + "function": { + "name": "noop", + "description": "noop", + "parameters": {"type": "object", "properties": {}}, + }, + } + ], + "english_tasks": [long_text], + "english_answer_list": [ + [ + { + "action": {"name": "prepare_to_answer", "arguments": {}}, + "observation": "ok", + "dependency_list": [], + } + ] + ], + } + handler._step_responses = [text_response_factory("ok")] + + handler.inference_multi_turn(test_entry) + + spans = span_exporter.get_finished_spans() + chain = _spans_by_kind(spans, "CHAIN")[0] + attrs = dict(chain.attributes or {}) + assert "input.value" in attrs + # Default cap is 4096; truncated form must be <= cap + suffix length. + assert len(attrs["input.value"]) <= 4096 + len("...(truncated)") + assert attrs["input.value"].startswith("xxx") + + +# ============================================================================ +# M2: STEP span carries gen_ai.react.finish_reason on error paths +# ============================================================================ + + +class TestStepFinishReason: + def test_finish_reason_action_name_mismatch( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, + ): + handler = _StubHandler() + # wrong tool name → wtb's "action name not in candidate" branch + handler._step_responses = [ + tool_call_response_factory("wrong_tool", {"x": 1}, "call_bad") + ] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 1 + attrs = dict(steps[0].attributes or {}) + assert attrs.get("gen_ai.react.finish_reason") == "action_name_mismatch" + + def test_finish_reason_empty_response( + self, span_exporter, instrument, simple_test_entry, make_completion, + ): + """Empty content + no tool_calls → STEP gets finish_reason=empty_response.""" + from tests.conftest import ( + FakeChatCompletion, + _make_chat_completion_response, + ) + + handler = _StubHandler() + handler._step_responses = [ + FakeChatCompletion( + _make_chat_completion_response(content="", tool_calls=None) + ) + ] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 1 + attrs = dict(steps[0].attributes or {}) + assert attrs.get("gen_ai.react.finish_reason") == "empty_response" + + def test_finish_reason_request_exception( + self, span_exporter, instrument, simple_test_entry, + ): + """Exception in _request_tool_call → STEP ERROR + finish_reason=error.""" + handler = _StubHandler() + handler._step_responses = [RuntimeError("Boom")] + + with pytest.raises(RuntimeError): + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 1 + attrs = dict(steps[0].attributes or {}) + assert steps[0].status.status_code == StatusCode.ERROR + assert attrs.get("gen_ai.react.finish_reason") == "error" + + def test_finish_reason_omitted_on_success( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + """Successful steps should NOT have a finish_reason (per execute.md).""" + handler = _StubHandler() + handler._step_responses = [ + tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ), + text_response_factory("OK"), + ] + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + for s in _spans_named(spans, "react step"): + attrs = dict(s.attributes or {}) + assert "gen_ai.react.finish_reason" not in attrs, ( + f"unexpected finish_reason on success step round=" + f"{attrs.get('gen_ai.react.round')}: {attrs.get('gen_ai.react.finish_reason')}" + ) + + +# ============================================================================ +# M3: TOOL span carries gen_ai.tool.call.arguments / result / description +# (and keeps wildtool.tool.execution_mode) +# ============================================================================ + + +class TestToolSensitiveAttributes: + def test_tool_args_result_description_and_execution_mode( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + handler = _StubHandler() + resp0 = tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ) + resp1 = text_response_factory("Sunny day") + handler._step_responses = [resp0, resp1] + + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + tool_spans = _spans_by_kind(spans, "TOOL") + assert len(tool_spans) == 1 + attrs = dict(tool_spans[0].attributes or {}) + + # M3 explicit attrs. + args_attr = attrs.get("gen_ai.tool.call.arguments") + assert args_attr is not None + assert json.loads(args_attr) == {"city": "Beijing"} + + # observation comes from the appended {"role": "tool", ...} message + # written by wtb after the call matches the answer; it's a string. + result_attr = attrs.get("gen_ai.tool.call.result") + assert result_attr == "Sunny, 25°C", attrs + + # description sourced from inference_data["tools"][i].function.description + assert attrs.get("gen_ai.tool.description") == "Get weather for a city" + + # Existing custom attribute must still be present. + assert ( + attrs.get("wildtool.tool.execution_mode") + == "ground_truth_replay" + ) + + +# ============================================================================ +# H2: STEP span carries gen_ai.system / gen_ai.provider.name fallback +# ============================================================================ + + +class TestStepProviderFallback: + def test_step_has_provider_name_fallback( + self, span_exporter, instrument, simple_test_entry, + tool_call_response_factory, text_response_factory, + ): + handler = _StubHandler() + handler._step_responses = [ + tool_call_response_factory( + "get_weather", {"city": "Beijing"}, "call_001" + ), + text_response_factory("OK"), + ] + handler.inference_multi_turn(simple_test_entry) + + spans = span_exporter.get_finished_spans() + steps = _spans_named(spans, "react step") + assert len(steps) == 2 + for s in steps: + attrs = dict(s.attributes or {}) + assert attrs.get("gen_ai.system") == "openai", attrs + assert attrs.get("gen_ai.provider.name") == "openai", attrs diff --git a/packages.txt b/packages.txt new file mode 100644 index 000000000..cee224898 --- /dev/null +++ b/packages.txt @@ -0,0 +1,112 @@ +aiohappyeyeballs==2.6.1 +aiohttp==3.10.2 +aiosignal==1.3.1 +aliyun-instrumentation-sglang @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/aliyun-instrumentation-sglang +aliyun-instrumentation-vllm @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/aliyun-instrumentation-vllm +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=aliyun_sdk_extension_arms&subdirectory=sdk-extension/aliyun-sdk-extension-arms +aliyun-semantic-conventions==1.2.0 +annotated-types==0.7.0 +anyio==4.10.0 +asgiref==3.8.1 +asttokens==3.0.0 +async-timeout==4.0.3 +attrs==25.3.0 +blinker==1.7.0 +build==1.3.0 +bytecode==0.17.0 +certifi==2024.7.4 +chardet==5.2.0 +charset-normalizer==3.3.2 +click==8.1.7 +cramjam==2.10.0 +crcmod==1.7 +decorator==5.2.1 +Deprecated==1.2.14 +Django==5.2.4 +executing==2.2.1 +fastapi==0.116.1 +filelock==3.19.1 +Flask==3.0.2 +frozenlist==1.4.1 +fsspec==2025.9.0 +googleapis-common-protos==1.70.0 +h11==0.16.0 +http_server_mock==1.7 +httpcore==1.0.9 +httpretty==1.1.4 +httpx==0.28.1 +idna==3.7 +importlib_metadata==8.4.0 +iniconfig==2.0.0 +ipython==9.5.0 +ipython_pygments_lexers==1.1.1 +itsdangerous==2.1.2 +jedi==0.19.2 +Jinja2==3.1.4 +jsonpath==0.82.2 +MarkupSafe==2.1.5 +matplotlib-inline==0.1.7 +mpmath==1.3.0 +multidict==6.0.5 +networkx==3.5 +numpy==2.3.2 +opentelemetry-api==1.30.0 +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_exporter_otlp_proto_http&subdirectory=exporter/opentelemetry-exporter-otlp-proto-http +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_instrumentation&subdirectory=opentelemetry-instrumentation +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_instrumentation_aiohttp_client&subdirectory=instrumentation/opentelemetry-instrumentation-aiohttp-client +opentelemetry-instrumentation-asgi @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-asgi +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_instrumentation_django&subdirectory=instrumentation/opentelemetry-instrumentation-django +opentelemetry-instrumentation-fastapi @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-fastapi +opentelemetry-instrumentation-flask @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-flask +opentelemetry-instrumentation-httpx @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-httpx +opentelemetry-instrumentation-requests @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-requests +opentelemetry-instrumentation-tornado @ file:///Users/liuziming/Desktop/loongsuite-python-agent/instrumentation/opentelemetry-instrumentation-tornado +opentelemetry-instrumentation-wsgi==0.51b0 +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_sdk&subdirectory=opentelemetry-sdk +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_semantic_conventions&subdirectory=opentelemetry-semantic-conventions +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_test_utils&subdirectory=opentelemetry-test-utils +-e git+https://github.com/alibaba/loongsuite-python-agent.git@fe5b8bf1938dcd449dfa335234b58af81b00bc98#egg=opentelemetry_util_http&subdirectory=util/opentelemetry-util-http +packaging==24.0 +parso==0.8.5 +pexpect==4.9.0 +pillow==11.3.0 +pluggy==1.5.0 +prompt_toolkit==3.0.52 +propcache==0.3.2 +protobuf==6.32.0 +psutil==7.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +py-cpuinfo==9.0.0 +pydantic==2.11.7 +pydantic_core==2.33.2 +Pygments==2.19.2 +pyproject_hooks==1.2.0 +pytest==7.4.4 +python-snappy==0.7.3 +PyYAML==6.0.2 +requests==2.32.3 +setproctitle==1.3.6 +setuptools==80.9.0 +sglang==0.4.8 +sniffio==1.3.1 +sqlparse==0.5.3 +stack-data==0.6.3 +starlette==0.47.2 +sympy==1.14.0 +tomli==2.0.1 +tomlkit==0.13.3 +torch==2.8.0 +tornado==6.5.2 +tqdm==4.67.1 +traitlets==5.14.3 +typing-inspection==0.4.1 +typing_extensions==4.12.2 +urllib3==2.2.2 +uvloop==0.21.0 +wcwidth==0.2.13 +Werkzeug==3.0.6 +wheel==0.45.1 +wrapt==1.16.0 +yarl==1.9.4 +zipp==3.19.2 diff --git a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/extended_types.py b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/extended_types.py index e110fdcd3..d74131cf9 100644 --- a/util/opentelemetry-util-genai/src/opentelemetry/util/genai/extended_types.py +++ b/util/opentelemetry-util-genai/src/opentelemetry/util/genai/extended_types.py @@ -297,6 +297,12 @@ class EntryInvocation: output_messages: List[OutputMessage] = field( default_factory=_new_output_messages ) + system_instruction: List[MessagePart] = field( + default_factory=_new_system_instruction + ) + tool_definitions: List[ToolDefinition] = field( + default_factory=_new_tool_definitions + ) response_time_to_first_token: int | None = None # nanoseconds monotonic_start_s: float | None = None