diff --git a/README.md b/README.md index 27c1457..97213c9 100644 --- a/README.md +++ b/README.md @@ -362,6 +362,33 @@ Save this to `dmesg_custom_config.json` and run: node-scraper --plugin-configs=dmesg_custom_config.json run-plugins DmesgPlugin ``` +### Regex helper and `RegexSearchPlugin` + +A small utility of common regex patterns is available at `nodescraper.regex_patterns` to +help build analyzer-friendly `error_regex` dicts. This is useful when composing configs for +`RegexSearchPlugin` or other analyzers that accept `error_regex` lists. + +Python example (programmatic usage): + +```py +from nodescraper import regex_patterns +from nodescraper.plugins.regex_search.regex_search_analyzer import RegexSearchAnalyzer +from nodescraper.plugins.regex_search.regex_search_data import RegexSearchData + +# build error_regex list from named common patterns +rules = regex_patterns.build_error_regex_dicts(["ipv4", "email"], message_template="Found {name}") + +# prepare data and args +data = RegexSearchData(content="2026-05-01T12:00:00,000+00:00 connect from 192.0.2.1") +args = {"error_regex": rules} + +analyzer = RegexSearchAnalyzer(system_info=None) +result = analyzer.analyze_data(data, args) +print(result.events) +``` + +CLI note: `RegexSearchPlugin` accepts `--data` pointing to a file or directory and `--error-regex` entries for patterns when invoked from the CLI or a plugin config JSON. + #### **'compare-runs' subcommand** The `compare-runs` subcommand compares datamodels from two run log directories (e.g. two `nodescraper_log_*` folders). By default, all plugins with data in both runs are compared. diff --git a/docs/PLUGIN_DOC.md b/docs/PLUGIN_DOC.md index 5e84641..4339096 100644 --- a/docs/PLUGIN_DOC.md +++ b/docs/PLUGIN_DOC.md @@ -4,6 +4,7 @@ | Plugin | Collection | Analyzer Args | Collection Args | DataModel | Collector | Analyzer | | --- | --- | --- | --- | --- | --- | --- | +| RegexHelper | - | **Utility:** common regex patterns and `build_error_regex_dicts()` to produce analyzer-friendly `error_regex` dicts; useful when composing `RegexSearchPlugin` configs programmatically. | - | - | - | - | | AmdSmiPlugin | bad-pages
firmware --json
list --json
metric -g all
partition --json
process --json
ras --cper --folder={folder}
ras --afid --cper-file {cper_file}
static -g all --json
static -g {gpu_id} --json
topology
version --json
xgmi -l
xgmi -m | **Analyzer Args:**
- `check_static_data`: bool — If True, run static data checks (e.g. driver version, partition mode).
- `expected_gpu_processes`: Optional[int] — Expected number of GPU processes.
- `expected_max_power`: Optional[int] — Expected maximum power value (e.g. watts).
- `expected_driver_version`: Optional[str] — Expected AMD driver version string.
- `expected_memory_partition_mode`: Optional[str] — Expected memory partition mode (e.g. sp3, dp).
- `expected_compute_partition_mode`: Optional[str] — Expected compute partition mode.
- `expected_firmware_versions`: Optional[dict[str, str]] — Expected firmware versions keyed by amd-smi fw_id (e.g. PLDM_BUNDLE).
- `l0_to_recovery_count_error_threshold`: Optional[int] — L0-to-recovery count above which an error is raised.
- `l0_to_recovery_count_warning_threshold`: Optional[int] — L0-to-recovery count above which a warning is raised.
- `vendorid_ep`: Optional[str] — Expected endpoint vendor ID (e.g. for PCIe).
- `vendorid_ep_vf`: Optional[str] — Expected endpoint VF vendor ID.
- `devid_ep`: Optional[str] — Expected endpoint device ID.
- `devid_ep_vf`: Optional[str] — Expected endpoint VF device ID.
- `sku_name`: Optional[str] — Expected SKU name string for GPU.
- `expected_xgmi_speed`: Optional[list[float]] — Expected xGMI speed value(s) (e.g. link rate).
- `analysis_range_start`: Optional[datetime.datetime] — Start of time range for time-windowed analysis.
- `analysis_range_end`: Optional[datetime.datetime] — End of time range for time-windowed analysis. | **Collection Args:**
- `analysis_firmware_ids`: Optional[list[str]] — amd-smi fw_id values to record in analysis_ref.firmware_versions
- `cper_file_path`: Optional[str] — Path to CPER folder or file for RAS AFID collection (ras --afid --cper-file). | [AmdSmiDataModel](#AmdSmiDataModel-Model) | [AmdSmiCollector](#Collector-Class-AmdSmiCollector) | [AmdSmiAnalyzer](#Data-Analyzer-Class-AmdSmiAnalyzer) | | BiosPlugin | sh -c 'cat /sys/devices/virtual/dmi/id/bios_version'
wmic bios get SMBIOSBIOSVersion /Value | **Analyzer Args:**
- `exp_bios_version`: list[str] — Expected BIOS version(s) to match against collected value (str or list).
- `regex_match`: bool — If True, match exp_bios_version as regex; otherwise exact match. | - | [BiosDataModel](#BiosDataModel-Model) | [BiosCollector](#Collector-Class-BiosCollector) | [BiosAnalyzer](#Data-Analyzer-Class-BiosAnalyzer) | | CmdlinePlugin | cat /proc/cmdline | **Analyzer Args:**
- `required_cmdline`: Union[str, List] — Command-line parameters that must be present (e.g. 'pci=bfsort').
- `banned_cmdline`: Union[str, List] — Command-line parameters that must not be present.
- `os_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-OS overrides for required_cmdline and banned_cmdline (keyed by OS identifier).
- `platform_overrides`: Dict[str, nodescraper.plugins.inband.cmdline.cmdlineconfig.OverrideConfig] — Per-platform overrides for required_cmdline and banned_cmdline (keyed by platform). | - | [CmdlineDataModel](#CmdlineDataModel-Model) | [CmdlineCollector](#Collector-Class-CmdlineCollector) | [CmdlineAnalyzer](#Data-Analyzer-Class-CmdlineAnalyzer) | diff --git a/nodescraper/regex_patterns.py b/nodescraper/regex_patterns.py new file mode 100644 index 0000000..2920e27 --- /dev/null +++ b/nodescraper/regex_patterns.py @@ -0,0 +1,52 @@ +"""Common regex patterns and helpers for building analyzer error rules. + +Keep these lightweight and dependency-free so other modules can import them +without circular imports. +""" +from typing import Iterable, List + +COMMON_PATTERNS: dict[str, str] = { + "ipv4": r"\b(?:25[0-5]|2[0-4]\d|1?\d?\d)(?:\.(?:25[0-5]|2[0-4]\d|1?\d?\d)){3}\b", + "mac": r"\b(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}\b", + "uuid": r"\b[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}\b", + "iso8601_ts": r"\b\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?\b", + "email": r"\b[\w.+-]+@[\w-]+(?:\.[\w-]+)+\b", +} + + +def get_pattern(name: str) -> str: + """Return the raw regex string for a named common pattern. + + Raises KeyError if the name is unknown. + """ + return COMMON_PATTERNS[name] + + +def build_error_regex_dicts( + names: Iterable[str], + message_template: str = "{name} matched", + event_category: str = "UNKNOWN", + event_priority: str = "ERROR", +) -> List[dict]: + """Create list of dicts compatible with RegexAnalyzer._convert_and_extend_error_regex. + + Each dict contains keys: 'regex' (string), 'message', 'event_category', 'event_priority'. + The analyzer will compile the regex strings into patterns. + """ + out: List[dict] = [] + for name in names: + pat = COMMON_PATTERNS.get(name) + if not pat: + raise KeyError(f"Unknown pattern name: {name}") + out.append( + { + "regex": pat, + "message": message_template.format(name=name), + "event_category": event_category, + "event_priority": event_priority, + } + ) + return out + + +__all__ = ["COMMON_PATTERNS", "get_pattern", "build_error_regex_dicts"] diff --git a/test/unit/test_regex_patterns.py b/test/unit/test_regex_patterns.py new file mode 100644 index 0000000..98d0a11 --- /dev/null +++ b/test/unit/test_regex_patterns.py @@ -0,0 +1,22 @@ +import re + +from nodescraper import regex_patterns + + +def test_ipv4_pattern_matches(): + pat = regex_patterns.get_pattern("ipv4") + compiled = re.compile(pat) + assert compiled.search("address 192.0.2.1") + + +def test_mac_and_uuid_patterns_match(): + mac = regex_patterns.get_pattern("mac") + uuid = regex_patterns.get_pattern("uuid") + assert re.search(mac, "found MAC 00:1A:2B:3C:4D:5E") + assert re.search(uuid, "id: 123e4567-e89b-12d3-a456-426655440000") + + +def test_build_error_regex_dicts_works(): + rules = regex_patterns.build_error_regex_dicts(["ipv4", "email"], message_template="got {name}") + assert isinstance(rules, list) and len(rules) == 2 + assert all("regex" in r and "message" in r for r in rules) diff --git a/test/unit/test_regex_search_analyzer_extra.py b/test/unit/test_regex_search_analyzer_extra.py new file mode 100644 index 0000000..69eeadf --- /dev/null +++ b/test/unit/test_regex_search_analyzer_extra.py @@ -0,0 +1,29 @@ +from nodescraper.models.systeminfo import SystemInfo +from nodescraper.plugins.regex_search.regex_search_analyzer import RegexSearchAnalyzer +from nodescraper.plugins.regex_search.regex_search_data import RegexSearchData +from nodescraper.plugins.regex_search.analyzer_args import RegexSearchAnalyzerArgs + +from nodescraper import regex_patterns + + +def test_regex_search_analyzer_detects_ipv4(): + system_info = SystemInfo() + analyzer = RegexSearchAnalyzer(system_info=system_info) + + # Content includes an ISO-like timestamp and an IPv4 address + content = "2026-05-01T12:00:00,000+00:00 Something happened at 192.0.2.123\n" + data = RegexSearchData(content=content, data_root="regex_search") + + args = { + "error_regex": regex_patterns.build_error_regex_dicts(["ipv4"], message_template="Found {name}"), + "num_timestamps": 2, + "interval_to_collapse_event": 60, + } + + result = analyzer.analyze_data(data, args) + + assert result is not None + assert len(result.events) >= 1 + ev = result.events[0] + # matched content should include the IPv4 + assert "192.0.2.123" in str(ev.data.get("match_content", ""))