Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 33 additions & 123 deletions nodescraper/plugins/inband/rdma/rdma_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,106 +37,12 @@ class RdmaAnalyzer(DataAnalyzer[RdmaDataModel, None]):

DATA_MODEL = RdmaDataModel

# Error fields checked from rdma statistic output (bnxt_re, mlx5, ionic, etc.)
ERROR_FIELDS = [
"recoverable_errors",
"tx_roce_errors",
"tx_roce_discards",
"rx_roce_errors",
"rx_roce_discards",
"local_ack_timeout_err",
"packet_seq_err",
"max_retry_exceeded",
"rnr_nak_retry_err",
"implied_nak_seq_err",
"unrecoverable_err",
"bad_resp_err",
"local_qp_op_err",
"local_protection_err",
"mem_mgmt_op_err",
"req_remote_invalid_request",
"req_remote_access_errors",
"remote_op_err",
"duplicate_request",
"res_exceed_max",
"resp_local_length_error",
"res_exceeds_wqe",
"res_opcode_err",
"res_rx_invalid_rkey",
"res_rx_domain_err",
"res_rx_no_perm",
"res_rx_range_err",
"res_tx_invalid_rkey",
"res_tx_domain_err",
"res_tx_no_perm",
"res_tx_range_err",
"res_irrq_oflow",
"res_unsup_opcode",
"res_unaligned_atomic",
"res_rem_inv_err",
"res_mem_err",
"res_srq_err",
"res_cmp_err",
"res_invalid_dup_rkey",
"res_wqe_format_err",
"res_cq_load_err",
"res_srq_load_err",
"res_tx_pci_err",
"res_rx_pci_err",
"out_of_buffer",
"out_of_sequence",
"req_cqe_error",
"req_cqe_flush_error",
"resp_cqe_error",
"resp_cqe_flush_error",
"resp_remote_access_errors",
"req_rx_pkt_seq_err",
"req_rx_rnr_retry_err",
"req_rx_rmt_acc_err",
"req_rx_rmt_req_err",
"req_rx_oper_err",
"req_rx_impl_nak_seq_err",
"req_rx_cqe_err",
"req_rx_cqe_flush",
"req_rx_dup_response",
"req_rx_inval_pkts",
"req_tx_loc_acc_err",
"req_tx_loc_oper_err",
"req_tx_mem_mgmt_err",
"req_tx_retry_excd_err",
"req_tx_loc_sgl_inv_err",
"resp_rx_dup_request",
"resp_rx_outof_buf",
"resp_rx_outouf_seq",
"resp_rx_cqe_err",
"resp_rx_cqe_flush",
"resp_rx_loc_len_err",
"resp_rx_inval_request",
"resp_rx_loc_oper_err",
"resp_rx_outof_atomic",
"resp_tx_pkt_seq_err",
"resp_tx_rmt_inval_req_err",
"resp_tx_rmt_acc_err",
"resp_tx_rmt_oper_err",
"resp_tx_rnr_retry_err",
"resp_tx_loc_sgl_inv_err",
"resp_rx_s0_table_err",
"resp_rx_ccl_cts_outouf_seq",
"tx_rdma_ack_timeout",
"tx_rdma_ccl_cts_ack_timeout",
"rx_rdma_mtu_discard_pkts",
]

CRITICAL_ERROR_FIELDS = [
"unrecoverable_err",
"res_tx_pci_err",
"res_rx_pci_err",
"res_mem_err",
]

def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> TaskResult:
"""Analyze RDMA statistics for non-zero error counters.

Error and critical counter names come from each vendor's statistics model
(ionic / bnxt / mlx prefixes).

Args:
data: RDMA data model with statistic_list (and optionally link_list).
args: Unused (analyzer has no configurable args).
Expand All @@ -150,32 +56,36 @@ def analyze_data(self, data: RdmaDataModel, args: Optional[None] = None) -> Task
return self.result

error_state = False
for idx, stat in enumerate(data.statistic_list):
errors_on_interface = [] # (error_field, value, is_critical)
for error_field in self.ERROR_FIELDS:
value = getattr(stat, error_field, None)
if value is not None and value > 0:
is_critical = error_field in self.CRITICAL_ERROR_FIELDS
errors_on_interface.append((error_field, value, is_critical))
if errors_on_interface:
error_state = True
interface_label = stat.ifname or "unknown"
error_names = [e[0] for e in errors_on_interface]
any_critical = any(e[2] for e in errors_on_interface)
priority = EventPriority.CRITICAL if any_critical else EventPriority.ERROR
errors_data = {field: value for field, value, _ in errors_on_interface}
self._log_event(
category=EventCategory.IO,
description=f"RDMA error detected on {interface_label}: [{', '.join(error_names)}]",
data={
"interface": stat.ifname,
"port": stat.port,
"errors": errors_data,
"statistic_index": idx,
},
priority=priority,
console_log=True,
)

for stat in data.statistic_list:
if stat.vendor_statistics is None:
continue

error_fields = stat.vendor_statistics.error_fields
critical_fields = stat.vendor_statistics.critial_error_fields

for error_field in error_fields + critical_fields:
error_value = getattr(stat.vendor_statistics, error_field, None)

if error_value is not None and error_value > 0:
priority = (
EventPriority.CRITICAL
if error_field in critical_fields
else EventPriority.ERROR
)
self._log_event(
category=EventCategory.NETWORK,
description=f"RDMA error detected: {error_field}",
data={
"interface": stat.ifname,
"port": stat.port,
"error_field": error_field,
"error_count": error_value,
},
priority=priority,
console_log=True,
)
error_state = True

if error_state:
self.result.message = "RDMA errors detected in statistics"
Expand Down
69 changes: 65 additions & 4 deletions nodescraper/plugins/inband/rdma/rdma_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,15 @@
from nodescraper.models import TaskResult
from nodescraper.utils import get_exception_traceback

from .rdmadata import RdmaDataModel, RdmaDevice, RdmaLink, RdmaLinkText, RdmaStatistics
from .rdmadata import (
VENDOR_PREFIX_MAP,
RdmaDataModel,
RdmaDevice,
RdmaLink,
RdmaLinkText,
RdmaStatistics,
RdmaVendorStatistics,
)


class RdmaCollector(InBandDataCollector[RdmaDataModel, None]):
Expand Down Expand Up @@ -172,7 +180,11 @@ def _parse_rdma_link_text(self, output: str) -> list[RdmaLinkText]:
return links

def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]:
"""Get RDMA statistics from 'rdma statistic -j'."""
"""Get RDMA statistics from 'rdma statistic -j'.

Warns on unexpected or missing fields relative to the vendor-specific model
for the interface prefix (ionic / bnxt / mlx).
"""
stat_data = self._run_rdma_command(self.CMD_STATISTIC)
if stat_data is None:
return None
Expand All @@ -190,15 +202,64 @@ def _get_rdma_statistics(self) -> Optional[list[RdmaStatistics]]:
priority=EventPriority.WARNING,
)
continue
statistics.append(RdmaStatistics(**stat))

ifname = stat.get("ifname", "")
vendor_stats: Optional[RdmaVendorStatistics] = None
for prefix, vendor_cls in VENDOR_PREFIX_MAP.items():
if ifname.startswith(prefix):
vendor_fields = set(vendor_cls.model_fields.keys())
stat_fields = set(stat.keys()) - {"ifname", "port"}

extra_fields = stat_fields - vendor_fields
if extra_fields:
self._log_event(
category=EventCategory.NETWORK,
description=f"Unexpected fields in RDMA statistic for {ifname}",
data={
"interface": ifname,
"extra_fields": sorted(extra_fields),
},
priority=EventPriority.WARNING,
)

missing_fields = vendor_fields - stat_fields
if missing_fields:
self._log_event(
category=EventCategory.NETWORK,
description=f"Missing fields in RDMA statistic for {ifname}",
data={
"interface": ifname,
"missing_fields": sorted(missing_fields),
},
priority=EventPriority.WARNING,
)

try:
vendor_stats = vendor_cls(**stat)
except ValidationError as ve:
self._log_event(
category=EventCategory.NETWORK,
description=f"Failed to build vendor model for {ifname}",
data={"exception": get_exception_traceback(ve)},
priority=EventPriority.WARNING,
)
break

rdma_stat = RdmaStatistics(
ifname=stat.get("ifname"),
port=stat.get("port"),
vendor_statistics=vendor_stats,
)
statistics.append(rdma_stat)
return statistics
except ValidationError as e:
self._log_event(
category=EventCategory.NETWORK,
description="Failed to build RdmaStatistics model",
data={"exception": get_exception_traceback(e)},
priority=EventPriority.WARNING,
)
return statistics
return None

def _get_rdma_link(self) -> Optional[list[RdmaLink]]:
"""Get RDMA link data from 'rdma link -j'."""
Expand Down
Loading
Loading