diff --git a/cmd/telemetry/telemetry.go b/cmd/telemetry/telemetry.go index 5c8864b8..f2736647 100644 --- a/cmd/telemetry/telemetry.go +++ b/cmd/telemetry/telemetry.go @@ -52,17 +52,19 @@ var ( flagAll bool - flagCPU bool - flagFrequency bool - flagIPC bool - flagC6 bool - flagIRQRate bool - flagMemory bool - flagNetwork bool - flagStorage bool - flagPower bool - flagTemperature bool - flagInstrMix bool + flagCPU bool + flagFrequency bool + flagIPC bool + flagC6 bool + flagIRQRate bool + flagMemory bool + flagNetwork bool + flagStorage bool + flagPower bool + flagTemperature bool + flagInstrMix bool + flagVirtualMemory bool + flagProcess bool flagNoSystemSummary bool @@ -76,17 +78,19 @@ const ( flagAllName = "all" - flagCPUName = "cpu" - flagFrequencyName = "frequency" - flagIPCName = "ipc" - flagC6Name = "c6" - flagIRQRateName = "irqrate" - flagMemoryName = "memory" - flagNetworkName = "network" - flagStorageName = "storage" - flagPowerName = "power" - flagTemperatureName = "temperature" - flagInstrMixName = "instrmix" + flagCPUName = "cpu" + flagFrequencyName = "frequency" + flagIPCName = "ipc" + flagC6Name = "c6" + flagIRQRateName = "irqrate" + flagMemoryName = "memory" + flagNetworkName = "network" + flagStorageName = "storage" + flagPowerName = "power" + flagTemperatureName = "temperature" + flagInstrMixName = "instrmix" + flagVirtualMemoryName = "virtual-memory" + flagProcessName = "process" flagNoSystemSummaryName = "no-summary" @@ -108,6 +112,8 @@ var categories = []app.Category{ {FlagName: flagStorageName, FlagVar: &flagStorage, DefaultValue: false, Help: "monitor storage", Tables: []table.TableDefinition{tableDefinitions[DriveTelemetryTableName]}}, {FlagName: flagIRQRateName, FlagVar: &flagIRQRate, DefaultValue: false, Help: "monitor IRQ rate", Tables: []table.TableDefinition{tableDefinitions[IRQRateTelemetryTableName]}}, {FlagName: flagInstrMixName, FlagVar: &flagInstrMix, DefaultValue: false, Help: "monitor instruction mix", Tables: []table.TableDefinition{tableDefinitions[InstructionTelemetryTableName]}}, + {FlagName: flagVirtualMemoryName, FlagVar: &flagVirtualMemory, DefaultValue: false, Help: "monitor virtual memory", Tables: []table.TableDefinition{tableDefinitions[VirtualMemoryTelemetryTableName]}}, + {FlagName: flagProcessName, FlagVar: &flagProcess, DefaultValue: false, Help: "monitor process telemetry", Tables: []table.TableDefinition{tableDefinitions[ProcessTelemetryTableName]}}, } const ( @@ -338,6 +344,8 @@ func runCmd(cmd *cobra.Command, args []string) error { report.RegisterHTMLRenderer(InstructionTelemetryTableName, instructionTelemetryTableHTMLRenderer) report.RegisterHTMLRenderer(GaudiTelemetryTableName, gaudiTelemetryTableHTMLRenderer) report.RegisterHTMLRenderer(PDUTelemetryTableName, pduTelemetryTableHTMLRenderer) + report.RegisterHTMLRenderer(VirtualMemoryTelemetryTableName, virtualMemoryTelemetryTableHTMLRenderer) + report.RegisterHTMLRenderer(ProcessTelemetryTableName, processTelemetryTableHTMLRenderer) return reportingCommand.Run() } @@ -363,6 +371,9 @@ func summaryFromTableValues(allTableValues []table.TableValues, _ map[string]scr networkReads := getMetricAverage(getTableValues(allTableValues, NetworkTelemetryTableName), []string{"rxkB/s"}, "Time") networkWrites := getMetricAverage(getTableValues(allTableValues, NetworkTelemetryTableName), []string{"txkB/s"}, "Time") memAvail := getMetricAverage(getTableValues(allTableValues, MemoryTelemetryTableName), []string{"avail"}, "Time") + minorFaults := getMetricAverage(getTableValues(allTableValues, VirtualMemoryTelemetryTableName), []string{"Minor Faults/s"}, "Time") + majorFaults := getMetricAverage(getTableValues(allTableValues, VirtualMemoryTelemetryTableName), []string{"Major Faults/s"}, "Time") + ctxSwitches := getMetricAverage(getTableValues(allTableValues, ProcessTelemetryTableName), []string{"Context Switches/s"}, "Time") return table.TableValues{ TableDefinition: table.TableDefinition{ Name: telemetrySummaryTableName, @@ -381,6 +392,9 @@ func summaryFromTableValues(allTableValues []table.TableValues, _ map[string]scr {Name: "Drive Writes (kB/s)", Values: []string{driveWrites}}, {Name: "Network RX (kB/s)", Values: []string{networkReads}}, {Name: "Network TX (kB/s)", Values: []string{networkWrites}}, + {Name: "Minor Page Faults/s", Values: []string{minorFaults}}, + {Name: "Major Page Faults/s", Values: []string{majorFaults}}, + {Name: "Context Switches/s", Values: []string{ctxSwitches}}, }, } } diff --git a/cmd/telemetry/telemetry_renderers.go b/cmd/telemetry/telemetry_renderers.go index f88bd392..c1695d42 100644 --- a/cmd/telemetry/telemetry_renderers.go +++ b/cmd/telemetry/telemetry_renderers.go @@ -673,3 +673,73 @@ func pduTelemetryTableHTMLRenderer(tableValues table.TableValues, targetName str } return telemetryTableHTMLRenderer(tableValues, data, datasetNames, chartConfig, nil) } + +func virtualMemoryTelemetryTableHTMLRenderer(tableValues table.TableValues, targetName string) string { + data := [][]float64{} + datasetNames := []string{} + for _, field := range tableValues.Fields[1:] { + points := []float64{} + for _, val := range field.Values { + if val == "" { + break + } + stat, err := strconv.ParseFloat(val, 64) + if err != nil { + slog.Error("error parsing stat", slog.String("error", err.Error())) + return "" + } + points = append(points, stat) + } + if len(points) > 0 { + data = append(data, points) + datasetNames = append(datasetNames, field.Name) + } + } + chartConfig := report.ChartTemplateStruct{ + ID: fmt.Sprintf("%s%d", tableValues.Name, util.RandUint(10000)), + XaxisText: "Time", + YaxisText: "count per second", + TitleText: "", + DisplayTitle: "false", + DisplayLegend: "true", + AspectRatio: "2", + SuggestedMin: "0", + SuggestedMax: "0", + } + return telemetryTableHTMLRenderer(tableValues, data, datasetNames, chartConfig, nil) +} + +func processTelemetryTableHTMLRenderer(tableValues table.TableValues, targetName string) string { + data := [][]float64{} + datasetNames := []string{} + for _, field := range tableValues.Fields[1:] { + points := []float64{} + for _, val := range field.Values { + if val == "" { + break + } + stat, err := strconv.ParseFloat(val, 64) + if err != nil { + slog.Error("error parsing stat", slog.String("error", err.Error())) + return "" + } + points = append(points, stat) + } + if len(points) > 0 { + data = append(data, points) + datasetNames = append(datasetNames, field.Name) + } + } + chartConfig := report.ChartTemplateStruct{ + ID: fmt.Sprintf("%s%d", tableValues.Name, util.RandUint(10000)), + XaxisText: "Time", + YaxisText: "count per second", + TitleText: "", + DisplayTitle: "false", + DisplayLegend: "true", + AspectRatio: "2", + SuggestedMin: "0", + SuggestedMax: "0", + } + return telemetryTableHTMLRenderer(tableValues, data, datasetNames, chartConfig, nil) +} diff --git a/cmd/telemetry/telemetry_tables.go b/cmd/telemetry/telemetry_tables.go index ec9209b7..d1e43cac 100644 --- a/cmd/telemetry/telemetry_tables.go +++ b/cmd/telemetry/telemetry_tables.go @@ -33,6 +33,8 @@ const ( TemperatureTelemetryTableName = "Temperature Telemetry" GaudiTelemetryTableName = "Gaudi Telemetry" PDUTelemetryTableName = "PDU Telemetry" + VirtualMemoryTelemetryTableName = "Virtual Memory Telemetry" + ProcessTelemetryTableName = "Process Telemetry" ) // telemetry table menu labels @@ -51,6 +53,8 @@ const ( TemperatureTelemetryMenuLabel = "Temperature" GaudiTelemetryMenuLabel = "Gaudi" PDUTelemetryMenuLabel = "PDU" + VirtualMemoryTelemetryMenuLabel = "Virtual Memory" + ProcessTelemetryMenuLabel = "Process" ) var tableDefinitions = map[string]table.TableDefinition{ @@ -177,6 +181,22 @@ var tableDefinitions = map[string]table.TableDefinition{ script.PDUTelemetryScriptName, }, FieldsFunc: pduTelemetryTableValues}, + VirtualMemoryTelemetryTableName: { + Name: VirtualMemoryTelemetryTableName, + MenuLabel: VirtualMemoryTelemetryMenuLabel, + HasRows: true, + ScriptNames: []string{ + script.KernelTelemetryScriptName, + }, + FieldsFunc: virtualMemoryTelemetryTableValues}, + ProcessTelemetryTableName: { + Name: ProcessTelemetryTableName, + MenuLabel: ProcessTelemetryMenuLabel, + HasRows: true, + ScriptNames: []string{ + script.KernelTelemetryScriptName, + }, + FieldsFunc: processTelemetryTableValues}, } func cpuUtilizationTelemetryTableValues(outputs map[string]script.ScriptOutput) []table.Field { @@ -707,3 +727,95 @@ func instructionTelemetryTableValues(outputs map[string]script.ScriptOutput) []t } return fields } + +func virtualMemoryTelemetryTableValues(outputs map[string]script.ScriptOutput) []table.Field { + fields := []table.Field{ + {Name: "Time"}, + {Name: "Minor Faults/s"}, + {Name: "Major Faults/s"}, + {Name: "Pgscan/s"}, + {Name: "Pgsteal/s"}, + {Name: "Swapin/s"}, + {Name: "Swapout/s"}, + } + // the output is in CSV format: + // timestamp,ctx_switches_per_sec,procs_running,procs_blocked,minor_faults_per_sec,major_faults_per_sec,pgscan_per_sec,pgsteal_per_sec,swapin_per_sec,swapout_per_sec + reader := csv.NewReader(strings.NewReader(outputs[script.KernelTelemetryScriptName].Stdout)) + records, err := reader.ReadAll() + if err != nil { + slog.Error("failed to read virtual memory telemetry CSV output", slog.String("error", err.Error())) + return []table.Field{} + } + if len(records) == 0 { + return []table.Field{} + } + // first row is the header, find the indices of the fields we're interested in + header := records[0] + fieldIndices := make(map[string]int) + for i, fieldName := range header { + fieldIndices[fieldName] = i + } + requiredFields := []string{"timestamp", "minor_faults_per_sec", "major_faults_per_sec", "pgscan_per_sec", "pgsteal_per_sec", "swapin_per_sec", "swapout_per_sec"} + for _, field := range requiredFields { + if _, ok := fieldIndices[field]; !ok { + slog.Error("missing expected field in virtual memory telemetry output", slog.String("field", field)) + return []table.Field{} + } + } + // subsequent rows are data + for _, record := range records[1:] { + if len(record) != len(header) { + slog.Error("unexpected number of fields in virtual memory telemetry output", slog.Int("expected", len(header)), slog.Int("got", len(record))) + continue + } + fields[0].Values = append(fields[0].Values, record[fieldIndices["timestamp"]]) + fields[1].Values = append(fields[1].Values, record[fieldIndices["minor_faults_per_sec"]]) + fields[2].Values = append(fields[2].Values, record[fieldIndices["major_faults_per_sec"]]) + fields[3].Values = append(fields[3].Values, record[fieldIndices["pgscan_per_sec"]]) + fields[4].Values = append(fields[4].Values, record[fieldIndices["pgsteal_per_sec"]]) + fields[5].Values = append(fields[5].Values, record[fieldIndices["swapin_per_sec"]]) + fields[6].Values = append(fields[6].Values, record[fieldIndices["swapout_per_sec"]]) + } + return fields +} + +func processTelemetryTableValues(outputs map[string]script.ScriptOutput) []table.Field { + fields := []table.Field{ + {Name: "Time"}, + {Name: "Context Switches/s"}, + } + // the output is in CSV format: + // timestamp,ctx_switches_per_sec,procs_running,procs_blocked,minor_faults_per_sec,major_faults_per_sec,pgscan_per_sec,pgsteal_per_sec,swapin_per_sec,swapout_per_sec + reader := csv.NewReader(strings.NewReader(outputs[script.KernelTelemetryScriptName].Stdout)) + records, err := reader.ReadAll() + if err != nil { + slog.Error("failed to read process telemetry CSV output", slog.String("error", err.Error())) + return []table.Field{} + } + if len(records) == 0 { + return []table.Field{} + } + // first row is the header, find the indices of the fields we're interested in + header := records[0] + fieldIndices := make(map[string]int) + for i, fieldName := range header { + fieldIndices[fieldName] = i + } + requiredFields := []string{"timestamp", "ctx_switches_per_sec"} + for _, field := range requiredFields { + if _, ok := fieldIndices[field]; !ok { + slog.Error("missing expected field in process telemetry output", slog.String("field", field)) + return []table.Field{} + } + } + // subsequent rows are data + for _, record := range records[1:] { + if len(record) != len(header) { + slog.Error("unexpected number of fields in process telemetry output", slog.Int("expected", len(header)), slog.Int("got", len(record))) + continue + } + fields[0].Values = append(fields[0].Values, record[fieldIndices["timestamp"]]) + fields[1].Values = append(fields[1].Values, record[fieldIndices["ctx_switches_per_sec"]]) + } + return fields +} diff --git a/internal/script/scripts.go b/internal/script/scripts.go index af856e87..31a32024 100644 --- a/internal/script/scripts.go +++ b/internal/script/scripts.go @@ -119,6 +119,7 @@ const ( InstructionTelemetryScriptName = "instruction telemetry" GaudiTelemetryScriptName = "gaudi telemetry" PDUTelemetryScriptName = "pdu telemetry" + KernelTelemetryScriptName = "kernel telemetry" // flamegraph scripts FlameGraphScriptName = "flamegraph" // lock scripts @@ -1441,6 +1442,89 @@ for ((i=0; i