From 7d46254484901b0f5a9a926d8ce408a766de80b9 Mon Sep 17 00:00:00 2001 From: Devesh Kumar Singh Date: Fri, 29 May 2026 15:56:39 +0530 Subject: [PATCH] HDDS-15413. Recon and SCM Container Sync Metrics addition. --- .../metrics/ReconScmContainerSyncMetrics.java | 150 ++++++++++++++++-- .../ReconStorageContainerManagerFacade.java | 3 +- .../scm/ReconStorageContainerSyncHelper.java | 50 ++++++ .../TestReconScmContainerSyncMetrics.java | 92 +++++++++++ .../TestReconStorageContainerSyncHelper.java | 43 +++++ 5 files changed, 325 insertions(+), 13 deletions(-) create mode 100644 hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/metrics/TestReconScmContainerSyncMetrics.java diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/metrics/ReconScmContainerSyncMetrics.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/metrics/ReconScmContainerSyncMetrics.java index d652a63b4ef..2fabb8fb138 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/metrics/ReconScmContainerSyncMetrics.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/metrics/ReconScmContainerSyncMetrics.java @@ -17,13 +17,22 @@ package org.apache.hadoop.ozone.recon.metrics; +import com.google.common.base.CaseFormat; +import java.util.Collections; +import java.util.EnumMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; import org.apache.hadoop.hdds.annotation.InterfaceAudience; +import org.apache.hadoop.hdds.protocol.proto.HddsProtos; +import org.apache.hadoop.metrics2.MetricsCollector; +import org.apache.hadoop.metrics2.MetricsInfo; +import org.apache.hadoop.metrics2.MetricsRecordBuilder; +import org.apache.hadoop.metrics2.MetricsSource; import org.apache.hadoop.metrics2.MetricsSystem; -import org.apache.hadoop.metrics2.annotation.Metric; import org.apache.hadoop.metrics2.annotation.Metrics; import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem; -import org.apache.hadoop.metrics2.lib.MutableGaugeInt; -import org.apache.hadoop.metrics2.lib.MutableGaugeLong; +import org.apache.hadoop.metrics2.lib.Interns; import org.apache.hadoop.ozone.OzoneConsts; /** @@ -31,11 +40,26 @@ */ @InterfaceAudience.Private @Metrics(about = "Recon SCM Container Sync Metrics", context = OzoneConsts.OZONE) -public final class ReconScmContainerSyncMetrics { +public final class ReconScmContainerSyncMetrics implements MetricsSource { private static final String SOURCE_NAME = ReconScmContainerSyncMetrics.class.getSimpleName(); + private static final HddsProtos.LifeCycleState[] SYNC_STATES = { + HddsProtos.LifeCycleState.OPEN, + HddsProtos.LifeCycleState.QUASI_CLOSED, + HddsProtos.LifeCycleState.CLOSED, + HddsProtos.LifeCycleState.DELETED + }; + + private static final MetricsInfo TARGETED_SYNC_STATUS = Interns.info( + "targetedSyncStatus", + "Targeted sync status: 0=idle, 1=in progress, 2=success, 3=failure"); + + private static final MetricsInfo LAST_TARGETED_SYNC_DURATION_MS = Interns.info( + "lastTargetedSyncDurationMs", + "Time taken by the last targeted sync in milliseconds"); + /** * No targeted sync has run yet, or the latest scheduler cycle did not run one. */ @@ -53,14 +77,22 @@ public final class ReconScmContainerSyncMetrics { */ public static final int TARGETED_SYNC_STATUS_FAILURE = 3; - @Metric(about = "Targeted sync status: 0=idle, 1=in progress, " - + "2=success, 3=failure") - private MutableGaugeInt targetedSyncStatus; - - @Metric(about = "Time taken by the last targeted sync in milliseconds") - private MutableGaugeLong lastTargetedSyncDurationMs; + private final AtomicInteger targetedSyncStatus = new AtomicInteger(); + private final AtomicLong lastTargetedSyncDurationMs = new AtomicLong(); + private final Map + lastContainerSyncDurationMs; + private final Map + lastContainerCountDrift; + private final Map + containerSyncDurationMetricInfo; + private final Map + containerCountDriftMetricInfo; private ReconScmContainerSyncMetrics() { + lastContainerSyncDurationMs = initStateGaugeValues(); + lastContainerCountDrift = initStateGaugeValues(); + containerSyncDurationMetricInfo = initSyncDurationMetricInfo(); + containerCountDriftMetricInfo = initCountDriftMetricInfo(); } public static ReconScmContainerSyncMetrics create() { @@ -83,11 +115,105 @@ public void setLastTargetedSyncDurationMs(long durationMs) { lastTargetedSyncDurationMs.set(durationMs); } + public void setLastContainerSyncDurationMs( + HddsProtos.LifeCycleState state, long durationMs) { + setStateGauge(lastContainerSyncDurationMs, state, durationMs); + } + + public void setLastContainerCountDrift( + HddsProtos.LifeCycleState state, long drift) { + setStateGauge(lastContainerCountDrift, state, drift); + } + public int getTargetedSyncStatus() { - return targetedSyncStatus.value(); + return targetedSyncStatus.get(); } public long getLastTargetedSyncDurationMs() { - return lastTargetedSyncDurationMs.value(); + return lastTargetedSyncDurationMs.get(); + } + + public long getLastContainerSyncDurationMs( + HddsProtos.LifeCycleState state) { + return getStateGauge(lastContainerSyncDurationMs, state); + } + + public long getLastContainerCountDrift( + HddsProtos.LifeCycleState state) { + return getStateGauge(lastContainerCountDrift, state); + } + + @Override + public void getMetrics(MetricsCollector collector, boolean all) { + MetricsRecordBuilder builder = collector.addRecord(SOURCE_NAME); + builder.addGauge(TARGETED_SYNC_STATUS, getTargetedSyncStatus()); + builder.addGauge(LAST_TARGETED_SYNC_DURATION_MS, + getLastTargetedSyncDurationMs()); + for (HddsProtos.LifeCycleState state : SYNC_STATES) { + builder.addGauge(containerSyncDurationMetricInfo.get(state), + getLastContainerSyncDurationMs(state)); + builder.addGauge(containerCountDriftMetricInfo.get(state), + getLastContainerCountDrift(state)); + } + } + + private static Map + initStateGaugeValues() { + Map gauges = + new EnumMap<>(HddsProtos.LifeCycleState.class); + for (HddsProtos.LifeCycleState state : SYNC_STATES) { + gauges.put(state, new AtomicLong()); + } + return Collections.unmodifiableMap(gauges); + } + + private static Map + initSyncDurationMetricInfo() { + Map metrics = + new EnumMap<>(HddsProtos.LifeCycleState.class); + for (HddsProtos.LifeCycleState state : SYNC_STATES) { + String stateName = metricStateName(state); + metrics.put(state, Interns.info( + "last" + stateName + "ContainerSyncDurationMs", + "Time taken by the last " + stateName + + " container sync pass in milliseconds")); + } + return Collections.unmodifiableMap(metrics); + } + + private static Map + initCountDriftMetricInfo() { + Map metrics = + new EnumMap<>(HddsProtos.LifeCycleState.class); + for (HddsProtos.LifeCycleState state : SYNC_STATES) { + String stateName = metricStateName(state); + metrics.put(state, Interns.info( + "last" + stateName + "ContainerCountDrift", + "Last pre-sync observed " + stateName + + " container count drift, computed as SCM count minus Recon count")); + } + return Collections.unmodifiableMap(metrics); + } + + private static String metricStateName(HddsProtos.LifeCycleState state) { + return CaseFormat.UPPER_UNDERSCORE.to( + CaseFormat.UPPER_CAMEL, state.name()); + } + + private static void setStateGauge( + Map gauges, + HddsProtos.LifeCycleState state, + long value) { + AtomicLong gauge = gauges.get(state); + if (gauge != null) { + gauge.set(value); + } + } + + private static long getStateGauge( + Map gauges, + HddsProtos.LifeCycleState state) { + AtomicLong gauge = gauges.get(state); + return gauge != null ? gauge.get() : 0L; } } diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java index 1712e711fe0..8b5d0f0f9ee 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerManagerFacade.java @@ -548,7 +548,8 @@ public ReconStorageContainerManagerFacade(OzoneConfiguration conf, containerSyncHelper = new ReconStorageContainerSyncHelper( scmServiceProvider, ozoneConfiguration, - containerManager + containerManager, + containerSyncMetrics ); } diff --git a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerSyncHelper.java b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerSyncHelper.java index 9a1aa48a1e1..8a81acae818 100644 --- a/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerSyncHelper.java +++ b/hadoop-ozone/recon/src/main/java/org/apache/hadoop/ozone/recon/scm/ReconStorageContainerSyncHelper.java @@ -42,7 +42,9 @@ import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException; import org.apache.hadoop.hdds.scm.container.common.helpers.ContainerWithPipeline; import org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException; +import org.apache.hadoop.ozone.recon.metrics.ReconScmContainerSyncMetrics; import org.apache.hadoop.ozone.recon.spi.StorageContainerServiceProvider; +import org.apache.hadoop.util.Time; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -140,13 +142,22 @@ class ReconStorageContainerSyncHelper { private final StorageContainerServiceProvider scmServiceProvider; private final OzoneConfiguration ozoneConfiguration; private final ReconContainerManager containerManager; + private final ReconScmContainerSyncMetrics containerSyncMetrics; ReconStorageContainerSyncHelper(StorageContainerServiceProvider scmServiceProvider, OzoneConfiguration ozoneConfiguration, ReconContainerManager containerManager) { + this(scmServiceProvider, ozoneConfiguration, containerManager, null); + } + + ReconStorageContainerSyncHelper(StorageContainerServiceProvider scmServiceProvider, + OzoneConfiguration ozoneConfiguration, + ReconContainerManager containerManager, + ReconScmContainerSyncMetrics containerSyncMetrics) { this.scmServiceProvider = scmServiceProvider; this.ozoneConfiguration = ozoneConfiguration; this.containerManager = containerManager; + this.containerSyncMetrics = containerSyncMetrics; } /** @@ -167,8 +178,10 @@ public boolean syncWithSCMContainerInfo() { */ private boolean syncContainersForState(HddsProtos.LifeCycleState scmState, boolean incrementalOpen) { + long startTime = Time.monotonicNow(); try { long total = scmServiceProvider.getContainerCount(scmState); + updateContainerCountDrift(scmState, total); if (total == 0) { LOG.debug("{} sync: no containers found in SCM.", scmState); return true; @@ -222,6 +235,8 @@ private boolean syncContainersForState(HddsProtos.LifeCycleState scmState, } catch (Exception e) { LOG.error("{} sync: unexpected error.", scmState, e); return false; + } finally { + updateContainerSyncDuration(scmState, Time.monotonicNow() - startTime); } } @@ -359,7 +374,9 @@ private int rebuildContainerFromScm(ContainerID containerID, * @return {@code true} if all RPC calls completed without error */ private boolean syncDeletedContainers() { + long startTime = Time.monotonicNow(); try { + updateDeletedContainerCountDrift(); int configuredBatch = ozoneConfiguration.getInt( OZONE_RECON_SCM_DELETED_CONTAINER_CHECK_BATCH_SIZE, OZONE_RECON_SCM_DELETED_CONTAINER_CHECK_BATCH_SIZE_DEFAULT); @@ -390,6 +407,39 @@ private boolean syncDeletedContainers() { } catch (Exception e) { LOG.error("DELETED sync: unexpected error.", e); return false; + } finally { + updateContainerSyncDuration(HddsProtos.LifeCycleState.DELETED, + Time.monotonicNow() - startTime); + } + } + + private void updateDeletedContainerCountDrift() { + if (containerSyncMetrics == null) { + return; + } + try { + long total = scmServiceProvider.getContainerCount( + HddsProtos.LifeCycleState.DELETED); + updateContainerCountDrift(HddsProtos.LifeCycleState.DELETED, total); + } catch (Exception e) { + LOG.warn("DELETED sync: unable to update pre-sync count drift metric.", e); + } + } + + private void updateContainerCountDrift(HddsProtos.LifeCycleState state, + long scmCount) { + if (containerSyncMetrics == null) { + return; + } + long reconCount = containerManager.getContainerStateCount(state); + containerSyncMetrics.setLastContainerCountDrift(state, + scmCount - reconCount); + } + + private void updateContainerSyncDuration(HddsProtos.LifeCycleState state, + long durationMs) { + if (containerSyncMetrics != null) { + containerSyncMetrics.setLastContainerSyncDurationMs(state, durationMs); } } diff --git a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/metrics/TestReconScmContainerSyncMetrics.java b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/metrics/TestReconScmContainerSyncMetrics.java new file mode 100644 index 00000000000..69eb45ee6e8 --- /dev/null +++ b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/metrics/TestReconScmContainerSyncMetrics.java @@ -0,0 +1,92 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.recon.metrics; + +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.CLOSED; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.CLOSING; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.DELETED; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.DELETING; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.OPEN; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.QUASI_CLOSED; +import static org.apache.hadoop.metrics2.lib.Interns.info; +import static org.apache.ozone.test.MetricsAsserts.eqName; +import static org.apache.ozone.test.MetricsAsserts.getLongGauge; +import static org.apache.ozone.test.MetricsAsserts.getMetrics; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.mockito.Mockito.eq; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.verify; + +import org.apache.hadoop.metrics2.MetricsRecordBuilder; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +/** + * Tests for Recon SCM container sync metrics. + */ +class TestReconScmContainerSyncMetrics { + + private ReconScmContainerSyncMetrics metrics; + + @BeforeEach + void setUp() { + metrics = ReconScmContainerSyncMetrics.create(); + } + + @AfterEach + void tearDown() { + metrics.unRegister(); + } + + @Test + void testStateMetricsAreEmittedForReconciledStatesOnly() { + metrics.setLastContainerSyncDurationMs(OPEN, 10L); + metrics.setLastContainerSyncDurationMs(QUASI_CLOSED, 20L); + metrics.setLastContainerSyncDurationMs(CLOSED, 30L); + metrics.setLastContainerSyncDurationMs(DELETED, 40L); + metrics.setLastContainerCountDrift(OPEN, 2L); + metrics.setLastContainerCountDrift(QUASI_CLOSED, 0L); + metrics.setLastContainerCountDrift(CLOSED, -3L); + metrics.setLastContainerCountDrift(DELETED, 4L); + metrics.setLastContainerCountDrift(CLOSING, 100L); + metrics.setLastContainerSyncDurationMs(DELETING, 200L); + + MetricsRecordBuilder builder = getMetrics(metrics); + + assertEquals(10L, getLongGauge("lastOpenContainerSyncDurationMs", builder)); + assertEquals(20L, + getLongGauge("lastQuasiClosedContainerSyncDurationMs", builder)); + assertEquals(30L, getLongGauge("lastClosedContainerSyncDurationMs", builder)); + assertEquals(40L, getLongGauge("lastDeletedContainerSyncDurationMs", builder)); + assertEquals(2L, getLongGauge("lastOpenContainerCountDrift", builder)); + assertEquals(0L, + getLongGauge("lastQuasiClosedContainerCountDrift", builder)); + assertEquals(-3L, getLongGauge("lastClosedContainerCountDrift", builder)); + assertEquals(4L, getLongGauge("lastDeletedContainerCountDrift", builder)); + + verify(builder, never()).addGauge( + eqName(info("lastClosingContainerSyncDurationMs", "")), eq(100L)); + verify(builder, never()).addGauge( + eqName(info("lastDeletingContainerSyncDurationMs", "")), eq(200L)); + verify(builder, never()).addGauge( + eqName(info("lastClosingContainerCountDrift", "")), eq(100L)); + verify(builder, never()).addGauge( + eqName(info("lastDeletingContainerCountDrift", "")), eq(200L)); + } +} diff --git a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconStorageContainerSyncHelper.java b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconStorageContainerSyncHelper.java index 03b771e7ed2..3d78c8313d7 100644 --- a/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconStorageContainerSyncHelper.java +++ b/hadoop-ozone/recon/src/test/java/org/apache/hadoop/ozone/recon/scm/TestReconStorageContainerSyncHelper.java @@ -18,8 +18,12 @@ package org.apache.hadoop.ozone.recon.scm; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.CLOSED; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.DELETED; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.OPEN; +import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState.QUASI_CLOSED; import static org.apache.hadoop.hdds.protocol.proto.HddsProtos.ReplicationFactor.ONE; import static org.apache.hadoop.ozone.recon.ReconServerConfigKeys.OZONE_RECON_SCM_CONTAINER_ID_BATCH_SIZE; +import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.any; import static org.mockito.ArgumentMatchers.argThat; @@ -36,6 +40,7 @@ import org.apache.hadoop.hdds.scm.container.ContainerID; import org.apache.hadoop.hdds.scm.container.ContainerInfo; import org.apache.hadoop.hdds.scm.container.common.helpers.ContainerWithPipeline; +import org.apache.hadoop.ozone.recon.metrics.ReconScmContainerSyncMetrics; import org.apache.hadoop.ozone.recon.spi.StorageContainerServiceProvider; import org.junit.jupiter.api.Test; @@ -57,6 +62,44 @@ class TestReconStorageContainerSyncHelper { ); } + @Test + void testContainerSyncMetricsTrackPreSyncDriftAndDuration() throws Exception { + ReconScmContainerSyncMetrics metrics = ReconScmContainerSyncMetrics.create(); + try { + ReconStorageContainerSyncHelper helperWithMetrics = + new ReconStorageContainerSyncHelper( + mockScmServiceProvider, + new OzoneConfiguration(), + mockContainerManager, + metrics); + + when(mockScmServiceProvider.getContainerCount(OPEN)).thenReturn(5L); + when(mockScmServiceProvider.getContainerCount(QUASI_CLOSED)).thenReturn(1L); + when(mockScmServiceProvider.getContainerCount(CLOSED)).thenReturn(8L); + when(mockScmServiceProvider.getContainerCount(DELETED)).thenReturn(9L); + when(mockContainerManager.getContainerStateCount(OPEN)).thenReturn(3); + when(mockContainerManager.getContainerStateCount(QUASI_CLOSED)).thenReturn(1); + when(mockContainerManager.getContainerStateCount(CLOSED)).thenReturn(10); + when(mockContainerManager.getContainerStateCount(DELETED)).thenReturn(7); + when(mockScmServiceProvider.getListOfContainerIDs( + any(), any(Integer.class), any())).thenReturn(Collections.emptyList()); + + boolean result = helperWithMetrics.syncWithSCMContainerInfo(); + + assertTrue(result); + assertEquals(2L, metrics.getLastContainerCountDrift(OPEN)); + assertEquals(0L, metrics.getLastContainerCountDrift(QUASI_CLOSED)); + assertEquals(-2L, metrics.getLastContainerCountDrift(CLOSED)); + assertEquals(2L, metrics.getLastContainerCountDrift(DELETED)); + assertTrue(metrics.getLastContainerSyncDurationMs(OPEN) >= 0); + assertTrue(metrics.getLastContainerSyncDurationMs(QUASI_CLOSED) >= 0); + assertTrue(metrics.getLastContainerSyncDurationMs(CLOSED) >= 0); + assertTrue(metrics.getLastContainerSyncDurationMs(DELETED) >= 0); + } finally { + metrics.unRegister(); + } + } + @Test void testContainerMissingFromReconIsAdded() throws Exception { ContainerID cid = ContainerID.valueOf(42L);