Skip to content

Commit 2c940cf

Browse files
xinlian12annie-mac
and
annie-mac
authored
resetTransitTimeoutAndCancellationCountDuringHighCPULoad (#38157)
* reset transitTimeout and cancellationCount under high CPU load --------- Co-authored-by: annie-mac <xinlian@microsoft.com>
1 parent 0866382 commit 2c940cf

File tree

3 files changed

+17
-4
lines changed

3 files changed

+17
-4
lines changed

sdk/cosmos/azure-cosmos-tests/src/test/java/com/azure/cosmos/implementation/directconnectivity/rntbd/RntbdClientChannelHealthCheckerTests.java

+10
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ public void transitTimeoutOnWrite_HighCPULoadTests(boolean withFailureReason) th
290290
Mockito.when(timestampsMock.tansitTimeoutWriteCount()).thenReturn(writeTimeoutCount);
291291
Mockito.when(timestampsMock.lastChannelWriteTime()).thenReturn(lastChannelWriteTime);
292292
Mockito.when(timestampsMock.lastChannelWriteAttemptTime()).thenReturn(lastChannelWriteAttemptTime);
293+
Mockito.doNothing().when(timestampsMock).resetTransitTimeout();
293294

294295
try(MockedStatic<CpuMemoryMonitor> cpuMemoryMonitorMock = Mockito.mockStatic(CpuMemoryMonitor.class)) {
295296
CpuLoadHistory cpuLoadHistoryMock = Mockito.mock(CpuLoadHistory.class);
@@ -300,10 +301,14 @@ public void transitTimeoutOnWrite_HighCPULoadTests(boolean withFailureReason) th
300301
Future<String> healthyResult = healthChecker.isHealthyWithFailureReason(channelMock).sync();
301302
assertThat(healthyResult.isSuccess()).isTrue();
302303
assertThat(healthyResult.getNow()).isEqualTo(RntbdConstants.RntbdHealthCheckResults.SuccessValue);
304+
// Verify under high CPU load, the transitTimeout will be reset
305+
Mockito.verify(timestampsMock, Mockito.times(1)).resetTransitTimeout();
303306
} else {
304307
Future<Boolean> healthyResult = healthChecker.isHealthy(channelMock).sync();
305308
assertThat(healthyResult.isSuccess()).isTrue();
306309
assertThat(healthyResult.getNow()).isTrue();
310+
// Verify under high CPU load, the transitTimeout will be reset
311+
Mockito.verify(timestampsMock, Mockito.times(1)).resetTransitTimeout();
307312
}
308313
}
309314
}
@@ -394,6 +399,7 @@ public void cancellationPronenessOfChannelWithHighCpuLoad_Test(boolean withFailu
394399
Mockito.when(timestampsMock.lastChannelWriteAttemptTime()).thenReturn(lastChannelWriteTime);
395400
Mockito.when(timestampsMock.transitTimeoutCount()).thenReturn(0);
396401
Mockito.when(timestampsMock.cancellationCount()).thenReturn(config.cancellationCountSinceLastReadThreshold());
402+
Mockito.doNothing().when(timestampsMock).resetCancellationCount();
397403

398404
try(MockedStatic<CpuMemoryMonitor> cpuMemoryMonitorMock = Mockito.mockStatic(CpuMemoryMonitor.class)) {
399405
CpuLoadHistory cpuLoadHistoryMock = Mockito.mock(CpuLoadHistory.class);
@@ -404,10 +410,14 @@ public void cancellationPronenessOfChannelWithHighCpuLoad_Test(boolean withFailu
404410
Future<String> healthyResult = healthChecker.isHealthyWithFailureReason(channelMock).sync();
405411
assertThat(healthyResult.isSuccess()).isTrue();
406412
assertThat(healthyResult.getNow()).isEqualTo(RntbdConstants.RntbdHealthCheckResults.SuccessValue);
413+
// Verify under high CPU load, the cancellationCount will be reset
414+
Mockito.verify(timestampsMock, Mockito.times(1)).resetCancellationCount();
407415
} else {
408416
Future<Boolean> healthyResult = healthChecker.isHealthy(channelMock).sync();
409417
assertThat(healthyResult.isSuccess()).isTrue();
410418
assertThat(healthyResult.getNow()).isTrue();
419+
// Verify under high CPU load, the transitTimeout will be reset
420+
Mockito.verify(timestampsMock, Mockito.times(1)).resetCancellationCount();
411421
}
412422
}
413423
}

sdk/cosmos/azure-cosmos/CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* Fixed a `NullPointerException` issue in `MetadataRequestRetryPolicy` when `locationEndpointToRoute` is not set. - See [PR 38094](https://github.com/Azure/azure-sdk-for-java/pull/38094)
1313

1414
#### Other Changes
15+
* Reset `transitTimeoutCount` and `cancellationCount` in `RntbdClientChannelHealthChecker` when CPU load is above threshold. - See [PR 38157](https://github.com/Azure/azure-sdk-for-java/pull/38157)
1516

1617
### 4.53.1 (2023-12-06)
1718

sdk/cosmos/azure-cosmos/src/main/java/com/azure/cosmos/implementation/directconnectivity/rntbd/RntbdClientChannelHealthChecker.java

+6-4
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,6 @@ public final class RntbdClientChannelHealthChecker implements ChannelHealthCheck
5858
@JsonProperty
5959
private final long writeDelayLimitInNanos;
6060
@JsonProperty
61-
private final long networkRequestTimeoutInNanos;
62-
@JsonProperty
6361
private final boolean timeoutDetectionEnabled;
6462
@JsonProperty
6563
private final double timeoutDetectionDisableCPUThreshold;
@@ -78,7 +76,6 @@ public final class RntbdClientChannelHealthChecker implements ChannelHealthCheck
7876
@JsonProperty
7977
private final int cancellationCountSinceLastReadThreshold;
8078

81-
8279
// endregion
8380

8481
// region Constructors
@@ -98,7 +95,6 @@ public RntbdClientChannelHealthChecker(final Config config) {
9895
this.idleConnectionTimeoutInNanos = config.idleConnectionTimeoutInNanos();
9996
this.readDelayLimitInNanos = config.receiveHangDetectionTimeInNanos();
10097
this.writeDelayLimitInNanos = config.sendHangDetectionTimeInNanos();
101-
this.networkRequestTimeoutInNanos = config.tcpNetworkRequestTimeoutInNanos();
10298
this.timeoutDetectionEnabled = config.timeoutDetectionEnabled();
10399
this.timeoutDetectionDisableCPUThreshold = config.timeoutDetectionDisableCPUThreshold();
104100
this.timeoutTimeLimitInNanos = config.timeoutDetectionTimeLimitInNanos();
@@ -327,6 +323,9 @@ private String transitTimeoutValidation(Timestamps timestamps, Instant currentTi
327323
// When request timeout due to high CPU,
328324
// close the existing the connection and re-establish a new one will not help the issue but rather make it worse, return fast
329325
if (CpuMemoryMonitor.getCpuLoad().isCpuOverThreshold(this.timeoutDetectionDisableCPUThreshold)) {
326+
// reset the transit timeout here
327+
// else when the CPU back to below the threshold, the connection may still trigger a connection close right away
328+
timestamps.resetTransitTimeout();
330329
return transitTimeoutValidationMessage;
331330
}
332331

@@ -418,6 +417,9 @@ private String isCancellationProneChannel(Timestamps timestamps, Instant current
418417
// When request cancellations are due to high CPU,
419418
// close the existing the connection and re-establish a new one will not help the issue but rather make it worse, return fast
420419
if (CpuMemoryMonitor.getCpuLoad().isCpuOverThreshold(this.timeoutDetectionDisableCPUThreshold)) {
420+
// reset the cancellation count here
421+
// else when the CPU back to below the threshold, the connection may still trigger a connection close right away
422+
timestamps.resetCancellationCount();
421423
return errorMessage;
422424
}
423425

0 commit comments

Comments
 (0)