Skip to content

Commit c9a6eee

Browse files
committed
Fix #6313: add backoff delay for scan server ERROR retries
1 parent 1f18f66 commit c9a6eee

2 files changed

Lines changed: 65 additions & 1 deletion

File tree

core/src/main/java/org/apache/accumulo/core/spi/scan/ConfigurableScanServerSelector.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,14 @@ public Duration getBusyTimeout() {
481481

482482
Duration busyTO = Duration.ofMillis(profile.getBusyTimeout(maxAttempts));
483483

484+
int maxErrorAttempts =
485+
params.getTablets().stream()
486+
.mapToInt(tablet -> (int) params.getAttempts(tablet).stream()
487+
.filter(a -> a.getResult() == ScanServerAttempt.Result.ERROR).count())
488+
.max().orElse(0);
489+
Duration delay = maxErrorAttempts == 0 ? Duration.ZERO
490+
: Duration.ofMillis(Math.min(5000L, 100L * (1L << Math.min(maxErrorAttempts - 1, 30))));
491+
484492
LOG.trace("Returning servers to use: {}", serversToUse);
485493
return new ScanServerSelections() {
486494
@Override
@@ -490,7 +498,7 @@ public String getScanServer(TabletId tabletId) {
490498

491499
@Override
492500
public Duration getDelay() {
493-
return Duration.ZERO;
501+
return delay;
494502
}
495503

496504
@Override

core/src/test/java/org/apache/accumulo/core/spi/scan/ConfigurableScanServerSelectorTest.java

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
import java.util.Collection;
3131
import java.util.HashMap;
3232
import java.util.HashSet;
33+
import java.util.List;
3334
import java.util.Map;
3435
import java.util.Optional;
3536
import java.util.Set;
@@ -608,6 +609,61 @@ public void testServerSetChanges() throws Exception {
608609
assertEquals(30, allServersSeen.size());
609610
}
610611

612+
@Test
613+
public void testErrorDelay() {
614+
ConfigurableScanServerSelector selector = new ConfigurableScanServerSelector();
615+
selector.init(new InitParams(
616+
Set.of("ss1:1", "ss2:2", "ss3:3", "ss4:4", "ss5:5", "ss6:6", "ss7:7", "ss8:8")));
617+
618+
var tabletId = nti("1", "m");
619+
620+
// no previous attempts: no delay
621+
assertEquals(0, selector.selectServers(new SelectorParams(tabletId)).getDelay().toMillis());
622+
623+
// BUSY attempts only: no error-based delay
624+
var busyAttempts = List.of(new TestScanServerAttempt("ss1:1", ScanServerAttempt.Result.BUSY));
625+
Map<TabletId,Collection<? extends ScanServerAttempt>> attempts = Map.of(tabletId, busyAttempts);
626+
assertEquals(0, selector.selectServers(new SelectorParams(tabletId, attempts, Map.of()))
627+
.getDelay().toMillis());
628+
629+
// 1 ERROR attempt: 100ms
630+
var err1 = List.of(new TestScanServerAttempt("ss1:1", ScanServerAttempt.Result.ERROR));
631+
attempts = Map.of(tabletId, err1);
632+
assertEquals(100, selector.selectServers(new SelectorParams(tabletId, attempts, Map.of()))
633+
.getDelay().toMillis());
634+
635+
// 2 ERROR attempts: 200ms
636+
var err2 = List.of(new TestScanServerAttempt("ss1:1", ScanServerAttempt.Result.ERROR),
637+
new TestScanServerAttempt("ss2:2", ScanServerAttempt.Result.ERROR));
638+
attempts = Map.of(tabletId, err2);
639+
assertEquals(200, selector.selectServers(new SelectorParams(tabletId, attempts, Map.of()))
640+
.getDelay().toMillis());
641+
642+
// 3 ERROR attempts: 400ms
643+
var err3 = List.of(new TestScanServerAttempt("ss1:1", ScanServerAttempt.Result.ERROR),
644+
new TestScanServerAttempt("ss2:2", ScanServerAttempt.Result.ERROR),
645+
new TestScanServerAttempt("ss3:3", ScanServerAttempt.Result.ERROR));
646+
attempts = Map.of(tabletId, err3);
647+
assertEquals(400, selector.selectServers(new SelectorParams(tabletId, attempts, Map.of()))
648+
.getDelay().toMillis());
649+
650+
// 6 ERROR attempts: 3200ms
651+
var err6 = Stream.iterate(1, i -> i <= 6, i -> i + 1)
652+
.map(i -> new TestScanServerAttempt("ss" + i + ":1", ScanServerAttempt.Result.ERROR))
653+
.collect(Collectors.toList());
654+
attempts = Map.of(tabletId, err6);
655+
assertEquals(3200, selector.selectServers(new SelectorParams(tabletId, attempts, Map.of()))
656+
.getDelay().toMillis());
657+
658+
// 7 or more ERROR attempts: capped at 5000ms
659+
var err7 = Stream.iterate(1, i -> i <= 7, i -> i + 1)
660+
.map(i -> new TestScanServerAttempt("ss" + i + ":1", ScanServerAttempt.Result.ERROR))
661+
.collect(Collectors.toList());
662+
attempts = Map.of(tabletId, err7);
663+
assertEquals(5000, selector.selectServers(new SelectorParams(tabletId, attempts, Map.of()))
664+
.getDelay().toMillis());
665+
}
666+
611667
/**
612668
* Test that previous failures are not used again unless all servers have failed
613669
*/

0 commit comments

Comments
 (0)