Skip to content

Commit bb9e88b

Browse files
committed
Fix #34863: check container health during startup to fail early
1 parent 8c272c0 commit bb9e88b

14 files changed

Lines changed: 147 additions & 115 deletions

File tree

src/main/java/eu/openanalytics/containerproxy/backend/IContainerBackend.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,5 +108,11 @@ default Proxy addRuntimeValuesAfterSpel(Authentication user, ProxySpec spec, Pro
108108
return proxy;
109109
}
110110

111+
/**
112+
* Checks the health of the proxy, by checking the health of the underlying container.
113+
* Assumes that the container has started and reached the running state.
114+
* @param proxy the proxy to check the health of
115+
* @return whether the proxy is healthy
116+
*/
111117
boolean isProxyHealthy(Proxy proxy);
112118
}

src/main/java/eu/openanalytics/containerproxy/backend/dispatcher/proxysharing/ProxySharingDispatcher.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,6 @@
4848
import org.slf4j.LoggerFactory;
4949
import org.springframework.beans.factory.annotation.Autowired;
5050
import org.springframework.context.ApplicationEventPublisher;
51-
import org.springframework.context.annotation.Lazy;
5251
import org.springframework.context.event.EventListener;
5352
import org.springframework.core.env.Environment;
5453
import org.springframework.security.core.Authentication;

src/main/java/eu/openanalytics/containerproxy/backend/docker/DockerSwarmBackend.java

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,6 @@
4444
import org.mandas.docker.client.LogStream;
4545
import org.mandas.docker.client.exceptions.DockerException;
4646
import org.mandas.docker.client.exceptions.ServiceNotFoundException;
47-
import org.mandas.docker.client.exceptions.TaskNotFoundException;
4847
import org.mandas.docker.client.messages.RegistryAuth;
4948
import org.mandas.docker.client.messages.mount.Mount;
5049
import org.mandas.docker.client.messages.swarm.DnsConfig;
@@ -88,6 +87,8 @@ public class DockerSwarmBackend extends AbstractDockerBackend {
8887

8988
private int serviceWaitTime;
9089

90+
private static final List<String> STARTING_STATES = List.of("new", "pending", "assigned", "accepted", "ready", "preparing", "starting", "running");
91+
9192
@PostConstruct
9293
public void initialize() {
9394
super.initialize();
@@ -221,21 +222,18 @@ public Proxy startContainer(Authentication user, Container initialContainer, Con
221222

222223
// Give the service some time to start up and launch a container.
223224
boolean containerFound = Retrying.retry((currentAttempt, maxAttempts) -> {
224-
try {
225-
Task serviceTask = dockerClient
226-
.listTasks(Task.Criteria.builder().serviceName(serviceName).build())
227-
.stream().findAny().orElseThrow(() -> new IllegalStateException("Swarm service has no tasks"));
228-
if (serviceTask.status().containerStatus() != null) {
229-
rContainerBuilder.id(serviceTask.status().containerStatus().containerId());
230-
return true;
231-
} else if (currentAttempt > 10 && log != null) {
232-
slog.info(proxy, String.format("Docker Swarm Service not ready yet, trying again (%d/%d)", currentAttempt, maxAttempts));
233-
}
234-
} catch (Exception e) {
235-
throw new RuntimeException("Failed to inspect swarm service tasks", e);
225+
Task serviceTask = dockerClient
226+
.listTasks(Task.Criteria.builder().serviceName(serviceName).build())
227+
.stream().findAny().orElseThrow(() -> new IllegalStateException("Swarm service has no tasks"));
228+
if (serviceTask.status().containerStatus() != null && serviceTask.status().state().equals("running")) {
229+
rContainerBuilder.id(serviceTask.status().containerStatus().containerId());
230+
return Retrying.SUCCESS;
231+
} else if (!STARTING_STATES.contains(serviceTask.status().state())) {
232+
slog.warn(proxy, "Docker Swarm container failed: container not running, state reported by docker: " + toJson(serviceTask.status()));
233+
return new Retrying.Result(false, false);
236234
}
237-
return false;
238-
}, serviceWaitTime, true);
235+
return Retrying.FAILURE;
236+
}, serviceWaitTime, "Docker Swarm Service", 10, proxy, slog);
239237

240238
if (!containerFound) {
241239
dockerClient.removeService(serviceId);

src/main/java/eu/openanalytics/containerproxy/backend/ecs/EcsBackend.java

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ public class EcsBackend extends AbstractContainerBackend {
9696
private static final Pattern TAG_VALUE_PATTERN = Pattern.compile("^[a-zA-Z0-9 +-=._:/@]*$");
9797
private static final Pattern LOG_GORUP_REPLACE_PATTERN = Pattern.compile("[^a-zA-Z0-9_\\-.#]");
9898
private static final List<RuntimeValueKey<?>> IGNORED_RUNTIME_VALUES = Arrays.asList(PortMappingsKey.inst, UserGroupsKey.inst);
99+
private static final List<String> STARTING_STATES = List.of("PROVISIONING", "PENDING", "ACTIVATING");
100+
private static final List<String> STOPPING_STATES = List.of("DEACTIVATING", "STOPPING", "DEPROVISIONING", "STOPPED", "DELETED");
99101

100102
private EcsClient ecsClient;
101103
private Boolean enableCloudWatch;
@@ -235,15 +237,17 @@ public Proxy startContainer(Authentication user, Container initialContainer, Con
235237
.cluster(cluster)
236238
.tasks(taskArn));
237239

238-
if (response.hasTasks() && response.tasks().get(0).lastStatus().equals("RUNNING")) {
239-
return true;
240-
} else {
241-
if (currentAttempt > 10) {
242-
slog.info(proxy, String.format("ECS Task not ready yet, trying again (%d/%d)", currentAttempt, maxAttempts));
240+
if (response.hasTasks()) {
241+
Task task = response.tasks().get(0);
242+
if (task.lastStatus().equals("RUNNING")) {
243+
return Retrying.SUCCESS;
244+
} else if (!STARTING_STATES.contains(task.lastStatus()) || !task.desiredStatus().equals("RUNNING")) {
245+
slog.warn(proxy, String.format("ECS container failed: task not running, stopCode: '%s', stoppingAt: '%s', stoppedAt: '%s', stoppedReason: '%s'", task.stopCode(), task.stoppingAt(), task.stoppedAt(), task.stoppedReason()));
246+
return new Retrying.Result(false, false);
243247
}
244-
return false;
245248
}
246-
}, totalWaitMs);
249+
return Retrying.FAILURE;
250+
}, totalWaitMs, "ECS Task", 10, proxy, slog);
247251

248252
proxyStartupLogBuilder.containerStarted(initialContainer.getIndex());
249253

@@ -413,24 +417,19 @@ protected void doStopProxy(Proxy proxy) throws Exception {
413417
ecsClient.deleteTaskDefinitions(builder -> builder.taskDefinitions("sp-task-definition-" + proxy.getId() + ":1"));
414418
}
415419

416-
List<String> stoppingState = Arrays.asList("DEACTIVATING", "STOPPING", "DEPROVISIONING", "STOPPED", "DELETED");
417-
418420
boolean isInactive = Retrying.retry((currentAttempt, maxAttempts) -> {
419421
for (Container container : proxy.getContainers()) {
420422
String taskArn = container.getRuntimeValue(BackendContainerNameKey.inst);
421423
DescribeTasksResponse response = ecsClient.describeTasks(builder -> builder
422424
.cluster(cluster)
423425
.tasks(taskArn));
424426

425-
if (response.hasTasks() && !stoppingState.contains(response.tasks().get(0).lastStatus())) {
426-
if (currentAttempt > 10) {
427-
slog.info(proxy, String.format("ECS Task not in stopping state yet, trying again (%d/%d)", currentAttempt, maxAttempts));
428-
}
429-
return false;
427+
if (response.hasTasks() && !STOPPING_STATES.contains(response.tasks().get(0).lastStatus())) {
428+
return Retrying.FAILURE;
430429
}
431430
}
432-
return true;
433-
}, totalWaitMs);
431+
return Retrying.SUCCESS;
432+
}, totalWaitMs, "Stopping ECS Task", 10, proxy, slog);
434433

435434
if (!isInactive) {
436435
slog.warn(proxy, "Container did not get into stopping state");

src/main/java/eu/openanalytics/containerproxy/backend/kubernetes/KubernetesBackend.java

Lines changed: 35 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,6 @@
9393
import org.apache.commons.compress.utils.IOUtils;
9494
import org.apache.commons.lang3.StringUtils;
9595
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
96-
import org.springframework.context.ApplicationEventPublisher;
9796
import org.springframework.security.core.Authentication;
9897
import org.springframework.stereotype.Component;
9998

@@ -349,14 +348,17 @@ public Proxy startContainer(Authentication user, Container initialContainer, Con
349348
Pod startedPod = kubeClient.pods().inNamespace(effectiveKubeNamespace).create(patchedPod);
350349

351350
boolean podReady = Retrying.retry((currentAttempt, maxAttempts) -> {
352-
if (!Readiness.getInstance().isReady(kubeClient.resource(startedPod).fromServer().get())) {
353-
if (currentAttempt > 10 && log != null) {
354-
slog.info(proxy, String.format("Kubernetes Pod not ready yet, trying again (%d/%d)", currentAttempt, maxAttempts));
355-
}
356-
return false;
351+
Pod pod = kubeClient.resource(startedPod).fromServer().get();
352+
Optional<String> error = getContainerFailure(pod);
353+
if (error.isPresent()) {
354+
slog.warn(proxy, error.get());
355+
return new Retrying.Result(false, false);
356+
}
357+
if (!Readiness.getInstance().isReady(pod)) {
358+
return Retrying.FAILURE;
357359
}
358-
return true;
359-
}, totalWaitMs);
360+
return Retrying.SUCCESS;
361+
}, totalWaitMs, "Kubernetes Pod", 10, proxy, slog);
360362

361363
if (!podReady) {
362364
// check a final time whether the pod is ready
@@ -396,7 +398,7 @@ public Proxy startContainer(Authentication user, Container initialContainer, Con
396398
.build());
397399

398400
// Workaround: waitUntilReady appears to be buggy.
399-
Retrying.retry((currentAttempt, maxAttempts) -> isServiceReady(kubeClient.resource(startupService).fromServer().get()), 60_000);
401+
Retrying.retry((currentAttempt, maxAttempts) -> new Retrying.Result(isServiceReady(kubeClient.resource(startupService).fromServer().get())), 60_000);
400402

401403
service = kubeClient.resource(startupService).fromServer().get();
402404
portBindings = service.getSpec().getPorts().stream()
@@ -660,7 +662,7 @@ protected void doStopProxy(Proxy proxy) {
660662
String podNamespace = podInfo.get().getNamespace();
661663
String podName = podInfo.get().getName();
662664

663-
if (getContainerFailure(proxy, podNamespace, podName).isPresent()) {
665+
if (getContainerFailure(podNamespace, podName).isPresent()) {
664666
logKubernetesWarnings(proxy, podNamespace, podName);
665667
}
666668

@@ -783,28 +785,31 @@ public List<ExistingContainerInfo> scanExistingContainers() {
783785
return containers;
784786
}
785787

786-
private Optional<String> getContainerFailure(Proxy proxy, String podNamespace, String podName) {
788+
private Optional<String> getContainerFailure(String podNamespace, String podName) {
787789
Pod pod = kubeClient.pods().inNamespace(podNamespace).withName(podName).get();
790+
return getContainerFailure(pod);
791+
}
792+
793+
private Optional<String> getContainerFailure(Pod pod) {
788794
if (pod == null) {
789795
return Optional.of("Kubernetes container failed, pod does not exist");
790-
} else {
791-
Optional<ContainerStatus> containerStatus = pod.getStatus().getContainerStatuses().stream().findFirst();
792-
ContainerState state = null;
793-
if (containerStatus.isPresent() && containerStatus.get().getState() != null) {
794-
state = containerStatus.get().getState();
795-
} else if (containerStatus.isPresent() && containerStatus.get().getLastState() != null) {
796-
state = containerStatus.get().getLastState();
797-
}
798-
if (state != null && state.getTerminated() != null) {
799-
String message = pod.getStatus().getMessage();
800-
String logs = state.getTerminated().getMessage();
801-
String reason = state.getTerminated().getReason();
802-
Integer exitCode = state.getTerminated().getExitCode();
803-
return Optional.of(String.format("Kubernetes pod failed, reason: '%s', exitCode: '%s', node: '%s', message:\n%s\nlogs:\n%s\n", reason, exitCode, pod.getSpec().getNodeName(), message, logs));
804-
}
805-
if (pod.isMarkedForDeletion()) {
806-
return Optional.of(String.format("Kubernetes pod is being terminated, node: '%s'", pod.getSpec().getNodeName()));
807-
}
796+
}
797+
Optional<ContainerStatus> containerStatus = pod.getStatus().getContainerStatuses().stream().findFirst();
798+
ContainerState state = null;
799+
if (containerStatus.isPresent() && containerStatus.get().getState() != null) {
800+
state = containerStatus.get().getState();
801+
} else if (containerStatus.isPresent() && containerStatus.get().getLastState() != null) {
802+
state = containerStatus.get().getLastState();
803+
}
804+
if (state != null && state.getTerminated() != null) {
805+
String message = pod.getStatus().getMessage();
806+
String logs = state.getTerminated().getMessage();
807+
String reason = state.getTerminated().getReason();
808+
Integer exitCode = state.getTerminated().getExitCode();
809+
return Optional.of(String.format("Kubernetes pod failed, reason: '%s', exitCode: '%s', node: '%s', message:\n%s\nlogs:\n%s\n", reason, exitCode, pod.getSpec().getNodeName(), message, logs));
810+
}
811+
if (pod.isMarkedForDeletion()) {
812+
return Optional.of(String.format("Kubernetes pod is being terminated, node: '%s'", pod.getSpec().getNodeName()));
808813
}
809814
return Optional.empty();
810815
}
@@ -817,7 +822,7 @@ public boolean isProxyHealthy(Proxy proxy) {
817822
continue;
818823
}
819824
for (int i = 0; i < 5; i++) {
820-
Optional<String> error = getContainerFailure(proxy, podInfo.get().getNamespace(), podInfo.get().getName());
825+
Optional<String> error = getContainerFailure(podInfo.get().getNamespace(), podInfo.get().getName());
821826
if (error.isPresent()) {
822827
slog.warn(proxy, error.get());
823828
return false;

src/main/java/eu/openanalytics/containerproxy/service/ProxyService.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@
4343
import eu.openanalytics.containerproxy.spec.expression.SpecExpressionContext;
4444
import eu.openanalytics.containerproxy.spec.expression.SpecExpressionResolver;
4545
import eu.openanalytics.containerproxy.util.ProxyMappingManager;
46-
import eu.openanalytics.containerproxy.util.Retrying;
4746
import org.slf4j.Logger;
4847
import org.slf4j.LoggerFactory;
4948
import org.springframework.context.ApplicationEventPublisher;

src/main/java/eu/openanalytics/containerproxy/util/Retrying.java

Lines changed: 43 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020
*/
2121
package eu.openanalytics.containerproxy.util;
2222

23+
import eu.openanalytics.containerproxy.model.runtime.Proxy;
24+
import eu.openanalytics.containerproxy.service.StructuredLogger;
2325
import org.slf4j.Logger;
2426
import org.slf4j.LoggerFactory;
2527

@@ -28,35 +30,49 @@ public class Retrying {
2830
private static final Logger log = LoggerFactory.getLogger(Retrying.class);
2931

3032
public static boolean retry(Attempt job, int maxDelay) {
31-
return retry(job, maxDelay, false);
33+
return retry(job, maxDelay, null, -1);
3234
}
3335

34-
public static boolean retry(Attempt job, int maxDelay, boolean retryOnException) {
35-
return retry(job, maxDelay, null, -1, retryOnException);
36+
public static boolean retry(Attempt job, int maxDelay, String logMessage, int logAfterAttempts) {
37+
return retry(job, maxDelay, logMessage, logAfterAttempts, null, null);
3638
}
3739

38-
public static boolean retry(Attempt job, int maxDelay, String logMessage, int logAfterAttmepts, boolean retryOnException) {
39-
boolean retVal = false;
40+
public static boolean retry(Attempt job, int maxDelay, String logMessage, int logAfterAttempts, Proxy proxy, StructuredLogger slog) {
4041
Exception exception = null;
4142
int maxAttempts = numberOfAttempts(maxDelay);
4243
for (int currentAttempt = 0; currentAttempt < maxAttempts; currentAttempt++) {
4344
delay(currentAttempt); // delay here so that we don't delay for the last iteration
4445
try {
45-
if (job.attempt(currentAttempt, maxAttempts)) {
46-
retVal = true;
47-
exception = null;
48-
break;
46+
Result result = job.attempt(currentAttempt, maxAttempts);
47+
if (!result.keepGoing) {
48+
if (result.success && currentAttempt > logAfterAttempts && logMessage != null) {
49+
if (slog != null && proxy != null) {
50+
slog.info(proxy, String.format("Ready: %s", logMessage));
51+
} else {
52+
log.info(String.format("Ready: %s", logMessage));
53+
}
54+
}
55+
return result.success;
4956
}
5057
} catch (Exception e) {
51-
if (retryOnException) exception = e;
52-
else throw new RuntimeException(e);
58+
exception = e;
5359
}
54-
if (currentAttempt > logAfterAttmepts && logMessage != null) {
55-
log.info(String.format("Retry: %s (%d/%d)", logMessage, currentAttempt, maxAttempts));
60+
if (currentAttempt > logAfterAttempts && logMessage != null) {
61+
if (slog != null && proxy != null) {
62+
slog.info(proxy, String.format("Waiting: %s (%d/%d)", logMessage, currentAttempt, maxAttempts));
63+
} else {
64+
log.info(String.format("Waiting: %s (%d/%d)", logMessage, currentAttempt, maxAttempts));
65+
}
5666
}
5767
}
58-
if (exception == null) return retVal;
59-
else throw new RuntimeException(exception);
68+
if (exception != null) {
69+
if (slog != null && proxy != null) {
70+
slog.warn(proxy, exception, String.format("Failed: %s", logMessage));
71+
} else {
72+
log.warn(String.format("Failed: %s", logMessage), exception);
73+
}
74+
}
75+
return false;
6076
}
6177

6278
public static void delay(Integer attempt) {
@@ -84,7 +100,18 @@ public static int numberOfAttempts(Integer maxDelay) {
84100

85101
@FunctionalInterface
86102
public interface Attempt {
87-
boolean attempt(int currentAttempt, int maxAttempts) throws Exception;
103+
Result attempt(int currentAttempt, int maxAttempts) throws Exception;
104+
}
105+
106+
public record Result(boolean success, boolean keepGoing) {
107+
108+
public Result(boolean success) {
109+
this(success, !success);
110+
}
111+
88112
}
89113

114+
public final static Result SUCCESS = new Result(true, false);
115+
public final static Result FAILURE = new Result(false, true);
116+
90117
}

0 commit comments

Comments
 (0)