Skip to content

Commit 099f6bd

Browse files
committed
Fix #34197: show logs for failed delegate proxies
1 parent ceb3e14 commit 099f6bd

8 files changed

Lines changed: 90 additions & 14 deletions

File tree

src/main/java/eu/openanalytics/containerproxy/backend/dispatcher/proxysharing/ProxySharingMicrometer.java

Lines changed: 67 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,12 @@
2020
*/
2121
package eu.openanalytics.containerproxy.backend.dispatcher.proxysharing;
2222

23+
import com.github.benmanes.caffeine.cache.Cache;
24+
import com.github.benmanes.caffeine.cache.Caffeine;
25+
import com.github.benmanes.caffeine.cache.Scheduler;
2326
import eu.openanalytics.containerproxy.backend.dispatcher.proxysharing.store.DelegateProxy;
27+
import eu.openanalytics.containerproxy.backend.dispatcher.proxysharing.store.DelegateProxyStatus;
28+
import eu.openanalytics.containerproxy.event.NewProxyEvent;
2429
import eu.openanalytics.containerproxy.model.runtime.Proxy;
2530
import eu.openanalytics.containerproxy.model.runtime.runtimevalues.BackendContainerName;
2631
import eu.openanalytics.containerproxy.model.runtime.runtimevalues.BackendContainerNameKey;
@@ -30,6 +35,7 @@
3035
import io.micrometer.core.instrument.Tags;
3136
import io.micrometer.core.instrument.search.MeterNotFoundException;
3237
import org.springframework.beans.factory.annotation.Autowired;
38+
import org.springframework.context.event.EventListener;
3339

3440
import javax.annotation.PostConstruct;
3541
import javax.inject.Inject;
@@ -40,6 +46,7 @@
4046
import java.util.Map;
4147
import java.util.Timer;
4248
import java.util.TimerTask;
49+
import java.util.concurrent.TimeUnit;
4350
import java.util.function.ToDoubleFunction;
4451
import java.util.stream.Collectors;
4552

@@ -56,6 +63,16 @@ public class ProxySharingMicrometer implements IStatCollector {
5663
@Autowired(required = false)
5764
private List<ProxySharingScaler> proxySharingScalers = new ArrayList<>();
5865

66+
private final List<String> specIds = new ArrayList<>();
67+
68+
private static final Map<DelegateProxyStatus, Integer> PROXY_STATUS_TO_INTEGER = Map.of(
69+
DelegateProxyStatus.Pending, 1,
70+
DelegateProxyStatus.Available, 10,
71+
DelegateProxyStatus.ToRemove, 20
72+
);
73+
74+
private Cache<String, String> recentProxies;
75+
5976
/**
6077
* Wraps a function that returns an Integer into a function that returns a double.
6178
* When the provided Integer is null, the resulting function returns Double.NaN.
@@ -74,8 +91,13 @@ private static <T> ToDoubleFunction<T> wrapHandleNull(ToLongFunction<T> producer
7491

7592
@PostConstruct
7693
public void init() {
94+
recentProxies = Caffeine.newBuilder()
95+
.scheduler(Scheduler.systemScheduler())
96+
.expireAfterWrite(2, TimeUnit.MINUTES)
97+
.build();
7798
for (ProxySharingDispatcher dispatcher : proxySharingDispatchers) {
7899
String specId = dispatcher.getSpec().getId();
100+
specIds.add(specId);
79101
registry.timer("seats_wait_time", "spec.id", specId);
80102
}
81103
for (ProxySharingScaler scaler : proxySharingScalers) {
@@ -96,16 +118,40 @@ public void registerSeatWaitTime(String specId, Duration time) {
96118
registry.timer("seats_wait_time", "spec.id", specId).record(time);
97119
}
98120

121+
@EventListener
122+
public void onNewProxyEvent(NewProxyEvent event) {
123+
if (event.getUserId() != null || event.getBackendContainerName() == null) {
124+
return;
125+
}
126+
if (specIds.contains(event.getSpecId())) {
127+
recentProxies.put(event.getProxyId(), event.getProxyId());
128+
registry.gauge("delegate_app_info",
129+
Tags.of(
130+
"spec.id", event.getSpecId(),
131+
"proxy.id", event.getProxyId(),
132+
"proxy.created.timestamp", Long.toString(event.getCreatedTimestamp()),
133+
"resource.id", event.getBackendContainerName().getName(),
134+
"proxy.namespace", event.getBackendContainerName().getNamespace()),
135+
PROXY_STATUS_TO_INTEGER.get(DelegateProxyStatus.Pending)
136+
);
137+
}
138+
}
139+
99140
private void updateDelegateAppInfo() {
100141
Map<String, Gauge> existingGauges = getDelegateAppInfoGauges();
101142
for (ProxySharingScaler scaler : proxySharingScalers) {
102143
String specId = scaler.getSpec().getId();
103144
for (DelegateProxy delegateProxy : scaler.getAllDelegateProxies()) {
104145
Proxy proxy = delegateProxy.getProxy();
105-
if (existingGauges.remove(proxy.getId()) != null) {
106-
// gauge already exists, no need to re-create
146+
recentProxies.put(proxy.getId(), proxy.getId());
147+
Gauge existingGauge = existingGauges.remove(proxy.getId());
148+
if (existingGauge != null && existingGauge.value() == PROXY_STATUS_TO_INTEGER.get(delegateProxy.getDelegateProxyStatus())) {
149+
// gauge already exists and value is correct
107150
continue;
108151
}
152+
if (existingGauge != null) {
153+
registry.remove(existingGauge);
154+
}
109155

110156
BackendContainerName backendContainerName = getBackendContainerName(proxy);
111157
if (backendContainerName == null) {
@@ -120,11 +166,29 @@ private void updateDelegateAppInfo() {
120166
"proxy.created.timestamp", Long.toString(proxy.getCreatedTimestamp()),
121167
"resource.id", backendContainerName.getName(),
122168
"proxy.namespace", backendContainerName.getNamespace()),
123-
1
169+
PROXY_STATUS_TO_INTEGER.get(delegateProxy.getDelegateProxyStatus())
124170
);
125171
}
126172
}
127173
for (Gauge gauge : existingGauges.values()) {
174+
String proxyId = gauge.getId().getTag("proxy.id");
175+
if (proxyId != null && recentProxies.getIfPresent(proxyId) != null) {
176+
// this DelegateProxy has been removed, mark it as ToRemove
177+
// when the TTL of this proxy in recentProxies expires, the gauge will be removed
178+
// this waiting period allows the metric system to pick up that the proxy is being removed
179+
registry.remove(gauge);
180+
registry.gauge("delegate_app_info",
181+
Tags.of(
182+
"spec.id", gauge.getId().getTag("spec.id"),
183+
"proxy.id", gauge.getId().getTag("proxy.id"),
184+
"proxy.created.timestamp", gauge.getId().getTag("proxy.created.timestamp"),
185+
"resource.id", gauge.getId().getTag("resource.id"),
186+
"proxy.namespace", gauge.getId().getTag("proxy.namespace")),
187+
PROXY_STATUS_TO_INTEGER.get(DelegateProxyStatus.ToRemove)
188+
);
189+
continue;
190+
}
191+
128192
// the proxy of this gauge no longer exists -> remove the gauge
129193
registry.remove(gauge);
130194
}

src/main/java/eu/openanalytics/containerproxy/backend/dispatcher/proxysharing/ProxySharingScaler.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
import eu.openanalytics.containerproxy.util.Sha1;
6363
import org.slf4j.Logger;
6464
import org.slf4j.LoggerFactory;
65+
import org.springframework.beans.factory.annotation.Autowired;
6566
import org.springframework.context.ApplicationEventPublisher;
6667
import org.springframework.context.event.EventListener;
6768
import org.springframework.core.env.Environment;
@@ -122,6 +123,8 @@ public class ProxySharingScaler implements AutoCloseable {
122123
private ApplicationEventPublisher applicationEventPublisher;
123124
@Inject
124125
private Environment environment;
126+
@Autowired(required = false)
127+
private ProxySharingMicrometer proxySharingMicrometer = null;
125128

126129
public ProxySharingScaler(ISeatStore seatStore, ProxySpec proxySpec, IDelegateProxyStore delegateProxyStore) {
127130
this.specExtension = proxySpec.getSpecExtension(ProxySharingSpecExtension.class);

src/main/java/eu/openanalytics/containerproxy/backend/docker/DockerEngineBackend.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -183,9 +183,7 @@ public Proxy startContainer(Authentication user, Container initialContainer, Con
183183
proxyStartupLogBuilder.startingContainer(initialContainer.getIndex());
184184
String containerName = spec.getResourceName().getValueOrDefault("sp-container-" + proxy.getId() + "-" + initialContainer.getIndex());
185185
rContainerBuilder.addRuntimeValue(new RuntimeValue(BackendContainerNameKey.inst, new BackendContainerName(containerName)), false);
186-
if (user != null) {
187-
applicationEventPublisher.publishEvent(new NewProxyEvent(proxy.toBuilder().updateContainer(rContainerBuilder.build()).build(), user));
188-
}
186+
applicationEventPublisher.publishEvent(new NewProxyEvent(proxy.toBuilder().updateContainer(rContainerBuilder.build()).build(), user));
189187

190188
ContainerCreation containerCreation = dockerClient.createContainer(containerConfig, containerName);
191189
rContainerBuilder.id(containerCreation.id());

src/main/java/eu/openanalytics/containerproxy/backend/ecs/EcsBackend.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -228,9 +228,7 @@ public Proxy startContainer(Authentication user, Container initialContainer, Con
228228
String taskArn = runTaskResponse.tasks().get(0).taskArn();
229229

230230
rContainerBuilder.addRuntimeValue(new RuntimeValue(BackendContainerNameKey.inst, new BackendContainerName(taskArn)), false);
231-
if (user != null) {
232-
applicationEventPublisher.publishEvent(new NewProxyEvent(proxy.toBuilder().updateContainer(rContainerBuilder.build()).build(), user));
233-
}
231+
applicationEventPublisher.publishEvent(new NewProxyEvent(proxy.toBuilder().updateContainer(rContainerBuilder.build()).build(), user));
234232

235233
boolean serviceReady = Retrying.retry((currentAttempt, maxAttempts) -> {
236234
DescribeTasksResponse response = ecsClient.describeTasks(builder -> builder

src/main/java/eu/openanalytics/containerproxy/backend/kubernetes/KubernetesBackend.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -337,9 +337,7 @@ public Proxy startContainer(Authentication user, Container initialContainer, Con
337337
// set the BackendContainerName now, so that the pod can be deleted in case other steps of this function fails
338338
rContainerBuilder.addRuntimeValue(new RuntimeValue(BackendContainerNameKey.inst, new BackendContainerName(effectiveKubeNamespace, patchedPod.getMetadata().getName())), false);
339339

340-
if (user != null) {
341-
applicationEventPublisher.publishEvent(new NewProxyEvent(proxy.toBuilder().updateContainer(rContainerBuilder.build()).build(), user));
342-
}
340+
applicationEventPublisher.publishEvent(new NewProxyEvent(proxy.toBuilder().updateContainer(rContainerBuilder.build()).build(), user));
343341

344342
// create additional manifests -> use the effective (i.e. patched) namespace if no namespace is provided
345343
createAdditionalManifests(user, proxySpec, proxy, specExtension, effectiveKubeNamespace, initialContainer);

src/main/java/eu/openanalytics/containerproxy/event/NewProxyEvent.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import eu.openanalytics.containerproxy.model.runtime.Proxy;
2727
import eu.openanalytics.containerproxy.model.runtime.runtimevalues.BackendContainerName;
2828
import eu.openanalytics.containerproxy.model.runtime.runtimevalues.BackendContainerNameKey;
29+
import eu.openanalytics.containerproxy.model.runtime.runtimevalues.RuntimeValueKeyRegistry;
2930
import lombok.AccessLevel;
3031
import lombok.EqualsAndHashCode;
3132
import lombok.NoArgsConstructor;
@@ -80,7 +81,7 @@ public NewProxyEvent(Proxy proxy, Authentication authentication) {
8081
proxy.getId(),
8182
proxy.getUserId(),
8283
proxy.getSpecId(),
83-
proxy.getRuntimeValue("SHINYPROXY_APP_INSTANCE"),
84+
proxy.getRuntimeValueOrNull("SHINYPROXY_APP_INSTANCE"),
8485
proxy.getCreatedTimestamp(),
8586
proxy.getContainers().isEmpty() ? null : proxy.getContainers().get(0).getRuntimeObjectOrNull(BackendContainerNameKey.inst),
8687
authentication);

src/main/java/eu/openanalytics/containerproxy/model/runtime/runtimevalues/RuntimeValueStore.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,4 +104,18 @@ public <T> String getRuntimeValue(RuntimeValueKey<T> key) {
104104
return key.serializeToString(runtimeValue.getObject());
105105
}
106106

107+
public <T> String getRuntimeValueOrNull(RuntimeValueKey<T> key) {
108+
Objects.requireNonNull(key, "key may not be null");
109+
RuntimeValue runtimeValue = getRuntimeValues().get(key);
110+
if (runtimeValue == null) {
111+
return null;
112+
}
113+
return key.serializeToString(runtimeValue.getObject());
114+
}
115+
116+
public String getRuntimeValueOrNull(String keyAsEnvVar) {
117+
Objects.requireNonNull(keyAsEnvVar, "key may not be null");
118+
return getRuntimeValueOrNull(RuntimeValueKeyRegistry.getRuntimeValue(keyAsEnvVar));
119+
}
120+
107121
}

src/main/java/eu/openanalytics/containerproxy/stat/impl/Micrometer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ public void onUserLoginEvent(UserLoginEvent event) {
171171
@EventListener
172172
public void onNewProxyEvent(NewProxyEvent event) {
173173
// must run on each instance (gauge is registered on every instance)
174-
if (event.getBackendContainerName() != null) {
174+
if (event.getUserId() != null && event.getBackendContainerName() != null) {
175175
registry.gauge("appInfo",
176176
Tags.of(
177177
"spec.id", event.getSpecId(),

0 commit comments

Comments
 (0)