Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/_help.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,9 @@
- name: --enable-windows-recording-rules
type: bool
short-summary: Enable Windows Recording Rules when enabling the Azure Monitor Metrics addon
- name: --enable-control-plane-metrics --enable-cp-metrics
type: bool
short-summary: Enable collection of Azure Monitor managed Prometheus control plane metrics for managed cluster components (controlplane-apiserver and controlplane-etcd targets by default). Requires Azure Monitor metrics to be enabled (already enabled or via --enable-azure-monitor-metrics).
- name: --enable-azure-monitor-app-monitoring
type: bool
short-summary: Enable Azure Monitor Application Monitoring auto-instrumentation for a Kubernetes cluster.
Expand Down Expand Up @@ -1102,6 +1105,12 @@
- name: --disable-azure-monitor-metrics
type: bool
short-summary: Disable Azure Monitor Metrics Profile. This will delete all DCRA's associated with the cluster, any linked DCRs with the data stream = prometheus-stream and the recording rule groups created by the addon for this AKS cluster.
- name: --enable-control-plane-metrics --enable-cp-metrics
type: bool
short-summary: Enable collection of Azure Monitor managed Prometheus control plane metrics for managed cluster components (controlplane-apiserver and controlplane-etcd targets by default). Requires Azure Monitor metrics to be enabled (already enabled or via --enable-azure-monitor-metrics).
- name: --disable-control-plane-metrics --disable-cp-metrics
type: bool
short-summary: Disable collection of Azure Monitor managed Prometheus control plane metrics. Leaves Azure Monitor metrics enabled.
- name: --enable-azure-monitor-app-monitoring
type: bool
short-summary: Enable Azure Monitor Application Monitoring auto-instrumentation for a Kubernetes cluster.
Expand Down
31 changes: 31 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,6 +573,17 @@ def load_arguments(self, _):
c.argument('ksm_metric_annotations_allow_list')
c.argument('grafana_resource_id', validator=validate_grafanaresourceid)
c.argument('enable_windows_recording_rules', action='store_true')
c.argument(
'enable_control_plane_metrics',
options_list=['--enable-control-plane-metrics', '--enable-cp-metrics'],
action='store_true',
help=(
'Enable collection of Azure Monitor managed Prometheus control plane metrics for managed '
'cluster components (controlplane-apiserver and controlplane-etcd targets by default). '
'Requires Azure Monitor metrics to be enabled '
'(already enabled or via --enable-azure-monitor-metrics).'
),
)
c.argument('enable_azure_monitor_app_monitoring', action='store_true')
c.argument('node_public_ip_tags', arg_type=tags_type, validator=validate_node_public_ip_tags,
help='space-separated tags: key[=value] [key[=value] ...].')
Expand Down Expand Up @@ -812,6 +823,26 @@ def load_arguments(self, _):
c.argument('grafana_resource_id', validator=validate_grafanaresourceid)
c.argument('enable_windows_recording_rules', action='store_true')
c.argument('disable_azure_monitor_metrics', action='store_true')
c.argument(
'enable_control_plane_metrics',
options_list=['--enable-control-plane-metrics', '--enable-cp-metrics'],
action='store_true',
help=(
'Enable collection of Azure Monitor managed Prometheus control plane metrics for managed '
'cluster components (controlplane-apiserver and controlplane-etcd targets by default). '
'Requires Azure Monitor metrics to be enabled '
'(already enabled or via --enable-azure-monitor-metrics).'
),
)
c.argument(
'disable_control_plane_metrics',
options_list=['--disable-control-plane-metrics', '--disable-cp-metrics'],
action='store_true',
help=(
'Disable collection of Azure Monitor managed Prometheus control plane metrics. '
'Sets azureMonitorProfile.metrics.controlPlane.enabled=false on the cluster.'
),
)
c.argument('enable_azure_monitor_app_monitoring', action='store_true')
c.argument('disable_azure_monitor_app_monitoring', action='store_true')
# azure container storage
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for license information.
# --------------------------------------------------------------------------------------------
from azure.cli.command_modules.acs._client_factory import get_container_service_client
from azure.cli.command_modules.acs.azuremonitormetrics.addonput import addon_put
from azure.cli.command_modules.acs.azuremonitormetrics.amg.link import link_grafana_instance
from azure.cli.command_modules.acs.azuremonitormetrics.amw.helper import get_azure_monitor_workspace_resource
Expand All @@ -18,13 +19,58 @@
)
from azure.cli.command_modules.acs.azuremonitormetrics.recordingrules.create import create_rules
from azure.cli.command_modules.acs.azuremonitormetrics.recordingrules.delete import delete_rules
from azure.cli.core.azclierror import InvalidArgumentValueError
from azure.cli.core.azclierror import CLIError, InvalidArgumentValueError, UnknownError
from knack.log import get_logger


logger = get_logger(__name__)


# pylint: disable=line-too-long
def _addon_put_with_control_plane(cmd, cluster_subscription, cluster_resource_group_name, cluster_name):
"""Sibling of ``addon_put`` that ALSO flips ``metrics.controlPlane.enabled=True``.

Used by the greenfield ``aks create --enable-control-plane-metrics`` path. The
initial cluster PUT intentionally leaves ``control_plane`` unset so the RP does
not schedule the control-plane-metrics collection (CCP) pod before the DCRA is
created in postprocessing. Once the DCRA exists, we issue this PUT so the CCP
pod is scheduled with its DCRA already in place (race-free).
"""
client = get_container_service_client(cmd.cli_ctx, cluster_subscription).managed_clusters
try:
mc = client.get(cluster_resource_group_name, cluster_name)
except CLIError as e:
raise UnknownError(str(e)) from e
# Enable metrics if present and not already enabled (mirrors addon_put).
if hasattr(mc, "azure_monitor_profile") and mc.azure_monitor_profile:
if hasattr(mc.azure_monitor_profile, "metrics") and mc.azure_monitor_profile.metrics:
if getattr(mc.azure_monitor_profile.metrics, "enabled", None) is False:
mc.azure_monitor_profile.metrics.enabled = True
# Flip control plane now that DCRA exists.
try:
from azure.mgmt.containerservice.models import (
ManagedClusterAzureMonitorProfileMetricsControlPlane,
)
mc.azure_monitor_profile.metrics.control_plane = (
ManagedClusterAzureMonitorProfileMetricsControlPlane(enabled=True)
)
except ImportError:
# Fallback for SDK versions that don't expose the model directly:
# set a dict that the generated client will serialize as the property.
mc.azure_monitor_profile.metrics.control_plane = {"enabled": True}
# Unlike the sibling ``addon_put`` (where ``metrics.enabled`` is already true on the
# cluster from the initial PUT and the postprocessing PUT is just a safety re-affirm),
# this is the ONLY place where ``controlPlane.enabled`` is set during ``aks create``.
# Wait for the LRO so the CP flip is durably persisted before the create command
# returns; otherwise callers / tests that read the cluster immediately could see the
# pre-flip state.
try:
poller = client.begin_create_or_update(cluster_resource_group_name, cluster_name, mc)
poller.result()
except Exception as e:
raise UnknownError(str(e)) from e


# pylint: disable=line-too-long
def link_azure_monitor_profile_artifacts(
cmd,
Expand All @@ -49,7 +95,15 @@ def link_azure_monitor_profile_artifacts(
create_rules(cmd, cluster_subscription, cluster_resource_group_name, cluster_name, azure_monitor_workspace_resource_id, azure_monitor_workspace_location, raw_parameters)
# if aks cluster create flow -> do a PUT on the AKS cluster to enable the addon
if create_flow:
addon_put(cmd, cluster_subscription, cluster_resource_group_name, cluster_name)
# If --enable-control-plane-metrics was specified on create, flip
# metrics.controlPlane.enabled HERE (after DCRA creation) instead of on
# the initial cluster PUT. This avoids the CCP pod being scheduled before
# its DCRA exists (which would cause CrashLoopBackOff until reconciliation).
enable_cp = bool(raw_parameters and raw_parameters.get("enable_control_plane_metrics"))
if enable_cp:
_addon_put_with_control_plane(cmd, cluster_subscription, cluster_resource_group_name, cluster_name)
else:
addon_put(cmd, cluster_subscription, cluster_resource_group_name, cluster_name)


# pylint: disable=line-too-long
Expand Down
3 changes: 3 additions & 0 deletions src/azure-cli/azure/cli/command_modules/acs/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -1008,6 +1008,7 @@ def aks_create(
ksm_metric_annotations_allow_list=None,
grafana_resource_id=None,
enable_windows_recording_rules=False,
enable_control_plane_metrics=False,
enable_azure_monitor_app_monitoring=False,
# azure container storage
enable_azure_container_storage=None,
Expand Down Expand Up @@ -1209,6 +1210,8 @@ def aks_update(
grafana_resource_id=None,
enable_windows_recording_rules=False,
disable_azure_monitor_metrics=False,
enable_control_plane_metrics=False,
disable_control_plane_metrics=False,
enable_azure_monitor_app_monitoring=False,
disable_azure_monitor_app_monitoring=False,
# azure container storage
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5956,6 +5956,109 @@ def get_disable_azure_monitor_metrics(self) -> bool:
"""
return self._get_disable_azure_monitor_metrics(enable_validation=True)

def validate_control_plane_metrics_params(self) -> None:
"""Validate the --enable/--disable-control-plane-metrics flag combo and
its interaction with --enable/--disable-azure-monitor-metrics.

Raises MutuallyExclusiveArgumentError or RequiredArgumentMissingError on
an invalid combination. Returns nothing — use this when you want to
surface validation errors without consuming a parameter value.

Reads raw_param directly so the getters can also delegate here from
their enable_validation=True path without recursing.
"""
enable_cp = self.raw_param.get("enable_control_plane_metrics")
disable_cp = self.raw_param.get("disable_control_plane_metrics")
# On create, the property may already be set on the incoming mc object.
if self.decorator_mode == DecoratorMode.CREATE:
if (
self.mc and
self.mc.azure_monitor_profile and
self.mc.azure_monitor_profile.metrics and
self.mc.azure_monitor_profile.metrics.control_plane
):
enable_cp = self.mc.azure_monitor_profile.metrics.control_plane.enabled

if enable_cp and disable_cp:
raise MutuallyExclusiveArgumentError(
"Cannot specify --enable-control-plane-metrics and --disable-control-plane-metrics "
"at the same time."
)

if enable_cp:
# Reject combining enable-control-plane-metrics with disable-azure-monitor-metrics
# in the same command — the resulting payload would be inconsistent.
if self._get_disable_azure_monitor_metrics(False):
raise MutuallyExclusiveArgumentError(
"Cannot specify --enable-control-plane-metrics together with "
"--disable-azure-monitor-metrics."
)
# Must have Azure Monitor metrics enabled (either already or in this command).
already_enabled = (
self.mc and
self.mc.azure_monitor_profile and
self.mc.azure_monitor_profile.metrics and
self.mc.azure_monitor_profile.metrics.enabled
)
enabling_now = self._get_enable_azure_monitor_metrics(False)
if not already_enabled and not enabling_now:
raise RequiredArgumentMissingError(
"--enable-control-plane-metrics requires Azure Monitor metrics to be enabled. "
"Specify --enable-azure-monitor-metrics or run on a cluster that already has "
"Azure Monitor metrics enabled."
)

def _get_enable_control_plane_metrics(self, enable_validation: bool = False) -> bool:
"""Internal function to obtain the value of enable_control_plane_metrics.
When enable_validation is True, the flag combinations are validated via
validate_control_plane_metrics_params before the value is returned.

:return: bool
"""
# Read the original value passed by the command.
enable_control_plane_metrics = self.raw_param.get("enable_control_plane_metrics")
# In create mode, try to read the property value corresponding to the parameter from the `mc` object.
if self.decorator_mode == DecoratorMode.CREATE:
if (
self.mc and
self.mc.azure_monitor_profile and
self.mc.azure_monitor_profile.metrics and
self.mc.azure_monitor_profile.metrics.control_plane
):
enable_control_plane_metrics = self.mc.azure_monitor_profile.metrics.control_plane.enabled
if enable_validation:
self.validate_control_plane_metrics_params()
return bool(enable_control_plane_metrics)

def get_enable_control_plane_metrics(self) -> bool:
"""Obtain the value of enable_control_plane_metrics.
This function will verify the parameter by default. If both enable_control_plane_metrics and
disable_control_plane_metrics are specified, raise a MutuallyExclusiveArgumentError.
:return: bool
"""
return self._get_enable_control_plane_metrics(enable_validation=True)

def _get_disable_control_plane_metrics(self, enable_validation: bool = False) -> bool:
"""Internal function to obtain the value of disable_control_plane_metrics.
When enable_validation is True, the flag combinations are validated via
validate_control_plane_metrics_params before the value is returned.

:return: bool
"""
# Read the original value passed by the command.
disable_control_plane_metrics = self.raw_param.get("disable_control_plane_metrics")
if enable_validation:
self.validate_control_plane_metrics_params()
return bool(disable_control_plane_metrics)

def get_disable_control_plane_metrics(self) -> bool:
"""Obtain the value of disable_control_plane_metrics.
This function will verify the parameter by default. If both enable_control_plane_metrics and
disable_control_plane_metrics are specified, raise a MutuallyExclusiveArgumentError.
:return: bool
"""
return self._get_disable_control_plane_metrics(enable_validation=True)

def _get_enable_azure_monitor_app_monitoring(self, enable_validation: bool = False) -> bool:
"""Internal function to obtain the value of enable_azure_monitor_app_monitoring.
This function supports the option of enable_validation. When enabled, if both
Expand Down Expand Up @@ -7714,13 +7817,25 @@ def set_up_azure_monitor_profile(self, mc: ManagedCluster) -> ManagedCluster:
ksm_metric_labels_allow_list = ""
if ksm_metric_annotations_allow_list is None:
ksm_metric_annotations_allow_list = ""
# Surface control-plane-metrics flag combination errors even when the
# parent metrics flag was not specified, so users get a clear error
# instead of a silent ignore when they pass --enable-control-plane-metrics
# on its own.
self.context.validate_control_plane_metrics_params()
if self.context.get_enable_azure_monitor_metrics():
if mc.azure_monitor_profile is None:
mc.azure_monitor_profile = self.models.ManagedClusterAzureMonitorProfile()
mc.azure_monitor_profile.metrics = self.models.ManagedClusterAzureMonitorProfileMetrics(enabled=False)
mc.azure_monitor_profile.metrics.kube_state_metrics = self.models.ManagedClusterAzureMonitorProfileKubeStateMetrics( # pylint:disable=line-too-long
metric_labels_allowlist=str(ksm_metric_labels_allow_list),
metric_annotations_allow_list=str(ksm_metric_annotations_allow_list))
# NOTE: control_plane.enabled is intentionally NOT set here on the create flow.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This approach is somewhat unconventional from a client perspective. I've encountered some CRIs who reported that the monitoring addon wasn't functioning as expected when deploying with an ARM template or similar methods. Would it be possible to handle these types of tasks on the server side instead? This allows users on different clients to have a consistent experience.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, agreed. We've had this in out backlog for a while. The logs and prometheus addon team has been looking into it this semester and currently Rashmi has been working on the design for that.

Once the logic is moved to the RP, I'll create a PR to remove it from the CLI.

# If we set it on this initial PUT, the RP would schedule the control-plane-metrics
# collection pod (CCP) before the DCRA (Data Collection Rule Association) has been
# created in postprocessing. The CCP would then crash-loop with "DCRA not found"
# until the next reconciliation. Instead, we defer the flip to the addon_put step
# inside link_azure_monitor_profile_artifacts (postprocessing_after_mc_created),
# which runs *after* DCRA creation.
# set intermediate
self.context.set_intermediate("azuremonitormetrics_addon_enabled", True, overwrite_exists=True)
if self.context.get_enable_azure_monitor_app_monitoring():
Expand Down Expand Up @@ -9810,6 +9925,30 @@ def update_azure_monitor_profile(self, mc: ManagedCluster) -> ManagedCluster:
self.context.get_disable_azure_monitor_metrics(),
False)

# Handle enable / disable of control plane metrics independently of the parent metrics flag,
# so users can toggle control plane metrics on a cluster that already has metrics enabled.
if self.context.get_enable_control_plane_metrics():
if mc.azure_monitor_profile is None:
mc.azure_monitor_profile = self.models.ManagedClusterAzureMonitorProfile()
if mc.azure_monitor_profile.metrics is None:
# Should not normally happen — validation requires metrics to be enabled — but guard
# against partially-populated profiles to avoid AttributeError.
mc.azure_monitor_profile.metrics = (
self.models.ManagedClusterAzureMonitorProfileMetrics(enabled=True)
)
mc.azure_monitor_profile.metrics.control_plane = (
self.models.ManagedClusterAzureMonitorProfileMetricsControlPlane(enabled=True)
)

if self.context.get_disable_control_plane_metrics():
if (
mc.azure_monitor_profile and
mc.azure_monitor_profile.metrics
):
mc.azure_monitor_profile.metrics.control_plane = (
self.models.ManagedClusterAzureMonitorProfileMetricsControlPlane(enabled=False)
)

if self.context.get_enable_azure_monitor_app_monitoring():
if mc.azure_monitor_profile is None:
mc.azure_monitor_profile = self.models.ManagedClusterAzureMonitorProfile()
Expand Down
Loading
Loading