Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Install prometheus #2865

Open
wants to merge 1 commit into
base: tilonalli
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,12 @@ func main() {
os.Exit(1)
}

monitorCRDExists, err := utils.RequiresMonitorController(mgr.GetConfig())
if err != nil {
setupLog.Error(err, "Failed to determine if monitoring (Prometheus) is required")
os.Exit(1)
}

clusterDomain, err := dns.GetClusterDomain(dns.DefaultResolveConfPath)
if err != nil {
clusterDomain = dns.DefaultClusterDomain
Expand All @@ -309,6 +315,7 @@ func main() {
EnterpriseCRDExists: enterpriseCRDExists,
UsePSP: usePSP,
AmazonCRDExists: amazonCRDExists,
MonitorCRDExists: monitorCRDExists,
ClusterDomain: clusterDomain,
KubernetesVersion: kubernetesVersion,
ManageCRDs: manageCRDs,
Expand Down
11 changes: 10 additions & 1 deletion pkg/controller/installation/core_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,11 @@ func Add(mgr manager.Manager, opts options.AddOptions) error {
return fmt.Errorf("Failed to create tigera-installation-controller: %w", err)
}

// Watch the Monitor CR
if opts.MonitorCRDExists {
// TODO: watch the Monitor CR
}

// Established deferred watches against the v3 API that should succeed after the Enterprise API Server becomes available.
if opts.EnterpriseCRDExists {
k8sClient, err := kubernetes.NewForConfig(mgr.GetConfig())
Expand Down Expand Up @@ -199,6 +204,7 @@ func newReconciler(mgr manager.Manager, opts options.AddOptions) (*ReconcileInst
calicoWindowsUpgrader: calicoWindowsUpgrader,
namespaceMigration: nm,
amazonCRDExists: opts.AmazonCRDExists,
monitorCRDExists: opts.MonitorCRDExists,
enterpriseCRDsExist: opts.EnterpriseCRDExists,
clusterDomain: opts.ClusterDomain,
manageCRDs: opts.ManageCRDs,
Expand Down Expand Up @@ -386,6 +392,7 @@ type ReconcileInstallation struct {
namespaceMigration migration.NamespaceMigration
enterpriseCRDsExist bool
amazonCRDExists bool
monitorCRDExists bool
migrationChecked bool
clusterDomain string
manageCRDs bool
Expand Down Expand Up @@ -1141,8 +1148,8 @@ func (r *ReconcileInstallation) Reconcile(ctx context.Context, request reconcile
nodeReporterMetricsPort := defaultNodeReporterPort
var nodePrometheusTLS certificatemanagement.KeyPairInterface
calicoVersion := components.CalicoRelease
if instance.Spec.Variant == operator.TigeraSecureEnterprise {

if r.MonitorCRDExists {
// Determine the port to use for nodeReporter metrics.
if felixConfiguration.Spec.PrometheusReporterPort != nil {
nodeReporterMetricsPort = *felixConfiguration.Spec.PrometheusReporterPort
Expand Down Expand Up @@ -1170,7 +1177,9 @@ func (r *ReconcileInstallation) Reconcile(ctx context.Context, request reconcile
if prometheusClientCert != nil {
typhaNodeTLS.TrustedBundle.AddCertificates(prometheusClientCert)
}
}

if instance.Spec.Variant == operator.TigeraSecureEnterprise {
// es-kube-controllers needs to trust the ESGW certificate. We'll fetch it here and add it to the trusted bundle.
// Note that although we're adding this to the typhaNodeTLS trusted bundle, it will be used by es-kube-controllers. This is because
// all components within this namespace share a trusted CA bundle.
Expand Down
66 changes: 41 additions & 25 deletions pkg/controller/monitor/monitor_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ const ResourceName = "monitor"
var log = logf.Log.WithName("controller_monitor")

func Add(mgr manager.Manager, opts options.AddOptions) error {
if !opts.EnterpriseCRDExists {
if !opts.MonitorCRDExists {
return nil
}

Expand Down Expand Up @@ -91,10 +91,12 @@ func Add(mgr manager.Manager, opts options.AddOptions) error {
{Name: networkpolicy.TigeraComponentDefaultDenyPolicyName, Namespace: common.TigeraPrometheusNamespace},
}

// Watch for changes to Tier, as its status is used as input to determine whether network policy should be reconciled by this controller.
go utils.WaitToAddTierWatch(networkpolicy.TigeraComponentTierName, controller, k8sClient, log, tierWatchReady)
if opts.EnterpriseCRDExists {
// Watch for changes to Tier, as its status is used as input to determine whether network policy should be reconciled by this controller.
go utils.WaitToAddTierWatch(networkpolicy.TigeraComponentTierName, controller, k8sClient, log, tierWatchReady)

go utils.WaitToAddNetworkPolicyWatches(controller, k8sClient, log, policyNames)
go utils.WaitToAddNetworkPolicyWatches(controller, k8sClient, log, policyNames)
}

go waitToAddPrometheusWatch(controller, k8sClient, log, prometheusReady)

Expand All @@ -111,12 +113,18 @@ func newReconciler(mgr manager.Manager, opts options.AddOptions, prometheusReady
tierWatchReady: tierWatchReady,
clusterDomain: opts.ClusterDomain,
usePSP: opts.UsePSP,
isEnterprise: opts.EnterpriseCRDExists,
}

r.status.AddStatefulSets([]types.NamespacedName{
{Namespace: common.TigeraPrometheusNamespace, Name: fmt.Sprintf("alertmanager-%s", monitor.CalicoNodeAlertmanager)},
namespacedNames = []types.NamespacedName{
{Namespace: common.TigeraPrometheusNamespace, Name: fmt.Sprintf("prometheus-%s", monitor.CalicoNodePrometheus)},
})
}

if r.isEnterprise {
namespacedNames = append(namespacedNames, types.NamespacedName{Namespace: common.TigeraPrometheusNamespace, Name: fmt.Sprintf("alertmanager-%s", monitor.CalicoNodeAlertmanager)})
}

r.status.AddStatefulSets(namespacedNames)

r.status.Run(opts.ShutdownContext)
return r
Expand Down Expand Up @@ -182,6 +190,7 @@ type ReconcileMonitor struct {
tierWatchReady *utils.ReadyFlag
clusterDomain string
usePSP bool
isEnterprise bool
}

func (r *ReconcileMonitor) getMonitor(ctx context.Context) (*operatorv1.Monitor, error) {
Expand Down Expand Up @@ -310,7 +319,7 @@ func (r *ReconcileMonitor) Reconcile(ctx context.Context, request reconcile.Requ
}

// Validate that the tier watch is ready before querying the tier to ensure we utilize the cache.
if !r.tierWatchReady.IsReady() {
if r.isEnterprise && !r.tierWatchReady.IsReady() {
r.status.SetDegraded(operatorv1.ResourceNotReady, "Waiting for Tier watch to be established", nil, reqLogger)
return reconcile.Result{RequeueAfter: 10 * time.Second}, nil
}
Expand All @@ -332,30 +341,37 @@ func (r *ReconcileMonitor) Reconcile(ctx context.Context, request reconcile.Requ
// Create a component handler to manage the rendered component.
hdler := utils.NewComponentHandler(log, r.client, r.scheme, instance)

alertmanagerConfigSecret, createInOperatorNamespace, err := r.readAlertmanagerConfigSecret(ctx)
if err != nil {
r.status.SetDegraded(operatorv1.ResourceReadError, "Error retrieving Alertmanager configuration secret", err, reqLogger)
return reconcile.Result{}, err
}

kubeControllersMetricsPort, err := utils.GetKubeControllerMetricsPort(ctx, r.client)
if err != nil {
r.status.SetDegraded(operatorv1.ResourceReadError, "Unable to read KubeControllersConfiguration", err, reqLogger)
return reconcile.Result{}, err
}

monitorCfg := &monitor.Config{
Installation: install,
PullSecrets: pullSecrets,
AlertmanagerConfigSecret: alertmanagerConfigSecret,
KeyValidatorConfig: keyValidatorConfig,
ServerTLSSecret: serverTLSSecret,
ClientTLSSecret: clientTLSSecret,
ClusterDomain: r.clusterDomain,
TrustedCertBundle: trustedBundle,
Openshift: r.provider == operatorv1.ProviderOpenShift,
KubeControllerPort: kubeControllersMetricsPort,
UsePSP: r.usePSP,
Installation: install,
PullSecrets: pullSecrets,
KeyValidatorConfig: keyValidatorConfig,
ServerTLSSecret: serverTLSSecret,
ClientTLSSecret: clientTLSSecret,
ClusterDomain: r.clusterDomain,
TrustedCertBundle: trustedBundle,
Openshift: r.provider == operatorv1.ProviderOpenShift,
KubeControllerPort: kubeControllersMetricsPort,
UsePSP: r.usePSP,
PrometheusOnly: !r.isEnterprise,
}

// Add in alertmanager secret for Enterprise only
var createInOperatorNamespace bool
if r.isEnterprise {
alertmanagerConfigSecret, createInOpNs, err := r.readAlertmanagerConfigSecret(ctx)
if err != nil {
r.status.SetDegraded(operatorv1.ResourceReadError, "Error retrieving Alertmanager configuration secret", err, reqLogger)
return reconcile.Result{}, err
}

monitorCfg.AlertmanagerConfigSecret = alertmanagerConfigSecret
createInOperatorNamespace = createInOpNs
}

// Render prometheus component
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ type AddOptions struct {
DetectedProvider v1.Provider
EnterpriseCRDExists bool
AmazonCRDExists bool
MonitorCRDExists bool
ClusterDomain string
KubernetesVersion *common.VersionInfo
ManageCRDs bool
Expand Down
23 changes: 21 additions & 2 deletions pkg/controller/utils/discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,6 @@ func RequiresTigeraSecure(cfg *rest.Config) (bool, error) {
fallthrough
case "ApplicationLayer":
fallthrough
case "Monitor":
fallthrough
case "ManagementCluster":
fallthrough
case "EgressGateway":
Expand Down Expand Up @@ -87,6 +85,27 @@ func RequiresAmazonController(cfg *rest.Config) (bool, error) {
return false, nil
}

// RequiresMonitorController determines if the configures requires we start the
// monitor controller which will manage Prometheus.
func RequiresMonitorController(cfg *rest.Config) (bool, error) {
clientset, err := kubernetes.NewForConfig(cfg)
if err != nil {
return false, err
}

// Use the discovery client to determine if the amazoncloudintegration APIs exist.
resources, err := clientset.Discovery().ServerResourcesForGroupVersion("operator.tigera.io/v1")
if err != nil {
return false, err
}
for _, r := range resources.APIResources {
if r.Kind == "Monitor" {
return true, nil
}
}
return false, nil
}

func MultiTenant(ctx context.Context, c kubernetes.Interface) (bool, error) {
resources, err := c.Discovery().ServerResourcesForGroupVersion("operator.tigera.io/v1")
if err != nil {
Expand Down
63 changes: 38 additions & 25 deletions pkg/render/monitor/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ type Config struct {
Openshift bool
KubeControllerPort int
UsePSP bool
PrometheusOnly bool
}

type monitorComponent struct {
Expand Down Expand Up @@ -200,19 +201,24 @@ func (mc *monitorComponent) Objects() ([]client.Object, []client.Object) {
mc.prometheusClusterRole(),
mc.prometheusClusterRoleBinding(),
mc.prometheus(),
mc.alertmanagerService(),
mc.alertmanager(),
mc.prometheusServiceService(),
mc.prometheusServiceClusterRole(),
mc.prometheusServiceClusterRoleBinding(),
mc.prometheusRule(),
mc.serviceMonitorCalicoNode(),
mc.serviceMonitorElasticsearch(),
mc.serviceMonitorFluentd(),
mc.serviceMonitorQueryServer(),
mc.serviceMonitorCalicoKubeControllers(),
)

if !mc.cfg.PrometheusOnly {
toCreate = append(toCreate,
mc.alertmanagerService(),
mc.alertmanager(),
mc.prometheusRule(),
mc.serviceMonitorElasticsearch(),
mc.serviceMonitorFluentd(),
mc.serviceMonitorQueryServer(),
mc.serviceMonitorCalicoKubeControllers(),
)
}

if mc.cfg.KeyValidatorConfig != nil {
toCreate = append(toCreate, secret.ToRuntimeObjects(mc.cfg.KeyValidatorConfig.RequiredSecrets(common.TigeraPrometheusNamespace)...)...)
toCreate = append(toCreate, configmap.ToRuntimeObjects(mc.cfg.KeyValidatorConfig.RequiredConfigMaps(common.TigeraPrometheusNamespace)...)...)
Expand Down Expand Up @@ -735,6 +741,30 @@ func (mc *monitorComponent) prometheusRule() *monitoringv1.PrometheusRule {
}

func (mc *monitorComponent) serviceMonitorCalicoNode() *monitoringv1.ServiceMonitor {
endpoints = []monitoringv1.Endpoint{
{
HonorLabels: true,
Interval: "5s",
Port: "calico-metrics-port",
ScrapeTimeout: "5s",
Scheme: "https",
TLSConfig: mc.tlsConfig(render.CalicoNodeMetricsService),
},
}

if !mc.cfg.PrometheusOnly {
endpoints = append(endpoints,
monitoringv1.Endpoint{
HonorLabels: true,
Interval: "5s",
Port: "calico-bgp-metrics-port",
ScrapeTimeout: "5s",
Scheme: "https",
TLSConfig: mc.tlsConfig(render.CalicoNodeMetricsService),
},
)
}

return &monitoringv1.ServiceMonitor{
TypeMeta: metav1.TypeMeta{Kind: monitoringv1.ServiceMonitorsKind, APIVersion: MonitoringAPIVersion},
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -745,24 +775,7 @@ func (mc *monitorComponent) serviceMonitorCalicoNode() *monitoringv1.ServiceMoni
Spec: monitoringv1.ServiceMonitorSpec{
Selector: metav1.LabelSelector{MatchLabels: map[string]string{"k8s-app": "calico-node"}},
NamespaceSelector: monitoringv1.NamespaceSelector{MatchNames: []string{"calico-system"}},
Endpoints: []monitoringv1.Endpoint{
{
HonorLabels: true,
Interval: "5s",
Port: "calico-metrics-port",
ScrapeTimeout: "5s",
Scheme: "https",
TLSConfig: mc.tlsConfig(render.CalicoNodeMetricsService),
},
{
HonorLabels: true,
Interval: "5s",
Port: "calico-bgp-metrics-port",
ScrapeTimeout: "5s",
Scheme: "https",
TLSConfig: mc.tlsConfig(render.CalicoNodeMetricsService),
},
},
Endpoints: endpoints,
},
}
}
Expand Down
7 changes: 4 additions & 3 deletions pkg/render/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -1401,6 +1401,10 @@ func (c *nodeComponent) nodeEnvVars() []corev1.EnvVar {
{Name: "FELIX_TYPHACERTFILE", Value: c.cfg.TLS.NodeSecret.VolumeMountCertificateFilePath()},
{Name: "FELIX_TYPHAKEYFILE", Value: c.cfg.TLS.NodeSecret.VolumeMountKeyFilePath()},
{Name: "FIPS_MODE_ENABLED", Value: operatorv1.IsFIPSModeEnabledString(c.cfg.Installation.FIPSMode)},
// TODO: Need to check if it is OK to have these values always enabled for prometheus metrics
{Name: "FELIX_PROMETHEUSREPORTERENABLED", Value: "true"},
{Name: "FELIX_PROMETHEUSREPORTERPORT", Value: fmt.Sprintf("%d", c.cfg.NodeReporterMetricsPort)},
{Name: "FELIX_FLOWLOGSFILEENABLED", Value: "true"},
}
// We need at least the CN or URISAN set, we depend on the validation
// done by the core_controller that the Secret will have one.
Expand Down Expand Up @@ -1613,9 +1617,6 @@ func (c *nodeComponent) nodeEnvVars() []corev1.EnvVar {
if c.cfg.Installation.Variant == operatorv1.TigeraSecureEnterprise {
// Add in Calico Enterprise specific configuration.
extraNodeEnv := []corev1.EnvVar{
{Name: "FELIX_PROMETHEUSREPORTERENABLED", Value: "true"},
{Name: "FELIX_PROMETHEUSREPORTERPORT", Value: fmt.Sprintf("%d", c.cfg.NodeReporterMetricsPort)},
{Name: "FELIX_FLOWLOGSFILEENABLED", Value: "true"},
{Name: "FELIX_FLOWLOGSFILEINCLUDELABELS", Value: "true"},
{Name: "FELIX_FLOWLOGSFILEINCLUDEPOLICIES", Value: "true"},
{Name: "FELIX_FLOWLOGSFILEINCLUDESERVICE", Value: "true"},
Expand Down