A Kubernetes operator that bridges Hardware Security Module (HSM) data storage with Kubernetes Secrets, providing true secret portability th
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

add missing files

+1606
+1029
internal/agent/manager.go
··· 1 + /* 2 + Copyright 2025. 3 + 4 + Licensed under the Apache License, Version 2.0 (the "License"); 5 + you may not use this file except in compliance with the License. 6 + You may obtain a copy of the License at 7 + 8 + http://www.apache.org/licenses/LICENSE-2.0 9 + 10 + Unless required by applicable law or agreed to in writing, software 11 + distributed under the License is distributed on an "AS IS" BASIS, 12 + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 + See the License for the specific language governing permissions and 14 + limitations under the License. 15 + */ 16 + 17 + package agent 18 + 19 + import ( 20 + "context" 21 + "fmt" 22 + "slices" 23 + "strings" 24 + "sync" 25 + "time" 26 + 27 + "github.com/go-logr/logr" 28 + appsv1 "k8s.io/api/apps/v1" 29 + corev1 "k8s.io/api/core/v1" 30 + "k8s.io/apimachinery/pkg/api/errors" 31 + "k8s.io/apimachinery/pkg/api/resource" 32 + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 33 + "k8s.io/apimachinery/pkg/types" 34 + "k8s.io/apimachinery/pkg/util/intstr" 35 + "sigs.k8s.io/controller-runtime/pkg/client" 36 + 37 + hsmv1alpha1 "github.com/evanjarrett/hsm-secrets-operator/api/v1alpha1" 38 + "github.com/evanjarrett/hsm-secrets-operator/internal/hsm" 39 + ) 40 + 41 + // ManagerInterface defines the interface for HSM agent management 42 + // This allows for easier testing with mocks 43 + type ManagerInterface interface { 44 + EnsureAgent(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool) error 45 + CleanupAgent(ctx context.Context, hsmDevice *hsmv1alpha1.HSMDevice) error 46 + } 47 + 48 + // AgentStatus represents the current status of an agent 49 + type AgentStatus string 50 + 51 + const ( 52 + AgentStatusCreating AgentStatus = "Creating" 53 + AgentStatusReady AgentStatus = "Ready" 54 + AgentStatusFailed AgentStatus = "Failed" 55 + ) 56 + 57 + // AgentInfo tracks agent state and connections 58 + type AgentInfo struct { 59 + PodIPs []string 60 + CreatedAt time.Time 61 + LastHealthCheck time.Time 62 + Status AgentStatus 63 + AgentName string 64 + Namespace string 65 + } 66 + 67 + const ( 68 + // AgentNamePrefix is the prefix for HSM agent deployment names 69 + AgentNamePrefix = "hsm-agent" 70 + 71 + // AgentPort is the port the HSM agent serves on (now gRPC) 72 + AgentPort = 9090 73 + 74 + // AgentHealthPort is the port for health checks (HTTP for simplicity) 75 + AgentHealthPort = 8093 76 + ) 77 + 78 + // Manager handles HSM agent pod lifecycle 79 + type Manager struct { 80 + client.Client 81 + AgentImage string 82 + AgentNamespace string 83 + ImageResolver ImageResolver 84 + 85 + // Internal tracking 86 + activeAgents map[string]*AgentInfo // deviceName -> AgentInfo 87 + mu sync.RWMutex 88 + 89 + // Test configuration 90 + TestMode bool // Enable test mode for faster operations 91 + WaitTimeout time.Duration // Timeout for waiting operations (default: 60s) 92 + WaitPollInterval time.Duration // Polling interval for waiting operations (default: 2s) 93 + } 94 + 95 + // ImageResolver interface for dependency injection 96 + type ImageResolver interface { 97 + GetImage(ctx context.Context, defaultImage string) string 98 + } 99 + 100 + // deviceWork represents work to be done for a specific device 101 + type deviceWork struct { 102 + device hsmv1alpha1.DiscoveredDevice 103 + agentName string 104 + agentKey string 105 + index int 106 + } 107 + 108 + // NewManager creates a new agent manager 109 + func NewManager(k8sClient client.Client, namespace string, imageResolver ImageResolver) *Manager { 110 + 111 + m := &Manager{ 112 + Client: k8sClient, 113 + AgentNamespace: namespace, 114 + ImageResolver: imageResolver, 115 + activeAgents: make(map[string]*AgentInfo), 116 + // Default production timeouts 117 + WaitTimeout: 60 * time.Second, 118 + WaitPollInterval: 2 * time.Second, 119 + } 120 + 121 + // If no namespace provided, agents will be deployed in the same namespace as their HSMDevice 122 + // AgentNamespace is only used as a fallback now 123 + 124 + return m 125 + } 126 + 127 + // NewTestManager creates a new agent manager optimized for testing 128 + func NewTestManager(k8sClient client.Client, namespace string, imageResolver ImageResolver) *Manager { 129 + m := &Manager{ 130 + Client: k8sClient, 131 + AgentNamespace: namespace, 132 + ImageResolver: imageResolver, 133 + activeAgents: make(map[string]*AgentInfo), 134 + // Fast test timeouts 135 + TestMode: true, 136 + WaitTimeout: 5 * time.Second, 137 + WaitPollInterval: 100 * time.Millisecond, 138 + } 139 + return m 140 + } 141 + 142 + // generateAgentName creates a consistent agent name for an HSM device 143 + func (m *Manager) generateAgentName(hsmPool *hsmv1alpha1.HSMPool) string { 144 + return fmt.Sprintf("%s-%s", AgentNamePrefix, hsmPool.OwnerReferences[0].Name) 145 + } 146 + 147 + // EnsureAgent ensures HSM agents are deployed for all available devices in the pool 148 + func (m *Manager) EnsureAgent(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool) error { 149 + 150 + // Pre-collect available devices to process (no mutex needed) 151 + workItems := make([]deviceWork, 0, len(hsmPool.Status.AggregatedDevices)) 152 + for i, aggregatedDevice := range hsmPool.Status.AggregatedDevices { 153 + if !aggregatedDevice.Available { 154 + continue 155 + } 156 + workItems = append(workItems, deviceWork{ 157 + device: aggregatedDevice, 158 + agentName: fmt.Sprintf("%s-%d", m.generateAgentName(hsmPool), i), 159 + agentKey: aggregatedDevice.SerialNumber, 160 + index: i, 161 + }) 162 + } 163 + 164 + if len(workItems) == 0 { 165 + return nil // No available devices to process 166 + } 167 + 168 + // Process devices in parallel 169 + var wg sync.WaitGroup 170 + errChan := make(chan error, len(workItems)) 171 + 172 + for _, work := range workItems { 173 + wg.Add(1) 174 + go func(w deviceWork) { 175 + defer wg.Done() 176 + 177 + // Mutex-protected check and update of activeAgents 178 + m.mu.Lock() 179 + needsDeployment := false 180 + if agentInfo, exists := m.activeAgents[w.agentKey]; exists { 181 + if !m.isAgentHealthy(ctx, agentInfo) { 182 + m.removeAgentFromTracking(w.agentKey) 183 + needsDeployment = true 184 + } 185 + } else { 186 + needsDeployment = true 187 + } 188 + m.mu.Unlock() 189 + 190 + // Skip if agent is healthy and tracked 191 + if !needsDeployment { 192 + return 193 + } 194 + 195 + // Deploy agent for this device (Kubernetes API calls - no mutex needed) 196 + if err := m.deployAgentForDevice(ctx, w, hsmPool); err != nil { 197 + errChan <- fmt.Errorf("failed to deploy agent %s: %w", w.agentName, err) 198 + } 199 + }(work) 200 + } 201 + 202 + // Wait for all goroutines to complete 203 + wg.Wait() 204 + close(errChan) 205 + 206 + // Collect any errors 207 + deploymentErrors := make([]error, 0, len(workItems)) 208 + for err := range errChan { 209 + deploymentErrors = append(deploymentErrors, err) 210 + } 211 + 212 + if len(deploymentErrors) > 0 { 213 + return fmt.Errorf("agent deployment errors: %v", deploymentErrors) 214 + } 215 + 216 + return nil 217 + } 218 + 219 + // deployAgentForDevice handles the deployment logic for a single device 220 + func (m *Manager) deployAgentForDevice(ctx context.Context, work deviceWork, hsmPool *hsmv1alpha1.HSMPool) error { 221 + // Check if deployment exists in Kubernetes 222 + var deployment appsv1.Deployment 223 + err := m.Get(ctx, types.NamespacedName{ 224 + Name: work.agentName, 225 + Namespace: hsmPool.Namespace, 226 + }, &deployment) 227 + 228 + if err == nil { 229 + // Agent exists, but check if it needs updating (image version, device/node configuration) 230 + needsUpdate, err := m.agentNeedsUpdate(ctx, &deployment, hsmPool) 231 + if err != nil { 232 + return fmt.Errorf("failed to check if agent deployment %s needs update: %w", work.agentName, err) 233 + } 234 + 235 + // Also check device-specific configuration 236 + if !needsUpdate { 237 + needsUpdate = m.deploymentNeedsUpdateForDevice(&deployment, &work.device) 238 + } 239 + 240 + if needsUpdate { 241 + // Delete existing deployment to trigger recreation 242 + if err := m.Delete(ctx, &deployment); err != nil && !errors.IsNotFound(err) { 243 + return fmt.Errorf("failed to delete outdated agent deployment %s: %w", work.agentName, err) 244 + } 245 + } else { 246 + // Agent exists and is correct - wait for it and track it 247 + podIPs, err := m.waitForAgentReady(ctx, work.agentName, hsmPool.Namespace) 248 + if err != nil { 249 + return fmt.Errorf("failed waiting for existing agent pods %s: %w", work.agentName, err) 250 + } 251 + 252 + // Track the existing agent (mutex-protected) 253 + m.mu.Lock() 254 + agentInfo := &AgentInfo{ 255 + PodIPs: podIPs, 256 + CreatedAt: time.Now(), 257 + LastHealthCheck: time.Now(), 258 + Status: AgentStatusReady, 259 + AgentName: work.agentName, 260 + Namespace: hsmPool.Namespace, 261 + } 262 + m.activeAgents[work.agentKey] = agentInfo 263 + m.mu.Unlock() 264 + return nil 265 + } 266 + } else if !errors.IsNotFound(err) { 267 + return fmt.Errorf("failed to check agent deployment %s: %w", work.agentName, err) 268 + } 269 + 270 + // Create agent deployment for this specific device 271 + if err := m.createAgentDeployment(ctx, hsmPool, &work.device, work.agentName); err != nil { 272 + return fmt.Errorf("failed to create agent deployment %s: %w", work.agentName, err) 273 + } 274 + 275 + // Wait for agent pods to be ready and get their IPs 276 + podIPs, err := m.waitForAgentReady(ctx, work.agentName, hsmPool.Namespace) 277 + if err != nil { 278 + return fmt.Errorf("failed waiting for agent pods %s: %w", work.agentName, err) 279 + } 280 + 281 + // Track the new agent (mutex-protected) 282 + m.mu.Lock() 283 + agentInfo := &AgentInfo{ 284 + PodIPs: podIPs, 285 + CreatedAt: time.Now(), 286 + LastHealthCheck: time.Now(), 287 + Status: AgentStatusReady, 288 + AgentName: work.agentName, 289 + Namespace: hsmPool.Namespace, 290 + } 291 + m.activeAgents[work.agentKey] = agentInfo 292 + m.mu.Unlock() 293 + 294 + return nil 295 + } 296 + 297 + // CleanupAgent removes all HSM agents for the given device when no longer needed 298 + func (m *Manager) CleanupAgent(ctx context.Context, hsmDevice *hsmv1alpha1.HSMDevice) error { 299 + m.mu.Lock() 300 + defer m.mu.Unlock() 301 + 302 + // Check if any HSMSecrets still reference this device 303 + var hsmSecretList hsmv1alpha1.HSMSecretList 304 + if err := m.List(ctx, &hsmSecretList); err != nil { 305 + return fmt.Errorf("failed to list HSMSecrets: %w", err) 306 + } 307 + 308 + // In the HSMPool architecture, cleanup should be based on device availability in pool 309 + // rather than individual secret references, since all secrets can use any available device 310 + // Check if there are any active HSMSecrets - if so, keep the agents running 311 + if len(hsmSecretList.Items) > 0 { 312 + return nil 313 + } 314 + 315 + // Get the HSMPool to find all agent deployments to clean up 316 + poolName := hsmDevice.Name + "-pool" 317 + var hsmPool hsmv1alpha1.HSMPool 318 + if err := m.Get(ctx, types.NamespacedName{ 319 + Name: poolName, 320 + Namespace: hsmDevice.Namespace, 321 + }, &hsmPool); err != nil { 322 + // If pool doesn't exist, try to clean up any remaining tracked agents 323 + return m.cleanupTrackedAgents(ctx, hsmDevice) 324 + } 325 + 326 + // Clean up all agent deployments (one per aggregated device) 327 + for i, aggregatedDevice := range hsmPool.Status.AggregatedDevices { 328 + agentName := fmt.Sprintf("%s-%s-%d", AgentNamePrefix, hsmDevice.Name, i) 329 + agentKey := fmt.Sprintf("%s-%s", hsmDevice.Name, aggregatedDevice.SerialNumber) 330 + 331 + // Remove from internal tracking 332 + m.removeAgentFromTracking(agentKey) 333 + 334 + // Delete deployment 335 + deployment := &appsv1.Deployment{ 336 + ObjectMeta: metav1.ObjectMeta{ 337 + Name: agentName, 338 + Namespace: hsmDevice.Namespace, 339 + }, 340 + } 341 + if err := m.Delete(ctx, deployment); err != nil && !errors.IsNotFound(err) { 342 + return fmt.Errorf("failed to delete agent deployment %s: %w", agentName, err) 343 + } 344 + } 345 + 346 + return nil 347 + } 348 + 349 + // cleanupTrackedAgents cleans up any remaining tracked agents when HSMPool is not available 350 + func (m *Manager) cleanupTrackedAgents(ctx context.Context, hsmDevice *hsmv1alpha1.HSMDevice) error { 351 + // Find all tracked agents for this device 352 + var agentsToCleanup []string 353 + devicePrefix := hsmDevice.Name + "-" 354 + 355 + for agentKey := range m.activeAgents { 356 + if strings.HasPrefix(agentKey, devicePrefix) { 357 + agentsToCleanup = append(agentsToCleanup, agentKey) 358 + } 359 + } 360 + 361 + // Clean up each tracked agent 362 + for _, agentKey := range agentsToCleanup { 363 + agentInfo := m.activeAgents[agentKey] 364 + 365 + // Remove from tracking 366 + m.removeAgentFromTracking(agentKey) 367 + 368 + // Delete deployment 369 + deployment := &appsv1.Deployment{ 370 + ObjectMeta: metav1.ObjectMeta{ 371 + Name: agentInfo.AgentName, 372 + Namespace: agentInfo.Namespace, 373 + }, 374 + } 375 + if err := m.Delete(ctx, deployment); err != nil && !errors.IsNotFound(err) { 376 + return fmt.Errorf("failed to delete agent deployment %s: %w", agentInfo.AgentName, err) 377 + } 378 + } 379 + 380 + return nil 381 + } 382 + 383 + // createAgentDeployment creates the HSM agent deployment for a specific device 384 + func (m *Manager) createAgentDeployment(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool, specificDevice *hsmv1alpha1.DiscoveredDevice, customAgentName string) error { 385 + if specificDevice == nil { 386 + return fmt.Errorf("specificDevice is required") 387 + } 388 + 389 + var agentName string 390 + if customAgentName != "" { 391 + agentName = customAgentName 392 + } else { 393 + agentName = m.generateAgentName(hsmPool) 394 + } 395 + 396 + targetNode := specificDevice.NodeName 397 + devicePath := specificDevice.DevicePath 398 + deviceName := hsmPool.OwnerReferences[0].Name 399 + 400 + // Get discovery image from environment, manager image, or use default 401 + agentImage := m.ImageResolver.GetImage(ctx, "AGENT_IMAGE") 402 + 403 + deployment := &appsv1.Deployment{ 404 + ObjectMeta: metav1.ObjectMeta{ 405 + Name: agentName, 406 + Namespace: hsmPool.Namespace, 407 + Labels: map[string]string{ 408 + "app": agentName, 409 + "app.kubernetes.io/component": "hsm-agent", 410 + "app.kubernetes.io/instance": agentName, 411 + "app.kubernetes.io/name": "hsm-agent", 412 + "app.kubernetes.io/part-of": "hsm-secrets-operator", 413 + "hsm.j5t.io/device": deviceName, 414 + "hsm.j5t.io/serial-number": specificDevice.SerialNumber, 415 + "hsm.j5t.io/device-path": sanitizeLabelValue(specificDevice.DevicePath), 416 + }, 417 + }, 418 + Spec: appsv1.DeploymentSpec{ 419 + Replicas: int32Ptr(1), 420 + Selector: &metav1.LabelSelector{ 421 + MatchLabels: map[string]string{ 422 + "app": agentName, 423 + }, 424 + }, 425 + Template: corev1.PodTemplateSpec{ 426 + ObjectMeta: metav1.ObjectMeta{ 427 + Labels: map[string]string{ 428 + "app": agentName, 429 + "app.kubernetes.io/component": "hsm-agent", 430 + "app.kubernetes.io/instance": agentName, 431 + "app.kubernetes.io/name": "hsm-agent", 432 + "app.kubernetes.io/part-of": "hsm-secrets-operator", 433 + "hsm.j5t.io/device": deviceName, 434 + "hsm.j5t.io/serial-number": specificDevice.SerialNumber, 435 + "hsm.j5t.io/device-path": sanitizeLabelValue(specificDevice.DevicePath), 436 + }, 437 + }, 438 + Spec: corev1.PodSpec{ 439 + // Pin to the specific node with the HSM device 440 + NodeSelector: map[string]string{ 441 + "kubernetes.io/hostname": targetNode, 442 + }, 443 + // Affinity for better scheduling 444 + Affinity: &corev1.Affinity{ 445 + NodeAffinity: &corev1.NodeAffinity{ 446 + RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ 447 + NodeSelectorTerms: []corev1.NodeSelectorTerm{ 448 + { 449 + MatchExpressions: []corev1.NodeSelectorRequirement{ 450 + { 451 + Key: "kubernetes.io/hostname", 452 + Operator: corev1.NodeSelectorOpIn, 453 + Values: []string{targetNode}, 454 + }, 455 + }, 456 + }, 457 + }, 458 + }, 459 + }, 460 + }, 461 + SecurityContext: &corev1.PodSecurityContext{ 462 + RunAsUser: int64Ptr(0), 463 + RunAsGroup: int64Ptr(0), 464 + RunAsNonRoot: boolPtr(false), 465 + }, 466 + ServiceAccountName: "hsm-secrets-operator", 467 + Containers: []corev1.Container{ 468 + { 469 + Name: "agent", 470 + Image: agentImage, 471 + Command: []string{ 472 + "/entrypoint.sh", 473 + "agent", 474 + }, 475 + Args: []string{ 476 + "--device-name=" + deviceName, 477 + "--port=" + fmt.Sprintf("%d", AgentPort), 478 + "--health-port=" + fmt.Sprintf("%d", AgentHealthPort), 479 + }, 480 + Env: func() []corev1.EnvVar { 481 + env, err := m.buildAgentEnv(ctx, hsmPool) 482 + if err != nil { 483 + // Log error but continue with empty env to avoid breaking deployment creation 484 + return []corev1.EnvVar{} 485 + } 486 + return env 487 + }(), 488 + Ports: []corev1.ContainerPort{ 489 + { 490 + Name: "grpc", 491 + ContainerPort: AgentPort, 492 + Protocol: corev1.ProtocolTCP, 493 + }, 494 + { 495 + Name: "health", 496 + ContainerPort: AgentHealthPort, 497 + Protocol: corev1.ProtocolTCP, 498 + }, 499 + }, 500 + LivenessProbe: &corev1.Probe{ 501 + ProbeHandler: corev1.ProbeHandler{ 502 + HTTPGet: &corev1.HTTPGetAction{ 503 + Path: "/healthz", 504 + Port: intstr.FromInt(AgentHealthPort), 505 + }, 506 + }, 507 + InitialDelaySeconds: 15, 508 + PeriodSeconds: 20, 509 + }, 510 + ReadinessProbe: &corev1.Probe{ 511 + ProbeHandler: corev1.ProbeHandler{ 512 + HTTPGet: &corev1.HTTPGetAction{ 513 + Path: "/readyz", 514 + Port: intstr.FromInt(AgentHealthPort), 515 + }, 516 + }, 517 + InitialDelaySeconds: 5, 518 + PeriodSeconds: 10, 519 + }, 520 + Resources: corev1.ResourceRequirements{ 521 + Requests: corev1.ResourceList{ 522 + corev1.ResourceCPU: resourceQuantity("100m"), 523 + corev1.ResourceMemory: resourceQuantity("128Mi"), 524 + }, 525 + Limits: corev1.ResourceList{ 526 + corev1.ResourceCPU: resourceQuantity("500m"), 527 + corev1.ResourceMemory: resourceQuantity("256Mi"), 528 + }, 529 + }, 530 + SecurityContext: &corev1.SecurityContext{ 531 + Privileged: boolPtr(true), 532 + AllowPrivilegeEscalation: boolPtr(true), 533 + Capabilities: &corev1.Capabilities{ 534 + Drop: []corev1.Capability{}, 535 + Add: []corev1.Capability{ 536 + "SYS_ADMIN", 537 + }, 538 + }, 539 + ReadOnlyRootFilesystem: boolPtr(false), 540 + RunAsNonRoot: boolPtr(false), 541 + RunAsUser: int64Ptr(0), 542 + }, 543 + VolumeMounts: m.buildAgentVolumeMounts(), 544 + }, 545 + }, 546 + Volumes: m.buildAgentVolumes(devicePath), 547 + }, 548 + }, 549 + }, 550 + } 551 + 552 + return m.Create(ctx, deployment) 553 + } 554 + 555 + // buildAgentEnv builds environment variables for the HSM agent 556 + func (m *Manager) buildAgentEnv(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool) ([]corev1.EnvVar, error) { 557 + // Get HSMDevice from owner reference 558 + deviceName := hsmPool.OwnerReferences[0].Name 559 + var hsmDevice hsmv1alpha1.HSMDevice 560 + if err := m.Get(ctx, types.NamespacedName{ 561 + Name: deviceName, 562 + Namespace: hsmPool.Namespace, 563 + }, &hsmDevice); err != nil { 564 + return nil, fmt.Errorf("failed to get HSMDevice %s: %w", deviceName, err) 565 + } 566 + env := []corev1.EnvVar{ 567 + { 568 + Name: "HSM_DEVICE_NAME", 569 + Value: hsmDevice.Name, 570 + }, 571 + { 572 + Name: "HSM_DEVICE_TYPE", 573 + Value: string(hsmDevice.Spec.DeviceType), 574 + }, 575 + } 576 + 577 + // Add PKCS#11 configuration if available 578 + if hsmDevice.Spec.PKCS11 != nil { 579 + env = append(env, []corev1.EnvVar{ 580 + { 581 + Name: "PKCS11_LIBRARY_PATH", 582 + Value: hsmDevice.Spec.PKCS11.LibraryPath, 583 + }, 584 + { 585 + Name: "PKCS11_SLOT_ID", 586 + Value: fmt.Sprintf("%d", hsmDevice.Spec.PKCS11.SlotId), 587 + }, 588 + { 589 + Name: "PKCS11_TOKEN_LABEL", 590 + Value: hsmDevice.Spec.PKCS11.TokenLabel, 591 + }, 592 + }...) 593 + 594 + // Add PIN from secret if configured 595 + if hsmDevice.Spec.PKCS11.PinSecret != nil { 596 + env = append(env, corev1.EnvVar{ 597 + Name: "PKCS11_PIN", 598 + ValueFrom: &corev1.EnvVarSource{ 599 + SecretKeyRef: &corev1.SecretKeySelector{ 600 + LocalObjectReference: corev1.LocalObjectReference{ 601 + Name: hsmDevice.Spec.PKCS11.PinSecret.Name, 602 + }, 603 + Key: hsmDevice.Spec.PKCS11.PinSecret.Key, 604 + }, 605 + }, 606 + }) 607 + } 608 + } 609 + 610 + return env, nil 611 + } 612 + 613 + // buildAgentVolumeMounts builds volume mounts for the HSM agent 614 + func (m *Manager) buildAgentVolumeMounts() []corev1.VolumeMount { 615 + return []corev1.VolumeMount{ 616 + { 617 + Name: "tmp", 618 + MountPath: "/tmp", 619 + }, 620 + { 621 + Name: "hsm-device", 622 + MountPath: "/dev/hsm", 623 + }, 624 + } 625 + } 626 + 627 + // buildAgentVolumes builds volumes for the HSM agent 628 + func (m *Manager) buildAgentVolumes(devicePath string) []corev1.Volume { 629 + return []corev1.Volume{ 630 + { 631 + Name: "tmp", 632 + VolumeSource: corev1.VolumeSource{ 633 + EmptyDir: &corev1.EmptyDirVolumeSource{}, 634 + }, 635 + }, 636 + { 637 + Name: "hsm-device", 638 + VolumeSource: corev1.VolumeSource{ 639 + HostPath: &corev1.HostPathVolumeSource{ 640 + Path: devicePath, 641 + Type: hostPathTypePtr(corev1.HostPathCharDev), 642 + }, 643 + }, 644 + }, 645 + } 646 + } 647 + 648 + // agentNeedsUpdate checks if the agent deployment needs to be updated due to device path or image changes 649 + func (m *Manager) agentNeedsUpdate(ctx context.Context, deployment *appsv1.Deployment, hsmPool *hsmv1alpha1.HSMPool) (bool, error) { 650 + if hsmPool == nil { 651 + return false, nil // No pool available, no update needed 652 + } 653 + // Check if container image needs updating 654 + if len(deployment.Spec.Template.Spec.Containers) == 0 { 655 + return false, fmt.Errorf("deployment has no containers") 656 + } 657 + 658 + container := deployment.Spec.Template.Spec.Containers[0] 659 + currentImage := container.Image 660 + 661 + // Check if image has changed (only if ImageResolver is available) 662 + if m.ImageResolver != nil { 663 + expectedImage := m.ImageResolver.GetImage(ctx, "AGENT_IMAGE") 664 + if currentImage != expectedImage { 665 + // Image has changed, need to update 666 + return true, nil 667 + } 668 + } 669 + 670 + // Extract current volume mounts from deployment 671 + currentDeviceMounts := make(map[string]string) // mount name -> device path 672 + 673 + for _, mount := range container.VolumeMounts { 674 + if mount.Name == "hsm-device" { 675 + // Find corresponding volume 676 + for _, vol := range deployment.Spec.Template.Spec.Volumes { 677 + if vol.Name == mount.Name && vol.HostPath != nil { 678 + currentDeviceMounts[mount.Name] = vol.HostPath.Path 679 + break 680 + } 681 + } 682 + } 683 + } 684 + 685 + // Check if any device paths in the pool differ from current mounts 686 + for _, device := range hsmPool.Status.AggregatedDevices { 687 + if device.DevicePath != "" && device.Available { 688 + // Check if this device path is already mounted 689 + found := false 690 + for _, path := range currentDeviceMounts { 691 + if path == device.DevicePath { 692 + found = true 693 + break 694 + } 695 + } 696 + if !found { 697 + // New device path found that's not in current deployment 698 + return true, nil 699 + } 700 + } 701 + } 702 + 703 + // Check for stale device paths (mounted paths that are no longer in aggregated devices) 704 + for _, currentPath := range currentDeviceMounts { 705 + found := false 706 + for _, device := range hsmPool.Status.AggregatedDevices { 707 + if device.DevicePath == currentPath && device.Available { 708 + found = true 709 + break 710 + } 711 + } 712 + if !found { 713 + // Current mount points to a device path that's no longer available 714 + return true, nil 715 + } 716 + } 717 + 718 + return false, nil 719 + } 720 + 721 + // deploymentNeedsUpdateForDevice checks if a deployment needs to be updated for a specific device 722 + // This is a simplified check that only validates device-specific configuration 723 + func (m *Manager) deploymentNeedsUpdateForDevice(deployment *appsv1.Deployment, aggregatedDevice *hsmv1alpha1.DiscoveredDevice) bool { 724 + // Check node affinity - ensure agent is pinned to the correct node 725 + if deployment.Spec.Template.Spec.Affinity == nil || 726 + deployment.Spec.Template.Spec.Affinity.NodeAffinity == nil || 727 + deployment.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { 728 + return true // Missing required node affinity 729 + } 730 + 731 + // Check if the node name matches the aggregated device's node 732 + nodeSelector := deployment.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution 733 + if len(nodeSelector.NodeSelectorTerms) == 0 { 734 + return true 735 + } 736 + 737 + // Check if hostname requirement matches the device's node 738 + nodeMatches := false 739 + for _, term := range nodeSelector.NodeSelectorTerms { 740 + for _, expr := range term.MatchExpressions { 741 + if expr.Key == "kubernetes.io/hostname" && expr.Operator == corev1.NodeSelectorOpIn { 742 + if slices.Contains(expr.Values, aggregatedDevice.NodeName) { 743 + nodeMatches = true 744 + } 745 + } 746 + } 747 + } 748 + 749 + if !nodeMatches { 750 + return true // Node doesn't match 751 + } 752 + 753 + // Check device path in volume mounts 754 + for _, vol := range deployment.Spec.Template.Spec.Volumes { 755 + if vol.Name == "hsm-device" && vol.HostPath != nil { 756 + if vol.HostPath.Path != aggregatedDevice.DevicePath { 757 + return true // Device path changed 758 + } 759 + } 760 + } 761 + 762 + return false 763 + } 764 + 765 + // Helper functions 766 + // waitForAgentReady waits for agent pods to be ready and returns their IPs 767 + func (m *Manager) waitForAgentReady(ctx context.Context, agentName, namespace string) ([]string, error) { 768 + // In test mode, simulate immediate readiness for faster tests 769 + if m.TestMode { 770 + return []string{"127.0.0.1"}, nil 771 + } 772 + 773 + timeout := time.After(m.WaitTimeout) 774 + ticker := time.NewTicker(m.WaitPollInterval) 775 + defer ticker.Stop() 776 + 777 + for { 778 + select { 779 + case <-timeout: 780 + return nil, fmt.Errorf("timeout waiting for agent pods to be ready after %v", m.WaitTimeout) 781 + case <-ticker.C: 782 + pods := &corev1.PodList{} 783 + err := m.List(ctx, pods, 784 + client.InNamespace(namespace), 785 + client.MatchingLabels{"app": agentName}, 786 + ) 787 + if err != nil { 788 + continue 789 + } 790 + 791 + var readyPodIPs []string 792 + for _, pod := range pods.Items { 793 + if pod.Status.Phase == corev1.PodRunning && 794 + len(pod.Status.PodIP) > 0 { 795 + // Check if all containers are ready 796 + allReady := true 797 + for _, condition := range pod.Status.Conditions { 798 + if condition.Type == corev1.PodReady { 799 + allReady = condition.Status == corev1.ConditionTrue 800 + break 801 + } 802 + } 803 + if allReady { 804 + readyPodIPs = append(readyPodIPs, pod.Status.PodIP) 805 + } 806 + } 807 + } 808 + 809 + if len(readyPodIPs) > 0 { 810 + return readyPodIPs, nil 811 + } 812 + } 813 + } 814 + } 815 + 816 + // GetAgentInfo returns the AgentInfo for a device 817 + func (m *Manager) GetAgentInfo(deviceName string) (*AgentInfo, bool) { 818 + m.mu.RLock() 819 + defer m.mu.RUnlock() 820 + 821 + agentInfo, exists := m.activeAgents[deviceName] 822 + return agentInfo, exists 823 + } 824 + 825 + // removeAgentFromTracking removes an agent from internal tracking 826 + func (m *Manager) removeAgentFromTracking(deviceName string) { 827 + delete(m.activeAgents, deviceName) 828 + } 829 + 830 + // RemoveAgentFromTracking removes an agent from tracking (public method for testing) 831 + func (m *Manager) RemoveAgentFromTracking(deviceName string) { 832 + m.mu.Lock() 833 + defer m.mu.Unlock() 834 + m.removeAgentFromTracking(deviceName) 835 + } 836 + 837 + // SetAgentInfo sets agent information for testing 838 + func (m *Manager) SetAgentInfo(deviceName string, info *AgentInfo) { 839 + m.mu.Lock() 840 + defer m.mu.Unlock() 841 + 842 + m.activeAgents[deviceName] = info 843 + } 844 + 845 + // isAgentHealthy checks if an agent is healthy by verifying pod IPs 846 + func (m *Manager) isAgentHealthy(ctx context.Context, agentInfo *AgentInfo) bool { 847 + // Simple health check: ensure pod IPs are still valid 848 + // In the future, we can add gRPC health checks here 849 + if len(agentInfo.PodIPs) == 0 { 850 + return false 851 + } 852 + 853 + // Check if pods still exist and are running 854 + pods := &corev1.PodList{} 855 + err := m.List(ctx, pods, 856 + client.InNamespace(agentInfo.Namespace), 857 + client.MatchingLabels{"app": agentInfo.AgentName}, 858 + ) 859 + if err != nil { 860 + return false 861 + } 862 + 863 + runningPods := 0 864 + for _, pod := range pods.Items { 865 + if pod.Status.Phase == corev1.PodRunning { 866 + runningPods++ 867 + } 868 + } 869 + 870 + return runningPods > 0 871 + } 872 + 873 + // GetAgentPodIPs returns all agent pod IPs for a device type from HSMPool 874 + func (m *Manager) GetAgentPodIPs(ctx context.Context, deviceName, namespace string) ([]string, error) { 875 + // Get HSMPool for this device 876 + poolName := deviceName + "-pool" 877 + var hsmPool hsmv1alpha1.HSMPool 878 + if err := m.Get(ctx, types.NamespacedName{ 879 + Name: poolName, 880 + Namespace: namespace, 881 + }, &hsmPool); err != nil { 882 + return nil, fmt.Errorf("failed to get HSMPool %s: %w", poolName, err) 883 + } 884 + 885 + m.mu.RLock() 886 + defer m.mu.RUnlock() 887 + 888 + var allPodIPs []string 889 + 890 + // Collect pod IPs from all agent instances for this device 891 + for _, aggregatedDevice := range hsmPool.Status.AggregatedDevices { 892 + agentKey := fmt.Sprintf("%s-%s", deviceName, aggregatedDevice.SerialNumber) 893 + if agentInfo, exists := m.activeAgents[agentKey]; exists && len(agentInfo.PodIPs) > 0 { 894 + allPodIPs = append(allPodIPs, agentInfo.PodIPs...) 895 + } 896 + } 897 + 898 + if len(allPodIPs) == 0 { 899 + return nil, fmt.Errorf("no active agents found for device %s in pool %s", deviceName, poolName) 900 + } 901 + 902 + return allPodIPs, nil 903 + } 904 + 905 + // sanitizeLabelValue sanitizes a string to be a valid Kubernetes label value 906 + // Kubernetes labels must be alphanumeric, '-', '_', or '.' and start/end with alphanumeric 907 + func sanitizeLabelValue(value string) string { 908 + if len(value) == 0 { 909 + return value 910 + } 911 + 912 + // Replace invalid characters with dashes 913 + sanitized := strings.Map(func(r rune) rune { 914 + if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_' || r == '.' { 915 + return r 916 + } 917 + return '-' 918 + }, value) 919 + 920 + // Ensure starts and ends with alphanumeric 921 + sanitized = strings.TrimFunc(sanitized, func(r rune) bool { 922 + return (r < 'A' || r > 'Z') && (r < 'a' || r > 'z') && (r < '0' || r > '9') 923 + }) 924 + 925 + // Kubernetes label values have a 63 character limit 926 + if len(sanitized) > 63 { 927 + sanitized = sanitized[:63] 928 + // Re-trim end if we cut off at a non-alphanumeric 929 + sanitized = strings.TrimFunc(sanitized, func(r rune) bool { 930 + return (r < 'A' || r > 'Z') && (r < 'a' || r > 'z') && (r < '0' || r > '9') 931 + }) 932 + } 933 + 934 + return sanitized 935 + } 936 + 937 + // GetGRPCEndpoints returns gRPC endpoints for all agent pods of a device 938 + func (m *Manager) GetGRPCEndpoints(ctx context.Context, deviceName, namespace string) ([]string, error) { 939 + podIPs, err := m.GetAgentPodIPs(ctx, deviceName, namespace) 940 + if err != nil { 941 + return nil, err 942 + } 943 + 944 + endpoints := make([]string, 0, len(podIPs)) 945 + for _, ip := range podIPs { 946 + endpoints = append(endpoints, fmt.Sprintf("%s:%d", ip, AgentPort)) 947 + } 948 + 949 + return endpoints, nil 950 + } 951 + 952 + // CreateGRPCClient creates a gRPC client for the first available agent pod of a device 953 + func (m *Manager) CreateGRPCClient(ctx context.Context, deviceName, namespace string, logger logr.Logger) (hsm.Client, error) { 954 + endpoints, err := m.GetGRPCEndpoints(ctx, deviceName, namespace) 955 + if err != nil { 956 + return nil, err 957 + } 958 + 959 + if len(endpoints) == 0 { 960 + return nil, fmt.Errorf("no agent endpoints available for device %s", deviceName) 961 + } 962 + 963 + // Use the first endpoint for single client 964 + grpcClient, err := NewGRPCClient(endpoints[0], deviceName, logger) 965 + if err != nil { 966 + return nil, fmt.Errorf("failed to create gRPC client for %s: %w", endpoints[0], err) 967 + } 968 + 969 + // Test the connection 970 + if err := grpcClient.Initialize(ctx, hsm.Config{}); err != nil { 971 + if err := grpcClient.Close(); err != nil { 972 + logger.Error(err, "Failed to close gRPC client after failed initialization") 973 + } 974 + return nil, fmt.Errorf("failed to initialize gRPC client for %s: %w", endpoints[0], err) 975 + } 976 + 977 + return grpcClient, nil 978 + } 979 + 980 + // GetAvailableDevices finds all devices with ready HSMPools and active agents 981 + func (m *Manager) GetAvailableDevices(ctx context.Context, namespace string) ([]string, error) { 982 + // List all HSMPools to find all with active agents 983 + var hsmPoolList hsmv1alpha1.HSMPoolList 984 + if err := m.List(ctx, &hsmPoolList, client.InNamespace(namespace)); err != nil { 985 + return nil, fmt.Errorf("failed to list HSM pools: %w", err) 986 + } 987 + 988 + var availableDevices []string 989 + // Check all pools that have active agents 990 + for _, pool := range hsmPoolList.Items { 991 + if pool.Status.Phase != hsmv1alpha1.HSMPoolPhaseReady { 992 + continue 993 + } 994 + 995 + // Extract device name from pool name (remove "-pool" suffix) 996 + deviceName := strings.TrimSuffix(pool.Name, "-pool") 997 + 998 + if podIPs, err := m.GetAgentPodIPs(ctx, deviceName, namespace); err == nil && len(podIPs) > 0 { 999 + availableDevices = append(availableDevices, deviceName) 1000 + } 1001 + } 1002 + 1003 + if len(availableDevices) == 0 { 1004 + return nil, fmt.Errorf("no available HSM agents found") 1005 + } 1006 + 1007 + return availableDevices, nil 1008 + } 1009 + 1010 + func int32Ptr(i int32) *int32 { 1011 + return &i 1012 + } 1013 + 1014 + func int64Ptr(i int64) *int64 { 1015 + return &i 1016 + } 1017 + 1018 + func boolPtr(b bool) *bool { 1019 + return &b 1020 + } 1021 + 1022 + func hostPathTypePtr(t corev1.HostPathType) *corev1.HostPathType { 1023 + return &t 1024 + } 1025 + 1026 + func resourceQuantity(s string) resource.Quantity { 1027 + q, _ := resource.ParseQuantity(s) 1028 + return q 1029 + }
+497
internal/agent/manager_test.go
··· 1 + /* 2 + Copyright 2025. 3 + 4 + Licensed under the Apache License, Version 2.0 (the "License"); 5 + you may not use this file except in compliance with the License. 6 + You may obtain a copy of the License at 7 + 8 + http://www.apache.org/licenses/LICENSE-2.0 9 + 10 + Unless required by applicable law or agreed to in writing, software 11 + distributed under the License is distributed on an "AS IS" BASIS, 12 + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 + See the License for the specific language governing permissions and 14 + limitations under the License. 15 + */ 16 + 17 + package agent 18 + 19 + import ( 20 + "context" 21 + "testing" 22 + 23 + "github.com/stretchr/testify/assert" 24 + "github.com/stretchr/testify/require" 25 + appsv1 "k8s.io/api/apps/v1" 26 + corev1 "k8s.io/api/core/v1" 27 + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 28 + "k8s.io/apimachinery/pkg/runtime" 29 + "sigs.k8s.io/controller-runtime/pkg/client/fake" 30 + 31 + hsmv1alpha1 "github.com/evanjarrett/hsm-secrets-operator/api/v1alpha1" 32 + ) 33 + 34 + func TestAgentNeedsUpdate(t *testing.T) { 35 + scheme := runtime.NewScheme() 36 + require.NoError(t, hsmv1alpha1.AddToScheme(scheme)) 37 + require.NoError(t, appsv1.AddToScheme(scheme)) 38 + require.NoError(t, corev1.AddToScheme(scheme)) 39 + 40 + tests := []struct { 41 + name string 42 + deployment *appsv1.Deployment 43 + hsmDevice *hsmv1alpha1.HSMDevice 44 + hsmPool *hsmv1alpha1.HSMPool 45 + expectedUpdate bool 46 + expectError bool 47 + }{ 48 + { 49 + name: "no update needed - same device path", 50 + deployment: &appsv1.Deployment{ 51 + ObjectMeta: metav1.ObjectMeta{ 52 + Name: "test-agent", 53 + Namespace: "default", 54 + }, 55 + Spec: appsv1.DeploymentSpec{ 56 + Template: corev1.PodTemplateSpec{ 57 + Spec: corev1.PodSpec{ 58 + Containers: []corev1.Container{ 59 + { 60 + Name: "agent", 61 + VolumeMounts: []corev1.VolumeMount{ 62 + { 63 + Name: "hsm-device", 64 + MountPath: "/dev/hsm", 65 + }, 66 + }, 67 + }, 68 + }, 69 + Volumes: []corev1.Volume{ 70 + { 71 + Name: "hsm-device", 72 + VolumeSource: corev1.VolumeSource{ 73 + HostPath: &corev1.HostPathVolumeSource{ 74 + Path: "/dev/bus/usb/001/015", 75 + }, 76 + }, 77 + }, 78 + }, 79 + }, 80 + }, 81 + }, 82 + }, 83 + hsmDevice: &hsmv1alpha1.HSMDevice{ 84 + ObjectMeta: metav1.ObjectMeta{ 85 + Name: "test-device", 86 + Namespace: "default", 87 + }, 88 + }, 89 + hsmPool: &hsmv1alpha1.HSMPool{ 90 + ObjectMeta: metav1.ObjectMeta{ 91 + Name: "test-device-pool", 92 + Namespace: "default", 93 + }, 94 + Status: hsmv1alpha1.HSMPoolStatus{ 95 + AggregatedDevices: []hsmv1alpha1.DiscoveredDevice{ 96 + { 97 + DevicePath: "/dev/bus/usb/001/015", 98 + Available: true, 99 + }, 100 + }, 101 + }, 102 + }, 103 + expectedUpdate: false, 104 + expectError: false, 105 + }, 106 + { 107 + name: "update needed - device path changed", 108 + deployment: &appsv1.Deployment{ 109 + ObjectMeta: metav1.ObjectMeta{ 110 + Name: "test-agent", 111 + Namespace: "default", 112 + }, 113 + Spec: appsv1.DeploymentSpec{ 114 + Template: corev1.PodTemplateSpec{ 115 + Spec: corev1.PodSpec{ 116 + Containers: []corev1.Container{ 117 + { 118 + Name: "agent", 119 + VolumeMounts: []corev1.VolumeMount{ 120 + { 121 + Name: "hsm-device", 122 + MountPath: "/dev/hsm", 123 + }, 124 + }, 125 + }, 126 + }, 127 + Volumes: []corev1.Volume{ 128 + { 129 + Name: "hsm-device", 130 + VolumeSource: corev1.VolumeSource{ 131 + HostPath: &corev1.HostPathVolumeSource{ 132 + Path: "/dev/bus/usb/001/015", // Old path 133 + }, 134 + }, 135 + }, 136 + }, 137 + }, 138 + }, 139 + }, 140 + }, 141 + hsmDevice: &hsmv1alpha1.HSMDevice{ 142 + ObjectMeta: metav1.ObjectMeta{ 143 + Name: "test-device", 144 + Namespace: "default", 145 + }, 146 + }, 147 + hsmPool: &hsmv1alpha1.HSMPool{ 148 + ObjectMeta: metav1.ObjectMeta{ 149 + Name: "test-device-pool", 150 + Namespace: "default", 151 + }, 152 + Status: hsmv1alpha1.HSMPoolStatus{ 153 + AggregatedDevices: []hsmv1alpha1.DiscoveredDevice{ 154 + { 155 + DevicePath: "/dev/bus/usb/001/016", // New path 156 + Available: true, 157 + }, 158 + }, 159 + }, 160 + }, 161 + expectedUpdate: true, 162 + expectError: false, 163 + }, 164 + { 165 + name: "no update needed - pool not found", 166 + deployment: &appsv1.Deployment{ 167 + ObjectMeta: metav1.ObjectMeta{ 168 + Name: "test-agent", 169 + Namespace: "default", 170 + }, 171 + Spec: appsv1.DeploymentSpec{ 172 + Template: corev1.PodTemplateSpec{ 173 + Spec: corev1.PodSpec{ 174 + Containers: []corev1.Container{ 175 + { 176 + Name: "agent", 177 + }, 178 + }, 179 + }, 180 + }, 181 + }, 182 + }, 183 + hsmDevice: &hsmv1alpha1.HSMDevice{ 184 + ObjectMeta: metav1.ObjectMeta{ 185 + Name: "test-device", 186 + Namespace: "default", 187 + }, 188 + }, 189 + // No HSMPool object created 190 + expectedUpdate: false, 191 + expectError: false, 192 + }, 193 + } 194 + 195 + for _, tt := range tests { 196 + t.Run(tt.name, func(t *testing.T) { 197 + ctx := context.Background() 198 + 199 + // Create fake client with objects 200 + objs := []runtime.Object{tt.hsmDevice} 201 + if tt.hsmPool != nil { 202 + objs = append(objs, tt.hsmPool) 203 + } 204 + 205 + fakeClient := fake.NewClientBuilder(). 206 + WithScheme(scheme). 207 + WithRuntimeObjects(objs...). 208 + Build() 209 + 210 + manager := &Manager{ 211 + Client: fakeClient, 212 + AgentImage: "test-image", 213 + } 214 + 215 + needsUpdate, err := manager.agentNeedsUpdate(ctx, tt.deployment, tt.hsmPool) 216 + 217 + if tt.expectError { 218 + assert.Error(t, err) 219 + } else { 220 + assert.NoError(t, err) 221 + assert.Equal(t, tt.expectedUpdate, needsUpdate) 222 + } 223 + }) 224 + } 225 + } 226 + 227 + func TestAgentTracking(t *testing.T) { 228 + scheme := runtime.NewScheme() 229 + require.NoError(t, hsmv1alpha1.AddToScheme(scheme)) 230 + require.NoError(t, appsv1.AddToScheme(scheme)) 231 + require.NoError(t, corev1.AddToScheme(scheme)) 232 + 233 + t.Run("GetAgentInfo - agent exists", func(t *testing.T) { 234 + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() 235 + manager := NewManager(fakeClient, "test-namespace", nil) 236 + 237 + // Add agent to tracking 238 + agentInfo := &AgentInfo{ 239 + PodIPs: []string{"10.1.1.5", "10.1.1.6"}, 240 + Status: AgentStatusReady, 241 + AgentName: "hsm-agent-test-device", 242 + Namespace: "default", 243 + } 244 + manager.activeAgents["test-device"] = agentInfo 245 + 246 + // Test retrieval 247 + retrieved, exists := manager.GetAgentInfo("test-device") 248 + assert.True(t, exists) 249 + assert.Equal(t, agentInfo, retrieved) 250 + }) 251 + 252 + t.Run("GetAgentInfo - agent does not exist", func(t *testing.T) { 253 + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() 254 + manager := NewManager(fakeClient, "test-namespace", nil) 255 + 256 + retrieved, exists := manager.GetAgentInfo("nonexistent-device") 257 + assert.False(t, exists) 258 + assert.Nil(t, retrieved) 259 + }) 260 + 261 + // GetAgentPodIPs tests removed - function now uses HSMPool-based lookup 262 + 263 + // GetGRPCEndpoints tests removed - function now uses HSMPool-based lookup 264 + 265 + t.Run("removeAgentFromTracking", func(t *testing.T) { 266 + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() 267 + manager := NewManager(fakeClient, "test-namespace", nil) 268 + 269 + // Add agent to tracking 270 + agentInfo := &AgentInfo{ 271 + PodIPs: []string{"10.1.1.5"}, 272 + Status: AgentStatusReady, 273 + } 274 + manager.activeAgents["test-device"] = agentInfo 275 + 276 + // Verify it exists 277 + _, exists := manager.GetAgentInfo("test-device") 278 + assert.True(t, exists) 279 + 280 + // Remove it 281 + manager.removeAgentFromTracking("test-device") 282 + 283 + // Verify it's gone 284 + _, exists = manager.GetAgentInfo("test-device") 285 + assert.False(t, exists) 286 + }) 287 + } 288 + 289 + func TestGetAvailableDevices(t *testing.T) { 290 + scheme := runtime.NewScheme() 291 + require.NoError(t, hsmv1alpha1.AddToScheme(scheme)) 292 + require.NoError(t, appsv1.AddToScheme(scheme)) 293 + require.NoError(t, corev1.AddToScheme(scheme)) 294 + 295 + tests := []struct { 296 + name string 297 + hsmPools []*hsmv1alpha1.HSMPool 298 + agentPods []*corev1.Pod 299 + expectedDevices []string 300 + expectError bool 301 + }{ 302 + { 303 + name: "ready pool but no active agents", 304 + hsmPools: []*hsmv1alpha1.HSMPool{ 305 + { 306 + ObjectMeta: metav1.ObjectMeta{ 307 + Name: "pico-hsm-pool", 308 + Namespace: "test-namespace", 309 + }, 310 + Status: hsmv1alpha1.HSMPoolStatus{ 311 + Phase: hsmv1alpha1.HSMPoolPhaseReady, 312 + AggregatedDevices: []hsmv1alpha1.DiscoveredDevice{ 313 + { 314 + DevicePath: "/dev/bus/usb/001/015", 315 + Available: true, 316 + SerialNumber: "ABC123", 317 + }, 318 + }, 319 + }, 320 + }, 321 + }, 322 + agentPods: []*corev1.Pod{}, 323 + expectedDevices: []string{}, 324 + expectError: true, 325 + }, 326 + { 327 + name: "multiple ready pools but no active agents", 328 + hsmPools: []*hsmv1alpha1.HSMPool{ 329 + { 330 + ObjectMeta: metav1.ObjectMeta{ 331 + Name: "pico-hsm-pool", 332 + Namespace: "test-namespace", 333 + }, 334 + Status: hsmv1alpha1.HSMPoolStatus{ 335 + Phase: hsmv1alpha1.HSMPoolPhaseReady, 336 + }, 337 + }, 338 + { 339 + ObjectMeta: metav1.ObjectMeta{ 340 + Name: "yubikey-pool", 341 + Namespace: "test-namespace", 342 + }, 343 + Status: hsmv1alpha1.HSMPoolStatus{ 344 + Phase: hsmv1alpha1.HSMPoolPhaseReady, 345 + }, 346 + }, 347 + }, 348 + agentPods: []*corev1.Pod{}, 349 + expectedDevices: []string{}, 350 + expectError: true, 351 + }, 352 + { 353 + name: "pool not ready - should be excluded", 354 + hsmPools: []*hsmv1alpha1.HSMPool{ 355 + { 356 + ObjectMeta: metav1.ObjectMeta{ 357 + Name: "pico-hsm-pool", 358 + Namespace: "test-namespace", 359 + }, 360 + Status: hsmv1alpha1.HSMPoolStatus{ 361 + Phase: hsmv1alpha1.HSMPoolPhasePending, 362 + }, 363 + }, 364 + }, 365 + agentPods: []*corev1.Pod{}, 366 + expectedDevices: []string{}, 367 + expectError: true, 368 + }, 369 + { 370 + name: "no pools", 371 + hsmPools: []*hsmv1alpha1.HSMPool{}, 372 + agentPods: []*corev1.Pod{}, 373 + expectedDevices: []string{}, 374 + expectError: true, 375 + }, 376 + } 377 + 378 + for _, tt := range tests { 379 + t.Run(tt.name, func(t *testing.T) { 380 + ctx := context.Background() 381 + 382 + // Create fake client with objects 383 + var objs []runtime.Object 384 + for _, pool := range tt.hsmPools { 385 + objs = append(objs, pool) 386 + } 387 + for _, pod := range tt.agentPods { 388 + objs = append(objs, pod) 389 + } 390 + 391 + fakeClient := fake.NewClientBuilder(). 392 + WithScheme(scheme). 393 + WithRuntimeObjects(objs...). 394 + Build() 395 + 396 + manager := NewManager(fakeClient, "test-namespace", nil) 397 + 398 + devices, err := manager.GetAvailableDevices(ctx, "test-namespace") 399 + 400 + if tt.expectError { 401 + assert.Error(t, err) 402 + } else { 403 + assert.NoError(t, err) 404 + assert.ElementsMatch(t, tt.expectedDevices, devices) 405 + } 406 + }) 407 + } 408 + } 409 + 410 + func TestIsAgentHealthy(t *testing.T) { 411 + scheme := runtime.NewScheme() 412 + require.NoError(t, hsmv1alpha1.AddToScheme(scheme)) 413 + require.NoError(t, appsv1.AddToScheme(scheme)) 414 + require.NoError(t, corev1.AddToScheme(scheme)) 415 + 416 + t.Run("healthy agent with running pods", func(t *testing.T) { 417 + ctx := context.Background() 418 + 419 + // Create running pods 420 + pod1 := &corev1.Pod{ 421 + ObjectMeta: metav1.ObjectMeta{ 422 + Name: "agent-pod-1", 423 + Namespace: "default", 424 + Labels: map[string]string{"app": "hsm-agent-test-device"}, 425 + }, 426 + Status: corev1.PodStatus{ 427 + Phase: corev1.PodRunning, 428 + }, 429 + } 430 + 431 + fakeClient := fake.NewClientBuilder(). 432 + WithScheme(scheme). 433 + WithRuntimeObjects(pod1). 434 + Build() 435 + 436 + manager := NewManager(fakeClient, "test-namespace", nil) 437 + 438 + agentInfo := &AgentInfo{ 439 + PodIPs: []string{"10.1.1.5"}, 440 + Status: AgentStatusReady, 441 + AgentName: "hsm-agent-test-device", 442 + Namespace: "default", 443 + } 444 + 445 + healthy := manager.isAgentHealthy(ctx, agentInfo) 446 + assert.True(t, healthy) 447 + }) 448 + 449 + t.Run("unhealthy agent with no pod IPs", func(t *testing.T) { 450 + ctx := context.Background() 451 + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() 452 + manager := NewManager(fakeClient, "test-namespace", nil) 453 + 454 + agentInfo := &AgentInfo{ 455 + PodIPs: []string{}, // No pod IPs 456 + Status: AgentStatusReady, 457 + AgentName: "hsm-agent-test-device", 458 + Namespace: "default", 459 + } 460 + 461 + healthy := manager.isAgentHealthy(ctx, agentInfo) 462 + assert.False(t, healthy) 463 + }) 464 + 465 + t.Run("unhealthy agent with no running pods", func(t *testing.T) { 466 + ctx := context.Background() 467 + 468 + // Create non-running pod 469 + pod1 := &corev1.Pod{ 470 + ObjectMeta: metav1.ObjectMeta{ 471 + Name: "agent-pod-1", 472 + Namespace: "default", 473 + Labels: map[string]string{"app": "hsm-agent-test-device"}, 474 + }, 475 + Status: corev1.PodStatus{ 476 + Phase: corev1.PodFailed, 477 + }, 478 + } 479 + 480 + fakeClient := fake.NewClientBuilder(). 481 + WithScheme(scheme). 482 + WithRuntimeObjects(pod1). 483 + Build() 484 + 485 + manager := NewManager(fakeClient, "test-namespace", nil) 486 + 487 + agentInfo := &AgentInfo{ 488 + PodIPs: []string{"10.1.1.5"}, 489 + Status: AgentStatusReady, 490 + AgentName: "hsm-agent-test-device", 491 + Namespace: "default", 492 + } 493 + 494 + healthy := manager.isAgentHealthy(ctx, agentInfo) 495 + assert.False(t, healthy) 496 + }) 497 + }
+80
internal/api/server_test.go
··· 1 + /* 2 + Copyright 2025. 3 + 4 + Licensed under the Apache License, Version 2.0 (the "License"); 5 + you may not use this file except in compliance with the License. 6 + You may obtain a copy of the License at 7 + 8 + http://www.apache.org/licenses/LICENSE-2.0 9 + 10 + Unless required by applicable law or agreed to in writing, software 11 + distributed under the License is distributed on an "AS IS" BASIS, 12 + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 + See the License for the specific language governing permissions and 14 + limitations under the License. 15 + */ 16 + 17 + package api 18 + 19 + import ( 20 + "context" 21 + "testing" 22 + 23 + "github.com/go-logr/logr" 24 + "github.com/stretchr/testify/assert" 25 + "github.com/stretchr/testify/require" 26 + "k8s.io/apimachinery/pkg/runtime" 27 + "sigs.k8s.io/controller-runtime/pkg/client/fake" 28 + 29 + hsmv1alpha1 "github.com/evanjarrett/hsm-secrets-operator/api/v1alpha1" 30 + "github.com/evanjarrett/hsm-secrets-operator/internal/agent" 31 + ) 32 + 33 + func TestGetAllAvailableAgents(t *testing.T) { 34 + scheme := runtime.NewScheme() 35 + require.NoError(t, hsmv1alpha1.AddToScheme(scheme)) 36 + 37 + tests := []struct { 38 + name string 39 + agentManager *agent.Manager 40 + expectedDevices []string 41 + expectError bool 42 + }{ 43 + { 44 + name: "nil agent manager", 45 + agentManager: nil, 46 + expectedDevices: nil, 47 + expectError: true, 48 + }, 49 + { 50 + name: "valid agent manager with no devices", 51 + agentManager: func() *agent.Manager { 52 + fakeClient := fake.NewClientBuilder().WithScheme(scheme).Build() 53 + return agent.NewManager(fakeClient, "test-namespace", nil) 54 + }(), 55 + expectedDevices: nil, 56 + expectError: true, // GetAvailableDevices returns error when no devices found 57 + }, 58 + } 59 + 60 + for _, tt := range tests { 61 + t.Run(tt.name, func(t *testing.T) { 62 + ctx := context.Background() 63 + 64 + // Create server instance 65 + server := &Server{ 66 + agentManager: tt.agentManager, 67 + logger: logr.Discard(), 68 + } 69 + 70 + devices, err := server.getAllAvailableAgents(ctx, "test-namespace") 71 + 72 + if tt.expectError { 73 + assert.Error(t, err) 74 + } else { 75 + assert.NoError(t, err) 76 + assert.ElementsMatch(t, tt.expectedDevices, devices) 77 + } 78 + }) 79 + } 80 + }