A Kubernetes operator that bridges Hardware Security Module (HSM) data storage with Kubernetes Secrets, providing true secret portability th
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

at main 943 lines 33 kB view raw
1/* 2Copyright 2025. 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8 http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15*/ 16 17package controller 18 19import ( 20 "context" 21 "fmt" 22 "slices" 23 "sort" 24 "strings" 25 "time" 26 27 appsv1 "k8s.io/api/apps/v1" 28 corev1 "k8s.io/api/core/v1" 29 "k8s.io/apimachinery/pkg/api/errors" 30 "k8s.io/apimachinery/pkg/api/resource" 31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" 32 "k8s.io/apimachinery/pkg/runtime" 33 "k8s.io/apimachinery/pkg/types" 34 "k8s.io/apimachinery/pkg/util/intstr" 35 ctrl "sigs.k8s.io/controller-runtime" 36 "sigs.k8s.io/controller-runtime/pkg/client" 37 "sigs.k8s.io/controller-runtime/pkg/handler" 38 "sigs.k8s.io/controller-runtime/pkg/log" 39 "sigs.k8s.io/controller-runtime/pkg/reconcile" 40 41 hsmv1alpha1 "github.com/evanjarrett/hsm-secrets-operator/api/v1alpha1" 42 "github.com/evanjarrett/hsm-secrets-operator/internal/agent" 43 "github.com/evanjarrett/hsm-secrets-operator/internal/config" 44) 45 46const ( 47 // AgentNamePrefix is the prefix for HSM agent deployment names 48 AgentNamePrefix = "hsm-agent" 49 50 // AgentPort is the port the HSM agent serves on (now gRPC) 51 AgentPort = 9090 52 53 // AgentHealthPort is the port for health checks (HTTP for simplicity) 54 AgentHealthPort = 8093 55) 56 57// HSMPoolAgentReconciler watches HSMPools and ensures agents are deployed when pools become ready 58type HSMPoolAgentReconciler struct { 59 client.Client 60 Scheme *runtime.Scheme 61 AgentManager agent.ManagerInterface 62 ImageResolver *config.ImageResolver 63 AgentImage string 64 ServiceAccountName string 65 66 // DeviceAbsenceTimeout is the duration after which agents are cleaned up when devices are unavailable 67 // Defaults to 2x grace period (10 minutes) if not set 68 DeviceAbsenceTimeout time.Duration 69} 70 71// +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmpools,verbs=get;list;watch 72// +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmpools/status,verbs=get 73// +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmdevices,verbs=get;list;watch 74// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete 75// +kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;update;patch;delete 76// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch 77 78// Reconcile ensures HSM agents are deployed for ready pools 79func (r *HSMPoolAgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { 80 logger := log.FromContext(ctx) 81 82 // Fetch the HSMPool instance 83 var hsmPool hsmv1alpha1.HSMPool 84 if err := r.Get(ctx, req.NamespacedName, &hsmPool); err != nil { 85 return ctrl.Result{}, client.IgnoreNotFound(err) 86 } 87 88 logger.Info("Reconciling HSM agent deployment", "phase", hsmPool.Status.Phase) 89 90 // Only deploy agents for ready pools with discovered hardware 91 if hsmPool.Status.Phase == hsmv1alpha1.HSMPoolPhaseReady && len(hsmPool.Status.AggregatedDevices) > 0 { 92 // Ensure owner reference exists and get the HSMDevice 93 if len(hsmPool.OwnerReferences) == 0 { 94 logger.Error(fmt.Errorf("no owner references"), "HSMPool has no owner references", "pool", hsmPool.Name) 95 return ctrl.Result{}, nil 96 } 97 98 deviceRef := hsmPool.OwnerReferences[0].Name 99 // Get the HSMDevice to pass to agent manager 100 var hsmDevice hsmv1alpha1.HSMDevice 101 if err := r.Get(ctx, client.ObjectKey{ 102 Name: deviceRef, 103 Namespace: hsmPool.Namespace, 104 }, &hsmDevice); err != nil { 105 logger.Error(err, "Failed to get referenced HSMDevice", "device", deviceRef) 106 // Don't return error - this allows graceful handling of missing devices 107 return ctrl.Result{}, nil 108 } 109 110 // Ensure agent deployments for all available devices in the pool 111 if err := r.ensureAgentDeployments(ctx, &hsmPool); err != nil { 112 logger.Error(err, "Failed to ensure HSM agent deployments for pool", "device", deviceRef) 113 return ctrl.Result{}, err 114 } 115 116 // Notify agent manager to track the agents 117 if r.AgentManager != nil { 118 if err := r.AgentManager.EnsureAgent(ctx, &hsmPool); err != nil { 119 logger.Error(err, "Failed to track HSM agents for pool", "device", deviceRef) 120 // Don't return error - deployment succeeded, tracking is secondary 121 } 122 } 123 } else { 124 logger.V(1).Info("HSMPool not ready for agent deployment", 125 "phase", hsmPool.Status.Phase, 126 "devices", len(hsmPool.Status.AggregatedDevices)) 127 } 128 129 // Check for agents that need cleanup due to prolonged device absence 130 if err := r.cleanupStaleAgents(ctx, &hsmPool); err != nil { 131 logger.Error(err, "Failed to cleanup stale agents") 132 // Don't return error - continue with normal reconciliation 133 } 134 135 return ctrl.Result{}, nil 136} 137 138// cleanupStaleAgents removes agent deployments for devices that have been unavailable for too long 139// Returns nil to ensure reconciliation continues even if cleanup fails for individual devices 140func (r *HSMPoolAgentReconciler) cleanupStaleAgents(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool) error { //nolint:unparam 141 logger := log.FromContext(ctx) 142 143 // Get the device absence timeout (default to 2x grace period) 144 absenceTimeout := r.DeviceAbsenceTimeout 145 if absenceTimeout == 0 { 146 gracePeriod := 5 * time.Minute // Default grace period 147 if hsmPool.Spec.GracePeriod != nil { 148 gracePeriod = hsmPool.Spec.GracePeriod.Duration 149 } 150 absenceTimeout = 2 * gracePeriod // Default to 2x grace period 151 } 152 153 // Check if the HSMDevice referenced by this pool should be cleaned up (from ownerReferences) 154 if len(hsmPool.OwnerReferences) == 0 { 155 logger.V(1).Info("HSMPool has no owner references, skipping cleanup") 156 return nil 157 } 158 159 deviceRef := hsmPool.OwnerReferences[0].Name 160 // Get the HSMDevice 161 var hsmDevice hsmv1alpha1.HSMDevice 162 if err := r.Get(ctx, client.ObjectKey{ 163 Name: deviceRef, 164 Namespace: hsmPool.Namespace, 165 }, &hsmDevice); err != nil { 166 logger.V(1).Info("HSMDevice not found, skipping cleanup check", "device", deviceRef) 167 return nil 168 } 169 170 // Check if this device has available aggregated devices in the pool 171 deviceAvailable := false 172 var lastSeenTime time.Time 173 174 for _, aggregatedDevice := range hsmPool.Status.AggregatedDevices { 175 if aggregatedDevice.Available { 176 deviceAvailable = true 177 break 178 } 179 // Track the most recent LastSeen time for unavailable devices 180 if aggregatedDevice.LastSeen.After(lastSeenTime) { 181 lastSeenTime = aggregatedDevice.LastSeen.Time 182 } 183 } 184 185 // If device is not available and hasn't been seen for longer than absence timeout 186 if !deviceAvailable { 187 timeSinceLastSeen := time.Since(lastSeenTime) 188 189 if lastSeenTime.IsZero() { 190 // No devices have ever been seen - check if pool has been around long enough 191 poolAge := time.Since(hsmPool.CreationTimestamp.Time) 192 if poolAge > absenceTimeout { 193 logger.Info("Cleaning up agent for device with no discovered instances", 194 "device", deviceRef, 195 "poolAge", poolAge, 196 "absenceTimeout", absenceTimeout) 197 198 if err := r.cleanupAgentForDevice(ctx, &hsmDevice); err != nil { 199 logger.Error(err, "Failed to cleanup agent for device with no instances", "device", deviceRef) 200 } 201 } 202 } else if timeSinceLastSeen > absenceTimeout { 203 logger.Info("Cleaning up agent for device absent too long", 204 "device", deviceRef, 205 "timeSinceLastSeen", timeSinceLastSeen, 206 "absenceTimeout", absenceTimeout, 207 "lastSeen", lastSeenTime) 208 209 if err := r.cleanupAgentForDevice(ctx, &hsmDevice); err != nil { 210 logger.Error(err, "Failed to cleanup agent for absent device", "device", deviceRef) 211 } 212 } else { 213 logger.V(1).Info("Device unavailable but within tolerance", 214 "device", deviceRef, 215 "timeSinceLastSeen", timeSinceLastSeen, 216 "absenceTimeout", absenceTimeout) 217 } 218 } 219 220 return nil 221} 222 223// cleanupAgentForDevice removes the agent deployment for a specific device 224func (r *HSMPoolAgentReconciler) cleanupAgentForDevice(ctx context.Context, hsmDevice *hsmv1alpha1.HSMDevice) error { 225 logger := log.FromContext(ctx) 226 227 // Get the HSMPool to find all agent deployments to clean up 228 poolName := hsmDevice.Name + "-pool" 229 var hsmPool hsmv1alpha1.HSMPool 230 if err := r.Get(ctx, types.NamespacedName{ 231 Name: poolName, 232 Namespace: hsmDevice.Namespace, 233 }, &hsmPool); err != nil { 234 // If pool doesn't exist, try to clean up any agent deployments by pattern 235 return r.cleanupAgentDeploymentsByPattern(ctx, hsmDevice) 236 } 237 238 // Clean up all agent deployments using stable index mapping 239 availableDevices := make([]hsmv1alpha1.DiscoveredDevice, 0, len(hsmPool.Status.AggregatedDevices)) 240 availableDevices = append(availableDevices, hsmPool.Status.AggregatedDevices...) 241 242 // Sort by serial number for stable index assignment (same as ensureAgentDeployments) 243 sort.Slice(availableDevices, func(i, j int) bool { 244 return availableDevices[i].SerialNumber < availableDevices[j].SerialNumber 245 }) 246 247 for i := range availableDevices { 248 agentName := fmt.Sprintf("%s-%s-%d", AgentNamePrefix, hsmDevice.Name, i) 249 250 // Delete deployment 251 deployment := &appsv1.Deployment{ 252 ObjectMeta: metav1.ObjectMeta{ 253 Name: agentName, 254 Namespace: hsmDevice.Namespace, 255 }, 256 } 257 if err := r.Delete(ctx, deployment); err != nil && !errors.IsNotFound(err) { 258 logger.Error(err, "Failed to delete agent deployment", "deployment", agentName) 259 } else { 260 logger.Info("Deleted agent deployment", "deployment", agentName) 261 } 262 } 263 264 // Also clean up tracking in agent manager 265 if r.AgentManager != nil { 266 if err := r.AgentManager.CleanupAgent(ctx, hsmDevice); err != nil { 267 logger.Error(err, "Failed to cleanup agent tracking", "device", hsmDevice.Name) 268 } 269 } 270 271 return nil 272} 273 274// cleanupAgentDeploymentsByPattern removes agent deployments by naming pattern when pool is unavailable 275func (r *HSMPoolAgentReconciler) cleanupAgentDeploymentsByPattern(ctx context.Context, hsmDevice *hsmv1alpha1.HSMDevice) error { 276 logger := log.FromContext(ctx) 277 278 // List all deployments in the namespace that match our agent pattern 279 var deploymentList appsv1.DeploymentList 280 if err := r.List(ctx, &deploymentList, client.InNamespace(hsmDevice.Namespace)); err != nil { 281 return fmt.Errorf("failed to list deployments: %w", err) 282 } 283 284 // Find and delete deployments that match this device 285 for _, deployment := range deploymentList.Items { 286 // Check if this is an agent deployment for this device 287 if deviceName, exists := deployment.Labels["hsm.j5t.io/device"]; exists && deviceName == hsmDevice.Name { 288 if err := r.Delete(ctx, &deployment); err != nil && !errors.IsNotFound(err) { 289 logger.Error(err, "Failed to delete agent deployment", "deployment", deployment.Name) 290 } else { 291 logger.Info("Deleted agent deployment", "deployment", deployment.Name) 292 } 293 } 294 } 295 296 // Also clean up tracking in agent manager 297 if r.AgentManager != nil { 298 if err := r.AgentManager.CleanupAgent(ctx, hsmDevice); err != nil { 299 logger.Error(err, "Failed to cleanup agent tracking", "device", hsmDevice.Name) 300 } 301 } 302 303 return nil 304} 305 306// Deployment creation and management functions 307 308// ensureAgentDeployments ensures agent deployments exist for all available devices in the pool 309// Handles device migrations by grouping devices by serial number and detecting state changes 310func (r *HSMPoolAgentReconciler) ensureAgentDeployments(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool) error { 311 logger := log.FromContext(ctx) 312 313 // Group devices by serial number to detect migrations 314 devicesBySerial := make(map[string][]hsmv1alpha1.DiscoveredDevice) 315 for _, device := range hsmPool.Status.AggregatedDevices { 316 devicesBySerial[device.SerialNumber] = append(devicesBySerial[device.SerialNumber], device) 317 } 318 319 // Process each unique serial with stable ordering 320 serialNumbers := make([]string, 0, len(devicesBySerial)) 321 for serial := range devicesBySerial { 322 serialNumbers = append(serialNumbers, serial) 323 } 324 sort.Strings(serialNumbers) // Stable ordering 325 326 var deploymentErrors []error 327 328 for i, serial := range serialNumbers { 329 devices := devicesBySerial[serial] 330 agentName := fmt.Sprintf("%s-%s-%d", AgentNamePrefix, hsmPool.OwnerReferences[0].Name, i) 331 332 // Find the active device (if any) and lost device (if any) 333 var activeDevice *hsmv1alpha1.DiscoveredDevice 334 var lostDevice *hsmv1alpha1.DiscoveredDevice 335 336 for j := range devices { 337 dev := &devices[j] 338 if dev.Available { 339 activeDevice = dev 340 } else { 341 lostDevice = dev 342 } 343 } 344 345 // Decision logic based on device state 346 if activeDevice != nil && lostDevice != nil { 347 // Migration scenario - device moved nodes 348 timeSinceLost := time.Since(lostDevice.LastSeen.Time) 349 gracePeriod := 5 * time.Minute 350 if hsmPool.Spec.GracePeriod != nil { 351 gracePeriod = hsmPool.Spec.GracePeriod.Duration 352 } 353 354 if timeSinceLost < gracePeriod && activeDevice.NodeName != lostDevice.NodeName { 355 logger.Info("Device migration detected", 356 "serial", serial, 357 "from", lostDevice.NodeName, 358 "to", activeDevice.NodeName, 359 "timeSinceLost", timeSinceLost) 360 361 // Ensure agent is on the new node (will handle deletion/creation) 362 if err := r.ensureAgentOnNode(ctx, hsmPool, activeDevice, agentName); err != nil { 363 logger.Error(err, "Failed to ensure agent on new node after migration", "serial", serial) 364 deploymentErrors = append(deploymentErrors, fmt.Errorf("migration failed for %s: %w", serial, err)) 365 continue 366 } 367 } else if timeSinceLost < gracePeriod { 368 // Device came back on same node within grace period 369 logger.Info("Device reconnected on same node", 370 "serial", serial, 371 "node", activeDevice.NodeName, 372 "timeSinceLost", timeSinceLost) 373 374 if err := r.ensureAgentOnNode(ctx, hsmPool, activeDevice, agentName); err != nil { 375 logger.Error(err, "Failed to ensure agent after reconnection", "serial", serial) 376 deploymentErrors = append(deploymentErrors, fmt.Errorf("reconnection failed for %s: %w", serial, err)) 377 continue 378 } 379 } 380 } else if activeDevice != nil { 381 // Normal case - device is available 382 if err := r.ensureAgentOnNode(ctx, hsmPool, activeDevice, agentName); err != nil { 383 logger.Error(err, "Failed to ensure agent for available device", "serial", serial) 384 deploymentErrors = append(deploymentErrors, fmt.Errorf("agent creation failed for %s: %w", serial, err)) 385 continue 386 } 387 } else if lostDevice != nil { 388 // Device is lost - check if we should clean up 389 timeSinceLost := time.Since(lostDevice.LastSeen.Time) 390 gracePeriod := 5 * time.Minute 391 if hsmPool.Spec.GracePeriod != nil { 392 gracePeriod = hsmPool.Spec.GracePeriod.Duration 393 } 394 395 if timeSinceLost > gracePeriod { 396 logger.Info("Cleaning up agent for lost device", 397 "serial", serial, 398 "lastNode", lostDevice.NodeName, 399 "timeSinceLost", timeSinceLost) 400 401 if err := r.deleteAgent(ctx, agentName, hsmPool.Namespace); err != nil { 402 logger.Error(err, "Failed to delete agent for lost device", "serial", serial) 403 deploymentErrors = append(deploymentErrors, fmt.Errorf("cleanup failed for %s: %w", serial, err)) 404 } 405 } else { 406 logger.V(1).Info("Device lost but within grace period", 407 "serial", serial, 408 "timeSinceLost", timeSinceLost, 409 "gracePeriod", gracePeriod) 410 } 411 } 412 } 413 414 // Return aggregated errors if any occurred 415 if len(deploymentErrors) > 0 { 416 return fmt.Errorf("deployment errors occurred: %v", deploymentErrors) 417 } 418 419 return nil 420} 421 422// ensureAgentOnNode ensures an agent deployment exists on the correct node for the given device 423func (r *HSMPoolAgentReconciler) ensureAgentOnNode(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool, device *hsmv1alpha1.DiscoveredDevice, agentName string) error { 424 logger := log.FromContext(ctx) 425 426 // Check if deployment exists 427 var deployment appsv1.Deployment 428 err := r.Get(ctx, types.NamespacedName{ 429 Name: agentName, 430 Namespace: hsmPool.Namespace, 431 }, &deployment) 432 433 if err == nil { 434 // Deployment exists - check if it's on the right node 435 if !r.isDeploymentOnNode(&deployment, device.NodeName) { 436 logger.Info("Agent on wrong node, recreating", 437 "agent", agentName, 438 "currentNode", r.getDeploymentNode(&deployment), 439 "targetNode", device.NodeName, 440 "serial", device.SerialNumber) 441 442 // Delete and recreate 443 if err := r.Delete(ctx, &deployment); err != nil && !errors.IsNotFound(err) { 444 return fmt.Errorf("failed to delete outdated agent: %w", err) 445 } 446 // Fall through to create 447 } else { 448 // Agent is on correct node - check if other details need updating 449 needsUpdate, err := r.agentNeedsUpdate(ctx, &deployment, hsmPool) 450 if err != nil { 451 return fmt.Errorf("failed to check if agent needs update: %w", err) 452 } 453 454 if !needsUpdate { 455 needsUpdate = r.deploymentNeedsUpdateForDevice(&deployment, device) 456 } 457 458 if needsUpdate { 459 logger.Info("Agent needs updating, recreating", 460 "agent", agentName, 461 "node", device.NodeName, 462 "serial", device.SerialNumber) 463 464 if err := r.Delete(ctx, &deployment); err != nil && !errors.IsNotFound(err) { 465 return fmt.Errorf("failed to delete outdated agent: %w", err) 466 } 467 // Fall through to create 468 } else { 469 // Agent is up to date 470 logger.V(1).Info("Agent deployment is up to date", 471 "agent", agentName, 472 "node", device.NodeName, 473 "serial", device.SerialNumber) 474 return nil 475 } 476 } 477 } else if !errors.IsNotFound(err) { 478 return fmt.Errorf("failed to check agent deployment: %w", err) 479 } 480 481 // Create agent deployment 482 logger.Info("Creating agent deployment", 483 "agent", agentName, 484 "node", device.NodeName, 485 "serial", device.SerialNumber) 486 487 return r.createAgentDeployment(ctx, hsmPool, device, agentName) 488} 489 490// isDeploymentOnNode checks if a deployment is pinned to the specified node 491func (r *HSMPoolAgentReconciler) isDeploymentOnNode(deployment *appsv1.Deployment, nodeName string) bool { 492 if deployment.Spec.Template.Spec.NodeSelector != nil { 493 return deployment.Spec.Template.Spec.NodeSelector["kubernetes.io/hostname"] == nodeName 494 } 495 return false 496} 497 498// getDeploymentNode returns the node name that a deployment is pinned to 499func (r *HSMPoolAgentReconciler) getDeploymentNode(deployment *appsv1.Deployment) string { 500 if deployment.Spec.Template.Spec.NodeSelector != nil { 501 return deployment.Spec.Template.Spec.NodeSelector["kubernetes.io/hostname"] 502 } 503 return "" 504} 505 506// deleteAgent deletes an agent deployment by name 507func (r *HSMPoolAgentReconciler) deleteAgent(ctx context.Context, name, namespace string) error { 508 deployment := &appsv1.Deployment{ 509 ObjectMeta: metav1.ObjectMeta{ 510 Name: name, 511 Namespace: namespace, 512 }, 513 } 514 if err := r.Delete(ctx, deployment); err != nil && !errors.IsNotFound(err) { 515 return fmt.Errorf("failed to delete agent deployment %s: %w", name, err) 516 } 517 return nil 518} 519 520// createAgentDeployment creates the HSM agent deployment for a specific device 521func (r *HSMPoolAgentReconciler) createAgentDeployment(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool, specificDevice *hsmv1alpha1.DiscoveredDevice, customAgentName string) error { 522 logger := log.FromContext(ctx) 523 524 if specificDevice == nil { 525 return fmt.Errorf("specificDevice is required") 526 } 527 528 var agentName string 529 if customAgentName != "" { 530 agentName = customAgentName 531 } else { 532 agentName = r.generateAgentName(hsmPool) 533 } 534 535 targetNode := specificDevice.NodeName 536 deviceName := hsmPool.OwnerReferences[0].Name 537 538 // Fetch the HSMDevice to get the device type for extended resource requests 539 var hsmDevice hsmv1alpha1.HSMDevice 540 if err := r.Get(ctx, types.NamespacedName{Name: deviceName, Namespace: hsmPool.Namespace}, &hsmDevice); err != nil { 541 logger.Error(err, "Failed to get HSMDevice for agent deployment", "device", deviceName) 542 return fmt.Errorf("failed to get HSMDevice %s: %w", deviceName, err) 543 } 544 545 // Build extended resource name from device type (e.g., "hsm.j5t.io/picohsm") 546 extendedResourceName := corev1.ResourceName( 547 fmt.Sprintf("hsm.j5t.io/%s", strings.ToLower(string(hsmDevice.Spec.DeviceType))), 548 ) 549 logger.V(1).Info("Using extended resource for agent", 550 "agent", agentName, 551 "resource", extendedResourceName) 552 553 // Get agent image from config or fallback to auto-detection 554 var agentImage string 555 if r.AgentImage != "" { 556 agentImage = r.AgentImage 557 } else if r.ImageResolver != nil { 558 // Fallback to ImageResolver for backward compatibility or auto-detection 559 agentImage = r.ImageResolver.GetImage(ctx, "") 560 } 561 562 var replicas int32 = 1 563 var rootUserId int64 = 0 564 // Fallback to root for USB device access - compensated by distroless 565 falsePtr := new(bool) 566 *falsePtr = false 567 truePtr := new(bool) 568 *truePtr = true 569 hostPathDirectory := corev1.HostPathDirectory 570 571 deployment := &appsv1.Deployment{ 572 ObjectMeta: metav1.ObjectMeta{ 573 Name: agentName, 574 Namespace: hsmPool.Namespace, 575 Labels: map[string]string{ 576 "app": agentName, 577 "app.kubernetes.io/component": "hsm-agent", 578 "app.kubernetes.io/instance": agentName, 579 "app.kubernetes.io/name": "hsm-agent", 580 "app.kubernetes.io/part-of": "hsm-secrets-operator", 581 "hsm.j5t.io/device": deviceName, 582 "hsm.j5t.io/serial-number": specificDevice.SerialNumber, 583 "hsm.j5t.io/device-path": sanitizeLabelValue(specificDevice.DevicePath), 584 }, 585 }, 586 Spec: appsv1.DeploymentSpec{ 587 Replicas: &replicas, 588 Selector: &metav1.LabelSelector{ 589 MatchLabels: map[string]string{ 590 "app": agentName, 591 }, 592 }, 593 Template: corev1.PodTemplateSpec{ 594 ObjectMeta: metav1.ObjectMeta{ 595 Labels: map[string]string{ 596 "app": agentName, 597 "app.kubernetes.io/component": "hsm-agent", 598 "app.kubernetes.io/instance": agentName, 599 "app.kubernetes.io/name": "hsm-agent", 600 "app.kubernetes.io/part-of": "hsm-secrets-operator", 601 "hsm.j5t.io/device": deviceName, 602 "hsm.j5t.io/serial-number": specificDevice.SerialNumber, 603 "hsm.j5t.io/device-path": sanitizeLabelValue(specificDevice.DevicePath), 604 }, 605 }, 606 Spec: corev1.PodSpec{ 607 // Pin to the specific node with the HSM device 608 NodeSelector: map[string]string{ 609 "kubernetes.io/hostname": targetNode, 610 }, 611 // Affinity for better scheduling 612 Affinity: &corev1.Affinity{ 613 NodeAffinity: &corev1.NodeAffinity{ 614 RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{ 615 NodeSelectorTerms: []corev1.NodeSelectorTerm{ 616 { 617 MatchExpressions: []corev1.NodeSelectorRequirement{ 618 { 619 Key: "kubernetes.io/hostname", 620 Operator: corev1.NodeSelectorOpIn, 621 Values: []string{targetNode}, 622 }, 623 }, 624 }, 625 }, 626 }, 627 }, 628 }, 629 SecurityContext: &corev1.PodSecurityContext{ 630 RunAsUser: &rootUserId, 631 RunAsGroup: &rootUserId, 632 RunAsNonRoot: falsePtr, // Root required for USB access 633 }, 634 ServiceAccountName: r.ServiceAccountName, 635 Containers: []corev1.Container{ 636 { 637 Name: "agent", 638 Image: agentImage, 639 Args: r.buildAgentArgs(ctx, hsmPool, deviceName), 640 Env: []corev1.EnvVar{}, 641 Ports: []corev1.ContainerPort{ 642 { 643 Name: "grpc", 644 ContainerPort: AgentPort, 645 Protocol: corev1.ProtocolTCP, 646 }, 647 { 648 Name: "health", 649 ContainerPort: AgentHealthPort, 650 Protocol: corev1.ProtocolTCP, 651 }, 652 }, 653 LivenessProbe: &corev1.Probe{ 654 ProbeHandler: corev1.ProbeHandler{ 655 HTTPGet: &corev1.HTTPGetAction{ 656 Path: "/healthz", 657 Port: intstr.FromInt(AgentHealthPort), 658 }, 659 }, 660 InitialDelaySeconds: 15, 661 PeriodSeconds: 20, 662 }, 663 ReadinessProbe: &corev1.Probe{ 664 ProbeHandler: corev1.ProbeHandler{ 665 HTTPGet: &corev1.HTTPGetAction{ 666 Path: "/readyz", 667 Port: intstr.FromInt(AgentHealthPort), 668 }, 669 }, 670 InitialDelaySeconds: 5, 671 PeriodSeconds: 10, 672 }, 673 Resources: corev1.ResourceRequirements{ 674 Requests: corev1.ResourceList{ 675 corev1.ResourceCPU: resource.MustParse("100m"), 676 corev1.ResourceMemory: resource.MustParse("128Mi"), 677 extendedResourceName: resource.MustParse("1"), // Request 1 HSM device 678 }, 679 Limits: corev1.ResourceList{ 680 corev1.ResourceCPU: resource.MustParse("500m"), 681 corev1.ResourceMemory: resource.MustParse("256Mi"), 682 extendedResourceName: resource.MustParse("1"), // Limit to 1 HSM device 683 }, 684 }, 685 SecurityContext: &corev1.SecurityContext{ 686 Privileged: truePtr, 687 AllowPrivilegeEscalation: truePtr, 688 ReadOnlyRootFilesystem: falsePtr, // pcscd needs writable /run and /var/lock 689 RunAsNonRoot: falsePtr, // Root required for USB device access 690 RunAsUser: &rootUserId, 691 SeccompProfile: &corev1.SeccompProfile{ 692 Type: corev1.SeccompProfileTypeRuntimeDefault, 693 }, 694 }, 695 VolumeMounts: []corev1.VolumeMount{ 696 { 697 Name: "tmp", 698 MountPath: "/tmp", 699 ReadOnly: false, // Required for pcscd runtime with readonly filesystem 700 }, 701 { 702 Name: "usb-bus", 703 MountPath: "/dev/bus/usb", 704 ReadOnly: false, 705 }, 706 { 707 Name: "pcscd-run", 708 MountPath: "/run/pcscd", 709 ReadOnly: false, // Required for pcscd socket 710 }, 711 { 712 Name: "pcscd-lock", 713 MountPath: "/var/lock/pcsc", 714 ReadOnly: false, // Required for pcscd locking 715 }, 716 }, 717 }, 718 }, 719 Volumes: []corev1.Volume{ 720 { 721 Name: "tmp", 722 VolumeSource: corev1.VolumeSource{ 723 EmptyDir: &corev1.EmptyDirVolumeSource{}, 724 }, 725 }, 726 { 727 Name: "usb-bus", 728 VolumeSource: corev1.VolumeSource{ 729 HostPath: &corev1.HostPathVolumeSource{ 730 Path: "/dev/bus/usb", 731 Type: &hostPathDirectory, 732 }, 733 }, 734 }, 735 { 736 Name: "pcscd-run", 737 VolumeSource: corev1.VolumeSource{ 738 EmptyDir: &corev1.EmptyDirVolumeSource{}, 739 }, 740 }, 741 { 742 Name: "pcscd-lock", 743 VolumeSource: corev1.VolumeSource{ 744 EmptyDir: &corev1.EmptyDirVolumeSource{}, 745 }, 746 }, 747 }, 748 }, 749 }, 750 }, 751 } 752 753 return r.Create(ctx, deployment) 754} 755 756// agentNeedsUpdate checks if the agent deployment needs to be updated due to device path or image changes 757func (r *HSMPoolAgentReconciler) agentNeedsUpdate(ctx context.Context, deployment *appsv1.Deployment, hsmPool *hsmv1alpha1.HSMPool) (bool, error) { 758 if hsmPool == nil { 759 return false, nil // No pool available, no update needed 760 } 761 // Check if container image needs updating 762 if len(deployment.Spec.Template.Spec.Containers) == 0 { 763 return false, fmt.Errorf("deployment has no containers") 764 } 765 766 container := deployment.Spec.Template.Spec.Containers[0] 767 currentImage := container.Image 768 769 // Check if image has changed 770 var expectedImage string 771 if r.AgentImage != "" { 772 expectedImage = r.AgentImage 773 } else if r.ImageResolver != nil { 774 // Fallback to auto-detection 775 expectedImage = r.ImageResolver.GetImage(ctx, "") 776 } 777 778 if expectedImage != "" && currentImage != expectedImage { 779 // Image has changed, need to update 780 return true, nil 781 } 782 783 // Device-specific path validation is handled by deploymentNeedsUpdateForDevice 784 // This function only checks image changes and other deployment-wide properties 785 786 return false, nil 787} 788 789// deploymentNeedsUpdateForDevice checks if a deployment needs to be updated for a specific device 790// This is a simplified check that only validates device-specific configuration 791func (r *HSMPoolAgentReconciler) deploymentNeedsUpdateForDevice(deployment *appsv1.Deployment, aggregatedDevice *hsmv1alpha1.DiscoveredDevice) bool { 792 // Check node affinity - ensure agent is pinned to the correct node 793 if deployment.Spec.Template.Spec.Affinity == nil || 794 deployment.Spec.Template.Spec.Affinity.NodeAffinity == nil || 795 deployment.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil { 796 return true // Missing required node affinity 797 } 798 799 // Check if the node name matches the aggregated device's node 800 nodeSelector := deployment.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution 801 if len(nodeSelector.NodeSelectorTerms) == 0 { 802 return true 803 } 804 805 // Check if hostname requirement matches the device's node 806 nodeMatches := false 807 for _, term := range nodeSelector.NodeSelectorTerms { 808 for _, expr := range term.MatchExpressions { 809 if expr.Key == "kubernetes.io/hostname" && expr.Operator == corev1.NodeSelectorOpIn { 810 if slices.Contains(expr.Values, aggregatedDevice.NodeName) { 811 nodeMatches = true 812 } 813 } 814 } 815 } 816 817 if !nodeMatches { 818 return true // Node doesn't match 819 } 820 821 // Check device path in volume mounts 822 for _, vol := range deployment.Spec.Template.Spec.Volumes { 823 if vol.Name == "hsm-device" && vol.HostPath != nil { 824 if vol.HostPath.Path != aggregatedDevice.DevicePath { 825 return true // Device path changed 826 } 827 } 828 } 829 830 return false 831} 832 833// generateAgentName creates a consistent agent name for an HSM device 834func (r *HSMPoolAgentReconciler) generateAgentName(hsmPool *hsmv1alpha1.HSMPool) string { 835 return fmt.Sprintf("%s-%s", AgentNamePrefix, hsmPool.OwnerReferences[0].Name) 836} 837 838// buildAgentArgs builds CLI arguments for the HSM agent 839func (r *HSMPoolAgentReconciler) buildAgentArgs(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool, deviceName string) []string { 840 args := []string{ 841 "--mode=agent", 842 "--device-name=" + deviceName, 843 "--port=" + fmt.Sprintf("%d", AgentPort), 844 "--health-port=" + fmt.Sprintf("%d", AgentHealthPort), 845 } 846 847 // Get HSMDevice from owner reference 848 var hsmDevice hsmv1alpha1.HSMDevice 849 if err := r.Get(ctx, types.NamespacedName{ 850 Name: deviceName, 851 Namespace: hsmPool.Namespace, 852 }, &hsmDevice); err != nil { 853 // If we can't get the device, return basic args 854 return args 855 } 856 857 // Add PKCS#11 configuration if available 858 if hsmDevice.Spec.PKCS11 != nil { 859 if hsmDevice.Spec.PKCS11.TokenLabel != "" { 860 args = append(args, "--token-label="+hsmDevice.Spec.PKCS11.TokenLabel) 861 } 862 863 if hsmDevice.Spec.PKCS11.SlotId >= 0 { 864 args = append(args, "--slot-id="+fmt.Sprintf("%d", hsmDevice.Spec.PKCS11.SlotId)) 865 } 866 867 if hsmDevice.Spec.PKCS11.LibraryPath != "" { 868 args = append(args, "--pkcs11-library="+hsmDevice.Spec.PKCS11.LibraryPath) 869 } 870 } 871 872 return args 873} 874 875// sanitizeLabelValue sanitizes a string to be a valid Kubernetes label value 876// Kubernetes labels must be alphanumeric, '-', '_', or '.' and start/end with alphanumeric 877func sanitizeLabelValue(value string) string { 878 if len(value) == 0 { 879 return value 880 } 881 882 // Replace invalid characters with dashes 883 sanitized := strings.Map(func(r rune) rune { 884 if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_' || r == '.' { 885 return r 886 } 887 return '-' 888 }, value) 889 890 // Ensure starts and ends with alphanumeric 891 sanitized = strings.TrimFunc(sanitized, func(r rune) bool { 892 return (r < 'A' || r > 'Z') && (r < 'a' || r > 'z') && (r < '0' || r > '9') 893 }) 894 895 // Kubernetes label values have a 63 character limit 896 if len(sanitized) > 63 { 897 sanitized = sanitized[:63] 898 // Re-trim end if we cut off at a non-alphanumeric 899 sanitized = strings.TrimFunc(sanitized, func(r rune) bool { 900 return (r < 'A' || r > 'Z') && (r < 'a' || r > 'z') && (r < '0' || r > '9') 901 }) 902 } 903 904 return sanitized 905} 906 907// SetupWithManager sets up the controller with the Manager. 908func (r *HSMPoolAgentReconciler) SetupWithManager(mgr ctrl.Manager) error { 909 return ctrl.NewControllerManagedBy(mgr). 910 For(&hsmv1alpha1.HSMPool{}). 911 Watches( 912 &appsv1.Deployment{}, 913 handler.EnqueueRequestsFromMapFunc(r.findPoolsForDeployment), 914 ). 915 Named("hsmpool-agent"). 916 Complete(r) 917} 918 919// findPoolsForDeployment maps agent deployments back to HSMPools for reconciliation 920func (r *HSMPoolAgentReconciler) findPoolsForDeployment(ctx context.Context, obj client.Object) []reconcile.Request { 921 deployment, ok := obj.(*appsv1.Deployment) 922 if !ok { 923 return nil 924 } 925 926 // Check if this is an HSM agent deployment 927 deviceName, exists := deployment.Labels["hsm.j5t.io/device"] 928 if !exists { 929 return nil 930 } 931 932 // Find the corresponding HSMPool (agent deployments are created for devices referenced in pools) 933 poolName := deviceName + "-pool" 934 935 return []reconcile.Request{ 936 { 937 NamespacedName: client.ObjectKey{ 938 Name: poolName, 939 Namespace: deployment.Namespace, 940 }, 941 }, 942 } 943}