A Kubernetes operator that bridges Hardware Security Module (HSM) data storage with Kubernetes Secrets, providing true secret portability th
1/*
2Copyright 2025.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17package controller
18
19import (
20 "context"
21 "fmt"
22 "slices"
23 "sort"
24 "strings"
25 "time"
26
27 appsv1 "k8s.io/api/apps/v1"
28 corev1 "k8s.io/api/core/v1"
29 "k8s.io/apimachinery/pkg/api/errors"
30 "k8s.io/apimachinery/pkg/api/resource"
31 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
32 "k8s.io/apimachinery/pkg/runtime"
33 "k8s.io/apimachinery/pkg/types"
34 "k8s.io/apimachinery/pkg/util/intstr"
35 ctrl "sigs.k8s.io/controller-runtime"
36 "sigs.k8s.io/controller-runtime/pkg/client"
37 "sigs.k8s.io/controller-runtime/pkg/handler"
38 "sigs.k8s.io/controller-runtime/pkg/log"
39 "sigs.k8s.io/controller-runtime/pkg/reconcile"
40
41 hsmv1alpha1 "github.com/evanjarrett/hsm-secrets-operator/api/v1alpha1"
42 "github.com/evanjarrett/hsm-secrets-operator/internal/agent"
43 "github.com/evanjarrett/hsm-secrets-operator/internal/config"
44)
45
46const (
47 // AgentNamePrefix is the prefix for HSM agent deployment names
48 AgentNamePrefix = "hsm-agent"
49
50 // AgentPort is the port the HSM agent serves on (now gRPC)
51 AgentPort = 9090
52
53 // AgentHealthPort is the port for health checks (HTTP for simplicity)
54 AgentHealthPort = 8093
55)
56
57// HSMPoolAgentReconciler watches HSMPools and ensures agents are deployed when pools become ready
58type HSMPoolAgentReconciler struct {
59 client.Client
60 Scheme *runtime.Scheme
61 AgentManager agent.ManagerInterface
62 ImageResolver *config.ImageResolver
63 AgentImage string
64 ServiceAccountName string
65
66 // DeviceAbsenceTimeout is the duration after which agents are cleaned up when devices are unavailable
67 // Defaults to 2x grace period (10 minutes) if not set
68 DeviceAbsenceTimeout time.Duration
69}
70
71// +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmpools,verbs=get;list;watch
72// +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmpools/status,verbs=get
73// +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmdevices,verbs=get;list;watch
74// +kubebuilder:rbac:groups=apps,resources=deployments,verbs=get;list;watch;create;update;patch;delete
75// +kubebuilder:rbac:groups="",resources=services,verbs=get;list;watch;create;update;patch;delete
76// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch
77
78// Reconcile ensures HSM agents are deployed for ready pools
79func (r *HSMPoolAgentReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
80 logger := log.FromContext(ctx)
81
82 // Fetch the HSMPool instance
83 var hsmPool hsmv1alpha1.HSMPool
84 if err := r.Get(ctx, req.NamespacedName, &hsmPool); err != nil {
85 return ctrl.Result{}, client.IgnoreNotFound(err)
86 }
87
88 logger.Info("Reconciling HSM agent deployment", "phase", hsmPool.Status.Phase)
89
90 // Only deploy agents for ready pools with discovered hardware
91 if hsmPool.Status.Phase == hsmv1alpha1.HSMPoolPhaseReady && len(hsmPool.Status.AggregatedDevices) > 0 {
92 // Ensure owner reference exists and get the HSMDevice
93 if len(hsmPool.OwnerReferences) == 0 {
94 logger.Error(fmt.Errorf("no owner references"), "HSMPool has no owner references", "pool", hsmPool.Name)
95 return ctrl.Result{}, nil
96 }
97
98 deviceRef := hsmPool.OwnerReferences[0].Name
99 // Get the HSMDevice to pass to agent manager
100 var hsmDevice hsmv1alpha1.HSMDevice
101 if err := r.Get(ctx, client.ObjectKey{
102 Name: deviceRef,
103 Namespace: hsmPool.Namespace,
104 }, &hsmDevice); err != nil {
105 logger.Error(err, "Failed to get referenced HSMDevice", "device", deviceRef)
106 // Don't return error - this allows graceful handling of missing devices
107 return ctrl.Result{}, nil
108 }
109
110 // Ensure agent deployments for all available devices in the pool
111 if err := r.ensureAgentDeployments(ctx, &hsmPool); err != nil {
112 logger.Error(err, "Failed to ensure HSM agent deployments for pool", "device", deviceRef)
113 return ctrl.Result{}, err
114 }
115
116 // Notify agent manager to track the agents
117 if r.AgentManager != nil {
118 if err := r.AgentManager.EnsureAgent(ctx, &hsmPool); err != nil {
119 logger.Error(err, "Failed to track HSM agents for pool", "device", deviceRef)
120 // Don't return error - deployment succeeded, tracking is secondary
121 }
122 }
123 } else {
124 logger.V(1).Info("HSMPool not ready for agent deployment",
125 "phase", hsmPool.Status.Phase,
126 "devices", len(hsmPool.Status.AggregatedDevices))
127 }
128
129 // Check for agents that need cleanup due to prolonged device absence
130 if err := r.cleanupStaleAgents(ctx, &hsmPool); err != nil {
131 logger.Error(err, "Failed to cleanup stale agents")
132 // Don't return error - continue with normal reconciliation
133 }
134
135 return ctrl.Result{}, nil
136}
137
138// cleanupStaleAgents removes agent deployments for devices that have been unavailable for too long
139// Returns nil to ensure reconciliation continues even if cleanup fails for individual devices
140func (r *HSMPoolAgentReconciler) cleanupStaleAgents(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool) error { //nolint:unparam
141 logger := log.FromContext(ctx)
142
143 // Get the device absence timeout (default to 2x grace period)
144 absenceTimeout := r.DeviceAbsenceTimeout
145 if absenceTimeout == 0 {
146 gracePeriod := 5 * time.Minute // Default grace period
147 if hsmPool.Spec.GracePeriod != nil {
148 gracePeriod = hsmPool.Spec.GracePeriod.Duration
149 }
150 absenceTimeout = 2 * gracePeriod // Default to 2x grace period
151 }
152
153 // Check if the HSMDevice referenced by this pool should be cleaned up (from ownerReferences)
154 if len(hsmPool.OwnerReferences) == 0 {
155 logger.V(1).Info("HSMPool has no owner references, skipping cleanup")
156 return nil
157 }
158
159 deviceRef := hsmPool.OwnerReferences[0].Name
160 // Get the HSMDevice
161 var hsmDevice hsmv1alpha1.HSMDevice
162 if err := r.Get(ctx, client.ObjectKey{
163 Name: deviceRef,
164 Namespace: hsmPool.Namespace,
165 }, &hsmDevice); err != nil {
166 logger.V(1).Info("HSMDevice not found, skipping cleanup check", "device", deviceRef)
167 return nil
168 }
169
170 // Check if this device has available aggregated devices in the pool
171 deviceAvailable := false
172 var lastSeenTime time.Time
173
174 for _, aggregatedDevice := range hsmPool.Status.AggregatedDevices {
175 if aggregatedDevice.Available {
176 deviceAvailable = true
177 break
178 }
179 // Track the most recent LastSeen time for unavailable devices
180 if aggregatedDevice.LastSeen.After(lastSeenTime) {
181 lastSeenTime = aggregatedDevice.LastSeen.Time
182 }
183 }
184
185 // If device is not available and hasn't been seen for longer than absence timeout
186 if !deviceAvailable {
187 timeSinceLastSeen := time.Since(lastSeenTime)
188
189 if lastSeenTime.IsZero() {
190 // No devices have ever been seen - check if pool has been around long enough
191 poolAge := time.Since(hsmPool.CreationTimestamp.Time)
192 if poolAge > absenceTimeout {
193 logger.Info("Cleaning up agent for device with no discovered instances",
194 "device", deviceRef,
195 "poolAge", poolAge,
196 "absenceTimeout", absenceTimeout)
197
198 if err := r.cleanupAgentForDevice(ctx, &hsmDevice); err != nil {
199 logger.Error(err, "Failed to cleanup agent for device with no instances", "device", deviceRef)
200 }
201 }
202 } else if timeSinceLastSeen > absenceTimeout {
203 logger.Info("Cleaning up agent for device absent too long",
204 "device", deviceRef,
205 "timeSinceLastSeen", timeSinceLastSeen,
206 "absenceTimeout", absenceTimeout,
207 "lastSeen", lastSeenTime)
208
209 if err := r.cleanupAgentForDevice(ctx, &hsmDevice); err != nil {
210 logger.Error(err, "Failed to cleanup agent for absent device", "device", deviceRef)
211 }
212 } else {
213 logger.V(1).Info("Device unavailable but within tolerance",
214 "device", deviceRef,
215 "timeSinceLastSeen", timeSinceLastSeen,
216 "absenceTimeout", absenceTimeout)
217 }
218 }
219
220 return nil
221}
222
223// cleanupAgentForDevice removes the agent deployment for a specific device
224func (r *HSMPoolAgentReconciler) cleanupAgentForDevice(ctx context.Context, hsmDevice *hsmv1alpha1.HSMDevice) error {
225 logger := log.FromContext(ctx)
226
227 // Get the HSMPool to find all agent deployments to clean up
228 poolName := hsmDevice.Name + "-pool"
229 var hsmPool hsmv1alpha1.HSMPool
230 if err := r.Get(ctx, types.NamespacedName{
231 Name: poolName,
232 Namespace: hsmDevice.Namespace,
233 }, &hsmPool); err != nil {
234 // If pool doesn't exist, try to clean up any agent deployments by pattern
235 return r.cleanupAgentDeploymentsByPattern(ctx, hsmDevice)
236 }
237
238 // Clean up all agent deployments using stable index mapping
239 availableDevices := make([]hsmv1alpha1.DiscoveredDevice, 0, len(hsmPool.Status.AggregatedDevices))
240 availableDevices = append(availableDevices, hsmPool.Status.AggregatedDevices...)
241
242 // Sort by serial number for stable index assignment (same as ensureAgentDeployments)
243 sort.Slice(availableDevices, func(i, j int) bool {
244 return availableDevices[i].SerialNumber < availableDevices[j].SerialNumber
245 })
246
247 for i := range availableDevices {
248 agentName := fmt.Sprintf("%s-%s-%d", AgentNamePrefix, hsmDevice.Name, i)
249
250 // Delete deployment
251 deployment := &appsv1.Deployment{
252 ObjectMeta: metav1.ObjectMeta{
253 Name: agentName,
254 Namespace: hsmDevice.Namespace,
255 },
256 }
257 if err := r.Delete(ctx, deployment); err != nil && !errors.IsNotFound(err) {
258 logger.Error(err, "Failed to delete agent deployment", "deployment", agentName)
259 } else {
260 logger.Info("Deleted agent deployment", "deployment", agentName)
261 }
262 }
263
264 // Also clean up tracking in agent manager
265 if r.AgentManager != nil {
266 if err := r.AgentManager.CleanupAgent(ctx, hsmDevice); err != nil {
267 logger.Error(err, "Failed to cleanup agent tracking", "device", hsmDevice.Name)
268 }
269 }
270
271 return nil
272}
273
274// cleanupAgentDeploymentsByPattern removes agent deployments by naming pattern when pool is unavailable
275func (r *HSMPoolAgentReconciler) cleanupAgentDeploymentsByPattern(ctx context.Context, hsmDevice *hsmv1alpha1.HSMDevice) error {
276 logger := log.FromContext(ctx)
277
278 // List all deployments in the namespace that match our agent pattern
279 var deploymentList appsv1.DeploymentList
280 if err := r.List(ctx, &deploymentList, client.InNamespace(hsmDevice.Namespace)); err != nil {
281 return fmt.Errorf("failed to list deployments: %w", err)
282 }
283
284 // Find and delete deployments that match this device
285 for _, deployment := range deploymentList.Items {
286 // Check if this is an agent deployment for this device
287 if deviceName, exists := deployment.Labels["hsm.j5t.io/device"]; exists && deviceName == hsmDevice.Name {
288 if err := r.Delete(ctx, &deployment); err != nil && !errors.IsNotFound(err) {
289 logger.Error(err, "Failed to delete agent deployment", "deployment", deployment.Name)
290 } else {
291 logger.Info("Deleted agent deployment", "deployment", deployment.Name)
292 }
293 }
294 }
295
296 // Also clean up tracking in agent manager
297 if r.AgentManager != nil {
298 if err := r.AgentManager.CleanupAgent(ctx, hsmDevice); err != nil {
299 logger.Error(err, "Failed to cleanup agent tracking", "device", hsmDevice.Name)
300 }
301 }
302
303 return nil
304}
305
306// Deployment creation and management functions
307
308// ensureAgentDeployments ensures agent deployments exist for all available devices in the pool
309// Handles device migrations by grouping devices by serial number and detecting state changes
310func (r *HSMPoolAgentReconciler) ensureAgentDeployments(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool) error {
311 logger := log.FromContext(ctx)
312
313 // Group devices by serial number to detect migrations
314 devicesBySerial := make(map[string][]hsmv1alpha1.DiscoveredDevice)
315 for _, device := range hsmPool.Status.AggregatedDevices {
316 devicesBySerial[device.SerialNumber] = append(devicesBySerial[device.SerialNumber], device)
317 }
318
319 // Process each unique serial with stable ordering
320 serialNumbers := make([]string, 0, len(devicesBySerial))
321 for serial := range devicesBySerial {
322 serialNumbers = append(serialNumbers, serial)
323 }
324 sort.Strings(serialNumbers) // Stable ordering
325
326 var deploymentErrors []error
327
328 for i, serial := range serialNumbers {
329 devices := devicesBySerial[serial]
330 agentName := fmt.Sprintf("%s-%s-%d", AgentNamePrefix, hsmPool.OwnerReferences[0].Name, i)
331
332 // Find the active device (if any) and lost device (if any)
333 var activeDevice *hsmv1alpha1.DiscoveredDevice
334 var lostDevice *hsmv1alpha1.DiscoveredDevice
335
336 for j := range devices {
337 dev := &devices[j]
338 if dev.Available {
339 activeDevice = dev
340 } else {
341 lostDevice = dev
342 }
343 }
344
345 // Decision logic based on device state
346 if activeDevice != nil && lostDevice != nil {
347 // Migration scenario - device moved nodes
348 timeSinceLost := time.Since(lostDevice.LastSeen.Time)
349 gracePeriod := 5 * time.Minute
350 if hsmPool.Spec.GracePeriod != nil {
351 gracePeriod = hsmPool.Spec.GracePeriod.Duration
352 }
353
354 if timeSinceLost < gracePeriod && activeDevice.NodeName != lostDevice.NodeName {
355 logger.Info("Device migration detected",
356 "serial", serial,
357 "from", lostDevice.NodeName,
358 "to", activeDevice.NodeName,
359 "timeSinceLost", timeSinceLost)
360
361 // Ensure agent is on the new node (will handle deletion/creation)
362 if err := r.ensureAgentOnNode(ctx, hsmPool, activeDevice, agentName); err != nil {
363 logger.Error(err, "Failed to ensure agent on new node after migration", "serial", serial)
364 deploymentErrors = append(deploymentErrors, fmt.Errorf("migration failed for %s: %w", serial, err))
365 continue
366 }
367 } else if timeSinceLost < gracePeriod {
368 // Device came back on same node within grace period
369 logger.Info("Device reconnected on same node",
370 "serial", serial,
371 "node", activeDevice.NodeName,
372 "timeSinceLost", timeSinceLost)
373
374 if err := r.ensureAgentOnNode(ctx, hsmPool, activeDevice, agentName); err != nil {
375 logger.Error(err, "Failed to ensure agent after reconnection", "serial", serial)
376 deploymentErrors = append(deploymentErrors, fmt.Errorf("reconnection failed for %s: %w", serial, err))
377 continue
378 }
379 }
380 } else if activeDevice != nil {
381 // Normal case - device is available
382 if err := r.ensureAgentOnNode(ctx, hsmPool, activeDevice, agentName); err != nil {
383 logger.Error(err, "Failed to ensure agent for available device", "serial", serial)
384 deploymentErrors = append(deploymentErrors, fmt.Errorf("agent creation failed for %s: %w", serial, err))
385 continue
386 }
387 } else if lostDevice != nil {
388 // Device is lost - check if we should clean up
389 timeSinceLost := time.Since(lostDevice.LastSeen.Time)
390 gracePeriod := 5 * time.Minute
391 if hsmPool.Spec.GracePeriod != nil {
392 gracePeriod = hsmPool.Spec.GracePeriod.Duration
393 }
394
395 if timeSinceLost > gracePeriod {
396 logger.Info("Cleaning up agent for lost device",
397 "serial", serial,
398 "lastNode", lostDevice.NodeName,
399 "timeSinceLost", timeSinceLost)
400
401 if err := r.deleteAgent(ctx, agentName, hsmPool.Namespace); err != nil {
402 logger.Error(err, "Failed to delete agent for lost device", "serial", serial)
403 deploymentErrors = append(deploymentErrors, fmt.Errorf("cleanup failed for %s: %w", serial, err))
404 }
405 } else {
406 logger.V(1).Info("Device lost but within grace period",
407 "serial", serial,
408 "timeSinceLost", timeSinceLost,
409 "gracePeriod", gracePeriod)
410 }
411 }
412 }
413
414 // Return aggregated errors if any occurred
415 if len(deploymentErrors) > 0 {
416 return fmt.Errorf("deployment errors occurred: %v", deploymentErrors)
417 }
418
419 return nil
420}
421
422// ensureAgentOnNode ensures an agent deployment exists on the correct node for the given device
423func (r *HSMPoolAgentReconciler) ensureAgentOnNode(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool, device *hsmv1alpha1.DiscoveredDevice, agentName string) error {
424 logger := log.FromContext(ctx)
425
426 // Check if deployment exists
427 var deployment appsv1.Deployment
428 err := r.Get(ctx, types.NamespacedName{
429 Name: agentName,
430 Namespace: hsmPool.Namespace,
431 }, &deployment)
432
433 if err == nil {
434 // Deployment exists - check if it's on the right node
435 if !r.isDeploymentOnNode(&deployment, device.NodeName) {
436 logger.Info("Agent on wrong node, recreating",
437 "agent", agentName,
438 "currentNode", r.getDeploymentNode(&deployment),
439 "targetNode", device.NodeName,
440 "serial", device.SerialNumber)
441
442 // Delete and recreate
443 if err := r.Delete(ctx, &deployment); err != nil && !errors.IsNotFound(err) {
444 return fmt.Errorf("failed to delete outdated agent: %w", err)
445 }
446 // Fall through to create
447 } else {
448 // Agent is on correct node - check if other details need updating
449 needsUpdate, err := r.agentNeedsUpdate(ctx, &deployment, hsmPool)
450 if err != nil {
451 return fmt.Errorf("failed to check if agent needs update: %w", err)
452 }
453
454 if !needsUpdate {
455 needsUpdate = r.deploymentNeedsUpdateForDevice(&deployment, device)
456 }
457
458 if needsUpdate {
459 logger.Info("Agent needs updating, recreating",
460 "agent", agentName,
461 "node", device.NodeName,
462 "serial", device.SerialNumber)
463
464 if err := r.Delete(ctx, &deployment); err != nil && !errors.IsNotFound(err) {
465 return fmt.Errorf("failed to delete outdated agent: %w", err)
466 }
467 // Fall through to create
468 } else {
469 // Agent is up to date
470 logger.V(1).Info("Agent deployment is up to date",
471 "agent", agentName,
472 "node", device.NodeName,
473 "serial", device.SerialNumber)
474 return nil
475 }
476 }
477 } else if !errors.IsNotFound(err) {
478 return fmt.Errorf("failed to check agent deployment: %w", err)
479 }
480
481 // Create agent deployment
482 logger.Info("Creating agent deployment",
483 "agent", agentName,
484 "node", device.NodeName,
485 "serial", device.SerialNumber)
486
487 return r.createAgentDeployment(ctx, hsmPool, device, agentName)
488}
489
490// isDeploymentOnNode checks if a deployment is pinned to the specified node
491func (r *HSMPoolAgentReconciler) isDeploymentOnNode(deployment *appsv1.Deployment, nodeName string) bool {
492 if deployment.Spec.Template.Spec.NodeSelector != nil {
493 return deployment.Spec.Template.Spec.NodeSelector["kubernetes.io/hostname"] == nodeName
494 }
495 return false
496}
497
498// getDeploymentNode returns the node name that a deployment is pinned to
499func (r *HSMPoolAgentReconciler) getDeploymentNode(deployment *appsv1.Deployment) string {
500 if deployment.Spec.Template.Spec.NodeSelector != nil {
501 return deployment.Spec.Template.Spec.NodeSelector["kubernetes.io/hostname"]
502 }
503 return ""
504}
505
506// deleteAgent deletes an agent deployment by name
507func (r *HSMPoolAgentReconciler) deleteAgent(ctx context.Context, name, namespace string) error {
508 deployment := &appsv1.Deployment{
509 ObjectMeta: metav1.ObjectMeta{
510 Name: name,
511 Namespace: namespace,
512 },
513 }
514 if err := r.Delete(ctx, deployment); err != nil && !errors.IsNotFound(err) {
515 return fmt.Errorf("failed to delete agent deployment %s: %w", name, err)
516 }
517 return nil
518}
519
520// createAgentDeployment creates the HSM agent deployment for a specific device
521func (r *HSMPoolAgentReconciler) createAgentDeployment(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool, specificDevice *hsmv1alpha1.DiscoveredDevice, customAgentName string) error {
522 logger := log.FromContext(ctx)
523
524 if specificDevice == nil {
525 return fmt.Errorf("specificDevice is required")
526 }
527
528 var agentName string
529 if customAgentName != "" {
530 agentName = customAgentName
531 } else {
532 agentName = r.generateAgentName(hsmPool)
533 }
534
535 targetNode := specificDevice.NodeName
536 deviceName := hsmPool.OwnerReferences[0].Name
537
538 // Fetch the HSMDevice to get the device type for extended resource requests
539 var hsmDevice hsmv1alpha1.HSMDevice
540 if err := r.Get(ctx, types.NamespacedName{Name: deviceName, Namespace: hsmPool.Namespace}, &hsmDevice); err != nil {
541 logger.Error(err, "Failed to get HSMDevice for agent deployment", "device", deviceName)
542 return fmt.Errorf("failed to get HSMDevice %s: %w", deviceName, err)
543 }
544
545 // Build extended resource name from device type (e.g., "hsm.j5t.io/picohsm")
546 extendedResourceName := corev1.ResourceName(
547 fmt.Sprintf("hsm.j5t.io/%s", strings.ToLower(string(hsmDevice.Spec.DeviceType))),
548 )
549 logger.V(1).Info("Using extended resource for agent",
550 "agent", agentName,
551 "resource", extendedResourceName)
552
553 // Get agent image from config or fallback to auto-detection
554 var agentImage string
555 if r.AgentImage != "" {
556 agentImage = r.AgentImage
557 } else if r.ImageResolver != nil {
558 // Fallback to ImageResolver for backward compatibility or auto-detection
559 agentImage = r.ImageResolver.GetImage(ctx, "")
560 }
561
562 var replicas int32 = 1
563 var rootUserId int64 = 0
564 // Fallback to root for USB device access - compensated by distroless
565 falsePtr := new(bool)
566 *falsePtr = false
567 truePtr := new(bool)
568 *truePtr = true
569 hostPathDirectory := corev1.HostPathDirectory
570
571 deployment := &appsv1.Deployment{
572 ObjectMeta: metav1.ObjectMeta{
573 Name: agentName,
574 Namespace: hsmPool.Namespace,
575 Labels: map[string]string{
576 "app": agentName,
577 "app.kubernetes.io/component": "hsm-agent",
578 "app.kubernetes.io/instance": agentName,
579 "app.kubernetes.io/name": "hsm-agent",
580 "app.kubernetes.io/part-of": "hsm-secrets-operator",
581 "hsm.j5t.io/device": deviceName,
582 "hsm.j5t.io/serial-number": specificDevice.SerialNumber,
583 "hsm.j5t.io/device-path": sanitizeLabelValue(specificDevice.DevicePath),
584 },
585 },
586 Spec: appsv1.DeploymentSpec{
587 Replicas: &replicas,
588 Selector: &metav1.LabelSelector{
589 MatchLabels: map[string]string{
590 "app": agentName,
591 },
592 },
593 Template: corev1.PodTemplateSpec{
594 ObjectMeta: metav1.ObjectMeta{
595 Labels: map[string]string{
596 "app": agentName,
597 "app.kubernetes.io/component": "hsm-agent",
598 "app.kubernetes.io/instance": agentName,
599 "app.kubernetes.io/name": "hsm-agent",
600 "app.kubernetes.io/part-of": "hsm-secrets-operator",
601 "hsm.j5t.io/device": deviceName,
602 "hsm.j5t.io/serial-number": specificDevice.SerialNumber,
603 "hsm.j5t.io/device-path": sanitizeLabelValue(specificDevice.DevicePath),
604 },
605 },
606 Spec: corev1.PodSpec{
607 // Pin to the specific node with the HSM device
608 NodeSelector: map[string]string{
609 "kubernetes.io/hostname": targetNode,
610 },
611 // Affinity for better scheduling
612 Affinity: &corev1.Affinity{
613 NodeAffinity: &corev1.NodeAffinity{
614 RequiredDuringSchedulingIgnoredDuringExecution: &corev1.NodeSelector{
615 NodeSelectorTerms: []corev1.NodeSelectorTerm{
616 {
617 MatchExpressions: []corev1.NodeSelectorRequirement{
618 {
619 Key: "kubernetes.io/hostname",
620 Operator: corev1.NodeSelectorOpIn,
621 Values: []string{targetNode},
622 },
623 },
624 },
625 },
626 },
627 },
628 },
629 SecurityContext: &corev1.PodSecurityContext{
630 RunAsUser: &rootUserId,
631 RunAsGroup: &rootUserId,
632 RunAsNonRoot: falsePtr, // Root required for USB access
633 },
634 ServiceAccountName: r.ServiceAccountName,
635 Containers: []corev1.Container{
636 {
637 Name: "agent",
638 Image: agentImage,
639 Args: r.buildAgentArgs(ctx, hsmPool, deviceName),
640 Env: []corev1.EnvVar{},
641 Ports: []corev1.ContainerPort{
642 {
643 Name: "grpc",
644 ContainerPort: AgentPort,
645 Protocol: corev1.ProtocolTCP,
646 },
647 {
648 Name: "health",
649 ContainerPort: AgentHealthPort,
650 Protocol: corev1.ProtocolTCP,
651 },
652 },
653 LivenessProbe: &corev1.Probe{
654 ProbeHandler: corev1.ProbeHandler{
655 HTTPGet: &corev1.HTTPGetAction{
656 Path: "/healthz",
657 Port: intstr.FromInt(AgentHealthPort),
658 },
659 },
660 InitialDelaySeconds: 15,
661 PeriodSeconds: 20,
662 },
663 ReadinessProbe: &corev1.Probe{
664 ProbeHandler: corev1.ProbeHandler{
665 HTTPGet: &corev1.HTTPGetAction{
666 Path: "/readyz",
667 Port: intstr.FromInt(AgentHealthPort),
668 },
669 },
670 InitialDelaySeconds: 5,
671 PeriodSeconds: 10,
672 },
673 Resources: corev1.ResourceRequirements{
674 Requests: corev1.ResourceList{
675 corev1.ResourceCPU: resource.MustParse("100m"),
676 corev1.ResourceMemory: resource.MustParse("128Mi"),
677 extendedResourceName: resource.MustParse("1"), // Request 1 HSM device
678 },
679 Limits: corev1.ResourceList{
680 corev1.ResourceCPU: resource.MustParse("500m"),
681 corev1.ResourceMemory: resource.MustParse("256Mi"),
682 extendedResourceName: resource.MustParse("1"), // Limit to 1 HSM device
683 },
684 },
685 SecurityContext: &corev1.SecurityContext{
686 Privileged: truePtr,
687 AllowPrivilegeEscalation: truePtr,
688 ReadOnlyRootFilesystem: falsePtr, // pcscd needs writable /run and /var/lock
689 RunAsNonRoot: falsePtr, // Root required for USB device access
690 RunAsUser: &rootUserId,
691 SeccompProfile: &corev1.SeccompProfile{
692 Type: corev1.SeccompProfileTypeRuntimeDefault,
693 },
694 },
695 VolumeMounts: []corev1.VolumeMount{
696 {
697 Name: "tmp",
698 MountPath: "/tmp",
699 ReadOnly: false, // Required for pcscd runtime with readonly filesystem
700 },
701 {
702 Name: "usb-bus",
703 MountPath: "/dev/bus/usb",
704 ReadOnly: false,
705 },
706 {
707 Name: "pcscd-run",
708 MountPath: "/run/pcscd",
709 ReadOnly: false, // Required for pcscd socket
710 },
711 {
712 Name: "pcscd-lock",
713 MountPath: "/var/lock/pcsc",
714 ReadOnly: false, // Required for pcscd locking
715 },
716 },
717 },
718 },
719 Volumes: []corev1.Volume{
720 {
721 Name: "tmp",
722 VolumeSource: corev1.VolumeSource{
723 EmptyDir: &corev1.EmptyDirVolumeSource{},
724 },
725 },
726 {
727 Name: "usb-bus",
728 VolumeSource: corev1.VolumeSource{
729 HostPath: &corev1.HostPathVolumeSource{
730 Path: "/dev/bus/usb",
731 Type: &hostPathDirectory,
732 },
733 },
734 },
735 {
736 Name: "pcscd-run",
737 VolumeSource: corev1.VolumeSource{
738 EmptyDir: &corev1.EmptyDirVolumeSource{},
739 },
740 },
741 {
742 Name: "pcscd-lock",
743 VolumeSource: corev1.VolumeSource{
744 EmptyDir: &corev1.EmptyDirVolumeSource{},
745 },
746 },
747 },
748 },
749 },
750 },
751 }
752
753 return r.Create(ctx, deployment)
754}
755
756// agentNeedsUpdate checks if the agent deployment needs to be updated due to device path or image changes
757func (r *HSMPoolAgentReconciler) agentNeedsUpdate(ctx context.Context, deployment *appsv1.Deployment, hsmPool *hsmv1alpha1.HSMPool) (bool, error) {
758 if hsmPool == nil {
759 return false, nil // No pool available, no update needed
760 }
761 // Check if container image needs updating
762 if len(deployment.Spec.Template.Spec.Containers) == 0 {
763 return false, fmt.Errorf("deployment has no containers")
764 }
765
766 container := deployment.Spec.Template.Spec.Containers[0]
767 currentImage := container.Image
768
769 // Check if image has changed
770 var expectedImage string
771 if r.AgentImage != "" {
772 expectedImage = r.AgentImage
773 } else if r.ImageResolver != nil {
774 // Fallback to auto-detection
775 expectedImage = r.ImageResolver.GetImage(ctx, "")
776 }
777
778 if expectedImage != "" && currentImage != expectedImage {
779 // Image has changed, need to update
780 return true, nil
781 }
782
783 // Device-specific path validation is handled by deploymentNeedsUpdateForDevice
784 // This function only checks image changes and other deployment-wide properties
785
786 return false, nil
787}
788
789// deploymentNeedsUpdateForDevice checks if a deployment needs to be updated for a specific device
790// This is a simplified check that only validates device-specific configuration
791func (r *HSMPoolAgentReconciler) deploymentNeedsUpdateForDevice(deployment *appsv1.Deployment, aggregatedDevice *hsmv1alpha1.DiscoveredDevice) bool {
792 // Check node affinity - ensure agent is pinned to the correct node
793 if deployment.Spec.Template.Spec.Affinity == nil ||
794 deployment.Spec.Template.Spec.Affinity.NodeAffinity == nil ||
795 deployment.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution == nil {
796 return true // Missing required node affinity
797 }
798
799 // Check if the node name matches the aggregated device's node
800 nodeSelector := deployment.Spec.Template.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution
801 if len(nodeSelector.NodeSelectorTerms) == 0 {
802 return true
803 }
804
805 // Check if hostname requirement matches the device's node
806 nodeMatches := false
807 for _, term := range nodeSelector.NodeSelectorTerms {
808 for _, expr := range term.MatchExpressions {
809 if expr.Key == "kubernetes.io/hostname" && expr.Operator == corev1.NodeSelectorOpIn {
810 if slices.Contains(expr.Values, aggregatedDevice.NodeName) {
811 nodeMatches = true
812 }
813 }
814 }
815 }
816
817 if !nodeMatches {
818 return true // Node doesn't match
819 }
820
821 // Check device path in volume mounts
822 for _, vol := range deployment.Spec.Template.Spec.Volumes {
823 if vol.Name == "hsm-device" && vol.HostPath != nil {
824 if vol.HostPath.Path != aggregatedDevice.DevicePath {
825 return true // Device path changed
826 }
827 }
828 }
829
830 return false
831}
832
833// generateAgentName creates a consistent agent name for an HSM device
834func (r *HSMPoolAgentReconciler) generateAgentName(hsmPool *hsmv1alpha1.HSMPool) string {
835 return fmt.Sprintf("%s-%s", AgentNamePrefix, hsmPool.OwnerReferences[0].Name)
836}
837
838// buildAgentArgs builds CLI arguments for the HSM agent
839func (r *HSMPoolAgentReconciler) buildAgentArgs(ctx context.Context, hsmPool *hsmv1alpha1.HSMPool, deviceName string) []string {
840 args := []string{
841 "--mode=agent",
842 "--device-name=" + deviceName,
843 "--port=" + fmt.Sprintf("%d", AgentPort),
844 "--health-port=" + fmt.Sprintf("%d", AgentHealthPort),
845 }
846
847 // Get HSMDevice from owner reference
848 var hsmDevice hsmv1alpha1.HSMDevice
849 if err := r.Get(ctx, types.NamespacedName{
850 Name: deviceName,
851 Namespace: hsmPool.Namespace,
852 }, &hsmDevice); err != nil {
853 // If we can't get the device, return basic args
854 return args
855 }
856
857 // Add PKCS#11 configuration if available
858 if hsmDevice.Spec.PKCS11 != nil {
859 if hsmDevice.Spec.PKCS11.TokenLabel != "" {
860 args = append(args, "--token-label="+hsmDevice.Spec.PKCS11.TokenLabel)
861 }
862
863 if hsmDevice.Spec.PKCS11.SlotId >= 0 {
864 args = append(args, "--slot-id="+fmt.Sprintf("%d", hsmDevice.Spec.PKCS11.SlotId))
865 }
866
867 if hsmDevice.Spec.PKCS11.LibraryPath != "" {
868 args = append(args, "--pkcs11-library="+hsmDevice.Spec.PKCS11.LibraryPath)
869 }
870 }
871
872 return args
873}
874
875// sanitizeLabelValue sanitizes a string to be a valid Kubernetes label value
876// Kubernetes labels must be alphanumeric, '-', '_', or '.' and start/end with alphanumeric
877func sanitizeLabelValue(value string) string {
878 if len(value) == 0 {
879 return value
880 }
881
882 // Replace invalid characters with dashes
883 sanitized := strings.Map(func(r rune) rune {
884 if (r >= 'A' && r <= 'Z') || (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '-' || r == '_' || r == '.' {
885 return r
886 }
887 return '-'
888 }, value)
889
890 // Ensure starts and ends with alphanumeric
891 sanitized = strings.TrimFunc(sanitized, func(r rune) bool {
892 return (r < 'A' || r > 'Z') && (r < 'a' || r > 'z') && (r < '0' || r > '9')
893 })
894
895 // Kubernetes label values have a 63 character limit
896 if len(sanitized) > 63 {
897 sanitized = sanitized[:63]
898 // Re-trim end if we cut off at a non-alphanumeric
899 sanitized = strings.TrimFunc(sanitized, func(r rune) bool {
900 return (r < 'A' || r > 'Z') && (r < 'a' || r > 'z') && (r < '0' || r > '9')
901 })
902 }
903
904 return sanitized
905}
906
907// SetupWithManager sets up the controller with the Manager.
908func (r *HSMPoolAgentReconciler) SetupWithManager(mgr ctrl.Manager) error {
909 return ctrl.NewControllerManagedBy(mgr).
910 For(&hsmv1alpha1.HSMPool{}).
911 Watches(
912 &appsv1.Deployment{},
913 handler.EnqueueRequestsFromMapFunc(r.findPoolsForDeployment),
914 ).
915 Named("hsmpool-agent").
916 Complete(r)
917}
918
919// findPoolsForDeployment maps agent deployments back to HSMPools for reconciliation
920func (r *HSMPoolAgentReconciler) findPoolsForDeployment(ctx context.Context, obj client.Object) []reconcile.Request {
921 deployment, ok := obj.(*appsv1.Deployment)
922 if !ok {
923 return nil
924 }
925
926 // Check if this is an HSM agent deployment
927 deviceName, exists := deployment.Labels["hsm.j5t.io/device"]
928 if !exists {
929 return nil
930 }
931
932 // Find the corresponding HSMPool (agent deployments are created for devices referenced in pools)
933 poolName := deviceName + "-pool"
934
935 return []reconcile.Request{
936 {
937 NamespacedName: client.ObjectKey{
938 Name: poolName,
939 Namespace: deployment.Namespace,
940 },
941 },
942 }
943}