A Kubernetes operator that bridges Hardware Security Module (HSM) data storage with Kubernetes Secrets, providing true secret portability th
1
fork

Configure Feed

Select the types of activity you want to include in your feed.

fix issue not spawning a new agent when second device is plugged in

+308 -68
+2 -2
helm/hsm-secrets-operator/Chart.yaml
··· 2 2 name: hsm-secrets-operator 3 3 description: A Kubernetes operator that bridges Pico HSM binary data storage with Kubernetes Secrets 4 4 type: application 5 - version: 0.5.13 6 - appVersion: v0.5.13 5 + version: 0.5.14 6 + appVersion: v0.5.14 7 7 icon: https://raw.githubusercontent.com/cncf/artwork/master/projects/kubernetes/icon/color/kubernetes-icon-color.svg 8 8 home: https://github.com/evanjarrett/hsm-secrets-operator 9 9 sources:
+306 -66
internal/controller/hsmsecret_controller.go
··· 55 55 OperatorName string 56 56 } 57 57 58 + // HSMDeviceClients holds multiple HSM devices and their clients 59 + type HSMDeviceClients struct { 60 + Devices []*hsmv1alpha1.HSMDevice 61 + Clients []hsm.Client 62 + } 63 + 64 + // Close closes all clients 65 + func (hdc *HSMDeviceClients) Close() error { 66 + var errs []error 67 + for _, hsmClient := range hdc.Clients { 68 + if hsmClient != nil { 69 + if err := hsmClient.Close(); err != nil { 70 + errs = append(errs, err) 71 + } 72 + } 73 + } 74 + if len(errs) > 0 { 75 + return fmt.Errorf("failed to close %d clients: %v", len(errs), errs) 76 + } 77 + return nil 78 + } 79 + 80 + // DeviceInfo holds device data and metadata for version-based conflict resolution 81 + type DeviceInfo struct { 82 + Data hsm.SecretData 83 + Metadata *hsm.SecretMetadata 84 + Version int64 85 + Checksum string 86 + Timestamp time.Time 87 + } 88 + 58 89 // +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmsecrets,verbs=get;list;watch;create;update;patch;delete 59 90 // +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmsecrets/status,verbs=get;update;patch 60 91 // +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmsecrets/finalizers,verbs=update ··· 89 120 return ctrl.Result{}, nil 90 121 } 91 122 92 - // Find target HSM device and ensure agent is running 93 - hsmDevice, agentClient, err := r.ensureHSMAgent(ctx, &hsmSecret) 123 + // Find target HSM devices and ensure agents are running 124 + deviceClients, err := r.ensureHSMAgents(ctx, &hsmSecret) 94 125 if err != nil { 95 - logger.Error(err, "Failed to ensure HSM agent") 126 + logger.Error(err, "Failed to ensure HSM agents") 96 127 return ctrl.Result{RequeueAfter: time.Minute * 5}, nil 97 128 } 129 + defer func() { 130 + if err := deviceClients.Close(); err != nil { 131 + logger.Error(err, "Failed to close device clients") 132 + } 133 + }() 98 134 99 - // Use agent client instead of direct HSM client 100 - if agentClient == nil || !agentClient.IsConnected() { 101 - logger.Error(fmt.Errorf("HSM agent not available"), "HSM agent not connected", "device", hsmDevice.Name) 135 + // Check that we have at least one connected client 136 + if len(deviceClients.Clients) == 0 { 137 + logger.Error(fmt.Errorf("no HSM agents available"), "No HSM agents connected") 102 138 return ctrl.Result{RequeueAfter: time.Minute * 2}, nil 139 + } 140 + 141 + // Validate all clients are connected 142 + for i, hsmClient := range deviceClients.Clients { 143 + if hsmClient == nil || !hsmClient.IsConnected() { 144 + logger.Error(fmt.Errorf("HSM agent not available"), "HSM agent not connected", "device", deviceClients.Devices[i].Name) 145 + return ctrl.Result{RequeueAfter: time.Minute * 2}, nil 146 + } 103 147 } 104 148 105 149 // Handle deletion ··· 117 161 return ctrl.Result{Requeue: true}, nil 118 162 } 119 163 120 - // Reconcile the HSMSecret using the agent client 121 - result, err := r.reconcileNormal(ctx, &hsmSecret, agentClient) 164 + // Reconcile the HSMSecret across all available devices 165 + result, err := r.reconcileSecret(ctx, &hsmSecret, deviceClients) 122 166 if err != nil { 123 167 logger.Error(err, "Failed to reconcile HSMSecret") 124 168 r.updateStatus(ctx, &hsmSecret, hsmv1alpha1.SyncStatusError, err.Error()) ··· 127 171 return result, err 128 172 } 129 173 130 - // ensureHSMAgent finds an HSM device for the secret and ensures an agent is running 131 - func (r *HSMSecretReconciler) ensureHSMAgent(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret) (*hsmv1alpha1.HSMDevice, hsm.Client, error) { 174 + // ensureHSMAgents finds all HSM devices and ensures agents are running for each 175 + func (r *HSMSecretReconciler) ensureHSMAgents(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret) (*HSMDeviceClients, error) { 132 176 logger := log.FromContext(ctx) 133 177 134 - // Find the appropriate HSM device 135 - hsmDevice, err := r.findHSMDeviceForSecret(ctx) 178 + // Find all appropriate HSM devices 179 + hsmDevices, err := r.findAllHSMDevices(ctx) 136 180 if err != nil { 137 - return nil, nil, fmt.Errorf("failed to find HSM device for secret: %w", err) 181 + return nil, fmt.Errorf("failed to find HSM devices for secret: %w", err) 138 182 } 139 183 140 - // Ensure agent pod is running for this device 141 184 if r.AgentManager == nil { 142 - return nil, nil, fmt.Errorf("agent manager not configured") 185 + return nil, fmt.Errorf("agent manager not configured") 143 186 } 144 187 145 - // EnsureAgent now returns HTTP endpoint for backward compatibility, but we'll use gRPC 146 - _, err = r.AgentManager.EnsureAgent(ctx, hsmDevice, hsmSecret) 147 - if err != nil { 148 - return nil, nil, fmt.Errorf("failed to ensure HSM agent: %w", err) 188 + deviceClients := &HSMDeviceClients{ 189 + Devices: hsmDevices, 190 + Clients: make([]hsm.Client, 0, len(hsmDevices)), 149 191 } 150 192 151 - // Create gRPC client using agent manager's direct pod connections 152 - agentClient, err := r.AgentManager.CreateSingleGRPCClient(ctx, hsmDevice.Name, logger) 153 - if err != nil { 154 - return nil, nil, fmt.Errorf("failed to create gRPC client: %w", err) 155 - } 193 + // Ensure agent pods are running for all devices and create clients 194 + for _, hsmDevice := range hsmDevices { 195 + // EnsureAgent now returns HTTP endpoint for backward compatibility, but we'll use gRPC 196 + _, err = r.AgentManager.EnsureAgent(ctx, hsmDevice, hsmSecret) 197 + if err != nil { 198 + // Clean up any successful connections before returning error 199 + if err := deviceClients.Close(); err != nil { 200 + logger.Error(err, "Failed to close device clients during cleanup") 201 + } 202 + return nil, fmt.Errorf("failed to ensure HSM agent for device %s: %w", hsmDevice.Name, err) 203 + } 156 204 157 - // Test connection 158 - if !agentClient.IsConnected() { 159 - logger.Info("Waiting for HSM agent to be ready", "device", hsmDevice.Name) 160 - time.Sleep(5 * time.Second) 205 + // Create gRPC client using agent manager's direct pod connections 206 + agentClient, err := r.AgentManager.CreateSingleGRPCClient(ctx, hsmDevice.Name, logger) 207 + if err != nil { 208 + // Clean up any successful connections before returning error 209 + if err := deviceClients.Close(); err != nil { 210 + logger.Error(err, "Failed to close device clients during cleanup") 211 + } 212 + return nil, fmt.Errorf("failed to create gRPC client for device %s: %w", hsmDevice.Name, err) 213 + } 161 214 162 - // Test again 215 + // Test connection 163 216 if !agentClient.IsConnected() { 164 - if err := agentClient.Close(); err != nil { 165 - logger.Error(err, "Failed to close gRPC client after failed connection test") 217 + logger.Info("Waiting for HSM agent to be ready", "device", hsmDevice.Name) 218 + time.Sleep(5 * time.Second) 219 + 220 + // Test again 221 + if !agentClient.IsConnected() { 222 + if err := agentClient.Close(); err != nil { 223 + logger.Error(err, "Failed to close gRPC client after failed connection test") 224 + } 225 + // Clean up any successful connections before returning error 226 + if err := deviceClients.Close(); err != nil { 227 + logger.Error(err, "Failed to close device clients during cleanup") 228 + } 229 + return nil, fmt.Errorf("HSM agent not ready after waiting for device %s", hsmDevice.Name) 166 230 } 167 - return nil, nil, fmt.Errorf("HSM agent not ready after waiting") 168 231 } 232 + 233 + deviceClients.Clients = append(deviceClients.Clients, agentClient) 169 234 } 170 235 171 - return hsmDevice, agentClient, nil 236 + return deviceClients, nil 172 237 } 173 238 174 - // reconcileNormal handles normal reconciliation logic 175 - func (r *HSMSecretReconciler) reconcileNormal(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret, hsmClient hsm.Client) (ctrl.Result, error) { 239 + // reconcileSecret handles HSM secret reconciliation across all available devices 240 + func (r *HSMSecretReconciler) reconcileSecret(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret, deviceClients *HSMDeviceClients) (ctrl.Result, error) { 176 241 logger := log.FromContext(ctx) 177 242 178 243 // Set default values ··· 186 251 syncInterval = DefaultSyncInterval 187 252 } 188 253 189 - // Read secret from HSM via agent 190 - hsmData, err := r.readSecretFromHSM(ctx, hsmSecret, hsmClient) 254 + // Read from all devices (handles both single and multi-device scenarios) 255 + if len(deviceClients.Devices) > 1 { 256 + logger.Info("Multi-device setup detected, checking for consistency", "deviceCount", len(deviceClients.Devices)) 257 + } else { 258 + logger.V(1).Info("Single device setup", "deviceCount", len(deviceClients.Devices)) 259 + } 260 + 261 + deviceInfos, primaryDevice, err := r.readFromAllDevices(ctx, hsmSecret, deviceClients) 191 262 if err != nil { 192 - logger.Error(err, "Failed to read secret from HSM", "path", hsmSecret.Name) 263 + logger.Error(err, "Failed to read from devices") 193 264 return ctrl.Result{RequeueAfter: time.Minute * 2}, err 194 265 } 195 266 196 - // Read metadata from HSM via agent 197 - hsmMetadata, err := hsmClient.ReadMetadata(ctx, hsmSecret.Name) 198 - if err != nil { 199 - logger.V(1).Info("Failed to read metadata from HSM (this is normal if no metadata exists)", "path", hsmSecret.Name, "error", err) 200 - hsmMetadata = nil 267 + // Check for inconsistencies and sync if needed (only matters for multi-device) 268 + if len(deviceClients.Devices) > 1 && r.detectInconsistencies(deviceInfos) { 269 + logger.Info("Inconsistency detected between devices, performing sync", "primaryDevice", primaryDevice) 270 + 271 + if err := r.syncAcrossDevices(ctx, hsmSecret, deviceClients, primaryDevice, deviceInfos[primaryDevice]); err != nil { 272 + logger.Error(err, "Failed to sync across devices") 273 + return ctrl.Result{RequeueAfter: time.Minute * 2}, err 274 + } 275 + logger.Info("Successfully synced secret across all devices") 201 276 } 202 277 278 + // Use the primary device data for Kubernetes secret 279 + hsmData := deviceInfos[primaryDevice].Data 280 + hsmMetadata := deviceInfos[primaryDevice].Metadata 281 + 282 + return r.updateKubernetesSecret(ctx, hsmSecret, secretName, hsmData, hsmMetadata, syncInterval) 283 + } 284 + 285 + 286 + // readFromAllDevices reads the secret from all devices with version information 287 + func (r *HSMSecretReconciler) readFromAllDevices(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret, deviceClients *HSMDeviceClients) (map[string]*DeviceInfo, string, error) { 288 + logger := log.FromContext(ctx) 289 + deviceInfos := make(map[string]*DeviceInfo) 290 + 291 + for i, hsmClient := range deviceClients.Clients { 292 + deviceName := deviceClients.Devices[i].Name 293 + 294 + data, err := hsmClient.ReadSecret(ctx, hsmSecret.Name) 295 + if err != nil { 296 + logger.V(1).Info("Failed to read from device", "device", deviceName, "error", err) 297 + // Continue with other devices - this might be a new device without the secret 298 + continue 299 + } 300 + 301 + // Read metadata to get version information 302 + metadata, err := hsmClient.ReadMetadata(ctx, hsmSecret.Name) 303 + if err != nil { 304 + logger.V(1).Info("Failed to read metadata from device", "device", deviceName, "error", err) 305 + metadata = nil 306 + } 307 + 308 + // Extract version from metadata 309 + var version int64 310 + if metadata != nil && metadata.Labels != nil { 311 + if versionStr, exists := metadata.Labels["sync.version"]; exists { 312 + if parsedVersion, parseErr := r.parseVersion(versionStr); parseErr == nil { 313 + version = parsedVersion 314 + } 315 + } 316 + } 317 + 318 + deviceInfos[deviceName] = &DeviceInfo{ 319 + Data: data, 320 + Metadata: metadata, 321 + Version: version, 322 + Checksum: hsm.CalculateChecksum(data), 323 + Timestamp: time.Now(), 324 + } 325 + } 326 + 327 + if len(deviceInfos) == 0 { 328 + return nil, "", fmt.Errorf("no devices contain the secret %s", hsmSecret.Name) 329 + } 330 + 331 + // Select primary device based on version and HSMSecret status 332 + primaryDevice := r.selectPrimaryDevice(deviceInfos, hsmSecret) 333 + 334 + return deviceInfos, primaryDevice, nil 335 + } 336 + 337 + // parseVersion parses version string from metadata 338 + func (r *HSMSecretReconciler) parseVersion(versionStr string) (int64, error) { 339 + var version int64 340 + _, err := fmt.Sscanf(versionStr, "%d", &version) 341 + return version, err 342 + } 343 + 344 + // selectPrimaryDevice chooses the primary device based on version and HSMSecret status 345 + func (r *HSMSecretReconciler) selectPrimaryDevice(deviceInfos map[string]*DeviceInfo, hsmSecret *hsmv1alpha1.HSMSecret) string { 346 + // Check if there's already a designated primary in the status that's still available 347 + if hsmSecret.Status.PrimaryDevice != "" { 348 + if info, exists := deviceInfos[hsmSecret.Status.PrimaryDevice]; exists && info != nil { 349 + return hsmSecret.Status.PrimaryDevice 350 + } 351 + } 352 + 353 + // Find device with highest version number 354 + var bestDevice string 355 + var highestVersion int64 = -1 356 + var mostRecentTime time.Time 357 + 358 + for deviceName, info := range deviceInfos { 359 + // Prefer higher version numbers 360 + if info.Version > highestVersion { 361 + highestVersion = info.Version 362 + bestDevice = deviceName 363 + mostRecentTime = info.Timestamp 364 + } else if info.Version == highestVersion && info.Timestamp.After(mostRecentTime) { 365 + // If versions are equal, prefer more recent timestamp 366 + bestDevice = deviceName 367 + mostRecentTime = info.Timestamp 368 + } 369 + } 370 + 371 + return bestDevice 372 + } 373 + 374 + // detectInconsistencies checks if devices have different versions of the secret 375 + func (r *HSMSecretReconciler) detectInconsistencies(deviceInfos map[string]*DeviceInfo) bool { 376 + if len(deviceInfos) <= 1 { 377 + return false 378 + } 379 + 380 + checksums := make(map[string]int) 381 + for _, info := range deviceInfos { 382 + checksums[info.Checksum]++ 383 + } 384 + 385 + // Inconsistency if we have more than one unique checksum 386 + return len(checksums) > 1 387 + } 388 + 389 + // syncAcrossDevices copies the primary device's secret to all other devices with proper versioning 390 + func (r *HSMSecretReconciler) syncAcrossDevices(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret, deviceClients *HSMDeviceClients, primaryDevice string, primaryInfo *DeviceInfo) error { 391 + logger := log.FromContext(ctx) 392 + 393 + for i, hsmClient := range deviceClients.Clients { 394 + deviceName := deviceClients.Devices[i].Name 395 + 396 + // Skip the primary device 397 + if deviceName == primaryDevice { 398 + continue 399 + } 400 + 401 + logger.Info("Syncing secret to device", "device", deviceName, "from", primaryDevice) 402 + 403 + // Create metadata with updated version and sync information 404 + newVersion := time.Now().Unix() 405 + metadata := &hsm.SecretMetadata{ 406 + Labels: map[string]string{ 407 + "sync.version": fmt.Sprintf("%d", newVersion), 408 + "sync.primary": primaryDevice, 409 + "sync.timestamp": time.Now().Format(time.RFC3339), 410 + }, 411 + } 412 + 413 + // Copy over other metadata if it exists 414 + if primaryInfo.Metadata != nil { 415 + if metadata.Labels == nil { 416 + metadata.Labels = make(map[string]string) 417 + } 418 + if primaryInfo.Metadata.Description != "" { 419 + metadata.Description = primaryInfo.Metadata.Description 420 + } 421 + if primaryInfo.Metadata.Format != "" { 422 + metadata.Format = primaryInfo.Metadata.Format 423 + } 424 + if primaryInfo.Metadata.DataType != "" { 425 + metadata.DataType = primaryInfo.Metadata.DataType 426 + } 427 + if primaryInfo.Metadata.Source != "" { 428 + metadata.Source = primaryInfo.Metadata.Source 429 + } 430 + // Copy non-sync labels 431 + for key, value := range primaryInfo.Metadata.Labels { 432 + if !strings.HasPrefix(key, "sync.") { 433 + metadata.Labels[key] = value 434 + } 435 + } 436 + } 437 + 438 + // Write the primary device's data with metadata to this device 439 + if err := hsmClient.WriteSecretWithMetadata(ctx, hsmSecret.Name, primaryInfo.Data, metadata); err != nil { 440 + logger.Error(err, "Failed to sync secret to device", "device", deviceName) 441 + return fmt.Errorf("failed to sync to device %s: %w", deviceName, err) 442 + } 443 + 444 + logger.Info("Successfully synced secret to device", "device", deviceName, "version", newVersion) 445 + } 446 + 447 + return nil 448 + } 449 + 450 + // updateKubernetesSecret updates the Kubernetes Secret with the given data 451 + func (r *HSMSecretReconciler) updateKubernetesSecret(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret, secretName string, hsmData hsm.SecretData, hsmMetadata *hsm.SecretMetadata, syncInterval int32) (ctrl.Result, error) { 452 + logger := log.FromContext(ctx) 453 + 203 454 // Calculate HSM checksum 204 455 hsmChecksum := hsm.CalculateChecksum(hsmData) 205 456 ··· 210 461 Name: secretName, 211 462 } 212 463 213 - err = r.Get(ctx, secretKey, &k8sSecret) 464 + err := r.Get(ctx, secretKey, &k8sSecret) 214 465 if err != nil { 215 466 if errors.IsNotFound(err) { 216 467 // Create new secret ··· 496 747 } 497 748 } 498 749 499 - // readSecretFromHSM attempts to read a secret from HSM via agent 500 - func (r *HSMSecretReconciler) readSecretFromHSM(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret, hsmClient hsm.Client) (hsm.SecretData, error) { 501 - logger := log.FromContext(ctx) 502 - 503 - // Read from HSM via agent (sync handles mirroring automatically) 504 - if hsmClient != nil && hsmClient.IsConnected() { 505 - data, err := hsmClient.ReadSecret(ctx, hsmSecret.Name) 506 - if err == nil { 507 - logger.V(1).Info("Successfully read secret from HSM", "path", hsmSecret.Name) 508 - return data, nil 509 - } 510 - logger.V(1).Info("Failed to read from HSM", "error", err) 511 - return nil, err 512 - } 513 - 514 - return nil, fmt.Errorf("HSM client not available or not connected") 515 - } 516 - 517 - // findHSMDeviceForSecret finds the HSMDevice that should contain the secret 750 + // findAllHSMDevices finds all HSMDevices with ready HSMPools 518 751 // Note: HSMDevices are managed in the operator namespace, not the HSMSecret's namespace 519 - func (r *HSMSecretReconciler) findHSMDeviceForSecret(ctx context.Context) (*hsmv1alpha1.HSMDevice, error) { 752 + func (r *HSMSecretReconciler) findAllHSMDevices(ctx context.Context) ([]*hsmv1alpha1.HSMDevice, error) { 520 753 // List HSMDevices in this operator's namespace (where operator infrastructure is contained) 521 754 var hsmDeviceList hsmv1alpha1.HSMDeviceList 522 755 if err := r.List(ctx, &hsmDeviceList, client.InNamespace(r.OperatorNamespace)); err != nil { 523 756 return nil, fmt.Errorf("failed to list HSM devices: %w", err) 524 757 } 758 + 759 + var readyDevices []*hsmv1alpha1.HSMDevice 525 760 526 761 // Look for devices with associated HSMPools that are ready with available devices 527 762 for _, device := range hsmDeviceList.Items { ··· 538 773 len(pool.Status.AggregatedDevices) > 0 { 539 774 540 775 // This is a suitable device for HSM operations 541 - return &device, nil 776 + deviceCopy := device // Create a copy to avoid issues with loop variable 777 + readyDevices = append(readyDevices, &deviceCopy) 542 778 } 543 779 } 544 780 545 - return nil, fmt.Errorf("no suitable HSM device found in ready state") 781 + if len(readyDevices) == 0 { 782 + return nil, fmt.Errorf("no suitable HSM devices found in ready state") 783 + } 784 + 785 + return readyDevices, nil 546 786 } 547 787 548 788 // shouldHandleSecret determines if this operator instance should handle the given HSMSecret