···33# To re-generate a bundle for another specific version without changing the standard setup, you can:
44# - use the VERSION as arg of the bundle target (e.g make bundle VERSION=0.0.2)
55# - use environment variables to overwrite this value (e.g export VERSION=0.0.2)
66-VERSION ?= 0.5.28
66+VERSION ?= 0.5.29
7788# CHANNELS define the bundle channels used in the bundle.
99# Add a new line here if you would like to change its default config. (E.g CHANNELS = "candidate,fast,stable")
+2-2
helm/hsm-secrets-operator/Chart.yaml
···22name: hsm-secrets-operator
33description: A Kubernetes operator that bridges Pico HSM binary data storage with Kubernetes Secrets
44type: application
55-version: 0.5.28
66-appVersion: v0.5.28
55+version: 0.5.29
66+appVersion: v0.5.29
77icon: https://raw.githubusercontent.com/cncf/artwork/master/projects/kubernetes/icon/color/kubernetes-icon-color.svg
88home: https://github.com/evanjarrett/hsm-secrets-operator
99sources:
+235-22
internal/agent/connection_pool.go
···2727 "github.com/evanjarrett/hsm-secrets-operator/internal/hsm"
2828)
29293030+// ClientWrapper wraps an HSM client to track usage and manage lifecycle
3131+type ClientWrapper struct {
3232+ client hsm.Client
3333+ pool *ConnectionPool
3434+ endpoint string
3535+}
3636+3737+// Implement hsm.Client interface methods by delegating to wrapped client
3838+func (cw *ClientWrapper) Initialize(ctx context.Context, config hsm.Config) error {
3939+ return cw.client.Initialize(ctx, config)
4040+}
4141+4242+func (cw *ClientWrapper) WriteSecret(ctx context.Context, path string, data hsm.SecretData) error {
4343+ return cw.client.WriteSecret(ctx, path, data)
4444+}
4545+4646+func (cw *ClientWrapper) ReadSecret(ctx context.Context, path string) (hsm.SecretData, error) {
4747+ return cw.client.ReadSecret(ctx, path)
4848+}
4949+5050+func (cw *ClientWrapper) DeleteSecret(ctx context.Context, path string) error {
5151+ return cw.client.DeleteSecret(ctx, path)
5252+}
5353+5454+func (cw *ClientWrapper) ListSecrets(ctx context.Context, prefix string) ([]string, error) {
5555+ return cw.client.ListSecrets(ctx, prefix)
5656+}
5757+5858+func (cw *ClientWrapper) WriteSecretWithMetadata(ctx context.Context, path string, data hsm.SecretData, metadata *hsm.SecretMetadata) error {
5959+ return cw.client.WriteSecretWithMetadata(ctx, path, data, metadata)
6060+}
6161+6262+func (cw *ClientWrapper) ReadMetadata(ctx context.Context, path string) (*hsm.SecretMetadata, error) {
6363+ return cw.client.ReadMetadata(ctx, path)
6464+}
6565+6666+func (cw *ClientWrapper) GetInfo(ctx context.Context) (*hsm.HSMInfo, error) {
6767+ return cw.client.GetInfo(ctx)
6868+}
6969+7070+func (cw *ClientWrapper) GetChecksum(ctx context.Context, path string) (string, error) {
7171+ return cw.client.GetChecksum(ctx, path)
7272+}
7373+7474+func (cw *ClientWrapper) IsConnected() bool {
7575+ return cw.client.IsConnected()
7676+}
7777+7878+func (cw *ClientWrapper) Close() error {
7979+ // Mark client as no longer in use when closed
8080+ cw.pool.mutex.Lock()
8181+ if pooled, exists := cw.pool.clients[cw.endpoint]; exists {
8282+ pooled.InUse = false
8383+ cw.pool.logger.V(1).Info("Client marked as not in use", "endpoint", cw.endpoint)
8484+ }
8585+ cw.pool.mutex.Unlock()
8686+8787+ // Note: Don't close the underlying client here, let the pool manage it
8888+ return nil
8989+}
9090+3091// PooledClient represents a cached gRPC client with metadata
3192type PooledClient struct {
3232- Client hsm.Client
3333- Endpoint string
3434- CreatedAt time.Time
3535- LastUsed time.Time
9393+ Client hsm.Client
9494+ Endpoint string
9595+ CreatedAt time.Time
9696+ LastUsed time.Time
9797+ UsageCount int64 // Track how many times this client has been used
9898+ InUse bool // Track if client is currently being used
9999+}
100100+101101+// ConnectionPoolMetrics tracks connection pool performance
102102+type ConnectionPoolMetrics struct {
103103+ TotalConnections int64
104104+ SuccessfulConnections int64
105105+ FailedConnections int64
106106+ ConnectionReuses int64
107107+ HealthCheckPasses int64
108108+ HealthCheckFailures int64
109109+ ConnectionTimeouts int64
110110+ RetryAttempts int64
36111}
3711238113// ConnectionPool manages a pool of gRPC connections to HSM agents
···44119 cleanupInterval time.Duration
45120 stopChan chan struct{}
46121 stopOnce sync.Once
122122+ metrics ConnectionPoolMetrics
47123}
4812449125// NewConnectionPool creates a new connection pool
···6414065141// GetClient returns a cached client or creates a new one
66142func (cp *ConnectionPool) GetClient(ctx context.Context, endpoint string, logger logr.Logger) (hsm.Client, error) {
143143+ return cp.getClientWithRetry(ctx, endpoint, logger, 3) // Retry up to 3 times
144144+}
145145+146146+// getClientWithRetry implements retry logic for client creation
147147+func (cp *ConnectionPool) getClientWithRetry(ctx context.Context, endpoint string, logger logr.Logger, maxRetries int) (hsm.Client, error) {
148148+ var lastErr error
149149+150150+ for attempt := 1; attempt <= maxRetries; attempt++ {
151151+ client, err := cp.getClientAttempt(ctx, endpoint, logger)
152152+ if err == nil {
153153+ // Perform health check on the returned client
154154+ if wrapper, ok := client.(*ClientWrapper); ok {
155155+ if !wrapper.IsConnected() {
156156+ cp.logger.Info("Health check failed on new client", "endpoint", endpoint,
157157+ "attempt", attempt, "error", "client not connected")
158158+ // Remove the client and try again
159159+ cp.RemoveClient(endpoint)
160160+ lastErr = fmt.Errorf("health check failed: client not connected")
161161+ if attempt < maxRetries {
162162+ backoffDuration := time.Duration(attempt) * time.Second
163163+ cp.logger.Info("Retrying client creation after backoff",
164164+ "endpoint", endpoint, "attempt", attempt+1, "backoff", backoffDuration.String())
165165+ time.Sleep(backoffDuration)
166166+ continue
167167+ }
168168+ }
169169+ }
170170+ return client, nil
171171+ }
172172+173173+ lastErr = err
174174+ if attempt < maxRetries {
175175+ cp.metrics.RetryAttempts++
176176+ backoffDuration := time.Duration(attempt) * time.Second
177177+ cp.logger.Info("Client creation failed, retrying after backoff",
178178+ "endpoint", endpoint, "attempt", attempt, "error", err, "backoff", backoffDuration.String())
179179+ time.Sleep(backoffDuration)
180180+ }
181181+ }
182182+183183+ return nil, fmt.Errorf("failed to create client after %d attempts: %w", maxRetries, lastErr)
184184+}
185185+186186+// getClientAttempt performs a single attempt to get or create a client
187187+func (cp *ConnectionPool) getClientAttempt(ctx context.Context, endpoint string, logger logr.Logger) (hsm.Client, error) {
67188 cp.mutex.Lock()
68189 defer cp.mutex.Unlock()
69190···73194 if pooled, exists := cp.clients[endpoint]; exists {
74195 // Check if client is still valid and not too old
75196 if now.Sub(pooled.CreatedAt) < cp.maxAge {
7676- // Update last used time
197197+ // Check if client is currently in use - avoid closing active connections
198198+ if pooled.InUse {
199199+ cp.logger.Info("Client is in use, extending max age", "endpoint", endpoint,
200200+ "age", now.Sub(pooled.CreatedAt).String(), "usage_count", pooled.UsageCount)
201201+ // Extend the client's life while it's in use
202202+ pooled.CreatedAt = now.Add(-cp.maxAge / 2)
203203+ }
204204+205205+ // Update usage tracking
77206 pooled.LastUsed = now
7878- cp.logger.V(1).Info("Reusing cached gRPC client", "endpoint", endpoint,
7979- "age", now.Sub(pooled.CreatedAt).String())
8080- return pooled.Client, nil
207207+ pooled.UsageCount++
208208+ pooled.InUse = true
209209+210210+ cp.logger.Info("Reusing cached gRPC client", "endpoint", endpoint,
211211+ "age", now.Sub(pooled.CreatedAt).String(), "usage_count", pooled.UsageCount)
212212+ cp.metrics.ConnectionReuses++
213213+ return &ClientWrapper{client: pooled.Client, pool: cp, endpoint: endpoint}, nil
81214 } else {
8282- // Client is too old, close it and remove from cache
8383- cp.logger.V(1).Info("gRPC client expired, closing", "endpoint", endpoint,
8484- "age", now.Sub(pooled.CreatedAt).String())
215215+ // Client is too old, but check if it's in use
216216+ if pooled.InUse {
217217+ cp.logger.Info("Client expired but in use, extending life", "endpoint", endpoint,
218218+ "age", now.Sub(pooled.CreatedAt).String())
219219+ pooled.CreatedAt = now.Add(-cp.maxAge / 2)
220220+ pooled.LastUsed = now
221221+ pooled.UsageCount++
222222+ return &ClientWrapper{client: pooled.Client, pool: cp, endpoint: endpoint}, nil
223223+ }
224224+225225+ // Client is too old and not in use, close it and remove from cache
226226+ cp.logger.Info("gRPC client expired, closing", "endpoint", endpoint,
227227+ "age", now.Sub(pooled.CreatedAt).String(), "usage_count", pooled.UsageCount)
85228 if err := pooled.Client.Close(); err != nil {
86229 cp.logger.V(1).Info("Error closing expired client", "endpoint", endpoint, "error", err)
87230 }
···9123492235 // Create new client
93236 cp.logger.V(1).Info("Creating new gRPC client", "endpoint", endpoint)
237237+ cp.metrics.TotalConnections++
94238 client, err := NewGRPCClient(endpoint, logger)
95239 if err != nil {
240240+ cp.metrics.FailedConnections++
96241 return nil, fmt.Errorf("failed to create gRPC client: %w", err)
97242 }
9824399244 // Initialize the connection
100245 if err := client.Initialize(ctx, hsm.Config{}); err != nil {
246246+ cp.metrics.FailedConnections++
101247 if closeErr := client.Close(); closeErr != nil {
102248 cp.logger.V(1).Info("Error closing client after failed initialization",
103249 "endpoint", endpoint, "error", closeErr)
···107253108254 // Cache the client
109255 cp.clients[endpoint] = &PooledClient{
110110- Client: client,
111111- Endpoint: endpoint,
112112- CreatedAt: now,
113113- LastUsed: now,
256256+ Client: client,
257257+ Endpoint: endpoint,
258258+ CreatedAt: now,
259259+ LastUsed: now,
260260+ UsageCount: 1,
261261+ InUse: true,
114262 }
115263116264 cp.logger.Info("Created and cached new gRPC client", "endpoint", endpoint)
117117- return client, nil
265265+ cp.metrics.SuccessfulConnections++
266266+ return &ClientWrapper{client: client, pool: cp, endpoint: endpoint}, nil
118267}
119268120269// RemoveClient removes a client from the pool (useful when agent pods restart)
···173322 var toRemove []string
174323175324 for endpoint, pooled := range cp.clients {
176176- // Remove if too old or unused for too long
177177- if now.Sub(pooled.CreatedAt) > cp.maxAge || now.Sub(pooled.LastUsed) > cp.maxAge {
325325+ // Remove if too old or unused for too long, but not if currently in use
326326+ shouldRemove := (now.Sub(pooled.CreatedAt) > cp.maxAge || now.Sub(pooled.LastUsed) > cp.maxAge) && !pooled.InUse
327327+ if shouldRemove {
328328+ cp.logger.V(1).Info("Marking client for cleanup", "endpoint", endpoint,
329329+ "age", now.Sub(pooled.CreatedAt).String(),
330330+ "last_used_ago", now.Sub(pooled.LastUsed).String(),
331331+ "usage_count", pooled.UsageCount, "in_use", pooled.InUse)
178332 toRemove = append(toRemove, endpoint)
333333+ } else if pooled.InUse && (now.Sub(pooled.CreatedAt) > cp.maxAge) {
334334+ cp.logger.Info("Client is old but still in use, keeping alive", "endpoint", endpoint,
335335+ "age", now.Sub(pooled.CreatedAt).String(), "usage_count", pooled.UsageCount)
179336 }
180337 }
181338···198355 cp.mutex.RLock()
199356 defer cp.mutex.RUnlock()
200357358358+ now := time.Now()
201359 stats := make(map[string]interface{})
202360 stats["active_connections"] = len(cp.clients)
361361+ stats["max_age_seconds"] = cp.maxAge.Seconds()
362362+ stats["cleanup_interval_seconds"] = cp.cleanupInterval.Seconds()
203363204204- endpoints := make([]string, 0, len(cp.clients))
205205- for endpoint := range cp.clients {
206206- endpoints = append(endpoints, endpoint)
364364+ var totalUsage int64
365365+ inUseCount := 0
366366+ clientDetails := make([]map[string]interface{}, 0, len(cp.clients))
367367+368368+ for endpoint, pooled := range cp.clients {
369369+ totalUsage += pooled.UsageCount
370370+ if pooled.InUse {
371371+ inUseCount++
372372+ }
373373+374374+ clientDetails = append(clientDetails, map[string]interface{}{
375375+ "endpoint": endpoint,
376376+ "age_seconds": now.Sub(pooled.CreatedAt).Seconds(),
377377+ "last_used_seconds_ago": now.Sub(pooled.LastUsed).Seconds(),
378378+ "usage_count": pooled.UsageCount,
379379+ "in_use": pooled.InUse,
380380+ })
207381 }
208208- stats["endpoints"] = endpoints
382382+383383+ stats["clients_in_use"] = inUseCount
384384+ stats["total_usage_count"] = totalUsage
385385+ stats["client_details"] = clientDetails
386386+387387+ // Add connection pool metrics
388388+ stats["metrics"] = map[string]interface{}{
389389+ "total_connections": cp.metrics.TotalConnections,
390390+ "successful_connections": cp.metrics.SuccessfulConnections,
391391+ "failed_connections": cp.metrics.FailedConnections,
392392+ "connection_reuses": cp.metrics.ConnectionReuses,
393393+ "health_check_passes": cp.metrics.HealthCheckPasses,
394394+ "health_check_failures": cp.metrics.HealthCheckFailures,
395395+ "connection_timeouts": cp.metrics.ConnectionTimeouts,
396396+ "retry_attempts": cp.metrics.RetryAttempts,
397397+ }
209398210399 return stats
211400}
401401+402402+// HealthCheckClient verifies that a client connection is still healthy
403403+func (cp *ConnectionPool) HealthCheckClient(ctx context.Context, endpoint string) error {
404404+ cp.mutex.RLock()
405405+ pooled, exists := cp.clients[endpoint]
406406+ cp.mutex.RUnlock()
407407+408408+ if !exists {
409409+ return fmt.Errorf("no client found for endpoint %s", endpoint)
410410+ }
411411+412412+ // Check if the client is still connected
413413+ if !pooled.Client.IsConnected() {
414414+ cp.metrics.HealthCheckFailures++
415415+ cp.logger.Info("Health check failed for client, removing from pool",
416416+ "endpoint", endpoint, "error", "client not connected")
417417+ cp.RemoveClient(endpoint)
418418+ return fmt.Errorf("health check failed: client not connected")
419419+ }
420420+421421+ cp.metrics.HealthCheckPasses++
422422+ cp.logger.V(1).Info("Health check passed for client", "endpoint", endpoint)
423423+ return nil
424424+}
+112-10
internal/mirror/manager.go
···2121 "crypto/sha256"
2222 "fmt"
2323 "sort"
2424+ "strings"
2425 "time"
25262627 "github.com/go-logr/logr"
···658659 return mm.executeMirrorPlans(ctx, plans, deviceLookup, logger), nil
659660}
660661661661-// discoverAllSecrets discovers all secrets present on any HSM device
662662+// discoverAllSecrets discovers all secrets present on any HSM device with retry logic
662663func (mm *MirrorManager) discoverAllSecrets(ctx context.Context, devices []hsmv1alpha1.DiscoveredDevice, logger logr.Logger) []string {
663664 secretPaths := make(map[string]bool)
665665+ failedDevices := make([]hsmv1alpha1.DiscoveredDevice, 0)
666666+667667+ logger.Info("Starting secret discovery", "totalDevices", len(devices))
664668665669 for _, device := range devices {
666670 deviceId := device.SerialNumber
667671 deviceLogger := logger.WithValues("device", deviceId)
668672669669- hsmClient, err := mm.agentManager.CreateGRPCClient(ctx, device, deviceLogger)
673673+ secrets, err := mm.discoverDeviceSecretsWithRetry(ctx, device, deviceLogger, 3)
670674 if err != nil {
671671- deviceLogger.Info("Failed to connect to device for discovery, skipping", "error", err)
675675+ deviceLogger.Error(err, "Failed to discover secrets on device after retries")
676676+ failedDevices = append(failedDevices, device)
672677 continue
673678 }
674679675675- secrets, err := hsmClient.ListSecrets(ctx, "")
676676- if err != nil {
677677- deviceLogger.Info("Failed to list secrets on device, skipping", "error", err)
678678- continue
680680+ deviceLogger.Info("Successfully discovered secrets on device", "secretCount", len(secrets))
681681+ for _, secretPath := range secrets {
682682+ secretPaths[secretPath] = true
679683 }
684684+ }
680685681681- deviceLogger.Info("Discovered secrets on device", "secretCount", len(secrets))
682682- for _, secretPath := range secrets {
683683- secretPaths[secretPath] = true
686686+ // Log summary of discovery results
687687+ successCount := len(devices) - len(failedDevices)
688688+ logger.Info("Discovery completed",
689689+ "successfulDevices", successCount,
690690+ "failedDevices", len(failedDevices),
691691+ "totalSecretsFound", len(secretPaths))
692692+693693+ if len(failedDevices) > 0 {
694694+ for _, device := range failedDevices {
695695+ logger.Info("Device failed discovery", "device", device.SerialNumber, "nodeName", device.NodeName, "devicePath", device.DevicePath)
684696 }
685697 }
686698···692704 sort.Strings(result)
693705694706 return result
707707+}
708708+709709+// discoverDeviceSecretsWithRetry attempts to discover secrets from a single device with retry logic
710710+func (mm *MirrorManager) discoverDeviceSecretsWithRetry(ctx context.Context, device hsmv1alpha1.DiscoveredDevice, logger logr.Logger, maxRetries int) ([]string, error) {
711711+ var lastErr error
712712+713713+ for attempt := 1; attempt <= maxRetries; attempt++ {
714714+ attemptLogger := logger.WithValues("attempt", attempt, "maxRetries", maxRetries)
715715+716716+ // Create client with connection pool retry logic
717717+ hsmClient, err := mm.agentManager.CreateGRPCClient(ctx, device, attemptLogger)
718718+ if err != nil {
719719+ lastErr = fmt.Errorf("failed to create gRPC client: %w", err)
720720+ attemptLogger.Info("Failed to connect to device", "error", err)
721721+722722+ if attempt < maxRetries {
723723+ backoffDuration := time.Duration(attempt) * time.Second
724724+ attemptLogger.Info("Retrying device connection after backoff", "backoff", backoffDuration.String())
725725+ time.Sleep(backoffDuration)
726726+ continue
727727+ }
728728+ break
729729+ }
730730+731731+ // Ensure client is closed after use
732732+ defer func() {
733733+ if closeErr := hsmClient.Close(); closeErr != nil {
734734+ logger.V(1).Info("Error closing HSM client", "error", closeErr)
735735+ }
736736+ }()
737737+738738+ // Add timeout for list secrets operation
739739+ listCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
740740+ secrets, err := hsmClient.ListSecrets(listCtx, "")
741741+ cancel()
742742+743743+ if err != nil {
744744+ lastErr = fmt.Errorf("failed to list secrets: %w", err)
745745+ attemptLogger.Info("Failed to list secrets on device", "error", err)
746746+747747+ // Check for specific connection-related errors that might benefit from retry
748748+ if isConnectionError(err) && attempt < maxRetries {
749749+ backoffDuration := time.Duration(attempt) * time.Second
750750+ attemptLogger.Info("Connection error detected, retrying after backoff",
751751+ "backoff", backoffDuration.String(), "error", err)
752752+ time.Sleep(backoffDuration)
753753+ continue
754754+ }
755755+756756+ if attempt < maxRetries {
757757+ backoffDuration := time.Duration(attempt) * time.Second
758758+ attemptLogger.Info("Retrying list secrets after backoff", "backoff", backoffDuration.String())
759759+ time.Sleep(backoffDuration)
760760+ continue
761761+ }
762762+ break
763763+ }
764764+765765+ // Success case
766766+ attemptLogger.Info("Successfully listed secrets", "secretCount", len(secrets))
767767+ return secrets, nil
768768+ }
769769+770770+ return nil, fmt.Errorf("failed to discover secrets after %d attempts: %w", maxRetries, lastErr)
771771+}
772772+773773+// isConnectionError checks if an error is related to connection issues that might benefit from retry
774774+func isConnectionError(err error) bool {
775775+ if err == nil {
776776+ return false
777777+ }
778778+779779+ errStr := err.Error()
780780+ connectionErrors := []string{
781781+ "grpc: the client connection is closing",
782782+ "connection refused",
783783+ "connection reset",
784784+ "connection timeout",
785785+ "context deadline exceeded",
786786+ "rpc error: code = Canceled",
787787+ "rpc error: code = Unavailable",
788788+ }
789789+790790+ for _, connErr := range connectionErrors {
791791+ if strings.Contains(errStr, connErr) {
792792+ return true
793793+ }
794794+ }
795795+796796+ return false
695797}
696798697799// calculateChecksum calculates SHA256 checksum of secret data