···33# To re-generate a bundle for another specific version without changing the standard setup, you can:
44# - use the VERSION as arg of the bundle target (e.g make bundle VERSION=0.0.2)
55# - use environment variables to overwrite this value (e.g export VERSION=0.0.2)
66-VERSION ?= 0.5.27
66+VERSION ?= 0.5.28
7788# CHANNELS define the bundle channels used in the bundle.
99# Add a new line here if you would like to change its default config. (E.g CHANNELS = "candidate,fast,stable")
+2-2
helm/hsm-secrets-operator/Chart.yaml
···22name: hsm-secrets-operator
33description: A Kubernetes operator that bridges Pico HSM binary data storage with Kubernetes Secrets
44type: application
55-version: 0.5.27
66-appVersion: v0.5.27
55+version: 0.5.28
66+appVersion: v0.5.28
77icon: https://raw.githubusercontent.com/cncf/artwork/master/projects/kubernetes/icon/color/kubernetes-icon-color.svg
88home: https://github.com/evanjarrett/hsm-secrets-operator
99sources:
+211
internal/agent/connection_pool.go
···11+/*
22+Copyright 2025.
33+44+Licensed under the Apache License, Version 2.0 (the "License");
55+you may not use this file except in compliance with the License.
66+You may obtain a copy of the License at
77+88+ http://www.apache.org/licenses/LICENSE-2.0
99+1010+Unless required by applicable law or agreed to in writing, software
1111+distributed under the License is distributed on an "AS IS" BASIS,
1212+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313+See the License for the specific language governing permissions and
1414+limitations under the License.
1515+*/
1616+1717+package agent
1818+1919+import (
2020+ "context"
2121+ "fmt"
2222+ "sync"
2323+ "time"
2424+2525+ "github.com/go-logr/logr"
2626+2727+ "github.com/evanjarrett/hsm-secrets-operator/internal/hsm"
2828+)
2929+3030+// PooledClient represents a cached gRPC client with metadata
3131+type PooledClient struct {
3232+ Client hsm.Client
3333+ Endpoint string
3434+ CreatedAt time.Time
3535+ LastUsed time.Time
3636+}
3737+3838+// ConnectionPool manages a pool of gRPC connections to HSM agents
3939+type ConnectionPool struct {
4040+ clients map[string]*PooledClient // endpoint -> client
4141+ mutex sync.RWMutex
4242+ logger logr.Logger
4343+ maxAge time.Duration // Maximum age before client is recreated
4444+ cleanupInterval time.Duration
4545+ stopChan chan struct{}
4646+ stopOnce sync.Once
4747+}
4848+4949+// NewConnectionPool creates a new connection pool
5050+func NewConnectionPool(logger logr.Logger) *ConnectionPool {
5151+ pool := &ConnectionPool{
5252+ clients: make(map[string]*PooledClient),
5353+ logger: logger.WithName("connection-pool"),
5454+ maxAge: 5 * time.Minute, // Recreate connections every 5 minutes
5555+ cleanupInterval: 1 * time.Minute, // Cleanup every minute
5656+ stopChan: make(chan struct{}),
5757+ }
5858+5959+ // Start background cleanup goroutine
6060+ go pool.cleanupLoop()
6161+6262+ return pool
6363+}
6464+6565+// GetClient returns a cached client or creates a new one
6666+func (cp *ConnectionPool) GetClient(ctx context.Context, endpoint string, logger logr.Logger) (hsm.Client, error) {
6767+ cp.mutex.Lock()
6868+ defer cp.mutex.Unlock()
6969+7070+ now := time.Now()
7171+7272+ // Check if we have a cached client
7373+ if pooled, exists := cp.clients[endpoint]; exists {
7474+ // Check if client is still valid and not too old
7575+ if now.Sub(pooled.CreatedAt) < cp.maxAge {
7676+ // Update last used time
7777+ pooled.LastUsed = now
7878+ cp.logger.V(1).Info("Reusing cached gRPC client", "endpoint", endpoint,
7979+ "age", now.Sub(pooled.CreatedAt).String())
8080+ return pooled.Client, nil
8181+ } else {
8282+ // Client is too old, close it and remove from cache
8383+ cp.logger.V(1).Info("gRPC client expired, closing", "endpoint", endpoint,
8484+ "age", now.Sub(pooled.CreatedAt).String())
8585+ if err := pooled.Client.Close(); err != nil {
8686+ cp.logger.V(1).Info("Error closing expired client", "endpoint", endpoint, "error", err)
8787+ }
8888+ delete(cp.clients, endpoint)
8989+ }
9090+ }
9191+9292+ // Create new client
9393+ cp.logger.V(1).Info("Creating new gRPC client", "endpoint", endpoint)
9494+ client, err := NewGRPCClient(endpoint, logger)
9595+ if err != nil {
9696+ return nil, fmt.Errorf("failed to create gRPC client: %w", err)
9797+ }
9898+9999+ // Initialize the connection
100100+ if err := client.Initialize(ctx, hsm.Config{}); err != nil {
101101+ if closeErr := client.Close(); closeErr != nil {
102102+ cp.logger.V(1).Info("Error closing client after failed initialization",
103103+ "endpoint", endpoint, "error", closeErr)
104104+ }
105105+ return nil, fmt.Errorf("failed to initialize gRPC client: %w", err)
106106+ }
107107+108108+ // Cache the client
109109+ cp.clients[endpoint] = &PooledClient{
110110+ Client: client,
111111+ Endpoint: endpoint,
112112+ CreatedAt: now,
113113+ LastUsed: now,
114114+ }
115115+116116+ cp.logger.Info("Created and cached new gRPC client", "endpoint", endpoint)
117117+ return client, nil
118118+}
119119+120120+// RemoveClient removes a client from the pool (useful when agent pods restart)
121121+func (cp *ConnectionPool) RemoveClient(endpoint string) {
122122+ cp.mutex.Lock()
123123+ defer cp.mutex.Unlock()
124124+125125+ if pooled, exists := cp.clients[endpoint]; exists {
126126+ cp.logger.Info("Removing client from pool", "endpoint", endpoint)
127127+ if err := pooled.Client.Close(); err != nil {
128128+ cp.logger.V(1).Info("Error closing removed client", "endpoint", endpoint, "error", err)
129129+ }
130130+ delete(cp.clients, endpoint)
131131+ }
132132+}
133133+134134+// Close closes all connections and stops the pool
135135+func (cp *ConnectionPool) Close() {
136136+ cp.stopOnce.Do(func() {
137137+ close(cp.stopChan)
138138+139139+ cp.mutex.Lock()
140140+ defer cp.mutex.Unlock()
141141+142142+ cp.logger.Info("Closing connection pool", "cached_clients", len(cp.clients))
143143+ for endpoint, pooled := range cp.clients {
144144+ if err := pooled.Client.Close(); err != nil {
145145+ cp.logger.V(1).Info("Error closing pooled client", "endpoint", endpoint, "error", err)
146146+ }
147147+ }
148148+ cp.clients = make(map[string]*PooledClient)
149149+ })
150150+}
151151+152152+// cleanupLoop periodically removes unused/expired connections
153153+func (cp *ConnectionPool) cleanupLoop() {
154154+ ticker := time.NewTicker(cp.cleanupInterval)
155155+ defer ticker.Stop()
156156+157157+ for {
158158+ select {
159159+ case <-ticker.C:
160160+ cp.cleanup()
161161+ case <-cp.stopChan:
162162+ return
163163+ }
164164+ }
165165+}
166166+167167+// cleanup removes expired connections
168168+func (cp *ConnectionPool) cleanup() {
169169+ cp.mutex.Lock()
170170+ defer cp.mutex.Unlock()
171171+172172+ now := time.Now()
173173+ var toRemove []string
174174+175175+ for endpoint, pooled := range cp.clients {
176176+ // Remove if too old or unused for too long
177177+ if now.Sub(pooled.CreatedAt) > cp.maxAge || now.Sub(pooled.LastUsed) > cp.maxAge {
178178+ toRemove = append(toRemove, endpoint)
179179+ }
180180+ }
181181+182182+ if len(toRemove) > 0 {
183183+ cp.logger.V(1).Info("Cleaning up expired connections", "count", len(toRemove))
184184+ for _, endpoint := range toRemove {
185185+ if pooled, exists := cp.clients[endpoint]; exists {
186186+ if err := pooled.Client.Close(); err != nil {
187187+ cp.logger.V(1).Info("Error closing expired client during cleanup",
188188+ "endpoint", endpoint, "error", err)
189189+ }
190190+ delete(cp.clients, endpoint)
191191+ }
192192+ }
193193+ }
194194+}
195195+196196+// GetStats returns pool statistics
197197+func (cp *ConnectionPool) GetStats() map[string]interface{} {
198198+ cp.mutex.RLock()
199199+ defer cp.mutex.RUnlock()
200200+201201+ stats := make(map[string]interface{})
202202+ stats["active_connections"] = len(cp.clients)
203203+204204+ endpoints := make([]string, 0, len(cp.clients))
205205+ for endpoint := range cp.clients {
206206+ endpoints = append(endpoints, endpoint)
207207+ }
208208+ stats["endpoints"] = endpoints
209209+210210+ return stats
211211+}
+4-3
internal/agent/grpc_client.go
···4747 return nil, fmt.Errorf("endpoint cannot be empty")
4848 }
49495050- // Create gRPC connection with keepalive
5050+ // Create gRPC connection with conservative keepalive settings
5151+ // Reduce ping frequency to prevent "too_many_pings" errors
5152 conn, err := grpc.NewClient(endpoint,
5253 grpc.WithTransportCredentials(insecure.NewCredentials()),
5354 grpc.WithKeepaliveParams(keepalive.ClientParameters{
5454- Time: 10 * time.Second,
5555- Timeout: 3 * time.Second,
5555+ Time: 30 * time.Second, // Reduced from 10s to 30s
5656+ Timeout: 10 * time.Second, // Increased from 3s to 10s
5657 PermitWithoutStream: true,
5758 }),
5859 )
+10
internal/agent/grpc_server.go
···2626 "github.com/go-logr/logr"
2727 "google.golang.org/grpc"
2828 "google.golang.org/grpc/codes"
2929+ "google.golang.org/grpc/keepalive"
2930 "google.golang.org/grpc/status"
30313132 hsmv1 "github.com/evanjarrett/hsm-secrets-operator/api/proto/hsm/v1"
···6970 return fmt.Errorf("failed to listen on port %d: %w", s.port, err)
7071 }
71727373+ // Configure server with lenient keepalive policy to prevent "too_many_pings" errors
7274 grpcServer := grpc.NewServer(
7375 grpc.UnaryInterceptor(s.loggingInterceptor),
7676+ grpc.KeepaliveEnforcementPolicy(keepalive.EnforcementPolicy{
7777+ MinTime: 15 * time.Second, // Allow pings every 15s minimum
7878+ PermitWithoutStream: true, // Allow pings without active streams
7979+ }),
8080+ grpc.KeepaliveParams(keepalive.ServerParameters{
8181+ Time: 60 * time.Second, // Send pings every 60s if no activity
8282+ Timeout: 10 * time.Second, // Wait 10s for ping response
8383+ }),
7484 )
75857686 // Register the HSM agent service
+21-13
internal/agent/manager.go
···8383 ImageResolver ImageResolver
84848585 // Internal tracking
8686- activeAgents map[string]*AgentInfo // deviceName -> AgentInfo
8787- mu sync.RWMutex
8686+ activeAgents map[string]*AgentInfo // deviceName -> AgentInfo
8787+ connectionPool *ConnectionPool // Shared connection pool for gRPC clients
8888+ mu sync.RWMutex
88898990 // Test configuration
9091 TestMode bool // Enable test mode for faster operations
···107108108109// NewManager creates a new agent manager
109110func NewManager(k8sClient client.Client, namespace string, imageResolver ImageResolver) *Manager {
111111+ // Create logger for the manager
112112+ logger := logr.FromContextOrDiscard(context.Background()).WithName("agent-manager")
110113111114 m := &Manager{
112115 Client: k8sClient,
113116 AgentNamespace: namespace,
114117 ImageResolver: imageResolver,
115118 activeAgents: make(map[string]*AgentInfo),
119119+ connectionPool: NewConnectionPool(logger),
116120 // Default production timeouts
117121 WaitTimeout: 60 * time.Second,
118122 WaitPollInterval: 2 * time.Second,
···126130127131// NewTestManager creates a new agent manager optimized for testing
128132func NewTestManager(k8sClient client.Client, namespace string, imageResolver ImageResolver) *Manager {
133133+ // Create logger for the test manager
134134+ logger := logr.FromContextOrDiscard(context.Background()).WithName("agent-manager-test")
135135+129136 m := &Manager{
130137 Client: k8sClient,
131138 AgentNamespace: namespace,
132139 ImageResolver: imageResolver,
133140 activeAgents: make(map[string]*AgentInfo),
141141+ connectionPool: NewConnectionPool(logger),
134142 // Fast test timeouts
135143 TestMode: true,
136144 WaitTimeout: 5 * time.Second,
···980988 podIP := targetPod.Status.PodIPs[0].IP
981989 endpoint := fmt.Sprintf("%s:%d", podIP, AgentPort)
982990983983- // Create gRPC client
984984- grpcClient, err := NewGRPCClient(endpoint, logger)
991991+ // Use connection pool to get or create cached client
992992+ // This significantly reduces connection overhead and prevents "too_many_pings" errors
993993+ grpcClient, err := m.connectionPool.GetClient(ctx, endpoint, logger)
985994 if err != nil {
986986- return nil, fmt.Errorf("failed to create gRPC client for %s: %w", endpoint, err)
987987- }
988988-989989- // Test the connection
990990- if err := grpcClient.Initialize(ctx, hsm.Config{}); err != nil {
991991- if err := grpcClient.Close(); err != nil {
992992- logger.Error(err, "Failed to close gRPC client after failed initialization")
993993- }
994994- return nil, fmt.Errorf("failed to initialize gRPC client for %s: %w", endpoint, err)
995995+ return nil, fmt.Errorf("failed to get pooled gRPC client for %s: %w", endpoint, err)
995996 }
996997997998 return grpcClient, nil
···10421043 q, _ := resource.ParseQuantity(s)
10431044 return q
10441045}
10461046+10471047+// Close closes the manager and all its resources including the connection pool
10481048+func (m *Manager) Close() {
10491049+ if m.connectionPool != nil {
10501050+ m.connectionPool.Close()
10511051+ }
10521052+}
+2-25
internal/mirror/manager.go
···112112)
113113114114// buildSecretInventory builds a comprehensive inventory of secrets across all devices
115115+//
116116+//nolint:unparam // Error return preserved for future error handling scenarios
115117func (mm *MirrorManager) buildSecretInventory(ctx context.Context, secretPaths []string, devices []hsmv1alpha1.DiscoveredDevice, logger logr.Logger) (map[string]*SecretInventory, error) {
116118 inventory := make(map[string]*SecretInventory)
117119···145147 }
146148 continue
147149 }
148148-149149- defer func(client hsm.Client, device string) {
150150- if closeErr := client.Close(); closeErr != nil {
151151- logger.V(1).Info("Failed to close gRPC client", "device", device, "error", closeErr)
152152- }
153153- }(grpcClient, deviceId)
154150155151 // Check if device is connected
156152 if !grpcClient.IsConnected() {
···544540 if err != nil {
545541 return nil, nil, fmt.Errorf("failed to create gRPC client: %w", err)
546542 }
547547- defer func() {
548548- if closeErr := grpcClient.Close(); closeErr != nil {
549549- logger.V(1).Info("Failed to close gRPC client", "error", closeErr)
550550- }
551551- }()
552543553544 if !grpcClient.IsConnected() {
554545 return nil, nil, fmt.Errorf("device not connected")
···576567 if err != nil {
577568 return fmt.Errorf("failed to create gRPC client: %w", err)
578569 }
579579- defer func() {
580580- if closeErr := grpcClient.Close(); closeErr != nil {
581581- logger.V(1).Info("Failed to close gRPC client", "error", closeErr)
582582- }
583583- }()
584584-585570 if !grpcClient.IsConnected() {
586571 return fmt.Errorf("device not connected")
587572 }
···600585 if err != nil {
601586 return fmt.Errorf("failed to create gRPC client: %w", err)
602587 }
603603- defer func() {
604604- if closeErr := grpcClient.Close(); closeErr != nil {
605605- logger.V(1).Info("Failed to close gRPC client", "error", closeErr)
606606- }
607607- }()
608588609589 if !grpcClient.IsConnected() {
610590 return fmt.Errorf("device not connected")
···778758 }
779759 logger.Info("HSM agents are ready", "readyDevices", len(devices))
780760 return true, nil
781781- }
782782- if closeErr := grpcClient.Close(); closeErr != nil {
783783- logger.V(1).Info("Failed to close gRPC client", "error", closeErr)
784761 }
785762 }
786763 }