···43434444 // SyncInterval defines how often to check for HSM changes (in seconds)
4545 // Only applies when AutoSync is true
4646- // +kubebuilder:default=300
4646+ // +kubebuilder:default=30
4747 // +optional
4848 SyncInterval int32 `json:"syncInterval,omitempty"`
4949}
···6262 SyncStatusPending SyncStatus = "Pending"
6363)
64646565+// HSMDeviceSync tracks synchronization state for a specific HSM device
6666+type HSMDeviceSync struct {
6767+ // DeviceName is the name of the HSM device
6868+ DeviceName string `json:"deviceName"`
6969+7070+ // LastSyncTime is the timestamp of the last successful sync with this device
7171+ // +optional
7272+ LastSyncTime *metav1.Time `json:"lastSyncTime,omitempty"`
7373+7474+ // Checksum is the SHA256 checksum of the data on this device
7575+ // +optional
7676+ Checksum string `json:"checksum,omitempty"`
7777+7878+ // Status indicates the sync status for this specific device
7979+ // +optional
8080+ Status SyncStatus `json:"status,omitempty"`
8181+8282+ // LastError contains the last error when syncing with this device
8383+ // +optional
8484+ LastError string `json:"lastError,omitempty"`
8585+8686+ // Online indicates if this device is currently available
8787+ // +optional
8888+ Online bool `json:"online,omitempty"`
8989+9090+ // Version is a monotonically increasing counter for conflict resolution
9191+ // Updated each time the secret changes on this device
9292+ // +optional
9393+ Version int64 `json:"version,omitempty"`
9494+}
9595+6596// HSMSecretStatus defines the observed state of HSMSecret.
6697type HSMSecretStatus struct {
6798 // LastSyncTime is the timestamp of the last successful synchronization
6899 // +optional
69100 LastSyncTime *metav1.Time `json:"lastSyncTime,omitempty"`
701017171- // HSMChecksum is the SHA256 checksum of the HSM data
102102+ // HSMChecksum is the SHA256 checksum of the HSM data (deprecated - use DeviceSyncStatus)
72103 // +optional
73104 HSMChecksum string `json:"hsmChecksum,omitempty"`
74105···91122 // SecretRef references the created Kubernetes Secret
92123 // +optional
93124 SecretRef *corev1.ObjectReference `json:"secretRef,omitempty"`
125125+126126+ // DeviceSyncStatus tracks sync status for each HSM device in mirrored setups
127127+ // +optional
128128+ DeviceSyncStatus []HSMDeviceSync `json:"deviceSyncStatus,omitempty"`
129129+130130+ // PrimaryDevice indicates which device is currently considered the primary source of truth
131131+ // Used for conflict resolution in multi-device scenarios
132132+ // +optional
133133+ PrimaryDevice string `json:"primaryDevice,omitempty"`
134134+135135+ // SyncConflict indicates if there are conflicting versions across devices
136136+ // +optional
137137+ SyncConflict bool `json:"syncConflict,omitempty"`
94138}
9513996140// +kubebuilder:object:root=true
+26
api/v1alpha1/zz_generated.deepcopy.go
···189189}
190190191191// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
192192+func (in *HSMDeviceSync) DeepCopyInto(out *HSMDeviceSync) {
193193+ *out = *in
194194+ if in.LastSyncTime != nil {
195195+ in, out := &in.LastSyncTime, &out.LastSyncTime
196196+ *out = (*in).DeepCopy()
197197+ }
198198+}
199199+200200+// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new HSMDeviceSync.
201201+func (in *HSMDeviceSync) DeepCopy() *HSMDeviceSync {
202202+ if in == nil {
203203+ return nil
204204+ }
205205+ out := new(HSMDeviceSync)
206206+ in.DeepCopyInto(out)
207207+ return out
208208+}
209209+210210+// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil.
192211func (in *HSMPool) DeepCopyInto(out *HSMPool) {
193212 *out = *in
194213 out.TypeMeta = in.TypeMeta
···414433 in, out := &in.SecretRef, &out.SecretRef
415434 *out = new(corev1.ObjectReference)
416435 **out = **in
436436+ }
437437+ if in.DeviceSyncStatus != nil {
438438+ in, out := &in.DeviceSyncStatus, &out.DeviceSyncStatus
439439+ *out = make([]HSMDeviceSync, len(*in))
440440+ for i := range *in {
441441+ (*in)[i].DeepCopyInto(&(*out)[i])
442442+ }
417443 }
418444}
419445
+52-2
config/crd/bases/hsm.j5t.io_hsmsecrets.yaml
···7070 create
7171 type: string
7272 syncInterval:
7373- default: 300
7373+ default: 30
7474 description: |-
7575 SyncInterval defines how often to check for HSM changes (in seconds)
7676 Only applies when AutoSync is true
···138138 - type
139139 type: object
140140 type: array
141141+ deviceSyncStatus:
142142+ description: DeviceSyncStatus tracks sync status for each HSM device
143143+ in mirrored setups
144144+ items:
145145+ description: HSMDeviceSync tracks synchronization state for a specific
146146+ HSM device
147147+ properties:
148148+ checksum:
149149+ description: Checksum is the SHA256 checksum of the data on
150150+ this device
151151+ type: string
152152+ deviceName:
153153+ description: DeviceName is the name of the HSM device
154154+ type: string
155155+ lastError:
156156+ description: LastError contains the last error when syncing
157157+ with this device
158158+ type: string
159159+ lastSyncTime:
160160+ description: LastSyncTime is the timestamp of the last successful
161161+ sync with this device
162162+ format: date-time
163163+ type: string
164164+ online:
165165+ description: Online indicates if this device is currently available
166166+ type: boolean
167167+ status:
168168+ description: Status indicates the sync status for this specific
169169+ device
170170+ type: string
171171+ version:
172172+ description: |-
173173+ Version is a monotonically increasing counter for conflict resolution
174174+ Updated each time the secret changes on this device
175175+ format: int64
176176+ type: integer
177177+ required:
178178+ - deviceName
179179+ type: object
180180+ type: array
141181 hsmChecksum:
142142- description: HSMChecksum is the SHA256 checksum of the HSM data
182182+ description: HSMChecksum is the SHA256 checksum of the HSM data (deprecated
183183+ - use DeviceSyncStatus)
143184 type: string
144185 lastError:
145186 description: LastError contains the last error message if SyncStatus
···149190 description: LastSyncTime is the timestamp of the last successful
150191 synchronization
151192 format: date-time
193193+ type: string
194194+ primaryDevice:
195195+ description: |-
196196+ PrimaryDevice indicates which device is currently considered the primary source of truth
197197+ Used for conflict resolution in multi-device scenarios
152198 type: string
153199 secretChecksum:
154200 description: SecretChecksum is the SHA256 checksum of the Kubernetes
···197243 type: string
198244 type: object
199245 x-kubernetes-map-type: atomic
246246+ syncConflict:
247247+ description: SyncConflict indicates if there are conflicting versions
248248+ across devices
249249+ type: boolean
200250 syncStatus:
201251 description: SyncStatus indicates the current synchronization status
202252 type: string
···18181919import (
2020 "context"
2121+ "fmt"
2122 "net/http"
2223 "sync"
2424+ "time"
23252426 "github.com/gin-gonic/gin"
2527 "github.com/go-logr/logr"
26282729 "github.com/evanjarrett/hsm-secrets-operator/internal/hsm"
2830)
3131+3232+// WriteResult represents the result of writing to a single device
3333+type WriteResult struct {
3434+ DeviceName string
3535+ Error error
3636+}
29373038// ProxyClient handles HTTP requests and proxies them to gRPC clients
3139// It has methods that match the HTTP endpoints and handle the full request/response cycle
···96104 return grpcClient, nil
97105}
98106107107+// getAllAvailableGRPCClients returns all available gRPC clients for mirroring operations
108108+func (p *ProxyClient) getAllAvailableGRPCClients(c *gin.Context) (map[string]hsm.Client, error) {
109109+ // Extract namespace
110110+ namespace := c.GetHeader("X-Namespace")
111111+ if namespace == "" {
112112+ namespace = "secrets"
113113+ }
114114+115115+ // Get all available devices
116116+ devices, err := p.server.getAllAvailableAgents(c.Request.Context(), namespace)
117117+ if err != nil {
118118+ return nil, err
119119+ }
120120+121121+ clients := make(map[string]hsm.Client)
122122+ p.clientsMutex.Lock()
123123+ defer p.clientsMutex.Unlock()
124124+125125+ for _, deviceName := range devices {
126126+ // Try to get existing client for this device
127127+ if client, exists := p.grpcClients[deviceName]; exists && client.IsConnected() {
128128+ clients[deviceName] = client
129129+ continue
130130+ }
131131+132132+ // Close existing client for this device if it exists but is not connected
133133+ if oldClient, exists := p.grpcClients[deviceName]; exists {
134134+ if closeErr := oldClient.Close(); closeErr != nil {
135135+ p.logger.V(1).Info("Error closing old gRPC client", "device", deviceName, "error", closeErr)
136136+ }
137137+ delete(p.grpcClients, deviceName)
138138+ }
139139+140140+ // Create new gRPC client
141141+ grpcClient, err := p.server.createGRPCClient(c.Request.Context(), deviceName, namespace)
142142+ if err != nil {
143143+ p.logger.V(1).Info("Failed to create gRPC client", "device", deviceName, "error", err)
144144+ continue
145145+ }
146146+147147+ // Cache and include the client
148148+ p.grpcClients[deviceName] = grpcClient
149149+ clients[deviceName] = grpcClient
150150+ p.logger.V(1).Info("Created new gRPC client", "device", deviceName)
151151+ }
152152+153153+ return clients, nil
154154+}
155155+99156// GetInfo handles GET /hsm/info
100157func (p *ProxyClient) GetInfo(c *gin.Context) {
101158 grpcClient, err := p.getOrCreateGRPCClient(c)
···177234 p.server.sendResponse(c, http.StatusOK, "Secret read successfully", response)
178235}
179236180180-// WriteSecret handles POST/PUT /hsm/secrets/:path
237237+// WriteSecret handles POST/PUT /hsm/secrets/:path with mirroring support
181238func (p *ProxyClient) WriteSecret(c *gin.Context) {
182239 path := c.Param("path")
183240 if path == "" {
···189246 var req struct {
190247 Data map[string]string `json:"data" binding:"required"`
191248 Metadata *hsm.SecretMetadata `json:"metadata,omitempty"`
249249+ Mirror *bool `json:"mirror,omitempty"` // Enable/disable mirroring for this request
192250 }
193251 if err := c.ShouldBindJSON(&req); err != nil {
194252 p.server.sendError(c, http.StatusBadRequest, "parse_error", "Failed to parse request body", map[string]any{
···203261 data[key] = []byte(value)
204262 }
205263206206- grpcClient, err := p.getOrCreateGRPCClient(c)
207207- if err != nil {
208208- p.server.sendError(c, http.StatusServiceUnavailable, "no_agent", "No HSM agents available", map[string]any{
209209- "error": err.Error(),
210210- })
211211- return
212212- }
264264+ // Determine if we should mirror this write (default: true)
265265+ shouldMirror := req.Mirror == nil || *req.Mirror
266266+267267+ if shouldMirror {
268268+ // Get all available clients for mirroring
269269+ clients, err := p.getAllAvailableGRPCClients(c)
270270+ if err != nil {
271271+ p.server.sendError(c, http.StatusServiceUnavailable, "no_agents", "No HSM agents available for mirroring", map[string]any{
272272+ "error": err.Error(),
273273+ })
274274+ return
275275+ }
276276+277277+ if len(clients) == 0 {
278278+ p.server.sendError(c, http.StatusServiceUnavailable, "no_agents", "No HSM agents available", nil)
279279+ return
280280+ }
281281+282282+ // Add mirroring metadata
283283+ metadata := req.Metadata
284284+ if metadata == nil {
285285+ metadata = &hsm.SecretMetadata{Tags: make(map[string]string)}
286286+ }
287287+ if metadata.Tags == nil {
288288+ metadata.Tags = make(map[string]string)
289289+ }
290290+ metadata.Tags["sync.version"] = fmt.Sprintf("%d", time.Now().Unix())
291291+ metadata.Tags["sync.timestamp"] = time.Now().Format(time.RFC3339)
292292+ metadata.Tags["sync.mirrored"] = "true"
293293+294294+ // Write to all devices in parallel
295295+ results := p.writeToAllDevices(c.Request.Context(), clients, path, data, metadata)
296296+297297+ // Check results
298298+ successful := 0
299299+ var errors []string
300300+ deviceResults := make(map[string]any)
301301+302302+ for deviceName, result := range results {
303303+ deviceResults[deviceName] = map[string]any{
304304+ "success": result.Error == nil,
305305+ "error": func() string {
306306+ if result.Error != nil {
307307+ return result.Error.Error()
308308+ }
309309+ return ""
310310+ }(),
311311+ }
312312+313313+ if result.Error == nil {
314314+ successful++
315315+ } else {
316316+ errors = append(errors, fmt.Sprintf("%s: %v", deviceName, result.Error))
317317+ p.logger.Error(result.Error, "Failed to write to device", "device", deviceName, "path", path)
318318+ }
319319+ }
320320+321321+ // Consider the operation successful if we wrote to at least one device
322322+ if successful > 0 {
323323+ response := map[string]any{
324324+ "path": path,
325325+ "keys": len(data),
326326+ "mirrored": true,
327327+ "devices": len(clients),
328328+ "successful": successful,
329329+ "deviceResults": deviceResults,
330330+ }
331331+ if metadata != nil {
332332+ response["metadata"] = metadata
333333+ }
334334+ if len(errors) > 0 {
335335+ response["warnings"] = errors
336336+ }
213337214214- if req.Metadata != nil {
215215- err = grpcClient.WriteSecretWithMetadata(c.Request.Context(), path, data, req.Metadata)
338338+ statusCode := http.StatusCreated
339339+ message := "Secret written successfully"
340340+ if successful < len(clients) {
341341+ statusCode = http.StatusPartialContent // 206 indicates partial success
342342+ message = fmt.Sprintf("Secret written to %d/%d devices", successful, len(clients))
343343+ }
344344+345345+ p.server.sendResponse(c, statusCode, message, response)
346346+ } else {
347347+ // All devices failed
348348+ p.server.sendError(c, http.StatusInternalServerError, "write_failed", "Failed to write secret to any HSM device", map[string]any{
349349+ "errors": errors,
350350+ "deviceResults": deviceResults,
351351+ "path": path,
352352+ })
353353+ }
216354 } else {
217217- err = grpcClient.WriteSecret(c.Request.Context(), path, data)
218218- }
355355+ // Single-device write (no mirroring)
356356+ grpcClient, err := p.getOrCreateGRPCClient(c)
357357+ if err != nil {
358358+ p.server.sendError(c, http.StatusServiceUnavailable, "no_agent", "No HSM agents available", map[string]any{
359359+ "error": err.Error(),
360360+ })
361361+ return
362362+ }
219363220220- if err != nil {
221221- p.server.sendError(c, http.StatusInternalServerError, "grpc_error", "Failed to write secret to HSM", map[string]any{
222222- "error": err.Error(),
223223- "path": path,
224224- })
225225- return
226226- }
364364+ if req.Metadata != nil {
365365+ err = grpcClient.WriteSecretWithMetadata(c.Request.Context(), path, data, req.Metadata)
366366+ } else {
367367+ err = grpcClient.WriteSecret(c.Request.Context(), path, data)
368368+ }
227369228228- response := map[string]any{
229229- "path": path,
230230- "keys": len(data),
231231- }
232232- if req.Metadata != nil {
233233- response["metadata"] = req.Metadata
370370+ if err != nil {
371371+ p.server.sendError(c, http.StatusInternalServerError, "grpc_error", "Failed to write secret to HSM", map[string]any{
372372+ "error": err.Error(),
373373+ "path": path,
374374+ })
375375+ return
376376+ }
377377+378378+ response := map[string]any{
379379+ "path": path,
380380+ "keys": len(data),
381381+ "mirrored": false,
382382+ }
383383+ if req.Metadata != nil {
384384+ response["metadata"] = req.Metadata
385385+ }
386386+ p.server.sendResponse(c, http.StatusCreated, "Secret written successfully", response)
234387 }
235235- p.server.sendResponse(c, http.StatusCreated, "Secret written successfully", response)
236388}
237389238238-// DeleteSecret handles DELETE /hsm/secrets/:path
390390+// DeleteSecret handles DELETE /hsm/secrets/:path with mirroring support
239391func (p *ProxyClient) DeleteSecret(c *gin.Context) {
240392 path := c.Param("path")
241393 if path == "" {
···243395 return
244396 }
245397246246- grpcClient, err := p.getOrCreateGRPCClient(c)
247247- if err != nil {
248248- p.server.sendError(c, http.StatusServiceUnavailable, "no_agent", "No HSM agents available", map[string]any{
249249- "error": err.Error(),
250250- })
251251- return
252252- }
398398+ // Check if mirroring is explicitly disabled
399399+ mirror := c.Query("mirror")
400400+ shouldMirror := mirror != "false" // Default to true unless explicitly set to false
401401+402402+ if shouldMirror {
403403+ // Get all available clients for mirroring delete
404404+ clients, err := p.getAllAvailableGRPCClients(c)
405405+ if err != nil {
406406+ p.server.sendError(c, http.StatusServiceUnavailable, "no_agents", "No HSM agents available for mirroring", map[string]any{
407407+ "error": err.Error(),
408408+ })
409409+ return
410410+ }
411411+412412+ if len(clients) == 0 {
413413+ p.server.sendError(c, http.StatusServiceUnavailable, "no_agents", "No HSM agents available", nil)
414414+ return
415415+ }
416416+417417+ // Delete from all devices in parallel
418418+ results := p.deleteFromAllDevices(c.Request.Context(), clients, path)
419419+420420+ // Check results
421421+ successful := 0
422422+ var errors []string
423423+ deviceResults := make(map[string]any)
424424+425425+ for deviceName, result := range results {
426426+ deviceResults[deviceName] = map[string]any{
427427+ "success": result.Error == nil,
428428+ "error": func() string {
429429+ if result.Error != nil {
430430+ return result.Error.Error()
431431+ }
432432+ return ""
433433+ }(),
434434+ }
435435+436436+ if result.Error == nil {
437437+ successful++
438438+ } else {
439439+ errors = append(errors, fmt.Sprintf("%s: %v", deviceName, result.Error))
440440+ p.logger.Error(result.Error, "Failed to delete from device", "device", deviceName, "path", path)
441441+ }
442442+ }
443443+444444+ // Consider the operation successful if we deleted from at least one device
445445+ if successful > 0 {
446446+ response := map[string]any{
447447+ "path": path,
448448+ "mirrored": true,
449449+ "devices": len(clients),
450450+ "successful": successful,
451451+ "deviceResults": deviceResults,
452452+ }
453453+ if len(errors) > 0 {
454454+ response["warnings"] = errors
455455+ }
456456+457457+ statusCode := http.StatusOK
458458+ message := "Secret deleted successfully"
459459+ if successful < len(clients) {
460460+ statusCode = http.StatusPartialContent // 206 indicates partial success
461461+ message = fmt.Sprintf("Secret deleted from %d/%d devices", successful, len(clients))
462462+ }
463463+464464+ p.server.sendResponse(c, statusCode, message, response)
465465+ } else {
466466+ // All devices failed
467467+ p.server.sendError(c, http.StatusInternalServerError, "delete_failed", "Failed to delete secret from any HSM device", map[string]any{
468468+ "errors": errors,
469469+ "deviceResults": deviceResults,
470470+ "path": path,
471471+ })
472472+ }
473473+ } else {
474474+ // Single-device delete (no mirroring)
475475+ grpcClient, err := p.getOrCreateGRPCClient(c)
476476+ if err != nil {
477477+ p.server.sendError(c, http.StatusServiceUnavailable, "no_agent", "No HSM agents available", map[string]any{
478478+ "error": err.Error(),
479479+ })
480480+ return
481481+ }
253482254254- err = grpcClient.DeleteSecret(c.Request.Context(), path)
255255- if err != nil {
256256- p.server.sendError(c, http.StatusInternalServerError, "grpc_error", "Failed to delete secret from HSM", map[string]any{
257257- "error": err.Error(),
258258- "path": path,
259259- })
260260- return
261261- }
483483+ err = grpcClient.DeleteSecret(c.Request.Context(), path)
484484+ if err != nil {
485485+ p.server.sendError(c, http.StatusInternalServerError, "grpc_error", "Failed to delete secret from HSM", map[string]any{
486486+ "error": err.Error(),
487487+ "path": path,
488488+ })
489489+ return
490490+ }
262491263263- response := map[string]any{
264264- "path": path,
492492+ response := map[string]any{
493493+ "path": path,
494494+ "mirrored": false,
495495+ }
496496+ p.server.sendResponse(c, http.StatusOK, "Secret deleted successfully", response)
265497 }
266266- p.server.sendResponse(c, http.StatusOK, "Secret deleted successfully", response)
267498}
268499269500// ReadMetadata handles GET /hsm/secrets/:path/metadata
···395626 p.clientsMutex.RLock()
396627 defer p.clientsMutex.RUnlock()
397628 return len(p.grpcClients)
629629+}
630630+631631+// writeToAllDevices writes secret data to all devices in parallel
632632+func (p *ProxyClient) writeToAllDevices(ctx context.Context, clients map[string]hsm.Client, path string, data hsm.SecretData, metadata *hsm.SecretMetadata) map[string]WriteResult {
633633+ results := make(map[string]WriteResult)
634634+ resultsMutex := sync.Mutex{}
635635+ wg := sync.WaitGroup{}
636636+637637+ for deviceName, client := range clients {
638638+ wg.Add(1)
639639+ go func(deviceName string, client hsm.Client) {
640640+ defer wg.Done()
641641+642642+ var err error
643643+ if metadata != nil {
644644+ err = client.WriteSecretWithMetadata(ctx, path, data, metadata)
645645+ } else {
646646+ err = client.WriteSecret(ctx, path, data)
647647+ }
648648+649649+ resultsMutex.Lock()
650650+ results[deviceName] = WriteResult{
651651+ DeviceName: deviceName,
652652+ Error: err,
653653+ }
654654+ resultsMutex.Unlock()
655655+ }(deviceName, client)
656656+ }
657657+658658+ wg.Wait()
659659+ return results
660660+}
661661+662662+// deleteFromAllDevices deletes secret data from all devices in parallel
663663+func (p *ProxyClient) deleteFromAllDevices(ctx context.Context, clients map[string]hsm.Client, path string) map[string]WriteResult {
664664+ results := make(map[string]WriteResult)
665665+ resultsMutex := sync.Mutex{}
666666+ wg := sync.WaitGroup{}
667667+668668+ for deviceName, client := range clients {
669669+ wg.Add(1)
670670+ go func(deviceName string, client hsm.Client) {
671671+ defer wg.Done()
672672+673673+ err := client.DeleteSecret(ctx, path)
674674+675675+ resultsMutex.Lock()
676676+ results[deviceName] = WriteResult{
677677+ DeviceName: deviceName,
678678+ Error: err,
679679+ }
680680+ resultsMutex.Unlock()
681681+ }(deviceName, client)
682682+ }
683683+684684+ wg.Wait()
685685+ return results
398686}
399687400688// Interface compliance methods (unused in HTTP mode but required for hsm.Client interface)
+42-20
internal/api/server.go
···29293030 hsmv1alpha1 "github.com/evanjarrett/hsm-secrets-operator/api/v1alpha1"
3131 "github.com/evanjarrett/hsm-secrets-operator/internal/agent"
3232- "github.com/evanjarrett/hsm-secrets-operator/internal/discovery"
3332 "github.com/evanjarrett/hsm-secrets-operator/internal/hsm"
3433)
35343635// Server represents the HSM REST API server that proxies requests to agent pods
3736type Server struct {
3838- client client.Client
3939- agentManager *agent.Manager
4040- mirroringManager *discovery.MirroringManager
4141- validator *validator.Validate
4242- logger logr.Logger
4343- router *gin.Engine
4444- proxyClient *ProxyClient
3737+ client client.Client
3838+ agentManager *agent.Manager
3939+ validator *validator.Validate
4040+ logger logr.Logger
4141+ router *gin.Engine
4242+ proxyClient *ProxyClient
4543}
46444745// NewServer creates a new API server instance that proxies to agents
4848-func NewServer(k8sClient client.Client, agentManager *agent.Manager, mirroringManager *discovery.MirroringManager, logger logr.Logger) *Server {
4646+func NewServer(k8sClient client.Client, agentManager *agent.Manager, logger logr.Logger) *Server {
4947 s := &Server{
5050- client: k8sClient,
5151- agentManager: agentManager,
5252- mirroringManager: mirroringManager,
5353- validator: validator.New(),
5454- logger: logger.WithName("api-server"),
4848+ client: k8sClient,
4949+ agentManager: agentManager,
5050+ validator: validator.New(),
5151+ logger: logger.WithName("api-server"),
5552 }
56535754 // Create ProxyClient instance
···8986 // In proxy mode, check if any agents are available
9087 _, agentErr := s.findAvailableAgent(c.Request.Context(), "secrets")
9188 hsmConnected := agentErr == nil
9292- replicationEnabled := s.mirroringManager != nil
9393- activeNodes := 0
94899595- if s.mirroringManager != nil {
9696- // Count active nodes (simplified - in real implementation would check actual node health)
9797- activeNodes = 1 // Current node
9898- }
9090+ // Check if multiple agents are available for replication
9191+ agents, _ := s.getAllAvailableAgents(c.Request.Context(), "secrets")
9292+ replicationEnabled := len(agents) > 1
9393+ activeNodes := len(agents)
999410095 status := "healthy"
10196 if !hsmConnected {
···192187 }
193188194189 return "", fmt.Errorf("no available HSM agents found")
190190+}
191191+192192+// getAllAvailableAgents finds all available HSM agents for mirroring operations
193193+func (s *Server) getAllAvailableAgents(ctx context.Context, namespace string) ([]string, error) {
194194+ if s.agentManager == nil {
195195+ return nil, fmt.Errorf("agent manager not available")
196196+ }
197197+198198+ // List all HSMDevices to find all with active agents
199199+ var hsmDeviceList hsmv1alpha1.HSMDeviceList
200200+ if err := s.client.List(ctx, &hsmDeviceList, client.InNamespace(namespace)); err != nil {
201201+ return nil, fmt.Errorf("failed to list HSM devices: %w", err)
202202+ }
203203+204204+ var availableDevices []string
205205+ // Check all devices that have active agents with pod IPs
206206+ for _, device := range hsmDeviceList.Items {
207207+ if podIPs, err := s.agentManager.GetAgentPodIPs(device.Name); err == nil && len(podIPs) > 0 {
208208+ availableDevices = append(availableDevices, device.Name)
209209+ }
210210+ }
211211+212212+ if len(availableDevices) == 0 {
213213+ return nil, fmt.Errorf("no available HSM agents found")
214214+ }
215215+216216+ return availableDevices, nil
195217}
196218197219// createGRPCClient creates a gRPC client for the specified device using AgentManager
+13-30
internal/controller/hsmsecret_controller.go
···35353636 hsmv1alpha1 "github.com/evanjarrett/hsm-secrets-operator/api/v1alpha1"
3737 "github.com/evanjarrett/hsm-secrets-operator/internal/agent"
3838- "github.com/evanjarrett/hsm-secrets-operator/internal/discovery"
3938 "github.com/evanjarrett/hsm-secrets-operator/internal/hsm"
4039)
4140···4443 HSMSecretFinalizer = "hsmsecret.hsm.j5t.io/finalizer"
45444645 // DefaultSyncInterval is the default sync interval in seconds
4747- DefaultSyncInterval = 300
4646+ DefaultSyncInterval = 30
4847)
49485049// HSMSecretReconciler reconciles a HSMSecret object
5150type HSMSecretReconciler struct {
5251 client.Client
5353- Scheme *runtime.Scheme
5454- MirroringManager *discovery.MirroringManager
5555- AgentManager *agent.Manager
5252+ Scheme *runtime.Scheme
5353+ AgentManager *agent.Manager
5654}
57555856// +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmsecrets,verbs=get;list;watch;create;update;patch;delete
···176174 syncInterval = DefaultSyncInterval
177175 }
178176179179- // Read secret from HSM with readonly fallback support
180180- hsmData, err := r.readSecretWithFallback(ctx, hsmSecret, hsmClient)
177177+ // Read secret from HSM via agent
178178+ hsmData, err := r.readSecretFromHSM(ctx, hsmSecret, hsmClient)
181179 if err != nil {
182182- logger.Error(err, "Failed to read secret from HSM and mirrors", "path", hsmSecret.Name)
180180+ logger.Error(err, "Failed to read secret from HSM", "path", hsmSecret.Name)
183181 return ctrl.Result{RequeueAfter: time.Minute * 2}, err
184182 }
185183···367365 }
368366}
369367370370-// readSecretWithFallback attempts to read a secret from primary HSM, falling back to mirrors if needed
371371-func (r *HSMSecretReconciler) readSecretWithFallback(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret, hsmClient hsm.Client) (hsm.SecretData, error) {
368368+// readSecretFromHSM attempts to read a secret from HSM via agent
369369+func (r *HSMSecretReconciler) readSecretFromHSM(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret, hsmClient hsm.Client) (hsm.SecretData, error) {
372370 logger := log.FromContext(ctx)
373371374374- // Try to read from primary HSM first (via agent)
372372+ // Read from HSM via agent (sync handles mirroring automatically)
375373 if hsmClient != nil && hsmClient.IsConnected() {
376374 data, err := hsmClient.ReadSecret(ctx, hsmSecret.Name)
377375 if err == nil {
378378- logger.V(1).Info("Successfully read secret from primary HSM", "path", hsmSecret.Name)
376376+ logger.V(1).Info("Successfully read secret from HSM", "path", hsmSecret.Name)
379377 return data, nil
380378 }
381381- logger.V(1).Info("Failed to read from primary HSM, attempting fallback", "error", err)
379379+ logger.V(1).Info("Failed to read from HSM", "error", err)
380380+ return nil, err
382381 }
383382384384- // If primary failed and we have a mirroring manager, try readonly access from mirrors
385385- if r.MirroringManager != nil {
386386- // Find relevant HSMDevice for this secret path
387387- hsmDevice, err := r.findHSMDeviceForSecret(ctx, hsmSecret)
388388- if err != nil {
389389- logger.Error(err, "Failed to find HSM device for readonly fallback")
390390- } else if hsmDevice != nil {
391391- data, err := r.MirroringManager.GetReadOnlyAccess(ctx, hsmSecret.Name, hsmDevice)
392392- if err == nil {
393393- logger.Info("Successfully read secret from readonly mirror", "path", hsmSecret.Name)
394394- return data, nil
395395- }
396396- logger.V(1).Info("Failed to read from mirrors", "error", err)
397397- }
398398- }
399399-400400- return nil, fmt.Errorf("secret not accessible from primary HSM or mirrors")
383383+ return nil, fmt.Errorf("HSM client not available or not connected")
401384}
402385403386// findHSMDeviceForSecret finds the HSMDevice that should contain the secret
+128
internal/controller/hsmsync_controller.go
···11+/*
22+Copyright 2025.
33+44+Licensed under the Apache License, Version 2.0 (the "License");
55+you may not use this file except in compliance with the License.
66+You may obtain a copy of the License at
77+88+ http://www.apache.org/licenses/LICENSE-2.0
99+1010+Unless required by applicable law or agreed to in writing, software
1111+distributed under the License is distributed on an "AS IS" BASIS,
1212+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313+See the License for the specific language governing permissions and
1414+limitations under the License.
1515+*/
1616+1717+package controller
1818+1919+import (
2020+ "context"
2121+ "time"
2222+2323+ "k8s.io/apimachinery/pkg/runtime"
2424+ ctrl "sigs.k8s.io/controller-runtime"
2525+ "sigs.k8s.io/controller-runtime/pkg/client"
2626+ "sigs.k8s.io/controller-runtime/pkg/log"
2727+2828+ hsmv1alpha1 "github.com/evanjarrett/hsm-secrets-operator/api/v1alpha1"
2929+ "github.com/evanjarrett/hsm-secrets-operator/internal/agent"
3030+ "github.com/evanjarrett/hsm-secrets-operator/internal/sync"
3131+)
3232+3333+// HSMSyncReconciler handles multi-device HSM synchronization and conflict resolution
3434+type HSMSyncReconciler struct {
3535+ client.Client
3636+ Scheme *runtime.Scheme
3737+ SyncManager *sync.SyncManager
3838+3939+ // SyncInterval controls how often to perform sync checks (default: 30 seconds)
4040+ SyncInterval time.Duration
4141+}
4242+4343+// NewHSMSyncReconciler creates a new HSM sync reconciler
4444+func NewHSMSyncReconciler(k8sClient client.Client, scheme *runtime.Scheme, agentManager *agent.Manager) *HSMSyncReconciler {
4545+ logger := ctrl.Log.WithName("hsm-sync-controller")
4646+ syncManager := sync.NewSyncManager(k8sClient, agentManager, logger)
4747+4848+ return &HSMSyncReconciler{
4949+ Client: k8sClient,
5050+ Scheme: scheme,
5151+ SyncManager: syncManager,
5252+ SyncInterval: 30 * time.Second, // Default sync interval
5353+ }
5454+}
5555+5656+// +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmsecrets,verbs=get;list;watch;update;patch
5757+// +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmsecrets/status,verbs=get;update;patch
5858+// +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmpools,verbs=get;list;watch
5959+// +kubebuilder:rbac:groups=hsm.j5t.io,resources=hsmdevices,verbs=get;list;watch
6060+6161+// Reconcile performs HSM device synchronization and conflict resolution
6262+func (r *HSMSyncReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
6363+ logger := log.FromContext(ctx)
6464+6565+ // Fetch the HSMSecret instance
6666+ var hsmSecret hsmv1alpha1.HSMSecret
6767+ if err := r.Get(ctx, req.NamespacedName, &hsmSecret); err != nil {
6868+ return ctrl.Result{}, client.IgnoreNotFound(err)
6969+ }
7070+7171+ // Skip sync if AutoSync is disabled
7272+ if !hsmSecret.Spec.AutoSync {
7373+ logger.V(1).Info("AutoSync disabled, skipping sync", "secret", hsmSecret.Name)
7474+ return ctrl.Result{}, nil
7575+ }
7676+7777+ logger.Info("Starting multi-device HSM sync", "secret", hsmSecret.Name)
7878+7979+ // Perform the sync operation
8080+ result, err := r.SyncManager.SyncSecret(ctx, &hsmSecret)
8181+ if err != nil {
8282+ logger.Error(err, "Failed to perform HSM sync", "secret", hsmSecret.Name)
8383+8484+ // Update status with error
8585+ hsmSecret.Status.SyncStatus = hsmv1alpha1.SyncStatusError
8686+ hsmSecret.Status.LastError = err.Error()
8787+ if updateErr := r.Status().Update(ctx, &hsmSecret); updateErr != nil {
8888+ logger.Error(updateErr, "Failed to update HSMSecret status")
8989+ }
9090+9191+ // Retry sooner on error
9292+ return ctrl.Result{RequeueAfter: r.SyncInterval / 2}, nil
9393+ }
9494+9595+ // Update HSMSecret status with sync results
9696+ if err := r.SyncManager.UpdateHSMSecretStatus(ctx, &hsmSecret, result); err != nil {
9797+ logger.Error(err, "Failed to update HSMSecret status", "secret", hsmSecret.Name)
9898+ return ctrl.Result{RequeueAfter: r.SyncInterval / 2}, err
9999+ }
100100+101101+ // Log sync results
102102+ if result.ConflictDetected {
103103+ logger.Info("Conflict detected and resolved",
104104+ "secret", hsmSecret.Name,
105105+ "primaryDevice", result.PrimaryDevice,
106106+ "devices", len(result.DeviceResults))
107107+ } else {
108108+ logger.V(1).Info("HSM sync completed successfully",
109109+ "secret", hsmSecret.Name,
110110+ "devices", len(result.DeviceResults))
111111+ }
112112+113113+ // Calculate next sync interval based on HSMSecret spec
114114+ syncInterval := r.SyncInterval
115115+ if hsmSecret.Spec.SyncInterval > 0 {
116116+ syncInterval = time.Duration(hsmSecret.Spec.SyncInterval) * time.Second
117117+ }
118118+119119+ return ctrl.Result{RequeueAfter: syncInterval}, nil
120120+}
121121+122122+// SetupWithManager sets up the controller with the Manager.
123123+func (r *HSMSyncReconciler) SetupWithManager(mgr ctrl.Manager) error {
124124+ return ctrl.NewControllerManagedBy(mgr).
125125+ For(&hsmv1alpha1.HSMSecret{}).
126126+ Named("hsmsync").
127127+ Complete(r)
128128+}
-114
internal/discovery/mirroring.go
···11-/*
22-Copyright 2025.
33-44-Licensed under the Apache License, Version 2.0 (the "License");
55-you may not use this file except in compliance with the License.
66-You may obtain a copy of the License at
77-88- http://www.apache.org/licenses/LICENSE-2.0
99-1010-Unless required by applicable law or agreed to in writing, software
1111-distributed under the License is distributed on an "AS IS" BASIS,
1212-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313-See the License for the specific language governing permissions and
1414-limitations under the License.
1515-*/
1616-1717-// TODO: This entire mirroring system needs to be redesigned for the new HSMPool architecture.
1818-// The previous implementation tried to modify HSMDevice.Status which no longer exists.
1919-// Providing stub implementations to avoid compilation errors while the new architecture is implemented.
2020-2121-package discovery
2222-2323-import (
2424- "context"
2525- "fmt"
2626- "sync"
2727- "time"
2828-2929- "github.com/go-logr/logr"
3030- ctrl "sigs.k8s.io/controller-runtime"
3131- "sigs.k8s.io/controller-runtime/pkg/client"
3232-3333- hsmv1alpha1 "github.com/evanjarrett/hsm-secrets-operator/api/v1alpha1"
3434- "github.com/evanjarrett/hsm-secrets-operator/internal/hsm"
3535-)
3636-3737-// MirroredSecretData represents secret data with metadata for mirroring
3838-type MirroredSecretData struct {
3939- Path string `json:"path"`
4040- Data hsm.SecretData `json:"data"`
4141- Checksum string `json:"checksum"`
4242- LastModified time.Time `json:"lastModified"`
4343- SourceNode string `json:"sourceNode"`
4444- Metadata map[string]string `json:"metadata"`
4545-}
4646-4747-// MirroringManager handles HSM device mirroring and cross-node synchronization
4848-// TODO: Redesign this for HSMPool architecture
4949-type MirroringManager struct {
5050- client client.Client
5151- logger logr.Logger
5252- mutex sync.RWMutex
5353- hsmClients map[string]hsm.Client
5454- syncTimeout time.Duration
5555-}
5656-5757-// NewMirroringManager creates a new mirroring manager
5858-func NewMirroringManager(k8sClient client.Client, logger logr.Logger) *MirroringManager {
5959- return &MirroringManager{
6060- client: k8sClient,
6161- logger: logger,
6262- hsmClients: make(map[string]hsm.Client),
6363- syncTimeout: 30 * time.Second,
6464- }
6565-}
6666-6767-// RegisterHSMClient registers an HSM client for a specific node
6868-func (m *MirroringManager) RegisterHSMClient(nodeName string, hsmClient hsm.Client) {
6969- m.mutex.Lock()
7070- defer m.mutex.Unlock()
7171- m.hsmClients[nodeName] = hsmClient
7272-}
7373-7474-// UnregisterHSMClient removes an HSM client for a node
7575-func (m *MirroringManager) UnregisterHSMClient(nodeName string) {
7676- m.mutex.Lock()
7777- defer m.mutex.Unlock()
7878- delete(m.hsmClients, nodeName)
7979-}
8080-8181-// SyncDevices synchronizes HSM devices across mirror nodes
8282-// TODO: Redesign for HSMPool architecture
8383-func (m *MirroringManager) SyncDevices(ctx context.Context, hsmDevice *hsmv1alpha1.HSMDevice) error {
8484- m.logger.Info("Device sync needs redesign for HSMPool architecture", "device", hsmDevice.Name)
8585- return fmt.Errorf("device sync functionality needs to be redesigned for HSMPool architecture")
8686-}
8787-8888-// TODO: The following functions will be redesigned for HSMPool architecture:
8989-// - determineMirrorTopology
9090-// - syncFromPrimary
9191-// - updateMirroringStatus
9292-9393-// GetReadOnlyAccess provides read-only access to HSM data during failover scenarios
9494-// TODO: Redesign for HSMPool architecture
9595-func (m *MirroringManager) GetReadOnlyAccess(ctx context.Context, secretPath string, hsmDevice *hsmv1alpha1.HSMDevice) (hsm.SecretData, error) {
9696- m.logger.Info("Read-only access needs redesign for HSMPool architecture",
9797- "device", hsmDevice.Name,
9898- "secretPath", secretPath)
9999- return nil, fmt.Errorf("read-only access functionality needs to be redesigned for HSMPool architecture")
100100-}
101101-102102-// HandleFailover handles automatic failover to a healthy mirror node
103103-// TODO: Redesign for HSMPool architecture
104104-func (m *MirroringManager) HandleFailover(ctx context.Context, hsmDevice *hsmv1alpha1.HSMDevice) error {
105105- m.logger.Info("Failover handling needs redesign for HSMPool architecture", "device", hsmDevice.Name)
106106- return fmt.Errorf("failover functionality needs to be redesigned for HSMPool architecture")
107107-}
108108-109109-// SetupWithManager sets up the mirroring manager with the controller manager
110110-func (m *MirroringManager) SetupWithManager(mgr ctrl.Manager) error {
111111- m.logger.Info("Mirroring manager setup - functionality needs redesign for HSMPool architecture")
112112- // TODO: Set up watches on HSMPool resources instead of HSMDevice
113113- return nil
114114-}
+6-11
internal/modes/manager/manager.go
···4040 "github.com/evanjarrett/hsm-secrets-operator/internal/agent"
4141 "github.com/evanjarrett/hsm-secrets-operator/internal/api"
4242 "github.com/evanjarrett/hsm-secrets-operator/internal/controller"
4343- "github.com/evanjarrett/hsm-secrets-operator/internal/discovery"
4443)
45444645var (
···210209 return err
211210 }
212211213213- // Initialize mirroring manager for HSMSecret controller device failover
214214- // Note: Device discovery is handled by separate discovery daemon
215215- mirroringManager := discovery.NewMirroringManager(mgr.GetClient(), setupLog)
216216-217217- // HSM client registration removed - now handled by agent architecture
212212+ // HSM mirroring is now handled by the sync package and HSMSyncReconciler
213213+ // Device discovery is handled by separate discovery daemon
218214219215 // Agent manager will detect the current namespace automatically
220216 imageResolver := controller.NewImageResolver(mgr.GetClient())
···241237 }
242238243239 if err := (&controller.HSMSecretReconciler{
244244- Client: mgr.GetClient(),
245245- Scheme: mgr.GetScheme(),
246246- MirroringManager: mirroringManager,
247247- AgentManager: agentManager,
240240+ Client: mgr.GetClient(),
241241+ Scheme: mgr.GetScheme(),
242242+ AgentManager: agentManager,
248243 }).SetupWithManager(mgr); err != nil {
249244 setupLog.Error(err, "unable to create controller", "controller", "HSMSecret")
250245 return err
···287282288283 // Start API server if enabled
289284 if enableAPI {
290290- apiServer := api.NewServer(mgr.GetClient(), agentManager, mirroringManager, ctrl.Log.WithName("api"))
285285+ apiServer := api.NewServer(mgr.GetClient(), agentManager, ctrl.Log.WithName("api"))
291286292287 // Start API server in a separate goroutine
293288 go func() {
+304
internal/sync/conflict_resolver.go
···11+/*
22+Copyright 2025.
33+44+Licensed under the Apache License, Version 2.0 (the "License");
55+you may not use this file except in compliance with the License.
66+You may obtain a copy of the License at
77+88+ http://www.apache.org/licenses/LICENSE-2.0
99+1010+Unless required by applicable law or agreed to in writing, software
1111+distributed under the License is distributed on an "AS IS" BASIS,
1212+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313+See the License for the specific language governing permissions and
1414+limitations under the License.
1515+*/
1616+1717+package sync
1818+1919+import (
2020+ "context"
2121+ "fmt"
2222+ "time"
2323+2424+ "github.com/go-logr/logr"
2525+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2626+2727+ hsmv1alpha1 "github.com/evanjarrett/hsm-secrets-operator/api/v1alpha1"
2828+ "github.com/evanjarrett/hsm-secrets-operator/internal/hsm"
2929+)
3030+3131+// ConflictResolutionStrategy defines how conflicts should be resolved
3232+type ConflictResolutionStrategy string
3333+3434+const (
3535+ // StrategyLatestVersion resolves conflicts by choosing the device with the highest version
3636+ StrategyLatestVersion ConflictResolutionStrategy = "latest-version"
3737+3838+ // StrategyLatestTimestamp resolves conflicts by choosing the most recently modified data
3939+ StrategyLatestTimestamp ConflictResolutionStrategy = "latest-timestamp"
4040+4141+ // StrategyManualResolution requires manual intervention for conflict resolution
4242+ StrategyManualResolution ConflictResolutionStrategy = "manual"
4343+4444+ // StrategyPrimaryDevice always uses the designated primary device as source of truth
4545+ StrategyPrimaryDevice ConflictResolutionStrategy = "primary-device"
4646+)
4747+4848+// ConflictResolver handles HSM device synchronization conflicts
4949+type ConflictResolver struct {
5050+ logger logr.Logger
5151+ strategy ConflictResolutionStrategy
5252+}
5353+5454+// NewConflictResolver creates a new conflict resolver
5555+func NewConflictResolver(logger logr.Logger, strategy ConflictResolutionStrategy) *ConflictResolver {
5656+ return &ConflictResolver{
5757+ logger: logger.WithName("conflict-resolver"),
5858+ strategy: strategy,
5959+ }
6060+}
6161+6262+// ConflictInfo represents detected conflict information
6363+type ConflictInfo struct {
6464+ SecretPath string
6565+ Devices []DeviceConflictData
6666+ DetectedAt time.Time
6767+ ResolutionRef string // Reference to resolution method used
6868+}
6969+7070+// DeviceConflictData represents conflict data from a specific device
7171+type DeviceConflictData struct {
7272+ DeviceName string
7373+ Checksum string
7474+ Version int64
7575+ Timestamp time.Time
7676+ Data hsm.SecretData
7777+ Online bool
7878+ Error error
7979+}
8080+8181+// ResolveConflict resolves a detected conflict using the configured strategy
8282+func (cr *ConflictResolver) ResolveConflict(ctx context.Context, conflict *ConflictInfo, hsmSecret *hsmv1alpha1.HSMSecret) (*ResolutionResult, error) {
8383+ logger := cr.logger.WithValues("secret", hsmSecret.Name, "strategy", cr.strategy)
8484+ logger.Info("Resolving HSM sync conflict", "devices", len(conflict.Devices))
8585+8686+ switch cr.strategy {
8787+ case StrategyLatestVersion:
8888+ return cr.resolveByLatestVersion(conflict, logger)
8989+ case StrategyLatestTimestamp:
9090+ return cr.resolveByLatestTimestamp(conflict, logger)
9191+ case StrategyPrimaryDevice:
9292+ return cr.resolveByPrimaryDevice(conflict, hsmSecret, logger)
9393+ case StrategyManualResolution:
9494+ return cr.requireManualResolution(conflict, hsmSecret, logger)
9595+ default:
9696+ return nil, fmt.Errorf("unknown conflict resolution strategy: %s", cr.strategy)
9797+ }
9898+}
9999+100100+// ResolutionResult represents the result of conflict resolution
101101+type ResolutionResult struct {
102102+ Winner *DeviceConflictData
103103+ Resolution ConflictResolutionStrategy
104104+ RequiresManualIntervention bool
105105+ SyncTargets []string // Devices that need to be updated
106106+ ResolvedData hsm.SecretData
107107+}
108108+109109+// resolveByLatestVersion chooses the device with the highest version number
110110+func (cr *ConflictResolver) resolveByLatestVersion(conflict *ConflictInfo, logger logr.Logger) (*ResolutionResult, error) {
111111+ var winner *DeviceConflictData
112112+ highestVersion := int64(-1)
113113+114114+ // Find device with highest version among online devices
115115+ for i := range conflict.Devices {
116116+ device := &conflict.Devices[i]
117117+ if device.Online && device.Error == nil && device.Version > highestVersion {
118118+ highestVersion = device.Version
119119+ winner = device
120120+ }
121121+ }
122122+123123+ if winner == nil {
124124+ return nil, fmt.Errorf("no online devices with valid version found")
125125+ }
126126+127127+ // Determine which devices need updating
128128+ var syncTargets []string
129129+ for _, device := range conflict.Devices {
130130+ if device.DeviceName != winner.DeviceName && device.Online && device.Error == nil {
131131+ syncTargets = append(syncTargets, device.DeviceName)
132132+ }
133133+ }
134134+135135+ logger.Info("Resolved conflict by latest version",
136136+ "winner", winner.DeviceName,
137137+ "version", winner.Version,
138138+ "targets", syncTargets)
139139+140140+ return &ResolutionResult{
141141+ Winner: winner,
142142+ Resolution: StrategyLatestVersion,
143143+ SyncTargets: syncTargets,
144144+ ResolvedData: winner.Data,
145145+ }, nil
146146+}
147147+148148+// resolveByLatestTimestamp chooses the device with the most recent timestamp
149149+func (cr *ConflictResolver) resolveByLatestTimestamp(conflict *ConflictInfo, logger logr.Logger) (*ResolutionResult, error) {
150150+ var winner *DeviceConflictData
151151+ var latestTime time.Time
152152+153153+ // Find device with most recent timestamp among online devices
154154+ for i := range conflict.Devices {
155155+ device := &conflict.Devices[i]
156156+ if device.Online && device.Error == nil && device.Timestamp.After(latestTime) {
157157+ latestTime = device.Timestamp
158158+ winner = device
159159+ }
160160+ }
161161+162162+ if winner == nil {
163163+ return nil, fmt.Errorf("no online devices with valid timestamp found")
164164+ }
165165+166166+ // Determine which devices need updating
167167+ var syncTargets []string
168168+ for _, device := range conflict.Devices {
169169+ if device.DeviceName != winner.DeviceName && device.Online && device.Error == nil {
170170+ syncTargets = append(syncTargets, device.DeviceName)
171171+ }
172172+ }
173173+174174+ logger.Info("Resolved conflict by latest timestamp",
175175+ "winner", winner.DeviceName,
176176+ "timestamp", winner.Timestamp,
177177+ "targets", syncTargets)
178178+179179+ return &ResolutionResult{
180180+ Winner: winner,
181181+ Resolution: StrategyLatestTimestamp,
182182+ SyncTargets: syncTargets,
183183+ ResolvedData: winner.Data,
184184+ }, nil
185185+}
186186+187187+// resolveByPrimaryDevice uses the designated primary device as the source of truth
188188+func (cr *ConflictResolver) resolveByPrimaryDevice(conflict *ConflictInfo, hsmSecret *hsmv1alpha1.HSMSecret, logger logr.Logger) (*ResolutionResult, error) {
189189+ primaryDevice := hsmSecret.Status.PrimaryDevice
190190+ if primaryDevice == "" {
191191+ // No primary device set, fall back to latest version strategy
192192+ logger.Info("No primary device set, falling back to latest version strategy")
193193+ return cr.resolveByLatestVersion(conflict, logger)
194194+ }
195195+196196+ // Find the primary device in the conflict data
197197+ var winner *DeviceConflictData
198198+ for i := range conflict.Devices {
199199+ device := &conflict.Devices[i]
200200+ if device.DeviceName == primaryDevice {
201201+ if device.Online && device.Error == nil {
202202+ winner = device
203203+ break
204204+ } else {
205205+ logger.Info("Primary device is offline or has errors, falling back to latest version",
206206+ "primaryDevice", primaryDevice,
207207+ "online", device.Online,
208208+ "error", device.Error)
209209+ return cr.resolveByLatestVersion(conflict, logger)
210210+ }
211211+ }
212212+ }
213213+214214+ if winner == nil {
215215+ logger.Info("Primary device not found in conflict, falling back to latest version", "primaryDevice", primaryDevice)
216216+ return cr.resolveByLatestVersion(conflict, logger)
217217+ }
218218+219219+ // Determine which devices need updating
220220+ var syncTargets []string
221221+ for _, device := range conflict.Devices {
222222+ if device.DeviceName != winner.DeviceName && device.Online && device.Error == nil {
223223+ syncTargets = append(syncTargets, device.DeviceName)
224224+ }
225225+ }
226226+227227+ logger.Info("Resolved conflict by primary device",
228228+ "winner", winner.DeviceName,
229229+ "targets", syncTargets)
230230+231231+ return &ResolutionResult{
232232+ Winner: winner,
233233+ Resolution: StrategyPrimaryDevice,
234234+ SyncTargets: syncTargets,
235235+ ResolvedData: winner.Data,
236236+ }, nil
237237+}
238238+239239+// requireManualResolution marks the conflict as requiring manual intervention
240240+func (cr *ConflictResolver) requireManualResolution(conflict *ConflictInfo, hsmSecret *hsmv1alpha1.HSMSecret, logger logr.Logger) (*ResolutionResult, error) {
241241+ logger.Info("Conflict marked for manual resolution",
242242+ "devices", len(conflict.Devices),
243243+ "secret", hsmSecret.Name)
244244+245245+ // Add condition to HSMSecret indicating manual resolution is required
246246+ now := metav1.NewTime(time.Now())
247247+ condition := metav1.Condition{
248248+ Type: "ConflictResolutionRequired",
249249+ Status: metav1.ConditionTrue,
250250+ Reason: "ManualResolutionRequired",
251251+ Message: fmt.Sprintf("Conflict detected between %d devices requires manual resolution", len(conflict.Devices)),
252252+ LastTransitionTime: now,
253253+ }
254254+255255+ // Update conditions (this would be done by the caller)
256256+ _ = condition
257257+258258+ return &ResolutionResult{
259259+ RequiresManualIntervention: true,
260260+ Resolution: StrategyManualResolution,
261261+ SyncTargets: []string{}, // No automatic sync
262262+ }, nil
263263+}
264264+265265+// DetectConflicts analyzes device sync results to identify conflicts
266266+func (cr *ConflictResolver) DetectConflicts(deviceResults map[string]DeviceResult, secretPath string) (*ConflictInfo, bool) {
267267+ // Group devices by checksum
268268+ checksumGroups := make(map[string][]string)
269269+ conflictDevices := make([]DeviceConflictData, 0)
270270+271271+ for deviceName, result := range deviceResults {
272272+ if result.Online && result.Error == nil && result.Checksum != "" {
273273+ if _, exists := checksumGroups[result.Checksum]; !exists {
274274+ checksumGroups[result.Checksum] = make([]string, 0)
275275+ }
276276+ checksumGroups[result.Checksum] = append(checksumGroups[result.Checksum], deviceName)
277277+278278+ // Add to conflict devices list
279279+ conflictDevices = append(conflictDevices, DeviceConflictData{
280280+ DeviceName: deviceName,
281281+ Checksum: result.Checksum,
282282+ Version: result.Version,
283283+ Timestamp: result.Timestamp,
284284+ Online: result.Online,
285285+ Error: result.Error,
286286+ })
287287+ }
288288+ }
289289+290290+ // Conflict exists if we have more than one checksum group
291291+ hasConflict := len(checksumGroups) > 1 && len(conflictDevices) > 1
292292+293293+ if !hasConflict {
294294+ return nil, false
295295+ }
296296+297297+ conflict := &ConflictInfo{
298298+ SecretPath: secretPath,
299299+ Devices: conflictDevices,
300300+ DetectedAt: time.Now(),
301301+ }
302302+303303+ return conflict, true
304304+}
+405
internal/sync/manager.go
···11+/*
22+Copyright 2025.
33+44+Licensed under the Apache License, Version 2.0 (the "License");
55+you may not use this file except in compliance with the License.
66+You may obtain a copy of the License at
77+88+ http://www.apache.org/licenses/LICENSE-2.0
99+1010+Unless required by applicable law or agreed to in writing, software
1111+distributed under the License is distributed on an "AS IS" BASIS,
1212+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313+See the License for the specific language governing permissions and
1414+limitations under the License.
1515+*/
1616+1717+package sync
1818+1919+import (
2020+ "context"
2121+ "crypto/sha256"
2222+ "fmt"
2323+ "sort"
2424+ "time"
2525+2626+ "github.com/go-logr/logr"
2727+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2828+ "sigs.k8s.io/controller-runtime/pkg/client"
2929+3030+ hsmv1alpha1 "github.com/evanjarrett/hsm-secrets-operator/api/v1alpha1"
3131+ "github.com/evanjarrett/hsm-secrets-operator/internal/hsm"
3232+)
3333+3434+// AgentManagerInterface defines the interface for HSM agent management used by sync
3535+type AgentManagerInterface interface {
3636+ CreateSingleGRPCClient(ctx context.Context, deviceName string, logger logr.Logger) (hsm.Client, error)
3737+}
3838+3939+// SyncManager handles multi-device HSM synchronization and conflict resolution
4040+type SyncManager struct {
4141+ client client.Client
4242+ agentManager AgentManagerInterface
4343+ logger logr.Logger
4444+}
4545+4646+// NewSyncManager creates a new sync manager
4747+func NewSyncManager(k8sClient client.Client, agentManager AgentManagerInterface, logger logr.Logger) *SyncManager {
4848+ return &SyncManager{
4949+ client: k8sClient,
5050+ agentManager: agentManager,
5151+ logger: logger.WithName("sync-manager"),
5252+ }
5353+}
5454+5555+// SyncResult represents the result of a sync operation
5656+type SyncResult struct {
5757+ Success bool
5858+ ConflictDetected bool
5959+ PrimaryDevice string
6060+ DeviceResults map[string]DeviceResult
6161+ ResolvedData hsm.SecretData
6262+}
6363+6464+// DeviceResult represents the sync result for a specific device
6565+type DeviceResult struct {
6666+ Online bool
6767+ Checksum string
6868+ Version int64
6969+ Error error
7070+ Timestamp time.Time
7171+}
7272+7373+// SyncSecret performs multi-device synchronization for an HSMSecret
7474+func (sm *SyncManager) SyncSecret(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret) (*SyncResult, error) {
7575+ logger := sm.logger.WithValues("secret", hsmSecret.Name, "namespace", hsmSecret.Namespace)
7676+ secretPath := hsmSecret.Name
7777+7878+ // Get all available HSM devices from HSMPools
7979+ devices, err := sm.getAvailableDevices(ctx, hsmSecret.Namespace)
8080+ if err != nil {
8181+ return nil, fmt.Errorf("failed to get available devices: %w", err)
8282+ }
8383+8484+ if len(devices) == 0 {
8585+ return &SyncResult{
8686+ Success: false,
8787+ DeviceResults: make(map[string]DeviceResult),
8888+ }, fmt.Errorf("no HSM devices available")
8989+ }
9090+9191+ logger.Info("Starting multi-device sync", "devices", len(devices))
9292+9393+ // Read data from all online devices
9494+ deviceResults := make(map[string]DeviceResult)
9595+ validDeviceData := make(map[string]hsm.SecretData)
9696+9797+ for _, deviceName := range devices {
9898+ result := sm.readFromDevice(ctx, deviceName, secretPath, logger)
9999+ deviceResults[deviceName] = result
100100+101101+ if result.Online && result.Error == nil {
102102+ // Calculate checksum for the data (we'll get the actual data in practice)
103103+ // For now, simulating based on checksum
104104+ validDeviceData[deviceName] = hsm.SecretData{
105105+ "checksum": []byte(result.Checksum),
106106+ }
107107+ }
108108+ }
109109+110110+ // Detect conflicts and resolve
111111+ conflictDetected := sm.detectConflicts(deviceResults)
112112+ primaryDevice := sm.selectPrimaryDevice(deviceResults, hsmSecret)
113113+114114+ var resolvedData hsm.SecretData
115115+ if primaryDevice != "" && validDeviceData[primaryDevice] != nil {
116116+ resolvedData = validDeviceData[primaryDevice]
117117+ logger.Info("Using primary device data", "primaryDevice", primaryDevice)
118118+ } else if len(validDeviceData) > 0 {
119119+ // Use most recent data if no clear primary
120120+ resolvedData = sm.selectMostRecentData(deviceResults, validDeviceData)
121121+ logger.Info("Using most recent data for resolution")
122122+ }
123123+124124+ // If conflict detected and we have a resolution, sync to all other devices
125125+ if conflictDetected && primaryDevice != "" {
126126+ sm.syncToSecondaryDevices(ctx, devices, primaryDevice, secretPath, resolvedData, logger)
127127+ }
128128+129129+ return &SyncResult{
130130+ Success: len(validDeviceData) > 0,
131131+ ConflictDetected: conflictDetected,
132132+ PrimaryDevice: primaryDevice,
133133+ DeviceResults: deviceResults,
134134+ ResolvedData: resolvedData,
135135+ }, nil
136136+}
137137+138138+// readFromDevice reads secret data from a specific HSM device
139139+func (sm *SyncManager) readFromDevice(ctx context.Context, deviceName, secretPath string, logger logr.Logger) DeviceResult {
140140+ result := DeviceResult{
141141+ Timestamp: time.Now(),
142142+ }
143143+144144+ // Get gRPC client for this device
145145+ grpcClient, err := sm.agentManager.CreateSingleGRPCClient(ctx, deviceName, logger)
146146+ if err != nil {
147147+ result.Error = fmt.Errorf("failed to create gRPC client: %w", err)
148148+ return result
149149+ }
150150+ defer func() {
151151+ if closeErr := grpcClient.Close(); closeErr != nil {
152152+ logger.V(1).Info("Failed to close gRPC client", "error", closeErr)
153153+ }
154154+ }()
155155+156156+ result.Online = grpcClient.IsConnected()
157157+ if !result.Online {
158158+ result.Error = fmt.Errorf("device not connected")
159159+ return result
160160+ }
161161+162162+ // Try to read the secret
163163+ data, err := grpcClient.ReadSecret(ctx, secretPath)
164164+ if err != nil {
165165+ result.Error = fmt.Errorf("failed to read secret: %w", err)
166166+ return result
167167+ }
168168+169169+ // Calculate checksum
170170+ result.Checksum = sm.calculateChecksum(data)
171171+172172+ // Get metadata to extract version (if available)
173173+ metadata, err := grpcClient.ReadMetadata(ctx, secretPath)
174174+ if err == nil && metadata != nil {
175175+ if versionStr, exists := metadata.Tags["sync.version"]; exists {
176176+ if version, parseErr := parseVersion(versionStr); parseErr == nil {
177177+ result.Version = version
178178+ }
179179+ }
180180+ }
181181+182182+ return result
183183+}
184184+185185+// detectConflicts checks if there are conflicting checksums across devices
186186+func (sm *SyncManager) detectConflicts(deviceResults map[string]DeviceResult) bool {
187187+ checksums := make(map[string]int)
188188+ onlineDevices := 0
189189+190190+ for _, result := range deviceResults {
191191+ if result.Online && result.Error == nil && result.Checksum != "" {
192192+ checksums[result.Checksum]++
193193+ onlineDevices++
194194+ }
195195+ }
196196+197197+ // Conflict if we have more than one unique checksum across online devices
198198+ return len(checksums) > 1 && onlineDevices > 1
199199+}
200200+201201+// selectPrimaryDevice chooses the primary device for conflict resolution
202202+func (sm *SyncManager) selectPrimaryDevice(deviceResults map[string]DeviceResult, hsmSecret *hsmv1alpha1.HSMSecret) string {
203203+ // Check if there's already a designated primary in the status
204204+ if hsmSecret.Status.PrimaryDevice != "" {
205205+ if result, exists := deviceResults[hsmSecret.Status.PrimaryDevice]; exists && result.Online && result.Error == nil {
206206+ return hsmSecret.Status.PrimaryDevice
207207+ }
208208+ }
209209+210210+ // Find device with highest version number among online devices
211211+ var bestDevice string
212212+ var highestVersion int64 = -1
213213+ var mostRecentTime time.Time
214214+215215+ for deviceName, result := range deviceResults {
216216+ if result.Online && result.Error == nil {
217217+ // Prefer higher version numbers
218218+ if result.Version > highestVersion {
219219+ highestVersion = result.Version
220220+ bestDevice = deviceName
221221+ mostRecentTime = result.Timestamp
222222+ } else if result.Version == highestVersion && result.Timestamp.After(mostRecentTime) {
223223+ // If versions are equal, prefer more recent timestamp
224224+ bestDevice = deviceName
225225+ mostRecentTime = result.Timestamp
226226+ }
227227+ }
228228+ }
229229+230230+ return bestDevice
231231+}
232232+233233+// selectMostRecentData selects the most recently modified data
234234+func (sm *SyncManager) selectMostRecentData(deviceResults map[string]DeviceResult, validDeviceData map[string]hsm.SecretData) hsm.SecretData {
235235+ var mostRecentDevice string
236236+ var mostRecentTime time.Time
237237+238238+ for deviceName, result := range deviceResults {
239239+ if result.Online && result.Error == nil && result.Timestamp.After(mostRecentTime) {
240240+ mostRecentTime = result.Timestamp
241241+ mostRecentDevice = deviceName
242242+ }
243243+ }
244244+245245+ if mostRecentDevice != "" && validDeviceData[mostRecentDevice] != nil {
246246+ return validDeviceData[mostRecentDevice]
247247+ }
248248+249249+ // Return first available data if no clear winner
250250+ for _, data := range validDeviceData {
251251+ return data
252252+ }
253253+254254+ return nil
255255+}
256256+257257+// syncToSecondaryDevices syncs resolved data to all secondary devices
258258+func (sm *SyncManager) syncToSecondaryDevices(ctx context.Context, devices []string, primaryDevice, secretPath string, data hsm.SecretData, logger logr.Logger) {
259259+ for _, deviceName := range devices {
260260+ if deviceName == primaryDevice {
261261+ continue // Skip primary device
262262+ }
263263+264264+ logger.Info("Syncing to secondary device", "device", deviceName)
265265+266266+ grpcClient, err := sm.agentManager.CreateSingleGRPCClient(ctx, deviceName, logger)
267267+ if err != nil {
268268+ logger.Error(err, "Failed to create gRPC client for sync", "device", deviceName)
269269+ continue
270270+ }
271271+272272+ if !grpcClient.IsConnected() {
273273+ logger.V(1).Info("Device offline, skipping sync", "device", deviceName)
274274+ if closeErr := grpcClient.Close(); closeErr != nil {
275275+ logger.V(1).Info("Failed to close gRPC client", "error", closeErr)
276276+ }
277277+ continue
278278+ }
279279+280280+ // Write data with updated version metadata
281281+ metadata := &hsm.SecretMetadata{
282282+ Tags: map[string]string{
283283+ "sync.version": fmt.Sprintf("%d", time.Now().Unix()),
284284+ "sync.primary": primaryDevice,
285285+ "sync.timestamp": time.Now().Format(time.RFC3339),
286286+ },
287287+ }
288288+289289+ if err := grpcClient.WriteSecretWithMetadata(ctx, secretPath, data, metadata); err != nil {
290290+ logger.Error(err, "Failed to sync to secondary device", "device", deviceName)
291291+ } else {
292292+ logger.Info("Successfully synced to secondary device", "device", deviceName)
293293+ }
294294+295295+ if closeErr := grpcClient.Close(); closeErr != nil {
296296+ logger.V(1).Info("Failed to close gRPC client", "error", closeErr)
297297+ }
298298+ }
299299+}
300300+301301+// getAvailableDevices gets list of available HSM devices from HSMPools
302302+func (sm *SyncManager) getAvailableDevices(ctx context.Context, namespace string) ([]string, error) {
303303+ var hsmPoolList hsmv1alpha1.HSMPoolList
304304+ if err := sm.client.List(ctx, &hsmPoolList, client.InNamespace(namespace)); err != nil {
305305+ return nil, fmt.Errorf("failed to list HSM pools: %w", err)
306306+ }
307307+308308+ deviceNames := make(map[string]bool)
309309+310310+ for _, pool := range hsmPoolList.Items {
311311+ if pool.Status.Phase == hsmv1alpha1.HSMPoolPhaseReady {
312312+ for _, deviceRef := range pool.Spec.HSMDeviceRefs {
313313+ deviceNames[deviceRef] = true
314314+ }
315315+ }
316316+ }
317317+318318+ devices := make([]string, 0, len(deviceNames))
319319+ for deviceName := range deviceNames {
320320+ devices = append(devices, deviceName)
321321+ }
322322+323323+ sort.Strings(devices) // Ensure consistent ordering
324324+ return devices, nil
325325+}
326326+327327+// UpdateHSMSecretStatus updates the HSMSecret status with sync results
328328+func (sm *SyncManager) UpdateHSMSecretStatus(ctx context.Context, hsmSecret *hsmv1alpha1.HSMSecret, result *SyncResult) error {
329329+ now := metav1.NewTime(time.Now())
330330+331331+ // Update overall status
332332+ if result.Success {
333333+ hsmSecret.Status.SyncStatus = hsmv1alpha1.SyncStatusInSync
334334+ hsmSecret.Status.LastSyncTime = &now
335335+ hsmSecret.Status.LastError = ""
336336+ } else {
337337+ hsmSecret.Status.SyncStatus = hsmv1alpha1.SyncStatusError
338338+ hsmSecret.Status.LastError = "Failed to sync with any HSM device"
339339+ }
340340+341341+ hsmSecret.Status.SyncConflict = result.ConflictDetected
342342+ hsmSecret.Status.PrimaryDevice = result.PrimaryDevice
343343+344344+ // Update device-specific sync status
345345+ hsmSecret.Status.DeviceSyncStatus = make([]hsmv1alpha1.HSMDeviceSync, 0, len(result.DeviceResults))
346346+347347+ for deviceName, deviceResult := range result.DeviceResults {
348348+ syncTime := metav1.NewTime(deviceResult.Timestamp)
349349+ deviceSync := hsmv1alpha1.HSMDeviceSync{
350350+ DeviceName: deviceName,
351351+ LastSyncTime: &syncTime,
352352+ Checksum: deviceResult.Checksum,
353353+ Online: deviceResult.Online,
354354+ Version: deviceResult.Version,
355355+ }
356356+357357+ if deviceResult.Error != nil {
358358+ deviceSync.Status = hsmv1alpha1.SyncStatusError
359359+ deviceSync.LastError = deviceResult.Error.Error()
360360+ } else if deviceResult.Online {
361361+ deviceSync.Status = hsmv1alpha1.SyncStatusInSync
362362+ } else {
363363+ deviceSync.Status = hsmv1alpha1.SyncStatusOutOfSync
364364+ }
365365+366366+ hsmSecret.Status.DeviceSyncStatus = append(hsmSecret.Status.DeviceSyncStatus, deviceSync)
367367+ }
368368+369369+ // Update Kubernetes Secret checksum if we have resolved data
370370+ if result.ResolvedData != nil {
371371+ hsmSecret.Status.SecretChecksum = sm.calculateChecksum(result.ResolvedData)
372372+ }
373373+374374+ return sm.client.Status().Update(ctx, hsmSecret)
375375+}
376376+377377+// calculateChecksum calculates SHA256 checksum of secret data
378378+func (sm *SyncManager) calculateChecksum(data hsm.SecretData) string {
379379+ if data == nil {
380380+ return ""
381381+ }
382382+383383+ h := sha256.New()
384384+385385+ // Sort keys for consistent checksum calculation
386386+ keys := make([]string, 0, len(data))
387387+ for k := range data {
388388+ keys = append(keys, k)
389389+ }
390390+ sort.Strings(keys)
391391+392392+ for _, k := range keys {
393393+ h.Write([]byte(k))
394394+ h.Write(data[k])
395395+ }
396396+397397+ return fmt.Sprintf("%x", h.Sum(nil))
398398+}
399399+400400+// Helper function to parse version string
401401+func parseVersion(versionStr string) (int64, error) {
402402+ var version int64
403403+ _, err := fmt.Sscanf(versionStr, "%d", &version)
404404+ return version, err
405405+}