* Fix: Add -admin.grpc flag to worker for explicit gRPC port configuration * Fix(helm): Add adminGrpcServer to worker configuration * Refactor: Support host:port.grpcPort address format, revert -admin.grpc flag * Helm: Conditionally append grpcPort to worker admin address * weed/admin: fix "send on closed channel" panic in worker gRPC server Make unregisterWorker connection-aware to prevent closing channels belonging to newer connections. * weed/worker: improve gRPC client stability and logging - Fix goroutine leak in reconnection logic - Refactor reconnection loop to exit on success and prevent busy-waiting - Add session identification and enhanced logging to client handlers - Use constant for internal reset action and remove unused variables * weed/worker: fix worker state initialization and add lifecycle logs - Revert workerState to use running boolean correctly - Prevent handleStart failing by checking running state instead of startTime - Add more detailed logs for worker startup events
This commit is contained in:
@@ -225,11 +225,11 @@ func (s *WorkerGrpcServer) WorkerStream(stream worker_pb.WorkerService_WorkerStr
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
glog.Infof("Worker %s connection closed: %v", workerID, ctx.Err())
|
||||
s.unregisterWorker(workerID)
|
||||
s.unregisterWorker(conn)
|
||||
return nil
|
||||
case <-connCtx.Done():
|
||||
glog.Infof("Worker %s connection cancelled", workerID)
|
||||
s.unregisterWorker(workerID)
|
||||
s.unregisterWorker(conn)
|
||||
return nil
|
||||
default:
|
||||
}
|
||||
@@ -241,7 +241,7 @@ func (s *WorkerGrpcServer) WorkerStream(stream worker_pb.WorkerService_WorkerStr
|
||||
} else {
|
||||
glog.Errorf("Error receiving from worker %s: %v", workerID, err)
|
||||
}
|
||||
s.unregisterWorker(workerID)
|
||||
s.unregisterWorker(conn)
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -294,7 +294,7 @@ func (s *WorkerGrpcServer) handleWorkerMessage(conn *WorkerConnection, msg *work
|
||||
|
||||
case *worker_pb.WorkerMessage_Shutdown:
|
||||
glog.Infof("Worker %s shutting down: %s", workerID, m.Shutdown.Reason)
|
||||
s.unregisterWorker(workerID)
|
||||
s.unregisterWorker(conn)
|
||||
|
||||
default:
|
||||
glog.Warningf("Unknown message type from worker %s", workerID)
|
||||
@@ -463,17 +463,24 @@ func (s *WorkerGrpcServer) safeCloseOutgoingChannel(conn *WorkerConnection, sour
|
||||
}
|
||||
|
||||
// unregisterWorker removes a worker connection
|
||||
func (s *WorkerGrpcServer) unregisterWorker(workerID string) {
|
||||
func (s *WorkerGrpcServer) unregisterWorker(conn *WorkerConnection) {
|
||||
s.connMutex.Lock()
|
||||
conn, exists := s.connections[workerID]
|
||||
existingConn, exists := s.connections[conn.workerID]
|
||||
if !exists {
|
||||
s.connMutex.Unlock()
|
||||
glog.V(2).Infof("unregisterWorker: worker %s not found in connections map (already unregistered)", workerID)
|
||||
glog.V(2).Infof("unregisterWorker: worker %s not found in connections map (already unregistered)", conn.workerID)
|
||||
return
|
||||
}
|
||||
|
||||
// Only remove if it matches the specific connection instance
|
||||
if existingConn != conn {
|
||||
s.connMutex.Unlock()
|
||||
glog.V(1).Infof("unregisterWorker: worker %s connection replaced, skipping unregister for old connection", conn.workerID)
|
||||
return
|
||||
}
|
||||
|
||||
// Remove from map first to prevent duplicate cleanup attempts
|
||||
delete(s.connections, workerID)
|
||||
delete(s.connections, conn.workerID)
|
||||
s.connMutex.Unlock()
|
||||
|
||||
// Cancel context to signal goroutines to stop
|
||||
@@ -482,7 +489,7 @@ func (s *WorkerGrpcServer) unregisterWorker(workerID string) {
|
||||
// Safely close the outgoing channel with recover to handle potential double-close
|
||||
s.safeCloseOutgoingChannel(conn, "unregisterWorker")
|
||||
|
||||
glog.V(1).Infof("Unregistered worker %s", workerID)
|
||||
glog.V(1).Infof("Unregistered worker %s", conn.workerID)
|
||||
}
|
||||
|
||||
// cleanupRoutine periodically cleans up stale connections
|
||||
@@ -505,16 +512,19 @@ func (s *WorkerGrpcServer) cleanupStaleConnections() {
|
||||
cutoff := time.Now().Add(-2 * time.Minute)
|
||||
|
||||
s.connMutex.Lock()
|
||||
defer s.connMutex.Unlock()
|
||||
|
||||
for workerID, conn := range s.connections {
|
||||
// collect connections to remove first to avoid deadlock if unregisterWorker locks
|
||||
var toRemove []*WorkerConnection
|
||||
for _, conn := range s.connections {
|
||||
if conn.lastSeen.Before(cutoff) {
|
||||
glog.Warningf("Cleaning up stale worker connection: %s", workerID)
|
||||
conn.cancel()
|
||||
s.safeCloseOutgoingChannel(conn, "cleanupStaleConnections")
|
||||
delete(s.connections, workerID)
|
||||
toRemove = append(toRemove, conn)
|
||||
}
|
||||
}
|
||||
s.connMutex.Unlock()
|
||||
|
||||
for _, conn := range toRemove {
|
||||
glog.Warningf("Cleaning up stale worker connection: %s", conn.workerID)
|
||||
s.unregisterWorker(conn)
|
||||
}
|
||||
}
|
||||
|
||||
// GetConnectedWorkers returns a list of currently connected workers
|
||||
|
||||
Reference in New Issue
Block a user