Fix: Add -admin.grpc flag to worker for explicit gRPC port (#7926) (#7927)

* Fix: Add -admin.grpc flag to worker for explicit gRPC port configuration

* Fix(helm): Add adminGrpcServer to worker configuration

* Refactor: Support host:port.grpcPort address format, revert -admin.grpc flag

* Helm: Conditionally append grpcPort to worker admin address

* weed/admin: fix "send on closed channel" panic in worker gRPC server

Make unregisterWorker connection-aware to prevent closing channels
belonging to newer connections.

* weed/worker: improve gRPC client stability and logging

- Fix goroutine leak in reconnection logic
- Refactor reconnection loop to exit on success and prevent busy-waiting
- Add session identification and enhanced logging to client handlers
- Use constant for internal reset action and remove unused variables

* weed/worker: fix worker state initialization and add lifecycle logs

- Revert workerState to use running boolean correctly
- Prevent handleStart failing by checking running state instead of startTime
- Add more detailed logs for worker startup events
This commit is contained in:
Chris Lu
2025-12-31 11:55:09 -08:00
committed by GitHub
parent 5a135f8c5a
commit 31a4f57cd9
7 changed files with 146 additions and 69 deletions

View File

@@ -225,11 +225,11 @@ func (s *WorkerGrpcServer) WorkerStream(stream worker_pb.WorkerService_WorkerStr
select {
case <-ctx.Done():
glog.Infof("Worker %s connection closed: %v", workerID, ctx.Err())
s.unregisterWorker(workerID)
s.unregisterWorker(conn)
return nil
case <-connCtx.Done():
glog.Infof("Worker %s connection cancelled", workerID)
s.unregisterWorker(workerID)
s.unregisterWorker(conn)
return nil
default:
}
@@ -241,7 +241,7 @@ func (s *WorkerGrpcServer) WorkerStream(stream worker_pb.WorkerService_WorkerStr
} else {
glog.Errorf("Error receiving from worker %s: %v", workerID, err)
}
s.unregisterWorker(workerID)
s.unregisterWorker(conn)
return err
}
@@ -294,7 +294,7 @@ func (s *WorkerGrpcServer) handleWorkerMessage(conn *WorkerConnection, msg *work
case *worker_pb.WorkerMessage_Shutdown:
glog.Infof("Worker %s shutting down: %s", workerID, m.Shutdown.Reason)
s.unregisterWorker(workerID)
s.unregisterWorker(conn)
default:
glog.Warningf("Unknown message type from worker %s", workerID)
@@ -463,17 +463,24 @@ func (s *WorkerGrpcServer) safeCloseOutgoingChannel(conn *WorkerConnection, sour
}
// unregisterWorker removes a worker connection
func (s *WorkerGrpcServer) unregisterWorker(workerID string) {
func (s *WorkerGrpcServer) unregisterWorker(conn *WorkerConnection) {
s.connMutex.Lock()
conn, exists := s.connections[workerID]
existingConn, exists := s.connections[conn.workerID]
if !exists {
s.connMutex.Unlock()
glog.V(2).Infof("unregisterWorker: worker %s not found in connections map (already unregistered)", workerID)
glog.V(2).Infof("unregisterWorker: worker %s not found in connections map (already unregistered)", conn.workerID)
return
}
// Only remove if it matches the specific connection instance
if existingConn != conn {
s.connMutex.Unlock()
glog.V(1).Infof("unregisterWorker: worker %s connection replaced, skipping unregister for old connection", conn.workerID)
return
}
// Remove from map first to prevent duplicate cleanup attempts
delete(s.connections, workerID)
delete(s.connections, conn.workerID)
s.connMutex.Unlock()
// Cancel context to signal goroutines to stop
@@ -482,7 +489,7 @@ func (s *WorkerGrpcServer) unregisterWorker(workerID string) {
// Safely close the outgoing channel with recover to handle potential double-close
s.safeCloseOutgoingChannel(conn, "unregisterWorker")
glog.V(1).Infof("Unregistered worker %s", workerID)
glog.V(1).Infof("Unregistered worker %s", conn.workerID)
}
// cleanupRoutine periodically cleans up stale connections
@@ -505,16 +512,19 @@ func (s *WorkerGrpcServer) cleanupStaleConnections() {
cutoff := time.Now().Add(-2 * time.Minute)
s.connMutex.Lock()
defer s.connMutex.Unlock()
for workerID, conn := range s.connections {
// collect connections to remove first to avoid deadlock if unregisterWorker locks
var toRemove []*WorkerConnection
for _, conn := range s.connections {
if conn.lastSeen.Before(cutoff) {
glog.Warningf("Cleaning up stale worker connection: %s", workerID)
conn.cancel()
s.safeCloseOutgoingChannel(conn, "cleanupStaleConnections")
delete(s.connections, workerID)
toRemove = append(toRemove, conn)
}
}
s.connMutex.Unlock()
for _, conn := range toRemove {
glog.Warningf("Cleaning up stale worker connection: %s", conn.workerID)
s.unregisterWorker(conn)
}
}
// GetConnectedWorkers returns a list of currently connected workers