Improve-worker (#7367)
* ♻️ refactor(worker): remove goto * ♻️ refactor(worker): let manager loop exit by itself * ♻️ refactor(worker): fix race condition when closing worker CloseSend is not safe to call when another goroutine concurrently calls Send. streamCancel already handles proper stream closure. Also, streamExit signal should be called AFTER sending shutdownMsg Now the worker has no race condition if stopped during any moment (hopefully, tested with -race flag) * 🐛 fix(task_logger): deadlock in log closure * 🐛 fix(balance): fix balance task Removes the outdated "UnloadVolume" step as it is handled by "DeleteVolume". #7346
This commit is contained in:
committed by
GitHub
parent
557aa4ec09
commit
f06ddd05cc
@@ -5,7 +5,6 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"sync"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||||
@@ -26,7 +25,6 @@ type GrpcAdminClient struct {
|
|||||||
dialOption grpc.DialOption
|
dialOption grpc.DialOption
|
||||||
|
|
||||||
cmds chan grpcCommand
|
cmds chan grpcCommand
|
||||||
closeOnce sync.Once
|
|
||||||
|
|
||||||
// Reconnection parameters
|
// Reconnection parameters
|
||||||
maxReconnectAttempts int
|
maxReconnectAttempts int
|
||||||
@@ -103,12 +101,14 @@ func NewGrpcAdminClient(adminAddress string, workerID string, dialOption grpc.Di
|
|||||||
func (c *GrpcAdminClient) managerLoop() {
|
func (c *GrpcAdminClient) managerLoop() {
|
||||||
state := &grpcState{shouldReconnect: true}
|
state := &grpcState{shouldReconnect: true}
|
||||||
|
|
||||||
|
out:
|
||||||
for cmd := range c.cmds {
|
for cmd := range c.cmds {
|
||||||
switch cmd.action {
|
switch cmd.action {
|
||||||
case ActionConnect:
|
case ActionConnect:
|
||||||
c.handleConnect(cmd, state)
|
c.handleConnect(cmd, state)
|
||||||
case ActionDisconnect:
|
case ActionDisconnect:
|
||||||
c.handleDisconnect(cmd, state)
|
c.handleDisconnect(cmd, state)
|
||||||
|
break out
|
||||||
case ActionReconnect:
|
case ActionReconnect:
|
||||||
if state.connected || state.reconnecting || !state.shouldReconnect {
|
if state.connected || state.reconnecting || !state.shouldReconnect {
|
||||||
cmd.resp <- ErrAlreadyConnected
|
cmd.resp <- ErrAlreadyConnected
|
||||||
@@ -240,9 +240,6 @@ func (c *GrpcAdminClient) reconnect(s *grpcState) error {
|
|||||||
if s.streamCancel != nil {
|
if s.streamCancel != nil {
|
||||||
s.streamCancel()
|
s.streamCancel()
|
||||||
}
|
}
|
||||||
if s.stream != nil {
|
|
||||||
s.stream.CloseSend()
|
|
||||||
}
|
|
||||||
if s.conn != nil {
|
if s.conn != nil {
|
||||||
s.conn.Close()
|
s.conn.Close()
|
||||||
}
|
}
|
||||||
@@ -412,9 +409,6 @@ func (c *GrpcAdminClient) Disconnect() error {
|
|||||||
resp: resp,
|
resp: resp,
|
||||||
}
|
}
|
||||||
err := <-resp
|
err := <-resp
|
||||||
c.closeOnce.Do(func() {
|
|
||||||
close(c.cmds)
|
|
||||||
})
|
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -427,9 +421,6 @@ func (c *GrpcAdminClient) handleDisconnect(cmd grpcCommand, s *grpcState) {
|
|||||||
// Send shutdown signal to stop reconnection loop
|
// Send shutdown signal to stop reconnection loop
|
||||||
close(s.reconnectStop)
|
close(s.reconnectStop)
|
||||||
|
|
||||||
// Send shutdown signal to stop handlers loop
|
|
||||||
close(s.streamExit)
|
|
||||||
|
|
||||||
s.connected = false
|
s.connected = false
|
||||||
s.shouldReconnect = false
|
s.shouldReconnect = false
|
||||||
|
|
||||||
@@ -452,16 +443,14 @@ func (c *GrpcAdminClient) handleDisconnect(cmd grpcCommand, s *grpcState) {
|
|||||||
glog.Warningf("Failed to send shutdown message")
|
glog.Warningf("Failed to send shutdown message")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Send shutdown signal to stop handlers loop
|
||||||
|
close(s.streamExit)
|
||||||
|
|
||||||
// Cancel stream context
|
// Cancel stream context
|
||||||
if s.streamCancel != nil {
|
if s.streamCancel != nil {
|
||||||
s.streamCancel()
|
s.streamCancel()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close stream
|
|
||||||
if s.stream != nil {
|
|
||||||
s.stream.CloseSend()
|
|
||||||
}
|
|
||||||
|
|
||||||
// Close connection
|
// Close connection
|
||||||
if s.conn != nil {
|
if s.conn != nil {
|
||||||
s.conn.Close()
|
s.conn.Close()
|
||||||
|
|||||||
@@ -106,15 +106,8 @@ func (t *BalanceTask) Execute(ctx context.Context, params *worker_pb.TaskParams)
|
|||||||
glog.Warningf("Tail operation failed (may be normal): %v", err)
|
glog.Warningf("Tail operation failed (may be normal): %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 5: Unmount from source
|
// Step 5: Delete from source
|
||||||
t.ReportProgress(85.0)
|
t.ReportProgress(90.0)
|
||||||
t.GetLogger().Info("Unmounting volume from source server")
|
|
||||||
if err := t.unmountVolume(sourceServer, volumeId); err != nil {
|
|
||||||
return fmt.Errorf("failed to unmount volume from source: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Step 6: Delete from source
|
|
||||||
t.ReportProgress(95.0)
|
|
||||||
t.GetLogger().Info("Deleting volume from source server")
|
t.GetLogger().Info("Deleting volume from source server")
|
||||||
if err := t.deleteVolume(sourceServer, volumeId); err != nil {
|
if err := t.deleteVolume(sourceServer, volumeId); err != nil {
|
||||||
return fmt.Errorf("failed to delete volume from source: %v", err)
|
return fmt.Errorf("failed to delete volume from source: %v", err)
|
||||||
|
|||||||
@@ -232,6 +232,7 @@ func (l *FileTaskLogger) LogWithFields(level string, message string, fields map[
|
|||||||
|
|
||||||
// Close closes the logger and finalizes metadata
|
// Close closes the logger and finalizes metadata
|
||||||
func (l *FileTaskLogger) Close() error {
|
func (l *FileTaskLogger) Close() error {
|
||||||
|
l.Info("Task logger closed for %s", l.taskID)
|
||||||
l.mutex.Lock()
|
l.mutex.Lock()
|
||||||
defer l.mutex.Unlock()
|
defer l.mutex.Unlock()
|
||||||
|
|
||||||
@@ -260,7 +261,6 @@ func (l *FileTaskLogger) Close() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
l.closed = true
|
l.closed = true
|
||||||
l.Info("Task logger closed for %s", l.taskID)
|
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/seaweedfs/seaweedfs/weed/glog"
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
||||||
@@ -29,7 +28,6 @@ type Worker struct {
|
|||||||
cmds chan workerCommand
|
cmds chan workerCommand
|
||||||
state *workerState
|
state *workerState
|
||||||
taskLogHandler *tasks.TaskLogHandler
|
taskLogHandler *tasks.TaskLogHandler
|
||||||
closeOnce sync.Once
|
|
||||||
}
|
}
|
||||||
type workerState struct {
|
type workerState struct {
|
||||||
running bool
|
running bool
|
||||||
@@ -201,13 +199,14 @@ func (w *Worker) managerLoop() {
|
|||||||
stopChan: make(chan struct{}),
|
stopChan: make(chan struct{}),
|
||||||
currentTasks: make(map[string]*types.TaskInput),
|
currentTasks: make(map[string]*types.TaskInput),
|
||||||
}
|
}
|
||||||
|
out:
|
||||||
for cmd := range w.cmds {
|
for cmd := range w.cmds {
|
||||||
switch cmd.action {
|
switch cmd.action {
|
||||||
case ActionStart:
|
case ActionStart:
|
||||||
w.handleStart(cmd)
|
w.handleStart(cmd)
|
||||||
case ActionStop:
|
case ActionStop:
|
||||||
w.handleStop(cmd)
|
w.handleStop(cmd)
|
||||||
|
break out
|
||||||
case ActionGetStatus:
|
case ActionGetStatus:
|
||||||
respCh := cmd.data.(statusResponse)
|
respCh := cmd.data.(statusResponse)
|
||||||
var currentTasks []types.TaskInput
|
var currentTasks []types.TaskInput
|
||||||
@@ -495,15 +494,15 @@ func (w *Worker) Stop() error {
|
|||||||
// Wait for tasks to finish
|
// Wait for tasks to finish
|
||||||
timeout := time.NewTimer(30 * time.Second)
|
timeout := time.NewTimer(30 * time.Second)
|
||||||
defer timeout.Stop()
|
defer timeout.Stop()
|
||||||
|
out:
|
||||||
for w.getTaskLoad() > 0 {
|
for w.getTaskLoad() > 0 {
|
||||||
select {
|
select {
|
||||||
case <-timeout.C:
|
case <-timeout.C:
|
||||||
glog.Warningf("Worker %s stopping with %d tasks still running", w.id, w.getTaskLoad())
|
glog.Warningf("Worker %s stopping with %d tasks still running", w.id, w.getTaskLoad())
|
||||||
goto end_wait
|
break out
|
||||||
case <-time.After(100 * time.Millisecond):
|
case <-time.After(100 * time.Millisecond):
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
end_wait:
|
|
||||||
|
|
||||||
// Disconnect from admin server
|
// Disconnect from admin server
|
||||||
if adminClient := w.getAdmin(); adminClient != nil {
|
if adminClient := w.getAdmin(); adminClient != nil {
|
||||||
@@ -511,10 +510,6 @@ end_wait:
|
|||||||
glog.Errorf("Error disconnecting from admin server: %v", err)
|
glog.Errorf("Error disconnecting from admin server: %v", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
w.closeOnce.Do(func() {
|
|
||||||
close(w.cmds)
|
|
||||||
})
|
|
||||||
glog.Infof("Worker %s stopped", w.id)
|
glog.Infof("Worker %s stopped", w.id)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -731,7 +726,6 @@ func (w *Worker) executeTask(task *types.TaskInput) {
|
|||||||
fileLogger.Info("Task %s completed successfully", task.ID)
|
fileLogger.Info("Task %s completed successfully", task.ID)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// completeTask reports task completion to admin server
|
// completeTask reports task completion to admin server
|
||||||
|
|||||||
Reference in New Issue
Block a user