Files
seaweedFS/weed/server/master_grpc_server_assign.go
Chris Lu 72a8f598f2 Fix Maintenance Task Sorting and Refactor Log Persistence (#8199)
* fix float stepping

* do not auto refresh

* only logs when non 200 status

* fix maintenance task sorting and cleanup redundant handler logic

* Refactor log retrieval to persist to disk and fix slowness

- Move log retrieval to disk-based persistence in GetMaintenanceTaskDetail
- Implement background log fetching on task completion in worker_grpc_server.go
- Implement async background refresh for in-progress tasks
- Completely remove blocking gRPC calls from the UI path to fix 10s timeouts
- Cleanup debug logs and performance profiling code

* Ensure consistent deterministic sorting in config_persistence cleanup

* Replace magic numbers with constants and remove debug logs

- Added descriptive constants for truncation limits and timeouts in admin_server.go and worker_grpc_server.go
- Replaced magic numbers with these constants throughout the codebase
- Verified removal of stdout debug printing
- Ensured consistent truncation logic during log persistence

* Address code review feedback on history truncation and logging logic

- Fix AssignmentHistory double-serialization by copying task in GetMaintenanceTaskDetail
- Fix handleTaskCompletion logging logic (mutually exclusive success/failure logs)
- Remove unused Timeout field from LogRequestContext and sync select timeouts with constants
- Ensure AssignmentHistory is only provided in the top-level field for better JSON structure

* Implement goroutine leak protection and request deduplication

- Add request deduplication in RequestTaskLogs to prevent multiple concurrent fetches for the same task
- Implement safe cleanup in timeout handlers to avoid race conditions in pendingLogRequests map
- Add a 10s cooldown for background log refreshes in GetMaintenanceTaskDetail to prevent spamming
- Ensure all persistent log-fetching goroutines are bounded and efficiently managed

* Fix potential nil pointer panics in maintenance handlers

- Add nil checks for adminServer in ShowTaskDetail, ShowMaintenanceWorkers, and UpdateTaskConfig
- Update getMaintenanceQueueData to return a descriptive error instead of nil when adminServer is uninitialized
- Ensure internal helper methods consistently check for adminServer initialization before use

* Strictly enforce disk-only log reading

- Remove background log fetching from GetMaintenanceTaskDetail to prevent timeouts and network calls during page view
- Remove unused lastLogFetch tracking fields to clean up dead code
- Ensure logs are only updated upon task completion via handleTaskCompletion

* Refactor GetWorkerLogs to read from disk

- Update /api/maintenance/workers/:id/logs endpoint to use configPersistence.LoadTaskExecutionLogs
- Remove synchronous gRPC call RequestTaskLogs to prevent timeouts and bad gateway errors
- Ensure consistent log retrieval behavior across the application (disk-only)

* Fix timestamp parsing in log viewer

- Update task_detail.templ JS to handle both ISO 8601 strings and Unix timestamps
- Fix "Invalid time value" error when displaying logs fetched from disk
- Regenerate templates

* master: fallback to HDD if SSD volumes are full in Assign

* worker: improve EC detection logging and fix skip counters

* worker: add Sync method to TaskLogger interface

* worker: implement Sync and ensure logs are flushed before task completion

* admin: improve task log retrieval with retries and better timeouts

* admin: robust timestamp parsing in task detail view
2026-02-04 08:48:55 -08:00

154 lines
4.3 KiB
Go

package weed_server
import (
"context"
"fmt"
"strings"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/stats"
"github.com/seaweedfs/raft"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/security"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
"github.com/seaweedfs/seaweedfs/weed/topology"
)
func (ms *MasterServer) StreamAssign(server master_pb.Seaweed_StreamAssignServer) error {
for {
req, err := server.Recv()
if err != nil {
glog.Errorf("StreamAssign failed to receive: %v", err)
return err
}
resp, err := ms.Assign(context.Background(), req)
if err != nil {
glog.Errorf("StreamAssign failed to assign: %v", err)
return err
}
if err = server.Send(resp); err != nil {
glog.Errorf("StreamAssign failed to send: %v", err)
return err
}
}
}
func (ms *MasterServer) Assign(ctx context.Context, req *master_pb.AssignRequest) (*master_pb.AssignResponse, error) {
if !ms.Topo.IsLeader() {
return nil, raft.NotLeaderError
}
if req.Count == 0 {
req.Count = 1
}
if req.Replication == "" {
req.Replication = ms.option.DefaultReplicaPlacement
}
replicaPlacement, err := super_block.NewReplicaPlacementFromString(req.Replication)
if err != nil {
return nil, err
}
ttl, err := needle.ReadTTL(req.Ttl)
if err != nil {
return nil, err
}
diskType := types.ToDiskType(req.DiskType)
ver := needle.GetCurrentVersion()
option := &topology.VolumeGrowOption{
Collection: req.Collection,
ReplicaPlacement: replicaPlacement,
Ttl: ttl,
DiskType: diskType,
Preallocate: ms.preallocateSize,
DataCenter: req.DataCenter,
Rack: req.Rack,
DataNode: req.DataNode,
MemoryMapMaxSizeMb: req.MemoryMapMaxSizeMb,
Version: uint32(ver),
}
if !ms.Topo.DataCenterExists(option.DataCenter) {
return nil, fmt.Errorf("data center %v not found in topology", option.DataCenter)
}
vl := ms.Topo.GetVolumeLayout(option.Collection, option.ReplicaPlacement, option.Ttl, option.DiskType)
if req.DiskType == "" {
if writable, _ := vl.GetWritableVolumeCount(); writable == 0 {
if hddVl := ms.Topo.GetVolumeLayout(option.Collection, option.ReplicaPlacement, option.Ttl, types.ToDiskType(types.HddType)); hddVl != nil {
if writable, _ := hddVl.GetWritableVolumeCount(); writable > 0 {
option.DiskType = types.ToDiskType(types.HddType)
vl = hddVl
}
}
}
}
vl.SetLastGrowCount(req.WritableVolumeCount)
var (
lastErr error
maxTimeout = time.Second * 10
startTime = time.Now()
)
for time.Now().Sub(startTime) < maxTimeout {
fid, count, dnList, shouldGrow, err := ms.Topo.PickForWrite(req.Count, option, vl)
if shouldGrow && !vl.HasGrowRequest() && !ms.option.VolumeGrowthDisabled {
if err != nil && ms.Topo.AvailableSpaceFor(option) <= 0 {
err = fmt.Errorf("%s and no free volumes left for %s", err.Error(), option.String())
}
vl.AddGrowRequest()
ms.volumeGrowthRequestChan <- &topology.VolumeGrowRequest{
Option: option,
Count: req.WritableVolumeCount,
Reason: "grpc assign",
}
}
if err != nil {
glog.V(1).Infof("assign %v %v: %v", req, option.String(), err)
stats.MasterPickForWriteErrorCounter.Inc()
lastErr = err
if (req.DataCenter != "" || req.Rack != "") && strings.Contains(err.Error(), topology.NoWritableVolumes) {
break
}
time.Sleep(200 * time.Millisecond)
continue
}
dn := dnList.Head()
if dn == nil {
continue
}
var replicas []*master_pb.Location
for _, r := range dnList.Rest() {
replicas = append(replicas, &master_pb.Location{
Url: r.Url(),
PublicUrl: r.PublicUrl,
GrpcPort: uint32(r.GrpcPort),
DataCenter: r.GetDataCenterId(),
})
}
return &master_pb.AssignResponse{
Fid: fid,
Location: &master_pb.Location{
Url: dn.Url(),
PublicUrl: dn.PublicUrl,
GrpcPort: uint32(dn.GrpcPort),
DataCenter: dn.GetDataCenterId(),
},
Count: count,
Auth: string(security.GenJwtForVolumeServer(ms.guard.SigningKey, ms.guard.ExpiresAfterSec, fid)),
Replicas: replicas,
}, nil
}
if lastErr != nil {
glog.V(0).Infof("assign %v %v: %v", req, option.String(), lastErr)
}
return nil, lastErr
}