Files
seaweedFS/weed/server/volume_grpc_admin.go
Chris Lu 772459f93c fix: restore volume mount when VolumeConfigure fails (#7669)
* fix: restore volume mount when VolumeConfigure fails

When volume.configure.replication command fails (e.g., due to corrupted
.vif file), the volume was left unmounted and the master was already
notified that the volume was deleted, causing the volume to disappear.

This fix attempts to re-mount the volume when ConfigureVolume fails,
restoring the volume state and preventing data loss.

Fixes #7666

* include mount restore error in response message
2025-12-08 16:43:35 -08:00

365 lines
11 KiB
Go

package weed_server
import (
"context"
"fmt"
"path/filepath"
"time"
"github.com/seaweedfs/seaweedfs/weed/util/version"
"github.com/seaweedfs/seaweedfs/weed/storage"
"github.com/seaweedfs/seaweedfs/weed/cluster"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/master_pb"
"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
"github.com/seaweedfs/seaweedfs/weed/stats"
"github.com/seaweedfs/seaweedfs/weed/storage/needle"
"github.com/seaweedfs/seaweedfs/weed/storage/super_block"
"github.com/seaweedfs/seaweedfs/weed/storage/types"
)
func (vs *VolumeServer) DeleteCollection(ctx context.Context, req *volume_server_pb.DeleteCollectionRequest) (*volume_server_pb.DeleteCollectionResponse, error) {
resp := &volume_server_pb.DeleteCollectionResponse{}
err := vs.store.DeleteCollection(req.Collection)
if err != nil {
glog.Errorf("delete collection %s: %v", req.Collection, err)
} else {
glog.V(2).Infof("delete collection %v", req)
}
return resp, err
}
func (vs *VolumeServer) AllocateVolume(ctx context.Context, req *volume_server_pb.AllocateVolumeRequest) (*volume_server_pb.AllocateVolumeResponse, error) {
resp := &volume_server_pb.AllocateVolumeResponse{}
err := vs.store.AddVolume(
needle.VolumeId(req.VolumeId),
req.Collection,
vs.needleMapKind,
req.Replication,
req.Ttl,
req.Preallocate,
needle.Version(req.Version),
req.MemoryMapMaxSizeMb,
types.ToDiskType(req.DiskType),
vs.ldbTimout,
)
if err != nil {
glog.Errorf("assign volume %v: %v", req, err)
} else {
glog.V(2).Infof("assign volume %v", req)
}
return resp, err
}
func (vs *VolumeServer) VolumeMount(ctx context.Context, req *volume_server_pb.VolumeMountRequest) (*volume_server_pb.VolumeMountResponse, error) {
resp := &volume_server_pb.VolumeMountResponse{}
err := vs.store.MountVolume(needle.VolumeId(req.VolumeId))
if err != nil {
glog.Errorf("volume mount %v: %v", req, err)
} else {
glog.V(2).Infof("volume mount %v", req)
}
return resp, err
}
func (vs *VolumeServer) VolumeUnmount(ctx context.Context, req *volume_server_pb.VolumeUnmountRequest) (*volume_server_pb.VolumeUnmountResponse, error) {
resp := &volume_server_pb.VolumeUnmountResponse{}
err := vs.store.UnmountVolume(needle.VolumeId(req.VolumeId))
if err != nil {
glog.Errorf("volume unmount %v: %v", req, err)
} else {
glog.V(2).Infof("volume unmount %v", req)
}
return resp, err
}
func (vs *VolumeServer) VolumeDelete(ctx context.Context, req *volume_server_pb.VolumeDeleteRequest) (*volume_server_pb.VolumeDeleteResponse, error) {
resp := &volume_server_pb.VolumeDeleteResponse{}
err := vs.store.DeleteVolume(needle.VolumeId(req.VolumeId), req.OnlyEmpty)
if err != nil {
glog.Errorf("volume delete %v: %v", req, err)
} else {
glog.V(2).Infof("volume delete %v", req)
}
return resp, err
}
func (vs *VolumeServer) VolumeConfigure(ctx context.Context, req *volume_server_pb.VolumeConfigureRequest) (*volume_server_pb.VolumeConfigureResponse, error) {
resp := &volume_server_pb.VolumeConfigureResponse{}
// check replication format
if _, err := super_block.NewReplicaPlacementFromString(req.Replication); err != nil {
resp.Error = fmt.Sprintf("volume configure replication %v: %v", req, err)
return resp, nil
}
// unmount
if err := vs.store.UnmountVolume(needle.VolumeId(req.VolumeId)); err != nil {
glog.Errorf("volume configure unmount %v: %v", req, err)
resp.Error = fmt.Sprintf("volume configure unmount %v: %v", req, err)
return resp, nil
}
// modify the volume info file
if err := vs.store.ConfigureVolume(needle.VolumeId(req.VolumeId), req.Replication); err != nil {
glog.Errorf("volume configure %v: %v", req, err)
resp.Error = fmt.Sprintf("volume configure %v: %v", req, err)
// Try to re-mount to restore the volume state
if mountErr := vs.store.MountVolume(needle.VolumeId(req.VolumeId)); mountErr != nil {
glog.Errorf("volume configure failed to restore mount %v: %v", req, mountErr)
resp.Error += fmt.Sprintf(". Also failed to restore mount: %v", mountErr)
}
return resp, nil
}
// mount
if err := vs.store.MountVolume(needle.VolumeId(req.VolumeId)); err != nil {
glog.Errorf("volume configure mount %v: %v", req, err)
resp.Error = fmt.Sprintf("volume configure mount %v: %v", req, err)
return resp, nil
}
return resp, nil
}
func (vs *VolumeServer) VolumeMarkReadonly(ctx context.Context, req *volume_server_pb.VolumeMarkReadonlyRequest) (*volume_server_pb.VolumeMarkReadonlyResponse, error) {
resp := &volume_server_pb.VolumeMarkReadonlyResponse{}
v := vs.store.GetVolume(needle.VolumeId(req.VolumeId))
if v == nil {
return nil, fmt.Errorf("volume %d not found", req.VolumeId)
}
// step 1: stop master from redirecting traffic here
if err := vs.notifyMasterVolumeReadonly(v, true); err != nil {
return resp, err
}
// rare case 1.5: it will be unlucky if heartbeat happened between step 1 and 2.
// step 2: mark local volume as readonly
err := vs.store.MarkVolumeReadonly(needle.VolumeId(req.VolumeId), req.GetPersist())
if err != nil {
glog.Errorf("volume mark readonly %v: %v", req, err)
} else {
glog.V(2).Infof("volume mark readonly %v", req)
}
// step 3: tell master from redirecting traffic here again, to prevent rare case 1.5
if err := vs.notifyMasterVolumeReadonly(v, true); err != nil {
return resp, err
}
return resp, err
}
func (vs *VolumeServer) notifyMasterVolumeReadonly(v *storage.Volume, isReadOnly bool) error {
if grpcErr := pb.WithMasterClient(false, vs.GetMaster(context.Background()), vs.grpcDialOption, false, func(client master_pb.SeaweedClient) error {
_, err := client.VolumeMarkReadonly(context.Background(), &master_pb.VolumeMarkReadonlyRequest{
Ip: vs.store.Ip,
Port: uint32(vs.store.Port),
VolumeId: uint32(v.Id),
Collection: v.Collection,
ReplicaPlacement: uint32(v.ReplicaPlacement.Byte()),
Ttl: v.Ttl.ToUint32(),
DiskType: string(v.DiskType()),
IsReadonly: isReadOnly,
})
if err != nil {
return fmt.Errorf("set volume %d to read only on master: %v", v.Id, err)
}
return nil
}); grpcErr != nil {
glog.V(0).Infof("connect to %s: %v", vs.GetMaster(context.Background()), grpcErr)
return fmt.Errorf("grpc VolumeMarkReadonly with master %s: %v", vs.GetMaster(context.Background()), grpcErr)
}
return nil
}
func (vs *VolumeServer) VolumeMarkWritable(ctx context.Context, req *volume_server_pb.VolumeMarkWritableRequest) (*volume_server_pb.VolumeMarkWritableResponse, error) {
resp := &volume_server_pb.VolumeMarkWritableResponse{}
v := vs.store.GetVolume(needle.VolumeId(req.VolumeId))
if v == nil {
return nil, fmt.Errorf("volume %d not found", req.VolumeId)
}
err := vs.store.MarkVolumeWritable(needle.VolumeId(req.VolumeId))
if err != nil {
glog.Errorf("volume mark writable %v: %v", req, err)
} else {
glog.V(2).Infof("volume mark writable %v", req)
}
// enable master to redirect traffic here
if err := vs.notifyMasterVolumeReadonly(v, false); err != nil {
return resp, err
}
return resp, err
}
func (vs *VolumeServer) VolumeStatus(ctx context.Context, req *volume_server_pb.VolumeStatusRequest) (*volume_server_pb.VolumeStatusResponse, error) {
resp := &volume_server_pb.VolumeStatusResponse{}
v := vs.store.GetVolume(needle.VolumeId(req.VolumeId))
if v == nil {
return nil, fmt.Errorf("not found volume id %d", req.VolumeId)
}
if v.DataBackend == nil {
return nil, fmt.Errorf("volume %d data backend not found", req.VolumeId)
}
volumeSize, _, _ := v.DataBackend.GetStat()
resp.IsReadOnly = v.IsReadOnly()
resp.VolumeSize = uint64(volumeSize)
resp.FileCount = v.FileCount()
resp.FileDeletedCount = v.DeletedCount()
return resp, nil
}
func (vs *VolumeServer) VolumeServerStatus(ctx context.Context, req *volume_server_pb.VolumeServerStatusRequest) (*volume_server_pb.VolumeServerStatusResponse, error) {
resp := &volume_server_pb.VolumeServerStatusResponse{
MemoryStatus: stats.MemStat(),
Version: version.Version(),
DataCenter: vs.dataCenter,
Rack: vs.rack,
}
for _, loc := range vs.store.Locations {
if dir, e := filepath.Abs(loc.Directory); e == nil {
resp.DiskStatuses = append(resp.DiskStatuses, stats.NewDiskStatus(dir))
}
}
return resp, nil
}
func (vs *VolumeServer) VolumeServerLeave(ctx context.Context, req *volume_server_pb.VolumeServerLeaveRequest) (*volume_server_pb.VolumeServerLeaveResponse, error) {
resp := &volume_server_pb.VolumeServerLeaveResponse{}
vs.StopHeartbeat()
return resp, nil
}
func (vs *VolumeServer) VolumeNeedleStatus(ctx context.Context, req *volume_server_pb.VolumeNeedleStatusRequest) (*volume_server_pb.VolumeNeedleStatusResponse, error) {
resp := &volume_server_pb.VolumeNeedleStatusResponse{}
volumeId := needle.VolumeId(req.VolumeId)
n := &needle.Needle{
Id: types.NeedleId(req.NeedleId),
}
var count int
var err error
hasVolume := vs.store.HasVolume(volumeId)
if !hasVolume {
_, hasEcVolume := vs.store.FindEcVolume(volumeId)
if !hasEcVolume {
return nil, fmt.Errorf("volume not found %d", req.VolumeId)
}
count, err = vs.store.ReadEcShardNeedle(volumeId, n, nil)
} else {
count, err = vs.store.ReadVolumeNeedle(volumeId, n, nil, nil)
}
if err != nil {
return nil, err
}
if count < 0 {
return nil, fmt.Errorf("needle not found %d", n.Id)
}
resp.NeedleId = uint64(n.Id)
resp.Cookie = uint32(n.Cookie)
resp.Size = uint32(n.Size)
resp.LastModified = n.LastModified
resp.Crc = n.Checksum.Value()
if n.HasTtl() {
resp.Ttl = n.Ttl.String()
}
return resp, nil
}
func (vs *VolumeServer) Ping(ctx context.Context, req *volume_server_pb.PingRequest) (resp *volume_server_pb.PingResponse, pingErr error) {
resp = &volume_server_pb.PingResponse{
StartTimeNs: time.Now().UnixNano(),
}
if req.TargetType == cluster.FilerType {
pingErr = pb.WithFilerClient(false, 0, pb.ServerAddress(req.Target), vs.grpcDialOption, func(client filer_pb.SeaweedFilerClient) error {
pingResp, err := client.Ping(ctx, &filer_pb.PingRequest{})
if pingResp != nil {
resp.RemoteTimeNs = pingResp.StartTimeNs
}
return err
})
}
if req.TargetType == cluster.VolumeServerType {
pingErr = pb.WithVolumeServerClient(false, pb.ServerAddress(req.Target), vs.grpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
pingResp, err := client.Ping(ctx, &volume_server_pb.PingRequest{})
if pingResp != nil {
resp.RemoteTimeNs = pingResp.StartTimeNs
}
return err
})
}
if req.TargetType == cluster.MasterType {
pingErr = pb.WithMasterClient(false, pb.ServerAddress(req.Target), vs.grpcDialOption, false, func(client master_pb.SeaweedClient) error {
pingResp, err := client.Ping(ctx, &master_pb.PingRequest{})
if pingResp != nil {
resp.RemoteTimeNs = pingResp.StartTimeNs
}
return err
})
}
if pingErr != nil {
pingErr = fmt.Errorf("ping %s %s: %v", req.TargetType, req.Target, pingErr)
}
resp.StopTimeNs = time.Now().UnixNano()
return
}