fix: sync replica entries before ec.encode and volume.tier.move (#7798)

* fix: sync replica entries before ec.encode and volume.tier.move (#7797) This addresses the data inconsistency risk in multi-replica volumes. When ec.encode or volume.tier.move operates on a multi-replica volume: 1. Find the replica with the highest file count (the 'best' one) 2. Copy missing entries from other replicas INTO this best replica 3. Use this union replica for the destructive operation This ensures no data is lost due to replica inconsistency before EC encoding or tier moving. Added: - command_volume_replica_check.go: Core sync and select logic - command_volume_replica_check_test.go: Test coverage Modified: - command_ec_encode.go: Call syncAndSelectBestReplica before encoding - command_volume_tier_move.go: Call syncAndSelectBestReplica before moving Fixes #7797 * test: add integration test for replicated volume sync during ec.encode * test: improve retry logic for replicated volume integration test * fix: resolve JWT issue in integration tests by using empty security.toml * address review comments: add readNeedleMeta, parallelize status fetch, fix collection param, fix test issues * test: use collection parameter consistently in replica sync test * fix: convert weed binary path to absolute to work with changed working directory * fix: remove skip behavior, keep tests failing on missing binary * fix: always check recency for each needle, add divergent replica test
2025-12-16 23:16:07 -08:00
parent 9c4a2e1b1a
commit 347ed7cbfa
6 changed files with 959 additions and 15 deletions
--- a/weed/shell/command_ec_common.go
+++ b/weed/shell/command_ec_common.go
@@ -202,6 +202,35 @@ func collectEcNodes(commandEnv *CommandEnv, diskType types.DiskType) (ecNodes []
 	return collectEcNodesForDC(commandEnv, "", diskType)
 }

+// collectVolumeIdToCollection returns a map from volume ID to its collection name
+func collectVolumeIdToCollection(t *master_pb.TopologyInfo, vids []needle.VolumeId) map[needle.VolumeId]string {
+	result := make(map[needle.VolumeId]string)
+	if len(vids) == 0 {
+		return result
+	}
+
+	vidSet := make(map[needle.VolumeId]bool)
+	for _, vid := range vids {
+		vidSet[vid] = true
+	}
+
+	for _, dc := range t.DataCenterInfos {
+		for _, r := range dc.RackInfos {
+			for _, dn := range r.DataNodeInfos {
+				for _, diskInfo := range dn.DiskInfos {
+					for _, vi := range diskInfo.VolumeInfos {
+						vid := needle.VolumeId(vi.Id)
+						if vidSet[vid] {
+							result[vid] = vi.Collection
+						}
+					}
+				}
+			}
+		}
+	}
+	return result
+}
+
 func collectCollectionsForVolumeIds(t *master_pb.TopologyInfo, vids []needle.VolumeId) []string {
 	if len(vids) == 0 {
 		return nil
--- a/weed/shell/command_ec_encode.go
+++ b/weed/shell/command_ec_encode.go
@@ -153,6 +153,9 @@ func (c *commandEcEncode) Do(args []string, commandEnv *CommandEnv, writer io.Wr
 		return nil
 	}

+	// Collect volume ID to collection name mapping for the sync operation
+	volumeIdToCollection := collectVolumeIdToCollection(topologyInfo, volumeIds)
+
 	// Collect volume locations BEFORE EC encoding starts to avoid race condition
 	// where the master metadata is updated after EC encoding but before deletion
 	fmt.Printf("Collecting volume locations for %d volumes before EC encoding...\n", len(volumeIds))
@@ -162,7 +165,7 @@ func (c *commandEcEncode) Do(args []string, commandEnv *CommandEnv, writer io.Wr
 	}

 	// encode all requested volumes...
-	if err = doEcEncode(commandEnv, *collection, volumeIds, *maxParallelization); err != nil {
+	if err = doEcEncode(commandEnv, writer, volumeIdToCollection, volumeIds, *maxParallelization); err != nil {
 		return fmt.Errorf("ec encode for volumes %v: %w", volumeIds, err)
 	}
 	// ...re-balance ec shards...
@@ -192,7 +195,7 @@ func volumeLocations(commandEnv *CommandEnv, volumeIds []needle.VolumeId) (map[n
 	return res, nil
 }

-func doEcEncode(commandEnv *CommandEnv, collection string, volumeIds []needle.VolumeId, maxParallelization int) error {
+func doEcEncode(commandEnv *CommandEnv, writer io.Writer, volumeIdToCollection map[needle.VolumeId]string, volumeIds []needle.VolumeId, maxParallelization int) error {
 	if !commandEnv.isLocked() {
 		return fmt.Errorf("lock is lost")
 	}
@@ -217,10 +220,26 @@ func doEcEncode(commandEnv *CommandEnv, collection string, volumeIds []needle.Vo
 		return err
 	}

-	// generate ec shards
+	// Sync replicas and select the best one for each volume (with highest file count)
+	// This addresses data inconsistency risk in multi-replica volumes (issue #7797)
+	// by syncing missing entries between replicas before encoding
+	bestReplicas := make(map[needle.VolumeId]wdclient.Location)
+	for _, vid := range volumeIds {
+		locs := locations[vid]
+		collection := volumeIdToCollection[vid]
+		// Sync missing entries between replicas, then select the best one
+		bestLoc, selectErr := syncAndSelectBestReplica(commandEnv.option.GrpcDialOption, vid, collection, locs, "", writer)
+		if selectErr != nil {
+			return fmt.Errorf("failed to sync and select replica for volume %d: %v", vid, selectErr)
+		}
+		bestReplicas[vid] = bestLoc
+	}
+
+	// generate ec shards using the best replica for each volume
 	ewg.Reset()
-	for i, vid := range volumeIds {
-		target := locations[vid][i%len(locations[vid])]
+	for _, vid := range volumeIds {
+		target := bestReplicas[vid]
+		collection := volumeIdToCollection[vid]
 		ewg.Add(func() error {
 			if err := generateEcShards(commandEnv.option.GrpcDialOption, vid, collection, target.ServerAddress()); err != nil {
 				return fmt.Errorf("generate ec shards for volume %d on %s: %v", vid, target.Url, err)
@@ -239,8 +258,9 @@ func doEcEncode(commandEnv *CommandEnv, collection string, volumeIds []needle.Vo
 	}

 	ewg.Reset()
-	for i, vid := range volumeIds {
-		target := locations[vid][i%len(locations[vid])]
+	for _, vid := range volumeIds {
+		target := bestReplicas[vid]
+		collection := volumeIdToCollection[vid]
 		ewg.Add(func() error {
 			if err := mountEcShards(commandEnv.option.GrpcDialOption, collection, vid, target.ServerAddress(), shardIds); err != nil {
 				return fmt.Errorf("mount ec shards for volume %d on %s: %v", vid, target.Url, err)
--- a/weed/shell/command_volume_replica_check.go
+++ b/weed/shell/command_volume_replica_check.go
@@ -0,0 +1,353 @@
+package shell
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"io"
+	"math"
+	"sync"
+	"time"
+
+	"google.golang.org/grpc"
+
+	"github.com/seaweedfs/seaweedfs/weed/operation"
+	"github.com/seaweedfs/seaweedfs/weed/pb"
+	"github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"
+	"github.com/seaweedfs/seaweedfs/weed/storage/needle"
+	"github.com/seaweedfs/seaweedfs/weed/storage/needle_map"
+	"github.com/seaweedfs/seaweedfs/weed/wdclient"
+)
+
+// VolumeReplicaStatus represents the status of a volume replica
+type VolumeReplicaStatus struct {
+	Location         wdclient.Location
+	FileCount        uint64
+	FileDeletedCount uint64
+	VolumeSize       uint64
+	IsReadOnly       bool
+	Error            error
+}
+
+// getVolumeReplicaStatus retrieves the current status of a volume replica
+func getVolumeReplicaStatus(grpcDialOption grpc.DialOption, vid needle.VolumeId, location wdclient.Location) VolumeReplicaStatus {
+	status := VolumeReplicaStatus{
+		Location: location,
+	}
+
+	err := operation.WithVolumeServerClient(false, location.ServerAddress(), grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
+		resp, reqErr := volumeServerClient.VolumeStatus(context.Background(), &volume_server_pb.VolumeStatusRequest{
+			VolumeId: uint32(vid),
+		})
+		if reqErr != nil {
+			return reqErr
+		}
+		if resp != nil {
+			status.FileCount = resp.FileCount
+			status.FileDeletedCount = resp.FileDeletedCount
+			status.VolumeSize = resp.VolumeSize
+			status.IsReadOnly = resp.IsReadOnly
+		}
+		return nil
+	})
+	status.Error = err
+	return status
+}
+
+// getVolumeReplicaStatuses retrieves status for all replicas of a volume in parallel
+func getVolumeReplicaStatuses(grpcDialOption grpc.DialOption, vid needle.VolumeId, locations []wdclient.Location) []VolumeReplicaStatus {
+	statuses := make([]VolumeReplicaStatus, len(locations))
+	var wg sync.WaitGroup
+	for i, location := range locations {
+		wg.Add(1)
+		go func(i int, location wdclient.Location) {
+			defer wg.Done()
+			statuses[i] = getVolumeReplicaStatus(grpcDialOption, vid, location)
+		}(i, location)
+	}
+	wg.Wait()
+	return statuses
+}
+
+// replicaUnionBuilder builds a union replica by copying missing entries from other replicas
+type replicaUnionBuilder struct {
+	grpcDialOption grpc.DialOption
+	writer         io.Writer
+	vid            needle.VolumeId
+	collection     string
+}
+
+// buildUnionReplica finds the largest replica and copies missing entries from other replicas into it.
+// If excludeFromSelection is non-empty, that server won't be selected as the target but will still
+// be used as a source for missing entries.
+// Returns the location of the union replica (the one that now has all entries).
+func (rub *replicaUnionBuilder) buildUnionReplica(locations []wdclient.Location, excludeFromSelection string) (wdclient.Location, int, error) {
+	if len(locations) == 0 {
+		return wdclient.Location{}, 0, fmt.Errorf("no replicas available")
+	}
+	if len(locations) == 1 {
+		if locations[0].Url == excludeFromSelection {
+			return wdclient.Location{}, 0, fmt.Errorf("only replica is excluded")
+		}
+		return locations[0], 0, nil
+	}
+
+	// Step 1: Find the largest replica (highest file count) that's not excluded
+	statuses := getVolumeReplicaStatuses(rub.grpcDialOption, rub.vid, locations)
+
+	bestIdx := -1
+	var bestFileCount uint64
+	for i, s := range statuses {
+		if s.Error == nil && locations[i].Url != excludeFromSelection {
+			if bestIdx == -1 || s.FileCount > bestFileCount {
+				bestIdx = i
+				bestFileCount = s.FileCount
+			}
+		}
+	}
+
+	if bestIdx == -1 {
+		return wdclient.Location{}, 0, fmt.Errorf("could not find valid replica (all excluded or errored)")
+	}
+
+	bestLocation := locations[bestIdx]
+	fmt.Fprintf(rub.writer, "volume %d: selected %s as best replica (file count: %d)\n",
+		rub.vid, bestLocation.Url, bestFileCount)
+
+	// Step 2: Read index database from the best replica
+	bestDB := needle_map.NewMemDb()
+	if bestDB == nil {
+		return wdclient.Location{}, 0, fmt.Errorf("failed to allocate in-memory needle DB")
+	}
+	defer bestDB.Close()
+
+	if err := rub.readIndexDatabase(bestDB, bestLocation.ServerAddress()); err != nil {
+		return wdclient.Location{}, 0, fmt.Errorf("read index from best replica %s: %w", bestLocation.Url, err)
+	}
+
+	// Step 3: For each other replica (including excluded), find entries missing from best and copy them
+	totalSynced := 0
+	cutoffFromAtNs := uint64(time.Now().UnixNano())
+
+	for i, loc := range locations {
+		if i == bestIdx {
+			continue
+		}
+		if statuses[i].Error != nil {
+			fmt.Fprintf(rub.writer, "  skipping %s: %v\n", loc.Url, statuses[i].Error)
+			continue
+		}
+
+		// Read this replica's index
+		otherDB := needle_map.NewMemDb()
+		if otherDB == nil {
+			fmt.Fprintf(rub.writer, "  skipping %s: failed to allocate DB\n", loc.Url)
+			continue
+		}
+
+		if err := rub.readIndexDatabase(otherDB, loc.ServerAddress()); err != nil {
+			otherDB.Close()
+			fmt.Fprintf(rub.writer, "  skipping %s: %v\n", loc.Url, err)
+			continue
+		}
+
+		// Find entries in other that are missing from best
+		var missingNeedles []needle_map.NeedleValue
+
+		otherDB.AscendingVisit(func(nv needle_map.NeedleValue) error {
+			if nv.Size.IsDeleted() {
+				return nil
+			}
+			if _, found := bestDB.Get(nv.Key); !found {
+				// Check if this entry was written too recently (after sync started)
+				// Skip entries written after sync started to avoid copying in-flight writes
+				if needleMeta, err := readNeedleMeta(rub.grpcDialOption, loc.ServerAddress(), uint32(rub.vid), nv); err == nil {
+					if needleMeta.AppendAtNs > cutoffFromAtNs {
+						return nil // Skip entries written after sync started
+					}
+				}
+				missingNeedles = append(missingNeedles, nv)
+			}
+			return nil
+		})
+		otherDB.Close()
+
+		if len(missingNeedles) == 0 {
+			continue
+		}
+
+		// Copy missing entries from this replica to best replica
+		syncedFromThis := 0
+		for _, nv := range missingNeedles {
+			needleBlob, err := rub.readNeedleBlob(loc.ServerAddress(), nv)
+			if err != nil {
+				fmt.Fprintf(rub.writer, "  warning: read needle %d from %s: %v\n", nv.Key, loc.Url, err)
+				continue
+			}
+
+			if err := rub.writeNeedleBlob(bestLocation.ServerAddress(), nv, needleBlob); err != nil {
+				fmt.Fprintf(rub.writer, "  warning: write needle %d to %s: %v\n", nv.Key, bestLocation.Url, err)
+				continue
+			}
+
+			// Also add to bestDB so we don't copy duplicates from other replicas
+			bestDB.Set(nv.Key, nv.Offset, nv.Size)
+			syncedFromThis++
+		}
+
+		if syncedFromThis > 0 {
+			fmt.Fprintf(rub.writer, "  copied %d entries from %s to %s\n",
+				syncedFromThis, loc.Url, bestLocation.Url)
+			totalSynced += syncedFromThis
+		}
+	}
+
+	return bestLocation, totalSynced, nil
+}
+
+func (rub *replicaUnionBuilder) readIndexDatabase(db *needle_map.MemDb, server pb.ServerAddress) error {
+	var buf bytes.Buffer
+
+	err := operation.WithVolumeServerClient(true, server, rub.grpcDialOption, func(volumeServerClient volume_server_pb.VolumeServerClient) error {
+		copyFileClient, err := volumeServerClient.CopyFile(context.Background(), &volume_server_pb.CopyFileRequest{
+			VolumeId:                 uint32(rub.vid),
+			Ext:                      ".idx",
+			CompactionRevision:       math.MaxUint32,
+			StopOffset:               math.MaxInt64,
+			Collection:               rub.collection,
+			IsEcVolume:               false,
+			IgnoreSourceFileNotFound: false,
+		})
+		if err != nil {
+			return fmt.Errorf("start copy: %w", err)
+		}
+
+		for {
+			resp, recvErr := copyFileClient.Recv()
+			if recvErr == io.EOF {
+				break
+			}
+			if recvErr != nil {
+				return fmt.Errorf("receive: %w", recvErr)
+			}
+			buf.Write(resp.FileContent)
+		}
+		return nil
+	})
+	if err != nil {
+		return err
+	}
+
+	return db.LoadFilterFromReaderAt(bytes.NewReader(buf.Bytes()), true, false)
+}
+
+func (rub *replicaUnionBuilder) readNeedleBlob(server pb.ServerAddress, nv needle_map.NeedleValue) ([]byte, error) {
+	var needleBlob []byte
+	err := operation.WithVolumeServerClient(false, server, rub.grpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
+		resp, err := client.ReadNeedleBlob(context.Background(), &volume_server_pb.ReadNeedleBlobRequest{
+			VolumeId: uint32(rub.vid),
+			Offset:   nv.Offset.ToActualOffset(),
+			Size:     int32(nv.Size),
+		})
+		if err != nil {
+			return err
+		}
+		needleBlob = resp.NeedleBlob
+		return nil
+	})
+	return needleBlob, err
+}
+
+func (rub *replicaUnionBuilder) writeNeedleBlob(server pb.ServerAddress, nv needle_map.NeedleValue, needleBlob []byte) error {
+	return operation.WithVolumeServerClient(false, server, rub.grpcDialOption, func(client volume_server_pb.VolumeServerClient) error {
+		_, err := client.WriteNeedleBlob(context.Background(), &volume_server_pb.WriteNeedleBlobRequest{
+			VolumeId:   uint32(rub.vid),
+			NeedleId:   uint64(nv.Key),
+			Size:       int32(nv.Size),
+			NeedleBlob: needleBlob,
+		})
+		return err
+	})
+}
+
+// syncAndSelectBestReplica finds the largest replica, copies missing entries from other replicas
+// into it to create a union, then returns this union replica for the operation.
+// If excludeFromSelection is non-empty, that server won't be selected but will still contribute entries.
+//
+// The process:
+// 1. Find the replica with the highest file count (the "best" one), excluding excludeFromSelection
+// 2. For each other replica, find entries missing from best and copy them to best
+// 3. Return the best replica which now contains the union of all entries
+func syncAndSelectBestReplica(grpcDialOption grpc.DialOption, vid needle.VolumeId, collection string, locations []wdclient.Location, excludeFromSelection string, writer io.Writer) (wdclient.Location, error) {
+	if len(locations) == 0 {
+		return wdclient.Location{}, fmt.Errorf("no replicas available for volume %d", vid)
+	}
+
+	// Filter for checking consistency (exclude the excluded server)
+	var checkLocations []wdclient.Location
+	for _, loc := range locations {
+		if loc.Url != excludeFromSelection {
+			checkLocations = append(checkLocations, loc)
+		}
+	}
+
+	if len(checkLocations) == 0 {
+		return wdclient.Location{}, fmt.Errorf("no replicas available for volume %d after exclusion", vid)
+	}
+
+	if len(checkLocations) == 1 && len(locations) == 1 {
+		return checkLocations[0], nil
+	}
+
+	// Check if replicas are already consistent (skip sync if so)
+	statuses := getVolumeReplicaStatuses(grpcDialOption, vid, locations)
+	var validStatuses []VolumeReplicaStatus
+	for i, s := range statuses {
+		if s.Error == nil {
+			// Include all for consistency check
+			validStatuses = append(validStatuses, s)
+			_ = i
+		}
+	}
+
+	if len(validStatuses) > 1 {
+		allSame := true
+		for _, s := range validStatuses[1:] {
+			if s.FileCount != validStatuses[0].FileCount {
+				allSame = false
+				break
+			}
+		}
+		if allSame {
+			// All replicas are consistent, return the best non-excluded one
+			for _, s := range validStatuses {
+				if s.Location.Url != excludeFromSelection {
+					fmt.Fprintf(writer, "volume %d: all %d replicas are consistent (file count: %d)\n",
+						vid, len(validStatuses), s.FileCount)
+					return s.Location, nil
+				}
+			}
+		}
+	}
+
+	// Replicas are inconsistent, build union on the best replica
+	fmt.Fprintf(writer, "volume %d: replicas are inconsistent, building union...\n", vid)
+
+	builder := &replicaUnionBuilder{
+		grpcDialOption: grpcDialOption,
+		writer:         writer,
+		vid:            vid,
+		collection:     collection,
+	}
+
+	unionLocation, totalSynced, err := builder.buildUnionReplica(locations, excludeFromSelection)
+	if err != nil {
+		return wdclient.Location{}, fmt.Errorf("failed to build union replica: %w", err)
+	}
+
+	if totalSynced > 0 {
+		fmt.Fprintf(writer, "volume %d: added %d entries to union replica %s\n", vid, totalSynced, unionLocation.Url)
+	}
+
+	return unionLocation, nil
+}
--- a/weed/shell/command_volume_replica_check_test.go
+++ b/weed/shell/command_volume_replica_check_test.go
@@ -0,0 +1,273 @@
+package shell
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/seaweedfs/seaweedfs/weed/storage/needle_map"
+	"github.com/seaweedfs/seaweedfs/weed/storage/types"
+)
+
+func TestBuildUnionFromMultipleIndexDatabases(t *testing.T) {
+	// Test that we can correctly identify missing entries between replicas
+
+	// Create mock index databases representing different replicas
+	replicaA := needle_map.NewMemDb()
+	replicaB := needle_map.NewMemDb()
+	replicaC := needle_map.NewMemDb()
+	defer replicaA.Close()
+	defer replicaB.Close()
+	defer replicaC.Close()
+
+	// Replica A has entries 1, 2, 3, 4, 5
+	replicaA.Set(types.NeedleId(1), types.Offset{}, types.Size(100))
+	replicaA.Set(types.NeedleId(2), types.Offset{}, types.Size(100))
+	replicaA.Set(types.NeedleId(3), types.Offset{}, types.Size(100))
+	replicaA.Set(types.NeedleId(4), types.Offset{}, types.Size(100))
+	replicaA.Set(types.NeedleId(5), types.Offset{}, types.Size(100))
+
+	// Replica B has entries 1, 2, 3, 6, 7 (missing 4, 5 from A, has unique 6, 7)
+	replicaB.Set(types.NeedleId(1), types.Offset{}, types.Size(100))
+	replicaB.Set(types.NeedleId(2), types.Offset{}, types.Size(100))
+	replicaB.Set(types.NeedleId(3), types.Offset{}, types.Size(100))
+	replicaB.Set(types.NeedleId(6), types.Offset{}, types.Size(100))
+	replicaB.Set(types.NeedleId(7), types.Offset{}, types.Size(100))
+
+	// Replica C has entries 1, 2, 8 (minimal overlap, has unique 8)
+	replicaC.Set(types.NeedleId(1), types.Offset{}, types.Size(100))
+	replicaC.Set(types.NeedleId(2), types.Offset{}, types.Size(100))
+	replicaC.Set(types.NeedleId(8), types.Offset{}, types.Size(100))
+
+	// Test: Find entries in B that are missing from A
+	var missingFromA []types.NeedleId
+	replicaB.AscendingVisit(func(nv needle_map.NeedleValue) error {
+		if _, found := replicaA.Get(nv.Key); !found {
+			missingFromA = append(missingFromA, nv.Key)
+		}
+		return nil
+	})
+
+	if len(missingFromA) != 2 {
+		t.Errorf("Expected 2 entries missing from A (6, 7), got %d: %v", len(missingFromA), missingFromA)
+	}
+
+	// Test: Find entries in C that are missing from A
+	var missingFromAinC []types.NeedleId
+	replicaC.AscendingVisit(func(nv needle_map.NeedleValue) error {
+		if _, found := replicaA.Get(nv.Key); !found {
+			missingFromAinC = append(missingFromAinC, nv.Key)
+		}
+		return nil
+	})
+
+	if len(missingFromAinC) != 1 {
+		t.Errorf("Expected 1 entry missing from A in C (8), got %d: %v", len(missingFromAinC), missingFromAinC)
+	}
+
+	// Simulate building union: add missing entries to A
+	for _, id := range missingFromA {
+		replicaA.Set(id, types.Offset{}, types.Size(100))
+	}
+	for _, id := range missingFromAinC {
+		replicaA.Set(id, types.Offset{}, types.Size(100))
+	}
+
+	// Verify A now has all 8 unique entries
+	count := 0
+	replicaA.AscendingVisit(func(nv needle_map.NeedleValue) error {
+		count++
+		return nil
+	})
+
+	if count != 8 {
+		t.Errorf("Expected union to have 8 entries, got %d", count)
+	}
+}
+
+func TestFindLargestReplica(t *testing.T) {
+	// Test that we correctly identify the replica with the most entries
+
+	type replicaInfo struct {
+		url       string
+		fileCount uint64
+	}
+
+	testCases := []struct {
+		name     string
+		replicas []replicaInfo
+		expected string
+	}{
+		{
+			name: "single replica",
+			replicas: []replicaInfo{
+				{"server1:8080", 100},
+			},
+			expected: "server1:8080",
+		},
+		{
+			name: "first is largest",
+			replicas: []replicaInfo{
+				{"server1:8080", 100},
+				{"server2:8080", 50},
+				{"server3:8080", 75},
+			},
+			expected: "server1:8080",
+		},
+		{
+			name: "last is largest",
+			replicas: []replicaInfo{
+				{"server1:8080", 50},
+				{"server2:8080", 75},
+				{"server3:8080", 100},
+			},
+			expected: "server3:8080",
+		},
+		{
+			name: "middle is largest",
+			replicas: []replicaInfo{
+				{"server1:8080", 50},
+				{"server2:8080", 100},
+				{"server3:8080", 75},
+			},
+			expected: "server2:8080",
+		},
+		{
+			name: "all equal - pick first",
+			replicas: []replicaInfo{
+				{"server1:8080", 100},
+				{"server2:8080", 100},
+				{"server3:8080", 100},
+			},
+			expected: "server1:8080",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Find the largest
+			bestIdx := 0
+			var bestCount uint64 = 0
+			for i, r := range tc.replicas {
+				if i == 0 || r.fileCount > bestCount {
+					bestIdx = i
+					bestCount = r.fileCount
+				}
+			}
+
+			if tc.replicas[bestIdx].url != tc.expected {
+				t.Errorf("Expected %s, got %s", tc.expected, tc.replicas[bestIdx].url)
+			}
+		})
+	}
+}
+
+func TestDeletedEntriesAreSkipped(t *testing.T) {
+	// Test that deleted entries are not copied during sync
+
+	replicaA := needle_map.NewMemDb()
+	replicaB := needle_map.NewMemDb()
+	defer replicaA.Close()
+	defer replicaB.Close()
+
+	// Replica A has entries 1, 2, 3 (all valid)
+	replicaA.Set(types.NeedleId(1), types.Offset{}, types.Size(100))
+	replicaA.Set(types.NeedleId(2), types.Offset{}, types.Size(100))
+	replicaA.Set(types.NeedleId(3), types.Offset{}, types.Size(100))
+
+	// Replica B has entry 4 valid, entry 5 deleted
+	replicaB.Set(types.NeedleId(4), types.Offset{}, types.Size(100))
+	replicaB.Set(types.NeedleId(5), types.Offset{}, types.Size(-1)) // Deleted (negative size)
+
+	// Find non-deleted entries in B missing from A
+	var missingFromA []types.NeedleId
+	replicaB.AscendingVisit(func(nv needle_map.NeedleValue) error {
+		if nv.Size.IsDeleted() {
+			return nil // Skip deleted
+		}
+		if _, found := replicaA.Get(nv.Key); !found {
+			missingFromA = append(missingFromA, nv.Key)
+		}
+		return nil
+	})
+
+	if len(missingFromA) != 1 {
+		t.Errorf("Expected 1 non-deleted entry missing (4), got %d: %v", len(missingFromA), missingFromA)
+	}
+
+	if len(missingFromA) > 0 && missingFromA[0] != types.NeedleId(4) {
+		t.Errorf("Expected missing entry to be 4, got %d", missingFromA[0])
+	}
+}
+
+func TestReplicaUnionBuilder_EmptyLocations(t *testing.T) {
+	// Test handling of empty locations slice
+	builder := &replicaUnionBuilder{
+		writer: &bytes.Buffer{},
+		vid:    1,
+	}
+
+	_, count, err := builder.buildUnionReplica(nil, "")
+	if err == nil {
+		t.Error("Expected error for empty locations")
+	}
+	if count != 0 {
+		t.Errorf("Expected 0 synced, got %d", count)
+	}
+}
+
+func TestAvoidDuplicateCopies(t *testing.T) {
+	// Test that when building union, we don't copy the same entry multiple times
+	// by updating the best replica's in-memory index after each copy
+
+	bestDB := needle_map.NewMemDb()
+	defer bestDB.Close()
+
+	// Best replica has entries 1, 2
+	bestDB.Set(types.NeedleId(1), types.Offset{}, types.Size(100))
+	bestDB.Set(types.NeedleId(2), types.Offset{}, types.Size(100))
+
+	// Simulate two other replicas both having entry 3
+	otherReplicas := [][]types.NeedleId{
+		{3, 4}, // Replica B has 3, 4
+		{3, 5}, // Replica C has 3, 5
+	}
+
+	copiedEntries := make(map[types.NeedleId]int) // Track how many times each entry is "copied"
+
+	for _, otherEntries := range otherReplicas {
+		for _, id := range otherEntries {
+			if _, found := bestDB.Get(id); !found {
+				// Would copy this entry
+				copiedEntries[id]++
+				// Add to bestDB to prevent duplicate copies
+				bestDB.Set(id, types.Offset{}, types.Size(100))
+			}
+		}
+	}
+
+	// Entry 3 should only be copied once (from first replica that has it)
+	if copiedEntries[types.NeedleId(3)] != 1 {
+		t.Errorf("Entry 3 should be copied exactly once, got %d", copiedEntries[types.NeedleId(3)])
+	}
+
+	// Entry 4 should be copied once
+	if copiedEntries[types.NeedleId(4)] != 1 {
+		t.Errorf("Entry 4 should be copied exactly once, got %d", copiedEntries[types.NeedleId(4)])
+	}
+
+	// Entry 5 should be copied once
+	if copiedEntries[types.NeedleId(5)] != 1 {
+		t.Errorf("Entry 5 should be copied exactly once, got %d", copiedEntries[types.NeedleId(5)])
+	}
+
+	// Best should now have 5 entries total
+	count := 0
+	bestDB.AscendingVisit(func(nv needle_map.NeedleValue) error {
+		count++
+		return nil
+	})
+	if count != 5 {
+		t.Errorf("Expected 5 entries in union, got %d", count)
+	}
+}
+
--- a/weed/shell/command_volume_tier_move.go
+++ b/weed/shell/command_volume_tier_move.go
@@ -102,6 +102,9 @@ func (c *commandVolumeTierMove) Do(args []string, commandEnv *CommandEnv, writer
 	}
 	fmt.Printf("tier move volumes: %v\n", volumeIds)

+	// Collect volume ID to collection name mapping for the sync operation
+	volumeIdToCollection := collectVolumeIdToCollection(topologyInfo, volumeIds)
+
 	_, allLocations := collectVolumeReplicaLocations(topologyInfo)
 	allLocations = filterLocationsByDiskType(allLocations, toDiskType)
 	keepDataNodesSorted(allLocations, toDiskType)
@@ -143,7 +146,8 @@ func (c *commandVolumeTierMove) Do(args []string, commandEnv *CommandEnv, writer
 	}

 	for _, vid := range volumeIds {
-		if err = c.doVolumeTierMove(commandEnv, writer, vid, toDiskType, allLocations); err != nil {
+		collection := volumeIdToCollection[vid]
+		if err = c.doVolumeTierMove(commandEnv, writer, vid, collection, toDiskType, allLocations); err != nil {
 			fmt.Printf("tier move volume %d: %v\n", vid, err)
 		}
 		allLocations = rotateDataNodes(allLocations)
@@ -192,7 +196,7 @@ func isOneOf(server string, locations []wdclient.Location) bool {
 	return false
 }

-func (c *commandVolumeTierMove) doVolumeTierMove(commandEnv *CommandEnv, writer io.Writer, vid needle.VolumeId, toDiskType types.DiskType, allLocations []location) (err error) {
+func (c *commandVolumeTierMove) doVolumeTierMove(commandEnv *CommandEnv, writer io.Writer, vid needle.VolumeId, collection string, toDiskType types.DiskType, allLocations []location) (err error) {
 	// find volume location
 	locations, found := commandEnv.MasterClient.GetLocationsClone(uint32(vid))
 	if !found {
@@ -208,12 +212,18 @@ func (c *commandVolumeTierMove) doVolumeTierMove(commandEnv *CommandEnv, writer
 			if isOneOf(dst.dataNode.Id, locations) {
 				continue
 			}
-			var sourceVolumeServer pb.ServerAddress
-			for _, loc := range locations {
-				if loc.Url != dst.dataNode.Id {
-					sourceVolumeServer = loc.ServerAddress()
-				}
+
+			// Sync replicas and select the best one (with highest file count) for multi-replica volumes
+			// This addresses data inconsistency risk in multi-replica volumes (issue #7797)
+			// by syncing missing entries between replicas before moving
+			sourceLoc, selectErr := syncAndSelectBestReplica(
+				commandEnv.option.GrpcDialOption, vid, collection, locations, dst.dataNode.Id, writer)
+			if selectErr != nil {
+				fmt.Fprintf(writer, "failed to sync and select source replica for volume %d: %v\n", vid, selectErr)
+				continue
 			}
+			sourceVolumeServer := sourceLoc.ServerAddress()
+
 			if sourceVolumeServer == "" {
 				continue
 			}