fix(worker): pass compaction revision and file sizes in EC volume copy (#8835)

* fix(worker): pass compaction revision and file sizes in EC volume copy

The worker EC task was sending CopyFile requests without the current
compaction revision (defaulting to 0) and with StopOffset set to
math.MaxInt64.  After a vacuum compaction this caused the volume server
to reject the copy or return stale data.

Read the volume file status first and forward the compaction revision
and actual file sizes so the copy is consistent with the compacted
volume.

* propagate erasure coding task context

* fix(worker): validate volume file status and detect short copies

Reject zero dat file size from ReadVolumeFileStatus — a zero-sized
snapshot would produce 0-byte copies and broken EC shards.

After streaming, verify totalBytes matches the expected stopOffset
and return an error on short copies instead of logging success.

* fix(worker): reject zero idx file size in volume status validation

A non-empty dat with zero idx indicates an empty or corrupt volume.
Without this guard, copyFileFromSource gets stopOffset=0, produces a
0-byte .idx, passes the short-copy check, and generateEcShardsLocally
runs against a volume with no index.

* fix fake plugin volume file status

* fix plugin volume balance test fixtures
This commit is contained in:
Chris Lu
2026-03-29 18:47:15 -07:00
committed by GitHub
parent e3359badfc
commit d074830016
4 changed files with 264 additions and 29 deletions

View File

@@ -16,6 +16,8 @@ import (
"google.golang.org/protobuf/proto"
)
const testVolumeDatSize = 1 * 1024 * 1024
func TestVolumeBalanceExecutionIntegration(t *testing.T) {
volumeID := uint32(303)
@@ -31,6 +33,7 @@ func TestVolumeBalanceExecutionIntegration(t *testing.T) {
source := pluginworkers.NewVolumeServer(t, "")
target := pluginworkers.NewVolumeServer(t, "")
pluginworkers.WriteTestVolumeFiles(t, source.BaseDir(), volumeID, testVolumeDatSize)
job := &plugin_pb.JobSpec{
JobId: fmt.Sprintf("balance-job-%d", volumeID),
@@ -84,6 +87,9 @@ func TestVolumeBalanceBatchExecutionIntegration(t *testing.T) {
// Build a batch job with 3 volume moves from source → target.
volumeIDs := []uint32{401, 402, 403}
for _, vid := range volumeIDs {
pluginworkers.WriteTestVolumeFiles(t, source.BaseDir(), vid, testVolumeDatSize)
}
moves := make([]*worker_pb.BalanceMoveSpec, len(volumeIDs))
for i, vid := range volumeIDs {
moves[i] = &worker_pb.BalanceMoveSpec{
@@ -139,10 +145,11 @@ func TestVolumeBalanceBatchExecutionIntegration(t *testing.T) {
require.True(t, deletedVols[vid], "volume %d should have been deleted from source", vid)
}
// Pre-delete verification should have called ReadVolumeFileStatus on both
// source and target for each volume.
require.Equal(t, len(volumeIDs), source.ReadFileStatusCount(),
"each move should read source volume status before delete")
// Each move reads source status once before copy and once inside the
// target's fake VolumeCopy implementation, then reads target status once
// before deleting the source.
require.Equal(t, len(volumeIDs)*2, source.ReadFileStatusCount(),
"each move should read source volume status before copy and during target copy")
require.Equal(t, len(volumeIDs), target.ReadFileStatusCount(),
"each move should read target volume status before delete")