fix(volume-rust): fix volume balance between Go and Rust servers (#8915)

Two bugs prevented reliable volume balancing when a Rust volume server
is the copy target:

1. find_last_append_at_ns returned None for delete tombstones (Size==0
   in dat header), falling back to file mtime truncated to seconds.
   This caused the tail step to re-send needles from the last sub-second
   window. Fix: change `needle_size <= 0` to `< 0` since Size==0 delete
   needles still have a valid timestamp in their tail.

2. VolumeTailReceiver called read_body_v2 on delete needles, which have
   no DataSize/Data/flags — only checksum+timestamp+padding after the
   header. Fix: skip read_body_v2 when size == 0, reject negative sizes.

Also:
- Unify gRPC server bind: use TcpListener::bind before spawn for both
  TLS and non-TLS paths, propagating bind errors at startup.
- Add mixed Go+Rust cluster test harness and integration tests covering
  VolumeCopy in both directions, copy with deletes, and full balance
  move with tail tombstone propagation and source deletion.
- Make FindOrBuildRustBinary configurable for default vs no-default
  features (4-byte vs 5-byte offsets).
This commit is contained in:
Chris Lu
2026-04-04 09:13:23 -07:00
committed by GitHub
parent d1823d3784
commit 9add18e169
6 changed files with 1039 additions and 46 deletions

View File

@@ -20,7 +20,7 @@ default = ["5bytes"]
[dependencies]
# Async runtime
tokio = { version = "1", features = ["full"] }
tokio-stream = "0.1"
tokio-stream = { version = "0.1", features = ["net"] }
tokio-io-timeout = "1"
# gRPC + protobuf

View File

@@ -600,35 +600,35 @@ async fn run(
})
};
// Bind the gRPC listener before spawning to propagate bind errors at startup.
let grpc_listener = tokio::net::TcpListener::bind(&grpc_addr)
.await
.unwrap_or_else(|e| panic!("Failed to bind gRPC to {}: {}", grpc_addr, e));
let grpc_local_addr = grpc_listener
.local_addr()
.unwrap_or_else(|e| panic!("Failed to get gRPC local addr: {}", e));
let grpc_handle = {
let grpc_state = state.clone();
let grpc_addr = grpc_addr.clone();
let grpc_tls_acceptor = grpc_tls_acceptor.clone();
let mut shutdown_rx = shutdown_tx.subscribe();
let shutdown_tx_grpc = shutdown_tx.clone();
tokio::spawn(async move {
let addr = tokio::net::lookup_host(&grpc_addr)
.await
.expect("Failed to resolve gRPC address")
.next()
.expect("No addresses found for gRPC bind address");
let grpc_service = VolumeGrpcService {
state: grpc_state.clone(),
};
if let Some(tls_acceptor) = grpc_tls_acceptor {
let listener = tokio::net::TcpListener::bind(&grpc_addr)
.await
.unwrap_or_else(|e| panic!("Failed to bind gRPC to {}: {}", grpc_addr, e));
let incoming = grpc_tls_incoming(listener, tls_acceptor);
let reflection_v1 = tonic_reflection::server::Builder::configure()
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
.build_v1()
.expect("Failed to build gRPC reflection v1 service");
let reflection_v1alpha = tonic_reflection::server::Builder::configure()
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
.build_v1alpha()
.expect("Failed to build gRPC reflection v1alpha service");
info!("gRPC server listening on {} (TLS enabled)", addr);
if let Err(e) = build_grpc_server_builder()
let reflection_v1 = tonic_reflection::server::Builder::configure()
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
.build_v1()
.expect("Failed to build gRPC reflection v1 service");
let reflection_v1alpha = tonic_reflection::server::Builder::configure()
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
.build_v1alpha()
.expect("Failed to build gRPC reflection v1alpha service");
let result = if let Some(tls_acceptor) = grpc_tls_acceptor {
let incoming = grpc_tls_incoming(grpc_listener, tls_acceptor);
info!("gRPC server listening on {} (TLS enabled)", grpc_local_addr);
build_grpc_server_builder()
.layer(GrpcRequestIdLayer)
.add_service(reflection_v1)
.add_service(reflection_v1alpha)
@@ -637,32 +637,25 @@ async fn run(
let _ = shutdown_rx.recv().await;
})
.await
{
error!("gRPC server error: {}", e);
}
} else {
let reflection_v1 = tonic_reflection::server::Builder::configure()
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
.build_v1()
.expect("Failed to build gRPC reflection v1 service");
let reflection_v1alpha = tonic_reflection::server::Builder::configure()
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
.build_v1alpha()
.expect("Failed to build gRPC reflection v1alpha service");
info!("gRPC server listening on {}", addr);
if let Err(e) = build_grpc_server_builder()
let incoming =
tokio_stream::wrappers::TcpListenerStream::new(grpc_listener);
info!("gRPC server listening on {}", grpc_local_addr);
build_grpc_server_builder()
.layer(GrpcRequestIdLayer)
.add_service(reflection_v1)
.add_service(reflection_v1alpha)
.add_service(build_volume_grpc_service(grpc_service))
.serve_with_shutdown(addr, async move {
.serve_with_incoming_shutdown(incoming, async move {
let _ = shutdown_rx.recv().await;
})
.await
{
error!("gRPC server error: {}", e);
}
};
if let Err(ref e) = result {
error!("gRPC server error: {}", e);
let _ = shutdown_tx_grpc.send(());
}
result
})
};
@@ -771,9 +764,40 @@ async fn run(
}))
};
// Wait for all servers
// Wait for servers. Use select! with &mut so the losing handle is not
// dropped, then await it explicitly afterward.
let mut server_err: Option<String> = None;
let mut http_handle = http_handle;
let mut grpc_handle = grpc_handle;
let grpc_finished_first = tokio::select! {
_ = &mut http_handle => false,
_ = &mut grpc_handle => true,
};
// Inspect the gRPC result (already resolved if it finished first,
// otherwise await it now).
let grpc_result = if grpc_finished_first {
grpc_handle.await
} else {
// HTTP finished first; gRPC is still running. Await it.
grpc_handle.await
};
match grpc_result {
Ok(Ok(())) => {}
Ok(Err(e)) => {
let msg = format!("gRPC server exited with error: {}", e);
error!("{}", msg);
server_err = Some(msg);
// serve error already sent shutdown inside the task
}
Err(e) => {
let msg = format!("gRPC task panicked: {}", e);
error!("{}", msg);
server_err = Some(msg);
let _ = shutdown_tx.send(());
}
}
// Ensure the HTTP handle completes too.
let _ = http_handle.await;
let _ = grpc_handle.await;
if let Some(h) = public_handle {
let _ = h.await;
}
@@ -798,6 +822,10 @@ async fn run(
cpu_profile.finish().map_err(std::io::Error::other)?;
}
if let Some(err_msg) = server_err {
return Err(std::io::Error::other(err_msg).into());
}
info!("Volume server stopped.");
Ok(())
}

View File

@@ -1867,6 +1867,7 @@ impl VolumeServer for VolumeGrpcService {
{
let needle_header = resp.needle_header;
let mut needle_body = resp.needle_body;
let resp_version = resp.version;
if needle_header.is_empty() {
continue;
@@ -1891,8 +1892,36 @@ impl VolumeServer for VolumeGrpcService {
// Parse needle from header + body
let mut n = Needle::default();
n.read_header(&needle_header);
n.read_body_v2(&needle_body)
.map_err(|e| Status::internal(format!("parse needle body: {}", e)))?;
if n.size.0 < 0 {
return Err(Status::invalid_argument(format!(
"unexpected negative needle size {} for needle {}",
n.size.0, n.id.0
)));
} else if n.size.0 > 0 {
// Normal needle: parse the body fields (DataSize, Data, flags, etc.)
n.read_body_v2(&needle_body)
.map_err(|e| Status::internal(format!("parse needle body: {}", e)))?;
} else {
// Delete tombstone (size == 0): body is checksum + timestamp
// (V3) or checksum only (V2) + padding. Validate minimum
// footer length for the protocol version.
use crate::storage::types::{
NEEDLE_CHECKSUM_SIZE, TIMESTAMP_SIZE, VERSION_3, Version,
};
let version = Version(resp_version as u8);
let min_footer = if version >= VERSION_3 {
NEEDLE_CHECKSUM_SIZE + TIMESTAMP_SIZE
} else {
NEEDLE_CHECKSUM_SIZE
};
if needle_body.len() < min_footer {
return Err(Status::invalid_argument(format!(
"tombstone needle {} body too short: got {} bytes, need >= {} for version {}",
n.id.0, needle_body.len(), min_footer, resp_version
)));
}
}
// Write needle to local volume
let mut store = state.store.write().unwrap();
@@ -4047,11 +4076,12 @@ fn find_last_append_at_ns(idx_path: &str, dat_path: &str, version: u32) -> Optio
let mut header = [0u8; 16];
dat_file.read_exact(&mut header).ok()?;
let needle_size = i32::from_be_bytes([header[12], header[13], header[14], header[15]]);
if needle_size <= 0 {
if needle_size < 0 {
return None;
}
// Seek to tail: offset + 16 (header) + size -> checksum (4) + timestamp (8)
// For delete needles (size == 0), the tail is right after the header.
let tail_offset = actual_offset as u64 + 16 + needle_size as u64;
dat_file.seek(SeekFrom::Start(tail_offset)).ok()?;