fix(volume-rust): fix volume balance between Go and Rust servers (#8915)
Two bugs prevented reliable volume balancing when a Rust volume server is the copy target: 1. find_last_append_at_ns returned None for delete tombstones (Size==0 in dat header), falling back to file mtime truncated to seconds. This caused the tail step to re-send needles from the last sub-second window. Fix: change `needle_size <= 0` to `< 0` since Size==0 delete needles still have a valid timestamp in their tail. 2. VolumeTailReceiver called read_body_v2 on delete needles, which have no DataSize/Data/flags — only checksum+timestamp+padding after the header. Fix: skip read_body_v2 when size == 0, reject negative sizes. Also: - Unify gRPC server bind: use TcpListener::bind before spawn for both TLS and non-TLS paths, propagating bind errors at startup. - Add mixed Go+Rust cluster test harness and integration tests covering VolumeCopy in both directions, copy with deletes, and full balance move with tail tombstone propagation and source deletion. - Make FindOrBuildRustBinary configurable for default vs no-default features (4-byte vs 5-byte offsets).
This commit is contained in:
@@ -20,7 +20,7 @@ default = ["5bytes"]
|
||||
[dependencies]
|
||||
# Async runtime
|
||||
tokio = { version = "1", features = ["full"] }
|
||||
tokio-stream = "0.1"
|
||||
tokio-stream = { version = "0.1", features = ["net"] }
|
||||
tokio-io-timeout = "1"
|
||||
|
||||
# gRPC + protobuf
|
||||
|
||||
@@ -600,35 +600,35 @@ async fn run(
|
||||
})
|
||||
};
|
||||
|
||||
// Bind the gRPC listener before spawning to propagate bind errors at startup.
|
||||
let grpc_listener = tokio::net::TcpListener::bind(&grpc_addr)
|
||||
.await
|
||||
.unwrap_or_else(|e| panic!("Failed to bind gRPC to {}: {}", grpc_addr, e));
|
||||
let grpc_local_addr = grpc_listener
|
||||
.local_addr()
|
||||
.unwrap_or_else(|e| panic!("Failed to get gRPC local addr: {}", e));
|
||||
|
||||
let grpc_handle = {
|
||||
let grpc_state = state.clone();
|
||||
let grpc_addr = grpc_addr.clone();
|
||||
let grpc_tls_acceptor = grpc_tls_acceptor.clone();
|
||||
let mut shutdown_rx = shutdown_tx.subscribe();
|
||||
let shutdown_tx_grpc = shutdown_tx.clone();
|
||||
tokio::spawn(async move {
|
||||
let addr = tokio::net::lookup_host(&grpc_addr)
|
||||
.await
|
||||
.expect("Failed to resolve gRPC address")
|
||||
.next()
|
||||
.expect("No addresses found for gRPC bind address");
|
||||
let grpc_service = VolumeGrpcService {
|
||||
state: grpc_state.clone(),
|
||||
};
|
||||
if let Some(tls_acceptor) = grpc_tls_acceptor {
|
||||
let listener = tokio::net::TcpListener::bind(&grpc_addr)
|
||||
.await
|
||||
.unwrap_or_else(|e| panic!("Failed to bind gRPC to {}: {}", grpc_addr, e));
|
||||
let incoming = grpc_tls_incoming(listener, tls_acceptor);
|
||||
let reflection_v1 = tonic_reflection::server::Builder::configure()
|
||||
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
|
||||
.build_v1()
|
||||
.expect("Failed to build gRPC reflection v1 service");
|
||||
let reflection_v1alpha = tonic_reflection::server::Builder::configure()
|
||||
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
|
||||
.build_v1alpha()
|
||||
.expect("Failed to build gRPC reflection v1alpha service");
|
||||
info!("gRPC server listening on {} (TLS enabled)", addr);
|
||||
if let Err(e) = build_grpc_server_builder()
|
||||
let reflection_v1 = tonic_reflection::server::Builder::configure()
|
||||
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
|
||||
.build_v1()
|
||||
.expect("Failed to build gRPC reflection v1 service");
|
||||
let reflection_v1alpha = tonic_reflection::server::Builder::configure()
|
||||
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
|
||||
.build_v1alpha()
|
||||
.expect("Failed to build gRPC reflection v1alpha service");
|
||||
let result = if let Some(tls_acceptor) = grpc_tls_acceptor {
|
||||
let incoming = grpc_tls_incoming(grpc_listener, tls_acceptor);
|
||||
info!("gRPC server listening on {} (TLS enabled)", grpc_local_addr);
|
||||
build_grpc_server_builder()
|
||||
.layer(GrpcRequestIdLayer)
|
||||
.add_service(reflection_v1)
|
||||
.add_service(reflection_v1alpha)
|
||||
@@ -637,32 +637,25 @@ async fn run(
|
||||
let _ = shutdown_rx.recv().await;
|
||||
})
|
||||
.await
|
||||
{
|
||||
error!("gRPC server error: {}", e);
|
||||
}
|
||||
} else {
|
||||
let reflection_v1 = tonic_reflection::server::Builder::configure()
|
||||
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
|
||||
.build_v1()
|
||||
.expect("Failed to build gRPC reflection v1 service");
|
||||
let reflection_v1alpha = tonic_reflection::server::Builder::configure()
|
||||
.register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET)
|
||||
.build_v1alpha()
|
||||
.expect("Failed to build gRPC reflection v1alpha service");
|
||||
info!("gRPC server listening on {}", addr);
|
||||
if let Err(e) = build_grpc_server_builder()
|
||||
let incoming =
|
||||
tokio_stream::wrappers::TcpListenerStream::new(grpc_listener);
|
||||
info!("gRPC server listening on {}", grpc_local_addr);
|
||||
build_grpc_server_builder()
|
||||
.layer(GrpcRequestIdLayer)
|
||||
.add_service(reflection_v1)
|
||||
.add_service(reflection_v1alpha)
|
||||
.add_service(build_volume_grpc_service(grpc_service))
|
||||
.serve_with_shutdown(addr, async move {
|
||||
.serve_with_incoming_shutdown(incoming, async move {
|
||||
let _ = shutdown_rx.recv().await;
|
||||
})
|
||||
.await
|
||||
{
|
||||
error!("gRPC server error: {}", e);
|
||||
}
|
||||
};
|
||||
if let Err(ref e) = result {
|
||||
error!("gRPC server error: {}", e);
|
||||
let _ = shutdown_tx_grpc.send(());
|
||||
}
|
||||
result
|
||||
})
|
||||
};
|
||||
|
||||
@@ -771,9 +764,40 @@ async fn run(
|
||||
}))
|
||||
};
|
||||
|
||||
// Wait for all servers
|
||||
// Wait for servers. Use select! with &mut so the losing handle is not
|
||||
// dropped, then await it explicitly afterward.
|
||||
let mut server_err: Option<String> = None;
|
||||
let mut http_handle = http_handle;
|
||||
let mut grpc_handle = grpc_handle;
|
||||
let grpc_finished_first = tokio::select! {
|
||||
_ = &mut http_handle => false,
|
||||
_ = &mut grpc_handle => true,
|
||||
};
|
||||
// Inspect the gRPC result (already resolved if it finished first,
|
||||
// otherwise await it now).
|
||||
let grpc_result = if grpc_finished_first {
|
||||
grpc_handle.await
|
||||
} else {
|
||||
// HTTP finished first; gRPC is still running. Await it.
|
||||
grpc_handle.await
|
||||
};
|
||||
match grpc_result {
|
||||
Ok(Ok(())) => {}
|
||||
Ok(Err(e)) => {
|
||||
let msg = format!("gRPC server exited with error: {}", e);
|
||||
error!("{}", msg);
|
||||
server_err = Some(msg);
|
||||
// serve error already sent shutdown inside the task
|
||||
}
|
||||
Err(e) => {
|
||||
let msg = format!("gRPC task panicked: {}", e);
|
||||
error!("{}", msg);
|
||||
server_err = Some(msg);
|
||||
let _ = shutdown_tx.send(());
|
||||
}
|
||||
}
|
||||
// Ensure the HTTP handle completes too.
|
||||
let _ = http_handle.await;
|
||||
let _ = grpc_handle.await;
|
||||
if let Some(h) = public_handle {
|
||||
let _ = h.await;
|
||||
}
|
||||
@@ -798,6 +822,10 @@ async fn run(
|
||||
cpu_profile.finish().map_err(std::io::Error::other)?;
|
||||
}
|
||||
|
||||
if let Some(err_msg) = server_err {
|
||||
return Err(std::io::Error::other(err_msg).into());
|
||||
}
|
||||
|
||||
info!("Volume server stopped.");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -1867,6 +1867,7 @@ impl VolumeServer for VolumeGrpcService {
|
||||
{
|
||||
let needle_header = resp.needle_header;
|
||||
let mut needle_body = resp.needle_body;
|
||||
let resp_version = resp.version;
|
||||
|
||||
if needle_header.is_empty() {
|
||||
continue;
|
||||
@@ -1891,8 +1892,36 @@ impl VolumeServer for VolumeGrpcService {
|
||||
// Parse needle from header + body
|
||||
let mut n = Needle::default();
|
||||
n.read_header(&needle_header);
|
||||
n.read_body_v2(&needle_body)
|
||||
.map_err(|e| Status::internal(format!("parse needle body: {}", e)))?;
|
||||
|
||||
if n.size.0 < 0 {
|
||||
return Err(Status::invalid_argument(format!(
|
||||
"unexpected negative needle size {} for needle {}",
|
||||
n.size.0, n.id.0
|
||||
)));
|
||||
} else if n.size.0 > 0 {
|
||||
// Normal needle: parse the body fields (DataSize, Data, flags, etc.)
|
||||
n.read_body_v2(&needle_body)
|
||||
.map_err(|e| Status::internal(format!("parse needle body: {}", e)))?;
|
||||
} else {
|
||||
// Delete tombstone (size == 0): body is checksum + timestamp
|
||||
// (V3) or checksum only (V2) + padding. Validate minimum
|
||||
// footer length for the protocol version.
|
||||
use crate::storage::types::{
|
||||
NEEDLE_CHECKSUM_SIZE, TIMESTAMP_SIZE, VERSION_3, Version,
|
||||
};
|
||||
let version = Version(resp_version as u8);
|
||||
let min_footer = if version >= VERSION_3 {
|
||||
NEEDLE_CHECKSUM_SIZE + TIMESTAMP_SIZE
|
||||
} else {
|
||||
NEEDLE_CHECKSUM_SIZE
|
||||
};
|
||||
if needle_body.len() < min_footer {
|
||||
return Err(Status::invalid_argument(format!(
|
||||
"tombstone needle {} body too short: got {} bytes, need >= {} for version {}",
|
||||
n.id.0, needle_body.len(), min_footer, resp_version
|
||||
)));
|
||||
}
|
||||
}
|
||||
|
||||
// Write needle to local volume
|
||||
let mut store = state.store.write().unwrap();
|
||||
@@ -4047,11 +4076,12 @@ fn find_last_append_at_ns(idx_path: &str, dat_path: &str, version: u32) -> Optio
|
||||
let mut header = [0u8; 16];
|
||||
dat_file.read_exact(&mut header).ok()?;
|
||||
let needle_size = i32::from_be_bytes([header[12], header[13], header[14], header[15]]);
|
||||
if needle_size <= 0 {
|
||||
if needle_size < 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Seek to tail: offset + 16 (header) + size -> checksum (4) + timestamp (8)
|
||||
// For delete needles (size == 0), the tail is right after the header.
|
||||
let tail_offset = actual_offset as u64 + 16 + needle_size as u64;
|
||||
dat_file.seek(SeekFrom::Start(tail_offset)).ok()?;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user