* Prevent split-brain: Persistent ClusterID and Join Validation - Persist ClusterId in Raft store to survive restarts. - Validate ClusterId on Raft command application (piggybacked on MaxVolumeId). - Prevent masters with conflicting ClusterIds from joining/operating together. - Update Telemetry to report the persistent ClusterId. * Refine ClusterID validation based on feedback - Improved error message in cluster_commands.go. - Added ClusterId mismatch check in RaftServer.Recovery. * Handle Raft errors and support Hashicorp Raft for ClusterId - Check for errors when persisting ClusterId in legacy Raft. - Implement ClusterId generation and persistence for Hashicorp Raft leader changes. - Ensure consistent error logging. * Refactor ClusterId validation - Centralize ClusterId mismatch check in Topology.SetClusterId. - Simplify MaxVolumeIdCommand.Apply and RaftServer.Recovery to rely on SetClusterId. * Fix goroutine leak and add timeout - Handle channel closure in Hashicorp Raft leader listener. - Add timeout to Raft Apply call to prevent blocking. * Fix deadlock in legacy Raft listener - Wrap ClusterId generation/persistence in a goroutine to avoid blocking the Raft event loop (deadlock). * Rename ClusterId to SystemId - Renamed ClusterId to SystemId across the codebase (protobuf, topology, server, telemetry). - Regenerated telemetry.pb.go with new field. * Rename SystemId to TopologyId - Rename to SystemId was intermediate step. - Final name is TopologyId for the persistent cluster identifier. - Updated protobuf, topology, raft server, master server, and telemetry. * Optimize Hashicorp Raft listener - Integrated TopologyId generation into existing monitorLeaderLoop. - Removed extra goroutine in master_server.go. * Fix optimistic TopologyId update - Removed premature local state update of TopologyId in master_server.go and raft_hashicorp.go. - State is now solely updated via the Raft state machine Apply/Restore methods after consensus. * Add explicit log for recovered TopologyId - Added glog.V(0) info log in RaftServer.Recovery to print the recovered TopologyId on startup. * Add Raft barrier to prevent TopologyId race condition - Implement ensureTopologyId helper method - Send no-op MaxVolumeIdCommand to sync Raft log before checking TopologyId - Ensures persisted TopologyId is recovered before generating new one - Prevents race where generation happens during log replay * Serialize TopologyId generation with mutex - Add topologyIdGenLock mutex to MasterServer struct - Wrap ensureTopologyId method with lock to prevent concurrent generation - Fixes race where event listener and manual leadership check both generate IDs - Second caller waits for first to complete and sees the generated ID * Add TopologyId recovery logging to Apply method - Change log level from V(1) to V(0) for visibility - Log 'Recovered TopologyId' when applying from Raft log - Ensures recovery is visible whether from snapshot or log replay - Matches Recovery() method logging for consistency * Fix Raft barrier timing issue - Add 100ms delay after barrier command to ensure log application completes - Add debug logging to track barrier execution and TopologyId state - Return early if barrier command fails - Prevents TopologyId generation before old logs are fully applied * ensure leader * address comments * address comments * redundant * clean up * double check * refactoring * comment
52 lines
1.2 KiB
Protocol Buffer
52 lines
1.2 KiB
Protocol Buffer
syntax = "proto3";
|
|
|
|
package telemetry;
|
|
|
|
option go_package = "github.com/seaweedfs/seaweedfs/telemetry/proto";
|
|
|
|
// TelemetryData represents cluster-level telemetry information
|
|
message TelemetryData {
|
|
// Unique cluster identifier (generated in-memory)
|
|
string topology_id = 1;
|
|
|
|
// SeaweedFS version
|
|
string version = 2;
|
|
|
|
// Operating system (e.g., "linux/amd64")
|
|
string os = 3;
|
|
|
|
// Field 4 reserved (was features)
|
|
reserved 4;
|
|
|
|
// Field 5 reserved (was deployment)
|
|
reserved 5;
|
|
|
|
// Number of volume servers in the cluster
|
|
int32 volume_server_count = 6;
|
|
|
|
// Total disk usage across all volume servers (in bytes)
|
|
uint64 total_disk_bytes = 7;
|
|
|
|
// Total number of volumes in the cluster
|
|
int32 total_volume_count = 8;
|
|
|
|
// Number of filer servers in the cluster
|
|
int32 filer_count = 9;
|
|
|
|
// Number of broker servers in the cluster
|
|
int32 broker_count = 10;
|
|
|
|
// Unix timestamp when the data was collected
|
|
int64 timestamp = 11;
|
|
}
|
|
|
|
// TelemetryRequest is sent from SeaweedFS clusters to the telemetry server
|
|
message TelemetryRequest {
|
|
TelemetryData data = 1;
|
|
}
|
|
|
|
// TelemetryResponse is returned by the telemetry server
|
|
message TelemetryResponse {
|
|
bool success = 1;
|
|
string message = 2;
|
|
} |