Files
seaweedFS/weed/telemetry/client.go
Chris Lu 753e1db096 Prevent split-brain: Persistent ClusterID and Join Validation (#8022)
* Prevent split-brain: Persistent ClusterID and Join Validation

- Persist ClusterId in Raft store to survive restarts.
- Validate ClusterId on Raft command application (piggybacked on MaxVolumeId).
- Prevent masters with conflicting ClusterIds from joining/operating together.
- Update Telemetry to report the persistent ClusterId.

* Refine ClusterID validation based on feedback

- Improved error message in cluster_commands.go.
- Added ClusterId mismatch check in RaftServer.Recovery.

* Handle Raft errors and support Hashicorp Raft for ClusterId

- Check for errors when persisting ClusterId in legacy Raft.
- Implement ClusterId generation and persistence for Hashicorp Raft leader changes.
- Ensure consistent error logging.

* Refactor ClusterId validation

- Centralize ClusterId mismatch check in Topology.SetClusterId.
- Simplify MaxVolumeIdCommand.Apply and RaftServer.Recovery to rely on SetClusterId.

* Fix goroutine leak and add timeout

- Handle channel closure in Hashicorp Raft leader listener.
- Add timeout to Raft Apply call to prevent blocking.

* Fix deadlock in legacy Raft listener

- Wrap ClusterId generation/persistence in a goroutine to avoid blocking the Raft event loop (deadlock).

* Rename ClusterId to SystemId

- Renamed ClusterId to SystemId across the codebase (protobuf, topology, server, telemetry).
- Regenerated telemetry.pb.go with new field.

* Rename SystemId to TopologyId

- Rename to SystemId was intermediate step.
- Final name is TopologyId for the persistent cluster identifier.
- Updated protobuf, topology, raft server, master server, and telemetry.

* Optimize Hashicorp Raft listener

- Integrated TopologyId generation into existing monitorLeaderLoop.
- Removed extra goroutine in master_server.go.

* Fix optimistic TopologyId update

- Removed premature local state update of TopologyId in master_server.go and raft_hashicorp.go.
- State is now solely updated via the Raft state machine Apply/Restore methods after consensus.

* Add explicit log for recovered TopologyId

- Added glog.V(0) info log in RaftServer.Recovery to print the recovered TopologyId on startup.

* Add Raft barrier to prevent TopologyId race condition

- Implement ensureTopologyId helper method
- Send no-op MaxVolumeIdCommand to sync Raft log before checking TopologyId
- Ensures persisted TopologyId is recovered before generating new one
- Prevents race where generation happens during log replay

* Serialize TopologyId generation with mutex

- Add topologyIdGenLock mutex to MasterServer struct
- Wrap ensureTopologyId method with lock to prevent concurrent generation
- Fixes race where event listener and manual leadership check both generate IDs
- Second caller waits for first to complete and sees the generated ID

* Add TopologyId recovery logging to Apply method

- Change log level from V(1) to V(0) for visibility
- Log 'Recovered TopologyId' when applying from Raft log
- Ensures recovery is visible whether from snapshot or log replay
- Matches Recovery() method logging for consistency

* Fix Raft barrier timing issue

- Add 100ms delay after barrier command to ensure log application completes
- Add debug logging to track barrier execution and TopologyId state
- Return early if barrier command fails
- Prevents TopologyId generation before old logs are fully applied

* ensure leader

* address comments

* address comments

* redundant

* clean up

* double check

* refactoring

* comment
2026-01-18 14:02:34 -08:00

120 lines
2.7 KiB
Go

package telemetry
import (
"bytes"
"fmt"
"net/http"
"sync"
"time"
"github.com/google/uuid"
"github.com/seaweedfs/seaweedfs/telemetry/proto"
"github.com/seaweedfs/seaweedfs/weed/glog"
protobuf "google.golang.org/protobuf/proto"
)
type Client struct {
url string
enabled bool
instanceID string
httpClient *http.Client
topologyId string
sync.RWMutex
}
// NewClient creates a new telemetry client
func NewClient(url string, enabled bool) *Client {
return &Client{
url: url,
enabled: enabled,
instanceID: uuid.New().String(), // Generate UUID in memory only
httpClient: &http.Client{
Timeout: 10 * time.Second,
},
}
}
func (c *Client) SetTopologyId(topologyId string) {
c.Lock()
defer c.Unlock()
c.topologyId = topologyId
}
// IsEnabled returns whether telemetry is enabled
func (c *Client) IsEnabled() bool {
return c.enabled && c.url != ""
}
// SendTelemetry sends telemetry data synchronously using protobuf format
func (c *Client) SendTelemetry(data *proto.TelemetryData) error {
if !c.IsEnabled() {
return nil
}
// Work on a copy to avoid mutating the caller's TelemetryData
clonedData, ok := protobuf.Clone(data).(*proto.TelemetryData)
if !ok {
return fmt.Errorf("failed to clone telemetry data")
}
// Set the topology ID
c.RLock()
if c.topologyId != "" {
clonedData.TopologyId = c.topologyId
}
c.RUnlock()
return c.sendProtobuf(clonedData)
}
// SendTelemetryAsync sends telemetry data asynchronously
func (c *Client) SendTelemetryAsync(data *proto.TelemetryData) {
if !c.IsEnabled() {
return
}
go func() {
if err := c.SendTelemetry(data); err != nil {
glog.V(1).Infof("Failed to send telemetry: %v", err)
}
}()
}
// sendProtobuf sends data using protobuf format
func (c *Client) sendProtobuf(data *proto.TelemetryData) error {
req := &proto.TelemetryRequest{
Data: data,
}
body, err := protobuf.Marshal(req)
if err != nil {
return fmt.Errorf("failed to marshal protobuf: %w", err)
}
httpReq, err := http.NewRequest("POST", c.url, bytes.NewBuffer(body))
if err != nil {
return fmt.Errorf("failed to create request: %w", err)
}
httpReq.Header.Set("Content-Type", "application/x-protobuf")
httpReq.Header.Set("User-Agent", fmt.Sprintf("SeaweedFS/%s", data.Version))
resp, err := c.httpClient.Do(httpReq)
if err != nil {
return fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("server returned status %d", resp.StatusCode)
}
glog.V(2).Infof("Telemetry sent successfully via protobuf")
return nil
}
// GetInstanceID returns the current instance ID
func (c *Client) GetInstanceID() string {
return c.instanceID
}