* Prevent split-brain: Persistent ClusterID and Join Validation - Persist ClusterId in Raft store to survive restarts. - Validate ClusterId on Raft command application (piggybacked on MaxVolumeId). - Prevent masters with conflicting ClusterIds from joining/operating together. - Update Telemetry to report the persistent ClusterId. * Refine ClusterID validation based on feedback - Improved error message in cluster_commands.go. - Added ClusterId mismatch check in RaftServer.Recovery. * Handle Raft errors and support Hashicorp Raft for ClusterId - Check for errors when persisting ClusterId in legacy Raft. - Implement ClusterId generation and persistence for Hashicorp Raft leader changes. - Ensure consistent error logging. * Refactor ClusterId validation - Centralize ClusterId mismatch check in Topology.SetClusterId. - Simplify MaxVolumeIdCommand.Apply and RaftServer.Recovery to rely on SetClusterId. * Fix goroutine leak and add timeout - Handle channel closure in Hashicorp Raft leader listener. - Add timeout to Raft Apply call to prevent blocking. * Fix deadlock in legacy Raft listener - Wrap ClusterId generation/persistence in a goroutine to avoid blocking the Raft event loop (deadlock). * Rename ClusterId to SystemId - Renamed ClusterId to SystemId across the codebase (protobuf, topology, server, telemetry). - Regenerated telemetry.pb.go with new field. * Rename SystemId to TopologyId - Rename to SystemId was intermediate step. - Final name is TopologyId for the persistent cluster identifier. - Updated protobuf, topology, raft server, master server, and telemetry. * Optimize Hashicorp Raft listener - Integrated TopologyId generation into existing monitorLeaderLoop. - Removed extra goroutine in master_server.go. * Fix optimistic TopologyId update - Removed premature local state update of TopologyId in master_server.go and raft_hashicorp.go. - State is now solely updated via the Raft state machine Apply/Restore methods after consensus. * Add explicit log for recovered TopologyId - Added glog.V(0) info log in RaftServer.Recovery to print the recovered TopologyId on startup. * Add Raft barrier to prevent TopologyId race condition - Implement ensureTopologyId helper method - Send no-op MaxVolumeIdCommand to sync Raft log before checking TopologyId - Ensures persisted TopologyId is recovered before generating new one - Prevents race where generation happens during log replay * Serialize TopologyId generation with mutex - Add topologyIdGenLock mutex to MasterServer struct - Wrap ensureTopologyId method with lock to prevent concurrent generation - Fixes race where event listener and manual leadership check both generate IDs - Second caller waits for first to complete and sees the generated ID * Add TopologyId recovery logging to Apply method - Change log level from V(1) to V(0) for visibility - Log 'Recovered TopologyId' when applying from Raft log - Ensures recovery is visible whether from snapshot or log replay - Matches Recovery() method logging for consistency * Fix Raft barrier timing issue - Add 100ms delay after barrier command to ensure log application completes - Add debug logging to track barrier execution and TopologyId state - Return early if barrier command fails - Prevents TopologyId generation before old logs are fully applied * ensure leader * address comments * address comments * redundant * clean up * double check * refactoring * comment
120 lines
2.7 KiB
Go
120 lines
2.7 KiB
Go
package telemetry
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"net/http"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/google/uuid"
|
|
"github.com/seaweedfs/seaweedfs/telemetry/proto"
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
protobuf "google.golang.org/protobuf/proto"
|
|
)
|
|
|
|
type Client struct {
|
|
url string
|
|
enabled bool
|
|
instanceID string
|
|
httpClient *http.Client
|
|
topologyId string
|
|
sync.RWMutex
|
|
}
|
|
|
|
// NewClient creates a new telemetry client
|
|
func NewClient(url string, enabled bool) *Client {
|
|
return &Client{
|
|
url: url,
|
|
enabled: enabled,
|
|
instanceID: uuid.New().String(), // Generate UUID in memory only
|
|
httpClient: &http.Client{
|
|
Timeout: 10 * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
func (c *Client) SetTopologyId(topologyId string) {
|
|
c.Lock()
|
|
defer c.Unlock()
|
|
c.topologyId = topologyId
|
|
}
|
|
|
|
// IsEnabled returns whether telemetry is enabled
|
|
func (c *Client) IsEnabled() bool {
|
|
return c.enabled && c.url != ""
|
|
}
|
|
|
|
// SendTelemetry sends telemetry data synchronously using protobuf format
|
|
func (c *Client) SendTelemetry(data *proto.TelemetryData) error {
|
|
if !c.IsEnabled() {
|
|
return nil
|
|
}
|
|
|
|
// Work on a copy to avoid mutating the caller's TelemetryData
|
|
clonedData, ok := protobuf.Clone(data).(*proto.TelemetryData)
|
|
if !ok {
|
|
return fmt.Errorf("failed to clone telemetry data")
|
|
}
|
|
|
|
// Set the topology ID
|
|
c.RLock()
|
|
if c.topologyId != "" {
|
|
clonedData.TopologyId = c.topologyId
|
|
}
|
|
c.RUnlock()
|
|
|
|
return c.sendProtobuf(clonedData)
|
|
}
|
|
|
|
// SendTelemetryAsync sends telemetry data asynchronously
|
|
func (c *Client) SendTelemetryAsync(data *proto.TelemetryData) {
|
|
if !c.IsEnabled() {
|
|
return
|
|
}
|
|
|
|
go func() {
|
|
if err := c.SendTelemetry(data); err != nil {
|
|
glog.V(1).Infof("Failed to send telemetry: %v", err)
|
|
}
|
|
}()
|
|
}
|
|
|
|
// sendProtobuf sends data using protobuf format
|
|
func (c *Client) sendProtobuf(data *proto.TelemetryData) error {
|
|
req := &proto.TelemetryRequest{
|
|
Data: data,
|
|
}
|
|
|
|
body, err := protobuf.Marshal(req)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to marshal protobuf: %w", err)
|
|
}
|
|
|
|
httpReq, err := http.NewRequest("POST", c.url, bytes.NewBuffer(body))
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create request: %w", err)
|
|
}
|
|
|
|
httpReq.Header.Set("Content-Type", "application/x-protobuf")
|
|
httpReq.Header.Set("User-Agent", fmt.Sprintf("SeaweedFS/%s", data.Version))
|
|
|
|
resp, err := c.httpClient.Do(httpReq)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to send request: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return fmt.Errorf("server returned status %d", resp.StatusCode)
|
|
}
|
|
|
|
glog.V(2).Infof("Telemetry sent successfully via protobuf")
|
|
return nil
|
|
}
|
|
|
|
// GetInstanceID returns the current instance ID
|
|
func (c *Client) GetInstanceID() string {
|
|
return c.instanceID
|
|
}
|