package weed_server import ( "context" "errors" "fmt" "strings" "sync/atomic" "time" "github.com/seaweedfs/seaweedfs/weed/stats" "google.golang.org/protobuf/proto" "github.com/seaweedfs/seaweedfs/weed/filer" "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb/filer_pb" "github.com/seaweedfs/seaweedfs/weed/util" "github.com/seaweedfs/seaweedfs/weed/util/log_buffer" ) const ( // MaxUnsyncedEvents send empty notification with timestamp when certain amount of events have been filtered MaxUnsyncedEvents = 1e3 ) // metadataStreamSender is satisfied by both gRPC stream types and pipelinedSender. type metadataStreamSender interface { Send(*filer_pb.SubscribeMetadataResponse) error } const ( // batchBehindThreshold: when an event's timestamp is older than this // relative to wall clock, the sender switches to batch mode for throughput. // When events are closer to current time, they are sent one-by-one for // low latency. batchBehindThreshold = 2 * time.Minute maxBatchSize = 256 ) // pipelinedSender decouples event reading from gRPC delivery by buffering // messages in a channel. A dedicated goroutine handles stream.Send(), allowing // the reader to continue reading ahead without waiting for the client to // acknowledge each event. // // When the client declares support for batching AND events are far behind // current time (backlog catch-up), multiple events are packed into a single // stream.Send() using the Events field. Otherwise events are sent one-by-one. type pipelinedSender struct { sendCh chan *filer_pb.SubscribeMetadataResponse errCh chan error done chan struct{} canBatch bool // true only if client set ClientSupportsBatching } func newPipelinedSender(stream metadataStreamSender, bufSize int, clientSupportsBatching bool) *pipelinedSender { s := &pipelinedSender{ sendCh: make(chan *filer_pb.SubscribeMetadataResponse, bufSize), errCh: make(chan error, 1), done: make(chan struct{}), canBatch: clientSupportsBatching, } go s.sendLoop(stream) return s } func (s *pipelinedSender) sendLoop(stream metadataStreamSender) { defer close(s.done) for msg := range s.sendCh { shouldBatch := s.canBatch && time.Now().UnixNano()-msg.TsNs > int64(batchBehindThreshold) if !shouldBatch { // Real-time: send immediately for low latency if err := stream.Send(msg); err != nil { s.reportErr(err) return } continue } // Backlog: batch multiple events into one Send for throughput. // The first event goes in the top-level fields; additional events // go in the Events slice. Old clients ignore the Events field. batch := make([]*filer_pb.SubscribeMetadataResponse, 0, maxBatchSize) batch = append(batch, msg) drain: for len(batch) < maxBatchSize { select { case next, ok := <-s.sendCh: if !ok { break drain } batch = append(batch, next) default: break drain } } var toSend *filer_pb.SubscribeMetadataResponse if len(batch) == 1 { toSend = batch[0] } else { // Pack batch: first event is the envelope, rest go in Events toSend = batch[0] toSend.Events = batch[1:] } if err := stream.Send(toSend); err != nil { s.reportErr(err) return } if toSend.Events != nil { toSend.Events = nil } } } func (s *pipelinedSender) reportErr(err error) { select { case s.errCh <- err: default: } // Don't drain sendCh here — Send() detects the exit via <-s.done // and the deferred close(s.done) in sendLoop will fire after this returns. } func (s *pipelinedSender) Send(msg *filer_pb.SubscribeMetadataResponse) error { select { case s.sendCh <- msg: return nil case err := <-s.errCh: return err case <-s.done: // Sender goroutine exited (stream error or shutdown). select { case err := <-s.errCh: return err default: return fmt.Errorf("pipelined sender closed") } } } func (s *pipelinedSender) Close() error { close(s.sendCh) <-s.done select { case err := <-s.errCh: return err default: return nil } } func (fs *FilerServer) SubscribeMetadata(req *filer_pb.SubscribeMetadataRequest, stream filer_pb.SeaweedFiler_SubscribeMetadataServer) error { if fs.filer.MetaAggregator == nil || !fs.filer.MetaAggregator.HasRemotePeers() { return fs.SubscribeLocalMetadata(req, stream) } ctx := stream.Context() peerAddress := findClientAddress(ctx, 0) isReplacing, alreadyKnown, clientName := fs.addClient("", req.ClientName, peerAddress, req.ClientId, req.ClientEpoch) if isReplacing { fs.filer.MetaAggregator.ListenersCond.Broadcast() // nudges the subscribers that are waiting } else if alreadyKnown { fs.filer.MetaAggregator.ListenersCond.Broadcast() // nudges the subscribers that are waiting return fmt.Errorf("duplicated subscription detected for client %s id %d", clientName, req.ClientId) } defer func() { glog.V(0).Infof("disconnect %v subscriber %s clientId:%d", clientName, req.PathPrefix, req.ClientId) fs.deleteClient("", clientName, req.ClientId, req.ClientEpoch) fs.filer.MetaAggregator.ListenersCond.Broadcast() // nudges the subscribers that are waiting }() lastReadTime := log_buffer.NewMessagePosition(req.SinceNs, -2) glog.V(0).Infof(" %v starts to subscribe %s from %+v", clientName, req.PathPrefix, lastReadTime) sender := newPipelinedSender(stream, 1024, req.ClientSupportsBatching) defer sender.Close() // Register for instant notification when new data arrives in the aggregated log buffer. // Used to replace the 1127ms sleep with event-driven wake-up. aggNotifyName := "aggSubscribe:" + clientName aggNotifyChan := fs.filer.MetaAggregator.MetaLogBuffer.RegisterSubscriber(aggNotifyName) defer fs.filer.MetaAggregator.MetaLogBuffer.UnregisterSubscriber(aggNotifyName) eachEventNotificationFn := fs.eachEventNotificationFn(req, sender, clientName) eachLogEntryFn := eachLogEntryFn(eachEventNotificationFn) var processedTsNs int64 var readPersistedLogErr error var readInMemoryLogErr error var isDone bool for { glog.V(4).Infof("read on disk %v aggregated subscribe %s from %+v", clientName, req.PathPrefix, lastReadTime) if req.ClientSupportsMetadataChunks { processedTsNs, isDone, readPersistedLogErr = fs.sendLogFileRefs(ctx, stream, lastReadTime, req.UntilNs) } else { processedTsNs, isDone, readPersistedLogErr = fs.filer.ReadPersistedLogBuffer(lastReadTime, req.UntilNs, eachLogEntryFn) } if readPersistedLogErr != nil { return fmt.Errorf("reading from persisted logs: %w", readPersistedLogErr) } if isDone { return nil } glog.V(4).Infof("processed to %v: %v", clientName, processedTsNs) if processedTsNs != 0 { lastReadTime = log_buffer.NewMessagePosition(processedTsNs, -2) } else { // No data found on disk // Check if we previously got ResumeFromDiskError from memory, meaning we're in a gap if errors.Is(readInMemoryLogErr, log_buffer.ResumeFromDiskError) { // We have a gap: requested time < earliest memory time, but no data on disk // Skip forward to earliest memory time to avoid infinite loop earliestTime := fs.filer.MetaAggregator.MetaLogBuffer.GetEarliestTime() if !earliestTime.IsZero() && earliestTime.After(lastReadTime.Time) { glog.V(3).Infof("gap detected: skipping from %v to earliest memory time %v for %v", lastReadTime.Time, earliestTime, clientName) // Position at earliest time; time-based reader will include it lastReadTime = log_buffer.NewMessagePosition(earliestTime.UnixNano(), -2) readInMemoryLogErr = nil // Clear the error since we're skipping forward } } else { // First pass or no ResumeFromDiskError yet - check the next day for logs nextDayTs := util.GetNextDayTsNano(lastReadTime.Time.UnixNano()) position := log_buffer.NewMessagePosition(nextDayTs, -2) found, err := fs.filer.HasPersistedLogFiles(position) if err != nil { return fmt.Errorf("checking persisted log files: %w", err) } if found { lastReadTime = position } } } glog.V(4).Infof("read in memory %v aggregated subscribe %s from %+v", clientName, req.PathPrefix, lastReadTime) lastReadTime, isDone, readInMemoryLogErr = fs.filer.MetaAggregator.MetaLogBuffer.LoopProcessLogData("aggMeta:"+clientName, lastReadTime, req.UntilNs, func() bool { select { case <-ctx.Done(): return false default: } return fs.hasClient(req.ClientId, req.ClientEpoch) }, eachLogEntryFn) if readInMemoryLogErr != nil { if errors.Is(readInMemoryLogErr, log_buffer.ResumeFromDiskError) { // Memory says data is too old - will read from disk on next iteration // But if disk also has no data (gap in history), we'll skip forward continue } glog.Errorf("processed to %v: %v", lastReadTime, readInMemoryLogErr) if !errors.Is(readInMemoryLogErr, log_buffer.ResumeError) { break } } if isDone { return nil } if !fs.hasClient(req.ClientId, req.ClientEpoch) { glog.V(0).Infof("client %v is closed", clientName) return nil } // Wait for new data (event-driven instead of 1127ms polling). // Drain any stale notification first to avoid a spurious wake-up. select { case <-aggNotifyChan: default: } select { case <-aggNotifyChan: case <-ctx.Done(): return nil } } return readInMemoryLogErr } func (fs *FilerServer) SubscribeLocalMetadata(req *filer_pb.SubscribeMetadataRequest, stream filer_pb.SeaweedFiler_SubscribeLocalMetadataServer) error { ctx := stream.Context() peerAddress := findClientAddress(ctx, 0) // use negative client id to differentiate from addClient()/deleteClient() used in SubscribeMetadata() req.ClientId = -req.ClientId isReplacing, alreadyKnown, clientName := fs.addClient("local", req.ClientName, peerAddress, req.ClientId, req.ClientEpoch) if isReplacing { fs.listenersCond.Broadcast() // nudges the subscribers that are waiting } else if alreadyKnown { return fmt.Errorf("duplicated local subscription detected for client %s clientId:%d", clientName, req.ClientId) } defer func() { glog.V(0).Infof("disconnect %v local subscriber %s clientId:%d", clientName, req.PathPrefix, req.ClientId) fs.deleteClient("local", clientName, req.ClientId, req.ClientEpoch) fs.listenersCond.Broadcast() // nudges the subscribers that are waiting }() lastReadTime := log_buffer.NewMessagePosition(req.SinceNs, -2) glog.V(0).Infof(" + %v local subscribe %s from %+v clientId:%d", clientName, req.PathPrefix, lastReadTime, req.ClientId) sender := newPipelinedSender(stream, 1024, req.ClientSupportsBatching) defer sender.Close() eachEventNotificationFn := fs.eachEventNotificationFn(req, sender, clientName) eachLogEntryFn := eachLogEntryFn(eachEventNotificationFn) var processedTsNs int64 var readPersistedLogErr error var readInMemoryLogErr error var isDone bool var lastCheckedFlushTsNs int64 = -1 // Track the last flushed time we checked var lastDiskReadTsNs int64 = -1 // Track the last read position we used for disk read for { // Check if new data has been flushed to disk since last check, or if read position advanced currentFlushTsNs := fs.filer.LocalMetaLogBuffer.GetLastFlushTsNs() currentReadTsNs := lastReadTime.Time.UnixNano() // Read from disk if: first time, new flush observed, or read position advanced (draining backlog) shouldReadFromDisk := lastCheckedFlushTsNs == -1 || currentFlushTsNs > lastCheckedFlushTsNs || currentReadTsNs > lastDiskReadTsNs if shouldReadFromDisk { // Record the position we are about to read from lastDiskReadTsNs = currentReadTsNs glog.V(4).Infof("read on disk %v local subscribe %s from %+v (lastFlushed: %v)", clientName, req.PathPrefix, lastReadTime, time.Unix(0, currentFlushTsNs)) if req.ClientSupportsMetadataChunks { processedTsNs, isDone, readPersistedLogErr = fs.sendLogFileRefs(ctx, stream, lastReadTime, req.UntilNs) } else { processedTsNs, isDone, readPersistedLogErr = fs.filer.ReadPersistedLogBuffer(lastReadTime, req.UntilNs, eachLogEntryFn) } if readPersistedLogErr != nil { glog.V(0).Infof("read on disk %v local subscribe %s from %+v: %v", clientName, req.PathPrefix, lastReadTime, readPersistedLogErr) return fmt.Errorf("reading from persisted logs: %w", readPersistedLogErr) } if isDone { return nil } // Update the last checked flushed time lastCheckedFlushTsNs = currentFlushTsNs if processedTsNs != 0 { lastReadTime = log_buffer.NewMessagePosition(processedTsNs, -2) } else { // No data found on disk // Check if we previously got ResumeFromDiskError from memory, meaning we're in a gap if readInMemoryLogErr == log_buffer.ResumeFromDiskError { // We have a gap: requested time < earliest memory time, but no data on disk // Skip forward to earliest memory time to avoid infinite loop earliestTime := fs.filer.LocalMetaLogBuffer.GetEarliestTime() if !earliestTime.IsZero() && earliestTime.After(lastReadTime.Time) { glog.V(3).Infof("gap detected: skipping from %v to earliest memory time %v for %v", lastReadTime.Time, earliestTime, clientName) // Position at earliest time; time-based reader will include it lastReadTime = log_buffer.NewMessagePosition(earliestTime.UnixNano(), -2) readInMemoryLogErr = nil // Clear the error since we're skipping forward } else { // No memory data yet, wait for new data (event-driven) fs.listenersLock.Lock() atomic.AddInt64(&fs.listenersWaits, 1) fs.listenersCond.Wait() atomic.AddInt64(&fs.listenersWaits, -1) fs.listenersLock.Unlock() continue } } else { // First pass or no ResumeFromDiskError yet // Check the next day for logs nextDayTs := util.GetNextDayTsNano(lastReadTime.Time.UnixNano()) position := log_buffer.NewMessagePosition(nextDayTs, -2) found, err := fs.filer.HasPersistedLogFiles(position) if err != nil { return fmt.Errorf("checking persisted log files: %w", err) } if found { lastReadTime = position } } } } glog.V(3).Infof("read in memory %v local subscribe %s from %+v", clientName, req.PathPrefix, lastReadTime) lastReadTime, isDone, readInMemoryLogErr = fs.filer.LocalMetaLogBuffer.LoopProcessLogData("localMeta:"+clientName, lastReadTime, req.UntilNs, func() bool { select { case <-ctx.Done(): return false default: } return fs.hasClient(req.ClientId, req.ClientEpoch) }, eachLogEntryFn) if readInMemoryLogErr != nil { if readInMemoryLogErr == log_buffer.ResumeFromDiskError { // Memory buffer says the requested time is too old // Retry disk read if: (a) flush advanced, or (b) read position advanced (draining backlog) currentFlushTsNs := fs.filer.LocalMetaLogBuffer.GetLastFlushTsNs() currentReadTsNs := lastReadTime.Time.UnixNano() if currentFlushTsNs > lastCheckedFlushTsNs || currentReadTsNs > lastDiskReadTsNs { glog.V(0).Infof("retry disk read %v local subscribe %s (lastFlushed: %v -> %v, readTs: %v -> %v)", clientName, req.PathPrefix, time.Unix(0, lastCheckedFlushTsNs), time.Unix(0, currentFlushTsNs), time.Unix(0, lastDiskReadTsNs), time.Unix(0, currentReadTsNs)) continue } // No progress possible, wait for new data to arrive (event-driven, not polling) fs.listenersLock.Lock() atomic.AddInt64(&fs.listenersWaits, 1) fs.listenersCond.Wait() atomic.AddInt64(&fs.listenersWaits, -1) fs.listenersLock.Unlock() continue } glog.Errorf("processed to %v: %v", lastReadTime, readInMemoryLogErr) if readInMemoryLogErr != log_buffer.ResumeError { break } } if isDone { return nil } if !fs.hasClient(req.ClientId, req.ClientEpoch) { return nil } } return readInMemoryLogErr } func eachLogEntryFn(eachEventNotificationFn func(dirPath string, eventNotification *filer_pb.EventNotification, tsNs int64) error) log_buffer.EachLogEntryFuncType { return func(logEntry *filer_pb.LogEntry) (bool, error) { event := &filer_pb.SubscribeMetadataResponse{} if err := proto.Unmarshal(logEntry.Data, event); err != nil { glog.Errorf("unexpected unmarshal filer_pb.SubscribeMetadataResponse: %v", err) return false, fmt.Errorf("unexpected unmarshal filer_pb.SubscribeMetadataResponse: %w", err) } if err := eachEventNotificationFn(event.Directory, event.EventNotification, event.TsNs); err != nil { return false, err } return false, nil } } // sendLogFileRefs collects persisted log file chunk references and sends them // to the client so it can read the data directly from volume servers. // This does zero volume server I/O — it only lists filer store directory entries. // Sends directly on the gRPC stream (bypasses pipelinedSender) because ref // messages have TsNs=0 and must not be batched into Events by the sender. func (fs *FilerServer) sendLogFileRefs(ctx context.Context, stream metadataStreamSender, startPosition log_buffer.MessagePosition, stopTsNs int64) (lastTsNs int64, isDone bool, err error) { refs, lastTsNs, err := fs.filer.CollectLogFileRefs(ctx, startPosition, stopTsNs) if err != nil { return 0, false, err } if len(refs) == 0 { return 0, false, nil } const maxRefsPerMessage = 64 for i := 0; i < len(refs); i += maxRefsPerMessage { end := i + maxRefsPerMessage if end > len(refs) { end = len(refs) } if err := stream.Send(&filer_pb.SubscribeMetadataResponse{ LogFileRefs: refs[i:end], }); err != nil { return lastTsNs, false, err } } return lastTsNs, false, nil } func (fs *FilerServer) eachEventNotificationFn(req *filer_pb.SubscribeMetadataRequest, sender metadataStreamSender, clientName string) func(dirPath string, eventNotification *filer_pb.EventNotification, tsNs int64) error { filtered := 0 return func(dirPath string, eventNotification *filer_pb.EventNotification, tsNs int64) error { defer func() { if filtered > MaxUnsyncedEvents { if err := sender.Send(&filer_pb.SubscribeMetadataResponse{ EventNotification: &filer_pb.EventNotification{}, TsNs: tsNs, }); err == nil { filtered = 0 } } }() filtered++ foundSelf := false for _, sig := range eventNotification.Signatures { if sig == req.Signature && req.Signature != 0 { return nil } if sig == fs.filer.Signature { foundSelf = true } } if !foundSelf { eventNotification.Signatures = append(eventNotification.Signatures, fs.filer.Signature) } // get complete path to the file or directory var entryName string if eventNotification.OldEntry != nil { entryName = eventNotification.OldEntry.Name } else if eventNotification.NewEntry != nil { entryName = eventNotification.NewEntry.Name } fullpath := util.Join(dirPath, entryName) // skip on filer internal meta logs if strings.HasPrefix(fullpath, filer.SystemLogDir) { return nil } message := &filer_pb.SubscribeMetadataResponse{ Directory: dirPath, EventNotification: eventNotification, TsNs: tsNs, } if !filer_pb.MetadataEventMatchesSubscription(message, req.PathPrefix, req.PathPrefixes, req.Directories) { return nil } // collect timestamps for path stats.FilerServerLastSendTsOfSubscribeGauge.WithLabelValues(fs.option.Host.String(), req.ClientName, req.PathPrefix).Set(float64(tsNs)) // println("sending", dirPath, entryName) if err := sender.Send(message); err != nil { glog.V(0).Infof("=> client %v: %+v", clientName, err) return err } filtered = 0 return nil } } func (fs *FilerServer) addClient(prefix string, clientType string, clientAddress string, clientId int32, clientEpoch int32) (isReplacing, alreadyKnown bool, clientName string) { clientName = clientType + "@" + clientAddress glog.V(0).Infof("+ %v listener %v clientId %v clientEpoch %v", prefix, clientName, clientId, clientEpoch) if clientId != 0 { fs.knownListenersLock.Lock() defer fs.knownListenersLock.Unlock() epoch, found := fs.knownListeners[clientId] if !found || epoch < clientEpoch { fs.knownListeners[clientId] = clientEpoch isReplacing = true } else { alreadyKnown = true } } return } func (fs *FilerServer) deleteClient(prefix string, clientName string, clientId int32, clientEpoch int32) { glog.V(0).Infof("- %v listener %v clientId %v clientEpoch %v", prefix, clientName, clientId, clientEpoch) if clientId != 0 { fs.knownListenersLock.Lock() defer fs.knownListenersLock.Unlock() epoch, found := fs.knownListeners[clientId] if found && epoch <= clientEpoch { delete(fs.knownListeners, clientId) } } } func (fs *FilerServer) hasClient(clientId int32, clientEpoch int32) bool { if clientId != 0 { fs.knownListenersLock.Lock() defer fs.knownListenersLock.Unlock() epoch, found := fs.knownListeners[clientId] if found && epoch <= clientEpoch { return true } } return false }