Files
seaweedFS/weed/command/filer_backup.go
Jaehoon Kim 6cf34f2376 Add -filerExcludePathPattern flag and fix nil panic in -filerExcludeFileName (#8756)
* Fix filerExcludeFileName to support directory names and path components

The original implementation only matched excludeFileName against
message.NewEntry.Name, which caused two issues:

1. Nil pointer panic on delete events (NewEntry is nil)
2. Files inside excluded directories were still backed up because
   the parent directory name was not checked

This patch:
- Checks all path components in resp.Directory against the regexp
- Adds nil guard for message.NewEntry before accessing .Name
- Also checks message.OldEntry.Name for rename/delete events

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* Add -filerExcludePathPattern flag and fix nil panic in filerExcludeFileName

Separate concerns between two exclude mechanisms:
- filerExcludeFileName: matches entry name only (leaf node)
- filerExcludePathPattern (NEW): matches any path component via regexp,
  so files inside matched directories are also excluded

Also fixes nil pointer panic when filerExcludeFileName encounters
delete events where NewEntry is nil.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* Refactor exclude logic: per-side exclusion for rename events, reduce duplication

- Extract isEntryExcluded() to compute exclusion per old/new side,
  so rename events crossing an exclude boundary are handled as
  delete + create instead of being entirely skipped
- Extract compileExcludePattern() to deduplicate regexp compilation
- Replace strings.Split with allocation-free pathContainsMatch()
- Check message.NewParentPath (not just resp.Directory) for new side

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* Move regexp compilation out of retry loop to fail fast on config errors

compileExcludePattern for -filerExcludeFileName and -filerExcludePathPattern
are configuration-time validations that will never succeed on retry.
Move them to runFilerBackup before the reconnect loop and use glog.Fatalf
on failure, so invalid patterns are caught immediately at startup instead
of being retried every 1.7 seconds indefinitely.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* Add wildcard matching helpers for path and filename exclusion

* Replace regexp exclude patterns with wildcard-based flags, deprecate -filerExcludeFileName

Add -filerExcludeFileNames and -filerExcludePathPatterns flags that accept
comma-separated wildcard patterns (*, ?) using the existing wildcard library.
Mark -filerExcludeFileName as deprecated but keep its regexp behavior.

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Co-authored-by: Chris Lu <chris.lu@gmail.com>
2026-03-26 10:04:06 -07:00

239 lines
9.8 KiB
Go

package command
import (
"errors"
"fmt"
nethttp "net/http"
"regexp"
"strings"
"time"
"github.com/seaweedfs/seaweedfs/weed/glog"
"github.com/seaweedfs/seaweedfs/weed/pb"
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
"github.com/seaweedfs/seaweedfs/weed/replication/source"
"github.com/seaweedfs/seaweedfs/weed/security"
"github.com/seaweedfs/seaweedfs/weed/util"
"github.com/seaweedfs/seaweedfs/weed/util/http"
"github.com/seaweedfs/seaweedfs/weed/util/wildcard"
"google.golang.org/grpc"
)
type FilerBackupOptions struct {
isActivePassive *bool
filer *string
path *string
excludePaths *string
excludeFileName *string // deprecated: use excludeFileNames
excludeFileNames *string
excludePathPatterns *string
debug *bool
proxyByFiler *bool
doDeleteFiles *bool
disableErrorRetry *bool
ignore404Error *bool
timeAgo *time.Duration
retentionDays *int
}
var (
filerBackupOptions FilerBackupOptions
ignorable404ErrString = fmt.Sprintf("%d %s: %s", nethttp.StatusNotFound, nethttp.StatusText(nethttp.StatusNotFound), http.ErrNotFound.Error())
)
func init() {
cmdFilerBackup.Run = runFilerBackup // break init cycle
filerBackupOptions.filer = cmdFilerBackup.Flag.String("filer", "localhost:8888", "filer of one SeaweedFS cluster")
filerBackupOptions.path = cmdFilerBackup.Flag.String("filerPath", "/", "directory to sync on filer")
filerBackupOptions.excludePaths = cmdFilerBackup.Flag.String("filerExcludePaths", "", "exclude directories to sync on filer")
filerBackupOptions.excludeFileName = cmdFilerBackup.Flag.String("filerExcludeFileName", "", "[DEPRECATED: use -filerExcludeFileNames] exclude file names that match the regexp")
filerBackupOptions.excludeFileNames = cmdFilerBackup.Flag.String("filerExcludeFileNames", "", "comma-separated wildcard patterns to exclude file names, e.g., \"*.tmp,._*\"")
filerBackupOptions.excludePathPatterns = cmdFilerBackup.Flag.String("filerExcludePathPatterns", "", "comma-separated wildcard patterns to exclude paths where any component matches, e.g., \".snapshot,temp*\"")
filerBackupOptions.proxyByFiler = cmdFilerBackup.Flag.Bool("filerProxy", false, "read and write file chunks by filer instead of volume servers")
filerBackupOptions.doDeleteFiles = cmdFilerBackup.Flag.Bool("doDeleteFiles", false, "delete files on the destination")
filerBackupOptions.debug = cmdFilerBackup.Flag.Bool("debug", false, "debug mode to print out received files")
filerBackupOptions.timeAgo = cmdFilerBackup.Flag.Duration("timeAgo", 0, "start time before now. \"300ms\", \"1.5h\" or \"2h45m\". Valid time units are \"ns\", \"us\" (or \"µs\"), \"ms\", \"s\", \"m\", \"h\"")
filerBackupOptions.retentionDays = cmdFilerBackup.Flag.Int("retentionDays", 0, "incremental backup retention days")
filerBackupOptions.disableErrorRetry = cmdFilerBackup.Flag.Bool("disableErrorRetry", false, "disables errors retry, only logs will print")
filerBackupOptions.ignore404Error = cmdFilerBackup.Flag.Bool("ignore404Error", true, "ignore 404 errors from filer")
}
var cmdFilerBackup = &Command{
UsageLine: "filer.backup -filer=<filerHost>:<filerPort> ",
Short: "resume-able continuously replicate files from a SeaweedFS cluster to another location defined in replication.toml",
Long: `resume-able continuously replicate files from a SeaweedFS cluster to another location defined in replication.toml
filer.backup listens on filer notifications. If any file is updated, it will fetch the updated content,
and write to the destination. This is to replace filer.replicate command since additional message queue is not needed.
If restarted and "-timeAgo" is not set, the synchronization will resume from the previous checkpoints, persisted every minute.
A fresh sync will start from the earliest metadata logs. To reset the checkpoints, just set "-timeAgo" to a high value.
`,
}
func runFilerBackup(cmd *Command, args []string) bool {
util.LoadSecurityConfiguration()
util.LoadConfiguration("replication", true)
// Compile exclude patterns once before the retry loop — these are
// configuration errors and must not be retried.
reExcludeFileName, err := compileExcludePattern(*filerBackupOptions.excludeFileName, "exclude file name")
if err != nil {
glog.Fatalf("invalid -filerExcludeFileName: %v", err)
}
excludeFileNames := wildcard.CompileWildcardMatchers(*filerBackupOptions.excludeFileNames)
excludePathPatterns := wildcard.CompileWildcardMatchers(*filerBackupOptions.excludePathPatterns)
grpcDialOption := security.LoadClientTLS(util.GetViper(), "grpc.client")
clientId := util.RandomInt32()
var clientEpoch int32
for {
clientEpoch++
err := doFilerBackup(grpcDialOption, &filerBackupOptions, reExcludeFileName, excludeFileNames, excludePathPatterns, clientId, clientEpoch)
if err != nil {
glog.Errorf("backup from %s: %v", *filerBackupOptions.filer, err)
time.Sleep(1747 * time.Millisecond)
}
}
}
const (
BackupKeyPrefix = "backup."
)
func doFilerBackup(grpcDialOption grpc.DialOption, backupOption *FilerBackupOptions, reExcludeFileName *regexp.Regexp, excludeFileNames []*wildcard.WildcardMatcher, excludePathPatterns []*wildcard.WildcardMatcher, clientId int32, clientEpoch int32) error {
// find data sink
dataSink := findSink(util.GetViper())
if dataSink == nil {
return fmt.Errorf("no data sink configured in replication.toml")
}
sourceFiler := pb.ServerAddress(*backupOption.filer)
sourcePath := *backupOption.path
excludePaths := util.StringSplit(*backupOption.excludePaths, ",")
timeAgo := *backupOption.timeAgo
targetPath := dataSink.GetSinkToDirectory()
debug := *backupOption.debug
// get start time for the data sink
startFrom := time.Unix(0, 0)
sinkId := util.HashStringToLong(dataSink.GetName() + dataSink.GetSinkToDirectory())
if timeAgo.Milliseconds() == 0 {
lastOffsetTsNs, err := getOffset(grpcDialOption, sourceFiler, BackupKeyPrefix, int32(sinkId))
if err != nil {
glog.V(0).Infof("starting from %v", startFrom)
} else {
startFrom = time.Unix(0, lastOffsetTsNs)
glog.V(0).Infof("resuming from %v", startFrom)
}
} else {
startFrom = time.Now().Add(-timeAgo)
glog.V(0).Infof("start time is set to %v", startFrom)
}
// create filer sink
filerSource := &source.FilerSource{}
filerSource.DoInitialize(
sourceFiler.ToHttpAddress(),
sourceFiler.ToGrpcAddress(),
sourcePath,
*backupOption.proxyByFiler)
dataSink.SetSourceFiler(filerSource)
var processEventFn func(*filer_pb.SubscribeMetadataResponse) error
if *backupOption.ignore404Error {
processEventFnGenerated := genProcessFunction(sourcePath, targetPath, excludePaths, reExcludeFileName, excludeFileNames, excludePathPatterns, dataSink, *backupOption.doDeleteFiles, debug)
processEventFn = func(resp *filer_pb.SubscribeMetadataResponse) error {
err := processEventFnGenerated(resp)
if err == nil {
return nil
}
if isIgnorable404(err) {
glog.V(0).Infof("got 404 error for %s, ignore it: %s", getSourceKey(resp), err.Error())
return nil
}
return err
}
} else {
processEventFn = genProcessFunction(sourcePath, targetPath, excludePaths, reExcludeFileName, excludeFileNames, excludePathPatterns, dataSink, *backupOption.doDeleteFiles, debug)
}
processEventFnWithOffset := pb.AddOffsetFunc(processEventFn, 3*time.Second, func(counter int64, lastTsNs int64) error {
glog.V(0).Infof("backup %s progressed to %v %0.2f/sec", sourceFiler, time.Unix(0, lastTsNs), float64(counter)/float64(3))
return setOffset(grpcDialOption, sourceFiler, BackupKeyPrefix, int32(sinkId), lastTsNs)
})
if dataSink.IsIncremental() && *filerBackupOptions.retentionDays > 0 {
go func() {
for {
now := time.Now()
time.Sleep(time.Hour * 24)
key := util.Join(targetPath, now.Add(-1*time.Hour*24*time.Duration(*filerBackupOptions.retentionDays)).Format("2006-01-02"))
_ = dataSink.DeleteEntry(util.Join(targetPath, key), true, true, nil)
glog.V(0).Infof("incremental backup delete directory:%s", key)
}
}()
}
prefix := sourcePath
if !strings.HasSuffix(prefix, "/") {
prefix = prefix + "/"
}
eventErrorType := pb.RetryForeverOnError
if *backupOption.disableErrorRetry {
eventErrorType = pb.TrivialOnError
}
metadataFollowOption := &pb.MetadataFollowOption{
ClientName: "backup_" + dataSink.GetName(),
ClientId: clientId,
ClientEpoch: clientEpoch,
SelfSignature: 0,
PathPrefix: prefix,
AdditionalPathPrefixes: nil,
DirectoriesToWatch: nil,
StartTsNs: startFrom.UnixNano(),
StopTsNs: 0,
EventErrorType: eventErrorType,
}
return pb.FollowMetadata(sourceFiler, grpcDialOption, metadataFollowOption, processEventFnWithOffset)
}
func getSourceKey(resp *filer_pb.SubscribeMetadataResponse) string {
if resp == nil || resp.EventNotification == nil {
return ""
}
message := resp.EventNotification
if message.NewEntry != nil {
return string(util.FullPath(message.NewParentPath).Child(message.NewEntry.Name))
}
if message.OldEntry != nil {
return string(util.FullPath(resp.Directory).Child(message.OldEntry.Name))
}
return ""
}
// isIgnorable404 returns true if the error represents a 404/not-found condition
// that should be silently ignored during backup. This covers:
// - errors wrapping http.ErrNotFound (direct volume server 404 via non-S3 sinks)
// - errors containing the "404 Not Found: not found" status string (S3 sink path
// where AWS SDK breaks the errors.Is unwrap chain)
// - LookupFileId or volume-id-not-found errors from the volume id map
func isIgnorable404(err error) bool {
if errors.Is(err, http.ErrNotFound) {
return true
}
errStr := err.Error()
return strings.Contains(errStr, ignorable404ErrString) ||
strings.Contains(errStr, "LookupFileId") ||
(strings.Contains(errStr, "volume id") && strings.Contains(errStr, "not found"))
}