* feat(filer): add lazy directory listing for remote mounts Directory listings on remote mounts previously only queried the local filer store. With lazy mounts the listing was empty; with eager mounts it went stale over time. Add on-demand directory listing that fetches from remote and caches results with a 5-minute TTL: - Add `ListDirectory` to `RemoteStorageClient` interface (delimiter-based, single-level listing, separate from recursive `Traverse`) - Implement in S3, GCS, and Azure backends using each platform's hierarchical listing API - Add `maybeLazyListFromRemote` to filer: before each directory listing, check if the directory is under a remote mount with an expired cache, fetch from remote, persist entries to the local store, then let existing listing logic run on the populated store - Use singleflight to deduplicate concurrent requests for the same directory - Skip local-only entries (no RemoteEntry) to avoid overwriting unsynced uploads - Errors are logged and swallowed (availability over consistency) * refactor: extract xattr key to constant xattrRemoteListingSyncedAt * feat: make listing cache TTL configurable per mount via listing_cache_ttl_seconds Add listing_cache_ttl_seconds field to RemoteStorageLocation protobuf. When 0 (default), lazy directory listing is disabled for that mount. When >0, enables on-demand directory listing with the specified TTL. Expose as -listingCacheTTL flag on remote.mount command. * refactor: address review feedback for lazy directory listing - Add context.Context to ListDirectory interface and all implementations - Capture startTime before remote call for accurate TTL tracking - Simplify S3 ListDirectory using ListObjectsV2PagesWithContext - Make maybeLazyListFromRemote return void (errors always swallowed) - Remove redundant trailing-slash path manipulation in caller - Update tests to match new signatures * When an existing entry has Remote != nil, we should merge remote metadata into it rather than replacing it. * fix(gcs): wrap ListDirectory iterator error with context The raw iterator error was returned without bucket/path context, making it harder to debug. Wrap it consistently with the S3 pattern. * fix(s3): guard against nil pointer dereference in Traverse and ListDirectory Some S3-compatible backends may return nil for LastModified, Size, or ETag fields. Check for nil before dereferencing to prevent panics. * fix(filer): remove blanket 2-minute timeout from lazy listing context Individual SDK operations (S3, GCS, Azure) already have per-request timeouts and retry policies. The blanket timeout could cut off large directory listings mid-operation even though individual pages were succeeding. * fix(filer): preserve trace context in lazy listing with WithoutCancel Use context.WithoutCancel(ctx) instead of context.Background() so trace/span values from the incoming request are retained for distributed tracing, while still decoupling cancellation. * fix(filer): use Store.FindEntry for internal lookups, add Uid/Gid to files, fix updateDirectoryListingSyncedAt - Use f.Store.FindEntry instead of f.FindEntry for staleness check and child lookups to avoid unnecessary lazy-fetch overhead - Set OS_UID/OS_GID on new file entries for consistency with directories - In updateDirectoryListingSyncedAt, use Store.UpdateEntry for existing directories instead of CreateEntry to avoid deleteChunksIfNotNew and NotifyUpdateEvent side effects * fix(filer): distinguish not-found from store errors in lazy listing Previously, any error from Store.FindEntry was treated as "not found," which could cause entry recreation/overwrite on transient DB failures. Now check for filer_pb.ErrNotFound explicitly and skip entries or bail out on real store errors. * refactor(filer): use errors.Is for ErrNotFound comparisons
322 lines
9.1 KiB
Go
322 lines
9.1 KiB
Go
package gcs
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"reflect"
|
|
"strings"
|
|
"time"
|
|
|
|
"cloud.google.com/go/storage"
|
|
"github.com/seaweedfs/seaweedfs/weed/glog"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/filer_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/pb/remote_pb"
|
|
"github.com/seaweedfs/seaweedfs/weed/remote_storage"
|
|
"github.com/seaweedfs/seaweedfs/weed/util"
|
|
"golang.org/x/oauth2"
|
|
"golang.org/x/oauth2/google"
|
|
"google.golang.org/api/iterator"
|
|
"google.golang.org/api/option"
|
|
)
|
|
|
|
func init() {
|
|
remote_storage.RemoteStorageClientMakers["gcs"] = new(gcsRemoteStorageMaker)
|
|
}
|
|
|
|
type gcsRemoteStorageMaker struct{}
|
|
|
|
func (s gcsRemoteStorageMaker) HasBucket() bool {
|
|
return true
|
|
}
|
|
|
|
func (s gcsRemoteStorageMaker) Make(conf *remote_pb.RemoteConf) (remote_storage.RemoteStorageClient, error) {
|
|
client := &gcsRemoteStorageClient{
|
|
conf: conf,
|
|
}
|
|
|
|
googleApplicationCredentials := conf.GcsGoogleApplicationCredentials
|
|
|
|
if googleApplicationCredentials == "" {
|
|
if creds, found := os.LookupEnv("GOOGLE_APPLICATION_CREDENTIALS"); found {
|
|
googleApplicationCredentials = creds
|
|
} else {
|
|
glog.Warningf("no GOOGLE_APPLICATION_CREDENTIALS env variable found, falling back to Application Default Credentials")
|
|
}
|
|
}
|
|
|
|
projectID := conf.GcsProjectId
|
|
if projectID == "" {
|
|
if pid, found := os.LookupEnv("GOOGLE_CLOUD_PROJECT"); found {
|
|
projectID = pid
|
|
} else {
|
|
glog.Warningf("need to specify GOOGLE_CLOUD_PROJECT env variable")
|
|
}
|
|
}
|
|
|
|
var clientOpts []option.ClientOption
|
|
|
|
if googleApplicationCredentials != "" {
|
|
googleApplicationCredentials = util.ResolvePath(googleApplicationCredentials)
|
|
var data []byte
|
|
var err error
|
|
if strings.HasPrefix(googleApplicationCredentials, "{") {
|
|
data = []byte(googleApplicationCredentials)
|
|
} else {
|
|
data, err = os.ReadFile(googleApplicationCredentials)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to read credentials file %s: %w", googleApplicationCredentials, err)
|
|
}
|
|
}
|
|
creds, err := google.CredentialsFromJSON(context.Background(), data, storage.ScopeFullControl)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to parse credentials: %w", err)
|
|
}
|
|
httpClient := oauth2.NewClient(context.Background(), creds.TokenSource)
|
|
clientOpts = append(clientOpts, option.WithHTTPClient(httpClient), option.WithoutAuthentication())
|
|
}
|
|
|
|
c, err := storage.NewClient(context.Background(), clientOpts...)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create client: %w", err)
|
|
}
|
|
|
|
client.client = c
|
|
client.projectID = projectID
|
|
return client, nil
|
|
}
|
|
|
|
type gcsRemoteStorageClient struct {
|
|
conf *remote_pb.RemoteConf
|
|
client *storage.Client
|
|
projectID string
|
|
}
|
|
|
|
var _ = remote_storage.RemoteStorageClient(&gcsRemoteStorageClient{})
|
|
|
|
func (gcs *gcsRemoteStorageClient) Traverse(loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) (err error) {
|
|
|
|
pathKey := loc.Path[1:]
|
|
|
|
objectIterator := gcs.client.Bucket(loc.Bucket).Objects(context.Background(), &storage.Query{
|
|
Delimiter: "",
|
|
Prefix: pathKey,
|
|
Versions: false,
|
|
})
|
|
|
|
var objectAttr *storage.ObjectAttrs
|
|
for err == nil {
|
|
objectAttr, err = objectIterator.Next()
|
|
if err != nil {
|
|
if err == iterator.Done {
|
|
return nil
|
|
}
|
|
return err
|
|
}
|
|
|
|
key := objectAttr.Name
|
|
key = "/" + key
|
|
dir, name := util.FullPath(key).DirAndName()
|
|
err = visitFn(dir, name, false, &filer_pb.RemoteEntry{
|
|
RemoteMtime: objectAttr.Updated.Unix(),
|
|
RemoteSize: objectAttr.Size,
|
|
RemoteETag: objectAttr.Etag,
|
|
StorageName: gcs.conf.Name,
|
|
})
|
|
}
|
|
return
|
|
}
|
|
|
|
const defaultGCSOpTimeout = 30 * time.Second
|
|
|
|
func (gcs *gcsRemoteStorageClient) ListDirectory(ctx context.Context, loc *remote_pb.RemoteStorageLocation, visitFn remote_storage.VisitFunc) (err error) {
|
|
pathKey := loc.Path[1:]
|
|
if pathKey != "" && !strings.HasSuffix(pathKey, "/") {
|
|
pathKey += "/"
|
|
}
|
|
|
|
objectIterator := gcs.client.Bucket(loc.Bucket).Objects(ctx, &storage.Query{
|
|
Delimiter: "/",
|
|
Prefix: pathKey,
|
|
Versions: false,
|
|
})
|
|
|
|
for {
|
|
objectAttr, iterErr := objectIterator.Next()
|
|
if iterErr != nil {
|
|
if iterErr == iterator.Done {
|
|
return nil
|
|
}
|
|
return fmt.Errorf("list directory %s%s: %w", loc.Bucket, loc.Path, iterErr)
|
|
}
|
|
|
|
if objectAttr.Prefix != "" {
|
|
// Common prefix → subdirectory
|
|
dirKey := "/" + strings.TrimSuffix(objectAttr.Prefix, "/")
|
|
dir, name := util.FullPath(dirKey).DirAndName()
|
|
if err = visitFn(dir, name, true, nil); err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
key := "/" + objectAttr.Name
|
|
if strings.HasSuffix(key, "/") {
|
|
continue // skip directory markers
|
|
}
|
|
dir, name := util.FullPath(key).DirAndName()
|
|
if err = visitFn(dir, name, false, &filer_pb.RemoteEntry{
|
|
RemoteMtime: objectAttr.Updated.Unix(),
|
|
RemoteSize: objectAttr.Size,
|
|
RemoteETag: objectAttr.Etag,
|
|
StorageName: gcs.conf.Name,
|
|
}); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (gcs *gcsRemoteStorageClient) StatFile(loc *remote_pb.RemoteStorageLocation) (remoteEntry *filer_pb.RemoteEntry, err error) {
|
|
key := loc.Path[1:]
|
|
ctx, cancel := context.WithTimeout(context.Background(), defaultGCSOpTimeout)
|
|
defer cancel()
|
|
attr, err := gcs.client.Bucket(loc.Bucket).Object(key).Attrs(ctx)
|
|
if err != nil {
|
|
if errors.Is(err, storage.ErrObjectNotExist) {
|
|
return nil, remote_storage.ErrRemoteObjectNotFound
|
|
}
|
|
return nil, fmt.Errorf("stat gcs %s%s: %w", loc.Bucket, loc.Path, err)
|
|
}
|
|
return &filer_pb.RemoteEntry{
|
|
StorageName: gcs.conf.Name,
|
|
RemoteMtime: attr.Updated.Unix(),
|
|
RemoteSize: attr.Size,
|
|
RemoteETag: attr.Etag,
|
|
}, nil
|
|
}
|
|
|
|
func (gcs *gcsRemoteStorageClient) ReadFile(loc *remote_pb.RemoteStorageLocation, offset int64, size int64) (data []byte, err error) {
|
|
|
|
key := loc.Path[1:]
|
|
rangeReader, readErr := gcs.client.Bucket(loc.Bucket).Object(key).NewRangeReader(context.Background(), offset, size)
|
|
if readErr != nil {
|
|
return nil, readErr
|
|
}
|
|
data, err = io.ReadAll(rangeReader)
|
|
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to download file %s%s: %v", loc.Bucket, loc.Path, err)
|
|
}
|
|
|
|
return
|
|
}
|
|
|
|
func (gcs *gcsRemoteStorageClient) WriteDirectory(loc *remote_pb.RemoteStorageLocation, entry *filer_pb.Entry) (err error) {
|
|
return nil
|
|
}
|
|
|
|
func (gcs *gcsRemoteStorageClient) RemoveDirectory(loc *remote_pb.RemoteStorageLocation) (err error) {
|
|
return nil
|
|
}
|
|
|
|
func (gcs *gcsRemoteStorageClient) WriteFile(loc *remote_pb.RemoteStorageLocation, entry *filer_pb.Entry, reader io.Reader) (remoteEntry *filer_pb.RemoteEntry, err error) {
|
|
|
|
key := loc.Path[1:]
|
|
|
|
metadata := toMetadata(entry.Extended)
|
|
wc := gcs.client.Bucket(loc.Bucket).Object(key).NewWriter(context.Background())
|
|
wc.Metadata = metadata
|
|
if _, err = io.Copy(wc, reader); err != nil {
|
|
return nil, fmt.Errorf("upload to gcs %s/%s%s: %v", loc.Name, loc.Bucket, loc.Path, err)
|
|
}
|
|
if err = wc.Close(); err != nil {
|
|
return nil, fmt.Errorf("close gcs %s/%s%s: %v", loc.Name, loc.Bucket, loc.Path, err)
|
|
}
|
|
|
|
// read back the remote entry
|
|
return gcs.readFileRemoteEntry(loc)
|
|
|
|
}
|
|
|
|
func (gcs *gcsRemoteStorageClient) readFileRemoteEntry(loc *remote_pb.RemoteStorageLocation) (*filer_pb.RemoteEntry, error) {
|
|
return gcs.StatFile(loc)
|
|
}
|
|
|
|
func toMetadata(attributes map[string][]byte) map[string]string {
|
|
metadata := make(map[string]string)
|
|
for k, v := range attributes {
|
|
if strings.HasPrefix(k, "X-") {
|
|
continue
|
|
}
|
|
metadata[k] = string(v)
|
|
}
|
|
return metadata
|
|
}
|
|
|
|
func (gcs *gcsRemoteStorageClient) UpdateFileMetadata(loc *remote_pb.RemoteStorageLocation, oldEntry *filer_pb.Entry, newEntry *filer_pb.Entry) (err error) {
|
|
if reflect.DeepEqual(oldEntry.Extended, newEntry.Extended) {
|
|
return nil
|
|
}
|
|
metadata := toMetadata(newEntry.Extended)
|
|
|
|
key := loc.Path[1:]
|
|
|
|
if len(metadata) > 0 {
|
|
_, err = gcs.client.Bucket(loc.Bucket).Object(key).Update(context.Background(), storage.ObjectAttrsToUpdate{
|
|
Metadata: metadata,
|
|
})
|
|
} else {
|
|
// no way to delete the metadata yet
|
|
}
|
|
|
|
return
|
|
}
|
|
func (gcs *gcsRemoteStorageClient) DeleteFile(loc *remote_pb.RemoteStorageLocation) (err error) {
|
|
key := loc.Path[1:]
|
|
if err = gcs.client.Bucket(loc.Bucket).Object(key).Delete(context.Background()); err != nil {
|
|
return fmt.Errorf("gcs delete %s%s: %v", loc.Bucket, key, err)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (gcs *gcsRemoteStorageClient) ListBuckets() (buckets []*remote_storage.Bucket, err error) {
|
|
if gcs.projectID == "" {
|
|
return nil, fmt.Errorf("gcs project id or GOOGLE_CLOUD_PROJECT env variable not set")
|
|
}
|
|
iter := gcs.client.Buckets(context.Background(), gcs.projectID)
|
|
for {
|
|
b, err := iter.Next()
|
|
if err == iterator.Done {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return buckets, err
|
|
}
|
|
buckets = append(buckets, &remote_storage.Bucket{
|
|
Name: b.Name,
|
|
CreatedAt: b.Created,
|
|
})
|
|
}
|
|
return
|
|
}
|
|
|
|
func (gcs *gcsRemoteStorageClient) CreateBucket(name string) (err error) {
|
|
if gcs.projectID == "" {
|
|
return fmt.Errorf("gcs project id or GOOGLE_CLOUD_PROJECT env variable not set")
|
|
}
|
|
err = gcs.client.Bucket(name).Create(context.Background(), gcs.projectID, &storage.BucketAttrs{})
|
|
if err != nil {
|
|
return fmt.Errorf("create bucket %s: %v", name, err)
|
|
}
|
|
return
|
|
}
|
|
|
|
func (gcs *gcsRemoteStorageClient) DeleteBucket(name string) (err error) {
|
|
err = gcs.client.Bucket(name).Delete(context.Background())
|
|
if err != nil {
|
|
return fmt.Errorf("delete bucket %s: %v", name, err)
|
|
}
|
|
return
|
|
}
|