Switch empty-folder cleanup to bucket policy (#8292)

* Fix Spark _temporary cleanup and add issue #8285 regression test

* Generalize empty folder cleanup for Spark temp artifacts

* Revert synchronous folder pruning and add cleanup diagnostics

* Add actionable empty-folder cleanup diagnostics

* Fix Spark temp marker cleanup in async folder cleaner

* Fix Spark temp cleanup with implicit directory markers

* Keep explicit directory markers non-implicit

* logging

* more logs

* Switch empty-folder cleanup to bucket policy

* Seaweed-X-Amz-Allow-Empty-Folders

* less logs

* go vet

* less logs

* refactoring
This commit is contained in:
Chris Lu
2026-02-10 18:38:38 -08:00
committed by GitHub
parent 5c365e7090
commit b57429ef2e
16 changed files with 798 additions and 157 deletions

View File

@@ -14,7 +14,7 @@ func setupSparkIssue8234Env(t *testing.T) *TestEnvironment {
}
env.StartSeaweedFS(t)
t.Cleanup(func() { env.Cleanup() })
t.Cleanup(func() { env.Cleanup(t) })
createObjectBucket(t, env, "test")
env.startSparkContainer(t)

View File

@@ -0,0 +1,180 @@
package spark
import (
"context"
"errors"
"fmt"
"strings"
"testing"
"time"
"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/credentials"
"github.com/aws/aws-sdk-go-v2/service/s3"
s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
)
func TestSparkS3TemporaryDirectoryCleanupIssue8285Regression(t *testing.T) {
if testing.Short() {
t.Skip("Skipping Spark integration test in short mode")
}
env := setupSparkIssue8234Env(t)
script := `
import pyspark.sql.functions as F
target = "s3a://test/issue-8285/output"
spark.conf.set("spark.hadoop.fs.s3a.committer.name", "directory")
spark.conf.set("spark.hadoop.fs.s3a.committer.magic.enabled", "false")
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.abort.pending.uploads", "true")
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.conflict-mode", "append")
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.tmp.path", "/tmp")
spark.conf.set("spark.hadoop.fs.s3a.directory.marker.retention", "keep")
df = spark.range(0, 200).repartition(12).withColumn("value", F.col("id") * 2)
df.write.format("parquet").mode("overwrite").save(target)
count = spark.read.parquet(target).count()
print("WRITE_COUNT=" + str(count))
`
code, output := runSparkPyScript(t, env.sparkContainer, script, env.s3Port)
if code != 0 {
t.Fatalf("Spark script exited with code %d; output:\n%s", code, output)
}
if !strings.Contains(output, "WRITE_COUNT=200") {
t.Fatalf("expected write/read success marker in output, got:\n%s", output)
}
keys := listObjectKeysByPrefix(t, env, "test", "issue-8285/")
var temporaryKeys []string
for _, key := range keys {
if hasTemporaryPathSegment(key) {
temporaryKeys = append(temporaryKeys, key)
}
}
if len(temporaryKeys) > 0 {
t.Fatalf("issue #8285 regression detected: found lingering _temporary artifacts: %v\nall keys: %v", temporaryKeys, keys)
}
temporaryCandidates := []string{
"issue-8285/output/_temporary/",
"issue-8285/output/_temporary/0/",
"issue-8285/output/_temporary/0/_temporary/",
}
lingering := waitForObjectsToDisappear(t, env, "test", temporaryCandidates, 35*time.Second)
if len(lingering) > 0 {
t.Fatalf("issue #8285 regression detected: lingering temporary directories: %v", lingering)
}
}
func listObjectKeysByPrefix(t *testing.T, env *TestEnvironment, bucketName, prefix string) []string {
t.Helper()
client := newS3Client(env)
pager := s3.NewListObjectsV2Paginator(client, &s3.ListObjectsV2Input{
Bucket: aws.String(bucketName),
Prefix: aws.String(prefix),
})
var keys []string
for pager.HasMorePages() {
page, err := pager.NextPage(context.Background())
if err != nil {
t.Fatalf("failed listing objects for prefix %q: %v", prefix, err)
}
for _, object := range page.Contents {
keys = append(keys, aws.ToString(object.Key))
}
}
return keys
}
func headObjectInfo(t *testing.T, env *TestEnvironment, bucketName, key string) (bool, string, error) {
t.Helper()
client := newS3Client(env)
output, err := client.HeadObject(context.Background(), &s3.HeadObjectInput{
Bucket: aws.String(bucketName),
Key: aws.String(key),
})
if err == nil {
return true, aws.ToString(output.ContentType), nil
}
var notFound *s3types.NotFound
if strings.Contains(err.Error(), "NotFound") || strings.Contains(err.Error(), "NoSuchKey") || errors.As(err, &notFound) {
return false, "", nil
}
return false, "", err
}
func waitForObjectsToDisappear(t *testing.T, env *TestEnvironment, bucketName string, keys []string, timeout time.Duration) []string {
t.Helper()
deadline := time.Now().Add(timeout)
pending := make(map[string]struct{}, len(keys))
details := make(map[string]string, len(keys))
for _, key := range keys {
pending[key] = struct{}{}
}
for len(pending) > 0 && time.Now().Before(deadline) {
for key := range pending {
exists, contentType, err := headObjectInfo(t, env, bucketName, key)
if err != nil {
details[key] = fmt.Sprintf("%s (head_error=%v)", key, err)
continue
}
if !exists {
delete(pending, key)
delete(details, key)
continue
}
details[key] = fmt.Sprintf("%s (exists=true, contentType=%q)", key, contentType)
}
if len(pending) > 0 {
time.Sleep(2 * time.Second)
}
}
if len(pending) == 0 {
return nil
}
var lingering []string
for _, key := range keys {
if _, ok := pending[key]; !ok {
continue
}
if detail, hasDetail := details[key]; hasDetail {
lingering = append(lingering, detail)
} else {
lingering = append(lingering, key)
}
}
return lingering
}
func newS3Client(env *TestEnvironment) *s3.Client {
cfg := aws.Config{
Region: "us-east-1",
Credentials: aws.NewCredentialsCache(credentials.NewStaticCredentialsProvider(env.accessKey, env.secretKey, "")),
BaseEndpoint: aws.String(fmt.Sprintf("http://localhost:%d", env.s3Port)),
}
return s3.NewFromConfig(cfg, func(o *s3.Options) {
o.UsePathStyle = true
})
}
func hasTemporaryPathSegment(key string) bool {
for _, segment := range strings.Split(strings.TrimSuffix(key, "/"), "/") {
if segment == "_temporary" {
return true
}
}
return false
}

View File

@@ -8,6 +8,7 @@ import (
"net"
"os"
"os/exec"
"path/filepath"
"sync"
"testing"
"time"
@@ -53,6 +54,8 @@ type TestEnvironment struct {
dockerAvailable bool
weedBinary string
seaweedfsDataDir string
weedLogPath string
weedLogFile *os.File
masterPort int
filerPort int
s3Port int
@@ -113,6 +116,15 @@ func (env *TestEnvironment) StartSeaweedFS(t *testing.T) {
"-s3.config", iamConfigPath,
"-dir", env.seaweedfsDataDir,
)
weedLogPath := filepath.Join(env.seaweedfsDataDir, "weed-mini.log")
weedLogFile, err := os.Create(weedLogPath)
if err != nil {
t.Fatalf("failed to create weed log file: %v", err)
}
env.weedLogPath = weedLogPath
env.weedLogFile = weedLogFile
env.masterProcess.Stdout = weedLogFile
env.masterProcess.Stderr = weedLogFile
env.masterProcess.Env = append(os.Environ(),
"AWS_ACCESS_KEY_ID="+env.accessKey,
"AWS_SECRET_ACCESS_KEY="+env.secretKey,
@@ -160,12 +172,30 @@ func (env *TestEnvironment) startSparkContainer(t *testing.T) {
env.sparkContainer = container
}
func (env *TestEnvironment) Cleanup() {
func (env *TestEnvironment) Cleanup(t *testing.T) {
if env.masterProcess != nil && env.masterProcess.Process != nil {
_ = env.masterProcess.Process.Kill()
_ = env.masterProcess.Wait()
}
clearMiniProcess(env.masterProcess)
if env.weedLogFile != nil {
_ = env.weedLogFile.Close()
}
if t.Failed() && os.Getenv("CI") != "" && env.weedLogPath != "" {
logData, err := os.ReadFile(env.weedLogPath)
if err != nil {
t.Logf("failed to read weed mini log file %s: %v", env.weedLogPath, err)
} else {
// Print the tail to keep CI output manageable while preserving failure context.
const maxTailBytes = 64 * 1024
start := 0
if len(logData) > maxTailBytes {
start = len(logData) - maxTailBytes
}
t.Logf("weed mini logs (tail, %d bytes):\n%s", len(logData)-start, string(logData[start:]))
}
}
if env.sparkContainer != nil {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)