Switch empty-folder cleanup to bucket policy (#8292)

* Fix Spark _temporary cleanup and add issue #8285 regression test * Generalize empty folder cleanup for Spark temp artifacts * Revert synchronous folder pruning and add cleanup diagnostics * Add actionable empty-folder cleanup diagnostics * Fix Spark temp marker cleanup in async folder cleaner * Fix Spark temp cleanup with implicit directory markers * Keep explicit directory markers non-implicit * logging * more logs * Switch empty-folder cleanup to bucket policy * Seaweed-X-Amz-Allow-Empty-Folders * less logs * go vet * less logs * refactoring
2026-02-10 18:38:38 -08:00
parent 5c365e7090
commit b57429ef2e
16 changed files with 798 additions and 157 deletions
--- a/test/s3/spark/issue_8234_repro_test.go
+++ b/test/s3/spark/issue_8234_repro_test.go
@@ -14,7 +14,7 @@ func setupSparkIssue8234Env(t *testing.T) *TestEnvironment {
 	}

 	env.StartSeaweedFS(t)
-	t.Cleanup(func() { env.Cleanup() })
+	t.Cleanup(func() { env.Cleanup(t) })

 	createObjectBucket(t, env, "test")
 	env.startSparkContainer(t)
--- a/test/s3/spark/issue_8285_repro_test.go
+++ b/test/s3/spark/issue_8285_repro_test.go
@@ -0,0 +1,180 @@
+package spark
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/aws/aws-sdk-go-v2/aws"
+	"github.com/aws/aws-sdk-go-v2/credentials"
+	"github.com/aws/aws-sdk-go-v2/service/s3"
+	s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
+)
+
+func TestSparkS3TemporaryDirectoryCleanupIssue8285Regression(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping Spark integration test in short mode")
+	}
+
+	env := setupSparkIssue8234Env(t)
+
+	script := `
+import pyspark.sql.functions as F
+
+target = "s3a://test/issue-8285/output"
+
+spark.conf.set("spark.hadoop.fs.s3a.committer.name", "directory")
+spark.conf.set("spark.hadoop.fs.s3a.committer.magic.enabled", "false")
+spark.conf.set("spark.hadoop.fs.s3a.committer.staging.abort.pending.uploads", "true")
+spark.conf.set("spark.hadoop.fs.s3a.committer.staging.conflict-mode", "append")
+spark.conf.set("spark.hadoop.fs.s3a.committer.staging.tmp.path", "/tmp")
+spark.conf.set("spark.hadoop.fs.s3a.directory.marker.retention", "keep")
+
+df = spark.range(0, 200).repartition(12).withColumn("value", F.col("id") * 2)
+df.write.format("parquet").mode("overwrite").save(target)
+count = spark.read.parquet(target).count()
+print("WRITE_COUNT=" + str(count))
+`
+
+	code, output := runSparkPyScript(t, env.sparkContainer, script, env.s3Port)
+	if code != 0 {
+		t.Fatalf("Spark script exited with code %d; output:\n%s", code, output)
+	}
+	if !strings.Contains(output, "WRITE_COUNT=200") {
+		t.Fatalf("expected write/read success marker in output, got:\n%s", output)
+	}
+
+	keys := listObjectKeysByPrefix(t, env, "test", "issue-8285/")
+	var temporaryKeys []string
+	for _, key := range keys {
+		if hasTemporaryPathSegment(key) {
+			temporaryKeys = append(temporaryKeys, key)
+		}
+	}
+
+	if len(temporaryKeys) > 0 {
+		t.Fatalf("issue #8285 regression detected: found lingering _temporary artifacts: %v\nall keys: %v", temporaryKeys, keys)
+	}
+
+	temporaryCandidates := []string{
+		"issue-8285/output/_temporary/",
+		"issue-8285/output/_temporary/0/",
+		"issue-8285/output/_temporary/0/_temporary/",
+	}
+	lingering := waitForObjectsToDisappear(t, env, "test", temporaryCandidates, 35*time.Second)
+	if len(lingering) > 0 {
+		t.Fatalf("issue #8285 regression detected: lingering temporary directories: %v", lingering)
+	}
+}
+
+func listObjectKeysByPrefix(t *testing.T, env *TestEnvironment, bucketName, prefix string) []string {
+	t.Helper()
+	client := newS3Client(env)
+
+	pager := s3.NewListObjectsV2Paginator(client, &s3.ListObjectsV2Input{
+		Bucket: aws.String(bucketName),
+		Prefix: aws.String(prefix),
+	})
+
+	var keys []string
+	for pager.HasMorePages() {
+		page, err := pager.NextPage(context.Background())
+		if err != nil {
+			t.Fatalf("failed listing objects for prefix %q: %v", prefix, err)
+		}
+		for _, object := range page.Contents {
+			keys = append(keys, aws.ToString(object.Key))
+		}
+	}
+
+	return keys
+}
+
+func headObjectInfo(t *testing.T, env *TestEnvironment, bucketName, key string) (bool, string, error) {
+	t.Helper()
+
+	client := newS3Client(env)
+	output, err := client.HeadObject(context.Background(), &s3.HeadObjectInput{
+		Bucket: aws.String(bucketName),
+		Key:    aws.String(key),
+	})
+	if err == nil {
+		return true, aws.ToString(output.ContentType), nil
+	}
+
+	var notFound *s3types.NotFound
+	if strings.Contains(err.Error(), "NotFound") || strings.Contains(err.Error(), "NoSuchKey") || errors.As(err, &notFound) {
+		return false, "", nil
+	}
+	return false, "", err
+}
+
+func waitForObjectsToDisappear(t *testing.T, env *TestEnvironment, bucketName string, keys []string, timeout time.Duration) []string {
+	t.Helper()
+
+	deadline := time.Now().Add(timeout)
+	pending := make(map[string]struct{}, len(keys))
+	details := make(map[string]string, len(keys))
+	for _, key := range keys {
+		pending[key] = struct{}{}
+	}
+
+	for len(pending) > 0 && time.Now().Before(deadline) {
+		for key := range pending {
+			exists, contentType, err := headObjectInfo(t, env, bucketName, key)
+			if err != nil {
+				details[key] = fmt.Sprintf("%s (head_error=%v)", key, err)
+				continue
+			}
+			if !exists {
+				delete(pending, key)
+				delete(details, key)
+				continue
+			}
+			details[key] = fmt.Sprintf("%s (exists=true, contentType=%q)", key, contentType)
+		}
+		if len(pending) > 0 {
+			time.Sleep(2 * time.Second)
+		}
+	}
+
+	if len(pending) == 0 {
+		return nil
+	}
+
+	var lingering []string
+	for _, key := range keys {
+		if _, ok := pending[key]; !ok {
+			continue
+		}
+		if detail, hasDetail := details[key]; hasDetail {
+			lingering = append(lingering, detail)
+		} else {
+			lingering = append(lingering, key)
+		}
+	}
+	return lingering
+}
+
+func newS3Client(env *TestEnvironment) *s3.Client {
+	cfg := aws.Config{
+		Region:       "us-east-1",
+		Credentials:  aws.NewCredentialsCache(credentials.NewStaticCredentialsProvider(env.accessKey, env.secretKey, "")),
+		BaseEndpoint: aws.String(fmt.Sprintf("http://localhost:%d", env.s3Port)),
+	}
+	return s3.NewFromConfig(cfg, func(o *s3.Options) {
+		o.UsePathStyle = true
+	})
+}
+
+func hasTemporaryPathSegment(key string) bool {
+	for _, segment := range strings.Split(strings.TrimSuffix(key, "/"), "/") {
+		if segment == "_temporary" {
+			return true
+		}
+	}
+	return false
+}
--- a/test/s3/spark/setup_test.go
+++ b/test/s3/spark/setup_test.go
@@ -8,6 +8,7 @@ import (
 	"net"
 	"os"
 	"os/exec"
+	"path/filepath"
 	"sync"
 	"testing"
 	"time"
@@ -53,6 +54,8 @@ type TestEnvironment struct {
 	dockerAvailable  bool
 	weedBinary       string
 	seaweedfsDataDir string
+	weedLogPath      string
+	weedLogFile      *os.File
 	masterPort       int
 	filerPort        int
 	s3Port           int
@@ -113,6 +116,15 @@ func (env *TestEnvironment) StartSeaweedFS(t *testing.T) {
 		"-s3.config", iamConfigPath,
 		"-dir", env.seaweedfsDataDir,
 	)
+	weedLogPath := filepath.Join(env.seaweedfsDataDir, "weed-mini.log")
+	weedLogFile, err := os.Create(weedLogPath)
+	if err != nil {
+		t.Fatalf("failed to create weed log file: %v", err)
+	}
+	env.weedLogPath = weedLogPath
+	env.weedLogFile = weedLogFile
+	env.masterProcess.Stdout = weedLogFile
+	env.masterProcess.Stderr = weedLogFile
 	env.masterProcess.Env = append(os.Environ(),
 		"AWS_ACCESS_KEY_ID="+env.accessKey,
 		"AWS_SECRET_ACCESS_KEY="+env.secretKey,
@@ -160,12 +172,30 @@ func (env *TestEnvironment) startSparkContainer(t *testing.T) {
 	env.sparkContainer = container
 }

-func (env *TestEnvironment) Cleanup() {
+func (env *TestEnvironment) Cleanup(t *testing.T) {
 	if env.masterProcess != nil && env.masterProcess.Process != nil {
 		_ = env.masterProcess.Process.Kill()
 		_ = env.masterProcess.Wait()
 	}
 	clearMiniProcess(env.masterProcess)
+	if env.weedLogFile != nil {
+		_ = env.weedLogFile.Close()
+	}
+
+	if t.Failed() && os.Getenv("CI") != "" && env.weedLogPath != "" {
+		logData, err := os.ReadFile(env.weedLogPath)
+		if err != nil {
+			t.Logf("failed to read weed mini log file %s: %v", env.weedLogPath, err)
+		} else {
+			// Print the tail to keep CI output manageable while preserving failure context.
+			const maxTailBytes = 64 * 1024
+			start := 0
+			if len(logData) > maxTailBytes {
+				start = len(logData) - maxTailBytes
+			}
+			t.Logf("weed mini logs (tail, %d bytes):\n%s", len(logData)-start, string(logData[start:]))
+		}
+	}

 	if env.sparkContainer != nil {
 		ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)