Switch empty-folder cleanup to bucket policy (#8292)
* Fix Spark _temporary cleanup and add issue #8285 regression test * Generalize empty folder cleanup for Spark temp artifacts * Revert synchronous folder pruning and add cleanup diagnostics * Add actionable empty-folder cleanup diagnostics * Fix Spark temp marker cleanup in async folder cleaner * Fix Spark temp cleanup with implicit directory markers * Keep explicit directory markers non-implicit * logging * more logs * Switch empty-folder cleanup to bucket policy * Seaweed-X-Amz-Allow-Empty-Folders * less logs * go vet * less logs * refactoring
This commit is contained in:
@@ -14,7 +14,7 @@ func setupSparkIssue8234Env(t *testing.T) *TestEnvironment {
|
||||
}
|
||||
|
||||
env.StartSeaweedFS(t)
|
||||
t.Cleanup(func() { env.Cleanup() })
|
||||
t.Cleanup(func() { env.Cleanup(t) })
|
||||
|
||||
createObjectBucket(t, env, "test")
|
||||
env.startSparkContainer(t)
|
||||
|
||||
180
test/s3/spark/issue_8285_repro_test.go
Normal file
180
test/s3/spark/issue_8285_repro_test.go
Normal file
@@ -0,0 +1,180 @@
|
||||
package spark
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/aws/aws-sdk-go-v2/aws"
|
||||
"github.com/aws/aws-sdk-go-v2/credentials"
|
||||
"github.com/aws/aws-sdk-go-v2/service/s3"
|
||||
s3types "github.com/aws/aws-sdk-go-v2/service/s3/types"
|
||||
)
|
||||
|
||||
func TestSparkS3TemporaryDirectoryCleanupIssue8285Regression(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("Skipping Spark integration test in short mode")
|
||||
}
|
||||
|
||||
env := setupSparkIssue8234Env(t)
|
||||
|
||||
script := `
|
||||
import pyspark.sql.functions as F
|
||||
|
||||
target = "s3a://test/issue-8285/output"
|
||||
|
||||
spark.conf.set("spark.hadoop.fs.s3a.committer.name", "directory")
|
||||
spark.conf.set("spark.hadoop.fs.s3a.committer.magic.enabled", "false")
|
||||
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.abort.pending.uploads", "true")
|
||||
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.conflict-mode", "append")
|
||||
spark.conf.set("spark.hadoop.fs.s3a.committer.staging.tmp.path", "/tmp")
|
||||
spark.conf.set("spark.hadoop.fs.s3a.directory.marker.retention", "keep")
|
||||
|
||||
df = spark.range(0, 200).repartition(12).withColumn("value", F.col("id") * 2)
|
||||
df.write.format("parquet").mode("overwrite").save(target)
|
||||
count = spark.read.parquet(target).count()
|
||||
print("WRITE_COUNT=" + str(count))
|
||||
`
|
||||
|
||||
code, output := runSparkPyScript(t, env.sparkContainer, script, env.s3Port)
|
||||
if code != 0 {
|
||||
t.Fatalf("Spark script exited with code %d; output:\n%s", code, output)
|
||||
}
|
||||
if !strings.Contains(output, "WRITE_COUNT=200") {
|
||||
t.Fatalf("expected write/read success marker in output, got:\n%s", output)
|
||||
}
|
||||
|
||||
keys := listObjectKeysByPrefix(t, env, "test", "issue-8285/")
|
||||
var temporaryKeys []string
|
||||
for _, key := range keys {
|
||||
if hasTemporaryPathSegment(key) {
|
||||
temporaryKeys = append(temporaryKeys, key)
|
||||
}
|
||||
}
|
||||
|
||||
if len(temporaryKeys) > 0 {
|
||||
t.Fatalf("issue #8285 regression detected: found lingering _temporary artifacts: %v\nall keys: %v", temporaryKeys, keys)
|
||||
}
|
||||
|
||||
temporaryCandidates := []string{
|
||||
"issue-8285/output/_temporary/",
|
||||
"issue-8285/output/_temporary/0/",
|
||||
"issue-8285/output/_temporary/0/_temporary/",
|
||||
}
|
||||
lingering := waitForObjectsToDisappear(t, env, "test", temporaryCandidates, 35*time.Second)
|
||||
if len(lingering) > 0 {
|
||||
t.Fatalf("issue #8285 regression detected: lingering temporary directories: %v", lingering)
|
||||
}
|
||||
}
|
||||
|
||||
func listObjectKeysByPrefix(t *testing.T, env *TestEnvironment, bucketName, prefix string) []string {
|
||||
t.Helper()
|
||||
client := newS3Client(env)
|
||||
|
||||
pager := s3.NewListObjectsV2Paginator(client, &s3.ListObjectsV2Input{
|
||||
Bucket: aws.String(bucketName),
|
||||
Prefix: aws.String(prefix),
|
||||
})
|
||||
|
||||
var keys []string
|
||||
for pager.HasMorePages() {
|
||||
page, err := pager.NextPage(context.Background())
|
||||
if err != nil {
|
||||
t.Fatalf("failed listing objects for prefix %q: %v", prefix, err)
|
||||
}
|
||||
for _, object := range page.Contents {
|
||||
keys = append(keys, aws.ToString(object.Key))
|
||||
}
|
||||
}
|
||||
|
||||
return keys
|
||||
}
|
||||
|
||||
func headObjectInfo(t *testing.T, env *TestEnvironment, bucketName, key string) (bool, string, error) {
|
||||
t.Helper()
|
||||
|
||||
client := newS3Client(env)
|
||||
output, err := client.HeadObject(context.Background(), &s3.HeadObjectInput{
|
||||
Bucket: aws.String(bucketName),
|
||||
Key: aws.String(key),
|
||||
})
|
||||
if err == nil {
|
||||
return true, aws.ToString(output.ContentType), nil
|
||||
}
|
||||
|
||||
var notFound *s3types.NotFound
|
||||
if strings.Contains(err.Error(), "NotFound") || strings.Contains(err.Error(), "NoSuchKey") || errors.As(err, ¬Found) {
|
||||
return false, "", nil
|
||||
}
|
||||
return false, "", err
|
||||
}
|
||||
|
||||
func waitForObjectsToDisappear(t *testing.T, env *TestEnvironment, bucketName string, keys []string, timeout time.Duration) []string {
|
||||
t.Helper()
|
||||
|
||||
deadline := time.Now().Add(timeout)
|
||||
pending := make(map[string]struct{}, len(keys))
|
||||
details := make(map[string]string, len(keys))
|
||||
for _, key := range keys {
|
||||
pending[key] = struct{}{}
|
||||
}
|
||||
|
||||
for len(pending) > 0 && time.Now().Before(deadline) {
|
||||
for key := range pending {
|
||||
exists, contentType, err := headObjectInfo(t, env, bucketName, key)
|
||||
if err != nil {
|
||||
details[key] = fmt.Sprintf("%s (head_error=%v)", key, err)
|
||||
continue
|
||||
}
|
||||
if !exists {
|
||||
delete(pending, key)
|
||||
delete(details, key)
|
||||
continue
|
||||
}
|
||||
details[key] = fmt.Sprintf("%s (exists=true, contentType=%q)", key, contentType)
|
||||
}
|
||||
if len(pending) > 0 {
|
||||
time.Sleep(2 * time.Second)
|
||||
}
|
||||
}
|
||||
|
||||
if len(pending) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var lingering []string
|
||||
for _, key := range keys {
|
||||
if _, ok := pending[key]; !ok {
|
||||
continue
|
||||
}
|
||||
if detail, hasDetail := details[key]; hasDetail {
|
||||
lingering = append(lingering, detail)
|
||||
} else {
|
||||
lingering = append(lingering, key)
|
||||
}
|
||||
}
|
||||
return lingering
|
||||
}
|
||||
|
||||
func newS3Client(env *TestEnvironment) *s3.Client {
|
||||
cfg := aws.Config{
|
||||
Region: "us-east-1",
|
||||
Credentials: aws.NewCredentialsCache(credentials.NewStaticCredentialsProvider(env.accessKey, env.secretKey, "")),
|
||||
BaseEndpoint: aws.String(fmt.Sprintf("http://localhost:%d", env.s3Port)),
|
||||
}
|
||||
return s3.NewFromConfig(cfg, func(o *s3.Options) {
|
||||
o.UsePathStyle = true
|
||||
})
|
||||
}
|
||||
|
||||
func hasTemporaryPathSegment(key string) bool {
|
||||
for _, segment := range strings.Split(strings.TrimSuffix(key, "/"), "/") {
|
||||
if segment == "_temporary" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"net"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
@@ -53,6 +54,8 @@ type TestEnvironment struct {
|
||||
dockerAvailable bool
|
||||
weedBinary string
|
||||
seaweedfsDataDir string
|
||||
weedLogPath string
|
||||
weedLogFile *os.File
|
||||
masterPort int
|
||||
filerPort int
|
||||
s3Port int
|
||||
@@ -113,6 +116,15 @@ func (env *TestEnvironment) StartSeaweedFS(t *testing.T) {
|
||||
"-s3.config", iamConfigPath,
|
||||
"-dir", env.seaweedfsDataDir,
|
||||
)
|
||||
weedLogPath := filepath.Join(env.seaweedfsDataDir, "weed-mini.log")
|
||||
weedLogFile, err := os.Create(weedLogPath)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to create weed log file: %v", err)
|
||||
}
|
||||
env.weedLogPath = weedLogPath
|
||||
env.weedLogFile = weedLogFile
|
||||
env.masterProcess.Stdout = weedLogFile
|
||||
env.masterProcess.Stderr = weedLogFile
|
||||
env.masterProcess.Env = append(os.Environ(),
|
||||
"AWS_ACCESS_KEY_ID="+env.accessKey,
|
||||
"AWS_SECRET_ACCESS_KEY="+env.secretKey,
|
||||
@@ -160,12 +172,30 @@ func (env *TestEnvironment) startSparkContainer(t *testing.T) {
|
||||
env.sparkContainer = container
|
||||
}
|
||||
|
||||
func (env *TestEnvironment) Cleanup() {
|
||||
func (env *TestEnvironment) Cleanup(t *testing.T) {
|
||||
if env.masterProcess != nil && env.masterProcess.Process != nil {
|
||||
_ = env.masterProcess.Process.Kill()
|
||||
_ = env.masterProcess.Wait()
|
||||
}
|
||||
clearMiniProcess(env.masterProcess)
|
||||
if env.weedLogFile != nil {
|
||||
_ = env.weedLogFile.Close()
|
||||
}
|
||||
|
||||
if t.Failed() && os.Getenv("CI") != "" && env.weedLogPath != "" {
|
||||
logData, err := os.ReadFile(env.weedLogPath)
|
||||
if err != nil {
|
||||
t.Logf("failed to read weed mini log file %s: %v", env.weedLogPath, err)
|
||||
} else {
|
||||
// Print the tail to keep CI output manageable while preserving failure context.
|
||||
const maxTailBytes = 64 * 1024
|
||||
start := 0
|
||||
if len(logData) > maxTailBytes {
|
||||
start = len(logData) - maxTailBytes
|
||||
}
|
||||
t.Logf("weed mini logs (tail, %d bytes):\n%s", len(logData)-start, string(logData[start:]))
|
||||
}
|
||||
}
|
||||
|
||||
if env.sparkContainer != nil {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
|
||||
Reference in New Issue
Block a user