Files
seaweedFS/weed/s3api/s3tables/utils.go
Chris Lu b244bb58aa s3tables: redesign Iceberg REST Catalog using iceberg-go and automate integration tests (#8197)
* full integration with iceberg-go

* Table Commit Operations (handleUpdateTable)

* s3tables: fix Iceberg v2 compliance and namespace properties

This commit ensures SeaweedFS Iceberg REST Catalog is compliant with
Iceberg Format Version 2 by:
- Using iceberg-go's table.NewMetadataWithUUID for strict v2 compliance.
- Explicitly initializing namespace properties to empty maps.
- Removing omitempty from required Iceberg response fields.
- Fixing CommitTableRequest unmarshaling using table.Requirements and table.Updates.

* s3tables: automate Iceberg integration tests

- Added Makefile for local test execution and cluster management.
- Added docker-compose for PyIceberg compatibility kit.
- Added Go integration test harness for PyIceberg.
- Updated GitHub CI to run Iceberg catalog tests automatically.

* s3tables: update PyIceberg test suite for compatibility

- Updated test_rest_catalog.py to use latest PyIceberg transaction APIs.
- Updated Dockerfile to include pyarrow and pandas dependencies.
- Improved namespace and table handling in integration tests.

* s3tables: address review feedback on Iceberg Catalog

- Implemented robust metadata version parsing and incrementing.
- Ensured table metadata changes are persisted during commit (handleUpdateTable).
- Standardized namespace property initialization for consistency.
- Fixed unused variable and incorrect struct field build errors.

* s3tables: finalize Iceberg REST Catalog and optimize tests

- Implemented robust metadata versioning and persistence.
- Standardized namespace property initialization.
- Optimized integration tests using pre-built Docker image.
- Added strict property persistence validation to test suite.
- Fixed build errors from previous partial updates.

* Address PR review: fix Table UUID stability, implement S3Tables UpdateTable, and support full metadata persistence individually

* fix: Iceberg catalog stable UUIDs, metadata persistence, and file writing

- Ensure table UUIDs are stable (do not regenerate on load).
- Persist full table metadata (Iceberg JSON) in s3tables extended attributes.
- Add `MetadataVersion` to explicitly track version numbers, replacing regex parsing.
- Implement `saveMetadataFile` to persist metadata JSON files to the Filer on commit.
- Update `CreateTable` and `UpdateTable` handlers to use the new logic.

* test: bind weed mini to 0.0.0.0 in integration tests to fix Docker connectivity

* Iceberg: fix metadata handling in REST catalog

- Add nil guard in createTable
- Fix updateTable to correctly load existing metadata from storage
- Ensure full metadata persistence on updates
- Populate loadTable result with parsed metadata

* S3Tables: add auth checks and fix response fields in UpdateTable

- Add CheckPermissionWithContext to UpdateTable handler
- Include TableARN and MetadataLocation in UpdateTable response
- Use ErrCodeConflict (409) for version token mismatches

* Tests: improve Iceberg catalog test infrastructure and cleanup

- Makefile: use PID file for precise process killing
- test_rest_catalog.py: remove unused variables and fix f-strings

* Iceberg: fix variable shadowing in UpdateTable

- Rename inner loop variable `req` to `requirement` to avoid shadowing outer request variable

* S3Tables: simplify MetadataVersion initialization

- Use `max(req.MetadataVersion, 1)` instead of anonymous function

* Tests: remove unicode characters from S3 tables integration test logs

- Remove unicode checkmarks from test output for cleaner logs

* Iceberg: improve metadata persistence robustness

- Fix MetadataLocation in LoadTableResult to fallback to generated location
- Improve saveMetadataFile to ensure directory hierarchy existence and robust error handling
2026-02-03 15:30:04 -08:00

366 lines
12 KiB
Go

package s3tables
import (
"crypto/rand"
"encoding/hex"
"fmt"
"net/url"
"path"
"regexp"
"strings"
"time"
)
const (
bucketNamePatternStr = `[a-z0-9-]+`
tableNamespacePatternStr = `[a-z0-9_]+`
tableNamePatternStr = `[a-z0-9_]+`
)
var (
bucketARNPattern = regexp.MustCompile(`^arn:aws:s3tables:[^:]*:[^:]*:bucket/(` + bucketNamePatternStr + `)$`)
tableARNPattern = regexp.MustCompile(`^arn:aws:s3tables:[^:]*:[^:]*:bucket/(` + bucketNamePatternStr + `)/table/(` + tableNamespacePatternStr + `)/(` + tableNamePatternStr + `)$`)
tagPattern = regexp.MustCompile(`^([\p{L}\p{Z}\p{N}_.:/=+\-@]*)$`)
)
// ARN parsing functions
// parseBucketNameFromARN extracts bucket name from table bucket ARN
// ARN format: arn:aws:s3tables:{region}:{account}:bucket/{bucket-name}
func parseBucketNameFromARN(arn string) (string, error) {
matches := bucketARNPattern.FindStringSubmatch(arn)
if len(matches) != 2 {
return "", fmt.Errorf("invalid bucket ARN: %s", arn)
}
bucketName := matches[1]
if !isValidBucketName(bucketName) {
return "", fmt.Errorf("invalid bucket name in ARN: %s", bucketName)
}
return bucketName, nil
}
// ParseBucketNameFromARN is a wrapper to validate bucket ARN for other packages.
func ParseBucketNameFromARN(arn string) (string, error) {
return parseBucketNameFromARN(arn)
}
// parseTableFromARN extracts bucket name, namespace, and table name from ARN
// ARN format: arn:aws:s3tables:{region}:{account}:bucket/{bucket-name}/table/{namespace}/{table-name}
func parseTableFromARN(arn string) (bucketName, namespace, tableName string, err error) {
// Updated regex to align with namespace validation (single-segment)
matches := tableARNPattern.FindStringSubmatch(arn)
if len(matches) != 4 {
return "", "", "", fmt.Errorf("invalid table ARN: %s", arn)
}
// Validate bucket name
bucketName = matches[1]
if err := validateBucketName(bucketName); err != nil {
return "", "", "", fmt.Errorf("invalid bucket name in ARN: %v", err)
}
// Namespace is already constrained by the regex; validate it directly.
namespace = matches[2]
_, err = validateNamespace([]string{namespace})
if err != nil {
return "", "", "", fmt.Errorf("invalid namespace in ARN: %v", err)
}
// URL decode and validate the table name from the ARN path component
tableNameUnescaped, err := url.PathUnescape(matches[3])
if err != nil {
return "", "", "", fmt.Errorf("invalid table name encoding in ARN: %v", err)
}
if _, err := validateTableName(tableNameUnescaped); err != nil {
return "", "", "", fmt.Errorf("invalid table name in ARN: %v", err)
}
return bucketName, namespace, tableNameUnescaped, nil
}
// Path helpers
// GetTableBucketPath returns the filer path for a table bucket
func GetTableBucketPath(bucketName string) string {
return path.Join(TablesPath, bucketName)
}
// GetNamespacePath returns the filer path for a namespace
func GetNamespacePath(bucketName, namespace string) string {
return path.Join(TablesPath, bucketName, namespace)
}
// GetTablePath returns the filer path for a table
func GetTablePath(bucketName, namespace, tableName string) string {
return path.Join(TablesPath, bucketName, namespace, tableName)
}
// Metadata structures
type tableBucketMetadata struct {
Name string `json:"name"`
CreatedAt time.Time `json:"createdAt"`
OwnerAccountID string `json:"ownerAccountId"`
}
// namespaceMetadata stores metadata for a namespace
type namespaceMetadata struct {
Namespace []string `json:"namespace"`
CreatedAt time.Time `json:"createdAt"`
OwnerAccountID string `json:"ownerAccountId"`
}
// tableMetadataInternal stores metadata for a table
type tableMetadataInternal struct {
Name string `json:"name"`
Namespace string `json:"namespace"`
Format string `json:"format"`
CreatedAt time.Time `json:"createdAt"`
ModifiedAt time.Time `json:"modifiedAt"`
OwnerAccountID string `json:"ownerAccountId"`
VersionToken string `json:"versionToken"`
MetadataVersion int `json:"metadataVersion"`
MetadataLocation string `json:"metadataLocation,omitempty"`
Metadata *TableMetadata `json:"metadata,omitempty"`
}
// Utility functions
// validateBucketName validates bucket name and returns an error if invalid.
// Bucket names must contain only lowercase letters, numbers, and hyphens.
// Length must be between 3 and 63 characters.
// Must start and end with a letter or digit.
// Reserved prefixes/suffixes are rejected.
func validateBucketName(name string) error {
if name == "" {
return fmt.Errorf("bucket name is required")
}
if len(name) < 3 || len(name) > 63 {
return fmt.Errorf("bucket name must be between 3 and 63 characters")
}
// Must start and end with a letter or digit
start := name[0]
end := name[len(name)-1]
if !((start >= 'a' && start <= 'z') || (start >= '0' && start <= '9')) {
return fmt.Errorf("bucket name must start with a letter or digit")
}
if !((end >= 'a' && end <= 'z') || (end >= '0' && end <= '9')) {
return fmt.Errorf("bucket name must end with a letter or digit")
}
// Allowed characters: a-z, 0-9, -
for i := 0; i < len(name); i++ {
ch := name[i]
if (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9') || ch == '-' {
continue
}
return fmt.Errorf("bucket name can only contain lowercase letters, numbers, and hyphens")
}
// Reserved prefixes
reservedPrefixes := []string{"xn--", "sthree-", "amzn-s3-demo-", "aws"}
for _, p := range reservedPrefixes {
if strings.HasPrefix(name, p) {
return fmt.Errorf("bucket name cannot start with reserved prefix: %s", p)
}
}
// Reserved suffixes
reservedSuffixes := []string{"-s3alias", "--ol-s3", "--x-s3", "--table-s3"}
for _, s := range reservedSuffixes {
if strings.HasSuffix(name, s) {
return fmt.Errorf("bucket name cannot end with reserved suffix: %s", s)
}
}
return nil
}
// ValidateBucketName validates bucket name and returns an error if invalid.
func ValidateBucketName(name string) error {
return validateBucketName(name)
}
// BuildBucketARN builds a bucket ARN with the provided region and account ID.
// If region is empty, the ARN will omit the region field.
func BuildBucketARN(region, accountID, bucketName string) (string, error) {
if bucketName == "" {
return "", fmt.Errorf("bucket name is required")
}
if err := validateBucketName(bucketName); err != nil {
return "", err
}
if accountID == "" {
accountID = DefaultAccountID
}
return buildARN(region, accountID, fmt.Sprintf("bucket/%s", bucketName)), nil
}
// BuildTableARN builds a table ARN with the provided region and account ID.
func BuildTableARN(region, accountID, bucketName, namespace, tableName string) (string, error) {
if bucketName == "" {
return "", fmt.Errorf("bucket name is required")
}
if err := validateBucketName(bucketName); err != nil {
return "", err
}
if namespace == "" {
return "", fmt.Errorf("namespace is required")
}
normalizedNamespace, err := validateNamespace([]string{namespace})
if err != nil {
return "", err
}
if tableName == "" {
return "", fmt.Errorf("table name is required")
}
normalizedTable, err := validateTableName(tableName)
if err != nil {
return "", err
}
if accountID == "" {
accountID = DefaultAccountID
}
return buildARN(region, accountID, fmt.Sprintf("bucket/%s/table/%s/%s", bucketName, normalizedNamespace, normalizedTable)), nil
}
func buildARN(region, accountID, resourcePath string) string {
return fmt.Sprintf("arn:aws:s3tables:%s:%s:%s", region, accountID, resourcePath)
}
// ValidateTags validates tags for S3 Tables.
func ValidateTags(tags map[string]string) error {
if len(tags) > 10 {
return fmt.Errorf("validate tags: %d tags more than 10", len(tags))
}
for k, v := range tags {
if len(k) > 128 {
return fmt.Errorf("validate tags: tag key longer than 128")
}
if !tagPattern.MatchString(k) {
return fmt.Errorf("validate tags key %s error, incorrect key", k)
}
if len(v) > 256 {
return fmt.Errorf("validate tags: tag value longer than 256")
}
if !tagPattern.MatchString(v) {
return fmt.Errorf("validate tags value %s error, incorrect value", v)
}
}
return nil
}
// isValidBucketName validates bucket name characters (kept for compatibility)
// Deprecated: use validateBucketName instead
func isValidBucketName(name string) bool {
return validateBucketName(name) == nil
}
// generateVersionToken generates a unique, unpredictable version token
func generateVersionToken() string {
b := make([]byte, 16)
if _, err := rand.Read(b); err != nil {
// Fallback to timestamp if crypto/rand fails
return fmt.Sprintf("%x", time.Now().UnixNano())
}
return hex.EncodeToString(b)
}
// splitPath splits a path into directory and name components using stdlib
func splitPath(p string) (dir, name string) {
dir = path.Dir(p)
name = path.Base(p)
return
}
// validateNamespace validates that the namespace provided is supported (single-level)
func validateNamespace(namespace []string) (string, error) {
if len(namespace) == 0 {
return "", fmt.Errorf("namespace is required")
}
if len(namespace) > 1 {
return "", fmt.Errorf("multi-level namespaces are not supported")
}
name := namespace[0]
if len(name) < 1 || len(name) > 255 {
return "", fmt.Errorf("namespace name must be between 1 and 255 characters")
}
// Prevent path traversal and multi-segment paths
if name == "." || name == ".." {
return "", fmt.Errorf("namespace name cannot be '.' or '..'")
}
if strings.Contains(name, "/") {
return "", fmt.Errorf("namespace name cannot contain '/'")
}
// Must start and end with a letter or digit
start := name[0]
end := name[len(name)-1]
if !((start >= 'a' && start <= 'z') || (start >= '0' && start <= '9')) {
return "", fmt.Errorf("namespace name must start with a letter or digit")
}
if !((end >= 'a' && end <= 'z') || (end >= '0' && end <= '9')) {
return "", fmt.Errorf("namespace name must end with a letter or digit")
}
// Allowed characters: a-z, 0-9, _
for _, ch := range name {
if (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9') || ch == '_' {
continue
}
return "", fmt.Errorf("invalid namespace name: only 'a-z', '0-9', and '_' are allowed")
}
// Reserved prefix
if strings.HasPrefix(name, "aws") {
return "", fmt.Errorf("namespace name cannot start with reserved prefix 'aws'")
}
return name, nil
}
// ValidateNamespace is a wrapper to validate namespace for other packages.
func ValidateNamespace(namespace []string) (string, error) {
return validateNamespace(namespace)
}
// validateTableName validates a table name
func validateTableName(name string) (string, error) {
if len(name) < 1 || len(name) > 255 {
return "", fmt.Errorf("table name must be between 1 and 255 characters")
}
if name == "." || name == ".." || strings.Contains(name, "/") {
return "", fmt.Errorf("invalid table name: cannot be '.', '..' or contain '/'")
}
// First character must be a letter or digit
start := name[0]
if !((start >= 'a' && start <= 'z') || (start >= '0' && start <= '9')) {
return "", fmt.Errorf("table name must start with a letter or digit")
}
// Allowed characters: a-z, 0-9, _
for _, ch := range name {
if (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9') || ch == '_' {
continue
}
return "", fmt.Errorf("invalid table name: only 'a-z', '0-9', and '_' are allowed")
}
return name, nil
}
// ValidateTableName is a wrapper to validate table name for other packages.
func ValidateTableName(name string) (string, error) {
return validateTableName(name)
}
// flattenNamespace joins namespace elements into a single string (using dots as per AWS S3 Tables)
func flattenNamespace(namespace []string) string {
if len(namespace) == 0 {
return ""
}
return strings.Join(namespace, ".")
}