iceberg: add sort-aware compaction rewrite (#8666)

* iceberg: add sort-aware compaction rewrite

* iceberg: share filtered row iteration in compaction

* iceberg: rely on table sort order for sort rewrites

* iceberg: harden sort compaction planning

* iceberg: include rewrite strategy in planning config hash

compactionPlanningConfigHash now incorporates RewriteStrategy and
SortMaxInputBytes so cached planning results are invalidated when
sort strategy settings change. Also use the bytesPerMB constant in
compactionNoEligibleMessage.
This commit is contained in:
Chris Lu
2026-03-17 00:57:32 -07:00
committed by GitHub
parent e5c0889473
commit 55e988a7ee
8 changed files with 950 additions and 135 deletions

View File

@@ -187,19 +187,11 @@ func (h *Handler) Descriptor() *plugin_pb.JobTypeDescriptor {
{
Name: "rewrite_strategy",
Label: "Rewrite Strategy",
Description: "binpack keeps the current row order; sort rewrites each compaction bin using sort_fields or the table sort order.",
Description: "binpack keeps the existing row order; sort rewrites each compaction bin using the Iceberg table sort order.",
FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_STRING,
Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_TEXT,
Placeholder: "binpack or sort",
},
{
Name: "sort_fields",
Label: "Sort Fields",
Description: "Comma-separated field names for rewrite_strategy=sort. Blank uses the table sort order when present.",
FieldType: plugin_pb.ConfigFieldType_CONFIG_FIELD_TYPE_STRING,
Widget: plugin_pb.ConfigWidget_CONFIG_WIDGET_TEXT,
Placeholder: "id, created_at",
},
{
Name: "sort_max_input_mb",
Label: "Sort Max Input (MB)",
@@ -325,8 +317,7 @@ func (h *Handler) Descriptor() *plugin_pb.JobTypeDescriptor {
"max_commit_retries": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultMaxCommitRetries}},
"operations": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: defaultOperations}},
"apply_deletes": {Kind: &plugin_pb.ConfigValue_BoolValue{BoolValue: true}},
"rewrite_strategy": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: "binpack"}},
"sort_fields": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}},
"rewrite_strategy": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: defaultRewriteStrategy}},
"sort_max_input_mb": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: 0}},
"where": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}},
},
@@ -355,8 +346,7 @@ func (h *Handler) Descriptor() *plugin_pb.JobTypeDescriptor {
"max_commit_retries": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: defaultMaxCommitRetries}},
"operations": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: defaultOperations}},
"apply_deletes": {Kind: &plugin_pb.ConfigValue_BoolValue{BoolValue: true}},
"rewrite_strategy": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: "binpack"}},
"sort_fields": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}},
"rewrite_strategy": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: defaultRewriteStrategy}},
"sort_max_input_mb": {Kind: &plugin_pb.ConfigValue_Int64Value{Int64Value: 0}},
"where": {Kind: &plugin_pb.ConfigValue_StringValue{StringValue: ""}},
},