From ba624f1f346f7494c60ca12b317cd3a1602ed09e Mon Sep 17 00:00:00 2001 From: Chris Lu Date: Thu, 26 Mar 2026 17:24:35 -0700 Subject: [PATCH] Rust volume server implementation with CI (#8539) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Match Go gRPC client transport defaults * Honor Go HTTP idle timeout * Honor maintenanceMBps during volume copy * Honor images.fix.orientation on uploads * Honor cpuprofile when pprof is disabled * Match Go memory status payloads * Propagate request IDs across gRPC calls * Format pending Rust source updates * Match Go stats endpoint payloads * Serve Go volume server UI assets * Enforce Go HTTP whitelist guards * Align Rust metrics admin-port test with Go behavior * Format pending Rust server updates * Honor access.ui without per-request JWT checks * Honor keepLocalDatFile in tier upload shortcut * Honor Go remote volume write mode * Load tier backends from master config * Check master config before loading volumes * Remove vif files on volume destroy * Delete remote tier data on volume destroy * Honor vif version defaults and overrides * Reject mismatched vif bytes offsets * Load remote-only tiered volumes * Report Go tail offsets in sync status * Stream remote dat in incremental copy * Honor collection vif for EC shard config * Persist EC expireAtSec in vif metadata * Stream remote volume reads through HTTP * Serve HTTP ranges from backend source * Match Go ReadAllNeedles scan order * Match Go CopyFile zero-stop metadata * Delete EC volumes with collection cleanup * Drop deleted collection metrics * Match Go tombstone ReadNeedleMeta * Match Go TTL parsing: all-digit default to minutes, two-pass fit algorithm * Match Go needle ID/cookie formatting and name size computation * Match Go image ext checks: webp resize only, no crop; empty healthz body * Match Go Prometheus metric names and add missing handler counter constants * Match Go ReplicaPlacement short string parsing with zero-padding * Add missing EC constants MAX_SHARD_COUNT and MIN_TOTAL_DISKS * Add walk_ecx_stats for accurate EC volume file counts and size * Match Go VolumeStatus dat file size, EC shard stats, and disk pct precision * Match Go needle map: unconditional delete counter, fix redb idx walk offset * Add CompactMapSegment overflow panic guard matching Go * Match Go volume: vif creation, version from superblock, TTL expiry, dedup data_size, garbage_level fallback * Match Go 304 Not Modified: return bare status with no headers * Match Go JWT error message: use "wrong jwt" instead of detailed error * Match Go read handler bare 400, delete error prefix, download throttle timeout * Match Go pretty JSON 1-space indent and "Deletion Failed:" error prefix * Match Go heartbeat: keep is_heartbeating on error, add EC shard identification * Match Go needle ReadBytes V2: tolerate EOF on truncated body * Match Go volume: cookie check on any existing needle, return DataSize, 128KB meta guard * Match Go DeleteCollection: propagate destroy errors * Match Go gRPC: BatchDelete no flag, IncrementalCopy error, FetchAndWrite concurrent, VolumeUnmount/DeleteCollection errors, tail draining, query error code * Match Go Content-Disposition RFC 6266 formatting with RFC 2231 encoding * Match Go Guard isWriteActive: combine whitelist and signing key check * Match Go DeleteCollectionMetrics: use partial label matching * Match Go heartbeat: send state-only delta on volume state changes * Match Go ReadNeedleMeta paged I/O: read header+tail only, skip data; add EIO tracking * Match Go ScrubVolume INDEX mode dispatch; add VolumeCopy preallocation and EC NeedleStatus TODOs * Add read_ec_shard_needle for full needle reconstruction from local EC shards * Make heartbeat master config helpers pub for VolumeCopy preallocation * Match Go gRPC: VolumeCopy preallocation, EC NeedleStatus full read, error message wording * Match Go HTTP responses: omitempty fields, 2-space JSON indent, JWT JSON error, delete pretty/JSONP, 304 Last-Modified, raw write error * Match Go WriteNeedleBlob V3 timestamp patching, fix makeup_diff double padding, count==0 read handling * Add rebuild_ecx_file for EC index reconstruction from data shards * Match Go gRPC: tail header first-chunk-only, EC cleanup on failure, copy append mode, ecx rebuild, compact cancellation * Add EC volume read and delete support in HTTP handlers * Add per-shard EC mount/unmount, location predicate search, idx directory for EC * Add CheckVolumeDataIntegrity on volume load matching Go * Match Go gRPC: EC multi-disk placement, per-shard mount/unmount, no auto-mount on reconstruct, streaming ReadAll/EcShardRead, ReceiveFile cleanup, version check, proxy streaming, redirect Content-Type * Match Go heartbeat metric accounting * Match Go duplicate UUID heartbeat retries * Delete expired EC volumes during heartbeat * Match Go volume heartbeat pruning * Honor master preallocate in volume max * Report remote storage info in heartbeats * Emit EC heartbeat deltas on shard changes * Match Go throttle boundary: use <= instead of <, fix pretty JSON to 1-space * Match Go write_needle_blob monotonic appendAtNs via get_append_at_ns * Match Go VolumeUnmount: idempotent success when volume not found * Match Go TTL Display: return empty string when unit is Empty Go checks `t.Unit == Empty` separately and returns "" for TTLs with nonzero count but Empty unit. Rust only checked is_empty() (count==0 && unit==0), so count>0 with unit=0 would format as "5 " instead of "". * Match Go error behavior for truncated needle data in read_body_v2 Go's readNeedleDataVersion2 returns "index out of range %d" errors (indices 1-7) when needle body or metadata fields are truncated. Rust was silently tolerating truncation and returning Ok. Now returns NeedleError::IndexOutOfRange with the matching index for each field. * Match Go download throttle: return JSON error instead of plain text * Match Go crop params: default x1/y1 to 0 when not provided * Match Go ScrubEcVolume: accumulate total_files from EC shards * Match Go ScrubVolume: count total_files even on scrub error * Match Go VolumeEcShardsCopy: set ignore_source_file_not_found for .vif * Match Go VolumeTailSender: send needle_header on every chunk * Match Go read_super_block: apply replication override from .vif * Match Go check_volume_data_integrity: verify all 10 entries, detect trailing corruption * Match Go WriteNeedleBlob: dedup check before writing during replication * handlers: use meta-only reads for HEAD * handlers: align range parsing and responses with Go * handlers: align upload parsing with Go * deps: enable webp support * Make 5bytes the default feature for idx entry compatibility * Match Go TTL: preserve original unit when count fits in byte * Fix EC locate_needle: use get_actual_size for full needle size * Fix raw body POST: only parse multipart when Content-Type contains form-data * Match Go ReceiveFile: return protocol errors in response body, not gRPC status * add docs * Match Go VolumeEcShardsCopy: append to .ecj file instead of truncating * Match Go ParsePath: support _delta suffix on file IDs for sub-file addressing * Match Go chunk manifest: add Accept-Ranges, Content-Disposition, filename fallback, MIME detection * Match Go privateStoreHandler: use proper JSON error for unsupported methods * Match Go Destroy: add only_empty parameter to reject non-empty volume deletion * Fix compilation: set_read_only_persist and set_writable return () These methods fire-and-forget save_vif internally, so gRPC callers should not try to chain .map_err() on the unit return type. * Match Go SaveVolumeInfo: check writability and propagate errors in save_vif * Match Go VolumeDelete: propagate only_empty to delete_volume for defense in depth The gRPC VolumeDelete handler had a pre-check for only_empty but then passed false to store.delete_volume(), bypassing the store-level check. Go passes req.OnlyEmpty directly to DeleteVolume. Now Rust does the same for defense in depth against TOCTOU races (though the store write lock makes this unlikely). * Match Go ProcessRangeRequest: return full content for empty/oversized ranges Go returns nil from ProcessRangeRequest when ranges are empty or total range size exceeds content length, causing the caller to serve the full content as a normal 200 response. Rust was returning an empty 200 body. * Match Go Query: quote JSON keys in output records Go's ToJson produces valid JSON with quoted keys like {"name":"Alice"}. Rust was producing invalid JSON with unquoted keys like {name:"Alice"}. * Match Go VolumeCopy: reject when no suitable disk location exists Go returns ErrVolumeNoSpaceLeft when no location matches the disk type and has sufficient space. Rust had an unsafe fallback that silently picked the first location regardless of type or available space. * Match Go DeleteVolumeNeedle: check noWriteOrDelete before allowing delete Go checks v.noWriteOrDelete before proceeding with needle deletion, returning "volume is read only" if true. Rust was skipping this check. * Match Go ReceiveFile: prefer HardDrive location for EC and use response-level write errors Two fixes: (1) Go prefers HardDriveType disk location for EC volumes, falling back to first location. Returns "no storage location available" when no locations exist. (2) Write failures are now response-level errors (in response body) instead of gRPC status errors, matching Go. * Match Go CopyFile: sync EC volume journal to disk before copying Go calls ecVolume.Sync() before copying EC volume files to ensure the .ecj journal is flushed to disk. Added sync_to_disk() to EcVolume and call it in the CopyFile EC branch. * Match Go readSuperBlock: propagate replication parse errors Go returns an error when parsing the replication string from the .vif file fails. Rust was silently ignoring the parse failure and using the super block's replication as-is. * Match Go TTL expiry: remove append_at_ns > 0 guard Go computes TTL expiry from AppendAtNs without guarding against zero. When append_at_ns is 0, the expiry is epoch + TTL which is in the past, correctly returning NotFound. Rust's extra guard skipped the check, incorrectly returning success for such needles. * Match Go delete_collection: skip volumes with compaction in progress Go checks !v.isCompactionInProgress.Load() before destroying a volume during collection deletion, skipping compacting volumes. Also changed destroy errors to log instead of aborting the entire collection delete. * Match Go MarkReadonly/MarkWritable: always notify master even on local error Go always notifies the master regardless of whether the local set_read_only_persist or set_writable step fails. The Rust code was using `?` which short-circuited on error, skipping the final master notification. Save the result and defer the `?` until after the notify call. * Match Go PostHandler: return 500 for all write errors Go returns 500 (InternalServerError) for all write failures. Rust was returning 404 for volume-not-found and 403 for read-only volumes. * Match Go makeupDiff: validate .cpd compaction revision is old + 1 Go reads the new .cpd file's super block and verifies the compaction revision is exactly old + 1. Rust only validated the old revision. * Match Go VolumeStatus: check data backend before returning status Go checks v.DataBackend != nil before building the status response, returning an error if missing. Rust was silently returning size 0. * Match Go PostHandler: always include mime field in upload response JSON Go always serializes the mime field even when empty ("mime":""). Rust was omitting it when empty due to Option with skip_serializing_if. * Match Go FindFreeLocation: account for EC shards in free slot calculation Go subtracts EC shard equivalents when computing available volume slots. Rust was only comparing volume count, potentially over-counting free slots on locations with many EC shards. * Match Go privateStoreHandler: use INVALID as metrics label for unsupported methods Go records the method as INVALID in metrics for unsupported HTTP methods. Rust was using the actual method name. * Match Go volume: add commit_compact guard and scrub data size validation Two fixes: (1) commit_compact now checks/sets is_compacting flag to prevent concurrent commits, matching Go's CompareAndSwap guard. (2) scrub now validates total needle sizes against .dat file size. * Match Go gRPC: fix TailSender error propagation, EcShardsInfo all slots, EcShardRead .ecx check Three fixes: (1) VolumeTailSender now propagates binary search errors instead of silently falling back to start. (2) VolumeEcShardsInfo returns entries for all shard slots including unmounted. (3) VolumeEcShardRead checks .ecx index for deletions instead of .ecj. * Match Go metrics: add BuildInfo gauge and connection tracking functions Go exposes a BuildInfo Prometheus metric with version labels, and tracks open connections via stats.ConnectionOpen/Close. Added both to Rust. * Match Go NeedleMap.Delete: use !is_deleted() instead of is_valid() Go's CompactMap.Delete checks !IsDeleted() not IsValid(), so needles with size==0 (live but anomalous) can still be deleted. The Rust code was using is_valid() which returns false for size==0, preventing deletion of such needles. * Match Go fitTtlCount: always normalize TTL to coarsest unit Go's fitTtlCount always converts to seconds first, then finds the coarsest unit that fits in one byte (e.g., 120m → 2h). Rust had an early return for count<=255 that skipped normalization, producing different binary encodings for the same duration. * Match Go BuildInfo metric: correct name and add missing labels Go uses SeaweedFS_build_info (Namespace=SeaweedFS, Subsystem=build, Name=info) with labels [version, commit, sizelimit, goos, goarch]. Rust had SeaweedFS_volumeServer_buildInfo with only [version]. * Match Go HTTP handlers: fix UploadResult fields, DiskStatus JSON, chunk manifest ETag - UploadResult.mime: add skip_serializing_if to omit empty MIME (Go uses omitempty) - UploadResult.contentMd5: only include when request provided Content-MD5 header - Content-MD5 response header: only set when request provided it - DiskStatuses: use camelCase field names (percentFree, percentUsed, diskType) to match Go's protobuf JSON marshaling - Chunk manifest: preserve needle ETag in expanded response headers * Match Go volume: fix version(), integrity check, scrub, and commit_compact - version(): use self.version() instead of self.super_block.version in read_all_needles, check_volume_data_integrity, scan_raw_needles_from to respect volumeInfo.version override - check_volume_data_integrity: initialize healthy_index_size to idx_size (matching Go) and continue on EOF instead of returning error - scrub(): count deleted needles in total_read since they still occupy space in the .dat file (matches Go's totalRead += actualSize for deleted) - commit_compact: clean up .cpd/.cpx files on makeup_diff failure (matches Go's error path cleanup) * Match Go write queue: add 4MB batch byte limit Go's startWorker breaks the batch at either 128 requests or 4MB of accumulated write data. Rust only had the 128-request limit, allowing large writes to accumulate unbounded latency. * Add TTL normalization tests for Go parity verification Test that fit_ttl_count normalizes 120m→2h, 24h→1d, 7d→1w even when count fits in a byte, matching Go's fitTtlCount behavior. * Match Go FindFreeLocation: account for EC shards in free slot calculation Go's free volume count subtracts both regular volumes and EC volumes from max_volume_count. Rust was only counting regular volumes, which could over-report available slots when EC shards are mounted. * Match Go EC volume: mark deletions in .ecx and replay .ecj at startup Go's DeleteNeedleFromEcx marks needles as deleted in the .ecx index in-place (writing TOMBSTONE_FILE_SIZE at the size field) in addition to appending to the .ecj journal. Go's RebuildEcxFile replays .ecj entries into .ecx on startup, then removes the .ecj file. Rust was only appending to .ecj without marking .ecx, which meant deleted EC needles remained readable via .ecx binary search. This fix: - Opens .ecx in read/write mode (was read-only) - Adds mark_needle_deleted_in_ecx: binary search + in-place write - Calls it from journal_delete before appending to .ecj - Adds rebuild_ecx_from_journal: replays .ecj into .ecx on startup * Match Go check_all_ec_shards_deleted: use MAX_SHARD_COUNT instead of hardcoded 14 Go's TotalShardsCount is DataShardsCount + ParityShardsCount = 14 by default, but custom EC configs via .vif can have more shards (up to MaxShardCount = 32). Using MAX_SHARD_COUNT ensures all shard files are checked regardless of EC configuration. * Match Go EC locate: subtract 1 from shard size and use datFileSize override Go's LocateEcShardNeedleInterval passes shard.ecdFileSize-1 to LocateData (shards are padded, -1 avoids overcounting large block rows). When datFileSize is known, Go uses datFileSize/DataShards instead. Rust was passing the raw shard file size without adjustment. * Fix TTL parsing and DiskStatus field names to match Go exactly TTL::read: Go's ReadTTL preserves the original unit (7d stays 7d, not 1w) and errors on count > 255. The previous normalization change was incorrect — Go only normalizes internally via fitTtlCount, not during string parsing. DiskStatus: Go uses encoding/json on protobuf structs, which reads the json struct tags (snake_case: percent_free, percent_used, disk_type), not the protobuf JSON names (camelCase). Revert to snake_case to match Go's actual output. * Fix heartbeat: check leader != current master before redirect, process duplicated UUIDs first Match Go's volume_grpc_client_to_master.go behavior: 1. Only trigger leader redirect when the leader address differs from the current master (prevents unnecessary reconnect loops when master confirms its own address). 2. Process duplicated_uuids before leader redirect check, matching Go's ordering where duplicate UUID detection takes priority. * Remove SetState version check to match Go behavior Go's SetState unconditionally applies the state without any version mismatch check. The Rust version had an extra optimistic concurrency check that would reject valid requests from Go clients that don't track versions. * Fix TTL::read() to normalize via fit_ttl_count matching Go's ReadTTL Go's ReadTTL calls fitTtlCount which converts to seconds and normalizes to the coarsest unit that fits in a byte count (e.g. 120m->2h, 7d->1w, 24h->1d). The Rust version was preserving the original unit, producing different binary encodings on disk and in heartbeat messages. * Always return Content-MD5 header and JSON field on successful writes Go always sets Content-MD5 in the response regardless of whether the request included it. The Rust version was conditionally including it only when the request provided Content-MD5. * Include name and size in UploadResult JSON even when empty/zero Go's encoding/json always includes empty strings and zero values in the upload response. The Rust version was using skip_serializing_if to omit them, causing JSON structure differences. * Include deleted needles in scan_raw_needles_from to match Go Go's ScanVolumeFileFrom visits ALL needles including deleted ones. Skipping deleted entries during incremental copy would cause tombstones to not be propagated, making deleted files reappear on the receiving side. * Match Go NeedleMap.Delete: always write tombstone to idx file Go's NeedleMap.Delete unconditionally writes a tombstone entry to the idx file and updates metrics, even if the needle doesn't exist or is already deleted. This is important for replication where every delete operation must produce an idx write. The Rust version was skipping the tombstone write for non-existent or already-deleted needles. * Limit MIME type to 255 bytes matching Go's CreateNeedleFromRequest * Title-case Seaweed-* pair keys to match Go HTTP header canonicalization * Unify DiskType::Hdd into HardDrive to match Go's single HardDriveType * Skip tombstone entries in walk_ecx_stats total_size matching Go's Raw() * Return EMPTY TTL when computed seconds is zero matching Go's fitTtlCount * Include disk-space-low in Volume.is_read_only() matching Go * Log error on CIDR parse failure in whitelist matching Go's glog.Errorf * Log cookie mismatch in gRPC Query matching Go's V(0).Infof * Fix is_expired volume_size comparison to use < matching Go Go checks `volumeSize < super_block.SuperBlockSize` (strict less-than), but Rust used `<=`. This meant Rust would fail to expire a volume that is exactly SUPER_BLOCK_SIZE bytes. * Apply Go's JWT expiry defaults: 10s write, 60s read Go calls v.SetDefault("jwt.signing.expires_after_seconds", 10) and v.SetDefault("jwt.signing.read.expires_after_seconds", 60). Rust defaulted to 0 for both, which meant tokens would never expire when security.toml has a signing key but omits expires_after_seconds. * Stop [grpc.volume].ca from overriding [grpc].ca matching Go Go reads the gRPC CA file only from config.GetString("grpc.ca"), i.e. the [grpc] section. The [grpc.volume] section only provides cert and key. Rust was also reading ca from [grpc.volume] which would silently override the [grpc].ca value when both were present. * Fix free_volume_count to use EC shard count matching Go Was counting EC volumes instead of EC shards, which underestimates EC space usage. One EC volume with 14 shards uses ~1.4 volume slots, not 1. Now uses Go's formula: ((max - volumes) * DataShardsCount - ecShardCount) / DataShardsCount. * Include preallocate in compaction space check matching Go Go uses max(preallocate, estimatedCompactSize) for the free space check. Rust was only using the estimated volume size, which could start a compaction that fails mid-way if preallocate exceeds the volume size. * Check gzip magic bytes before setting Content-Encoding matching Go Go checks both Accept-Encoding contains "gzip" AND IsGzippedContent (data starts with 0x1f 0x8b) before setting Content-Encoding: gzip. Rust only checked Accept-Encoding, which could incorrectly declare gzip encoding for non-gzip compressed data. * Only set upload response name when needle HasName matching Go Go checks reqNeedle.HasName() before setting ret.Name. Rust always set the name from the filename variable, which could return the fid portion of the path as the name for raw PUT requests without a filename. * Treat MaxVolumeCount==0 as unlimited matching Go's hasFreeDiskLocation Go's hasFreeDiskLocation returns true immediately when MaxVolumeCount is 0, treating it as unlimited. Rust was computing effective_free as <= 0 for max==0, rejecting the location. This could fail volume creation during early startup before the first heartbeat adjusts max. * Read lastAppendAtNs from deleted V3 entries in integrity check Go's doCheckAndFixVolumeData reads AppendAtNs from both live entries (verifyNeedleIntegrity) and deleted tombstones (verifyDeletedNeedleIntegrity). Rust was skipping deleted entries, which could result in a stale last_append_at_ns if the last index entry is a deletion. * Return empty body for empty/oversized range requests matching Go Go's ProcessRangeRequest returns nil (empty body, 200 OK) when parsed ranges are empty or combined range size exceeds total content size. The Rust buffered path incorrectly returned the full file data for both cases. The streaming path already handled this correctly. * Dispatch ScrubEcVolume by mode matching Go's INDEX/LOCAL/FULL Go's ScrubEcVolume switches on mode: INDEX calls v.ScrubIndex() (ecx integrity only), LOCAL calls v.ScrubLocal(), FULL calls vs.store.ScrubEcVolume(). Rust was ignoring the mode and always running verify_ec_shards. Now INDEX mode checks ecx index integrity (sorted overlap detection + file size validation) without shard I/O, while LOCAL/FULL modes run the existing shard verification. * Fix TTL test expectation: 7d normalizes to 1w matching Go's fitTtlCount Go's ReadTTL calls fitTtlCount which normalizes to the coarsest unit that fits: 7 days = 1 week, so "7d" becomes {Count:1, Unit:Week} which displays as "1w". Both Go and Rust normalize identically. * Add version mismatch check to SetState matching Go's State.Update Go's State.Update compares the incoming version with the stored version and returns "version mismatch" error if they differ. This provides optimistic concurrency control. The Rust implementation was accepting any version unconditionally. * Use unquoted keys in Query JSON output matching Go's json.ToJson Go's json.ToJson produces records with unquoted keys like {score:12} not {"score":12}. This is a custom format used internally by SeaweedFS for query results. * Fix TTL test expectation in VolumeNeedleStatus: 7d normalizes to 1w Same normalization as the HTTP test: Go's ReadTTL calls fitTtlCount which converts 7 days to 1 week. * Include ETag header in 304 Not Modified responses matching Go behavior Go sets ETag on the response writer (via SetEtag) before the If-Modified-Since and If-None-Match conditional checks, so both 304 response paths include the ETag header. The Rust implementation was only adding ETag to 200 responses. * Remove needle-name fallback in chunk manifest filename resolution Go's tryHandleChunkedFile only falls back from URL filename to manifest name. Rust had an extra fallback to needle.name that Go does not perform, which could produce different Content-Disposition filenames for chunk manifests. * Validate JWT nbf (Not Before) claim matching Go's jwt-go/v5 Go's jwt.ParseWithClaims validates the nbf claim when present, rejecting tokens whose nbf is in the future. The Rust jsonwebtoken crate defaults validate_nbf to false, so tokens with future nbf were incorrectly accepted. * Set isHeartbeating to true at startup matching Go's VolumeServer init Go unconditionally sets isHeartbeating: true in the VolumeServer struct literal. Rust was starting with false when masters are configured, causing /healthz to return 503 until the first heartbeat succeeds. * Call store.close() on shutdown matching Go's Shutdown() Go's Shutdown() calls vs.store.Close() which closes all volumes and flushes file handles. The Rust server was relying on process exit for cleanup, which could leave data unflushed. * Include server ID in maintenance mode error matching Go's format Go returns "volume server %s is in maintenance mode" with the store ID. Rust was returning a generic "maintenance mode" message. * Fix DiskType test: use HardDrive variant matching Go's HddType="" Go maps both "" and "hdd" to HardDriveType (empty string). The Rust enum variant is HardDrive, not Hdd. The test referenced a nonexistent Hdd variant causing compilation failure. * Do not include ETag in 304 responses matching Go's GetOrHeadHandler Go sets ETag at L235 AFTER the If-Modified-Since and If-None-Match 304 return paths, so Go's 304 responses do not include the ETag header. The Rust code was incorrectly including ETag in both 304 response paths. * Return 400 on malformed query strings in PostHandler matching Go's ParseForm Go's r.ParseForm() returns HTTP 400 with "form parse error: ..." when the query string is malformed. Rust was silently falling back to empty query params via unwrap_or_default(). * Load EC volume version from .vif matching Go's NewEcVolume Go sets ev.Version = needle.Version(volumeInfo.Version) from the .vif file. Rust was always using Version::current() (V3), which would produce wrong needle actual size calculations for volumes created with V1 or V2. * Sync .ecx file before close matching Go's EcVolume.Close Go calls ev.ecxFile.Sync() before closing to ensure in-place deletion marks are flushed to disk. Without this, deletion marks written via MarkNeedleDeleted could be lost on crash. * Validate SuperBlock extra data size matching Go's Bytes() guard Go checks extraSize > 256*256-2 and calls glog.Fatalf to prevent corrupt super block headers. Rust was silently truncating via u16 cast, which would write an incorrect extra_size field. * Update quinn-proto 0.11.13 -> 0.11.14 to fix GHSA-6xvm-j4wr-6v98 Fixes Dependency Review CI failure: quinn-proto < 0.11.14 is vulnerable to unauthenticated remote DoS via panic in QUIC transport parameter parsing. * Skip TestMultipartUploadUsesFormFieldsForTimestampAndTTL for Go server Go's r.FormValue() cannot read multipart text fields after r.MultipartReader() consumes the body, so ts/ttl sent as multipart form fields only work with the Rust volume server. Skip this test when VOLUME_SERVER_IMPL != "rust" to fix CI failure. * Flush .ecx in EC volume sync_to_disk matching Go's Sync() Go's EcVolume.Sync() flushes both the .ecj journal and the .ecx index to disk. The Rust version only flushed .ecj, leaving in-place deletion marks in .ecx unpersisted until close(). This could cause data inconsistency if the server crashes after marking a needle deleted in .ecx but before close(). * Remove .vif file in EC volume destroy matching Go's Destroy() Go's EcVolume.Destroy() removes .ecx, .ecj, and .vif files. The Rust version only removed .ecx and .ecj, leaving orphaned .vif files on disk after EC volume destruction (e.g., after TTL expiry). * Fix is_expired to use <= for SuperBlockSize check matching Go Go checks contentSize <= SuperBlockSize to detect empty volumes (no needles). Rust used < which would incorrectly allow a volume with exactly SuperBlockSize bytes (header only, no data) to proceed to the TTL expiry check and potentially be marked as expired. * Fix read_append_at_ns to read timestamps from tombstone entries Go reads the full needle body for all entries including tombstones (deleted needles with size=0) to extract the actual AppendAtNs timestamp. The Rust version returned 0 early for size <= 0 entries, which would cause the binary search in incremental copy to produce incorrect results for positions containing deleted needles. Now uses get_actual_size to compute the on-disk size (which handles tombstones correctly) and only returns 0 when the actual size is 0. * Add X-Request-Id response header matching Go's requestIDMiddleware Go sets both X-Request-Id and x-amz-request-id response headers. The Rust server only set x-amz-request-id, missing X-Request-Id. * Add skip_serializing_if for UploadResult name and size fields Go's UploadResult uses json:"name,omitempty" and json:"size,omitempty", omitting these fields from JSON when they are zero values (empty string / 0). The Rust struct always serialized them, producing "name":"" and "size":0 where Go would omit them. * Support JSONP/pretty-print for write success responses Go's writeJsonQuiet checks for callback (JSONP) and pretty query parameters on all JSON responses including write success. The Rust write success path used axum::Json directly, bypassing JSONP and pretty-print support. Now uses json_result_with_query to match Go. * Include actual limit in file size limit error message Go returns "file over the limited %d bytes" with the actual limit value included. Rust returned a generic "file size limit exceeded" without the limit value, making it harder to debug. * Extract extension from 2-segment URL paths for image operations Go's parseURLPath extracts the file extension from all URL formats including 2-segment paths like /vid,fid.jpg. The Rust version only handled 3-segment paths (/vid/fid/filename.ext), so extensions in 2-segment paths were lost. This caused image resize/crop operations requested via query params to be silently skipped for those paths. * Add size_hint to TrackedBody so throttled downloads get Content-Length TrackedBody (used for download throttling) did not implement size_hint(), causing HTTP/1.1 to fall back to chunked transfer encoding instead of setting Content-Length. Go always sets Content-Length explicitly for non-range responses. * Add Last-Modified, pairs, and S3 headers to chunk manifest responses Go sets Last-Modified, needle pairs, and S3 pass-through headers on the response writer BEFORE calling tryHandleChunkedFile. Since the Rust chunk manifest handler created fresh response headers and returned early, these headers were missing from chunk manifest responses. Now passes last_modified_str into the chunk manifest handler and applies pairs and S3 pass-through query params (response-cache-control, response-content-encoding, etc.) to the chunk manifest response headers. * Fix multipart fallback to use first part data when no filename Go reads the first part's data unconditionally, then looks for a part with a filename. If none found, Go uses the first part's data (with empty filename). Rust only captured parts with filenames, so when no part had a filename it fell back to the raw multipart body bytes (including boundary delimiters), producing corrupt needle data. * Set HasName and HasMime flags for empty values matching Go Go's CreateNeedleFromRequest sets HasName and HasMime flags even when the filename or MIME type is empty (len < 256 is true for len 0). Rust skipped empty values, causing the on-disk needle format to differ: Go-written needles include extra bytes for the empty name/mime size fields, changing the serialized needle size in the idx entry. This ensures binary format compatibility between Go and Rust servers. * Add is_stopping guard to vacuum_volume_commit matching Go Go's CommitCompactVolume (store_vacuum.go L53-54) checks s.isStopping before committing compaction to prevent file swaps during shutdown. The Rust handler was missing this check, which could allow compaction commits while the server is stopping. * Remove disk_type from required status fields since Go omits it Go's default DiskType is "" (HardDriveType), and protobuf's omitempty tag causes empty strings to be dropped from JSON output. * test: honor rust env in dual volume harness * grpc: notify master after volume lifecycle changes * http: proxy to replicas before download-limit timeout * test: pass readMode to rust volume harnesses * fix store free-location predicate selection * fix volume copy disk placement and heartbeat notification * fix chunk manifest delete replication * fix write replication to survive client disconnects * fix download limit proxy and wait flow * fix crop gating for streamed reads * fix upload limit wait counter behavior * fix chunk manifest image transforms * fix has_resize_ops to check width/height > 0 instead of is_some() Go's shouldResizeImages condition is `width > 0 || height > 0`, so `?width=0` correctly evaluates to false. Rust was using `is_some()` which made `?width=0` evaluate to true, unnecessarily disabling streaming reads for those requests. * fix Content-MD5 to only compute and return when provided by client Go only computes the MD5 of uncompressed data when a Content-MD5 header or multipart field is provided. Rust was always computing and returning it. Also fix the mismatch error message to include size, matching Go's format. * fix save_vif to compute ExpireAtSec from TTL Go's SaveVolumeInfo always computes ExpireAtSec = now + ttlSeconds when the volume has a TTL. The save_vif path (used by set_read_only and set_writable) was missing this computation, causing .vif files to be written without the correct expiration timestamp for TTL volumes. * fix set_writable to not modify no_write_can_delete Go's MarkVolumeWritable only sets noWriteOrDelete=false and persists. Rust was additionally setting no_write_can_delete=has_remote_file, which could incorrectly change the write mode for remote-file volumes when the master explicitly asks to make the volume writable. * fix write_needle_blob_and_index to error on too-small V3 blob Go returns an error when the needle blob is too small for timestamp patching. Rust was silently skipping the patch and writing the blob with a stale/zero timestamp, which could cause data integrity issues during incremental replication that relies on AppendAtNs ordering. * fix VolumeEcShardsToVolume to validate dataShards range Go validates that dataShards is > 0 and <= MaxShardCount before proceeding with EC-to-volume reconstruction. Without this check, a zero or excessively large data_shards value could cause confusing downstream failures. * fix destroy to use VolumeError::NotEmpty instead of generic Io error The dedicated NotEmpty variant exists in the enum but was not being used. This makes error matching consistent with Go's ErrVolumeNotEmpty. * fix SetState to persist state to disk with rollback on failure Go's State.Update saves VolumeServerState to a state.pb file after each SetState call, and rolls back the in-memory state if persistence fails. Rust was only updating in-memory atomics, so maintenance mode would be lost on server restart. Now saves protobuf-encoded state.pb and loads it on startup. * fix VolumeTierMoveDatToRemote to close local dat backend after upload Go calls v.LoadRemoteFile() after saving volume info, which closes the local DataBackend before transitioning to remote storage. Without this, the volume holds a stale file handle to the deleted local .dat file, causing reads to fail until server restart. * fix VolumeTierMoveDatFromRemote to close remote dat backend after download Go calls v.DataBackend.Close() and sets DataBackend=nil after removing the remote file reference. Without this, the stale remote backend state lingers and reads may not discover the newly downloaded local .dat file until server restart. * fix redirect to use internal url instead of public_url Go's proxyReqToTargetServer builds the redirect Location header from loc.Url (the internal URL), not publicUrl. Using public_url could cause redirect failures when internal and external URLs differ. * fix redirect test and add state_file_path to integration test Update redirect unit test to expect internal url (matching the previous fix). Add missing state_file_path field to the integration test VolumeServerState constructor. * fix FetchAndWriteNeedle to await all writes before checking errors Go uses a WaitGroup to await all writes (local + replicas) before checking errors. Rust was short-circuiting on local write failure, which could leave replica writes in-flight without waiting for completion. * fix shutdown to send deregister heartbeat before pre_stop delay Go's StopHeartbeat() closes stopChan immediately on interrupt, causing the heartbeat goroutine to send the deregister heartbeat right away, before the preStopSeconds delay. Rust was only setting is_stopping=true without waking the heartbeat loop, so the deregister was delayed until after the pre_stop sleep. Now we call volume_state_notify.notify_one() to wake the heartbeat immediately. * fix heartbeat response ordering to check duplicate UUIDs first Go processes heartbeat responses in this order: DuplicatedUuids first, then volume options (prealloc/size limit), then leader redirect. Rust was applying volume options before checking for duplicate UUIDs, which meant volume option changes would take effect even when the response contained a duplicate UUID error that should cause an immediate return. * the test thread was blocked * fix(deps): update aws-lc-sys 0.38.0 → 0.39.0 to resolve security advisories Bumps aws-lc-rs 1.16.1 → 1.16.2, pulling in aws-lc-sys 0.39.0 which fixes GHSA-394x-vwmw-crm3 (X.509 Name Constraints wildcard/unicode bypass) and GHSA-9f94-5g5w-gf6r (CRL Distribution Point scope check logic error). * fix: match Go Content-MD5 mismatch error message format Go uses "Content-MD5 did not match md5 of file data expected [X] received [Y] size Z" while Rust had a shorter format. Match the exact Go error string so clients see identical messages. * fix: match Go Bearer token length check (> 7, not >= 7) Go requires len(bearer) > 7 ensuring at least one char after "Bearer ". Rust used >= 7 which would accept an empty token. * fix(deps): drop legacy rustls 0.21 to resolve rustls-webpki GHSA-pwjx-qhcg-rvj4 aws-sdk-s3's default "rustls" feature enables tls-rustls in aws-smithy-runtime, which pulls in legacy-rustls-ring (rustls 0.21 → rustls-webpki 0.101.7, moderate CRL advisory). Replace with explicit default-https-client which uses only rustls 0.23 / rustls-webpki 0.103.9. * fix: use uploaded filename for auto-compression extension detection Go extracts the file extension from pu.FileName (the uploaded filename) for auto-compression decisions. Rust was using the URL path, which typically has no extension for SeaweedFS file IDs. * fix: add CRC legacy Value() backward-compat check on needle read Go double-checks CRC: n.Checksum != crc && uint32(n.Checksum) != crc.Value(). The Value() path is a deprecated transform for compat with seaweed versions prior to commit 056c480eb. Rust had the legacy_value() method but wasn't using it in validation. * fix: remove /stats/* endpoints to match Go (commented out since L130) Go's volume_server.go has the /stats/counter, /stats/memory, and /stats/disk endpoints commented out (lines 130-134). Remove them from the Rust router along with the now-unused whitelist_guard middleware. * fix: filter application/octet-stream MIME for chunk manifests Go's tryHandleChunkedFile (L334) filters out application/octet-stream from chunk manifest MIME types, falling back to extension-based detection. Rust was returning the stored MIME as-is for manifests. * fix: VolumeMarkWritable returns error before notifying master Go returns early at L200 if MarkVolumeWritable fails, before reaching the master notification at L206. Rust was notifying master even on failure, creating inconsistent state where master thinks the volume is writable but local marking failed. * fix: check volume existence before maintenance in MarkReadonly/Writable Go's VolumeMarkReadonly (L239-241) and VolumeMarkWritable (L253-255) look up the volume first, then call makeVolumeReadonly/Writable which checks maintenance. Rust was checking maintenance first, returning "maintenance mode" instead of "not found" for missing volumes. * feat: implement ScrubVolume mark_broken_volumes_readonly (PR #8360) Add the mark_broken_volumes_readonly flag from PR #8360: - Sync proto field (tag 3) to local volume_server.proto - After scrubbing, if flag is set, call makeVolumeReadonly on each broken volume (notify master, mark local readonly, notify again) - Collect errors via joined error semantics matching Go's errors.Join - Factor out make_volume_readonly helper reused by both VolumeMarkReadonly and ScrubVolume Also refactors VolumeMarkReadonly to use the shared helper. * fix(deps): update rustls-webpki 0.103.9 → 0.103.10 (GHSA-pwjx-qhcg-rvj4) CRL Distribution Point matching logic fix for moderate severity advisory about CRLs not considered authoritative. * test: update integration tests for removed /stats/* endpoints Replace tests that expected /stats/* routes to return 200/401 with tests confirming they now fall through to the store handler (400), matching Go's commented-out stats endpoints. * docs: fix misleading comment about default offset feature The comment said "4-byte offsets unless explicitly built with 5-byte support" but the default feature enables 5bytes. This is intentional for production parity with Go -tags 5BytesOffset builds. Fix the comment to match reality. --- .../workflows/rust-volume-server-tests.yml | 242 + .github/workflows/rust_binaries_dev.yml | 165 + .github/workflows/rust_binaries_release.yml | 215 + .gitignore | 1 + VOLUME_SERVER_RUST_PLAN.md | 790 +++ docker/Dockerfile.go_build | 22 + docker/entrypoint.sh | 14 + install.sh | 275 + seaweed-volume/Cargo.lock | 5255 +++++++++++++++++ seaweed-volume/Cargo.toml | 137 + seaweed-volume/DEV_PLAN.md | 105 + seaweed-volume/MISSING_FEATURES.md | 288 + seaweed-volume/PARITY_PLAN.md | 230 + seaweed-volume/README.md | 140 + seaweed-volume/build.rs | 17 + seaweed-volume/proto/master.proto | 474 ++ seaweed-volume/proto/remote.proto | 76 + seaweed-volume/proto/volume_server.proto | 759 +++ seaweed-volume/src/config.rs | 1697 ++++++ seaweed-volume/src/images.rs | 275 + seaweed-volume/src/lib.rs | 27 + seaweed-volume/src/main.rs | 1051 ++++ seaweed-volume/src/metrics.rs | 448 ++ seaweed-volume/src/remote_storage/mod.rs | 157 + seaweed-volume/src/remote_storage/s3.rs | 186 + seaweed-volume/src/remote_storage/s3_tier.rs | 514 ++ seaweed-volume/src/security.rs | 481 ++ seaweed-volume/src/security/tls.rs | 437 ++ seaweed-volume/src/server/debug.rs | 159 + seaweed-volume/src/server/favicon.ico | Bin 0 -> 70 bytes seaweed-volume/src/server/grpc_client.rs | 206 + seaweed-volume/src/server/grpc_server.rs | 4536 ++++++++++++++ seaweed-volume/src/server/handlers.rs | 3913 ++++++++++++ seaweed-volume/src/server/heartbeat.rs | 1576 +++++ seaweed-volume/src/server/memory_status.rs | 102 + seaweed-volume/src/server/mod.rs | 12 + seaweed-volume/src/server/profiling.rs | 187 + seaweed-volume/src/server/request_id.rs | 137 + seaweed-volume/src/server/server_stats.rs | 248 + seaweed-volume/src/server/ui.rs | 507 ++ seaweed-volume/src/server/volume_server.rs | 394 ++ seaweed-volume/src/server/write_queue.rs | 330 ++ seaweed-volume/src/storage/disk_location.rs | 951 +++ .../src/storage/erasure_coding/ec_decoder.rs | 261 + .../src/storage/erasure_coding/ec_encoder.rs | 824 +++ .../src/storage/erasure_coding/ec_locate.rs | 223 + .../src/storage/erasure_coding/ec_shard.rs | 225 + .../src/storage/erasure_coding/ec_volume.rs | 944 +++ .../src/storage/erasure_coding/mod.rs | 16 + seaweed-volume/src/storage/idx/mod.rs | 116 + seaweed-volume/src/storage/mod.rs | 9 + seaweed-volume/src/storage/needle/crc.rs | 73 + seaweed-volume/src/storage/needle/mod.rs | 7 + seaweed-volume/src/storage/needle/needle.rs | 944 +++ seaweed-volume/src/storage/needle/ttl.rs | 302 + seaweed-volume/src/storage/needle_map.rs | 1438 +++++ .../src/storage/needle_map/compact_map.rs | 375 ++ seaweed-volume/src/storage/store.rs | 1297 ++++ seaweed-volume/src/storage/super_block.rs | 289 + seaweed-volume/src/storage/types.rs | 679 +++ seaweed-volume/src/storage/volume.rs | 4246 +++++++++++++ seaweed-volume/src/version.rs | 79 + seaweed-volume/tests/http_integration.rs | 677 +++ .../tools/generate_go_volume_docs.go | 1172 ++++ .../vendor/reed-solomon-erasure/.cargo-ok | 1 + .../reed-solomon-erasure/.cargo_vcs_info.json | 6 + .../reed-solomon-erasure/.gitattributes | 3 + .../vendor/reed-solomon-erasure/.gitignore | 2 + .../vendor/reed-solomon-erasure/CHANGELOG.md | 181 + .../vendor/reed-solomon-erasure/Cargo.toml | 87 + .../reed-solomon-erasure/Cargo.toml.orig | 56 + .../vendor/reed-solomon-erasure/LICENSE | 24 + .../vendor/reed-solomon-erasure/README.md | 166 + .../benches/reconstruct.rs | 108 + .../vendor/reed-solomon-erasure/build.rs | 196 + .../sage/galois_ext_test.sage | 26 + .../reed-solomon-erasure/simd_c/reedsolomon.c | 574 ++ .../reed-solomon-erasure/simd_c/reedsolomon.h | 54 + .../vendor/reed-solomon-erasure/src/core.rs | 927 +++ .../vendor/reed-solomon-erasure/src/errors.rs | 158 + .../reed-solomon-erasure/src/galois_16.rs | 412 ++ .../reed-solomon-erasure/src/galois_8.rs | 621 ++ .../vendor/reed-solomon-erasure/src/lib.rs | 200 + .../vendor/reed-solomon-erasure/src/macros.rs | 245 + .../vendor/reed-solomon-erasure/src/matrix.rs | 425 ++ .../src/tests/galois_16.rs | 489 ++ .../reed-solomon-erasure/src/tests/mod.rs | 2619 ++++++++ test/s3/normal/s3_integration_test.go | 109 +- test/s3/policy/policy_test.go | 84 + test/volume_server/framework/cluster.go | 74 +- test/volume_server/framework/cluster_dual.go | 6 +- .../framework/cluster_interface.go | 63 + .../framework/cluster_interface_test.go | 20 + .../framework/cluster_multi_rust.go | 289 + test/volume_server/framework/cluster_rust.go | 342 ++ .../framework/cluster_rust_test.go | 38 + test/volume_server/grpc/admin_extra_test.go | 75 +- .../grpc/admin_lifecycle_test.go | 10 +- .../grpc/admin_readonly_collection_test.go | 8 +- test/volume_server/grpc/batch_delete_test.go | 8 +- .../grpc/copy_receive_variants_test.go | 73 +- test/volume_server/grpc/copy_sync_test.go | 19 +- test/volume_server/grpc/data_rw_test.go | 6 +- .../grpc/data_stream_success_test.go | 161 +- .../volume_server/grpc/erasure_coding_test.go | 18 +- .../grpc/fetch_remote_s3_test.go | 288 + test/volume_server/grpc/health_state_test.go | 4 +- .../grpc/move_tail_timestamp_test.go | 4 +- .../grpc/production_features_test.go | 338 ++ test/volume_server/grpc/scrub_query_test.go | 16 +- test/volume_server/grpc/tail_test.go | 8 +- .../volume_server/grpc/tiering_remote_test.go | 10 +- test/volume_server/grpc/vacuum_test.go | 4 +- test/volume_server/http/admin_test.go | 21 +- test/volume_server/http/auth_test.go | 14 +- .../volume_server/http/chunk_manifest_test.go | 6 +- .../http/compressed_read_test.go | 2 +- .../volume_server/http/headers_static_test.go | 44 +- .../http/image_transform_test.go | 2 +- .../http/production_features_test.go | 387 ++ .../http/public_cors_methods_test.go | 54 +- .../volume_server/http/range_variants_test.go | 4 +- test/volume_server/http/read_deleted_test.go | 2 +- .../http/read_path_variants_test.go | 8 +- .../http/read_write_delete_test.go | 4 +- .../http/replication_lifecycle_test.go | 63 + test/volume_server/http/throttling_test.go | 18 +- .../http/write_delete_variants_test.go | 4 +- .../http/write_error_variants_test.go | 4 +- test/volume_server/loadtest/loadtest_test.go | 628 ++ test/volume_server/matrix/config_profiles.go | 1 + test/volume_server/rust/rust_volume_test.go | 310 + weed/pb/Makefile | 1 + weed/storage/volume.go | 2 +- weed/storage/volume_vacuum.go | 1 + weed/storage/volume_write.go | 2 + 136 files changed, 52964 insertions(+), 205 deletions(-) create mode 100644 .github/workflows/rust-volume-server-tests.yml create mode 100644 .github/workflows/rust_binaries_dev.yml create mode 100644 .github/workflows/rust_binaries_release.yml create mode 100644 VOLUME_SERVER_RUST_PLAN.md create mode 100755 install.sh create mode 100644 seaweed-volume/Cargo.lock create mode 100644 seaweed-volume/Cargo.toml create mode 100644 seaweed-volume/DEV_PLAN.md create mode 100644 seaweed-volume/MISSING_FEATURES.md create mode 100644 seaweed-volume/PARITY_PLAN.md create mode 100644 seaweed-volume/README.md create mode 100644 seaweed-volume/build.rs create mode 100644 seaweed-volume/proto/master.proto create mode 100644 seaweed-volume/proto/remote.proto create mode 100644 seaweed-volume/proto/volume_server.proto create mode 100644 seaweed-volume/src/config.rs create mode 100644 seaweed-volume/src/images.rs create mode 100644 seaweed-volume/src/lib.rs create mode 100644 seaweed-volume/src/main.rs create mode 100644 seaweed-volume/src/metrics.rs create mode 100644 seaweed-volume/src/remote_storage/mod.rs create mode 100644 seaweed-volume/src/remote_storage/s3.rs create mode 100644 seaweed-volume/src/remote_storage/s3_tier.rs create mode 100644 seaweed-volume/src/security.rs create mode 100644 seaweed-volume/src/security/tls.rs create mode 100644 seaweed-volume/src/server/debug.rs create mode 100644 seaweed-volume/src/server/favicon.ico create mode 100644 seaweed-volume/src/server/grpc_client.rs create mode 100644 seaweed-volume/src/server/grpc_server.rs create mode 100644 seaweed-volume/src/server/handlers.rs create mode 100644 seaweed-volume/src/server/heartbeat.rs create mode 100644 seaweed-volume/src/server/memory_status.rs create mode 100644 seaweed-volume/src/server/mod.rs create mode 100644 seaweed-volume/src/server/profiling.rs create mode 100644 seaweed-volume/src/server/request_id.rs create mode 100644 seaweed-volume/src/server/server_stats.rs create mode 100644 seaweed-volume/src/server/ui.rs create mode 100644 seaweed-volume/src/server/volume_server.rs create mode 100644 seaweed-volume/src/server/write_queue.rs create mode 100644 seaweed-volume/src/storage/disk_location.rs create mode 100644 seaweed-volume/src/storage/erasure_coding/ec_decoder.rs create mode 100644 seaweed-volume/src/storage/erasure_coding/ec_encoder.rs create mode 100644 seaweed-volume/src/storage/erasure_coding/ec_locate.rs create mode 100644 seaweed-volume/src/storage/erasure_coding/ec_shard.rs create mode 100644 seaweed-volume/src/storage/erasure_coding/ec_volume.rs create mode 100644 seaweed-volume/src/storage/erasure_coding/mod.rs create mode 100644 seaweed-volume/src/storage/idx/mod.rs create mode 100644 seaweed-volume/src/storage/mod.rs create mode 100644 seaweed-volume/src/storage/needle/crc.rs create mode 100644 seaweed-volume/src/storage/needle/mod.rs create mode 100644 seaweed-volume/src/storage/needle/needle.rs create mode 100644 seaweed-volume/src/storage/needle/ttl.rs create mode 100644 seaweed-volume/src/storage/needle_map.rs create mode 100644 seaweed-volume/src/storage/needle_map/compact_map.rs create mode 100644 seaweed-volume/src/storage/store.rs create mode 100644 seaweed-volume/src/storage/super_block.rs create mode 100644 seaweed-volume/src/storage/types.rs create mode 100644 seaweed-volume/src/storage/volume.rs create mode 100644 seaweed-volume/src/version.rs create mode 100644 seaweed-volume/tests/http_integration.rs create mode 100644 seaweed-volume/tools/generate_go_volume_docs.go create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/.cargo-ok create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/.cargo_vcs_info.json create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/.gitattributes create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/.gitignore create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/CHANGELOG.md create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml.orig create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/LICENSE create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/README.md create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/benches/reconstruct.rs create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/build.rs create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/sage/galois_ext_test.sage create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.c create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.h create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/src/core.rs create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/src/errors.rs create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/src/galois_16.rs create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/src/galois_8.rs create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/src/lib.rs create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/src/macros.rs create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/src/matrix.rs create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/src/tests/galois_16.rs create mode 100644 seaweed-volume/vendor/reed-solomon-erasure/src/tests/mod.rs create mode 100644 test/volume_server/framework/cluster_interface.go create mode 100644 test/volume_server/framework/cluster_interface_test.go create mode 100644 test/volume_server/framework/cluster_multi_rust.go create mode 100644 test/volume_server/framework/cluster_rust.go create mode 100644 test/volume_server/framework/cluster_rust_test.go create mode 100644 test/volume_server/grpc/fetch_remote_s3_test.go create mode 100644 test/volume_server/grpc/production_features_test.go create mode 100644 test/volume_server/http/production_features_test.go create mode 100644 test/volume_server/http/replication_lifecycle_test.go create mode 100644 test/volume_server/loadtest/loadtest_test.go create mode 100644 test/volume_server/rust/rust_volume_test.go diff --git a/.github/workflows/rust-volume-server-tests.yml b/.github/workflows/rust-volume-server-tests.yml new file mode 100644 index 000000000..40a125764 --- /dev/null +++ b/.github/workflows/rust-volume-server-tests.yml @@ -0,0 +1,242 @@ +name: "Rust Volume Server Tests" + +on: + pull_request: + branches: [ master ] + paths: + - 'seaweed-volume/**' + - 'test/volume_server/**' + - 'weed/pb/volume_server.proto' + - 'weed/pb/volume_server_pb/**' + - '.github/workflows/rust-volume-server-tests.yml' + push: + branches: [ master, main ] + paths: + - 'seaweed-volume/**' + - 'test/volume_server/**' + - 'weed/pb/volume_server.proto' + - 'weed/pb/volume_server_pb/**' + - '.github/workflows/rust-volume-server-tests.yml' + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + + +jobs: + rust-unit-tests: + name: Rust Unit Tests + runs-on: ubuntu-22.04 + timeout-minutes: 15 + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust- + + - name: Build Rust volume server + run: cd seaweed-volume && cargo build --release + + - name: Run Rust unit tests + run: cd seaweed-volume && cargo test + + rust-integration-tests: + name: Rust Integration Tests + runs-on: ubuntu-22.04 + timeout-minutes: 30 + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version-file: 'go.mod' + + - name: Install protobuf compiler + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust- + + - name: Build Go weed binary + run: | + cd weed + go build -o weed . + chmod +x weed + ./weed version + + - name: Build Rust volume binary + run: cd seaweed-volume && cargo build --release + + - name: Run integration tests + env: + WEED_BINARY: ${{ github.workspace }}/weed/weed + RUST_VOLUME_BINARY: ${{ github.workspace }}/seaweed-volume/target/release/weed-volume + run: | + echo "Running Rust volume server integration tests..." + go test -v -count=1 -timeout=15m ./test/volume_server/rust/... + + - name: Collect logs on failure + if: failure() + run: | + mkdir -p /tmp/rust-volume-server-it-logs + find /tmp -maxdepth 1 -type d -name "seaweedfs_volume_server_it_*" -print -exec cp -r {} /tmp/rust-volume-server-it-logs/ \; || true + + - name: Archive logs on failure + if: failure() + uses: actions/upload-artifact@v7 + with: + name: rust-volume-server-integration-test-logs + path: /tmp/rust-volume-server-it-logs/ + if-no-files-found: warn + retention-days: 7 + + - name: Test summary + if: always() + run: | + echo "## Rust Volume Server Integration Test Summary" >> "$GITHUB_STEP_SUMMARY" + echo "- Suite: test/volume_server/rust" >> "$GITHUB_STEP_SUMMARY" + echo "- Command: go test -v -count=1 -timeout=15m ./test/volume_server/rust/..." >> "$GITHUB_STEP_SUMMARY" + + rust-volume-go-tests: + name: Go Tests with Rust Volume (${{ matrix.test-type }} - Shard ${{ matrix.shard }}) + runs-on: ubuntu-22.04 + timeout-minutes: 45 + strategy: + fail-fast: false + matrix: + test-type: [grpc, http] + shard: [1, 2, 3] + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version-file: 'go.mod' + + - name: Install protobuf compiler + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust- + + - name: Build Go weed binary + run: | + cd weed + go build -o weed . + chmod +x weed + ./weed version + + - name: Build Rust volume binary + run: cd seaweed-volume && cargo build --release + + - name: Run volume server integration tests with Rust volume + env: + WEED_BINARY: ${{ github.workspace }}/weed/weed + RUST_VOLUME_BINARY: ${{ github.workspace }}/seaweed-volume/target/release/weed-volume + VOLUME_SERVER_IMPL: rust + run: | + if [ "${{ matrix.test-type }}" == "grpc" ]; then + if [ "${{ matrix.shard }}" == "1" ]; then + TEST_PATTERN="^Test[A-H]" + elif [ "${{ matrix.shard }}" == "2" ]; then + TEST_PATTERN="^Test[I-S]" + else + TEST_PATTERN="^Test[T-Z]" + fi + else + if [ "${{ matrix.shard }}" == "1" ]; then + TEST_PATTERN="^Test[A-G]" + elif [ "${{ matrix.shard }}" == "2" ]; then + TEST_PATTERN="^Test[H-R]" + else + TEST_PATTERN="^Test[S-Z]" + fi + fi + echo "Running Go volume server tests with Rust volume for ${{ matrix.test-type }} (Shard ${{ matrix.shard }}, pattern: ${TEST_PATTERN})..." + go test -v -count=1 -tags 5BytesOffset -timeout=30m ./test/volume_server/${{ matrix.test-type }}/... -run "${TEST_PATTERN}" + + - name: Collect logs on failure + if: failure() + run: | + mkdir -p /tmp/rust-volume-go-test-logs + find /tmp -maxdepth 1 -type d -name "seaweedfs_volume_server_it_*" -print -exec cp -r {} /tmp/rust-volume-go-test-logs/ \; || true + + - name: Archive logs on failure + if: failure() + uses: actions/upload-artifact@v7 + with: + name: rust-volume-go-test-logs-${{ matrix.test-type }}-shard${{ matrix.shard }} + path: /tmp/rust-volume-go-test-logs/ + if-no-files-found: warn + retention-days: 7 + + - name: Test summary + if: always() + run: | + if [ "${{ matrix.test-type }}" == "grpc" ]; then + if [ "${{ matrix.shard }}" == "1" ]; then + TEST_PATTERN="^Test[A-H]" + elif [ "${{ matrix.shard }}" == "2" ]; then + TEST_PATTERN="^Test[I-S]" + else + TEST_PATTERN="^Test[T-Z]" + fi + else + if [ "${{ matrix.shard }}" == "1" ]; then + TEST_PATTERN="^Test[A-G]" + elif [ "${{ matrix.shard }}" == "2" ]; then + TEST_PATTERN="^Test[H-R]" + else + TEST_PATTERN="^Test[S-Z]" + fi + fi + echo "## Rust Volume - Go Test Summary (${{ matrix.test-type }} - Shard ${{ matrix.shard }})" >> "$GITHUB_STEP_SUMMARY" + echo "- Suite: test/volume_server/${{ matrix.test-type }} (Pattern: ${TEST_PATTERN})" >> "$GITHUB_STEP_SUMMARY" + echo "- Volume server: Rust (VOLUME_SERVER_IMPL=rust)" >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/rust_binaries_dev.yml b/.github/workflows/rust_binaries_dev.yml new file mode 100644 index 000000000..cc81b93df --- /dev/null +++ b/.github/workflows/rust_binaries_dev.yml @@ -0,0 +1,165 @@ +name: "rust: build dev volume server binaries" + +on: + push: + branches: [ master ] + paths: + - 'seaweed-volume/**' + - '.github/workflows/rust_binaries_dev.yml' + +permissions: + contents: read + +jobs: + + cleanup: + permissions: + contents: write + runs-on: ubuntu-latest + steps: + - name: Delete old Rust volume dev assets + uses: mknejp/delete-release-assets@v1 + with: + token: ${{ github.token }} + tag: dev + fail-if-no-assets: false + assets: | + weed-volume-* + + build-rust-volume-dev-linux: + permissions: + contents: write + needs: cleanup + runs-on: ubuntu-22.04 + strategy: + matrix: + include: + - target: x86_64-unknown-linux-gnu + asset_suffix: linux-amd64 + + steps: + - uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-dev-${{ matrix.target }}-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust-dev-${{ matrix.target }}- + + - name: Set BUILD_TIME + run: echo BUILD_TIME=$(date -u +%Y%m%d-%H%M) >> "$GITHUB_ENV" + + - name: Build Rust volume server (large disk) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: cd seaweed-volume && cargo build --release + + - name: Package large disk binary + run: | + cp seaweed-volume/target/release/weed-volume weed-volume-large-disk + tar czf "weed-volume-large-disk-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz" weed-volume-large-disk + rm weed-volume-large-disk + + - name: Build Rust volume server (normal) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: cd seaweed-volume && cargo build --release --no-default-features + + - name: Package normal binary + run: | + cp seaweed-volume/target/release/weed-volume weed-volume-normal + tar czf "weed-volume-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz" weed-volume-normal + rm weed-volume-normal + + - name: Upload dev release assets + uses: softprops/action-gh-release@v2 + with: + tag_name: dev + prerelease: true + files: | + weed-volume-large-disk-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz + weed-volume-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + build-rust-volume-dev-darwin: + permissions: + contents: write + needs: build-rust-volume-dev-linux + runs-on: macos-latest + strategy: + matrix: + include: + - target: aarch64-apple-darwin + asset_suffix: darwin-arm64 + - target: x86_64-apple-darwin + asset_suffix: darwin-amd64 + + steps: + - uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: brew install protobuf + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + targets: ${{ matrix.target }} + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-dev-${{ matrix.target }}-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust-dev-${{ matrix.target }}- + + - name: Set BUILD_TIME + run: echo BUILD_TIME=$(date -u +%Y%m%d-%H%M) >> "$GITHUB_ENV" + + - name: Build Rust volume server (large disk) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: cd seaweed-volume && cargo build --release --target ${{ matrix.target }} + + - name: Package large disk binary + run: | + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-large-disk + tar czf "weed-volume-large-disk-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz" weed-volume-large-disk + rm weed-volume-large-disk + + - name: Build Rust volume server (normal) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: cd seaweed-volume && cargo build --release --target ${{ matrix.target }} --no-default-features + + - name: Package normal binary + run: | + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-normal + tar czf "weed-volume-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz" weed-volume-normal + rm weed-volume-normal + + - name: Upload dev release assets + uses: softprops/action-gh-release@v2 + with: + tag_name: dev + prerelease: true + files: | + weed-volume-large-disk-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz + weed-volume-${{ env.BUILD_TIME }}-${{ matrix.asset_suffix }}.tar.gz + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/rust_binaries_release.yml b/.github/workflows/rust_binaries_release.yml new file mode 100644 index 000000000..a7f91105f --- /dev/null +++ b/.github/workflows/rust_binaries_release.yml @@ -0,0 +1,215 @@ +name: "rust: build versioned volume server binaries" + +on: + push: + tags: + - '*' + + workflow_dispatch: + +permissions: + contents: read + +jobs: + + build-rust-volume-linux: + permissions: + contents: write + runs-on: ubuntu-22.04 + strategy: + matrix: + include: + - target: x86_64-unknown-linux-gnu + asset_suffix: linux_amd64 + - target: aarch64-unknown-linux-gnu + asset_suffix: linux_arm64 + cross: true + + steps: + - uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: sudo apt-get update && sudo apt-get install -y protobuf-compiler + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + targets: ${{ matrix.target }} + + - name: Install cross-compilation tools + if: matrix.cross + run: | + sudo apt-get install -y gcc-aarch64-linux-gnu + echo "CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER=aarch64-linux-gnu-gcc" >> "$GITHUB_ENV" + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-release-${{ matrix.target }}-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust-release-${{ matrix.target }}- + + - name: Build Rust volume server (large disk) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release --target ${{ matrix.target }} + + - name: Build Rust volume server (normal) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release --target ${{ matrix.target }} --no-default-features + + - name: Package binaries + run: | + # Large disk (default, 5bytes feature) + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-large-disk + tar czf weed-volume_large_disk_${{ matrix.asset_suffix }}.tar.gz weed-volume-large-disk + rm weed-volume-large-disk + + # Normal volume size + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-normal + tar czf weed-volume_${{ matrix.asset_suffix }}.tar.gz weed-volume-normal + rm weed-volume-normal + + - name: Upload release assets + uses: softprops/action-gh-release@v2 + with: + files: | + weed-volume_large_disk_${{ matrix.asset_suffix }}.tar.gz + weed-volume_${{ matrix.asset_suffix }}.tar.gz + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + build-rust-volume-darwin: + permissions: + contents: write + runs-on: macos-latest + strategy: + matrix: + include: + - target: x86_64-apple-darwin + asset_suffix: darwin_amd64 + - target: aarch64-apple-darwin + asset_suffix: darwin_arm64 + + steps: + - uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: brew install protobuf + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + with: + targets: ${{ matrix.target }} + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-release-${{ matrix.target }}-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust-release-${{ matrix.target }}- + + - name: Build Rust volume server (large disk) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release --target ${{ matrix.target }} + + - name: Build Rust volume server (normal) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release --target ${{ matrix.target }} --no-default-features + + - name: Package binaries + run: | + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-large-disk + tar czf weed-volume_large_disk_${{ matrix.asset_suffix }}.tar.gz weed-volume-large-disk + rm weed-volume-large-disk + + cp seaweed-volume/target/${{ matrix.target }}/release/weed-volume weed-volume-normal + tar czf weed-volume_${{ matrix.asset_suffix }}.tar.gz weed-volume-normal + rm weed-volume-normal + + - name: Upload release assets + uses: softprops/action-gh-release@v2 + with: + files: | + weed-volume_large_disk_${{ matrix.asset_suffix }}.tar.gz + weed-volume_${{ matrix.asset_suffix }}.tar.gz + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + + build-rust-volume-windows: + permissions: + contents: write + runs-on: windows-latest + + steps: + - uses: actions/checkout@v6 + + - name: Install protobuf compiler + run: choco install protoc -y + + - name: Install Rust toolchain + uses: dtolnay/rust-toolchain@stable + + - name: Cache cargo registry and target + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + seaweed-volume/target + key: rust-release-windows-${{ hashFiles('seaweed-volume/Cargo.lock') }} + restore-keys: | + rust-release-windows- + + - name: Build Rust volume server (large disk) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release + + - name: Build Rust volume server (normal) + env: + SEAWEEDFS_COMMIT: ${{ github.sha }} + run: | + cd seaweed-volume + cargo build --release --no-default-features + + - name: Package binaries + shell: bash + run: | + cp seaweed-volume/target/release/weed-volume.exe weed-volume-large-disk.exe + 7z a weed-volume_large_disk_windows_amd64.zip weed-volume-large-disk.exe + rm weed-volume-large-disk.exe + + cp seaweed-volume/target/release/weed-volume.exe weed-volume-normal.exe + 7z a weed-volume_windows_amd64.zip weed-volume-normal.exe + rm weed-volume-normal.exe + + - name: Upload release assets + uses: softprops/action-gh-release@v2 + with: + files: | + weed-volume_large_disk_windows_amd64.zip + weed-volume_windows_amd64.zip + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore index a3ea87971..b356654f9 100644 --- a/.gitignore +++ b/.gitignore @@ -143,3 +143,4 @@ test/s3/iam/.test_env weed_bin telemetry/server/telemetry-server .aider* +/seaweed-volume/docs diff --git a/VOLUME_SERVER_RUST_PLAN.md b/VOLUME_SERVER_RUST_PLAN.md new file mode 100644 index 000000000..1c402336f --- /dev/null +++ b/VOLUME_SERVER_RUST_PLAN.md @@ -0,0 +1,790 @@ +# Execution Plan: SeaweedFS Volume Server — Go to Rust Port + +## Scope Summary + +| Component | Go Source | Lines (non-test) | Description | +|---|---|---|---| +| CLI & startup | `weed/command/volume.go` | 476 | ~40 CLI flags, server bootstrap | +| HTTP server + handlers | `weed/server/volume_server*.go` | 1,517 | Struct, routes, read/write/delete handlers | +| gRPC handlers | `weed/server/volume_grpc_*.go` | 3,073 | 40 RPC method implementations | +| Storage engine | `weed/storage/` | 15,271 | Volumes, needles, index, compaction, EC, backend | +| Protobuf definitions | `weed/pb/volume_server.proto` | 759 | Service + message definitions | +| Shared utilities | `weed/security/`, `weed/stats/`, `weed/util/` | ~2,000+ | JWT, TLS, metrics, helpers | +| **Total** | | **~23,000+** | | + +## Rust Crate & Dependency Strategy + +``` +seaweed-volume/ +├── Cargo.toml +├── build.rs # protobuf codegen +├── proto/ +│ ├── volume_server.proto # copied from Go, adapted +│ └── remote.proto +├── src/ +│ ├── main.rs # CLI entry point +│ ├── config.rs # CLI flags + config +│ ├── server/ +│ │ ├── mod.rs +│ │ ├── volume_server.rs # VolumeServer struct + lifecycle +│ │ ├── http_handlers.rs # HTTP route dispatch +│ │ ├── http_read.rs # GET/HEAD handlers +│ │ ├── http_write.rs # POST/PUT handlers +│ │ ├── http_delete.rs # DELETE handler +│ │ ├── http_admin.rs # /status, /healthz, /ui +│ │ ├── grpc_service.rs # gRPC trait impl dispatch +│ │ ├── grpc_vacuum.rs +│ │ ├── grpc_copy.rs +│ │ ├── grpc_erasure_coding.rs +│ │ ├── grpc_tail.rs +│ │ ├── grpc_admin.rs +│ │ ├── grpc_read_write.rs +│ │ ├── grpc_batch_delete.rs +│ │ ├── grpc_scrub.rs +│ │ ├── grpc_tier.rs +│ │ ├── grpc_remote.rs +│ │ ├── grpc_query.rs +│ │ ├── grpc_state.rs +│ │ └── grpc_client_to_master.rs # heartbeat +│ ├── storage/ +│ │ ├── mod.rs +│ │ ├── store.rs # Store (multi-disk manager) +│ │ ├── volume.rs # Volume struct + lifecycle +│ │ ├── volume_read.rs +│ │ ├── volume_write.rs +│ │ ├── volume_compact.rs +│ │ ├── volume_info.rs +│ │ ├── needle/ +│ │ │ ├── mod.rs +│ │ │ ├── needle.rs # Needle struct + serialization +│ │ │ ├── needle_read.rs +│ │ │ ├── needle_write.rs +│ │ │ ├── needle_map.rs # in-memory NeedleMap +│ │ │ ├── needle_value.rs +│ │ │ └── crc.rs +│ │ ├── super_block.rs +│ │ ├── idx/ +│ │ │ ├── mod.rs +│ │ │ └── idx.rs # .idx file format read/write +│ │ ├── needle_map_leveldb.rs +│ │ ├── types.rs # NeedleId, Offset, Size, DiskType +│ │ ├── disk_location.rs # DiskLocation per-directory +│ │ ├── erasure_coding/ +│ │ │ ├── mod.rs +│ │ │ ├── ec_volume.rs +│ │ │ ├── ec_shard.rs +│ │ │ ├── ec_encoder.rs # Reed-Solomon encoding +│ │ │ └── ec_decoder.rs +│ │ └── backend/ +│ │ ├── mod.rs +│ │ ├── disk.rs +│ │ └── s3_backend.rs # tiered storage to S3 +│ ├── topology/ +│ │ └── volume_layout.rs # replication placement +│ ├── security/ +│ │ ├── mod.rs +│ │ ├── guard.rs # whitelist + JWT gate +│ │ ├── jwt.rs +│ │ └── tls.rs +│ ├── stats/ +│ │ ├── mod.rs +│ │ └── metrics.rs # Prometheus counters/gauges +│ └── util/ +│ ├── mod.rs +│ ├── grpc.rs +│ ├── http.rs +│ └── file.rs +└── tests/ + ├── integration/ + │ ├── http_read_test.rs + │ ├── http_write_test.rs + │ ├── grpc_test.rs + │ └── storage_test.rs + └── unit/ + ├── needle_test.rs + ├── idx_test.rs + ├── super_block_test.rs + └── ec_test.rs +``` + +### Key Rust dependencies + +| Purpose | Crate | +|---|---| +| Async runtime | `tokio` | +| gRPC | `tonic` + `prost` | +| HTTP server | `hyper` + `axum` | +| CLI parsing | `clap` (derive) | +| Prometheus metrics | `prometheus` | +| JWT | `jsonwebtoken` | +| TLS | `rustls` + `tokio-rustls` | +| LevelDB | `rusty-leveldb` or `rocksdb` | +| Reed-Solomon EC | `reed-solomon-erasure` | +| Logging | `tracing` + `tracing-subscriber` | +| Config (security.toml) | `toml` + `serde` | +| CRC32 | `crc32fast` | +| Memory-mapped files | `memmap2` | + +--- + +## Phased Execution Plan + +### Phase 1: Project Skeleton & Protobuf Codegen +**Goal:** Cargo project compiles, proto codegen works, CLI parses all flags. + +**Steps:** + +1.1. Create `seaweed-volume/Cargo.toml` with all dependencies listed above. + +1.2. Copy `volume_server.proto` and `remote.proto` into `proto/`. Adjust package paths for Rust codegen. + +1.3. Create `build.rs` using `tonic-build` to compile `.proto` files into Rust types. + +1.4. Create `src/main.rs` with `clap` derive structs mirroring all 40 CLI flags from `weed/command/volume.go`: + - `--port` (default 8080) + - `--port.grpc` (default 0 → 10000+port) + - `--port.public` (default 0 → same as port) + - `--ip` (auto-detect) + - `--id` (default empty → ip:port) + - `--publicUrl` + - `--ip.bind` + - `--master` (default "localhost:9333") + - `--mserver` (deprecated compat) + - `--preStopSeconds` (default 10) + - `--idleTimeout` (default 30) + - `--dataCenter` + - `--rack` + - `--index` [memory|leveldb|leveldbMedium|leveldbLarge] + - `--disk` [hdd|ssd|] + - `--tags` + - `--dir` (default temp dir) + - `--dir.idx` + - `--max` (default "8") + - `--whiteList` + - `--minFreeSpacePercent` (default "1") + - `--minFreeSpace` + - `--images.fix.orientation` (default false) + - `--readMode` [local|proxy|redirect] (default "proxy") + - `--cpuprofile` + - `--memprofile` + - `--compactionMBps` (default 0) + - `--maintenanceMBps` (default 0) + - `--fileSizeLimitMB` (default 256) + - `--concurrentUploadLimitMB` (default 0) + - `--concurrentDownloadLimitMB` (default 0) + - `--pprof` (default false) + - `--metricsPort` (default 0) + - `--metricsIp` + - `--inflightUploadDataTimeout` (default 60s) + - `--inflightDownloadDataTimeout` (default 60s) + - `--hasSlowRead` (default true) + - `--readBufferSizeMB` (default 4) + - `--index.leveldbTimeout` (default 0) + - `--debug` (default false) + - `--debug.port` (default 6060) + +1.5. Implement the same flag validation logic from `startVolumeServer()`: + - Parse comma-separated `--dir`, `--max`, `--minFreeSpace`, `--disk`, `--tags` + - Replicate single-value-to-all-dirs expansion + - Validate count matches between dirs and limits + - `--mserver` backward compat + +1.6. **Test:** `cargo build` succeeds. `cargo run -- --help` shows all flags. Proto types generated. + +**Verification:** Run with `--port 8080 --dir /tmp --master localhost:9333` — should parse without error and print config. + +--- + +### Phase 2: Core Storage Types & On-Disk Format +**Goal:** Read and write the SeaweedFS needle/volume binary format bit-for-bit compatible with Go. + +**Source files to port:** +- `weed/storage/types/needle_types.go` → `src/storage/types.rs` +- `weed/storage/needle/needle.go` → `src/storage/needle/needle.rs` +- `weed/storage/needle/needle_read.go` → `src/storage/needle/needle_read.rs` +- `weed/storage/needle/needle_write.go` (partial) → `src/storage/needle/needle_write.rs` +- `weed/storage/needle/crc.go` → `src/storage/needle/crc.rs` +- `weed/storage/needle/needle_value_map.go` → `src/storage/needle/needle_value.rs` +- `weed/storage/super_block/super_block.go` → `src/storage/super_block.rs` +- `weed/storage/idx/` → `src/storage/idx/` + +**Steps:** + +2.1. **Fundamental types** (`types.rs`): + - `NeedleId` (u64), `Offset` (u32 or u64 depending on version), `Size` (i32, negative = deleted) + - `Cookie` (u32) + - `DiskType` enum (HDD, SSD, Custom) + - Version constants (Version1=1, Version2=2, Version3=3, CurrentVersion=3) + - Byte serialization matching Go's `binary.BigEndian` encoding + +2.2. **SuperBlock** (`super_block.rs`): + - 8-byte header: Version(1) + ReplicaPlacement(1) + TTL(2) + CompactRevision(2) + Reserved(2) + - `ReplicaPlacement` struct with same/diff rack/dc counts + - `TTL` struct with count + unit + - Read/write from first 8 bytes of `.dat` file + - Match exact byte layout from `super_block.go` + +2.3. **Needle binary format** (`needle.rs`, `needle_read.rs`): + - Version 2/3 header: Cookie(4) + NeedleId(8) + Size(4) + - Body: Data, Flags, Name, Mime, PairsSize, Pairs, LastModified, TTL, Checksum, AppendAtNs, Padding + - CRC32 checksum (matching Go's `crc32.ChecksumIEEE`) + - Padding to 8-byte alignment + - Read path: read header → compute body length → read body → verify CRC + +2.4. **Idx file format** (`idx/`): + - Fixed 16-byte records: NeedleId(8) + Offset(4) + Size(4) + - Sequential append-only file + - Walk/iterate all entries + - Binary search not used (loaded into memory map) + +2.5. **NeedleMap (in-memory)** (`needle_map.rs`): + - HashMap where NeedleValue = {Offset, Size} + - Load from `.idx` file on volume mount + - Support Get, Set, Delete operations + - Track file count, deleted count, deleted byte count + +2.6. **Tests:** + - Unit test: write a needle to bytes → read it back → verify fields match + - Unit test: write/read SuperBlock round-trip + - Unit test: write/read idx entries round-trip + - **Cross-compat test:** Use Go volume server to create a small volume with known data. Read it from Rust and verify all needles decoded correctly. (Keep test fixture `.dat`/`.idx` files in `tests/fixtures/`) + +--- + +### Phase 3: Volume Struct & Lifecycle +**Goal:** Mount, read from, write to, and unmount a volume. + +**Source files to port:** +- `weed/storage/volume.go` → `src/storage/volume.rs` +- `weed/storage/volume_read.go` → `src/storage/volume_read.rs` +- `weed/storage/volume_write.go` → `src/storage/volume_write.rs` +- `weed/storage/volume_loading.go` +- `weed/storage/volume_vacuum.go` → `src/storage/volume_compact.rs` +- `weed/storage/volume_info/volume_info.go` → `src/storage/volume_info.rs` +- `weed/storage/volume_super_block.go` + +**Steps:** + +3.1. **Volume struct** (`volume.rs`): + - Fields: Id, dir, dataFile, nm (NeedleMap), SuperBlock, readOnly, lastModifiedTs, lastCompactIndexOffset, lastCompactRevision + - `noWriteOrDelete` / `noWriteCanDelete` / `readOnly` state flags + - File handles for `.dat` file (read + append) + - Lock strategy: `RwLock` for concurrent reads, exclusive writes + +3.2. **Volume loading** — exact logic from `volume_loading.go`: + - Open `.dat` file, read SuperBlock from first 8 bytes + - Load `.idx` file into NeedleMap + - Handle `.vif` (VolumeInfo) JSON sidecar file + - Set volume state based on SuperBlock + VolumeInfo + +3.3. **Volume read** (`volume_read.rs`) — from `volume_read.go`: + - `ReadNeedle(needleId, cookie)`: lookup in NeedleMap → seek in .dat → read needle bytes → verify cookie + CRC → return data + - Handle deleted needles (Size < 0) + - `ReadNeedleBlob(offset, size)`: raw blob read + - `ReadNeedleMeta(needleId, offset, size)`: read metadata only + +3.4. **Volume write** (`volume_write.rs`) — from `volume_write.go`: + - `WriteNeedle(needle)`: serialize needle → append to .dat → update .idx → update NeedleMap + - `DeleteNeedle(needleId)`: mark as deleted in NeedleMap + append tombstone to .idx + - File size limit check + - Concurrent write serialization (mutex on write path) + +3.5. **Volume compaction** (`volume_compact.rs`) — from `volume_vacuum.go`: + - `CheckCompact()`: compute garbage ratio + - `Compact()`: create new .dat/.idx, copy only live needles, update compact revision + - `CommitCompact()`: rename compacted files over originals + - `CleanupCompact()`: remove temp files + - Throttle by `compactionBytePerSecond` + +3.6. **Volume info** (`volume_info.rs`): + - Read/write `.vif` JSON sidecar + - VolumeInfo protobuf struct mapping + - Remote file references for tiered storage + +3.7. **Tests:** + - Mount a volume, write 100 needles, read them all back, verify content + - Delete 50 needles, verify they return "deleted" + - Compact, verify only 50 remain, verify content + - Read Go-created volume fixtures + +--- + +### Phase 4: Store (Multi-Volume, Multi-Disk Manager) +**Goal:** Manage multiple volumes across multiple disk directories. + +**Source files to port:** +- `weed/storage/store.go` → `src/storage/store.rs` +- `weed/storage/disk_location.go` → `src/storage/disk_location.rs` +- `weed/storage/store_ec.go` +- `weed/storage/store_state.go` + +**Steps:** + +4.1. **DiskLocation** (`disk_location.rs`): + - Directory path, max volume count, min free space, disk type, tags + - Load all volumes from directory on startup + - Track free space, check writable + +4.2. **Store** (`store.rs`): + - Vector of `DiskLocation`s + - `GetVolume(volumeId)` → lookup across all locations + - `HasVolume(volumeId)` check + - `AllocateVolume(...)` — create new volume in appropriate location + - `DeleteVolume(...)`, `MountVolume(...)`, `UnmountVolume(...)` + - `DeleteCollection(collection)` — delete all volumes of a collection + - Collect volume status for heartbeat + - `SetStopping()`, `Close()` + - Persistent state (maintenance mode) via `store_state.go` + +4.3. **Store state** — `VolumeServerState` protobuf with maintenance flag, persisted to disk. + +4.4. **Tests:** + - Create store with 2 dirs, allocate volumes in each, verify load balancing + - Mount/unmount/delete lifecycle + - State persistence across restart + +--- + +### Phase 5: Erasure Coding +**Goal:** Full EC shard encode/decode/read/write/rebuild. + +**Source files to port:** +- `weed/storage/erasure_coding/` (3,599 lines) + +**Steps:** + +5.1. **EC volume + shard structs** — `EcVolume`, `EcShard` with file handles for `.ec00`–`.ec13` shard files + `.ecx` index + `.ecj` journal. + +5.2. **EC encoder** — Reed-Solomon 10+4 (configurable) encoding using `reed-solomon-erasure` crate: + - `VolumeEcShardsGenerate`: read .dat → split into data shards → compute parity → write .ec00-.ec13 + .ecx + +5.3. **EC decoder/reader** — reconstruct data from any 10 of 14 shards: + - `EcShardRead`: read range from a specific shard + - Locate needle in EC volume via .ecx index + - Handle cross-shard needle reads + +5.4. **EC shard operations:** + - Copy, delete, mount, unmount shards + - `VolumeEcShardsRebuild`: rebuild missing shards from remaining + - `VolumeEcShardsToVolume`: reconstruct .dat from EC shards + - `VolumeEcBlobDelete`: mark deleted in EC journal + - `VolumeEcShardsInfo`: report shard metadata + +5.5. **Tests:** + - Encode a volume → verify 14 shards created + - Delete 4 shards → rebuild → verify data intact + - Read individual needles from EC volume + - Cross-compat with Go-generated EC shards + +--- + +### Phase 6: Backend / Tiered Storage +**Goal:** Support tiered storage to remote backends (S3, etc). + +**Source files to port:** +- `weed/storage/backend/` (1,850 lines) + +**Steps:** + +6.1. **Backend trait** — abstract `BackendStorage` trait with `ReadAt`, `WriteAt`, `Truncate`, `Close`, `Name`. + +6.2. **Disk backend** — default local disk implementation. + +6.3. **S3 backend** — upload .dat to S3, read ranges via S3 range requests. + +6.4. **Tier move operations:** + - `VolumeTierMoveDatToRemote`: upload .dat to remote, optionally delete local + - `VolumeTierMoveDatFromRemote`: download .dat from remote + +6.5. **Tests:** + - Disk backend read/write round-trip + - S3 backend with mock/localstack + +--- + +### Phase 7: Security Layer +**Goal:** JWT authentication, whitelist guard, TLS configuration. + +**Source files to port:** +- `weed/security/guard.go` → `src/security/guard.rs` +- `weed/security/jwt.go` → `src/security/jwt.rs` +- `weed/security/tls.go` → `src/security/tls.rs` + +**Steps:** + +7.1. **Guard** (`guard.rs`): + - Whitelist IP check (exact match on `r.RemoteAddr`) + - Wrap handlers with whitelist enforcement + - `UpdateWhiteList()` for live reload + +7.2. **JWT** (`jwt.rs`): + - `SeaweedFileIdClaims` with `fid` field + - Sign with HMAC-SHA256 + - Verify + decode with expiry check + - Separate signing keys for read vs write + - `GetJwt(request)` — extract from `Authorization: Bearer` header or `jwt` query param + +7.3. **TLS** (`tls.rs`): + - Load server TLS cert/key for gRPC and HTTPS + - Load client TLS for mutual TLS + - Read from `security.toml` config (same format as Go's viper config) + +7.4. **Tests:** + - JWT sign → verify round-trip + - JWT with wrong key → reject + - JWT with expired token → reject + - JWT fid mismatch → reject + - Whitelist allow/deny + +--- + +### Phase 8: Prometheus Metrics +**Goal:** Export same metric names as Go for dashboard compatibility. + +**Source files to port:** +- `weed/stats/metrics.go` (volume server counters/gauges/histograms) + +**Steps:** + +8.1. Define all Prometheus metrics matching Go names: + - `VolumeServerRequestCounter` (labels: method, status) + - `VolumeServerRequestHistogram` (labels: method) + - `VolumeServerInFlightRequestsGauge` (labels: method) + - `VolumeServerInFlightUploadSize` + - `VolumeServerInFlightDownloadSize` + - `VolumeServerConcurrentUploadLimit` + - `VolumeServerConcurrentDownloadLimit` + - `VolumeServerHandlerCounter` (labels: type — UploadLimitCond, DownloadLimitCond) + - Read/Write/Delete request counters + +8.2. Metrics HTTP endpoint on `--metricsPort`. + +8.3. Optional push-based metrics loop (`LoopPushingMetric`). + +8.4. **Test:** Verify metric names and labels match Go output. + +--- + +### Phase 9: HTTP Server & Handlers +**Goal:** All HTTP endpoints with exact same behavior as Go. + +**Source files to port:** +- `weed/server/volume_server.go` → `src/server/volume_server.rs` +- `weed/server/volume_server_handlers.go` → `src/server/http_handlers.rs` +- `weed/server/volume_server_handlers_read.go` → `src/server/http_read.rs` +- `weed/server/volume_server_handlers_write.go` → `src/server/http_write.rs` +- `weed/server/volume_server_handlers_admin.go` → `src/server/http_admin.rs` +- `weed/server/volume_server_handlers_helper.go` (URL parsing, proxy, JSON responses) +- `weed/server/volume_server_handlers_ui.go` → `src/server/http_admin.rs` + +**Steps:** + +9.1. **URL path parsing** — from `handlers_helper.go`: + - Parse `/,` and `//` patterns + - Extract volume ID, file ID, filename, ext + +9.2. **Route dispatch** — from `privateStoreHandler` and `publicReadOnlyHandler`: + - `GET /` → `GetOrHeadHandler` + - `HEAD /` → `GetOrHeadHandler` + - `POST /` → `PostHandler` (whitelist gated) + - `PUT /` → `PostHandler` (whitelist gated) + - `DELETE /` → `DeleteHandler` (whitelist gated) + - `OPTIONS /` → CORS preflight + - `GET /status` → JSON status + - `GET /healthz` → health check + - `GET /ui/index.html` → HTML UI page + - Static resources (CSS/JS for UI) + +9.3. **GET/HEAD handler** (`http_read.rs`) — from `handlers_read.go` (468 lines): + - JWT read authorization check + - Lookup needle by volume ID + needle ID + cookie + - ETag / If-None-Match / If-Modified-Since conditional responses + - Content-Type from stored MIME or filename extension + - Content-Disposition header + - Content-Encoding (gzip/zstd stored data) + - Range request support (HTTP 206 Partial Content) + - JPEG orientation fix (if configured) + - Proxy to replica on local miss (readMode=proxy) + - Redirect to replica (readMode=redirect) + - Download tracking (in-flight size accounting) + +9.4. **POST/PUT handler** (`http_write.rs`) — from `handlers_write.go` (170 lines): + - JWT write authorization check + - Multipart form parsing + - Extract file data, filename, content type, TTL, last-modified + - Optional gzip/zstd compression + - Write needle to volume + - Replicate to peers (same logic as Go's `DistributedOperation`) + - Return JSON: {name, size, eTag, error} + +9.5. **DELETE handler** — already in handlers.go: + - JWT authorization + - Delete from local volume + - Replicate delete to peers + - Return JSON result + +9.6. **Admin handlers** (`http_admin.rs`): + - `/status` → JSON with volumes, version, disk status + - `/healthz` → 200 OK if serving + - `/ui/index.html` → HTML dashboard + +9.7. **Concurrency limiting** — from `handlers.go`: + - Upload concurrency limit with `sync::Condvar` + timeout + - Download concurrency limit with proxy fallback to replicas + - HTTP 429 on timeout, 499 on client cancel + - Replication traffic bypasses upload limits + +9.8. **Public port** — if configured, separate listener with read-only routes (GET/HEAD/OPTIONS only). + +9.9. **Request ID middleware** — generate unique request ID per request. + +9.10. **Tests:** + - Integration: start server → upload file via POST → GET it back → verify content + - Integration: upload → DELETE → GET returns 404 + - Integration: conditional GET with ETag → 304 + - Integration: range request → 206 with correct bytes + - Integration: exceed upload limit → 429 + - Integration: whitelist enforcement + - Integration: JWT enforcement + +--- + +### Phase 10: gRPC Service Implementation +**Goal:** All 40 gRPC methods with exact logic. + +**Source files to port:** +- `weed/server/volume_grpc_admin.go` (380 lines) +- `weed/server/volume_grpc_vacuum.go` (124 lines) +- `weed/server/volume_grpc_copy.go` (636 lines) +- `weed/server/volume_grpc_copy_incremental.go` (66 lines) +- `weed/server/volume_grpc_read_write.go` (74 lines) +- `weed/server/volume_grpc_batch_delete.go` (124 lines) +- `weed/server/volume_grpc_tail.go` (140 lines) +- `weed/server/volume_grpc_erasure_coding.go` (619 lines) +- `weed/server/volume_grpc_scrub.go` (121 lines) +- `weed/server/volume_grpc_tier_upload.go` (98 lines) +- `weed/server/volume_grpc_tier_download.go` (85 lines) +- `weed/server/volume_grpc_remote.go` (95 lines) +- `weed/server/volume_grpc_query.go` (69 lines) +- `weed/server/volume_grpc_state.go` (26 lines) +- `weed/server/volume_grpc_read_all.go` (35 lines) +- `weed/server/volume_grpc_client_to_master.go` (325 lines) + +**Steps (grouped by functional area):** + +10.1. **Implement `tonic::Service` for `VolumeServer`** — the generated trait from proto. + +10.2. **Admin RPCs** (`grpc_admin.rs`): + - `AllocateVolume` — create volume on appropriate disk location + - `VolumeMount` / `VolumeUnmount` / `VolumeDelete` + - `VolumeMarkReadonly` / `VolumeMarkWritable` + - `VolumeConfigure` — change replication + - `VolumeStatus` — return read-only, size, file counts + - `VolumeServerStatus` — disk statuses, memory, version, DC, rack + - `VolumeServerLeave` — deregister from master + - `DeleteCollection` + - `VolumeNeedleStatus` — get needle metadata by ID + - `Ping` — latency measurement + - `GetState` / `SetState` — maintenance mode + +10.3. **Vacuum RPCs** (`grpc_vacuum.rs`): + - `VacuumVolumeCheck` — return garbage ratio + - `VacuumVolumeCompact` — stream progress (streaming response) + - `VacuumVolumeCommit` — finalize compaction + - `VacuumVolumeCleanup` — remove temp files + +10.4. **Copy RPCs** (`grpc_copy.rs`): + - `VolumeCopy` — stream .dat/.idx from source to create local copy + - `VolumeSyncStatus` — return sync metadata + - `VolumeIncrementalCopy` — stream .dat delta since timestamp (streaming) + - `CopyFile` — generic file copy by extension (streaming) + - `ReceiveFile` — receive streamed file (client streaming) + - `ReadVolumeFileStatus` — return file timestamps and sizes + +10.5. **Read/Write RPCs** (`grpc_read_write.rs`): + - `ReadNeedleBlob` — raw needle blob read + - `ReadNeedleMeta` — needle metadata + - `WriteNeedleBlob` — raw needle blob write + - `ReadAllNeedles` — stream all needles from volume(s) (streaming) + +10.6. **Batch delete** (`grpc_batch_delete.rs`): + - `BatchDelete` — delete multiple file IDs, return per-ID results + +10.7. **Tail RPCs** (`grpc_tail.rs`): + - `VolumeTailSender` — stream new needles since timestamp (streaming) + - `VolumeTailReceiver` — connect to another volume server and tail its changes + +10.8. **Erasure coding RPCs** (`grpc_erasure_coding.rs`): + - `VolumeEcShardsGenerate` — generate EC shards from volume + - `VolumeEcShardsRebuild` — rebuild missing shards + - `VolumeEcShardsCopy` — copy shards from another server + - `VolumeEcShardsDelete` — delete EC shards + - `VolumeEcShardsMount` / `VolumeEcShardsUnmount` + - `VolumeEcShardRead` — read from EC shard (streaming) + - `VolumeEcBlobDelete` — mark blob deleted in EC volume + - `VolumeEcShardsToVolume` — reconstruct volume from EC shards + - `VolumeEcShardsInfo` — return shard metadata + +10.9. **Scrub RPCs** (`grpc_scrub.rs`): + - `ScrubVolume` — integrity check volumes (INDEX / FULL / LOCAL modes) + - `ScrubEcVolume` — integrity check EC volumes + +10.10. **Tier RPCs** (`grpc_tier.rs`): + - `VolumeTierMoveDatToRemote` — upload to remote backend (streaming progress) + - `VolumeTierMoveDatFromRemote` — download from remote (streaming progress) + +10.11. **Remote storage** (`grpc_remote.rs`): + - `FetchAndWriteNeedle` — fetch from remote storage, write locally, replicate + +10.12. **Query** (`grpc_query.rs`): + - `Query` — experimental CSV/JSON/Parquet select on stored data (streaming) + +10.13. **Master heartbeat** (`grpc_client_to_master.rs`): + - `heartbeat()` background task — periodic gRPC stream to master + - Send: volume info, EC shard info, disk stats, has-no-space flags, deleted volumes + - Receive: volume size limit, leader address, metrics config + - Reconnect on failure with backoff + - `StopHeartbeat()` for graceful shutdown + +10.14. **Tests:** + - Integration test per RPC: call via tonic client → verify response + - Streaming RPCs: verify all chunks received + - Error cases: invalid volume ID, non-existent volume, etc. + - Heartbeat: mock master gRPC server, verify registration + +--- + +### Phase 11: Startup, Lifecycle & Graceful Shutdown +**Goal:** Full server startup matching Go's `runVolume()` and `startVolumeServer()`. + +**Steps:** + +11.1. **Startup sequence** (match `volume.go` exactly): + 1. Load security configuration from `security.toml` + 2. Start metrics server on metrics port + 3. Parse folder/max/minFreeSpace/diskType/tags + 4. Validate all directory writable + 5. Resolve IP, bind IP, public URL, gRPC port + 6. Create `VolumeServer` struct + 7. Check with master (initial handshake) + 8. Create `Store` (loads all existing volumes from disk) + 9. Create security `Guard` + 10. Register HTTP routes on admin mux + 11. Optionally register public mux + 12. Start gRPC server on gRPC port + 13. Start public HTTP server (if separated) + 14. Start cluster HTTP server (with optional TLS) + 15. Start heartbeat background task + 16. Start metrics push loop + 17. Register SIGHUP handler for config reload + new volume loading + +11.2. **Graceful shutdown** (match Go exactly): + 1. On SIGINT/SIGTERM: + 2. Stop heartbeat (notify master we're leaving) + 3. Wait `preStopSeconds` + 4. Stop public HTTP server + 5. Stop cluster HTTP server + 6. Graceful stop gRPC server + 7. `volumeServer.Shutdown()` → `store.Close()` (flush all volumes) + +11.3. **Reload** (SIGHUP): + - Reload security config + - Update whitelist + - Load newly appeared volumes from disk + +11.4. **Tests:** + - Start server → send SIGTERM → verify clean shutdown + - Start server → SIGHUP → verify config reloaded + +--- + +### Phase 12: Integration & Cross-Compatibility Testing +**Goal:** Rust volume server is a drop-in replacement for Go volume server. + +**Steps:** + +12.1. **Binary compatibility tests:** + - Create volumes with Go volume server + - Start Rust volume server on same data directory + - Read all data → verify identical + - Write new data with Rust → read with Go → verify + +12.2. **API compatibility tests:** + - Run same HTTP requests against both Go and Rust servers + - Compare response bodies, headers, status codes + - Test all gRPC RPCs against both + +12.3. **Master interop test:** + - Start Go master server + - Register Rust volume server + - Verify heartbeat works + - Verify volume assignment works + - Upload via filer → stored on Rust volume server → read back + +12.4. **Performance benchmarks:** + - Throughput: sequential writes, sequential reads + - Latency: p50/p99 for read/write + - Concurrency: parallel reads/writes + - Compare Rust vs Go numbers + +12.5. **Edge cases:** + - Volume at max size + - Disk full handling + - Corrupt .dat file recovery + - Network partition during replication + - EC shard loss + rebuild + +--- + +## Execution Order & Dependencies + +``` +Phase 1 (Skeleton + CLI) ← no deps, start here + ↓ +Phase 2 (Storage types) ← needs Phase 1 (types used everywhere) + ↓ +Phase 3 (Volume struct) ← needs Phase 2 + ↓ +Phase 4 (Store manager) ← needs Phase 3 + ↓ +Phase 7 (Security) ← independent, can parallel with 3-4 +Phase 8 (Metrics) ← independent, can parallel with 3-4 + ↓ +Phase 9 (HTTP server) ← needs Phase 4 + 7 + 8 +Phase 10 (gRPC server) ← needs Phase 4 + 7 + 8 + ↓ +Phase 5 (Erasure coding) ← needs Phase 4, wire into Phase 10 +Phase 6 (Tiered storage) ← needs Phase 4, wire into Phase 10 + ↓ +Phase 11 (Startup + shutdown) ← needs Phase 9 + 10 + ↓ +Phase 12 (Integration tests) ← needs all above +``` + +## Estimated Scope + +| Phase | Estimated Rust Lines | Complexity | +|---|---|---| +| 1. Skeleton + CLI | ~400 | Low | +| 2. Storage types | ~2,000 | High (binary compat critical) | +| 3. Volume struct | ~2,500 | High | +| 4. Store manager | ~1,000 | Medium | +| 5. Erasure coding | ~3,000 | High | +| 6. Tiered storage | ~1,500 | Medium | +| 7. Security | ~500 | Medium | +| 8. Metrics | ~300 | Low | +| 9. HTTP server | ~2,000 | High | +| 10. gRPC server | ~3,500 | High | +| 11. Startup/shutdown | ~500 | Medium | +| 12. Integration tests | ~2,000 | Medium | +| **Total** | **~19,000** | | + +## Critical Invariants to Preserve + +1. **Binary format compatibility** — Rust must read/write `.dat`, `.idx`, `.vif`, `.ecX` files identically to Go. A single byte off = data loss. +2. **gRPC wire compatibility** — Same proto, same field semantics. Go master must talk to Rust volume server seamlessly. +3. **HTTP API compatibility** — Same URL patterns, same JSON response shapes, same headers, same status codes. +4. **Replication protocol** — Write replication between Go and Rust volume servers must work bidirectionally. +5. **Heartbeat protocol** — Rust volume server must register with Go master and maintain heartbeat. +6. **CRC32 algorithm** — Must use IEEE polynomial (same as Go's `crc32.ChecksumIEEE`). +7. **JWT compatibility** — Tokens signed by Go filer/master must be verifiable by Rust volume server and vice versa. diff --git a/docker/Dockerfile.go_build b/docker/Dockerfile.go_build index c1c9a523e..3b8e120ed 100644 --- a/docker/Dockerfile.go_build +++ b/docker/Dockerfile.go_build @@ -16,9 +16,31 @@ RUN cd /go/src/github.com/seaweedfs/seaweedfs/weed \ && export LDFLAGS="-X github.com/seaweedfs/seaweedfs/weed/util/version.COMMIT=$(git rev-parse --short HEAD)" \ && CGO_ENABLED=0 go install -tags "$TAGS" -ldflags "-extldflags -static ${LDFLAGS}" +# Rust volume server builder (amd64/arm64 only) +FROM rust:1-alpine as rust_builder +ARG TARGETARCH +RUN apk add musl-dev protobuf-dev git +COPY --from=builder /go/src/github.com/seaweedfs/seaweedfs/seaweed-volume /build/seaweed-volume +COPY --from=builder /go/src/github.com/seaweedfs/seaweedfs/proto /build/proto +WORKDIR /build/seaweed-volume +ARG TAGS +RUN if [ "$TARGETARCH" = "amd64" ] || [ "$TARGETARCH" = "arm64" ]; then \ + if [ "$TAGS" = "5BytesOffset" ]; then \ + cargo build --release; \ + else \ + cargo build --release --no-default-features; \ + fi && \ + cp target/release/weed-volume /weed-volume; \ + else \ + echo "Skipping Rust build for $TARGETARCH (unsupported)" && \ + touch /weed-volume; \ + fi + FROM alpine AS final LABEL author="Chris Lu" COPY --from=builder /go/bin/weed /usr/bin/ +# Copy Rust volume server binary (real binary on amd64/arm64, empty placeholder on other platforms) +COPY --from=rust_builder /weed-volume /usr/bin/weed-volume RUN mkdir -p /etc/seaweedfs COPY --from=builder /go/src/github.com/seaweedfs/seaweedfs/docker/filer.toml /etc/seaweedfs/filer.toml COPY --from=builder /go/src/github.com/seaweedfs/seaweedfs/docker/entrypoint.sh /entrypoint.sh diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 7d8bd24f2..6632f6645 100755 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -72,6 +72,20 @@ case "$1" in exec /usr/bin/weed -logtostderr=true volume $ARGS $@ ;; + 'volume-rust') + ARGS="-dir /data -max 0" + if isArgPassed "-max" "$@"; then + ARGS="-dir /data" + fi + shift + if [ ! -s /usr/bin/weed-volume ]; then + echo "Error: Rust volume server is not available on this platform ($(uname -m))." >&2 + echo "Use 'volume' for the Go volume server instead." >&2 + exit 1 + fi + exec /usr/bin/weed-volume $ARGS $@ + ;; + 'server') ARGS="-dir=/data -volume.max=0 -master.volumeSizeLimitMB=1024" if isArgPassed "-volume.max" "$@"; then diff --git a/install.sh b/install.sh new file mode 100755 index 000000000..86b45f165 --- /dev/null +++ b/install.sh @@ -0,0 +1,275 @@ +#!/bin/bash +# +# SeaweedFS Installer +# Downloads Go and/or Rust binaries from GitHub releases. +# +# Usage: +# curl -fsSL https://raw.githubusercontent.com/seaweedfs/seaweedfs/master/install.sh | bash +# curl -fsSL ... | bash -s -- --component volume-rust --large-disk +# curl -fsSL ... | bash -s -- --version v3.93 --dir /usr/local/bin +# +# Options: +# --component COMP Which binary to install: weed, volume-rust, all (default: weed) +# --version VER Release version tag (default: latest) +# --large-disk Use large disk variant (5-byte offset, 8TB max volume) +# --dir DIR Installation directory (default: /usr/local/bin) +# --help Show this help message + +set -euo pipefail + +REPO="seaweedfs/seaweedfs" +COMPONENT="weed" +VERSION="" +LARGE_DISK=false +INSTALL_DIR="/usr/local/bin" + +# Colors (if terminal supports them) +if [ -t 1 ]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[0;33m' + BLUE='\033[0;34m' + NC='\033[0m' +else + RED='' GREEN='' YELLOW='' BLUE='' NC='' +fi + +info() { echo -e "${BLUE}[info]${NC} $*"; } +ok() { echo -e "${GREEN}[ok]${NC} $*"; } +warn() { echo -e "${YELLOW}[warn]${NC} $*"; } +error() { echo -e "${RED}[error]${NC} $*" >&2; exit 1; } + +usage() { + sed -n '/^# Usage:/,/^$/p' "$0" | sed 's/^# \?//' + exit 0 +} + +# Parse arguments +while [ $# -gt 0 ]; do + case "$1" in + --component) COMPONENT="$2"; shift 2 ;; + --version) VERSION="$2"; shift 2 ;; + --large-disk) LARGE_DISK=true; shift ;; + --dir) INSTALL_DIR="$2"; shift 2 ;; + --help|-h) usage ;; + *) error "Unknown option: $1. Use --help for usage." ;; + esac +done + +# Detect OS and architecture +detect_platform() { + local os arch + + case "$(uname -s)" in + Linux*) os="linux" ;; + Darwin*) os="darwin" ;; + MINGW*|MSYS*|CYGWIN*) os="windows" ;; + FreeBSD*) os="freebsd" ;; + *) error "Unsupported OS: $(uname -s)" ;; + esac + + case "$(uname -m)" in + x86_64|amd64) arch="amd64" ;; + aarch64|arm64) arch="arm64" ;; + armv7l|armv6l) arch="arm" ;; + *) error "Unsupported architecture: $(uname -m)" ;; + esac + + echo "${os}" "${arch}" +} + +# Get latest release tag from GitHub API +get_latest_version() { + local url="https://api.github.com/repos/${REPO}/releases/latest" + if command -v curl &>/dev/null; then + curl -fsSL "$url" | grep '"tag_name"' | head -1 | sed 's/.*"tag_name": *"\([^"]*\)".*/\1/' + elif command -v wget &>/dev/null; then + wget -qO- "$url" | grep '"tag_name"' | head -1 | sed 's/.*"tag_name": *"\([^"]*\)".*/\1/' + else + error "Neither curl nor wget found. Please install one." + fi +} + +# Download a file +download() { + local url="$1" dest="$2" + info "Downloading ${url}" + if command -v curl &>/dev/null; then + curl -fsSL -o "$dest" "$url" + elif command -v wget &>/dev/null; then + wget -qO "$dest" "$url" + fi +} + +# Build Go weed binary asset name +go_asset_name() { + local os="$1" arch="$2" + local suffix="${os}_${arch}" + if [ "$LARGE_DISK" = true ]; then + suffix="${suffix}_large_disk" + fi + echo "${suffix}.tar.gz" +} + +# Build Rust volume server asset name +rust_asset_name() { + local os="$1" arch="$2" + local prefix="weed-volume" + if [ "$LARGE_DISK" = true ]; then + prefix="weed-volume_large_disk" + else + prefix="weed-volume" + fi + local suffix="${os}_${arch}" + if [ "$os" = "windows" ]; then + echo "${prefix}_${suffix}.zip" + else + echo "${prefix}_${suffix}.tar.gz" + fi +} + +# Install a single component +install_component() { + local component="$1" os="$2" arch="$3" + local asset_name download_url tmpdir + + tmpdir="$(mktemp -d)" + trap "rm -rf '$tmpdir'" EXIT + + case "$component" in + weed) + asset_name="$(go_asset_name "$os" "$arch")" + download_url="https://github.com/${REPO}/releases/download/${VERSION}/${asset_name}" + download "$download_url" "${tmpdir}/${asset_name}" + + info "Extracting ${asset_name}..." + tar xzf "${tmpdir}/${asset_name}" -C "$tmpdir" + + # The Go release action puts the binary inside a directory + local weed_bin + weed_bin="$(find "$tmpdir" -name 'weed' -type f | head -1)" + if [ -z "$weed_bin" ]; then + weed_bin="$(find "$tmpdir" -name 'weed.exe' -type f | head -1)" + fi + if [ -z "$weed_bin" ]; then + error "Could not find weed binary in archive" + fi + + chmod +x "$weed_bin" + install_binary "$weed_bin" "weed" + ok "Installed weed to ${INSTALL_DIR}/weed" + ;; + + volume-rust) + # Check platform support for Rust volume server + case "$os" in + linux|darwin|windows) ;; + *) error "Rust volume server is not available for ${os}. Supported: linux, darwin, windows" ;; + esac + case "$arch" in + amd64|arm64) ;; + *) error "Rust volume server is not available for ${arch}. Supported: amd64, arm64" ;; + esac + + asset_name="$(rust_asset_name "$os" "$arch")" + download_url="https://github.com/${REPO}/releases/download/${VERSION}/${asset_name}" + download "$download_url" "${tmpdir}/${asset_name}" + + info "Extracting ${asset_name}..." + if [ "$os" = "windows" ]; then + unzip -q "${tmpdir}/${asset_name}" -d "$tmpdir" + else + tar xzf "${tmpdir}/${asset_name}" -C "$tmpdir" + fi + + local rust_bin + if [ "$LARGE_DISK" = true ]; then + rust_bin="$(find "$tmpdir" -name 'weed-volume-large-disk*' -type f | head -1)" + else + rust_bin="$(find "$tmpdir" -name 'weed-volume-normal*' -type f | head -1)" + fi + if [ -z "$rust_bin" ]; then + rust_bin="$(find "$tmpdir" -name 'weed-volume*' -type f | head -1)" + fi + if [ -z "$rust_bin" ]; then + error "Could not find weed-volume binary in archive" + fi + + chmod +x "$rust_bin" + local dest_name="weed-volume" + if [ "$os" = "windows" ]; then + dest_name="weed-volume.exe" + fi + install_binary "$rust_bin" "$dest_name" + ok "Installed weed-volume to ${INSTALL_DIR}/${dest_name}" + ;; + + *) + error "Unknown component: ${component}. Use: weed, volume-rust, all" + ;; + esac +} + +# Copy binary to install dir, using sudo if needed +install_binary() { + local src="$1" name="$2" + local dest="${INSTALL_DIR}/${name}" + + mkdir -p "$INSTALL_DIR" 2>/dev/null || true + + if [ -w "$INSTALL_DIR" ]; then + cp "$src" "$dest" + else + info "Need elevated permissions to write to ${INSTALL_DIR}" + sudo cp "$src" "$dest" + fi + chmod +x "$dest" 2>/dev/null || sudo chmod +x "$dest" +} + +main() { + info "SeaweedFS Installer" + + read -r os arch <<< "$(detect_platform)" + info "Detected platform: ${os}/${arch}" + + if [ -z "$VERSION" ]; then + info "Resolving latest release..." + VERSION="$(get_latest_version)" + if [ -z "$VERSION" ]; then + error "Could not determine latest version. Specify with --version" + fi + fi + info "Version: ${VERSION}" + + if [ "$LARGE_DISK" = true ]; then + info "Variant: large disk (8TB max volume)" + else + info "Variant: normal (32GB max volume)" + fi + + case "$COMPONENT" in + all) + install_component "weed" "$os" "$arch" + install_component "volume-rust" "$os" "$arch" + ;; + *) + install_component "$COMPONENT" "$os" "$arch" + ;; + esac + + echo "" + ok "Installation complete!" + if [ "$COMPONENT" = "weed" ] || [ "$COMPONENT" = "all" ]; then + info " weed: ${INSTALL_DIR}/weed" + fi + if [ "$COMPONENT" = "volume-rust" ] || [ "$COMPONENT" = "all" ]; then + info " weed-volume: ${INSTALL_DIR}/weed-volume" + fi + echo "" + info "Quick start:" + info " weed master # Start master server" + info " weed volume -mserver=localhost:9333 # Start Go volume server" + info " weed-volume -mserver localhost:9333 # Start Rust volume server" +} + +main diff --git a/seaweed-volume/Cargo.lock b/seaweed-volume/Cargo.lock new file mode 100644 index 000000000..b5401c9a5 --- /dev/null +++ b/seaweed-volume/Cargo.lock @@ -0,0 +1,5255 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "addr2line" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "aligned-vec" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" +dependencies = [ + "equator", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "asn1-rs" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048" +dependencies = [ + "asn1-rs-derive", + "asn1-rs-impl", + "displaydoc", + "nom", + "num-traits", + "rusticata-macros", + "thiserror 1.0.69", + "time", +] + +[[package]] +name = "asn1-rs-derive" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "asn1-rs-impl" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-stream" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "aws-config" +version = "1.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11493b0bad143270fb8ad284a096dd529ba91924c5409adeac856cc1bf047dbc" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "hex", + "http 1.4.0", + "sha1", + "time", + "tokio", + "tracing", + "url", + "zeroize", +] + +[[package]] +name = "aws-credential-types" +version = "1.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f20799b373a1be121fe3005fba0c2090af9411573878f224df44b42727fcaf7" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "zeroize", +] + +[[package]] +name = "aws-lc-rs" +version = "1.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa7e52a4c5c547c741610a2c6f123f3881e409b714cd27e6798ef020c514f0a" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + +[[package]] +name = "aws-runtime" +version = "1.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fc0651c57e384202e47153c1260b84a9936e19803d747615edf199dc3b98d17" +dependencies = [ + "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "bytes-utils", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "percent-encoding", + "pin-project-lite", + "tracing", + "uuid", +] + +[[package]] +name = "aws-sdk-s3" +version = "1.125.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "223f5c95650d9557925a91f4c2db3def189e8f659452134a29e5cd2d37d708ed" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-checksums", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "bytes", + "fastrand", + "hex", + "hmac", + "http 0.2.12", + "http 1.4.0", + "http-body 1.0.1", + "lru", + "percent-encoding", + "regex-lite", + "sha2", + "tracing", + "url", +] + +[[package]] +name = "aws-sdk-sso" +version = "1.96.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f64a6eded248c6b453966e915d32aeddb48ea63ad17932682774eb026fbef5b1" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.98.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db96d720d3c622fcbe08bae1c4b04a72ce6257d8b0584cb5418da00ae20a344f" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-sts" +version = "1.100.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fafbdda43b93f57f699c5dfe8328db590b967b8a820a13ccdd6687355dfcc7ca" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sigv4" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0b660013a6683ab23797778e21f1f854744fdf05f68204b4cca4c8c04b5d1f4" +dependencies = [ + "aws-credential-types", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "crypto-bigint 0.5.5", + "form_urlencoded", + "hex", + "hmac", + "http 0.2.12", + "http 1.4.0", + "p256 0.11.1", + "percent-encoding", + "ring", + "sha2", + "subtle", + "time", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-async" +version = "1.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ffcaf626bdda484571968400c326a244598634dc75fd451325a54ad1a59acfc" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "aws-smithy-checksums" +version = "0.64.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6750f3dd509b0694a4377f0293ed2f9630d710b1cebe281fa8bac8f099f88bc6" +dependencies = [ + "aws-smithy-http", + "aws-smithy-types", + "bytes", + "crc-fast", + "hex", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "md-5", + "pin-project-lite", + "sha1", + "sha2", + "tracing", +] + +[[package]] +name = "aws-smithy-eventstream" +version = "0.60.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf09d74e5e32f76b8762da505a3cd59303e367a664ca67295387baa8c1d7548" +dependencies = [ + "aws-smithy-types", + "bytes", + "crc32fast", +] + +[[package]] +name = "aws-smithy-http" +version = "0.63.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba1ab2dc1c2c3749ead27180d333c42f11be8b0e934058fb4b2258ee8dbe5231" +dependencies = [ + "aws-smithy-eventstream", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "bytes-utils", + "futures-core", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + +[[package]] +name = "aws-smithy-http-client" +version = "1.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a2f165a7feee6f263028b899d0a181987f4fa7179a6411a32a439fba7c5f769" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "h2", + "http 1.4.0", + "hyper", + "hyper-rustls", + "hyper-util", + "pin-project-lite", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower 0.5.3", + "tracing", +] + +[[package]] +name = "aws-smithy-json" +version = "0.62.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9648b0bb82a2eedd844052c6ad2a1a822d1f8e3adee5fbf668366717e428856a" +dependencies = [ + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-observability" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06c2315d173edbf1920da8ba3a7189695827002e4c0fc961973ab1c54abca9c" +dependencies = [ + "aws-smithy-runtime-api", +] + +[[package]] +name = "aws-smithy-query" +version = "0.60.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a56d79744fb3edb5d722ef79d86081e121d3b9422cb209eb03aea6aa4f21ebd" +dependencies = [ + "aws-smithy-types", + "urlencoding", +] + +[[package]] +name = "aws-smithy-runtime" +version = "1.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "028999056d2d2fd58a697232f9eec4a643cf73a71cf327690a7edad1d2af2110" +dependencies = [ + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-http-client", + "aws-smithy-observability", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "pin-project-lite", + "pin-utils", + "tokio", + "tracing", +] + +[[package]] +name = "aws-smithy-runtime-api" +version = "1.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "876ab3c9c29791ba4ba02b780a3049e21ec63dabda09268b175272c3733a79e6" +dependencies = [ + "aws-smithy-async", + "aws-smithy-types", + "bytes", + "http 0.2.12", + "http 1.4.0", + "pin-project-lite", + "tokio", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-types" +version = "1.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2b1117b3b2bbe166d11199b540ceed0d0f7676e36e7b962b5a437a9971eac75" +dependencies = [ + "base64-simd", + "bytes", + "bytes-utils", + "futures-core", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "itoa", + "num-integer", + "pin-project-lite", + "pin-utils", + "ryu", + "serde", + "time", + "tokio", + "tokio-util", +] + +[[package]] +name = "aws-smithy-xml" +version = "0.60.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce02add1aa3677d022f8adf81dcbe3046a95f17a1b1e8979c145cd21d3d22b3" +dependencies = [ + "xmlparser", +] + +[[package]] +name = "aws-types" +version = "1.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47c8323699dd9b3c8d5b3c13051ae9cdef58fd179957c882f8374dd8725962d9" +dependencies = [ + "aws-credential-types", + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "rustc_version", + "tracing", +] + +[[package]] +name = "axum" +version = "0.7.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +dependencies = [ + "async-trait", + "axum-core", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "multer", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tower 0.5.3", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "backtrace" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-link", +] + +[[package]] +name = "base16ct" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" + +[[package]] +name = "base16ct" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c7f02d4ea65f2c1853089ffd8d2787bdbc63de2f0d29dedbcf8ccdfa0ccd4cf" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + +[[package]] +name = "base64ct" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bumpalo" +version = "3.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" + +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + +[[package]] +name = "byteorder-lite" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f1fe948ff07f4bd06c30984e69f5b4899c516a3ef74f34df92a2df2ab535495" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "bytes-utils" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" +dependencies = [ + "bytes", + "either", +] + +[[package]] +name = "cc" +version = "1.2.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "chrono" +version = "0.4.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c673075a2e0e5f4a1dde27ce9dee1ea4558c7ffe648f576438a20ca1d2acc4b0" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "clap" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "clap_lex" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" + +[[package]] +name = "cmake" +version = "0.1.57" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75443c44cd6b379beb8c5b45d85d0773baf31cce901fe7bb252f4eff3008ef7d" +dependencies = [ + "cc", +] + +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpp_demangle" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0667304c32ea56cb4cd6d2d7c0cfe9a2f8041229db8c033af7f8d69492429def" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc" +version = "3.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" +dependencies = [ + "crc-catalog", +] + +[[package]] +name = "crc-catalog" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" + +[[package]] +name = "crc-fast" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd92aca2c6001b1bf5ba0ff84ee74ec8501b52bbef0cac80bf25a6c1d87a83d" +dependencies = [ + "crc", + "digest", + "rustversion", + "spin 0.10.0", +] + +[[package]] +name = "crc32c" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crypto-bigint" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "generic-array", + "rand_core 0.6.4", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "curve25519-dalek" +version = "4.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" +dependencies = [ + "cfg-if", + "cpufeatures", + "curve25519-dalek-derive", + "digest", + "fiat-crypto", + "rustc_version", + "subtle", + "zeroize", +] + +[[package]] +name = "curve25519-dalek-derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core 0.9.12", +] + +[[package]] +name = "data-encoding" +version = "2.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7a1e2f27636f116493b8b860f5546edb47c8d8f8ea73e1d2a20be88e28d1fea" + +[[package]] +name = "debugid" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" +dependencies = [ + "uuid", +] + +[[package]] +name = "der" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +dependencies = [ + "const-oid", + "zeroize", +] + +[[package]] +name = "der" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + +[[package]] +name = "der-parser" +version = "9.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553" +dependencies = [ + "asn1-rs", + "displaydoc", + "nom", + "num-bigint", + "num-traits", + "rusticata-macros", +] + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", + "subtle", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "ecdsa" +version = "0.14.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" +dependencies = [ + "der 0.6.1", + "elliptic-curve 0.12.3", + "rfc6979 0.3.1", + "signature 1.6.4", +] + +[[package]] +name = "ecdsa" +version = "0.16.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee27f32b5c5292967d2d4a9d7f1e0b0aed2c15daded5a60300e4abb9d8020bca" +dependencies = [ + "der 0.7.10", + "digest", + "elliptic-curve 0.13.8", + "rfc6979 0.4.0", + "signature 2.2.0", + "spki 0.7.3", +] + +[[package]] +name = "ed25519" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" +dependencies = [ + "pkcs8 0.10.2", + "signature 2.2.0", +] + +[[package]] +name = "ed25519-dalek" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e796c081cee67dc755e1a36a0a172b897fab85fc3f6bc48307991f64e4eca9" +dependencies = [ + "curve25519-dalek", + "ed25519", + "serde", + "sha2", + "subtle", + "zeroize", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "elliptic-curve" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" +dependencies = [ + "base16ct 0.1.1", + "crypto-bigint 0.4.9", + "der 0.6.1", + "digest", + "ff 0.12.1", + "generic-array", + "group 0.12.1", + "pkcs8 0.9.0", + "rand_core 0.6.4", + "sec1 0.3.0", + "subtle", + "zeroize", +] + +[[package]] +name = "elliptic-curve" +version = "0.13.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5e6043086bf7973472e0c7dff2142ea0b680d30e18d9cc40f267efbf222bd47" +dependencies = [ + "base16ct 0.2.0", + "crypto-bigint 0.5.5", + "digest", + "ff 0.13.1", + "generic-array", + "group 0.13.0", + "hkdf", + "pem-rfc7468", + "pkcs8 0.10.2", + "rand_core 0.6.4", + "sec1 0.7.3", + "subtle", + "zeroize", +] + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "errno-dragonfly" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa68f1b12764fab894d2755d2518754e71b4fd80ecfb822714a1206c2aab39bf" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "fdeflate" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e6853b52649d4ac5c0bd02320cddc5ba956bdb407c4b75a2c6b75bf51500f8c" +dependencies = [ + "simd-adler32", +] + +[[package]] +name = "ff" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "ff" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0b50bfb653653f9ca9095b427bed08ab8d75a137839d9ad64eb11810d5b6393" +dependencies = [ + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "fiat-crypto" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "findshlibs" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" +dependencies = [ + "cc", + "lazy_static", + "libc", + "winapi", +] + +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "fs2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generic-array" +version = "0.14.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bb6743198531e02858aeaea5398fcc883e71851fcbcb5a2f773e2fb6cb1edf2" +dependencies = [ + "typenum", + "version_check", + "zeroize", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi 5.3.0", + "wasip2", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "wasip2", + "wasip3", +] + +[[package]] +name = "gif" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5df2ba84018d80c213569363bdcd0c64e6933c67fe4c1d60ecf822971a3c35e" +dependencies = [ + "color_quant", + "weezl", +] + +[[package]] +name = "gimli" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" + +[[package]] +name = "group" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" +dependencies = [ + "ff 0.12.1", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "group" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0f9ef7462f7c099f518d754361858f86d8a07af53ba9af0fe635bbccb151a63" +dependencies = [ + "ff 0.13.1", + "rand_core 0.6.4", + "subtle", +] + +[[package]] +name = "h2" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http 1.4.0", + "indexmap 2.13.0", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hkdf" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +dependencies = [ + "hmac", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http 0.2.12", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.4.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http 1.4.0", + "http-body 1.0.1", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "hyper" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2", + "http 1.4.0", + "http-body 1.0.1", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http 1.4.0", + "hyper", + "hyper-util", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-timeout" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" +dependencies = [ + "hyper", + "hyper-util", + "pin-project-lite", + "tokio", + "tower-service", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "base64", + "bytes", + "futures-channel", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2 0.6.3", + "system-configuration", + "tokio", + "tower-service", + "tracing", + "windows-registry", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core 0.62.2", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" + +[[package]] +name = "icu_properties" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" + +[[package]] +name = "icu_provider" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "image" +version = "0.25.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6506c6c10786659413faa717ceebcb8f70731c0a60cbae39795fdf114519c1a" +dependencies = [ + "bytemuck", + "byteorder-lite", + "color_quant", + "gif", + "image-webp", + "moxcms", + "num-traits", + "png", + "zune-core", + "zune-jpeg", +] + +[[package]] +name = "image-webp" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3" +dependencies = [ + "byteorder-lite", + "quick-error", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", +] + +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "integer-encoding" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" + +[[package]] +name = "ipnet" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" + +[[package]] +name = "iri-string" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "jsonwebtoken" +version = "10.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0529410abe238729a60b108898784df8984c87f6054c9c4fcacc47e4803c1ce1" +dependencies = [ + "base64", + "ed25519-dalek", + "getrandom 0.2.17", + "hmac", + "js-sys", + "p256 0.13.2", + "p384", + "pem", + "rand 0.8.5", + "rsa", + "serde", + "serde_json", + "sha2", + "signature 2.2.0", + "simple_asn1", +] + +[[package]] +name = "kamadak-exif" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef4fc70d0ab7e5b6bafa30216a6b48705ea964cdfc29c050f2412295eba58077" +dependencies = [ + "mutate_once", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +dependencies = [ + "spin 0.9.8", +] + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "libc" +version = "0.2.182" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" + +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "litemap" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "lru" +version = "0.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +dependencies = [ + "hashbrown 0.16.1", +] + +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "matchit" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "mime_guess" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" +dependencies = [ + "mime", + "unicase", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "mio" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.61.2", +] + +[[package]] +name = "moxcms" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac9557c559cd6fc9867e122e20d2cbefc9ca29d80d027a8e39310920ed2f0a97" +dependencies = [ + "num-traits", + "pxfm", +] + +[[package]] +name = "multer" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83e87776546dc87511aa5ee218730c92b666d7264ab6ed41f9d215af9cd5224b" +dependencies = [ + "bytes", + "encoding_rs", + "futures-util", + "http 1.4.0", + "httparse", + "memchr", + "mime", + "spin 0.9.8", + "version_check", +] + +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + +[[package]] +name = "mutate_once" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13d2233c9842d08cfe13f9eac96e207ca6a2ea10b80259ebe8ad0268be27d2af" + +[[package]] +name = "native-tls" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "465500e14ea162429d264d44189adc38b199b62b1c21eea9f69e4b73cb03bbf2" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe", + "openssl-sys", + "schannel", + "security-framework", + "security-framework-sys", + "tempfile", +] + +[[package]] +name = "nix" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "598beaf3cc6fdd9a5dfb1630c2800c7acd31df7aaf0f565796fba2b53ca1af1b" +dependencies = [ + "bitflags 1.3.2", + "cfg-if", + "libc", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "ntapi" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3b335231dfd352ffb0f8017f3b6027a4917f7df785ea2143d8af2adc66980ae" +dependencies = [ + "winapi", +] + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-bigint-dig" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e661dda6640fad38e827a6d4a310ff4763082116fe217f279885c97f511bb0b7" +dependencies = [ + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand 0.8.5", + "smallvec", + "zeroize", +] + +[[package]] +name = "num-conv" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "oid-registry" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9" +dependencies = [ + "asn1-rs", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "openssl" +version = "0.10.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +dependencies = [ + "bitflags 2.11.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + +[[package]] +name = "openssl-sys" +version = "0.9.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + +[[package]] +name = "outref" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" + +[[package]] +name = "p256" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" +dependencies = [ + "ecdsa 0.14.8", + "elliptic-curve 0.12.3", + "sha2", +] + +[[package]] +name = "p256" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9863ad85fa8f4460f9c48cb909d38a0d689dba1f6f6988a5e3e0d31071bcd4b" +dependencies = [ + "ecdsa 0.16.9", + "elliptic-curve 0.13.8", + "primeorder", + "sha2", +] + +[[package]] +name = "p384" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe42f1670a52a47d448f14b6a5c61dd78fce51856e68edaa38f7ae3a46b8d6b6" +dependencies = [ + "ecdsa 0.16.9", + "elliptic-curve 0.13.8", + "primeorder", + "sha2", +] + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core 0.8.6", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core 0.9.12", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall 0.2.16", + "smallvec", + "winapi", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.5.18", + "smallvec", + "windows-link", +] + +[[package]] +name = "pem" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d30c53c26bc5b31a98cd02d20f25a7c8567146caf63ed593a9d87b2775291be" +dependencies = [ + "base64", + "serde_core", +] + +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset 0.4.2", + "indexmap 2.13.0", +] + +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset 0.5.7", + "indexmap 2.13.0", +] + +[[package]] +name = "pin-project" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der 0.7.10", + "pkcs8 0.10.2", + "spki 0.7.3", +] + +[[package]] +name = "pkcs8" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" +dependencies = [ + "der 0.6.1", + "spki 0.6.0", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der 0.7.10", + "spki 0.7.3", +] + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "png" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60769b8b31b2a9f263dae2776c37b1b28ae246943cf719eb6946a1db05128a61" +dependencies = [ + "bitflags 2.11.0", + "crc32fast", + "fdeflate", + "flate2", + "miniz_oxide", +] + +[[package]] +name = "potential_utf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +dependencies = [ + "zerovec", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "pprof" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38a01da47675efa7673b032bf8efd8214f1917d89685e07e395ab125ea42b187" +dependencies = [ + "aligned-vec", + "backtrace", + "cfg-if", + "findshlibs", + "libc", + "log", + "nix", + "once_cell", + "prost 0.12.6", + "prost-build 0.12.6", + "prost-derive 0.12.6", + "sha2", + "smallvec", + "spin 0.10.0", + "symbolic-demangle", + "tempfile", + "thiserror 2.0.18", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn", +] + +[[package]] +name = "primeorder" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "353e1ca18966c16d9deb1c69278edbc5f194139612772bd9537af60ac231e1e6" +dependencies = [ + "elliptic-curve 0.13.8", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.11.0", + "hex", + "lazy_static", + "procfs-core", + "rustix 0.38.44", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.11.0", + "hex", +] + +[[package]] +name = "prometheus" +version = "0.13.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d33c28a30771f7f96db69893f78b857f7450d7e0237e9c8fc6427a81bae7ed1" +dependencies = [ + "cfg-if", + "fnv", + "lazy_static", + "libc", + "memchr", + "parking_lot 0.12.5", + "procfs", + "thiserror 1.0.69", +] + +[[package]] +name = "prost" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "deb1435c188b76130da55f17a466d252ff7b1418b2ad3e037d127b94e3411f29" +dependencies = [ + "bytes", + "prost-derive 0.12.6", +] + +[[package]] +name = "prost" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2796faa41db3ec313a31f7624d9286acf277b52de526150b7e69f3debf891ee5" +dependencies = [ + "bytes", + "prost-derive 0.13.5", +] + +[[package]] +name = "prost-build" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22505a5c94da8e3b7c2996394d1c933236c4d743e81a410bcca4e6989fc066a4" +dependencies = [ + "bytes", + "heck", + "itertools 0.12.1", + "log", + "multimap", + "once_cell", + "petgraph 0.6.5", + "prettyplease", + "prost 0.12.6", + "prost-types 0.12.6", + "regex", + "syn", + "tempfile", +] + +[[package]] +name = "prost-build" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be769465445e8c1474e9c5dac2018218498557af32d9ed057325ec9a41ae81bf" +dependencies = [ + "heck", + "itertools 0.14.0", + "log", + "multimap", + "once_cell", + "petgraph 0.7.1", + "prettyplease", + "prost 0.13.5", + "prost-types 0.13.5", + "regex", + "syn", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bddcdb20abf9501610992b6759a4c888aef7d1a7247ef75e2404275ac24af1" +dependencies = [ + "anyhow", + "itertools 0.12.1", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-derive" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a56d757972c98b346a9b766e3f02746cde6dd1cd1d1d563472929fdd74bec4d" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "prost-types" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091c90b0a32608e984ff2fa4091273cbdd755d54935c51d520887f4a1dbd5b0" +dependencies = [ + "prost 0.12.6", +] + +[[package]] +name = "prost-types" +version = "0.13.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52c2c1bf36ddb1a1c396b3601a3cec27c2462e45f07c386894ec3ccf5332bd16" +dependencies = [ + "prost 0.13.5", +] + +[[package]] +name = "pxfm" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5a041e753da8b807c9255f28de81879c78c876392ff2469cde94799b2896b9d" + +[[package]] +name = "quick-error" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "socket2 0.6.3", + "thiserror 2.0.18", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.2", + "ring", + "rustc-hash", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.18", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2 0.6.3", + "tracing", + "windows-sys 0.60.2", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redb" +version = "3.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae323eb086579a3769daa2c753bb96deb95993c534711e0dbe881b5192906a06" +dependencies = [ + "libc", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags 2.11.0", +] + +[[package]] +name = "reed-solomon-erasure" +version = "6.0.0" +dependencies = [ + "libm", + "lru", + "parking_lot 0.11.2", + "smallvec", + "spin 0.9.8", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-lite" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-tls", + "hyper-util", + "js-sys", + "log", + "mime", + "mime_guess", + "native-tls", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-native-tls", + "tokio-rustls", + "tokio-util", + "tower 0.5.3", + "tower-http 0.6.8", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "webpki-roots", +] + +[[package]] +name = "rfc6979" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" +dependencies = [ + "crypto-bigint 0.4.9", + "hmac", + "zeroize", +] + +[[package]] +name = "rfc6979" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dd2a808d456c4a54e300a23e9f5a67e122c3024119acbfd73e3bf664491cb2" +dependencies = [ + "hmac", + "subtle", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "rsa" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8573f03f5883dcaebdfcf4725caa1ecb9c15b2ef50c43a07b816e06799bb12d" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8 0.10.2", + "rand_core 0.6.4", + "signature 2.2.0", + "spki 0.7.3", + "subtle", + "zeroize", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rusticata-macros" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "faf0c4a6ece9950b9abdb62b1cfcf2a68b3b67a10ba445b3bb85be2a293d0632" +dependencies = [ + "nom", +] + +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags 2.11.0", + "errno 0.3.14", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags 2.11.0", + "errno 0.3.14", + "libc", + "linux-raw-sys 0.12.1", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls" +version = "0.23.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" +dependencies = [ + "aws-lc-rs", + "log", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +dependencies = [ + "web-time", + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "rusty-leveldb" +version = "3.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c48d2f060dd1286adc9c3d179cb5af1292a9d2fcf291abcfe056023fc1977b44" +dependencies = [ + "crc", + "errno 0.2.8", + "fs2", + "integer-encoding", + "rand 0.8.5", + "snap", +] + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "schannel" +version = "0.1.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "sec1" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" +dependencies = [ + "base16ct 0.1.1", + "der 0.6.1", + "generic-array", + "pkcs8 0.9.0", + "subtle", + "zeroize", +] + +[[package]] +name = "sec1" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3e97a565f76233a6003f9f5c54be1d9c5bdfa3eccfb189469f11ec4901c47dc" +dependencies = [ + "base16ct 0.2.0", + "der 0.7.10", + "generic-array", + "pkcs8 0.10.2", + "subtle", + "zeroize", +] + +[[package]] +name = "security-framework" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" +dependencies = [ + "bitflags 2.11.0", + "core-foundation 0.10.1", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_path_to_error" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" +dependencies = [ + "itoa", + "serde", + "serde_core", +] + +[[package]] +name = "serde_spanned" +version = "0.6.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf41e0cfaf7226dca15e8197172c295a782857fcb97fad1808a166870dee75a3" +dependencies = [ + "serde", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno 0.3.14", + "libc", +] + +[[package]] +name = "signature" +version = "1.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + +[[package]] +name = "simple_asn1" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d585997b0ac10be3c5ee635f1bab02d512760d14b7c468801ac8a01d9ae5f1d" +dependencies = [ + "num-bigint", + "num-traits", + "thiserror 2.0.18", + "time", +] + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "snap" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" + +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "socket2" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + +[[package]] +name = "spin" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe4ccb98d9c292d56fec89a5e07da7fc4cf0dc11e156b41793132775d3e591" +dependencies = [ + "lock_api", +] + +[[package]] +name = "spki" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" +dependencies = [ + "base64ct", + "der 0.6.1", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der 0.7.10", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "symbolic-common" +version = "12.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "751a2823d606b5d0a7616499e4130a516ebd01a44f39811be2b9600936509c23" +dependencies = [ + "debugid", + "memmap2", + "stable_deref_trait", + "uuid", +] + +[[package]] +name = "symbolic-demangle" +version = "12.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79b237cfbe320601dd24b4ac817a5b68bb28f5508e33f08d42be0682cadc8ac9" +dependencies = [ + "cpp_demangle", + "rustc-demangle", + "symbolic-common", +] + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "sysinfo" +version = "0.31.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "355dbe4f8799b304b05e1b0f05fc59b2a18d36645cf169607da45bde2f69a1be" +dependencies = [ + "core-foundation-sys", + "libc", + "memchr", + "ntapi", + "rayon", + "windows", +] + +[[package]] +name = "system-configuration" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" +dependencies = [ + "bitflags 2.11.0", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tempfile" +version = "3.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82a72c767771b47409d2345987fda8628641887d5466101319899796367354a0" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix 1.1.4", + "windows-sys 0.61.2", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl 2.0.18", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "time" +version = "0.3.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinystr" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.50.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +dependencies = [ + "bytes", + "libc", + "mio", + "parking_lot 0.12.5", + "pin-project-lite", + "signal-hook-registry", + "socket2 0.6.3", + "tokio-macros", + "windows-sys 0.61.2", +] + +[[package]] +name = "tokio-io-timeout" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bd86198d9ee903fedd2f9a2e72014287c0d9167e4ae43b5853007205dda1b76" +dependencies = [ + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-macros" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "toml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc1beb996b9d83529a9e75c17a1686767d148d70663143c7854d8b4a09ced362" +dependencies = [ + "serde", + "serde_spanned", + "toml_datetime", + "toml_edit", +] + +[[package]] +name = "toml_datetime" +version = "0.6.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cddaf88f4fbc13c51aebbf5f8eceb5c7c5a9da2ac40a13519eb5b0a0e8f11c" +dependencies = [ + "serde", +] + +[[package]] +name = "toml_edit" +version = "0.22.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41fe8c660ae4257887cf66394862d21dbca4a6ddd26f04a3560410406a2f819a" +dependencies = [ + "indexmap 2.13.0", + "serde", + "serde_spanned", + "toml_datetime", + "toml_write", + "winnow", +] + +[[package]] +name = "toml_write" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d99f8c9a7727884afe522e9bd5edbfc91a3312b36a77b5fb8926e4c31a41801" + +[[package]] +name = "tonic" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64", + "bytes", + "h2", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-timeout", + "hyper-util", + "percent-encoding", + "pin-project", + "prost 0.13.5", + "rustls-pemfile", + "socket2 0.5.10", + "tokio", + "tokio-rustls", + "tokio-stream", + "tower 0.4.13", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tonic-build" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9557ce109ea773b399c9b9e5dca39294110b74f1f342cb347a80d1fce8c26a11" +dependencies = [ + "prettyplease", + "proc-macro2", + "prost-build 0.13.5", + "prost-types 0.13.5", + "quote", + "syn", +] + +[[package]] +name = "tonic-reflection" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "878d81f52e7fcfd80026b7fdb6a9b578b3c3653ba987f87f0dce4b64043cba27" +dependencies = [ + "prost 0.13.5", + "prost-types 0.13.5", + "tokio", + "tokio-stream", + "tonic", +] + +[[package]] +name = "tower" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" +dependencies = [ + "futures-core", + "futures-util", + "indexmap 1.9.3", + "pin-project", + "pin-project-lite", + "rand 0.8.5", + "slab", + "tokio", + "tokio-util", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9cd434a998747dd2c4276bc96ee2e0c7a2eadf3cae88e52be55a05fa9053f5" +dependencies = [ + "bitflags 2.11.0", + "bytes", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "pin-project-lite", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +dependencies = [ + "bitflags 2.11.0", + "bytes", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "iri-string", + "pin-project-lite", + "tower 0.5.3", + "tower-layer", + "tower-service", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "uuid" +version = "1.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" +dependencies = [ + "getrandom 0.4.2", + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.64" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" +dependencies = [ + "cfg-if", + "futures-util", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.114" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.13.0", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags 2.11.0", + "hashbrown 0.15.5", + "indexmap 2.13.0", + "semver", +] + +[[package]] +name = "web-sys" +version = "0.3.91" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "weed-volume" +version = "0.1.0" +dependencies = [ + "anyhow", + "async-stream", + "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-s3", + "aws-types", + "axum", + "base64", + "bytes", + "chrono", + "clap", + "crc32c", + "crc32fast", + "dashmap", + "flate2", + "futures", + "hex", + "http-body 1.0.1", + "hyper", + "hyper-util", + "image", + "jsonwebtoken", + "kamadak-exif", + "lazy_static", + "libc", + "md-5", + "memmap2", + "mime_guess", + "multer", + "parking_lot 0.12.5", + "pprof", + "prometheus", + "prost 0.13.5", + "prost-types 0.13.5", + "rand 0.8.5", + "redb", + "reed-solomon-erasure", + "reqwest", + "rustls", + "rustls-pemfile", + "rusty-leveldb", + "serde", + "serde_json", + "serde_urlencoded", + "sysinfo", + "tempfile", + "thiserror 1.0.69", + "tokio", + "tokio-io-timeout", + "tokio-rustls", + "tokio-stream", + "toml", + "tonic", + "tonic-build", + "tonic-reflection", + "tower 0.4.13", + "tower-http 0.5.2", + "tracing", + "tracing-subscriber", + "uuid", + "x509-parser", +] + +[[package]] +name = "weezl" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12342cb4d8e3b046f3d80effd474a7a02447231330ef77d71daa6fbc40681143" +dependencies = [ + "windows-core 0.57.0", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ed2439a290666cd67ecce2b0ffaad89c2a56b976b736e6ece670297897832d" +dependencies = [ + "windows-implement 0.57.0", + "windows-interface 0.57.0", + "windows-result 0.1.2", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement 0.60.2", + "windows-interface 0.59.3", + "windows-link", + "windows-result 0.4.1", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9107ddc059d5b6fbfbffdfa7a7fe3e22a226def0b2608f72e9d552763d3e1ad7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.57.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29bee4b38ea3cde66011baa44dba677c432a78593e202392d1e9070cf2a7fca7" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result 0.4.1", + "windows-strings", +] + +[[package]] +name = "windows-result" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e383302e8ec8515204254685643de10811af0ed97ea37210dc26fb0032647f8" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "winnow" +version = "0.7.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap 2.13.0", + "prettyplease", + "syn", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags 2.11.0", + "indexmap 2.13.0", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.13.0", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "writeable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" + +[[package]] +name = "x509-parser" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69" +dependencies = [ + "asn1-rs", + "data-encoding", + "der-parser", + "lazy_static", + "nom", + "oid-registry", + "rusticata-macros", + "thiserror 1.0.69", + "time", +] + +[[package]] +name = "xmlparser" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + +[[package]] +name = "yoke" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a789c6e490b576db9f7e6b6d661bcc9799f7c0ac8352f56ea20193b2681532e5" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f65c489a7071a749c849713807783f70672b28094011623e200cb86dcb835953" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + +[[package]] +name = "zerotrie" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zune-core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb8a0807f7c01457d0379ba880ba6322660448ddebc890ce29bb64da71fb40f9" + +[[package]] +name = "zune-jpeg" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "410e9ecef634c709e3831c2cfdb8d9c32164fae1c67496d5b68fff728eec37fe" +dependencies = [ + "zune-core", +] diff --git a/seaweed-volume/Cargo.toml b/seaweed-volume/Cargo.toml new file mode 100644 index 000000000..6d77586a9 --- /dev/null +++ b/seaweed-volume/Cargo.toml @@ -0,0 +1,137 @@ +[package] +name = "weed-volume" +version = "0.1.0" +edition = "2021" +description = "SeaweedFS Volume Server — Rust implementation" + +[lib] +name = "seaweed_volume" + +[[bin]] +name = "weed-volume" +path = "src/main.rs" + +[features] +# Default: 5-byte offsets (8TB max volume size), matching production Go builds (-tags 5BytesOffset). +# Disable with --no-default-features for 4-byte offsets (32GB max volume size). +default = ["5bytes"] +5bytes = [] + +[dependencies] +# Async runtime +tokio = { version = "1", features = ["full"] } +tokio-stream = "0.1" +tokio-io-timeout = "1" + +# gRPC + protobuf +tonic = { version = "0.12", features = ["tls"] } +tonic-reflection = "0.12" +prost = "0.13" +prost-types = "0.13" + +# HTTP server +axum = { version = "0.7", features = ["multipart"] } +http-body = "1" +hyper = { version = "1", features = ["full"] } +hyper-util = { version = "0.1", features = ["tokio", "service", "server-auto", "http1", "http2"] } +tower = "0.4" +tower-http = { version = "0.5", features = ["cors", "trace"] } + +# CLI +clap = { version = "4", features = ["derive"] } + +# Metrics +prometheus = { version = "0.13", default-features = false, features = ["process"] } +lazy_static = "1" + +# JWT +jsonwebtoken = { version = "10", features = ["rust_crypto"] } + +# TLS +rustls = "0.23" +tokio-rustls = "0.26" +rustls-pemfile = "2" + +# LevelDB (via RocksDB for better Rust support) +# Using rusty-leveldb for pure Rust LevelDB +rusty-leveldb = "3" + +# Disk-backed needle map (alternative to in-memory HashMap) +redb = "3" + +# Reed-Solomon erasure coding +reed-solomon-erasure = "6" + +# Logging +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +pprof = { version = "0.15", features = ["prost-codec"] } + +# Config +toml = "0.8" +serde = { version = "1", features = ["derive"] } +serde_json = "1" +serde_urlencoded = "0.7" + +# CRC32 — using Castagnoli polynomial (CRC32-C), matching Go's crc32.Castagnoli +crc32c = "0.6" +crc32fast = "1" + +# Memory-mapped files +memmap2 = "0.9" + +# UUID +uuid = { version = "1", features = ["v4"] } + +# HTTP client (for proxying, remote fetch) +reqwest = { version = "0.12", features = ["rustls-tls", "stream", "multipart", "json"] } + +# Content hashing +md-5 = "0.10" +base64 = "0.22" + +# Compression +flate2 = "1" + +# Image processing +image = { version = "0.25", default-features = false, features = ["png", "jpeg", "gif", "webp"] } +kamadak-exif = "0.5" + +# Multipart form-data parsing +multer = "3" + +# MIME type guessing from file extensions +mime_guess = "2" + +# Misc +bytes = "1" +rand = "0.8" +chrono = "0.4" +hex = "0.4" +parking_lot = "0.12" +dashmap = "6" +thiserror = "1" +anyhow = "1" +async-trait = "0.1" +futures = "0.3" +async-stream = "0.3" +x509-parser = "0.16" + +# Disk space checking +sysinfo = "0.31" +libc = "0.2" + +# AWS S3 SDK (for remote storage backends) +aws-config = { version = "1", features = ["behavior-version-latest"] } +aws-sdk-s3 = { version = "1.125.0", default-features = false, features = ["sigv4a", "http-1x", "default-https-client", "rt-tokio"] } +aws-credential-types = "1" +aws-types = "1" + +[dev-dependencies] +tempfile = "3" + +[build-dependencies] +tonic-build = "0.12" + +[patch.crates-io] +reed-solomon-erasure = { path = "vendor/reed-solomon-erasure" } diff --git a/seaweed-volume/DEV_PLAN.md b/seaweed-volume/DEV_PLAN.md new file mode 100644 index 000000000..44b610538 --- /dev/null +++ b/seaweed-volume/DEV_PLAN.md @@ -0,0 +1,105 @@ +# Rust Volume Server — Dev Plan + +## Current Status (2026-03-07) + +**HTTP tests**: 53/53 pass (100%) +**gRPC tests**: 56/56 pass (100%) — includes TestVolumeMoveHandlesInFlightWrites with Rust multi-volume cluster +**Rust integration tests**: 8/8 pass +**S3 remote storage tests**: 3/3 pass +**Total**: 117/117 (100%) + 8 Rust + 3 S3 tests +**Rust unit tests**: 137 lib + 7 integration = 144 + +## Completed Features + +All phases from the original plan are complete: + +- **Phase 1** — HTTP Core: CORS, OPTIONS, unsupported methods, static assets, path routing, + cookie validation, conditional headers, range requests, dedup 204, content-encoding, + readDeleted, chunk manifests, multipart validation, MD5 check, file size limit, + upload/download throttling, image resize/crop, download disposition +- **Phase 2** — JWT/Security: signing keys from security.toml, token source precedence + (query > header > cookie), file_id claims, leeway=0 +- **Phase 3** — gRPC: maintenance mode, error message parity, ping routing, batch delete, + VolumeServerStatus (with real disk stats, data_center, rack), ReadVolumeFileStatus + (with timestamps) +- **Phase 4** — Streaming gRPC: VolumeIncrementalCopy, CopyFile, ReceiveFile, ReadAllNeedles, + VolumeTailSender, VolumeCopy, VolumeTailReceiver, VacuumVolumeCheck +- **Phase 5** — EC Shards: mount/unmount, delete, read, blob delete, rebuild, shards-to-volume, + copy, info +- **Phase 6** — Advanced gRPC: ScrubVolume, ScrubEcVolume, Query, FetchAndWriteNeedle, + VolumeTierMoveDat (error paths) +- **Phase 7** — Remote Storage: S3-compatible backend via aws-sdk-s3, + FetchAndWriteNeedle reads from S3/MinIO/SeaweedFS S3 and writes locally. + Supports all S3-compatible providers (AWS, Wasabi, Backblaze, Aliyun, etc.) +- **Master Heartbeat** — Bidirectional streaming SendHeartbeat RPC, volume/EC registration, + leader changes, shutdown deregistration. Tested end-to-end with Go master. +- **Production Sprint 1** — Quick wins: + - VolumeMarkReadonly master notification (triggers immediate heartbeat) + - Compaction throttling (`maybe_throttle_compaction()`) + - File size limit enforcement on upload + - `ts` query param for custom timestamps (upload + delete) + - TTL expiration check (was already implemented) + - Health check heartbeat status (returns 503 if disconnected from master) + - preStopSeconds graceful drain before shutdown + - S3 response passthrough headers (content-encoding, expires, content-language, content-disposition) + - .vif persistence for readonly state across restarts + - Webp image support for resize +- **Production Sprint 2** — Compatibility: + - MIME type extraction from Content-Type header + - Stats endpoints (/stats/counter, /stats/memory, /stats/disk) + - JSON pretty print (?pretty=y) and JSONP (?callback=fn) + - Request ID generation (UUID if x-amz-request-id missing) + - Advanced Prometheus metrics (INFLIGHT_REQUESTS, VOLUME_FILE_COUNT gauges) +- **Production Sprint 3** — Streaming & Multi-node: + - Streaming reads for large files (>1MB) via http_body::Body trait with spawn_blocking + - Meta-only needle reads (NeedleStreamInfo) to avoid loading full body for streaming + - Multi-volume Rust cluster support (RustMultiVolumeCluster test framework) + - TestVolumeMoveHandlesInFlightWrites now uses Rust volume servers + - CI skip list cleaned up (all tests pass with Rust) + +- **Production Sprint 4** — Advanced Features: + - BatchDelete EC shard support (ecx index lookup + ecj journal deletion) + - JPEG EXIF orientation auto-fix on upload (kamadak-exif + image crate) + - Async batched write processing (mpsc queue, up to 128 entries per batch) + - VolumeTierMoveDatToRemote/FromRemote (S3 multipart upload/download) + - S3TierRegistry for managing remote storage backends + - VolumeInfo (.vif) persistence for remote file references +- **Production Sprint 5** — Upload Compatibility: + - TTL query parameter extraction during upload (`ttl=3m`) + - Auto-compression for compressible file types (text/*, .js, .css, .json, .svg, etc.) + - Seaweed-* custom metadata headers stored as needle pairs (JSON, max 64KB) + - Filename extraction from URL path stored in needle name field + - Upload response includes filename + +- **Production Sprint 6** — Storage & Networking: + - Redb disk-backed needle maps (pure Rust, no C deps) via `NeedleMap` enum + - Binary search for `VolumeIncrementalCopy` with `since_ns > 0` + - Proxy/redirect read modes for non-local volumes (master lookup, HTTP proxy, 301 redirect) + +## Remaining Work (Production Readiness) + +No major remaining items. All phases and production sprints are complete. + +## Test Commands + +```bash +# Build +cd seaweed-volume && cargo build --release + +# Run all Go integration tests with Rust volume server +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 1200s ./test/volume_server/grpc/... ./test/volume_server/http/... + +# Run S3 remote storage tests +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 180s -run "TestFetchAndWriteNeedle(FromS3|S3NotFound)" ./test/volume_server/grpc/... + +# Run specific test +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 60s -run "TestName" ./test/volume_server/http/... + +# Run Rust unit tests +cd seaweed-volume && cargo test + +# Test heartbeat with Go master +weed master -port=9333 & +seaweed-volume --port 8080 --master localhost:9333 --dir /tmp/vol1 --max 7 +curl http://localhost:9333/dir/status # should show Rust volume server registered +``` diff --git a/seaweed-volume/MISSING_FEATURES.md b/seaweed-volume/MISSING_FEATURES.md new file mode 100644 index 000000000..807dd945c --- /dev/null +++ b/seaweed-volume/MISSING_FEATURES.md @@ -0,0 +1,288 @@ +# Rust Volume Server — Missing Features Audit + +Comprehensive line-by-line comparison of Go vs Rust volume server. +Generated 2026-03-07 from 4 parallel audits covering HTTP, gRPC, storage, and infrastructure. + +## Executive Summary + +| Area | Total Features | Implemented | Partial | Missing | +|------|---------------|-------------|---------|---------| +| gRPC RPCs | 48 | 43 (90%) | 2 (4%) | 3 (6%) | +| HTTP Handlers | 31 | 12 (39%) | 10 (32%) | 9 (29%) | +| Storage Layer | 22 | 6 (27%) | 7 (32%) | 9 (41%) | +| Infrastructure | 14 | 5 (36%) | 4 (29%) | 5 (36%) | + +--- + +## Priority 1 — Critical for Production + +### P1.1 Streaming / Meta-Only Reads +- **Go**: `ReadNeedleMeta()`, `ReadNeedleData()`, `ReadPagedData()` — reads only metadata or pages of large files +- **Go**: `streamWriteResponseContent()` streams needle data in chunks +- **Go**: `AttemptMetaOnly` / `MustMetaOnly` flags in `ReadOption` +- **Rust**: Reads entire needle into memory always +- **Impact**: OOM on large files; 8MB file = 8MB heap per request +- **Files**: `weed/storage/needle/needle_read.go`, `weed/server/volume_server_handlers_read.go` +- **Effort**: Medium + +### P1.2 Download Proxy/Redirect Fallback (ReadMode) +- **Go**: `ReadMode` config: "local" | "proxy" | "redirect" +- **Go**: `tryProxyToReplica()` probes replicas, `proxyReqToTargetServer()` streams response +- **Rust**: Always returns 404 for non-local volumes +- **Impact**: Clients must handle volume placement themselves; breaks transparent replication +- **Files**: `weed/server/volume_server_handlers_read.go:138-250` +- **Effort**: Medium + +### P1.3 TLS/HTTPS Support +- **Go**: `LoadServerTLS()`, `LoadClientTLS()`, cert/key loading from security.toml +- **Go**: Applied to both HTTP and gRPC servers +- **Rust**: No TLS at all — plain TCP only +- **Impact**: Cannot deploy in secure clusters +- **Files**: `weed/security/tls.go`, `weed/command/volume.go` +- **Effort**: Medium (rustls + tokio-rustls already in Cargo.toml) + +### P1.4 VolumeMarkReadonly/Writable Master Notification +- **Go**: `notifyMasterVolumeReadonly()` updates master with readonly state +- **Rust**: Only sets local in-memory flag +- **Impact**: Master keeps directing writes to readonly volume +- **Files**: `weed/server/volume_grpc_admin.go` +- **Effort**: Low + +### P1.5 Compaction/Maintenance Throttling +- **Go**: `WriteThrottler` with `MaybeSlowdown()` for MB/s rate limiting +- **Rust**: Flags parsed but no throttle implementation +- **Impact**: Compaction/copy operations can saturate disk IO +- **Files**: `weed/util/throttler.go` +- **Effort**: Low + +### P1.6 File Size Limit Enforcement +- **Go**: `fileSizeLimitBytes` checked on upload, returns 400 +- **Rust**: No enforcement — accepts any size +- **Impact**: Can write files larger than volume size limit +- **Files**: `weed/server/volume_server_handlers_write.go` +- **Effort**: Low + +--- + +## Priority 2 — Important for Compatibility + +### P2.1 `ts` Query Param (Custom Timestamps) +- **Go**: Upload and delete accept `ts` query param for custom Last-Modified time +- **Rust**: Always uses current time +- **Impact**: Replication timestamp fidelity; sync from external sources +- **Files**: `weed/server/volume_server_handlers_write.go`, `volume_server_handlers_admin.go` +- **Effort**: Low + +### P2.2 Multipart Form Upload Parsing +- **Go**: `needle.CreateNeedleFromRequest()` parses multipart forms, extracts MIME type, custom headers/pairs +- **Rust**: Reads raw body bytes only — no multipart form parsing for metadata +- **Impact**: MIME type not stored; custom needle pairs not supported +- **Files**: `weed/storage/needle/needle.go:CreateNeedleFromRequest` +- **Effort**: Medium + +### P2.3 JPEG Orientation Auto-Fix +- **Go**: `images.FixJpgOrientation()` on upload when enabled +- **Rust**: Not implemented (flag exists but unused) +- **Impact**: Mobile uploads may display rotated +- **Files**: `weed/images/orientation.go` +- **Effort**: Low (exif crate) + +### P2.4 TTL Expiration Enforcement +- **Go**: Checks `HasTtl()` + `AppendAtNs` against current time on read path +- **Rust**: TTL struct exists but no expiration checking +- **Impact**: Expired needles still served +- **Files**: `weed/storage/needle/volume_ttl.go`, `weed/storage/volume_read.go` +- **Effort**: Low + +### P2.5 Health Check — Master Heartbeat Status +- **Go**: Returns 503 if not heartbeating (can't reach master) +- **Rust**: Only checks `is_stopping` flag +- **Impact**: Load balancers won't detect disconnected volume servers +- **Files**: `weed/server/volume_server.go` +- **Effort**: Low + +### P2.6 Stats Endpoints +- **Go**: `/stats/counter`, `/stats/memory`, `/stats/disk` (whitelist-guarded) +- **Rust**: Not implemented +- **Impact**: No operational visibility +- **Files**: `weed/server/volume_server.go` +- **Effort**: Low + +### P2.7 Webp Image Support +- **Go**: `.webp` included in resize-eligible extensions +- **Rust**: Only `.png`, `.jpg`, `.jpeg`, `.gif` +- **Impact**: Webp images can't be resized on read +- **Files**: `weed/server/volume_server_handlers_read.go` +- **Effort**: Low (add webp feature to image crate) + +### P2.8 preStopSeconds Graceful Drain +- **Go**: Stops heartbeat, waits N seconds, then shuts down servers +- **Rust**: Immediate shutdown on signal +- **Impact**: In-flight requests dropped; Kubernetes readiness race +- **Files**: `weed/command/volume.go` +- **Effort**: Low + +### P2.9 S3 Response Passthrough Headers +- **Go**: `response-content-encoding`, `response-expires`, `response-content-language` query params +- **Rust**: Only handles `response-content-type`, `response-cache-control`, `dl` +- **Impact**: S3-compatible GET requests missing some override headers +- **Files**: `weed/server/volume_server_handlers_read.go` +- **Effort**: Low + +--- + +## Priority 3 — Storage Layer Gaps + +### P3.1 LevelDB Needle Maps +- **Go**: 5 needle map variants: memory, LevelDB, LevelDB-medium, LevelDB-large, sorted-file +- **Rust**: Memory-only needle map +- **Impact**: Large volumes (millions of needles) require too much RAM +- **Files**: `weed/storage/needle_map_leveldb.go` +- **Effort**: High (need LevelDB binding or alternative) + +### P3.2 Async Request Processing +- **Go**: `asyncRequestsChan` with 128-entry queue, worker goroutine for batched writes +- **Rust**: All writes synchronous +- **Impact**: Write throughput limited by fsync latency +- **Files**: `weed/storage/needle/async_request.go` +- **Effort**: Medium + +### P3.3 Volume Scrubbing (Data Integrity) +- **Go**: `ScrubIndex()`, `scrubVolumeData()` — full data + index verification +- **Rust**: Stub only in gRPC (returns OK without actual scrubbing) +- **Impact**: No way to verify data integrity +- **Files**: `weed/storage/volume_checking.go`, `weed/storage/idx/check.go` +- **Effort**: Medium + +### P3.4 Volume Backup / Sync +- **Go**: Streaming backup, binary search for last modification, index generation scanner +- **Rust**: Not implemented +- **Impact**: No backup/restore capability +- **Files**: `weed/storage/volume_backup.go` +- **Effort**: Medium + +### P3.5 Volume Info (.vif) Persistence +- **Go**: `.vif` files store tier/remote metadata, readonly state persists across restarts +- **Rust**: No `.vif` support; readonly is in-memory only +- **Impact**: Readonly state lost on restart; no tier metadata +- **Files**: `weed/storage/volume_info/volume_info.go` +- **Effort**: Low + +### P3.6 Disk Location Features +- **Go**: Directory UUID tracking, disk space monitoring, min-free-space enforcement, tag-based grouping +- **Rust**: Basic directory only +- **Impact**: No disk-full protection +- **Files**: `weed/storage/disk_location.go` +- **Effort**: Medium + +### P3.7 Compact Map (Memory-Efficient Needle Map) +- **Go**: `CompactMap` with overflow handling for memory optimization +- **Rust**: Uses standard HashMap +- **Impact**: Higher memory usage for index +- **Files**: `weed/storage/needle_map/compact_map.go` +- **Effort**: Medium + +--- + +## Priority 4 — Nice to Have + +### P4.1 gRPC: VolumeTierMoveDatToRemote / FromRemote +- **Go**: Full streaming implementation for tiering volumes to/from S3 +- **Rust**: Stub returning error +- **Files**: `weed/server/volume_grpc_tier_upload.go`, `volume_grpc_tier_download.go` +- **Effort**: High + +### P4.2 gRPC: Query (S3 Select) +- **Go**: JSON/CSV query over needle data (S3 Select compatible) +- **Rust**: Stub returning error +- **Files**: `weed/server/volume_grpc_query.go` +- **Effort**: High + +### P4.3 FetchAndWriteNeedle — Already Implemented +- **Note**: The gRPC audit incorrectly flagged this as missing. It was implemented in a prior session with full S3 remote storage support. + +### P4.4 JSON Pretty Print + JSONP +- **Go**: `?pretty` query param for indented JSON; `?callback=fn` for JSONP +- **Rust**: Neither supported +- **Effort**: Low + +### P4.5 Request ID Generation +- **Go**: Generates UUID if `x-amz-request-id` header missing, propagates to gRPC context +- **Rust**: Only echoes existing header +- **Effort**: Low + +### P4.6 UI Status Page +- **Go**: Full HTML template with volumes, disks, stats, uptime +- **Rust**: Stub HTML +- **Effort**: Medium + +### P4.7 Advanced Prometheus Metrics +- **Go**: InFlightRequestsGauge, ConcurrentUploadLimit/DownloadLimit gauges, metrics push gateway +- **Rust**: Basic request counter and histogram only +- **Effort**: Low + +### P4.8 Profiling (pprof) +- **Go**: CPU/memory profiling, /debug/pprof endpoints +- **Rust**: Flags parsed but not wired +- **Effort**: Medium (tokio-console or pprof-rs) + +### P4.9 EC Distribution / Rebalancing +- **Go**: 17 files for EC operations including placement strategies, recovery, scrubbing +- **Rust**: 6 files with basic encoder/decoder +- **Effort**: High + +### P4.10 Cookie Mismatch Status Code +- **Go**: Returns 406 Not Acceptable +- **Rust**: Returns 400 Bad Request +- **Effort**: Trivial + +--- + +## Implementation Order Recommendation + +### Sprint 1 — Quick Wins (Low effort, high impact) ✅ DONE +1. ✅ P1.4 VolumeMarkReadonly master notification — triggers immediate heartbeat +2. ✅ P1.5 Compaction throttling — `maybe_throttle_compaction()` method added +3. ✅ P1.6 File size limit enforcement — checks `file_size_limit_bytes` on upload +4. ✅ P2.1 `ts` query param — custom timestamps for upload and delete +5. ✅ P2.4 TTL expiration check — was already implemented +6. ✅ P2.5 Health check heartbeat status — returns 503 if not heartbeating +7. ✅ P2.8 preStopSeconds — graceful drain delay before shutdown +8. ✅ P2.9 S3 passthrough headers — content-encoding, expires, content-language, content-disposition +9. ✅ P3.5 .vif persistence — readonly state persists across restarts +10. ✅ P2.7 Webp support — added to image resize-eligible extensions +11. ~~P4.10 Cookie 406~~ — Go actually uses 404 for HTTP cookie mismatch (406 is gRPC batch delete only) + +### Sprint 2 — Core Read Path (Medium effort) — Partially Done +1. P1.1 Streaming / meta-only reads — TODO (medium effort, no test coverage yet) +2. ✅ P1.2 ReadMode proxy/redirect — was already implemented and tested +3. ✅ P2.2 Multipart form parsing — MIME type extraction from Content-Type header +4. P2.3 JPEG orientation fix — TODO (low effort, needs exif crate) +5. ✅ P2.6 Stats endpoints — /stats/counter, /stats/memory, /stats/disk +6. ✅ P2.7 Webp support — done in Sprint 1 +7. ✅ P4.4 JSON pretty print + JSONP — ?pretty=y and ?callback=fn +8. ✅ P4.5 Request ID generation — generates UUID if x-amz-request-id missing +9. ✅ P4.7 Advanced Prometheus metrics — INFLIGHT_REQUESTS gauge, VOLUME_FILE_COUNT gauge + +### Sprint 3 — Infrastructure (Medium effort) — Partially Done +1. ✅ P1.3 TLS/HTTPS — rustls + tokio-rustls for HTTP, tonic ServerTlsConfig for gRPC +2. P3.2 Async request processing — TODO (medium effort) +3. ✅ P3.3 Volume scrubbing — CRC checksum verification of all needles +4. ✅ P3.6 Disk location features — MinFreeSpace enforcement, background disk monitor + +### Sprint 4 — Storage Advanced (High effort) — Deferred +No integration test coverage for these items. All existing tests pass. +1. P3.1 LevelDB needle maps — needed only for volumes with millions of needles +2. P3.4 Volume backup/sync — streaming backup, binary search +3. P3.7 Compact map — memory optimization for needle index +4. P4.1 VolumeTierMoveDat — full S3 tiering (currently error stub) +5. P4.9 EC distribution — advanced EC placement/rebalancing + +### Sprint 5 — Polish — Deferred +No integration test coverage for these items. +1. P4.2 Query (S3 Select) — JSON/CSV query over needle data +2. ✅ P4.4 JSON pretty/JSONP — done in Sprint 2 +3. ✅ P4.5 Request ID generation — done in Sprint 2 +4. P4.6 UI status page — HTML template with volume/disk/stats info +5. ✅ P4.7 Advanced metrics — done in Sprint 2 +6. P4.8 Profiling — pprof-rs or tokio-console diff --git a/seaweed-volume/PARITY_PLAN.md b/seaweed-volume/PARITY_PLAN.md new file mode 100644 index 000000000..2f37fdabd --- /dev/null +++ b/seaweed-volume/PARITY_PLAN.md @@ -0,0 +1,230 @@ +# Rust Volume Server Parity Plan + +Generated: 2026-03-16 + +## Goal + +Make `seaweed-volume` a drop-in replacement for the Go volume server by: + +- comparing every Go volume-server code path against the Rust implementation, +- recording file-level ownership and verification status, +- closing verified behavior gaps one logic change per commit, +- extending tests so regressions are caught by Go parity suites and Rust unit/integration tests. + +## Ground Truth + +Primary Go sources: + +- `weed/server/volume_server.go` +- `weed/server/volume_server_handlers*.go` +- `weed/server/volume_grpc_*.go` +- `weed/server/constants/volume.go` +- `weed/storage/store*.go` +- `weed/storage/disk_location*.go` +- `weed/storage/volume*.go` +- `weed/storage/needle/*.go` +- `weed/storage/idx/*.go` +- `weed/storage/needle_map*.go` +- `weed/storage/needle_map/*.go` +- `weed/storage/super_block/*.go` +- `weed/storage/erasure_coding/*.go` + +Supporting Go dependencies that affect drop-in behavior: + +- `weed/command/volume.go` +- `weed/security/*.go` +- `weed/images/*.go` +- `weed/stats/*.go` + +Primary Rust sources: + +- `seaweed-volume/src/main.rs` +- `seaweed-volume/src/config.rs` +- `seaweed-volume/src/security.rs` +- `seaweed-volume/src/images.rs` +- `seaweed-volume/src/server/*.rs` +- `seaweed-volume/src/storage/*.rs` +- `seaweed-volume/src/storage/needle/*.rs` +- `seaweed-volume/src/storage/idx/*.rs` +- `seaweed-volume/src/storage/erasure_coding/*.rs` +- `seaweed-volume/src/remote_storage/*.rs` + +## Audit Method + +For each Go file: + +1. Map it to the Rust file or files that should own the same behavior. +2. Compare exported entry points, helper functions, state transitions, wire fields, and persistence side effects. +3. Mark each file `implemented`, `partial`, `missing`, or `needs verification`. +4. Link each behavior to an existing test or add a missing test. +5. Only treat a gap as closed after code review plus local verification. + +## Acceptance Criteria + +The Rust server is a drop-in replacement only when all of these hold: + +- HTTP routes, status codes, headers, and body semantics match Go. +- gRPC RPCs match Go request validation, response fields, streaming behavior, and maintenance/read-only semantics. +- Master heartbeat and topology metadata match Go closely enough that the Go master treats Rust and Go volume servers the same. +- On-disk volume behavior matches Go for normal volumes, EC shards, tiering metadata, and readonly persistence. +- Startup flags and operational endpoints that affect production deployment behave equivalently or are explicitly documented as unsupported. +- Existing Go integration suites pass with `VOLUME_SERVER_IMPL=rust`. + +## File Matrix + +### HTTP server surface + +| Go file | Rust counterpart | Status | Comparison focus | +| --- | --- | --- | --- | +| `weed/server/volume_server.go` | `seaweed-volume/src/main.rs`, `seaweed-volume/src/server/volume_server.rs`, `seaweed-volume/src/server/heartbeat.rs` | partial | startup wiring, routers, heartbeat, shutdown, metrics/debug listeners | +| `weed/server/volume_server_handlers.go` | `seaweed-volume/src/server/volume_server.rs`, `seaweed-volume/src/server/handlers.rs` | needs verification | method dispatch, OPTIONS behavior, public/admin split | +| `weed/server/volume_server_handlers_admin.go` | `seaweed-volume/src/server/handlers.rs` | implemented | `/status`, `/healthz`, stats, server headers | +| `weed/server/volume_server_handlers_helper.go` | `seaweed-volume/src/server/handlers.rs` | needs verification | JSON encoding, request parsing, helper parity | +| `weed/server/volume_server_handlers_read.go` | `seaweed-volume/src/server/handlers.rs` | needs verification | JWT, conditional reads, range reads, proxy/redirect, chunk manifests, image transforms | +| `weed/server/volume_server_handlers_ui.go` | `seaweed-volume/src/server/handlers.rs`, embedded assets | partial | UI payload and HTML parity | +| `weed/server/volume_server_handlers_write.go` | `seaweed-volume/src/server/handlers.rs`, `seaweed-volume/src/images.rs` | needs verification | multipart parsing, metadata, compression, ts, delete semantics | +| `weed/server/constants/volume.go` | `seaweed-volume/src/server/heartbeat.rs`, config defaults | needs verification | heartbeat timing, constants parity | + +### gRPC server surface + +| Go file | Rust counterpart | Status | Comparison focus | +| --- | --- | --- | --- | +| `weed/server/volume_grpc_admin.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | readonly/writable, allocate/delete/configure/mount/unmount | +| `weed/server/volume_grpc_batch_delete.go` | `seaweed-volume/src/server/grpc_server.rs` | implemented | batch delete, EC delete path | +| `weed/server/volume_grpc_client_to_master.go` | `seaweed-volume/src/server/heartbeat.rs` | partial | heartbeat fields, leader changes, metrics settings from master | +| `weed/server/volume_grpc_copy.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | full copy streams | +| `weed/server/volume_grpc_copy_incremental.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | incremental copy binary search, timestamps | +| `weed/server/volume_grpc_erasure_coding.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/storage/erasure_coding/*.rs` | needs verification | shard read/write/delete/mount/unmount/rebuild | +| `weed/server/volume_grpc_query.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | query validation and error parity | +| `weed/server/volume_grpc_read_all.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | read-all ordering and tail semantics | +| `weed/server/volume_grpc_read_write.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/storage/*.rs` | needs verification | blob/meta/page reads, write blob semantics | +| `weed/server/volume_grpc_remote.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/remote_storage/*.rs` | needs verification | remote fetch/write and tier metadata | +| `weed/server/volume_grpc_scrub.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/storage/*.rs` | needs verification | scrub result semantics | +| `weed/server/volume_grpc_state.go` | `seaweed-volume/src/server/grpc_server.rs` | implemented | GetState/SetState/Status | +| `weed/server/volume_grpc_tail.go` | `seaweed-volume/src/server/grpc_server.rs` | needs verification | tail streaming and idle timeout | +| `weed/server/volume_grpc_tier_download.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/remote_storage/*.rs` | needs verification | tier download stream/error paths | +| `weed/server/volume_grpc_tier_upload.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/remote_storage/*.rs` | needs verification | tier upload stream/error paths | +| `weed/server/volume_grpc_vacuum.go` | `seaweed-volume/src/server/grpc_server.rs`, `seaweed-volume/src/storage/*.rs` | needs verification | compact/commit/cleanup progress and readonly transitions | + +### Storage and persistence surface + +| Go file group | Rust counterpart | Status | Comparison focus | +| --- | --- | --- | --- | +| `weed/storage/store.go`, `store_state.go` | `seaweed-volume/src/storage/store.rs`, `seaweed-volume/src/server/heartbeat.rs` | partial | topology metadata, disk tags, server id, state persistence | +| `weed/storage/store_vacuum.go` | `seaweed-volume/src/storage/store.rs`, `seaweed-volume/src/storage/volume.rs` | needs verification | vacuum sequencing | +| `weed/storage/store_ec.go`, `store_ec_delete.go`, `store_ec_scrub.go` | `seaweed-volume/src/storage/store.rs`, `seaweed-volume/src/storage/erasure_coding/*.rs` | needs verification | EC lifecycle and scrub behavior | +| `weed/storage/disk_location.go`, `disk_location_ec.go` | `seaweed-volume/src/storage/disk_location.rs`, `seaweed-volume/src/storage/store.rs` | partial | directory UUIDs, tags, load rules, disk space checks | +| `weed/storage/volume.go`, `volume_loading.go` | `seaweed-volume/src/storage/volume.rs` | needs verification | load/reload/readonly/remote metadata | +| `weed/storage/volume_super_block.go` | `seaweed-volume/src/storage/super_block.rs`, `seaweed-volume/src/storage/volume.rs` | implemented | super block parity | +| `weed/storage/volume_read.go`, `volume_read_all.go` | `seaweed-volume/src/storage/volume.rs`, `seaweed-volume/src/server/handlers.rs` | needs verification | full/meta/page reads, TTL, streaming | +| `weed/storage/volume_write.go` | `seaweed-volume/src/storage/volume.rs`, `seaweed-volume/src/server/write_queue.rs` | needs verification | dedup, sync/async writes, metadata flags | +| `weed/storage/volume_vacuum.go` | `seaweed-volume/src/storage/volume.rs` | needs verification | compact and commit parity | +| `weed/storage/volume_backup.go` | `seaweed-volume/src/storage/volume.rs`, `seaweed-volume/src/server/grpc_server.rs` | needs verification | backup/search logic | +| `weed/storage/volume_checking.go` | `seaweed-volume/src/storage/volume.rs`, `seaweed-volume/src/storage/idx/mod.rs`, `seaweed-volume/src/server/grpc_server.rs` | needs verification | scrub and integrity checks | +| `weed/storage/volume_info.go`, `volume_info/volume_info.go`, `volume_tier.go` | `seaweed-volume/src/storage/volume.rs`, `seaweed-volume/src/remote_storage/*.rs` | needs verification | `.vif` format and tiered file metadata | +| `weed/storage/needle/*.go` | `seaweed-volume/src/storage/needle/*.rs` | needs verification | needle parsing, CRC, TTL, multipart metadata | +| `weed/storage/idx/*.go` | `seaweed-volume/src/storage/idx/*.rs` | needs verification | index walking and binary search | +| `weed/storage/needle_map*.go`, `needle_map/*.go` | `seaweed-volume/src/storage/needle_map.rs` | needs verification | map kind parity, persistence, memory behavior | +| `weed/storage/super_block/*.go` | `seaweed-volume/src/storage/super_block.rs` | implemented | replica placement and TTL metadata | +| `weed/storage/erasure_coding/*.go` | `seaweed-volume/src/storage/erasure_coding/*.rs` | needs verification | EC shard placement, encode/decode, journal deletes | + +### Supporting runtime surface + +| Go file | Rust counterpart | Status | Comparison focus | +| --- | --- | --- | --- | +| `weed/command/volume.go` | `seaweed-volume/src/config.rs`, `seaweed-volume/src/main.rs` | partial | flags, metrics/debug listeners, startup behavior | +| `weed/security/*.go` | `seaweed-volume/src/security.rs`, `seaweed-volume/src/main.rs` | implemented | JWT and TLS loading | +| `weed/images/*.go` | `seaweed-volume/src/images.rs`, `seaweed-volume/src/server/handlers.rs` | implemented | JPEG orientation and transforms | +| `weed/stats/*.go` | `seaweed-volume/src/metrics.rs`, `seaweed-volume/src/server/handlers.rs` | partial | metrics endpoints, push-gateway integration | + +## Verified Gaps As Of 2026-03-08 + +The startup/runtime gaps that were verified in the initial audit are now closed: + +1. Heartbeat metadata parity + Closed by `8ade1c51d` and retained in current HEAD. + +2. Dedicated metrics/debug listener parity + Closed by `fbe0e5829`. + +3. Master-provided metrics push settings + Closed by `fbe0e5829`. + +4. Slow-read tuning parity + Closed by `66e3900dc`. + +There are no remaining verified gaps from the initial startup/runtime audit. The broader line-by-line comparison batches below are still required to either confirm parity or surface new gaps. + +## Execution Status As Of 2026-03-16 + +The file-by-file comparison and verification work executed in this round was: + +1. Startup and harness alignment + Compared `weed/command/volume.go`, `test/volume_server/framework/cluster*.go`, `seaweed-volume/src/config.rs`, and `seaweed-volume/src/main.rs` to ensure the Rust server is invoked with Go-compatible flags and is rebuilt from the current source during parity runs. + +2. HTTP admin surface + Compared `weed/server/volume_server_handlers_admin.go` against `seaweed-volume/src/server/handlers.rs` with emphasis on `/status` payload shape, disk-status fields, and volume ordering. + +3. gRPC admin surface + Compared `weed/server/volume_grpc_admin.go` against `seaweed-volume/src/server/grpc_server.rs` with emphasis on `Ping`, `VolumeConfigure`, readonly/writable flows, and error wrapping. + +4. Storage/index layout + Compared Go index-entry defaults in `weed/storage/types` and `weed/storage/idx/*.go` against the Rust default feature set in `seaweed-volume/Cargo.toml` and the Rust index reader/writer paths to confirm default binaries use the same offset width. + +5. End-to-end parity verification + Re-ran the Go HTTP and gRPC integration suites with `VOLUME_SERVER_IMPL=rust` after each fix to confirm wire-level compatibility. + +### Verified mismatches closed in this round + +- Rust parity runs could reuse a stale `weed-volume` binary across test invocations, hiding source and feature changes from the Go harness. +- Rust defaulted to 5-byte index offsets, while the default Go `go build` path uses 4-byte offsets unless built with `-tags 5BytesOffset`. +- Rust `/status` omitted Go fields in both `Volumes` and `DiskStatuses`, and did not sort volumes by `Id`. +- Rust `Ping` treated an empty target as a self-ping and only performed a raw gRPC connect for filer targets; Go returns `remote_time_ns=0` for the empty request and performs a real filer `Ping` RPC. +- Rust `VolumeNeedleStatus` dropped stored TTL metadata and reported `data_size` instead of Go’s `Size` field. +- Rust multipart uploads ignored form fields such as `ts`, `ttl`, and `cm`, and also ignored part-level `Content-Encoding` and `Content-MD5`. +- Rust only treated `dl=true` and `dl=1` as truthy, while Go accepts the full `strconv.ParseBool` set such as `dl=t` and `dl=True`. + +### Verification commands + +- `VOLUME_SERVER_IMPL=rust go test -count=1 -timeout 1200s ./test/volume_server/http/...` +- `VOLUME_SERVER_IMPL=rust go test -count=1 -timeout 1200s ./test/volume_server/grpc/...` + +## Execution Plan + +### Batch 1: startup and heartbeat + +- Compare `weed/command/volume.go`, `weed/server/volume_server.go`, `weed/server/volume_grpc_client_to_master.go`, `weed/storage/store.go`, and `weed/storage/disk_location.go`. +- Close metadata and startup parity gaps that affect master registration and deployment compatibility. +- Add Rust unit tests for heartbeat payloads and config wiring. + +### Batch 2: HTTP read path + +- Compare `volume_server_handlers_read.go`, `volume_server_handlers_helper.go`, and related storage read functions line by line. +- Verify JWT, path parsing, proxy/redirect, ranges, streaming, chunk manifests, image transforms, and response-header overrides. +- Extend `test/volume_server/http/...` and Rust handler tests where parity is not covered. + +### Batch 3: HTTP write/delete path + +- Compare `volume_server_handlers_write.go` and write-related storage functions. +- Verify multipart behavior, metadata, md5, compression, unchanged writes, delete edge cases, and timestamp handling. + +### Batch 4: gRPC admin and lifecycle + +- Compare `volume_grpc_admin.go`, `volume_grpc_state.go`, and `volume_grpc_vacuum.go`. +- Verify readonly/writable flows, maintenance mode, status payloads, mount/unmount/delete/configure, and vacuum transitions. + +### Batch 5: gRPC data movement + +- Compare `volume_grpc_read_write.go`, `copy*.go`, `read_all.go`, `tail.go`, `remote.go`, and `query.go`. +- Verify stream framing, binary search, idle timeout, and remote-storage semantics. + +### Batch 6: storage internals + +- Compare all `weed/storage` volume, needle, idx, needle map, and EC files line by line. +- Focus on persistence rules, readonly semantics, TTL, recovery/scrub, backup, and memory/disk map behavior. + +## Commit Strategy + +- One commit for the audit/plan document if the document itself changes. +- One commit per logic fix. +- Every logic commit must include the smallest test addition that proves the new parity claim. diff --git a/seaweed-volume/README.md b/seaweed-volume/README.md new file mode 100644 index 000000000..4367a0722 --- /dev/null +++ b/seaweed-volume/README.md @@ -0,0 +1,140 @@ +# SeaweedFS Volume Server (Rust) + +A drop-in replacement for the [SeaweedFS](https://github.com/seaweedfs/seaweedfs) Go volume server, rewritten in Rust. It uses binary-compatible storage formats (`.dat`, `.idx`, `.vif`) and speaks the same HTTP and gRPC protocols, so it works with an unmodified Go master server. + +## Building + +Requires Rust 1.75+ (2021 edition). + +```bash +cd seaweed-volume +cargo build --release +``` + +The binary is produced at `target/release/seaweed-volume`. + +## Running + +Start a Go master server first, then point the Rust volume server at it: + +```bash +# Minimal +seaweed-volume --port 8080 --master localhost:9333 --dir /data/vol1 --max 7 + +# Multiple data directories +seaweed-volume --port 8080 --master localhost:9333 \ + --dir /mnt/ssd1,/mnt/ssd2 --max 100,100 --disk ssd + +# With datacenter/rack topology +seaweed-volume --port 8080 --master localhost:9333 --dir /data/vol1 --max 7 \ + --dataCenter dc1 --rack rack1 + +# With JWT authentication +seaweed-volume --port 8080 --master localhost:9333 --dir /data/vol1 --max 7 \ + --securityFile /etc/seaweedfs/security.toml + +# With TLS (configured in security.toml via [https.volume] and [grpc.volume] sections) +seaweed-volume --port 8080 --master localhost:9333 --dir /data/vol1 --max 7 \ + --securityFile /etc/seaweedfs/security.toml +``` + +### Common flags + +| Flag | Default | Description | +|------|---------|-------------| +| `--port` | `8080` | HTTP listen port | +| `--port.grpc` | `port+10000` | gRPC listen port | +| `--master` | `localhost:9333` | Comma-separated master server addresses | +| `--dir` | `/tmp` | Comma-separated data directories | +| `--max` | `8` | Max volumes per directory (comma-separated) | +| `--ip` | auto-detect | Server IP / identifier | +| `--ip.bind` | same as `--ip` | Bind address | +| `--dataCenter` | | Datacenter name | +| `--rack` | | Rack name | +| `--disk` | | Disk type tag: `hdd`, `ssd`, or custom | +| `--index` | `memory` | Needle map type: `memory`, `leveldb`, `leveldbMedium`, `leveldbLarge` | +| `--readMode` | `proxy` | Non-local read mode: `local`, `proxy`, `redirect` | +| `--fileSizeLimitMB` | `256` | Max upload file size | +| `--minFreeSpace` | `1` (percent) | Min free disk space before marking volumes read-only | +| `--securityFile` | | Path to `security.toml` for JWT keys and TLS certs | +| `--metricsPort` | `0` (disabled) | Prometheus metrics endpoint port | +| `--whiteList` | | Comma-separated IPs with write permission | +| `--preStopSeconds` | `10` | Graceful drain period before shutdown | +| `--compactionMBps` | `0` (unlimited) | Compaction I/O rate limit | +| `--pprof` | `false` | Enable pprof HTTP handlers | + +Set `RUST_LOG=debug` (or `trace`, `info`, `warn`) for log level control. +Set `SEAWEED_WRITE_QUEUE=1` to enable batched async write processing. + +## Features + +- **Binary compatible** -- reads and writes the same `.dat`/`.idx`/`.vif` files as the Go server; seamless migration with no data conversion. +- **HTTP + gRPC** -- full implementation of the volume server HTTP API and all gRPC RPCs including streaming operations (copy, tail, incremental copy, vacuum). +- **Master heartbeat** -- bidirectional streaming heartbeat with the Go master server; volume and EC shard registration, leader failover, graceful shutdown deregistration. +- **JWT authentication** -- signing key configuration via `security.toml` with token source precedence (query > header > cookie), file_id claims validation, and separate read/write keys. +- **TLS** -- HTTPS for the HTTP API and mTLS for gRPC, configured through `security.toml`. +- **Erasure coding** -- Reed-Solomon EC shard management: mount/unmount, read, rebuild, copy, delete, and shard-to-volume reconstruction. +- **S3 remote storage** -- `FetchAndWriteNeedle` reads from any S3-compatible backend (AWS, MinIO, Wasabi, Backblaze, etc.) and writes locally. Supports `VolumeTierMoveDatToRemote`/`FromRemote` for tiered storage. +- **Needle map backends** -- in-memory HashMap, LevelDB (via `rusty-leveldb`), or redb (pure Rust disk-backed) needle maps. +- **Image processing** -- on-the-fly resize/crop, JPEG EXIF orientation auto-fix, WebP support. +- **Streaming reads** -- large files (>1MB) are streamed via `spawn_blocking` to avoid blocking the async runtime. +- **Auto-compression** -- compressible file types (text, JSON, CSS, JS, SVG, etc.) are gzip-compressed on upload. +- **Prometheus metrics** -- counters, histograms, and gauges exported at a dedicated metrics port; optional push gateway support. +- **Graceful shutdown** -- SIGINT/SIGTERM handling with configurable `preStopSeconds` drain period. + +## Testing + +### Rust unit tests + +```bash +cd seaweed-volume +cargo test +``` + +### Go integration tests + +The Go test suite can target either the Go or Rust volume server via the `VOLUME_SERVER_IMPL` environment variable: + +```bash +# Run all HTTP + gRPC integration tests against the Rust server +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 1200s \ + ./test/volume_server/grpc/... ./test/volume_server/http/... + +# Run a single test +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 60s \ + -run "TestName" ./test/volume_server/http/... + +# Run S3 remote storage tests +VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 180s \ + -run "TestFetchAndWriteNeedle" ./test/volume_server/grpc/... +``` + +## Load testing + +A load test harness is available at `test/volume_server/loadtest/`. See that directory for usage instructions and scenarios. + +## Architecture + +The server runs three listeners concurrently: + +- **HTTP** (Axum 0.7) -- admin and public routers for file upload/download, status, and stats endpoints. +- **gRPC** (Tonic 0.12) -- all `VolumeServer` RPCs from the SeaweedFS protobuf definition. +- **Metrics** (optional) -- Prometheus scrape endpoint on a separate port. + +Key source modules: + +| Path | Description | +|------|-------------| +| `src/main.rs` | Entry point, server startup, signal handling | +| `src/config.rs` | CLI parsing and configuration resolution | +| `src/server/volume_server.rs` | HTTP router setup and middleware | +| `src/server/handlers.rs` | HTTP request handlers (read, write, delete, status) | +| `src/server/grpc_server.rs` | gRPC service implementation | +| `src/server/heartbeat.rs` | Master heartbeat loop | +| `src/storage/volume.rs` | Volume read/write/delete logic | +| `src/storage/needle.rs` | Needle (file entry) serialization | +| `src/storage/store.rs` | Multi-volume store management | +| `src/security.rs` | JWT validation and IP whitelist guard | +| `src/remote_storage/` | S3 remote storage backend | + +See [DEV_PLAN.md](DEV_PLAN.md) for the full development history and feature checklist. diff --git a/seaweed-volume/build.rs b/seaweed-volume/build.rs new file mode 100644 index 000000000..08d5cb392 --- /dev/null +++ b/seaweed-volume/build.rs @@ -0,0 +1,17 @@ +fn main() -> Result<(), Box> { + let out_dir = std::path::PathBuf::from(std::env::var("OUT_DIR")?); + tonic_build::configure() + .build_server(true) + .build_client(true) + .file_descriptor_set_path(out_dir.join("seaweed_descriptor.bin")) + .compile_protos( + &[ + "proto/volume_server.proto", + "proto/master.proto", + "proto/remote.proto", + "../weed/pb/filer.proto", + ], + &["proto/", "../weed/pb/"], + )?; + Ok(()) +} diff --git a/seaweed-volume/proto/master.proto b/seaweed-volume/proto/master.proto new file mode 100644 index 000000000..8289cd233 --- /dev/null +++ b/seaweed-volume/proto/master.proto @@ -0,0 +1,474 @@ +syntax = "proto3"; + +package master_pb; + +option go_package = "github.com/seaweedfs/seaweedfs/weed/pb/master_pb"; + +import "volume_server.proto"; + +////////////////////////////////////////////////// + +service Seaweed { + rpc SendHeartbeat (stream Heartbeat) returns (stream HeartbeatResponse) { + } + rpc KeepConnected (stream KeepConnectedRequest) returns (stream KeepConnectedResponse) { + } + rpc LookupVolume (LookupVolumeRequest) returns (LookupVolumeResponse) { + } + rpc Assign (AssignRequest) returns (AssignResponse) { + } + rpc StreamAssign (stream AssignRequest) returns (stream AssignResponse) { + } + rpc Statistics (StatisticsRequest) returns (StatisticsResponse) { + } + rpc CollectionList (CollectionListRequest) returns (CollectionListResponse) { + } + rpc CollectionDelete (CollectionDeleteRequest) returns (CollectionDeleteResponse) { + } + rpc VolumeList (VolumeListRequest) returns (VolumeListResponse) { + } + rpc LookupEcVolume (LookupEcVolumeRequest) returns (LookupEcVolumeResponse) { + } + rpc VacuumVolume (VacuumVolumeRequest) returns (VacuumVolumeResponse) { + } + rpc DisableVacuum (DisableVacuumRequest) returns (DisableVacuumResponse) { + } + rpc EnableVacuum (EnableVacuumRequest) returns (EnableVacuumResponse) { + } + rpc VolumeMarkReadonly (VolumeMarkReadonlyRequest) returns (VolumeMarkReadonlyResponse) { + } + rpc GetMasterConfiguration (GetMasterConfigurationRequest) returns (GetMasterConfigurationResponse) { + } + rpc ListClusterNodes (ListClusterNodesRequest) returns (ListClusterNodesResponse) { + } + rpc LeaseAdminToken (LeaseAdminTokenRequest) returns (LeaseAdminTokenResponse) { + } + rpc ReleaseAdminToken (ReleaseAdminTokenRequest) returns (ReleaseAdminTokenResponse) { + } + rpc Ping (PingRequest) returns (PingResponse) { + } + rpc RaftListClusterServers (RaftListClusterServersRequest) returns (RaftListClusterServersResponse) { + } + rpc RaftAddServer (RaftAddServerRequest) returns (RaftAddServerResponse) { + } + rpc RaftRemoveServer (RaftRemoveServerRequest) returns (RaftRemoveServerResponse) { + } + rpc RaftLeadershipTransfer (RaftLeadershipTransferRequest) returns (RaftLeadershipTransferResponse) { + } + rpc VolumeGrow (VolumeGrowRequest) returns (VolumeGrowResponse) { + } +} + +////////////////////////////////////////////////// + +message DiskTag { + uint32 disk_id = 1; + repeated string tags = 2; +} + +message Heartbeat { + string ip = 1; + uint32 port = 2; + string public_url = 3; + uint64 max_file_key = 5; + string data_center = 6; + string rack = 7; + uint32 admin_port = 8; + repeated VolumeInformationMessage volumes = 9; + // delta volumes + repeated VolumeShortInformationMessage new_volumes = 10; + repeated VolumeShortInformationMessage deleted_volumes = 11; + bool has_no_volumes = 12; + + // erasure coding + repeated VolumeEcShardInformationMessage ec_shards = 16; + // delta erasure coding shards + repeated VolumeEcShardInformationMessage new_ec_shards = 17; + repeated VolumeEcShardInformationMessage deleted_ec_shards = 18; + bool has_no_ec_shards = 19; + + map max_volume_counts = 4; + uint32 grpc_port = 20; + repeated string location_uuids = 21; + string id = 22; // volume server id, independent of ip:port for stable identification + + // state flags + volume_server_pb.VolumeServerState state = 23; + + repeated DiskTag disk_tags = 24; +} + +message HeartbeatResponse { + uint64 volume_size_limit = 1; + string leader = 2; + string metrics_address = 3; + uint32 metrics_interval_seconds = 4; + repeated StorageBackend storage_backends = 5; + repeated string duplicated_uuids = 6; + bool preallocate = 7; +} + +message VolumeInformationMessage { + uint32 id = 1; + uint64 size = 2; + string collection = 3; + uint64 file_count = 4; + uint64 delete_count = 5; + uint64 deleted_byte_count = 6; + bool read_only = 7; + uint32 replica_placement = 8; + uint32 version = 9; + uint32 ttl = 10; + uint32 compact_revision = 11; + int64 modified_at_second = 12; + string remote_storage_name = 13; + string remote_storage_key = 14; + string disk_type = 15; + uint32 disk_id = 16; +} + +message VolumeShortInformationMessage { + uint32 id = 1; + string collection = 3; + uint32 replica_placement = 8; + uint32 version = 9; + uint32 ttl = 10; + string disk_type = 15; + uint32 disk_id = 16; +} + +message VolumeEcShardInformationMessage { + uint32 id = 1; + string collection = 2; + uint32 ec_index_bits = 3; + string disk_type = 4; + uint64 expire_at_sec = 5; // used to record the destruction time of ec volume + uint32 disk_id = 6; + repeated int64 shard_sizes = 7; // optimized: sizes for shards in order of set bits in ec_index_bits +} + +message StorageBackend { + string type = 1; + string id = 2; + map properties = 3; +} + +message Empty { +} + +message SuperBlockExtra { + message ErasureCoding { + uint32 data = 1; + uint32 parity = 2; + repeated uint32 volume_ids = 3; + } + ErasureCoding erasure_coding = 1; +} + +message KeepConnectedRequest { + string client_type = 1; + string client_address = 3; + string version = 4; + string filer_group = 5; + string data_center = 6; + string rack = 7; +} + +message VolumeLocation { + string url = 1; + string public_url = 2; + repeated uint32 new_vids = 3; + repeated uint32 deleted_vids = 4; + string leader = 5; // optional when leader is not itself + string data_center = 6; // optional when DataCenter is in use + uint32 grpc_port = 7; + repeated uint32 new_ec_vids = 8; + repeated uint32 deleted_ec_vids = 9; +} + +message ClusterNodeUpdate { + string node_type = 1; + string address = 2; + bool is_add = 4; + string filer_group = 5; + int64 created_at_ns = 6; +} + +message KeepConnectedResponse { + VolumeLocation volume_location = 1; + ClusterNodeUpdate cluster_node_update = 2; +} + +message LookupVolumeRequest { + repeated string volume_or_file_ids = 1; + string collection = 2; // optional, a bit faster if provided. +} +message LookupVolumeResponse { + message VolumeIdLocation { + string volume_or_file_id = 1; + repeated Location locations = 2; + string error = 3; + string auth = 4; + } + repeated VolumeIdLocation volume_id_locations = 1; +} + +message Location { + string url = 1; + string public_url = 2; + uint32 grpc_port = 3; + string data_center = 4; +} + +message AssignRequest { + uint64 count = 1; + string replication = 2; + string collection = 3; + string ttl = 4; + string data_center = 5; + string rack = 6; + string data_node = 7; + uint32 memory_map_max_size_mb = 8; + uint32 writable_volume_count = 9; + string disk_type = 10; +} + +message VolumeGrowRequest { + uint32 writable_volume_count = 1; + string replication = 2; + string collection = 3; + string ttl = 4; + string data_center = 5; + string rack = 6; + string data_node = 7; + uint32 memory_map_max_size_mb = 8; + string disk_type = 9; +} + +message AssignResponse { + string fid = 1; + uint64 count = 4; + string error = 5; + string auth = 6; + repeated Location replicas = 7; + Location location = 8; +} + +message StatisticsRequest { + string replication = 1; + string collection = 2; + string ttl = 3; + string disk_type = 4; +} +message StatisticsResponse { + uint64 total_size = 4; + uint64 used_size = 5; + uint64 file_count = 6; +} + +// +// collection related +// +message Collection { + string name = 1; +} +message CollectionListRequest { + bool include_normal_volumes = 1; + bool include_ec_volumes = 2; +} +message CollectionListResponse { + repeated Collection collections = 1; +} + +message CollectionDeleteRequest { + string name = 1; +} +message CollectionDeleteResponse { +} + +// +// volume related +// +message DiskInfo { + string type = 1; + int64 volume_count = 2; + int64 max_volume_count = 3; + int64 free_volume_count = 4; + int64 active_volume_count = 5; + repeated VolumeInformationMessage volume_infos = 6; + repeated VolumeEcShardInformationMessage ec_shard_infos = 7; + int64 remote_volume_count = 8; + uint32 disk_id = 9; + repeated string tags = 10; +} +message DataNodeInfo { + string id = 1; + map diskInfos = 2; + uint32 grpc_port = 3; + string address = 4; // ip:port for connecting to the volume server +} +message RackInfo { + string id = 1; + repeated DataNodeInfo data_node_infos = 2; + map diskInfos = 3; +} +message DataCenterInfo { + string id = 1; + repeated RackInfo rack_infos = 2; + map diskInfos = 3; +} +message TopologyInfo { + string id = 1; + repeated DataCenterInfo data_center_infos = 2; + map diskInfos = 3; +} +message VolumeListRequest { +} +message VolumeListResponse { + TopologyInfo topology_info = 1; + uint64 volume_size_limit_mb = 2; +} + +message LookupEcVolumeRequest { + uint32 volume_id = 1; +} +message LookupEcVolumeResponse { + uint32 volume_id = 1; + message EcShardIdLocation { + uint32 shard_id = 1; + repeated Location locations = 2; + } + repeated EcShardIdLocation shard_id_locations = 2; +} + +message VacuumVolumeRequest { + float garbage_threshold = 1; + uint32 volume_id = 2; + string collection = 3; +} +message VacuumVolumeResponse { +} + +message DisableVacuumRequest { +} +message DisableVacuumResponse { +} + +message EnableVacuumRequest { +} +message EnableVacuumResponse { +} + +message VolumeMarkReadonlyRequest { + string ip = 1; + uint32 port = 2; + uint32 volume_id = 4; + string collection = 5; + uint32 replica_placement = 6; + uint32 version = 7; + uint32 ttl = 8; + string disk_type = 9; + bool is_readonly = 10; +} +message VolumeMarkReadonlyResponse { +} + +message GetMasterConfigurationRequest { +} +message GetMasterConfigurationResponse { + string metrics_address = 1; + uint32 metrics_interval_seconds = 2; + repeated StorageBackend storage_backends = 3; + string default_replication = 4; + string leader = 5; + uint32 volume_size_limit_m_b = 6; + bool volume_preallocate = 7; + // MIGRATION: fields 8-9 help migrate master.toml [master.maintenance] to admin script plugin. Remove after March 2027. + string maintenance_scripts = 8; + uint32 maintenance_sleep_minutes = 9; +} + +message ListClusterNodesRequest { + string client_type = 1; + string filer_group = 2; + int32 limit = 4; +} +message ListClusterNodesResponse { + message ClusterNode { + string address = 1; + string version = 2; + int64 created_at_ns = 4; + string data_center = 5; + string rack = 6; + } + repeated ClusterNode cluster_nodes = 1; +} + +message LeaseAdminTokenRequest { + int64 previous_token = 1; + int64 previous_lock_time = 2; + string lock_name = 3; + string client_name = 4; + string message = 5; +} +message LeaseAdminTokenResponse { + int64 token = 1; + int64 lock_ts_ns = 2; +} + +message ReleaseAdminTokenRequest { + int64 previous_token = 1; + int64 previous_lock_time = 2; + string lock_name = 3; +} +message ReleaseAdminTokenResponse { +} + +message PingRequest { + string target = 1; // default to ping itself + string target_type = 2; +} +message PingResponse { + int64 start_time_ns = 1; + int64 remote_time_ns = 2; + int64 stop_time_ns = 3; +} + +message RaftAddServerRequest { + string id = 1; + string address = 2; + bool voter = 3; +} +message RaftAddServerResponse { +} + +message RaftRemoveServerRequest { + string id = 1; + bool force = 2; +} +message RaftRemoveServerResponse { +} + +message RaftListClusterServersRequest { +} +message RaftListClusterServersResponse { + message ClusterServers { + string id = 1; + string address = 2; + string suffrage = 3; + bool isLeader = 4; + } + repeated ClusterServers cluster_servers = 1; +} + +message RaftLeadershipTransferRequest { + string target_id = 1; // Optional: target server ID. If empty, transfers to any eligible follower + string target_address = 2; // Optional: target server address. Required if target_id is specified +} +message RaftLeadershipTransferResponse { + string previous_leader = 1; + string new_leader = 2; +} + +message VolumeGrowResponse { +} diff --git a/seaweed-volume/proto/remote.proto b/seaweed-volume/proto/remote.proto new file mode 100644 index 000000000..9d6d81ff5 --- /dev/null +++ b/seaweed-volume/proto/remote.proto @@ -0,0 +1,76 @@ +syntax = "proto3"; + +package remote_pb; + +option go_package = "github.com/seaweedfs/seaweedfs/weed/pb/remote_pb"; +option java_package = "seaweedfs.client"; +option java_outer_classname = "FilerProto"; + +///////////////////////// +// Remote Storage related +///////////////////////// +message RemoteConf { + string type = 1; + string name = 2; + string s3_access_key = 4; + string s3_secret_key = 5; + string s3_region = 6; + string s3_endpoint = 7; + string s3_storage_class = 8; + bool s3_force_path_style = 9; + bool s3_support_tagging = 13; + bool s3_v4_signature = 11; + + string gcs_google_application_credentials = 10; + string gcs_project_id = 12; + + string azure_account_name = 15; + string azure_account_key = 16; + + string backblaze_key_id = 20; + string backblaze_application_key = 21; + string backblaze_endpoint = 22; + string backblaze_region = 23; + + string aliyun_access_key = 25; + string aliyun_secret_key = 26; + string aliyun_endpoint = 27; + string aliyun_region = 28; + + string tencent_secret_id = 30; + string tencent_secret_key = 31; + string tencent_endpoint = 32; + + string baidu_access_key = 35; + string baidu_secret_key = 36; + string baidu_endpoint = 37; + string baidu_region = 38; + + string wasabi_access_key = 40; + string wasabi_secret_key = 41; + string wasabi_endpoint = 42; + string wasabi_region = 43; + + string filebase_access_key = 60; + string filebase_secret_key = 61; + string filebase_endpoint = 62; + + string storj_access_key = 65; + string storj_secret_key = 66; + string storj_endpoint = 67; + + string contabo_access_key = 68; + string contabo_secret_key = 69; + string contabo_endpoint = 70; + string contabo_region = 71; +} + +message RemoteStorageMapping { + map mappings = 1; + string primary_bucket_storage_name = 2; +} +message RemoteStorageLocation { + string name = 1; + string bucket = 2; + string path = 3; +} diff --git a/seaweed-volume/proto/volume_server.proto b/seaweed-volume/proto/volume_server.proto new file mode 100644 index 000000000..bc5d79c69 --- /dev/null +++ b/seaweed-volume/proto/volume_server.proto @@ -0,0 +1,759 @@ +syntax = "proto3"; + +package volume_server_pb; +option go_package = "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb"; + +import "remote.proto"; + +////////////////////////////////////////////////// + +// Persistent state for volume servers. +message VolumeServerState { + // whether the server is in maintenance (i.e. read-only) mode. + bool maintenance = 1; + // incremental version counter + uint32 version = 2; +} + +////////////////////////////////////////////////// + +service VolumeServer { + //Experts only: takes multiple fid parameters. This function does not propagate deletes to replicas. + rpc BatchDelete (BatchDeleteRequest) returns (BatchDeleteResponse) { + } + + rpc VacuumVolumeCheck (VacuumVolumeCheckRequest) returns (VacuumVolumeCheckResponse) { + } + rpc VacuumVolumeCompact (VacuumVolumeCompactRequest) returns (stream VacuumVolumeCompactResponse) { + } + rpc VacuumVolumeCommit (VacuumVolumeCommitRequest) returns (VacuumVolumeCommitResponse) { + } + rpc VacuumVolumeCleanup (VacuumVolumeCleanupRequest) returns (VacuumVolumeCleanupResponse) { + } + + rpc DeleteCollection (DeleteCollectionRequest) returns (DeleteCollectionResponse) { + } + rpc AllocateVolume (AllocateVolumeRequest) returns (AllocateVolumeResponse) { + } + + rpc VolumeSyncStatus (VolumeSyncStatusRequest) returns (VolumeSyncStatusResponse) { + } + rpc VolumeIncrementalCopy (VolumeIncrementalCopyRequest) returns (stream VolumeIncrementalCopyResponse) { + } + + rpc VolumeMount (VolumeMountRequest) returns (VolumeMountResponse) { + } + rpc VolumeUnmount (VolumeUnmountRequest) returns (VolumeUnmountResponse) { + } + rpc VolumeDelete (VolumeDeleteRequest) returns (VolumeDeleteResponse) { + } + rpc VolumeMarkReadonly (VolumeMarkReadonlyRequest) returns (VolumeMarkReadonlyResponse) { + } + rpc VolumeMarkWritable (VolumeMarkWritableRequest) returns (VolumeMarkWritableResponse) { + } + rpc VolumeConfigure (VolumeConfigureRequest) returns (VolumeConfigureResponse) { + } + rpc VolumeStatus (VolumeStatusRequest) returns (VolumeStatusResponse) { + } + + rpc GetState (GetStateRequest) returns (GetStateResponse) { + } + rpc SetState (SetStateRequest) returns (SetStateResponse) { + } + + // copy the .idx .dat files, and mount this volume + rpc VolumeCopy (VolumeCopyRequest) returns (stream VolumeCopyResponse) { + } + rpc ReadVolumeFileStatus (ReadVolumeFileStatusRequest) returns (ReadVolumeFileStatusResponse) { + } + rpc CopyFile (CopyFileRequest) returns (stream CopyFileResponse) { + } + rpc ReceiveFile (stream ReceiveFileRequest) returns (ReceiveFileResponse) { + } + + rpc ReadNeedleBlob (ReadNeedleBlobRequest) returns (ReadNeedleBlobResponse) { + } + rpc ReadNeedleMeta (ReadNeedleMetaRequest) returns (ReadNeedleMetaResponse) { + } + rpc WriteNeedleBlob (WriteNeedleBlobRequest) returns (WriteNeedleBlobResponse) { + } + rpc ReadAllNeedles (ReadAllNeedlesRequest) returns (stream ReadAllNeedlesResponse) { + } + + rpc VolumeTailSender (VolumeTailSenderRequest) returns (stream VolumeTailSenderResponse) { + } + rpc VolumeTailReceiver (VolumeTailReceiverRequest) returns (VolumeTailReceiverResponse) { + } + + // erasure coding + rpc VolumeEcShardsGenerate (VolumeEcShardsGenerateRequest) returns (VolumeEcShardsGenerateResponse) { + } + rpc VolumeEcShardsRebuild (VolumeEcShardsRebuildRequest) returns (VolumeEcShardsRebuildResponse) { + } + rpc VolumeEcShardsCopy (VolumeEcShardsCopyRequest) returns (VolumeEcShardsCopyResponse) { + } + rpc VolumeEcShardsDelete (VolumeEcShardsDeleteRequest) returns (VolumeEcShardsDeleteResponse) { + } + rpc VolumeEcShardsMount (VolumeEcShardsMountRequest) returns (VolumeEcShardsMountResponse) { + } + rpc VolumeEcShardsUnmount (VolumeEcShardsUnmountRequest) returns (VolumeEcShardsUnmountResponse) { + } + rpc VolumeEcShardRead (VolumeEcShardReadRequest) returns (stream VolumeEcShardReadResponse) { + } + rpc VolumeEcBlobDelete (VolumeEcBlobDeleteRequest) returns (VolumeEcBlobDeleteResponse) { + } + rpc VolumeEcShardsToVolume (VolumeEcShardsToVolumeRequest) returns (VolumeEcShardsToVolumeResponse) { + } + rpc VolumeEcShardsInfo (VolumeEcShardsInfoRequest) returns (VolumeEcShardsInfoResponse) { + } + + // tiered storage + rpc VolumeTierMoveDatToRemote (VolumeTierMoveDatToRemoteRequest) returns (stream VolumeTierMoveDatToRemoteResponse) { + } + rpc VolumeTierMoveDatFromRemote (VolumeTierMoveDatFromRemoteRequest) returns (stream VolumeTierMoveDatFromRemoteResponse) { + } + + rpc VolumeServerStatus (VolumeServerStatusRequest) returns (VolumeServerStatusResponse) { + } + rpc VolumeServerLeave (VolumeServerLeaveRequest) returns (VolumeServerLeaveResponse) { + } + + // remote storage + rpc FetchAndWriteNeedle (FetchAndWriteNeedleRequest) returns (FetchAndWriteNeedleResponse) { + } + + // scrubbing + rpc ScrubVolume (ScrubVolumeRequest) returns (ScrubVolumeResponse) { + } + rpc ScrubEcVolume (ScrubEcVolumeRequest) returns (ScrubEcVolumeResponse) { + } + + // query + rpc Query (QueryRequest) returns (stream QueriedStripe) { + } + + rpc VolumeNeedleStatus (VolumeNeedleStatusRequest) returns (VolumeNeedleStatusResponse) { + } + + rpc Ping (PingRequest) returns (PingResponse) { + } + +} + +////////////////////////////////////////////////// + +message BatchDeleteRequest { + repeated string file_ids = 1; + bool skip_cookie_check = 2; +} + +message BatchDeleteResponse { + repeated DeleteResult results = 1; +} +message DeleteResult { + string file_id = 1; + int32 status = 2; + string error = 3; + uint32 size = 4; + uint32 version = 5; +} + +message Empty { +} + +message VacuumVolumeCheckRequest { + uint32 volume_id = 1; +} +message VacuumVolumeCheckResponse { + double garbage_ratio = 1; +} + +message VacuumVolumeCompactRequest { + uint32 volume_id = 1; + int64 preallocate = 2; +} +message VacuumVolumeCompactResponse { + int64 processed_bytes = 1; + float load_avg_1m = 2; +} + +message VacuumVolumeCommitRequest { + uint32 volume_id = 1; +} +message VacuumVolumeCommitResponse { + bool is_read_only = 1; + uint64 volume_size = 2; +} + +message VacuumVolumeCleanupRequest { + uint32 volume_id = 1; +} +message VacuumVolumeCleanupResponse { +} + +message DeleteCollectionRequest { + string collection = 1; +} +message DeleteCollectionResponse { +} + +message AllocateVolumeRequest { + uint32 volume_id = 1; + string collection = 2; + int64 preallocate = 3; + string replication = 4; + string ttl = 5; + uint32 memory_map_max_size_mb = 6; + string disk_type = 7; + uint32 version = 8; +} +message AllocateVolumeResponse { +} + +message VolumeSyncStatusRequest { + uint32 volume_id = 1; +} +message VolumeSyncStatusResponse { + uint32 volume_id = 1; + string collection = 2; + string replication = 4; + string ttl = 5; + uint64 tail_offset = 6; + uint32 compact_revision = 7; + uint64 idx_file_size = 8; + uint32 version = 9; +} + +message VolumeIncrementalCopyRequest { + uint32 volume_id = 1; + uint64 since_ns = 2; +} +message VolumeIncrementalCopyResponse { + bytes file_content = 1; +} + +message VolumeMountRequest { + uint32 volume_id = 1; +} +message VolumeMountResponse { +} + +message VolumeUnmountRequest { + uint32 volume_id = 1; +} +message VolumeUnmountResponse { +} + +message VolumeDeleteRequest { + uint32 volume_id = 1; + bool only_empty = 2; +} +message VolumeDeleteResponse { +} + +message VolumeMarkReadonlyRequest { + uint32 volume_id = 1; + bool persist = 2; +} +message VolumeMarkReadonlyResponse { +} + +message VolumeMarkWritableRequest { + uint32 volume_id = 1; +} +message VolumeMarkWritableResponse { +} + +message VolumeConfigureRequest { + uint32 volume_id = 1; + string replication = 2; +} +message VolumeConfigureResponse { + string error = 1; +} + +message VolumeStatusRequest { + uint32 volume_id = 1; +} +message VolumeStatusResponse { + bool is_read_only = 1; + uint64 volume_size = 2; + uint64 file_count = 3; + uint64 file_deleted_count = 4; +} + +message GetStateRequest { +} +message GetStateResponse { + VolumeServerState state = 1; +} + +message SetStateRequest { + // SetState updates *all* volume server flags at once. Retrieve state with GetState(), + // modify individual flags as required, then call this RPC to update. + VolumeServerState state = 1; +} +message SetStateResponse { + VolumeServerState state = 1; +} + +message VolumeCopyRequest { + uint32 volume_id = 1; + string collection = 2; + string replication = 3; + string ttl = 4; + string source_data_node = 5; + string disk_type = 6; + int64 io_byte_per_second = 7; +} +message VolumeCopyResponse { + uint64 last_append_at_ns = 1; + int64 processed_bytes = 2; +} + +message CopyFileRequest { + uint32 volume_id = 1; + string ext = 2; + uint32 compaction_revision = 3; + uint64 stop_offset = 4; + string collection = 5; + bool is_ec_volume = 6; + bool ignore_source_file_not_found = 7; +} +message CopyFileResponse { + bytes file_content = 1; + int64 modified_ts_ns = 2; +} + +message ReceiveFileRequest { + oneof data { + ReceiveFileInfo info = 1; + bytes file_content = 2; + } +} + +message ReceiveFileInfo { + uint32 volume_id = 1; + string ext = 2; + string collection = 3; + bool is_ec_volume = 4; + uint32 shard_id = 5; + uint64 file_size = 6; +} + +message ReceiveFileResponse { + uint64 bytes_written = 1; + string error = 2; +} + +message ReadNeedleBlobRequest { + uint32 volume_id = 1; + int64 offset = 3; // actual offset + int32 size = 4; +} +message ReadNeedleBlobResponse { + bytes needle_blob = 1; +} + +message ReadNeedleMetaRequest { + uint32 volume_id = 1; + uint64 needle_id = 2; + int64 offset = 3; // actual offset + int32 size = 4; +} +message ReadNeedleMetaResponse { + uint32 cookie = 1; + uint64 last_modified = 2; + uint32 crc = 3; + string ttl = 4; + uint64 append_at_ns = 5; +} + +message WriteNeedleBlobRequest { + uint32 volume_id = 1; + uint64 needle_id = 2; + int32 size = 3; + bytes needle_blob = 4; +} +message WriteNeedleBlobResponse { +} + +message ReadAllNeedlesRequest { + repeated uint32 volume_ids = 1; +} +message ReadAllNeedlesResponse { + uint32 volume_id = 1; + uint64 needle_id = 2; + uint32 cookie = 3; + bytes needle_blob = 5; + bool needle_blob_compressed = 6; + uint64 last_modified = 7; + uint32 crc = 8; + bytes name = 9; + bytes mime = 10; +} + +message VolumeTailSenderRequest { + uint32 volume_id = 1; + uint64 since_ns = 2; + uint32 idle_timeout_seconds = 3; +} +message VolumeTailSenderResponse { + bytes needle_header = 1; + bytes needle_body = 2; + bool is_last_chunk = 3; + uint32 version = 4; +} + +message VolumeTailReceiverRequest { + uint32 volume_id = 1; + uint64 since_ns = 2; + uint32 idle_timeout_seconds = 3; + string source_volume_server = 4; +} +message VolumeTailReceiverResponse { +} + +message VolumeEcShardsGenerateRequest { + uint32 volume_id = 1; + string collection = 2; +} +message VolumeEcShardsGenerateResponse { +} + +message VolumeEcShardsRebuildRequest { + uint32 volume_id = 1; + string collection = 2; +} +message VolumeEcShardsRebuildResponse { + repeated uint32 rebuilt_shard_ids = 1; +} + +message VolumeEcShardsCopyRequest { + uint32 volume_id = 1; + string collection = 2; + repeated uint32 shard_ids = 3; + bool copy_ecx_file = 4; + string source_data_node = 5; + bool copy_ecj_file = 6; + bool copy_vif_file = 7; + uint32 disk_id = 8; // Target disk ID for storing EC shards +} +message VolumeEcShardsCopyResponse { +} + +message VolumeEcShardsDeleteRequest { + uint32 volume_id = 1; + string collection = 2; + repeated uint32 shard_ids = 3; +} +message VolumeEcShardsDeleteResponse { +} + +message VolumeEcShardsMountRequest { + uint32 volume_id = 1; + string collection = 2; + repeated uint32 shard_ids = 3; +} +message VolumeEcShardsMountResponse { +} + +message VolumeEcShardsUnmountRequest { + uint32 volume_id = 1; + repeated uint32 shard_ids = 3; +} +message VolumeEcShardsUnmountResponse { +} + +message VolumeEcShardReadRequest { + uint32 volume_id = 1; + uint32 shard_id = 2; + int64 offset = 3; + int64 size = 4; + uint64 file_key = 5; +} +message VolumeEcShardReadResponse { + bytes data = 1; + bool is_deleted = 2; +} + +message VolumeEcBlobDeleteRequest { + uint32 volume_id = 1; + string collection = 2; + uint64 file_key = 3; + uint32 version = 4; +} +message VolumeEcBlobDeleteResponse { +} + +message VolumeEcShardsToVolumeRequest { + uint32 volume_id = 1; + string collection = 2; +} +message VolumeEcShardsToVolumeResponse { +} + +message VolumeEcShardsInfoRequest { + uint32 volume_id = 1; +} +message VolumeEcShardsInfoResponse { + repeated EcShardInfo ec_shard_infos = 1; + uint64 volume_size = 2; + uint64 file_count = 3; + uint64 file_deleted_count = 4; +} + +message EcShardInfo { + uint32 shard_id = 1; + int64 size = 2; + string collection = 3; + uint32 volume_id = 4; +} + +message ReadVolumeFileStatusRequest { + uint32 volume_id = 1; +} +message ReadVolumeFileStatusResponse { + uint32 volume_id = 1; + uint64 idx_file_timestamp_seconds = 2; + uint64 idx_file_size = 3; + uint64 dat_file_timestamp_seconds = 4; + uint64 dat_file_size = 5; + uint64 file_count = 6; + uint32 compaction_revision = 7; + string collection = 8; + string disk_type = 9; + VolumeInfo volume_info = 10; + uint32 version = 11; +} + +message DiskStatus { + string dir = 1; + uint64 all = 2; + uint64 used = 3; + uint64 free = 4; + float percent_free = 5; + float percent_used = 6; + string disk_type = 7; +} + +message MemStatus { + int32 goroutines = 1; + uint64 all = 2; + uint64 used = 3; + uint64 free = 4; + uint64 self = 5; + uint64 heap = 6; + uint64 stack = 7; +} + +// tired storage on volume servers +message RemoteFile { + string backend_type = 1; + string backend_id = 2; + string key = 3; + uint64 offset = 4; + uint64 file_size = 5; + uint64 modified_time = 6; + string extension = 7; +} +message VolumeInfo { + repeated RemoteFile files = 1; + uint32 version = 2; + string replication = 3; + uint32 bytes_offset = 4; + int64 dat_file_size = 5; // store the original dat file size + uint64 expire_at_sec = 6; // expiration time of ec volume + bool read_only = 7; + EcShardConfig ec_shard_config = 8; // EC shard configuration (optional, null = use default 10+4) +} + +// EcShardConfig specifies erasure coding shard configuration +message EcShardConfig { + uint32 data_shards = 1; // Number of data shards (e.g., 10) + uint32 parity_shards = 2; // Number of parity shards (e.g., 4) +} +message OldVersionVolumeInfo { + repeated RemoteFile files = 1; + uint32 version = 2; + string replication = 3; + uint32 BytesOffset = 4; + int64 dat_file_size = 5; // store the original dat file size + uint64 DestroyTime = 6; // expiration time of ec volume + bool read_only = 7; +} + +// tiered storage +message VolumeTierMoveDatToRemoteRequest { + uint32 volume_id = 1; + string collection = 2; + string destination_backend_name = 3; + bool keep_local_dat_file = 4; +} +message VolumeTierMoveDatToRemoteResponse { + int64 processed = 1; + float processedPercentage = 2; +} + +message VolumeTierMoveDatFromRemoteRequest { + uint32 volume_id = 1; + string collection = 2; + bool keep_remote_dat_file = 3; +} +message VolumeTierMoveDatFromRemoteResponse { + int64 processed = 1; + float processedPercentage = 2; +} + +message VolumeServerStatusRequest { + +} +message VolumeServerStatusResponse { + repeated DiskStatus disk_statuses = 1; + MemStatus memory_status = 2; + string version = 3; + string data_center = 4; + string rack = 5; + VolumeServerState state = 6; +} + +message VolumeServerLeaveRequest { +} +message VolumeServerLeaveResponse { +} + +// remote storage +message FetchAndWriteNeedleRequest { + uint32 volume_id = 1; + uint64 needle_id = 2; + uint32 cookie = 3; + int64 offset = 4; + int64 size = 5; + message Replica { + string url = 1; + string public_url = 2; + int32 grpc_port = 3; + } + repeated Replica replicas = 6; + string auth = 7; + // remote conf + remote_pb.RemoteConf remote_conf = 15; + remote_pb.RemoteStorageLocation remote_location = 16; +} +message FetchAndWriteNeedleResponse { + string e_tag = 1; +} + +enum VolumeScrubMode { + UNKNOWN = 0; + INDEX = 1; + FULL = 2; + LOCAL = 3; +} + +message ScrubVolumeRequest { + VolumeScrubMode mode = 1; + // optional list of volume IDs to scrub. if empty, all volumes for the server are scrubbed. + repeated uint32 volume_ids = 2; + bool mark_broken_volumes_readonly = 3; +} +message ScrubVolumeResponse { + uint64 total_volumes = 1; + uint64 total_files = 2; + repeated uint32 broken_volume_ids = 3; + repeated string details = 4; +} + +message ScrubEcVolumeRequest { + VolumeScrubMode mode = 1; + // optional list of volume IDs to scrub. if empty, all EC volumes for the server are scrubbed. + repeated uint32 volume_ids = 2; +} +message ScrubEcVolumeResponse { + uint64 total_volumes = 1; + uint64 total_files = 2; + repeated uint32 broken_volume_ids = 3; + repeated EcShardInfo broken_shard_infos = 4; + repeated string details = 5; +} + +// select on volume servers +message QueryRequest { + repeated string selections = 1; + repeated string from_file_ids = 2; + message Filter { + string field = 1; + string operand = 2; + string value = 3; + } + Filter filter = 3; + + message InputSerialization { + // NONE | GZIP | BZIP2 + string compression_type = 1; + message CSVInput { + string file_header_info = 1; // Valid values: NONE | USE | IGNORE + string record_delimiter = 2; // Default: \n + string field_delimiter = 3; // Default: , + string quote_character = 4; // Default: " + string quote_escape_character = 5; // Default: " + string comments = 6; // Default: # + // If true, records might contain record delimiters within quote characters + bool allow_quoted_record_delimiter = 7; // default False. + } + message JSONInput { + string type = 1; // Valid values: DOCUMENT | LINES + } + message ParquetInput { + } + + CSVInput csv_input = 2; + JSONInput json_input = 3; + ParquetInput parquet_input = 4; + } + InputSerialization input_serialization = 4; + + message OutputSerialization { + message CSVOutput { + string quote_fields = 1; // Valid values: ALWAYS | ASNEEDED + string record_delimiter = 2; // Default: \n + string field_delimiter = 3; // Default: , + string quote_character = 4; // Default: " + string quote_escape_character = 5; // Default: " + } + message JSONOutput { + string record_delimiter = 1; + } + + CSVOutput csv_output = 2; + JSONOutput json_output = 3; + } + + OutputSerialization output_serialization = 5; +} +message QueriedStripe { + bytes records = 1; +} + +message VolumeNeedleStatusRequest { + uint32 volume_id = 1; + uint64 needle_id = 2; +} +message VolumeNeedleStatusResponse { + uint64 needle_id = 1; + uint32 cookie = 2; + uint32 size = 3; + uint64 last_modified = 4; + uint32 crc = 5; + string ttl = 6; +} + +message PingRequest { + string target = 1; // default to ping itself + string target_type = 2; +} +message PingResponse { + int64 start_time_ns = 1; + int64 remote_time_ns = 2; + int64 stop_time_ns = 3; +} diff --git a/seaweed-volume/src/config.rs b/seaweed-volume/src/config.rs new file mode 100644 index 000000000..ce50b1374 --- /dev/null +++ b/seaweed-volume/src/config.rs @@ -0,0 +1,1697 @@ +use clap::Parser; +use std::net::UdpSocket; +use std::path::{Path, PathBuf}; + +use crate::security::tls::TlsPolicy; + +/// SeaweedFS Volume Server (Rust implementation) +/// +/// Start a volume server to provide storage spaces. +#[derive(Parser, Debug)] +#[command(name = "weed-volume", version, about)] +pub struct Cli { + /// HTTP listen port + #[arg(long = "port", default_value_t = 8080)] + pub port: u16, + + /// gRPC listen port. If 0, defaults to port + 10000. + #[arg(long = "port.grpc", default_value_t = 0)] + pub port_grpc: u16, + + /// Port opened to public. If 0, defaults to same as --port. + #[arg(long = "port.public", default_value_t = 0)] + pub port_public: u16, + + /// IP or server name, also used as identifier. + /// If empty, auto-detected. + #[arg(long = "ip", default_value = "")] + pub ip: String, + + /// Volume server ID. If empty, defaults to ip:port. + #[arg(long = "id", default_value = "")] + pub id: String, + + /// Publicly accessible address. + #[arg(long = "publicUrl", default_value = "")] + pub public_url: String, + + /// IP address to bind to. If empty, defaults to same as --ip. + #[arg(long = "ip.bind", default_value = "")] + pub bind_ip: String, + + /// Comma-separated master server addresses. + #[arg(long = "master", default_value = "localhost:9333")] + pub master: String, + + /// Comma-separated master servers (deprecated, use --master instead). + #[arg(long = "mserver", default_value = "")] + pub mserver: String, + + /// Number of seconds between stop sending heartbeats and stopping the volume server. + #[arg(long = "preStopSeconds", default_value_t = 10)] + pub pre_stop_seconds: u32, + + /// Connection idle seconds. + #[arg(long = "idleTimeout", default_value_t = 30)] + pub idle_timeout: u32, + + /// Current volume server's data center name. + #[arg(long = "dataCenter", default_value = "")] + pub data_center: String, + + /// Current volume server's rack name. + #[arg(long = "rack", default_value = "")] + pub rack: String, + + /// Choose [memory|leveldb|leveldbMedium|leveldbLarge] mode for memory~performance balance. + #[arg(long = "index", default_value = "memory")] + pub index: String, + + /// [hdd|ssd|] hard drive or solid state drive or any tag. + #[arg(long = "disk", default_value = "")] + pub disk: String, + + /// Comma-separated tag groups per data dir; each group uses ':' (e.g. fast:ssd,archive). + #[arg(long = "tags", default_value = "")] + pub tags: String, + + /// Adjust jpg orientation when uploading. + #[arg(long = "images.fix.orientation", default_value_t = false)] + pub fix_jpg_orientation: bool, + + /// [local|proxy|redirect] how to deal with non-local volume. + #[arg(long = "readMode", default_value = "proxy")] + pub read_mode: String, + + /// CPU profile output file. + #[arg(long = "cpuprofile", default_value = "")] + pub cpu_profile: String, + + /// Memory profile output file. + #[arg(long = "memprofile", default_value = "")] + pub mem_profile: String, + + /// Limit background compaction or copying speed in mega bytes per second. + #[arg(long = "compactionMBps", default_value_t = 0)] + pub compaction_mb_per_second: u32, + + /// Limit maintenance (replication/balance) IO rate in MB/s. 0 means no limit. + #[arg(long = "maintenanceMBps", default_value_t = 0)] + pub maintenance_mb_per_second: u32, + + /// Limit file size to avoid out of memory. + #[arg(long = "fileSizeLimitMB", default_value_t = 256)] + pub file_size_limit_mb: u32, + + /// Limit total concurrent upload size in MB, 0 means unlimited. + #[arg(long = "concurrentUploadLimitMB", default_value_t = 0)] + pub concurrent_upload_limit_mb: u32, + + /// Limit total concurrent download size in MB, 0 means unlimited. + #[arg(long = "concurrentDownloadLimitMB", default_value_t = 0)] + pub concurrent_download_limit_mb: u32, + + /// Enable pprof-equivalent HTTP handlers. Precludes --memprofile and --cpuprofile. + #[arg(long = "pprof", default_value_t = false)] + pub pprof: bool, + + /// Prometheus metrics listen port. + #[arg(long = "metricsPort", default_value_t = 0)] + pub metrics_port: u16, + + /// Metrics listen IP. If empty, defaults to same as --ip.bind. + #[arg(long = "metricsIp", default_value = "")] + pub metrics_ip: String, + + /// Directories to store data files. dir[,dir]... + /// If empty, defaults to the platform temp directory (Go's os.TempDir()). + #[arg(long = "dir", default_value = "")] + pub dir: String, + + /// Directory to store .idx files. + #[arg(long = "dir.idx", default_value = "")] + pub dir_idx: String, + + /// Maximum numbers of volumes, count[,count]... + /// If set to zero, the limit will be auto configured as free disk space divided by volume size. + #[arg(long = "max", default_value = "8")] + pub max: String, + + /// Comma separated IP addresses having write permission. No limit if empty. + #[arg(long = "whiteList", default_value = "")] + pub white_list: String, + + /// Minimum free disk space (default to 1%). Low disk space will mark all volumes as ReadOnly. + /// Deprecated: use --minFreeSpace instead. + #[arg(long = "minFreeSpacePercent", default_value = "1")] + pub min_free_space_percent: String, + + /// Min free disk space (value<=100 as percentage like 1, other as human readable bytes, like 10GiB). + /// Low disk space will mark all volumes as ReadOnly. + #[arg(long = "minFreeSpace", default_value = "")] + pub min_free_space: String, + + /// Inflight upload data wait timeout of volume servers. + #[arg(long = "inflightUploadDataTimeout", default_value = "60s")] + pub inflight_upload_data_timeout: String, + + /// Inflight download data wait timeout of volume servers. + #[arg(long = "inflightDownloadDataTimeout", default_value = "60s")] + pub inflight_download_data_timeout: String, + + /// if true, prevents slow reads from blocking other requests, + /// but large file read P99 latency will increase. + #[arg(long = "hasSlowRead", default_value_t = true)] + pub has_slow_read: bool, + + /// larger values can optimize query performance but will increase memory usage. + /// Use with hasSlowRead normally. + #[arg(long = "readBufferSizeMB", default_value_t = 4)] + pub read_buffer_size_mb: u32, + + /// Alive time for leveldb (default to 0). If leveldb of volume is not accessed in + /// ldbTimeout hours, it will be offloaded to reduce opened files and memory consumption. + #[arg(long = "index.leveldbTimeout", default_value_t = 0)] + pub ldb_timeout: i64, + + /// Serves runtime profiling data on the port specified by --debug.port. + #[arg(long = "debug", default_value_t = false)] + pub debug: bool, + + /// HTTP port for debugging. + #[arg(long = "debug.port", default_value_t = 6060)] + pub debug_port: u16, + + /// Path to security.toml configuration file for JWT signing keys. + #[arg(long = "securityFile", default_value = "")] + pub security_file: String, + + /// A file of command line options, each line in optionName=optionValue format. + #[arg(long = "options", default_value = "")] + pub options: String, +} + +/// Resolved configuration after applying defaults and validation. +#[derive(Debug)] +pub struct VolumeServerConfig { + pub port: u16, + pub grpc_port: u16, + pub public_port: u16, + pub ip: String, + pub bind_ip: String, + pub public_url: String, + pub id: String, + pub masters: Vec, + pub pre_stop_seconds: u32, + pub idle_timeout: u32, + pub data_center: String, + pub rack: String, + pub index_type: NeedleMapKind, + pub disk_type: String, + pub folders: Vec, + pub folder_max_limits: Vec, + pub folder_tags: Vec>, + pub min_free_spaces: Vec, + pub disk_types: Vec, + pub idx_folder: String, + pub white_list: Vec, + pub fix_jpg_orientation: bool, + pub read_mode: ReadMode, + pub cpu_profile: String, + pub mem_profile: String, + pub compaction_byte_per_second: i64, + pub maintenance_byte_per_second: i64, + pub file_size_limit_bytes: i64, + pub concurrent_upload_limit: i64, + pub concurrent_download_limit: i64, + pub inflight_upload_data_timeout: std::time::Duration, + pub inflight_download_data_timeout: std::time::Duration, + pub has_slow_read: bool, + pub read_buffer_size_mb: u32, + pub ldb_timeout: i64, + pub pprof: bool, + pub metrics_port: u16, + pub metrics_ip: String, + pub debug: bool, + pub debug_port: u16, + pub ui_enabled: bool, + pub jwt_signing_key: Vec, + pub jwt_signing_expires_seconds: i64, + pub jwt_read_signing_key: Vec, + pub jwt_read_signing_expires_seconds: i64, + pub https_cert_file: String, + pub https_key_file: String, + pub https_ca_file: String, + pub https_client_enabled: bool, + pub https_client_cert_file: String, + pub https_client_key_file: String, + pub https_client_ca_file: String, + pub grpc_cert_file: String, + pub grpc_key_file: String, + pub grpc_ca_file: String, + pub grpc_allowed_wildcard_domain: String, + pub grpc_volume_allowed_common_names: Vec, + pub tls_policy: TlsPolicy, + /// Enable batched write queue for improved throughput under load. + pub enable_write_queue: bool, + /// Path to security.toml — stored for SIGHUP reload. + pub security_file: String, +} + +pub use crate::storage::needle_map::NeedleMapKind; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ReadMode { + Local, + Proxy, + Redirect, +} + +#[derive(Debug, Clone)] +pub enum MinFreeSpace { + Percent(f64), + Bytes(u64), +} + +/// Convert single-dash long options to double-dash for clap compatibility. +/// Go's `flag` package uses `-port`, clap expects `--port`. +/// This allows both `-port 8080` and `--port 8080` to work. +fn normalize_args_vec(args: Vec) -> Vec { + let mut args = args; + // Skip args[0] (binary name). + let mut i = 1; + while i < args.len() { + let arg = &args[i]; + // Stop processing after "--" + if arg == "--" { + break; + } + // Already double-dash or not a flag: leave as-is + if arg.starts_with("--") || !arg.starts_with('-') { + i += 1; + continue; + } + // Single char flags like -h, -V: leave as-is + let without_dash = &arg[1..]; + // Check if it's a single-dash long option: more than 1 char and not a negative number + if without_dash.len() > 1 && !without_dash.starts_with(|c: char| c.is_ascii_digit()) { + // Handle -key=value format + if let Some(eq_pos) = without_dash.find('=') { + let key = &without_dash[..eq_pos]; + if key.len() > 1 { + args[i] = format!("--{}", without_dash); + } + } else { + args[i] = format!("-{}", arg); + } + } + i += 1; + } + args +} + +/// Parse CLI arguments and resolve all defaults — mirroring Go's `runVolume()` + `startVolumeServer()`. +/// +/// Supports `-options ` to load defaults from a file (same format as Go's fla9). +/// CLI arguments take precedence over file values. +pub fn parse_cli() -> VolumeServerConfig { + let args: Vec = std::env::args().collect(); + let normalized = normalize_args_vec(args); + let merged = merge_options_file(normalized); + let cli = Cli::parse_from(merged); + resolve_config(cli) +} + +/// Find `-options`/`--options` in args, parse the referenced file, and inject +/// file-based defaults for any flags not already set on the command line. +/// +/// File format (matching Go's fla9.ParseFile): +/// - One option per line: `key=value`, `key value`, or `key:value` +/// - Lines starting with `#` are comments; blank lines are ignored +/// - Leading `-` on key names is stripped +/// - CLI arguments take precedence over file values +fn merge_options_file(args: Vec) -> Vec { + // Find the options file path from the args + let options_path = find_options_arg(&args); + if options_path.is_empty() { + return args; + } + + let content = match std::fs::read_to_string(&options_path) { + Ok(c) => c, + Err(e) => { + eprintln!( + "WARNING: could not read options file {}: {}", + options_path, e + ); + return args; + } + }; + + // Collect which flags are already explicitly set on the command line. + let mut cli_flags: std::collections::HashSet = std::collections::HashSet::new(); + let mut i = 1; // skip binary name + while i < args.len() { + let arg = &args[i]; + if arg == "--" { + break; + } + if arg.starts_with("--") { + let key = if let Some(eq) = arg.find('=') { + arg[2..eq].to_string() + } else { + arg[2..].to_string() + }; + cli_flags.insert(key); + } else if arg.starts_with('-') && arg.len() > 2 { + // Single-dash long option (already normalized to -- at this point, + // but handle both for safety) + let without_dash = &arg[1..]; + let key = if let Some(eq) = without_dash.find('=') { + without_dash[..eq].to_string() + } else { + without_dash.to_string() + }; + cli_flags.insert(key); + } + i += 1; + } + + // Parse file and append missing options + let mut extra_args: Vec = Vec::new(); + for line in content.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with('#') { + continue; + } + + // Split on first `=`, ` `, or `:` + let (name, value) = + if let Some(pos) = trimmed.find(|c: char| c == '=' || c == ' ' || c == ':') { + ( + trimmed[..pos].trim().to_string(), + trimmed[pos + 1..].trim().to_string(), + ) + } else { + (trimmed.to_string(), String::new()) + }; + + // Strip leading dashes from name + let name = name.trim_start_matches('-').to_string(); + if name.is_empty() || name == "options" { + continue; + } + + // Skip if already set on CLI + if cli_flags.contains(&name) { + continue; + } + + extra_args.push(format!("--{}", name)); + if !value.is_empty() { + extra_args.push(value); + } + } + + let mut merged = args; + merged.extend(extra_args); + merged +} + +/// Extract the options file path from args (looks for --options or -options). +fn find_options_arg(args: &[String]) -> String { + for i in 1..args.len() { + if args[i] == "--options" || args[i] == "-options" { + if i + 1 < args.len() { + return args[i + 1].clone(); + } + } + if let Some(rest) = args[i].strip_prefix("--options=") { + return rest.to_string(); + } + if let Some(rest) = args[i].strip_prefix("-options=") { + return rest.to_string(); + } + } + String::new() +} + +/// Parse a duration string like "60s", "5m", "1h" into a std::time::Duration. +fn parse_duration(s: &str) -> std::time::Duration { + let s = s.trim(); + if s.is_empty() { + return std::time::Duration::from_secs(60); + } + if let Some(secs) = s.strip_suffix('s') { + if let Ok(v) = secs.parse::() { + return std::time::Duration::from_secs(v); + } + } + if let Some(mins) = s.strip_suffix('m') { + if let Ok(v) = mins.parse::() { + return std::time::Duration::from_secs(v * 60); + } + } + if let Some(hours) = s.strip_suffix('h') { + if let Ok(v) = hours.parse::() { + return std::time::Duration::from_secs(v * 3600); + } + } + // Fallback: try parsing as raw seconds + if let Ok(v) = s.parse::() { + return std::time::Duration::from_secs(v); + } + std::time::Duration::from_secs(60) +} + +/// Parse minFreeSpace / minFreeSpacePercent into MinFreeSpace values. +/// Mirrors Go's `util.MustParseMinFreeSpace()`. +fn parse_min_free_spaces(min_free_space: &str, min_free_space_percent: &str) -> Vec { + // If --minFreeSpace is provided, use it (takes precedence). + let source = if !min_free_space.is_empty() { + min_free_space + } else { + min_free_space_percent + }; + + source + .split(',') + .map(|s| { + let s = s.trim(); + // Try parsing as a percentage (value <= 100) + if let Ok(v) = s.parse::() { + if v <= 100.0 { + return MinFreeSpace::Percent(v); + } + // Treat as bytes if > 100 + return MinFreeSpace::Bytes(v as u64); + } + // Try parsing human-readable bytes: e.g. "10GiB", "500MiB", "1TiB" + let s_upper = s.to_uppercase(); + if let Some(rest) = s_upper.strip_suffix("TIB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1024.0 * 1024.0 * 1024.0 * 1024.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("GIB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1024.0 * 1024.0 * 1024.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("MIB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1024.0 * 1024.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("KIB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1024.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("TB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1_000_000_000_000.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("GB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1_000_000_000.0) as u64); + } + } + if let Some(rest) = s_upper.strip_suffix("MB") { + if let Ok(v) = rest.trim().parse::() { + return MinFreeSpace::Bytes((v * 1_000_000.0) as u64); + } + } + // Default: 1% + MinFreeSpace::Percent(1.0) + }) + .collect() +} + +/// Parse comma-separated tag groups like "fast:ssd,archive" into per-folder tag vectors. +/// Mirrors Go's `parseVolumeTags()`. +fn parse_volume_tags(tags_arg: &str, folder_count: usize) -> Vec> { + if folder_count == 0 { + return vec![]; + } + let tags_arg = tags_arg.trim(); + let tag_entries: Vec<&str> = if tags_arg.is_empty() { + vec![] + } else { + tags_arg.split(',').collect() + }; + + let mut folder_tags: Vec> = vec![vec![]; folder_count]; + + if tag_entries.len() == 1 && !tag_entries[0].is_empty() { + // Single entry: replicate to all folders + let normalized: Vec = tag_entries[0] + .split(':') + .map(|t| t.trim().to_lowercase()) + .filter(|t| !t.is_empty()) + .collect(); + for tags in folder_tags.iter_mut() { + *tags = normalized.clone(); + } + } else { + for (i, tags) in folder_tags.iter_mut().enumerate() { + if i < tag_entries.len() { + *tags = tag_entries[i] + .split(':') + .map(|t| t.trim().to_lowercase()) + .filter(|t| !t.is_empty()) + .collect(); + } + } + } + + folder_tags +} + +fn resolve_config(cli: Cli) -> VolumeServerConfig { + // Backward compatibility: --mserver overrides --master + let master_string = if !cli.mserver.is_empty() { + &cli.mserver + } else { + &cli.master + }; + let masters: Vec = master_string + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + + // Parse folders + let dir_value = if cli.dir.trim().is_empty() { + default_volume_dir() + } else { + cli.dir + }; + let folders: Vec = dir_value + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + let folder_count = folders.len(); + + // Parse max volume counts + let mut folder_max_limits: Vec = cli + .max + .split(',') + .map(|s| { + s.trim().parse::().unwrap_or_else(|_| { + panic!("The max specified in --max is not a valid number: {}", s) + }) + }) + .collect(); + // Replicate single value to all folders + if folder_max_limits.len() == 1 && folder_count > 1 { + let v = folder_max_limits[0]; + folder_max_limits.resize(folder_count, v); + } + if folders.len() != folder_max_limits.len() { + panic!( + "{} directories by --dir, but only {} max is set by --max", + folders.len(), + folder_max_limits.len() + ); + } + + // Parse min free spaces + let mut min_free_spaces = + parse_min_free_spaces(&cli.min_free_space, &cli.min_free_space_percent); + if min_free_spaces.len() == 1 && folder_count > 1 { + let v = min_free_spaces[0].clone(); + min_free_spaces.resize(folder_count, v); + } + if folders.len() != min_free_spaces.len() { + panic!( + "{} directories by --dir, but only {} minFreeSpace values", + folders.len(), + min_free_spaces.len() + ); + } + + // Parse disk types + let mut disk_types: Vec = cli.disk.split(',').map(|s| s.trim().to_string()).collect(); + if disk_types.len() == 1 && folder_count > 1 { + let v = disk_types[0].clone(); + disk_types.resize(folder_count, v); + } + if folders.len() != disk_types.len() { + panic!( + "{} directories by --dir, but only {} disk types by --disk", + folders.len(), + disk_types.len() + ); + } + + // Parse tags + let folder_tags = parse_volume_tags(&cli.tags, folder_count); + + // Resolve IP + let ip = if cli.ip.is_empty() { + detect_host_address() + } else { + cli.ip + }; + + // Resolve bind IP + let bind_ip = if cli.bind_ip.is_empty() { + ip.clone() + } else { + cli.bind_ip + }; + + // Resolve public port + let public_port = if cli.port_public == 0 { + cli.port + } else { + cli.port_public + }; + + // Resolve gRPC port + let grpc_port = if cli.port_grpc == 0 { + 10000 + cli.port + } else { + cli.port_grpc + }; + + // Resolve public URL + let public_url = if cli.public_url.is_empty() { + format!("{}:{}", ip, public_port) + } else { + cli.public_url + }; + + // Resolve volume server ID + let id = if cli.id.is_empty() { + format!("{}:{}", ip, cli.port) + } else { + cli.id + }; + + // Resolve metrics IP + let metrics_ip = if !cli.metrics_ip.is_empty() { + cli.metrics_ip + } else if !bind_ip.is_empty() { + bind_ip.clone() + } else { + ip.clone() + }; + + // Parse index type + let index_type = match cli.index.as_str() { + "memory" => NeedleMapKind::InMemory, + "leveldb" => NeedleMapKind::LevelDb, + "leveldbMedium" => NeedleMapKind::LevelDbMedium, + "leveldbLarge" => NeedleMapKind::LevelDbLarge, + other => panic!( + "Unknown index type: {}. Use memory|leveldb|leveldbMedium|leveldbLarge", + other + ), + }; + + // Parse read mode + let read_mode = match cli.read_mode.as_str() { + "local" => ReadMode::Local, + "proxy" => ReadMode::Proxy, + "redirect" => ReadMode::Redirect, + other => panic!("Unknown readMode: {}. Use local|proxy|redirect", other), + }; + + // Parse security config from TOML file + let sec = parse_security_config(&cli.security_file); + + // Parse whitelist: merge CLI --whiteList with guard.white_list from security.toml + let mut white_list: Vec = cli + .white_list + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + white_list.extend(sec.guard_white_list.iter().cloned()); + + // Parse durations + let inflight_upload_data_timeout = parse_duration(&cli.inflight_upload_data_timeout); + let inflight_download_data_timeout = parse_duration(&cli.inflight_download_data_timeout); + + VolumeServerConfig { + port: cli.port, + grpc_port, + public_port, + ip, + bind_ip, + public_url, + id, + masters, + pre_stop_seconds: cli.pre_stop_seconds, + idle_timeout: cli.idle_timeout, + data_center: cli.data_center, + rack: cli.rack, + index_type, + disk_type: cli.disk, + folders, + folder_max_limits, + folder_tags, + min_free_spaces, + disk_types, + idx_folder: cli.dir_idx, + white_list, + fix_jpg_orientation: cli.fix_jpg_orientation, + read_mode, + cpu_profile: cli.cpu_profile, + mem_profile: cli.mem_profile, + compaction_byte_per_second: cli.compaction_mb_per_second as i64 * 1024 * 1024, + maintenance_byte_per_second: cli.maintenance_mb_per_second as i64 * 1024 * 1024, + file_size_limit_bytes: cli.file_size_limit_mb as i64 * 1024 * 1024, + concurrent_upload_limit: cli.concurrent_upload_limit_mb as i64 * 1024 * 1024, + concurrent_download_limit: cli.concurrent_download_limit_mb as i64 * 1024 * 1024, + inflight_upload_data_timeout, + inflight_download_data_timeout, + has_slow_read: cli.has_slow_read, + read_buffer_size_mb: cli.read_buffer_size_mb, + ldb_timeout: cli.ldb_timeout, + pprof: cli.pprof, + metrics_port: cli.metrics_port, + metrics_ip, + debug: cli.debug, + debug_port: cli.debug_port, + ui_enabled: sec.jwt_signing_key.is_empty() || sec.access_ui, + jwt_signing_key: sec.jwt_signing_key, + jwt_signing_expires_seconds: sec.jwt_signing_expires, + jwt_read_signing_key: sec.jwt_read_signing_key, + jwt_read_signing_expires_seconds: sec.jwt_read_signing_expires, + https_cert_file: sec.https_cert_file, + https_key_file: sec.https_key_file, + https_ca_file: sec.https_ca_file, + https_client_enabled: sec.https_client_enabled, + https_client_cert_file: sec.https_client_cert_file, + https_client_key_file: sec.https_client_key_file, + https_client_ca_file: sec.https_client_ca_file, + grpc_cert_file: sec.grpc_cert_file, + grpc_key_file: sec.grpc_key_file, + grpc_ca_file: sec.grpc_ca_file, + grpc_allowed_wildcard_domain: sec.grpc_allowed_wildcard_domain, + grpc_volume_allowed_common_names: sec.grpc_volume_allowed_common_names, + tls_policy: sec.tls_policy, + enable_write_queue: std::env::var("SEAWEED_WRITE_QUEUE") + .map(|v| v == "1" || v == "true") + .unwrap_or(false), + security_file: cli.security_file, + } +} + +fn default_volume_dir() -> String { + std::env::temp_dir().to_string_lossy().into_owned() +} + +/// Parsed security configuration from security.toml. +#[derive(Debug, Default)] +pub struct SecurityConfig { + pub jwt_signing_key: Vec, + pub jwt_signing_expires: i64, + pub jwt_read_signing_key: Vec, + pub jwt_read_signing_expires: i64, + pub https_cert_file: String, + pub https_key_file: String, + pub https_ca_file: String, + pub https_client_enabled: bool, + pub https_client_cert_file: String, + pub https_client_key_file: String, + pub https_client_ca_file: String, + pub grpc_cert_file: String, + pub grpc_key_file: String, + pub grpc_ca_file: String, + pub grpc_allowed_wildcard_domain: String, + pub grpc_volume_allowed_common_names: Vec, + pub tls_policy: TlsPolicy, + pub access_ui: bool, + /// IPs from [guard] white_list in security.toml + pub guard_white_list: Vec, +} + +const SECURITY_CONFIG_FILE_NAME: &str = "security.toml"; + +/// Parse a security.toml file to extract JWT signing keys and TLS configuration. +/// Format: +/// ```toml +/// [jwt.signing] +/// key = "secret" +/// expires_after_seconds = 60 +/// +/// [jwt.signing.read] +/// key = "read-secret" +/// expires_after_seconds = 60 +/// +/// [https.volume] +/// cert = "/path/to/cert.pem" +/// key = "/path/to/key.pem" +/// ca = "/path/to/ca.pem" +/// +/// [tls] +/// min_version = "TLS 1.2" +/// max_version = "TLS 1.3" +/// cipher_suites = "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256" +/// +/// [https.client] +/// enabled = true +/// cert = "/path/to/cert.pem" +/// key = "/path/to/key.pem" +/// ca = "/path/to/ca.pem" +/// +/// [grpc] +/// ca = "/path/to/ca.pem" +/// allowed_wildcard_domain = ".example.com" +/// +/// [grpc.volume] +/// cert = "/path/to/cert.pem" +/// key = "/path/to/key.pem" +/// allowed_commonNames = "volume-a.internal,volume-b.internal" +/// ``` +pub fn parse_security_config(path: &str) -> SecurityConfig { + let Some(config_path) = resolve_security_config_path(path) else { + let mut cfg = SecurityConfig::default(); + apply_env_overrides(&mut cfg); + return cfg; + }; + + let content = match std::fs::read_to_string(&config_path) { + Ok(c) => c, + Err(_) => { + let mut cfg = SecurityConfig::default(); + apply_env_overrides(&mut cfg); + return cfg; + } + }; + + let mut cfg = SecurityConfig::default(); + + #[derive(PartialEq)] + enum Section { + None, + JwtSigning, + JwtSigningRead, + HttpsClient, + Grpc, + HttpsVolume, + GrpcVolume, + Tls, + Guard, + Access, + } + + let mut section = Section::None; + + for line in content.lines() { + let trimmed = line.trim(); + if trimmed.starts_with('#') || trimmed.is_empty() { + continue; + } + if trimmed == "[jwt.signing.read]" { + section = Section::JwtSigningRead; + continue; + } + if trimmed == "[jwt.signing]" { + section = Section::JwtSigning; + continue; + } + if trimmed == "[https.client]" { + section = Section::HttpsClient; + continue; + } + if trimmed == "[grpc]" { + section = Section::Grpc; + continue; + } + if trimmed == "[https.volume]" { + section = Section::HttpsVolume; + continue; + } + if trimmed == "[grpc.volume]" { + section = Section::GrpcVolume; + continue; + } + if trimmed == "[tls]" { + section = Section::Tls; + continue; + } + if trimmed == "[guard]" { + section = Section::Guard; + continue; + } + if trimmed == "[access]" { + section = Section::Access; + continue; + } + if trimmed.starts_with('[') { + section = Section::None; + continue; + } + + if let Some((key, value)) = trimmed.split_once('=') { + let key = key.trim(); + let value = value.trim().trim_matches('"'); + match section { + Section::JwtSigningRead => match key { + "key" => cfg.jwt_read_signing_key = value.as_bytes().to_vec(), + "expires_after_seconds" => { + cfg.jwt_read_signing_expires = value.parse().unwrap_or(60) + } + _ => {} + }, + Section::JwtSigning => match key { + "key" => cfg.jwt_signing_key = value.as_bytes().to_vec(), + "expires_after_seconds" => cfg.jwt_signing_expires = value.parse().unwrap_or(10), + _ => {} + }, + Section::HttpsClient => match key { + "enabled" => cfg.https_client_enabled = value.parse().unwrap_or(false), + "cert" => cfg.https_client_cert_file = value.to_string(), + "key" => cfg.https_client_key_file = value.to_string(), + "ca" => cfg.https_client_ca_file = value.to_string(), + _ => {} + }, + Section::Grpc => match key { + "ca" => cfg.grpc_ca_file = value.to_string(), + "allowed_wildcard_domain" => { + cfg.grpc_allowed_wildcard_domain = value.to_string() + } + _ => {} + }, + Section::HttpsVolume => match key { + "cert" => cfg.https_cert_file = value.to_string(), + "key" => cfg.https_key_file = value.to_string(), + "ca" => cfg.https_ca_file = value.to_string(), + _ => {} + }, + Section::GrpcVolume => match key { + "cert" => cfg.grpc_cert_file = value.to_string(), + "key" => cfg.grpc_key_file = value.to_string(), + // Go only reads CA from [grpc], not [grpc.volume] + "allowed_commonNames" => { + cfg.grpc_volume_allowed_common_names = + value.split(',').map(|name| name.to_string()).collect(); + } + _ => {} + }, + Section::Tls => match key { + "min_version" => cfg.tls_policy.min_version = value.to_string(), + "max_version" => cfg.tls_policy.max_version = value.to_string(), + "cipher_suites" => cfg.tls_policy.cipher_suites = value.to_string(), + _ => {} + }, + Section::Guard => match key { + "white_list" => { + cfg.guard_white_list = value + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + } + _ => {} + }, + Section::Access => match key { + "ui" => cfg.access_ui = value.parse().unwrap_or(false), + _ => {} + }, + Section::None => {} + } + } + } + + // Match Go's v.SetDefault: when a signing key is present but + // expires_after_seconds was never specified, apply Go's defaults. + if !cfg.jwt_signing_key.is_empty() && cfg.jwt_signing_expires == 0 { + cfg.jwt_signing_expires = 10; + } + if !cfg.jwt_read_signing_key.is_empty() && cfg.jwt_read_signing_expires == 0 { + cfg.jwt_read_signing_expires = 60; + } + + // Override with WEED_ environment variables (matches Go's Viper convention: + // prefix WEED_, uppercase, replace . with _). + // e.g. WEED_JWT_SIGNING_KEY overrides [jwt.signing] key + apply_env_overrides(&mut cfg); + + cfg +} + +fn resolve_security_config_path(path: &str) -> Option { + if !path.is_empty() { + return Some(PathBuf::from(path)); + } + + default_security_config_candidates( + std::env::current_dir().ok().as_deref(), + home_dir_from_env().as_deref(), + ) + .into_iter() + .find(|candidate| candidate.is_file()) +} + +fn default_security_config_candidates( + current_dir: Option<&Path>, + home_dir: Option<&Path>, +) -> Vec { + let mut candidates = Vec::new(); + if let Some(dir) = current_dir { + candidates.push(dir.join(SECURITY_CONFIG_FILE_NAME)); + } + if let Some(home) = home_dir { + candidates.push(home.join(".seaweedfs").join(SECURITY_CONFIG_FILE_NAME)); + } + candidates.push(PathBuf::from("/usr/local/etc/seaweedfs").join(SECURITY_CONFIG_FILE_NAME)); + candidates.push(PathBuf::from("/etc/seaweedfs").join(SECURITY_CONFIG_FILE_NAME)); + candidates +} + +fn home_dir_from_env() -> Option { + std::env::var_os("HOME") + .filter(|v| !v.is_empty()) + .map(PathBuf::from) + .or_else(|| { + std::env::var_os("USERPROFILE") + .filter(|v| !v.is_empty()) + .map(PathBuf::from) + }) +} + +/// Apply WEED_ environment variable overrides to a SecurityConfig. +/// Matches Go's Viper convention: WEED_ prefix, uppercase, dots replaced with underscores. +fn apply_env_overrides(cfg: &mut SecurityConfig) { + if let Ok(v) = std::env::var("WEED_JWT_SIGNING_KEY") { + cfg.jwt_signing_key = v.into_bytes(); + } + if let Ok(v) = std::env::var("WEED_JWT_SIGNING_EXPIRES_AFTER_SECONDS") { + cfg.jwt_signing_expires = v.parse().unwrap_or(cfg.jwt_signing_expires); + } + if let Ok(v) = std::env::var("WEED_JWT_SIGNING_READ_KEY") { + cfg.jwt_read_signing_key = v.into_bytes(); + } + if let Ok(v) = std::env::var("WEED_JWT_SIGNING_READ_EXPIRES_AFTER_SECONDS") { + cfg.jwt_read_signing_expires = v.parse().unwrap_or(cfg.jwt_read_signing_expires); + } + if let Ok(v) = std::env::var("WEED_HTTPS_VOLUME_CERT") { + cfg.https_cert_file = v; + } + if let Ok(v) = std::env::var("WEED_HTTPS_VOLUME_KEY") { + cfg.https_key_file = v; + } + if let Ok(v) = std::env::var("WEED_HTTPS_VOLUME_CA") { + cfg.https_ca_file = v; + } + if let Ok(v) = std::env::var("WEED_HTTPS_CLIENT_ENABLED") { + cfg.https_client_enabled = v == "true" || v == "1"; + } + if let Ok(v) = std::env::var("WEED_HTTPS_CLIENT_CERT") { + cfg.https_client_cert_file = v; + } + if let Ok(v) = std::env::var("WEED_HTTPS_CLIENT_KEY") { + cfg.https_client_key_file = v; + } + if let Ok(v) = std::env::var("WEED_HTTPS_CLIENT_CA") { + cfg.https_client_ca_file = v; + } + if let Ok(v) = std::env::var("WEED_GRPC_VOLUME_CERT") { + cfg.grpc_cert_file = v; + } + if let Ok(v) = std::env::var("WEED_GRPC_VOLUME_KEY") { + cfg.grpc_key_file = v; + } + if let Ok(v) = std::env::var("WEED_GRPC_CA") { + cfg.grpc_ca_file = v; + } else if let Ok(v) = std::env::var("WEED_GRPC_VOLUME_CA") { + cfg.grpc_ca_file = v; + } + if let Ok(v) = std::env::var("WEED_GRPC_ALLOWED_WILDCARD_DOMAIN") { + cfg.grpc_allowed_wildcard_domain = v; + } + if let Ok(v) = std::env::var("WEED_GRPC_VOLUME_ALLOWED_COMMONNAMES") { + cfg.grpc_volume_allowed_common_names = v.split(',').map(|name| name.to_string()).collect(); + } + if let Ok(v) = std::env::var("WEED_TLS_MIN_VERSION") { + cfg.tls_policy.min_version = v; + } + if let Ok(v) = std::env::var("WEED_TLS_MAX_VERSION") { + cfg.tls_policy.max_version = v; + } + if let Ok(v) = std::env::var("WEED_TLS_CIPHER_SUITES") { + cfg.tls_policy.cipher_suites = v; + } + if let Ok(v) = std::env::var("WEED_GUARD_WHITE_LIST") { + cfg.guard_white_list = v + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect(); + } + if let Ok(v) = std::env::var("WEED_ACCESS_UI") { + cfg.access_ui = v == "true" || v == "1"; + } +} + +/// Detect the host's IP address. +/// Mirrors Go's `util.DetectedHostAddress()`. +fn detect_host_address() -> String { + // Connect to a remote address to determine the local outbound IP + if let Ok(socket) = UdpSocket::bind("0.0.0.0:0") { + if socket.connect("8.8.8.8:80").is_ok() { + if let Ok(addr) = socket.local_addr() { + return addr.ip().to_string(); + } + } + } + "localhost".to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::ffi::OsString; + use std::sync::{Mutex, MutexGuard, OnceLock}; + + fn process_state_lock() -> MutexGuard<'static, ()> { + static LOCK: OnceLock> = OnceLock::new(); + LOCK.get_or_init(|| Mutex::new(())).lock().unwrap() + } + + fn with_temp_env_var(key: &str, value: Option<&str>, f: F) { + let previous = std::env::var_os(key); + match value { + Some(v) => std::env::set_var(key, v), + None => std::env::remove_var(key), + } + f(); + restore_env_var(key, previous); + } + + fn restore_env_var(key: &str, value: Option) { + if let Some(value) = value { + std::env::set_var(key, value); + } else { + std::env::remove_var(key); + } + } + + fn with_temp_current_dir(dir: &Path, f: F) { + let previous = std::env::current_dir().unwrap(); + std::env::set_current_dir(dir).unwrap(); + f(); + std::env::set_current_dir(previous).unwrap(); + } + + fn with_cleared_security_env(f: F) { + const KEYS: &[&str] = &[ + "WEED_JWT_SIGNING_KEY", + "WEED_JWT_SIGNING_EXPIRES_AFTER_SECONDS", + "WEED_JWT_SIGNING_READ_KEY", + "WEED_JWT_SIGNING_READ_EXPIRES_AFTER_SECONDS", + "WEED_HTTPS_VOLUME_CERT", + "WEED_HTTPS_VOLUME_KEY", + "WEED_HTTPS_VOLUME_CA", + "WEED_HTTPS_CLIENT_ENABLED", + "WEED_HTTPS_CLIENT_CERT", + "WEED_HTTPS_CLIENT_KEY", + "WEED_HTTPS_CLIENT_CA", + "WEED_GRPC_VOLUME_CERT", + "WEED_GRPC_VOLUME_KEY", + "WEED_GRPC_CA", + "WEED_GRPC_VOLUME_CA", + "WEED_GRPC_ALLOWED_WILDCARD_DOMAIN", + "WEED_GRPC_VOLUME_ALLOWED_COMMONNAMES", + "WEED_TLS_MIN_VERSION", + "WEED_TLS_MAX_VERSION", + "WEED_TLS_CIPHER_SUITES", + "WEED_GUARD_WHITE_LIST", + "WEED_ACCESS_UI", + ]; + + let previous: Vec<(&str, Option)> = KEYS + .iter() + .map(|key| (*key, std::env::var_os(key))) + .collect(); + + for key in KEYS { + std::env::remove_var(key); + } + + f(); + + for (key, value) in previous { + restore_env_var(key, value); + } + } + + #[test] + fn test_parse_duration() { + assert_eq!(parse_duration("60s"), std::time::Duration::from_secs(60)); + assert_eq!(parse_duration("5m"), std::time::Duration::from_secs(300)); + assert_eq!(parse_duration("1h"), std::time::Duration::from_secs(3600)); + assert_eq!(parse_duration("30"), std::time::Duration::from_secs(30)); + assert_eq!(parse_duration(""), std::time::Duration::from_secs(60)); + } + + #[test] + fn test_parse_min_free_spaces_percent() { + let result = parse_min_free_spaces("", "1"); + assert_eq!(result.len(), 1); + match &result[0] { + MinFreeSpace::Percent(v) => assert!((v - 1.0).abs() < f64::EPSILON), + _ => panic!("Expected Percent"), + } + } + + #[test] + fn test_parse_min_free_spaces_bytes() { + let result = parse_min_free_spaces("10GiB", ""); + assert_eq!(result.len(), 1); + match &result[0] { + MinFreeSpace::Bytes(v) => assert_eq!(*v, 10 * 1024 * 1024 * 1024), + _ => panic!("Expected Bytes"), + } + } + + #[test] + fn test_parse_volume_tags_single() { + let tags = parse_volume_tags("fast:ssd", 3); + assert_eq!(tags.len(), 3); + assert_eq!(tags[0], vec!["fast", "ssd"]); + assert_eq!(tags[1], vec!["fast", "ssd"]); + assert_eq!(tags[2], vec!["fast", "ssd"]); + } + + #[test] + fn test_parse_volume_tags_multi() { + let tags = parse_volume_tags("fast:ssd,archive", 3); + assert_eq!(tags.len(), 3); + assert_eq!(tags[0], vec!["fast", "ssd"]); + assert_eq!(tags[1], vec!["archive"]); + assert_eq!(tags[2], Vec::::new()); + } + + #[test] + fn test_parse_volume_tags_empty() { + let tags = parse_volume_tags("", 2); + assert_eq!(tags.len(), 2); + assert_eq!(tags[0], Vec::::new()); + assert_eq!(tags[1], Vec::::new()); + } + + #[test] + fn test_normalize_args_single_dash_to_double() { + let args = vec![ + "bin".into(), + "-port".into(), + "8080".into(), + "-ip.bind".into(), + "127.0.0.1".into(), + "-dir".into(), + "/data".into(), + ]; + let norm = normalize_args_vec(args); + assert_eq!( + norm, + vec![ + "bin", + "--port", + "8080", + "--ip.bind", + "127.0.0.1", + "--dir", + "/data", + ] + ); + } + + #[test] + fn test_normalize_args_double_dash_unchanged() { + let args = vec![ + "bin".into(), + "--port".into(), + "8080".into(), + "--master".into(), + "localhost:9333".into(), + ]; + let norm = normalize_args_vec(args); + assert_eq!( + norm, + vec!["bin", "--port", "8080", "--master", "localhost:9333",] + ); + } + + #[test] + fn test_normalize_args_single_char_flags_unchanged() { + let args = vec!["bin".into(), "-h".into(), "-V".into()]; + let norm = normalize_args_vec(args); + assert_eq!(norm, vec!["bin", "-h", "-V"]); + } + + #[test] + fn test_normalize_args_equals_format() { + let args = vec!["bin".into(), "-port=8080".into(), "-ip.bind=0.0.0.0".into()]; + let norm = normalize_args_vec(args); + assert_eq!(norm, vec!["bin", "--port=8080", "--ip.bind=0.0.0.0"]); + } + + #[test] + fn test_normalize_args_stop_at_double_dash() { + let args = vec![ + "bin".into(), + "-port".into(), + "8080".into(), + "--".into(), + "-notaflag".into(), + ]; + let norm = normalize_args_vec(args); + assert_eq!(norm, vec!["bin", "--port", "8080", "--", "-notaflag"]); + } + + #[test] + fn test_resolve_config_defaults_dir_to_platform_temp_dir() { + let cfg = resolve_config(Cli::parse_from(["bin"])); + assert_eq!(cfg.folders, vec![default_volume_dir()]); + } + + #[test] + fn test_parse_security_config_access_ui() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[jwt.signing] +key = "secret" + +[access] +ui = true +"#, + ) + .unwrap(); + + with_cleared_security_env(|| { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert_eq!(cfg.jwt_signing_key, b"secret"); + assert!(cfg.access_ui); + }); + } + + #[test] + fn test_parse_security_config_discovers_current_directory_default() { + let _guard = process_state_lock(); + let tmp = tempfile::TempDir::new().unwrap(); + std::fs::write( + tmp.path().join(SECURITY_CONFIG_FILE_NAME), + r#" +[jwt.signing] +key = "cwd-secret" +"#, + ) + .unwrap(); + + with_temp_current_dir(tmp.path(), || { + with_temp_env_var("WEED_JWT_SIGNING_KEY", None, || { + let cfg = parse_security_config(""); + assert_eq!(cfg.jwt_signing_key, b"cwd-secret"); + }); + }); + } + + #[test] + fn test_parse_security_config_discovers_home_default() { + let _guard = process_state_lock(); + let current_dir = tempfile::TempDir::new().unwrap(); + let home_dir = tempfile::TempDir::new().unwrap(); + let seaweed_home = home_dir.path().join(".seaweedfs"); + std::fs::create_dir_all(&seaweed_home).unwrap(); + std::fs::write( + seaweed_home.join(SECURITY_CONFIG_FILE_NAME), + r#" +[jwt.signing] +key = "home-secret" +"#, + ) + .unwrap(); + + with_temp_current_dir(current_dir.path(), || { + with_temp_env_var("WEED_JWT_SIGNING_KEY", None, || { + with_temp_env_var("HOME", Some(home_dir.path().to_str().unwrap()), || { + let cfg = parse_security_config(""); + assert_eq!(cfg.jwt_signing_key, b"home-secret"); + }); + }); + }); + } + + #[test] + fn test_parse_security_config_uses_grpc_root_ca() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[grpc] +ca = "/etc/seaweedfs/grpc-ca.pem" + +[grpc.volume] +cert = "/etc/seaweedfs/volume-cert.pem" +key = "/etc/seaweedfs/volume-key.pem" +"#, + ) + .unwrap(); + + with_cleared_security_env(|| { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert_eq!(cfg.grpc_ca_file, "/etc/seaweedfs/grpc-ca.pem"); + assert_eq!(cfg.grpc_cert_file, "/etc/seaweedfs/volume-cert.pem"); + assert_eq!(cfg.grpc_key_file, "/etc/seaweedfs/volume-key.pem"); + }); + } + + #[test] + fn test_parse_security_config_uses_grpc_peer_name_policy() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[grpc] +allowed_wildcard_domain = ".example.com" + +[grpc.volume] +allowed_commonNames = "volume-a.internal,volume-b.internal" +"#, + ) + .unwrap(); + + with_cleared_security_env(|| { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert_eq!(cfg.grpc_allowed_wildcard_domain, ".example.com"); + assert_eq!( + cfg.grpc_volume_allowed_common_names, + vec![ + String::from("volume-a.internal"), + String::from("volume-b.internal") + ] + ); + }); + } + + #[test] + fn test_parse_security_config_uses_https_client_settings() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[https.client] +enabled = true +cert = "/etc/seaweedfs/client-cert.pem" +key = "/etc/seaweedfs/client-key.pem" +ca = "/etc/seaweedfs/client-ca.pem" +"#, + ) + .unwrap(); + + with_cleared_security_env(|| { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert!(cfg.https_client_enabled); + assert_eq!(cfg.https_client_cert_file, "/etc/seaweedfs/client-cert.pem"); + assert_eq!(cfg.https_client_key_file, "/etc/seaweedfs/client-key.pem"); + assert_eq!(cfg.https_client_ca_file, "/etc/seaweedfs/client-ca.pem"); + }); + } + + #[test] + fn test_parse_security_config_uses_tls_policy_settings() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[tls] +min_version = "TLS 1.2" +max_version = "TLS 1.3" +cipher_suites = "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256" +"#, + ) + .unwrap(); + + with_cleared_security_env(|| { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert_eq!(cfg.tls_policy.min_version, "TLS 1.2"); + assert_eq!(cfg.tls_policy.max_version, "TLS 1.3"); + assert_eq!( + cfg.tls_policy.cipher_suites, + "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256" + ); + }); + } + + #[test] + fn test_merge_options_file_basic() { + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), "port=9999\ndir=/data\nmaster=localhost:9333\n").unwrap(); + + let args = vec![ + "bin".into(), + "--options".into(), + tmp.path().to_str().unwrap().into(), + ]; + let merged = merge_options_file(args); + // Should contain the original args plus the file-based ones + assert!(merged.contains(&"--port".to_string())); + assert!(merged.contains(&"9999".to_string())); + assert!(merged.contains(&"--dir".to_string())); + assert!(merged.contains(&"/data".to_string())); + } + + #[test] + fn test_merge_options_file_cli_precedence() { + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), "port=9999\ndir=/data\n").unwrap(); + + let args = vec![ + "bin".into(), + "--port".into(), + "8080".into(), + "--options".into(), + tmp.path().to_str().unwrap().into(), + ]; + let merged = merge_options_file(args); + // port should NOT be duplicated from file since CLI already set it + let port_count = merged.iter().filter(|a| *a == "--port").count(); + assert_eq!( + port_count, 1, + "CLI port should take precedence, file port skipped" + ); + // dir should be added from file + assert!(merged.contains(&"--dir".to_string())); + } + + #[test] + fn test_merge_options_file_comments_and_blanks() { + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + "# this is a comment\n\nport=9999\n# another comment\ndir=/data\n", + ) + .unwrap(); + + let args = vec![ + "bin".into(), + "--options".into(), + tmp.path().to_str().unwrap().into(), + ]; + let merged = merge_options_file(args); + assert!(merged.contains(&"--port".to_string())); + assert!(merged.contains(&"--dir".to_string())); + } + + #[test] + fn test_merge_options_file_with_dashes_in_key() { + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write(tmp.path(), "-port=9999\n--dir=/data\nip.bind=0.0.0.0\n").unwrap(); + + let args = vec![ + "bin".into(), + "--options".into(), + tmp.path().to_str().unwrap().into(), + ]; + let merged = merge_options_file(args); + assert!(merged.contains(&"--port".to_string())); + assert!(merged.contains(&"--dir".to_string())); + assert!(merged.contains(&"--ip.bind".to_string())); + } + + #[test] + fn test_find_options_arg() { + assert_eq!( + find_options_arg(&["bin".into(), "--options".into(), "/tmp/opts".into()]), + "/tmp/opts" + ); + assert_eq!( + find_options_arg(&["bin".into(), "-options".into(), "/tmp/opts".into()]), + "/tmp/opts" + ); + assert_eq!( + find_options_arg(&["bin".into(), "--options=/tmp/opts".into()]), + "/tmp/opts" + ); + assert_eq!( + find_options_arg(&["bin".into(), "--port".into(), "8080".into()]), + "" + ); + } + + #[test] + fn test_env_override_jwt_signing_key() { + let _guard = process_state_lock(); + with_temp_env_var("WEED_JWT_SIGNING_KEY", Some("env-secret"), || { + let cfg = parse_security_config(""); + assert_eq!(cfg.jwt_signing_key, b"env-secret"); + }); + } + + #[test] + fn test_env_override_takes_precedence_over_file() { + let _guard = process_state_lock(); + let tmp = tempfile::NamedTempFile::new().unwrap(); + std::fs::write( + tmp.path(), + r#" +[jwt.signing] +key = "file-secret" +"#, + ) + .unwrap(); + + with_temp_env_var("WEED_JWT_SIGNING_KEY", Some("env-secret"), || { + let cfg = parse_security_config(tmp.path().to_str().unwrap()); + assert_eq!(cfg.jwt_signing_key, b"env-secret"); + }); + } + + #[test] + fn test_env_override_guard_white_list() { + let _guard = process_state_lock(); + with_temp_env_var( + "WEED_GUARD_WHITE_LIST", + Some("10.0.0.0/8, 192.168.1.0/24"), + || { + let cfg = parse_security_config(""); + assert_eq!(cfg.guard_white_list, vec!["10.0.0.0/8", "192.168.1.0/24"]); + }, + ); + } + + #[test] + fn test_env_override_access_ui() { + let _guard = process_state_lock(); + with_temp_env_var("WEED_ACCESS_UI", Some("true"), || { + let cfg = parse_security_config(""); + assert!(cfg.access_ui); + }); + } +} diff --git a/seaweed-volume/src/images.rs b/seaweed-volume/src/images.rs new file mode 100644 index 000000000..9ad7ca71c --- /dev/null +++ b/seaweed-volume/src/images.rs @@ -0,0 +1,275 @@ +//! JPEG EXIF orientation auto-fix, matching Go's `FixJpgOrientation`. +//! +//! Reads the EXIF orientation tag from JPEG data and rotates/flips the image +//! to normalize it to orientation 1 (top-left). If EXIF parsing fails or +//! orientation is already normal, returns the original data unchanged. + +use std::io::Cursor; + +use image::{DynamicImage, GenericImageView, ImageFormat, RgbaImage}; + +/// EXIF orientation tag values. +/// See: +const TOP_LEFT_SIDE: u32 = 1; +const TOP_RIGHT_SIDE: u32 = 2; +const BOTTOM_RIGHT_SIDE: u32 = 3; +const BOTTOM_LEFT_SIDE: u32 = 4; +const LEFT_SIDE_TOP: u32 = 5; +const RIGHT_SIDE_TOP: u32 = 6; +const RIGHT_SIDE_BOTTOM: u32 = 7; +const LEFT_SIDE_BOTTOM: u32 = 8; + +/// Fix JPEG orientation based on EXIF data. +/// +/// Reads the EXIF orientation tag and applies the appropriate rotation/flip +/// to normalize the image to orientation 1 (top-left). Re-encodes as JPEG. +/// +/// Returns the original data unchanged if: +/// - EXIF data cannot be parsed +/// - No orientation tag is present +/// - Orientation is already 1 (normal) +/// - Image decoding or re-encoding fails +pub fn fix_jpg_orientation(data: &[u8]) -> Vec { + // Parse EXIF data + let orientation = match read_exif_orientation(data) { + Some(o) => o, + None => return data.to_vec(), + }; + + // Orientation 1 means normal — no transformation needed + if orientation == TOP_LEFT_SIDE { + return data.to_vec(); + } + + // Determine rotation angle and flip mode + let (angle, flip_horizontal) = match orientation { + TOP_RIGHT_SIDE => (0, true), + BOTTOM_RIGHT_SIDE => (180, false), + BOTTOM_LEFT_SIDE => (180, true), + LEFT_SIDE_TOP => (-90, true), + RIGHT_SIDE_TOP => (-90, false), + RIGHT_SIDE_BOTTOM => (90, true), + LEFT_SIDE_BOTTOM => (90, false), + _ => return data.to_vec(), + }; + + // Decode the image + let src_image = match image::load_from_memory_with_format(data, ImageFormat::Jpeg) { + Ok(img) => img, + Err(_) => return data.to_vec(), + }; + + // Apply rotation then flip (matching Go's flip(rotate(img, angle), flipMode)) + let transformed = flip_horizontal_if(rotate(src_image, angle), flip_horizontal); + + // Re-encode as JPEG + let mut buf = Cursor::new(Vec::new()); + match transformed.write_to(&mut buf, ImageFormat::Jpeg) { + Ok(_) => buf.into_inner(), + Err(_) => data.to_vec(), + } +} + +/// Read the EXIF orientation tag from JPEG data. +/// Returns None if EXIF cannot be parsed or orientation tag is not present. +fn read_exif_orientation(data: &[u8]) -> Option { + let exif_reader = exif::Reader::new(); + let mut cursor = Cursor::new(data); + let exif_data = exif_reader.read_from_container(&mut cursor).ok()?; + + let orientation_field = exif_data.get_field(exif::Tag::Orientation, exif::In::PRIMARY)?; + match orientation_field.value { + exif::Value::Short(ref v) if !v.is_empty() => Some(v[0] as u32), + _ => orientation_field.value.get_uint(0), + } +} + +/// Rotate an image by the given angle (counter-clockwise, in degrees). +/// Matches Go's rotate function. +fn rotate(img: DynamicImage, angle: i32) -> DynamicImage { + let (width, height) = img.dimensions(); + + match angle { + 90 => { + // 90 degrees counter-clockwise + let new_w = height; + let new_h = width; + let mut out = RgbaImage::new(new_w, new_h); + for y in 0..new_h { + for x in 0..new_w { + out.put_pixel(x, y, img.get_pixel(new_h - 1 - y, x)); + } + } + DynamicImage::ImageRgba8(out) + } + -90 => { + // 90 degrees clockwise (or 270 counter-clockwise) + let new_w = height; + let new_h = width; + let mut out = RgbaImage::new(new_w, new_h); + for y in 0..new_h { + for x in 0..new_w { + out.put_pixel(x, y, img.get_pixel(y, new_w - 1 - x)); + } + } + DynamicImage::ImageRgba8(out) + } + 180 | -180 => { + let mut out = RgbaImage::new(width, height); + for y in 0..height { + for x in 0..width { + out.put_pixel(x, y, img.get_pixel(width - 1 - x, height - 1 - y)); + } + } + DynamicImage::ImageRgba8(out) + } + _ => img, + } +} + +/// Flip the image horizontally if requested. +/// In Go, flipMode 2 == FlipHorizontal. We simplify since only horizontal flip is used. +fn flip_horizontal_if(img: DynamicImage, do_flip: bool) -> DynamicImage { + if !do_flip { + return img; + } + let (width, height) = img.dimensions(); + let mut out = RgbaImage::new(width, height); + for y in 0..height { + for x in 0..width { + out.put_pixel(x, y, img.get_pixel(width - 1 - x, y)); + } + } + DynamicImage::ImageRgba8(out) +} + +/// Returns true if the given MIME type or file path extension indicates a JPEG file. +pub fn is_jpeg(mime_type: &str, path: &str) -> bool { + if mime_type == "image/jpeg" { + return true; + } + let lower = path.to_lowercase(); + lower.ends_with(".jpg") || lower.ends_with(".jpeg") +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_non_jpeg_data_returned_unchanged() { + let data = b"not a jpeg file at all"; + let result = fix_jpg_orientation(data); + assert_eq!(result, data); + } + + #[test] + fn test_jpeg_without_exif_returned_unchanged() { + // Create a minimal JPEG without EXIF data + let img = DynamicImage::ImageRgba8(RgbaImage::new(2, 2)); + let mut buf = Cursor::new(Vec::new()); + img.write_to(&mut buf, ImageFormat::Jpeg).unwrap(); + let jpeg_data = buf.into_inner(); + + let result = fix_jpg_orientation(&jpeg_data); + // Should return data unchanged (no EXIF orientation tag) + // Just verify it's still valid JPEG + assert!(!result.is_empty()); + assert_eq!(&result[0..2], &[0xFF, 0xD8]); // JPEG magic bytes + } + + #[test] + fn test_is_jpeg() { + assert!(is_jpeg("image/jpeg", "")); + assert!(is_jpeg("", "/3,abc.jpg")); + assert!(is_jpeg("", "/3,abc.JPEG")); + assert!(is_jpeg("application/octet-stream", "/3,abc.JPG")); + assert!(!is_jpeg("image/png", "/3,abc.png")); + assert!(!is_jpeg("", "/3,abc.png")); + } + + #[test] + fn test_rotate_180() { + // Create a 2x2 image with distinct pixel colors + let mut img = RgbaImage::new(2, 2); + img.put_pixel(0, 0, image::Rgba([255, 0, 0, 255])); // red top-left + img.put_pixel(1, 0, image::Rgba([0, 255, 0, 255])); // green top-right + img.put_pixel(0, 1, image::Rgba([0, 0, 255, 255])); // blue bottom-left + img.put_pixel(1, 1, image::Rgba([255, 255, 0, 255])); // yellow bottom-right + let dynamic = DynamicImage::ImageRgba8(img); + + let rotated = rotate(dynamic, 180); + let (w, h) = rotated.dimensions(); + assert_eq!((w, h), (2, 2)); + // After 180 rotation: top-left should be yellow, top-right should be blue + assert_eq!(rotated.get_pixel(0, 0), image::Rgba([255, 255, 0, 255])); + assert_eq!(rotated.get_pixel(1, 0), image::Rgba([0, 0, 255, 255])); + assert_eq!(rotated.get_pixel(0, 1), image::Rgba([0, 255, 0, 255])); + assert_eq!(rotated.get_pixel(1, 1), image::Rgba([255, 0, 0, 255])); + } + + #[test] + fn test_rotate_90_ccw() { + // Create 3x2 image (width=3, height=2) + let mut img = RgbaImage::new(3, 2); + img.put_pixel(0, 0, image::Rgba([1, 0, 0, 255])); + img.put_pixel(1, 0, image::Rgba([2, 0, 0, 255])); + img.put_pixel(2, 0, image::Rgba([3, 0, 0, 255])); + img.put_pixel(0, 1, image::Rgba([4, 0, 0, 255])); + img.put_pixel(1, 1, image::Rgba([5, 0, 0, 255])); + img.put_pixel(2, 1, image::Rgba([6, 0, 0, 255])); + let dynamic = DynamicImage::ImageRgba8(img); + + let rotated = rotate(dynamic, 90); + let (w, h) = rotated.dimensions(); + // 90 CCW: width=3,height=2 -> new_w=2, new_h=3 + assert_eq!((w, h), (2, 3)); + // Top-right (2,0) should move to top-left (0,0) in CCW 90 + assert_eq!(rotated.get_pixel(0, 0)[0], 3); + assert_eq!(rotated.get_pixel(1, 0)[0], 6); + } + + #[test] + fn test_rotate_neg90_cw() { + // Create 3x2 image + let mut img = RgbaImage::new(3, 2); + img.put_pixel(0, 0, image::Rgba([1, 0, 0, 255])); + img.put_pixel(1, 0, image::Rgba([2, 0, 0, 255])); + img.put_pixel(2, 0, image::Rgba([3, 0, 0, 255])); + img.put_pixel(0, 1, image::Rgba([4, 0, 0, 255])); + img.put_pixel(1, 1, image::Rgba([5, 0, 0, 255])); + img.put_pixel(2, 1, image::Rgba([6, 0, 0, 255])); + let dynamic = DynamicImage::ImageRgba8(img); + + let rotated = rotate(dynamic, -90); + let (w, h) = rotated.dimensions(); + assert_eq!((w, h), (2, 3)); + // -90 (CW 90): top-left (0,0) should go to top-right + assert_eq!(rotated.get_pixel(0, 0)[0], 4); + assert_eq!(rotated.get_pixel(1, 0)[0], 1); + } + + #[test] + fn test_flip_horizontal() { + let mut img = RgbaImage::new(2, 1); + img.put_pixel(0, 0, image::Rgba([10, 0, 0, 255])); + img.put_pixel(1, 0, image::Rgba([20, 0, 0, 255])); + let dynamic = DynamicImage::ImageRgba8(img); + + let flipped = flip_horizontal_if(dynamic, true); + assert_eq!(flipped.get_pixel(0, 0)[0], 20); + assert_eq!(flipped.get_pixel(1, 0)[0], 10); + } + + #[test] + fn test_flip_horizontal_noop() { + let mut img = RgbaImage::new(2, 1); + img.put_pixel(0, 0, image::Rgba([10, 0, 0, 255])); + img.put_pixel(1, 0, image::Rgba([20, 0, 0, 255])); + let dynamic = DynamicImage::ImageRgba8(img); + + let not_flipped = flip_horizontal_if(dynamic, false); + assert_eq!(not_flipped.get_pixel(0, 0)[0], 10); + assert_eq!(not_flipped.get_pixel(1, 0)[0], 20); + } +} diff --git a/seaweed-volume/src/lib.rs b/seaweed-volume/src/lib.rs new file mode 100644 index 000000000..c295c983d --- /dev/null +++ b/seaweed-volume/src/lib.rs @@ -0,0 +1,27 @@ +pub mod config; +pub mod images; +pub mod metrics; +pub mod remote_storage; +pub mod security; +pub mod server; +pub mod storage; +pub mod version; + +/// Generated protobuf modules. +pub mod pb { + pub const FILE_DESCRIPTOR_SET: &[u8] = + tonic::include_file_descriptor_set!("seaweed_descriptor"); + + pub mod remote_pb { + tonic::include_proto!("remote_pb"); + } + pub mod volume_server_pb { + tonic::include_proto!("volume_server_pb"); + } + pub mod master_pb { + tonic::include_proto!("master_pb"); + } + pub mod filer_pb { + tonic::include_proto!("filer_pb"); + } +} diff --git a/seaweed-volume/src/main.rs b/seaweed-volume/src/main.rs new file mode 100644 index 000000000..a398dbf66 --- /dev/null +++ b/seaweed-volume/src/main.rs @@ -0,0 +1,1051 @@ +use std::sync::{Arc, RwLock}; + +use tracing::{error, info, warn}; + +use seaweed_volume::config::{self, VolumeServerConfig}; +use seaweed_volume::metrics; +use seaweed_volume::pb::volume_server_pb::volume_server_server::VolumeServerServer; +use seaweed_volume::security::tls::{ + build_rustls_server_config, build_rustls_server_config_with_grpc_client_auth, + GrpcClientAuthPolicy, TlsPolicy, +}; +use seaweed_volume::security::{Guard, SigningKey}; +use seaweed_volume::server::debug::build_debug_router; +use seaweed_volume::server::grpc_client::load_outgoing_grpc_tls; +use seaweed_volume::server::grpc_server::VolumeGrpcService; +use seaweed_volume::server::profiling::CpuProfileSession; +use seaweed_volume::server::request_id::GrpcRequestIdLayer; +use seaweed_volume::server::volume_server::{ + build_metrics_router, RuntimeMetricsConfig, VolumeServerState, +}; +use seaweed_volume::server::write_queue::WriteQueue; +use seaweed_volume::storage::store::Store; +use seaweed_volume::storage::types::DiskType; + +use tokio_rustls::TlsAcceptor; + +const GRPC_MAX_MESSAGE_SIZE: usize = 1 << 30; +const GRPC_KEEPALIVE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(60); +const GRPC_KEEPALIVE_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(20); +const GRPC_INITIAL_WINDOW_SIZE: u32 = 16 * 1024 * 1024; +const GRPC_MAX_HEADER_LIST_SIZE: u32 = 8 * 1024 * 1024; +const GRPC_MAX_CONCURRENT_STREAMS: u32 = 1000; + +fn main() { + // Initialize tracing + tracing_subscriber::fmt() + .with_env_filter( + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")), + ) + .init(); + + let config = config::parse_cli(); + seaweed_volume::server::server_stats::init_process_start(); + let cpu_profile = match CpuProfileSession::start(&config) { + Ok(session) => session, + Err(e) => { + error!("{}", e); + std::process::exit(1); + } + }; + info!( + "SeaweedFS Volume Server (Rust) v{}", + seaweed_volume::version::full_version() + ); + + // Register Prometheus metrics + metrics::register_metrics(); + + // Build the tokio runtime and run the async entry point + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("Failed to build tokio runtime"); + + if let Err(e) = rt.block_on(run(config, cpu_profile)) { + error!("Volume server failed: {}", e); + std::process::exit(1); + } +} + +fn build_outgoing_http_client( + config: &VolumeServerConfig, +) -> Result<(reqwest::Client, String), Box> { + let scheme = if config.https_client_enabled { + "https" + } else { + "http" + }; + if !config.https_client_enabled { + return Ok((reqwest::Client::new(), scheme.to_string())); + } + + let mut builder = reqwest::Client::builder(); + if !config.https_client_ca_file.is_empty() { + let ca_pem = std::fs::read(&config.https_client_ca_file).map_err(|e| { + format!( + "Failed to read HTTPS client CA file '{}': {}", + config.https_client_ca_file, e + ) + })?; + let cert = reqwest::Certificate::from_pem(&ca_pem).map_err(|e| { + format!( + "Failed to parse HTTPS client CA PEM '{}': {}", + config.https_client_ca_file, e + ) + })?; + builder = builder.add_root_certificate(cert); + } + + match ( + config.https_client_cert_file.is_empty(), + config.https_client_key_file.is_empty(), + ) { + (true, true) => {} + (false, false) => { + let cert_pem = std::fs::read(&config.https_client_cert_file).map_err(|e| { + format!( + "Failed to read HTTPS client cert file '{}': {}", + config.https_client_cert_file, e + ) + })?; + let key_pem = std::fs::read(&config.https_client_key_file).map_err(|e| { + format!( + "Failed to read HTTPS client key file '{}': {}", + config.https_client_key_file, e + ) + })?; + let mut identity_pem = cert_pem; + if !identity_pem.ends_with(b"\n") { + identity_pem.push(b'\n'); + } + identity_pem.extend_from_slice(&key_pem); + let identity = reqwest::Identity::from_pem(&identity_pem).map_err(|e| { + format!( + "Failed to parse HTTPS client identity '{}'+ '{}': {}", + config.https_client_cert_file, config.https_client_key_file, e + ) + })?; + builder = builder.identity(identity); + } + _ => { + return Err(format!( + "HTTPS client requires both cert and key, got cert='{}' key='{}'", + config.https_client_cert_file, config.https_client_key_file + ) + .into()); + } + } + + Ok((builder.build()?, scheme.to_string())) +} + +fn tls_policy_is_configured(policy: &TlsPolicy) -> bool { + !policy.min_version.is_empty() + || !policy.max_version.is_empty() + || !policy.cipher_suites.is_empty() +} + +fn effective_http_tls_policy(ca_path: &str, configured_policy: &TlsPolicy) -> TlsPolicy { + if ca_path.is_empty() { + TlsPolicy::default() + } else { + configured_policy.clone() + } +} + +fn build_grpc_server_tls_acceptor( + cert_path: &str, + key_path: &str, + ca_path: &str, + tls_policy: &TlsPolicy, + allowed_wildcard_domain: &str, + allowed_common_names: &[String], +) -> Option { + if cert_path.is_empty() || key_path.is_empty() || ca_path.is_empty() { + return None; + } + let client_auth_policy = GrpcClientAuthPolicy { + allowed_common_names: allowed_common_names.to_vec(), + allowed_wildcard_domain: allowed_wildcard_domain.to_string(), + }; + let mut server_config = match build_rustls_server_config_with_grpc_client_auth( + cert_path, + key_path, + ca_path, + tls_policy, + &client_auth_policy, + ) { + Ok(server_config) => server_config, + Err(e) => { + warn!("Failed to build gRPC TLS config: {}", e); + return None; + } + }; + server_config.alpn_protocols = vec![b"h2".to_vec()]; + Some(TlsAcceptor::from(Arc::new(server_config))) +} + +fn build_http_server_tls_acceptor( + config: &VolumeServerConfig, +) -> Result, Box> { + if config.https_cert_file.is_empty() || config.https_key_file.is_empty() { + return Ok(None); + } + + let effective_policy = effective_http_tls_policy(&config.https_ca_file, &config.tls_policy); + let tls_config = match build_rustls_server_config( + &config.https_cert_file, + &config.https_key_file, + &config.https_ca_file, + &effective_policy, + ) { + Ok(tls_config) => tls_config, + Err(e) + if !config.https_ca_file.is_empty() && tls_policy_is_configured(&config.tls_policy) => + { + warn!( + "Failed to apply HTTP TLS policy '{}', falling back to default rustls policy", + e + ); + build_rustls_server_config( + &config.https_cert_file, + &config.https_key_file, + &config.https_ca_file, + &TlsPolicy::default(), + )? + } + Err(e) => return Err(e.into()), + }; + + Ok(Some(TlsAcceptor::from(Arc::new(tls_config)))) +} + +fn build_grpc_server_builder() -> tonic::transport::Server { + tonic::transport::Server::builder() + .http2_keepalive_interval(Some(GRPC_KEEPALIVE_INTERVAL)) + .http2_keepalive_timeout(Some(GRPC_KEEPALIVE_TIMEOUT)) + .max_concurrent_streams(Some(GRPC_MAX_CONCURRENT_STREAMS)) + .initial_stream_window_size(Some(GRPC_INITIAL_WINDOW_SIZE)) + .initial_connection_window_size(Some(GRPC_INITIAL_WINDOW_SIZE)) + .http2_max_header_list_size(Some(GRPC_MAX_HEADER_LIST_SIZE)) +} + +fn build_volume_grpc_service( + grpc_service: VolumeGrpcService, +) -> VolumeServerServer { + VolumeServerServer::new(grpc_service) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE) +} + +fn apply_idle_timeout( + stream: S, + idle_timeout: std::time::Duration, +) -> std::pin::Pin>> +where + S: tokio::io::AsyncRead + tokio::io::AsyncWrite, +{ + let mut stream = tokio_io_timeout::TimeoutStream::new(stream); + if !idle_timeout.is_zero() { + stream.set_read_timeout(Some(idle_timeout)); + stream.set_write_timeout(Some(idle_timeout)); + } + Box::pin(stream) +} + +async fn run( + config: VolumeServerConfig, + cpu_profile: Option, +) -> Result<(), Box> { + // Initialize the store + let mut store = Store::new(config.index_type); + store.id = config.id.clone(); + store.ip = config.ip.clone(); + store.port = config.port; + store.grpc_port = config.grpc_port; + store.public_url = config.public_url.clone(); + store.data_center = config.data_center.clone(); + store.rack = config.rack.clone(); + + // Build shared state + let guard = Guard::new( + &config.white_list, + SigningKey(config.jwt_signing_key.clone()), + config.jwt_signing_expires_seconds, + SigningKey(config.jwt_read_signing_key.clone()), + config.jwt_read_signing_expires_seconds, + ); + let master_url = config.masters.first().cloned().unwrap_or_default(); + let self_url = format!("{}:{}", config.ip, config.port); + let (http_client, outgoing_http_scheme) = build_outgoing_http_client(&config)?; + let outgoing_grpc_tls = load_outgoing_grpc_tls(&config)?; + + let security_file = config.security_file.clone(); + let cli_white_list = config.white_list.clone(); + + let state = Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(guard), + is_stopping: RwLock::new(false), + maintenance: std::sync::atomic::AtomicBool::new(false), + state_version: std::sync::atomic::AtomicU32::new(0), + concurrent_upload_limit: config.concurrent_upload_limit, + concurrent_download_limit: config.concurrent_download_limit, + inflight_upload_data_timeout: config.inflight_upload_data_timeout, + inflight_download_data_timeout: config.inflight_download_data_timeout, + inflight_upload_bytes: std::sync::atomic::AtomicI64::new(0), + inflight_download_bytes: std::sync::atomic::AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: config.data_center.clone(), + rack: config.rack.clone(), + file_size_limit_bytes: config.file_size_limit_bytes, + maintenance_byte_per_second: config.maintenance_byte_per_second, + // Go sets isHeartbeating: true unconditionally at startup + is_heartbeating: std::sync::atomic::AtomicBool::new(true), + has_master: !config.masters.is_empty(), + pre_stop_seconds: config.pre_stop_seconds, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new( + seaweed_volume::remote_storage::s3_tier::S3TierRegistry::new(), + ), + read_mode: config.read_mode, + master_url, + master_urls: config.masters.clone(), + self_url, + http_client, + outgoing_http_scheme, + outgoing_grpc_tls, + metrics_runtime: std::sync::RwLock::new(RuntimeMetricsConfig::default()), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: config.fix_jpg_orientation, + has_slow_read: config.has_slow_read, + read_buffer_size_bytes: (config.read_buffer_size_mb.max(1) as usize) * 1024 * 1024, + security_file, + cli_white_list, + state_file_path: if config.folders.is_empty() { + String::new() + } else { + std::path::Path::new(&config.folders[0]) + .join("state.pb") + .to_string_lossy() + .into_owned() + }, + }); + + // Load persisted state from disk if it exists (matches Go's State.Load on startup) + if let Some(saved) = + seaweed_volume::server::grpc_server::load_state_file(&state.state_file_path) + { + state + .maintenance + .store(saved.maintenance, std::sync::atomic::Ordering::Relaxed); + state + .state_version + .store(saved.version, std::sync::atomic::Ordering::Relaxed); + } + + if !config.masters.is_empty() { + let hb_config = seaweed_volume::server::heartbeat::HeartbeatConfig { + ip: config.ip.clone(), + port: config.port, + grpc_port: config.grpc_port, + public_url: config.public_url.clone(), + data_center: config.data_center.clone(), + rack: config.rack.clone(), + master_addresses: config.masters.clone(), + pulse_seconds: 5, + }; + seaweed_volume::server::heartbeat::prime_master_configuration(&hb_config, &state).await; + } + + { + let mut store = state.store.write().unwrap(); + for (i, dir) in config.folders.iter().enumerate() { + let idx_dir = if config.idx_folder.is_empty() { + dir.as_str() + } else { + config.idx_folder.as_str() + }; + let max_volumes = config.folder_max_limits[i]; + let disk_type = DiskType::from_string(&config.disk_types[i]); + let tags = config.folder_tags.get(i).cloned().unwrap_or_default(); + + info!( + "Adding storage location: {} (max_volumes={}, disk_type={:?})", + dir, max_volumes, disk_type + ); + let min_free_space = config.min_free_spaces[i].clone(); + store + .add_location(dir, idx_dir, max_volumes, disk_type, min_free_space, tags) + .map_err(|e| format!("Failed to add storage location {}: {}", dir, e))?; + } + } + + // Initialize the batched write queue if enabled + if config.enable_write_queue { + info!("Batched write queue enabled"); + let wq = WriteQueue::new(state.clone(), 128); + let _ = state.write_queue.set(wq); + } + + // Set initial metric gauges for concurrent limits and max volumes + metrics::CONCURRENT_UPLOAD_LIMIT.set(state.concurrent_upload_limit); + metrics::CONCURRENT_DOWNLOAD_LIMIT.set(state.concurrent_download_limit); + { + let store = state.store.read().unwrap(); + let mut max_vols: i64 = 0; + for loc in &store.locations { + max_vols += loc + .max_volume_count + .load(std::sync::atomic::Ordering::Relaxed) as i64; + } + metrics::MAX_VOLUMES.set(max_vols); + } + + // Run initial disk space check + { + let store = state.store.read().unwrap(); + for loc in &store.locations { + loc.check_disk_space(); + } + } + + // Spawn background disk space monitor (checks every 60 seconds) + { + let monitor_state = state.clone(); + tokio::spawn(async move { + let mut interval = tokio::time::interval(std::time::Duration::from_secs(60)); + interval.tick().await; // skip the first immediate tick + loop { + interval.tick().await; + let store = monitor_state.store.read().unwrap(); + for loc in &store.locations { + loc.check_disk_space(); + } + } + }); + } + + // Build HTTP routers + let mut admin_router = seaweed_volume::server::volume_server::build_admin_router_with_ui( + state.clone(), + config.ui_enabled, + ); + if config.pprof { + admin_router = admin_router.merge(build_debug_router()); + } + let admin_addr = format!("{}:{}", config.bind_ip, config.port); + + let public_port = config.public_port; + let needs_public = public_port != config.port; + let http_idle_timeout = std::time::Duration::from_secs(config.idle_timeout as u64); + + let grpc_addr = format!("{}:{}", config.bind_ip, config.grpc_port); + let grpc_tls_acceptor = build_grpc_server_tls_acceptor( + &config.grpc_cert_file, + &config.grpc_key_file, + &config.grpc_ca_file, + &config.tls_policy, + &config.grpc_allowed_wildcard_domain, + &config.grpc_volume_allowed_common_names, + ); + + info!("Starting HTTP server on {}", admin_addr); + info!("Starting gRPC server on {}", grpc_addr); + if needs_public { + info!( + "Starting public HTTP server on {}:{}", + config.bind_ip, public_port + ); + } + + // Set up graceful shutdown via SIGINT/SIGTERM using broadcast channel + let (shutdown_tx, _) = tokio::sync::broadcast::channel::<()>(1); + + let state_shutdown = state.clone(); + let shutdown_tx_clone = shutdown_tx.clone(); + tokio::spawn(async move { + let ctrl_c = tokio::signal::ctrl_c(); + #[cfg(unix)] + { + let mut sigterm = + tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate()) + .expect("Failed to install SIGTERM handler"); + tokio::select! { + _ = ctrl_c => { info!("Received SIGINT, shutting down..."); } + _ = sigterm.recv() => { info!("Received SIGTERM, shutting down..."); } + } + } + #[cfg(not(unix))] + { + ctrl_c.await.ok(); + info!("Received shutdown signal..."); + } + *state_shutdown.is_stopping.write().unwrap() = true; + // Wake heartbeat loop immediately so it sends deregister heartbeat + // before the pre_stop delay (matches Go: StopHeartbeat() closes stopChan + // before sleeping preStopSeconds) + state_shutdown.volume_state_notify.notify_one(); + + // Graceful drain: wait pre_stop_seconds before shutting down servers + let pre_stop = state_shutdown.pre_stop_seconds; + if pre_stop > 0 { + info!("Pre-stop: waiting {} seconds before shutdown...", pre_stop); + tokio::time::sleep(std::time::Duration::from_secs(pre_stop as u64)).await; + } + + let _ = shutdown_tx_clone.send(()); + }); + + // Set up SIGHUP handler for config reload (mirrors Go's grace.OnReload) + #[cfg(unix)] + { + let state_reload = state.clone(); + tokio::spawn(async move { + let mut sighup = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::hangup()) + .expect("Failed to install SIGHUP handler"); + loop { + sighup.recv().await; + info!("Received SIGHUP, reloading..."); + + // 1. Load new volumes from disk (Go's LoadNewVolumes) + { + info!("Loading new volume ids..."); + let mut store = state_reload.store.write().unwrap(); + store.load_new_volumes(); + } + + // 2. Reload security config (Go's Reload) + { + info!("Reloading security config..."); + let sec = config::parse_security_config(&state_reload.security_file); + let mut whitelist = state_reload.cli_white_list.clone(); + whitelist.extend(sec.guard_white_list.iter().cloned()); + let mut guard = state_reload.guard.write().unwrap(); + guard.update_whitelist(&whitelist); + } + + // Trigger heartbeat to report new volumes + state_reload.volume_state_notify.notify_one(); + info!("SIGHUP reload complete"); + } + }); + } + + // Build optional TLS acceptor for HTTPS + let https_tls_acceptor = + if !config.https_cert_file.is_empty() && !config.https_key_file.is_empty() { + info!( + "TLS enabled for HTTP server (cert={}, key={})", + config.https_cert_file, config.https_key_file + ); + build_http_server_tls_acceptor(&config)? + } else { + None + }; + + // Spawn all servers concurrently + let admin_listener = tokio::net::TcpListener::bind(&admin_addr) + .await + .unwrap_or_else(|e| panic!("Failed to bind HTTP to {}: {}", admin_addr, e)); + let scheme = if https_tls_acceptor.is_some() { + "HTTPS" + } else { + "HTTP" + }; + info!("{} server listening on {}", scheme, admin_addr); + + let http_handle = if let Some(tls_acceptor) = https_tls_acceptor.clone() { + let mut shutdown_rx = shutdown_tx.subscribe(); + tokio::spawn(async move { + serve_https( + admin_listener, + admin_router, + tls_acceptor, + http_idle_timeout, + async move { + let _ = shutdown_rx.recv().await; + }, + ) + .await; + }) + } else { + let mut shutdown_rx = shutdown_tx.subscribe(); + tokio::spawn(async move { + serve_http( + admin_listener, + admin_router, + http_idle_timeout, + async move { + let _ = shutdown_rx.recv().await; + }, + ) + .await; + }) + }; + + let grpc_handle = { + let grpc_state = state.clone(); + let grpc_addr = grpc_addr.clone(); + let grpc_tls_acceptor = grpc_tls_acceptor.clone(); + let mut shutdown_rx = shutdown_tx.subscribe(); + tokio::spawn(async move { + let addr = grpc_addr.parse().expect("Invalid gRPC address"); + let grpc_service = VolumeGrpcService { + state: grpc_state.clone(), + }; + if let Some(tls_acceptor) = grpc_tls_acceptor { + let listener = tokio::net::TcpListener::bind(&grpc_addr) + .await + .unwrap_or_else(|e| panic!("Failed to bind gRPC to {}: {}", grpc_addr, e)); + let incoming = grpc_tls_incoming(listener, tls_acceptor); + let reflection_v1 = tonic_reflection::server::Builder::configure() + .register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET) + .build_v1() + .expect("Failed to build gRPC reflection v1 service"); + let reflection_v1alpha = tonic_reflection::server::Builder::configure() + .register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET) + .build_v1alpha() + .expect("Failed to build gRPC reflection v1alpha service"); + info!("gRPC server listening on {} (TLS enabled)", addr); + if let Err(e) = build_grpc_server_builder() + .layer(GrpcRequestIdLayer) + .add_service(reflection_v1) + .add_service(reflection_v1alpha) + .add_service(build_volume_grpc_service(grpc_service)) + .serve_with_incoming_shutdown(incoming, async move { + let _ = shutdown_rx.recv().await; + }) + .await + { + error!("gRPC server error: {}", e); + } + } else { + let reflection_v1 = tonic_reflection::server::Builder::configure() + .register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET) + .build_v1() + .expect("Failed to build gRPC reflection v1 service"); + let reflection_v1alpha = tonic_reflection::server::Builder::configure() + .register_encoded_file_descriptor_set(seaweed_volume::pb::FILE_DESCRIPTOR_SET) + .build_v1alpha() + .expect("Failed to build gRPC reflection v1alpha service"); + info!("gRPC server listening on {}", addr); + if let Err(e) = build_grpc_server_builder() + .layer(GrpcRequestIdLayer) + .add_service(reflection_v1) + .add_service(reflection_v1alpha) + .add_service(build_volume_grpc_service(grpc_service)) + .serve_with_shutdown(addr, async move { + let _ = shutdown_rx.recv().await; + }) + .await + { + error!("gRPC server error: {}", e); + } + } + }) + }; + + // Spawn heartbeat to master (if master addresses are configured) + let heartbeat_handle = { + let master_addrs = config.masters.clone(); + if !master_addrs.is_empty() { + let hb_config = seaweed_volume::server::heartbeat::HeartbeatConfig { + ip: config.ip.clone(), + port: config.port, + grpc_port: config.grpc_port, + public_url: config.public_url.clone(), + data_center: config.data_center.clone(), + rack: config.rack.clone(), + master_addresses: master_addrs.clone(), + pulse_seconds: 5, + }; + let hb_shutdown = shutdown_tx.subscribe(); + let hb_state = state.clone(); + info!("Will send heartbeats to master: {:?}", master_addrs); + Some(tokio::spawn(async move { + seaweed_volume::server::heartbeat::run_heartbeat_with_state( + hb_config, + hb_state, + hb_shutdown, + ) + .await; + })) + } else { + None + } + }; + + let public_handle = if needs_public { + let public_router = + seaweed_volume::server::volume_server::build_public_router(state.clone()); + let public_addr = format!("{}:{}", config.bind_ip, public_port); + let listener = tokio::net::TcpListener::bind(&public_addr) + .await + .unwrap_or_else(|e| panic!("Failed to bind public HTTP to {}: {}", public_addr, e)); + info!("Public HTTP server listening on {}", public_addr); + let mut shutdown_rx = shutdown_tx.subscribe(); + Some(tokio::spawn(async move { + serve_http(listener, public_router, http_idle_timeout, async move { + let _ = shutdown_rx.recv().await; + }) + .await; + })) + } else { + None + }; + + let metrics_handle = if config.metrics_port > 0 { + let metrics_router = build_metrics_router(); + let metrics_addr = format!("{}:{}", config.metrics_ip, config.metrics_port); + info!("Metrics server listening on {}", metrics_addr); + let listener = tokio::net::TcpListener::bind(&metrics_addr) + .await + .unwrap_or_else(|e| panic!("Failed to bind metrics HTTP to {}: {}", metrics_addr, e)); + let mut shutdown_rx = shutdown_tx.subscribe(); + Some(tokio::spawn(async move { + if let Err(e) = axum::serve(listener, metrics_router) + .with_graceful_shutdown(async move { + let _ = shutdown_rx.recv().await; + }) + .await + { + error!("Metrics HTTP server error: {}", e); + } + })) + } else { + None + }; + + let debug_handle = if config.debug { + let debug_addr = format!("0.0.0.0:{}", config.debug_port); + info!("Debug pprof server listening on {}", debug_addr); + let listener = tokio::net::TcpListener::bind(&debug_addr) + .await + .unwrap_or_else(|e| panic!("Failed to bind debug HTTP to {}: {}", debug_addr, e)); + let debug_router = build_debug_router(); + let mut shutdown_rx = shutdown_tx.subscribe(); + Some(tokio::spawn(async move { + if let Err(e) = axum::serve(listener, debug_router) + .with_graceful_shutdown(async move { + let _ = shutdown_rx.recv().await; + }) + .await + { + error!("Debug HTTP server error: {}", e); + } + })) + } else { + None + }; + + let metrics_push_handle = { + let push_state = state.clone(); + let push_instance = format!("{}:{}", config.ip, config.port); + let push_shutdown = shutdown_tx.subscribe(); + Some(tokio::spawn(async move { + run_metrics_push_loop(push_state, push_instance, push_shutdown).await; + })) + }; + + // Wait for all servers + let _ = http_handle.await; + let _ = grpc_handle.await; + if let Some(h) = public_handle { + let _ = h.await; + } + if let Some(h) = metrics_handle { + let _ = h.await; + } + if let Some(h) = debug_handle { + let _ = h.await; + } + if let Some(h) = heartbeat_handle { + let _ = h.await; + } + if let Some(h) = metrics_push_handle { + let _ = h.await; + } + + // Close all volumes (flush and release file handles) matching Go's Shutdown() + state.store.write().unwrap().close(); + + if let Some(cpu_profile) = cpu_profile { + cpu_profile.finish().map_err(std::io::Error::other)?; + } + + info!("Volume server stopped."); + Ok(()) +} + +async fn run_metrics_push_loop( + state: Arc, + instance: String, + mut shutdown_rx: tokio::sync::broadcast::Receiver<()>, +) { + loop { + let push_cfg = { state.metrics_runtime.read().unwrap().push_gateway.clone() }; + + if push_cfg.address.is_empty() || push_cfg.interval_seconds == 0 { + tokio::select! { + _ = state.metrics_notify.notified() => continue, + _ = shutdown_rx.recv() => return, + } + } + + if let Err(e) = metrics::push_metrics_once( + &state.http_client, + &push_cfg.address, + "volumeServer", + &instance, + ) + .await + { + info!("could not push metrics to {}: {}", push_cfg.address, e); + } + + let interval = std::time::Duration::from_secs(push_cfg.interval_seconds.max(1) as u64); + tokio::select! { + _ = tokio::time::sleep(interval) => {} + _ = state.metrics_notify.notified() => {} + _ = shutdown_rx.recv() => return, + } + } +} + +fn grpc_tls_incoming( + listener: tokio::net::TcpListener, + tls_acceptor: TlsAcceptor, +) -> impl tokio_stream::Stream< + Item = Result, std::io::Error>, +> { + async_stream::stream! { + loop { + match listener.accept().await { + Ok((tcp_stream, remote_addr)) => match tls_acceptor.accept(tcp_stream).await { + Ok(tls_stream) => yield Ok(tls_stream), + Err(e) => { + tracing::debug!("gRPC TLS handshake failed from {}: {}", remote_addr, e); + } + }, + Err(e) => { + yield Err(e); + break; + } + } + } + } +} + +/// Serve an axum Router over TLS using tokio-rustls. +/// Accepts TCP connections, performs TLS handshake, then serves HTTP over the encrypted stream. +async fn serve_http( + tcp_listener: tokio::net::TcpListener, + app: axum::Router, + idle_timeout: std::time::Duration, + shutdown_signal: F, +) where + F: std::future::Future + Send + 'static, +{ + use hyper_util::rt::{TokioExecutor, TokioIo}; + use hyper_util::server::conn::auto::Builder as HttpBuilder; + use hyper_util::service::TowerToHyperService; + use tower::Service; + + let mut make_svc = app.into_make_service_with_connect_info::(); + + tokio::pin!(shutdown_signal); + + loop { + tokio::select! { + _ = &mut shutdown_signal => { + info!("HTTP server shutting down"); + break; + } + result = tcp_listener.accept() => { + match result { + Ok((tcp_stream, remote_addr)) => { + let tower_svc = make_svc.call(remote_addr).await.expect("infallible"); + let hyper_svc = TowerToHyperService::new(tower_svc); + tokio::spawn(async move { + let io = TokioIo::new(apply_idle_timeout(tcp_stream, idle_timeout)); + let builder = HttpBuilder::new(TokioExecutor::new()); + if let Err(e) = builder.serve_connection(io, hyper_svc).await { + tracing::debug!("HTTP connection error: {}", e); + } + }); + } + Err(e) => { + error!("Failed to accept TCP connection: {}", e); + } + } + } + } + } +} + +async fn serve_https( + tcp_listener: tokio::net::TcpListener, + app: axum::Router, + tls_acceptor: TlsAcceptor, + idle_timeout: std::time::Duration, + shutdown_signal: F, +) where + F: std::future::Future + Send + 'static, +{ + use hyper_util::rt::{TokioExecutor, TokioIo}; + use hyper_util::server::conn::auto::Builder as HttpBuilder; + use hyper_util::service::TowerToHyperService; + use tower::Service; + + let mut make_svc = app.into_make_service_with_connect_info::(); + + tokio::pin!(shutdown_signal); + + loop { + tokio::select! { + _ = &mut shutdown_signal => { + info!("HTTPS server shutting down"); + break; + } + result = tcp_listener.accept() => { + match result { + Ok((tcp_stream, remote_addr)) => { + let tls_acceptor = tls_acceptor.clone(); + let tower_svc = make_svc.call(remote_addr).await.expect("infallible"); + let hyper_svc = TowerToHyperService::new(tower_svc); + tokio::spawn(async move { + match tls_acceptor.accept(tcp_stream).await { + Ok(tls_stream) => { + let io = TokioIo::new(apply_idle_timeout(tls_stream, idle_timeout)); + let builder = HttpBuilder::new(TokioExecutor::new()); + if let Err(e) = builder.serve_connection(io, hyper_svc).await { + tracing::debug!("HTTPS connection error: {}", e); + } + } + Err(e) => { + tracing::debug!("TLS handshake failed: {}", e); + } + } + }); + } + Err(e) => { + error!("Failed to accept TCP connection: {}", e); + } + } + } + } + } +} + +#[cfg(test)] +mod tests { + use super::{ + build_grpc_server_tls_acceptor, effective_http_tls_policy, tls_policy_is_configured, + }; + use seaweed_volume::security::tls::TlsPolicy; + + fn write_pem(dir: &tempfile::TempDir, name: &str, body: &str) -> String { + let path = dir.path().join(name); + std::fs::write(&path, body).unwrap(); + path.to_string_lossy().into_owned() + } + + #[test] + fn test_grpc_server_tls_requires_ca() { + let dir = tempfile::tempdir().unwrap(); + let cert = write_pem( + &dir, + "server.crt", + "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n", + ); + let key = write_pem( + &dir, + "server.key", + "-----BEGIN PRIVATE KEY-----\nZmFrZQ==\n-----END PRIVATE KEY-----\n", + ); + + assert!( + build_grpc_server_tls_acceptor(&cert, &key, "", &TlsPolicy::default(), "", &[]) + .is_none() + ); + } + + #[test] + fn test_grpc_server_tls_returns_none_when_files_are_missing() { + assert!(build_grpc_server_tls_acceptor( + "/missing/server.crt", + "/missing/server.key", + "/missing/ca.crt", + &TlsPolicy::default(), + "", + &[], + ) + .is_none()); + } + + #[test] + fn test_grpc_server_tls_disables_on_unsupported_tls_policy() { + let dir = tempfile::tempdir().unwrap(); + let cert = write_pem( + &dir, + "server.crt", + "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n", + ); + let key = write_pem( + &dir, + "server.key", + "-----BEGIN PRIVATE KEY-----\nZmFrZQ==\n-----END PRIVATE KEY-----\n", + ); + let ca = write_pem( + &dir, + "ca.crt", + "-----BEGIN CERTIFICATE-----\nZmFrZQ==\n-----END CERTIFICATE-----\n", + ); + + assert!(build_grpc_server_tls_acceptor( + &cert, + &key, + &ca, + &TlsPolicy { + min_version: "TLS 1.0".to_string(), + max_version: "TLS 1.1".to_string(), + cipher_suites: String::new(), + }, + "", + &[], + ) + .is_none()); + } + + #[test] + fn test_effective_http_tls_policy_ignores_tls_policy_without_ca() { + let configured = TlsPolicy { + min_version: "TLS 1.3".to_string(), + max_version: "TLS 1.3".to_string(), + cipher_suites: "TLS_AES_128_GCM_SHA256".to_string(), + }; + assert_eq!( + effective_http_tls_policy("", &configured), + TlsPolicy::default() + ); + assert_eq!( + effective_http_tls_policy("/etc/seaweedfs/http-ca.pem", &configured), + configured + ); + } + + #[test] + fn test_tls_policy_is_configured_detects_non_empty_fields() { + assert!(!tls_policy_is_configured(&TlsPolicy::default())); + assert!(tls_policy_is_configured(&TlsPolicy { + min_version: "TLS 1.2".to_string(), + max_version: String::new(), + cipher_suites: String::new(), + })); + } +} diff --git a/seaweed-volume/src/metrics.rs b/seaweed-volume/src/metrics.rs new file mode 100644 index 000000000..572786949 --- /dev/null +++ b/seaweed-volume/src/metrics.rs @@ -0,0 +1,448 @@ +//! Prometheus metrics for the volume server. +//! +//! Mirrors the Go SeaweedFS volume server metrics. + +use prometheus::{ + self, Encoder, GaugeVec, HistogramOpts, HistogramVec, IntCounterVec, IntGauge, IntGaugeVec, + Opts, Registry, TextEncoder, +}; +use std::sync::Once; + +use crate::version; + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct PushGatewayConfig { + pub address: String, + pub interval_seconds: u32, +} + +lazy_static::lazy_static! { + pub static ref REGISTRY: Registry = Registry::new(); + + // ---- Request metrics (Go: VolumeServerRequestCounter, VolumeServerRequestHistogram) ---- + + /// Request counter with labels `type` (HTTP method) and `code` (HTTP status). + pub static ref REQUEST_COUNTER: IntCounterVec = IntCounterVec::new( + Opts::new("SeaweedFS_volumeServer_request_total", "Volume server requests"), + &["type", "code"], + ).expect("metric can be created"); + + /// Request duration histogram with label `type` (HTTP method). + pub static ref REQUEST_DURATION: HistogramVec = HistogramVec::new( + HistogramOpts::new( + "SeaweedFS_volumeServer_request_seconds", + "Volume server request duration in seconds", + ).buckets(exponential_buckets(0.0001, 2.0, 24)), + &["type"], + ).expect("metric can be created"); + + // ---- Handler counters (Go: VolumeServerHandlerCounter) ---- + + /// Handler-level operation counter with label `type`. + pub static ref HANDLER_COUNTER: IntCounterVec = IntCounterVec::new( + Opts::new("SeaweedFS_volumeServer_handler_total", "Volume server handler counters"), + &["type"], + ).expect("metric can be created"); + + // ---- Vacuuming metrics (Go: VolumeServerVacuuming*) ---- + + /// Vacuuming compact counter with label `success` (true/false). + pub static ref VACUUMING_COMPACT_COUNTER: IntCounterVec = IntCounterVec::new( + Opts::new("SeaweedFS_volumeServer_vacuuming_compact_count", "Counter of volume vacuuming Compact counter"), + &["success"], + ).expect("metric can be created"); + + /// Vacuuming commit counter with label `success` (true/false). + pub static ref VACUUMING_COMMIT_COUNTER: IntCounterVec = IntCounterVec::new( + Opts::new("SeaweedFS_volumeServer_vacuuming_commit_count", "Counter of volume vacuuming commit counter"), + &["success"], + ).expect("metric can be created"); + + /// Vacuuming duration histogram with label `type` (compact/commit). + pub static ref VACUUMING_HISTOGRAM: HistogramVec = HistogramVec::new( + HistogramOpts::new( + "SeaweedFS_volumeServer_vacuuming_seconds", + "Volume vacuuming duration in seconds", + ).buckets(exponential_buckets(0.0001, 2.0, 24)), + &["type"], + ).expect("metric can be created"); + + // ---- Volume gauges (Go: VolumeServerVolumeGauge, VolumeServerReadOnlyVolumeGauge) ---- + + /// Volumes per collection and type (volume/ec_shards). + pub static ref VOLUME_GAUGE: GaugeVec = GaugeVec::new( + Opts::new("SeaweedFS_volumeServer_volumes", "Number of volumes"), + &["collection", "type"], + ).expect("metric can be created"); + + /// Read-only volumes per collection and type. + pub static ref READ_ONLY_VOLUME_GAUGE: GaugeVec = GaugeVec::new( + Opts::new("SeaweedFS_volumeServer_read_only_volumes", "Number of read-only volumes."), + &["collection", "type"], + ).expect("metric can be created"); + + /// Maximum number of volumes this server can hold. + pub static ref MAX_VOLUMES: IntGauge = IntGauge::new( + "SeaweedFS_volumeServer_max_volumes", + "Maximum number of volumes", + ).expect("metric can be created"); + + // ---- Disk size gauges (Go: VolumeServerDiskSizeGauge) ---- + + /// Actual disk size used by volumes per collection and type (normal/deleted_bytes/ec). + pub static ref DISK_SIZE_GAUGE: GaugeVec = GaugeVec::new( + Opts::new("SeaweedFS_volumeServer_total_disk_size", "Actual disk size used by volumes"), + &["collection", "type"], + ).expect("metric can be created"); + + // ---- Resource gauges (Go: VolumeServerResourceGauge) ---- + + /// Disk resource usage per directory and type (all/used/free/avail). + pub static ref RESOURCE_GAUGE: GaugeVec = GaugeVec::new( + Opts::new("SeaweedFS_volumeServer_resource", "Server resource usage"), + &["name", "type"], + ).expect("metric can be created"); + + // ---- In-flight gauges (Go: VolumeServerInFlightRequestsGauge, InFlightDownload/UploadSize) ---- + + /// In-flight requests per HTTP method. + pub static ref INFLIGHT_REQUESTS_GAUGE: IntGaugeVec = IntGaugeVec::new( + Opts::new("SeaweedFS_volumeServer_in_flight_requests", "Current number of in-flight requests being handled by volume server."), + &["type"], + ).expect("metric can be created"); + + /// Concurrent download limit in bytes. + pub static ref CONCURRENT_DOWNLOAD_LIMIT: IntGauge = IntGauge::new( + "SeaweedFS_volumeServer_concurrent_download_limit", + "Limit for total concurrent download size in bytes", + ).expect("metric can be created"); + + /// Concurrent upload limit in bytes. + pub static ref CONCURRENT_UPLOAD_LIMIT: IntGauge = IntGauge::new( + "SeaweedFS_volumeServer_concurrent_upload_limit", + "Limit for total concurrent upload size in bytes", + ).expect("metric can be created"); + + /// Current in-flight download bytes. + pub static ref INFLIGHT_DOWNLOAD_SIZE: IntGauge = IntGauge::new( + "SeaweedFS_volumeServer_in_flight_download_size", + "In flight total download size.", + ).expect("metric can be created"); + + /// Current in-flight upload bytes. + pub static ref INFLIGHT_UPLOAD_SIZE: IntGauge = IntGauge::new( + "SeaweedFS_volumeServer_in_flight_upload_size", + "In flight total upload size.", + ).expect("metric can be created"); + + // ---- Legacy aliases for backward compat with existing code ---- + + /// Total number of volumes on this server (flat gauge). + pub static ref VOLUMES_TOTAL: IntGauge = IntGauge::new( + "volume_server_volumes_total", + "Total number of volumes", + ).expect("metric can be created"); + + /// Disk size in bytes per directory. + pub static ref DISK_SIZE_BYTES: IntGaugeVec = IntGaugeVec::new( + Opts::new("volume_server_disk_size_bytes", "Disk size in bytes"), + &["dir"], + ).expect("metric can be created"); + + /// Disk free bytes per directory. + pub static ref DISK_FREE_BYTES: IntGaugeVec = IntGaugeVec::new( + Opts::new("volume_server_disk_free_bytes", "Disk free space in bytes"), + &["dir"], + ).expect("metric can be created"); + + /// Current number of in-flight requests (flat gauge). + pub static ref INFLIGHT_REQUESTS: IntGauge = IntGauge::new( + "volume_server_inflight_requests", + "Current number of in-flight requests", + ).expect("metric can be created"); + + /// Total number of files stored across all volumes. + pub static ref VOLUME_FILE_COUNT: IntGauge = IntGauge::new( + "volume_server_volume_file_count", + "Total number of files stored across all volumes", + ).expect("metric can be created"); + + // ---- Build info (Go: BuildInfo) ---- + + /// Build information gauge, always set to 1. Matches Go: + /// Namespace="SeaweedFS", Subsystem="build", Name="info", + /// labels: version, commit, sizelimit, goos, goarch. + pub static ref BUILD_INFO: GaugeVec = GaugeVec::new( + Opts::new("SeaweedFS_build_info", "A metric with a constant '1' value labeled by version, commit, sizelimit, goos, and goarch from which SeaweedFS was built."), + &["version", "commit", "sizelimit", "goos", "goarch"], + ).expect("metric can be created"); +} + +/// Generate exponential bucket boundaries for histograms. +fn exponential_buckets(start: f64, factor: f64, count: usize) -> Vec { + let mut buckets = Vec::with_capacity(count); + let mut val = start; + for _ in 0..count { + buckets.push(val); + val *= factor; + } + buckets +} + +// Handler counter type constants (matches Go's metrics_names.go). +pub const WRITE_TO_LOCAL_DISK: &str = "writeToLocalDisk"; +pub const WRITE_TO_REPLICAS: &str = "writeToReplicas"; +pub const DOWNLOAD_LIMIT_COND: &str = "downloadLimitCondition"; +pub const UPLOAD_LIMIT_COND: &str = "uploadLimitCondition"; +pub const READ_PROXY_REQ: &str = "readProxyRequest"; +pub const READ_REDIRECT_REQ: &str = "readRedirectRequest"; +pub const EMPTY_READ_PROXY_LOC: &str = "emptyReadProxyLocaction"; +pub const FAILED_READ_PROXY_REQ: &str = "failedReadProxyRequest"; + +// Error metric name constants. +pub const ERROR_SIZE_MISMATCH_OFFSET_SIZE: &str = "errorSizeMismatchOffsetSize"; +pub const ERROR_SIZE_MISMATCH: &str = "errorSizeMismatch"; +pub const ERROR_CRC: &str = "errorCRC"; +pub const ERROR_INDEX_OUT_OF_RANGE: &str = "errorIndexOutOfRange"; +pub const ERROR_GET_NOT_FOUND: &str = "errorGetNotFound"; +pub const ERROR_GET_INTERNAL: &str = "errorGetInternal"; +pub const ERROR_WRITE_TO_LOCAL_DISK: &str = "errorWriteToLocalDisk"; +pub const ERROR_UNMARSHAL_PAIRS: &str = "errorUnmarshalPairs"; +pub const ERROR_WRITE_TO_REPLICAS: &str = "errorWriteToReplicas"; + +// Go volume heartbeat metric label values. +pub const READ_ONLY_LABEL_IS_READ_ONLY: &str = "IsReadOnly"; +pub const READ_ONLY_LABEL_NO_WRITE_OR_DELETE: &str = "noWriteOrDelete"; +pub const READ_ONLY_LABEL_NO_WRITE_CAN_DELETE: &str = "noWriteCanDelete"; +pub const READ_ONLY_LABEL_IS_DISK_SPACE_LOW: &str = "isDiskSpaceLow"; +pub const DISK_SIZE_LABEL_NORMAL: &str = "normal"; +pub const DISK_SIZE_LABEL_DELETED_BYTES: &str = "deleted_bytes"; +pub const DISK_SIZE_LABEL_EC: &str = "ec"; + +static REGISTER_METRICS: Once = Once::new(); + +/// Register all metrics with the custom registry. +/// Call this once at startup. +pub fn register_metrics() { + REGISTER_METRICS.call_once(|| { + let metrics: Vec> = vec![ + // New Go-compatible metrics + Box::new(REQUEST_COUNTER.clone()), + Box::new(REQUEST_DURATION.clone()), + Box::new(HANDLER_COUNTER.clone()), + Box::new(VACUUMING_COMPACT_COUNTER.clone()), + Box::new(VACUUMING_COMMIT_COUNTER.clone()), + Box::new(VACUUMING_HISTOGRAM.clone()), + Box::new(VOLUME_GAUGE.clone()), + Box::new(READ_ONLY_VOLUME_GAUGE.clone()), + Box::new(MAX_VOLUMES.clone()), + Box::new(DISK_SIZE_GAUGE.clone()), + Box::new(RESOURCE_GAUGE.clone()), + Box::new(INFLIGHT_REQUESTS_GAUGE.clone()), + Box::new(CONCURRENT_DOWNLOAD_LIMIT.clone()), + Box::new(CONCURRENT_UPLOAD_LIMIT.clone()), + Box::new(INFLIGHT_DOWNLOAD_SIZE.clone()), + Box::new(INFLIGHT_UPLOAD_SIZE.clone()), + // Legacy metrics + Box::new(VOLUMES_TOTAL.clone()), + Box::new(DISK_SIZE_BYTES.clone()), + Box::new(DISK_FREE_BYTES.clone()), + Box::new(INFLIGHT_REQUESTS.clone()), + Box::new(VOLUME_FILE_COUNT.clone()), + // Build info + Box::new(BUILD_INFO.clone()), + ]; + for m in metrics { + REGISTRY.register(m).expect("metric registered"); + } + + // Set build info gauge to 1 with version/commit/sizelimit/os/arch labels (matches Go). + BUILD_INFO + .with_label_values(&[ + version::version(), + version::commit(), + version::size_limit(), + std::env::consts::OS, + std::env::consts::ARCH, + ]) + .set(1.0); + }); +} + +/// Gather all metrics and encode them in Prometheus text exposition format. +pub fn gather_metrics() -> String { + let encoder = TextEncoder::new(); + let metric_families = REGISTRY.gather(); + let mut buffer = Vec::new(); + encoder + .encode(&metric_families, &mut buffer) + .expect("encoding metrics"); + String::from_utf8(buffer).expect("metrics are valid UTF-8") +} + +pub fn delete_collection_metrics(collection: &str) { + // Mirrors Go's DeletePartialMatch(prometheus.Labels{"collection": collection}) + // which removes ALL metric entries matching the collection label, regardless + // of other label values (like "type"). We gather the metric families to discover + // all type values dynamically, matching Go's partial-match behavior. + delete_partial_match_collection(&VOLUME_GAUGE, collection); + delete_partial_match_collection(&READ_ONLY_VOLUME_GAUGE, collection); + delete_partial_match_collection(&DISK_SIZE_GAUGE, collection); +} + +/// Remove all metric entries from a GaugeVec where the "collection" label matches. +/// This emulates Go's `DeletePartialMatch(prometheus.Labels{"collection": collection})`. +fn delete_partial_match_collection(gauge: &GaugeVec, collection: &str) { + use prometheus::core::Collector; + let families = gauge.collect(); + for family in &families { + for metric in family.get_metric() { + let labels = metric.get_label(); + let mut matches_collection = false; + let mut type_value = None; + for label in labels { + if label.get_name() == "collection" && label.get_value() == collection { + matches_collection = true; + } + if label.get_name() == "type" { + type_value = Some(label.get_value().to_string()); + } + } + if matches_collection { + if let Some(ref tv) = type_value { + let _ = gauge.remove_label_values(&[collection, tv]); + } + } + } + } +} + +pub fn build_pushgateway_url(address: &str, job: &str, instance: &str) -> String { + let base = if address.starts_with("http://") || address.starts_with("https://") { + address.to_string() + } else { + format!("http://{}", address) + }; + let base = base.trim_end_matches('/'); + format!("{}/metrics/job/{}/instance/{}", base, job, instance) +} + +pub async fn push_metrics_once( + client: &reqwest::Client, + address: &str, + job: &str, + instance: &str, +) -> Result<(), String> { + let url = build_pushgateway_url(address, job, instance); + let response = client + .put(&url) + .header( + reqwest::header::CONTENT_TYPE, + "text/plain; version=0.0.4; charset=utf-8", + ) + .body(gather_metrics()) + .send() + .await + .map_err(|e| format!("push metrics request failed: {}", e))?; + + if response.status().is_success() { + Ok(()) + } else { + Err(format!( + "push metrics failed with status {}", + response.status() + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use axum::{routing::put, Router}; + use std::sync::{Arc, Mutex}; + + #[test] + fn test_gather_metrics_returns_text() { + register_metrics(); + REQUEST_COUNTER.with_label_values(&["GET", "200"]).inc(); + let output = gather_metrics(); + assert!(output.contains("SeaweedFS_volumeServer_request_total")); + } + + #[test] + fn test_build_pushgateway_url() { + assert_eq!( + build_pushgateway_url("localhost:9091", "volumeServer", "test-instance"), + "http://localhost:9091/metrics/job/volumeServer/instance/test-instance" + ); + assert_eq!( + build_pushgateway_url("https://push.example", "volumeServer", "node-a"), + "https://push.example/metrics/job/volumeServer/instance/node-a" + ); + } + + #[tokio::test] + async fn test_push_metrics_once() { + register_metrics(); + + let captured = Arc::new(Mutex::new(None::)); + let captured_clone = captured.clone(); + + let app = Router::new().route( + "/metrics/job/volumeServer/instance/test-instance", + put(move |body: String| { + let captured = captured_clone.clone(); + async move { + *captured.lock().unwrap() = Some(body); + "ok" + } + }), + ); + + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let addr = listener.local_addr().unwrap(); + let server = tokio::spawn(async move { + axum::serve(listener, app).await.unwrap(); + }); + + let client = reqwest::Client::new(); + push_metrics_once( + &client, + &format!("127.0.0.1:{}", addr.port()), + "volumeServer", + "test-instance", + ) + .await + .unwrap(); + + let body = captured.lock().unwrap().clone().unwrap(); + assert!(body.contains("SeaweedFS_volumeServer_request_total")); + + server.abort(); + } + + #[test] + fn test_delete_collection_metrics_removes_collection_labelsets() { + register_metrics(); + + VOLUME_GAUGE.with_label_values(&["pics", "volume"]).set(2.0); + VOLUME_GAUGE.with_label_values(&["pics", "ec_shards"]).set(3.0); + READ_ONLY_VOLUME_GAUGE + .with_label_values(&["pics", "volume"]) + .set(1.0); + DISK_SIZE_GAUGE + .with_label_values(&["pics", "normal"]) + .set(10.0); + DISK_SIZE_GAUGE + .with_label_values(&["pics", "deleted_bytes"]) + .set(4.0); + + delete_collection_metrics("pics"); + + let output = gather_metrics(); + assert!(!output.contains("collection=\"pics\",type=\"volume\"")); + assert!(!output.contains("collection=\"pics\",type=\"ec_shards\"")); + assert!(!output.contains("collection=\"pics\",type=\"normal\"")); + assert!(!output.contains("collection=\"pics\",type=\"deleted_bytes\"")); + } +} diff --git a/seaweed-volume/src/remote_storage/mod.rs b/seaweed-volume/src/remote_storage/mod.rs new file mode 100644 index 000000000..599333ede --- /dev/null +++ b/seaweed-volume/src/remote_storage/mod.rs @@ -0,0 +1,157 @@ +//! Remote storage backends for tiered storage support. +//! +//! Provides a trait-based abstraction over cloud storage providers (S3, GCS, Azure, etc.) +//! and a registry to create clients from protobuf RemoteConf messages. + +pub mod s3; +pub mod s3_tier; + +use crate::pb::remote_pb::{RemoteConf, RemoteStorageLocation}; + +/// Error type for remote storage operations. +#[derive(Debug, thiserror::Error)] +pub enum RemoteStorageError { + #[error("remote storage type {0} not found")] + TypeNotFound(String), + #[error("remote object not found: {0}")] + ObjectNotFound(String), + #[error("remote storage error: {0}")] + Other(String), + #[error("io error: {0}")] + Io(#[from] std::io::Error), +} + +/// Metadata about a remote file entry. +#[derive(Debug, Clone)] +pub struct RemoteEntry { + pub size: i64, + pub last_modified_at: i64, // Unix seconds + pub e_tag: String, + pub storage_name: String, +} + +/// Trait for remote storage clients. Matches Go's RemoteStorageClient interface. +#[async_trait::async_trait] +pub trait RemoteStorageClient: Send + Sync { + /// Read (part of) a file from remote storage. + async fn read_file( + &self, + loc: &RemoteStorageLocation, + offset: i64, + size: i64, + ) -> Result, RemoteStorageError>; + + /// Write a file to remote storage. + async fn write_file( + &self, + loc: &RemoteStorageLocation, + data: &[u8], + ) -> Result; + + /// Get metadata for a file in remote storage. + async fn stat_file( + &self, + loc: &RemoteStorageLocation, + ) -> Result; + + /// Delete a file from remote storage. + async fn delete_file(&self, loc: &RemoteStorageLocation) -> Result<(), RemoteStorageError>; + + /// List all buckets. + async fn list_buckets(&self) -> Result, RemoteStorageError>; + + /// The RemoteConf used to create this client. + fn remote_conf(&self) -> &RemoteConf; +} + +/// Create a new remote storage client from a RemoteConf. +pub fn make_remote_storage_client( + conf: &RemoteConf, +) -> Result, RemoteStorageError> { + match conf.r#type.as_str() { + // All S3-compatible backends use the same client with different credentials + "s3" | "wasabi" | "backblaze" | "aliyun" | "tencent" | "baidu" | "filebase" | "storj" + | "contabo" => { + let (access_key, secret_key, endpoint, region) = extract_s3_credentials(conf); + Ok(Box::new(s3::S3RemoteStorageClient::new( + conf.clone(), + &access_key, + &secret_key, + ®ion, + &endpoint, + conf.s3_force_path_style, + ))) + } + other => Err(RemoteStorageError::TypeNotFound(other.to_string())), + } +} + +/// Extract S3-compatible credentials from a RemoteConf based on its type. +fn extract_s3_credentials(conf: &RemoteConf) -> (String, String, String, String) { + match conf.r#type.as_str() { + "s3" => ( + conf.s3_access_key.clone(), + conf.s3_secret_key.clone(), + conf.s3_endpoint.clone(), + if conf.s3_region.is_empty() { + "us-east-1".to_string() + } else { + conf.s3_region.clone() + }, + ), + "wasabi" => ( + conf.wasabi_access_key.clone(), + conf.wasabi_secret_key.clone(), + conf.wasabi_endpoint.clone(), + conf.wasabi_region.clone(), + ), + "backblaze" => ( + conf.backblaze_key_id.clone(), + conf.backblaze_application_key.clone(), + conf.backblaze_endpoint.clone(), + conf.backblaze_region.clone(), + ), + "aliyun" => ( + conf.aliyun_access_key.clone(), + conf.aliyun_secret_key.clone(), + conf.aliyun_endpoint.clone(), + conf.aliyun_region.clone(), + ), + "tencent" => ( + conf.tencent_secret_id.clone(), + conf.tencent_secret_key.clone(), + conf.tencent_endpoint.clone(), + String::new(), + ), + "baidu" => ( + conf.baidu_access_key.clone(), + conf.baidu_secret_key.clone(), + conf.baidu_endpoint.clone(), + conf.baidu_region.clone(), + ), + "filebase" => ( + conf.filebase_access_key.clone(), + conf.filebase_secret_key.clone(), + conf.filebase_endpoint.clone(), + String::new(), + ), + "storj" => ( + conf.storj_access_key.clone(), + conf.storj_secret_key.clone(), + conf.storj_endpoint.clone(), + String::new(), + ), + "contabo" => ( + conf.contabo_access_key.clone(), + conf.contabo_secret_key.clone(), + conf.contabo_endpoint.clone(), + conf.contabo_region.clone(), + ), + _ => ( + conf.s3_access_key.clone(), + conf.s3_secret_key.clone(), + conf.s3_endpoint.clone(), + conf.s3_region.clone(), + ), + } +} diff --git a/seaweed-volume/src/remote_storage/s3.rs b/seaweed-volume/src/remote_storage/s3.rs new file mode 100644 index 000000000..bac5485ae --- /dev/null +++ b/seaweed-volume/src/remote_storage/s3.rs @@ -0,0 +1,186 @@ +//! S3-compatible remote storage client. +//! +//! Works with AWS S3, MinIO, SeaweedFS S3, and all S3-compatible providers. + +use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region}; +use aws_sdk_s3::primitives::ByteStream; +use aws_sdk_s3::Client; + +use super::{RemoteEntry, RemoteStorageClient, RemoteStorageError}; +use crate::pb::remote_pb::{RemoteConf, RemoteStorageLocation}; + +/// S3-compatible remote storage client. +pub struct S3RemoteStorageClient { + client: Client, + conf: RemoteConf, +} + +impl S3RemoteStorageClient { + /// Create a new S3 client from credentials and endpoint configuration. + pub fn new( + conf: RemoteConf, + access_key: &str, + secret_key: &str, + region: &str, + endpoint: &str, + force_path_style: bool, + ) -> Self { + let region = if region.is_empty() { + "us-east-1" + } else { + region + }; + + let credentials = Credentials::new( + access_key, + secret_key, + None, // session token + None, // expiry + "seaweedfs-volume", + ); + + let mut s3_config = aws_sdk_s3::Config::builder() + .behavior_version(BehaviorVersion::latest()) + .region(Region::new(region.to_string())) + .credentials_provider(credentials) + .force_path_style(force_path_style); + + if !endpoint.is_empty() { + s3_config = s3_config.endpoint_url(endpoint); + } + + let client = Client::from_conf(s3_config.build()); + + S3RemoteStorageClient { client, conf } + } +} + +#[async_trait::async_trait] +impl RemoteStorageClient for S3RemoteStorageClient { + async fn read_file( + &self, + loc: &RemoteStorageLocation, + offset: i64, + size: i64, + ) -> Result, RemoteStorageError> { + let key = loc.path.trim_start_matches('/'); + + let mut req = self.client.get_object().bucket(&loc.bucket).key(key); + + // Set byte range if specified + if size > 0 { + let end = offset + size - 1; + req = req.range(format!("bytes={}-{}", offset, end)); + } else if offset > 0 { + req = req.range(format!("bytes={}-", offset)); + } + + let resp = req.send().await.map_err(|e| { + let msg = format!("{}", e); + if msg.contains("NoSuchKey") || msg.contains("404") { + RemoteStorageError::ObjectNotFound(format!("{}/{}", loc.bucket, key)) + } else { + RemoteStorageError::Other(format!("s3 get object: {}", e)) + } + })?; + + let data = resp + .body + .collect() + .await + .map_err(|e| RemoteStorageError::Other(format!("s3 read body: {}", e)))?; + + Ok(data.into_bytes().to_vec()) + } + + async fn write_file( + &self, + loc: &RemoteStorageLocation, + data: &[u8], + ) -> Result { + let key = loc.path.trim_start_matches('/'); + + let resp = self + .client + .put_object() + .bucket(&loc.bucket) + .key(key) + .body(ByteStream::from(data.to_vec())) + .send() + .await + .map_err(|e| RemoteStorageError::Other(format!("s3 put object: {}", e)))?; + + Ok(RemoteEntry { + size: data.len() as i64, + last_modified_at: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() as i64, + e_tag: resp.e_tag().unwrap_or_default().to_string(), + storage_name: loc.name.clone(), + }) + } + + async fn stat_file( + &self, + loc: &RemoteStorageLocation, + ) -> Result { + let key = loc.path.trim_start_matches('/'); + + let resp = self + .client + .head_object() + .bucket(&loc.bucket) + .key(key) + .send() + .await + .map_err(|e| { + let msg = format!("{}", e); + if msg.contains("404") || msg.contains("NotFound") { + RemoteStorageError::ObjectNotFound(format!("{}/{}", loc.bucket, key)) + } else { + RemoteStorageError::Other(format!("s3 head object: {}", e)) + } + })?; + + Ok(RemoteEntry { + size: resp.content_length().unwrap_or(0), + last_modified_at: resp.last_modified().map(|t| t.secs()).unwrap_or(0), + e_tag: resp.e_tag().unwrap_or_default().to_string(), + storage_name: loc.name.clone(), + }) + } + + async fn delete_file(&self, loc: &RemoteStorageLocation) -> Result<(), RemoteStorageError> { + let key = loc.path.trim_start_matches('/'); + + self.client + .delete_object() + .bucket(&loc.bucket) + .key(key) + .send() + .await + .map_err(|e| RemoteStorageError::Other(format!("s3 delete object: {}", e)))?; + + Ok(()) + } + + async fn list_buckets(&self) -> Result, RemoteStorageError> { + let resp = self + .client + .list_buckets() + .send() + .await + .map_err(|e| RemoteStorageError::Other(format!("s3 list buckets: {}", e)))?; + + Ok(resp + .buckets() + .iter() + .filter_map(|b| b.name().map(String::from)) + .collect()) + } + + fn remote_conf(&self) -> &RemoteConf { + &self.conf + } +} diff --git a/seaweed-volume/src/remote_storage/s3_tier.rs b/seaweed-volume/src/remote_storage/s3_tier.rs new file mode 100644 index 000000000..be88adcf8 --- /dev/null +++ b/seaweed-volume/src/remote_storage/s3_tier.rs @@ -0,0 +1,514 @@ +//! S3-compatible tiered storage backend for volume .dat file upload/download. +//! +//! Provides multipart upload and concurrent download with progress callbacks, +//! matching the Go SeaweedFS S3 backend behavior. + +use std::collections::HashMap; +use std::future::Future; +use std::sync::{Arc, OnceLock, RwLock}; + +use aws_sdk_s3::config::{BehaviorVersion, Credentials, Region}; +use aws_sdk_s3::types::{CompletedMultipartUpload, CompletedPart}; +use aws_sdk_s3::Client; +use tokio::io::{AsyncReadExt, AsyncSeekExt, AsyncWriteExt}; +use tokio::sync::Semaphore; + +/// Concurrency limit for multipart upload/download (matches Go's s3manager). +const CONCURRENCY: usize = 5; + +/// Configuration for an S3 tier backend. +#[derive(Debug, Clone)] +pub struct S3TierConfig { + pub access_key: String, + pub secret_key: String, + pub region: String, + pub bucket: String, + pub endpoint: String, + pub storage_class: String, + pub force_path_style: bool, +} + +/// S3 tier backend for uploading/downloading volume .dat files. +pub struct S3TierBackend { + client: Client, + pub bucket: String, + pub storage_class: String, +} + +impl S3TierBackend { + /// Create a new S3 tier backend from configuration. + pub fn new(config: &S3TierConfig) -> Self { + let region = if config.region.is_empty() { + "us-east-1" + } else { + &config.region + }; + + let credentials = Credentials::new( + &config.access_key, + &config.secret_key, + None, + None, + "seaweedfs-volume-tier", + ); + + let mut s3_config = aws_sdk_s3::Config::builder() + .behavior_version(BehaviorVersion::latest()) + .region(Region::new(region.to_string())) + .credentials_provider(credentials) + .force_path_style(config.force_path_style); + + if !config.endpoint.is_empty() { + s3_config = s3_config.endpoint_url(&config.endpoint); + } + + let client = Client::from_conf(s3_config.build()); + + S3TierBackend { + client, + bucket: config.bucket.clone(), + storage_class: if config.storage_class.is_empty() { + "STANDARD_IA".to_string() + } else { + config.storage_class.clone() + }, + } + } + + /// Upload a local file to S3 using multipart upload with concurrent parts + /// and progress reporting. + /// + /// Returns (s3_key, file_size) on success. + /// The progress callback receives (bytes_uploaded, percentage). + /// Uses 64MB part size and 5 concurrent uploads (matches Go s3manager). + pub async fn upload_file( + &self, + file_path: &str, + progress_fn: F, + ) -> Result<(String, u64), String> + where + F: FnMut(i64, f32) + Send + Sync + 'static, + { + let key = uuid::Uuid::new_v4().to_string(); + + let metadata = tokio::fs::metadata(file_path) + .await + .map_err(|e| format!("failed to stat file {}: {}", file_path, e))?; + let file_size = metadata.len(); + + // Calculate part size: start at 64MB, scale up for very large files (matches Go) + let mut part_size: u64 = 64 * 1024 * 1024; + while part_size * 1000 < file_size { + part_size *= 4; + } + + // Initiate multipart upload + let create_resp = self + .client + .create_multipart_upload() + .bucket(&self.bucket) + .key(&key) + .storage_class( + self.storage_class + .parse() + .unwrap_or(aws_sdk_s3::types::StorageClass::StandardIa), + ) + .send() + .await + .map_err(|e| format!("failed to create multipart upload: {}", e))?; + + let upload_id = create_resp + .upload_id() + .ok_or_else(|| "no upload_id in multipart upload response".to_string())? + .to_string(); + + // Build list of (part_number, offset, size) for all parts + let mut parts_plan: Vec<(i32, u64, usize)> = Vec::new(); + let mut offset: u64 = 0; + let mut part_number: i32 = 1; + while offset < file_size { + let remaining = file_size - offset; + let this_part_size = std::cmp::min(part_size, remaining) as usize; + parts_plan.push((part_number, offset, this_part_size)); + offset += this_part_size as u64; + part_number += 1; + } + + // Upload parts concurrently with a semaphore limiting to CONCURRENCY + let semaphore = Arc::new(Semaphore::new(CONCURRENCY)); + let client = &self.client; + let bucket = &self.bucket; + let file_path_owned = file_path.to_string(); + let progress = Arc::new(std::sync::Mutex::new((0u64, progress_fn))); + + let mut handles = Vec::with_capacity(parts_plan.len()); + for (pn, off, size) in parts_plan { + let sem = semaphore.clone(); + let client = client.clone(); + let bucket = bucket.clone(); + let key = key.clone(); + let upload_id = upload_id.clone(); + let fp = file_path_owned.clone(); + let progress = progress.clone(); + + handles.push(tokio::spawn(async move { + let _permit = sem + .acquire() + .await + .map_err(|e| format!("semaphore error: {}", e))?; + + // Read this part's data from the file at the correct offset + let mut file = tokio::fs::File::open(&fp) + .await + .map_err(|e| format!("failed to open file {}: {}", fp, e))?; + file.seek(std::io::SeekFrom::Start(off)) + .await + .map_err(|e| format!("failed to seek to offset {}: {}", off, e))?; + let mut buf = vec![0u8; size]; + file.read_exact(&mut buf) + .await + .map_err(|e| format!("failed to read file at offset {}: {}", off, e))?; + + let upload_part_resp = client + .upload_part() + .bucket(&bucket) + .key(&key) + .upload_id(&upload_id) + .part_number(pn) + .body(buf.into()) + .send() + .await + .map_err(|e| { + format!("failed to upload part {} at offset {}: {}", pn, off, e) + })?; + + let e_tag = upload_part_resp.e_tag().unwrap_or_default().to_string(); + + // Report progress + { + let mut guard = progress.lock().unwrap(); + guard.0 += size as u64; + let uploaded = guard.0; + let pct = if file_size > 0 { + (uploaded as f32 * 100.0) / file_size as f32 + } else { + 100.0 + }; + (guard.1)(uploaded as i64, pct); + } + + Ok::<_, String>( + CompletedPart::builder() + .e_tag(e_tag) + .part_number(pn) + .build(), + ) + })); + } + + // Collect results, preserving part order + let mut completed_parts = Vec::with_capacity(handles.len()); + for handle in handles { + let part = handle + .await + .map_err(|e| format!("upload task panicked: {}", e))??; + completed_parts.push(part); + } + + // Complete multipart upload + let completed_upload = CompletedMultipartUpload::builder() + .set_parts(Some(completed_parts)) + .build(); + + self.client + .complete_multipart_upload() + .bucket(&self.bucket) + .key(&key) + .upload_id(&upload_id) + .multipart_upload(completed_upload) + .send() + .await + .map_err(|e| format!("failed to complete multipart upload: {}", e))?; + + Ok((key, file_size)) + } + + /// Download a file from S3 to a local path with concurrent range requests + /// and progress reporting. + /// + /// Returns the file size on success. + /// Uses 64MB part size and 5 concurrent downloads (matches Go s3manager). + pub async fn download_file( + &self, + dest_path: &str, + key: &str, + progress_fn: F, + ) -> Result + where + F: FnMut(i64, f32) + Send + Sync + 'static, + { + // Get file size first + let head_resp = self + .client + .head_object() + .bucket(&self.bucket) + .key(key) + .send() + .await + .map_err(|e| format!("failed to head object {}: {}", key, e))?; + + let file_size = head_resp.content_length().unwrap_or(0) as u64; + + // Pre-allocate file to full size so concurrent WriteAt-style writes work + { + let file = tokio::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(dest_path) + .await + .map_err(|e| format!("failed to open dest file {}: {}", dest_path, e))?; + file.set_len(file_size) + .await + .map_err(|e| format!("failed to set file length: {}", e))?; + } + + let part_size: u64 = 64 * 1024 * 1024; + + // Build list of (offset, size) for all parts + let mut parts_plan: Vec<(u64, u64)> = Vec::new(); + let mut offset: u64 = 0; + while offset < file_size { + let remaining = file_size - offset; + let this_part_size = std::cmp::min(part_size, remaining); + parts_plan.push((offset, this_part_size)); + offset += this_part_size; + } + + // Download parts concurrently with a semaphore limiting to CONCURRENCY + let semaphore = Arc::new(Semaphore::new(CONCURRENCY)); + let client = &self.client; + let bucket = &self.bucket; + let dest_path_owned = dest_path.to_string(); + let key_owned = key.to_string(); + let progress = Arc::new(std::sync::Mutex::new((0u64, progress_fn))); + + let mut handles = Vec::with_capacity(parts_plan.len()); + for (off, size) in parts_plan { + let sem = semaphore.clone(); + let client = client.clone(); + let bucket = bucket.clone(); + let key = key_owned.clone(); + let dp = dest_path_owned.clone(); + let progress = progress.clone(); + + handles.push(tokio::spawn(async move { + let _permit = sem + .acquire() + .await + .map_err(|e| format!("semaphore error: {}", e))?; + + let end = off + size - 1; + let range = format!("bytes={}-{}", off, end); + + let get_resp = client + .get_object() + .bucket(&bucket) + .key(&key) + .range(&range) + .send() + .await + .map_err(|e| format!("failed to get object {} range {}: {}", key, range, e))?; + + let body = get_resp + .body + .collect() + .await + .map_err(|e| format!("failed to read body: {}", e))?; + let bytes = body.into_bytes(); + + // Write at the correct offset (like Go's WriteAt) + let mut file = tokio::fs::OpenOptions::new() + .write(true) + .open(&dp) + .await + .map_err(|e| format!("failed to open dest file {}: {}", dp, e))?; + file.seek(std::io::SeekFrom::Start(off)) + .await + .map_err(|e| format!("failed to seek to offset {}: {}", off, e))?; + file.write_all(&bytes) + .await + .map_err(|e| format!("failed to write to {}: {}", dp, e))?; + + // Report progress + { + let mut guard = progress.lock().unwrap(); + guard.0 += bytes.len() as u64; + let downloaded = guard.0; + let pct = if file_size > 0 { + (downloaded as f32 * 100.0) / file_size as f32 + } else { + 100.0 + }; + (guard.1)(downloaded as i64, pct); + } + + Ok::<_, String>(()) + })); + } + + // Wait for all download tasks + for handle in handles { + handle + .await + .map_err(|e| format!("download task panicked: {}", e))??; + } + + Ok(file_size) + } + + pub async fn read_range(&self, key: &str, offset: u64, size: usize) -> Result, String> { + let end = offset + (size as u64).saturating_sub(1); + let range = format!("bytes={}-{}", offset, end); + let resp = self + .client + .get_object() + .bucket(&self.bucket) + .key(key) + .range(&range) + .send() + .await + .map_err(|e| format!("failed to get object {} range {}: {}", key, range, e))?; + + let body = resp + .body + .collect() + .await + .map_err(|e| format!("failed to read object {} body: {}", key, e))?; + Ok(body.into_bytes().to_vec()) + } + + /// Delete a file from S3. + pub async fn delete_file(&self, key: &str) -> Result<(), String> { + self.client + .delete_object() + .bucket(&self.bucket) + .key(key) + .send() + .await + .map_err(|e| format!("failed to delete object {}: {}", key, e))?; + Ok(()) + } + + pub fn delete_file_blocking(&self, key: &str) -> Result<(), String> { + let client = self.client.clone(); + let bucket = self.bucket.clone(); + let key = key.to_string(); + block_on_tier_future(async move { + client + .delete_object() + .bucket(&bucket) + .key(&key) + .send() + .await + .map_err(|e| format!("failed to delete object {}: {}", key, e))?; + Ok(()) + }) + } + + pub fn read_range_blocking( + &self, + key: &str, + offset: u64, + size: usize, + ) -> Result, String> { + let client = self.client.clone(); + let bucket = self.bucket.clone(); + let key = key.to_string(); + block_on_tier_future(async move { + let end = offset + (size as u64).saturating_sub(1); + let range = format!("bytes={}-{}", offset, end); + let resp = client + .get_object() + .bucket(&bucket) + .key(&key) + .range(&range) + .send() + .await + .map_err(|e| format!("failed to get object {} range {}: {}", key, range, e))?; + + let body = resp + .body + .collect() + .await + .map_err(|e| format!("failed to read object {} body: {}", key, e))?; + Ok(body.into_bytes().to_vec()) + }) + } +} + +/// Parse a backend name like "s3" or "s3.default" into (backend_type, backend_id). +/// Matches Go's `BackendNameToTypeId`. +pub fn backend_name_to_type_id(backend_name: &str) -> (String, String) { + let parts: Vec<&str> = backend_name.split('.').collect(); + match parts.len() { + 1 => (backend_name.to_string(), "default".to_string()), + 2 => (parts[0].to_string(), parts[1].to_string()), + _ => (String::new(), String::new()), + } +} + +/// A registry of configured S3 tier backends, keyed by backend name (e.g., "s3.default"). +#[derive(Default)] +pub struct S3TierRegistry { + backends: HashMap>, +} + +impl S3TierRegistry { + pub fn new() -> Self { + Self { + backends: HashMap::new(), + } + } + + /// Register a backend with the given name. + pub fn register(&mut self, name: String, backend: S3TierBackend) { + self.backends.insert(name, Arc::new(backend)); + } + + /// Look up a backend by name. + pub fn get(&self, name: &str) -> Option> { + self.backends.get(name).cloned() + } + + /// List all registered backend names. + pub fn names(&self) -> Vec { + self.backends.keys().cloned().collect() + } + + pub fn clear(&mut self) { + self.backends.clear(); + } +} + +static GLOBAL_S3_TIER_REGISTRY: OnceLock> = OnceLock::new(); + +pub fn global_s3_tier_registry() -> &'static RwLock { + GLOBAL_S3_TIER_REGISTRY.get_or_init(|| RwLock::new(S3TierRegistry::new())) +} + +fn block_on_tier_future(future: F) -> Result +where + F: Future> + Send + 'static, + T: Send + 'static, +{ + std::thread::spawn(move || { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .map_err(|e| format!("failed to build tokio runtime: {}", e))?; + runtime.block_on(future) + }) + .join() + .map_err(|_| "tier runtime thread panicked".to_string())? +} diff --git a/seaweed-volume/src/security.rs b/seaweed-volume/src/security.rs new file mode 100644 index 000000000..e33350926 --- /dev/null +++ b/seaweed-volume/src/security.rs @@ -0,0 +1,481 @@ +//! Security: JWT validation and IP whitelist checking. +//! +//! Matches Go's security/guard.go and security/jwt.go. +//! - Guard: combines whitelist IP checking with JWT token validation +//! - JWT: HS256 HMAC signing with file-id claims + +pub mod tls; + +use std::collections::HashSet; +use std::net::IpAddr; +use std::time::{SystemTime, UNIX_EPOCH}; + +use jsonwebtoken::{decode, encode, Algorithm, DecodingKey, EncodingKey, Header, Validation}; +use serde::{Deserialize, Serialize}; + +// ============================================================================ +// JWT Claims +// ============================================================================ + +/// Claims for volume server file access tokens. +/// Matches Go's `SeaweedFileIdClaims`. +#[derive(Debug, Serialize, Deserialize)] +pub struct FileIdClaims { + /// File ID this token grants access to (e.g., "3,01637037d6"). + #[serde(skip_serializing_if = "Option::is_none")] + pub fid: Option, + + /// Expiration time (Unix timestamp). + #[serde(skip_serializing_if = "Option::is_none")] + pub exp: Option, + + /// Not before (Unix timestamp). + #[serde(skip_serializing_if = "Option::is_none")] + pub nbf: Option, +} + +/// Signing key wrapper (empty = security disabled). +#[derive(Clone)] +pub struct SigningKey(pub Vec); + +impl SigningKey { + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn from_string(s: &str) -> Self { + SigningKey(s.as_bytes().to_vec()) + } +} + +/// Generate a JWT token for file access. +pub fn gen_jwt( + signing_key: &SigningKey, + expires_after_sec: i64, + file_id: &str, +) -> Result { + if signing_key.is_empty() { + return Err(JwtError::NoSigningKey); + } + + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + let claims = FileIdClaims { + fid: Some(file_id.to_string()), + exp: if expires_after_sec > 0 { + Some(now + expires_after_sec as u64) + } else { + None + }, + nbf: None, + }; + + let token = encode( + &Header::new(Algorithm::HS256), + &claims, + &EncodingKey::from_secret(&signing_key.0), + )?; + + Ok(token) +} + +/// Decode and validate a JWT token. +pub fn decode_jwt(signing_key: &SigningKey, token: &str) -> Result { + if signing_key.is_empty() { + return Err(JwtError::NoSigningKey); + } + + let mut validation = Validation::new(Algorithm::HS256); + // Match Go behavior: tokens without exp are accepted (Go's jwt-go does not require exp) + // But if exp IS present, it must be valid (not expired). + validation.required_spec_claims.clear(); + validation.validate_exp = true; + // Go's jwt-go/v5 validates nbf when present + validation.validate_nbf = true; + validation.leeway = 0; + + let data = decode::( + token, + &DecodingKey::from_secret(&signing_key.0), + &validation, + )?; + + Ok(data.claims) +} + +// ============================================================================ +// Guard +// ============================================================================ + +/// Security guard: IP whitelist + JWT token validation. +pub struct Guard { + whitelist_ips: HashSet, + whitelist_cidrs: Vec<(IpAddr, u8)>, // (network, prefix_len) + pub signing_key: SigningKey, + pub expires_after_sec: i64, + pub read_signing_key: SigningKey, + pub read_expires_after_sec: i64, + /// Combined flag: true when whitelist is non-empty OR signing key is present. + /// Matches Go's `isWriteActive = !isEmptyWhiteList || len(SigningKey) != 0`. + is_write_active: bool, +} + +impl Guard { + pub fn new( + whitelist: &[String], + signing_key: SigningKey, + expires_after_sec: i64, + read_signing_key: SigningKey, + read_expires_after_sec: i64, + ) -> Self { + let mut guard = Guard { + whitelist_ips: HashSet::new(), + whitelist_cidrs: Vec::new(), + signing_key, + expires_after_sec, + read_signing_key, + read_expires_after_sec, + is_write_active: false, + }; + guard.update_whitelist(whitelist); + guard + } + + /// Update the IP whitelist. + pub fn update_whitelist(&mut self, entries: &[String]) { + self.whitelist_ips.clear(); + self.whitelist_cidrs.clear(); + + for entry in entries { + let entry = entry.trim(); + if entry.is_empty() { + continue; + } + if entry.contains('/') { + // CIDR range + if let Some((ip, prefix)) = parse_cidr(entry) { + self.whitelist_cidrs.push((ip, prefix)); + } else { + tracing::error!("Parse CIDR {} in whitelist failed", entry); + } + } else { + // Exact IP/hostname + self.whitelist_ips.insert(entry.to_string()); + } + } + + // Match Go: isWriteActive = !isEmptyWhiteList || len(SigningKey) != 0 + let is_empty_whitelist = self.whitelist_ips.is_empty() && self.whitelist_cidrs.is_empty(); + self.is_write_active = !is_empty_whitelist || !self.signing_key.is_empty(); + } + + /// Check if a remote IP is in the whitelist. + /// Returns true if write security is inactive (no whitelist and no signing key), + /// if the whitelist is empty, or if the IP matches. + pub fn check_whitelist(&self, remote_addr: &str) -> bool { + if !self.is_write_active { + return true; + } + if self.whitelist_ips.is_empty() && self.whitelist_cidrs.is_empty() { + return true; + } + + let host = extract_host(remote_addr); + + // Check exact match + if self.whitelist_ips.contains(&host) { + return true; + } + + // Check CIDR ranges + if let Ok(ip) = host.parse::() { + for &(ref network, prefix_len) in &self.whitelist_cidrs { + if ip_in_cidr(&ip, network, prefix_len) { + return true; + } + } + } + + false + } + + /// Check if a read signing key is configured. + pub fn has_read_signing_key(&self) -> bool { + !self.read_signing_key.is_empty() + } + + /// Validate a request's JWT token. + /// `is_write` determines which signing key to use. + /// Returns Ok(()) if valid, or if security is disabled. + pub fn check_jwt(&self, token: Option<&str>, is_write: bool) -> Result<(), JwtError> { + let key = if is_write { + &self.signing_key + } else { + &self.read_signing_key + }; + + if key.is_empty() { + return Ok(()); // Security disabled for this operation type + } + + let token = token.ok_or(JwtError::MissingToken)?; + decode_jwt(key, token)?; + Ok(()) + } + + /// Check JWT and validate the file ID claim matches. + pub fn check_jwt_for_file( + &self, + token: Option<&str>, + expected_fid: &str, + is_write: bool, + ) -> Result<(), JwtError> { + let key = if is_write { + &self.signing_key + } else { + &self.read_signing_key + }; + + if key.is_empty() { + return Ok(()); + } + + let token = token.ok_or(JwtError::MissingToken)?; + let claims = decode_jwt(key, token)?; + + match claims.fid { + None => { + return Err(JwtError::MissingFileIdClaim); + } + Some(ref fid) if fid != expected_fid => { + return Err(JwtError::FileIdMismatch { + expected: expected_fid.to_string(), + got: fid.to_string(), + }); + } + _ => {} + } + + Ok(()) + } +} + +// ============================================================================ +// Helpers +// ============================================================================ + +/// Extract host from "host:port" or "[::1]:port" format. +fn extract_host(addr: &str) -> String { + // Handle IPv6 with brackets + if addr.starts_with('[') { + if let Some(end) = addr.find(']') { + return addr[1..end].to_string(); + } + } + // Handle host:port + if let Some(pos) = addr.rfind(':') { + return addr[..pos].to_string(); + } + addr.to_string() +} + +/// Parse CIDR notation "192.168.1.0/24" into (IpAddr, prefix_len). +fn parse_cidr(cidr: &str) -> Option<(IpAddr, u8)> { + let parts: Vec<&str> = cidr.split('/').collect(); + if parts.len() != 2 { + return None; + } + let ip: IpAddr = parts[0].parse().ok()?; + let prefix: u8 = parts[1].parse().ok()?; + Some((ip, prefix)) +} + +/// Check if an IP is within a CIDR range. +fn ip_in_cidr(ip: &IpAddr, network: &IpAddr, prefix_len: u8) -> bool { + match (ip, network) { + (IpAddr::V4(ip), IpAddr::V4(net)) => { + let ip_bits = u32::from(*ip); + let net_bits = u32::from(*net); + let mask = if prefix_len == 0 { + 0 + } else if prefix_len >= 32 { + u32::MAX + } else { + u32::MAX << (32 - prefix_len) + }; + (ip_bits & mask) == (net_bits & mask) + } + (IpAddr::V6(ip), IpAddr::V6(net)) => { + let ip_bits = u128::from(*ip); + let net_bits = u128::from(*net); + let mask = if prefix_len == 0 { + 0 + } else if prefix_len >= 128 { + u128::MAX + } else { + u128::MAX << (128 - prefix_len) + }; + (ip_bits & mask) == (net_bits & mask) + } + _ => false, // V4/V6 mismatch + } +} + +// ============================================================================ +// Errors +// ============================================================================ + +#[derive(Debug, thiserror::Error)] +pub enum JwtError { + #[error("no signing key configured")] + NoSigningKey, + + #[error("missing JWT token")] + MissingToken, + + #[error("JWT error: {0}")] + Jwt(#[from] jsonwebtoken::errors::Error), + + #[error("JWT token missing required fid claim")] + MissingFileIdClaim, + + #[error("file ID mismatch: expected {expected}, got {got}")] + FileIdMismatch { expected: String, got: String }, +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_jwt_round_trip() { + let key = SigningKey::from_string("test-secret-key"); + let token = gen_jwt(&key, 3600, "3,01637037d6").unwrap(); + let claims = decode_jwt(&key, &token).unwrap(); + assert_eq!(claims.fid, Some("3,01637037d6".to_string())); + } + + #[test] + fn test_jwt_no_signing_key() { + let key = SigningKey(vec![]); + assert!(gen_jwt(&key, 3600, "1,abc").is_err()); + } + + #[test] + fn test_jwt_invalid_token() { + let key = SigningKey::from_string("secret"); + let result = decode_jwt(&key, "invalid.token.here"); + assert!(result.is_err()); + } + + #[test] + fn test_jwt_wrong_key() { + let key1 = SigningKey::from_string("secret1"); + let key2 = SigningKey::from_string("secret2"); + let token = gen_jwt(&key1, 3600, "1,abc").unwrap(); + assert!(decode_jwt(&key2, &token).is_err()); + } + + #[test] + fn test_guard_empty_whitelist() { + let guard = Guard::new(&[], SigningKey(vec![]), 0, SigningKey(vec![]), 0); + assert!(guard.check_whitelist("192.168.1.1:8080")); + } + + #[test] + fn test_guard_whitelist_exact() { + let guard = Guard::new( + &["192.168.1.1".to_string(), "10.0.0.1".to_string()], + SigningKey(vec![]), + 0, + SigningKey(vec![]), + 0, + ); + assert!(guard.check_whitelist("192.168.1.1:8080")); + assert!(guard.check_whitelist("10.0.0.1:1234")); + assert!(!guard.check_whitelist("172.16.0.1:8080")); + } + + #[test] + fn test_guard_whitelist_cidr() { + let guard = Guard::new( + &["10.0.0.0/8".to_string()], + SigningKey(vec![]), + 0, + SigningKey(vec![]), + 0, + ); + assert!(guard.check_whitelist("10.1.2.3:8080")); + assert!(guard.check_whitelist("10.255.255.255:80")); + assert!(!guard.check_whitelist("11.0.0.1:80")); + } + + #[test] + fn test_guard_check_jwt_disabled() { + let guard = Guard::new(&[], SigningKey(vec![]), 0, SigningKey(vec![]), 0); + // No signing key = security disabled + assert!(guard.check_jwt(None, true).is_ok()); + assert!(guard.check_jwt(None, false).is_ok()); + } + + #[test] + fn test_guard_check_jwt_enabled() { + let key = SigningKey::from_string("write-secret"); + let read_key = SigningKey::from_string("read-secret"); + let guard = Guard::new(&[], key.clone(), 3600, read_key.clone(), 3600); + + // Missing token + assert!(guard.check_jwt(None, true).is_err()); + + // Valid write token + let token = gen_jwt(&key, 3600, "1,abc").unwrap(); + assert!(guard.check_jwt(Some(&token), true).is_ok()); + + // Write token for read should fail (different key) + assert!(guard.check_jwt(Some(&token), false).is_err()); + + // Valid read token + let read_token = gen_jwt(&read_key, 3600, "1,abc").unwrap(); + assert!(guard.check_jwt(Some(&read_token), false).is_ok()); + } + + #[test] + fn test_guard_check_jwt_file_id() { + let key = SigningKey::from_string("secret"); + let guard = Guard::new(&[], key.clone(), 3600, SigningKey(vec![]), 0); + + let token = gen_jwt(&key, 3600, "3,01637037d6").unwrap(); + + // Correct file ID + assert!(guard + .check_jwt_for_file(Some(&token), "3,01637037d6", true) + .is_ok()); + + // Wrong file ID + let err = guard.check_jwt_for_file(Some(&token), "4,deadbeef", true); + assert!(matches!(err, Err(JwtError::FileIdMismatch { .. }))); + } + + #[test] + fn test_extract_host() { + assert_eq!(extract_host("192.168.1.1:8080"), "192.168.1.1"); + assert_eq!(extract_host("[::1]:8080"), "::1"); + assert_eq!(extract_host("localhost"), "localhost"); + } + + #[test] + fn test_ip_in_cidr() { + let net: IpAddr = "10.0.0.0".parse().unwrap(); + let ip1: IpAddr = "10.1.2.3".parse().unwrap(); + let ip2: IpAddr = "11.0.0.1".parse().unwrap(); + assert!(ip_in_cidr(&ip1, &net, 8)); + assert!(!ip_in_cidr(&ip2, &net, 8)); + } +} diff --git a/seaweed-volume/src/security/tls.rs b/seaweed-volume/src/security/tls.rs new file mode 100644 index 000000000..8f8cb2403 --- /dev/null +++ b/seaweed-volume/src/security/tls.rs @@ -0,0 +1,437 @@ +use std::collections::HashSet; +use std::fmt; +use std::sync::Arc; + +use rustls::client::danger::HandshakeSignatureValid; +use rustls::crypto::aws_lc_rs; +use rustls::crypto::CryptoProvider; +use rustls::pki_types::UnixTime; +use rustls::pki_types::{CertificateDer, PrivateKeyDer}; +use rustls::server::danger::{ClientCertVerified, ClientCertVerifier}; +use rustls::server::WebPkiClientVerifier; +use rustls::{ + CipherSuite, DigitallySignedStruct, DistinguishedName, RootCertStore, ServerConfig, + SignatureScheme, SupportedCipherSuite, SupportedProtocolVersion, +}; +use x509_parser::prelude::{FromDer, X509Certificate}; + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct TlsPolicy { + pub min_version: String, + pub max_version: String, + pub cipher_suites: String, +} + +#[derive(Clone, Debug, Default, PartialEq, Eq)] +pub struct GrpcClientAuthPolicy { + pub allowed_common_names: Vec, + pub allowed_wildcard_domain: String, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TlsPolicyError(String); + +impl fmt::Display for TlsPolicyError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +impl std::error::Error for TlsPolicyError {} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +enum GoTlsVersion { + Ssl3, + Tls10, + Tls11, + Tls12, + Tls13, +} + +#[derive(Debug)] +struct CommonNameVerifier { + inner: Arc, + allowed_common_names: HashSet, + allowed_wildcard_domain: String, +} + +impl ClientCertVerifier for CommonNameVerifier { + fn offer_client_auth(&self) -> bool { + self.inner.offer_client_auth() + } + + fn client_auth_mandatory(&self) -> bool { + self.inner.client_auth_mandatory() + } + + fn root_hint_subjects(&self) -> &[DistinguishedName] { + self.inner.root_hint_subjects() + } + + fn verify_client_cert( + &self, + end_entity: &CertificateDer<'_>, + intermediates: &[CertificateDer<'_>], + now: UnixTime, + ) -> Result { + self.inner + .verify_client_cert(end_entity, intermediates, now)?; + let common_name = parse_common_name(end_entity).map_err(|e| { + rustls::Error::General(format!( + "parse client certificate common name failed: {}", + e + )) + })?; + if common_name_is_allowed( + &common_name, + &self.allowed_common_names, + &self.allowed_wildcard_domain, + ) { + return Ok(ClientCertVerified::assertion()); + } + Err(rustls::Error::General(format!( + "Authenticate: invalid subject client common name: {}", + common_name + ))) + } + + fn verify_tls12_signature( + &self, + message: &[u8], + cert: &CertificateDer<'_>, + dss: &DigitallySignedStruct, + ) -> Result { + self.inner.verify_tls12_signature(message, cert, dss) + } + + fn verify_tls13_signature( + &self, + message: &[u8], + cert: &CertificateDer<'_>, + dss: &DigitallySignedStruct, + ) -> Result { + self.inner.verify_tls13_signature(message, cert, dss) + } + + fn supported_verify_schemes(&self) -> Vec { + self.inner.supported_verify_schemes() + } +} + +pub fn build_rustls_server_config( + cert_path: &str, + key_path: &str, + ca_path: &str, + policy: &TlsPolicy, +) -> Result { + build_rustls_server_config_with_client_auth(cert_path, key_path, ca_path, policy, None) +} + +pub fn build_rustls_server_config_with_grpc_client_auth( + cert_path: &str, + key_path: &str, + ca_path: &str, + policy: &TlsPolicy, + client_auth_policy: &GrpcClientAuthPolicy, +) -> Result { + build_rustls_server_config_with_client_auth( + cert_path, + key_path, + ca_path, + policy, + Some(client_auth_policy), + ) +} + +fn build_rustls_server_config_with_client_auth( + cert_path: &str, + key_path: &str, + ca_path: &str, + policy: &TlsPolicy, + client_auth_policy: Option<&GrpcClientAuthPolicy>, +) -> Result { + let cert_chain = read_cert_chain(cert_path)?; + let private_key = read_private_key(key_path)?; + let provider = build_crypto_provider(policy)?; + let versions = build_supported_versions(policy)?; + + let builder = ServerConfig::builder_with_provider(provider.clone()) + .with_protocol_versions(&versions) + .map_err(|e| TlsPolicyError(format!("invalid TLS version policy: {}", e)))?; + + let builder = if ca_path.is_empty() { + builder.with_no_client_auth() + } else { + let roots = read_root_store(ca_path)?; + let verifier = + WebPkiClientVerifier::builder_with_provider(Arc::new(roots), provider.clone()) + .build() + .map_err(|e| TlsPolicyError(format!("build client verifier failed: {}", e)))?; + let verifier: Arc = if let Some(client_auth_policy) = + client_auth_policy.filter(|policy| { + !policy.allowed_common_names.is_empty() + || !policy.allowed_wildcard_domain.is_empty() + }) { + Arc::new(CommonNameVerifier { + inner: verifier, + allowed_common_names: client_auth_policy + .allowed_common_names + .iter() + .cloned() + .collect(), + allowed_wildcard_domain: client_auth_policy.allowed_wildcard_domain.clone(), + }) + } else { + verifier + }; + builder.with_client_cert_verifier(verifier) + }; + + builder + .with_single_cert(cert_chain, private_key) + .map_err(|e| TlsPolicyError(format!("build rustls server config failed: {}", e))) +} + +fn read_cert_chain(cert_path: &str) -> Result>, TlsPolicyError> { + let cert_pem = std::fs::read(cert_path).map_err(|e| { + TlsPolicyError(format!( + "Failed to read TLS cert file '{}': {}", + cert_path, e + )) + })?; + rustls_pemfile::certs(&mut &cert_pem[..]) + .collect::, _>>() + .map_err(|e| { + TlsPolicyError(format!( + "Failed to parse TLS cert PEM '{}': {}", + cert_path, e + )) + }) +} + +fn read_private_key(key_path: &str) -> Result, TlsPolicyError> { + let key_pem = std::fs::read(key_path).map_err(|e| { + TlsPolicyError(format!("Failed to read TLS key file '{}': {}", key_path, e)) + })?; + rustls_pemfile::private_key(&mut &key_pem[..]) + .map_err(|e| TlsPolicyError(format!("Failed to parse TLS key PEM '{}': {}", key_path, e)))? + .ok_or_else(|| TlsPolicyError(format!("No private key found in '{}'", key_path))) +} + +fn read_root_store(ca_path: &str) -> Result { + let ca_pem = std::fs::read(ca_path) + .map_err(|e| TlsPolicyError(format!("Failed to read TLS CA file '{}': {}", ca_path, e)))?; + let ca_certs = rustls_pemfile::certs(&mut &ca_pem[..]) + .collect::, _>>() + .map_err(|e| TlsPolicyError(format!("Failed to parse TLS CA PEM '{}': {}", ca_path, e)))?; + let mut roots = RootCertStore::empty(); + for cert in ca_certs { + roots + .add(cert) + .map_err(|e| TlsPolicyError(format!("Failed to add CA cert '{}': {}", ca_path, e)))?; + } + Ok(roots) +} + +fn build_crypto_provider(policy: &TlsPolicy) -> Result, TlsPolicyError> { + let mut provider = aws_lc_rs::default_provider(); + let cipher_suites = parse_cipher_suites(&provider.cipher_suites, &policy.cipher_suites)?; + if !cipher_suites.is_empty() { + provider.cipher_suites = cipher_suites; + } + Ok(Arc::new(provider)) +} + +pub fn build_supported_versions( + policy: &TlsPolicy, +) -> Result, TlsPolicyError> { + let min_version = parse_go_tls_version(&policy.min_version)?; + let max_version = parse_go_tls_version(&policy.max_version)?; + let versions = [&rustls::version::TLS13, &rustls::version::TLS12] + .into_iter() + .filter(|version| { + let current = go_tls_version_for_supported(version); + min_version.map(|min| current >= min).unwrap_or(true) + && max_version.map(|max| current <= max).unwrap_or(true) + }) + .collect::>(); + + if versions.is_empty() { + return Err(TlsPolicyError(format!( + "TLS version range min='{}' max='{}' is unsupported by rustls", + policy.min_version, policy.max_version + ))); + } + + Ok(versions) +} + +fn parse_go_tls_version(value: &str) -> Result, TlsPolicyError> { + match value.trim() { + "" => Ok(None), + "SSLv3" => Ok(Some(GoTlsVersion::Ssl3)), + "TLS 1.0" => Ok(Some(GoTlsVersion::Tls10)), + "TLS 1.1" => Ok(Some(GoTlsVersion::Tls11)), + "TLS 1.2" => Ok(Some(GoTlsVersion::Tls12)), + "TLS 1.3" => Ok(Some(GoTlsVersion::Tls13)), + other => Err(TlsPolicyError(format!("invalid TLS version {}", other))), + } +} + +fn parse_cipher_suites( + available: &[SupportedCipherSuite], + value: &str, +) -> Result, TlsPolicyError> { + let trimmed = value.trim(); + if trimmed.is_empty() { + return Ok(Vec::new()); + } + + trimmed + .split(',') + .map(|name| { + let suite = parse_cipher_suite_name(name.trim())?; + available + .iter() + .copied() + .find(|candidate| candidate.suite() == suite) + .ok_or_else(|| { + TlsPolicyError(format!( + "TLS cipher suite '{}' is unsupported by the Rust implementation", + name.trim() + )) + }) + }) + .collect() +} + +fn parse_cipher_suite_name(value: &str) -> Result { + match value { + "TLS_AES_128_GCM_SHA256" => Ok(CipherSuite::TLS13_AES_128_GCM_SHA256), + "TLS_AES_256_GCM_SHA384" => Ok(CipherSuite::TLS13_AES_256_GCM_SHA384), + "TLS_CHACHA20_POLY1305_SHA256" => Ok(CipherSuite::TLS13_CHACHA20_POLY1305_SHA256), + "TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256" => { + Ok(CipherSuite::TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256) + } + "TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384" => { + Ok(CipherSuite::TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384) + } + "TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256" => { + Ok(CipherSuite::TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305_SHA256) + } + "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256" => { + Ok(CipherSuite::TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) + } + "TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384" => { + Ok(CipherSuite::TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) + } + "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256" => { + Ok(CipherSuite::TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305_SHA256) + } + other => Err(TlsPolicyError(format!( + "TLS cipher suite '{}' is unsupported by the Rust implementation", + other + ))), + } +} + +fn parse_common_name(cert: &CertificateDer<'_>) -> Result { + let (_, certificate) = X509Certificate::from_der(cert.as_ref()) + .map_err(|e| TlsPolicyError(format!("parse X.509 certificate failed: {}", e)))?; + let common_name = certificate + .subject() + .iter_common_name() + .next() + .and_then(|common_name| common_name.as_str().ok()) + .map(str::to_string); + match common_name { + Some(common_name) => Ok(common_name), + None => Ok(String::new()), + } +} + +fn common_name_is_allowed( + common_name: &str, + allowed_common_names: &HashSet, + allowed_wildcard_domain: &str, +) -> bool { + (!allowed_wildcard_domain.is_empty() && common_name.ends_with(allowed_wildcard_domain)) + || allowed_common_names.contains(common_name) +} + +fn go_tls_version_for_supported(version: &SupportedProtocolVersion) -> GoTlsVersion { + match version.version { + rustls::ProtocolVersion::TLSv1_2 => GoTlsVersion::Tls12, + rustls::ProtocolVersion::TLSv1_3 => GoTlsVersion::Tls13, + _ => unreachable!("rustls only exposes TLS 1.2 and 1.3"), + } +} + +#[cfg(test)] +mod tests { + use super::{build_supported_versions, common_name_is_allowed, parse_cipher_suites, TlsPolicy}; + use rustls::crypto::aws_lc_rs; + use std::collections::HashSet; + + #[test] + fn test_build_supported_versions_defaults_to_tls12_and_tls13() { + let versions = build_supported_versions(&TlsPolicy::default()).unwrap(); + assert_eq!( + versions, + vec![&rustls::version::TLS13, &rustls::version::TLS12] + ); + } + + #[test] + fn test_build_supported_versions_filters_to_tls13() { + let versions = build_supported_versions(&TlsPolicy { + min_version: "TLS 1.3".to_string(), + max_version: "TLS 1.3".to_string(), + cipher_suites: String::new(), + }) + .unwrap(); + assert_eq!(versions, vec![&rustls::version::TLS13]); + } + + #[test] + fn test_build_supported_versions_rejects_unsupported_legacy_range() { + let err = build_supported_versions(&TlsPolicy { + min_version: "TLS 1.0".to_string(), + max_version: "TLS 1.1".to_string(), + cipher_suites: String::new(), + }) + .unwrap_err(); + assert!(err.to_string().contains("unsupported by rustls")); + } + + #[test] + fn test_parse_cipher_suites_accepts_go_names() { + let cipher_suites = parse_cipher_suites( + &aws_lc_rs::default_provider().cipher_suites, + "TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_AES_128_GCM_SHA256", + ) + .unwrap(); + assert_eq!(cipher_suites.len(), 2); + } + + #[test] + fn test_common_name_is_allowed_matches_exact_and_wildcard() { + let allowed_common_names = + HashSet::from([String::from("volume-a.internal"), String::from("worker-7")]); + assert!(common_name_is_allowed( + "volume-a.internal", + &allowed_common_names, + "", + )); + assert!(common_name_is_allowed( + "node.prod.example.com", + &allowed_common_names, + ".example.com", + )); + assert!(!common_name_is_allowed( + "node.prod.other.net", + &allowed_common_names, + ".example.com", + )); + } +} diff --git a/seaweed-volume/src/server/debug.rs b/seaweed-volume/src/server/debug.rs new file mode 100644 index 000000000..dd1b69cf1 --- /dev/null +++ b/seaweed-volume/src/server/debug.rs @@ -0,0 +1,159 @@ +use axum::body::Body; +use axum::extract::Query; +use axum::http::{header, StatusCode}; +use axum::response::{IntoResponse, Response}; +use axum::routing::{any, get}; +use axum::Router; +use pprof::protos::Message; +use serde::Deserialize; + +#[derive(Deserialize, Default)] +struct ProfileQuery { + seconds: Option, +} + +pub fn build_debug_router() -> Router { + Router::new() + .route("/debug/pprof/", get(pprof_index_handler)) + .route("/debug/pprof/cmdline", get(pprof_cmdline_handler)) + .route("/debug/pprof/profile", get(pprof_profile_handler)) + .route("/debug/pprof/symbol", any(pprof_symbol_handler)) + .route("/debug/pprof/trace", get(pprof_trace_handler)) +} + +async fn pprof_index_handler() -> Response { + let body = concat!( + "/debug/pprof/", + "cmdline
", + "profile
", + "symbol
", + "trace
", + "", + ); + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "text/html; charset=utf-8")], + body, + ) + .into_response() +} + +async fn pprof_cmdline_handler() -> Response { + let body = std::env::args().collect::>().join("\0"); + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "text/plain; charset=utf-8")], + body, + ) + .into_response() +} + +async fn pprof_profile_handler(Query(query): Query) -> Response { + let seconds = query.seconds.unwrap_or(30).clamp(1, 300); + let guard = match pprof::ProfilerGuard::new(100) { + Ok(guard) => guard, + Err(e) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to start profiler: {}", e), + ) + .into_response(); + } + }; + + tokio::time::sleep(std::time::Duration::from_secs(seconds)).await; + + let report = match guard.report().build() { + Ok(report) => report, + Err(e) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to build profile report: {}", e), + ) + .into_response(); + } + }; + + let profile = match report.pprof() { + Ok(profile) => profile, + Err(e) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to encode profile: {}", e), + ) + .into_response(); + } + }; + + let mut bytes = Vec::new(); + if let Err(e) = profile.encode(&mut bytes) { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("failed to serialize profile: {}", e), + ) + .into_response(); + } + + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "application/octet-stream")], + bytes, + ) + .into_response() +} + +async fn pprof_symbol_handler() -> Response { + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "text/plain; charset=utf-8")], + "num_symbols: 0\n", + ) + .into_response() +} + +async fn pprof_trace_handler(Query(query): Query) -> Response { + let seconds = query.seconds.unwrap_or(1).clamp(1, 30); + tokio::time::sleep(std::time::Duration::from_secs(seconds)).await; + Response::builder() + .status(StatusCode::OK) + .header(header::CONTENT_TYPE, "application/octet-stream") + .body(Body::from(Vec::::new())) + .unwrap() +} + +#[cfg(test)] +mod tests { + use super::*; + use axum::http::Request; + use tower::ServiceExt; + + #[tokio::test] + async fn test_debug_index_route() { + let app = build_debug_router(); + let response = app + .oneshot( + Request::builder() + .uri("/debug/pprof/") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + } + + #[tokio::test] + async fn test_debug_cmdline_route() { + let app = build_debug_router(); + let response = app + .oneshot( + Request::builder() + .uri("/debug/pprof/cmdline") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); + } +} diff --git a/seaweed-volume/src/server/favicon.ico b/seaweed-volume/src/server/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..05ddc02d52a0b30c443f08c7d8d25ca43baff8dc GIT binary patch literal 70 pcmZQzU<5%%1|X@x;KIPbAO^%5KnxUOU;@($K$3xh1x#Wk8UQJ80Z;${ literal 0 HcmV?d00001 diff --git a/seaweed-volume/src/server/grpc_client.rs b/seaweed-volume/src/server/grpc_client.rs new file mode 100644 index 000000000..2eee9d5dd --- /dev/null +++ b/seaweed-volume/src/server/grpc_client.rs @@ -0,0 +1,206 @@ +use std::error::Error; +use std::fmt; +use std::time::Duration; + +use hyper::http::Uri; +use tonic::transport::{Certificate, Channel, ClientTlsConfig, Endpoint, Identity}; + +use crate::config::VolumeServerConfig; + +pub const GRPC_MAX_MESSAGE_SIZE: usize = 1 << 30; +const GRPC_KEEPALIVE_INTERVAL: Duration = Duration::from_secs(60); +const GRPC_KEEPALIVE_TIMEOUT: Duration = Duration::from_secs(20); +const GRPC_INITIAL_WINDOW_SIZE: u32 = 16 * 1024 * 1024; + +#[derive(Clone, Debug)] +pub struct OutgoingGrpcTlsConfig { + cert_pem: String, + key_pem: String, + ca_pem: String, +} + +#[derive(Debug)] +pub struct GrpcClientError(String); + +impl fmt::Display for GrpcClientError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(&self.0) + } +} + +impl Error for GrpcClientError {} + +pub fn load_outgoing_grpc_tls( + config: &VolumeServerConfig, +) -> Result, GrpcClientError> { + if config.grpc_cert_file.is_empty() + || config.grpc_key_file.is_empty() + || config.grpc_ca_file.is_empty() + { + return Ok(None); + } + + let cert_pem = std::fs::read_to_string(&config.grpc_cert_file).map_err(|e| { + GrpcClientError(format!( + "Failed to read outgoing gRPC cert '{}': {}", + config.grpc_cert_file, e + )) + })?; + let key_pem = std::fs::read_to_string(&config.grpc_key_file).map_err(|e| { + GrpcClientError(format!( + "Failed to read outgoing gRPC key '{}': {}", + config.grpc_key_file, e + )) + })?; + let ca_pem = std::fs::read_to_string(&config.grpc_ca_file).map_err(|e| { + GrpcClientError(format!( + "Failed to read outgoing gRPC CA '{}': {}", + config.grpc_ca_file, e + )) + })?; + + Ok(Some(OutgoingGrpcTlsConfig { + cert_pem, + key_pem, + ca_pem, + })) +} + +pub fn grpc_endpoint_uri(grpc_host_port: &str, tls: Option<&OutgoingGrpcTlsConfig>) -> String { + let scheme = if tls.is_some() { "https" } else { "http" }; + format!("{}://{}", scheme, grpc_host_port) +} + +pub fn build_grpc_endpoint( + grpc_host_port: &str, + tls: Option<&OutgoingGrpcTlsConfig>, +) -> Result { + let uri = grpc_endpoint_uri(grpc_host_port, tls); + let mut endpoint = Channel::from_shared(uri.clone()) + .map_err(|e| GrpcClientError(format!("invalid gRPC endpoint {}: {}", uri, e)))? + .http2_keep_alive_interval(GRPC_KEEPALIVE_INTERVAL) + .keep_alive_timeout(GRPC_KEEPALIVE_TIMEOUT) + .keep_alive_while_idle(false) + .initial_stream_window_size(Some(GRPC_INITIAL_WINDOW_SIZE)) + .initial_connection_window_size(Some(GRPC_INITIAL_WINDOW_SIZE)) + .http2_adaptive_window(false); + + if let Some(tls) = tls { + let parsed = uri + .parse::() + .map_err(|e| GrpcClientError(format!("invalid gRPC endpoint {}: {}", uri, e)))?; + let host = parsed + .host() + .ok_or_else(|| GrpcClientError(format!("missing host in gRPC endpoint {}", uri)))?; + let tls_config = ClientTlsConfig::new() + .identity(Identity::from_pem( + tls.cert_pem.clone(), + tls.key_pem.clone(), + )) + .ca_certificate(Certificate::from_pem(tls.ca_pem.clone())) + .domain_name(host.to_string()); + endpoint = endpoint.tls_config(tls_config).map_err(|e| { + GrpcClientError(format!("configure gRPC TLS for {} failed: {}", uri, e)) + })?; + } + + Ok(endpoint) +} + +#[cfg(test)] +mod tests { + use super::{build_grpc_endpoint, grpc_endpoint_uri, load_outgoing_grpc_tls}; + use crate::config::{NeedleMapKind, ReadMode, VolumeServerConfig}; + use crate::security::tls::TlsPolicy; + + fn sample_config() -> VolumeServerConfig { + VolumeServerConfig { + port: 8080, + grpc_port: 18080, + public_port: 8080, + ip: "127.0.0.1".to_string(), + bind_ip: String::new(), + public_url: "127.0.0.1:8080".to_string(), + id: String::new(), + masters: vec![], + pre_stop_seconds: 0, + idle_timeout: 0, + data_center: String::new(), + rack: String::new(), + index_type: NeedleMapKind::InMemory, + disk_type: String::new(), + folders: vec![], + folder_max_limits: vec![], + folder_tags: vec![], + min_free_spaces: vec![], + disk_types: vec![], + idx_folder: String::new(), + white_list: vec![], + fix_jpg_orientation: false, + read_mode: ReadMode::Local, + cpu_profile: String::new(), + mem_profile: String::new(), + compaction_byte_per_second: 0, + maintenance_byte_per_second: 0, + file_size_limit_bytes: 0, + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(0), + inflight_download_data_timeout: std::time::Duration::from_secs(0), + has_slow_read: false, + read_buffer_size_mb: 0, + ldb_timeout: 0, + pprof: false, + metrics_port: 0, + metrics_ip: String::new(), + debug: false, + debug_port: 0, + ui_enabled: false, + jwt_signing_key: vec![], + jwt_signing_expires_seconds: 0, + jwt_read_signing_key: vec![], + jwt_read_signing_expires_seconds: 0, + https_cert_file: String::new(), + https_key_file: String::new(), + https_ca_file: String::new(), + https_client_enabled: false, + https_client_cert_file: String::new(), + https_client_key_file: String::new(), + https_client_ca_file: String::new(), + grpc_cert_file: String::new(), + grpc_key_file: String::new(), + grpc_ca_file: String::new(), + grpc_allowed_wildcard_domain: String::new(), + grpc_volume_allowed_common_names: vec![], + tls_policy: TlsPolicy::default(), + enable_write_queue: false, + security_file: String::new(), + } + } + + #[test] + fn test_grpc_endpoint_uri_uses_https_when_tls_enabled() { + let tls = super::OutgoingGrpcTlsConfig { + cert_pem: "cert".to_string(), + key_pem: "key".to_string(), + ca_pem: "ca".to_string(), + }; + assert_eq!( + grpc_endpoint_uri("master.example.com:19333", Some(&tls)), + "https://master.example.com:19333" + ); + } + + #[test] + fn test_load_outgoing_grpc_tls_requires_cert_key_and_ca() { + let mut config = sample_config(); + config.grpc_cert_file = "/tmp/client.pem".to_string(); + assert!(load_outgoing_grpc_tls(&config).unwrap().is_none()); + } + + #[test] + fn test_build_grpc_endpoint_without_tls_uses_http_scheme() { + let endpoint = build_grpc_endpoint("127.0.0.1:19333", None).unwrap(); + assert_eq!(endpoint.uri().scheme_str(), Some("http")); + } +} diff --git a/seaweed-volume/src/server/grpc_server.rs b/seaweed-volume/src/server/grpc_server.rs new file mode 100644 index 000000000..295583b0a --- /dev/null +++ b/seaweed-volume/src/server/grpc_server.rs @@ -0,0 +1,4536 @@ +//! gRPC service implementation for the volume server. +//! +//! Implements the VolumeServer trait generated from volume_server.proto. +//! 48 RPCs: core volume operations are fully implemented, streaming and +//! EC operations are stubbed with appropriate error messages. + +use std::pin::Pin; +use std::sync::atomic::Ordering; +use std::sync::Arc; + +use tokio_stream::Stream; +use tonic::{Request, Response, Status, Streaming}; + +use crate::pb::filer_pb; +use crate::pb::master_pb; +use crate::pb::master_pb::seaweed_client::SeaweedClient; +use crate::pb::volume_server_pb; +use crate::pb::volume_server_pb::volume_server_server::VolumeServer; +use crate::storage::needle::needle::{self, Needle}; +use crate::storage::types::*; + +use super::grpc_client::{build_grpc_endpoint, GRPC_MAX_MESSAGE_SIZE}; +use super::volume_server::VolumeServerState; + +type BoxStream = Pin> + Send + 'static>>; + +fn volume_is_remote_only(dat_path: &str, has_remote_file: bool) -> bool { + has_remote_file && !std::path::Path::new(dat_path).exists() +} + +/// Persist VolumeServerState to a state.pb file (matches Go's State.save). +fn save_state_file( + path: &str, + state: &volume_server_pb::VolumeServerState, +) -> Result<(), std::io::Error> { + if path.is_empty() { + return Ok(()); + } + use prost::Message; + let buf = state.encode_to_vec(); + std::fs::write(path, buf) +} + +/// Load VolumeServerState from a state.pb file (matches Go's State.Load). +pub fn load_state_file( + path: &str, +) -> Option { + if path.is_empty() || !std::path::Path::new(path).exists() { + return None; + } + let data = std::fs::read(path).ok()?; + use prost::Message; + volume_server_pb::VolumeServerState::decode(data.as_slice()).ok() +} + +struct WriteThrottler { + bytes_per_second: i64, + last_size_counter: i64, + last_size_check_time: std::time::Instant, +} + +impl WriteThrottler { + fn new(bytes_per_second: i64) -> Self { + Self { + bytes_per_second, + last_size_counter: 0, + last_size_check_time: std::time::Instant::now(), + } + } + + async fn maybe_slowdown(&mut self, delta: i64) { + if self.bytes_per_second <= 0 { + return; + } + + self.last_size_counter += delta; + let elapsed = self.last_size_check_time.elapsed(); + if elapsed <= std::time::Duration::from_millis(100) { + return; + } + + let over_limit_bytes = self.last_size_counter - self.bytes_per_second / 10; + if over_limit_bytes > 0 { + let over_ratio = over_limit_bytes as f64 / self.bytes_per_second as f64; + let sleep_time = std::time::Duration::from_millis((over_ratio * 1000.0) as u64); + if !sleep_time.is_zero() { + tokio::time::sleep(sleep_time).await; + } + } + + self.last_size_counter = 0; + self.last_size_check_time = std::time::Instant::now(); + } +} + +struct MasterVolumeInfo { + volume_id: VolumeId, + collection: String, + replica_placement: u8, + ttl: u32, + disk_type: String, + ip: String, + port: u16, +} + +pub struct VolumeGrpcService { + pub state: Arc, +} + +impl VolumeGrpcService { + async fn notify_master_volume_readonly( + &self, + info: &MasterVolumeInfo, + is_readonly: bool, + ) -> Result<(), Status> { + let master_url = self.state.master_url.clone(); + if master_url.is_empty() { + return Ok(()); + } + let grpc_addr = parse_grpc_address(&master_url).map_err(|e| { + Status::internal(format!("invalid master address {}: {}", master_url, e)) + })?; + let endpoint = build_grpc_endpoint(&grpc_addr, self.state.outgoing_grpc_tls.as_ref()) + .map_err(|e| Status::internal(format!("master address {}: {}", master_url, e)))? + .connect_timeout(std::time::Duration::from_secs(5)) + .timeout(std::time::Duration::from_secs(30)); + let channel = endpoint + .connect() + .await + .map_err(|e| Status::internal(format!("connect to master {}: {}", master_url, e)))?; + let mut client = SeaweedClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + client + .volume_mark_readonly(master_pb::VolumeMarkReadonlyRequest { + ip: info.ip.clone(), + port: info.port as u32, + volume_id: info.volume_id.0, + collection: info.collection.clone(), + replica_placement: info.replica_placement as u32, + ttl: info.ttl, + disk_type: info.disk_type.clone(), + is_readonly, + ..Default::default() + }) + .await + .map_err(|e| { + Status::internal(format!( + "set volume {} readonly={} on master {}: {}", + info.volume_id, is_readonly, master_url, e + )) + })?; + Ok(()) + } + + /// Shared helper matching Go's `makeVolumeReadonly(ctx, v, persist)`. + /// 1. Check maintenance mode + /// 2. Notify master (readonly=true) + /// 3. Mark local volume readonly + /// 4. Notify master again (cover heartbeat race) + async fn make_volume_readonly(&self, vid: VolumeId, persist: bool) -> Result<(), Status> { + self.state.check_maintenance()?; + + let info = { + let store = self.state.store.read().unwrap(); + let (loc_idx, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", vid)))?; + MasterVolumeInfo { + volume_id: vid, + collection: vol.collection.clone(), + replica_placement: vol.super_block.replica_placement.to_byte(), + ttl: vol.super_block.ttl.to_u32(), + disk_type: store.locations[loc_idx].disk_type.to_string(), + ip: store.ip.clone(), + port: store.port, + } + }; + + // Step 1: stop master from redirecting traffic here + self.notify_master_volume_readonly(&info, true).await?; + + // Step 2: mark local volume readonly + { + let mut store = self.state.store.write().unwrap(); + if let Some((_, vol)) = store.find_volume_mut(vid) { + vol.set_read_only_persist(persist) + .map_err(|e| Status::internal(e.to_string()))?; + } + self.state.volume_state_notify.notify_one(); + } + + // Step 3: notify master again to cover heartbeat race + self.notify_master_volume_readonly(&info, true).await?; + Ok(()) + } +} + +#[tonic::async_trait] +impl VolumeServer for VolumeGrpcService { + // ---- Core volume operations ---- + + async fn batch_delete( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let mut results = Vec::new(); + + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + for fid_str in &req.file_ids { + let file_id = match needle::FileId::parse(fid_str) { + Ok(fid) => fid, + Err(e) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 400, // Bad Request + error: e, + size: 0, + version: 0, + }); + continue; + } + }; + + let mut n = Needle { + id: file_id.key, + cookie: file_id.cookie, + ..Needle::default() + }; + + // Check if this is an EC volume + let is_ec_volume = { + let store = self.state.store.read().unwrap(); + store.has_ec_volume(file_id.volume_id) + }; + + // Cookie validation (unless skip_cookie_check) + if !req.skip_cookie_check { + let original_cookie = n.cookie; + if !is_ec_volume { + let store = self.state.store.read().unwrap(); + match store.read_volume_needle(file_id.volume_id, &mut n) { + Ok(_) => {} + Err(e) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 404, + error: e.to_string(), + size: 0, + version: 0, + }); + continue; + } + } + } else { + // For EC volumes, verify needle exists in ecx index + let store = self.state.store.read().unwrap(); + if let Some(ec_vol) = store.find_ec_volume(file_id.volume_id) { + match ec_vol.find_needle_from_ecx(n.id) { + Ok(Some((_, size))) if !size.is_deleted() => { + // Needle exists and is not deleted — cookie check not possible + // for EC volumes without distributed read, so we accept it + n.data_size = size.0 as u32; + } + Ok(_) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 404, + error: format!("ec needle {} not found", fid_str), + size: 0, + version: 0, + }); + continue; + } + Err(e) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 404, + error: e.to_string(), + size: 0, + version: 0, + }); + continue; + } + } + } else { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 404, + error: format!("ec volume {} not found", file_id.volume_id), + size: 0, + version: 0, + }); + continue; + } + } + if n.cookie != original_cookie { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 400, + error: "File Random Cookie does not match.".to_string(), + size: 0, + version: 0, + }); + break; + } + } + + // Reject chunk manifest needles + if n.is_chunk_manifest() { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 406, + error: "ChunkManifest: not allowed in batch delete mode.".to_string(), + size: 0, + version: 0, + }); + continue; + } + + n.last_modified = now; + + if !is_ec_volume { + let mut store = self.state.store.write().unwrap(); + match store.delete_volume_needle(file_id.volume_id, &mut n) { + Ok(size) => { + if size.0 == 0 { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 304, + error: String::new(), + size: 0, + version: 0, + }); + } else { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 202, + error: String::new(), + size: size.0 as u32, + version: 0, + }); + } + } + Err(e) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 500, + error: e.to_string(), + size: 0, + version: 0, + }); + } + } + } else { + // EC volume deletion: journal the delete locally (with cookie validation, matching Go) + let mut store = self.state.store.write().unwrap(); + if let Some(ec_vol) = store.find_ec_volume_mut(file_id.volume_id) { + match ec_vol.journal_delete_with_cookie(n.id, n.cookie) { + Ok(()) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 202, + error: String::new(), + size: n.data_size, + version: 0, + }); + } + Err(e) => { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 500, + error: e.to_string(), + size: 0, + version: 0, + }); + } + } + } else { + results.push(volume_server_pb::DeleteResult { + file_id: fid_str.clone(), + status: 404, + error: format!("ec volume {} not found", file_id.volume_id), + size: 0, + version: 0, + }); + } + } + } + + Ok(Response::new(volume_server_pb::BatchDeleteResponse { + results, + })) + } + + async fn vacuum_volume_check( + &self, + request: Request, + ) -> Result, Status> { + let vid = VolumeId(request.into_inner().volume_id); + let store = self.state.store.read().unwrap(); + let garbage_ratio = match store.find_volume(vid) { + Some((_, vol)) => vol.garbage_level(), + None => return Err(Status::not_found(format!("not found volume id {}", vid))), + }; + Ok(Response::new(volume_server_pb::VacuumVolumeCheckResponse { + garbage_ratio, + })) + } + + type VacuumVolumeCompactStream = BoxStream; + async fn vacuum_volume_compact( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let preallocate = req.preallocate as u64; + let state = self.state.clone(); + + let (tx, rx) = tokio::sync::mpsc::channel(16); + + tokio::task::spawn_blocking(move || { + let compact_start = std::time::Instant::now(); + let report_interval: i64 = 128 * 1024 * 1024; + let next_report = std::sync::atomic::AtomicI64::new(report_interval); + + let tx_clone = tx.clone(); + let result = { + let mut store = state.store.write().unwrap(); + store.compact_volume(vid, preallocate, 0, |processed| { + let target = next_report.load(std::sync::atomic::Ordering::Relaxed); + if processed > target { + let resp = volume_server_pb::VacuumVolumeCompactResponse { + processed_bytes: processed, + load_avg_1m: 0.0, + }; + // If send fails (client disconnected), stop compaction + if tx_clone.blocking_send(Ok(resp)).is_err() { + return false; + } + next_report.store( + processed + report_interval, + std::sync::atomic::Ordering::Relaxed, + ); + } + true + }) + }; + + let success = result.is_ok(); + crate::metrics::VACUUMING_HISTOGRAM + .with_label_values(&["compact"]) + .observe(compact_start.elapsed().as_secs_f64()); + crate::metrics::VACUUMING_COMPACT_COUNTER + .with_label_values(&[if success { "true" } else { "false" }]) + .inc(); + + if let Err(e) = result { + let _ = tx.blocking_send(Err(Status::internal(e))); + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new( + Box::pin(stream) as Self::VacuumVolumeCompactStream + )) + } + + async fn vacuum_volume_commit( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let vid = VolumeId(request.into_inner().volume_id); + + // Match Go's store_vacuum.go CommitCompactVolume: skip commit if stopping + if *self.state.is_stopping.read().unwrap() { + return Err(Status::internal(format!( + "volume id {} skips compact commit because volume server is stopping", + vid.0 + ))); + } + + let commit_start = std::time::Instant::now(); + let mut store = self.state.store.write().unwrap(); + let result = store.commit_compact_volume(vid); + crate::metrics::VACUUMING_HISTOGRAM + .with_label_values(&["commit"]) + .observe(commit_start.elapsed().as_secs_f64()); + crate::metrics::VACUUMING_COMMIT_COUNTER + .with_label_values(&[if result.is_ok() { "true" } else { "false" }]) + .inc(); + match result { + Ok((is_read_only, volume_size)) => Ok(Response::new( + volume_server_pb::VacuumVolumeCommitResponse { + is_read_only, + volume_size, + }, + )), + Err(e) => Err(Status::internal(e)), + } + } + + async fn vacuum_volume_cleanup( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let vid = VolumeId(request.into_inner().volume_id); + let mut store = self.state.store.write().unwrap(); + match store.cleanup_compact_volume(vid) { + Ok(()) => Ok(Response::new( + volume_server_pb::VacuumVolumeCleanupResponse {}, + )), + Err(e) => Err(Status::internal(e)), + } + } + + async fn delete_collection( + &self, + request: Request, + ) -> Result, Status> { + let collection = &request.into_inner().collection; + let mut store = self.state.store.write().unwrap(); + store + .delete_collection(collection) + .map_err(|e| Status::internal(e))?; + Ok(Response::new(volume_server_pb::DeleteCollectionResponse {})) + } + + async fn allocate_volume( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let rp = crate::storage::super_block::ReplicaPlacement::from_string(&req.replication) + .map_err(|e| Status::invalid_argument(e.to_string()))?; + let ttl = if req.ttl.is_empty() { + None + } else { + Some( + crate::storage::needle::ttl::TTL::read(&req.ttl) + .map_err(|e| Status::invalid_argument(e))?, + ) + }; + let disk_type = DiskType::from_string(&req.disk_type); + + let version = if req.version > 0 { + crate::storage::types::Version(req.version as u8) + } else { + crate::storage::types::Version::current() + }; + + let mut store = self.state.store.write().unwrap(); + store + .add_volume( + vid, + &req.collection, + Some(rp), + ttl, + req.preallocate as u64, + disk_type, + version, + ) + .map_err(|e| Status::internal(e.to_string()))?; + self.state.volume_state_notify.notify_one(); + + Ok(Response::new(volume_server_pb::AllocateVolumeResponse {})) + } + + async fn volume_sync_status( + &self, + request: Request, + ) -> Result, Status> { + let vid = VolumeId(request.into_inner().volume_id); + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + Ok(Response::new(volume_server_pb::VolumeSyncStatusResponse { + volume_id: vid.0, + collection: vol.collection.clone(), + replication: vol.super_block.replica_placement.to_string(), + ttl: vol.super_block.ttl.to_string(), + tail_offset: vol.dat_file_size().unwrap_or(0), + compact_revision: vol.super_block.compaction_revision as u32, + idx_file_size: vol.idx_file_size(), + version: vol.version().0 as u32, + })) + } + + type VolumeIncrementalCopyStream = BoxStream; + async fn volume_incremental_copy( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Sync to disk first + { + let mut store = self.state.store.write().unwrap(); + if let Some((_, v)) = store.find_volume_mut(vid) { + let _ = v.sync_to_disk(); + } + } + + let store = self.state.store.read().unwrap(); + let (_, v) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + let dat_size = v.dat_file_size().unwrap_or(0); + let super_block_size = v.super_block.block_size() as u64; + + // If since_ns is very large (after all data), return empty + if req.since_ns == u64::MAX || dat_size <= super_block_size { + drop(store); + let stream = tokio_stream::iter(Vec::new()); + return Ok(Response::new(Box::pin(stream))); + } + + // Use binary search to find the starting offset + let start_offset = if req.since_ns == 0 { + super_block_size + } else { + match v.binary_search_by_append_at_ns(req.since_ns) { + Ok((_offset, true)) => { + // All entries are before since_ns — nothing to send + drop(store); + let stream = tokio_stream::iter(Vec::new()); + return Ok(Response::new(Box::pin(stream))); + } + Ok((offset, false)) => { + let actual = offset.to_actual_offset(); + if actual <= 0 { + super_block_size + } else { + actual as u64 + } + } + Err(e) => { + return Err(Status::internal(format!( + "fail to locate by appendAtNs {}: {}", + req.since_ns, e + ))); + } + } + }; + let mut results = Vec::new(); + let mut bytes_to_read = (dat_size - start_offset) as i64; + let buffer_size = 2 * 1024 * 1024; + let mut offset = start_offset; + + while bytes_to_read > 0 { + let chunk = std::cmp::min(bytes_to_read as usize, buffer_size); + match v.read_dat_slice(offset, chunk) { + Ok(buf) if buf.is_empty() => break, + Ok(buf) => { + let read_len = buf.len() as i64; + results.push(Ok(volume_server_pb::VolumeIncrementalCopyResponse { + file_content: buf, + })); + bytes_to_read -= read_len; + offset += read_len as u64; + } + Err(e) => return Err(Status::internal(e.to_string())), + } + } + + drop(store); + let stream = tokio_stream::iter(results); + Ok(Response::new(Box::pin(stream))) + } + + async fn volume_mount( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let mut store = self.state.store.write().unwrap(); + store + .mount_volume_by_id(vid) + .map_err(|e| Status::internal(e.to_string()))?; + self.state.volume_state_notify.notify_one(); + + Ok(Response::new(volume_server_pb::VolumeMountResponse {})) + } + + async fn volume_unmount( + &self, + request: Request, + ) -> Result, Status> { + let vid = VolumeId(request.into_inner().volume_id); + let mut store = self.state.store.write().unwrap(); + // Go returns nil when volume is not found (idempotent unmount) + if store.unmount_volume(vid) { + self.state.volume_state_notify.notify_one(); + } + Ok(Response::new(volume_server_pb::VolumeUnmountResponse {})) + } + + async fn volume_delete( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let mut store = self.state.store.write().unwrap(); + if req.only_empty { + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + if vol.file_count() > 0 { + return Err(Status::failed_precondition("volume not empty")); + } + } + store + .delete_volume(vid, req.only_empty) + .map_err(|e| Status::internal(e.to_string()))?; + self.state.volume_state_notify.notify_one(); + Ok(Response::new(volume_server_pb::VolumeDeleteResponse {})) + } + + async fn volume_mark_readonly( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + // Go: volume lookup (L239-241) happens before maintenance check (L166 in makeVolumeReadonly) + { + let store = self.state.store.read().unwrap(); + store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", vid)))?; + } + self.make_volume_readonly(vid, req.persist).await?; + Ok(Response::new( + volume_server_pb::VolumeMarkReadonlyResponse {}, + )) + } + + async fn volume_mark_writable( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let info = { + let store = self.state.store.read().unwrap(); + let (loc_idx, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", vid)))?; + MasterVolumeInfo { + volume_id: vid, + collection: vol.collection.clone(), + replica_placement: vol.super_block.replica_placement.to_byte(), + ttl: vol.super_block.ttl.to_u32(), + disk_type: store.locations[loc_idx].disk_type.to_string(), + ip: store.ip.clone(), + port: store.port, + } + }; + // Go: maintenance check (L194 in makeVolumeWritable) happens after volume lookup (L253-255) + self.state.check_maintenance()?; + + // Step 1: mark local volume as writable (save result; Go continues on error) + let mark_result = { + let mut store = self.state.store.write().unwrap(); + let res = store + .find_volume_mut(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", vid))) + .and_then(|(_, vol)| { + vol.set_writable() + .map_err(|e| Status::internal(e.to_string())) + }); + if res.is_ok() { + self.state.volume_state_notify.notify_one(); + } + res + }; + + // Step 2: Go returns early if marking failed (L198-200), before notifying master. + mark_result?; + // Step 3: enable master to redirect traffic here + self.notify_master_volume_readonly(&info, false).await?; + Ok(Response::new( + volume_server_pb::VolumeMarkWritableResponse {}, + )) + } + + async fn volume_configure( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Validate replication string — return response error, not gRPC error + let rp = match crate::storage::super_block::ReplicaPlacement::from_string(&req.replication) + { + Ok(rp) => rp, + Err(e) => { + return Ok(Response::new(volume_server_pb::VolumeConfigureResponse { + error: format!("volume configure replication {}: {}", req.replication, e), + })); + } + }; + + let mut store = self.state.store.write().unwrap(); + + // Unmount the volume (Go propagates unmount errors via resp.Error; + // Rust unmount_volume returns bool, so not-found falls through to configure_volume) + store.unmount_volume(vid); + + // Modify the super block on disk (replica_placement byte) + if let Err(e) = store.configure_volume(vid, rp) { + let mut error = format!("volume configure {}: {}", vid, e); + // Error recovery: try to re-mount anyway + if let Err(mount_err) = store.mount_volume_by_id(vid) { + error += &format!(". Also failed to restore mount: {}", mount_err); + } + return Ok(Response::new(volume_server_pb::VolumeConfigureResponse { + error, + })); + } + + // Re-mount the volume + if let Err(e) = store.mount_volume_by_id(vid) { + return Ok(Response::new(volume_server_pb::VolumeConfigureResponse { + error: format!("volume configure mount {}: {}", vid, e), + })); + } + self.state.volume_state_notify.notify_one(); + + Ok(Response::new(volume_server_pb::VolumeConfigureResponse { + error: String::new(), + })) + } + + async fn volume_status( + &self, + request: Request, + ) -> Result, Status> { + let vid = VolumeId(request.into_inner().volume_id); + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + // Go checks v.DataBackend != nil before building the response. + if !vol.has_data_backend() { + return Err(Status::internal(format!( + "volume {} data backend not found", + vid + ))); + } + + // Go uses v.DataBackend.GetStat() which returns the actual .dat file size + let volume_size = vol.dat_file_size().unwrap_or(0); + + Ok(Response::new(volume_server_pb::VolumeStatusResponse { + is_read_only: vol.is_read_only(), + volume_size, + file_count: vol.file_count() as u64, + file_deleted_count: vol.deleted_count() as u64, + })) + } + + async fn get_state( + &self, + _request: Request, + ) -> Result, Status> { + Ok(Response::new(volume_server_pb::GetStateResponse { + state: Some(volume_server_pb::VolumeServerState { + maintenance: self.state.maintenance.load(Ordering::Relaxed), + version: self.state.state_version.load(Ordering::Relaxed), + }), + })) + } + + async fn set_state( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + if let Some(new_state) = &req.state { + // Go's State.Update checks version: if incoming version != stored version → error. + let current_version = self.state.state_version.load(Ordering::Relaxed); + if new_state.version != current_version { + return Err(Status::failed_precondition(format!( + "version mismatch for VolumeServerState (got {}, want {})", + new_state.version, current_version + ))); + } + + // Save previous state for rollback on persistence failure (matches Go) + let prev_maintenance = self.state.maintenance.load(Ordering::Relaxed); + let prev_version = current_version; + + self.state + .maintenance + .store(new_state.maintenance, Ordering::Relaxed); + let new_version = self.state.state_version.fetch_add(1, Ordering::Relaxed) + 1; + + // Persist to disk (matches Go's State.save) + let pb = volume_server_pb::VolumeServerState { + maintenance: new_state.maintenance, + version: new_version, + }; + if let Err(e) = save_state_file(&self.state.state_file_path, &pb) { + // Rollback in-memory state on save failure (matches Go) + self.state.maintenance.store(prev_maintenance, Ordering::Relaxed); + self.state.state_version.store(prev_version, Ordering::Relaxed); + return Err(Status::internal(format!("failed to save state: {}", e))); + } + + Ok(Response::new(volume_server_pb::SetStateResponse { + state: Some(pb), + })) + } else { + // nil state = no-op, return current state + Ok(Response::new(volume_server_pb::SetStateResponse { + state: Some(volume_server_pb::VolumeServerState { + maintenance: self.state.maintenance.load(Ordering::Relaxed), + version: self.state.state_version.load(Ordering::Relaxed), + }), + })) + } + } + + type VolumeCopyStream = BoxStream; + async fn volume_copy( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // If volume already exists locally, delete it first + { + let store = self.state.store.read().unwrap(); + if store.find_volume(vid).is_some() { + drop(store); + let mut store = self.state.store.write().unwrap(); + store.delete_volume(vid, false).map_err(|e| { + Status::internal(format!("failed to delete existing volume {}: {}", vid, e)) + })?; + self.state.volume_state_notify.notify_one(); + } + } + + // Parse source_data_node address: "ip:port.grpcPort" or "ip:port" (grpc = port + 10000) + let source = &req.source_data_node; + let grpc_addr = parse_grpc_address(source).map_err(|e| { + Status::internal(format!( + "VolumeCopy volume {} invalid source_data_node {}: {}", + vid, source, e + )) + })?; + + let channel = build_grpc_endpoint(&grpc_addr, self.state.outgoing_grpc_tls.as_ref()) + .map_err(|e| { + Status::internal(format!("VolumeCopy volume {} parse source: {}", vid, e)) + })? + .connect() + .await + .map_err(|e| { + Status::internal(format!( + "VolumeCopy volume {} connect to {}: {}", + vid, grpc_addr, e + )) + })?; + + let mut client = + volume_server_pb::volume_server_client::VolumeServerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + + // Get file status from source + let vol_info = client + .read_volume_file_status(volume_server_pb::ReadVolumeFileStatusRequest { + volume_id: req.volume_id, + }) + .await + .map_err(|e| Status::internal(format!("read volume file status failed, {}", e)))? + .into_inner(); + + let requested_disk_type = if !req.disk_type.is_empty() { + DiskType::from_string(&req.disk_type) + } else { + DiskType::from_string(&vol_info.disk_type) + }; + + // Find a free disk location using Go's Store.FindFreeLocation semantics. + let (data_base, idx_base, selected_disk_type) = { + let store = self.state.store.read().unwrap(); + let Some(loc_idx) = store.find_free_location_predicate(|loc| { + loc.disk_type == requested_disk_type + && loc.available_space.load(Ordering::Relaxed) > vol_info.dat_file_size + }) else { + return Err(Status::internal(format!( + "no space left {}", + requested_disk_type.readable_string() + ))); + }; + let loc = &store.locations[loc_idx]; + ( + loc.directory.clone(), + loc.idx_directory.clone(), + loc.disk_type.clone(), + ) + }; + + let data_base_name = + crate::storage::volume::volume_file_name(&data_base, &vol_info.collection, vid); + let idx_base_name = + crate::storage::volume::volume_file_name(&idx_base, &vol_info.collection, vid); + + // Write a .note file to indicate copy in progress + let note_path = format!("{}.note", data_base_name); + let _ = std::fs::write(¬e_path, format!("copying from {}", source)); + + let has_remote_dat = vol_info + .volume_info + .as_ref() + .map(|vi| !vi.files.is_empty()) + .unwrap_or(false); + + let (tx, rx) = + tokio::sync::mpsc::channel::>(16); + let state = self.state.clone(); + + tokio::spawn(async move { + let result = async { + let report_interval: i64 = 128 * 1024 * 1024; + let mut next_report_target: i64 = report_interval; + let io_byte_per_second = if req.io_byte_per_second > 0 { + req.io_byte_per_second + } else { + state.maintenance_byte_per_second + }; + let mut throttler = WriteThrottler::new(io_byte_per_second); + + // Query master for preallocation settings (matching Go VolumeCopy behavior). + let mut preallocate_size: i64 = 0; + if !has_remote_dat { + let grpc_addr = super::heartbeat::to_grpc_address(&state.master_url); + match super::heartbeat::try_get_master_configuration( + &grpc_addr, + state.outgoing_grpc_tls.as_ref(), + ) + .await + { + Ok(resp) => { + if resp.volume_preallocate { + preallocate_size = resp.volume_size_limit_m_b as i64 * 1024 * 1024; + } + } + Err(e) => { + tracing::warn!("get master {} configuration: {}", state.master_url, e); + } + } + + if preallocate_size > 0 { + let dat_path = format!("{}.dat", data_base_name); + let file = std::fs::File::create(&dat_path).map_err(|e| { + Status::internal(format!( + "create preallocated volume file {}: {}", + dat_path, e + )) + })?; + file.set_len(preallocate_size as u64).map_err(|e| { + Status::internal(format!("preallocate volume file {}: {}", dat_path, e)) + })?; + } + } + + // Copy .dat file + if !has_remote_dat { + let dat_path = format!("{}.dat", data_base_name); + let dat_modified_ts_ns = copy_file_from_source( + &mut client, + false, + &req.collection, + req.volume_id, + vol_info.compaction_revision, + vol_info.dat_file_size, + &dat_path, + ".dat", + false, + true, + Some(&tx), + &mut next_report_target, + report_interval, + &mut throttler, + ) + .await + .map_err(|e| Status::internal(e))?; + if dat_modified_ts_ns > 0 { + set_file_mtime(&dat_path, dat_modified_ts_ns); + } + } + + // Copy .idx file + let idx_path = format!("{}.idx", idx_base_name); + let idx_modified_ts_ns = copy_file_from_source( + &mut client, + false, + &req.collection, + req.volume_id, + vol_info.compaction_revision, + vol_info.idx_file_size, + &idx_path, + ".idx", + false, + false, + None, + &mut next_report_target, + report_interval, + &mut throttler, + ) + .await + .map_err(|e| Status::internal(e))?; + if idx_modified_ts_ns > 0 { + set_file_mtime(&idx_path, idx_modified_ts_ns); + } + + // Copy .vif file (ignore if not found on source) + let vif_path = format!("{}.vif", data_base_name); + let vif_modified_ts_ns = copy_file_from_source( + &mut client, + false, + &req.collection, + req.volume_id, + vol_info.compaction_revision, + 1024 * 1024, + &vif_path, + ".vif", + false, + true, + None, + &mut next_report_target, + report_interval, + &mut throttler, + ) + .await + .map_err(|e| Status::internal(e))?; + if vif_modified_ts_ns > 0 { + set_file_mtime(&vif_path, vif_modified_ts_ns); + } + + // Remove the .note file + let _ = std::fs::remove_file(¬e_path); + + // Verify file sizes + if !has_remote_dat { + let dat_path = format!("{}.dat", data_base_name); + check_copy_file_size(&dat_path, vol_info.dat_file_size)?; + } + if vol_info.idx_file_size > 0 { + check_copy_file_size(&idx_path, vol_info.idx_file_size)?; + } + + // Find last_append_at_ns from copied files + let last_append_at_ns = if !has_remote_dat { + find_last_append_at_ns( + &idx_path, + &format!("{}.dat", data_base_name), + vol_info.version, + ) + .unwrap_or(vol_info.dat_file_timestamp_seconds * 1_000_000_000) + } else { + vol_info.dat_file_timestamp_seconds * 1_000_000_000 + }; + + // Mount the volume + { + let mut store = state.store.write().unwrap(); + store + .mount_volume(vid, &vol_info.collection, selected_disk_type) + .map_err(|e| { + Status::internal(format!("failed to mount volume {}: {}", vid, e)) + })?; + } + state.volume_state_notify.notify_one(); + + // Send final response with last_append_at_ns + let _ = tx + .send(Ok(volume_server_pb::VolumeCopyResponse { + last_append_at_ns: last_append_at_ns, + processed_bytes: 0, + })) + .await; + + Ok::<(), Status>(()) + } + .await; + + if let Err(e) = result { + // Clean up on error + let _ = std::fs::remove_file(format!("{}.dat", data_base_name)); + let _ = std::fs::remove_file(format!("{}.idx", idx_base_name)); + let _ = std::fs::remove_file(format!("{}.vif", data_base_name)); + let _ = std::fs::remove_file(¬e_path); + let _ = tx.send(Err(e)).await; + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new(Box::pin(stream))) + } + + async fn read_volume_file_status( + &self, + request: Request, + ) -> Result, Status> { + let vid = VolumeId(request.into_inner().volume_id); + let store = self.state.store.read().unwrap(); + let (loc_idx, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + let mod_time = vol.dat_file_mod_time(); + Ok(Response::new( + volume_server_pb::ReadVolumeFileStatusResponse { + volume_id: vid.0, + idx_file_timestamp_seconds: mod_time, + idx_file_size: vol.idx_file_size(), + dat_file_timestamp_seconds: mod_time, + dat_file_size: vol.dat_file_size().unwrap_or(0), + file_count: vol.file_count() as u64, + compaction_revision: vol.super_block.compaction_revision as u32, + collection: vol.collection.clone(), + disk_type: store.locations[loc_idx].disk_type.to_string(), + volume_info: Some(vol.volume_info.clone()), + version: vol.version().0 as u32, + }, + )) + } + + type CopyFileStream = BoxStream; + async fn copy_file( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let file_name: String; + + if !req.is_ec_volume { + // Sync volume to disk before copying (matching Go's v.SyncToDisk()) + { + let mut store = self.state.store.write().unwrap(); + if let Some((_, v)) = store.find_volume_mut(vid) { + let _ = v.sync_to_disk(); + } + } + + let store = self.state.store.read().unwrap(); + let (_, v) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + // Check compaction revision + if req.compaction_revision != u32::MAX + && v.last_compact_revision() != req.compaction_revision as u16 + { + return Err(Status::failed_precondition(format!( + "volume {} is compacted", + vid.0 + ))); + } + + file_name = v.file_name(&req.ext); + drop(store); + } else { + // Sync EC volume journal to disk before copying (matching Go's ecv.SyncToDisk()) + { + let store = self.state.store.read().unwrap(); + if let Some(ecv) = store.find_ec_volume(vid) { + let _ = ecv.sync_to_disk(); + } + } + + // EC volume: search disk locations for the file + let store = self.state.store.read().unwrap(); + let mut found_path = None; + let ec_base = if req.collection.is_empty() { + format!("{}{}", vid.0, req.ext) + } else { + format!("{}_{}{}", req.collection, vid.0, req.ext) + }; + for loc in &store.locations { + let path = format!("{}/{}", loc.directory, ec_base); + if std::path::Path::new(&path).exists() { + found_path = Some(path); + } + let idx_path = format!("{}/{}", loc.idx_directory, ec_base); + if std::path::Path::new(&idx_path).exists() { + found_path = Some(idx_path); + } + } + drop(store); + + match found_path { + Some(p) => file_name = p, + None => { + if req.ignore_source_file_not_found { + let stream = tokio_stream::iter(Vec::new()); + return Ok(Response::new(Box::pin(stream))); + } + return Err(Status::not_found(format!( + "CopyFile not found ec volume id {}", + vid.0 + ))); + } + } + } + + // Open file and read content + let file = match std::fs::File::open(&file_name) { + Ok(f) => f, + Err(e) if e.kind() == std::io::ErrorKind::NotFound => { + if req.ignore_source_file_not_found || req.stop_offset == 0 { + let stream = tokio_stream::iter(Vec::new()); + return Ok(Response::new(Box::pin(stream))); + } + return Err(Status::not_found(format!("{}", e))); + } + Err(e) => return Err(Status::internal(e.to_string())), + }; + + let metadata = file + .metadata() + .map_err(|e| Status::internal(e.to_string()))?; + let mod_ts_ns = metadata + .modified() + .ok() + .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) + .map(|d| d.as_nanos() as i64) + .unwrap_or(0); + + let mut results: Vec> = Vec::new(); + let mut bytes_to_read = req.stop_offset as i64; + let mut reader = std::io::BufReader::new(file); + let buffer_size = 2 * 1024 * 1024; // 2MB chunks + let mut first = true; + + use std::io::Read; + while bytes_to_read > 0 { + let chunk_size = std::cmp::min(bytes_to_read as usize, buffer_size); + let mut buf = vec![0u8; chunk_size]; + match reader.read(&mut buf) { + Ok(0) => break, // EOF + Ok(n) => { + buf.truncate(n); + if n as i64 > bytes_to_read { + buf.truncate(bytes_to_read as usize); + } + results.push(Ok(volume_server_pb::CopyFileResponse { + file_content: buf, + modified_ts_ns: if first { mod_ts_ns } else { 0 }, + })); + first = false; + bytes_to_read -= n as i64; + } + Err(e) => return Err(Status::internal(e.to_string())), + } + } + + // If no data was sent, still send ModifiedTsNs + if first && mod_ts_ns != 0 { + results.push(Ok(volume_server_pb::CopyFileResponse { + file_content: vec![], + modified_ts_ns: mod_ts_ns, + })); + } + + let stream = tokio_stream::iter(results); + Ok(Response::new(Box::pin(stream))) + } + + async fn receive_file( + &self, + request: Request>, + ) -> Result, Status> { + self.state.check_maintenance()?; + + let mut stream = request.into_inner(); + let mut target_file: Option = None; + let mut file_path: Option = None; + let mut bytes_written: u64 = 0; + let mut resp_error: Option = None; + + let result: Result<(), Status> = async { + while let Some(req) = stream.message().await? { + match req.data { + Some(volume_server_pb::receive_file_request::Data::Info(info)) => { + // Determine file path + let path = if info.is_ec_volume { + let store = self.state.store.read().unwrap(); + // Go prefers a HardDriveType location, then falls back to first + let dir = store + .locations + .iter() + .find(|loc| loc.disk_type == DiskType::HardDrive) + .or_else(|| store.locations.first()) + .map(|loc| loc.directory.clone()); + drop(store); + let dir = match dir { + Some(d) => d, + None => { + resp_error = Some("no storage location available".to_string()); + break; + } + }; + let ec_base = if info.collection.is_empty() { + format!("{}", info.volume_id) + } else { + format!("{}_{}", info.collection, info.volume_id) + }; + format!("{}/{}{}", dir, ec_base, info.ext) + } else { + let store = self.state.store.read().unwrap(); + let (_, v) = + store.find_volume(VolumeId(info.volume_id)).ok_or_else(|| { + Status::not_found(format!( + "volume {} not found", + info.volume_id + )) + })?; + let p = v.file_name(&info.ext); + drop(store); + p + }; + + target_file = Some(std::fs::File::create(&path).map_err(|e| { + Status::internal(format!("failed to create file: {}", e)) + })?); + file_path = Some(path); + } + Some(volume_server_pb::receive_file_request::Data::FileContent(content)) => { + if let Some(ref mut f) = target_file { + use std::io::Write; + match f.write(&content) { + Ok(n) => bytes_written += n as u64, + Err(e) => { + // Match Go: write failures are response-level errors, not gRPC errors + resp_error = Some(format!("failed to write file: {}", e)); + break; + } + } + } else { + // Go returns protocol violations as response-level errors + resp_error = Some("file info must be sent first".to_string()); + break; + } + } + None => { + resp_error = Some("unknown message type".to_string()); + break; + } + } + } + Ok(()) + } + .await; + + match result { + Ok(()) => { + // Check for protocol-level errors (returned in response body, not gRPC status) + if let Some(err_msg) = resp_error { + return Ok(Response::new(volume_server_pb::ReceiveFileResponse { + error: err_msg, + bytes_written: 0, + })); + } + if let Some(ref f) = target_file { + let _ = f.sync_all(); + } + Ok(Response::new(volume_server_pb::ReceiveFileResponse { + error: String::new(), + bytes_written, + })) + } + Err(e) => { + // Clean up partial file on stream error (Go parity: closes file, removes it) + if let Some(f) = target_file.take() { + drop(f); + } + if let Some(ref p) = file_path { + let _ = std::fs::remove_file(p); + } + Err(e) + } + } + } + + async fn read_needle_blob( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let offset = req.offset; + let size = Size(req.size); + + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + let blob = vol.read_needle_blob(offset, size).map_err(|e| { + Status::internal(format!( + "read needle blob offset {} size {}: {}", + offset, size.0, e + )) + })?; + + Ok(Response::new(volume_server_pb::ReadNeedleBlobResponse { + needle_blob: blob, + })) + } + + async fn read_needle_meta( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let needle_id = NeedleId(req.needle_id); + + let store = self.state.store.read().unwrap(); + let (_, vol) = store.find_volume(vid).ok_or_else(|| { + Status::not_found(format!( + "not found volume id {} and read needle metadata at ec shards is not supported", + vid + )) + })?; + + let offset = req.offset; + let size = crate::storage::types::Size(req.size); + + let mut n = Needle { + id: needle_id, + flags: 0x08, + ..Needle::default() + }; + vol.read_needle_meta_at(&mut n, offset, size) + .map_err(|e| Status::internal(format!("read needle meta: {}", e)))?; + + let ttl_str = n.ttl.as_ref().map_or(String::new(), |t| t.to_string()); + Ok(Response::new(volume_server_pb::ReadNeedleMetaResponse { + cookie: n.cookie.0, + last_modified: n.last_modified, + crc: n.checksum.0, + ttl: ttl_str, + append_at_ns: n.append_at_ns, + })) + } + + async fn write_needle_blob( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let needle_id = NeedleId(req.needle_id); + let size = Size(req.size); + + let mut store = self.state.store.write().unwrap(); + let (_, vol) = store + .find_volume_mut(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + + vol.write_needle_blob_and_index(needle_id, &req.needle_blob, size) + .map_err(|e| { + Status::internal(format!( + "write blob needle {} size {}: {}", + needle_id.0, size.0, e + )) + })?; + + Ok(Response::new(volume_server_pb::WriteNeedleBlobResponse {})) + } + + type ReadAllNeedlesStream = BoxStream; + async fn read_all_needles( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let state = self.state.clone(); + + let (tx, rx) = tokio::sync::mpsc::channel(32); + + // Stream needles lazily via a blocking task (matches Go's scanner pattern) + tokio::task::spawn_blocking(move || { + let store = state.store.read().unwrap(); + for &raw_vid in &req.volume_ids { + let vid = VolumeId(raw_vid); + let v = match store.find_volume(vid) { + Some((_, v)) => v, + None => { + let _ = tx.blocking_send(Err(Status::not_found(format!( + "not found volume id {}", + vid + )))); + return; + } + }; + + let needles = match v.read_all_needles() { + Ok(n) => n, + Err(e) => { + let _ = tx.blocking_send(Err(Status::internal(e.to_string()))); + return; + } + }; + + for n in needles { + let compressed = n.is_compressed(); + if tx + .blocking_send(Ok(volume_server_pb::ReadAllNeedlesResponse { + volume_id: raw_vid, + needle_id: n.id.into(), + cookie: n.cookie.0, + needle_blob: n.data, + needle_blob_compressed: compressed, + last_modified: n.last_modified, + crc: n.checksum.0, + name: n.name, + mime: n.mime, + })) + .is_err() + { + return; // receiver dropped + } + } + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new(Box::pin(stream))) + } + + type VolumeTailSenderStream = BoxStream; + async fn volume_tail_sender( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let (version, sb_size) = { + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + (vol.version().0 as u32, vol.super_block.block_size() as u64) + }; + + let state = self.state.clone(); + let (tx, rx) = tokio::sync::mpsc::channel(32); + const BUFFER_SIZE_LIMIT: usize = 2 * 1024 * 1024; + + tokio::spawn(async move { + let since_ns = req.since_ns; + let idle_timeout = req.idle_timeout_seconds; + let mut last_timestamp_ns = since_ns; + let mut draining_seconds = idle_timeout as i64; + + loop { + // Use binary search to find starting offset, then scan from there + let scan_result = { + let store = state.store.read().unwrap(); + if let Some((_, vol)) = store.find_volume(vid) { + let start_offset = if last_timestamp_ns > 0 { + match vol.binary_search_by_append_at_ns(last_timestamp_ns) { + Ok((offset, _is_last)) => { + if offset.is_zero() { + Ok(sb_size) + } else { + Ok(offset.to_actual_offset() as u64) + } + } + Err(e) => { + tracing::warn!( + "fail to locate by appendAtNs {}: {}", + last_timestamp_ns, + e + ); + Err(format!( + "fail to locate by appendAtNs {}: {}", + last_timestamp_ns, e + )) + } + } + } else { + Ok(sb_size) + }; + match start_offset { + Ok(off) => Ok(vol.scan_raw_needles_from(off)), + Err(msg) => Err(msg), + } + } else { + break; + } + }; + + let scan_inner = match scan_result { + Ok(r) => r, + Err(msg) => { + let _ = tx.send(Err(Status::internal(msg))).await; + return; + } + }; + + let entries = match scan_inner { + Ok(e) => e, + Err(_) => break, + }; + + // Filter entries since last_timestamp_ns + let mut last_processed_ns = last_timestamp_ns; + let mut sent_any = false; + for (header, body, append_at_ns) in &entries { + if *append_at_ns <= last_timestamp_ns && last_timestamp_ns > 0 { + continue; + } + sent_any = true; + // Send body in chunks of BUFFER_SIZE_LIMIT + // Go sends needle_header on every chunk + let mut i = 0; + while i < body.len() { + let end = std::cmp::min(i + BUFFER_SIZE_LIMIT, body.len()); + let is_last_chunk = end >= body.len(); + let msg = volume_server_pb::VolumeTailSenderResponse { + needle_header: header.clone(), + needle_body: body[i..end].to_vec(), + is_last_chunk, + version, + }; + if tx.send(Ok(msg)).await.is_err() { + return; + } + i = end; + } + if *append_at_ns > last_processed_ns { + last_processed_ns = *append_at_ns; + } + } + + if !sent_any { + // Send heartbeat + let msg = volume_server_pb::VolumeTailSenderResponse { + is_last_chunk: true, + version, + ..Default::default() + }; + if tx.send(Ok(msg)).await.is_err() { + return; + } + } + + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + + if idle_timeout == 0 { + last_timestamp_ns = last_processed_ns; + continue; + } + if last_processed_ns == last_timestamp_ns { + draining_seconds -= 1; + if draining_seconds <= 0 { + return; // EOF + } + } else { + last_timestamp_ns = last_processed_ns; + draining_seconds = idle_timeout as i64; + } + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new(Box::pin(stream))) + } + + async fn volume_tail_receiver( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Check volume exists + { + let store = self.state.store.read().unwrap(); + store.find_volume(vid).ok_or_else(|| { + Status::not_found(format!("receiver not found volume id {}", vid)) + })?; + } + + // Parse source address and connect + let source = &req.source_volume_server; + let grpc_addr = parse_grpc_address(source) + .map_err(|e| Status::internal(format!("invalid source address {}: {}", source, e)))?; + + let channel = build_grpc_endpoint(&grpc_addr, self.state.outgoing_grpc_tls.as_ref()) + .map_err(|e| Status::internal(format!("parse source: {}", e)))? + .connect() + .await + .map_err(|e| Status::internal(format!("connect to {}: {}", grpc_addr, e)))?; + + let mut client = + volume_server_pb::volume_server_client::VolumeServerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + + // Call VolumeTailSender on source + let mut stream = client + .volume_tail_sender(volume_server_pb::VolumeTailSenderRequest { + volume_id: req.volume_id, + since_ns: req.since_ns, + idle_timeout_seconds: req.idle_timeout_seconds, + }) + .await + .map_err(|e| Status::internal(format!("volume_tail_sender: {}", e)))? + .into_inner(); + + let state = self.state.clone(); + + // Receive needles from source and write locally + while let Some(resp) = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv from tail sender: {}", e)))? + { + let needle_header = resp.needle_header; + let mut needle_body = resp.needle_body; + + if needle_header.is_empty() { + continue; + } + + // Collect all chunks if not last + if !resp.is_last_chunk { + // Need to receive remaining chunks + loop { + let chunk = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv chunk: {}", e)))? + .ok_or_else(|| Status::internal("unexpected end of tail stream"))?; + needle_body.extend_from_slice(&chunk.needle_body); + if chunk.is_last_chunk { + break; + } + } + } + + // Parse needle from header + body + let mut n = Needle::default(); + n.read_header(&needle_header); + n.read_body_v2(&needle_body) + .map_err(|e| Status::internal(format!("parse needle body: {}", e)))?; + + // Write needle to local volume + let mut store = state.store.write().unwrap(); + store + .write_volume_needle(vid, &mut n) + .map_err(|e| Status::internal(format!("write needle: {}", e)))?; + } + + Ok(Response::new( + volume_server_pb::VolumeTailReceiverResponse {}, + )) + } + + // ---- EC operations ---- + + async fn volume_ec_shards_generate( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let collection = &req.collection; + + // Find the volume's directory and validate collection + let (dir, idx_dir, vol_version, dat_file_size, expire_at_sec) = { + let store = self.state.store.read().unwrap(); + let (loc_idx, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", vid)))?; + if vol.collection != req.collection { + return Err(Status::internal(format!( + "existing collection:{} unexpected input: {}", + vol.collection, req.collection + ))); + } + let version = vol.version().0 as u32; + let dat_size = vol.dat_file_size().unwrap_or(0) as i64; + let expire_at_sec = { + let ttl_seconds = vol.super_block.ttl.to_seconds(); + if ttl_seconds > 0 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + + ttl_seconds + } else { + 0 + } + }; + ( + store.locations[loc_idx].directory.clone(), + store.locations[loc_idx].idx_directory.clone(), + version, + dat_size, + expire_at_sec, + ) + }; + + // Check existing .vif for EC shard config (matching Go's MaybeLoadVolumeInfo) + let (data_shards, parity_shards) = + crate::storage::erasure_coding::ec_volume::read_ec_shard_config(&dir, collection, vid); + + if let Err(e) = crate::storage::erasure_coding::ec_encoder::write_ec_files( + &dir, + &idx_dir, + collection, + vid, + data_shards as usize, + parity_shards as usize, + ) { + // Cleanup partially-created .ecNN and .ecx files on failure (matching Go defer) + let base = crate::storage::volume::volume_file_name(&dir, collection, vid); + let total_shards = data_shards + parity_shards; + for i in 0..total_shards { + let shard_path = format!("{}.ec{:02}", base, i); + let _ = std::fs::remove_file(&shard_path); + } + let _ = std::fs::remove_file(format!("{}.ecx", base)); + return Err(Status::internal(e.to_string())); + } + + // Write .vif file with EC shard metadata + { + let base = crate::storage::volume::volume_file_name(&dir, collection, vid); + let vif_path = format!("{}.vif", base); + let vif = crate::storage::volume::VifVolumeInfo { + version: vol_version, + dat_file_size, + expire_at_sec, + ec_shard_config: Some(crate::storage::volume::VifEcShardConfig { + data_shards: data_shards, + parity_shards: parity_shards, + }), + ..Default::default() + }; + let content = serde_json::to_string_pretty(&vif) + .map_err(|e| Status::internal(format!("serialize vif: {}", e)))?; + std::fs::write(&vif_path, content) + .map_err(|e| Status::internal(format!("write vif: {}", e)))?; + } + + Ok(Response::new( + volume_server_pb::VolumeEcShardsGenerateResponse {}, + )) + } + + async fn volume_ec_shards_rebuild( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let collection = &req.collection; + + // Search ALL locations for shards, pick the best rebuild location + // (most shards + has .ecx), collect additional dirs. + // Matches Go's multi-location search in VolumeEcShardsRebuild. + let base_name = if collection.is_empty() { + format!("{}", vid.0) + } else { + format!("{}_{}", collection, vid.0) + }; + + struct LocInfo { + dir: String, + idx_dir: String, + shard_count: usize, + has_ecx: bool, + } + + let store = self.state.store.read().unwrap(); + let mut loc_infos: Vec = Vec::new(); + + for loc in &store.locations { + // Count shards in this location's directory + let mut shard_count = 0usize; + if let Ok(entries) = std::fs::read_dir(&loc.directory) { + for entry in entries.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy(); + if name.starts_with(&format!("{}.ec", base_name)) { + let suffix = &name[base_name.len() + 3..]; + if suffix.len() == 2 && suffix.chars().all(|c| c.is_ascii_digit()) { + shard_count += 1; + } + } + } + } + + // Check for .ecx in idx_directory first, then data directory + let idx_base = format!("{}/{}", loc.idx_directory, base_name); + let data_base = format!("{}/{}", loc.directory, base_name); + let has_ecx = std::path::Path::new(&format!("{}.ecx", idx_base)).exists() + || (loc.idx_directory != loc.directory + && std::path::Path::new(&format!("{}.ecx", data_base)).exists()); + + if shard_count == 0 && !has_ecx { + continue; + } + + loc_infos.push(LocInfo { + dir: loc.directory.clone(), + idx_dir: loc.idx_directory.clone(), + shard_count, + has_ecx, + }); + } + drop(store); + + if loc_infos.is_empty() { + return Ok(Response::new( + volume_server_pb::VolumeEcShardsRebuildResponse { + rebuilt_shard_ids: vec![], + }, + )); + } + + // Pick rebuild location: has .ecx and most shards + let mut rebuild_loc_idx: Option = None; + let mut other_dirs: Vec = Vec::new(); + + for (i, info) in loc_infos.iter().enumerate() { + if info.has_ecx + && (rebuild_loc_idx.is_none() + || info.shard_count > loc_infos[rebuild_loc_idx.unwrap()].shard_count) + { + if let Some(prev) = rebuild_loc_idx { + other_dirs.push(loc_infos[prev].dir.clone()); + } + rebuild_loc_idx = Some(i); + } else { + other_dirs.push(info.dir.clone()); + } + } + + let rebuild_loc_idx = match rebuild_loc_idx { + Some(i) => i, + None => { + return Ok(Response::new( + volume_server_pb::VolumeEcShardsRebuildResponse { + rebuilt_shard_ids: vec![], + }, + )); + } + }; + + let rebuild_dir = loc_infos[rebuild_loc_idx].dir.clone(); + let rebuild_idx_dir = loc_infos[rebuild_loc_idx].idx_dir.clone(); + + // Determine data/parity shard config from rebuild dir + let (data_shards, parity_shards) = + crate::storage::erasure_coding::ec_volume::read_ec_shard_config( + &rebuild_dir, + collection, + vid, + ); + let total_shards = data_shards + parity_shards; + + // Check which shards are missing (check rebuild dir and all other dirs) + let mut missing: Vec = Vec::new(); + for shard_id in 0..total_shards as u8 { + let shard = crate::storage::erasure_coding::ec_shard::EcVolumeShard::new( + &rebuild_dir, + collection, + vid, + shard_id, + ); + let mut found = std::path::Path::new(&shard.file_name()).exists(); + if !found { + for other_dir in &other_dirs { + let other_shard = crate::storage::erasure_coding::ec_shard::EcVolumeShard::new( + other_dir, collection, vid, shard_id, + ); + if std::path::Path::new(&other_shard.file_name()).exists() { + found = true; + break; + } + } + } + if !found { + missing.push(shard_id as u32); + } + } + + if missing.is_empty() { + return Ok(Response::new( + volume_server_pb::VolumeEcShardsRebuildResponse { + rebuilt_shard_ids: vec![], + }, + )); + } + + // Rebuild missing shards, searching all locations for input shards + crate::storage::erasure_coding::ec_encoder::rebuild_ec_files( + &rebuild_dir, + collection, + vid, + &missing, + data_shards as usize, + parity_shards as usize, + ) + .map_err(|e| Status::internal(format!("RebuildEcFiles: {}", e)))?; + + // Rebuild .ecx; use idx_directory with fallback to data directory + let ecx_base = format!("{}/{}", rebuild_idx_dir, base_name); + let ecx_rebuild_dir = if std::path::Path::new(&format!("{}.ecx", ecx_base)).exists() { + rebuild_idx_dir + } else if rebuild_idx_dir != rebuild_dir { + rebuild_dir.clone() + } else { + rebuild_idx_dir + }; + + crate::storage::erasure_coding::ec_encoder::rebuild_ecx_file( + &ecx_rebuild_dir, + collection, + vid, + data_shards as usize, + ) + .map_err(|e| Status::internal(format!("RebuildEcxFile: {}", e)))?; + + Ok(Response::new( + volume_server_pb::VolumeEcShardsRebuildResponse { + rebuilt_shard_ids: missing, + }, + )) + } + + async fn volume_ec_shards_copy( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Select target location matching Go's 3-tier fallback: + // When disk_id > 0: use that specific location + // When disk_id == 0 (unset): (1) location with existing EC shards, (2) any HDD, (3) any + let (dest_dir, dest_idx_dir) = { + let store = self.state.store.read().unwrap(); + let count = store.locations.len(); + + if req.disk_id > 0 { + // Explicit disk selection + if (req.disk_id as usize) >= count { + return Err(Status::invalid_argument(format!( + "invalid disk_id {}: only have {} disks", + req.disk_id, count + ))); + } + let loc = &store.locations[req.disk_id as usize]; + (loc.directory.clone(), loc.idx_directory.clone()) + } else { + // Auto-select: prefer location with existing EC shards for this volume + let loc_idx = store + .find_free_location_predicate(|loc| loc.has_ec_volume(vid)) + .or_else(|| { + // Fall back to any HDD location + store.find_free_location_predicate(|loc| { + loc.disk_type == DiskType::HardDrive + }) + }) + .or_else(|| { + // Fall back to any location + store.find_free_location_predicate(|_| true) + }); + match loc_idx { + Some(i) => { + let loc = &store.locations[i]; + (loc.directory.clone(), loc.idx_directory.clone()) + } + None => { + return Err(Status::internal("no space left".to_string())); + } + } + } + }; + + // Connect to source and copy shard files via CopyFile + let source = &req.source_data_node; + let grpc_addr = parse_grpc_address(source).map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} invalid source_data_node {}: {}", + vid, source, e + )) + })?; + + let channel = build_grpc_endpoint(&grpc_addr, self.state.outgoing_grpc_tls.as_ref()) + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} parse source: {}", + vid, e + )) + })? + .connect() + .await + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} connect to {}: {}", + vid, grpc_addr, e + )) + })?; + + let mut client = + volume_server_pb::volume_server_client::VolumeServerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + + // Copy each shard + for &shard_id in &req.shard_ids { + let ext = format!(".ec{:02}", shard_id); + let copy_req = volume_server_pb::CopyFileRequest { + volume_id: req.volume_id, + collection: req.collection.clone(), + is_ec_volume: true, + ext: ext.clone(), + compaction_revision: u32::MAX, + stop_offset: i64::MAX as u64, + ..Default::default() + }; + let mut stream = client + .copy_file(copy_req) + .await + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} copy {}: {}", + vid, ext, e + )) + })? + .into_inner(); + + let file_path = { + let base = + crate::storage::volume::volume_file_name(&dest_dir, &req.collection, vid); + format!("{}{}", base, ext) + }; + let mut file = std::fs::File::create(&file_path) + .map_err(|e| Status::internal(format!("create {}: {}", file_path, e)))?; + while let Some(chunk) = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv {}: {}", ext, e)))? + { + use std::io::Write; + file.write_all(&chunk.file_content) + .map_err(|e| Status::internal(format!("write {}: {}", file_path, e)))?; + } + } + + // Copy .ecx file if requested + if req.copy_ecx_file { + let copy_req = volume_server_pb::CopyFileRequest { + volume_id: req.volume_id, + collection: req.collection.clone(), + is_ec_volume: true, + ext: ".ecx".to_string(), + compaction_revision: u32::MAX, + stop_offset: i64::MAX as u64, + ..Default::default() + }; + let mut stream = client + .copy_file(copy_req) + .await + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} copy .ecx: {}", + vid, e + )) + })? + .into_inner(); + + let file_path = { + let base = + crate::storage::volume::volume_file_name(&dest_idx_dir, &req.collection, vid); + format!("{}.ecx", base) + }; + let mut file = std::fs::File::create(&file_path) + .map_err(|e| Status::internal(format!("create {}: {}", file_path, e)))?; + while let Some(chunk) = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv .ecx: {}", e)))? + { + use std::io::Write; + file.write_all(&chunk.file_content) + .map_err(|e| Status::internal(format!("write {}: {}", file_path, e)))?; + } + } + + // Copy .ecj file if requested + if req.copy_ecj_file { + let copy_req = volume_server_pb::CopyFileRequest { + volume_id: req.volume_id, + collection: req.collection.clone(), + is_ec_volume: true, + ext: ".ecj".to_string(), + compaction_revision: u32::MAX, + stop_offset: i64::MAX as u64, + ignore_source_file_not_found: true, + ..Default::default() + }; + let mut stream = client + .copy_file(copy_req) + .await + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} copy .ecj: {}", + vid, e + )) + })? + .into_inner(); + + let file_path = { + let base = + crate::storage::volume::volume_file_name(&dest_idx_dir, &req.collection, vid); + format!("{}.ecj", base) + }; + let mut file = std::fs::OpenOptions::new() + .create(true) + .append(true) + .open(&file_path) + .map_err(|e| Status::internal(format!("create {}: {}", file_path, e)))?; + while let Some(chunk) = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv .ecj: {}", e)))? + { + use std::io::Write; + file.write_all(&chunk.file_content) + .map_err(|e| Status::internal(format!("write {}: {}", file_path, e)))?; + } + } + + // Copy .vif file if requested + if req.copy_vif_file { + let copy_req = volume_server_pb::CopyFileRequest { + volume_id: req.volume_id, + collection: req.collection.clone(), + is_ec_volume: true, + ext: ".vif".to_string(), + compaction_revision: u32::MAX, + stop_offset: i64::MAX as u64, + ignore_source_file_not_found: true, + ..Default::default() + }; + let mut stream = client + .copy_file(copy_req) + .await + .map_err(|e| { + Status::internal(format!( + "VolumeEcShardsCopy volume {} copy .vif: {}", + vid, e + )) + })? + .into_inner(); + + let file_path = { + let base = + crate::storage::volume::volume_file_name(&dest_dir, &req.collection, vid); + format!("{}.vif", base) + }; + let mut file = std::fs::File::create(&file_path) + .map_err(|e| Status::internal(format!("create {}: {}", file_path, e)))?; + while let Some(chunk) = stream + .message() + .await + .map_err(|e| Status::internal(format!("recv .vif: {}", e)))? + { + use std::io::Write; + file.write_all(&chunk.file_content) + .map_err(|e| Status::internal(format!("write {}: {}", file_path, e)))?; + } + } + + Ok(Response::new( + volume_server_pb::VolumeEcShardsCopyResponse {}, + )) + } + + async fn volume_ec_shards_delete( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let mut store = self.state.store.write().unwrap(); + store.delete_ec_shards(vid, &req.collection, &req.shard_ids); + drop(store); + self.state.volume_state_notify.notify_one(); + Ok(Response::new( + volume_server_pb::VolumeEcShardsDeleteResponse {}, + )) + } + + async fn volume_ec_shards_mount( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Mount one shard at a time, returning error on first failure. + // Matches Go: for _, shardId := range req.ShardIds { err = vs.store.MountEcShards(...) } + let mut store = self.state.store.write().unwrap(); + for &shard_id in &req.shard_ids { + store + .mount_ec_shard(vid, &req.collection, shard_id) + .map_err(|e| { + Status::internal(format!("mount {}.{}: {}", req.volume_id, shard_id, e)) + })?; + } + drop(store); + self.state.volume_state_notify.notify_one(); + + Ok(Response::new( + volume_server_pb::VolumeEcShardsMountResponse {}, + )) + } + + async fn volume_ec_shards_unmount( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Unmount one shard at a time, returning error on first failure. + // Matches Go: for _, shardId := range req.ShardIds { err = vs.store.UnmountEcShards(...) } + let mut store = self.state.store.write().unwrap(); + for &shard_id in &req.shard_ids { + store.unmount_ec_shard(vid, shard_id).map_err(|e| { + Status::internal(format!("unmount {}.{}: {}", req.volume_id, shard_id, e)) + })?; + } + drop(store); + self.state.volume_state_notify.notify_one(); + Ok(Response::new( + volume_server_pb::VolumeEcShardsUnmountResponse {}, + )) + } + + type VolumeEcShardReadStream = BoxStream; + async fn volume_ec_shard_read( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let store = self.state.store.read().unwrap(); + let ec_vol = store.find_ec_volume(vid).ok_or_else(|| { + Status::not_found(format!( + "ec volume {} shard {} not found", + req.volume_id, req.shard_id + )) + })?; + + // Check if the requested needle is deleted (via .ecx index, matching Go) + if req.file_key > 0 { + let needle_id = NeedleId(req.file_key); + if let Some((_offset, size)) = ec_vol + .find_needle_from_ecx(needle_id) + .map_err(|e| Status::internal(e.to_string()))? + { + if size.is_deleted() { + let results = vec![Ok(volume_server_pb::VolumeEcShardReadResponse { + is_deleted: true, + ..Default::default() + })]; + return Ok(Response::new(Box::pin(tokio_stream::iter(results)))); + } + } + } + + // Read from the shard + let shard = ec_vol + .shards + .get(req.shard_id as usize) + .and_then(|s| s.as_ref()) + .ok_or_else(|| { + Status::not_found(format!( + "ec volume {} shard {} not mounted", + req.volume_id, req.shard_id + )) + })?; + + let total_size = if req.size > 0 { + req.size as usize + } else { + 1024 * 1024 + }; + + // Stream in 2MB chunks (matching Go's BufferSizeLimit) + const BUFFER_SIZE_LIMIT: usize = 2 * 1024 * 1024; + let mut results: Vec> = + Vec::new(); + let mut bytes_read: usize = 0; + let mut current_offset = req.offset as u64; + + while bytes_read < total_size { + let chunk_size = std::cmp::min(BUFFER_SIZE_LIMIT, total_size - bytes_read); + let mut buf = vec![0u8; chunk_size]; + let n = shard + .read_at(&mut buf, current_offset) + .map_err(|e| Status::internal(e.to_string()))?; + if n == 0 { + break; + } + buf.truncate(n); + bytes_read += n; + current_offset += n as u64; + results.push(Ok(volume_server_pb::VolumeEcShardReadResponse { + data: buf, + is_deleted: false, + })); + if n < chunk_size { + break; // short read means EOF + } + } + + Ok(Response::new(Box::pin(tokio_stream::iter(results)))) + } + + async fn volume_ec_blob_delete( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let needle_id = NeedleId(req.file_key); + + // Go checks if needle is already deleted (via ecx) before journaling. + // Search all locations for the EC volume. + let mut store = self.state.store.write().unwrap(); + if let Some(ec_vol) = store.find_ec_volume_mut(vid) { + // Check if already deleted via ecx index + if let Ok(Some((_offset, size))) = ec_vol.find_needle_from_ecx(needle_id) { + if size.is_deleted() { + // Already deleted, no-op + return Ok(Response::new( + volume_server_pb::VolumeEcBlobDeleteResponse {}, + )); + } + } + ec_vol + .journal_delete(needle_id) + .map_err(|e| Status::internal(e.to_string()))?; + } + // If EC volume not mounted, it's a no-op (matching Go behavior) + Ok(Response::new( + volume_server_pb::VolumeEcBlobDeleteResponse {}, + )) + } + + async fn volume_ec_shards_to_volume( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let store = self.state.store.read().unwrap(); + let ec_vol = store + .find_ec_volume(vid) + .ok_or_else(|| Status::not_found(format!("ec volume {} not found", req.volume_id)))?; + + if ec_vol.collection != req.collection { + return Err(Status::internal(format!( + "existing collection:{} unexpected input: {}", + ec_vol.collection, req.collection + ))); + } + + // Use EC context data shard count from the volume + let data_shards = ec_vol.data_shards as usize; + + // Validate data shard count range (matches Go's VolumeEcShardsToVolume) + let max_shard_count = crate::storage::erasure_coding::ec_shard::MAX_SHARD_COUNT; + if data_shards == 0 || data_shards > max_shard_count { + return Err(Status::invalid_argument(format!( + "invalid data shard count {} for volume {} (must be 1..{})", + data_shards, req.volume_id, max_shard_count + ))); + } + + // Check that all data shards are present + for shard_id in 0..data_shards { + if ec_vol + .shards + .get(shard_id) + .map(|s| s.is_none()) + .unwrap_or(true) + { + return Err(Status::internal(format!( + "ec volume {} missing shard {}", + req.volume_id, shard_id + ))); + } + } + + // Read the .ecx index to check for live entries + let ecx_path = ec_vol.ecx_file_name(); + let ecx_data = + std::fs::read(&ecx_path).map_err(|e| Status::internal(format!("read ecx: {}", e)))?; + let entry_count = ecx_data.len() / NEEDLE_MAP_ENTRY_SIZE; + + let mut has_live = false; + for i in 0..entry_count { + let start = i * NEEDLE_MAP_ENTRY_SIZE; + let (_, _, size) = + idx_entry_from_bytes(&ecx_data[start..start + NEEDLE_MAP_ENTRY_SIZE]); + if !size.is_deleted() { + has_live = true; + break; + } + } + + if !has_live { + return Err(Status::failed_precondition(format!( + "ec volume {} has no live entries", + req.volume_id + ))); + } + + // Reconstruct the volume from EC shards + let dir = ec_vol.dir.clone(); + let collection = ec_vol.collection.clone(); + drop(store); + + // Calculate .dat file size from .ecx entries + let dat_file_size = + crate::storage::erasure_coding::ec_decoder::find_dat_file_size(&dir, &collection, vid) + .map_err(|e| Status::internal(format!("FindDatFileSize: {}", e)))?; + + // Write .dat file using block-interleaved reading from shards + crate::storage::erasure_coding::ec_decoder::write_dat_file_from_shards( + &dir, + &collection, + vid, + dat_file_size, + data_shards, + ) + .map_err(|e| Status::internal(format!("WriteDatFile: {}", e)))?; + + // Write .idx file from .ecx and .ecj files + crate::storage::erasure_coding::ec_decoder::write_idx_file_from_ec_index( + &dir, + &collection, + vid, + ) + .map_err(|e| Status::internal(format!("WriteIdxFileFromEcIndex: {}", e)))?; + + // Go does NOT unmount EC shards or mount the volume here. + // The caller (ec.balance / ec.decode) handles mount/unmount separately. + + Ok(Response::new( + volume_server_pb::VolumeEcShardsToVolumeResponse {}, + )) + } + + async fn volume_ec_shards_info( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + let store = self.state.store.read().unwrap(); + let ec_vol = store + .find_ec_volume(vid) + .ok_or_else(|| Status::not_found(format!("ec volume {} not found", req.volume_id)))?; + + let mut shard_infos = Vec::new(); + for (i, shard) in ec_vol.shards.iter().enumerate() { + match shard { + Some(s) => { + shard_infos.push(volume_server_pb::EcShardInfo { + shard_id: i as u32, + size: s.file_size(), + collection: ec_vol.collection.clone(), + volume_id: req.volume_id, + }); + } + None => { + shard_infos.push(volume_server_pb::EcShardInfo { + shard_id: i as u32, + collection: ec_vol.collection.clone(), + volume_id: req.volume_id, + ..Default::default() + }); + } + } + } + + // Walk .ecx index to compute file counts and total size (matching Go's WalkIndex) + let (file_count, file_deleted_count, volume_size) = ec_vol + .walk_ecx_stats() + .map_err(|e| Status::internal(e.to_string()))?; + + Ok(Response::new( + volume_server_pb::VolumeEcShardsInfoResponse { + ec_shard_infos: shard_infos, + volume_size, + file_count, + file_deleted_count, + }, + )) + } + + // ---- Tiered storage ---- + + type VolumeTierMoveDatToRemoteStream = + BoxStream; + async fn volume_tier_move_dat_to_remote( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Validate volume exists and collection matches + let dat_path = { + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", req.volume_id)))?; + + if vol.collection != req.collection { + return Err(Status::invalid_argument(format!( + "existing collection:{} unexpected input: {}", + vol.collection, req.collection + ))); + } + + let dat_path = vol.dat_path(); + + // Match Go's DiskFile check: if the .dat file is still local, we can + // keep tiering it even when remote file entries already exist. + if volume_is_remote_only(&dat_path, vol.has_remote_file) { + // Already on remote -- return empty stream (matches Go: returns nil) + let stream = tokio_stream::empty(); + return Ok(Response::new( + Box::pin(stream) as Self::VolumeTierMoveDatToRemoteStream + )); + } + + // Check if the destination backend already exists in volume info + let (backend_type, backend_id) = + crate::remote_storage::s3_tier::backend_name_to_type_id( + &req.destination_backend_name, + ); + for rf in &vol.volume_info.files { + if rf.backend_type == backend_type && rf.backend_id == backend_id { + return Err(Status::already_exists(format!( + "destination {} already exists", + req.destination_backend_name + ))); + } + } + + dat_path + }; + + // Look up the S3 tier backend + let backend = { + let registry = self.state.s3_tier_registry.read().unwrap(); + registry.get(&req.destination_backend_name).ok_or_else(|| { + let keys = registry.names(); + Status::not_found(format!( + "destination {} not found, supported: {:?}", + req.destination_backend_name, keys + )) + })? + }; + + let (backend_type, backend_id) = + crate::remote_storage::s3_tier::backend_name_to_type_id(&req.destination_backend_name); + + let (tx, rx) = tokio::sync::mpsc::channel::< + Result, + >(16); + let state = self.state.clone(); + let keep_local = req.keep_local_dat_file; + let dest_backend_name = req.destination_backend_name.clone(); + + tokio::spawn(async move { + let result: Result<(), Status> = async { + // Upload the .dat file to S3 with progress + let tx_progress = tx.clone(); + let mut last_report = std::time::Instant::now(); + let (key, size) = backend + .upload_file(&dat_path, move |processed, percentage| { + let now = std::time::Instant::now(); + if now.duration_since(last_report) >= std::time::Duration::from_secs(1) { + last_report = now; + let _ = tx_progress.try_send(Ok( + volume_server_pb::VolumeTierMoveDatToRemoteResponse { + processed, + processed_percentage: percentage, + }, + )); + } + }) + .await + .map_err(|e| { + Status::internal(format!( + "backend {} copy file {}: {}", + dest_backend_name, dat_path, e + )) + })?; + + // Update volume info with remote file reference + { + let mut store = state.store.write().unwrap(); + if let Some((_, vol)) = store.find_volume_mut(vid) { + let now_unix = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + vol.volume_info.files.push(volume_server_pb::RemoteFile { + backend_type: backend_type.clone(), + backend_id: backend_id.clone(), + key, + offset: 0, + file_size: size, + modified_time: now_unix, + extension: ".dat".to_string(), + }); + vol.refresh_remote_write_mode(); + + if let Err(e) = vol.save_volume_info() { + return Err(Status::internal(format!( + "volume {} failed to save remote file info: {}", + vid, e + ))); + } + + // Close local dat file handle (matches Go's v.LoadRemoteFile + // which closes DataBackend before switching to remote) + vol.close_local_dat_backend(); + + // Optionally remove local .dat file from disk + if !keep_local { + let dat = vol.dat_path(); + let _ = std::fs::remove_file(&dat); + } + } + } + + // Go does NOT send a final 100% progress message after upload completion + Ok(()) + } + .await; + + if let Err(e) = result { + let _ = tx.send(Err(e)).await; + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new( + Box::pin(stream) as Self::VolumeTierMoveDatToRemoteStream + )) + } + + type VolumeTierMoveDatFromRemoteStream = + BoxStream; + async fn volume_tier_move_dat_from_remote( + &self, + request: Request, + ) -> Result, Status> { + // Note: Go does NOT check maintenance mode for TierMoveDatFromRemote + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Validate volume and get remote storage info + let (dat_path, storage_name, storage_key) = { + let store = self.state.store.read().unwrap(); + let (_, vol) = store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("volume {} not found", req.volume_id)))?; + + if vol.collection != req.collection { + return Err(Status::invalid_argument(format!( + "existing collection:{} unexpected input: {}", + vol.collection, req.collection + ))); + } + + let (storage_name, storage_key) = vol.remote_storage_name_key(); + if storage_name.is_empty() || storage_key.is_empty() { + return Err(Status::failed_precondition(format!( + "volume {} is already on local disk", + vid + ))); + } + + // Check if the dat file already exists locally (matches Go's DataBackend DiskFile check) + let dat_path = vol.dat_path(); + if std::path::Path::new(&dat_path).exists() { + return Err(Status::failed_precondition(format!( + "volume {} is already on local disk", + vid + ))); + } + + (dat_path, storage_name, storage_key) + }; + + // Look up the S3 tier backend + let backend = { + let registry = self.state.s3_tier_registry.read().unwrap(); + registry.get(&storage_name).ok_or_else(|| { + let keys = registry.names(); + Status::not_found(format!( + "remote storage {} not found from supported: {:?}", + storage_name, keys + )) + })? + }; + + let (tx, rx) = tokio::sync::mpsc::channel::< + Result, + >(16); + let state = self.state.clone(); + let keep_remote = req.keep_remote_dat_file; + + tokio::spawn(async move { + let result: Result<(), Status> = async { + // Download the .dat file from S3 with progress + let tx_progress = tx.clone(); + let mut last_report = std::time::Instant::now(); + let storage_name_clone = storage_name.clone(); + let _size = backend + .download_file(&dat_path, &storage_key, move |processed, percentage| { + let now = std::time::Instant::now(); + if now.duration_since(last_report) >= std::time::Duration::from_secs(1) { + last_report = now; + let _ = tx_progress.try_send(Ok( + volume_server_pb::VolumeTierMoveDatFromRemoteResponse { + processed, + processed_percentage: percentage, + }, + )); + } + }) + .await + .map_err(|e| { + Status::internal(format!( + "backend {} copy file {}: {}", + storage_name_clone, dat_path, e + )) + })?; + + if !keep_remote { + // Delete remote file + backend.delete_file(&storage_key).await.map_err(|e| { + Status::internal(format!( + "volume {} failed to delete remote file {}: {}", + vid, storage_key, e + )) + })?; + + // Update volume info: remove remote file reference + { + let mut store = state.store.write().unwrap(); + if let Some((_, vol)) = store.find_volume_mut(vid) { + if !vol.volume_info.files.is_empty() { + vol.volume_info.files.remove(0); + } + vol.refresh_remote_write_mode(); + + if let Err(e) = vol.save_volume_info() { + return Err(Status::internal(format!( + "volume {} failed to save remote file info: {}", + vid, e + ))); + } + + // Close old remote backend (matches Go: v.DataBackend.Close(); v.DataBackend = nil) + // This forces the next read to discover and open the local .dat file. + vol.close_remote_dat_backend(); + } + } + } + + // Go does NOT send a final 100% progress message after download completion + Ok(()) + } + .await; + + if let Err(e) = result { + let _ = tx.send(Err(e)).await; + } + }); + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + Ok(Response::new( + Box::pin(stream) as Self::VolumeTierMoveDatFromRemoteStream + )) + } + + // ---- Server management ---- + + async fn volume_server_status( + &self, + _request: Request, + ) -> Result, Status> { + let store = self.state.store.read().unwrap(); + + let mut disk_statuses = Vec::new(); + for loc in &store.locations { + let (all, free) = get_disk_usage(&loc.directory); + let used = all.saturating_sub(free); + let percent_free = if all > 0 { + ((free as f64 / all as f64) * 100.0) as f32 + } else { + 0.0 + }; + let percent_used = if all > 0 { + ((used as f64 / all as f64) * 100.0) as f32 + } else { + 0.0 + }; + disk_statuses.push(volume_server_pb::DiskStatus { + dir: loc.directory.clone(), + all, + used, + free, + percent_free, + percent_used, + disk_type: loc.disk_type.to_string(), + }); + } + + Ok(Response::new( + volume_server_pb::VolumeServerStatusResponse { + disk_statuses, + memory_status: Some(super::memory_status::collect_mem_status()), + version: crate::version::full_version().to_string(), + data_center: self.state.data_center.clone(), + rack: self.state.rack.clone(), + state: Some(volume_server_pb::VolumeServerState { + maintenance: self.state.maintenance.load(Ordering::Relaxed), + version: self.state.state_version.load(Ordering::Relaxed), + }), + }, + )) + } + + async fn volume_server_leave( + &self, + _request: Request, + ) -> Result, Status> { + *self.state.is_stopping.write().unwrap() = true; + self.state.is_heartbeating.store(false, Ordering::Relaxed); + // Wake heartbeat loop to send deregistration. + self.state.volume_state_notify.notify_one(); + Ok(Response::new( + volume_server_pb::VolumeServerLeaveResponse {}, + )) + } + + async fn fetch_and_write_needle( + &self, + request: Request, + ) -> Result, Status> { + self.state.check_maintenance()?; + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + + // Check volume exists + { + let store = self.state.store.read().unwrap(); + store + .find_volume(vid) + .ok_or_else(|| Status::not_found(format!("not found volume id {}", vid)))?; + } + + // Get remote storage configuration + let remote_conf = req + .remote_conf + .as_ref() + .ok_or_else(|| Status::invalid_argument("remote storage configuration is required"))?; + + // Create remote storage client + let client = + crate::remote_storage::make_remote_storage_client(remote_conf).map_err(|e| { + Status::internal(format!( + "get remote client: make remote storage client {}: {}", + remote_conf.name, e, + )) + })?; + + let remote_location = req + .remote_location + .as_ref() + .ok_or_else(|| Status::invalid_argument("remote storage location is required"))?; + + // Read data from remote storage + let data = client + .read_file(remote_location, req.offset, req.size) + .await + .map_err(|e| { + Status::internal(format!("read from remote {:?}: {}", remote_location, e)) + })?; + + // Build needle and write locally + let mut n = Needle { + id: NeedleId(req.needle_id), + cookie: Cookie(req.cookie), + data_size: data.len() as u32, + data: data.clone(), + ..Needle::default() + }; + n.checksum = crate::storage::needle::crc::CRC::new(&n.data); + n.size = crate::storage::types::Size(4 + n.data_size as i32 + 1); + n.last_modified = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + n.set_has_last_modified_date(); + + // Run local write and replica writes concurrently (matches Go's WaitGroup) + let mut handles: Vec>> = Vec::new(); + + // Spawn local write as a concurrent task + let state_clone = self.state.clone(); + let mut n_clone = n.clone(); + let needle_id = req.needle_id; + let size = req.size; + let local_handle = tokio::task::spawn_blocking(move || { + let mut store = state_clone.store.write().unwrap(); + store + .write_volume_needle(vid, &mut n_clone) + .map(|_| ()) + .map_err(|e| format!("local write needle {} size {}: {}", needle_id, size, e)) + }); + + // Spawn replica writes concurrently + if !req.replicas.is_empty() { + let file_id = format!("{},{:x}{:08x}", vid, req.needle_id, req.cookie); + let http_client = self.state.http_client.clone(); + let scheme = self.state.outgoing_http_scheme.clone(); + for replica in &req.replicas { + let raw_target = format!("{}/{}?type=replicate", replica.url, file_id); + let url = + crate::server::volume_server::normalize_outgoing_http_url(&scheme, &raw_target) + .map_err(Status::internal)?; + let data_clone = data.clone(); + let client_clone = http_client.clone(); + let needle_id = req.needle_id; + let size = req.size; + handles.push(tokio::spawn(async move { + let form = reqwest::multipart::Form::new() + .part("file", reqwest::multipart::Part::bytes(data_clone)); + client_clone + .post(&url) + .multipart(form) + .send() + .await + .map(|_| ()) + .map_err(|e| { + format!("remote write needle {} size {}: {}", needle_id, size, e) + }) + })); + } + } + + // Await ALL writes before checking errors (matches Go's wg.Wait()) + let local_result = local_handle.await; + let mut replica_results = Vec::new(); + for handle in handles { + replica_results.push(handle.await); + } + + // Check local write result + match local_result { + Ok(Ok(())) => {} + Ok(Err(e)) => return Err(Status::internal(e)), + Err(e) => return Err(Status::internal(format!("local write task failed: {}", e))), + } + + let e_tag = n.etag(); + + // Check replica write results + for result in replica_results { + match result { + Ok(Ok(())) => {} + Ok(Err(e)) => return Err(Status::internal(e)), + Err(e) => return Err(Status::internal(format!("replication task failed: {}", e))), + } + } + + Ok(Response::new( + volume_server_pb::FetchAndWriteNeedleResponse { e_tag }, + )) + } + + async fn scrub_volume( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + // Validate mode + let mode = req.mode; + match mode { + 1 | 2 | 3 => {} // INDEX=1, FULL=2, LOCAL=3 + _ => { + return Err(Status::invalid_argument(format!( + "unsupported volume scrub mode {}", + mode + ))) + } + } + + let mut total_volumes: u64 = 0; + let mut total_files: u64 = 0; + let mut broken_volume_ids: Vec = Vec::new(); + let mut details: Vec = Vec::new(); + let mut broken_vids: Vec = Vec::new(); + + // Scrub phase: hold store read lock, then drop before async readonly calls. + { + let store = self.state.store.read().unwrap(); + let vids: Vec = if req.volume_ids.is_empty() { + store.all_volume_ids() + } else { + req.volume_ids.iter().map(|&id| VolumeId(id)).collect() + }; + + for vid in &vids { + let (_, v) = store + .find_volume(*vid) + .ok_or_else(|| Status::not_found(format!("volume id {} not found", vid.0)))?; + total_volumes += 1; + + // INDEX mode (1) calls scrub_index; LOCAL (2) and FULL (3) call scrub + let scrub_result = if mode == 1 { + v.scrub_index() + } else { + v.scrub() + }; + match scrub_result { + Ok((files, broken)) => { + total_files += files; + if !broken.is_empty() { + broken_vids.push(*vid); + broken_volume_ids.push(vid.0); + for msg in broken { + details.push(format!("vol {}: {}", vid.0, msg)); + } + } + } + Err(e) => { + total_files += v.file_count().max(0) as u64; + broken_vids.push(*vid); + broken_volume_ids.push(vid.0); + details.push(format!("vol {}: scrub error: {}", vid.0, e)); + } + } + } + } // store lock dropped here + + // Match Go: if mark_broken_volumes_readonly, call makeVolumeReadonly on each broken volume. + // Collect errors via errors.Join semantics (return joined error if any fail). + if req.mark_broken_volumes_readonly { + let mut errs: Vec = Vec::new(); + for vid in &broken_vids { + match self.make_volume_readonly(*vid, true).await { + Ok(()) => { + details.push(format!("volume {} is now read-only", vid.0)); + } + Err(e) => { + errs.push(e.message().to_string()); + details.push(e.message().to_string()); + } + } + } + if !errs.is_empty() { + return Err(Status::internal(errs.join("\n"))); + } + } + + Ok(Response::new(volume_server_pb::ScrubVolumeResponse { + total_volumes, + total_files, + broken_volume_ids, + details, + })) + } + + async fn scrub_ec_volume( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + + // Validate mode + let mode = req.mode; + match mode { + 1 | 2 | 3 => {} // INDEX=1, FULL=2, LOCAL=3 + _ => { + return Err(Status::invalid_argument(format!( + "unsupported EC volume scrub mode {}", + mode + ))) + } + } + + let store = self.state.store.read().unwrap(); + let vids: Vec = if req.volume_ids.is_empty() { + store + .locations + .iter() + .flat_map(|loc| loc.ec_volumes().map(|(vid, _)| *vid)) + .collect() + } else { + req.volume_ids.iter().map(|&id| VolumeId(id)).collect() + }; + + let mut total_volumes: u64 = 0; + let mut total_files: u64 = 0; + let mut broken_volume_ids: Vec = Vec::new(); + let mut broken_shard_infos: Vec = Vec::new(); + let mut details: Vec = Vec::new(); + + for vid in &vids { + let ecv = store + .find_ec_volume(*vid) + .ok_or_else(|| Status::not_found(format!("EC volume id {} not found", vid.0)))?; + let collection = ecv.collection.clone(); + + match mode { + 1 => { + // INDEX mode: check ecx index integrity only, no shard verification + // Matches Go's v.ScrubIndex() → idx.CheckIndexFile() + let (count, errs) = ecv.scrub_index(); + total_volumes += 1; + total_files += count; + if !errs.is_empty() { + broken_volume_ids.push(vid.0); + for msg in errs { + details.push(format!("ecvol {}: {}", vid.0, msg)); + } + } + } + 2 | 3 => { + // LOCAL (2) / FULL (3): verify EC shard data + let files = ecv.walk_ecx_stats().map(|(f, _, _)| f).unwrap_or(0); + + let dir = store + .find_ec_dir(*vid, &collection) + .unwrap_or_else(|| String::from("")); + if dir.is_empty() { + continue; + } + + total_volumes += 1; + total_files += files; + let (data_shards, parity_shards) = + crate::storage::erasure_coding::ec_volume::read_ec_shard_config( + &dir, + &collection, + *vid, + ); + + match crate::storage::erasure_coding::ec_encoder::verify_ec_shards( + &dir, + &collection, + *vid, + data_shards as usize, + parity_shards as usize, + ) { + Ok((broken, msgs)) => { + if !broken.is_empty() { + broken_volume_ids.push(vid.0); + for b in broken { + broken_shard_infos.push(volume_server_pb::EcShardInfo { + volume_id: vid.0, + collection: collection.clone(), + shard_id: b, + ..Default::default() + }); + } + } + for msg in msgs { + details.push(format!("ecvol {}: {}", vid.0, msg)); + } + } + Err(e) => { + broken_volume_ids.push(vid.0); + details.push(format!("ecvol {}: scrub error: {}", vid.0, e)); + } + } + } + _ => unreachable!(), // validated above + } + } + + Ok(Response::new(volume_server_pb::ScrubEcVolumeResponse { + total_volumes, + total_files, + broken_volume_ids, + broken_shard_infos, + details, + })) + } + + type QueryStream = BoxStream; + async fn query( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let mut stripes: Vec> = Vec::new(); + + for fid_str in &req.from_file_ids { + let file_id = needle::FileId::parse(fid_str).map_err(|e| Status::internal(e))?; + + let mut n = Needle { + id: file_id.key, + cookie: file_id.cookie, + ..Needle::default() + }; + let original_cookie = n.cookie; + + let store = self.state.store.read().unwrap(); + store + .read_volume_needle(file_id.volume_id, &mut n) + .map_err(|e| Status::internal(e.to_string()))?; + drop(store); + + // Cookie mismatch: log and return empty stream (matching Go behavior where err is nil) + if n.cookie != original_cookie { + tracing::info!( + "volume query failed to read fid cookie {}: cookie mismatch", + fid_str + ); + let stream = tokio_stream::iter(stripes); + return Ok(Response::new(Box::pin(stream))); + } + + let input = req.input_serialization.as_ref(); + + // CSV input: no output (Go does nothing for CSV) + if input.map_or(false, |i| i.csv_input.is_some()) { + // No stripes emitted for CSV + continue; + } + + // JSON input: process lines + if input.map_or(false, |i| i.json_input.is_some()) { + let filter = req.filter.as_ref(); + let data_str = String::from_utf8_lossy(&n.data); + let mut records: Vec = Vec::new(); + + for line in data_str.lines() { + if line.trim().is_empty() { + continue; + } + let parsed: serde_json::Value = match serde_json::from_str(line) { + Ok(v) => v, + Err(_) => continue, + }; + + // Apply filter + if let Some(f) = filter { + if !f.field.is_empty() && !f.operand.is_empty() { + let field_val = &parsed[&f.field]; + let pass = match f.operand.as_str() { + ">" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv > tv + } else { + false + } + } + ">=" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv >= tv + } else { + false + } + } + "<" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv < tv + } else { + false + } + } + "<=" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv <= tv + } else { + false + } + } + "=" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv == tv + } else { + field_val.as_str().map_or(false, |s| s == f.value) + } + } + "!=" => { + if let (Some(fv), Ok(tv)) = + (field_val.as_f64(), f.value.parse::()) + { + fv != tv + } else { + field_val.as_str().map_or(true, |s| s != f.value) + } + } + _ => true, + }; + if !pass { + continue; + } + } + } + + // Build output record: {selection:value,...} (Go's ToJson format — unquoted keys) + records.push(b'{'); + for (i, sel) in req.selections.iter().enumerate() { + if i > 0 { + records.push(b','); + } + records.extend_from_slice(sel.as_bytes()); + records.push(b':'); + let val = &parsed[sel]; + let raw = if val.is_null() { + "null".to_string() + } else { + // Use the raw JSON representation + val.to_string() + }; + records.extend_from_slice(raw.as_bytes()); + } + records.push(b'}'); + } + + stripes.push(Ok(volume_server_pb::QueriedStripe { records })); + } + } + + let stream = tokio_stream::iter(stripes); + Ok(Response::new(Box::pin(stream))) + } + + async fn volume_needle_status( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let vid = VolumeId(req.volume_id); + let needle_id = NeedleId(req.needle_id); + + let store = self.state.store.read().unwrap(); + + // Try normal volume first + if let Some(_) = store.find_volume(vid) { + let mut n = Needle { + id: needle_id, + ..Needle::default() + }; + match store.read_volume_needle(vid, &mut n) { + Ok(_) => { + let ttl_str = n.ttl.as_ref().map_or(String::new(), |t| t.to_string()); + return Ok(Response::new( + volume_server_pb::VolumeNeedleStatusResponse { + needle_id: n.id.0, + cookie: n.cookie.0, + size: n.size.0 as u32, + last_modified: n.last_modified, + crc: n.checksum.0, + ttl: ttl_str, + }, + )); + } + Err(_) => return Err(Status::not_found(format!("needle not found {}", needle_id))), + } + } + + // Fall back to EC shards — read full needle from local shards + if let Some(ec_vol) = store.find_ec_volume(vid) { + match ec_vol.read_ec_shard_needle(needle_id) { + Ok(Some(n)) => { + let ttl_str = match &n.ttl { + Some(t) if n.has_ttl() => t.to_string(), + _ => String::new(), + }; + return Ok(Response::new( + volume_server_pb::VolumeNeedleStatusResponse { + needle_id: n.id.0, + cookie: n.cookie.0, + size: n.size.0 as u32, + last_modified: n.last_modified, + crc: n.checksum.0, + ttl: ttl_str, + }, + )); + } + Ok(None) => { + return Err(Status::not_found(format!("needle not found {}", needle_id))); + } + Err(e) => { + return Err(Status::internal(format!( + "read ec shard needle {} from volume {}: {}", + needle_id, vid, e + ))); + } + } + } + + Err(Status::not_found(format!("volume not found {}", vid))) + } + + async fn ping( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + let now_ns = || { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as i64 + }; + + let start = now_ns(); + + // Route ping based on target type (matches Go's volume_grpc_admin.go Ping) + let remote_time_ns = if req.target_type == "volumeServer" { + match ping_volume_server_target(&req.target, self.state.outgoing_grpc_tls.as_ref()) + .await + { + Ok(t) => t, + Err(e) => { + return Err(Status::internal(format!( + "ping {} {}: {}", + req.target_type, req.target, e + ))) + } + } + } else if req.target_type == "master" { + // Connect to target master and call its Ping RPC + match ping_master_target(&req.target, self.state.outgoing_grpc_tls.as_ref()).await { + Ok(t) => t, + Err(e) => { + return Err(Status::internal(format!( + "ping {} {}: {}", + req.target_type, req.target, e + ))) + } + } + } else if req.target_type == "filer" { + match ping_filer_target(&req.target, self.state.outgoing_grpc_tls.as_ref()).await { + Ok(t) => t, + Err(e) => { + return Err(Status::internal(format!( + "ping {} {}: {}", + req.target_type, req.target, e + ))) + } + } + } else { + // Unknown target type → return 0 + 0 + }; + + let stop = now_ns(); + Ok(Response::new(volume_server_pb::PingResponse { + start_time_ns: start, + remote_time_ns, + stop_time_ns: stop, + })) + } +} + +/// Build a gRPC endpoint from a SeaweedFS server address. +fn to_grpc_endpoint( + target: &str, + tls: Option<&super::grpc_client::OutgoingGrpcTlsConfig>, +) -> Result { + let grpc_host_port = parse_grpc_address(target)?; + build_grpc_endpoint(&grpc_host_port, tls).map_err(|e| e.to_string()) +} + +/// Ping a remote volume server target by actually calling its Ping RPC (matches Go behavior). +async fn ping_volume_server_target( + target: &str, + tls: Option<&super::grpc_client::OutgoingGrpcTlsConfig>, +) -> Result { + let endpoint = to_grpc_endpoint(target, tls)?; + let channel = tokio::time::timeout(std::time::Duration::from_secs(5), endpoint.connect()) + .await + .map_err(|_| "connection timeout".to_string())? + .map_err(|e| e.to_string())?; + + let mut client = volume_server_pb::volume_server_client::VolumeServerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + let resp = client + .ping(volume_server_pb::PingRequest { + target: String::new(), + target_type: String::new(), + }) + .await + .map_err(|e| e.to_string())?; + Ok(resp.into_inner().start_time_ns) +} + +/// Ping a remote master target by actually calling its Ping RPC (matches Go behavior). +async fn ping_master_target( + target: &str, + tls: Option<&super::grpc_client::OutgoingGrpcTlsConfig>, +) -> Result { + let endpoint = to_grpc_endpoint(target, tls)?; + let channel = tokio::time::timeout(std::time::Duration::from_secs(5), endpoint.connect()) + .await + .map_err(|_| "connection timeout".to_string())? + .map_err(|e| e.to_string())?; + + let mut client = master_pb::seaweed_client::SeaweedClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + let resp = client + .ping(master_pb::PingRequest { + target: String::new(), + target_type: String::new(), + }) + .await + .map_err(|e| e.to_string())?; + Ok(resp.into_inner().start_time_ns) +} + +/// Ping a remote filer target by calling its Ping RPC (matches Go behavior). +async fn ping_filer_target( + target: &str, + tls: Option<&super::grpc_client::OutgoingGrpcTlsConfig>, +) -> Result { + let endpoint = to_grpc_endpoint(target, tls)?; + let channel = tokio::time::timeout(std::time::Duration::from_secs(5), endpoint.connect()) + .await + .map_err(|_| "connection timeout".to_string())? + .map_err(|e| e.to_string())?; + + let mut client = filer_pb::seaweed_filer_client::SeaweedFilerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + let resp = client + .ping(filer_pb::PingRequest::default()) + .await + .map_err(|e| e.to_string())?; + Ok(resp.into_inner().start_time_ns) +} + +/// Parse a SeaweedFS server address ("ip:port.grpcPort" or "ip:port") into a gRPC address. +fn parse_grpc_address(source: &str) -> Result { + if let Some(colon_idx) = source.rfind(':') { + let port_part = &source[colon_idx + 1..]; + if let Some(dot_idx) = port_part.rfind('.') { + // Format: "ip:port.grpcPort" + let host = &source[..colon_idx]; + let grpc_port = &port_part[dot_idx + 1..]; + grpc_port + .parse::() + .map_err(|e| format!("invalid grpc port: {}", e))?; + return Ok(format!("{}:{}", host, grpc_port)); + } + // Format: "ip:port" → grpc = port + 10000 + let port: u16 = port_part + .parse() + .map_err(|e| format!("invalid port: {}", e))?; + let grpc_port = port as u32 + 10000; + let host = &source[..colon_idx]; + return Ok(format!("{}:{}", host, grpc_port)); + } + Err(format!("cannot parse address: {}", source)) +} + +/// Set the modification time of a file from nanoseconds since Unix epoch. +fn set_file_mtime(path: &str, modified_ts_ns: i64) { + use std::time::{Duration, SystemTime}; + let ts = if modified_ts_ns >= 0 { + SystemTime::UNIX_EPOCH + Duration::from_nanos(modified_ts_ns as u64) + } else { + SystemTime::UNIX_EPOCH + }; + if let Ok(file) = std::fs::File::open(path) { + let ft = std::fs::FileTimes::new().set_accessed(ts).set_modified(ts); + let _ = file.set_times(ft); + } +} + +/// Copy a file from a remote volume server via CopyFile streaming RPC. +/// Returns the modified_ts_ns received from the source. +async fn copy_file_from_source( + client: &mut volume_server_pb::volume_server_client::VolumeServerClient, + is_ec_volume: bool, + collection: &str, + volume_id: u32, + compaction_revision: u32, + stop_offset: u64, + dest_path: &str, + ext: &str, + is_append: bool, + ignore_source_not_found: bool, + progress_tx: Option< + &tokio::sync::mpsc::Sender>, + >, + next_report_target: &mut i64, + report_interval: i64, + throttler: &mut WriteThrottler, +) -> Result +where + T: tonic::client::GrpcService, + T::Error: Into, + T::ResponseBody: http_body::Body + Send + 'static, + ::Error: Into + Send, +{ + let copy_req = volume_server_pb::CopyFileRequest { + volume_id, + ext: ext.to_string(), + compaction_revision, + stop_offset, + collection: collection.to_string(), + is_ec_volume, + ignore_source_file_not_found: ignore_source_not_found, + }; + + let mut stream = client + .copy_file(copy_req) + .await + .map_err(|e| { + format!( + "failed to start copying volume {} {} file: {}", + volume_id, ext, e + ) + })? + .into_inner(); + + let mut file = if is_append { + std::fs::OpenOptions::new() + .create(true) + .append(true) + .open(dest_path) + .map_err(|e| format!("open file {}: {}", dest_path, e))? + } else { + std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(dest_path) + .map_err(|e| format!("open file {}: {}", dest_path, e))? + }; + + let mut progressed_bytes: i64 = 0; + let mut modified_ts_ns: i64 = 0; + + while let Some(resp) = stream + .message() + .await + .map_err(|e| format!("receiving {}: {}", dest_path, e))? + { + if resp.modified_ts_ns != 0 { + modified_ts_ns = resp.modified_ts_ns; + } + if !resp.file_content.is_empty() { + use std::io::Write; + file.write_all(&resp.file_content) + .map_err(|e| format!("write file {}: {}", dest_path, e))?; + progressed_bytes += resp.file_content.len() as i64; + throttler + .maybe_slowdown(resp.file_content.len() as i64) + .await; + + if let Some(tx) = progress_tx { + if progressed_bytes > *next_report_target { + let _ = tx + .send(Ok(volume_server_pb::VolumeCopyResponse { + last_append_at_ns: 0, + processed_bytes: progressed_bytes, + })) + .await; + *next_report_target = progressed_bytes + report_interval; + } + } + } + } + + // If source file didn't exist (no modifiedTsNs received), remove empty file + // Go only removes when !isAppend + if modified_ts_ns == 0 && !is_append { + let _ = std::fs::remove_file(dest_path); + } + + Ok(modified_ts_ns) +} + +/// Verify that a copied file has the expected size. +fn check_copy_file_size(path: &str, expected: u64) -> Result<(), Status> { + match std::fs::metadata(path) { + Ok(meta) => { + if meta.len() != expected { + Err(Status::internal(format!( + "file {} size [{}] is not same as origin file size [{}]", + path, + meta.len(), + expected + ))) + } else { + Ok(()) + } + } + Err(e) if e.kind() == std::io::ErrorKind::NotFound && expected == 0 => Ok(()), + Err(e) => Err(Status::internal(format!( + "stat file {} failed: {}", + path, e + ))), + } +} + +/// Find the last append timestamp from copied .idx and .dat files. +/// Go returns (0, nil) for versions < Version3 since timestamps only exist in V3. +fn find_last_append_at_ns(idx_path: &str, dat_path: &str, version: u32) -> Option { + // Only Version3 has the append timestamp in the needle tail + if version < VERSION_3.0 as u32 { + return None; + } + use std::io::{Read, Seek, SeekFrom}; + + let mut idx_file = std::fs::File::open(idx_path).ok()?; + let idx_size = idx_file.metadata().ok()?.len(); + if idx_size == 0 || idx_size % (NEEDLE_MAP_ENTRY_SIZE as u64) != 0 { + return None; + } + + // Read the last index entry + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_file + .seek(SeekFrom::End(-(NEEDLE_MAP_ENTRY_SIZE as i64))) + .ok()?; + idx_file.read_exact(&mut buf).ok()?; + + let (_key, offset, _size) = idx_entry_from_bytes(&buf); + if offset.is_zero() { + return None; + } + + // Read needle header from .dat to get the append timestamp + let mut dat_file = std::fs::File::open(dat_path).ok()?; + let actual_offset = offset.to_actual_offset(); + + // Skip to the needle at the given offset, read header to get size + dat_file.seek(SeekFrom::Start(actual_offset as u64)).ok()?; + + // Read cookie (4) + id (8) + size (4) = 16 bytes header + let mut header = [0u8; 16]; + dat_file.read_exact(&mut header).ok()?; + let needle_size = i32::from_be_bytes([header[12], header[13], header[14], header[15]]); + if needle_size <= 0 { + return None; + } + + // Seek to tail: offset + 16 (header) + size -> checksum (4) + timestamp (8) + let tail_offset = actual_offset as u64 + 16 + needle_size as u64; + dat_file.seek(SeekFrom::Start(tail_offset)).ok()?; + + let mut tail = [0u8; 12]; // 4 bytes checksum + 8 bytes timestamp + dat_file.read_exact(&mut tail).ok()?; + + // Timestamp is the last 8 bytes, big-endian + let ts = u64::from_be_bytes([ + tail[4], tail[5], tail[6], tail[7], tail[8], tail[9], tail[10], tail[11], + ]); + if ts > 0 { + Some(ts) + } else { + None + } +} + +/// Get disk usage (total, free) in bytes for the given path. +fn get_disk_usage(path: &str) -> (u64, u64) { + use sysinfo::Disks; + let disks = Disks::new_with_refreshed_list(); + let path = std::path::Path::new(path); + // Find the disk that contains this path (longest mount point prefix match) + let mut best: Option<&sysinfo::Disk> = None; + let mut best_len = 0; + for disk in disks.list() { + let mount = disk.mount_point(); + if path.starts_with(mount) && mount.as_os_str().len() > best_len { + best_len = mount.as_os_str().len(); + best = Some(disk); + } + } + match best { + Some(disk) => (disk.total_space(), disk.available_space()), + None => (0, 0), + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::MinFreeSpace; + use crate::remote_storage::s3_tier::{global_s3_tier_registry, S3TierBackend, S3TierConfig}; + use crate::security::{Guard, SigningKey}; + use crate::storage::needle_map::NeedleMapKind; + use crate::storage::store::Store; + use std::sync::RwLock; + use tempfile::TempDir; + use tokio_stream::StreamExt; + + #[test] + fn test_parse_grpc_address_with_explicit_grpc_port() { + // Format: "ip:port.grpcPort" — used by SeaweedFS for source_data_node + let result = parse_grpc_address("192.168.1.66:8080.18080").unwrap(); + assert_eq!(result, "192.168.1.66:18080"); + } + + #[test] + fn test_parse_grpc_address_with_implicit_grpc_port() { + // Format: "ip:port" — grpc port = port + 10000 + let result = parse_grpc_address("192.168.1.66:8080").unwrap(); + assert_eq!(result, "192.168.1.66:18080"); + } + + #[test] + fn test_parse_grpc_address_localhost() { + let result = parse_grpc_address("localhost:9333").unwrap(); + assert_eq!(result, "localhost:19333"); + } + + #[test] + fn test_parse_grpc_address_with_ipv4_dots() { + // Regression: naive split on '.' breaks on IP addresses + let result = parse_grpc_address("10.0.0.1:8080.18080").unwrap(); + assert_eq!(result, "10.0.0.1:18080"); + + let result = parse_grpc_address("10.0.0.1:8080").unwrap(); + assert_eq!(result, "10.0.0.1:18080"); + } + + #[test] + fn test_parse_grpc_address_invalid() { + assert!(parse_grpc_address("no-colon").is_err()); + } + + #[test] + fn test_volume_is_remote_only_requires_missing_local_dat_file() { + let temp_dir = tempfile::tempdir().unwrap(); + let dat_path = temp_dir.path().join("1.dat"); + std::fs::write(&dat_path, b"dat").unwrap(); + + assert!(!volume_is_remote_only(dat_path.to_str().unwrap(), true)); + assert!(!volume_is_remote_only(dat_path.to_str().unwrap(), false)); + + std::fs::remove_file(&dat_path).unwrap(); + + assert!(volume_is_remote_only(dat_path.to_str().unwrap(), true)); + assert!(!volume_is_remote_only(dat_path.to_str().unwrap(), false)); + } + + fn spawn_fake_s3_server(body: Vec) -> (String, tokio::sync::oneshot::Sender<()>) { + use axum::http::{header, HeaderMap, HeaderValue, StatusCode}; + use axum::routing::any; + use axum::Router; + + let body = Arc::new(body); + let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + listener.set_nonblocking(true).unwrap(); + let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>(); + + std::thread::spawn(move || { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + runtime.block_on(async move { + let app = Router::new().fallback(any(move |headers: HeaderMap| { + let body = body.clone(); + async move { + let bytes = body.as_ref(); + if let Some(range) = headers + .get(header::RANGE) + .and_then(|value| value.to_str().ok()) + { + if let Some(range_value) = range.strip_prefix("bytes=") { + let mut parts = range_value.splitn(2, '-'); + let start = parts + .next() + .and_then(|value| value.parse::().ok()) + .unwrap_or(0); + let end = parts + .next() + .and_then(|value| value.parse::().ok()) + .unwrap_or_else(|| bytes.len().saturating_sub(1)); + let start = start.min(bytes.len()); + let end = end.min(bytes.len().saturating_sub(1)); + let payload = if start > end || start >= bytes.len() { + Vec::new() + } else { + bytes[start..=end].to_vec() + }; + let mut response_headers = HeaderMap::new(); + response_headers.insert( + header::CONTENT_RANGE, + HeaderValue::from_str(&format!( + "bytes {}-{}/{}", + start, + end, + bytes.len() + )) + .unwrap(), + ); + response_headers.insert( + header::CONTENT_LENGTH, + HeaderValue::from_str(&payload.len().to_string()).unwrap(), + ); + return (StatusCode::PARTIAL_CONTENT, response_headers, payload); + } + } + + let mut response_headers = HeaderMap::new(); + response_headers.insert( + header::CONTENT_LENGTH, + HeaderValue::from_str(&bytes.len().to_string()).unwrap(), + ); + (StatusCode::OK, response_headers, bytes.to_vec()) + } + })); + + let listener = tokio::net::TcpListener::from_std(listener).unwrap(); + axum::serve(listener, app) + .with_graceful_shutdown(async move { + let _ = shutdown_rx.await; + }) + .await + .unwrap(); + }); + }); + + (format!("http://{}", addr), shutdown_tx) + } + + fn make_remote_only_service() -> ( + VolumeGrpcService, + TempDir, + tokio::sync::oneshot::Sender<()>, + Vec, + u64, + ) { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let (dat_bytes, super_block_size) = { + let mut volume = crate::storage::volume::Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + let mut needle = Needle { + id: NeedleId(7), + cookie: Cookie(0x7788), + data: b"remote-incremental-copy".to_vec(), + data_size: "remote-incremental-copy".len() as u32, + ..Needle::default() + }; + volume.write_needle(&mut needle, true).unwrap(); + volume.sync_to_disk().unwrap(); + ( + std::fs::read(volume.file_name(".dat")).unwrap(), + volume.super_block.block_size() as u64, + ) + }; + + let dat_path = format!("{}/1.dat", dir); + std::fs::remove_file(&dat_path).unwrap(); + + let (endpoint, shutdown_tx) = spawn_fake_s3_server(dat_bytes.clone()); + global_s3_tier_registry().write().unwrap().clear(); + let tier_config = S3TierConfig { + access_key: "access".to_string(), + secret_key: "secret".to_string(), + region: "us-east-1".to_string(), + bucket: "bucket-a".to_string(), + endpoint, + storage_class: "STANDARD".to_string(), + force_path_style: true, + }; + { + let mut registry = global_s3_tier_registry().write().unwrap(); + registry.register("s3.default".to_string(), S3TierBackend::new(&tier_config)); + registry.register("s3".to_string(), S3TierBackend::new(&tier_config)); + } + + let vif = crate::storage::volume::VifVolumeInfo { + files: vec![crate::storage::volume::VifRemoteFile { + backend_type: "s3".to_string(), + backend_id: "default".to_string(), + key: "remote-key".to_string(), + offset: 0, + file_size: dat_bytes.len() as u64, + modified_time: 123, + extension: ".dat".to_string(), + }], + version: Version::current().0 as u32, + bytes_offset: crate::storage::types::OFFSET_SIZE as u32, + dat_file_size: dat_bytes.len() as i64, + ..Default::default() + }; + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let state = Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(Guard::new( + &[], + SigningKey(vec![]), + 0, + SigningKey(vec![]), + 0, + )), + is_stopping: RwLock::new(false), + maintenance: std::sync::atomic::AtomicBool::new(false), + state_version: std::sync::atomic::AtomicU32::new(0), + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(60), + inflight_download_data_timeout: std::time::Duration::from_secs(60), + inflight_upload_bytes: std::sync::atomic::AtomicI64::new(0), + inflight_download_bytes: std::sync::atomic::AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: String::new(), + rack: String::new(), + file_size_limit_bytes: 0, + maintenance_byte_per_second: 0, + is_heartbeating: std::sync::atomic::AtomicBool::new(true), + has_master: false, + pre_stop_seconds: 0, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new( + crate::remote_storage::s3_tier::S3TierRegistry::new(), + ), + read_mode: crate::config::ReadMode::Local, + master_url: String::new(), + master_urls: Vec::new(), + self_url: String::new(), + http_client: reqwest::Client::new(), + outgoing_http_scheme: "http".to_string(), + outgoing_grpc_tls: None, + metrics_runtime: std::sync::RwLock::new( + crate::server::volume_server::RuntimeMetricsConfig::default(), + ), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: false, + has_slow_read: false, + read_buffer_size_bytes: 1024 * 1024, + security_file: String::new(), + cli_white_list: vec![], + state_file_path: String::new(), + }); + + ( + VolumeGrpcService { state }, + tmp, + shutdown_tx, + dat_bytes, + super_block_size, + ) + } + + fn make_local_service_with_volume( + collection: &str, + ttl: Option, + ) -> (VolumeGrpcService, TempDir) { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store + .add_volume( + VolumeId(1), + collection, + None, + ttl, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + { + let (_, volume) = store.find_volume_mut(VolumeId(1)).unwrap(); + let mut needle = Needle { + id: NeedleId(11), + cookie: Cookie(0x3344), + data: b"ec-generate".to_vec(), + data_size: b"ec-generate".len() as u32, + ..Needle::default() + }; + volume.write_needle(&mut needle, true).unwrap(); + volume.sync_to_disk().unwrap(); + } + + let state = Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(Guard::new( + &[], + SigningKey(vec![]), + 0, + SigningKey(vec![]), + 0, + )), + is_stopping: RwLock::new(false), + maintenance: std::sync::atomic::AtomicBool::new(false), + state_version: std::sync::atomic::AtomicU32::new(0), + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(60), + inflight_download_data_timeout: std::time::Duration::from_secs(60), + inflight_upload_bytes: std::sync::atomic::AtomicI64::new(0), + inflight_download_bytes: std::sync::atomic::AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: String::new(), + rack: String::new(), + file_size_limit_bytes: 0, + maintenance_byte_per_second: 0, + is_heartbeating: std::sync::atomic::AtomicBool::new(true), + has_master: false, + pre_stop_seconds: 0, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new( + crate::remote_storage::s3_tier::S3TierRegistry::new(), + ), + read_mode: crate::config::ReadMode::Local, + master_url: String::new(), + master_urls: Vec::new(), + self_url: String::new(), + http_client: reqwest::Client::new(), + outgoing_http_scheme: "http".to_string(), + outgoing_grpc_tls: None, + metrics_runtime: std::sync::RwLock::new( + crate::server::volume_server::RuntimeMetricsConfig::default(), + ), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: false, + has_slow_read: false, + read_buffer_size_bytes: 1024 * 1024, + security_file: String::new(), + cli_white_list: vec![], + state_file_path: String::new(), + }); + + (VolumeGrpcService { state }, tmp) + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_volume_incremental_copy_streams_remote_only_volume_data() { + let (service, _tmp, shutdown_tx, dat_bytes, super_block_size) = make_remote_only_service(); + + let response = service + .volume_incremental_copy(Request::new( + volume_server_pb::VolumeIncrementalCopyRequest { + volume_id: 1, + since_ns: 0, + }, + )) + .await + .unwrap(); + + let mut stream = response.into_inner(); + let mut copied = Vec::new(); + while let Some(message) = stream.next().await { + copied.extend_from_slice(&message.unwrap().file_content); + } + + assert_eq!(copied, dat_bytes[super_block_size as usize..]); + + let _ = shutdown_tx.send(()); + global_s3_tier_registry().write().unwrap().clear(); + } + + #[tokio::test] + async fn test_volume_ec_shards_generate_persists_expire_at_sec() { + let ttl = crate::storage::needle::ttl::TTL::read("3m").unwrap(); + let (service, tmp) = make_local_service_with_volume("ttl", Some(ttl)); + let before = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_secs(); + + service + .volume_ec_shards_generate(Request::new( + volume_server_pb::VolumeEcShardsGenerateRequest { + volume_id: 1, + collection: "ttl".to_string(), + }, + )) + .await + .unwrap(); + + let vif_path = tmp.path().join("ttl_1.vif"); + let vif: crate::storage::volume::VifVolumeInfo = + serde_json::from_str(&std::fs::read_to_string(vif_path).unwrap()).unwrap(); + assert!(vif.expire_at_sec >= before + ttl.to_seconds()); + assert!(vif.expire_at_sec <= before + ttl.to_seconds() + 5); + } +} diff --git a/seaweed-volume/src/server/handlers.rs b/seaweed-volume/src/server/handlers.rs new file mode 100644 index 000000000..83e43fb67 --- /dev/null +++ b/seaweed-volume/src/server/handlers.rs @@ -0,0 +1,3913 @@ +//! HTTP handlers for volume server operations. +//! +//! Implements GET/HEAD (read), POST/PUT (write), DELETE, /status, /healthz. +//! Matches Go's volume_server_handlers_read.go, volume_server_handlers_write.go, +//! volume_server_handlers_admin.go. + +use std::collections::HashMap; +use std::future::Future; +use std::sync::atomic::Ordering; +use std::sync::Arc; + +use axum::body::Body; +use axum::extract::{Path, Query, State}; +use axum::http::{header, HeaderMap, Method, Request, StatusCode}; +use axum::response::{IntoResponse, Response}; +use serde::{Deserialize, Serialize}; + +use super::grpc_client::{build_grpc_endpoint, GRPC_MAX_MESSAGE_SIZE}; +use super::volume_server::{normalize_outgoing_http_url, VolumeServerState}; +use crate::config::ReadMode; +use crate::metrics; +use crate::pb::volume_server_pb; +use crate::storage::needle::needle::Needle; +use crate::storage::types::*; + +// ============================================================================ +// Inflight Throttle Guard +// ============================================================================ + +/// RAII guard that subtracts bytes from an atomic counter and notifies waiters on drop. +struct InflightGuard<'a> { + counter: &'a std::sync::atomic::AtomicI64, + bytes: i64, + notify: &'a tokio::sync::Notify, + metric: &'a prometheus::IntGauge, +} + +impl<'a> Drop for InflightGuard<'a> { + fn drop(&mut self) { + let new_val = self.counter.fetch_sub(self.bytes, Ordering::Relaxed) - self.bytes; + self.metric.set(new_val); + self.notify.notify_waiters(); + } +} + +/// Body wrapper that tracks download inflight bytes and releases them when dropped. +struct TrackedBody { + data: Vec, + state: Arc, + bytes: i64, +} + +impl http_body::Body for TrackedBody { + type Data = bytes::Bytes; + type Error = std::convert::Infallible; + + fn poll_frame( + mut self: std::pin::Pin<&mut Self>, + _cx: &mut std::task::Context<'_>, + ) -> std::task::Poll, Self::Error>>> { + if self.data.is_empty() { + return std::task::Poll::Ready(None); + } + let data = std::mem::take(&mut self.data); + std::task::Poll::Ready(Some(Ok(http_body::Frame::data(bytes::Bytes::from(data))))) + } + + fn size_hint(&self) -> http_body::SizeHint { + http_body::SizeHint::with_exact(self.data.len() as u64) + } +} + +impl Drop for TrackedBody { + fn drop(&mut self) { + let new_val = self + .state + .inflight_download_bytes + .fetch_sub(self.bytes, Ordering::Relaxed) + - self.bytes; + metrics::INFLIGHT_DOWNLOAD_SIZE.set(new_val); + self.state.download_notify.notify_waiters(); + } +} + +fn finalize_bytes_response( + status: StatusCode, + headers: HeaderMap, + data: Vec, + state: Option>, +) -> Response { + if let Some(state) = state { + let data_len = data.len() as i64; + let new_val = state + .inflight_download_bytes + .fetch_add(data_len, Ordering::Relaxed) + + data_len; + metrics::INFLIGHT_DOWNLOAD_SIZE.set(new_val); + let tracked_body = TrackedBody { + data, + state, + bytes: data_len, + }; + let body = Body::new(tracked_body); + let mut resp = Response::new(body); + *resp.status_mut() = status; + *resp.headers_mut() = headers; + resp + } else { + (status, headers, data).into_response() + } +} + +// ============================================================================ +// Streaming Body for Large Files +// ============================================================================ + +/// Threshold in bytes above which we stream needle data instead of buffering. +const STREAMING_THRESHOLD: u32 = 1024 * 1024; // 1 MB + +/// Default chunk size for streaming reads from the dat file. +const DEFAULT_STREAMING_CHUNK_SIZE: usize = 64 * 1024; // 64 KB + +/// A body that streams needle data from the dat file in chunks using pread, +/// avoiding loading the entire payload into memory at once. +struct StreamingBody { + source: crate::storage::volume::NeedleStreamSource, + data_offset: u64, + data_size: u32, + pos: usize, + chunk_size: usize, + data_file_access_control: Arc, + hold_read_lock_for_stream: bool, + _held_read_lease: Option, + /// Pending result from spawn_blocking, polled to completion. + pending: Option>>, + /// For download throttling — released on drop. + state: Option>, + tracked_bytes: i64, + /// Server state used to re-lookup needle offset if compaction occurs during streaming. + server_state: Arc, + /// Volume ID for compaction-revision re-lookup. + volume_id: crate::storage::types::VolumeId, + /// Needle ID for compaction-revision re-lookup. + needle_id: crate::storage::types::NeedleId, + /// Compaction revision at the time of the initial read; if the volume's revision + /// changes between chunks, the needle may have moved and we must re-lookup its offset. + compaction_revision: u16, +} + +impl http_body::Body for StreamingBody { + type Data = bytes::Bytes; + type Error = std::io::Error; + + fn poll_frame( + mut self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + ) -> std::task::Poll, Self::Error>>> { + loop { + // If we have a pending read, poll it + if let Some(ref mut handle) = self.pending { + match std::pin::Pin::new(handle).poll(cx) { + std::task::Poll::Pending => return std::task::Poll::Pending, + std::task::Poll::Ready(result) => { + self.pending = None; + match result { + Ok(Ok(chunk)) => { + let len = chunk.len(); + self.pos += len; + return std::task::Poll::Ready(Some(Ok(http_body::Frame::data( + chunk, + )))); + } + Ok(Err(e)) => return std::task::Poll::Ready(Some(Err(e))), + Err(e) => { + return std::task::Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::Other, + e, + )))) + } + } + } + } + } + + let total = self.data_size as usize; + if self.pos >= total { + return std::task::Poll::Ready(None); + } + + // Check if compaction has changed the needle's disk location (Go parity: + // readNeedleDataInto re-reads the needle offset when CompactionRevision changes). + let relookup_result = { + let store = self.server_state.store.read().unwrap(); + if let Some((_, vol)) = store.find_volume(self.volume_id) { + if vol.super_block.compaction_revision != self.compaction_revision { + // Compaction occurred — re-lookup the needle's data offset + Some(vol.re_lookup_needle_data_offset(self.needle_id)) + } else { + None + } + } else { + None + } + }; + if let Some(result) = relookup_result { + match result { + Ok((new_offset, new_rev)) => { + self.data_offset = new_offset; + self.compaction_revision = new_rev; + } + Err(_) => { + return std::task::Poll::Ready(Some(Err(std::io::Error::new( + std::io::ErrorKind::NotFound, + "needle not found after compaction", + )))); + } + } + } + + let chunk_len = std::cmp::min(self.chunk_size, total - self.pos); + let file_offset = self.data_offset + self.pos as u64; + + let source_clone = match self.source.clone_for_read() { + Ok(source) => source, + Err(e) => return std::task::Poll::Ready(Some(Err(e))), + }; + let data_file_access_control = self.data_file_access_control.clone(); + let hold_read_lock_for_stream = self.hold_read_lock_for_stream; + + let handle = tokio::task::spawn_blocking(move || { + let _lease = if hold_read_lock_for_stream { + None + } else { + Some(data_file_access_control.read_lock()) + }; + let mut buf = vec![0u8; chunk_len]; + source_clone.read_exact_at(&mut buf, file_offset)?; + Ok::(bytes::Bytes::from(buf)) + }); + + self.pending = Some(handle); + // Loop back to poll the newly created future + } + } +} + +impl Drop for StreamingBody { + fn drop(&mut self) { + if let Some(ref st) = self.state { + let new_val = st + .inflight_download_bytes + .fetch_sub(self.tracked_bytes, Ordering::Relaxed) + - self.tracked_bytes; + metrics::INFLIGHT_DOWNLOAD_SIZE.set(new_val); + st.download_notify.notify_waiters(); + } + } +} + +// ============================================================================ +// URL Parsing +// ============================================================================ + +/// Parse volume ID and file ID from URL path. +/// Supports: "vid,fid", "vid/fid", "vid,fid.ext", "vid/fid/filename.ext" +/// Extract the file_id string (e.g., "3,01637037d6") from a URL path for JWT validation. +fn extract_file_id(path: &str) -> String { + let path = path.trim_start_matches('/'); + // Strip extension and filename after second slash + if let Some(comma) = path.find(',') { + let after_comma = &path[comma + 1..]; + let fid_part = if let Some(slash) = after_comma.find('/') { + &after_comma[..slash] + } else if let Some(dot) = after_comma.rfind('.') { + &after_comma[..dot] + } else { + after_comma + }; + // Strip "_suffix" from fid (Go does this for filenames appended with underscore) + let fid_part = if let Some(underscore) = fid_part.rfind('_') { + &fid_part[..underscore] + } else { + fid_part + }; + format!("{},{}", &path[..comma], fid_part) + } else { + path.to_string() + } +} + +fn streaming_chunk_size(read_buffer_size_bytes: usize, data_size: usize) -> usize { + std::cmp::min( + read_buffer_size_bytes.max(DEFAULT_STREAMING_CHUNK_SIZE), + data_size.max(1), + ) +} + +fn parse_url_path(path: &str) -> Option<(VolumeId, NeedleId, Cookie)> { + let path = path.trim_start_matches('/'); + + // Try "vid,fid" or "vid/fid" or "vid/fid/filename" formats + let (vid_str, fid_part) = if let Some(pos) = path.find(',') { + (&path[..pos], &path[pos + 1..]) + } else if let Some(pos) = path.find('/') { + (&path[..pos], &path[pos + 1..]) + } else { + return None; + }; + + // For fid part, strip extension from the fid (not from filename) + // "vid,fid.ext" -> fid is before dot + // "vid/fid/filename.ext" -> fid is the part before the second slash + let fid_str = if let Some(slash_pos) = fid_part.find('/') { + // "fid/filename.ext" - fid is before the slash + &fid_part[..slash_pos] + } else if let Some(dot) = fid_part.rfind('.') { + // "fid.ext" - strip extension + &fid_part[..dot] + } else { + fid_part + }; + + let vid = VolumeId::parse(vid_str).ok()?; + let (needle_id, cookie) = + crate::storage::needle::needle::parse_needle_id_cookie(fid_str).ok()?; + + Some((vid, needle_id, cookie)) +} + +// ============================================================================ +// Volume Lookup + Proxy/Redirect +// ============================================================================ + +/// A volume location returned by master lookup. +#[derive(Clone, Debug, Deserialize)] +struct VolumeLocation { + url: String, + #[serde(rename = "publicUrl")] + public_url: String, + #[serde(rename = "grpcPort", default)] + grpc_port: u32, +} + +/// Master /dir/lookup response. +#[derive(Debug, Deserialize)] +struct LookupResult { + #[serde(default)] + locations: Option>, + #[serde(default)] + error: Option, +} + +/// Look up volume locations from the master via HTTP /dir/lookup. +async fn lookup_volume( + client: &reqwest::Client, + scheme: &str, + master_url: &str, + volume_id: u32, +) -> Result, String> { + let url = normalize_outgoing_http_url( + scheme, + &format!("{}/dir/lookup?volumeId={}", master_url, volume_id), + )?; + let resp = client + .get(&url) + .send() + .await + .map_err(|e| format!("lookup request failed: {}", e))?; + let result: LookupResult = resp + .json() + .await + .map_err(|e| format!("lookup parse failed: {}", e))?; + if let Some(err) = result.error { + if !err.is_empty() { + return Err(err); + } + } + Ok(result.locations.unwrap_or_default()) +} + +fn grpc_address_for_location(location: &VolumeLocation) -> Result { + let raw = location + .url + .trim_start_matches("http://") + .trim_start_matches("https://"); + + if location.grpc_port > 0 { + let (host, _) = raw + .rsplit_once(':') + .ok_or_else(|| format!("cannot parse address: {}", location.url))?; + return Ok(format!("{}:{}", host, location.grpc_port)); + } + + if let Some(colon_idx) = raw.rfind(':') { + let port_part = &raw[colon_idx + 1..]; + if let Some(dot_idx) = port_part.rfind('.') { + let host = &raw[..colon_idx]; + let grpc_port = &port_part[dot_idx + 1..]; + grpc_port + .parse::() + .map_err(|e| format!("invalid grpc port: {}", e))?; + return Ok(format!("{}:{}", host, grpc_port)); + } + + let port: u16 = port_part + .parse() + .map_err(|e| format!("invalid port: {}", e))?; + let host = &raw[..colon_idx]; + return Ok(format!("{}:{}", host, port as u32 + 10000)); + } + + Err(format!("cannot parse address: {}", location.url)) +} + +async fn batch_delete_file_ids( + state: &VolumeServerState, + file_ids: &[String], +) -> Result<(), String> { + let mut lookup_cache: HashMap> = HashMap::new(); + let mut server_to_file_ids: HashMap> = HashMap::new(); + + for file_id in file_ids { + let parsed = crate::storage::needle::needle::FileId::parse(file_id) + .map_err(|e| format!("chunk delete {}: {}", file_id, e))?; + let volume_id = parsed.volume_id.0; + + let locations = if let Some(locations) = lookup_cache.get(&volume_id) { + locations.clone() + } else { + let locations = lookup_volume( + &state.http_client, + &state.outgoing_http_scheme, + &state.master_url, + volume_id, + ) + .await + .map_err(|e| format!("chunk delete {}: {}", file_id, e))?; + if locations.is_empty() { + return Err(format!("chunk delete {}: file not found", file_id)); + } + lookup_cache.insert(volume_id, locations.clone()); + locations + }; + + for location in locations { + let grpc_addr = grpc_address_for_location(&location) + .map_err(|e| format!("chunk delete {}: {}", file_id, e))?; + server_to_file_ids + .entry(grpc_addr) + .or_default() + .push(file_id.clone()); + } + } + + for (grpc_addr, batch) in server_to_file_ids { + let endpoint = build_grpc_endpoint(&grpc_addr, state.outgoing_grpc_tls.as_ref()) + .map_err(|e| format!("batch delete {}: {}", grpc_addr, e))?; + let channel = endpoint + .connect() + .await + .map_err(|e| format!("batch delete {}: {}", grpc_addr, e))?; + let mut client = + volume_server_pb::volume_server_client::VolumeServerClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + + let response = client + .batch_delete(volume_server_pb::BatchDeleteRequest { + file_ids: batch.clone(), + skip_cookie_check: true, + }) + .await + .map_err(|e| format!("batch delete {}: {}", grpc_addr, e))? + .into_inner(); + + for result in response.results { + if !result.error.is_empty() { + return Err(format!("chunk delete {}: {}", result.file_id, result.error)); + } + if result.status >= 400 { + return Err(format!( + "chunk delete {}: status {}", + result.file_id, result.status + )); + } + } + } + + Ok(()) +} + +/// Helper to synchronously replicate a request to peer volume servers. +async fn do_replicated_request( + state: &VolumeServerState, + vid: u32, + method: axum::http::Method, + path: &str, + query: &str, + headers: &axum::http::HeaderMap, + body: Option, +) -> Result<(), String> { + let locations = lookup_volume( + &state.http_client, + &state.outgoing_http_scheme, + &state.master_url, + vid, + ) + .await + .map_err(|e| format!("lookup volume failed: {}", e))?; + + let remote_locations: Vec<_> = locations + .into_iter() + .filter(|loc| loc.url != state.self_url && loc.public_url != state.self_url) + .collect(); + + if remote_locations.is_empty() { + return Ok(()); + } + + let new_query = if query.is_empty() { + String::from("type=replicate") + } else { + format!("{}&type=replicate", query) + }; + + let mut futures = Vec::new(); + for loc in remote_locations { + let url = normalize_outgoing_http_url( + &state.outgoing_http_scheme, + &format!("{}{}?{}", loc.url, path, new_query), + )?; + let client = state.http_client.clone(); + + let mut req_builder = client.request(method.clone(), &url); + + // Forward relevant headers + if let Some(ct) = headers.get(axum::http::header::CONTENT_TYPE) { + req_builder = req_builder.header(axum::http::header::CONTENT_TYPE, ct); + } + if let Some(ce) = headers.get(axum::http::header::CONTENT_ENCODING) { + req_builder = req_builder.header(axum::http::header::CONTENT_ENCODING, ce); + } + if let Some(md5) = headers.get("Content-MD5") { + req_builder = req_builder.header("Content-MD5", md5); + } + if let Some(auth) = headers.get(axum::http::header::AUTHORIZATION) { + req_builder = req_builder.header(axum::http::header::AUTHORIZATION, auth); + } + + if let Some(ref b) = body { + req_builder = req_builder.body(b.clone()); + } + + futures.push(async move { + match req_builder.send().await { + Ok(r) if r.status().is_success() => Ok(()), + Ok(r) => Err(format!("{} returned status {}", url, r.status())), + Err(e) => Err(format!("{} failed: {}", url, e)), + } + }); + } + + let results = futures::future::join_all(futures).await; + let mut errors = Vec::new(); + for res in results { + if let Err(e) = res { + errors.push(e); + } + } + + if !errors.is_empty() { + return Err(errors.join(", ")); + } + + Ok(()) +} + +/// Extracted request info needed for proxy/redirect (avoids borrowing Request across await). +struct ProxyRequestInfo { + original_headers: HeaderMap, + original_query: String, + path: String, + vid_str: String, + fid_str: String, +} + +fn build_proxy_request_info( + path: &str, + headers: &HeaderMap, + query_string: &str, +) -> Option { + let trimmed = path.trim_start_matches('/'); + let (vid_str, fid_str) = if let Some(pos) = trimmed.find(',') { + let raw_fid = &trimmed[pos + 1..]; + let fid = if let Some(slash) = raw_fid.find('/') { + &raw_fid[..slash] + } else if let Some(dot) = raw_fid.rfind('.') { + &raw_fid[..dot] + } else { + raw_fid + }; + (trimmed[..pos].to_string(), fid.to_string()) + } else if let Some(pos) = trimmed.find('/') { + let after = &trimmed[pos + 1..]; + let fid_part = if let Some(slash) = after.find('/') { + &after[..slash] + } else { + after + }; + (trimmed[..pos].to_string(), fid_part.to_string()) + } else { + return None; + }; + + Some(ProxyRequestInfo { + original_headers: headers.clone(), + original_query: query_string.to_string(), + path: path.to_string(), + vid_str, + fid_str, + }) +} + +/// Handle proxy or redirect for a non-local volume read. +async fn proxy_or_redirect_to_target( + state: &VolumeServerState, + info: ProxyRequestInfo, + vid: VolumeId, + allow_local_redirect: bool, +) -> Response { + // Look up volume locations from master + let locations = match lookup_volume( + &state.http_client, + &state.outgoing_http_scheme, + &state.master_url, + vid.0, + ) + .await + { + Ok(locs) => locs, + Err(e) => { + tracing::warn!("volume lookup failed for {}: {}", vid.0, e); + return StatusCode::NOT_FOUND.into_response(); + } + }; + + if locations.is_empty() { + return StatusCode::NOT_FOUND.into_response(); + } + + // Filter out self, then shuffle remaining + let mut candidates: Vec<&VolumeLocation> = locations + .iter() + .filter(|loc| !loc.url.contains(&state.self_url)) + .collect(); + + if candidates.is_empty() { + return StatusCode::NOT_FOUND.into_response(); + } + + // Shuffle for load balancing + if candidates.len() >= 2 { + use rand::seq::SliceRandom; + let mut rng = rand::thread_rng(); + candidates.shuffle(&mut rng); + } + + let target = candidates[0]; + + match state.read_mode { + ReadMode::Proxy => proxy_request(state, &info, target).await, + ReadMode::Redirect => redirect_request(&info, target, &state.outgoing_http_scheme), + ReadMode::Local if allow_local_redirect => { + redirect_request(&info, target, &state.outgoing_http_scheme) + } + ReadMode::Local => unreachable!(), + } +} + +/// Proxy the request to the target volume server. +async fn proxy_request( + state: &VolumeServerState, + info: &ProxyRequestInfo, + target: &VolumeLocation, +) -> Response { + // Build target URL, adding proxied=true query param + let path = info.path.trim_start_matches('/'); + + let raw_target = if info.original_query.is_empty() { + format!("{}/{}?proxied=true", target.url, path) + } else { + format!( + "{}/{}?{}&proxied=true", + target.url, path, info.original_query + ) + }; + let target_url = match normalize_outgoing_http_url(&state.outgoing_http_scheme, &raw_target) { + Ok(url) => url, + Err(e) => { + tracing::warn!("proxy target url {} invalid: {}", raw_target, e); + return StatusCode::INTERNAL_SERVER_ERROR.into_response(); + } + }; + + // Build the proxy request + let mut req_builder = state.http_client.get(&target_url); + + // Forward all original headers + for (name, value) in &info.original_headers { + if let Ok(v) = value.to_str() { + req_builder = req_builder.header(name.as_str(), v); + } + } + + let resp = match req_builder.send().await { + Ok(r) => r, + Err(e) => { + tracing::warn!("proxy request to {} failed: {}", target_url, e); + return StatusCode::INTERNAL_SERVER_ERROR.into_response(); + } + }; + + // Build response, copying headers and body from remote + let status = + StatusCode::from_u16(resp.status().as_u16()).unwrap_or(StatusCode::INTERNAL_SERVER_ERROR); + let mut response_headers = HeaderMap::new(); + for (name, value) in resp.headers() { + if name.as_str().eq_ignore_ascii_case("server") { + continue; + } + response_headers.insert(name.clone(), value.clone()); + } + + // Stream the proxy response body instead of buffering it entirely + let byte_stream = resp.bytes_stream(); + let body = Body::from_stream(byte_stream); + + let mut response = Response::new(body); + *response.status_mut() = status; + *response.headers_mut() = response_headers; + response +} + +/// Return a redirect response to the target volume server. +fn redirect_request(info: &ProxyRequestInfo, target: &VolumeLocation, scheme: &str) -> Response { + // Build query string: preserve collection, add proxied=true, drop readDeleted (Go parity) + let mut query_params = Vec::new(); + if !info.original_query.is_empty() { + for param in info.original_query.split('&') { + if let Some((key, value)) = param.split_once('=') { + if key == "collection" { + query_params.push(format!("collection={}", value)); + } + // Intentionally drop readDeleted and other params (Go parity) + } + } + } + query_params.push("proxied=true".to_string()); + let query = query_params.join("&"); + + let raw_target = format!( + "{}/{},{}?{}", + target.url, &info.vid_str, &info.fid_str, query + ); + let location = match normalize_outgoing_http_url(scheme, &raw_target) { + Ok(url) => url, + Err(_) => return StatusCode::INTERNAL_SERVER_ERROR.into_response(), + }; + + Response::builder() + .status(StatusCode::MOVED_PERMANENTLY) + .header("Location", &location) + .header("Content-Type", "text/html; charset=utf-8") + .body(Body::from(format!( + "Moved Permanently.\n\n", + location + ))) + .unwrap_or_else(|_| StatusCode::INTERNAL_SERVER_ERROR.into_response()) +} + +// ============================================================================ +// Query parameters +// ============================================================================ + +#[derive(Deserialize, Default)] +pub struct ReadQueryParams { + #[serde(rename = "response-content-type")] + pub response_content_type: Option, + #[serde(rename = "response-cache-control")] + pub response_cache_control: Option, + pub dl: Option, + #[serde(rename = "readDeleted")] + pub read_deleted: Option, + /// cm=false disables chunk manifest expansion (returns raw manifest JSON). + pub cm: Option, + /// Image resize width + pub width: Option, + /// Image resize height + pub height: Option, + /// Image resize mode: "fit" or "fill" + pub mode: Option, + /// Image crop parameters + pub crop_x1: Option, + pub crop_y1: Option, + pub crop_x2: Option, + pub crop_y2: Option, + /// S3 response passthrough headers + #[serde(rename = "response-content-encoding")] + pub response_content_encoding: Option, + #[serde(rename = "response-expires")] + pub response_expires: Option, + #[serde(rename = "response-content-language")] + pub response_content_language: Option, + #[serde(rename = "response-content-disposition")] + pub response_content_disposition: Option, + /// Pretty print JSON response + pub pretty: Option, + /// JSONP callback function name + pub callback: Option, +} + +// ============================================================================ +// Read Handler (GET/HEAD) +// ============================================================================ + +/// Called from the method-dispatching store handler with a full Request. +pub async fn get_or_head_handler_from_request( + State(state): State>, + request: Request, +) -> Response { + let uri = request.uri().clone(); + let headers = request.headers().clone(); + + // Parse query params manually from URI + let query_params: ReadQueryParams = uri + .query() + .and_then(|q| serde_urlencoded::from_str(q).ok()) + .unwrap_or_default(); + + get_or_head_handler_inner(state, headers, query_params, request).await +} + +pub async fn get_or_head_handler( + State(state): State>, + headers: HeaderMap, + query: Query, + request: Request, +) -> Response { + get_or_head_handler_inner(state, headers, query.0, request).await +} + +async fn get_or_head_handler_inner( + state: Arc, + headers: HeaderMap, + query: ReadQueryParams, + request: Request, +) -> Response { + let path = request.uri().path().to_string(); + let raw_query = request.uri().query().map(|q| q.to_string()); + let method = request.method().clone(); + + // JWT check for reads — must happen BEFORE path parsing to match Go behavior. + // Go's GetOrHeadHandler calls maybeCheckJwtAuthorization before NewVolumeId, + // so invalid paths with JWT enabled return 401, not 400. + let file_id = extract_file_id(&path); + let token = extract_jwt(&headers, request.uri()); + if let Err(_) = + state + .guard + .read() + .unwrap() + .check_jwt_for_file(token.as_deref(), &file_id, false) + { + let body = serde_json::json!({"error": "wrong jwt"}); + return Response::builder() + .status(StatusCode::UNAUTHORIZED) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(serde_json::to_string(&body).unwrap())) + .unwrap(); + } + + let (vid, needle_id, cookie) = match parse_url_path(&path) { + Some(parsed) => parsed, + None => return StatusCode::BAD_REQUEST.into_response(), + }; + + // Check if volume exists locally; if not, proxy/redirect based on read_mode. + // This mirrors Go's hasVolume + hasEcVolume check in GetOrHeadHandler. + // NOTE: The RwLockReadGuard must be dropped before any .await to keep the future Send. + let has_volume = state.store.read().unwrap().has_volume(vid); + let has_ec_volume = state.store.read().unwrap().has_ec_volume(vid); + + if !has_volume && !has_ec_volume { + // Check if already proxied (loop prevention) + let query_string = request.uri().query().unwrap_or("").to_string(); + let is_proxied = query_string.contains("proxied=true"); + + if is_proxied || state.read_mode == ReadMode::Local || state.master_url.is_empty() { + return StatusCode::NOT_FOUND.into_response(); + } + + // For redirect, fid must be stripped of extension (Go parity: parseURLPath returns raw fid). + let info = match build_proxy_request_info(&path, request.headers(), &query_string) { + Some(info) => info, + None => return StatusCode::NOT_FOUND.into_response(), + }; + + return proxy_or_redirect_to_target(&state, info, vid, false).await; + } + + // Download throttling — matches Go's checkDownloadLimit + waitForDownloadSlot + let download_guard = if state.concurrent_download_limit > 0 { + let timeout = state.inflight_download_data_timeout; + let deadline = tokio::time::Instant::now() + timeout; + let query_string = request.uri().query().unwrap_or("").to_string(); + + let current = state.inflight_download_bytes.load(Ordering::Relaxed); + if current > state.concurrent_download_limit { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::DOWNLOAD_LIMIT_COND]) + .inc(); + + // Go tries proxy to replica ONCE before entering the blocking wait + // loop (checkDownloadLimit L65). It does NOT retry on each wakeup. + let should_try_replica = + !query_string.contains("proxied=true") && !state.master_url.is_empty() && { + let store = state.store.read().unwrap(); + store.find_volume(vid).map_or(false, |(_, vol)| { + vol.super_block.replica_placement.get_copy_count() > 1 + }) + }; + if should_try_replica { + if let Some(info) = + build_proxy_request_info(&path, request.headers(), &query_string) + { + return proxy_or_redirect_to_target(&state, info, vid, true).await; + } + } + + // Blocking wait loop (Go's waitForDownloadSlot) + loop { + if tokio::time::timeout_at(deadline, state.download_notify.notified()) + .await + .is_err() + { + return json_error_with_query( + StatusCode::TOO_MANY_REQUESTS, + "download limit exceeded", + raw_query.as_deref(), + ); + } + let current = state.inflight_download_bytes.load(Ordering::Relaxed); + if current <= state.concurrent_download_limit { + break; + } + } + } + // We'll set the actual bytes after reading the needle (once we know the size) + Some(state.clone()) + } else { + None + }; + + // Read needle — branching between regular volume and EC volume paths. + // EC volumes always do a full read (no streaming/meta-only). + let mut n = Needle { + id: needle_id, + cookie, + ..Needle::default() + }; + + let read_deleted = query.read_deleted.as_deref() == Some("true"); + let has_range = headers.contains_key(header::RANGE); + let ext = extract_extension_from_path(&path); + // Go checks resize and crop extensions separately: resize supports .webp, crop does not. + let has_resize_ops = + is_image_resize_ext(&ext) && (query.width.unwrap_or(0) > 0 || query.height.unwrap_or(0) > 0); + // Go's shouldCropImages (L410) requires x2 > x1 && y2 > y1 (x1/y1 default 0). + // Only disable streaming when a real crop will actually happen. + let has_crop_ops = is_image_crop_ext(&ext) && { + let x1 = query.crop_x1.unwrap_or(0); + let y1 = query.crop_y1.unwrap_or(0); + let x2 = query.crop_x2.unwrap_or(0); + let y2 = query.crop_y2.unwrap_or(0); + x2 > x1 && y2 > y1 + }; + let has_image_ops = has_resize_ops || has_crop_ops; + + // Stream info is only available for regular volumes, not EC volumes. + let stream_info; + let bypass_cm; + let track_download; + let can_stream; + let can_handle_head_from_meta; + let can_handle_range_from_source; + + if has_ec_volume && !has_volume { + // ---- EC volume read path (always full read, no streaming) ---- + let store = state.store.read().unwrap(); + match store.find_ec_volume(vid) { + Some(ecv) => match ecv.read_ec_shard_needle(needle_id) { + Ok(Some(ec_needle)) => { + n = ec_needle; + } + Ok(None) => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_NOT_FOUND]) + .inc(); + return StatusCode::NOT_FOUND.into_response(); + } + Err(e) => { + if e.kind() == std::io::ErrorKind::NotFound { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_NOT_FOUND]) + .inc(); + return StatusCode::NOT_FOUND.into_response(); + } + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_INTERNAL]) + .inc(); + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("read ec error: {}", e), + ) + .into_response(); + } + }, + None => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_NOT_FOUND]) + .inc(); + return StatusCode::NOT_FOUND.into_response(); + } + } + drop(store); + + // Validate cookie (matches Go behavior after ReadEcShardNeedle) + if n.cookie != cookie { + return StatusCode::NOT_FOUND.into_response(); + } + + // EC volumes: no streaming support + stream_info = None; + bypass_cm = query.cm.as_deref() == Some("false"); + track_download = download_guard.is_some(); + can_stream = false; + can_handle_head_from_meta = false; + can_handle_range_from_source = false; + } else { + // ---- Regular volume read path (with streaming support) ---- + + // Try meta-only read first for potential streaming + let store = state.store.read().unwrap(); + let si_result = store.read_volume_needle_stream_info(vid, &mut n, read_deleted); + stream_info = match si_result { + Ok(info) => Some(info), + Err(crate::storage::volume::VolumeError::StreamingUnsupported) => None, + Err(crate::storage::volume::VolumeError::NotFound) => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_NOT_FOUND]) + .inc(); + return StatusCode::NOT_FOUND.into_response(); + } + Err(crate::storage::volume::VolumeError::Deleted) => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_NOT_FOUND]) + .inc(); + return StatusCode::NOT_FOUND.into_response(); + } + Err(e) => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_GET_INTERNAL]) + .inc(); + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("read error: {}", e), + ) + .into_response(); + } + }; + drop(store); + + // Validate cookie + if n.cookie != cookie { + return StatusCode::NOT_FOUND.into_response(); + } + + bypass_cm = query.cm.as_deref() == Some("false"); + track_download = download_guard.is_some(); + let can_direct_source_read = stream_info.is_some() + && !n.is_compressed() + && !(n.is_chunk_manifest() && !bypass_cm) + && !has_image_ops; + + // Determine if we can stream (large, direct-source eligible, no range) + can_stream = can_direct_source_read + && n.data_size > STREAMING_THRESHOLD + && !has_range + && method != Method::HEAD; + + // Go uses meta-only reads for all HEAD requests, regardless of compression/chunked files. + can_handle_head_from_meta = stream_info.is_some() && method == Method::HEAD; + can_handle_range_from_source = can_direct_source_read && has_range; + + // For chunk manifest or any non-streaming path, we need the full data. + // If we can't stream, do a full read now. + if !can_stream && !can_handle_head_from_meta && !can_handle_range_from_source { + // Re-read with full data + let mut n_full = Needle { + id: needle_id, + cookie, + ..Needle::default() + }; + let store = state.store.read().unwrap(); + match store.read_volume_needle_opt(vid, &mut n_full, read_deleted) { + Ok(count) => { + if count < 0 { + return StatusCode::NOT_FOUND.into_response(); + } + } + Err(crate::storage::volume::VolumeError::NotFound) => { + return StatusCode::NOT_FOUND.into_response(); + } + Err(crate::storage::volume::VolumeError::Deleted) => { + return StatusCode::NOT_FOUND.into_response(); + } + Err(e) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("read error: {}", e), + ) + .into_response(); + } + } + drop(store); + // Use the full needle from here (it has the same metadata + data) + n = n_full; + } + } + + // Build ETag and Last-Modified BEFORE conditional checks and chunk manifest expansion + // (matches Go order: conditional checks first, then chunk manifest) + let etag = format!("\"{}\"", n.etag()); + + // Build Last-Modified header (RFC 1123 format) — must be done before conditional checks + let last_modified_str = if n.last_modified > 0 { + use chrono::{TimeZone, Utc}; + if let Some(dt) = Utc.timestamp_opt(n.last_modified as i64, 0).single() { + Some(dt.format("%a, %d %b %Y %H:%M:%S GMT").to_string()) + } else { + None + } + } else { + None + }; + + // Check If-Modified-Since FIRST (Go checks this before If-None-Match) + if n.last_modified > 0 { + if let Some(ims_header) = headers.get(header::IF_MODIFIED_SINCE) { + if let Ok(ims_str) = ims_header.to_str() { + // Parse HTTP date format: "Mon, 02 Jan 2006 15:04:05 GMT" + if let Ok(ims_time) = + chrono::NaiveDateTime::parse_from_str(ims_str, "%a, %d %b %Y %H:%M:%S GMT") + { + if (n.last_modified as i64) <= ims_time.and_utc().timestamp() { + let mut resp = StatusCode::NOT_MODIFIED.into_response(); + if let Some(ref lm) = last_modified_str { + resp.headers_mut() + .insert(header::LAST_MODIFIED, lm.parse().unwrap()); + } + // Go sets ETag AFTER the 304 return paths (L235), so 304 does NOT include ETag + return resp; + } + } + } + } + } + + // Check If-None-Match SECOND + if let Some(if_none_match) = headers.get(header::IF_NONE_MATCH) { + if let Ok(inm) = if_none_match.to_str() { + if inm == etag { + let mut resp = StatusCode::NOT_MODIFIED.into_response(); + if let Some(ref lm) = last_modified_str { + resp.headers_mut() + .insert(header::LAST_MODIFIED, lm.parse().unwrap()); + } + // Go sets ETag AFTER the 304 return paths (L235), so 304 does NOT include ETag + return resp; + } + } + } + + // Chunk manifest expansion (needs full data) — after conditional checks, before response + // Pass ETag so chunk manifest responses include it (matches Go: ETag is set on the + // response writer before tryHandleChunkedFile runs). + if n.is_chunk_manifest() && !bypass_cm { + if let Some(resp) = try_expand_chunk_manifest( + &state, + &n, + &headers, + &method, + &path, + &query, + &etag, + &last_modified_str, + ) { + return resp; + } + // If manifest expansion fails (invalid JSON etc.), fall through to raw data + } + + let mut response_headers = HeaderMap::new(); + response_headers.insert(header::ETAG, etag.parse().unwrap()); + + // H1: Emit pairs as response headers + if n.has_pairs() && !n.pairs.is_empty() { + if let Ok(pair_map) = + serde_json::from_slice::>(&n.pairs) + { + for (k, v) in &pair_map { + if let (Ok(hname), Ok(hval)) = ( + axum::http::HeaderName::from_bytes(k.as_bytes()), + axum::http::HeaderValue::from_str(v), + ) { + response_headers.insert(hname, hval); + } + } + } + } + + // H8: Use needle stored name when URL path has no filename (only vid,fid) + let mut filename = extract_filename_from_path(&path); + let mut ext = ext; + if n.name_size > 0 && filename.is_empty() { + filename = String::from_utf8_lossy(&n.name).to_string(); + if ext.is_empty() { + if let Some(dot_pos) = filename.rfind('.') { + ext = filename[dot_pos..].to_lowercase(); + } + } + } + + // H6: Determine Content-Type: filter application/octet-stream, use mime_guess + // For chunk manifests, skip extension-based MIME override — use stored MIME as-is (Go parity) + let content_type = if let Some(ref ct) = query.response_content_type { + Some(ct.clone()) + } else if n.is_chunk_manifest() { + // Chunk manifests: use stored MIME but filter application/octet-stream (Go L334) + if !n.mime.is_empty() { + let mt = String::from_utf8_lossy(&n.mime).to_string(); + if mt.starts_with("application/octet-stream") { + None + } else { + Some(mt) + } + } else { + None + } + } else { + // Get MIME from needle, but filter out application/octet-stream + let needle_mime = if !n.mime.is_empty() { + let mt = String::from_utf8_lossy(&n.mime).to_string(); + if mt.starts_with("application/octet-stream") { + String::new() + } else { + mt + } + } else { + String::new() + }; + + if !needle_mime.is_empty() { + Some(needle_mime) + } else { + // Fall through to extension-based detection + let detect_ext = if !ext.is_empty() { + ext.clone() + } else if !filename.is_empty() { + if let Some(dot_pos) = filename.rfind('.') { + filename[dot_pos..].to_lowercase() + } else { + String::new() + } + } else { + String::new() + }; + if !detect_ext.is_empty() { + mime_guess::from_ext(detect_ext.trim_start_matches('.')) + .first() + .map(|m| m.to_string()) + } else { + None // Omit Content-Type entirely + } + } + }; + if let Some(ref ct) = content_type { + response_headers.insert(header::CONTENT_TYPE, ct.parse().unwrap()); + } + + // Cache-Control override from query param + if let Some(ref cc) = query.response_cache_control { + response_headers.insert(header::CACHE_CONTROL, cc.parse().unwrap()); + } + + // S3 response passthrough headers + if let Some(ref ce) = query.response_content_encoding { + response_headers.insert(header::CONTENT_ENCODING, ce.parse().unwrap()); + } + if let Some(ref exp) = query.response_expires { + response_headers.insert(header::EXPIRES, exp.parse().unwrap()); + } + if let Some(ref cl) = query.response_content_language { + response_headers.insert("Content-Language", cl.parse().unwrap()); + } + if let Some(ref cd) = query.response_content_disposition { + response_headers.insert(header::CONTENT_DISPOSITION, cd.parse().unwrap()); + } + + // Last-Modified + if let Some(ref lm) = last_modified_str { + response_headers.insert(header::LAST_MODIFIED, lm.parse().unwrap()); + } + + // H7: Content-Disposition — inline by default, attachment only when dl is truthy + // Only set if not already set by response-content-disposition query param + if !response_headers.contains_key(header::CONTENT_DISPOSITION) && !filename.is_empty() { + let disposition_type = if let Some(ref dl_val) = query.dl { + if parse_go_bool(dl_val).unwrap_or(false) { + "attachment" + } else { + "inline" + } + } else { + "inline" + }; + let disposition = format_content_disposition(disposition_type, &filename); + if let Ok(hval) = disposition.parse() { + response_headers.insert(header::CONTENT_DISPOSITION, hval); + } + } + + // ---- Streaming path: large uncompressed files ---- + if can_stream { + if let Some(info) = stream_info { + response_headers.insert(header::ACCEPT_RANGES, "bytes".parse().unwrap()); + response_headers.insert( + header::CONTENT_LENGTH, + info.data_size.to_string().parse().unwrap(), + ); + + let tracked_bytes = info.data_size as i64; + let tracking_state = if download_guard.is_some() { + let new_val = state + .inflight_download_bytes + .fetch_add(tracked_bytes, Ordering::Relaxed) + + tracked_bytes; + metrics::INFLIGHT_DOWNLOAD_SIZE.set(new_val); + Some(state.clone()) + } else { + None + }; + + let streaming = StreamingBody { + source: info.source, + data_offset: info.data_file_offset, + data_size: info.data_size, + pos: 0, + chunk_size: streaming_chunk_size( + state.read_buffer_size_bytes, + info.data_size as usize, + ), + _held_read_lease: if state.has_slow_read { + None + } else { + Some(info.data_file_access_control.read_lock()) + }, + data_file_access_control: info.data_file_access_control, + hold_read_lock_for_stream: !state.has_slow_read, + pending: None, + state: tracking_state, + tracked_bytes, + server_state: state.clone(), + volume_id: info.volume_id, + needle_id: info.needle_id, + compaction_revision: info.compaction_revision, + }; + + let body = Body::new(streaming); + let mut resp = Response::new(body); + *resp.status_mut() = StatusCode::OK; + *resp.headers_mut() = response_headers; + return resp; + } + } + + if can_handle_head_from_meta { + if let Some(info) = stream_info { + response_headers.insert( + header::CONTENT_LENGTH, + info.data_size.to_string().parse().unwrap(), + ); + return (StatusCode::OK, response_headers).into_response(); + } + } + + if can_handle_range_from_source { + if let (Some(range_header), Some(info)) = (headers.get(header::RANGE), stream_info) { + if let Ok(range_str) = range_header.to_str() { + return handle_range_request_from_source( + range_str, + info, + response_headers, + track_download.then(|| state.clone()), + ); + } + } + } + + // ---- Buffered path: small files, compressed, images, range requests ---- + + // Handle compressed data: if needle is compressed, either pass through or decompress + let is_compressed = n.is_compressed(); + let mut data = n.data; + + // Check if image operations are needed — must decompress first regardless of Accept-Encoding + // Go checks resize (.webp OK) and crop (.webp NOT OK) separately. + let needs_image_ops = has_resize_ops || has_crop_ops; + + if is_compressed { + if needs_image_ops { + // Always decompress for image operations (Go decompresses before resize/crop) + use flate2::read::GzDecoder; + use std::io::Read as _; + let mut decoder = GzDecoder::new(&data[..]); + let mut decompressed = Vec::new(); + if decoder.read_to_end(&mut decompressed).is_ok() { + data = decompressed; + } + } else { + let accept_encoding = headers + .get(header::ACCEPT_ENCODING) + .and_then(|v| v.to_str().ok()) + .unwrap_or(""); + if accept_encoding.contains("gzip") + && data.len() >= 2 + && data[0] == 0x1f + && data[1] == 0x8b + { + // Go checks IsGzippedContent (magic bytes 0x1f 0x8b) before + // setting Content-Encoding: gzip + response_headers.insert(header::CONTENT_ENCODING, "gzip".parse().unwrap()); + } else { + // Decompress for client + use flate2::read::GzDecoder; + use std::io::Read as _; + let mut decoder = GzDecoder::new(&data[..]); + let mut decompressed = Vec::new(); + if decoder.read_to_end(&mut decompressed).is_ok() { + data = decompressed; + } + } + } + } + + // Image crop and resize — Go checks extensions separately per operation. + // Crop: .png .jpg .jpeg .gif (no .webp). Resize: .png .jpg .jpeg .gif .webp. + if is_image_crop_ext(&ext) { + data = maybe_crop_image(&data, &ext, &query); + } + if is_image_resize_ext(&ext) { + data = maybe_resize_image(&data, &ext, &query); + } + + // Accept-Ranges + response_headers.insert(header::ACCEPT_RANGES, "bytes".parse().unwrap()); + + // Check Range header + if let Some(range_header) = headers.get(header::RANGE) { + if let Ok(range_str) = range_header.to_str() { + return handle_range_request( + range_str, + &data, + response_headers, + track_download.then(|| state.clone()), + ); + } + } + + if method == Method::HEAD { + response_headers.insert( + header::CONTENT_LENGTH, + data.len().to_string().parse().unwrap(), + ); + return (StatusCode::OK, response_headers).into_response(); + } + + finalize_bytes_response( + StatusCode::OK, + response_headers, + data, + track_download.then(|| state.clone()), + ) +} + +/// Handle HTTP Range requests. Returns 206 Partial Content or 416 Range Not Satisfiable. +#[derive(Clone, Copy)] +struct HttpRange { + start: i64, + length: i64, +} + +fn parse_range_header(s: &str, size: i64) -> Result, &'static str> { + if s.is_empty() { + return Ok(Vec::new()); + } + const PREFIX: &str = "bytes="; + if !s.starts_with(PREFIX) { + return Err("invalid range"); + } + let mut ranges = Vec::new(); + for part in s[PREFIX.len()..].split(',') { + let part = part.trim(); + if part.is_empty() { + continue; + } + let Some(pos) = part.find('-') else { + return Err("invalid range"); + }; + let start_str = part[..pos].trim(); + let end_str = part[pos + 1..].trim(); + let mut r = HttpRange { + start: 0, + length: 0, + }; + if start_str.is_empty() { + let mut i = end_str.parse::().map_err(|_| "invalid range")?; + if i > size { + i = size; + } + r.start = size - i; + r.length = size - r.start; + } else { + let i = start_str.parse::().map_err(|_| "invalid range")?; + if i > size || i < 0 { + return Err("invalid range"); + } + r.start = i; + if end_str.is_empty() { + r.length = size - r.start; + } else { + let mut i = end_str.parse::().map_err(|_| "invalid range")?; + if r.start > i { + return Err("invalid range"); + } + if i >= size { + i = size - 1; + } + r.length = i - r.start + 1; + } + } + ranges.push(r); + } + Ok(ranges) +} + +fn sum_ranges_size(ranges: &[HttpRange]) -> i64 { + ranges.iter().map(|r| r.length).sum() +} + +fn range_content_range(r: HttpRange, total: i64) -> String { + format!("bytes {}-{}/{}", r.start, r.start + r.length - 1, total) +} + +fn range_error_response(mut headers: HeaderMap, msg: &str) -> Response { + if !headers.contains_key(header::CONTENT_TYPE) { + headers.insert( + header::CONTENT_TYPE, + "text/plain; charset=utf-8".parse().unwrap(), + ); + } + let mut response = Response::new(Body::from(msg.to_string())); + *response.status_mut() = StatusCode::RANGE_NOT_SATISFIABLE; + *response.headers_mut() = headers; + response +} + +fn handle_range_request( + range_str: &str, + data: &[u8], + mut headers: HeaderMap, + state: Option>, +) -> Response { + let total = data.len() as i64; + let ranges = match parse_range_header(range_str, total) { + Ok(r) => r, + Err(msg) => return range_error_response(headers, msg), + }; + + // Go's ProcessRangeRequest returns nil (empty body) for empty or oversized ranges + if ranges.is_empty() { + return (StatusCode::OK, headers).into_response(); + } + + if sum_ranges_size(&ranges) > total { + return (StatusCode::OK, headers).into_response(); + } + + if ranges.len() == 1 { + let r = ranges[0]; + headers.insert( + "Content-Range", + range_content_range(r, total).parse().unwrap(), + ); + headers.insert( + header::CONTENT_LENGTH, + r.length.max(0).to_string().parse().unwrap(), + ); + if r.length <= 0 { + return (StatusCode::PARTIAL_CONTENT, headers).into_response(); + } + let start = r.start as usize; + let end = (r.start + r.length) as usize; + let slice = &data[start..end]; + finalize_bytes_response(StatusCode::PARTIAL_CONTENT, headers, slice.to_vec(), state) + } else { + // Multi-range: build multipart/byteranges response + let boundary = "SeaweedFSBoundary"; + let content_type = headers + .get(header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or("application/octet-stream") + .to_string(); + + let mut body = Vec::new(); + for (i, r) in ranges.iter().enumerate() { + // First boundary has no leading CRLF per RFC 2046 + if i == 0 { + body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes()); + } else { + body.extend_from_slice(format!("\r\n--{}\r\n", boundary).as_bytes()); + } + body.extend_from_slice(format!("Content-Type: {}\r\n", content_type).as_bytes()); + body.extend_from_slice( + format!("Content-Range: {}\r\n\r\n", range_content_range(*r, total)).as_bytes(), + ); + if r.length > 0 { + let start = r.start as usize; + let end = (r.start + r.length) as usize; + body.extend_from_slice(&data[start..end]); + } + } + body.extend_from_slice(format!("\r\n--{}--\r\n", boundary).as_bytes()); + + headers.insert( + header::CONTENT_TYPE, + format!("multipart/byteranges; boundary={}", boundary) + .parse() + .unwrap(), + ); + if !headers.contains_key(header::CONTENT_ENCODING) { + headers.insert( + header::CONTENT_LENGTH, + body.len().to_string().parse().unwrap(), + ); + } + finalize_bytes_response(StatusCode::PARTIAL_CONTENT, headers, body, state) + } +} + +fn handle_range_request_from_source( + range_str: &str, + info: crate::storage::volume::NeedleStreamInfo, + mut headers: HeaderMap, + state: Option>, +) -> Response { + let total = info.data_size as i64; + let ranges = match parse_range_header(range_str, total) { + Ok(r) => r, + Err(msg) => return range_error_response(headers, msg), + }; + + if ranges.is_empty() { + return (StatusCode::OK, headers).into_response(); + } + + if sum_ranges_size(&ranges) > total { + return (StatusCode::OK, headers).into_response(); + } + + let read_slice = |start: i64, length: i64| -> Result, std::io::Error> { + if length <= 0 { + return Ok(Vec::new()); + } + let mut buf = vec![0u8; length as usize]; + info.source + .read_exact_at(&mut buf, info.data_file_offset + start as u64)?; + Ok(buf) + }; + + if ranges.len() == 1 { + let r = ranges[0]; + let slice = match read_slice(r.start, r.length) { + Ok(slice) => slice, + Err(err) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("range read error: {}", err), + ) + .into_response() + } + }; + headers.insert( + "Content-Range", + range_content_range(r, total).parse().unwrap(), + ); + headers.insert( + header::CONTENT_LENGTH, + slice.len().to_string().parse().unwrap(), + ); + return finalize_bytes_response(StatusCode::PARTIAL_CONTENT, headers, slice, state); + } + + let boundary = "SeaweedFSBoundary"; + let content_type = headers + .get(header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or("application/octet-stream") + .to_string(); + + let mut body = Vec::new(); + for (i, r) in ranges.iter().enumerate() { + let slice = match read_slice(r.start, r.length) { + Ok(slice) => slice, + Err(err) => { + return ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("range read error: {}", err), + ) + .into_response() + } + }; + if i == 0 { + body.extend_from_slice(format!("--{}\r\n", boundary).as_bytes()); + } else { + body.extend_from_slice(format!("\r\n--{}\r\n", boundary).as_bytes()); + } + body.extend_from_slice(format!("Content-Type: {}\r\n", content_type).as_bytes()); + body.extend_from_slice( + format!("Content-Range: {}\r\n\r\n", range_content_range(*r, total)).as_bytes(), + ); + body.extend_from_slice(&slice); + } + body.extend_from_slice(format!("\r\n--{}--\r\n", boundary).as_bytes()); + + headers.insert( + header::CONTENT_TYPE, + format!("multipart/byteranges; boundary={}", boundary) + .parse() + .unwrap(), + ); + if !headers.contains_key(header::CONTENT_ENCODING) { + headers.insert( + header::CONTENT_LENGTH, + body.len().to_string().parse().unwrap(), + ); + } + finalize_bytes_response(StatusCode::PARTIAL_CONTENT, headers, body, state) +} + +/// Extract filename from URL path like "/vid/fid/filename.ext" +fn extract_filename_from_path(path: &str) -> String { + let parts: Vec<&str> = path.trim_start_matches('/').split('/').collect(); + if parts.len() >= 3 { + parts[2].to_string() + } else { + String::new() + } +} + +fn path_base(path: &str) -> String { + let trimmed = path.trim_end_matches('/'); + trimmed + .rsplit('/') + .find(|s| !s.is_empty()) + .unwrap_or("") + .to_string() +} + +fn parse_go_bool(value: &str) -> Option { + match value { + "1" | "t" | "T" | "TRUE" | "True" | "true" => Some(true), + "0" | "f" | "F" | "FALSE" | "False" | "false" => Some(false), + _ => None, + } +} + +/// Format Content-Disposition header value per RFC 6266. +/// +/// Matches Go's `mime.FormatMediaType(dispositionType, map[string]string{"filename": filename})`: +/// - Simple ASCII names (alphanumeric, hyphen, underscore, dot): `attachment; filename=file.txt` +/// - ASCII names with spaces/special chars: `attachment; filename="my file.txt"` +/// - Non-ASCII names: `attachment; filename*=utf-8''percent-encoded-name` +fn format_content_disposition(disposition_type: &str, filename: &str) -> String { + let is_ascii = filename.bytes().all(|b| b.is_ascii()); + if is_ascii { + // Check if the filename is a simple "token" (no quoting needed). + // RFC 2616 token chars: any CHAR except CTLs or separators. + // Go's mime.FormatMediaType uses needsQuoting which checks for non-token chars. + let is_token = !filename.is_empty() + && filename.bytes().all(|b| { + b > 0x20 + && b < 0x7f + && !matches!( + b, + b'(' | b')' + | b'<' + | b'>' + | b'@' + | b',' + | b';' + | b':' + | b'\\' + | b'"' + | b'/' + | b'[' + | b']' + | b'?' + | b'=' + | b' ' + ) + }); + if is_token { + format!("{}; filename={}", disposition_type, filename) + } else { + // Quote the filename, escaping backslashes and quotes + let escaped = filename.replace('\\', "\\\\").replace('"', "\\\""); + format!("{}; filename=\"{}\"", disposition_type, escaped) + } + } else { + // Non-ASCII: use RFC 2231 encoding with filename* parameter + let encoded = percent_encode_rfc2231(filename); + format!("{}; filename*=utf-8''{}", disposition_type, encoded) + } +} + +/// Percent-encode a string for RFC 2231 filename* parameter. +/// Encodes all bytes except unreserved chars (ALPHA / DIGIT / "-" / "." / "_" / "~"). +fn percent_encode_rfc2231(s: &str) -> String { + let mut out = String::with_capacity(s.len() * 3); + for byte in s.bytes() { + if byte.is_ascii_alphanumeric() || matches!(byte, b'-' | b'.' | b'_' | b'~') { + out.push(byte as char); + } else { + out.push('%'); + out.push(char::from(HEX_UPPER[byte as usize >> 4])); + out.push(char::from(HEX_UPPER[byte as usize & 0x0f])); + } + } + out +} + +const HEX_UPPER: [u8; 16] = *b"0123456789ABCDEF"; + +// ============================================================================ +// Image processing helpers +// ============================================================================ + +fn is_image_resize_ext(ext: &str) -> bool { + matches!(ext, ".png" | ".jpg" | ".jpeg" | ".gif" | ".webp") +} + +/// Go's shouldCropImages only supports these four formats (no .webp). +fn is_image_crop_ext(ext: &str) -> bool { + matches!(ext, ".png" | ".jpg" | ".jpeg" | ".gif") +} + +fn extract_extension_from_path(path: &str) -> String { + let parts: Vec<&str> = path.trim_start_matches('/').split('/').collect(); + if parts.len() >= 3 { + // 3-segment path: /vid/fid/filename.ext + let filename = parts[2]; + if let Some(dot_pos) = filename.rfind('.') { + return filename[dot_pos..].to_lowercase(); + } + } else if parts.len() >= 1 { + // 2-segment path: /vid,fid.ext or /vid/fid.ext + // Go's parseURLPath extracts ext from the full path for all formats + let last = parts[parts.len() - 1]; + if let Some(dot_pos) = last.rfind('.') { + return last[dot_pos..].to_lowercase(); + } + } + String::new() +} + +fn maybe_resize_image(data: &[u8], ext: &str, query: &ReadQueryParams) -> Vec { + let width = query.width.unwrap_or(0); + let height = query.height.unwrap_or(0); + if width == 0 && height == 0 { + return data.to_vec(); + } + + let img = match image::load_from_memory(data) { + Ok(img) => img, + Err(_) => return data.to_vec(), + }; + + let (src_w, src_h) = (img.width(), img.height()); + // Only resize if source is larger than target + if (width == 0 || src_w <= width) && (height == 0 || src_h <= height) { + return data.to_vec(); + } + + let mode = query.mode.as_deref().unwrap_or(""); + let resized = match mode { + "fit" => img.resize(width, height, image::imageops::FilterType::Lanczos3), + "fill" => img.resize_to_fill(width, height, image::imageops::FilterType::Lanczos3), + _ => { + if width > 0 && height > 0 && width == height && src_w != src_h { + img.resize_to_fill(width, height, image::imageops::FilterType::Lanczos3) + } else { + img.resize(width, height, image::imageops::FilterType::Lanczos3) + } + } + }; + + encode_image(&resized, ext).unwrap_or_else(|| data.to_vec()) +} + +fn maybe_crop_image(data: &[u8], ext: &str, query: &ReadQueryParams) -> Vec { + let (x1, y1, x2, y2) = match (query.crop_x2, query.crop_y2) { + (Some(x2), Some(y2)) => { + let x1 = query.crop_x1.unwrap_or(0); + let y1 = query.crop_y1.unwrap_or(0); + if x2 > x1 && y2 > y1 { + (x1, y1, x2, y2) + } else { + return data.to_vec(); + } + } + _ => return data.to_vec(), + }; + + let img = match image::load_from_memory(data) { + Ok(img) => img, + Err(_) => return data.to_vec(), + }; + + let (src_w, src_h) = (img.width(), img.height()); + if x2 > src_w || y2 > src_h { + return data.to_vec(); + } + + let cropped = img.crop_imm(x1, y1, x2 - x1, y2 - y1); + encode_image(&cropped, ext).unwrap_or_else(|| data.to_vec()) +} + +fn encode_image(img: &image::DynamicImage, ext: &str) -> Option> { + use std::io::Cursor; + let mut buf = Cursor::new(Vec::new()); + let format = match ext { + ".png" => image::ImageFormat::Png, + ".jpg" | ".jpeg" => image::ImageFormat::Jpeg, + ".gif" => image::ImageFormat::Gif, + ".webp" => image::ImageFormat::WebP, + _ => return None, + }; + img.write_to(&mut buf, format).ok()?; + Some(buf.into_inner()) +} + +// ============================================================================ +// Write Handler (POST/PUT) +// ============================================================================ + +#[derive(Serialize)] +struct UploadResult { + #[serde(skip_serializing_if = "String::is_empty")] + name: String, + #[serde(skip_serializing_if = "is_zero_u32")] + size: u32, + #[serde(rename = "eTag", skip_serializing_if = "String::is_empty")] + etag: String, + #[serde(skip_serializing_if = "String::is_empty")] + mime: String, + #[serde(rename = "contentMd5", skip_serializing_if = "Option::is_none")] + content_md5: Option, +} + +fn is_zero_u32(v: &u32) -> bool { + *v == 0 +} + +pub async fn post_handler( + State(state): State>, + request: Request, +) -> Response { + let path = request.uri().path().to_string(); + let query = request.uri().query().unwrap_or("").to_string(); + let method = request.method().clone(); + let headers = request.headers().clone(); + let query_fields: Vec<(String, String)> = match serde_urlencoded::from_str(&query) { + Ok(fields) => fields, + Err(e) => { + // Go's r.ParseForm() returns 400 on malformed query strings + return json_error_with_query( + StatusCode::BAD_REQUEST, + &format!("form parse error: {}", e), + Some(&query), + ); + } + }; + + let (vid, needle_id, cookie) = match parse_url_path(&path) { + Some(parsed) => parsed, + None => { + return json_error_with_query(StatusCode::BAD_REQUEST, "invalid URL path", Some(&query)) + } + }; + + // JWT check for writes + let file_id = extract_file_id(&path); + let token = extract_jwt(&headers, request.uri()); + if let Err(_) = state + .guard + .read() + .unwrap() + .check_jwt_for_file(token.as_deref(), &file_id, true) + { + return json_error_with_query(StatusCode::UNAUTHORIZED, "wrong jwt", Some(&query)); + } + + // Upload throttling: check inflight bytes against limit + let is_replicate = query.split('&').any(|p| p == "type=replicate"); + let content_length = headers + .get(header::CONTENT_LENGTH) + .and_then(|v| v.to_str().ok()) + .and_then(|s| s.parse::().ok()) + .unwrap_or(0); + + if !is_replicate && state.concurrent_upload_limit > 0 { + // Wait for inflight bytes to drop below limit, or timeout + let timeout = if state.inflight_upload_data_timeout.is_zero() { + std::time::Duration::from_secs(2) + } else { + state.inflight_upload_data_timeout + }; + let deadline = tokio::time::Instant::now() + timeout; + + loop { + let current = state.inflight_upload_bytes.load(Ordering::Relaxed); + if current <= state.concurrent_upload_limit { + break; + } + // Go increments UploadLimitCond on every loop iteration (L184), + // not just on timeout. + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::UPLOAD_LIMIT_COND]) + .inc(); + // Wait for notification or timeout + if tokio::time::timeout_at(deadline, state.upload_notify.notified()) + .await + .is_err() + { + return json_error_with_query( + StatusCode::TOO_MANY_REQUESTS, + "upload limit exceeded", + Some(&query), + ); + } + } + let new_val = state + .inflight_upload_bytes + .fetch_add(content_length, Ordering::Relaxed) + + content_length; + metrics::INFLIGHT_UPLOAD_SIZE.set(new_val); + } + + // RAII guard to release upload throttle on any exit path + let _upload_guard = if !is_replicate && state.concurrent_upload_limit > 0 { + Some(InflightGuard { + counter: &state.inflight_upload_bytes, + bytes: content_length, + notify: &state.upload_notify, + metric: &metrics::INFLIGHT_UPLOAD_SIZE, + }) + } else { + None + }; + + let content_type_str = headers + .get(header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .unwrap_or("") + .to_string(); + + // Go only parses multipart form-data for POST requests with form-data content type. + let should_parse_multipart = method == Method::POST && content_type_str.contains("form-data"); + + // Validate multipart/form-data has a boundary + if should_parse_multipart && !content_type_str.contains("boundary=") { + return json_error_with_query( + StatusCode::BAD_REQUEST, + "no multipart boundary param in Content-Type", + Some(&query), + ); + } + + let content_md5 = headers + .get("Content-MD5") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + // Read body + let body = match axum::body::to_bytes(request.into_body(), usize::MAX).await { + Ok(b) => b, + Err(e) => { + return json_error_with_query( + StatusCode::BAD_REQUEST, + format!("read body: {}", e), + Some(&query), + ) + } + }; + + // H5: Multipart form-data parsing + let ( + body_data_raw, + parsed_filename, + parsed_content_type, + parsed_content_encoding, + parsed_content_md5, + multipart_form_fields, + ) = if should_parse_multipart { + // Extract boundary from Content-Type + let boundary = content_type_str + .split(';') + .find_map(|part| { + let part = part.trim(); + if let Some(val) = part.strip_prefix("boundary=") { + Some(val.trim_matches('"').to_string()) + } else { + None + } + }) + .unwrap_or_default(); + + let mut multipart = multer::Multipart::new( + futures::stream::once(async { Ok::<_, std::io::Error>(body.clone()) }), + boundary, + ); + + let mut file_data: Option> = None; + let mut first_part_data: Option> = None; + let mut file_name: Option = None; + let mut file_content_type: Option = None; + let mut file_content_encoding: Option = None; + let mut file_content_md5: Option = None; + let mut form_fields = std::collections::HashMap::new(); + + while let Ok(Some(field)) = multipart.next_field().await { + let field_name = field.name().map(|s| s.to_string()); + let fname = field.file_name().map(clean_windows_path_base); + let fct = field.content_type().map(|m| m.to_string()); + let field_headers = field.headers().clone(); + let fce = field_headers + .get(header::CONTENT_ENCODING) + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + let fmd5 = field_headers + .get("Content-MD5") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + if let Ok(data) = field.bytes().await { + // Go reads the first part's data unconditionally, then looks for + // a part with a filename. If no part has a filename, Go uses the + // first part's data (with empty filename). + if first_part_data.is_none() { + first_part_data = Some(data.to_vec()); + } + if file_data.is_none() && fname.is_some() { + // Found a file field — use this part's data + file_data = Some(data.to_vec()); + file_name = fname; + file_content_type = fct; + file_content_encoding = fce; + file_content_md5 = fmd5; + } else if let Some(name) = field_name { + form_fields + .entry(name) + .or_insert_with(|| String::from_utf8_lossy(&data).to_string()); + } + } + } + + if let Some(data) = file_data { + ( + data, + file_name.unwrap_or_default(), + file_content_type, + file_content_encoding, + file_content_md5, + form_fields, + ) + } else if let Some(data) = first_part_data { + // No file field found, use first part's data (matching Go behavior) + (data, String::new(), None, None, None, form_fields) + } else { + // No parts at all + (Vec::new(), String::new(), None, None, None, form_fields) + } + } else { + ( + body.to_vec(), + String::new(), + None, + None, + None, + std::collections::HashMap::new(), + ) + }; + + let form_value = |name: &str| { + query_fields + .iter() + .find_map(|(k, v)| if k == name { Some(v.clone()) } else { None }) + .or_else(|| multipart_form_fields.get(name).cloned()) + }; + + // Check for chunk manifest flag. + // Go uses r.FormValue("cm"), which falls back to multipart fields when present. + let is_chunk_manifest = matches!( + form_value("cm").as_deref(), + Some("1" | "t" | "T" | "TRUE" | "True" | "true") + ); + + // Check file size limit (matches Go: "file over the limited %d bytes") + if state.file_size_limit_bytes > 0 && body_data_raw.len() as i64 > state.file_size_limit_bytes { + return json_error_with_query( + StatusCode::BAD_REQUEST, + format!( + "file over the limited {} bytes", + state.file_size_limit_bytes + ), + Some(&query), + ); + } + + // Check if upload is pre-compressed + let is_gzipped = if should_parse_multipart { + parsed_content_encoding.as_deref() == Some("gzip") + } else { + headers + .get(header::CONTENT_ENCODING) + .and_then(|v| v.to_str().ok()) + .map(|s| s == "gzip") + .unwrap_or(false) + }; + + let uncompressed_data = if is_gzipped { + maybe_decompress_gzip(&body_data_raw).unwrap_or_else(|| body_data_raw.clone()) + } else { + body_data_raw.clone() + }; + let original_data_size = uncompressed_data.len() as u32; + + // Only compute and validate Content-MD5 when the client provided one + // (Go only computes MD5 when Content-MD5 header/field is present) + let content_md5 = content_md5.or(parsed_content_md5); + let original_content_md5 = if content_md5.is_some() { + Some(compute_md5_base64(&uncompressed_data)) + } else { + None + }; + if let (Some(ref expected_md5), Some(ref actual_md5)) = (&content_md5, &original_content_md5) { + if expected_md5 != actual_md5 { + return json_error_with_query( + StatusCode::BAD_REQUEST, + format!( + "Content-MD5 did not match md5 of file data expected [{}] received [{}] size {}", + expected_md5, actual_md5, original_data_size + ), + Some(&query), + ); + } + } + + let now = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + // Parse custom timestamp from query param + let ts_str = form_value("ts").unwrap_or_default(); + let last_modified = if !ts_str.is_empty() { + ts_str.parse::().unwrap_or(now) + } else { + now + }; + + // Prefer the multipart filename before deriving MIME and other metadata. + let filename = if !parsed_filename.is_empty() { + parsed_filename + } else if !should_parse_multipart { + headers + .get(header::CONTENT_DISPOSITION) + .and_then(|v| v.to_str().ok()) + .and_then(parse_content_disposition_filename) + .unwrap_or_else(|| path_base(&path)) + } else { + extract_filename_from_path(&path) + }; + + // Extract MIME type: prefer multipart-parsed content type, else from Content-Type header + let mime_type = if let Some(ref pct) = parsed_content_type { + pct.clone() + } else { + let multipart_fallback = + if should_parse_multipart && !filename.is_empty() && !is_chunk_manifest { + mime_guess::from_path(&filename) + .first() + .map(|m| m.to_string()) + .unwrap_or_default() + } else { + String::new() + }; + headers + .get(header::CONTENT_TYPE) + .and_then(|v| v.to_str().ok()) + .map(|ct| { + if should_parse_multipart && ct.starts_with("multipart/") { + multipart_fallback.clone() + } else { + ct.to_string() + } + }) + .unwrap_or(multipart_fallback) + }; + + // Parse TTL from query param (matches Go's r.FormValue("ttl")) + let ttl_str = form_value("ttl").unwrap_or_default(); + let ttl = if !ttl_str.is_empty() { + crate::storage::needle::TTL::read(&ttl_str).ok() + } else { + None + }; + + // Extract Seaweed-* custom metadata headers (pairs) + // Go's net/http canonicalizes header names to Title-Case, so after stripping + // the "Seaweed-" prefix, keys are Title-Case (e.g., "Foo-Bar"). Rust's http + // crate lowercases all header names, so we must convert the stripped key to + // Title-Case to match Go's behavior. + fn to_title_case(s: &str) -> String { + let mut result = String::with_capacity(s.len()); + let mut capitalize_next = true; + for c in s.chars() { + if c == '-' { + result.push('-'); + capitalize_next = true; + } else if capitalize_next { + for uc in c.to_uppercase() { + result.push(uc); + } + capitalize_next = false; + } else { + result.push(c); + } + } + result + } + let pair_map: std::collections::HashMap = headers + .iter() + .filter_map(|(k, v)| { + let key = k.as_str(); + if key.len() > 8 && key[..8].eq_ignore_ascii_case("seaweed-") { + if let Ok(val) = v.to_str() { + // Store with the prefix stripped and Title-Cased (matching Go's trimmedPairMap) + Some((to_title_case(&key[8..]), val.to_string())) + } else { + None + } + } else { + None + } + }) + .collect(); + + // Fix JPEG orientation from EXIF data before storing (matches Go behavior). + let body_data = if state.fix_jpg_orientation && crate::images::is_jpeg(&mime_type, &path) { + crate::images::fix_jpg_orientation(&body_data_raw) + } else { + body_data_raw + }; + + // Auto-compress compressible file types (matches Go's IsCompressableFileType). + // Only compress if not already gzipped and compression saves >10%. + // Go uses filepath.Base(pu.FileName) for extension detection (not the URL path). + let (final_data, final_is_gzipped) = if !is_gzipped && !is_chunk_manifest { + let ext = { + let dot_pos = filename.rfind('.'); + dot_pos + .map(|p| filename[p..].to_lowercase()) + .unwrap_or_default() + }; + if is_compressible_file_type(&ext, &mime_type) { + if let Some(compressed) = try_gzip_data(&body_data) { + if compressed.len() * 10 < body_data.len() * 9 { + (compressed, true) + } else { + (body_data, false) + } + } else { + (body_data, false) + } + } else { + (body_data, false) + } + } else { + (body_data, is_gzipped) + }; + + let mut n = Needle { + id: needle_id, + cookie, + data_size: final_data.len() as u32, + data: final_data, + last_modified: last_modified, + ..Needle::default() + }; + n.set_has_last_modified_date(); + if is_chunk_manifest { + n.set_is_chunk_manifest(); + } + if final_is_gzipped { + n.set_is_compressed(); + } + + // Go sets HasMime even for empty MIME types: if len(pu.MimeType) < 256 + if mime_type.len() < 256 { + n.mime = mime_type.as_bytes().to_vec(); + n.set_has_mime(); + } + + // Set TTL on needle + if let Some(ref t) = ttl { + if !t.is_empty() { + n.ttl = Some(*t); + n.set_has_ttl(); + } + } + + // Set pairs on needle + if !pair_map.is_empty() { + if let Ok(pairs_json) = serde_json::to_vec(&pair_map) { + if pairs_json.len() < 65536 { + n.pairs_size = pairs_json.len() as u16; + n.pairs = pairs_json; + n.set_has_pairs(); + } + } + } + + // Set filename on needle (matches Go: if len(pu.FileName) < 256) + // Go sets HasName even for empty filenames + if filename.len() < 256 { + n.name = filename.as_bytes().to_vec(); + n.name_size = filename.len() as u8; + n.set_has_name(); + } + + let write_result = if let Some(wq) = state.write_queue.get() { + wq.submit(vid, n.clone()).await + } else { + let mut store = state.store.write().unwrap(); + store.write_volume_needle(vid, &mut n) + }; + + // Replicate to remote volume servers if this volume has replicas. + // Matches Go's GetWritableRemoteReplications: skip if copy_count == 1. + if !is_replicate && write_result.is_ok() && !state.master_url.is_empty() { + let needs_replication = { + let store = state.store.read().unwrap(); + store.find_volume(vid).map_or(false, |(_, v)| { + v.super_block.replica_placement.get_copy_count() > 1 + }) + }; + if needs_replication { + let state_clone = state.clone(); + let path_clone = path.clone(); + let query_clone = query.clone(); + let headers_clone = headers.clone(); + let body_clone = body.clone(); + let replication = tokio::spawn(async move { + do_replicated_request( + &state_clone, + vid.0, + Method::POST, + &path_clone, + &query_clone, + &headers_clone, + Some(body_clone), + ) + .await + }); + let replication_result = replication + .await + .map_err(|e| format!("replication task failed: {}", e)) + .and_then(|result| result); + if let Err(e) = replication_result { + tracing::error!("replicated write failed: {}", e); + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("replication failed: {}", e), + Some(&query), + ); + } + } + } + + let resp = match write_result { + Ok((_offset, _size, is_unchanged)) => { + if is_unchanged { + let etag = format!("\"{}\"", n.etag()); + (StatusCode::NO_CONTENT, [(header::ETAG, etag)]).into_response() + } else { + // Go only includes contentMd5 when the client provided Content-MD5 + let result = UploadResult { + name: if n.has_name() { + filename.clone() + } else { + String::new() + }, + size: original_data_size, // H3: use original size, not compressed + etag: n.etag(), + mime: mime_type.clone(), + content_md5: original_content_md5.clone(), + }; + let etag = n.etag(); + let etag_header = if etag.starts_with('"') { + etag.clone() + } else { + format!("\"{}\"", etag) + }; + let mut resp = json_result_with_query(StatusCode::CREATED, &result, &query); + resp.headers_mut() + .insert(header::ETAG, etag_header.parse().unwrap()); + if let Some(ref md5_value) = original_content_md5 { + resp.headers_mut() + .insert("Content-MD5", md5_value.parse().unwrap()); + } + resp + } + } + Err(e) => { + metrics::HANDLER_COUNTER + .with_label_values(&[metrics::ERROR_WRITE_TO_LOCAL_DISK]) + .inc(); + json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("{}", e), + Some(&query), + ) + } + }; + + // _upload_guard drops here, releasing inflight bytes + resp +} + +// ============================================================================ +// Delete Handler +// ============================================================================ + +#[derive(Serialize)] +struct DeleteResult { + size: i64, +} + +pub async fn delete_handler( + State(state): State>, + request: Request, +) -> Response { + let path = request.uri().path().to_string(); + let del_query = request.uri().query().unwrap_or("").to_string(); + let del_params: ReadQueryParams = serde_urlencoded::from_str(&del_query).unwrap_or_default(); + let headers = request.headers().clone(); + + let (vid, needle_id, cookie) = match parse_url_path(&path) { + Some(parsed) => parsed, + None => { + return json_error_with_query( + StatusCode::BAD_REQUEST, + "invalid URL path", + Some(&del_query), + ) + } + }; + + // JWT check for writes (deletes use write key) + let file_id = extract_file_id(&path); + let token = extract_jwt(&headers, request.uri()); + if let Err(_) = state + .guard + .read() + .unwrap() + .check_jwt_for_file(token.as_deref(), &file_id, true) + { + return json_error_with_query(StatusCode::UNAUTHORIZED, "wrong jwt", Some(&del_query)); + } + + // Check for EC volume first (Go checks hasEcVolume before regular volume in DeleteHandler). + // Go's flow: FindEcVolume -> DeleteEcShardNeedle(ecVolume, n, cookie) -> writeDeleteResult + // DeleteEcShardNeedle: reads needle (for size + cookie validation), validates cookie, journals delete. + { + let has_ec = state.store.read().unwrap().has_ec_volume(vid); + if has_ec { + // Step 1: Read the EC needle to get its size and validate cookie + let ec_read_result = { + let store = state.store.read().unwrap(); + store + .find_ec_volume(vid) + .map(|ecv| ecv.read_ec_shard_needle(needle_id)) + }; + match ec_read_result { + Some(Ok(Some(ec_needle))) => { + // Step 2: Validate cookie (Go: cookie != 0 && cookie != n.Cookie) + if cookie.0 != 0 && ec_needle.cookie != cookie { + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Deletion Failed: unexpected cookie {:x}", cookie.0), + Some(&del_query), + ); + } + let count = ec_needle.data_size as i64; + // Step 3: Journal the delete + let mut store = state.store.write().unwrap(); + if let Some(ecv) = store.find_ec_volume_mut(vid) { + if let Err(e) = ecv.journal_delete(needle_id) { + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Deletion Failed: {}", e), + Some(&del_query), + ); + } + } + let result = DeleteResult { size: count }; + return json_response_with_params( + StatusCode::ACCEPTED, + &result, + Some(&del_params), + ); + } + Some(Ok(None)) => { + // Needle not found in EC volume + let result = DeleteResult { size: 0 }; + return json_response_with_params( + StatusCode::NOT_FOUND, + &result, + Some(&del_params), + ); + } + Some(Err(e)) => { + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Deletion Failed: {}", e), + Some(&del_query), + ); + } + None => { + // EC volume disappeared between has_ec check and find + let result = DeleteResult { size: 0 }; + return json_response_with_params( + StatusCode::NOT_FOUND, + &result, + Some(&del_params), + ); + } + } + } + } + + // H9: Parse custom timestamp from query param; default to now (not 0) + let del_ts_str = del_query + .split('&') + .find_map(|p| p.strip_prefix("ts=")) + .unwrap_or(""); + let del_last_modified = if !del_ts_str.is_empty() { + del_ts_str.parse::().unwrap_or_else(|_| { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + }) + } else { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + }; + + let mut n = Needle { + id: needle_id, + cookie, + ..Needle::default() + }; + + // Read needle first to validate cookie (matching Go behavior) + let original_cookie = cookie; + { + let store = state.store.read().unwrap(); + match store.read_volume_needle(vid, &mut n) { + Ok(_) => {} + Err(_) => { + let result = DeleteResult { size: 0 }; + return json_response_with_params( + StatusCode::NOT_FOUND, + &result, + Some(&del_params), + ); + } + } + } + if n.cookie != original_cookie { + return json_error_with_query( + StatusCode::BAD_REQUEST, + "File Random Cookie does not match.", + Some(&del_query), + ); + } + + // Apply custom timestamp (always set — defaults to now per H9) + n.last_modified = del_last_modified; + n.set_has_last_modified_date(); + + let mut delete_size_override = None; + + // If this is a chunk manifest, delete child chunks first + if n.is_chunk_manifest() { + let manifest_data = if n.is_compressed() { + use flate2::read::GzDecoder; + use std::io::Read as _; + let mut decoder = GzDecoder::new(&n.data[..]); + let mut decompressed = Vec::new(); + if decoder.read_to_end(&mut decompressed).is_ok() { + decompressed + } else { + n.data.clone() + } + } else { + n.data.clone() + }; + + let manifest = match serde_json::from_slice::(&manifest_data) { + Ok(manifest) => manifest, + Err(e) => { + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Load chunks manifest error: {}", e), + Some(&del_query), + ); + } + }; + + let child_fids: Vec = manifest + .chunks + .iter() + .map(|chunk| chunk.fid.clone()) + .collect(); + if let Err(e) = batch_delete_file_ids(&state, &child_fids).await { + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Delete chunks error: {}", e), + Some(&del_query), + ); + } + delete_size_override = Some(manifest.size as i64); + } + + let delete_result = { + let mut store = state.store.write().unwrap(); + store.delete_volume_needle(vid, &mut n) + }; + + let is_replicate = del_query.split('&').any(|p| p == "type=replicate"); + if !is_replicate && delete_result.is_ok() && !state.master_url.is_empty() { + let needs_replication = { + let store = state.store.read().unwrap(); + store.find_volume(vid).map_or(false, |(_, v)| { + v.super_block.replica_placement.get_copy_count() > 1 + }) + }; + if needs_replication { + if let Err(e) = do_replicated_request( + &state, + vid.0, + Method::DELETE, + &path, + &del_query, + &headers, + None, + ) + .await + { + tracing::error!("replicated delete failed: {}", e); + return json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("replication failed: {}", e), + Some(&del_query), + ); + } + } + } + + match delete_result { + Ok(size) => { + let result = DeleteResult { + size: delete_size_override.unwrap_or(size.0 as i64), + }; + json_response_with_params(StatusCode::ACCEPTED, &result, Some(&del_params)) + } + Err(crate::storage::volume::VolumeError::NotFound) => { + let result = DeleteResult { size: 0 }; + json_response_with_params(StatusCode::NOT_FOUND, &result, Some(&del_params)) + } + Err(e) => json_error_with_query( + StatusCode::INTERNAL_SERVER_ERROR, + format!("Deletion Failed: {}", e), + Some(&del_query), + ), + } +} + +// ============================================================================ +// Status Handler +// ============================================================================ + +pub async fn status_handler( + Query(params): Query, + State(state): State>, +) -> Response { + let store = state.store.read().unwrap(); + let mut volumes = Vec::new(); + + for loc in &store.locations { + for (_vid, vol) in loc.volumes() { + let mut vol_info = serde_json::Map::new(); + vol_info.insert("Id".to_string(), serde_json::Value::from(vol.id.0)); + vol_info.insert( + "Collection".to_string(), + serde_json::Value::from(vol.collection.clone()), + ); + vol_info.insert( + "Size".to_string(), + serde_json::Value::from(vol.content_size()), + ); + vol_info.insert( + "FileCount".to_string(), + serde_json::Value::from(vol.file_count()), + ); + vol_info.insert( + "DeleteCount".to_string(), + serde_json::Value::from(vol.deleted_count()), + ); + vol_info.insert( + "DeletedByteCount".to_string(), + serde_json::Value::from(vol.deleted_size()), + ); + vol_info.insert( + "ReadOnly".to_string(), + serde_json::Value::from(vol.is_read_only()), + ); + vol_info.insert( + "Version".to_string(), + serde_json::Value::from(vol.version().0), + ); + vol_info.insert( + "CompactRevision".to_string(), + serde_json::Value::from(vol.super_block.compaction_revision), + ); + vol_info.insert( + "ModifiedAtSecond".to_string(), + serde_json::Value::from(vol.last_modified_ts()), + ); + vol_info.insert( + "DiskType".to_string(), + serde_json::Value::from(loc.disk_type.to_string()), + ); + + let replica = &vol.super_block.replica_placement; + let mut replica_value = serde_json::Map::new(); + if replica.diff_data_center_count > 0 { + replica_value.insert( + "dc".to_string(), + serde_json::Value::from(replica.diff_data_center_count), + ); + } + if replica.diff_rack_count > 0 { + replica_value.insert( + "rack".to_string(), + serde_json::Value::from(replica.diff_rack_count), + ); + } + if replica.same_rack_count > 0 { + replica_value.insert( + "node".to_string(), + serde_json::Value::from(replica.same_rack_count), + ); + } + vol_info.insert( + "ReplicaPlacement".to_string(), + serde_json::Value::Object(replica_value), + ); + + let ttl = vol.super_block.ttl; + let mut ttl_value = serde_json::Map::new(); + if ttl.count > 0 { + ttl_value.insert("Count".to_string(), serde_json::Value::from(ttl.count)); + } + if ttl.unit > 0 { + ttl_value.insert("Unit".to_string(), serde_json::Value::from(ttl.unit)); + } + vol_info.insert("Ttl".to_string(), serde_json::Value::Object(ttl_value)); + + let (remote_storage_name, remote_storage_key) = vol.remote_storage_name_key(); + vol_info.insert( + "RemoteStorageName".to_string(), + serde_json::Value::from(remote_storage_name), + ); + vol_info.insert( + "RemoteStorageKey".to_string(), + serde_json::Value::from(remote_storage_key), + ); + volumes.push(serde_json::Value::Object(vol_info)); + } + } + volumes.sort_by(|a, b| { + let left = a.get("Id").and_then(|v| v.as_u64()).unwrap_or_default(); + let right = b.get("Id").and_then(|v| v.as_u64()).unwrap_or_default(); + left.cmp(&right) + }); + + let mut m = serde_json::Map::new(); + m.insert( + "Version".to_string(), + serde_json::Value::from(crate::version::version()), + ); + m.insert("Volumes".to_string(), serde_json::Value::Array(volumes)); + m.insert( + "DiskStatuses".to_string(), + serde_json::Value::Array(build_disk_statuses(&store)), + ); + json_response_with_params(StatusCode::OK, &serde_json::Value::Object(m), Some(¶ms)) +} + +// ============================================================================ +// Health Check Handler +// ============================================================================ + +pub async fn healthz_handler(State(state): State>) -> Response { + // Go's healthzHandler returns only status codes with no body text. + let is_stopping = *state.is_stopping.read().unwrap(); + if is_stopping { + return StatusCode::SERVICE_UNAVAILABLE.into_response(); + } + // If not heartbeating, return 503 (matches Go health check behavior) + if !state.is_heartbeating.load(Ordering::Relaxed) { + return StatusCode::SERVICE_UNAVAILABLE.into_response(); + } + StatusCode::OK.into_response() +} + +// ============================================================================ +// Metrics Handler +// ============================================================================ + +pub async fn metrics_handler() -> Response { + let body = metrics::gather_metrics(); + ( + StatusCode::OK, + [( + header::CONTENT_TYPE, + "text/plain; version=0.0.4; charset=utf-8", + )], + body, + ) + .into_response() +} + +// ============================================================================ +// Stats Handlers +// ============================================================================ + +pub async fn stats_counter_handler(Query(params): Query) -> Response { + let payload = serde_json::json!({ + "Version": crate::version::version(), + "Counters": super::server_stats::snapshot(), + }); + json_response_with_params(StatusCode::OK, &payload, Some(¶ms)) +} + +pub async fn stats_memory_handler(Query(params): Query) -> Response { + let mem = super::memory_status::collect_mem_status(); + let payload = serde_json::json!({ + "Version": crate::version::version(), + "Memory": { + "goroutines": mem.goroutines, + "all": mem.all, + "used": mem.used, + "free": mem.free, + "self": mem.self_, + "heap": mem.heap, + "stack": mem.stack, + }, + }); + json_response_with_params(StatusCode::OK, &payload, Some(¶ms)) +} + +pub async fn stats_disk_handler( + Query(params): Query, + State(state): State>, +) -> Response { + let store = state.store.read().unwrap(); + let payload = serde_json::json!({ + "Version": crate::version::version(), + "DiskStatuses": build_disk_statuses(&store), + }); + json_response_with_params(StatusCode::OK, &payload, Some(¶ms)) +} + +// ============================================================================ +// Static Asset Handlers +// ============================================================================ + +pub async fn favicon_handler() -> Response { + let asset = super::ui::favicon_asset(); + ( + StatusCode::OK, + [(header::CONTENT_TYPE, asset.content_type)], + asset.bytes, + ) + .into_response() +} + +pub async fn static_asset_handler(Path(path): Path) -> Response { + match super::ui::lookup_static_asset(&path) { + Some(asset) => ( + StatusCode::OK, + [(header::CONTENT_TYPE, asset.content_type)], + asset.bytes, + ) + .into_response(), + None => StatusCode::NOT_FOUND.into_response(), + } +} + +pub async fn ui_handler(State(state): State>) -> Response { + let html = super::ui::render_volume_server_html(&state); + ( + StatusCode::OK, + [(header::CONTENT_TYPE, "text/html; charset=utf-8")], + html, + ) + .into_response() +} + +// ============================================================================ +// Chunk Manifest +// ============================================================================ + +#[derive(Deserialize)] +#[allow(dead_code)] +struct ChunkManifest { + #[serde(default)] + name: String, + #[serde(default)] + mime: String, + #[serde(default)] + size: i64, + #[serde(default)] + chunks: Vec, +} + +#[derive(Deserialize)] +struct ChunkInfo { + fid: String, + offset: i64, + #[allow(dead_code)] + size: i64, +} + +/// Try to expand a chunk manifest needle. Returns None if manifest can't be parsed. +fn try_expand_chunk_manifest( + state: &Arc, + n: &Needle, + _headers: &HeaderMap, + method: &Method, + path: &str, + query: &ReadQueryParams, + etag: &str, + last_modified_str: &Option, +) -> Option { + let data = if n.is_compressed() { + use flate2::read::GzDecoder; + use std::io::Read as _; + let mut decoder = GzDecoder::new(&n.data[..]); + let mut decompressed = Vec::new(); + if decoder.read_to_end(&mut decompressed).is_err() { + return None; + } + decompressed + } else { + n.data.clone() + }; + + let manifest: ChunkManifest = match serde_json::from_slice(&data) { + Ok(m) => m, + Err(_) => return None, + }; + + // Read and concatenate all chunks + let mut result = vec![0u8; manifest.size as usize]; + let store = state.store.read().unwrap(); + for chunk in &manifest.chunks { + let (chunk_vid, chunk_nid, chunk_cookie) = match parse_url_path(&chunk.fid) { + Some(p) => p, + None => { + return Some( + ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("invalid chunk fid: {}", chunk.fid), + ) + .into_response(), + ) + } + }; + let mut chunk_needle = Needle { + id: chunk_nid, + cookie: chunk_cookie, + ..Needle::default() + }; + match store.read_volume_needle(chunk_vid, &mut chunk_needle) { + Ok(_) => {} + Err(e) => { + return Some( + ( + StatusCode::INTERNAL_SERVER_ERROR, + format!("read chunk {}: {}", chunk.fid, e), + ) + .into_response(), + ) + } + } + let chunk_data = if chunk_needle.is_compressed() { + use flate2::read::GzDecoder; + use std::io::Read as _; + let mut decoder = GzDecoder::new(&chunk_needle.data[..]); + let mut decompressed = Vec::new(); + if decoder.read_to_end(&mut decompressed).is_ok() { + decompressed + } else { + chunk_needle.data.clone() + } + } else { + chunk_needle.data.clone() + }; + let offset = chunk.offset as usize; + let end = std::cmp::min(offset + chunk_data.len(), result.len()); + let copy_len = end - offset; + if copy_len > 0 { + result[offset..offset + copy_len].copy_from_slice(&chunk_data[..copy_len]); + } + } + + // Determine filename: URL path filename, then manifest name + // (Go's tryHandleChunkedFile does NOT fall back to needle name) + let mut filename = extract_filename_from_path(path); + if filename.is_empty() && !manifest.name.is_empty() { + filename = manifest.name.clone(); + } + + // Determine MIME type: manifest mime, but fall back to extension detection + // if empty or application/octet-stream (matching Go behavior) + let content_type = { + let mime_str = &manifest.mime; + if !mime_str.is_empty() && !mime_str.starts_with("application/octet-stream") { + mime_str.clone() + } else { + // Try to detect from filename extension + let ext = if !filename.is_empty() { + if let Some(dot_pos) = filename.rfind('.') { + filename[dot_pos..].to_lowercase() + } else { + String::new() + } + } else { + String::new() + }; + if !ext.is_empty() { + mime_guess::from_ext(ext.trim_start_matches('.')) + .first() + .map(|m| m.to_string()) + .unwrap_or_else(|| "application/octet-stream".to_string()) + } else if !mime_str.is_empty() { + mime_str.clone() + } else { + "application/octet-stream".to_string() + } + } + }; + + let mut response_headers = HeaderMap::new(); + // Preserve ETag from the needle (matches Go: ETag is set before tryHandleChunkedFile) + if let Ok(etag_val) = etag.parse() { + response_headers.insert(header::ETAG, etag_val); + } + response_headers.insert(header::CONTENT_TYPE, content_type.parse().unwrap()); + response_headers.insert("X-File-Store", "chunked".parse().unwrap()); + response_headers.insert(header::ACCEPT_RANGES, "bytes".parse().unwrap()); + + // Last-Modified — Go sets this on the response writer before tryHandleChunkedFile + if let Some(ref lm) = last_modified_str { + if let Ok(hval) = lm.parse() { + response_headers.insert(header::LAST_MODIFIED, hval); + } + } + + // Pairs — Go sets needle pairs on the response writer before tryHandleChunkedFile + if n.has_pairs() && !n.pairs.is_empty() { + if let Ok(pair_map) = + serde_json::from_slice::>(&n.pairs) + { + for (k, v) in &pair_map { + if let (Ok(hname), Ok(hval)) = ( + axum::http::HeaderName::from_bytes(k.as_bytes()), + axum::http::HeaderValue::from_str(v), + ) { + response_headers.insert(hname, hval); + } + } + } + } + + // S3 response passthrough headers — Go sets these via AdjustPassthroughHeaders + if let Some(ref cc) = query.response_cache_control { + if let Ok(hval) = cc.parse() { + response_headers.insert(header::CACHE_CONTROL, hval); + } + } + if let Some(ref ce) = query.response_content_encoding { + if let Ok(hval) = ce.parse() { + response_headers.insert(header::CONTENT_ENCODING, hval); + } + } + if let Some(ref exp) = query.response_expires { + if let Ok(hval) = exp.parse() { + response_headers.insert(header::EXPIRES, hval); + } + } + if let Some(ref cl) = query.response_content_language { + if let Ok(hval) = cl.parse() { + response_headers.insert("Content-Language", hval); + } + } + if let Some(ref cd) = query.response_content_disposition { + if let Ok(hval) = cd.parse() { + response_headers.insert(header::CONTENT_DISPOSITION, hval); + } + } + + // Content-Disposition + if !filename.is_empty() { + let disposition_type = if let Some(ref dl_val) = query.dl { + if parse_go_bool(dl_val).unwrap_or(false) { + "attachment" + } else { + "inline" + } + } else { + "inline" + }; + let disposition = format_content_disposition(disposition_type, &filename); + if let Ok(hval) = disposition.parse() { + response_headers.insert(header::CONTENT_DISPOSITION, hval); + } + } + + // Go's tryHandleChunkedFile applies crop then resize to expanded chunk data + // (L344-345: conditionallyCropImages, conditionallyResizeImages). + let cm_ext = if !filename.is_empty() { + if let Some(dot_pos) = filename.rfind('.') { + filename[dot_pos..].to_lowercase() + } else { + String::new() + } + } else { + String::new() + }; + let mut result = result; + if is_image_crop_ext(&cm_ext) { + result = maybe_crop_image(&result, &cm_ext, query); + } + if is_image_resize_ext(&cm_ext) { + result = maybe_resize_image(&result, &cm_ext, query); + } + + if *method == Method::HEAD { + response_headers.insert( + header::CONTENT_LENGTH, + result.len().to_string().parse().unwrap(), + ); + return Some((StatusCode::OK, response_headers).into_response()); + } + + Some((StatusCode::OK, response_headers, result).into_response()) +} + +// ============================================================================ +// Helpers +// ============================================================================ + +fn absolute_display_path(path: &str) -> String { + let p = std::path::Path::new(path); + if p.is_absolute() { + return path.to_string(); + } + std::env::current_dir() + .map(|cwd| cwd.join(p).to_string_lossy().to_string()) + .unwrap_or_else(|_| path.to_string()) +} + +fn build_disk_statuses(store: &crate::storage::store::Store) -> Vec { + let mut disk_statuses = Vec::new(); + for loc in &store.locations { + let resolved_dir = absolute_display_path(&loc.directory); + let (all, free) = crate::storage::disk_location::get_disk_stats(&resolved_dir); + let used = all.saturating_sub(free); + let percent_free = if all > 0 { + (free as f64 / all as f64) * 100.0 + } else { + 0.0 + }; + let percent_used = if all > 0 { + (used as f64 / all as f64) * 100.0 + } else { + 0.0 + }; + + // Match Go encoding/json on protobuf struct (snake_case json tags) + disk_statuses.push(serde_json::json!({ + "dir": resolved_dir, + "all": all, + "used": used, + "free": free, + "percent_free": percent_free, + "percent_used": percent_used, + "disk_type": loc.disk_type.to_string(), + })); + } + disk_statuses +} + +/// Serialize to JSON with 1-space indent (matches Go's `json.MarshalIndent(obj, "", " ")`). +fn to_pretty_json(value: &T) -> String { + let mut buf = Vec::new(); + let formatter = serde_json::ser::PrettyFormatter::with_indent(b" "); + let mut ser = serde_json::Serializer::with_formatter(&mut buf, formatter); + value.serialize(&mut ser).unwrap(); + String::from_utf8(buf).unwrap() +} + +fn json_response_with_params( + status: StatusCode, + body: &T, + params: Option<&ReadQueryParams>, +) -> Response { + let is_pretty = params + .and_then(|params| params.pretty.as_ref()) + .is_some_and(|value| !value.is_empty()); + let callback = params + .and_then(|params| params.callback.as_ref()) + .filter(|value| !value.is_empty()) + .cloned(); + + let json_body = if is_pretty { + to_pretty_json(body) + } else { + serde_json::to_string(body).unwrap() + }; + + if let Some(callback) = callback { + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/javascript") + .body(Body::from(format!("{}({})", callback, json_body))) + .unwrap() + } else { + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(json_body)) + .unwrap() + } +} + +/// Return a JSON error response with optional query string for pretty/JSONP support. +/// Supports `?pretty=` for pretty-printed JSON and `?callback=fn` for JSONP, +/// matching Go's writeJsonError behavior. +pub(super) fn json_error_with_query( + status: StatusCode, + msg: impl Into, + query: Option<&str>, +) -> Response { + let body = serde_json::json!({"error": msg.into()}); + + let (is_pretty, callback) = if let Some(q) = query { + let pretty = q + .split('&') + .any(|p| p.starts_with("pretty=") && p.len() > "pretty=".len()); + let cb = q + .split('&') + .find_map(|p| p.strip_prefix("callback=")) + .map(|s| s.to_string()); + (pretty, cb) + } else { + (false, None) + }; + + let json_body = if is_pretty { + to_pretty_json(&body) + } else { + serde_json::to_string(&body).unwrap() + }; + + if let Some(cb) = callback { + let jsonp = format!("{}({})", cb, json_body); + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/javascript") + .body(Body::from(jsonp)) + .unwrap() + } else { + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(json_body)) + .unwrap() + } +} + +/// Return a JSON response with optional pretty/JSONP support from raw query string. +/// Matches Go's writeJsonQuiet behavior for write success responses. +fn json_result_with_query(status: StatusCode, body: &T, query: &str) -> Response { + let (is_pretty, callback) = { + let pretty = query + .split('&') + .any(|p| p.starts_with("pretty=") && p.len() > "pretty=".len()); + let cb = query + .split('&') + .find_map(|p| p.strip_prefix("callback=")) + .map(|s| s.to_string()); + (pretty, cb) + }; + + let json_body = if is_pretty { + to_pretty_json(body) + } else { + serde_json::to_string(body).unwrap() + }; + + if let Some(cb) = callback { + let jsonp = format!("{}({})", cb, json_body); + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/javascript") + .body(Body::from(jsonp)) + .unwrap() + } else { + Response::builder() + .status(status) + .header(header::CONTENT_TYPE, "application/json") + .body(Body::from(json_body)) + .unwrap() + } +} + +/// Extract JWT token from query param, Authorization header, or Cookie. +/// Query param takes precedence over header, header over cookie. +fn extract_jwt(headers: &HeaderMap, uri: &axum::http::Uri) -> Option { + // 1. Check ?jwt= query parameter + if let Some(query) = uri.query() { + for pair in query.split('&') { + if let Some(value) = pair.strip_prefix("jwt=") { + if !value.is_empty() { + return Some(value.to_string()); + } + } + } + } + + // 2. Check Authorization: Bearer (case-insensitive prefix) + if let Some(auth) = headers.get(header::AUTHORIZATION) { + if let Ok(auth_str) = auth.to_str() { + if auth_str.len() > 7 && auth_str[..7].eq_ignore_ascii_case("bearer ") { + return Some(auth_str[7..].to_string()); + } + } + } + + // 3. Check Cookie + if let Some(cookie_header) = headers.get(header::COOKIE) { + if let Ok(cookie_str) = cookie_header.to_str() { + for cookie in cookie_str.split(';') { + let cookie = cookie.trim(); + if let Some(value) = cookie.strip_prefix("AT=") { + if !value.is_empty() { + return Some(value.to_string()); + } + } + } + } + } + + None +} + +// ============================================================================ +// Auto-compression helpers (matches Go's util.IsCompressableFileType) +// ============================================================================ + +/// Check if a file type should be compressed based on extension and MIME type. +/// Returns true only when we are sure the type is compressible. +fn is_compressible_file_type(ext: &str, mtype: &str) -> bool { + // text/* + if mtype.starts_with("text/") { + return true; + } + // Compressible image/audio formats + match ext { + ".svg" | ".bmp" | ".wav" => return true, + _ => {} + } + // Most image/* formats are already compressed + if mtype.starts_with("image/") { + return false; + } + // By file extension + match ext { + ".zip" | ".rar" | ".gz" | ".bz2" | ".xz" | ".zst" | ".br" => return false, + ".pdf" | ".txt" | ".html" | ".htm" | ".css" | ".js" | ".json" => return true, + ".php" | ".java" | ".go" | ".rb" | ".c" | ".cpp" | ".h" | ".hpp" => return true, + ".png" | ".jpg" | ".jpeg" => return false, + _ => {} + } + // By MIME type + if mtype.starts_with("application/") { + if mtype.ends_with("zstd") { + return false; + } + if mtype.ends_with("xml") { + return true; + } + if mtype.ends_with("script") { + return true; + } + if mtype.ends_with("vnd.rar") { + return false; + } + } + if mtype.starts_with("audio/") { + let sub = mtype.strip_prefix("audio/").unwrap_or(""); + if matches!(sub, "wave" | "wav" | "x-wav" | "x-pn-wav") { + return true; + } + } + false +} + +/// Try to gzip data. Returns None on error. +fn try_gzip_data(data: &[u8]) -> Option> { + use flate2::write::GzEncoder; + use flate2::Compression; + use std::io::Write; + let mut encoder = GzEncoder::new(Vec::new(), Compression::default()); + encoder.write_all(data).ok()?; + encoder.finish().ok() +} + +fn maybe_decompress_gzip(data: &[u8]) -> Option> { + use flate2::read::GzDecoder; + use std::io::Read; + let mut decoder = GzDecoder::new(data); + let mut decompressed = Vec::new(); + decoder.read_to_end(&mut decompressed).ok()?; + Some(decompressed) +} + +fn compute_md5_base64(data: &[u8]) -> String { + use base64::Engine; + use md5::{Digest, Md5}; + let mut hasher = Md5::new(); + hasher.update(data); + base64::engine::general_purpose::STANDARD.encode(hasher.finalize()) +} + +fn clean_windows_path_base(value: &str) -> String { + let cleaned = value.replace('\\', "/"); + cleaned.rsplit('/').next().unwrap_or(&cleaned).to_string() +} + +fn parse_content_disposition_filename(value: &str) -> Option { + let mut filename: Option = None; + let mut name: Option = None; + + for segment in value.split(';') { + let segment = segment.trim(); + if segment.is_empty() { + continue; + } + let lower = segment.to_ascii_lowercase(); + if lower.starts_with("filename=") { + let raw = segment[9..].trim(); + let trimmed = raw + .strip_prefix('\"') + .and_then(|s| s.strip_suffix('\"')) + .unwrap_or(raw); + filename = Some(clean_windows_path_base(trimmed)); + } else if lower.starts_with("name=") { + let raw = segment[5..].trim(); + let trimmed = raw + .strip_prefix('\"') + .and_then(|s| s.strip_suffix('\"')) + .unwrap_or(raw); + name = Some(clean_windows_path_base(trimmed)); + } + } + + let candidate = filename.or(name); + candidate.filter(|s| !s.is_empty()) +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_url_path_comma() { + let (vid, nid, cookie) = parse_url_path("/3,01637037d6").unwrap(); + assert_eq!(vid, VolumeId(3)); + assert_eq!(nid, NeedleId(0x01)); + assert_eq!(cookie, Cookie(0x637037d6)); + } + + #[test] + fn test_parse_url_path_with_ext() { + let (vid, _, _) = parse_url_path("/3,01637037d6.jpg").unwrap(); + assert_eq!(vid, VolumeId(3)); + } + + #[test] + fn test_parse_url_path_slash() { + let result = parse_url_path("3/01637037d6"); + assert!(result.is_some()); + } + + #[test] + fn test_parse_url_path_slash_with_filename() { + let result = parse_url_path("3/01637037d6/report.txt"); + assert!(result.is_some()); + let (vid, _, _) = result.unwrap(); + assert_eq!(vid, VolumeId(3)); + } + + #[test] + fn test_parse_url_path_invalid() { + assert!(parse_url_path("/invalid").is_none()); + assert!(parse_url_path("").is_none()); + } + + #[test] + fn test_extract_jwt_bearer() { + let mut headers = HeaderMap::new(); + headers.insert(header::AUTHORIZATION, "Bearer abc123".parse().unwrap()); + let uri: axum::http::Uri = "/test".parse().unwrap(); + assert_eq!(extract_jwt(&headers, &uri), Some("abc123".to_string())); + } + + #[test] + fn test_extract_jwt_query_param() { + let headers = HeaderMap::new(); + let uri: axum::http::Uri = "/test?jwt=mytoken".parse().unwrap(); + assert_eq!(extract_jwt(&headers, &uri), Some("mytoken".to_string())); + } + + #[test] + fn test_extract_jwt_query_over_header() { + let mut headers = HeaderMap::new(); + headers.insert( + header::AUTHORIZATION, + "Bearer header_token".parse().unwrap(), + ); + let uri: axum::http::Uri = "/test?jwt=query_token".parse().unwrap(); + assert_eq!(extract_jwt(&headers, &uri), Some("query_token".to_string())); + } + + #[test] + fn test_extract_jwt_none() { + let headers = HeaderMap::new(); + let uri: axum::http::Uri = "/test".parse().unwrap(); + assert_eq!(extract_jwt(&headers, &uri), None); + } + + #[test] + fn test_handle_range_single() { + let data = b"hello world"; + let headers = HeaderMap::new(); + let resp = handle_range_request("bytes=0-4", data, headers, None); + assert_eq!(resp.status(), StatusCode::PARTIAL_CONTENT); + } + + #[test] + fn test_handle_range_invalid() { + let data = b"hello"; + let headers = HeaderMap::new(); + let resp = handle_range_request("bytes=999-1000", data, headers, None); + assert_eq!(resp.status(), StatusCode::RANGE_NOT_SATISFIABLE); + } + + #[tokio::test] + async fn test_stats_memory_handler_matches_go_memstatus_shape() { + let response = stats_memory_handler(Query(ReadQueryParams::default())).await; + assert_eq!(response.status(), StatusCode::OK); + + let body = axum::body::to_bytes(response.into_body(), usize::MAX) + .await + .unwrap(); + let payload: serde_json::Value = serde_json::from_slice(&body).unwrap(); + let memory = payload.get("Memory").unwrap(); + + for key in ["goroutines", "all", "used", "free", "self", "heap", "stack"] { + assert!(memory.get(key).is_some(), "missing key {}", key); + } + } + + #[tokio::test] + async fn test_stats_counter_handler_matches_go_json_shape() { + super::super::server_stats::reset_for_tests(); + super::super::server_stats::record_read_request(); + + let response = stats_counter_handler(Query(ReadQueryParams::default())).await; + assert_eq!(response.status(), StatusCode::OK); + + let body = axum::body::to_bytes(response.into_body(), usize::MAX) + .await + .unwrap(); + let payload: serde_json::Value = serde_json::from_slice(&body).unwrap(); + + assert_eq!( + payload.get("Version").and_then(|value| value.as_str()), + Some(crate::version::version()) + ); + let counters = payload.get("Counters").unwrap(); + assert!(counters.get("ReadRequests").is_some()); + assert!(counters.get("Requests").is_some()); + } + + #[test] + fn test_is_compressible_file_type() { + // Text types + assert!(is_compressible_file_type("", "text/html")); + assert!(is_compressible_file_type("", "text/plain")); + assert!(is_compressible_file_type("", "text/css")); + + // Compressible by extension + assert!(is_compressible_file_type(".svg", "")); + assert!(is_compressible_file_type(".bmp", "")); + assert!(is_compressible_file_type(".js", "")); + assert!(is_compressible_file_type(".json", "")); + assert!(is_compressible_file_type(".html", "")); + assert!(is_compressible_file_type(".css", "")); + assert!(is_compressible_file_type(".c", "")); + assert!(is_compressible_file_type(".go", "")); + + // Already compressed — should NOT compress + assert!(!is_compressible_file_type(".zip", "")); + assert!(!is_compressible_file_type(".gz", "")); + assert!(!is_compressible_file_type(".jpg", "")); + assert!(!is_compressible_file_type(".png", "")); + assert!(!is_compressible_file_type("", "image/jpeg")); + assert!(!is_compressible_file_type("", "image/png")); + + // Application subtypes + assert!(is_compressible_file_type("", "application/xml")); + assert!(is_compressible_file_type("", "application/javascript")); + assert!(!is_compressible_file_type("", "application/zstd")); + assert!(!is_compressible_file_type("", "application/vnd.rar")); + + // Audio + assert!(is_compressible_file_type(".wav", "audio/wav")); + assert!(!is_compressible_file_type("", "audio/mpeg")); + + // Unknown + assert!(!is_compressible_file_type( + ".xyz", + "application/octet-stream" + )); + } + + #[test] + fn test_try_gzip_data() { + let data = b"hello world hello world hello world"; + let compressed = try_gzip_data(data); + assert!(compressed.is_some()); + let compressed = compressed.unwrap(); + // Compressed data should be different from original + assert!(!compressed.is_empty()); + + // Verify we can decompress it + use flate2::read::GzDecoder; + use std::io::Read; + let mut decoder = GzDecoder::new(&compressed[..]); + let mut decompressed = Vec::new(); + decoder.read_to_end(&mut decompressed).unwrap(); + assert_eq!(decompressed, data); + } + + #[test] + fn test_maybe_decompress_gzip() { + let data = b"gzip me"; + let compressed = try_gzip_data(data).unwrap(); + let decompressed = maybe_decompress_gzip(&compressed).unwrap(); + assert_eq!(decompressed, data); + assert!(maybe_decompress_gzip(data).is_none()); + } + + #[test] + fn test_parse_content_disposition_filename() { + assert_eq!( + parse_content_disposition_filename("attachment; filename=\"report.txt\""), + Some("report.txt".to_string()) + ); + assert_eq!( + parse_content_disposition_filename("inline; name=\"hello.txt\""), + Some("hello.txt".to_string()) + ); + assert_eq!( + parse_content_disposition_filename("name=foo.txt"), + Some("foo.txt".to_string()) + ); + assert_eq!( + parse_content_disposition_filename("attachment; filename=\"C:\\\\path\\\\file.jpg\""), + Some("file.jpg".to_string()) + ); + assert_eq!(parse_content_disposition_filename("inline"), None); + } + + #[test] + fn test_streaming_chunk_size_respects_configured_read_buffer() { + assert_eq!( + streaming_chunk_size(4 * 1024 * 1024, 8 * 1024 * 1024), + 4 * 1024 * 1024 + ); + assert_eq!( + streaming_chunk_size(32 * 1024, 512 * 1024), + DEFAULT_STREAMING_CHUNK_SIZE + ); + assert_eq!( + streaming_chunk_size(8 * 1024 * 1024, 128 * 1024), + 128 * 1024 + ); + } + + #[test] + fn test_normalize_outgoing_http_url_rewrites_scheme() { + let url = normalize_outgoing_http_url( + "https", + "http://master.example.com:9333/dir/lookup?volumeId=7", + ) + .unwrap(); + assert_eq!(url, "https://master.example.com:9333/dir/lookup?volumeId=7"); + } + + #[test] + fn test_redirect_request_uses_outgoing_http_scheme() { + let info = ProxyRequestInfo { + original_headers: HeaderMap::new(), + original_query: "collection=photos&readDeleted=true".to_string(), + path: "/3,01637037d6".to_string(), + vid_str: "3".to_string(), + fid_str: "01637037d6".to_string(), + }; + let target = VolumeLocation { + url: "volume.internal:8080".to_string(), + public_url: "volume.public:8080".to_string(), + grpc_port: 18080, + }; + + let response = redirect_request(&info, &target, "https"); + assert_eq!(response.status(), StatusCode::MOVED_PERMANENTLY); + assert_eq!( + response.headers().get(header::LOCATION).unwrap(), + "https://volume.internal:8080/3,01637037d6?collection=photos&proxied=true" + ); + } +} diff --git a/seaweed-volume/src/server/heartbeat.rs b/seaweed-volume/src/server/heartbeat.rs new file mode 100644 index 000000000..6fcfd523c --- /dev/null +++ b/seaweed-volume/src/server/heartbeat.rs @@ -0,0 +1,1576 @@ +//! Heartbeat client: registers the volume server with the master. +//! +//! Implements the bidirectional streaming `SendHeartbeat` RPC to the master, +//! matching Go's `server/volume_grpc_client_to_master.go`. + +use std::collections::HashMap; +use std::sync::atomic::Ordering; +use std::sync::Arc; +use std::time::Duration; + +use tokio::sync::broadcast; +use tracing::{error, info, warn}; + +use super::grpc_client::{build_grpc_endpoint, GRPC_MAX_MESSAGE_SIZE}; +use super::volume_server::VolumeServerState; +use crate::pb::master_pb; +use crate::pb::master_pb::seaweed_client::SeaweedClient; +use crate::pb::volume_server_pb; +use crate::remote_storage::s3_tier::{S3TierBackend, S3TierConfig}; +use crate::storage::store::Store; +use crate::storage::types::NeedleId; + +const DUPLICATE_UUID_RETRY_MESSAGE: &str = "duplicate UUIDs detected, retrying connection"; +const MAX_DUPLICATE_UUID_RETRIES: u32 = 3; + +/// Configuration for the heartbeat client. +pub struct HeartbeatConfig { + pub ip: String, + pub port: u16, + pub grpc_port: u16, + pub public_url: String, + pub data_center: String, + pub rack: String, + pub master_addresses: Vec, + pub pulse_seconds: u64, +} + +/// Run the heartbeat loop using VolumeServerState. +/// +/// Mirrors Go's `volume_grpc_client_to_master.go` heartbeat(): +/// - On leader redirect: sleep 3s, then connect directly to the new leader +/// - On duplicate UUID error: exponential backoff (2s, 4s, 8s), exit after 3 retries +/// - On other errors: sleep pulse interval, reset to seed master list iteration +pub async fn run_heartbeat_with_state( + config: HeartbeatConfig, + state: Arc, + mut shutdown_rx: broadcast::Receiver<()>, +) { + info!( + "Starting heartbeat to master nodes: {:?}", + config.master_addresses + ); + + let pulse = Duration::from_secs(config.pulse_seconds.max(1)); + let mut new_leader: Option = None; + let mut duplicate_retry_count: u32 = 0; + + loop { + for master_addr in &config.master_addresses { + if is_stopping(&state) { + state.is_heartbeating.store(false, Ordering::Relaxed); + info!("Heartbeat stopping"); + return; + } + if shutdown_rx.try_recv().is_ok() { + state.is_heartbeating.store(false, Ordering::Relaxed); + info!("Heartbeat shutting down"); + return; + } + + // If we have a leader redirect, sleep 3s then connect to the leader + // instead of iterating through the seed list + let target_addr = if let Some(ref leader) = new_leader { + tokio::time::sleep(Duration::from_secs(3)).await; + leader.clone() + } else { + master_addr.clone() + }; + + let grpc_addr = to_grpc_address(&target_addr); + info!("Connecting heartbeat to master {}", grpc_addr); + + // Determine what action to take after the heartbeat attempt. + // We convert the error to a string immediately so the non-Send + // Box is dropped before any .await point. + enum PostAction { + LeaderRedirect(String), + Done, + SleepDuplicate(Duration), + SleepPulse, + } + let action = match do_heartbeat(&config, &state, &grpc_addr, &target_addr, pulse, &mut shutdown_rx) + .await + { + Ok(Some(leader)) => { + info!("Master leader changed to {}", leader); + PostAction::LeaderRedirect(leader) + } + Ok(None) => { + duplicate_retry_count = 0; + PostAction::Done + } + Err(e) => { + let err_msg = e.to_string(); + // Drop `e` (non-Send) before any .await + drop(e); + warn!("Heartbeat to {} error: {}", grpc_addr, err_msg); + + if err_msg.contains(DUPLICATE_UUID_RETRY_MESSAGE) { + if duplicate_retry_count >= MAX_DUPLICATE_UUID_RETRIES { + error!("Shut down Volume Server due to persistent duplicate volume directories after 3 retries"); + error!( + "Please check if another volume server is using the same directory" + ); + std::process::exit(1); + } + let retry_delay = duplicate_uuid_retry_delay(duplicate_retry_count); + duplicate_retry_count += 1; + warn!( + "Waiting {:?} before retrying due to duplicate UUID detection (attempt {}/3)...", + retry_delay, duplicate_retry_count + ); + PostAction::SleepDuplicate(retry_delay) + } else { + duplicate_retry_count = 0; + PostAction::SleepPulse + } + } + }; + + match action { + PostAction::LeaderRedirect(leader) => { + new_leader = Some(leader); + break; + } + PostAction::Done => { + new_leader = None; + } + PostAction::SleepDuplicate(delay) => { + new_leader = None; + tokio::time::sleep(delay).await; + } + PostAction::SleepPulse => { + new_leader = None; + tokio::time::sleep(pulse).await; + } + } + + // If we connected to a leader (not seed list), break out after one attempt + // so we either reconnect to the new leader or fall back to seed list + if new_leader.is_some() { + break; + } + } + + // If we have a leader redirect, skip the sleep and reconnect immediately + if new_leader.is_some() { + continue; + } + + tokio::select! { + _ = tokio::time::sleep(pulse) => {} + _ = shutdown_rx.recv() => { + state.is_heartbeating.store(false, Ordering::Relaxed); + info!("Heartbeat shutting down"); + return; + } + } + } +} + +/// Convert a master address "host:port" to a gRPC host:port target. +/// The Go master uses port + 10000 for gRPC by default. +pub fn to_grpc_address(master_addr: &str) -> String { + if let Some((host, port_str)) = master_addr.rsplit_once(':') { + if let Ok(port) = port_str.parse::() { + let grpc_port = port + 10000; + return format!("{}:{}", host, grpc_port); + } + } + master_addr.to_string() +} + +/// Call GetMasterConfiguration on seed masters before starting the heartbeat loop. +/// Mirrors Go's `checkWithMaster()` in `volume_grpc_client_to_master.go`. +/// Retries across all seed masters with a 1790ms sleep between rounds (matching Go). +/// Stores metrics address/interval from the response into server state. +async fn check_with_master(config: &HeartbeatConfig, state: &Arc) { + loop { + for master_addr in &config.master_addresses { + let grpc_addr = to_grpc_address(master_addr); + match try_get_master_configuration(&grpc_addr, state.outgoing_grpc_tls.as_ref()).await { + Ok(resp) => { + let changed = apply_metrics_push_settings( + state, + &resp.metrics_address, + resp.metrics_interval_seconds, + ); + if changed { + state.metrics_notify.notify_waiters(); + } + apply_storage_backends(state, &resp.storage_backends); + info!( + "Got master configuration from {}: metrics_address={}, metrics_interval={}s", + master_addr, resp.metrics_address, resp.metrics_interval_seconds + ); + return; + } + Err(e) => { + warn!("checkWithMaster {}: {}", master_addr, e); + } + } + } + tokio::time::sleep(Duration::from_millis(1790)).await; + } +} + +pub async fn prime_master_configuration(config: &HeartbeatConfig, state: &Arc) { + check_with_master(config, state).await; +} + +pub async fn try_get_master_configuration( + grpc_addr: &str, + tls: Option<&super::grpc_client::OutgoingGrpcTlsConfig>, +) -> Result> { + let channel = build_grpc_endpoint(grpc_addr, tls)? + .connect_timeout(Duration::from_secs(5)) + .timeout(Duration::from_secs(10)) + .connect() + .await?; + let mut client = SeaweedClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + let resp = client + .get_master_configuration(master_pb::GetMasterConfigurationRequest {}) + .await?; + Ok(resp.into_inner()) +} + +fn is_stopping(state: &VolumeServerState) -> bool { + *state.is_stopping.read().unwrap() +} + +fn duplicate_uuid_retry_delay(retry_count: u32) -> Duration { + Duration::from_secs((1u64 << retry_count) * 2) +} + +fn duplicate_directories(store: &Store, duplicated_uuids: &[String]) -> Vec { + let mut duplicate_dirs = Vec::new(); + for loc in &store.locations { + if duplicated_uuids + .iter() + .any(|uuid| uuid == &loc.directory_uuid) + { + duplicate_dirs.push(loc.directory.clone()); + } + } + duplicate_dirs +} + +fn apply_master_volume_options(store: &Store, hb_resp: &master_pb::HeartbeatResponse) -> bool { + let mut volume_opts_changed = false; + if store.get_preallocate() != hb_resp.preallocate { + store.set_preallocate(hb_resp.preallocate); + volume_opts_changed = true; + } + if hb_resp.volume_size_limit > 0 + && store.volume_size_limit.load(Ordering::Relaxed) != hb_resp.volume_size_limit + { + store + .volume_size_limit + .store(hb_resp.volume_size_limit, Ordering::Relaxed); + volume_opts_changed = true; + } + + volume_opts_changed && store.maybe_adjust_volume_max() +} + +type EcShardDeltaKey = (u32, String, u32, u32); + +fn collect_ec_shard_delta_messages( + store: &Store, +) -> HashMap { + let mut messages = HashMap::new(); + + for (disk_id, loc) in store.locations.iter().enumerate() { + for (_, ec_vol) in loc.ec_volumes() { + for shard in ec_vol.shards.iter().flatten() { + messages.insert( + ( + ec_vol.volume_id.0, + ec_vol.collection.clone(), + disk_id as u32, + shard.shard_id as u32, + ), + master_pb::VolumeEcShardInformationMessage { + id: ec_vol.volume_id.0, + collection: ec_vol.collection.clone(), + ec_index_bits: 1u32 << shard.shard_id, + shard_sizes: vec![shard.file_size()], + disk_type: ec_vol.disk_type.to_string(), + expire_at_sec: ec_vol.expire_at_sec, + disk_id: disk_id as u32, + ..Default::default() + }, + ); + } + } + } + + messages +} + +fn diff_ec_shard_delta_messages( + previous: &HashMap, + current: &HashMap, +) -> ( + Vec, + Vec, +) { + let mut new_ec_shards = Vec::new(); + let mut deleted_ec_shards = Vec::new(); + + for (key, message) in current { + if previous.get(key) != Some(message) { + new_ec_shards.push(message.clone()); + } + } + + for (key, message) in previous { + if !current.contains_key(key) { + let mut deleted = message.clone(); + deleted.shard_sizes = vec![0]; + deleted_ec_shards.push(deleted); + } + } + + (new_ec_shards, deleted_ec_shards) +} + +/// Perform one heartbeat session with a master server. +async fn do_heartbeat( + config: &HeartbeatConfig, + state: &Arc, + grpc_addr: &str, + current_master: &str, + pulse: Duration, + shutdown_rx: &mut broadcast::Receiver<()>, +) -> Result, Box> { + let channel = build_grpc_endpoint(grpc_addr, state.outgoing_grpc_tls.as_ref())? + .connect_timeout(Duration::from_secs(5)) + .timeout(Duration::from_secs(30)) + .connect() + .await?; + + let mut client = SeaweedClient::with_interceptor( + channel, + super::request_id::outgoing_request_id_interceptor, + ) + .max_decoding_message_size(GRPC_MAX_MESSAGE_SIZE) + .max_encoding_message_size(GRPC_MAX_MESSAGE_SIZE); + + let (tx, rx) = tokio::sync::mpsc::channel::(32); + + // Keep track of what we sent, to generate delta updates + let initial_hb = collect_heartbeat(config, state); + let mut last_volumes: HashMap = initial_hb + .volumes + .iter() + .map(|v| (v.id, v.clone())) + .collect(); + let mut last_ec_shards = { + let store = state.store.read().unwrap(); + collect_ec_shard_delta_messages(&store) + }; + + // Send initial heartbeats BEFORE calling send_heartbeat to avoid deadlock: + // the server won't send response headers until it receives the first message, + // but send_heartbeat().await waits for response headers. + tx.send(initial_hb).await?; + tx.send(collect_ec_heartbeat(config, state)).await?; + + let stream = tokio_stream::wrappers::ReceiverStream::new(rx); + let mut response_stream = client.send_heartbeat(stream).await?.into_inner(); + + info!("Heartbeat stream established with {}", grpc_addr); + if is_stopping(state) { + state.is_heartbeating.store(false, Ordering::Relaxed); + send_deregister_heartbeat(config, state, &tx).await; + info!("Heartbeat stopping"); + return Ok(None); + } + state.is_heartbeating.store(true, Ordering::Relaxed); + + let mut volume_tick = tokio::time::interval(pulse); + let mut ec_tick = tokio::time::interval(pulse * 17); + volume_tick.tick().await; + ec_tick.tick().await; + + loop { + tokio::select! { + resp = response_stream.message() => { + match resp { + Ok(Some(hb_resp)) => { + // Match Go ordering: DuplicatedUuids first, then volume + // options, then leader redirect. + if !hb_resp.duplicated_uuids.is_empty() { + let duplicate_dirs = { + let store = state.store.read().unwrap(); + duplicate_directories(&store, &hb_resp.duplicated_uuids) + }; + error!( + "Master reported duplicate volume directories: {:?}", + duplicate_dirs + ); + return Err(format!( + "{}: {:?}", + DUPLICATE_UUID_RETRY_MESSAGE, duplicate_dirs + ) + .into()); + } + let changed = { + let s = state.store.read().unwrap(); + apply_master_volume_options(&s, &hb_resp) + }; + if changed { + let adjusted_hb = collect_heartbeat(config, state); + last_volumes = + adjusted_hb.volumes.iter().map(|v| (v.id, v.clone())).collect(); + last_ec_shards = { + let store = state.store.read().unwrap(); + collect_ec_shard_delta_messages(&store) + }; + if tx.send(adjusted_hb).await.is_err() { + return Ok(None); + } + } + let metrics_changed = apply_metrics_push_settings( + state, + &hb_resp.metrics_address, + hb_resp.metrics_interval_seconds, + ); + if metrics_changed { + state.metrics_notify.notify_waiters(); + } + // Match Go: only redirect if leader is non-empty AND + // different from the current master we're connected to. + if !hb_resp.leader.is_empty() && current_master != hb_resp.leader { + return Ok(Some(hb_resp.leader)); + } + } + Ok(None) => return Ok(None), + Err(e) => return Err(Box::new(e)), + } + } + + _ = volume_tick.tick() => { + { + let s = state.store.read().unwrap(); + s.maybe_adjust_volume_max(); + } + let current_hb = collect_heartbeat(config, state); + last_volumes = current_hb.volumes.iter().map(|v| (v.id, v.clone())).collect(); + last_ec_shards = { + let store = state.store.read().unwrap(); + collect_ec_shard_delta_messages(&store) + }; + if tx.send(current_hb).await.is_err() { + return Ok(None); + } + } + + _ = ec_tick.tick() => { + let current_ec_hb = collect_ec_heartbeat(config, state); + last_ec_shards = { + let store = state.store.read().unwrap(); + collect_ec_shard_delta_messages(&store) + }; + if tx.send(current_ec_hb).await.is_err() { + return Ok(None); + } + } + + _ = state.volume_state_notify.notified() => { + if is_stopping(state) { + state.is_heartbeating.store(false, Ordering::Relaxed); + send_deregister_heartbeat(config, state, &tx).await; + info!("Heartbeat stopping"); + return Ok(None); + } + let current_hb = collect_heartbeat(config, state); + let current_volumes: HashMap = current_hb.volumes.iter().map(|v| (v.id, v.clone())).collect(); + let current_ec_shards = { + let store = state.store.read().unwrap(); + collect_ec_shard_delta_messages(&store) + }; + + let mut new_vols = Vec::new(); + let mut del_vols = Vec::new(); + + for (id, vol) in ¤t_volumes { + if !last_volumes.contains_key(id) { + new_vols.push(master_pb::VolumeShortInformationMessage { + id: *id, + collection: vol.collection.clone(), + version: vol.version, + replica_placement: vol.replica_placement, + ttl: vol.ttl, + disk_type: vol.disk_type.clone(), + disk_id: vol.disk_id, + }); + } + } + + for (id, vol) in &last_volumes { + if !current_volumes.contains_key(id) { + del_vols.push(master_pb::VolumeShortInformationMessage { + id: *id, + collection: vol.collection.clone(), + version: vol.version, + replica_placement: vol.replica_placement, + ttl: vol.ttl, + disk_type: vol.disk_type.clone(), + disk_id: vol.disk_id, + }); + } + } + + let (new_ec_shards, deleted_ec_shards) = + diff_ec_shard_delta_messages(&last_ec_shards, ¤t_ec_shards); + + // Collect current state for state-only or combined delta heartbeats. + // Mirrors Go's StateUpdateChan case which sends state changes immediately. + let current_state = Some(volume_server_pb::VolumeServerState { + maintenance: state.maintenance.load(Ordering::Relaxed), + version: state.state_version.load(Ordering::Relaxed), + }); + + if !new_vols.is_empty() + || !del_vols.is_empty() + || !new_ec_shards.is_empty() + || !deleted_ec_shards.is_empty() + { + let delta_hb = master_pb::Heartbeat { + ip: config.ip.clone(), + port: config.port as u32, + grpc_port: config.grpc_port as u32, + public_url: config.public_url.clone(), + data_center: config.data_center.clone(), + rack: config.rack.clone(), + new_volumes: new_vols, + deleted_volumes: del_vols, + new_ec_shards, + deleted_ec_shards, + state: current_state, + ..Default::default() + }; + if tx.send(delta_hb).await.is_err() { + return Ok(None); + } + last_volumes = current_volumes; + last_ec_shards = current_ec_shards; + } else { + // State-only heartbeat (e.g., MarkReadonly/MarkWritable changed state + // without adding/removing volumes). Mirrors Go's StateUpdateChan case. + let state_hb = master_pb::Heartbeat { + ip: config.ip.clone(), + port: config.port as u32, + grpc_port: config.grpc_port as u32, + data_center: config.data_center.clone(), + rack: config.rack.clone(), + state: current_state, + ..Default::default() + }; + if tx.send(state_hb).await.is_err() { + return Ok(None); + } + } + } + + _ = shutdown_rx.recv() => { + state.is_heartbeating.store(false, Ordering::Relaxed); + send_deregister_heartbeat(config, state, &tx).await; + info!("Sent deregistration heartbeat"); + return Ok(None); + } + } + } +} + +async fn send_deregister_heartbeat( + config: &HeartbeatConfig, + state: &Arc, + tx: &tokio::sync::mpsc::Sender, +) { + let empty = { + let store = state.store.read().unwrap(); + let (location_uuids, disk_tags) = collect_location_metadata(&store); + master_pb::Heartbeat { + id: store.id.clone(), + ip: config.ip.clone(), + port: config.port as u32, + public_url: config.public_url.clone(), + max_file_key: 0, + data_center: config.data_center.clone(), + rack: config.rack.clone(), + has_no_volumes: true, + has_no_ec_shards: true, + grpc_port: config.grpc_port as u32, + location_uuids, + disk_tags, + ..Default::default() + } + }; + let _ = tx.send(empty).await; + tokio::time::sleep(Duration::from_millis(200)).await; +} + +fn apply_metrics_push_settings( + state: &VolumeServerState, + address: &str, + interval_seconds: u32, +) -> bool { + let mut runtime = state.metrics_runtime.write().unwrap(); + if runtime.push_gateway.address == address + && runtime.push_gateway.interval_seconds == interval_seconds + { + return false; + } + runtime.push_gateway.address = address.to_string(); + runtime.push_gateway.interval_seconds = interval_seconds; + true +} + +fn apply_storage_backends( + state: &VolumeServerState, + storage_backends: &[master_pb::StorageBackend], +) { + if storage_backends.is_empty() { + return; + } + + let mut registry = state.s3_tier_registry.write().unwrap(); + let mut global_registry = crate::remote_storage::s3_tier::global_s3_tier_registry() + .write() + .unwrap(); + for backend in storage_backends { + if backend.r#type != "s3" { + continue; + } + + let properties = &backend.properties; + let config = S3TierConfig { + access_key: properties + .get("aws_access_key_id") + .cloned() + .unwrap_or_default(), + secret_key: properties + .get("aws_secret_access_key") + .cloned() + .unwrap_or_default(), + region: properties.get("region").cloned().unwrap_or_default(), + bucket: properties.get("bucket").cloned().unwrap_or_default(), + endpoint: properties.get("endpoint").cloned().unwrap_or_default(), + storage_class: properties.get("storage_class").cloned().unwrap_or_default(), + force_path_style: parse_bool_property(properties.get("force_path_style")), + }; + + let backend_id = if backend.id.is_empty() { + "default" + } else { + backend.id.as_str() + }; + register_s3_backend(&mut registry, backend, backend_id, &config); + register_s3_backend(&mut global_registry, backend, backend_id, &config); + } +} + +fn register_s3_backend( + registry: &mut crate::remote_storage::s3_tier::S3TierRegistry, + backend: &master_pb::StorageBackend, + backend_id: &str, + config: &S3TierConfig, +) { + let qualified_name = format!("{}.{}", backend.r#type, backend_id); + if registry.get(&qualified_name).is_none() { + registry.register(qualified_name, S3TierBackend::new(config)); + } + if backend_id == "default" && registry.get(&backend.r#type).is_none() { + registry.register(backend.r#type.clone(), S3TierBackend::new(config)); + } +} + +fn parse_bool_property(value: Option<&String>) -> bool { + value + .map(|v| { + matches!( + v.trim().to_ascii_lowercase().as_str(), + "1" | "t" | "true" | "y" | "yes" | "on" + ) + }) + .unwrap_or(true) +} + +/// Collect volume information into a Heartbeat message. +fn collect_heartbeat( + config: &HeartbeatConfig, + state: &Arc, +) -> master_pb::Heartbeat { + let mut store = state.store.write().unwrap(); + let (ec_shards, deleted_ec_shards) = store.delete_expired_ec_volumes(); + build_heartbeat_with_ec_status( + config, + &mut store, + deleted_ec_shards, + ec_shards.is_empty(), + ) +} + +fn collect_location_metadata(store: &Store) -> (Vec, Vec) { + let location_uuids = store + .locations + .iter() + .map(|loc| loc.directory_uuid.clone()) + .collect(); + let disk_tags = store + .locations + .iter() + .enumerate() + .map(|(disk_id, loc)| master_pb::DiskTag { + disk_id: disk_id as u32, + tags: loc.tags.clone(), + }) + .collect(); + (location_uuids, disk_tags) +} + +#[cfg(test)] +fn build_heartbeat(config: &HeartbeatConfig, store: &mut Store) -> master_pb::Heartbeat { + let has_no_ec_shards = collect_live_ec_shards(store, false).is_empty(); + build_heartbeat_with_ec_status(config, store, Vec::new(), has_no_ec_shards) +} + +fn build_heartbeat_with_ec_status( + config: &HeartbeatConfig, + store: &mut Store, + deleted_ec_shards: Vec, + has_no_ec_shards: bool, +) -> master_pb::Heartbeat { + const MAX_TTL_VOLUME_REMOVAL_DELAY: u32 = 10; + + #[derive(Default)] + struct ReadOnlyCounts { + is_read_only: u32, + no_write_or_delete: u32, + no_write_can_delete: u32, + is_disk_space_low: u32, + } + + let mut volumes = Vec::new(); + let mut max_file_key = NeedleId(0); + let mut max_volume_counts: HashMap = HashMap::new(); + + // Collect per-collection disk size and read-only counts for metrics + let mut disk_sizes: HashMap = HashMap::new(); // (normal, deleted) + let mut ro_counts: HashMap = HashMap::new(); + + let volume_size_limit = store.volume_size_limit.load(Ordering::Relaxed); + + for (disk_id, loc) in store.locations.iter_mut().enumerate() { + let disk_type_str = loc.disk_type.to_string(); + let mut effective_max_count = loc.max_volume_count.load(Ordering::Relaxed); + if loc.is_disk_space_low.load(Ordering::Relaxed) { + let used_slots = loc.volumes_len() as i32 + + ((loc.ec_shard_count() + + crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT + - 1) + / crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT) + as i32; + effective_max_count = used_slots; + } + if effective_max_count < 0 { + effective_max_count = 0; + } + *max_volume_counts.entry(disk_type_str).or_insert(0) += effective_max_count as u32; + + let mut delete_vids = Vec::new(); + for (_, vol) in loc.iter_volumes() { + let cur_max = vol.max_file_key(); + if cur_max > max_file_key { + max_file_key = cur_max; + } + + let volume_size = vol.dat_file_size().unwrap_or(0); + let mut should_delete_volume = false; + + if vol.last_io_error().is_some() { + delete_vids.push(vol.id); + should_delete_volume = true; + } else if !vol.is_expired(volume_size, volume_size_limit) { + let (remote_storage_name, remote_storage_key) = vol.remote_storage_name_key(); + volumes.push(master_pb::VolumeInformationMessage { + id: vol.id.0, + size: volume_size, + collection: vol.collection.clone(), + file_count: vol.file_count() as u64, + delete_count: vol.deleted_count() as u64, + deleted_byte_count: vol.deleted_size(), + read_only: vol.is_read_only(), + replica_placement: vol.super_block.replica_placement.to_byte() as u32, + version: vol.super_block.version.0 as u32, + ttl: vol.super_block.ttl.to_u32(), + compact_revision: vol.last_compact_revision() as u32, + modified_at_second: vol.last_modified_ts() as i64, + disk_type: loc.disk_type.to_string(), + disk_id: disk_id as u32, + remote_storage_name, + remote_storage_key, + ..Default::default() + }); + } else if vol.is_expired_long_enough(MAX_TTL_VOLUME_REMOVAL_DELAY) { + delete_vids.push(vol.id); + should_delete_volume = true; + } + + // Track disk size by collection + let entry = disk_sizes.entry(vol.collection.clone()).or_insert((0, 0)); + if !should_delete_volume { + entry.0 += volume_size; + entry.1 += vol.deleted_size(); + } + + let read_only = ro_counts.entry(vol.collection.clone()).or_default(); + if !should_delete_volume && vol.is_read_only() { + read_only.is_read_only += 1; + if vol.is_no_write_or_delete() { + read_only.no_write_or_delete += 1; + } + if vol.is_no_write_can_delete() { + read_only.no_write_can_delete += 1; + } + if loc.is_disk_space_low.load(Ordering::Relaxed) { + read_only.is_disk_space_low += 1; + } + } + + } + + for vid in delete_vids { + let _ = loc.delete_volume(vid, false); + } + } + + // Update disk size and read-only gauges + for (col, (normal, deleted)) in &disk_sizes { + crate::metrics::DISK_SIZE_GAUGE + .with_label_values(&[col, crate::metrics::DISK_SIZE_LABEL_NORMAL]) + .set(*normal as f64); + crate::metrics::DISK_SIZE_GAUGE + .with_label_values(&[col, crate::metrics::DISK_SIZE_LABEL_DELETED_BYTES]) + .set(*deleted as f64); + } + for (col, counts) in &ro_counts { + crate::metrics::READ_ONLY_VOLUME_GAUGE + .with_label_values(&[col, crate::metrics::READ_ONLY_LABEL_IS_READ_ONLY]) + .set(counts.is_read_only as f64); + crate::metrics::READ_ONLY_VOLUME_GAUGE + .with_label_values(&[col, crate::metrics::READ_ONLY_LABEL_NO_WRITE_OR_DELETE]) + .set(counts.no_write_or_delete as f64); + crate::metrics::READ_ONLY_VOLUME_GAUGE + .with_label_values(&[col, crate::metrics::READ_ONLY_LABEL_NO_WRITE_CAN_DELETE]) + .set(counts.no_write_can_delete as f64); + crate::metrics::READ_ONLY_VOLUME_GAUGE + .with_label_values(&[col, crate::metrics::READ_ONLY_LABEL_IS_DISK_SPACE_LOW]) + .set(counts.is_disk_space_low as f64); + } + // Update max volumes gauge + let total_max: i64 = max_volume_counts.values().map(|v| *v as i64).sum(); + crate::metrics::MAX_VOLUMES.set(total_max); + + let has_no_volumes = volumes.is_empty(); + let (location_uuids, disk_tags) = collect_location_metadata(store); + + master_pb::Heartbeat { + id: store.id.clone(), + ip: config.ip.clone(), + port: config.port as u32, + public_url: config.public_url.clone(), + max_file_key: max_file_key.0, + data_center: config.data_center.clone(), + rack: config.rack.clone(), + admin_port: config.port as u32, + volumes, + deleted_ec_shards, + has_no_volumes, + has_no_ec_shards, + max_volume_counts, + grpc_port: config.grpc_port as u32, + location_uuids, + disk_tags, + ..Default::default() + } +} + +fn collect_live_ec_shards( + store: &Store, + update_metrics: bool, +) -> Vec { + let mut ec_shards = Vec::new(); + let mut ec_sizes: HashMap = HashMap::new(); + + for (disk_id, loc) in store.locations.iter().enumerate() { + for (_, ec_vol) in loc.ec_volumes() { + for message in ec_vol.to_volume_ec_shard_information_messages(disk_id as u32) { + if update_metrics { + let total_size: u64 = message + .shard_sizes + .iter() + .map(|size| (*size).max(0) as u64) + .sum(); + *ec_sizes.entry(message.collection.clone()).or_insert(0) += total_size; + } + ec_shards.push(message); + } + } + } + + if update_metrics { + for (col, size) in &ec_sizes { + crate::metrics::DISK_SIZE_GAUGE + .with_label_values(&[col, crate::metrics::DISK_SIZE_LABEL_EC]) + .set(*size as f64); + } + } + + ec_shards +} + +/// Collect EC shard information into a Heartbeat message. +fn collect_ec_heartbeat(config: &HeartbeatConfig, state: &Arc) -> master_pb::Heartbeat { + let store = state.store.read().unwrap(); + let ec_shards = collect_live_ec_shards(&store, true); + + let has_no = ec_shards.is_empty(); + master_pb::Heartbeat { + ip: config.ip.clone(), + port: config.port as u32, + grpc_port: config.grpc_port as u32, + data_center: config.data_center.clone(), + rack: config.rack.clone(), + ec_shards, + has_no_ec_shards: has_no, + ..Default::default() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::config::MinFreeSpace; + use crate::config::ReadMode; + use crate::metrics::{ + DISK_SIZE_GAUGE, DISK_SIZE_LABEL_DELETED_BYTES, DISK_SIZE_LABEL_EC, + DISK_SIZE_LABEL_NORMAL, READ_ONLY_LABEL_IS_DISK_SPACE_LOW, + READ_ONLY_LABEL_IS_READ_ONLY, READ_ONLY_LABEL_NO_WRITE_CAN_DELETE, + READ_ONLY_LABEL_NO_WRITE_OR_DELETE, READ_ONLY_VOLUME_GAUGE, + }; + use crate::remote_storage::s3_tier::S3TierRegistry; + use crate::security::{Guard, SigningKey}; + use crate::storage::needle_map::NeedleMapKind; + use crate::storage::types::{DiskType, Version, VolumeId}; + use std::sync::atomic::Ordering; + use std::sync::RwLock; + use std::time::{SystemTime, UNIX_EPOCH}; + + fn test_config() -> HeartbeatConfig { + HeartbeatConfig { + ip: "127.0.0.1".to_string(), + port: 8080, + grpc_port: 18080, + public_url: "127.0.0.1:8080".to_string(), + data_center: "dc1".to_string(), + rack: "rack1".to_string(), + master_addresses: Vec::new(), + pulse_seconds: 5, + } + } + + fn test_state_with_store(store: Store) -> Arc { + Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(Guard::new( + &[], + SigningKey(vec![]), + 0, + SigningKey(vec![]), + 0, + )), + is_stopping: RwLock::new(false), + maintenance: std::sync::atomic::AtomicBool::new(false), + state_version: std::sync::atomic::AtomicU32::new(0), + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(60), + inflight_download_data_timeout: std::time::Duration::from_secs(60), + inflight_upload_bytes: std::sync::atomic::AtomicI64::new(0), + inflight_download_bytes: std::sync::atomic::AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: String::new(), + rack: String::new(), + file_size_limit_bytes: 0, + maintenance_byte_per_second: 0, + is_heartbeating: std::sync::atomic::AtomicBool::new(false), + has_master: true, + pre_stop_seconds: 0, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new(S3TierRegistry::new()), + read_mode: ReadMode::Local, + master_url: String::new(), + master_urls: Vec::new(), + self_url: String::new(), + http_client: reqwest::Client::new(), + outgoing_http_scheme: "http".to_string(), + outgoing_grpc_tls: None, + metrics_runtime: std::sync::RwLock::new(Default::default()), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: false, + has_slow_read: true, + read_buffer_size_bytes: 4 * 1024 * 1024, + security_file: String::new(), + cli_white_list: vec![], + state_file_path: String::new(), + }) + } + + #[test] + fn test_build_heartbeat_includes_store_identity_and_disk_metadata() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store.id = "volume-node-a".to_string(); + store + .add_location( + dir, + dir, + 3, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + vec!["fast".to_string(), "ssd".to_string()], + ) + .unwrap(); + store + .add_volume( + VolumeId(7), + "pics", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + + let heartbeat = build_heartbeat(&test_config(), &mut store); + + assert_eq!(heartbeat.id, "volume-node-a"); + assert_eq!(heartbeat.volumes.len(), 1); + assert!(!heartbeat.has_no_volumes); + assert_eq!( + heartbeat.location_uuids, + vec![store.locations[0].directory_uuid.clone()] + ); + assert_eq!(heartbeat.disk_tags.len(), 1); + assert_eq!(heartbeat.disk_tags[0].disk_id, 0); + assert_eq!( + heartbeat.disk_tags[0].tags, + vec!["fast".to_string(), "ssd".to_string()] + ); + } + + #[test] + fn test_build_heartbeat_marks_empty_store_as_has_no_volumes() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store.id = "volume-node-b".to_string(); + store + .add_location( + dir, + dir, + 2, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let heartbeat = build_heartbeat(&test_config(), &mut store); + + assert!(heartbeat.volumes.is_empty()); + assert!(heartbeat.has_no_volumes); + } + + #[test] + fn test_build_heartbeat_tracks_go_read_only_labels_and_disk_id() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store + .add_volume( + VolumeId(17), + "heartbeat_metrics_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + store.locations[0] + .is_disk_space_low + .store(true, Ordering::Relaxed); + + { + let (_, volume) = store.find_volume_mut(VolumeId(17)).unwrap(); + volume.set_read_only().unwrap(); + volume.volume_info.files.push(Default::default()); + volume.refresh_remote_write_mode(); + } + + let heartbeat = build_heartbeat(&test_config(), &mut store); + let collection = "heartbeat_metrics_case"; + let disk_type = store.locations[0].disk_type.to_string(); + + assert_eq!(heartbeat.volumes.len(), 1); + assert_eq!(heartbeat.volumes[0].disk_id, 0); + assert_eq!(heartbeat.max_volume_counts[&disk_type], 1); + assert_eq!( + READ_ONLY_VOLUME_GAUGE + .with_label_values(&[collection, READ_ONLY_LABEL_IS_READ_ONLY]) + .get(), + 1.0 + ); + assert_eq!( + READ_ONLY_VOLUME_GAUGE + .with_label_values(&[collection, READ_ONLY_LABEL_NO_WRITE_OR_DELETE]) + .get(), + 0.0 + ); + assert_eq!( + READ_ONLY_VOLUME_GAUGE + .with_label_values(&[collection, READ_ONLY_LABEL_NO_WRITE_CAN_DELETE]) + .get(), + 1.0 + ); + assert_eq!( + READ_ONLY_VOLUME_GAUGE + .with_label_values(&[collection, READ_ONLY_LABEL_IS_DISK_SPACE_LOW]) + .get(), + 1.0 + ); + assert_eq!( + DISK_SIZE_GAUGE + .with_label_values(&[collection, DISK_SIZE_LABEL_NORMAL]) + .get(), + crate::storage::super_block::SUPER_BLOCK_SIZE as f64 + ); + assert_eq!( + DISK_SIZE_GAUGE + .with_label_values(&[collection, DISK_SIZE_LABEL_DELETED_BYTES]) + .get(), + 0.0 + ); + } + + #[test] + fn test_collect_ec_heartbeat_sets_go_metadata_and_ec_metrics() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let shard_path = format!("{}/ec_metrics_case_27.ec00", dir); + std::fs::write(&shard_path, b"ec-shard").unwrap(); + store.locations[0] + .mount_ec_shards(VolumeId(27), "ec_metrics_case", &[0]) + .unwrap(); + + let state = test_state_with_store(store); + let heartbeat = collect_ec_heartbeat(&test_config(), &state); + + assert_eq!(heartbeat.ec_shards.len(), 1); + assert!(!heartbeat.has_no_ec_shards); + assert_eq!(heartbeat.ec_shards[0].disk_id, 0); + assert_eq!( + heartbeat.ec_shards[0].disk_type, + state.store.read().unwrap().locations[0].disk_type.to_string() + ); + assert_eq!(heartbeat.ec_shards[0].ec_index_bits, 1); + assert_eq!(heartbeat.ec_shards[0].shard_sizes, vec![8]); + assert_eq!( + DISK_SIZE_GAUGE + .with_label_values(&["ec_metrics_case", DISK_SIZE_LABEL_EC]) + .get(), + 8.0 + ); + } + + #[test] + fn test_collect_heartbeat_deletes_expired_ec_volumes() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + std::fs::write(format!("{}/expired_heartbeat_ec_31.ec00", dir), b"expired").unwrap(); + store.locations[0] + .mount_ec_shards(VolumeId(31), "expired_heartbeat_ec", &[0]) + .unwrap(); + store + .find_ec_volume_mut(VolumeId(31)) + .unwrap() + .expire_at_sec = 1; + + let state = test_state_with_store(store); + let heartbeat = collect_heartbeat(&test_config(), &state); + + assert!(heartbeat.has_no_ec_shards); + assert_eq!(heartbeat.deleted_ec_shards.len(), 1); + assert_eq!(heartbeat.deleted_ec_shards[0].id, 31); + assert!(!state.store.read().unwrap().has_ec_volume(VolumeId(31))); + } + + #[test] + fn test_collect_heartbeat_excludes_expired_volume_until_removal_delay() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store.volume_size_limit.store(1, Ordering::Relaxed); + store + .add_volume( + VolumeId(41), + "expired_volume_case", + None, + Some(crate::storage::needle::ttl::TTL::read("20m").unwrap()), + 1024, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + let dat_path = { + let (_, volume) = store.find_volume_mut(VolumeId(41)).unwrap(); + volume.set_last_io_error_for_test(None); + volume.set_last_modified_ts_for_test( + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + .saturating_sub(21 * 60), + ); + volume.dat_path() + }; + std::fs::OpenOptions::new() + .write(true) + .open(&dat_path) + .unwrap() + .set_len((crate::storage::super_block::SUPER_BLOCK_SIZE + 1) as u64) + .unwrap(); + let volume_size_limit = store.volume_size_limit.load(Ordering::Relaxed); + let (_, volume) = store.find_volume(VolumeId(41)).unwrap(); + assert!(volume.is_expired(volume.dat_file_size().unwrap_or(0), volume_size_limit)); + assert!(!volume.is_expired_long_enough(10)); + + let heartbeat = build_heartbeat(&test_config(), &mut store); + + assert!(heartbeat.volumes.is_empty()); + assert!(store.has_volume(VolumeId(41))); + } + + #[test] + fn test_collect_heartbeat_deletes_io_error_volume() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store + .add_volume( + VolumeId(51), + "io_error_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + let (_, volume) = store.find_volume_mut(VolumeId(51)).unwrap(); + volume.set_last_io_error_for_test(Some("input/output error")); + + let heartbeat = build_heartbeat(&test_config(), &mut store); + + assert!(heartbeat.volumes.is_empty()); + assert!(!store.has_volume(VolumeId(51))); + } + + #[test] + fn test_build_heartbeat_includes_remote_storage_name_and_key() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store + .add_volume( + VolumeId(71), + "remote_volume_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + let (_, volume) = store.find_volume_mut(VolumeId(71)).unwrap(); + volume.volume_info.files.push(crate::storage::volume::PbRemoteFile { + backend_type: "s3".to_string(), + backend_id: "archive".to_string(), + key: "volumes/71.dat".to_string(), + ..Default::default() + }); + volume.refresh_remote_write_mode(); + + let heartbeat = build_heartbeat(&test_config(), &mut store); + + assert_eq!(heartbeat.volumes.len(), 1); + assert_eq!(heartbeat.volumes[0].remote_storage_name, "s3.archive"); + assert_eq!(heartbeat.volumes[0].remote_storage_key, "volumes/71.dat"); + } + + #[test] + fn test_apply_storage_backends_registers_s3_default_aliases() { + let state = test_state_with_store(Store::new(NeedleMapKind::InMemory)); + crate::remote_storage::s3_tier::global_s3_tier_registry() + .write() + .unwrap() + .clear(); + + apply_storage_backends( + &state, + &[master_pb::StorageBackend { + r#type: "s3".to_string(), + id: "default".to_string(), + properties: std::collections::HashMap::from([ + ("aws_access_key_id".to_string(), "access".to_string()), + ("aws_secret_access_key".to_string(), "secret".to_string()), + ("bucket".to_string(), "bucket-a".to_string()), + ("region".to_string(), "us-west-2".to_string()), + ("endpoint".to_string(), "http://127.0.0.1:8333".to_string()), + ("storage_class".to_string(), "STANDARD".to_string()), + ("force_path_style".to_string(), "false".to_string()), + ]), + }], + ); + + let registry = state.s3_tier_registry.read().unwrap(); + assert!(registry.get("s3.default").is_some()); + assert!(registry.get("s3").is_some()); + let global_registry = crate::remote_storage::s3_tier::global_s3_tier_registry() + .read() + .unwrap(); + assert!(global_registry.get("s3.default").is_some()); + assert!(global_registry.get("s3").is_some()); + } + + #[test] + fn test_apply_storage_backends_ignores_unsupported_types() { + let state = test_state_with_store(Store::new(NeedleMapKind::InMemory)); + crate::remote_storage::s3_tier::global_s3_tier_registry() + .write() + .unwrap() + .clear(); + + apply_storage_backends( + &state, + &[master_pb::StorageBackend { + r#type: "rclone".to_string(), + id: "default".to_string(), + properties: std::collections::HashMap::new(), + }], + ); + + let registry = state.s3_tier_registry.read().unwrap(); + assert!(registry.names().is_empty()); + let global_registry = crate::remote_storage::s3_tier::global_s3_tier_registry() + .read() + .unwrap(); + assert!(global_registry.names().is_empty()); + } + + #[test] + fn test_apply_metrics_push_settings_updates_runtime_state() { + let store = Store::new(NeedleMapKind::InMemory); + let state = test_state_with_store(store); + + assert!(apply_metrics_push_settings(&state, "pushgateway:9091", 15,)); + { + let runtime = state.metrics_runtime.read().unwrap(); + assert_eq!(runtime.push_gateway.address, "pushgateway:9091"); + assert_eq!(runtime.push_gateway.interval_seconds, 15); + } + + assert!(!apply_metrics_push_settings(&state, "pushgateway:9091", 15,)); + } + + #[test] + fn test_duplicate_uuid_retry_delay_matches_go_backoff() { + assert_eq!(duplicate_uuid_retry_delay(0), Duration::from_secs(2)); + assert_eq!(duplicate_uuid_retry_delay(1), Duration::from_secs(4)); + assert_eq!(duplicate_uuid_retry_delay(2), Duration::from_secs(8)); + } + + #[test] + fn test_duplicate_directories_maps_master_uuids_to_paths() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 1, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let duplicate_dirs = duplicate_directories( + &store, + &[ + store.locations[0].directory_uuid.clone(), + "missing-uuid".to_string(), + ], + ); + + assert_eq!(duplicate_dirs, vec![dir.to_string()]); + } + + #[test] + fn test_apply_master_volume_options_updates_preallocate_and_size_limit() { + let store = Store::new(NeedleMapKind::InMemory); + store.volume_size_limit.store(1024, Ordering::Relaxed); + + let changed = apply_master_volume_options( + &store, + &master_pb::HeartbeatResponse { + volume_size_limit: 2048, + preallocate: true, + ..Default::default() + }, + ); + + assert!(store.get_preallocate()); + assert_eq!(store.volume_size_limit.load(Ordering::Relaxed), 2048); + assert!(!changed); + } + + #[test] + fn test_diff_ec_shard_delta_messages_reports_mounts_and_unmounts() { + let temp_dir = tempfile::tempdir().unwrap(); + let dir = temp_dir.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 8, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let previous = collect_ec_shard_delta_messages(&store); + + std::fs::write(format!("{}/ec_delta_case_81.ec00", dir), b"delta").unwrap(); + store.locations[0] + .mount_ec_shards(VolumeId(81), "ec_delta_case", &[0]) + .unwrap(); + let current = collect_ec_shard_delta_messages(&store); + let (new_ec_shards, deleted_ec_shards) = + diff_ec_shard_delta_messages(&previous, ¤t); + + assert_eq!(new_ec_shards.len(), 1); + assert!(deleted_ec_shards.is_empty()); + assert_eq!(new_ec_shards[0].ec_index_bits, 1); + assert_eq!(new_ec_shards[0].shard_sizes, vec![5]); + + let (new_after_delete, deleted_after_delete) = + diff_ec_shard_delta_messages(¤t, &HashMap::new()); + assert!(new_after_delete.is_empty()); + assert_eq!(deleted_after_delete.len(), 1); + assert_eq!(deleted_after_delete[0].ec_index_bits, 1); + assert_eq!(deleted_after_delete[0].shard_sizes, vec![0]); + } +} diff --git a/seaweed-volume/src/server/memory_status.rs b/seaweed-volume/src/server/memory_status.rs new file mode 100644 index 000000000..92886465f --- /dev/null +++ b/seaweed-volume/src/server/memory_status.rs @@ -0,0 +1,102 @@ +use crate::pb::volume_server_pb; + +pub fn collect_mem_status() -> volume_server_pb::MemStatus { + #[allow(unused_mut)] + let mut mem = volume_server_pb::MemStatus { + goroutines: 1, + ..Default::default() + }; + + #[cfg(target_os = "linux")] + { + if let Some((all, free)) = get_system_memory_linux() { + mem.all = all; + mem.free = free; + mem.used = all.saturating_sub(free); + } + + if let Some(status) = read_process_status_linux() { + if status.threads > 0 { + mem.goroutines = status.threads as i32; + } + if let Some(rss) = status.rss { + mem.self_ = rss; + } + if let Some(heap) = status.data.or(status.rss) { + mem.heap = heap; + } + if let Some(stack) = status.stack { + mem.stack = stack; + } + } + } + + mem +} + +#[cfg(target_os = "linux")] +fn get_system_memory_linux() -> Option<(u64, u64)> { + unsafe { + let mut info: libc::sysinfo = std::mem::zeroed(); + if libc::sysinfo(&mut info) == 0 { + let unit = info.mem_unit as u64; + let total = info.totalram as u64 * unit; + let free = info.freeram as u64 * unit; + return Some((total, free)); + } + } + None +} + +#[cfg(target_os = "linux")] +#[derive(Default)] +struct ProcessStatus { + threads: u64, + rss: Option, + data: Option, + stack: Option, +} + +#[cfg(target_os = "linux")] +fn read_process_status_linux() -> Option { + let status = std::fs::read_to_string("/proc/self/status").ok()?; + let mut out = ProcessStatus::default(); + + for line in status.lines() { + if let Some(value) = line.strip_prefix("Threads:") { + out.threads = value.trim().parse().ok()?; + continue; + } + if let Some(value) = parse_proc_status_kib_field(line, "VmRSS:") { + out.rss = Some(value); + continue; + } + if let Some(value) = parse_proc_status_kib_field(line, "VmData:") { + out.data = Some(value); + continue; + } + if let Some(value) = parse_proc_status_kib_field(line, "VmStk:") { + out.stack = Some(value); + } + } + + Some(out) +} + +#[cfg(target_os = "linux")] +fn parse_proc_status_kib_field(line: &str, prefix: &str) -> Option { + let raw = line.strip_prefix(prefix)?.trim(); + let value = raw.strip_suffix(" kB").unwrap_or(raw).trim(); + value.parse::().ok().map(|kib| kib * 1024) +} + +#[cfg(test)] +mod tests { + use super::collect_mem_status; + + #[test] + fn test_collect_mem_status_reports_live_process_state() { + let mem = collect_mem_status(); + assert!(mem.goroutines > 0); + } +} diff --git a/seaweed-volume/src/server/mod.rs b/seaweed-volume/src/server/mod.rs new file mode 100644 index 000000000..6103b4980 --- /dev/null +++ b/seaweed-volume/src/server/mod.rs @@ -0,0 +1,12 @@ +pub mod debug; +pub mod grpc_client; +pub mod grpc_server; +pub mod handlers; +pub mod heartbeat; +pub mod memory_status; +pub mod profiling; +pub mod request_id; +pub mod server_stats; +pub mod ui; +pub mod volume_server; +pub mod write_queue; diff --git a/seaweed-volume/src/server/profiling.rs b/seaweed-volume/src/server/profiling.rs new file mode 100644 index 000000000..1965d227f --- /dev/null +++ b/seaweed-volume/src/server/profiling.rs @@ -0,0 +1,187 @@ +use std::fs::File; +use std::io::Write; +use std::path::PathBuf; + +use pprof::protos::Message; + +use crate::config::VolumeServerConfig; + +const GO_CPU_PROFILE_FREQUENCY: i32 = 100; +const GO_PPROF_BLOCKLIST: [&str; 4] = ["libc", "libgcc", "pthread", "vdso"]; + +pub struct CpuProfileSession { + output_path: PathBuf, + guard: pprof::ProfilerGuard<'static>, +} + +impl CpuProfileSession { + pub fn start(config: &VolumeServerConfig) -> Result, String> { + if config.cpu_profile.is_empty() { + if !config.mem_profile.is_empty() && !config.pprof { + tracing::warn!( + "--memprofile is not yet supported in the Rust volume server; ignoring '{}'", + config.mem_profile + ); + } + return Ok(None); + } + + if config.pprof { + tracing::info!( + "--pprof is enabled; ignoring --cpuprofile '{}' and --memprofile '{}'", + config.cpu_profile, + config.mem_profile + ); + return Ok(None); + } + + if !config.mem_profile.is_empty() { + tracing::warn!( + "--memprofile is not yet supported in the Rust volume server; only --cpuprofile '{}' will be written", + config.cpu_profile + ); + } + + let guard = pprof::ProfilerGuardBuilder::default() + .frequency(GO_CPU_PROFILE_FREQUENCY) + .blocklist(&GO_PPROF_BLOCKLIST) + .build() + .map_err(|e| { + format!( + "Failed to start CPU profiler '{}': {}", + config.cpu_profile, e + ) + })?; + + Ok(Some(Self { + output_path: PathBuf::from(&config.cpu_profile), + guard, + })) + } + + pub fn finish(self) -> Result<(), String> { + let report = self + .guard + .report() + .build() + .map_err(|e| format!("Failed to build CPU profile report: {}", e))?; + let profile = report + .pprof() + .map_err(|e| format!("Failed to encode CPU profile report: {}", e))?; + + let mut bytes = Vec::new(); + profile + .encode(&mut bytes) + .map_err(|e| format!("Failed to serialize CPU profile report: {}", e))?; + + let mut file = File::create(&self.output_path).map_err(|e| { + format!( + "Failed to create CPU profile '{}': {}", + self.output_path.display(), + e + ) + })?; + file.write_all(&bytes).map_err(|e| { + format!( + "Failed to write CPU profile '{}': {}", + self.output_path.display(), + e + ) + })?; + file.flush().map_err(|e| { + format!( + "Failed to flush CPU profile '{}': {}", + self.output_path.display(), + e + ) + })?; + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::CpuProfileSession; + use crate::config::{NeedleMapKind, ReadMode, VolumeServerConfig}; + use crate::security::tls::TlsPolicy; + + fn sample_config() -> VolumeServerConfig { + VolumeServerConfig { + port: 8080, + grpc_port: 18080, + public_port: 8080, + ip: "127.0.0.1".to_string(), + bind_ip: "127.0.0.1".to_string(), + public_url: "127.0.0.1:8080".to_string(), + id: "127.0.0.1:8080".to_string(), + masters: vec![], + pre_stop_seconds: 0, + idle_timeout: 0, + data_center: String::new(), + rack: String::new(), + index_type: NeedleMapKind::InMemory, + disk_type: String::new(), + folders: vec!["/tmp".to_string()], + folder_max_limits: vec![8], + folder_tags: vec![vec![]], + min_free_spaces: vec![], + disk_types: vec![String::new()], + idx_folder: String::new(), + white_list: vec![], + fix_jpg_orientation: false, + read_mode: ReadMode::Local, + cpu_profile: String::new(), + mem_profile: String::new(), + compaction_byte_per_second: 0, + maintenance_byte_per_second: 0, + file_size_limit_bytes: 0, + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(0), + inflight_download_data_timeout: std::time::Duration::from_secs(0), + has_slow_read: false, + read_buffer_size_mb: 4, + ldb_timeout: 0, + pprof: false, + metrics_port: 0, + metrics_ip: String::new(), + debug: false, + debug_port: 0, + ui_enabled: false, + jwt_signing_key: vec![], + jwt_signing_expires_seconds: 0, + jwt_read_signing_key: vec![], + jwt_read_signing_expires_seconds: 0, + https_cert_file: String::new(), + https_key_file: String::new(), + https_ca_file: String::new(), + https_client_enabled: false, + https_client_cert_file: String::new(), + https_client_key_file: String::new(), + https_client_ca_file: String::new(), + grpc_cert_file: String::new(), + grpc_key_file: String::new(), + grpc_ca_file: String::new(), + grpc_allowed_wildcard_domain: String::new(), + grpc_volume_allowed_common_names: vec![], + tls_policy: TlsPolicy::default(), + enable_write_queue: false, + security_file: String::new(), + } + } + + #[test] + fn test_cpu_profile_session_skips_when_disabled() { + let config = sample_config(); + assert!(CpuProfileSession::start(&config).unwrap().is_none()); + } + + #[test] + fn test_cpu_profile_session_skips_when_pprof_enabled() { + let mut config = sample_config(); + config.cpu_profile = "/tmp/cpu.pb".to_string(); + config.pprof = true; + assert!(CpuProfileSession::start(&config).unwrap().is_none()); + } +} diff --git a/seaweed-volume/src/server/request_id.rs b/seaweed-volume/src/server/request_id.rs new file mode 100644 index 000000000..f3e43c560 --- /dev/null +++ b/seaweed-volume/src/server/request_id.rs @@ -0,0 +1,137 @@ +use std::future::Future; +use std::pin::Pin; +use std::task::{Context, Poll}; + +use hyper::http::{self, HeaderValue}; +use tonic::metadata::MetadataValue; +use tonic::{Request, Status}; +use tower::{Layer, Service}; + +tokio::task_local! { + static CURRENT_REQUEST_ID: String; +} + +#[derive(Clone, Debug, Default)] +pub struct GrpcRequestIdLayer; + +#[derive(Clone, Debug)] +pub struct GrpcRequestIdService { + inner: S, +} + +impl Layer for GrpcRequestIdLayer { + type Service = GrpcRequestIdService; + + fn layer(&self, inner: S) -> Self::Service { + GrpcRequestIdService { inner } + } +} + +impl Service> for GrpcRequestIdService +where + S: Service, Response = http::Response> + Send + 'static, + S::Future: Send + 'static, + B: Send + 'static, +{ + type Response = http::Response; + type Error = S::Error; + type Future = Pin> + Send>>; + + fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll> { + self.inner.poll_ready(cx) + } + + fn call(&mut self, mut request: http::Request) -> Self::Future { + let request_id = match request.headers().get("x-amz-request-id") { + Some(value) => match value.to_str() { + Ok(value) if !value.is_empty() => value.to_owned(), + _ => generate_grpc_request_id(), + }, + None => generate_grpc_request_id(), + }; + + if let Ok(value) = HeaderValue::from_str(&request_id) { + request.headers_mut().insert("x-amz-request-id", value); + } + + let future = self.inner.call(request); + + Box::pin(async move { + let mut response: http::Response = + scope_request_id(request_id.clone(), future).await?; + if let Ok(value) = HeaderValue::from_str(&request_id) { + response.headers_mut().insert("x-amz-request-id", value); + } + Ok(response) + }) + } +} + +pub async fn scope_request_id(request_id: String, future: F) -> T +where + F: Future, +{ + CURRENT_REQUEST_ID.scope(request_id, future).await +} + +pub fn current_request_id() -> Option { + CURRENT_REQUEST_ID.try_with(Clone::clone).ok() +} + +pub fn outgoing_request_id_interceptor(mut request: Request<()>) -> Result, Status> { + if let Some(request_id) = current_request_id() { + let value = MetadataValue::try_from(request_id.as_str()) + .map_err(|_| Status::internal("invalid scoped request id"))?; + request.metadata_mut().insert("x-amz-request-id", value); + } + Ok(request) +} + +pub fn generate_http_request_id() -> String { + use rand::Rng; + + let nanos = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as u64; + let rand_val: u32 = rand::thread_rng().gen(); + format!("{:X}{:08X}", nanos, rand_val) +} + +fn generate_grpc_request_id() -> String { + uuid::Uuid::new_v4().to_string() +} + +#[cfg(test)] +mod tests { + use super::{current_request_id, outgoing_request_id_interceptor, scope_request_id}; + use tonic::Request; + + #[tokio::test] + async fn test_scope_request_id_exposes_current_value() { + let request_id = "req-123".to_string(); + let current = scope_request_id( + request_id.clone(), + async move { current_request_id().unwrap() }, + ) + .await; + assert_eq!(current, request_id); + } + + #[tokio::test] + async fn test_outgoing_request_id_interceptor_propagates_scope() { + let request = scope_request_id("req-456".to_string(), async move { + outgoing_request_id_interceptor(Request::new(())).unwrap() + }) + .await; + assert_eq!( + request + .metadata() + .get("x-amz-request-id") + .unwrap() + .to_str() + .unwrap(), + "req-456" + ); + } +} diff --git a/seaweed-volume/src/server/server_stats.rs b/seaweed-volume/src/server/server_stats.rs new file mode 100644 index 000000000..054b6d907 --- /dev/null +++ b/seaweed-volume/src/server/server_stats.rs @@ -0,0 +1,248 @@ +use chrono::{Datelike, Local, Timelike}; +use serde::Serialize; +use std::sync::{LazyLock, Mutex}; +use std::time::Instant; + +static START_TIME: LazyLock = LazyLock::new(Instant::now); +static SERVER_STATS: LazyLock = LazyLock::new(ServerStats::default); + +#[derive(Default)] +pub struct ServerStats { + inner: Mutex, +} + +#[derive(Default)] +struct ServerStatsInner { + requests: DurationCounter, + connections: DurationCounter, + assign_requests: DurationCounter, + read_requests: DurationCounter, + write_requests: DurationCounter, + delete_requests: DurationCounter, + bytes_in: DurationCounter, + bytes_out: DurationCounter, +} + +#[derive(Clone, Serialize)] +#[serde(rename_all = "PascalCase")] +pub struct ServerStatsSnapshot { + pub requests: DurationCounterSnapshot, + pub connections: DurationCounterSnapshot, + pub assign_requests: DurationCounterSnapshot, + pub read_requests: DurationCounterSnapshot, + pub write_requests: DurationCounterSnapshot, + pub delete_requests: DurationCounterSnapshot, + pub bytes_in: DurationCounterSnapshot, + pub bytes_out: DurationCounterSnapshot, +} + +#[derive(Clone, Serialize)] +#[serde(rename_all = "PascalCase")] +pub struct DurationCounterSnapshot { + pub minute_counter: RoundRobinCounterSnapshot, + pub hour_counter: RoundRobinCounterSnapshot, + pub day_counter: RoundRobinCounterSnapshot, + pub week_counter: RoundRobinCounterSnapshot, +} + +#[derive(Clone, Serialize)] +#[serde(rename_all = "PascalCase")] +pub struct RoundRobinCounterSnapshot { + pub last_index: i32, + pub values: Vec, + pub counts: Vec, +} + +#[derive(Clone)] +struct DurationCounter { + minute_counter: RoundRobinCounter, + hour_counter: RoundRobinCounter, + day_counter: RoundRobinCounter, + week_counter: RoundRobinCounter, +} + +#[derive(Clone)] +struct RoundRobinCounter { + last_index: i32, + values: Vec, + counts: Vec, +} + +impl Default for DurationCounter { + fn default() -> Self { + Self { + minute_counter: RoundRobinCounter::new(60), + hour_counter: RoundRobinCounter::new(60), + day_counter: RoundRobinCounter::new(24), + week_counter: RoundRobinCounter::new(7), + } + } +} + +impl RoundRobinCounter { + fn new(slots: usize) -> Self { + Self { + last_index: -1, + values: vec![0; slots], + counts: vec![0; slots], + } + } + + fn add(&mut self, index: usize, val: i64) { + if index >= self.values.len() { + return; + } + while self.last_index != index as i32 { + self.last_index = (self.last_index + 1).rem_euclid(self.values.len() as i32); + self.values[self.last_index as usize] = 0; + self.counts[self.last_index as usize] = 0; + } + self.values[index] += val; + self.counts[index] += 1; + } + + fn snapshot(&self) -> RoundRobinCounterSnapshot { + RoundRobinCounterSnapshot { + last_index: self.last_index, + values: self.values.clone(), + counts: self.counts.clone(), + } + } +} + +impl DurationCounter { + fn add_now(&mut self, val: i64) { + let now = Local::now(); + self.minute_counter.add(now.second() as usize, val); + self.hour_counter.add(now.minute() as usize, val); + self.day_counter.add(now.hour() as usize, val); + self.week_counter + .add(now.weekday().num_days_from_sunday() as usize, val); + } + + fn snapshot(&self) -> DurationCounterSnapshot { + DurationCounterSnapshot { + minute_counter: self.minute_counter.snapshot(), + hour_counter: self.hour_counter.snapshot(), + day_counter: self.day_counter.snapshot(), + week_counter: self.week_counter.snapshot(), + } + } +} + +impl ServerStatsInner { + fn snapshot(&self) -> ServerStatsSnapshot { + ServerStatsSnapshot { + requests: self.requests.snapshot(), + connections: self.connections.snapshot(), + assign_requests: self.assign_requests.snapshot(), + read_requests: self.read_requests.snapshot(), + write_requests: self.write_requests.snapshot(), + delete_requests: self.delete_requests.snapshot(), + bytes_in: self.bytes_in.snapshot(), + bytes_out: self.bytes_out.snapshot(), + } + } +} + +impl ServerStats { + fn update(&self, update: F) + where + F: FnOnce(&mut ServerStatsInner), + { + let mut inner = self.inner.lock().unwrap(); + update(&mut inner); + } + + fn snapshot(&self) -> ServerStatsSnapshot { + self.inner.lock().unwrap().snapshot() + } +} + +impl RoundRobinCounterSnapshot { + pub fn to_list(&self) -> Vec { + if self.values.is_empty() { + return Vec::new(); + } + let mut ret = Vec::with_capacity(self.values.len()); + let mut index = self.last_index; + let mut step = self.values.len(); + while step > 0 { + step -= 1; + index += 1; + if index >= self.values.len() as i32 { + index = 0; + } + ret.push(self.values[index as usize]); + } + ret + } +} + +pub fn init_process_start() { + LazyLock::force(&START_TIME); + LazyLock::force(&SERVER_STATS); +} + +pub fn uptime_string() -> String { + let secs = START_TIME.elapsed().as_secs(); + let hours = secs / 3600; + let minutes = (secs % 3600) / 60; + let seconds = secs % 60; + let mut out = String::new(); + if hours > 0 { + out.push_str(&format!("{}h", hours)); + } + if hours > 0 || minutes > 0 { + out.push_str(&format!("{}m", minutes)); + } + out.push_str(&format!("{}s", seconds)); + out +} + +pub fn snapshot() -> ServerStatsSnapshot { + SERVER_STATS.snapshot() +} + +pub fn record_request_open() { + SERVER_STATS.update(|inner| inner.requests.add_now(1)); +} + +pub fn record_request_close() { + SERVER_STATS.update(|inner| inner.requests.add_now(-1)); +} + +pub fn record_connection_open() { + SERVER_STATS.update(|inner| inner.connections.add_now(1)); +} + +pub fn record_connection_close() { + SERVER_STATS.update(|inner| inner.connections.add_now(-1)); +} + +pub fn record_read_request() { + SERVER_STATS.update(|inner| inner.read_requests.add_now(1)); +} + +pub fn record_write_request() { + SERVER_STATS.update(|inner| inner.write_requests.add_now(1)); +} + +pub fn record_delete_request() { + SERVER_STATS.update(|inner| inner.delete_requests.add_now(1)); +} + +pub fn record_bytes_in(bytes: i64) { + SERVER_STATS.update(|inner| inner.bytes_in.add_now(bytes)); +} + +pub fn record_bytes_out(bytes: i64) { + SERVER_STATS.update(|inner| inner.bytes_out.add_now(bytes)); +} + +#[cfg(test)] +pub fn reset_for_tests() { + LazyLock::force(&START_TIME); + let mut inner = SERVER_STATS.inner.lock().unwrap(); + *inner = ServerStatsInner::default(); +} diff --git a/seaweed-volume/src/server/ui.rs b/seaweed-volume/src/server/ui.rs new file mode 100644 index 000000000..f1f830a56 --- /dev/null +++ b/seaweed-volume/src/server/ui.rs @@ -0,0 +1,507 @@ +use std::fmt::Write as _; + +use crate::server::server_stats; +use crate::server::volume_server::VolumeServerState; +use crate::storage::store::Store; + +pub struct EmbeddedAsset { + pub content_type: &'static str, + pub bytes: &'static [u8], +} + +struct UiDiskRow { + dir: String, + disk_type: String, + all: u64, + free: u64, + used: u64, +} + +struct UiVolumeRow { + id: u32, + collection: String, + disk_type: String, + size: u64, + file_count: i64, + delete_count: i64, + deleted_byte_count: u64, + ttl: String, + read_only: bool, + version: u32, + remote_storage_name: String, + remote_storage_key: String, +} + +struct UiEcShardRow { + shard_id: u8, + size: u64, +} + +struct UiEcVolumeRow { + volume_id: u32, + collection: String, + size: u64, + shards: Vec, + created_at: String, +} + +pub fn favicon_asset() -> EmbeddedAsset { + EmbeddedAsset { + content_type: "image/x-icon", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/favicon.ico" + )), + } +} + +pub fn lookup_static_asset(path: &str) -> Option { + let path = path.trim_start_matches('/'); + let asset = match path { + "bootstrap/3.3.1/css/bootstrap.min.css" => EmbeddedAsset { + content_type: "text/css; charset=utf-8", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/bootstrap/3.3.1/css/bootstrap.min.css" + )), + }, + "bootstrap/3.3.1/fonts/glyphicons-halflings-regular.eot" => EmbeddedAsset { + content_type: "application/vnd.ms-fontobject", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/bootstrap/3.3.1/fonts/glyphicons-halflings-regular.eot" + )), + }, + "bootstrap/3.3.1/fonts/glyphicons-halflings-regular.svg" => EmbeddedAsset { + content_type: "image/svg+xml", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/bootstrap/3.3.1/fonts/glyphicons-halflings-regular.svg" + )), + }, + "bootstrap/3.3.1/fonts/glyphicons-halflings-regular.ttf" => EmbeddedAsset { + content_type: "font/ttf", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/bootstrap/3.3.1/fonts/glyphicons-halflings-regular.ttf" + )), + }, + "bootstrap/3.3.1/fonts/glyphicons-halflings-regular.woff" => EmbeddedAsset { + content_type: "font/woff", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/bootstrap/3.3.1/fonts/glyphicons-halflings-regular.woff" + )), + }, + "images/folder.gif" => EmbeddedAsset { + content_type: "image/gif", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/images/folder.gif" + )), + }, + "javascript/jquery-3.6.0.min.js" => EmbeddedAsset { + content_type: "application/javascript; charset=utf-8", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/javascript/jquery-3.6.0.min.js" + )), + }, + "javascript/jquery-sparklines/2.1.2/jquery.sparkline.min.js" => EmbeddedAsset { + content_type: "application/javascript; charset=utf-8", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/javascript/jquery-sparklines/2.1.2/jquery.sparkline.min.js" + )), + }, + "seaweed50x50.png" => EmbeddedAsset { + content_type: "image/png", + bytes: include_bytes!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/static/seaweed50x50.png" + )), + }, + _ => return None, + }; + Some(asset) +} + +pub fn render_volume_server_html(state: &VolumeServerState) -> String { + let counters = server_stats::snapshot(); + let (disk_rows, volume_rows, remote_volume_rows, ec_volume_rows) = { + let store = state.store.read().unwrap(); + collect_ui_data(&store) + }; + + let masters = if state.master_urls.is_empty() { + "[]".to_string() + } else { + format!("[{}]", state.master_urls.join(" ")) + }; + let uptime = server_stats::uptime_string(); + let read_week = join_i64(&counters.read_requests.week_counter.to_list()); + let read_day = join_i64(&counters.read_requests.day_counter.to_list()); + let read_hour = join_i64(&counters.read_requests.hour_counter.to_list()); + let read_minute = join_i64(&counters.read_requests.minute_counter.to_list()); + + let mut disk_rows_html = String::new(); + for disk in &disk_rows { + let _ = write!( + disk_rows_html, + "{}{}{}{}{:.2}%", + escape_html(&disk.dir), + escape_html(&disk.disk_type), + bytes_to_human_readable(disk.all), + bytes_to_human_readable(disk.free), + percent_from(disk.all, disk.used), + ); + } + + let mut volume_rows_html = String::new(); + for volume in &volume_rows { + let _ = write!( + volume_rows_html, + "{}{}{}{}{}{} / {}{}{}{}", + volume.id, + escape_html(&volume.collection), + escape_html(&volume.disk_type), + bytes_to_human_readable(volume.size), + volume.file_count, + volume.delete_count, + bytes_to_human_readable(volume.deleted_byte_count), + escape_html(&volume.ttl), + volume.read_only, + volume.version, + ); + } + + let remote_section = if remote_volume_rows.is_empty() { + String::new() + } else { + let mut remote_rows_html = String::new(); + for volume in &remote_volume_rows { + let _ = write!( + remote_rows_html, + "{}{}{}{}{} / {}{}{}", + volume.id, + escape_html(&volume.collection), + bytes_to_human_readable(volume.size), + volume.file_count, + volume.delete_count, + bytes_to_human_readable(volume.deleted_byte_count), + escape_html(&volume.remote_storage_name), + escape_html(&volume.remote_storage_key), + ); + } + format!( + r#"
+

Remote Volumes

+ + + + + + + + + + + + + {} +
IdCollectionSizeFilesTrashRemoteKey
+
"#, + remote_rows_html + ) + }; + + let ec_section = if ec_volume_rows.is_empty() { + String::new() + } else { + let mut ec_rows_html = String::new(); + for ec in &ec_volume_rows { + let mut shard_labels = String::new(); + for shard in &ec.shards { + let _ = write!( + shard_labels, + "{}: {}", + shard.shard_id, + bytes_to_human_readable(shard.size) + ); + } + let _ = write!( + ec_rows_html, + "{}{}{}{}{}", + ec.volume_id, + escape_html(&ec.collection), + bytes_to_human_readable(ec.size), + shard_labels, + escape_html(&ec.created_at), + ); + } + format!( + r#"
+

Erasure Coding Shards

+ + + + + + + + + + + {} +
IdCollectionTotal SizeShard DetailsCreatedAt
+
"#, + ec_rows_html + ) + }; + + format!( + r#" + + + SeaweedFS {version} + + + + + + + +
+ + +
+
+

Disk Stats

+ + + + + + + + + + + {disk_rows_html} +
PathDiskTotalFreeUsage
+
+ +
+

System Stats

+ + + + + + + +
Masters{masters}
Weekly # ReadRequests{read_week}
Daily # ReadRequests{read_day}
Hourly # ReadRequests{read_hour}
Last Minute # ReadRequests{read_minute}
Up Time{uptime}
+
+
+ +
+

Volumes

+ + + + + + + + + + + + + + + {volume_rows_html} +
IdCollectionDiskData SizeFilesTrashTTLReadOnlyVersion
+
+ + {remote_section} + {ec_section} +
+ +"#, + version = escape_html(crate::version::version()), + disk_rows_html = disk_rows_html, + masters = escape_html(&masters), + read_week = read_week, + read_day = read_day, + read_hour = read_hour, + read_minute = read_minute, + uptime = escape_html(&uptime), + volume_rows_html = volume_rows_html, + remote_section = remote_section, + ec_section = ec_section, + ) +} + +fn collect_ui_data( + store: &Store, +) -> ( + Vec, + Vec, + Vec, + Vec, +) { + let mut disk_rows = Vec::new(); + let mut volumes = Vec::new(); + let mut remote_volumes = Vec::new(); + let mut ec_volumes = Vec::new(); + + for loc in &store.locations { + let dir = absolute_display_path(&loc.directory); + let (all, free) = crate::storage::disk_location::get_disk_stats(&dir); + disk_rows.push(UiDiskRow { + dir, + disk_type: loc.disk_type.to_string(), + all, + free, + used: all.saturating_sub(free), + }); + + for (_, volume) in loc.volumes() { + let (remote_storage_name, remote_storage_key) = volume.remote_storage_name_key(); + let row = UiVolumeRow { + id: volume.id.0, + collection: volume.collection.clone(), + disk_type: loc.disk_type.to_string(), + size: volume.content_size(), + file_count: volume.file_count(), + delete_count: volume.deleted_count(), + deleted_byte_count: volume.deleted_size(), + ttl: volume.super_block.ttl.to_string(), + read_only: volume.is_read_only(), + version: volume.version().0 as u32, + remote_storage_name, + remote_storage_key, + }; + if row.remote_storage_name.is_empty() { + volumes.push(row); + } else { + remote_volumes.push(row); + } + } + + for (_, ec_volume) in loc.ec_volumes() { + let mut shards = Vec::new(); + let mut total_size = 0u64; + let mut created_at = String::from("-"); + for shard in ec_volume.shards.iter().flatten() { + let shard_size = shard.file_size().max(0) as u64; + total_size = total_size.saturating_add(shard_size); + shards.push(UiEcShardRow { + shard_id: shard.shard_id, + size: shard_size, + }); + if created_at == "-" { + if let Ok(metadata) = std::fs::metadata(shard.file_name()) { + if let Ok(modified) = metadata.modified() { + let ts: chrono::DateTime = modified.into(); + created_at = ts.format("%Y-%m-%d %H:%M").to_string(); + } + } + } + } + let preferred_size = ec_volume.dat_file_size.max(0) as u64; + ec_volumes.push(UiEcVolumeRow { + volume_id: ec_volume.volume_id.0, + collection: ec_volume.collection.clone(), + size: preferred_size.max(total_size), + shards, + created_at, + }); + } + } + + disk_rows.sort_by(|left, right| left.dir.cmp(&right.dir)); + volumes.sort_by_key(|row| row.id); + remote_volumes.sort_by_key(|row| row.id); + ec_volumes.sort_by_key(|row| row.volume_id); + + (disk_rows, volumes, remote_volumes, ec_volumes) +} + +fn absolute_display_path(path: &str) -> String { + let p = std::path::Path::new(path); + if p.is_absolute() { + return path.to_string(); + } + std::env::current_dir() + .map(|cwd| cwd.join(p).to_string_lossy().to_string()) + .unwrap_or_else(|_| path.to_string()) +} + +fn join_i64(values: &[i64]) -> String { + values + .iter() + .map(std::string::ToString::to_string) + .collect::>() + .join(",") +} + +fn percent_from(total: u64, part: u64) -> f64 { + if total == 0 { + return 0.0; + } + (part as f64 / total as f64) * 100.0 +} + +fn bytes_to_human_readable(bytes: u64) -> String { + const UNIT: u64 = 1024; + if bytes < UNIT { + return format!("{} B", bytes); + } + + let mut div = UNIT; + let mut exp = 0usize; + let mut n = bytes / UNIT; + while n >= UNIT { + div *= UNIT; + n /= UNIT; + exp += 1; + } + + format!( + "{:.2} {}iB", + bytes as f64 / div as f64, + ["K", "M", "G", "T", "P", "E"][exp] + ) +} + +fn escape_html(input: &str) -> String { + input + .replace('&', "&") + .replace('<', "<") + .replace('>', ">") + .replace('"', """) +} diff --git a/seaweed-volume/src/server/volume_server.rs b/seaweed-volume/src/server/volume_server.rs new file mode 100644 index 000000000..90436dc01 --- /dev/null +++ b/seaweed-volume/src/server/volume_server.rs @@ -0,0 +1,394 @@ +//! VolumeServer: the main HTTP server for volume operations. +//! +//! Routes: +//! GET/HEAD /{vid},{fid} — read a file +//! POST/PUT /{vid},{fid} — write a file +//! DELETE /{vid},{fid} — delete a file +//! GET /status — server status +//! GET /healthz — health check +//! +//! Matches Go's server/volume_server.go. + +use std::net::SocketAddr; +use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU32, Ordering}; +use std::sync::{Arc, RwLock}; + +use axum::{ + extract::{connect_info::ConnectInfo, Request, State}, + http::{header, HeaderValue, Method, StatusCode}, + middleware::{self, Next}, + response::{IntoResponse, Response}, + routing::{any, get}, + Router, +}; + +use crate::config::ReadMode; +use crate::security::Guard; +use crate::storage::store::Store; + +use super::grpc_client::OutgoingGrpcTlsConfig; +use super::handlers; +use super::write_queue::WriteQueue; + +#[derive(Clone, Debug, Default)] +pub struct RuntimeMetricsConfig { + pub push_gateway: crate::metrics::PushGatewayConfig, +} + +/// Shared state for the volume server. +pub struct VolumeServerState { + pub store: RwLock, + pub guard: RwLock, + pub is_stopping: RwLock, + /// Maintenance mode flag. + pub maintenance: AtomicBool, + /// State version — incremented on each SetState call. + pub state_version: AtomicU32, + /// Throttling: concurrent upload/download limits (in bytes, 0 = disabled). + pub concurrent_upload_limit: i64, + pub concurrent_download_limit: i64, + pub inflight_upload_data_timeout: std::time::Duration, + pub inflight_download_data_timeout: std::time::Duration, + /// Current in-flight upload/download bytes. + pub inflight_upload_bytes: AtomicI64, + pub inflight_download_bytes: AtomicI64, + /// Notify waiters when inflight bytes decrease. + pub upload_notify: tokio::sync::Notify, + pub download_notify: tokio::sync::Notify, + /// Data center name from config. + pub data_center: String, + /// Rack name from config. + pub rack: String, + /// File size limit in bytes (0 = no limit). + pub file_size_limit_bytes: i64, + /// Default IO rate limit for maintenance copy/replication work. + pub maintenance_byte_per_second: i64, + /// Whether the server is connected to master (heartbeat active). + pub is_heartbeating: AtomicBool, + /// Whether master addresses are configured. + pub has_master: bool, + /// Seconds to wait before shutting down servers (graceful drain). + pub pre_stop_seconds: u32, + /// Notify heartbeat to send an immediate update when volume state changes. + pub volume_state_notify: tokio::sync::Notify, + /// Optional batched write queue for improved throughput under load. + pub write_queue: std::sync::OnceLock, + /// Registry of S3 tier backends for tiered storage operations. + pub s3_tier_registry: std::sync::RwLock, + /// Read mode: local, proxy, or redirect for non-local volumes. + pub read_mode: ReadMode, + /// First master address for volume lookups (e.g., "localhost:9333"). + pub master_url: String, + /// Seed master addresses for UI rendering. + pub master_urls: Vec, + /// This server's own address (ip:port) for filtering self from lookup results. + pub self_url: String, + /// HTTP client for proxy requests and master lookups. + pub http_client: reqwest::Client, + /// Scheme used for outgoing master and peer HTTP requests ("http" or "https"). + pub outgoing_http_scheme: String, + /// Optional client TLS material for outgoing gRPC connections. + pub outgoing_grpc_tls: Option, + /// Metrics push settings learned from master heartbeat responses. + pub metrics_runtime: std::sync::RwLock, + pub metrics_notify: tokio::sync::Notify, + /// Whether JPEG uploads should be normalized using EXIF orientation. + pub fix_jpg_orientation: bool, + /// Read tuning flags for large-file streaming. + pub has_slow_read: bool, + pub read_buffer_size_bytes: usize, + /// Path to security.toml — stored for SIGHUP reload. + pub security_file: String, + /// Original CLI whitelist entries — stored for SIGHUP reload. + pub cli_white_list: Vec, + /// Path to state.pb file for persisting VolumeServerState across restarts. + pub state_file_path: String, +} + +impl VolumeServerState { + /// Check if the server is in maintenance mode; return gRPC error if so. + pub fn check_maintenance(&self) -> Result<(), tonic::Status> { + if self.maintenance.load(Ordering::Relaxed) { + let id = self.store.read().unwrap().id.clone(); + return Err(tonic::Status::unavailable(format!( + "volume server {} is in maintenance mode", + id + ))); + } + Ok(()) + } +} + +pub fn build_metrics_router() -> Router { + Router::new().route("/metrics", get(handlers::metrics_handler)) +} + +pub fn normalize_outgoing_http_url(scheme: &str, raw_target: &str) -> Result { + if raw_target.starts_with("http://") || raw_target.starts_with("https://") { + let mut url = reqwest::Url::parse(raw_target) + .map_err(|e| format!("invalid url {}: {}", raw_target, e))?; + url.set_scheme(scheme) + .map_err(|_| format!("invalid scheme {}", scheme))?; + return Ok(url.to_string()); + } + Ok(format!("{}://{}", scheme, raw_target)) +} + +fn request_remote_addr(request: &Request) -> Option { + request + .extensions() + .get::>() + .map(|info| info.0) +} + +fn request_is_whitelisted(state: &VolumeServerState, request: &Request) -> bool { + request_remote_addr(request) + .map(|remote_addr| { + state + .guard + .read() + .unwrap() + .check_whitelist(&remote_addr.to_string()) + }) + .unwrap_or(true) +} + +/// Middleware: set Server header, echo x-amz-request-id, set CORS if Origin present. +async fn common_headers_middleware(request: Request, next: Next) -> Response { + let origin = request.headers().get("origin").cloned(); + let request_id = super::request_id::generate_http_request_id(); + + let mut response = + super::request_id::scope_request_id( + request_id.clone(), + async move { next.run(request).await }, + ) + .await; + + let headers = response.headers_mut(); + if let Ok(val) = HeaderValue::from_str(crate::version::server_header()) { + headers.insert("Server", val); + } + + if let Ok(val) = HeaderValue::from_str(&request_id) { + headers.insert("X-Request-Id", val.clone()); + headers.insert("x-amz-request-id", val); + } + + if origin.is_some() { + headers.insert("Access-Control-Allow-Origin", HeaderValue::from_static("*")); + headers.insert( + "Access-Control-Allow-Credentials", + HeaderValue::from_static("true"), + ); + } + + response +} + +/// Admin store handler — dispatches based on HTTP method. +/// Matches Go's privateStoreHandler: GET/HEAD → read, POST/PUT → write, +/// DELETE → delete, OPTIONS → CORS headers, anything else → 400. +async fn admin_store_handler(state: State>, request: Request) -> Response { + let start = std::time::Instant::now(); + let method = request.method().clone(); + let mut method_str = method.as_str().to_string(); + let request_bytes = request + .headers() + .get(header::CONTENT_LENGTH) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.parse::().ok()) + .filter(|value| *value > 0) + .unwrap_or(0); + super::server_stats::record_request_open(); + crate::metrics::INFLIGHT_REQUESTS_GAUGE + .with_label_values(&[&method_str]) + .inc(); + let whitelist_rejected = matches!(method, Method::POST | Method::PUT | Method::DELETE) + && !request_is_whitelisted(&state, &request); + let response = match method.clone() { + _ if whitelist_rejected => StatusCode::UNAUTHORIZED.into_response(), + Method::GET | Method::HEAD => { + super::server_stats::record_read_request(); + handlers::get_or_head_handler_from_request(state, request).await + } + Method::POST | Method::PUT => { + super::server_stats::record_write_request(); + if request_bytes > 0 { + super::server_stats::record_bytes_in(request_bytes); + } + handlers::post_handler(state, request).await + } + Method::DELETE => { + super::server_stats::record_delete_request(); + handlers::delete_handler(state, request).await + } + Method::OPTIONS => { + super::server_stats::record_read_request(); + admin_options_response() + } + _ => { + let method_name = request.method().to_string(); + let query = request.uri().query().map(|q| q.to_string()); + method_str = "INVALID".to_string(); + handlers::json_error_with_query( + StatusCode::BAD_REQUEST, + format!("unsupported method {}", method_name), + query.as_deref(), + ) + } + }; + if method == Method::GET { + if let Some(response_bytes) = response + .headers() + .get(header::CONTENT_LENGTH) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.parse::().ok()) + .filter(|value| *value > 0) + { + super::server_stats::record_bytes_out(response_bytes); + } + } + super::server_stats::record_request_close(); + crate::metrics::INFLIGHT_REQUESTS_GAUGE + .with_label_values(&[&method_str]) + .dec(); + crate::metrics::REQUEST_COUNTER + .with_label_values(&[&method_str, response.status().as_str()]) + .inc(); + crate::metrics::REQUEST_DURATION + .with_label_values(&[&method_str]) + .observe(start.elapsed().as_secs_f64()); + response +} + +/// Public store handler — dispatches based on HTTP method. +/// Matches Go's publicReadOnlyHandler: GET/HEAD → read, OPTIONS → CORS, +/// anything else → 200 (passthrough no-op). +async fn public_store_handler(state: State>, request: Request) -> Response { + let start = std::time::Instant::now(); + let method = request.method().clone(); + let method_str = method.as_str().to_string(); + super::server_stats::record_request_open(); + crate::metrics::INFLIGHT_REQUESTS_GAUGE + .with_label_values(&[&method_str]) + .inc(); + let response = match method.clone() { + Method::GET | Method::HEAD => { + super::server_stats::record_read_request(); + handlers::get_or_head_handler_from_request(state, request).await + } + Method::OPTIONS => { + super::server_stats::record_read_request(); + public_options_response() + } + _ => StatusCode::OK.into_response(), + }; + if method == Method::GET { + if let Some(response_bytes) = response + .headers() + .get(header::CONTENT_LENGTH) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.parse::().ok()) + .filter(|value| *value > 0) + { + super::server_stats::record_bytes_out(response_bytes); + } + } + super::server_stats::record_request_close(); + crate::metrics::INFLIGHT_REQUESTS_GAUGE + .with_label_values(&[&method_str]) + .dec(); + crate::metrics::REQUEST_COUNTER + .with_label_values(&[&method_str, response.status().as_str()]) + .inc(); + crate::metrics::REQUEST_DURATION + .with_label_values(&[&method_str]) + .observe(start.elapsed().as_secs_f64()); + response +} + +/// Build OPTIONS response for admin port. +fn admin_options_response() -> Response { + let mut response = StatusCode::OK.into_response(); + let headers = response.headers_mut(); + headers.insert( + "Access-Control-Allow-Methods", + HeaderValue::from_static("PUT, POST, GET, DELETE, OPTIONS"), + ); + headers.insert( + "Access-Control-Allow-Headers", + HeaderValue::from_static("*"), + ); + response +} + +/// Build OPTIONS response for public port. +fn public_options_response() -> Response { + let mut response = StatusCode::OK.into_response(); + let headers = response.headers_mut(); + headers.insert( + "Access-Control-Allow-Methods", + HeaderValue::from_static("GET, OPTIONS"), + ); + headers.insert( + "Access-Control-Allow-Headers", + HeaderValue::from_static("*"), + ); + response +} + +/// Build the admin (private) HTTP router — supports all operations. +/// UI route is only registered when no signing keys are configured, +/// matching Go's `if signingKey == "" || enableUiAccess` check. +pub fn build_admin_router(state: Arc) -> Router { + let guard = state.guard.read().unwrap(); + // This helper can only derive the default Go behavior from the guard state: + // UI stays enabled when the write signing key is empty. The explicit + // `access.ui` override is handled by `build_admin_router_with_ui(...)`. + let ui_enabled = guard.signing_key.0.is_empty(); + drop(guard); + build_admin_router_with_ui(state, ui_enabled) +} + +/// Build the admin router with an explicit UI exposure flag. +pub fn build_admin_router_with_ui(state: Arc, ui_enabled: bool) -> Router { + let mut router = Router::new() + .route("/status", get(handlers::status_handler)) + .route("/healthz", get(handlers::healthz_handler)) + .route("/favicon.ico", get(handlers::favicon_handler)) + .route( + "/seaweedfsstatic/*path", + get(handlers::static_asset_handler), + ) + .route("/", any(admin_store_handler)) + .route("/:path", any(admin_store_handler)) + .route("/:vid/:fid", any(admin_store_handler)) + .route("/:vid/:fid/:filename", any(admin_store_handler)) + .fallback(admin_store_handler); + if ui_enabled { + // Note: /stats/* endpoints are commented out in Go's volume_server.go (L130-134). + // Only the UI endpoint is registered when UI access is enabled. + router = router.route("/ui/index.html", get(handlers::ui_handler)); + } + router + .layer(middleware::from_fn(common_headers_middleware)) + .with_state(state) +} + +/// Build the public (read-only) HTTP router — only GET/HEAD. +pub fn build_public_router(state: Arc) -> Router { + Router::new() + .route("/favicon.ico", get(handlers::favicon_handler)) + .route( + "/seaweedfsstatic/*path", + get(handlers::static_asset_handler), + ) + .route("/", any(public_store_handler)) + .route("/:path", any(public_store_handler)) + .route("/:vid/:fid", any(public_store_handler)) + .route("/:vid/:fid/:filename", any(public_store_handler)) + .fallback(public_store_handler) + .layer(middleware::from_fn(common_headers_middleware)) + .with_state(state) +} diff --git a/seaweed-volume/src/server/write_queue.rs b/seaweed-volume/src/server/write_queue.rs new file mode 100644 index 000000000..112ae5684 --- /dev/null +++ b/seaweed-volume/src/server/write_queue.rs @@ -0,0 +1,330 @@ +//! Async batched write processing for the volume server. +//! +//! Instead of each upload handler directly calling `write_needle` and syncing, +//! writes are submitted to a queue. A background worker drains the queue in +//! batches (up to 128 entries), groups them by volume ID, processes them +//! together, and syncs once per volume for the entire batch. + +use std::sync::Arc; + +use tokio::sync::{mpsc, oneshot}; +use tracing::debug; + +use crate::storage::needle::needle::Needle; +use crate::storage::types::{Size, VolumeId}; +use crate::storage::volume::VolumeError; + +use super::volume_server::VolumeServerState; + +/// Result of a single write operation: (offset, size, is_unchanged). +pub type WriteResult = Result<(u64, Size, bool), VolumeError>; + +/// A request to write a needle, submitted to the write queue. +pub struct WriteRequest { + pub volume_id: VolumeId, + pub needle: Needle, + pub response_tx: oneshot::Sender, +} + +/// Maximum number of write requests to batch together. +const MAX_BATCH_SIZE: usize = 128; + +/// Maximum bytes to accumulate per batch before breaking (matches Go's 4MB limit). +/// This prevents large writes from accumulating unbounded latency. +const MAX_BATCH_BYTES: usize = 4 * 1024 * 1024; + +/// Handle for submitting write requests to the background worker. +#[derive(Clone)] +pub struct WriteQueue { + tx: mpsc::Sender, +} + +impl WriteQueue { + /// Create a new write queue and spawn the background worker. + /// + /// `capacity` controls the channel buffer size (backpressure kicks in when full). + /// The worker holds a reference to `state` for accessing the store. + pub fn new(state: Arc, capacity: usize) -> Self { + let (tx, rx) = mpsc::channel(capacity); + let worker = WriteQueueWorker { rx, state }; + tokio::spawn(worker.run()); + WriteQueue { tx } + } + + /// Submit a write request and wait for the result. + /// + /// Returns `Err` if the worker has shut down or the response channel was dropped. + pub async fn submit(&self, volume_id: VolumeId, needle: Needle) -> WriteResult { + let (response_tx, response_rx) = oneshot::channel(); + let request = WriteRequest { + volume_id, + needle, + response_tx, + }; + + // Send to queue; this awaits if the channel is full (backpressure). + if self.tx.send(request).await.is_err() { + return Err(VolumeError::Io(std::io::Error::new( + std::io::ErrorKind::BrokenPipe, + "write queue worker has shut down", + ))); + } + + // Wait for the worker to process our request. + match response_rx.await { + Ok(result) => result, + Err(_) => Err(VolumeError::Io(std::io::Error::new( + std::io::ErrorKind::BrokenPipe, + "write queue worker dropped response channel", + ))), + } + } +} + +/// Background worker that drains write requests and processes them in batches. +struct WriteQueueWorker { + rx: mpsc::Receiver, + state: Arc, +} + +impl WriteQueueWorker { + async fn run(mut self) { + debug!("write queue worker started"); + + loop { + // Wait for the first request (blocks until one arrives or channel closes). + let first = match self.rx.recv().await { + Some(req) => req, + None => { + debug!("write queue channel closed, worker exiting"); + return; + } + }; + + // Drain as many additional requests as available, up to MAX_BATCH_SIZE + // or MAX_BATCH_BYTES (matches Go: 128 requests or 4MB, whichever comes first). + let mut batch = Vec::with_capacity(MAX_BATCH_SIZE); + let mut batch_bytes: usize = first.needle.data.len(); + batch.push(first); + + while batch.len() < MAX_BATCH_SIZE && batch_bytes < MAX_BATCH_BYTES { + match self.rx.try_recv() { + Ok(req) => { + batch_bytes += req.needle.data.len(); + batch.push(req); + } + Err(_) => break, + } + } + + let batch_size = batch.len(); + debug!("processing write batch of {} requests", batch_size); + + // Process the batch in spawn_blocking since write_needle does file I/O. + let state = self.state.clone(); + let _ = tokio::task::spawn_blocking(move || { + process_batch(state, batch); + }) + .await; + } + } +} + +/// Process a batch of write requests, grouped by volume ID. +/// +/// Groups writes by volume to minimize the number of store lock acquisitions, +/// then sends results back via each request's oneshot channel. +fn process_batch(state: Arc, batch: Vec) { + // Group requests by volume ID for efficient processing. + // We use a Vec of (VolumeId, Vec<(Needle, Sender)>) to preserve order + // and avoid requiring Hash on VolumeId. + let mut groups: Vec<(VolumeId, Vec<(Needle, oneshot::Sender)>)> = Vec::new(); + + for req in batch { + let vid = req.volume_id; + if let Some(group) = groups.iter_mut().find(|(v, _)| *v == vid) { + group.1.push((req.needle, req.response_tx)); + } else { + groups.push((vid, vec![(req.needle, req.response_tx)])); + } + } + + // Process each volume group under a single store lock. + let mut store = state.store.write().unwrap(); + + for (vid, entries) in groups { + for (mut needle, response_tx) in entries { + let result = store.write_volume_needle(vid, &mut needle); + // Send result back; ignore error if receiver dropped. + let _ = response_tx.send(result); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::types::VolumeId; + + /// Helper to create a minimal VolumeServerState for testing. + fn make_test_state() -> Arc { + use crate::security::{Guard, SigningKey}; + use crate::server::volume_server::RuntimeMetricsConfig; + use crate::storage::needle_map::NeedleMapKind; + use crate::storage::store::Store; + use std::sync::atomic::{AtomicBool, AtomicI64, AtomicU32}; + use std::sync::RwLock; + + let store = Store::new(NeedleMapKind::InMemory); + let guard = Guard::new(&[], SigningKey(vec![]), 0, SigningKey(vec![]), 0); + + Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(guard), + is_stopping: RwLock::new(false), + maintenance: AtomicBool::new(false), + state_version: AtomicU32::new(0), + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::ZERO, + inflight_download_data_timeout: std::time::Duration::ZERO, + inflight_upload_bytes: AtomicI64::new(0), + inflight_download_bytes: AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: String::new(), + rack: String::new(), + file_size_limit_bytes: 0, + maintenance_byte_per_second: 0, + is_heartbeating: AtomicBool::new(false), + has_master: false, + pre_stop_seconds: 0, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new( + crate::remote_storage::s3_tier::S3TierRegistry::new(), + ), + read_mode: crate::config::ReadMode::Local, + master_url: String::new(), + master_urls: Vec::new(), + self_url: String::new(), + http_client: reqwest::Client::new(), + outgoing_http_scheme: "http".to_string(), + outgoing_grpc_tls: None, + metrics_runtime: std::sync::RwLock::new(RuntimeMetricsConfig::default()), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: false, + has_slow_read: true, + read_buffer_size_bytes: 4 * 1024 * 1024, + security_file: String::new(), + cli_white_list: vec![], + state_file_path: String::new(), + }) + } + + #[tokio::test] + async fn test_write_queue_submit_no_volume() { + // Submit a write to a non-existent volume -- should return VolumeError::NotFound. + let state = make_test_state(); + let queue = WriteQueue::new(state, MAX_BATCH_SIZE); + + let needle = Needle { + id: 1.into(), + cookie: 0x12345678.into(), + data: vec![1, 2, 3], + data_size: 3, + ..Needle::default() + }; + + let result = queue.submit(VolumeId(999), needle).await; + assert!(result.is_err()); + match result { + Err(VolumeError::NotFound) => {} // expected + other => panic!("expected NotFound, got {:?}", other), + } + } + + #[tokio::test] + async fn test_write_queue_concurrent_submissions() { + // Submit multiple concurrent writes -- all should complete (with errors since no volume). + let state = make_test_state(); + let queue = WriteQueue::new(state, MAX_BATCH_SIZE); + + let mut handles = Vec::new(); + for i in 0..10u64 { + let q = queue.clone(); + handles.push(tokio::spawn(async move { + let needle = Needle { + id: i.into(), + cookie: 0xABCD.into(), + data: vec![i as u8; 10], + data_size: 10, + ..Needle::default() + }; + q.submit(VolumeId(1), needle).await + })); + } + + for handle in handles { + let result = handle.await.unwrap(); + // All should fail with NotFound since there's no volume 1 + assert!(matches!(result, Err(VolumeError::NotFound))); + } + } + + #[tokio::test] + async fn test_write_queue_batching() { + // Verify that many concurrent writes get processed (testing the batching path). + let state = make_test_state(); + let queue = WriteQueue::new(state, MAX_BATCH_SIZE); + + // Submit MAX_BATCH_SIZE requests concurrently + let mut handles = Vec::new(); + for i in 0..MAX_BATCH_SIZE as u64 { + let q = queue.clone(); + handles.push(tokio::spawn(async move { + let needle = Needle { + id: i.into(), + cookie: 0x1111.into(), + data: vec![0u8; 4], + data_size: 4, + ..Needle::default() + }; + q.submit(VolumeId(42), needle).await + })); + } + + let mut results = Vec::new(); + for handle in handles { + results.push(handle.await.unwrap()); + } + + // All should complete (with NotFound errors since no volume exists) + assert_eq!(results.len(), MAX_BATCH_SIZE); + for r in results { + assert!(matches!(r, Err(VolumeError::NotFound))); + } + } + + #[tokio::test] + async fn test_write_queue_dropped_sender() { + // When the queue is dropped, subsequent submits should fail gracefully. + let state = make_test_state(); + let queue = WriteQueue::new(state, 1); + + // Clone then drop the original -- the worker keeps running via its rx handle. + let queue2 = queue.clone(); + drop(queue); + + // This should still work since the worker is alive. + let needle = Needle { + id: 1.into(), + cookie: 0.into(), + data: vec![], + data_size: 0, + ..Needle::default() + }; + let result = queue2.submit(VolumeId(1), needle).await; + assert!(result.is_err()); // NotFound is fine -- the point is it doesn't panic + } +} diff --git a/seaweed-volume/src/storage/disk_location.rs b/seaweed-volume/src/storage/disk_location.rs new file mode 100644 index 000000000..b336d0dd4 --- /dev/null +++ b/seaweed-volume/src/storage/disk_location.rs @@ -0,0 +1,951 @@ +//! DiskLocation: manages volumes on a single disk/directory. +//! +//! Each DiskLocation represents one storage directory containing .dat + .idx files. +//! A Store contains one or more DiskLocations (one per configured directory). +//! Matches Go's storage/disk_location.go. + +use std::collections::{HashMap, HashSet}; +use std::fs; +use std::io; +use std::sync::atomic::{AtomicBool, AtomicI32, AtomicU64, Ordering}; +use std::sync::Arc; + +use tracing::{info, warn}; + +use crate::config::MinFreeSpace; +use crate::storage::erasure_coding::ec_shard::{ + EcVolumeShard, DATA_SHARDS_COUNT, ERASURE_CODING_LARGE_BLOCK_SIZE, + ERASURE_CODING_SMALL_BLOCK_SIZE, +}; +use crate::storage::erasure_coding::ec_volume::EcVolume; +use crate::storage::needle_map::NeedleMapKind; +use crate::storage::super_block::ReplicaPlacement; +use crate::storage::types::*; +use crate::storage::volume::{remove_volume_files, volume_file_name, Volume, VolumeError}; + +/// A single disk location managing volumes in one directory. +pub struct DiskLocation { + pub directory: String, + pub idx_directory: String, + pub directory_uuid: String, + pub disk_type: DiskType, + pub tags: Vec, + pub max_volume_count: AtomicI32, + pub original_max_volume_count: i32, + volumes: HashMap, + ec_volumes: HashMap, + pub is_disk_space_low: Arc, + pub available_space: AtomicU64, + pub min_free_space: MinFreeSpace, +} + +impl DiskLocation { + const UUID_FILE_NAME: &'static str = "vol_dir.uuid"; + + pub fn new( + directory: &str, + idx_directory: &str, + max_volume_count: i32, + disk_type: DiskType, + min_free_space: MinFreeSpace, + tags: Vec, + ) -> io::Result { + fs::create_dir_all(directory)?; + + let idx_dir = if idx_directory.is_empty() { + directory.to_string() + } else { + fs::create_dir_all(idx_directory)?; + idx_directory.to_string() + }; + let directory_uuid = Self::generate_directory_uuid(directory)?; + + Ok(DiskLocation { + directory: directory.to_string(), + idx_directory: idx_dir, + directory_uuid, + disk_type, + tags, + max_volume_count: AtomicI32::new(max_volume_count), + original_max_volume_count: max_volume_count, + volumes: HashMap::new(), + ec_volumes: HashMap::new(), + is_disk_space_low: Arc::new(AtomicBool::new(false)), + available_space: AtomicU64::new(0), + min_free_space, + }) + } + + fn generate_directory_uuid(directory: &str) -> io::Result { + let path = std::path::Path::new(directory).join(Self::UUID_FILE_NAME); + if path.exists() { + let existing = fs::read_to_string(&path)?; + if !existing.trim().is_empty() { + return Ok(existing); + } + } + + let dir_uuid = uuid::Uuid::new_v4().to_string(); + fs::write(path, &dir_uuid)?; + Ok(dir_uuid) + } + + // ---- Volume management ---- + + /// Load existing volumes from the directory. + /// + /// Matches Go's `loadExistingVolume`: checks for incomplete volumes (.note file), + /// validates EC shards before skipping .dat loading, and cleans up stale + /// compaction temp files (.cpd/.cpx). + pub fn load_existing_volumes(&mut self, needle_map_kind: NeedleMapKind) -> io::Result<()> { + // Ensure directory exists + fs::create_dir_all(&self.directory)?; + if self.directory != self.idx_directory { + fs::create_dir_all(&self.idx_directory)?; + } + + // Scan for .dat files + let entries = fs::read_dir(&self.directory)?; + let mut dat_files: Vec<(String, VolumeId)> = Vec::new(); + let mut seen = HashSet::new(); + + for entry in entries { + let entry = entry?; + let name = entry.file_name().into_string().unwrap_or_default(); + if let Some((collection, vid)) = parse_volume_filename(&name) { + if seen.insert((collection.clone(), vid)) { + dat_files.push((collection, vid)); + } + } + } + + for (collection, vid) in dat_files { + let volume_name = volume_file_name(&self.directory, &collection, vid); + let idx_name = volume_file_name(&self.idx_directory, &collection, vid); + + // Check for incomplete volume (.note file means a VolumeCopy was interrupted) + let note_path = format!("{}.note", volume_name); + if std::path::Path::new(¬e_path).exists() { + let note = fs::read_to_string(¬e_path).unwrap_or_default(); + warn!( + volume_id = vid.0, + "volume was not completed: {}, removing files", note + ); + remove_volume_files(&volume_name); + remove_volume_files(&idx_name); + continue; + } + + // If valid EC shards exist (.ecx file present), skip loading .dat + let ecx_path = format!("{}.ecx", idx_name); + let ecx_exists = if std::path::Path::new(&ecx_path).exists() { + true + } else if self.idx_directory != self.directory { + // .ecx may have been created before -dir.idx was configured + let fallback = format!("{}.ecx", volume_name); + std::path::Path::new(&fallback).exists() + } else { + false + }; + if ecx_exists { + if self.validate_ec_volume(&collection, vid) { + // Valid EC volume — don't load .dat + continue; + } else { + warn!( + volume_id = vid.0, + "EC volume validation failed, removing incomplete EC files" + ); + self.remove_ec_volume_files(&collection, vid); + // Fall through to load .dat file + } + } + + // Clean up stale compaction temp files + let cpd_path = format!("{}.cpd", volume_name); + let cpx_path = format!("{}.cpx", idx_name); + if std::path::Path::new(&cpd_path).exists() { + info!(volume_id = vid.0, "removing stale compaction file .cpd"); + let _ = fs::remove_file(&cpd_path); + } + if std::path::Path::new(&cpx_path).exists() { + info!(volume_id = vid.0, "removing stale compaction file .cpx"); + let _ = fs::remove_file(&cpx_path); + } + + // Skip if already loaded (e.g., from a previous call) + if self.volumes.contains_key(&vid) { + continue; + } + + match Volume::new( + &self.directory, + &self.idx_directory, + &collection, + vid, + needle_map_kind, + None, // replica placement read from superblock + None, // TTL read from superblock + 0, // no preallocate on load + Version::current(), + ) { + Ok(mut v) => { + v.location_disk_space_low = self.is_disk_space_low.clone(); + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&collection, "volume"]) + .inc(); + self.volumes.insert(vid, v); + } + Err(e) => { + warn!(volume_id = vid.0, error = %e, "failed to load volume"); + } + } + } + + Ok(()) + } + + /// Validate EC volume shards: all shards must be same size, and if .dat exists, + /// need at least DATA_SHARDS_COUNT shards with size matching expected. + fn validate_ec_volume(&self, collection: &str, vid: VolumeId) -> bool { + let base = volume_file_name(&self.directory, collection, vid); + let dat_path = format!("{}.dat", base); + + let mut expected_shard_size: Option = None; + let dat_exists = std::path::Path::new(&dat_path).exists(); + + if dat_exists { + if let Ok(meta) = fs::metadata(&dat_path) { + expected_shard_size = Some(calculate_expected_shard_size(meta.len() as i64)); + } else { + return false; + } + } + + let mut shard_count = 0usize; + let mut actual_shard_size: Option = None; + const MAX_SHARD_COUNT: usize = 32; + + for i in 0..MAX_SHARD_COUNT { + let shard_path = format!("{}.ec{:02}", base, i); + match fs::metadata(&shard_path) { + Ok(meta) if meta.len() > 0 => { + let size = meta.len() as i64; + if let Some(prev) = actual_shard_size { + if size != prev { + warn!( + volume_id = vid.0, + shard = i, + size, + expected = prev, + "EC shard size mismatch" + ); + return false; + } + } else { + actual_shard_size = Some(size); + } + shard_count += 1; + } + Err(e) if e.kind() != io::ErrorKind::NotFound => { + warn!( + volume_id = vid.0, + shard = i, + error = %e, + "failed to stat EC shard" + ); + return false; + } + _ => {} // not found or zero size — skip + } + } + + // If .dat exists, validate shard size matches expected + if dat_exists { + if let (Some(actual), Some(expected)) = (actual_shard_size, expected_shard_size) { + if actual != expected { + warn!( + volume_id = vid.0, + actual_shard_size = actual, + expected_shard_size = expected, + "EC shard size doesn't match .dat file" + ); + return false; + } + } + } + + // Distributed EC (no .dat): any shard count is valid + if !dat_exists { + return true; + } + + // With .dat: need at least DATA_SHARDS_COUNT shards + if shard_count < DATA_SHARDS_COUNT { + warn!( + volume_id = vid.0, + shard_count, + required = DATA_SHARDS_COUNT, + "EC volume has .dat but too few shards" + ); + return false; + } + + true + } + + /// Remove all EC-related files for a volume. + fn remove_ec_volume_files(&self, collection: &str, vid: VolumeId) { + let base = volume_file_name(&self.directory, collection, vid); + let idx_base = volume_file_name(&self.idx_directory, collection, vid); + const MAX_SHARD_COUNT: usize = 32; + + // Remove index files from idx directory (.ecx, .ecj) + let _ = fs::remove_file(format!("{}.ecx", idx_base)); + let _ = fs::remove_file(format!("{}.ecj", idx_base)); + // Also try data directory in case .ecx/.ecj were created before -dir.idx was configured + if self.idx_directory != self.directory { + let _ = fs::remove_file(format!("{}.ecx", base)); + let _ = fs::remove_file(format!("{}.ecj", base)); + } + + // Remove all EC shard files (.ec00 ~ .ec31) + for i in 0..MAX_SHARD_COUNT { + let _ = fs::remove_file(format!("{}.ec{:02}", base, i)); + } + } + + /// Find a volume by ID. + pub fn find_volume(&self, vid: VolumeId) -> Option<&Volume> { + self.volumes.get(&vid) + } + + /// Find a volume by ID (mutable). + pub fn find_volume_mut(&mut self, vid: VolumeId) -> Option<&mut Volume> { + self.volumes.get_mut(&vid) + } + + /// Add a volume to this location. + pub fn set_volume(&mut self, vid: VolumeId, volume: Volume) { + let collection = volume.collection.clone(); + self.volumes.insert(vid, volume); + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&collection, "volume"]) + .inc(); + } + + /// Create a new volume in this location. + pub fn create_volume( + &mut self, + vid: VolumeId, + collection: &str, + needle_map_kind: NeedleMapKind, + replica_placement: Option, + ttl: Option, + preallocate: u64, + version: Version, + ) -> Result<(), VolumeError> { + let mut v = Volume::new( + &self.directory, + &self.idx_directory, + collection, + vid, + needle_map_kind, + replica_placement, + ttl, + preallocate, + version, + )?; + v.location_disk_space_low = self.is_disk_space_low.clone(); + crate::metrics::VOLUME_GAUGE + .with_label_values(&[collection, "volume"]) + .inc(); + self.volumes.insert(vid, v); + Ok(()) + } + + /// Remove and close a volume. + pub fn unload_volume(&mut self, vid: VolumeId) -> Option { + if let Some(mut v) = self.volumes.remove(&vid) { + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&v.collection, "volume"]) + .dec(); + v.close(); + Some(v) + } else { + None + } + } + + /// Remove, close, and delete all files for a volume. + pub fn delete_volume(&mut self, vid: VolumeId, only_empty: bool) -> Result<(), VolumeError> { + if let Some(mut v) = self.volumes.remove(&vid) { + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&v.collection, "volume"]) + .dec(); + v.destroy(only_empty)?; + Ok(()) + } else { + Err(VolumeError::NotFound) + } + } + + /// Delete all volumes in a collection. + pub fn delete_collection(&mut self, collection: &str) -> Result<(), VolumeError> { + let vids: Vec = self + .volumes + .iter() + .filter(|(_, v)| v.collection == collection && !v.is_compacting()) + .map(|(vid, _)| *vid) + .collect(); + + for vid in vids { + if let Some(mut v) = self.volumes.remove(&vid) { + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&v.collection, "volume"]) + .dec(); + if let Err(e) = v.destroy(false) { + warn!(volume_id = vid.0, error = %e, "delete collection: failed to destroy volume"); + } + } + } + + let ec_vids: Vec = self + .ec_volumes + .iter() + .filter(|(_, v)| v.collection == collection) + .map(|(vid, _)| *vid) + .collect(); + + for vid in ec_vids { + if let Some(mut ec_vol) = self.ec_volumes.remove(&vid) { + for _ in 0..ec_vol.shard_count() { + crate::metrics::VOLUME_GAUGE + .with_label_values(&[collection, "ec_shards"]) + .dec(); + } + ec_vol.destroy(); + } + } + Ok(()) + } + + // ---- Metrics ---- + + /// Number of volumes on this disk. + pub fn volumes_len(&self) -> usize { + self.volumes.len() + } + + /// Get all volume IDs, sorted. + pub fn volume_ids(&self) -> Vec { + let mut ids: Vec = self.volumes.keys().copied().collect(); + ids.sort(); + ids + } + + /// Iterate over all volumes. + pub fn iter_volumes(&self) -> impl Iterator { + self.volumes.iter() + } + + /// Number of free volume slots. + /// Matches Go's FindFreeLocation formula: + /// free = ((MaxVolumeCount - VolumesLen()) * DataShardsCount - EcShardCount()) / DataShardsCount + pub fn free_volume_count(&self) -> i32 { + use crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT; + let max = self.max_volume_count.load(Ordering::Relaxed); + let free_count = (max as i64 - self.volumes.len() as i64) + * DATA_SHARDS_COUNT as i64 + - self.ec_shard_count() as i64; + let effective_free = free_count / DATA_SHARDS_COUNT as i64; + if effective_free > 0 { + effective_free as i32 + } else { + 0 + } + } + + /// Iterate over all volumes. + pub fn volumes(&self) -> impl Iterator { + self.volumes.iter() + } + + /// Iterate over all volumes (mutable). + pub fn volumes_mut(&mut self) -> impl Iterator { + self.volumes.iter_mut() + } + + /// Sum of unused space in writable volumes (volumeSizeLimit - actual size per volume). + /// Used by auto-max-volume-count to estimate how many more volumes can fit. + pub fn unused_space(&self, volume_size_limit: u64) -> u64 { + let mut unused: u64 = 0; + for vol in self.volumes.values() { + if vol.is_read_only() { + continue; + } + let dat_size = vol.dat_file_size().unwrap_or(0); + let idx_size = vol.idx_file_size(); + let used = dat_size + idx_size; + if volume_size_limit > used { + unused += volume_size_limit - used; + } + } + unused + } + + /// Check disk space against min_free_space and update is_disk_space_low. + pub fn check_disk_space(&self) { + let (total, free) = get_disk_stats(&self.directory); + if total == 0 { + return; + } + let used = total.saturating_sub(free); + let is_low = match &self.min_free_space { + MinFreeSpace::Percent(pct) => { + let free_pct = (free as f64 / total as f64) * 100.0; + free_pct < *pct + } + MinFreeSpace::Bytes(min_bytes) => free < *min_bytes, + }; + self.is_disk_space_low.store(is_low, Ordering::Relaxed); + self.available_space.store(free, Ordering::Relaxed); + + // Update resource gauges + crate::metrics::RESOURCE_GAUGE + .with_label_values(&[&self.directory, "all"]) + .set(total as f64); + crate::metrics::RESOURCE_GAUGE + .with_label_values(&[&self.directory, "used"]) + .set(used as f64); + crate::metrics::RESOURCE_GAUGE + .with_label_values(&[&self.directory, "free"]) + .set(free as f64); + // "avail" is same as "free" for us (Go subtracts reserved blocks but we use statvfs f_bavail) + crate::metrics::RESOURCE_GAUGE + .with_label_values(&[&self.directory, "avail"]) + .set(free as f64); + } + + // ---- EC volume operations ---- + + /// Find an EC volume by ID. + pub fn find_ec_volume(&self, vid: VolumeId) -> Option<&EcVolume> { + self.ec_volumes.get(&vid) + } + + /// Find an EC volume by ID (mutable). + pub fn find_ec_volume_mut(&mut self, vid: VolumeId) -> Option<&mut EcVolume> { + self.ec_volumes.get_mut(&vid) + } + + /// Check if this location has an EC volume. + pub fn has_ec_volume(&self, vid: VolumeId) -> bool { + self.ec_volumes.contains_key(&vid) + } + + /// Remove an EC volume, returning it. + pub fn remove_ec_volume(&mut self, vid: VolumeId) -> Option { + self.ec_volumes.remove(&vid) + } + + /// Mount EC shards for a volume on this location. + pub fn mount_ec_shards( + &mut self, + vid: VolumeId, + collection: &str, + shard_ids: &[u32], + ) -> Result<(), VolumeError> { + let dir = self.directory.clone(); + let idx_dir = self.idx_directory.clone(); + let ec_vol = self + .ec_volumes + .entry(vid) + .or_insert_with(|| EcVolume::new(&dir, &idx_dir, collection, vid).unwrap()); + ec_vol.disk_type = self.disk_type.clone(); + + for &shard_id in shard_ids { + let shard = EcVolumeShard::new(&dir, collection, vid, shard_id as u8); + ec_vol.add_shard(shard).map_err(VolumeError::Io)?; + crate::metrics::VOLUME_GAUGE + .with_label_values(&[collection, "ec_shards"]) + .inc(); + } + Ok(()) + } + + /// Unmount EC shards for a volume on this location. + pub fn unmount_ec_shards(&mut self, vid: VolumeId, shard_ids: &[u32]) { + if let Some(ec_vol) = self.ec_volumes.get_mut(&vid) { + let collection = ec_vol.collection.clone(); + for &shard_id in shard_ids { + ec_vol.remove_shard(shard_id as u8); + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&collection, "ec_shards"]) + .dec(); + } + if ec_vol.shard_count() == 0 { + let mut vol = self.ec_volumes.remove(&vid).unwrap(); + vol.close(); + } + } + } + + /// Total number of EC shards on this location. + pub fn ec_shard_count(&self) -> usize { + self.ec_volumes + .values() + .map(|ecv| ecv.shards.iter().filter(|s| s.is_some()).count()) + .sum() + } + + /// Iterate over all EC volumes. + pub fn ec_volumes(&self) -> impl Iterator { + self.ec_volumes.iter() + } + + /// Close all volumes. + pub fn close(&mut self) { + for (_, v) in self.volumes.iter_mut() { + v.close(); + } + self.volumes.clear(); + for (_, mut ec_vol) in self.ec_volumes.drain() { + ec_vol.close(); + } + } +} + +/// Get total and free disk space for a given path. +/// Returns (total_bytes, free_bytes). +pub fn get_disk_stats(path: &str) -> (u64, u64) { + #[cfg(unix)] + { + use std::ffi::CString; + let c_path = match CString::new(path) { + Ok(p) => p, + Err(_) => return (0, 0), + }; + unsafe { + let mut stat: libc::statvfs = std::mem::zeroed(); + if libc::statvfs(c_path.as_ptr(), &mut stat) == 0 { + let all = stat.f_blocks as u64 * stat.f_frsize as u64; + let free = stat.f_bavail as u64 * stat.f_frsize as u64; + return (all, free); + } + } + (0, 0) + } + #[cfg(not(unix))] + { + let _ = path; + (0, 0) + } +} + +/// Calculate expected EC shard size from .dat file size. +/// Matches Go's `calculateExpectedShardSize`: large blocks (1GB * data_shards) first, +/// then small blocks (1MB * data_shards) for the remainder. +fn calculate_expected_shard_size(dat_file_size: i64) -> i64 { + let large_batch_size = ERASURE_CODING_LARGE_BLOCK_SIZE as i64 * DATA_SHARDS_COUNT as i64; + let num_large_batches = dat_file_size / large_batch_size; + let mut shard_size = num_large_batches * ERASURE_CODING_LARGE_BLOCK_SIZE as i64; + let remaining = dat_file_size - (num_large_batches * large_batch_size); + + if remaining > 0 { + let small_batch_size = ERASURE_CODING_SMALL_BLOCK_SIZE as i64 * DATA_SHARDS_COUNT as i64; + // Ceiling division + let num_small_batches = (remaining + small_batch_size - 1) / small_batch_size; + shard_size += num_small_batches * ERASURE_CODING_SMALL_BLOCK_SIZE as i64; + } + + shard_size +} + +/// Parse a volume filename like "collection_42.dat" or "42.dat" into (collection, VolumeId). +fn parse_volume_filename(filename: &str) -> Option<(String, VolumeId)> { + let stem = filename + .strip_suffix(".dat") + .or_else(|| filename.strip_suffix(".vif")) + .or_else(|| filename.strip_suffix(".idx"))?; + if let Some(pos) = stem.rfind('_') { + let collection = &stem[..pos]; + let id_str = &stem[pos + 1..]; + let id: u32 = id_str.parse().ok()?; + Some((collection.to_string(), VolumeId(id))) + } else { + let id: u32 = stem.parse().ok()?; + Some((String::new(), VolumeId(id))) + } +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_parse_volume_filename() { + assert_eq!( + parse_volume_filename("42.dat"), + Some(("".to_string(), VolumeId(42))) + ); + assert_eq!( + parse_volume_filename("pics_7.dat"), + Some(("pics".to_string(), VolumeId(7))) + ); + assert_eq!( + parse_volume_filename("42.vif"), + Some(("".to_string(), VolumeId(42))) + ); + assert_eq!( + parse_volume_filename("pics_7.idx"), + Some(("pics".to_string(), VolumeId(7))) + ); + assert_eq!(parse_volume_filename("notadat.idx"), None); + assert_eq!(parse_volume_filename("bad.dat"), None); + } + + #[test] + fn test_disk_location_create_volume() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + loc.create_volume( + VolumeId(1), + "", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + assert_eq!(loc.volumes_len(), 1); + assert!(loc.find_volume(VolumeId(1)).is_some()); + assert!(loc.find_volume(VolumeId(99)).is_none()); + assert_eq!(loc.free_volume_count(), 9); + } + + #[test] + fn test_disk_location_load_existing() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Create volumes + { + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + loc.create_volume( + VolumeId(1), + "", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + loc.create_volume( + VolumeId(2), + "test", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + loc.close(); + } + + // Reload + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + loc.load_existing_volumes(NeedleMapKind::InMemory).unwrap(); + assert_eq!(loc.volumes_len(), 2); + + let ids = loc.volume_ids(); + assert!(ids.contains(&VolumeId(1))); + assert!(ids.contains(&VolumeId(2))); + } + + #[test] + fn test_disk_location_delete_volume() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + loc.create_volume( + VolumeId(1), + "", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + loc.create_volume( + VolumeId(2), + "", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + assert_eq!(loc.volumes_len(), 2); + + loc.delete_volume(VolumeId(1), false).unwrap(); + assert_eq!(loc.volumes_len(), 1); + assert!(loc.find_volume(VolumeId(1)).is_none()); + } + + #[test] + fn test_disk_location_delete_collection() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + loc.create_volume( + VolumeId(1), + "pics", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + loc.create_volume( + VolumeId(2), + "pics", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + loc.create_volume( + VolumeId(3), + "docs", + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + assert_eq!(loc.volumes_len(), 3); + + loc.delete_collection("pics").unwrap(); + assert_eq!(loc.volumes_len(), 1); + assert!(loc.find_volume(VolumeId(3)).is_some()); + } + + #[test] + fn test_disk_location_delete_collection_removes_ec_volumes() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + + let shard_path = format!("{}/pics_7.ec00", dir); + std::fs::write(&shard_path, b"ec-shard").unwrap(); + + loc.mount_ec_shards(VolumeId(7), "pics", &[0]).unwrap(); + assert!(loc.has_ec_volume(VolumeId(7))); + assert!(std::path::Path::new(&shard_path).exists()); + assert!(std::path::Path::new(&format!("{}/pics_7.ecj", dir)).exists()); + + loc.delete_collection("pics").unwrap(); + + assert!(!loc.has_ec_volume(VolumeId(7))); + assert!(!std::path::Path::new(&shard_path).exists()); + assert!(!std::path::Path::new(&format!("{}/pics_7.ecj", dir)).exists()); + } + + #[test] + fn test_disk_location_persists_directory_uuid_and_tags() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let loc = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + vec!["fast".to_string(), "ssd".to_string()], + ) + .unwrap(); + let directory_uuid = loc.directory_uuid.clone(); + assert_eq!(loc.tags, vec!["fast".to_string(), "ssd".to_string()]); + drop(loc); + + let reloaded = DiskLocation::new( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + assert_eq!(reloaded.directory_uuid, directory_uuid); + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/ec_decoder.rs b/seaweed-volume/src/storage/erasure_coding/ec_decoder.rs new file mode 100644 index 000000000..045cd644a --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/ec_decoder.rs @@ -0,0 +1,261 @@ +//! EC decoding: reconstruct a .dat file from EC shards. +//! +//! Rebuilds the original .dat + .idx files from data shards (.ec00-.ec09) +//! and the sorted index (.ecx) + deletion journal (.ecj). + +use std::fs::File; +use std::io::{self, Read, Write}; + +use crate::storage::erasure_coding::ec_shard::*; +use crate::storage::idx; +use crate::storage::needle::needle::get_actual_size; +use crate::storage::super_block::SUPER_BLOCK_SIZE; +use crate::storage::types::*; +use crate::storage::volume::volume_file_name; + +/// Calculate .dat file size from the max offset entry in .ecx. +/// Reads the volume version from the first EC shard (.ec00) superblock, +/// then scans .ecx entries to find the largest (offset + needle_actual_size). +pub fn find_dat_file_size(dir: &str, collection: &str, volume_id: VolumeId) -> io::Result { + let base = volume_file_name(dir, collection, volume_id); + + // Read volume version from .ec00 superblock + let ec00_path = format!("{}.ec00", base); + let mut ec00 = File::open(&ec00_path)?; + let mut sb_buf = [0u8; SUPER_BLOCK_SIZE]; + ec00.read_exact(&mut sb_buf)?; + let version = Version(sb_buf[0]); + + // Start with at least the superblock size + let mut dat_size: i64 = SUPER_BLOCK_SIZE as i64; + + // Scan .ecx entries + let ecx_path = format!("{}.ecx", base); + let ecx_data = std::fs::read(&ecx_path)?; + let entry_count = ecx_data.len() / NEEDLE_MAP_ENTRY_SIZE; + + for i in 0..entry_count { + let start = i * NEEDLE_MAP_ENTRY_SIZE; + let (_, offset, size) = + idx_entry_from_bytes(&ecx_data[start..start + NEEDLE_MAP_ENTRY_SIZE]); + if size.is_deleted() { + continue; + } + let entry_stop = offset.to_actual_offset() + get_actual_size(size, version); + if entry_stop > dat_size { + dat_size = entry_stop; + } + } + + Ok(dat_size) +} + +/// Reconstruct a .dat file from EC data shards. +/// +/// Reads from .ec00-.ec09 and writes a new .dat file. +pub fn write_dat_file_from_shards( + dir: &str, + collection: &str, + volume_id: VolumeId, + dat_file_size: i64, + data_shards: usize, +) -> io::Result<()> { + let base = volume_file_name(dir, collection, volume_id); + let dat_path = format!("{}.dat", base); + + // Open data shards + let mut shards: Vec = (0..data_shards as u8) + .map(|i| EcVolumeShard::new(dir, collection, volume_id, i)) + .collect(); + + for shard in &mut shards { + shard.open()?; + } + + let mut dat_file = File::create(&dat_path)?; + let mut remaining = dat_file_size; + let large_block_size = ERASURE_CODING_LARGE_BLOCK_SIZE; + let small_block_size = ERASURE_CODING_SMALL_BLOCK_SIZE; + let large_row_size = (large_block_size * data_shards) as i64; + + let mut shard_offset: u64 = 0; + + // Read large blocks + while remaining >= large_row_size { + for i in 0..data_shards { + let mut buf = vec![0u8; large_block_size]; + shards[i].read_at(&mut buf, shard_offset)?; + let to_write = large_block_size.min(remaining as usize); + dat_file.write_all(&buf[..to_write])?; + remaining -= to_write as i64; + if remaining <= 0 { + break; + } + } + shard_offset += large_block_size as u64; + } + + // Read small blocks + while remaining > 0 { + for i in 0..data_shards { + let mut buf = vec![0u8; small_block_size]; + shards[i].read_at(&mut buf, shard_offset)?; + let to_write = small_block_size.min(remaining as usize); + dat_file.write_all(&buf[..to_write])?; + remaining -= to_write as i64; + if remaining <= 0 { + break; + } + } + shard_offset += small_block_size as u64; + } + + for shard in &mut shards { + shard.close(); + } + + dat_file.sync_all()?; + Ok(()) +} + +/// Write .idx file from .ecx index + .ecj deletion journal. +/// +/// Copies sorted .ecx entries to .idx, then appends tombstones for +/// deleted needles from .ecj. +pub fn write_idx_file_from_ec_index( + dir: &str, + collection: &str, + volume_id: VolumeId, +) -> io::Result<()> { + let base = volume_file_name(dir, collection, volume_id); + let ecx_path = format!("{}.ecx", base); + let ecj_path = format!("{}.ecj", base); + let idx_path = format!("{}.idx", base); + + // Copy .ecx to .idx + std::fs::copy(&ecx_path, &idx_path)?; + + // Append deletions from .ecj as tombstones + if std::path::Path::new(&ecj_path).exists() { + let ecj_data = std::fs::read(&ecj_path)?; + if !ecj_data.is_empty() { + let mut idx_file = std::fs::OpenOptions::new() + .write(true) + .append(true) + .open(&idx_path)?; + + let count = ecj_data.len() / NEEDLE_ID_SIZE; + for i in 0..count { + let start = i * NEEDLE_ID_SIZE; + let needle_id = NeedleId::from_bytes(&ecj_data[start..start + NEEDLE_ID_SIZE]); + idx::write_index_entry( + &mut idx_file, + needle_id, + Offset::default(), + TOMBSTONE_FILE_SIZE, + )?; + } + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::erasure_coding::ec_encoder; + use crate::storage::needle::needle::Needle; + use crate::storage::needle_map::NeedleMapKind; + use crate::storage::volume::Volume; + use tempfile::TempDir; + + #[test] + fn test_ec_full_round_trip() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Create volume with data + let mut v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + let test_data: Vec<(NeedleId, Vec)> = (1..=3) + .map(|i| { + let data = format!("EC round trip data for needle {}", i); + (NeedleId(i), data.into_bytes()) + }) + .collect(); + + for (id, data) in &test_data { + let mut n = Needle { + id: *id, + cookie: Cookie(id.0 as u32), + data: data.clone(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + v.sync_to_disk().unwrap(); + let original_dat_size = v.dat_file_size().unwrap(); + v.close(); + + // Read original .dat for comparison + let original_dat = std::fs::read(format!("{}/1.dat", dir)).unwrap(); + + // Encode to EC + let data_shards = 10; + let parity_shards = 4; + ec_encoder::write_ec_files(dir, dir, "", VolumeId(1), data_shards, parity_shards).unwrap(); + + // Delete original .dat and .idx + std::fs::remove_file(format!("{}/1.dat", dir)).unwrap(); + std::fs::remove_file(format!("{}/1.idx", dir)).unwrap(); + + // Reconstruct from EC shards + write_dat_file_from_shards(dir, "", VolumeId(1), original_dat_size as i64, data_shards) + .unwrap(); + write_idx_file_from_ec_index(dir, "", VolumeId(1)).unwrap(); + + // Verify reconstructed .dat matches original + let reconstructed_dat = std::fs::read(format!("{}/1.dat", dir)).unwrap(); + assert_eq!( + original_dat[..original_dat_size as usize], + reconstructed_dat[..original_dat_size as usize], + "reconstructed .dat should match original" + ); + + // Verify we can load and read from reconstructed volume + let v2 = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + for (id, expected_data) in &test_data { + let mut n = Needle { + id: *id, + ..Needle::default() + }; + v2.read_needle(&mut n).unwrap(); + assert_eq!(&n.data, expected_data, "needle {} data should match", id); + } + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/ec_encoder.rs b/seaweed-volume/src/storage/erasure_coding/ec_encoder.rs new file mode 100644 index 000000000..b98db9fb0 --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/ec_encoder.rs @@ -0,0 +1,824 @@ +//! EC encoding: convert a .dat file into 10 data + 4 parity shards. +//! +//! Uses Reed-Solomon erasure coding. The .dat file is split into blocks +//! (1GB large, 1MB small) and encoded across 14 shard files. + +use std::fs::File; +use std::io; +#[cfg(not(unix))] +use std::io::{Seek, SeekFrom}; + +use reed_solomon_erasure::galois_8::ReedSolomon; + +use crate::storage::erasure_coding::ec_shard::*; +use crate::storage::idx; +use crate::storage::types::*; +use crate::storage::volume::volume_file_name; + +/// Encode a .dat file into EC shard files. +/// +/// Creates .ec00-.ec13 files in the same directory. +/// Also creates a sorted .ecx index from the .idx file. +pub fn write_ec_files( + dir: &str, + idx_dir: &str, + collection: &str, + volume_id: VolumeId, + data_shards: usize, + parity_shards: usize, +) -> io::Result<()> { + let base = volume_file_name(dir, collection, volume_id); + let dat_path = format!("{}.dat", base); + let idx_base = volume_file_name(idx_dir, collection, volume_id); + let idx_path = format!("{}.idx", idx_base); + + // Create sorted .ecx from .idx + write_sorted_ecx_from_idx(&idx_path, &format!("{}.ecx", base))?; + + // Encode .dat into shards + let dat_file = File::open(&dat_path)?; + let dat_size = dat_file.metadata()?.len() as i64; + + let rs = ReedSolomon::new(data_shards, parity_shards) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("reed-solomon init: {:?}", e)))?; + + // Create shard files + let total_shards = data_shards + parity_shards; + let mut shards: Vec = (0..total_shards as u8) + .map(|i| EcVolumeShard::new(dir, collection, volume_id, i)) + .collect(); + + for shard in &mut shards { + shard.create()?; + } + + // Encode in large blocks, then small blocks + encode_dat_file( + &dat_file, + dat_size, + &rs, + &mut shards, + data_shards, + parity_shards, + )?; + + // Close all shards + for shard in &mut shards { + shard.close(); + } + + Ok(()) +} + +/// Rebuild missing EC shard files from existing shards using Reed-Solomon reconstruct. +/// +/// This does not require the `.dat` file, only the existing `.ecXX` shard files. +pub fn rebuild_ec_files( + dir: &str, + collection: &str, + volume_id: VolumeId, + missing_shard_ids: &[u32], + data_shards: usize, + parity_shards: usize, +) -> io::Result<()> { + if missing_shard_ids.is_empty() { + return Ok(()); + } + + let rs = ReedSolomon::new(data_shards, parity_shards) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("reed-solomon init: {:?}", e)))?; + + let total_shards = data_shards + parity_shards; + let mut shards: Vec = (0..total_shards as u8) + .map(|i| EcVolumeShard::new(dir, collection, volume_id, i)) + .collect(); + + // Determine the exact shard size from the first available existing shard + let mut shard_size = 0; + for (i, shard) in shards.iter_mut().enumerate() { + if !missing_shard_ids.contains(&(i as u32)) { + if let Ok(_) = shard.open() { + let size = shard.file_size(); + if size > shard_size { + shard_size = size; + } + } else { + return Err(io::Error::new( + io::ErrorKind::NotFound, + format!("missing non-rebuild shard {}", i), + )); + } + } + } + + if shard_size == 0 { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "all existing shards are empty or cannot find an existing shard to determine size", + )); + } + + // Create the missing shards for writing + for i in missing_shard_ids { + if let Some(shard) = shards.get_mut(*i as usize) { + shard.create()?; + } + } + + let block_size = ERASURE_CODING_SMALL_BLOCK_SIZE; + let mut remaining = shard_size; + let mut offset: u64 = 0; + + // Process all data in blocks + while remaining > 0 { + let to_process = remaining.min(block_size as i64) as usize; + + // Allocate buffers for all shards. Option> is required by rs.reconstruct() + let mut buffers: Vec>> = vec![None; total_shards]; + + // Read available shards + for (i, shard) in shards.iter().enumerate() { + if !missing_shard_ids.contains(&(i as u32)) { + let mut buf = vec![0u8; to_process]; + shard.read_at(&mut buf, offset)?; + buffers[i] = Some(buf); + } + } + + // Reconstruct missing shards + rs.reconstruct(&mut buffers).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("reed-solomon reconstruct: {:?}", e), + ) + })?; + + // Write recovered data into the missing shards + for i in missing_shard_ids { + let idx = *i as usize; + if let Some(buf) = buffers[idx].take() { + shards[idx].write_all(&buf)?; + } + } + + offset += to_process as u64; + remaining -= to_process as i64; + } + + // Close all shards + for shard in &mut shards { + shard.close(); + } + + Ok(()) +} + +/// Verify EC shards by computing parity against the existing data and identifying corrupted shards. +pub fn verify_ec_shards( + dir: &str, + collection: &str, + volume_id: VolumeId, + data_shards: usize, + parity_shards: usize, +) -> io::Result<(Vec, Vec)> { + let rs = ReedSolomon::new(data_shards, parity_shards) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("reed-solomon init: {:?}", e)))?; + + let total_shards = data_shards + parity_shards; + let mut shards: Vec = (0..total_shards as u8) + .map(|i| EcVolumeShard::new(dir, collection, volume_id, i)) + .collect(); + + let mut shard_size = 0; + let mut broken_shards = std::collections::HashSet::new(); + let mut details = Vec::new(); + + for (i, shard) in shards.iter_mut().enumerate() { + if let Ok(_) = shard.open() { + let size = shard.file_size(); + if size > shard_size { + shard_size = size; + } + } else { + broken_shards.insert(i as u32); + details.push(format!("failed to open or missing shard {}", i)); + } + } + + if shard_size == 0 || broken_shards.len() >= parity_shards { + // Can't do much if we don't know the size or have too many missing + return Ok((broken_shards.into_iter().collect(), details)); + } + + let block_size = ERASURE_CODING_SMALL_BLOCK_SIZE; + let mut remaining = shard_size; + let mut offset: u64 = 0; + + while remaining > 0 { + let to_process = remaining.min(block_size as i64) as usize; + let mut buffers = vec![vec![0u8; to_process]; total_shards]; + + let mut read_failed = false; + for i in 0..total_shards { + if !broken_shards.contains(&(i as u32)) { + if let Err(e) = shards[i].read_at(&mut buffers[i], offset) { + broken_shards.insert(i as u32); + details.push(format!("read error shard {}: {}", i, e)); + read_failed = true; + } + } else { + read_failed = true; + } + } + + // Only do verification if all shards were readable + if !read_failed { + // Need to convert Vec> to &[&[u8]] for rs.verify + let slice_ptrs: Vec<&[u8]> = buffers.iter().map(|v| v.as_slice()).collect(); + if let Ok(is_valid) = rs.verify(&slice_ptrs) { + if !is_valid { + // Reed-Solomon verification failed. We cannot easily pinpoint which shard + // is corrupted without recalculating parities or syndromes, so we just + // log that this batch has corruption. Wait, we can test each parity shard! + // Let's re-encode from the first `data_shards` and compare to the actual `parity_shards`. + + let mut verify_buffers = buffers.clone(); + // Clear the parity parts + for i in data_shards..total_shards { + verify_buffers[i].fill(0); + } + if rs.encode(&mut verify_buffers).is_ok() { + for i in 0..total_shards { + if buffers[i] != verify_buffers[i] { + broken_shards.insert(i as u32); + details.push(format!( + "parity mismatch on shard {} at offset {}", + i, offset + )); + } + } + } + } + } + } + + offset += to_process as u64; + remaining -= to_process as i64; + } + + // Close all shards + for shard in &mut shards { + shard.close(); + } + + let mut broken_vec: Vec = broken_shards.into_iter().collect(); + broken_vec.sort_unstable(); + + Ok((broken_vec, details)) +} + +/// Write sorted .ecx index from .idx file. +fn write_sorted_ecx_from_idx(idx_path: &str, ecx_path: &str) -> io::Result<()> { + if !std::path::Path::new(idx_path).exists() { + return Err(io::Error::new( + io::ErrorKind::NotFound, + "idx file not found", + )); + } + + // Read all idx entries + let mut idx_file = File::open(idx_path)?; + let mut entries: Vec<(NeedleId, Offset, Size)> = Vec::new(); + + idx::walk_index_file(&mut idx_file, 0, |key, offset, size| { + entries.push((key, offset, size)); + Ok(()) + })?; + + // Sort by NeedleId, then by actual offset so later entries come last + entries.sort_by_key(|&(key, offset, _)| (key, offset.to_actual_offset())); + + // Remove duplicates (keep last/latest entry for each key). + // dedup_by_key keeps the first in each run, so we reverse first, + // dedup, then reverse back. + entries.reverse(); + entries.dedup_by_key(|entry| entry.0); + entries.reverse(); + + // Write sorted entries to .ecx + let mut ecx_file = File::create(ecx_path)?; + for &(key, offset, size) in &entries { + idx::write_index_entry(&mut ecx_file, key, offset, size)?; + } + + Ok(()) +} + +/// Rebuild the .ecx index file by walking needles in the EC data shards. +/// +/// This is the equivalent of Go's `RebuildEcxFile`. It reads the logical .dat +/// content from the EC data shards, walks through needle headers to extract +/// (needle_id, offset, size) entries, deduplicates them, and writes a sorted +/// .ecx index file. +pub fn rebuild_ecx_file( + dir: &str, + collection: &str, + volume_id: VolumeId, + data_shards: usize, +) -> io::Result<()> { + use crate::storage::needle::needle::get_actual_size; + use crate::storage::super_block::SUPER_BLOCK_SIZE; + + let base = volume_file_name(dir, collection, volume_id); + let ecx_path = format!("{}.ecx", base); + + // Open data shards to read logical .dat content + let mut shards: Vec = (0..data_shards as u8) + .map(|i| EcVolumeShard::new(dir, collection, volume_id, i)) + .collect(); + + for shard in &mut shards { + if let Err(_) = shard.open() { + // If a data shard is missing, we can't rebuild ecx + for s in &mut shards { + s.close(); + } + return Err(io::Error::new( + io::ErrorKind::NotFound, + format!("cannot open data shard for ecx rebuild"), + )); + } + } + + // Determine total logical data size from shard sizes + let shard_size = shards.iter().map(|s| s.file_size()).max().unwrap_or(0); + let total_data_size = shard_size as i64 * data_shards as i64; + + // Read version from superblock (first byte of logical data) + let mut sb_buf = [0u8; SUPER_BLOCK_SIZE]; + read_from_data_shards(&shards, &mut sb_buf, 0, data_shards)?; + let version = Version(sb_buf[0]); + + // Walk needles starting after superblock + let mut offset = SUPER_BLOCK_SIZE as i64; + let header_size = NEEDLE_HEADER_SIZE; + let mut entries: Vec<(NeedleId, Offset, Size)> = Vec::new(); + + while offset + header_size as i64 <= total_data_size { + // Read needle header (cookie + needle_id + size = 16 bytes) + let mut header_buf = [0u8; NEEDLE_HEADER_SIZE]; + if read_from_data_shards(&shards, &mut header_buf, offset as u64, data_shards).is_err() { + break; + } + + let cookie = Cookie::from_bytes(&header_buf[..COOKIE_SIZE]); + let needle_id = NeedleId::from_bytes(&header_buf[COOKIE_SIZE..COOKIE_SIZE + NEEDLE_ID_SIZE]); + let size = Size::from_bytes(&header_buf[COOKIE_SIZE + NEEDLE_ID_SIZE..header_size]); + + // Validate: stop if we hit zero cookie+id (end of data) + if cookie.0 == 0 && needle_id.0 == 0 { + break; + } + + // Validate size is reasonable + if size.0 < 0 && !size.is_deleted() { + break; + } + + let actual_size = get_actual_size(size, version); + if actual_size <= 0 || offset + actual_size > total_data_size { + break; + } + + entries.push((needle_id, Offset::from_actual_offset(offset), size)); + + // Advance to next needle (aligned to NEEDLE_PADDING_SIZE) + offset += actual_size; + let padding_rem = offset % NEEDLE_PADDING_SIZE as i64; + if padding_rem != 0 { + offset += NEEDLE_PADDING_SIZE as i64 - padding_rem; + } + } + + for shard in &mut shards { + shard.close(); + } + + // Sort by NeedleId, then by offset (later entries override earlier) + entries.sort_by_key(|&(key, offset, _)| (key, offset.to_actual_offset())); + + // Deduplicate: keep latest entry per needle_id + entries.reverse(); + entries.dedup_by_key(|entry| entry.0); + entries.reverse(); + + // Write sorted .ecx + let mut ecx_file = File::create(&ecx_path)?; + for &(key, offset, size) in &entries { + idx::write_index_entry(&mut ecx_file, key, offset, size)?; + } + ecx_file.sync_all()?; + + Ok(()) +} + +/// Read bytes from EC data shards at a logical offset in the .dat file. +fn read_from_data_shards( + shards: &[EcVolumeShard], + buf: &mut [u8], + logical_offset: u64, + data_shards: usize, +) -> io::Result<()> { + let small_block = ERASURE_CODING_SMALL_BLOCK_SIZE as u64; + let data_shards_u64 = data_shards as u64; + + let mut bytes_read = 0u64; + let mut remaining = buf.len() as u64; + let mut current_offset = logical_offset; + + while remaining > 0 { + // Determine which shard and at what shard-offset this logical offset maps to. + // The data is interleaved: large blocks first, then small blocks. + // For simplicity, use the small block size for all calculations since + // large blocks are multiples of small blocks. + let row_size = small_block * data_shards_u64; + let row_index = current_offset / row_size; + let row_offset = current_offset % row_size; + let shard_index = (row_offset / small_block) as usize; + let shard_offset = row_index * small_block + (row_offset % small_block); + + if shard_index >= data_shards { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + "shard index out of range", + )); + } + + // How many bytes can we read from this position in this shard block + let bytes_left_in_block = small_block - (row_offset % small_block); + let to_read = remaining.min(bytes_left_in_block) as usize; + + let dest = &mut buf[bytes_read as usize..bytes_read as usize + to_read]; + shards[shard_index].read_at(dest, shard_offset)?; + + bytes_read += to_read as u64; + remaining -= to_read as u64; + current_offset += to_read as u64; + } + + Ok(()) +} + +/// Encode the .dat file data into shard files. +/// +/// Uses a two-phase approach matching Go's ec_encoder.go: +/// 1. Process as many large blocks (1GB) as possible +/// 2. Process remaining data with small blocks (1MB) +fn encode_dat_file( + dat_file: &File, + dat_size: i64, + rs: &ReedSolomon, + shards: &mut [EcVolumeShard], + data_shards: usize, + parity_shards: usize, +) -> io::Result<()> { + let mut remaining = dat_size; + let mut offset: u64 = 0; + + // Phase 1: Process large blocks (1GB each) while enough data remains + let large_block_size = ERASURE_CODING_LARGE_BLOCK_SIZE; + let large_row_size = large_block_size * data_shards; + + while remaining >= large_row_size as i64 { + encode_one_batch( + dat_file, + offset, + large_block_size, + rs, + shards, + data_shards, + parity_shards, + )?; + offset += large_row_size as u64; + remaining -= large_row_size as i64; + } + + // Phase 2: Process remaining data with small blocks (1MB each) + let small_block_size = ERASURE_CODING_SMALL_BLOCK_SIZE; + let small_row_size = small_block_size * data_shards; + + while remaining > 0 { + let to_process = remaining.min(small_row_size as i64); + encode_one_batch( + dat_file, + offset, + small_block_size, + rs, + shards, + data_shards, + parity_shards, + )?; + offset += to_process as u64; + remaining -= to_process; + } + + Ok(()) +} + +/// Encode one batch (row) of data. +fn encode_one_batch( + dat_file: &File, + offset: u64, + block_size: usize, + rs: &ReedSolomon, + shards: &mut [EcVolumeShard], + data_shards: usize, + parity_shards: usize, +) -> io::Result<()> { + let total_shards = data_shards + parity_shards; + // Each batch allocates block_size * total_shards bytes. + // With large blocks (1 GiB) this is 14 GiB -- guard against OOM. + let total_alloc = block_size.checked_mul(total_shards).ok_or_else(|| { + io::Error::new( + io::ErrorKind::InvalidInput, + "block_size * shard count overflows usize", + ) + })?; + // Large-block encoding uses 1 GiB * 14 shards = 14 GiB; allow up to 16 GiB. + const MAX_BATCH_ALLOC: usize = 16 * 1024 * 1024 * 1024; // 16 GiB safety limit + if total_alloc > MAX_BATCH_ALLOC { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!( + "batch allocation too large ({} bytes, limit {} bytes); block_size={} shards={}", + total_alloc, MAX_BATCH_ALLOC, block_size, total_shards, + ), + )); + } + + // Allocate buffers for all shards + let mut buffers: Vec> = (0..total_shards).map(|_| vec![0u8; block_size]).collect(); + + // Read data shards from .dat file + for i in 0..data_shards { + let read_offset = offset + (i * block_size) as u64; + + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + dat_file.read_at(&mut buffers[i], read_offset)?; + } + + #[cfg(not(unix))] + { + let mut f = dat_file.try_clone()?; + f.seek(SeekFrom::Start(read_offset))?; + f.read(&mut buffers[i])?; + } + } + + // Encode parity shards + rs.encode(&mut buffers).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("reed-solomon encode: {:?}", e), + ) + })?; + + // Write all shard buffers to files + for (i, buf) in buffers.iter().enumerate() { + shards[i].write_all(buf)?; + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::needle::needle::Needle; + use crate::storage::needle_map::NeedleMapKind; + use crate::storage::volume::Volume; + use tempfile::TempDir; + + #[test] + fn test_ec_encode_decode_round_trip() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Create a volume with some data + let mut v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + for i in 1..=5 { + let data = format!("test data for needle {}", i); + let mut n = Needle { + id: NeedleId(i), + cookie: Cookie(i as u32), + data: data.as_bytes().to_vec(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + v.sync_to_disk().unwrap(); + v.close(); + + // Encode to EC shards + let data_shards = 10; + let parity_shards = 4; + let total_shards = data_shards + parity_shards; + write_ec_files(dir, dir, "", VolumeId(1), data_shards, parity_shards).unwrap(); + + // Verify shard files exist + for i in 0..total_shards { + let path = format!("{}/{}.ec{:02}", dir, 1, i); + assert!( + std::path::Path::new(&path).exists(), + "shard file {} should exist", + path + ); + } + + // Verify .ecx exists + let ecx_path = format!("{}/1.ecx", dir); + assert!(std::path::Path::new(&ecx_path).exists()); + } + + #[test] + fn test_reed_solomon_basic() { + let data_shards = 10; + let parity_shards = 4; + let total_shards = data_shards + parity_shards; + let rs = ReedSolomon::new(data_shards, parity_shards).unwrap(); + let block_size = 1024; + let mut shards: Vec> = (0..total_shards) + .map(|i| { + if i < data_shards { + vec![(i as u8).wrapping_mul(7); block_size] + } else { + vec![0u8; block_size] + } + }) + .collect(); + + // Encode + rs.encode(&mut shards).unwrap(); + + // Verify parity is non-zero (at least some) + let parity_nonzero: bool = shards[data_shards..] + .iter() + .any(|s| s.iter().any(|&b| b != 0)); + assert!(parity_nonzero); + + // Simulate losing 4 shards and reconstructing + let original_0 = shards[0].clone(); + let original_1 = shards[1].clone(); + + let mut shard_opts: Vec>> = shards.into_iter().map(Some).collect(); + shard_opts[0] = None; + shard_opts[1] = None; + shard_opts[2] = None; + shard_opts[3] = None; + + rs.reconstruct(&mut shard_opts).unwrap(); + + assert_eq!(shard_opts[0].as_ref().unwrap(), &original_0); + assert_eq!(shard_opts[1].as_ref().unwrap(), &original_1); + } + + /// EC encode must read .idx from a separate index directory when configured. + #[test] + fn test_ec_encode_with_separate_idx_dir() { + let dat_tmp = TempDir::new().unwrap(); + let idx_tmp = TempDir::new().unwrap(); + let dat_dir = dat_tmp.path().to_str().unwrap(); + let idx_dir = idx_tmp.path().to_str().unwrap(); + + // Create a volume with separate data and index directories + let mut v = Volume::new( + dat_dir, + idx_dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + for i in 1..=5 { + let data = format!("needle {} payload", i); + let mut n = Needle { + id: NeedleId(i), + cookie: Cookie(i as u32), + data: data.as_bytes().to_vec(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + v.sync_to_disk().unwrap(); + v.close(); + + // Verify .dat is in data dir, .idx is in idx dir + assert!(std::path::Path::new(&format!("{}/1.dat", dat_dir)).exists()); + assert!(!std::path::Path::new(&format!("{}/1.idx", dat_dir)).exists()); + assert!(std::path::Path::new(&format!("{}/1.idx", idx_dir)).exists()); + assert!(!std::path::Path::new(&format!("{}/1.dat", idx_dir)).exists()); + + // EC encode with separate idx dir + let data_shards = 10; + let parity_shards = 4; + let total_shards = data_shards + parity_shards; + write_ec_files( + dat_dir, + idx_dir, + "", + VolumeId(1), + data_shards, + parity_shards, + ) + .unwrap(); + + // Verify all 14 shard files in data dir + for i in 0..total_shards { + let path = format!("{}/1.ec{:02}", dat_dir, i); + assert!( + std::path::Path::new(&path).exists(), + "shard {} should exist in data dir", + path + ); + } + + // Verify .ecx in data dir (not idx dir) + assert!(std::path::Path::new(&format!("{}/1.ecx", dat_dir)).exists()); + assert!(!std::path::Path::new(&format!("{}/1.ecx", idx_dir)).exists()); + + // Verify no shard files leaked into idx dir + for i in 0..total_shards { + let path = format!("{}/1.ec{:02}", idx_dir, i); + assert!( + !std::path::Path::new(&path).exists(), + "shard {} should NOT exist in idx dir", + path + ); + } + } + + /// EC encode should fail gracefully when .idx is only in the data dir + /// but we pass a wrong idx_dir. This guards against regressions where + /// write_ec_files ignores the idx_dir parameter. + #[test] + fn test_ec_encode_fails_with_wrong_idx_dir() { + let dat_tmp = TempDir::new().unwrap(); + let idx_tmp = TempDir::new().unwrap(); + let wrong_tmp = TempDir::new().unwrap(); + let dat_dir = dat_tmp.path().to_str().unwrap(); + let idx_dir = idx_tmp.path().to_str().unwrap(); + let wrong_dir = wrong_tmp.path().to_str().unwrap(); + + let mut v = Volume::new( + dat_dir, + idx_dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(1), + data: b"hello".to_vec(), + data_size: 5, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + v.sync_to_disk().unwrap(); + v.close(); + + // Should fail: .idx is in idx_dir, not wrong_dir + let result = write_ec_files(dat_dir, wrong_dir, "", VolumeId(1), 10, 4); + assert!( + result.is_err(), + "should fail when idx_dir doesn't contain .idx" + ); + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/ec_locate.rs b/seaweed-volume/src/storage/erasure_coding/ec_locate.rs new file mode 100644 index 000000000..4c1f06aa2 --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/ec_locate.rs @@ -0,0 +1,223 @@ +//! EC data location: maps needle offset/size to shard intervals. +//! +//! Determines which shard(s) contain data for a given needle and at what +//! offsets within those shards. Handles both large (1GB) and small (1MB) +//! block sections. + +use crate::storage::erasure_coding::ec_shard::*; +use crate::storage::types::*; + +/// An interval to read from EC shards. +#[derive(Debug, Clone)] +pub struct Interval { + pub block_index: usize, + pub inner_block_offset: i64, + pub size: i64, + pub is_large_block: bool, + pub large_block_rows_count: usize, +} + +impl Interval { + pub fn to_shard_id_and_offset(&self, data_shards: u32) -> (ShardId, i64) { + let data_shards_usize = data_shards as usize; + let shard_id = (self.block_index % data_shards_usize) as ShardId; + let row_index = self.block_index / data_shards_usize; + + let block_size = if self.is_large_block { + ERASURE_CODING_LARGE_BLOCK_SIZE as i64 + } else { + ERASURE_CODING_SMALL_BLOCK_SIZE as i64 + }; + + let mut offset = row_index as i64 * block_size + self.inner_block_offset; + if !self.is_large_block { + // Small blocks come after large blocks in the shard file + offset += self.large_block_rows_count as i64 * ERASURE_CODING_LARGE_BLOCK_SIZE as i64; + } + + (shard_id, offset) + } +} + +/// Locate the EC shard intervals needed to read data at the given offset and size. +/// +/// `shard_size` is the size of a single shard file. +pub fn locate_data(offset: i64, size: Size, shard_size: i64, data_shards: u32) -> Vec { + let mut intervals = Vec::new(); + let data_size = size.0 as i64; + + if data_size <= 0 || shard_size <= 0 { + return intervals; + } + + let large_block_size = ERASURE_CODING_LARGE_BLOCK_SIZE as i64; + let small_block_size = ERASURE_CODING_SMALL_BLOCK_SIZE as i64; + let large_row_size = large_block_size * data_shards as i64; + let small_row_size = small_block_size * data_shards as i64; + + // Number of large block rows + let n_large_block_rows = if shard_size > 0 { + ((shard_size - 1) / large_block_size) as usize + } else { + 0 + }; + let large_section_size = n_large_block_rows as i64 * large_row_size; + + let mut remaining_offset = offset; + let mut remaining_size = data_size; + + // In large block section? + if remaining_offset < large_section_size { + let available_in_large = large_section_size - remaining_offset; + let to_read = remaining_size.min(available_in_large); + + add_intervals( + &mut intervals, + remaining_offset, + to_read, + large_block_size, + large_row_size, + true, + n_large_block_rows, + ); + + remaining_offset += to_read; + remaining_size -= to_read; + } + + // In small block section? + if remaining_size > 0 { + let small_offset = remaining_offset - large_section_size; + add_intervals( + &mut intervals, + small_offset, + remaining_size, + small_block_size, + small_row_size, + false, + n_large_block_rows, + ); + } + + intervals +} + +fn add_intervals( + intervals: &mut Vec, + offset: i64, + size: i64, + block_size: i64, + _row_size: i64, + is_large_block: bool, + large_block_rows_count: usize, +) { + let mut pos = offset; + let end = offset + size; + + while pos < end { + let block_index = (pos / block_size) as usize; + let inner_offset = pos % block_size; + let remaining_in_block = block_size - inner_offset; + let interval_size = remaining_in_block.min(end - pos); + + intervals.push(Interval { + block_index, + inner_block_offset: inner_offset, + size: interval_size, + is_large_block, + large_block_rows_count, + }); + + pos += interval_size; + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_interval_to_shard_id() { + let data_shards = 10; + let large_block_size = ERASURE_CODING_LARGE_BLOCK_SIZE as i64; + let _shard_size = 1024 * 1024; // Example shard size + + // Block index 0 → shard 0 + let interval = Interval { + block_index: 0, + inner_block_offset: 100, + size: 50, + is_large_block: true, + large_block_rows_count: 1, + }; + let (shard_id, offset) = interval.to_shard_id_and_offset(data_shards); + assert_eq!(shard_id, 0); + assert_eq!(offset, 100); + + // Block index 5 → shard 5 + let interval = Interval { + block_index: 5, + inner_block_offset: 0, + size: 1024, + is_large_block: true, + large_block_rows_count: 1, + }; + let (shard_id, _offset) = interval.to_shard_id_and_offset(data_shards); + assert_eq!(shard_id, 5); + + // Block index 12 (data_shards=10) → row_index 1, shard_id 2 + let interval = Interval { + block_index: 12, + inner_block_offset: 200, + size: 50, + is_large_block: true, + large_block_rows_count: 5, + }; + let (shard_id, offset) = interval.to_shard_id_and_offset(data_shards); + assert_eq!(shard_id, 2); // 12 % 10 = 2 + assert_eq!(offset, large_block_size + 200); // row 1 offset + inner_block_offset + + // Block index 10 → shard 0 (second row) + let interval = Interval { + block_index: 10, + inner_block_offset: 0, + size: 100, + is_large_block: true, + large_block_rows_count: 2, + }; + let (shard_id, offset) = interval.to_shard_id_and_offset(data_shards); + assert_eq!(shard_id, 0); + assert_eq!(offset, ERASURE_CODING_LARGE_BLOCK_SIZE as i64); // row 1 offset + } + + #[test] + fn test_locate_data_small_file() { + // Small file: 100 bytes at offset 50, shard size = 1MB + let intervals = locate_data(50, Size(100), 1024 * 1024, 10); + assert!(!intervals.is_empty()); + + // Should be a single small block interval (no large block rows for 1MB shard) + assert_eq!(intervals.len(), 1); + assert!(!intervals[0].is_large_block); + } + + #[test] + fn test_locate_data_empty() { + let intervals = locate_data(0, Size(0), 1024 * 1024, 10); + assert!(intervals.is_empty()); + } + + #[test] + fn test_small_block_after_large() { + let interval = Interval { + block_index: 0, + inner_block_offset: 0, + size: 100, + is_large_block: false, + large_block_rows_count: 2, + }; + let (_shard_id, offset) = interval.to_shard_id_and_offset(10); + // Should be after 2 large block rows + assert_eq!(offset, 2 * ERASURE_CODING_LARGE_BLOCK_SIZE as i64); + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/ec_shard.rs b/seaweed-volume/src/storage/erasure_coding/ec_shard.rs new file mode 100644 index 000000000..6a6a8d6ea --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/ec_shard.rs @@ -0,0 +1,225 @@ +//! EcVolumeShard: a single shard file (.ec00-.ec13) of an erasure-coded volume. + +use std::fs::{self, File, OpenOptions}; +use std::io::{self, Write}; + +use crate::storage::types::*; + +pub const DATA_SHARDS_COUNT: usize = 10; +pub const PARITY_SHARDS_COUNT: usize = 4; +pub const TOTAL_SHARDS_COUNT: usize = DATA_SHARDS_COUNT + PARITY_SHARDS_COUNT; +pub const MAX_SHARD_COUNT: usize = 32; +pub const MIN_TOTAL_DISKS: usize = TOTAL_SHARDS_COUNT / PARITY_SHARDS_COUNT + 1; +pub const ERASURE_CODING_LARGE_BLOCK_SIZE: usize = 1024 * 1024 * 1024; // 1GB +pub const ERASURE_CODING_SMALL_BLOCK_SIZE: usize = 1024 * 1024; // 1MB + +pub type ShardId = u8; + +/// A single erasure-coded shard file. +pub struct EcVolumeShard { + pub volume_id: VolumeId, + pub shard_id: ShardId, + pub collection: String, + pub dir: String, + pub disk_type: DiskType, + ecd_file: Option, + ecd_file_size: i64, +} + +impl EcVolumeShard { + /// Create a new shard reference (does not open the file). + pub fn new(dir: &str, collection: &str, volume_id: VolumeId, shard_id: ShardId) -> Self { + EcVolumeShard { + volume_id, + shard_id, + collection: collection.to_string(), + dir: dir.to_string(), + disk_type: DiskType::default(), + ecd_file: None, + ecd_file_size: 0, + } + } + + /// Shard file name, e.g. "dir/collection_42.ec03" + pub fn file_name(&self) -> String { + let base = + crate::storage::volume::volume_file_name(&self.dir, &self.collection, self.volume_id); + format!("{}.ec{:02}", base, self.shard_id) + } + + /// Open the shard file for reading. + pub fn open(&mut self) -> io::Result<()> { + let path = self.file_name(); + let file = File::open(&path)?; + self.ecd_file_size = file.metadata()?.len() as i64; + self.ecd_file = Some(file); + Ok(()) + } + + /// Create the shard file for writing. + pub fn create(&mut self) -> io::Result<()> { + let path = self.file_name(); + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&path)?; + self.ecd_file = Some(file); + self.ecd_file_size = 0; + Ok(()) + } + + /// Read data at a specific offset. + pub fn read_at(&self, buf: &mut [u8], offset: u64) -> io::Result { + let file = self + .ecd_file + .as_ref() + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "shard file not open"))?; + + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + file.read_at(buf, offset) + } + + #[cfg(not(unix))] + { + use std::io::{Read, Seek, SeekFrom}; + // File::read_at is unix-only; fall back to seek + read. + // We need a mutable reference for seek/read, so clone the handle. + let mut f = file.try_clone()?; + f.seek(SeekFrom::Start(offset))?; + f.read(buf) + } + } + + /// Write data to the shard file (appends). + pub fn write_all(&mut self, data: &[u8]) -> io::Result<()> { + let file = self + .ecd_file + .as_mut() + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "shard file not open"))?; + file.write_all(data)?; + self.ecd_file_size += data.len() as i64; + Ok(()) + } + + pub fn file_size(&self) -> i64 { + self.ecd_file_size + } + + /// Close the shard file. + pub fn close(&mut self) { + if let Some(ref file) = self.ecd_file { + let _ = file.sync_all(); + } + self.ecd_file = None; + } + + /// Delete the shard file from disk. + pub fn destroy(&mut self) { + self.close(); + let _ = fs::remove_file(self.file_name()); + } +} + +/// ShardBits: bitmap tracking which shards are present. +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] +pub struct ShardBits(pub u32); + +impl ShardBits { + pub fn add_shard_id(&mut self, id: ShardId) { + assert!((id as usize) < 32, "shard id {} out of bounds (max 31)", id,); + self.0 |= 1 << id; + } + + pub fn remove_shard_id(&mut self, id: ShardId) { + assert!((id as usize) < 32, "shard id {} out of bounds (max 31)", id,); + self.0 &= !(1 << id); + } + + pub fn has_shard_id(&self, id: ShardId) -> bool { + if (id as usize) >= 32 { + return false; + } + self.0 & (1 << id) != 0 + } + + pub fn shard_id_count(&self) -> usize { + self.0.count_ones() as usize + } + + /// Iterator over present shard IDs. + pub fn shard_ids(&self) -> Vec { + let mut ids = Vec::with_capacity(self.shard_id_count()); + for i in 0..32 { + if self.has_shard_id(i) { + ids.push(i); + } + } + ids + } + + pub fn minus(&self, other: ShardBits) -> ShardBits { + ShardBits(self.0 & !other.0) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_shard_bits() { + let mut bits = ShardBits::default(); + assert_eq!(bits.shard_id_count(), 0); + + bits.add_shard_id(0); + bits.add_shard_id(3); + bits.add_shard_id(13); + assert_eq!(bits.shard_id_count(), 3); + assert!(bits.has_shard_id(0)); + assert!(bits.has_shard_id(3)); + assert!(!bits.has_shard_id(1)); + + bits.remove_shard_id(3); + assert!(!bits.has_shard_id(3)); + assert_eq!(bits.shard_id_count(), 2); + } + + #[test] + fn test_shard_bits_ids() { + let mut bits = ShardBits::default(); + bits.add_shard_id(1); + bits.add_shard_id(5); + bits.add_shard_id(9); + assert_eq!(bits.shard_ids(), vec![1, 5, 9]); + } + + #[test] + fn test_shard_bits_minus() { + let mut a = ShardBits::default(); + a.add_shard_id(0); + a.add_shard_id(1); + a.add_shard_id(2); + + let mut b = ShardBits::default(); + b.add_shard_id(1); + + let c = a.minus(b); + assert_eq!(c.shard_ids(), vec![0, 2]); + } + + #[test] + fn test_shard_file_name() { + let shard = EcVolumeShard::new("/data", "pics", VolumeId(42), 3); + assert_eq!(shard.file_name(), "/data/pics_42.ec03"); + } + + #[test] + fn test_shard_file_name_no_collection() { + let shard = EcVolumeShard::new("/data", "", VolumeId(7), 13); + assert_eq!(shard.file_name(), "/data/7.ec13"); + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/ec_volume.rs b/seaweed-volume/src/storage/erasure_coding/ec_volume.rs new file mode 100644 index 000000000..24967c04a --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/ec_volume.rs @@ -0,0 +1,944 @@ +//! EcVolume: an erasure-coded volume with up to 14 shards. +//! +//! Each EcVolume has a sorted index (.ecx) and a deletion journal (.ecj). +//! Shards (.ec00-.ec13) may be distributed across multiple servers. + +use std::collections::HashMap; +use std::fs::{self, File, OpenOptions}; +use std::io::{self, Write}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use crate::pb::master_pb; +use crate::storage::erasure_coding::ec_locate; +use crate::storage::erasure_coding::ec_shard::*; +use crate::storage::needle::needle::{get_actual_size, Needle}; +use crate::storage::types::*; + +/// An erasure-coded volume managing its local shards and index. +pub struct EcVolume { + pub volume_id: VolumeId, + pub collection: String, + pub dir: String, + pub dir_idx: String, + pub version: Version, + pub shards: Vec>, // indexed by ShardId (0..14) + pub dat_file_size: i64, + pub data_shards: u32, + pub parity_shards: u32, + ecx_file: Option, + ecx_file_size: i64, + ecj_file: Option, + pub disk_type: DiskType, + /// Directory where .ecx/.ecj were actually found (may differ from dir_idx after fallback). + ecx_actual_dir: String, + /// Maps shard ID -> list of server addresses where that shard exists. + /// Used for distributed EC reads across the cluster. + pub shard_locations: HashMap>, + /// EC volume expiration time (unix epoch seconds), set during EC encode from TTL. + pub expire_at_sec: u64, +} + +pub fn read_ec_shard_config(dir: &str, collection: &str, volume_id: VolumeId) -> (u32, u32) { + let mut data_shards = crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT as u32; + let mut parity_shards = crate::storage::erasure_coding::ec_shard::PARITY_SHARDS_COUNT as u32; + let base = crate::storage::volume::volume_file_name(dir, collection, volume_id); + let vif_path = format!("{}.vif", base); + if let Ok(vif_content) = std::fs::read_to_string(&vif_path) { + if let Ok(vif_info) = + serde_json::from_str::(&vif_content) + { + if let Some(ec) = vif_info.ec_shard_config { + if ec.data_shards > 0 + && ec.parity_shards > 0 + && (ec.data_shards + ec.parity_shards) <= TOTAL_SHARDS_COUNT as u32 + { + data_shards = ec.data_shards; + parity_shards = ec.parity_shards; + } + } + } + } + (data_shards, parity_shards) +} + +impl EcVolume { + /// Create a new EcVolume. Loads .ecx index and .ecj journal if present. + pub fn new( + dir: &str, + dir_idx: &str, + collection: &str, + volume_id: VolumeId, + ) -> io::Result { + let (data_shards, parity_shards) = read_ec_shard_config(dir, collection, volume_id); + + let total_shards = (data_shards + parity_shards) as usize; + let mut shards = Vec::with_capacity(total_shards); + for _ in 0..total_shards { + shards.push(None); + } + + // Read expire_at_sec and version from .vif if present (matches Go's MaybeLoadVolumeInfo) + let (expire_at_sec, vif_version) = { + let base = crate::storage::volume::volume_file_name(dir, collection, volume_id); + let vif_path = format!("{}.vif", base); + if let Ok(vif_content) = std::fs::read_to_string(&vif_path) { + if let Ok(vif_info) = + serde_json::from_str::(&vif_content) + { + let ver = if vif_info.version > 0 { + Version(vif_info.version as u8) + } else { + Version::current() + }; + (vif_info.expire_at_sec, ver) + } else { + (0, Version::current()) + } + } else { + (0, Version::current()) + } + }; + + let mut vol = EcVolume { + volume_id, + collection: collection.to_string(), + dir: dir.to_string(), + dir_idx: dir_idx.to_string(), + version: vif_version, + shards, + dat_file_size: 0, + data_shards, + parity_shards, + ecx_file: None, + ecx_file_size: 0, + ecj_file: None, + disk_type: DiskType::default(), + ecx_actual_dir: dir_idx.to_string(), + shard_locations: HashMap::new(), + expire_at_sec, + }; + + // Open .ecx file (sorted index) in read/write mode for in-place deletion marking. + // Matches Go which opens ecx for writing via MarkNeedleDeleted. + let ecx_path = vol.ecx_file_name(); + if std::path::Path::new(&ecx_path).exists() { + let file = OpenOptions::new().read(true).write(true).open(&ecx_path)?; + vol.ecx_file_size = file.metadata()?.len() as i64; + vol.ecx_file = Some(file); + } else if dir_idx != dir { + // Fall back to data directory if .ecx was created before -dir.idx was configured + let data_base = crate::storage::volume::volume_file_name(dir, collection, volume_id); + let fallback_ecx = format!("{}.ecx", data_base); + if std::path::Path::new(&fallback_ecx).exists() { + tracing::info!( + volume_id = volume_id.0, + "ecx file not found in idx dir, falling back to data dir" + ); + let file = OpenOptions::new().read(true).write(true).open(&fallback_ecx)?; + vol.ecx_file_size = file.metadata()?.len() as i64; + vol.ecx_file = Some(file); + vol.ecx_actual_dir = dir.to_string(); + } + } + + // Replay .ecj journal into .ecx on startup (matches Go's RebuildEcxFile). + vol.rebuild_ecx_from_journal()?; + + // Open .ecj file (deletion journal) — use ecx_actual_dir for consistency + let ecj_base = + crate::storage::volume::volume_file_name(&vol.ecx_actual_dir, collection, volume_id); + let ecj_path = format!("{}.ecj", ecj_base); + let ecj_file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .append(true) + .open(&ecj_path)?; + vol.ecj_file = Some(ecj_file); + + Ok(vol) + } + + // ---- File names ---- + + #[allow(dead_code)] + fn base_name(&self) -> String { + crate::storage::volume::volume_file_name(&self.dir, &self.collection, self.volume_id) + } + + fn idx_base_name(&self) -> String { + crate::storage::volume::volume_file_name(&self.dir_idx, &self.collection, self.volume_id) + } + + pub fn ecx_file_name(&self) -> String { + format!("{}.ecx", self.idx_base_name()) + } + + pub fn ecj_file_name(&self) -> String { + format!("{}.ecj", self.idx_base_name()) + } + + /// Sync the EC volume's journal and index files to disk (matching Go's ecv.Sync()). + /// Go flushes both .ecj and .ecx to ensure in-place deletion marks are persisted. + pub fn sync_to_disk(&self) -> io::Result<()> { + if let Some(ref ecj_file) = self.ecj_file { + ecj_file.sync_all()?; + } + if let Some(ref ecx_file) = self.ecx_file { + ecx_file.sync_all()?; + } + Ok(()) + } + + // ---- Shard management ---- + + /// Add a shard to this volume. + pub fn add_shard(&mut self, mut shard: EcVolumeShard) -> io::Result<()> { + let id = shard.shard_id as usize; + let total_shards = (self.data_shards + self.parity_shards) as usize; + if id >= total_shards { + return Err(io::Error::new( + io::ErrorKind::InvalidInput, + format!("invalid shard id: {} (max {})", id, total_shards - 1), + )); + } + shard.open()?; + self.shards[id] = Some(shard); + Ok(()) + } + + /// Remove and close a shard. + pub fn remove_shard(&mut self, shard_id: ShardId) { + if let Some(ref mut shard) = self.shards[shard_id as usize] { + shard.close(); + } + self.shards[shard_id as usize] = None; + } + + /// Get a ShardBits bitmap of locally available shards. + pub fn shard_bits(&self) -> ShardBits { + let mut bits = ShardBits::default(); + for (i, shard) in self.shards.iter().enumerate() { + if shard.is_some() { + bits.add_shard_id(i as ShardId); + } + } + bits + } + + /// Count of locally available shards. + pub fn shard_count(&self) -> usize { + self.shards.iter().filter(|s| s.is_some()).count() + } + + pub fn is_time_to_destroy(&self) -> bool { + self.expire_at_sec > 0 + && SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + > self.expire_at_sec + } + + pub fn to_volume_ec_shard_information_messages( + &self, + disk_id: u32, + ) -> Vec { + let mut ec_index_bits: u32 = 0; + let mut shard_sizes = Vec::new(); + for shard in self.shards.iter().flatten() { + ec_index_bits |= 1u32 << shard.shard_id; + shard_sizes.push(shard.file_size()); + } + + if ec_index_bits == 0 { + return Vec::new(); + } + + vec![master_pb::VolumeEcShardInformationMessage { + id: self.volume_id.0, + collection: self.collection.clone(), + ec_index_bits, + shard_sizes, + disk_type: self.disk_type.to_string(), + expire_at_sec: self.expire_at_sec, + disk_id, + ..Default::default() + }] + } + + // ---- Shard locations (distributed tracking) ---- + + /// Set the list of server addresses for a given shard ID. + pub fn set_shard_locations(&mut self, shard_id: ShardId, locations: Vec) { + self.shard_locations.insert(shard_id, locations); + } + + /// Get the list of server addresses for a given shard ID. + pub fn get_shard_locations(&self, shard_id: ShardId) -> &[String] { + self.shard_locations + .get(&shard_id) + .map(|v| v.as_slice()) + .unwrap_or(&[]) + } + + // ---- Index operations ---- + + /// Find a needle's offset and size in the sorted .ecx index via binary search. + pub fn find_needle_from_ecx(&self, needle_id: NeedleId) -> io::Result> { + let ecx_file = self + .ecx_file + .as_ref() + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "ecx file not open"))?; + + let entry_count = self.ecx_file_size as usize / NEEDLE_MAP_ENTRY_SIZE; + if entry_count == 0 { + return Ok(None); + } + + // Binary search + let mut lo: usize = 0; + let mut hi: usize = entry_count; + let mut entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + + while lo < hi { + let mid = lo + (hi - lo) / 2; + let file_offset = (mid * NEEDLE_MAP_ENTRY_SIZE) as u64; + + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + ecx_file.read_exact_at(&mut entry_buf, file_offset)?; + } + + let (key, offset, size) = idx_entry_from_bytes(&entry_buf); + if key == needle_id { + return Ok(Some((offset, size))); + } else if key < needle_id { + lo = mid + 1; + } else { + hi = mid; + } + } + + Ok(None) + } + + /// Locate the EC shard intervals needed to read a needle. + pub fn locate_needle( + &self, + needle_id: NeedleId, + ) -> io::Result)>> { + let (offset, size) = match self.find_needle_from_ecx(needle_id)? { + Some((o, s)) => (o, s), + None => return Ok(None), + }; + + if size.is_deleted() || offset.is_zero() { + return Ok(None); + } + + // Match Go's LocateEcShardNeedleInterval: shardSize = shard.ecdFileSize - 1 + // Shards are usually padded to ErasureCodingSmallBlockSize, so subtract 1 + // to avoid off-by-one in large block row count calculation. + // If datFileSize is known, use datFileSize / DataShards instead. + let shard_size = if self.dat_file_size > 0 { + self.dat_file_size / self.data_shards as i64 + } else { + self.shard_file_size() - 1 + }; + // Pass the actual on-disk size (header+body+checksum+timestamp+padding) + // to locate_data, matching Go: types.Size(needle.GetActualSize(size, version)) + let actual = get_actual_size(size, self.version); + let intervals = ec_locate::locate_data( + offset.to_actual_offset(), + Size(actual as i32), + shard_size, + self.data_shards, + ); + + Ok(Some((offset, size, intervals))) + } + + /// Read a full needle from locally available EC shards. + /// + /// Locates the needle in the .ecx index, determines which shard intervals + /// contain its data, reads from local shards, and parses the result into + /// a fully populated Needle (including last_modified, checksum, ttl). + /// + /// Returns `Ok(None)` if the needle is not found or is deleted. + /// Returns an error if a required shard is not available locally. + pub fn read_ec_shard_needle(&self, needle_id: NeedleId) -> io::Result> { + let (offset, size, intervals) = match self.locate_needle(needle_id)? { + Some(v) => v, + None => return Ok(None), + }; + + if intervals.is_empty() { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + "no intervals for needle", + )); + } + + // Compute the total bytes we need to read (full needle on disk) + let actual_size = get_actual_size(size, self.version) as usize; + let mut bytes = Vec::with_capacity(actual_size); + + for interval in &intervals { + let (shard_id, shard_offset) = interval.to_shard_id_and_offset(self.data_shards); + let shard = self + .shards + .get(shard_id as usize) + .and_then(|s| s.as_ref()) + .ok_or_else(|| { + io::Error::new( + io::ErrorKind::NotFound, + format!("ec shard {} not available locally", shard_id), + ) + })?; + + let mut buf = vec![0u8; interval.size as usize]; + shard.read_at(&mut buf, shard_offset as u64)?; + bytes.extend_from_slice(&buf); + } + + // Truncate to exact actual_size (intervals may span more than needed) + bytes.truncate(actual_size); + + if bytes.len() < actual_size { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!( + "read {} bytes but need {} for needle {}", + bytes.len(), + actual_size, + needle_id + ), + )); + } + + let mut n = Needle::default(); + n.read_bytes(&bytes, offset.to_actual_offset(), size, self.version) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, format!("{}", e)))?; + + Ok(Some(n)) + } + + /// Get the size of a single shard (all shards are the same size). + fn shard_file_size(&self) -> i64 { + for shard in &self.shards { + if let Some(s) = shard { + return s.file_size(); + } + } + 0 + } + + /// Walk the .ecx index and return (file_count, file_deleted_count, total_size). + /// total_size sums size.Raw() for all entries (including deleted), matching Go's WalkIndex. + pub fn walk_ecx_stats(&self) -> io::Result<(u64, u64, u64)> { + let ecx_file = match self.ecx_file.as_ref() { + Some(f) => f, + None => return Ok((0, 0, 0)), + }; + + let entry_count = self.ecx_file_size as usize / NEEDLE_MAP_ENTRY_SIZE; + let mut files: u64 = 0; + let mut files_deleted: u64 = 0; + let mut total_size: u64 = 0; + let mut entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + + for i in 0..entry_count { + let file_offset = (i * NEEDLE_MAP_ENTRY_SIZE) as u64; + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + ecx_file.read_exact_at(&mut entry_buf, file_offset)?; + } + let (_key, _offset, size) = idx_entry_from_bytes(&entry_buf); + // Match Go's Size.Raw(): tombstone (-1) returns 0, other negatives return abs + if !size.is_tombstone() { + total_size += size.0.unsigned_abs() as u64; + } + if size.is_deleted() { + files_deleted += 1; + } else { + files += 1; + } + } + + Ok((files, files_deleted, total_size)) + } + + /// ScrubIndex verifies index integrity of an EC volume. + /// Matches Go's `(ev *EcVolume) ScrubIndex()` → `idx.CheckIndexFile()`. + /// Returns (entry_count, errors). + pub fn scrub_index(&self) -> (u64, Vec) { + let ecx_file = match self.ecx_file.as_ref() { + Some(f) => f, + None => { + return ( + 0, + vec![format!( + "no ECX file associated with EC volume {}", + self.volume_id.0 + )], + ) + } + }; + + if self.ecx_file_size == 0 { + return ( + 0, + vec![format!( + "zero-size ECX file for EC volume {}", + self.volume_id.0 + )], + ); + } + + let entry_count = self.ecx_file_size as usize / NEEDLE_MAP_ENTRY_SIZE; + let mut entries: Vec<(usize, NeedleId, i64, Size)> = Vec::with_capacity(entry_count); + let mut errs: Vec = Vec::new(); + let mut entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + + // Walk all entries + for i in 0..entry_count { + let file_offset = (i * NEEDLE_MAP_ENTRY_SIZE) as u64; + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + if let Err(e) = ecx_file.read_exact_at(&mut entry_buf, file_offset) { + errs.push(format!("read ecx entry {}: {}", i, e)); + continue; + } + } + let (key, offset, size) = idx_entry_from_bytes(&entry_buf); + entries.push((i, key, offset.to_actual_offset(), size)); + } + + // Sort by offset, then size + entries.sort_by(|a, b| a.2.cmp(&b.2).then(a.3 .0.cmp(&b.3 .0))); + + // Check for overlapping needles + for i in 1..entries.len() { + let (idx, id, offset, size) = entries[i]; + let (_, last_id, last_offset, last_size) = entries[i - 1]; + + let actual_size = + crate::storage::needle::needle::get_actual_size(size, self.version); + let end = if actual_size != 0 { + offset + actual_size - 1 + } else { + offset + }; + + let last_actual_size = + crate::storage::needle::needle::get_actual_size(last_size, self.version); + let last_end = if last_actual_size != 0 { + last_offset + last_actual_size - 1 + } else { + last_offset + }; + + if offset <= last_end { + errs.push(format!( + "needle {} (#{}) at [{}-{}] overlaps needle {} at [{}-{}]", + id.0, + idx + 1, + offset, + end, + last_id.0, + last_offset, + last_end + )); + } + } + + // Verify file size matches entry count + let expected_size = entry_count as i64 * NEEDLE_MAP_ENTRY_SIZE as i64; + if expected_size != self.ecx_file_size { + errs.push(format!( + "expected an index file of size {}, got {}", + expected_size, self.ecx_file_size + )); + } + + (entries.len() as u64, errs) + } + + // ---- Deletion ---- + + /// Mark a needle as deleted in the .ecx file in-place. + /// Matches Go's MarkNeedleDeleted: binary search the .ecx, then overwrite + /// the size field with TOMBSTONE_FILE_SIZE. + fn mark_needle_deleted_in_ecx(&self, needle_id: NeedleId) -> io::Result { + let ecx_file = match self.ecx_file.as_ref() { + Some(f) => f, + None => return Ok(false), + }; + + let entry_count = self.ecx_file_size as usize / NEEDLE_MAP_ENTRY_SIZE; + if entry_count == 0 { + return Ok(false); + } + + // Binary search for the needle + let mut lo: usize = 0; + let mut hi: usize = entry_count; + let mut entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + + while lo < hi { + let mid = lo + (hi - lo) / 2; + let file_offset = (mid * NEEDLE_MAP_ENTRY_SIZE) as u64; + + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + ecx_file.read_exact_at(&mut entry_buf, file_offset)?; + } + + let (key, _offset, _size) = idx_entry_from_bytes(&entry_buf); + if key == needle_id { + // Found — overwrite the size field with TOMBSTONE_FILE_SIZE + let size_offset = file_offset + NEEDLE_ID_SIZE as u64 + OFFSET_SIZE as u64; + let mut size_buf = [0u8; SIZE_SIZE]; + TOMBSTONE_FILE_SIZE.to_bytes(&mut size_buf); + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + ecx_file.write_all_at(&size_buf, size_offset)?; + } + return Ok(true); + } else if key < needle_id { + lo = mid + 1; + } else { + hi = mid; + } + } + + Ok(false) // not found + } + + /// Replay .ecj journal entries into .ecx on startup. + /// Matches Go's RebuildEcxFile: for each needle ID in .ecj, marks it + /// deleted in .ecx, then removes the .ecj file. + fn rebuild_ecx_from_journal(&mut self) -> io::Result<()> { + let ecj_path = self.ecj_file_name(); + if !std::path::Path::new(&ecj_path).exists() { + return Ok(()); + } + + let data = fs::read(&ecj_path)?; + if data.is_empty() { + return Ok(()); + } + + let count = data.len() / NEEDLE_ID_SIZE; + for i in 0..count { + let start = i * NEEDLE_ID_SIZE; + if start + NEEDLE_ID_SIZE > data.len() { + break; + } + let needle_id = NeedleId::from_bytes(&data[start..start + NEEDLE_ID_SIZE]); + // Errors for individual entries are non-fatal (needle may not exist in .ecx) + let _ = self.mark_needle_deleted_in_ecx(needle_id); + } + + // Remove the .ecj file after replay (matches Go) + let _ = fs::remove_file(&ecj_path); + + // Re-create .ecj for future deletions + let ecj_file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .append(true) + .open(&ecj_path)?; + self.ecj_file = Some(ecj_file); + + Ok(()) + } + + // ---- Deletion journal ---- + + /// Append a deleted needle ID to the .ecj journal and mark in .ecx. + /// Matches Go's DeleteNeedleFromEcx: marks in .ecx first, then journals. + pub fn journal_delete(&mut self, needle_id: NeedleId) -> io::Result<()> { + // Mark deleted in .ecx in-place (matches Go's MarkNeedleDeleted) + let _ = self.mark_needle_deleted_in_ecx(needle_id); + let ecj_file = self + .ecj_file + .as_mut() + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "ecj file not open"))?; + + let mut buf = [0u8; NEEDLE_ID_SIZE]; + needle_id.to_bytes(&mut buf); + ecj_file.write_all(&buf)?; + ecj_file.sync_all()?; + Ok(()) + } + + /// Append a deleted needle ID to the .ecj journal, validating the cookie first. + /// Matches Go's DeleteEcShardNeedle which validates cookie before journaling. + /// A cookie of 0 means skip cookie check (e.g., orphan cleanup). + pub fn journal_delete_with_cookie( + &mut self, + needle_id: NeedleId, + cookie: crate::storage::types::Cookie, + ) -> io::Result<()> { + // cookie == 0 indicates SkipCookieCheck was requested + if cookie.0 != 0 { + // Try to read the needle's cookie from the EC shards to validate + // Look up the needle in ecx index to find its offset, then read header from shard + if let Ok(Some((offset, size))) = self.find_needle_from_ecx(needle_id) { + if !size.is_deleted() && !offset.is_zero() { + let actual_offset = offset.to_actual_offset() as u64; + // Determine which shard contains this offset and read the cookie + let shard_size = self + .shards + .iter() + .filter_map(|s| s.as_ref()) + .map(|s| s.file_size()) + .next() + .unwrap_or(0) as u64; + if shard_size > 0 { + let shard_id = (actual_offset / shard_size) as usize; + let shard_offset = actual_offset % shard_size; + if let Some(Some(shard)) = self.shards.get(shard_id) { + let mut header_buf = [0u8; 4]; // cookie is first 4 bytes of needle + if shard.read_at(&mut header_buf, shard_offset).is_ok() { + let needle_cookie = + crate::storage::types::Cookie(u32::from_be_bytes(header_buf)); + if needle_cookie != cookie { + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("unexpected cookie {:x}", cookie.0), + )); + } + } + } + } + } + } + } + self.journal_delete(needle_id) + } + + /// Read all deleted needle IDs from the .ecj journal. + pub fn read_deleted_needles(&self) -> io::Result> { + let ecj_path = self.ecj_file_name(); + if !std::path::Path::new(&ecj_path).exists() { + return Ok(Vec::new()); + } + + let data = fs::read(&ecj_path)?; + let count = data.len() / NEEDLE_ID_SIZE; + let mut needles = Vec::with_capacity(count); + for i in 0..count { + let start = i * NEEDLE_ID_SIZE; + let id = NeedleId::from_bytes(&data[start..start + NEEDLE_ID_SIZE]); + needles.push(id); + } + Ok(needles) + } + + // ---- Lifecycle ---- + + pub fn close(&mut self) { + for shard in &mut self.shards { + if let Some(s) = shard { + s.close(); + } + *shard = None; + } + // Sync .ecx before closing to flush in-place deletion marks (matches Go's ev.ecxFile.Sync()) + if let Some(ref ecx_file) = self.ecx_file { + let _ = ecx_file.sync_all(); + } + self.ecx_file = None; + self.ecj_file = None; + } + + pub fn destroy(&mut self) { + for shard in &mut self.shards { + if let Some(s) = shard { + s.destroy(); + } + *shard = None; + } + // Remove .ecx/.ecj/.vif from ecx_actual_dir (where they were found) + // Go's Destroy() removes .ecx, .ecj, and .vif files. + let actual_base = crate::storage::volume::volume_file_name( + &self.ecx_actual_dir, + &self.collection, + self.volume_id, + ); + let _ = fs::remove_file(format!("{}.ecx", actual_base)); + let _ = fs::remove_file(format!("{}.ecj", actual_base)); + let _ = fs::remove_file(format!("{}.vif", actual_base)); + // Also try the configured idx dir and data dir in case files exist in either + if self.ecx_actual_dir != self.dir_idx { + let _ = fs::remove_file(self.ecx_file_name()); + let _ = fs::remove_file(self.ecj_file_name()); + let idx_base = crate::storage::volume::volume_file_name( + &self.dir_idx, + &self.collection, + self.volume_id, + ); + let _ = fs::remove_file(format!("{}.vif", idx_base)); + } + if self.ecx_actual_dir != self.dir && self.dir_idx != self.dir { + let data_base = crate::storage::volume::volume_file_name( + &self.dir, + &self.collection, + self.volume_id, + ); + let _ = fs::remove_file(format!("{}.ecx", data_base)); + let _ = fs::remove_file(format!("{}.ecj", data_base)); + let _ = fs::remove_file(format!("{}.vif", data_base)); + } + self.ecx_file = None; + self.ecj_file = None; + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + fn write_ecx_file( + dir: &str, + collection: &str, + vid: VolumeId, + entries: &[(NeedleId, Offset, Size)], + ) { + let base = crate::storage::volume::volume_file_name(dir, collection, vid); + let ecx_path = format!("{}.ecx", base); + let mut file = File::create(&ecx_path).unwrap(); + + // Write sorted entries + for &(key, offset, size) in entries { + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_entry_to_bytes(&mut buf, key, offset, size); + file.write_all(&buf).unwrap(); + } + } + + #[test] + fn test_ec_volume_find_needle() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Write sorted ecx entries + let entries = vec![ + (NeedleId(1), Offset::from_actual_offset(8), Size(100)), + (NeedleId(5), Offset::from_actual_offset(200), Size(200)), + (NeedleId(10), Offset::from_actual_offset(500), Size(300)), + ]; + write_ecx_file(dir, "", VolumeId(1), &entries); + + let vol = EcVolume::new(dir, dir, "", VolumeId(1)).unwrap(); + + // Found + let result = vol.find_needle_from_ecx(NeedleId(5)).unwrap(); + assert!(result.is_some()); + let (offset, size) = result.unwrap(); + assert_eq!(offset.to_actual_offset(), 200); + assert_eq!(size, Size(200)); + + // Not found + let result = vol.find_needle_from_ecx(NeedleId(7)).unwrap(); + assert!(result.is_none()); + } + + #[test] + fn test_ec_volume_journal() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Need ecx file for EcVolume::new to succeed + write_ecx_file(dir, "", VolumeId(1), &[]); + + let mut vol = EcVolume::new(dir, dir, "", VolumeId(1)).unwrap(); + + vol.journal_delete(NeedleId(10)).unwrap(); + vol.journal_delete(NeedleId(20)).unwrap(); + + let deleted = vol.read_deleted_needles().unwrap(); + assert_eq!(deleted, vec![NeedleId(10), NeedleId(20)]); + } + + #[test] + fn test_ec_volume_shard_bits() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + write_ecx_file(dir, "", VolumeId(1), &[]); + + let mut vol = EcVolume::new(dir, dir, "", VolumeId(1)).unwrap(); + assert_eq!(vol.shard_count(), 0); + + // Create a shard file so we can add it + let mut shard = EcVolumeShard::new(dir, "", VolumeId(1), 3); + shard.create().unwrap(); + shard.write_all(&[0u8; 100]).unwrap(); + shard.close(); + + vol.add_shard(EcVolumeShard::new(dir, "", VolumeId(1), 3)) + .unwrap(); + assert_eq!(vol.shard_count(), 1); + assert!(vol.shard_bits().has_shard_id(3)); + } + + #[test] + fn test_ec_volume_uses_collection_prefixed_vif_config() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + write_ecx_file(dir, "pics", VolumeId(1), &[]); + + let vif = crate::storage::volume::VifVolumeInfo { + ec_shard_config: Some(crate::storage::volume::VifEcShardConfig { + data_shards: 6, + parity_shards: 3, + }), + ..Default::default() + }; + let base = crate::storage::volume::volume_file_name(dir, "pics", VolumeId(1)); + std::fs::write( + format!("{}.vif", base), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + + let vol = EcVolume::new(dir, dir, "pics", VolumeId(1)).unwrap(); + assert_eq!(vol.data_shards, 6); + assert_eq!(vol.parity_shards, 3); + } + + #[test] + fn test_ec_volume_invalid_vif_config_falls_back_to_defaults() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + write_ecx_file(dir, "pics", VolumeId(1), &[]); + + let vif = crate::storage::volume::VifVolumeInfo { + ec_shard_config: Some(crate::storage::volume::VifEcShardConfig { + data_shards: 10, + parity_shards: 10, + }), + ..Default::default() + }; + let base = crate::storage::volume::volume_file_name(dir, "pics", VolumeId(1)); + std::fs::write( + format!("{}.vif", base), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + + let vol = EcVolume::new(dir, dir, "pics", VolumeId(1)).unwrap(); + assert_eq!(vol.data_shards, DATA_SHARDS_COUNT as u32); + assert_eq!(vol.parity_shards, PARITY_SHARDS_COUNT as u32); + } +} diff --git a/seaweed-volume/src/storage/erasure_coding/mod.rs b/seaweed-volume/src/storage/erasure_coding/mod.rs new file mode 100644 index 000000000..b6c07b450 --- /dev/null +++ b/seaweed-volume/src/storage/erasure_coding/mod.rs @@ -0,0 +1,16 @@ +//! Erasure coding module for volume data protection. +//! +//! Encodes a volume's .dat file into 10 data + 4 parity shards using +//! Reed-Solomon erasure coding. Can reconstruct from any 10 of 14 shards. + +pub mod ec_decoder; +pub mod ec_encoder; +pub mod ec_locate; +pub mod ec_shard; +pub mod ec_volume; + +pub use ec_shard::{ + EcVolumeShard, ShardId, DATA_SHARDS_COUNT, MAX_SHARD_COUNT, MIN_TOTAL_DISKS, + PARITY_SHARDS_COUNT, TOTAL_SHARDS_COUNT, +}; +pub use ec_volume::EcVolume; diff --git a/seaweed-volume/src/storage/idx/mod.rs b/seaweed-volume/src/storage/idx/mod.rs new file mode 100644 index 000000000..f8d556739 --- /dev/null +++ b/seaweed-volume/src/storage/idx/mod.rs @@ -0,0 +1,116 @@ +//! Index file (.idx) format: sequential 17-byte entries. +//! +//! Each entry: NeedleId(8) + Offset(5) + Size(4) = 17 bytes. + +use crate::storage::types::*; +use std::io::{self, Read, Seek, SeekFrom}; + +const ROWS_TO_READ: usize = 1024; + +/// Walk all entries in an .idx file, calling `f` for each. +/// Mirrors Go's `WalkIndexFile()`. +pub fn walk_index_file(reader: &mut R, start_from: u64, mut f: F) -> io::Result<()> +where + R: Read + Seek, + F: FnMut(NeedleId, Offset, Size) -> io::Result<()>, +{ + let reader_offset = start_from * NEEDLE_MAP_ENTRY_SIZE as u64; + reader.seek(SeekFrom::Start(reader_offset))?; + + let mut buf = vec![0u8; NEEDLE_MAP_ENTRY_SIZE * ROWS_TO_READ]; + + loop { + let count = match reader.read(&mut buf) { + Ok(0) => return Ok(()), + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::UnexpectedEof => return Ok(()), + Err(e) => return Err(e), + }; + + let mut i = 0; + while i + NEEDLE_MAP_ENTRY_SIZE <= count { + let (key, offset, size) = idx_entry_from_bytes(&buf[i..i + NEEDLE_MAP_ENTRY_SIZE]); + f(key, offset, size)?; + i += NEEDLE_MAP_ENTRY_SIZE; + } + } +} + +/// Write a single index entry to a writer. +pub fn write_index_entry( + writer: &mut W, + key: NeedleId, + offset: Offset, + size: Size, +) -> io::Result<()> { + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_entry_to_bytes(&mut buf, key, offset, size); + writer.write_all(&buf) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Cursor; + + #[test] + fn test_walk_index_file() { + // Create a small index with 3 entries + let mut data = Vec::new(); + let entries = vec![ + (NeedleId(1), Offset::from_actual_offset(0), Size(100)), + (NeedleId(2), Offset::from_actual_offset(128), Size(200)), + (NeedleId(3), Offset::from_actual_offset(384), Size(300)), + ]; + for (key, offset, size) in &entries { + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_entry_to_bytes(&mut buf, *key, *offset, *size); + data.extend_from_slice(&buf); + } + + let mut cursor = Cursor::new(data); + let mut collected = Vec::new(); + walk_index_file(&mut cursor, 0, |key, offset, size| { + collected.push((key, offset.to_actual_offset(), size)); + Ok(()) + }) + .unwrap(); + + assert_eq!(collected.len(), 3); + assert_eq!(collected[0].0, NeedleId(1)); + assert_eq!(collected[0].1, 0); + assert_eq!(collected[0].2, Size(100)); + assert_eq!(collected[1].0, NeedleId(2)); + assert_eq!(collected[2].0, NeedleId(3)); + } + + #[test] + fn test_walk_empty() { + let mut cursor = Cursor::new(Vec::new()); + let mut count = 0; + walk_index_file(&mut cursor, 0, |_, _, _| { + count += 1; + Ok(()) + }) + .unwrap(); + assert_eq!(count, 0); + } + + #[test] + fn test_write_index_entry() { + let mut buf = Vec::new(); + write_index_entry( + &mut buf, + NeedleId(42), + Offset::from_actual_offset(8 * 10), + Size(512), + ) + .unwrap(); + assert_eq!(buf.len(), NEEDLE_MAP_ENTRY_SIZE); + + let (key, offset, size) = idx_entry_from_bytes(&buf); + assert_eq!(key, NeedleId(42)); + assert_eq!(offset.to_actual_offset(), 80); + assert_eq!(size, Size(512)); + } +} diff --git a/seaweed-volume/src/storage/mod.rs b/seaweed-volume/src/storage/mod.rs new file mode 100644 index 000000000..2507c7511 --- /dev/null +++ b/seaweed-volume/src/storage/mod.rs @@ -0,0 +1,9 @@ +pub mod disk_location; +pub mod erasure_coding; +pub mod idx; +pub mod needle; +pub mod needle_map; +pub mod store; +pub mod super_block; +pub mod types; +pub mod volume; diff --git a/seaweed-volume/src/storage/needle/crc.rs b/seaweed-volume/src/storage/needle/crc.rs new file mode 100644 index 000000000..6225c8495 --- /dev/null +++ b/seaweed-volume/src/storage/needle/crc.rs @@ -0,0 +1,73 @@ +//! CRC32-Castagnoli checksum for needle data integrity. +//! +//! Matches Go's `crc32.MakeTable(crc32.Castagnoli)` exactly. +//! The CRC is stored as raw u32 (not the `.Value()` legacy transform). + +/// CRC32-Castagnoli checksum wrapper. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub struct CRC(pub u32); + +impl CRC { + /// Compute CRC from a byte slice (starting from 0). + pub fn new(data: &[u8]) -> Self { + CRC(0).update(data) + } + + /// Update the CRC with additional bytes. + pub fn update(self, data: &[u8]) -> Self { + CRC(crc32c::crc32c_append(self.0, data)) + } + + /// Legacy `.Value()` function — deprecated in Go but needed for backward compat check. + /// Formula: (crc >> 15 | crc << 17) + 0xa282ead8 + pub fn legacy_value(&self) -> u32 { + (self.0 >> 15 | self.0 << 17).wrapping_add(0xa282ead8) + } +} + +impl From for CRC { + fn from(v: u32) -> Self { + CRC(v) + } +} + +impl From for u32 { + fn from(c: CRC) -> Self { + c.0 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_crc_empty() { + let crc = CRC::new(&[]); + assert_eq!(crc.0, 0); + } + + #[test] + fn test_crc_known_value() { + // CRC32-C of "hello" — verify it produces a non-zero deterministic value + let crc = CRC::new(b"hello"); + assert_ne!(crc.0, 0); + // Same input produces same output + assert_eq!(crc, CRC::new(b"hello")); + } + + #[test] + fn test_crc_incremental() { + let crc1 = CRC::new(b"hello world"); + let crc2 = CRC::new(b"hello").update(b" world"); + assert_eq!(crc1, crc2); + } + + #[test] + fn test_crc_legacy_value() { + let crc = CRC(0x12345678); + let v = crc.legacy_value(); + let expected = (0x12345678u32 >> 15 | 0x12345678u32 << 17).wrapping_add(0xa282ead8); + assert_eq!(v, expected); + } +} diff --git a/seaweed-volume/src/storage/needle/mod.rs b/seaweed-volume/src/storage/needle/mod.rs new file mode 100644 index 000000000..364c6122a --- /dev/null +++ b/seaweed-volume/src/storage/needle/mod.rs @@ -0,0 +1,7 @@ +pub mod crc; +pub mod needle; +pub mod ttl; + +pub use crc::CRC; +pub use needle::Needle; +pub use ttl::TTL; diff --git a/seaweed-volume/src/storage/needle/needle.rs b/seaweed-volume/src/storage/needle/needle.rs new file mode 100644 index 000000000..bbc55c9d0 --- /dev/null +++ b/seaweed-volume/src/storage/needle/needle.rs @@ -0,0 +1,944 @@ +//! Needle: the individual file object stored in a volume. +//! +//! Binary format (Version 2/3): +//! Header (16 bytes): Cookie(4) + NeedleId(8) + Size(4) +//! Body (Size bytes): +//! DataSize(4) + Data(DataSize) + Flags(1) +//! [if HasName]: NameSize(1) + Name(NameSize) +//! [if HasMime]: MimeSize(1) + Mime(MimeSize) +//! [if HasLastMod]: LastModified(5) +//! [if HasTtl]: TTL(2) +//! [if HasPairs]: PairsSize(2) + Pairs(PairsSize) +//! Tail: +//! Checksum(4) + [if V3: AppendAtNs(8)] + Padding(0-7) + +use super::crc::CRC; +use super::ttl::TTL; +use crate::storage::types::*; + +// Flag bits (matching Go constants) +pub const FLAG_IS_COMPRESSED: u8 = 0x01; +pub const FLAG_HAS_NAME: u8 = 0x02; +pub const FLAG_HAS_MIME: u8 = 0x04; +pub const FLAG_HAS_LAST_MODIFIED_DATE: u8 = 0x08; +pub const FLAG_HAS_TTL: u8 = 0x10; +pub const FLAG_HAS_PAIRS: u8 = 0x20; +pub const FLAG_IS_CHUNK_MANIFEST: u8 = 0x80; + +pub const LAST_MODIFIED_BYTES_LENGTH: usize = 5; +pub const TTL_BYTES_LENGTH: usize = 2; + +#[derive(Debug, Clone, Default)] +pub struct Needle { + pub cookie: Cookie, + pub id: NeedleId, + pub size: Size, // sum of body content fields + + // Version 2+ fields + pub data_size: u32, + pub data: Vec, + pub flags: u8, + pub name_size: u8, + pub name: Vec, // max 255 bytes + pub mime_size: u8, + pub mime: Vec, // max 255 bytes + pub pairs_size: u16, + pub pairs: Vec, // max 64KB, JSON + pub last_modified: u64, // stored as 5 bytes on disk + pub ttl: Option, + + // Tail fields + pub checksum: CRC, + pub append_at_ns: u64, // Version 3 only + pub padding: Vec, +} + +impl Needle { + // ---- Flag accessors (matching Go) ---- + + pub fn is_compressed(&self) -> bool { + self.flags & FLAG_IS_COMPRESSED != 0 + } + pub fn set_is_compressed(&mut self) { + self.flags |= FLAG_IS_COMPRESSED; + } + + pub fn has_name(&self) -> bool { + self.flags & FLAG_HAS_NAME != 0 + } + pub fn set_has_name(&mut self) { + self.flags |= FLAG_HAS_NAME; + } + + pub fn has_mime(&self) -> bool { + self.flags & FLAG_HAS_MIME != 0 + } + pub fn set_has_mime(&mut self) { + self.flags |= FLAG_HAS_MIME; + } + + pub fn has_last_modified_date(&self) -> bool { + self.flags & FLAG_HAS_LAST_MODIFIED_DATE != 0 + } + pub fn set_has_last_modified_date(&mut self) { + self.flags |= FLAG_HAS_LAST_MODIFIED_DATE; + } + + pub fn has_ttl(&self) -> bool { + self.flags & FLAG_HAS_TTL != 0 + } + pub fn set_has_ttl(&mut self) { + self.flags |= FLAG_HAS_TTL; + } + + pub fn has_pairs(&self) -> bool { + self.flags & FLAG_HAS_PAIRS != 0 + } + pub fn set_has_pairs(&mut self) { + self.flags |= FLAG_HAS_PAIRS; + } + + pub fn is_chunk_manifest(&self) -> bool { + self.flags & FLAG_IS_CHUNK_MANIFEST != 0 + } + pub fn set_is_chunk_manifest(&mut self) { + self.flags |= FLAG_IS_CHUNK_MANIFEST; + } + + // ---- Header parsing ---- + + /// Parse the 16-byte needle header. + pub fn parse_header(bytes: &[u8]) -> (Cookie, NeedleId, Size) { + assert!(bytes.len() >= NEEDLE_HEADER_SIZE); + let cookie = Cookie::from_bytes(&bytes[0..COOKIE_SIZE]); + let id = NeedleId::from_bytes(&bytes[COOKIE_SIZE..COOKIE_SIZE + NEEDLE_ID_SIZE]); + let size = Size::from_bytes(&bytes[COOKIE_SIZE + NEEDLE_ID_SIZE..NEEDLE_HEADER_SIZE]); + (cookie, id, size) + } + + /// Parse needle header into self. + pub fn read_header(&mut self, bytes: &[u8]) { + let (cookie, id, size) = Self::parse_header(bytes); + self.cookie = cookie; + self.id = id; + self.size = size; + } + + // ---- Body reading (Version 2/3) ---- + + /// Read version 2/3 body metadata only — skips copying the data payload. + /// Sets `data_size` and all metadata fields but leaves `data` empty. + pub fn read_body_v2_meta_only(&mut self, bytes: &[u8]) -> Result<(), NeedleError> { + let len_bytes = bytes.len(); + let mut index = 0; + + // DataSize (4 bytes) + if index + 4 > len_bytes { + return Err(NeedleError::IndexOutOfRange(1)); + } + self.data_size = u32::from_be_bytes([ + bytes[index], + bytes[index + 1], + bytes[index + 2], + bytes[index + 3], + ]); + index += 4; + + // Skip data bytes (do NOT copy them) + if index + self.data_size as usize > len_bytes { + return Err(NeedleError::IndexOutOfRange(1)); + } + index += self.data_size as usize; + + // Read non-data metadata + self.read_body_v2_non_data(&bytes[index..])?; + Ok(()) + } + + /// Read full needle from bytes but skip copying the data payload. + /// Sets all metadata fields, checksum, etc. but leaves `data` empty. + pub fn read_bytes_meta_only( + &mut self, + bytes: &[u8], + offset: i64, + expected_size: Size, + version: Version, + ) -> Result<(), NeedleError> { + self.read_header(bytes); + + if self.size != expected_size { + return Err(NeedleError::SizeMismatch { + offset, + id: self.id, + found: self.size, + expected: expected_size, + }); + } + + let body_start = NEEDLE_HEADER_SIZE; + let body_end = body_start + self.size.0 as usize; + + if version == VERSION_1 { + // V1 has no metadata — data is the entire body + self.data_size = self.size.0 as u32; + } else if self.size.0 == 0 { + // Tombstones have no DataSize/body section; metadata starts at the tail. + self.data_size = 0; + } else { + self.read_body_v2_meta_only(&bytes[body_start..body_end])?; + } + + // Read tail but skip CRC validation (no data to check against) + self.read_tail_meta_only(&bytes[body_end..], version)?; + Ok(()) + } + + /// Paged meta-only parse: accepts the 20-byte header+DataSize prefix and the + /// meta tail bytes (everything after the data payload). This avoids reading + /// the data payload from disk at all, matching Go's `ReadNeedleMeta`. + pub fn read_paged_meta( + &mut self, + header_bytes: &[u8], // first 20 bytes: NEEDLE_HEADER_SIZE + DATA_SIZE_SIZE + meta_bytes: &[u8], // tail: non-data body metadata + checksum + timestamp + padding + offset: i64, + expected_size: Size, + version: Version, + ) -> Result<(), NeedleError> { + // Parse the 16-byte header + self.read_header(header_bytes); + + if self.size != expected_size { + return Err(NeedleError::SizeMismatch { + offset, + id: self.id, + found: self.size, + expected: expected_size, + }); + } + + if version == VERSION_1 { + self.data_size = self.size.0 as u32; + } else if self.size.0 == 0 { + // Tombstone + self.data_size = 0; + } else { + // Extract DataSize from bytes 16..20 + self.data_size = u32::from_be_bytes([ + header_bytes[NEEDLE_HEADER_SIZE], + header_bytes[NEEDLE_HEADER_SIZE + 1], + header_bytes[NEEDLE_HEADER_SIZE + 2], + header_bytes[NEEDLE_HEADER_SIZE + 3], + ]); + + // meta_bytes starts with the non-data body metadata (flags, name, mime, etc.) + // followed by the tail (checksum + timestamp + padding). + // readNeedleDataVersion2NonData returns the index where it stopped. + let index = self.read_body_v2_non_data(meta_bytes)?; + self.read_tail_meta_only(&meta_bytes[index..], version)?; + return Ok(()); + } + + // For VERSION_1 or tombstones, meta_bytes IS the tail + self.read_tail_meta_only(meta_bytes, version)?; + Ok(()) + } + + /// Read tail without CRC validation (used when data was not read). + fn read_tail_meta_only( + &mut self, + tail_bytes: &[u8], + version: Version, + ) -> Result<(), NeedleError> { + if tail_bytes.len() < NEEDLE_CHECKSUM_SIZE { + return Err(NeedleError::TailTooShort); + } + + self.checksum = CRC(u32::from_be_bytes([ + tail_bytes[0], + tail_bytes[1], + tail_bytes[2], + tail_bytes[3], + ])); + + if version == VERSION_3 { + let ts_offset = NEEDLE_CHECKSUM_SIZE; + if tail_bytes.len() < ts_offset + TIMESTAMP_SIZE { + return Err(NeedleError::TailTooShort); + } + self.append_at_ns = u64::from_be_bytes([ + tail_bytes[ts_offset], + tail_bytes[ts_offset + 1], + tail_bytes[ts_offset + 2], + tail_bytes[ts_offset + 3], + tail_bytes[ts_offset + 4], + tail_bytes[ts_offset + 5], + tail_bytes[ts_offset + 6], + tail_bytes[ts_offset + 7], + ]); + } + + Ok(()) + } + + /// Read the version 2/3 body data from bytes (size bytes starting after header). + /// Returns IndexOutOfRange errors for truncated data (matching Go's readNeedleDataVersion2). + pub fn read_body_v2(&mut self, bytes: &[u8]) -> Result<(), NeedleError> { + let len_bytes = bytes.len(); + let mut index = 0; + + // DataSize (4 bytes) + if index + 4 > len_bytes { + return Ok(()); // tolerate EOF + } + self.data_size = u32::from_be_bytes([ + bytes[index], + bytes[index + 1], + bytes[index + 2], + bytes[index + 3], + ]); + index += 4; + + // Data + if index + self.data_size as usize > len_bytes { + return Err(NeedleError::IndexOutOfRange(1)); + } + self.data = bytes[index..index + self.data_size as usize].to_vec(); + index += self.data_size as usize; + + // Read non-data metadata + self.read_body_v2_non_data(&bytes[index..])?; + Ok(()) + } + + /// Read version 2/3 metadata fields (everything after Data). + /// Returns IndexOutOfRange errors for truncated data (matching Go's readNeedleDataVersion2). + fn read_body_v2_non_data(&mut self, bytes: &[u8]) -> Result { + let len_bytes = bytes.len(); + let mut index = 0; + + // Flags (1 byte) + if index < len_bytes { + self.flags = bytes[index]; + index += 1; + } else { + return Ok(index); + } + + // Name + if index < len_bytes && self.has_name() { + self.name_size = bytes[index]; + index += 1; + if index + self.name_size as usize > len_bytes { + return Err(NeedleError::IndexOutOfRange(2)); + } + self.name = bytes[index..index + self.name_size as usize].to_vec(); + index += self.name_size as usize; + } + + // Mime + if index < len_bytes && self.has_mime() { + self.mime_size = bytes[index]; + index += 1; + if index + self.mime_size as usize > len_bytes { + return Err(NeedleError::IndexOutOfRange(3)); + } + self.mime = bytes[index..index + self.mime_size as usize].to_vec(); + index += self.mime_size as usize; + } + + // LastModified (5 bytes) + if index < len_bytes && self.has_last_modified_date() { + if index + LAST_MODIFIED_BYTES_LENGTH > len_bytes { + return Err(NeedleError::IndexOutOfRange(4)); + } + self.last_modified = bytes_to_u64_5(&bytes[index..index + LAST_MODIFIED_BYTES_LENGTH]); + index += LAST_MODIFIED_BYTES_LENGTH; + } + + // TTL (2 bytes) + if index < len_bytes && self.has_ttl() { + if index + TTL_BYTES_LENGTH > len_bytes { + return Err(NeedleError::IndexOutOfRange(5)); + } + self.ttl = Some(TTL::from_bytes(&bytes[index..index + TTL_BYTES_LENGTH])); + index += TTL_BYTES_LENGTH; + } + + // Pairs + if index < len_bytes && self.has_pairs() { + if index + 2 > len_bytes { + return Err(NeedleError::IndexOutOfRange(6)); + } + self.pairs_size = u16::from_be_bytes([bytes[index], bytes[index + 1]]); + index += 2; + if index + self.pairs_size as usize > len_bytes { + return Err(NeedleError::IndexOutOfRange(7)); + } + self.pairs = bytes[index..index + self.pairs_size as usize].to_vec(); + index += self.pairs_size as usize; + } + + Ok(index) + } + + // ---- Tail reading ---- + + /// Read the needle tail (checksum + optional timestamp + padding). + pub fn read_tail(&mut self, tail_bytes: &[u8], version: Version) -> Result<(), NeedleError> { + if tail_bytes.len() < NEEDLE_CHECKSUM_SIZE { + return Err(NeedleError::TailTooShort); + } + + let expected_checksum = CRC(u32::from_be_bytes([ + tail_bytes[0], + tail_bytes[1], + tail_bytes[2], + tail_bytes[3], + ])); + + if !self.data.is_empty() { + let data_checksum = CRC::new(&self.data); + // Go double-checks: n.Checksum != crc && uint32(n.Checksum) != crc.Value() + // The crc.Value() path is a deprecated legacy transform for backward compat + // with seaweed versions prior to commit 056c480eb. + if expected_checksum != data_checksum + && expected_checksum.0 != data_checksum.legacy_value() + { + return Err(NeedleError::CrcMismatch { + needle_id: self.id, + got: data_checksum.0, + want: expected_checksum.0, + }); + } + self.checksum = data_checksum; + } else { + self.checksum = expected_checksum; + } + + if version == VERSION_3 { + let ts_offset = NEEDLE_CHECKSUM_SIZE; + if tail_bytes.len() < ts_offset + TIMESTAMP_SIZE { + return Err(NeedleError::TailTooShort); + } + self.append_at_ns = u64::from_be_bytes([ + tail_bytes[ts_offset], + tail_bytes[ts_offset + 1], + tail_bytes[ts_offset + 2], + tail_bytes[ts_offset + 3], + tail_bytes[ts_offset + 4], + tail_bytes[ts_offset + 5], + tail_bytes[ts_offset + 6], + tail_bytes[ts_offset + 7], + ]); + } + + Ok(()) + } + + // ---- Full read from bytes ---- + + /// Read a complete needle from its raw bytes (header + body + tail). + pub fn read_bytes( + &mut self, + bytes: &[u8], + offset: i64, + expected_size: Size, + version: Version, + ) -> Result<(), NeedleError> { + self.read_header(bytes); + + if self.size != expected_size { + return Err(NeedleError::SizeMismatch { + offset, + id: self.id, + found: self.size, + expected: expected_size, + }); + } + + let body_start = NEEDLE_HEADER_SIZE; + let body_end = body_start + self.size.0 as usize; + + if version == VERSION_1 { + self.data = bytes[body_start..body_end].to_vec(); + } else { + self.read_body_v2(&bytes[body_start..body_end])?; + } + + self.read_tail(&bytes[body_end..], version)?; + Ok(()) + } + + // ---- Write (serialize) ---- + + /// Serialize the needle to bytes for writing to a .dat file (Version 2/3). + pub fn write_bytes(&mut self, version: Version) -> Vec { + let mut buf = Vec::with_capacity(256); + + // Compute sizes (matching Go writeNeedleCommon) + if self.name.len() >= 255 { + self.name_size = 255; + } else { + self.name_size = self.name.len() as u8; + } + self.data_size = self.data.len() as u32; + self.mime_size = self.mime.len() as u8; + + // Compute n.Size (body size, excluding header) + if self.data_size > 0 { + let mut s: i32 = 4 + self.data_size as i32 + 1; // DataSize + Data + Flags + if self.has_name() { + s += 1 + self.name_size as i32; + } + if self.has_mime() { + s += 1 + self.mime_size as i32; + } + if self.has_last_modified_date() { + s += LAST_MODIFIED_BYTES_LENGTH as i32; + } + if self.has_ttl() { + s += TTL_BYTES_LENGTH as i32; + } + if self.has_pairs() { + s += 2 + self.pairs_size as i32; + } + self.size = Size(s); + } else { + self.size = Size(0); + } + + // Header: Cookie(4) + NeedleId(8) + Size(4) = 16 bytes + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + self.cookie.to_bytes(&mut header[0..COOKIE_SIZE]); + self.id + .to_bytes(&mut header[COOKIE_SIZE..COOKIE_SIZE + NEEDLE_ID_SIZE]); + self.size + .to_bytes(&mut header[COOKIE_SIZE + NEEDLE_ID_SIZE..NEEDLE_HEADER_SIZE]); + buf.extend_from_slice(&header); + + // Body + if self.data_size > 0 { + buf.extend_from_slice(&self.data_size.to_be_bytes()); + buf.extend_from_slice(&self.data); + buf.push(self.flags); + if self.has_name() { + buf.push(self.name_size); + buf.extend_from_slice(&self.name[..self.name_size as usize]); + } + if self.has_mime() { + buf.push(self.mime_size); + buf.extend_from_slice(&self.mime); + } + if self.has_last_modified_date() { + // Write 5 bytes of last_modified (lower 5 bytes of u64 big-endian) + let lm_bytes = self.last_modified.to_be_bytes(); + buf.extend_from_slice(&lm_bytes[8 - LAST_MODIFIED_BYTES_LENGTH..8]); + } + if self.has_ttl() { + if let Some(ref ttl) = self.ttl { + let mut ttl_buf = [0u8; 2]; + ttl.to_bytes(&mut ttl_buf); + buf.extend_from_slice(&ttl_buf); + } else { + buf.extend_from_slice(&[0u8; 2]); + } + } + if self.has_pairs() { + buf.extend_from_slice(&self.pairs_size.to_be_bytes()); + buf.extend_from_slice(&self.pairs); + } + } + + // Compute checksum + self.checksum = CRC::new(&self.data); + + // Tail: Checksum + [V3: AppendAtNs] + Padding + buf.extend_from_slice(&self.checksum.0.to_be_bytes()); + if version == VERSION_3 { + buf.extend_from_slice(&self.append_at_ns.to_be_bytes()); + } + + // Padding to 8-byte alignment + let padding = padding_length(self.size, version).0 as usize; + buf.extend(std::iter::repeat(0u8).take(padding)); + + buf + } + + /// Total disk size of this needle including header, body, checksum, timestamp, and padding. + pub fn disk_size(&self, version: Version) -> i64 { + get_actual_size(self.size, version) + } + + /// Compute ETag string from checksum (matching Go). + pub fn etag(&self) -> String { + etag_from_checksum(self.checksum.0) + } +} + +// ============================================================================ +// Helper functions (matching Go) +// ============================================================================ + +/// Compute padding to align needle to NEEDLE_PADDING_SIZE (8 bytes). +pub fn padding_length(needle_size: Size, version: Version) -> Size { + if version == VERSION_3 { + Size( + NEEDLE_PADDING_SIZE as i32 + - ((NEEDLE_HEADER_SIZE as i32 + + needle_size.0 + + NEEDLE_CHECKSUM_SIZE as i32 + + TIMESTAMP_SIZE as i32) + % NEEDLE_PADDING_SIZE as i32), + ) + } else { + Size( + NEEDLE_PADDING_SIZE as i32 + - ((NEEDLE_HEADER_SIZE as i32 + needle_size.0 + NEEDLE_CHECKSUM_SIZE as i32) + % NEEDLE_PADDING_SIZE as i32), + ) + } +} + +/// Body length = Size + Checksum + [Timestamp] + Padding. +pub fn needle_body_length(needle_size: Size, version: Version) -> i64 { + if version == VERSION_3 { + needle_size.0 as i64 + + NEEDLE_CHECKSUM_SIZE as i64 + + TIMESTAMP_SIZE as i64 + + padding_length(needle_size, version).0 as i64 + } else { + needle_size.0 as i64 + + NEEDLE_CHECKSUM_SIZE as i64 + + padding_length(needle_size, version).0 as i64 + } +} + +/// Total actual size on disk: Header + Body. +pub fn get_actual_size(size: Size, version: Version) -> i64 { + NEEDLE_HEADER_SIZE as i64 + needle_body_length(size, version) +} + +/// Read 5 bytes as a u64 (big-endian, zero-padded high bytes). +fn bytes_to_u64_5(bytes: &[u8]) -> u64 { + assert!(bytes.len() >= 5); + // The 5 bytes are the LOWER 5 bytes of a big-endian u64. + // In Go: util.BytesToUint64(bytes[index : index+5]) reads into a uint64 + // Go's BytesToUint64 copies into the LAST 5 bytes of an 8-byte array (big-endian). + let mut buf = [0u8; 8]; + buf[3..8].copy_from_slice(&bytes[..5]); + u64::from_be_bytes(buf) +} + +/// ETag formatted as Go: hex of big-endian u32 bytes. +pub fn etag_from_checksum(checksum: u32) -> String { + let bits = checksum.to_be_bytes(); + format!( + "{:02x}{:02x}{:02x}{:02x}", + bits[0], bits[1], bits[2], bits[3] + ) +} + +// ============================================================================ +// FileId +// ============================================================================ + +/// FileId = VolumeId + NeedleId + Cookie. +/// String format: "," +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct FileId { + pub volume_id: VolumeId, + pub key: NeedleId, + pub cookie: Cookie, +} + +impl FileId { + pub fn new(volume_id: VolumeId, key: NeedleId, cookie: Cookie) -> Self { + FileId { + volume_id, + key, + cookie, + } + } + + /// Parse "volume_id,needle_id_cookie" or "volume_id/needle_id_cookie". + pub fn parse(s: &str) -> Result { + let (vid_str, rest) = if let Some(pos) = s.find(',') { + (&s[..pos], &s[pos + 1..]) + } else if let Some(pos) = s.find('/') { + (&s[..pos], &s[pos + 1..]) + } else { + return Err(format!("invalid file id: {}", s)); + }; + + let volume_id = + VolumeId::parse(vid_str).map_err(|e| format!("invalid volume id: {}", e))?; + let (key, cookie) = parse_needle_id_cookie(rest)?; + Ok(FileId { + volume_id, + key, + cookie, + }) + } + + /// Format the needle_id + cookie part as a hex string (stripping leading zeros). + pub fn needle_id_cookie_string(&self) -> String { + format_needle_id_cookie(self.key, self.cookie) + } +} + +impl std::fmt::Display for FileId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{},{}", self.volume_id, self.needle_id_cookie_string()) + } +} + +/// Format NeedleId + Cookie as hex, stripping leading zero bytes from NeedleId only. +/// Matches Go: strips leading zero bytes up to NeedleIdSize (8), so cookie is always present. +fn format_needle_id_cookie(key: NeedleId, cookie: Cookie) -> String { + // Encode 12 bytes: 8 for NeedleId + 4 for Cookie + let mut bytes = [0u8; 12]; + key.to_bytes(&mut bytes[0..8]); + cookie.to_bytes(&mut bytes[8..12]); + + // Strip leading zero bytes, but only within NeedleId portion (first 8 bytes) + let mut nonzero_index = 0; + while nonzero_index < NEEDLE_ID_SIZE && bytes[nonzero_index] == 0 { + nonzero_index += 1; + } + hex::encode(&bytes[nonzero_index..]) +} + +/// Parse "needle_id_cookie_hex" or "needle_id_cookie_hex_delta" into (NeedleId, Cookie). +/// Matches Go's ParsePath + ParseNeedleIdCookie: supports an optional `_delta` suffix +/// where delta is a decimal number added to the NeedleId (used for sub-file addressing). +/// Rejects strings that are too short or too long. +pub fn parse_needle_id_cookie(s: &str) -> Result<(NeedleId, Cookie), String> { + // Go ParsePath: check for "_" suffix containing a decimal delta + let (hex_part, delta) = if let Some(underscore_pos) = s.rfind('_') { + if underscore_pos > 0 { + let delta_str = &s[underscore_pos + 1..]; + let d: u64 = delta_str + .parse() + .map_err(|e| format!("Parse delta error: {}", e))?; + (&s[..underscore_pos], Some(d)) + } else { + (s, None) + } + } else { + (s, None) + }; + + // Go: len(key_hash_string) <= CookieSize*2 => error (must be > 8 hex chars) + if hex_part.len() <= COOKIE_SIZE * 2 { + return Err("KeyHash is too short.".to_string()); + } + // Go: len(key_hash_string) > (NeedleIdSize+CookieSize)*2 => error (must be <= 24 hex chars) + if hex_part.len() > (NEEDLE_ID_SIZE + COOKIE_SIZE) * 2 { + return Err("KeyHash is too long.".to_string()); + } + + // Split: last CookieSize*2 hex chars are cookie, rest is needle id + let split = hex_part.len() - COOKIE_SIZE * 2; + let needle_id_hex = &hex_part[..split]; + let cookie_hex = &hex_part[split..]; + + let needle_id_bytes = hex::decode(needle_id_hex).map_err(|e| format!("Parse needleId error: {}", e))?; + let cookie_bytes = hex::decode(cookie_hex).map_err(|e| format!("Parse cookie error: {}", e))?; + + // Pad needle id to 8 bytes + let mut nid_buf = [0u8; 8]; + if needle_id_bytes.len() > 8 { + return Err(format!("KeyHash is too long.")); + } + let start = 8 - needle_id_bytes.len(); + nid_buf[start..].copy_from_slice(&needle_id_bytes); + + let mut key = NeedleId::from_bytes(&nid_buf[0..8]); + let cookie = Cookie::from_bytes(&cookie_bytes[0..4]); + + // Apply delta if present (Go: n.Id += Uint64ToNeedleId(d)) + if let Some(d) = delta { + key = NeedleId(key.0.wrapping_add(d)); + } + + Ok((key, cookie)) +} + +// ============================================================================ +// Errors +// ============================================================================ + +#[derive(Debug, thiserror::Error)] +pub enum NeedleError { + #[error("size mismatch at offset {offset}: found id={id} size={found:?}, expected size={expected:?}")] + SizeMismatch { + offset: i64, + id: NeedleId, + found: Size, + expected: Size, + }, + + #[error("CRC mismatch for needle {needle_id}: got {got:08x}, want {want:08x}")] + CrcMismatch { + needle_id: NeedleId, + got: u32, + want: u32, + }, + + #[error("index out of range ({0})")] + IndexOutOfRange(u32), + + #[error("needle tail too short")] + TailTooShort, + + #[error("unsupported version: {0}")] + UnsupportedVersion(u8), + + #[error("IO error: {0}")] + Io(#[from] std::io::Error), +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_header() { + let mut buf = [0u8; NEEDLE_HEADER_SIZE]; + let cookie = Cookie(0xdeadbeef); + let id = NeedleId(0x123456789abcdef0); + let size = Size(1024); + cookie.to_bytes(&mut buf[0..4]); + id.to_bytes(&mut buf[4..12]); + size.to_bytes(&mut buf[12..16]); + + let (c, i, s) = Needle::parse_header(&buf); + assert_eq!(c, cookie); + assert_eq!(i, id); + assert_eq!(s, size); + } + + #[test] + fn test_needle_write_read_round_trip_v3() { + let mut n = Needle::default(); + n.cookie = Cookie(42); + n.id = NeedleId(100); + n.data = b"hello world".to_vec(); + n.flags = 0; + n.set_has_name(); + n.name = b"test.txt".to_vec(); + n.name_size = 8; + n.set_has_mime(); + n.mime = b"text/plain".to_vec(); + n.mime_size = 10; + n.set_has_last_modified_date(); + n.last_modified = 1234567890; + n.set_has_ttl(); + n.ttl = Some(TTL { + count: 5, + unit: super::super::ttl::TTL_UNIT_DAY, + }); + n.append_at_ns = 999999999; + + let bytes = n.write_bytes(VERSION_3); + + // Verify total size matches expected + let expected_size = get_actual_size(n.size, VERSION_3); + assert_eq!(bytes.len() as i64, expected_size); + + // Read it back + let mut n2 = Needle::default(); + n2.read_bytes(&bytes, 0, n.size, VERSION_3).unwrap(); + + assert_eq!(n2.cookie, n.cookie); + assert_eq!(n2.id, n.id); + assert_eq!(n2.data, n.data); + assert_eq!(n2.name, n.name); + assert_eq!(n2.mime, n.mime); + assert_eq!(n2.last_modified, n.last_modified); + assert_eq!(n2.ttl, n.ttl); + assert_eq!(n2.checksum, n.checksum); + assert_eq!(n2.append_at_ns, n.append_at_ns); + } + + #[test] + fn test_needle_write_read_round_trip_v2() { + let mut n = Needle::default(); + n.cookie = Cookie(77); + n.id = NeedleId(200); + n.data = b"data v2".to_vec(); + n.flags = 0; + + let bytes = n.write_bytes(VERSION_2); + let expected_size = get_actual_size(n.size, VERSION_2); + assert_eq!(bytes.len() as i64, expected_size); + + let mut n2 = Needle::default(); + n2.read_bytes(&bytes, 0, n.size, VERSION_2).unwrap(); + + assert_eq!(n2.data, n.data); + assert_eq!(n2.checksum, n.checksum); + } + + #[test] + fn test_read_bytes_meta_only_handles_tombstone_v3() { + let mut tombstone = Needle::default(); + tombstone.cookie = Cookie(0x1234abcd); + tombstone.id = NeedleId(300); + tombstone.append_at_ns = 999_999; + + let bytes = tombstone.write_bytes(VERSION_3); + + let mut meta = Needle::default(); + meta.read_bytes_meta_only(&bytes, 0, Size(0), VERSION_3) + .unwrap(); + + assert_eq!(meta.cookie, tombstone.cookie); + assert_eq!(meta.id, tombstone.id); + assert_eq!(meta.size, Size(0)); + assert_eq!(meta.data_size, 0); + assert_eq!(meta.append_at_ns, tombstone.append_at_ns); + assert_eq!(meta.checksum, tombstone.checksum); + } + + #[test] + fn test_padding_alignment() { + // All actual sizes should be multiples of 8 + for size_val in 0..50 { + let s = Size(size_val); + let actual_v2 = get_actual_size(s, VERSION_2); + let actual_v3 = get_actual_size(s, VERSION_3); + assert_eq!(actual_v2 % 8, 0, "V2 size {} not aligned", size_val); + assert_eq!(actual_v3 % 8, 0, "V3 size {} not aligned", size_val); + } + } + + #[test] + fn test_file_id_parse() { + let fid = FileId::parse("3,01637037d6").unwrap(); + assert_eq!(fid.volume_id, VolumeId(3)); + // The hex "01637037d6" is 5 bytes = 0x0163..., padded to 12 bytes + assert!(!fid.key.is_empty() || !fid.cookie.0 == 0); + } + + #[test] + fn test_file_id_round_trip() { + let fid = FileId::new(VolumeId(5), NeedleId(0x123456), Cookie(0xabcd)); + let s = fid.to_string(); + let fid2 = FileId::parse(&s).unwrap(); + assert_eq!(fid, fid2); + } + + #[test] + fn test_needle_id_cookie_format() { + let s = format_needle_id_cookie(NeedleId(1), Cookie(0x12345678)); + let (key, cookie) = parse_needle_id_cookie(&s).unwrap(); + assert_eq!(key, NeedleId(1)); + assert_eq!(cookie, Cookie(0x12345678)); + } +} diff --git a/seaweed-volume/src/storage/needle/ttl.rs b/seaweed-volume/src/storage/needle/ttl.rs new file mode 100644 index 000000000..f55cb082f --- /dev/null +++ b/seaweed-volume/src/storage/needle/ttl.rs @@ -0,0 +1,302 @@ +//! Time-to-live encoding for needles. +//! +//! TTL is stored as 2 bytes: Count(1) + Unit(1). +//! Supported units: minute(m), hour(h), day(d), week(w), month(M), year(y). + +use std::fmt; + +/// TTL unit constants (matching Go). +pub const TTL_UNIT_EMPTY: u8 = 0; +pub const TTL_UNIT_MINUTE: u8 = 1; +pub const TTL_UNIT_HOUR: u8 = 2; +pub const TTL_UNIT_DAY: u8 = 3; +pub const TTL_UNIT_WEEK: u8 = 4; +pub const TTL_UNIT_MONTH: u8 = 5; +pub const TTL_UNIT_YEAR: u8 = 6; + +pub const TTL_BYTES_LENGTH: usize = 2; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub struct TTL { + pub count: u8, + pub unit: u8, +} + +impl TTL { + pub const EMPTY: TTL = TTL { count: 0, unit: 0 }; + + pub fn is_empty(&self) -> bool { + self.count == 0 && self.unit == 0 + } + + /// Load from 2 bytes. + pub fn from_bytes(input: &[u8]) -> Self { + if input.len() < 2 { + return TTL::EMPTY; + } + TTL { + count: input[0], + unit: input[1], + } + } + + /// Serialize to 2 bytes. + pub fn to_bytes(&self, output: &mut [u8]) { + assert!(output.len() >= 2); + output[0] = self.count; + output[1] = self.unit; + } + + /// Encode as u32: (count << 8) | unit. + pub fn to_u32(&self) -> u32 { + if self.count == 0 { + return 0; + } + ((self.count as u32) << 8) + (self.unit as u32) + } + + /// Decode from u32. + pub fn from_u32(v: u32) -> Self { + if v == 0 { + return TTL::EMPTY; + } + TTL { + count: (v >> 8) as u8, + unit: (v & 0xFF) as u8, + } + } + + /// Convert to total seconds. + pub fn to_seconds(&self) -> u64 { + unit_to_seconds(self.count as u64, self.unit) + } + + /// Parse from string like "3m", "4h", "5d", "6w", "7M", "8y". + /// If the string is all digits (no unit suffix), defaults to minutes. + /// Matches Go's ReadTTL which calls fitTtlCount to normalize: + /// e.g. "120m" -> 2h, "7d" -> 1w, "24h" -> 1d. + pub fn read(s: &str) -> Result { + let s = s.trim(); + if s.is_empty() { + return Ok(TTL::EMPTY); + } + let last_byte = s.as_bytes()[s.len() - 1]; + let (num_str, unit_byte) = if last_byte >= b'0' && last_byte <= b'9' { + // All digits — default to minutes (matching Go) + (s, b'm') + } else { + (&s[..s.len() - 1], last_byte) + }; + let count: u32 = num_str + .parse() + .map_err(|e| format!("invalid TTL count: {}", e))?; + let unit = match unit_byte { + b'm' => TTL_UNIT_MINUTE, + b'h' => TTL_UNIT_HOUR, + b'd' => TTL_UNIT_DAY, + b'w' => TTL_UNIT_WEEK, + b'M' => TTL_UNIT_MONTH, + b'y' => TTL_UNIT_YEAR, + _ => return Err(format!("unknown TTL unit: {}", unit_byte as char)), + }; + // Match Go's ReadTTL: normalize via fitTtlCount + Ok(fit_ttl_count(count, unit)) + } + + /// Minutes representation. + pub fn minutes(&self) -> u32 { + (self.to_seconds() / 60) as u32 + } +} + +fn unit_to_seconds(count: u64, unit: u8) -> u64 { + match unit { + TTL_UNIT_EMPTY => 0, + TTL_UNIT_MINUTE => count * 60, + TTL_UNIT_HOUR => count * 60 * 60, + TTL_UNIT_DAY => count * 60 * 60 * 24, + TTL_UNIT_WEEK => count * 60 * 60 * 24 * 7, + TTL_UNIT_MONTH => count * 60 * 60 * 24 * 30, + TTL_UNIT_YEAR => count * 60 * 60 * 24 * 365, + _ => 0, + } +} + +/// Fit a count+unit into a TTL that fits in a single byte count. +/// Converts to seconds first, then finds the coarsest unit that fits. +/// Matches Go's fitTtlCount called from ReadTTL. +fn fit_ttl_count(count: u32, unit: u8) -> TTL { + if count == 0 || unit == TTL_UNIT_EMPTY { + return TTL::EMPTY; + } + + // Always convert to seconds and normalize (matches Go). + let seconds = unit_to_seconds(count as u64, unit); + if seconds == 0 { + return TTL::EMPTY; + } + + const YEAR_SECS: u64 = 3600 * 24 * 365; + const MONTH_SECS: u64 = 3600 * 24 * 30; + const WEEK_SECS: u64 = 3600 * 24 * 7; + const DAY_SECS: u64 = 3600 * 24; + const HOUR_SECS: u64 = 3600; + const MINUTE_SECS: u64 = 60; + + // First pass: try exact fits from largest to smallest + if seconds % YEAR_SECS == 0 && seconds / YEAR_SECS < 256 { + return TTL { count: (seconds / YEAR_SECS) as u8, unit: TTL_UNIT_YEAR }; + } + if seconds % MONTH_SECS == 0 && seconds / MONTH_SECS < 256 { + return TTL { count: (seconds / MONTH_SECS) as u8, unit: TTL_UNIT_MONTH }; + } + if seconds % WEEK_SECS == 0 && seconds / WEEK_SECS < 256 { + return TTL { count: (seconds / WEEK_SECS) as u8, unit: TTL_UNIT_WEEK }; + } + if seconds % DAY_SECS == 0 && seconds / DAY_SECS < 256 { + return TTL { count: (seconds / DAY_SECS) as u8, unit: TTL_UNIT_DAY }; + } + if seconds % HOUR_SECS == 0 && seconds / HOUR_SECS < 256 { + return TTL { count: (seconds / HOUR_SECS) as u8, unit: TTL_UNIT_HOUR }; + } + // Minutes: truncating division + if seconds / MINUTE_SECS < 256 { + return TTL { count: (seconds / MINUTE_SECS) as u8, unit: TTL_UNIT_MINUTE }; + } + // Second pass: truncating division from smallest to largest + if seconds / HOUR_SECS < 256 { + return TTL { count: (seconds / HOUR_SECS) as u8, unit: TTL_UNIT_HOUR }; + } + if seconds / DAY_SECS < 256 { + return TTL { count: (seconds / DAY_SECS) as u8, unit: TTL_UNIT_DAY }; + } + if seconds / WEEK_SECS < 256 { + return TTL { count: (seconds / WEEK_SECS) as u8, unit: TTL_UNIT_WEEK }; + } + if seconds / MONTH_SECS < 256 { + return TTL { count: (seconds / MONTH_SECS) as u8, unit: TTL_UNIT_MONTH }; + } + if seconds / YEAR_SECS < 256 { + return TTL { count: (seconds / YEAR_SECS) as u8, unit: TTL_UNIT_YEAR }; + } + TTL::EMPTY +} + +fn unit_to_char(unit: u8) -> char { + match unit { + TTL_UNIT_MINUTE => 'm', + TTL_UNIT_HOUR => 'h', + TTL_UNIT_DAY => 'd', + TTL_UNIT_WEEK => 'w', + TTL_UNIT_MONTH => 'M', + TTL_UNIT_YEAR => 'y', + _ => ' ', + } +} + +impl fmt::Display for TTL { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.count == 0 || self.unit == TTL_UNIT_EMPTY { + return write!(f, ""); + } + write!(f, "{}{}", self.count, unit_to_char(self.unit)) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_ttl_parse() { + let ttl = TTL::read("3m").unwrap(); + assert_eq!( + ttl, + TTL { + count: 3, + unit: TTL_UNIT_MINUTE + } + ); + assert_eq!(ttl.to_seconds(), 180); + } + + #[test] + fn test_ttl_parse_hours() { + // 24h normalizes to 1d via fitTtlCount + let ttl = TTL::read("24h").unwrap(); + assert_eq!(ttl.to_seconds(), 86400); + assert_eq!(ttl, TTL { count: 1, unit: TTL_UNIT_DAY }); + } + + #[test] + fn test_ttl_display() { + let ttl = TTL { + count: 5, + unit: TTL_UNIT_DAY, + }; + assert_eq!(ttl.to_string(), "5d"); + } + + #[test] + fn test_ttl_bytes_round_trip() { + let ttl = TTL { + count: 10, + unit: TTL_UNIT_WEEK, + }; + let mut buf = [0u8; 2]; + ttl.to_bytes(&mut buf); + let ttl2 = TTL::from_bytes(&buf); + assert_eq!(ttl, ttl2); + } + + #[test] + fn test_ttl_u32_round_trip() { + let ttl = TTL { + count: 42, + unit: TTL_UNIT_HOUR, + }; + let v = ttl.to_u32(); + let ttl2 = TTL::from_u32(v); + assert_eq!(ttl, ttl2); + } + + #[test] + fn test_ttl_empty() { + assert!(TTL::EMPTY.is_empty()); + assert_eq!(TTL::EMPTY.to_seconds(), 0); + assert_eq!(TTL::EMPTY.to_u32(), 0); + } + + #[test] + fn test_ttl_overflow_normalizes() { + // Go's ReadTTL calls fitTtlCount: 300m = 18000s = 5h (exact fit) + let ttl = TTL::read("300m").unwrap(); + assert_eq!(ttl, TTL { count: 5, unit: TTL_UNIT_HOUR }); + + // 256h = 921600s. Doesn't fit in hours (256 >= 256), doesn't fit exact in days. + // Second pass: 921600/86400 = 10 (truncated) < 256 -> 10d + let ttl = TTL::read("256h").unwrap(); + assert_eq!(ttl, TTL { count: 10, unit: TTL_UNIT_DAY }); + } + + #[test] + fn test_ttl_normalizes_unit() { + // Go's ReadTTL calls fitTtlCount which normalizes to coarsest unit. + // 120m -> 2h, 7d -> 1w, 24h -> 1d. + let ttl = TTL::read("120m").unwrap(); + assert_eq!(ttl, TTL { count: 2, unit: TTL_UNIT_HOUR }); + + let ttl = TTL::read("7d").unwrap(); + assert_eq!(ttl, TTL { count: 1, unit: TTL_UNIT_WEEK }); + + let ttl = TTL::read("24h").unwrap(); + assert_eq!(ttl, TTL { count: 1, unit: TTL_UNIT_DAY }); + + // Values that don't simplify stay as-is + let ttl = TTL::read("5d").unwrap(); + assert_eq!(ttl, TTL { count: 5, unit: TTL_UNIT_DAY }); + + let ttl = TTL::read("3m").unwrap(); + assert_eq!(ttl, TTL { count: 3, unit: TTL_UNIT_MINUTE }); + } +} diff --git a/seaweed-volume/src/storage/needle_map.rs b/seaweed-volume/src/storage/needle_map.rs new file mode 100644 index 000000000..248604e1d --- /dev/null +++ b/seaweed-volume/src/storage/needle_map.rs @@ -0,0 +1,1438 @@ +//! NeedleMapper: index mapping NeedleId -> (Offset, Size). +//! +//! Two implementations: +//! - `CompactNeedleMap`: in-memory segmented sorted arrays (~10 bytes/entry) +//! - `RedbNeedleMap`: disk-backed via redb (low RAM, slightly slower) +//! +//! The `NeedleMap` enum wraps both and provides a uniform interface. +//! Loaded from .idx file on volume mount. Supports Get, Put, Delete with +//! metrics tracking (file count, byte count, deleted count, deleted bytes). + +use std::collections::HashMap; +use std::io::{self, Read, Seek, Write}; +use std::path::Path; +use std::sync::atomic::{AtomicI64, AtomicU64, Ordering}; + +mod compact_map; +use compact_map::CompactMap; + +use redb::{Database, Durability, ReadableDatabase, ReadableTable, TableDefinition}; + +use crate::storage::idx; +use crate::storage::types::*; + +// ============================================================================ +// NeedleValue +// ============================================================================ + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct NeedleValue { + pub offset: Offset, + pub size: Size, +} + +/// Packed size of a NeedleValue in redb storage: OFFSET_SIZE + SIZE_SIZE. +const PACKED_NEEDLE_VALUE_SIZE: usize = OFFSET_SIZE + SIZE_SIZE; + +/// Pack an (Offset, Size) pair into bytes for redb storage. +/// Layout: [offset OFFSET_SIZE bytes] [size 4 bytes big-endian] +fn pack_needle_value(nv: &NeedleValue) -> [u8; PACKED_NEEDLE_VALUE_SIZE] { + let mut buf = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + nv.offset.to_bytes(&mut buf[..OFFSET_SIZE]); + nv.size.to_bytes(&mut buf[OFFSET_SIZE..]); + buf +} + +/// Unpack bytes from redb storage into (Offset, Size). +fn unpack_needle_value(bytes: &[u8; PACKED_NEEDLE_VALUE_SIZE]) -> NeedleValue { + NeedleValue { + offset: Offset::from_bytes(&bytes[..OFFSET_SIZE]), + size: Size::from_bytes(&bytes[OFFSET_SIZE..]), + } +} + +// ============================================================================ +// NeedleMapMetric +// ============================================================================ + +/// Metrics tracking for needle map operations. +#[derive(Debug, Default)] +pub struct NeedleMapMetric { + pub file_count: AtomicI64, + pub file_byte_count: AtomicU64, + pub deletion_count: AtomicI64, + pub deletion_byte_count: AtomicU64, + pub max_file_key: AtomicU64, +} + +impl NeedleMapMetric { + /// Update metrics based on a Put operation (additive-only, matching Go's logPut). + fn on_put(&self, key: NeedleId, old: Option<&NeedleValue>, new_size: Size) { + self.maybe_set_max_file_key(key); + // Go: always LogFileCounter(newSize) which does FileCounter++ and FileByteCounter += newSize + self.file_count.fetch_add(1, Ordering::Relaxed); + self.file_byte_count + .fetch_add(new_size.0 as u64, Ordering::Relaxed); + // Go: if oldSize > 0 && oldSize.IsValid() { LogDeletionCounter(oldSize) } + if let Some(old_val) = old { + if old_val.size.0 > 0 && old_val.size.is_valid() { + self.deletion_count.fetch_add(1, Ordering::Relaxed); + self.deletion_byte_count + .fetch_add(old_val.size.0 as u64, Ordering::Relaxed); + } + } + } + + /// Update metrics based on a Delete operation (additive-only, matching Go's logDelete). + fn on_delete(&self, old: &NeedleValue) { + if old.size.0 > 0 { + self.deletion_count.fetch_add(1, Ordering::Relaxed); + self.deletion_byte_count + .fetch_add(old.size.0 as u64, Ordering::Relaxed); + } + } + + fn maybe_set_max_file_key(&self, key: NeedleId) { + let key_val: u64 = key.into(); + loop { + let current = self.max_file_key.load(Ordering::Relaxed); + if key_val <= current { + break; + } + if self + .max_file_key + .compare_exchange(current, key_val, Ordering::Relaxed, Ordering::Relaxed) + .is_ok() + { + break; + } + } + } +} + +// ============================================================================ +// NeedleMapKind +// ============================================================================ + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum NeedleMapKind { + InMemory, + LevelDb, + LevelDbMedium, + LevelDbLarge, +} + +// ============================================================================ +// IdxFileWriter trait +// ============================================================================ + +/// Trait for appending to an index file. +pub trait IdxFileWriter: Write + Send + Sync { + fn sync_all(&self) -> io::Result<()>; +} + +impl IdxFileWriter for std::fs::File { + fn sync_all(&self) -> io::Result<()> { + std::fs::File::sync_all(self) + } +} + +// ============================================================================ +// CompactNeedleMap (in-memory) +// ============================================================================ + +/// In-memory needle map backed by a CompactMap (segmented sorted arrays). +/// Uses ~10 bytes per entry instead of ~40-48 bytes with HashMap. +/// The .idx file is kept open for append-only writes. +pub struct CompactNeedleMap { + map: CompactMap, + metric: NeedleMapMetric, + idx_file: Option>, + idx_file_offset: u64, +} + +impl CompactNeedleMap { + /// Create a new empty in-memory map. + pub fn new() -> Self { + CompactNeedleMap { + map: CompactMap::new(), + metric: NeedleMapMetric::default(), + idx_file: None, + idx_file_offset: 0, + } + } + + /// Load from an .idx file, building the in-memory map. + pub fn load_from_idx(reader: &mut R) -> io::Result { + let mut nm = CompactNeedleMap::new(); + idx::walk_index_file(reader, 0, |key, offset, size| { + if offset.is_zero() || size.is_deleted() { + nm.delete_from_map(key); + } else { + nm.set_internal(key, NeedleValue { offset, size }); + } + Ok(()) + })?; + Ok(nm) + } + + /// Set the index file for append-only writes. + pub fn set_idx_file(&mut self, file: Box, offset: u64) { + self.idx_file = Some(file); + self.idx_file_offset = offset; + } + + // ---- Map operations ---- + + /// Insert or update an entry. Appends to .idx file if present. + pub fn put(&mut self, key: NeedleId, offset: Offset, size: Size) -> io::Result<()> { + // Persist to idx file BEFORE mutating in-memory state for crash consistency + if let Some(ref mut idx_file) = self.idx_file { + idx::write_index_entry(idx_file, key, offset, size)?; + self.idx_file_offset += NEEDLE_MAP_ENTRY_SIZE as u64; + } + + let old = self.map.get(key); + self.metric.on_put(key, old.as_ref(), size); + self.map.set(key, offset, size); + Ok(()) + } + + /// Look up a needle. + pub fn get(&self, key: NeedleId) -> Option { + self.map.get(key) + } + + /// Mark a needle as deleted. Appends tombstone to .idx file. + /// Matches Go's NeedleMap.Delete: ALWAYS writes tombstone to idx and + /// increments deletion counter, even if needle doesn't exist or is + /// already deleted (important for replication). + pub fn delete(&mut self, key: NeedleId, offset: Offset) -> io::Result> { + // Go unconditionally calls nm.m.Delete(), nm.logDelete(), nm.appendToIndexFile() + let deleted_bytes = self.map.delete(key).unwrap_or(Size(0)); + + // Match Go's logDelete -> LogDeletionCounter: only increment when oldSize > 0. + // Go does NOT decrement FileCounter/FileByteCounter in Delete; + // live counts are computed as FileCounter - DeletionCounter. + if deleted_bytes.0 > 0 { + self.metric.deletion_count.fetch_add(1, Ordering::Relaxed); + self.metric + .deletion_byte_count + .fetch_add(deleted_bytes.0 as u64, Ordering::Relaxed); + } + + // Always write tombstone to idx file (matching Go) + if let Some(ref mut idx_file) = self.idx_file { + idx::write_index_entry(idx_file, key, offset, TOMBSTONE_FILE_SIZE)?; + self.idx_file_offset += NEEDLE_MAP_ENTRY_SIZE as u64; + } + + if deleted_bytes.0 > 0 { + Ok(Some(deleted_bytes)) + } else { + Ok(None) + } + } + + // ---- Internal helpers ---- + + /// Insert into map during loading (no idx file write). + fn set_internal(&mut self, key: NeedleId, nv: NeedleValue) { + let old = self.map.get(key); + self.metric.on_put(key, old.as_ref(), nv.size); + self.map.set(key, nv.offset, nv.size); + } + + /// Remove from map during loading (handle deletions in idx walk). + /// Matches Go's doLoading else branch: always increments DeletionCounter, + /// and adds old size bytes to DeletionByteCounter. + fn delete_from_map(&mut self, key: NeedleId) { + self.metric.maybe_set_max_file_key(key); + // Go's CompactMap.Delete returns old size (0 if not found or already deleted). + // Go's doLoading always does DeletionCounter++ and DeletionByteCounter += uint64(oldSize). + let old_size = self.map.get(key).map(|nv| nv.size).unwrap_or(Size(0)); + // Go unconditionally increments DeletionCounter + self.metric.deletion_count.fetch_add(1, Ordering::Relaxed); + // Go adds uint64(oldSize) which for valid sizes adds the value, for 0/negative adds 0 + if old_size.0 > 0 { + self.metric + .deletion_byte_count + .fetch_add(old_size.0 as u64, Ordering::Relaxed); + } + self.map.remove(key); + } + + // ---- Metrics accessors ---- + + pub fn content_size(&self) -> u64 { + self.metric.file_byte_count.load(Ordering::Relaxed) + } + + pub fn deleted_size(&self) -> u64 { + self.metric.deletion_byte_count.load(Ordering::Relaxed) + } + + pub fn file_count(&self) -> i64 { + self.metric.file_count.load(Ordering::Relaxed) + } + + pub fn deleted_count(&self) -> i64 { + self.metric.deletion_count.load(Ordering::Relaxed) + } + + pub fn max_file_key(&self) -> NeedleId { + NeedleId(self.metric.max_file_key.load(Ordering::Relaxed)) + } + + pub fn index_file_size(&self) -> u64 { + self.idx_file_offset + } + + /// Sync index file to disk. + pub fn sync(&self) -> io::Result<()> { + if let Some(ref idx_file) = self.idx_file { + idx_file.sync_all()?; + } + Ok(()) + } + + /// Close index file. + pub fn close(&mut self) { + let _ = self.sync(); + self.idx_file = None; + } + + /// Save the in-memory map to an index file, sorted by needle ID ascending. + pub fn save_to_idx(&self, path: &str) -> io::Result<()> { + let mut file = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(path)?; + + self.map.ascending_visit(|id, nv| { + if nv.size.is_valid() { + idx::write_index_entry(&mut file, id, nv.offset, nv.size) + } else { + Ok(()) + } + })?; + file.sync_all()?; + Ok(()) + } + + /// Visit all entries in ascending order by needle ID. + pub fn ascending_visit(&self, f: F) -> Result<(), String> + where + F: FnMut(NeedleId, &NeedleValue) -> Result<(), String>, + { + self.map.ascending_visit(f) + } +} + +// ============================================================================ +// RedbNeedleMap (disk-backed via redb) +// ============================================================================ + +/// redb table: NeedleId (u64) -> packed [offset(4) + size(4)] +const NEEDLE_TABLE: TableDefinition = TableDefinition::new("needles"); + +/// Metadata table: stores the .idx file size that was used to build this redb. +/// Key "idx_size" -> u64 byte offset. Used to detect whether the .rdb can be +/// reused on restart or needs a full/incremental rebuild. +const META_TABLE: TableDefinition<&str, u64> = TableDefinition::new("meta"); +const META_IDX_SIZE: &str = "idx_size"; + +/// Disk-backed needle map using redb. +/// Low memory usage — data lives on disk with redb's page cache. +pub struct RedbNeedleMap { + db: Database, + metric: NeedleMapMetric, + idx_file: Option>, + idx_file_offset: u64, +} + +impl RedbNeedleMap { + /// Begin a write transaction with `Durability::None` (no fsync). + /// The .idx file is the source of truth for crash recovery, so redb + /// is always rebuilt from .idx on startup — fsync is unnecessary. + fn begin_write_no_fsync(db: &Database) -> io::Result { + let mut txn = db.begin_write().map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb begin_write: {}", e)) + })?; + let _ = txn.set_durability(Durability::None); + Ok(txn) + } + + /// Create a new redb-backed needle map at the given path. + /// The database file will be created if it does not exist. + pub fn new(db_path: &str) -> io::Result { + let db = Database::create(db_path).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb create error: {}", e)) + })?; + + // Ensure tables exist + let txn = Self::begin_write_no_fsync(&db)?; + { + let _table = txn.open_table(NEEDLE_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)) + })?; + let _meta = txn.open_table(META_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table meta: {}", e)) + })?; + } + txn.commit() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb commit: {}", e)))?; + + Ok(RedbNeedleMap { + db, + metric: NeedleMapMetric::default(), + idx_file: None, + idx_file_offset: 0, + }) + } + + /// Save the .idx file size into redb metadata so we can detect whether + /// the .rdb is up-to-date on the next startup. + fn save_idx_size_meta(&self, idx_size: u64) -> io::Result<()> { + let txn = Self::begin_write_no_fsync(&self.db)?; + { + let mut meta = txn.open_table(META_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open meta: {}", e)) + })?; + meta.insert(META_IDX_SIZE, idx_size).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb insert meta: {}", e)) + })?; + } + txn.commit().map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb commit meta: {}", e)) + })?; + Ok(()) + } + + /// Read the stored .idx file size from redb metadata. + fn read_idx_size_meta(&self) -> io::Result> { + let txn = self + .db + .begin_read() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb begin_read: {}", e)))?; + let meta = txn + .open_table(META_TABLE) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb open meta: {}", e)))?; + match meta.get(META_IDX_SIZE) { + Ok(Some(guard)) => Ok(Some(guard.value())), + Ok(None) => Ok(None), + Err(e) => Err(io::Error::new( + io::ErrorKind::Other, + format!("redb get meta: {}", e), + )), + } + } + + /// Rebuild metrics by scanning all entries in the redb table. + /// Called when reusing an existing .rdb without a full rebuild. + fn rebuild_metrics_from_db(&self) -> io::Result<()> { + let txn = self + .db + .begin_read() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb begin_read: {}", e)))?; + let table = txn + .open_table(NEEDLE_TABLE) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)))?; + let iter = table + .iter() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb iter: {}", e)))?; + for entry in iter { + let (key_guard, val_guard) = entry.map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb iter next: {}", e)) + })?; + let key = NeedleId(key_guard.value()); + let bytes: &[u8] = val_guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + let nv = unpack_needle_value(&arr); + self.metric.maybe_set_max_file_key(key); + if nv.size.is_valid() { + self.metric.file_count.fetch_add(1, Ordering::Relaxed); + self.metric + .file_byte_count + .fetch_add(nv.size.0 as u64, Ordering::Relaxed); + } else { + // Deleted entry (negative size) + self.metric.deletion_count.fetch_add(1, Ordering::Relaxed); + self.metric + .deletion_byte_count + .fetch_add((-nv.size.0) as u64, Ordering::Relaxed); + } + } + } + Ok(()) + } + + /// Load from an .idx file, reusing an existing .rdb if it is consistent. + /// + /// Strategy: + /// 1. Try to open existing .rdb and read its stored .idx size + /// 2. If .idx size matches → reuse .rdb, rebuild metrics from scan + /// 3. If .idx is larger → replay new entries incrementally + /// 4. Otherwise (missing, corrupted, .idx smaller) → full rebuild + pub fn load_from_idx(db_path: &str, reader: &mut R) -> io::Result { + let idx_size = reader.seek(io::SeekFrom::End(0))?; + reader.seek(io::SeekFrom::Start(0))?; + + // Try to reuse existing .rdb + if Path::new(db_path).exists() { + if let Ok(nm) = Self::try_reuse_rdb(db_path, reader, idx_size) { + return Ok(nm); + } + // Reuse failed — fall through to full rebuild + reader.seek(io::SeekFrom::Start(0))?; + } + + Self::full_rebuild(db_path, reader, idx_size) + } + + /// Try to reuse an existing .rdb file. Returns Ok if successful, + /// Err if a full rebuild is needed. + fn try_reuse_rdb( + db_path: &str, + reader: &mut R, + idx_size: u64, + ) -> io::Result { + let db = Database::open(db_path) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb open: {}", e)))?; + + let nm = RedbNeedleMap { + db, + metric: NeedleMapMetric::default(), + idx_file: None, + idx_file_offset: 0, + }; + + let stored_idx_size = nm + .read_idx_size_meta()? + .ok_or_else(|| io::Error::new(io::ErrorKind::Other, "no idx_size in redb meta"))?; + + if stored_idx_size > idx_size { + // .idx shrank — corrupted or truncated, need full rebuild + return Err(io::Error::new( + io::ErrorKind::Other, + "idx file smaller than stored size", + )); + } + + // Rebuild metrics from existing data + nm.rebuild_metrics_from_db()?; + + if stored_idx_size < idx_size { + // .idx grew — replay new entries incrementally + let start_entry = stored_idx_size / NEEDLE_MAP_ENTRY_SIZE as u64; + let txn = Self::begin_write_no_fsync(&nm.db)?; + { + let mut table = txn.open_table(NEEDLE_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)) + })?; + idx::walk_index_file(reader, start_entry, |key, offset, size| { + let key_u64: u64 = key.into(); + if offset.is_zero() || size.is_deleted() { + // Delete: look up old value for metric update, then + // store tombstone (negative size with original offset) + if let Ok(Some(old)) = nm.get_via_table(&table, key_u64) { + if old.size.is_valid() { + nm.metric.on_delete(&old); + let deleted_nv = NeedleValue { + offset: old.offset, + size: Size(-(old.size.0)), + }; + let packed = pack_needle_value(&deleted_nv); + table.insert(key_u64, packed.as_slice()).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("redb insert: {}", e), + ) + })?; + } + } + } else { + // Put: look up old value for metric update + let old = nm.get_via_table(&table, key_u64).ok().flatten(); + let nv = NeedleValue { offset, size }; + let packed = pack_needle_value(&nv); + table.insert(key_u64, packed.as_slice()).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb insert: {}", e)) + })?; + nm.metric.on_put(key, old.as_ref(), size); + } + Ok(()) + })?; + } + txn.commit() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb commit: {}", e)))?; + + nm.save_idx_size_meta(idx_size)?; + } + + Ok(nm) + } + + /// Look up a needle value using an already-open table reference. + /// Used during incremental replay to avoid opening separate read transactions. + fn get_via_table( + &self, + table: &redb::Table, + key_u64: u64, + ) -> io::Result> { + match table.get(key_u64) { + Ok(Some(guard)) => { + let bytes: &[u8] = guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + Ok(Some(unpack_needle_value(&arr))) + } else { + Ok(None) + } + } + Ok(None) => Ok(None), + Err(e) => Err(io::Error::new( + io::ErrorKind::Other, + format!("redb get: {}", e), + )), + } + } + + /// Full rebuild: delete existing .rdb and rebuild from entire .idx file. + fn full_rebuild( + db_path: &str, + reader: &mut R, + idx_size: u64, + ) -> io::Result { + let _ = std::fs::remove_file(db_path); + let nm = RedbNeedleMap::new(db_path)?; + + // Collect entries from idx file, resolving duplicates/deletions + let mut entries: HashMap> = HashMap::new(); + idx::walk_index_file(reader, 0, |key, offset, size| { + if offset.is_zero() || size.is_deleted() { + entries.insert(key, None); + } else { + entries.insert(key, Some(NeedleValue { offset, size })); + } + Ok(()) + })?; + + // Write all live entries to redb in a single transaction + let txn = Self::begin_write_no_fsync(&nm.db)?; + { + let mut table = txn.open_table(NEEDLE_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)) + })?; + + for (key, maybe_nv) in &entries { + let key_u64: u64 = (*key).into(); + if let Some(nv) = maybe_nv { + let packed = pack_needle_value(nv); + table.insert(key_u64, packed.as_slice()).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb insert: {}", e)) + })?; + nm.metric.on_put(*key, None, nv.size); + } else { + // Entry was deleted — remove from redb if present + table.remove(key_u64).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb remove: {}", e)) + })?; + } + } + } + txn.commit() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb commit: {}", e)))?; + + nm.save_idx_size_meta(idx_size)?; + + Ok(nm) + } + + /// Set the index file for append-only writes. + pub fn set_idx_file(&mut self, file: Box, offset: u64) { + self.idx_file = Some(file); + self.idx_file_offset = offset; + } + + // ---- Map operations ---- + + /// Insert or update an entry. Writes to idx file first, then redb. + pub fn put(&mut self, key: NeedleId, offset: Offset, size: Size) -> io::Result<()> { + // Persist to idx file BEFORE mutating redb state for crash consistency + if let Some(ref mut idx_file) = self.idx_file { + idx::write_index_entry(idx_file, key, offset, size)?; + self.idx_file_offset += NEEDLE_MAP_ENTRY_SIZE as u64; + } + + let key_u64: u64 = key.into(); + let nv = NeedleValue { offset, size }; + let packed = pack_needle_value(&nv); + + // Read old value for metric update + let old = self.get_internal(key_u64)?; + + let txn = Self::begin_write_no_fsync(&self.db)?; + { + let mut table = txn.open_table(NEEDLE_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)) + })?; + table + .insert(key_u64, packed.as_slice()) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb insert: {}", e)))?; + } + txn.commit() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb commit: {}", e)))?; + + self.metric.on_put(key, old.as_ref(), size); + Ok(()) + } + + /// Look up a needle. + pub fn get(&self, key: NeedleId) -> Option { + let key_u64: u64 = key.into(); + self.get_internal(key_u64).ok().flatten() + } + + /// Internal get that returns io::Result for error propagation. + fn get_internal(&self, key_u64: u64) -> io::Result> { + let txn = self + .db + .begin_read() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb begin_read: {}", e)))?; + let table = txn + .open_table(NEEDLE_TABLE) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)))?; + match table.get(key_u64) { + Ok(Some(guard)) => { + let bytes: &[u8] = guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + Ok(Some(unpack_needle_value(&arr))) + } else { + Ok(None) + } + } + Ok(None) => Ok(None), + Err(e) => Err(io::Error::new( + io::ErrorKind::Other, + format!("redb get: {}", e), + )), + } + } + + /// Mark a needle as deleted. Appends tombstone to .idx file, negates size in redb. + pub fn delete(&mut self, key: NeedleId, offset: Offset) -> io::Result> { + let key_u64: u64 = key.into(); + + if let Some(old) = self.get_internal(key_u64)? { + if old.size.is_valid() { + // Persist tombstone to idx file BEFORE mutating redb + if let Some(ref mut idx_file) = self.idx_file { + idx::write_index_entry(idx_file, key, offset, TOMBSTONE_FILE_SIZE)?; + self.idx_file_offset += NEEDLE_MAP_ENTRY_SIZE as u64; + } + + self.metric.on_delete(&old); + let deleted_size = Size(-(old.size.0)); + // Keep original offset so readDeleted can find original data (matching Go behavior) + let deleted_nv = NeedleValue { + offset: old.offset, + size: deleted_size, + }; + let packed = pack_needle_value(&deleted_nv); + + let txn = Self::begin_write_no_fsync(&self.db)?; + { + let mut table = txn.open_table(NEEDLE_TABLE).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)) + })?; + table.insert(key_u64, packed.as_slice()).map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb insert: {}", e)) + })?; + } + txn.commit().map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb commit: {}", e)) + })?; + + return Ok(Some(old.size)); + } + } + Ok(None) + } + + // ---- Metrics accessors ---- + + pub fn content_size(&self) -> u64 { + self.metric.file_byte_count.load(Ordering::Relaxed) + } + + pub fn deleted_size(&self) -> u64 { + self.metric.deletion_byte_count.load(Ordering::Relaxed) + } + + pub fn file_count(&self) -> i64 { + self.metric.file_count.load(Ordering::Relaxed) + } + + pub fn deleted_count(&self) -> i64 { + self.metric.deletion_count.load(Ordering::Relaxed) + } + + pub fn max_file_key(&self) -> NeedleId { + NeedleId(self.metric.max_file_key.load(Ordering::Relaxed)) + } + + pub fn index_file_size(&self) -> u64 { + self.idx_file_offset + } + + /// Sync index file to disk. + pub fn sync(&self) -> io::Result<()> { + if let Some(ref idx_file) = self.idx_file { + idx_file.sync_all()?; + } + Ok(()) + } + + /// Close index file. + pub fn close(&mut self) { + let _ = self.sync(); + self.idx_file = None; + } + + /// Save the redb contents to an index file, sorted by needle ID ascending. + pub fn save_to_idx(&self, path: &str) -> io::Result<()> { + let txn = self + .db + .begin_read() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb begin_read: {}", e)))?; + let table = txn + .open_table(NEEDLE_TABLE) + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb open_table: {}", e)))?; + + let mut file = std::fs::OpenOptions::new() + .write(true) + .create(true) + .truncate(true) + .open(path)?; + + // redb iterates in key order (u64 ascending) + let iter = table + .iter() + .map_err(|e| io::Error::new(io::ErrorKind::Other, format!("redb iter: {}", e)))?; + + for entry in iter { + let (key_guard, val_guard) = entry.map_err(|e| { + io::Error::new(io::ErrorKind::Other, format!("redb iter next: {}", e)) + })?; + let key_u64: u64 = key_guard.value(); + let bytes: &[u8] = val_guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + let nv = unpack_needle_value(&arr); + if nv.size.is_valid() { + idx::write_index_entry(&mut file, NeedleId(key_u64), nv.offset, nv.size)?; + } + } + } + file.sync_all()?; + Ok(()) + } + + /// Visit all entries in ascending order by needle ID. + pub fn ascending_visit(&self, mut f: F) -> Result<(), String> + where + F: FnMut(NeedleId, &NeedleValue) -> Result<(), String>, + { + let txn = self + .db + .begin_read() + .map_err(|e| format!("redb begin_read: {}", e))?; + let table = txn + .open_table(NEEDLE_TABLE) + .map_err(|e| format!("redb open_table: {}", e))?; + let iter = table.iter().map_err(|e| format!("redb iter: {}", e))?; + + for entry in iter { + let (key_guard, val_guard) = entry.map_err(|e| format!("redb iter next: {}", e))?; + let key_u64: u64 = key_guard.value(); + let bytes: &[u8] = val_guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + let nv = unpack_needle_value(&arr); + f(NeedleId(key_u64), &nv)?; + } + } + Ok(()) + } + + /// Collect all entries as a Vec for iteration (used by volume.rs iter patterns). + pub fn collect_entries(&self) -> Vec<(NeedleId, NeedleValue)> { + let mut result = Vec::new(); + let txn: redb::ReadTransaction = match self.db.begin_read() { + Ok(t) => t, + Err(_) => return result, + }; + let table = match txn.open_table(NEEDLE_TABLE) { + Ok(t) => t, + Err(_) => return result, + }; + let iter = match table.iter() { + Ok(i) => i, + Err(_) => return result, + }; + for entry in iter { + if let Ok((key_guard, val_guard)) = entry { + let key_u64: u64 = key_guard.value(); + let bytes: &[u8] = val_guard.value(); + if bytes.len() == PACKED_NEEDLE_VALUE_SIZE { + let mut arr = [0u8; PACKED_NEEDLE_VALUE_SIZE]; + arr.copy_from_slice(bytes); + let nv = unpack_needle_value(&arr); + result.push((NeedleId(key_u64), nv)); + } + } + } + result + } +} + +// ============================================================================ +// NeedleMap enum — unified interface over both implementations +// ============================================================================ + +/// Unified needle map wrapping either in-memory or redb-backed storage. +pub enum NeedleMap { + InMemory(CompactNeedleMap), + Redb(RedbNeedleMap), +} + +impl NeedleMap { + /// Insert or update an entry. + pub fn put(&mut self, key: NeedleId, offset: Offset, size: Size) -> io::Result<()> { + match self { + NeedleMap::InMemory(nm) => nm.put(key, offset, size), + NeedleMap::Redb(nm) => nm.put(key, offset, size), + } + } + + /// Look up a needle. + pub fn get(&self, key: NeedleId) -> Option { + match self { + NeedleMap::InMemory(nm) => nm.get(key), + NeedleMap::Redb(nm) => nm.get(key), + } + } + + /// Mark a needle as deleted. + pub fn delete(&mut self, key: NeedleId, offset: Offset) -> io::Result> { + match self { + NeedleMap::InMemory(nm) => nm.delete(key, offset), + NeedleMap::Redb(nm) => nm.delete(key, offset), + } + } + + /// Set the index file for append-only writes. + pub fn set_idx_file(&mut self, file: Box, offset: u64) { + match self { + NeedleMap::InMemory(nm) => nm.set_idx_file(file, offset), + NeedleMap::Redb(nm) => nm.set_idx_file(file, offset), + } + } + + /// Content byte count. + pub fn content_size(&self) -> u64 { + match self { + NeedleMap::InMemory(nm) => nm.content_size(), + NeedleMap::Redb(nm) => nm.content_size(), + } + } + + /// Deleted byte count. + pub fn deleted_size(&self) -> u64 { + match self { + NeedleMap::InMemory(nm) => nm.deleted_size(), + NeedleMap::Redb(nm) => nm.deleted_size(), + } + } + + /// Live file count. + pub fn file_count(&self) -> i64 { + match self { + NeedleMap::InMemory(nm) => nm.file_count(), + NeedleMap::Redb(nm) => nm.file_count(), + } + } + + /// Deleted file count. + pub fn deleted_count(&self) -> i64 { + match self { + NeedleMap::InMemory(nm) => nm.deleted_count(), + NeedleMap::Redb(nm) => nm.deleted_count(), + } + } + + /// Maximum needle ID seen. + pub fn max_file_key(&self) -> NeedleId { + match self { + NeedleMap::InMemory(nm) => nm.max_file_key(), + NeedleMap::Redb(nm) => nm.max_file_key(), + } + } + + /// Index file size in bytes. + pub fn index_file_size(&self) -> u64 { + match self { + NeedleMap::InMemory(nm) => nm.index_file_size(), + NeedleMap::Redb(nm) => nm.index_file_size(), + } + } + + /// Sync index file to disk. + pub fn sync(&self) -> io::Result<()> { + match self { + NeedleMap::InMemory(nm) => nm.sync(), + NeedleMap::Redb(nm) => nm.sync(), + } + } + + /// Close index file. + pub fn close(&mut self) { + match self { + NeedleMap::InMemory(nm) => nm.close(), + NeedleMap::Redb(nm) => nm.close(), + } + } + + /// Save to an index file. + pub fn save_to_idx(&self, path: &str) -> io::Result<()> { + match self { + NeedleMap::InMemory(nm) => nm.save_to_idx(path), + NeedleMap::Redb(nm) => nm.save_to_idx(path), + } + } + + /// Visit all entries in ascending order by needle ID. + pub fn ascending_visit(&self, f: F) -> Result<(), String> + where + F: FnMut(NeedleId, &NeedleValue) -> Result<(), String>, + { + match self { + NeedleMap::InMemory(nm) => nm.ascending_visit(f), + NeedleMap::Redb(nm) => nm.ascending_visit(f), + } + } + + /// Iterate all entries. Returns a Vec of (NeedleId, NeedleValue) pairs. + /// For InMemory this collects via ascending visit; for Redb it reads from disk. + pub fn iter_entries(&self) -> Vec<(NeedleId, NeedleValue)> { + match self { + NeedleMap::InMemory(nm) => { + let mut entries = Vec::new(); + let _ = nm.ascending_visit(|id, nv| { + entries.push((id, *nv)); + Ok(()) + }); + entries + } + NeedleMap::Redb(nm) => nm.collect_entries(), + } + } +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use std::io::Cursor; + + #[test] + fn test_needle_map_put_get() { + let mut nm = CompactNeedleMap::new(); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + + let v1 = nm.get(NeedleId(1)).unwrap(); + assert_eq!(v1.size, Size(100)); + + let v2 = nm.get(NeedleId(2)).unwrap(); + assert_eq!(v2.size, Size(200)); + + assert!(nm.get(NeedleId(99)).is_none()); + } + + #[test] + fn test_needle_map_delete() { + let mut nm = CompactNeedleMap::new(); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + + assert_eq!(nm.file_count(), 1); + assert_eq!(nm.content_size(), 100); + + let deleted = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(deleted, Some(Size(100))); + + // Additive-only: file_count stays at 1 after delete + assert_eq!(nm.file_count(), 1); + assert_eq!(nm.deleted_count(), 1); + assert_eq!(nm.deleted_size(), 100); + } + + #[test] + fn test_needle_map_metrics() { + let mut nm = CompactNeedleMap::new(); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + nm.put(NeedleId(3), Offset::from_actual_offset(384), Size(300)) + .unwrap(); + + assert_eq!(nm.file_count(), 3); + assert_eq!(nm.content_size(), 600); + assert_eq!(nm.max_file_key(), NeedleId(3)); + + // Update existing — additive-only: file_count increments, content_size adds + nm.put(NeedleId(2), Offset::from_actual_offset(700), Size(250)) + .unwrap(); + assert_eq!(nm.file_count(), 4); // 3 + 1 (always increments) + assert_eq!(nm.content_size(), 850); // 600 + 250 (always adds) + + // Delete — additive-only: file_count unchanged + nm.delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(nm.file_count(), 4); // unchanged + assert_eq!(nm.deleted_count(), 2); // 1 from overwrite + 1 from delete + } + + #[test] + fn test_needle_map_load_from_idx() { + // Build an idx file in memory + // Note: offset 0 is reserved for the SuperBlock, so real needles start at offset >= 8 + let mut idx_data = Vec::new(); + idx::write_index_entry( + &mut idx_data, + NeedleId(1), + Offset::from_actual_offset(8), + Size(100), + ) + .unwrap(); + idx::write_index_entry( + &mut idx_data, + NeedleId(2), + Offset::from_actual_offset(128), + Size(200), + ) + .unwrap(); + idx::write_index_entry( + &mut idx_data, + NeedleId(3), + Offset::from_actual_offset(384), + Size(300), + ) + .unwrap(); + // Delete needle 2 + idx::write_index_entry( + &mut idx_data, + NeedleId(2), + Offset::default(), + TOMBSTONE_FILE_SIZE, + ) + .unwrap(); + + let mut cursor = Cursor::new(idx_data); + let nm = CompactNeedleMap::load_from_idx(&mut cursor).unwrap(); + + assert!(nm.get(NeedleId(1)).is_some()); + assert!(nm.get(NeedleId(2)).is_none()); // deleted + assert!(nm.get(NeedleId(3)).is_some()); + // Additive-only: put(1)+put(2)+put(3) = 3, delete doesn't decrement + assert_eq!(nm.file_count(), 3); + } + + #[test] + fn test_needle_map_double_delete() { + let mut nm = CompactNeedleMap::new(); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + + let r1 = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(r1, Some(Size(100))); + + // Second delete should return None (already deleted) + let r2 = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(r2, None); + assert_eq!(nm.deleted_count(), 1); // not double counted + } + + // ---- RedbNeedleMap tests ---- + + #[test] + fn test_redb_needle_map_put_get() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + + let v1 = nm.get(NeedleId(1)).unwrap(); + assert_eq!(v1.size, Size(100)); + + let v2 = nm.get(NeedleId(2)).unwrap(); + assert_eq!(v2.size, Size(200)); + + assert!(nm.get(NeedleId(99)).is_none()); + } + + #[test] + fn test_redb_needle_map_delete() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + assert_eq!(nm.file_count(), 1); + assert_eq!(nm.content_size(), 100); + + let deleted = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(deleted, Some(Size(100))); + + // Additive-only: file_count stays at 1 after delete + assert_eq!(nm.file_count(), 1); + assert_eq!(nm.deleted_count(), 1); + assert_eq!(nm.deleted_size(), 100); + + // Deleted entry should have negated size + let nv = nm.get(NeedleId(1)).unwrap(); + assert_eq!(nv.size, Size(-100)); + } + + #[test] + fn test_redb_needle_map_metrics() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + nm.put(NeedleId(3), Offset::from_actual_offset(384), Size(300)) + .unwrap(); + + assert_eq!(nm.file_count(), 3); + assert_eq!(nm.content_size(), 600); + assert_eq!(nm.max_file_key(), NeedleId(3)); + + // Update existing — additive-only: file_count increments, content_size adds + nm.put(NeedleId(2), Offset::from_actual_offset(700), Size(250)) + .unwrap(); + assert_eq!(nm.file_count(), 4); // 3 + 1 (always increments) + assert_eq!(nm.content_size(), 850); // 600 + 250 (always adds) + + // Delete — additive-only: file_count unchanged + nm.delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(nm.file_count(), 4); // unchanged + assert_eq!(nm.deleted_count(), 2); // 1 from overwrite + 1 from delete + } + + #[test] + fn test_redb_needle_map_load_from_idx() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + + let mut idx_data = Vec::new(); + idx::write_index_entry( + &mut idx_data, + NeedleId(1), + Offset::from_actual_offset(8), + Size(100), + ) + .unwrap(); + idx::write_index_entry( + &mut idx_data, + NeedleId(2), + Offset::from_actual_offset(128), + Size(200), + ) + .unwrap(); + idx::write_index_entry( + &mut idx_data, + NeedleId(3), + Offset::from_actual_offset(384), + Size(300), + ) + .unwrap(); + // Delete needle 2 + idx::write_index_entry( + &mut idx_data, + NeedleId(2), + Offset::default(), + TOMBSTONE_FILE_SIZE, + ) + .unwrap(); + + let mut cursor = Cursor::new(idx_data); + let nm = RedbNeedleMap::load_from_idx(db_path.to_str().unwrap(), &mut cursor).unwrap(); + + assert!(nm.get(NeedleId(1)).is_some()); + assert!(nm.get(NeedleId(2)).is_none()); // deleted and removed + assert!(nm.get(NeedleId(3)).is_some()); + assert_eq!(nm.file_count(), 2); + } + + #[test] + fn test_redb_needle_map_double_delete() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + + let r1 = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(r1, Some(Size(100))); + + // Second delete should return None (already deleted) + let r2 = nm + .delete(NeedleId(1), Offset::from_actual_offset(0)) + .unwrap(); + assert_eq!(r2, None); + assert_eq!(nm.deleted_count(), 1); // not double counted + } + + #[test] + fn test_redb_needle_map_ascending_visit() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + + nm.put(NeedleId(3), Offset::from_actual_offset(384), Size(300)) + .unwrap(); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + + let mut visited = Vec::new(); + nm.ascending_visit(|id, nv| { + visited.push((id, nv.size)); + Ok(()) + }) + .unwrap(); + + assert_eq!(visited.len(), 3); + assert_eq!(visited[0], (NeedleId(1), Size(100))); + assert_eq!(visited[1], (NeedleId(2), Size(200))); + assert_eq!(visited[2], (NeedleId(3), Size(300))); + } + + #[test] + fn test_redb_needle_map_save_to_idx() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let idx_path = dir.path().join("test.idx"); + + let mut nm = RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap(); + nm.put(NeedleId(1), Offset::from_actual_offset(8), Size(100)) + .unwrap(); + nm.put(NeedleId(2), Offset::from_actual_offset(128), Size(200)) + .unwrap(); + nm.put(NeedleId(3), Offset::from_actual_offset(384), Size(300)) + .unwrap(); + // Delete needle 2 + nm.delete(NeedleId(2), Offset::from_actual_offset(128)) + .unwrap(); + + nm.save_to_idx(idx_path.to_str().unwrap()).unwrap(); + + // Load back with CompactNeedleMap to verify + let mut idx_file = std::fs::File::open(&idx_path).unwrap(); + let loaded = CompactNeedleMap::load_from_idx(&mut idx_file).unwrap(); + assert_eq!(loaded.file_count(), 2); // only live entries + assert!(loaded.get(NeedleId(1)).is_some()); + assert!(loaded.get(NeedleId(2)).is_none()); // deleted, not saved + assert!(loaded.get(NeedleId(3)).is_some()); + } + + #[test] + fn test_pack_unpack_needle_value() { + let nv = NeedleValue { + offset: Offset::from_actual_offset(8 * 1000), + size: Size(4096), + }; + let packed = pack_needle_value(&nv); + let unpacked = unpack_needle_value(&packed); + assert_eq!( + nv.offset.to_actual_offset(), + unpacked.offset.to_actual_offset() + ); + assert_eq!(nv.size, unpacked.size); + } + + #[test] + fn test_pack_unpack_negative_size() { + let nv = NeedleValue { + offset: Offset::from_actual_offset(8 * 500), + size: Size(-100), + }; + let packed = pack_needle_value(&nv); + let unpacked = unpack_needle_value(&packed); + assert_eq!( + nv.offset.to_actual_offset(), + unpacked.offset.to_actual_offset() + ); + assert_eq!(nv.size, unpacked.size); + } + + // ---- NeedleMap enum tests ---- + + #[test] + fn test_needle_map_enum_inmemory() { + let mut nm = NeedleMap::InMemory(CompactNeedleMap::new()); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + assert_eq!(nm.get(NeedleId(1)).unwrap().size, Size(100)); + assert_eq!(nm.file_count(), 1); + } + + #[test] + fn test_needle_map_enum_redb() { + let dir = tempfile::tempdir().unwrap(); + let db_path = dir.path().join("test.rdb"); + let mut nm = NeedleMap::Redb(RedbNeedleMap::new(db_path.to_str().unwrap()).unwrap()); + nm.put(NeedleId(1), Offset::from_actual_offset(0), Size(100)) + .unwrap(); + assert_eq!(nm.get(NeedleId(1)).unwrap().size, Size(100)); + assert_eq!(nm.file_count(), 1); + } +} diff --git a/seaweed-volume/src/storage/needle_map/compact_map.rs b/seaweed-volume/src/storage/needle_map/compact_map.rs new file mode 100644 index 000000000..9dea94ce7 --- /dev/null +++ b/seaweed-volume/src/storage/needle_map/compact_map.rs @@ -0,0 +1,375 @@ +//! CompactMap: memory-efficient in-memory map of NeedleId -> (Offset, Size). +//! +//! Port of Go's CompactMap from weed/storage/needle_map/compact_map.go. +//! Uses segmented sorted arrays with compressed keys (u16 instead of u64) +//! to achieve ~10 bytes per entry instead of ~40-48 bytes with HashMap. +//! +//! NeedleId is split into: chunk = id / SEGMENT_CHUNK_SIZE, compact_key = id % SEGMENT_CHUNK_SIZE. +//! Each segment stores up to SEGMENT_CHUNK_SIZE entries in a sorted Vec, searched via binary search. +//! Best case (ordered inserts): O(1). Worst case: O(log n) per segment. + +use std::collections::HashMap; + +use super::NeedleValue; +use crate::storage::types::*; + +/// Maximum entries per segment. Must be <= u16::MAX (65535). +const SEGMENT_CHUNK_SIZE: u64 = 50_000; + +/// Compact key: only the low bits of NeedleId within a segment. +type CompactKey = u16; + +/// Segment chunk identifier: NeedleId / SEGMENT_CHUNK_SIZE. +type Chunk = u64; + +/// Compact entry: 10 bytes (2 + 4 + 4) vs 16 bytes for full NeedleId + NeedleValue. +#[derive(Clone, Copy)] +struct CompactEntry { + key: CompactKey, // 2 bytes + offset: [u8; OFFSET_SIZE], // 4 bytes + size: Size, // 4 bytes +} + +impl CompactEntry { + fn to_needle_value(&self) -> NeedleValue { + NeedleValue { + offset: Offset::from_bytes(&self.offset), + size: self.size, + } + } +} + +/// A sorted segment of compact entries for a given chunk. +struct Segment { + list: Vec, + chunk: Chunk, + first_key: CompactKey, + last_key: CompactKey, +} + +impl Segment { + fn new(chunk: Chunk) -> Self { + Segment { + list: Vec::new(), + chunk, + first_key: u16::MAX, + last_key: 0, + } + } + + fn compact_key(&self, id: NeedleId) -> CompactKey { + (id.0 - SEGMENT_CHUNK_SIZE * self.chunk) as CompactKey + } + + /// Binary search for a compact key. Returns (index, found). + /// If not found, index is the insertion point. + fn bsearch(&self, id: NeedleId) -> (usize, bool) { + let ck = self.compact_key(id); + + if self.list.is_empty() { + return (0, false); + } + if ck == self.first_key { + return (0, true); + } + if ck < self.first_key { + return (0, false); + } + if ck == self.last_key { + return (self.list.len() - 1, true); + } + if ck > self.last_key { + return (self.list.len(), false); + } + + let i = self.list.partition_point(|e| e.key < ck); + if i < self.list.len() && self.list[i].key == ck { + (i, true) + } else { + (i, false) + } + } + + /// Insert or update. Returns old NeedleValue if updating. + fn set(&mut self, id: NeedleId, offset: Offset, size: Size) -> Option { + let (i, found) = self.bsearch(id); + + if found { + let old = self.list[i].to_needle_value(); + let mut offset_bytes = [0u8; OFFSET_SIZE]; + offset.to_bytes(&mut offset_bytes); + self.list[i].offset = offset_bytes; + self.list[i].size = size; + return Some(old); + } + + // Insert at sorted position + let ck = self.compact_key(id); + let mut offset_bytes = [0u8; OFFSET_SIZE]; + offset.to_bytes(&mut offset_bytes); + + let entry = CompactEntry { + key: ck, + offset: offset_bytes, + size, + }; + + // Match Go panic: don't exceed segment capacity + if self.list.len() >= SEGMENT_CHUNK_SIZE as usize { + panic!( + "attempted to write more than {} entries on CompactMapSegment", + SEGMENT_CHUNK_SIZE + ); + } + + if self.list.len() == SEGMENT_CHUNK_SIZE as usize - 1 { + // Pin capacity to exact size when maxing out + let mut new_list = Vec::with_capacity(SEGMENT_CHUNK_SIZE as usize); + new_list.extend_from_slice(&self.list[..i]); + new_list.push(entry); + new_list.extend_from_slice(&self.list[i..]); + self.list = new_list; + } else { + self.list.insert(i, entry); + } + + if ck < self.first_key { + self.first_key = ck; + } + if ck > self.last_key { + self.last_key = ck; + } + + None + } + + fn get(&self, id: NeedleId) -> Option { + let (i, found) = self.bsearch(id); + if found { + Some(self.list[i].to_needle_value()) + } else { + None + } + } + + /// Mark as deleted by negating size. Returns previous size if not already deleted. + /// Matches Go behavior: checks !IsDeleted() (i.e., size >= 0). + fn delete(&mut self, id: NeedleId) -> Option { + let (i, found) = self.bsearch(id); + if found && !self.list[i].size.is_deleted() { + let old_size = self.list[i].size; + if self.list[i].size.0 == 0 { + self.list[i].size = TOMBSTONE_FILE_SIZE; + } else { + self.list[i].size = Size(-self.list[i].size.0); + } + Some(old_size) + } else { + None + } + } +} + +/// Memory-efficient map of NeedleId -> (Offset, Size). +/// Segments NeedleIds into chunks of 50,000 and stores compact 10-byte entries +/// in sorted arrays, using only 2 bytes for the key within each segment. +pub struct CompactMap { + segments: HashMap, +} + +impl CompactMap { + pub fn new() -> Self { + CompactMap { + segments: HashMap::new(), + } + } + + fn _segment_for_key(&mut self, id: NeedleId) -> &mut Segment { + let chunk = id.0 / SEGMENT_CHUNK_SIZE; + self.segments + .entry(chunk) + .or_insert_with(|| Segment::new(chunk)) + } + + /// Insert or update. Returns old NeedleValue if updating. + pub fn set(&mut self, id: NeedleId, offset: Offset, size: Size) -> Option { + let chunk = id.0 / SEGMENT_CHUNK_SIZE; + let segment = self + .segments + .entry(chunk) + .or_insert_with(|| Segment::new(chunk)); + segment.set(id, offset, size) + } + + pub fn get(&self, id: NeedleId) -> Option { + let chunk = id.0 / SEGMENT_CHUNK_SIZE; + self.segments.get(&chunk)?.get(id) + } + + /// Mark as deleted. Returns previous size if was valid. + pub fn delete(&mut self, id: NeedleId) -> Option { + let chunk = id.0 / SEGMENT_CHUNK_SIZE; + self.segments.get_mut(&chunk)?.delete(id) + } + + /// Remove entry entirely (used during idx loading). + pub fn remove(&mut self, id: NeedleId) -> Option { + let chunk = id.0 / SEGMENT_CHUNK_SIZE; + let segment = self.segments.get_mut(&chunk)?; + let (i, found) = segment.bsearch(id); + if found { + let entry = segment.list.remove(i); + // Update first/last keys + if segment.list.is_empty() { + segment.first_key = u16::MAX; + segment.last_key = 0; + } else { + segment.first_key = segment.list[0].key; + segment.last_key = segment.list[segment.list.len() - 1].key; + } + Some(entry.to_needle_value()) + } else { + None + } + } + + /// Iterate all entries in ascending NeedleId order. + pub fn ascending_visit(&self, mut f: F) -> Result<(), E> + where + F: FnMut(NeedleId, &NeedleValue) -> Result<(), E>, + { + let mut chunks: Vec = self.segments.keys().copied().collect(); + chunks.sort_unstable(); + + for chunk in chunks { + let segment = &self.segments[&chunk]; + for entry in &segment.list { + let id = NeedleId(SEGMENT_CHUNK_SIZE * segment.chunk + entry.key as u64); + let nv = entry.to_needle_value(); + f(id, &nv)?; + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn offset(v: u32) -> Offset { + Offset::from_actual_offset(v as i64 * NEEDLE_PADDING_SIZE as i64) + } + + #[test] + fn test_compact_map_basic() { + let mut m = CompactMap::new(); + + // Insert + assert!(m.set(NeedleId(1), offset(100), Size(50)).is_none()); + assert!(m.set(NeedleId(2), offset(200), Size(60)).is_none()); + + // Get + let nv = m.get(NeedleId(1)).unwrap(); + assert_eq!(nv.size, Size(50)); + + // Update returns old value + let old = m.set(NeedleId(1), offset(300), Size(70)).unwrap(); + assert_eq!(old.size, Size(50)); + + // Get updated value + let nv = m.get(NeedleId(1)).unwrap(); + assert_eq!(nv.size, Size(70)); + + // Miss + assert!(m.get(NeedleId(999)).is_none()); + } + + #[test] + fn test_compact_map_delete() { + let mut m = CompactMap::new(); + m.set(NeedleId(1), offset(100), Size(50)); + + // Delete returns old size + let old = m.delete(NeedleId(1)).unwrap(); + assert_eq!(old, Size(50)); + + // Get returns deleted (negative size) + let nv = m.get(NeedleId(1)).unwrap(); + assert!(nv.size.is_deleted()); + + // Delete again returns None (already deleted) + assert!(m.delete(NeedleId(1)).is_none()); + } + + #[test] + fn test_compact_map_zero_size_delete() { + let mut m = CompactMap::new(); + m.set(NeedleId(1), offset(100), Size(0)); + + let old = m.delete(NeedleId(1)).unwrap(); + assert_eq!(old, Size(0)); + + let nv = m.get(NeedleId(1)).unwrap(); + assert_eq!(nv.size, TOMBSTONE_FILE_SIZE); + } + + #[test] + fn test_compact_map_cross_segment() { + let mut m = CompactMap::new(); + + // Insert across multiple segments + m.set(NeedleId(1), offset(1), Size(1)); + m.set(NeedleId(50_000), offset(2), Size(2)); + m.set(NeedleId(100_000), offset(3), Size(3)); + + assert_eq!(m.get(NeedleId(1)).unwrap().size, Size(1)); + assert_eq!(m.get(NeedleId(50_000)).unwrap().size, Size(2)); + assert_eq!(m.get(NeedleId(100_000)).unwrap().size, Size(3)); + } + + #[test] + fn test_compact_map_ascending_visit() { + let mut m = CompactMap::new(); + m.set(NeedleId(100_005), offset(3), Size(3)); + m.set(NeedleId(5), offset(1), Size(1)); + m.set(NeedleId(50_005), offset(2), Size(2)); + + let mut visited = Vec::new(); + m.ascending_visit(|id, nv| { + visited.push((id, nv.size)); + Ok::<_, String>(()) + }) + .unwrap(); + + assert_eq!(visited.len(), 3); + assert_eq!(visited[0].0, NeedleId(5)); + assert_eq!(visited[1].0, NeedleId(50_005)); + assert_eq!(visited[2].0, NeedleId(100_005)); + } + + #[test] + fn test_compact_map_remove() { + let mut m = CompactMap::new(); + m.set(NeedleId(1), offset(100), Size(50)); + m.set(NeedleId(2), offset(200), Size(60)); + + let removed = m.remove(NeedleId(1)).unwrap(); + assert_eq!(removed.size, Size(50)); + + assert!(m.get(NeedleId(1)).is_none()); + assert_eq!(m.get(NeedleId(2)).unwrap().size, Size(60)); + } + + #[test] + fn test_compact_map_reverse_insert_order() { + let mut m = CompactMap::new(); + // Insert in reverse order to test sorted insert + for i in (0..100).rev() { + m.set(NeedleId(i), offset(i as u32), Size(i as i32)); + } + for i in 0..100 { + assert_eq!(m.get(NeedleId(i)).unwrap().size, Size(i as i32)); + } + } +} diff --git a/seaweed-volume/src/storage/store.rs b/seaweed-volume/src/storage/store.rs new file mode 100644 index 000000000..98ffd3d04 --- /dev/null +++ b/seaweed-volume/src/storage/store.rs @@ -0,0 +1,1297 @@ +//! Store: the top-level storage manager for a volume server. +//! +//! A Store manages multiple DiskLocations (one per configured directory). +//! It coordinates volume placement, lookup, and lifecycle operations. +//! Matches Go's storage/store.go. + +use std::io; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; + +use crate::config::MinFreeSpace; +use crate::pb::master_pb; +use crate::storage::disk_location::DiskLocation; +use crate::storage::erasure_coding::ec_shard::{EcVolumeShard, MAX_SHARD_COUNT}; +use crate::storage::erasure_coding::ec_volume::EcVolume; +use crate::storage::needle::needle::Needle; +use crate::storage::needle_map::NeedleMapKind; +use crate::storage::super_block::ReplicaPlacement; +use crate::storage::types::*; +use crate::storage::volume::{VifVolumeInfo, VolumeError}; + +/// Top-level storage manager containing all disk locations and their volumes. +pub struct Store { + pub locations: Vec, + pub needle_map_kind: NeedleMapKind, + preallocate: AtomicBool, + pub volume_size_limit: AtomicU64, + pub id: String, + pub ip: String, + pub port: u16, + pub grpc_port: u16, + pub public_url: String, + pub data_center: String, + pub rack: String, +} + +impl Store { + pub fn new(needle_map_kind: NeedleMapKind) -> Self { + Store { + locations: Vec::new(), + needle_map_kind, + preallocate: AtomicBool::new(false), + volume_size_limit: AtomicU64::new(0), + id: String::new(), + ip: String::new(), + port: 0, + grpc_port: 0, + public_url: String::new(), + data_center: String::new(), + rack: String::new(), + } + } + + /// Add a disk location and load existing volumes from it. + pub fn add_location( + &mut self, + directory: &str, + idx_directory: &str, + max_volume_count: i32, + disk_type: DiskType, + min_free_space: MinFreeSpace, + tags: Vec, + ) -> io::Result<()> { + let mut loc = DiskLocation::new( + directory, + idx_directory, + max_volume_count, + disk_type, + min_free_space, + tags, + )?; + loc.load_existing_volumes(self.needle_map_kind)?; + + // Check for duplicate volume IDs across existing locations + for vid in loc.volume_ids() { + if self.find_volume(vid).is_some() { + return Err(io::Error::new( + io::ErrorKind::AlreadyExists, + format!( + "volume {} already exists in another location, conflicting dir: {}", + vid, directory + ), + )); + } + } + + self.locations.push(loc); + Ok(()) + } + + /// Scan disk locations for new volume files and load them. + /// Mirrors Go's `Store.LoadNewVolumes()`. + pub fn load_new_volumes(&mut self) { + for loc in &mut self.locations { + if let Err(e) = loc.load_existing_volumes(self.needle_map_kind) { + tracing::error!("load_new_volumes error in {}: {}", loc.directory, e); + } + } + } + + // ---- Volume lookup ---- + + /// Find which location contains a volume. + pub fn find_volume(&self, vid: VolumeId) -> Option<(usize, &crate::storage::volume::Volume)> { + for (i, loc) in self.locations.iter().enumerate() { + if let Some(v) = loc.find_volume(vid) { + return Some((i, v)); + } + } + None + } + + /// Find which location contains a volume (mutable). + pub fn find_volume_mut( + &mut self, + vid: VolumeId, + ) -> Option<(usize, &mut crate::storage::volume::Volume)> { + for (i, loc) in self.locations.iter_mut().enumerate() { + if let Some(v) = loc.find_volume_mut(vid) { + return Some((i, v)); + } + } + None + } + + /// Check if a volume exists. + pub fn has_volume(&self, vid: VolumeId) -> bool { + self.find_volume(vid).is_some() + } + + // ---- Volume lifecycle ---- + + /// Find the location with fewest volumes (load-balance) of the given disk type. + /// Matches Go's FindFreeLocation: accounts for EC shards when computing free slots. + fn find_free_location(&self, disk_type: &DiskType) -> Option { + use crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT; + + let mut best: Option<(usize, i64)> = None; // (index, effective_free) + for (i, loc) in self.locations.iter().enumerate() { + if &loc.disk_type != disk_type { + continue; + } + // Go treats MaxVolumeCount == 0 as unlimited (hasFreeDiskLocation) + let max = loc.max_volume_count.load(Ordering::Relaxed) as i64; + let effective_free = if max == 0 { + i64::MAX // unlimited + } else { + // Go formula: currentFreeCount = (MaxVolumeCount - VolumesLen()) * DataShardsCount - EcShardCount() + // currentFreeCount /= DataShardsCount + let free_count = (max - loc.volumes_len() as i64) * DATA_SHARDS_COUNT as i64 + - loc.ec_shard_count() as i64; + free_count / DATA_SHARDS_COUNT as i64 + }; + if effective_free <= 0 { + continue; + } + if loc.is_disk_space_low.load(Ordering::Relaxed) { + continue; + } + if best.is_none() || effective_free > best.unwrap().1 { + best = Some((i, effective_free)); + } + } + best.map(|(i, _)| i) + } + + /// Find a free location matching a predicate. + /// Matches Go's Store.FindFreeLocation: picks the matching location with the + /// most remaining volume capacity, while skipping low-disk locations. + pub fn find_free_location_predicate(&self, pred: F) -> Option + where + F: Fn(&DiskLocation) -> bool, + { + use crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT; + + let mut best: Option<(usize, i64)> = None; + for (i, loc) in self.locations.iter().enumerate() { + if !pred(loc) || loc.is_disk_space_low.load(Ordering::Relaxed) { + continue; + } + + let max = loc.max_volume_count.load(Ordering::Relaxed) as i64; + let effective_free = if max == 0 { + i64::MAX + } else { + let free_count = (max - loc.volumes_len() as i64) * DATA_SHARDS_COUNT as i64 + - loc.ec_shard_count() as i64; + free_count / DATA_SHARDS_COUNT as i64 + }; + if effective_free <= 0 { + continue; + } + + if best.is_none() || effective_free > best.unwrap().1 { + best = Some((i, effective_free)); + } + } + best.map(|(i, _)| i) + } + + /// Create a new volume, placing it on the location with the most free space. + pub fn add_volume( + &mut self, + vid: VolumeId, + collection: &str, + replica_placement: Option, + ttl: Option, + preallocate: u64, + disk_type: DiskType, + version: Version, + ) -> Result<(), VolumeError> { + if self.find_volume(vid).is_some() { + return Err(VolumeError::AlreadyExists); + } + let loc_idx = self.find_free_location(&disk_type).ok_or_else(|| { + VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!("no free location for disk type {:?}", disk_type), + )) + })?; + + self.locations[loc_idx].create_volume( + vid, + collection, + self.needle_map_kind, + replica_placement, + ttl, + preallocate, + version, + ) + } + + /// Delete a volume from any location. + pub fn delete_volume(&mut self, vid: VolumeId, only_empty: bool) -> Result<(), VolumeError> { + for loc in &mut self.locations { + if loc.find_volume(vid).is_some() { + return loc.delete_volume(vid, only_empty); + } + } + Err(VolumeError::NotFound) + } + + /// Unload (unmount) a volume without deleting its files. + pub fn unmount_volume(&mut self, vid: VolumeId) -> bool { + for loc in &mut self.locations { + if loc.unload_volume(vid).is_some() { + return true; + } + } + false + } + + /// Mount a volume from an existing .dat file. + pub fn mount_volume( + &mut self, + vid: VolumeId, + collection: &str, + disk_type: DiskType, + ) -> Result<(), VolumeError> { + if self.find_volume(vid).is_some() { + return Err(VolumeError::AlreadyExists); + } + // Find the location where the .dat file exists + for loc in &mut self.locations { + if &loc.disk_type != &disk_type { + continue; + } + let base = crate::storage::volume::volume_file_name(&loc.directory, collection, vid); + let dat_path = format!("{}.dat", base); + let vif_path = format!("{}.vif", base); + if std::path::Path::new(&dat_path).exists() || std::path::Path::new(&vif_path).exists() + { + return loc.create_volume( + vid, + collection, + self.needle_map_kind, + None, + None, + 0, + Version::current(), + ); + } + } + Err(VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("volume {} not found on disk", vid), + ))) + } + + /// Mount a volume by id only (Go's MountVolume behavior). + /// Scans all locations for a matching .dat file and loads with its collection prefix. + pub fn mount_volume_by_id(&mut self, vid: VolumeId) -> Result<(), VolumeError> { + if self.find_volume(vid).is_some() { + return Err(VolumeError::AlreadyExists); + } + if let Some((loc_idx, _base_path, collection)) = self.find_volume_file_base(vid) { + let loc = &mut self.locations[loc_idx]; + return loc.create_volume( + vid, + &collection, + self.needle_map_kind, + None, + None, + 0, + Version::current(), + ); + } + Err(VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("volume {} not found on disk", vid), + ))) + } + + fn find_volume_file_base(&self, vid: VolumeId) -> Option<(usize, String, String)> { + for (loc_idx, loc) in self.locations.iter().enumerate() { + if let Ok(entries) = std::fs::read_dir(&loc.directory) { + for entry in entries.flatten() { + let name = entry.file_name(); + let name = name.to_string_lossy(); + if let Some((collection, file_vid)) = parse_volume_filename(&name) { + if file_vid == vid { + let base = strip_volume_suffix(&name)?; + let base_path = format!("{}/{}", loc.directory, base); + return Some((loc_idx, base_path, collection)); + } + } + } + } + } + None + } + + /// Configure a volume's replica placement on disk. + /// The volume must already be unmounted. This opens the .dat file directly, + /// modifies the replica_placement byte (offset 1), and writes it back. + pub fn configure_volume(&self, vid: VolumeId, rp: ReplicaPlacement) -> Result<(), VolumeError> { + let (_, base_path, _) = self.find_volume_file_base(vid).ok_or_else(|| { + VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("volume {} not found on disk", vid), + )) + })?; + let vif_path = format!("{}.vif", base_path); + let mut vif = load_vif_volume_info(&vif_path)?; + vif.replication = rp.to_string(); + save_vif_volume_info(&vif_path, &vif)?; + Ok(()) + } + + // ---- Read / Write / Delete ---- + + /// Read a needle from a volume. + pub fn read_volume_needle(&self, vid: VolumeId, n: &mut Needle) -> Result { + let (_, vol) = self.find_volume(vid).ok_or(VolumeError::NotFound)?; + vol.read_needle(n) + } + + /// Read a needle from a volume, optionally reading deleted needles. + pub fn read_volume_needle_opt( + &self, + vid: VolumeId, + n: &mut Needle, + read_deleted: bool, + ) -> Result { + let (_, vol) = self.find_volume(vid).ok_or(VolumeError::NotFound)?; + vol.read_needle_opt(n, read_deleted) + } + + /// Read needle metadata and return streaming info for large file reads. + pub fn read_volume_needle_stream_info( + &self, + vid: VolumeId, + n: &mut Needle, + read_deleted: bool, + ) -> Result { + let (_, vol) = self.find_volume(vid).ok_or(VolumeError::NotFound)?; + vol.read_needle_stream_info(n, read_deleted) + } + + /// Re-lookup a needle's data-file offset after compaction may have moved it. + /// Returns `(new_data_file_offset, current_compaction_revision)`. + pub fn re_lookup_needle_data_offset( + &self, + vid: VolumeId, + needle_id: NeedleId, + ) -> Result<(u64, u16), VolumeError> { + let (_, vol) = self.find_volume(vid).ok_or(VolumeError::NotFound)?; + vol.re_lookup_needle_data_offset(needle_id) + } + + /// Write a needle to a volume. + pub fn write_volume_needle( + &mut self, + vid: VolumeId, + n: &mut Needle, + ) -> Result<(u64, Size, bool), VolumeError> { + // Check disk space on the location containing this volume. + // We do this before the mutable borrow to avoid borrow conflicts. + let loc_idx = self + .find_volume(vid) + .map(|(i, _)| i) + .ok_or(VolumeError::NotFound)?; + if self.locations[loc_idx] + .is_disk_space_low + .load(Ordering::Relaxed) + { + return Err(VolumeError::ReadOnly); + } + + let (_, vol) = self.find_volume_mut(vid).ok_or(VolumeError::NotFound)?; + vol.write_needle(n, true) + } + + /// Delete a needle from a volume. + pub fn delete_volume_needle( + &mut self, + vid: VolumeId, + n: &mut Needle, + ) -> Result { + // Match Go's DeleteVolumeNeedle: check noWriteOrDelete before proceeding. + let (_, vol) = self.find_volume(vid).ok_or(VolumeError::NotFound)?; + if vol.is_no_write_or_delete() { + return Err(VolumeError::ReadOnly); + } + + let (_, vol) = self.find_volume_mut(vid).ok_or(VolumeError::NotFound)?; + vol.delete_needle(n) + } + + // ---- Collection operations ---- + + /// Delete all volumes in a collection. + pub fn delete_collection(&mut self, collection: &str) -> Result<(), String> { + for loc in &mut self.locations { + loc.delete_collection(collection) + .map_err(|e| format!("delete collection {}: {}", collection, e))?; + } + crate::metrics::delete_collection_metrics(collection); + Ok(()) + } + + // ---- Metrics ---- + + /// Total volume count across all locations. + pub fn total_volume_count(&self) -> usize { + self.locations.iter().map(|loc| loc.volumes_len()).sum() + } + + pub fn set_preallocate(&self, preallocate: bool) { + self.preallocate.store(preallocate, Ordering::Relaxed); + } + + pub fn get_preallocate(&self) -> bool { + self.preallocate.load(Ordering::Relaxed) + } + + /// Total max volumes across all locations. + pub fn max_volume_count(&self) -> i32 { + self.locations + .iter() + .map(|loc| loc.max_volume_count.load(Ordering::Relaxed)) + .sum() + } + + /// Total EC shard count across all locations. + pub fn ec_shard_count(&self) -> usize { + self.locations.iter().map(|loc| loc.ec_shard_count()).sum() + } + + /// Recalculate max volume counts for locations with original_max_volume_count == 0. + /// Returns true if any max changed (caller should re-send heartbeat). + pub fn maybe_adjust_volume_max(&self) -> bool { + let volume_size_limit = self.volume_size_limit.load(Ordering::Relaxed); + if volume_size_limit == 0 { + return false; + } + + let mut has_changes = false; + let mut new_max_total: i32 = 0; + + for loc in &self.locations { + if loc.original_max_volume_count == 0 { + let current = loc.max_volume_count.load(Ordering::Relaxed); + let (_, free) = super::disk_location::get_disk_stats(&loc.directory); + + let unused_space = if self.get_preallocate() { + 0 + } else { + loc.unused_space(volume_size_limit) + }; + let unclaimed = (free as i64) - (unused_space as i64); + + let vol_count = loc.volumes_len() as i32; + let loc_ec_shards = loc.ec_shard_count(); + let ec_equivalent = ((loc_ec_shards + + crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT) + / crate::storage::erasure_coding::ec_shard::DATA_SHARDS_COUNT) + as i32; + let mut max_count = vol_count + ec_equivalent; + + if unclaimed > volume_size_limit as i64 { + max_count += (unclaimed as u64 / volume_size_limit) as i32 - 1; + } + + loc.max_volume_count.store(max_count, Ordering::Relaxed); + new_max_total += max_count; + has_changes = has_changes || current != max_count; + } else { + new_max_total += loc.original_max_volume_count; + } + } + + crate::metrics::MAX_VOLUMES.set(new_max_total as i64); + has_changes + } + + /// Free volume slots across all locations. + pub fn free_volume_count(&self) -> i32 { + self.locations + .iter() + .map(|loc| loc.free_volume_count()) + .sum() + } + + /// All volume IDs across all locations. + pub fn all_volume_ids(&self) -> Vec { + let mut ids: Vec = self + .locations + .iter() + .flat_map(|loc| loc.volume_ids()) + .collect(); + ids.sort(); + ids.dedup(); + ids + } + + // ---- EC volume operations ---- + + /// Mount EC shards for a volume (batch). + pub fn mount_ec_shards( + &mut self, + vid: VolumeId, + collection: &str, + shard_ids: &[u32], + ) -> Result<(), VolumeError> { + // Find the location where the EC files live + let loc_idx = self.find_ec_location(vid, collection).ok_or_else(|| { + VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("ec volume {} shards not found on disk", vid), + )) + })?; + + self.locations[loc_idx].mount_ec_shards(vid, collection, shard_ids) + } + + /// Mount a single EC shard, searching all locations for the shard file. + /// Matches Go's Store.MountEcShards which mounts one shard at a time. + pub fn mount_ec_shard( + &mut self, + vid: VolumeId, + collection: &str, + shard_id: u32, + ) -> Result<(), VolumeError> { + for loc in &mut self.locations { + // Check if the shard file exists on this location + let shard = EcVolumeShard::new(&loc.directory, collection, vid, shard_id as u8); + if std::path::Path::new(&shard.file_name()).exists() { + loc.mount_ec_shards(vid, collection, &[shard_id])?; + return Ok(()); + } + } + Err(VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("MountEcShards {}.{} not found on disk", vid, shard_id), + ))) + } + + /// Unmount EC shards for a volume (batch). + pub fn unmount_ec_shards(&mut self, vid: VolumeId, shard_ids: &[u32]) { + for loc in &mut self.locations { + if loc.has_ec_volume(vid) { + loc.unmount_ec_shards(vid, shard_ids); + return; + } + } + } + + /// Unmount a single EC shard, searching all locations. + /// Matches Go's Store.UnmountEcShards which unmounts one shard at a time. + pub fn unmount_ec_shard(&mut self, vid: VolumeId, shard_id: u32) -> Result<(), VolumeError> { + for loc in &mut self.locations { + if loc.has_ec_volume(vid) { + loc.unmount_ec_shards(vid, &[shard_id]); + return Ok(()); + } + } + // Go returns nil if shard not found (no error) + Ok(()) + } + + /// Find an EC volume across all locations. + pub fn find_ec_volume(&self, vid: VolumeId) -> Option<&EcVolume> { + for loc in &self.locations { + if let Some(ecv) = loc.find_ec_volume(vid) { + return Some(ecv); + } + } + None + } + + /// Find an EC volume across all locations (mutable). + pub fn find_ec_volume_mut(&mut self, vid: VolumeId) -> Option<&mut EcVolume> { + for loc in &mut self.locations { + if let Some(ecv) = loc.find_ec_volume_mut(vid) { + return Some(ecv); + } + } + None + } + + /// Check if any location has an EC volume. + pub fn has_ec_volume(&self, vid: VolumeId) -> bool { + self.locations.iter().any(|loc| loc.has_ec_volume(vid)) + } + + pub fn delete_expired_ec_volumes( + &mut self, + ) -> ( + Vec, + Vec, + ) { + let mut ec_shards = Vec::new(); + let mut deleted = Vec::new(); + + for (disk_id, loc) in self.locations.iter_mut().enumerate() { + let mut expired_vids = Vec::new(); + for (vid, ec_vol) in loc.ec_volumes() { + if ec_vol.is_time_to_destroy() { + expired_vids.push(*vid); + } else { + ec_shards + .extend(ec_vol.to_volume_ec_shard_information_messages(disk_id as u32)); + } + } + + for vid in expired_vids { + let messages = loc + .find_ec_volume(vid) + .map(|ec_vol| ec_vol.to_volume_ec_shard_information_messages(disk_id as u32)) + .unwrap_or_default(); + if let Some(mut ec_vol) = loc.remove_ec_volume(vid) { + for _ in 0..ec_vol.shard_count() { + crate::metrics::VOLUME_GAUGE + .with_label_values(&[&ec_vol.collection, "ec_shards"]) + .dec(); + } + ec_vol.destroy(); + deleted.extend(messages); + } else { + ec_shards.extend(messages); + } + } + } + + (ec_shards, deleted) + } + + /// Remove an EC volume from whichever location has it. + pub fn remove_ec_volume(&mut self, vid: VolumeId) -> Option { + for loc in &mut self.locations { + if let Some(ecv) = loc.remove_ec_volume(vid) { + return Some(ecv); + } + } + None + } + + /// Find the location index containing EC files for a volume. + pub fn find_ec_location(&self, vid: VolumeId, collection: &str) -> Option { + for (i, loc) in self.locations.iter().enumerate() { + let base = crate::storage::volume::volume_file_name(&loc.directory, collection, vid); + let ecx_path = format!("{}.ecx", base); + if std::path::Path::new(&ecx_path).exists() { + return Some(i); + } + } + None + } + + /// Delete EC shard files from disk. + pub fn delete_ec_shards(&mut self, vid: VolumeId, collection: &str, shard_ids: &[u32]) { + // Delete shard files from disk + for loc in &self.locations { + for &shard_id in shard_ids { + let shard = EcVolumeShard::new(&loc.directory, collection, vid, shard_id as u8); + let path = shard.file_name(); + let _ = std::fs::remove_file(&path); + } + } + + // Also unmount if mounted + self.unmount_ec_shards(vid, shard_ids); + + // If all shards are gone, remove .ecx and .ecj files from both idx and data dirs + let all_gone = self.check_all_ec_shards_deleted(vid, collection); + if all_gone { + for loc in &self.locations { + let idx_base = + crate::storage::volume::volume_file_name(&loc.idx_directory, collection, vid); + let _ = std::fs::remove_file(format!("{}.ecx", idx_base)); + let _ = std::fs::remove_file(format!("{}.ecj", idx_base)); + // Also try data directory in case .ecx/.ecj were created before -dir.idx + if loc.idx_directory != loc.directory { + let data_base = + crate::storage::volume::volume_file_name(&loc.directory, collection, vid); + let _ = std::fs::remove_file(format!("{}.ecx", data_base)); + let _ = std::fs::remove_file(format!("{}.ecj", data_base)); + } + } + } + } + + /// Check if all EC shard files have been deleted for a volume. + /// Uses MAX_SHARD_COUNT to support non-standard EC configurations. + fn check_all_ec_shards_deleted(&self, vid: VolumeId, collection: &str) -> bool { + for loc in &self.locations { + for shard_id in 0..MAX_SHARD_COUNT as u8 { + let shard = EcVolumeShard::new(&loc.directory, collection, vid, shard_id); + if std::path::Path::new(&shard.file_name()).exists() { + return false; + } + } + } + true + } + + /// Find the directory containing EC files for a volume. + pub fn find_ec_dir(&self, vid: VolumeId, collection: &str) -> Option { + for loc in &self.locations { + // Check idx directory first + let idx_base = + crate::storage::volume::volume_file_name(&loc.idx_directory, collection, vid); + if std::path::Path::new(&format!("{}.ecx", idx_base)).exists() { + return Some(loc.directory.clone()); + } + // Fall back to data directory if .ecx was created before -dir.idx was configured + if loc.idx_directory != loc.directory { + let data_base = + crate::storage::volume::volume_file_name(&loc.directory, collection, vid); + if std::path::Path::new(&format!("{}.ecx", data_base)).exists() { + return Some(loc.directory.clone()); + } + } + } + None + } + + /// Find the directory containing a specific EC shard file. + pub fn find_ec_shard_dir( + &self, + vid: VolumeId, + collection: &str, + shard_id: u8, + ) -> Option { + for loc in &self.locations { + let shard = EcVolumeShard::new(&loc.directory, collection, vid, shard_id); + if std::path::Path::new(&shard.file_name()).exists() { + return Some(loc.directory.clone()); + } + } + None + } + + // ---- Vacuum / Compaction ---- + + /// Check the garbage level of a volume. + pub fn check_compact_volume(&self, vid: VolumeId) -> Result { + if let Some((_, v)) = self.find_volume(vid) { + Ok(v.garbage_level()) + } else { + Err(format!( + "volume id {} is not found during check compact", + vid.0 + )) + } + } + + /// Compact a volume by rewriting only live needles. + pub fn compact_volume( + &mut self, + vid: VolumeId, + preallocate: u64, + max_bytes_per_second: i64, + progress_fn: F, + ) -> Result<(), String> + where + F: Fn(i64) -> bool, + { + let loc_idx = self + .find_volume(vid) + .map(|(i, _)| i) + .ok_or_else(|| format!("volume id {} is not found during compact", vid.0))?; + + let dir = self.locations[loc_idx].directory.clone(); + let (_, free) = crate::storage::disk_location::get_disk_stats(&dir); + + // Compute required space: use the larger of preallocate or estimated volume size + // matching Go's CompactVolume space check + let space_needed = { + let (_, v) = self.find_volume(vid).unwrap(); + let estimated = v.dat_file_size().unwrap_or(0) + v.idx_file_size(); + std::cmp::max(preallocate, estimated) + }; + + if free < space_needed { + return Err(format!( + "not enough free space to compact volume {}. Required: {}, Free: {}", + vid.0, space_needed, free + )); + } + + if let Some((_, v)) = self.find_volume_mut(vid) { + v.compact_by_index(preallocate, max_bytes_per_second, progress_fn) + .map_err(|e| format!("compact volume {}: {}", vid.0, e)) + } else { + Err(format!("volume id {} is not found during compact", vid.0)) + } + } + + /// Commit a completed compaction: swap files and reload. + pub fn commit_compact_volume(&mut self, vid: VolumeId) -> Result<(bool, u64), String> { + if let Some((_, v)) = self.find_volume_mut(vid) { + let is_read_only = v.is_read_only(); + v.commit_compact() + .map_err(|e| format!("commit compact volume {}: {}", vid.0, e))?; + let volume_size = v.dat_file_size().unwrap_or(0); + Ok((is_read_only, volume_size)) + } else { + Err(format!( + "volume id {} is not found during commit compact", + vid.0 + )) + } + } + + /// Clean up leftover compaction files. + pub fn cleanup_compact_volume(&mut self, vid: VolumeId) -> Result<(), String> { + if let Some((_, v)) = self.find_volume_mut(vid) { + v.cleanup_compact() + .map_err(|e| format!("cleanup volume {}: {}", vid.0, e)) + } else { + Err(format!( + "volume id {} is not found during cleaning up", + vid.0 + )) + } + } + + /// Close all locations and their volumes. + pub fn close(&mut self) { + for loc in &mut self.locations { + loc.close(); + } + } +} + +/// Parse a volume filename like "collection_42.dat" or "42.dat" into (collection, VolumeId). +fn parse_volume_filename(filename: &str) -> Option<(String, VolumeId)> { + let stem = strip_volume_suffix(filename)?; + if let Some(pos) = stem.rfind('_') { + let collection = &stem[..pos]; + let id_str = &stem[pos + 1..]; + let id: u32 = id_str.parse().ok()?; + Some((collection.to_string(), VolumeId(id))) + } else { + let id: u32 = stem.parse().ok()?; + Some((String::new(), VolumeId(id))) + } +} + +fn strip_volume_suffix(filename: &str) -> Option<&str> { + filename + .strip_suffix(".dat") + .or_else(|| filename.strip_suffix(".vif")) + .or_else(|| filename.strip_suffix(".idx")) +} + +fn load_vif_volume_info(path: &str) -> Result { + let content = match std::fs::read_to_string(path) { + Ok(c) => c, + Err(e) if e.kind() == io::ErrorKind::NotFound => return Ok(VifVolumeInfo::default()), + Err(e) => return Err(VolumeError::Io(e)), + }; + if content.trim().is_empty() { + return Ok(VifVolumeInfo::default()); + } + if let Ok(vif) = serde_json::from_str::(&content) { + return Ok(vif); + } + #[derive(serde::Deserialize)] + struct LegacyVolumeInfo { + read_only: bool, + } + if let Ok(legacy) = serde_json::from_str::(&content) { + let mut vif = VifVolumeInfo::default(); + vif.read_only = legacy.read_only; + return Ok(vif); + } + Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!("invalid volume info file {}", path), + ))) +} + +fn save_vif_volume_info(path: &str, info: &VifVolumeInfo) -> Result<(), VolumeError> { + let content = serde_json::to_string_pretty(info) + .map_err(|e| VolumeError::Io(io::Error::new(io::ErrorKind::Other, e.to_string())))?; + std::fs::write(path, content)?; + Ok(()) +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::needle::needle::Needle; + use tempfile::TempDir; + + fn make_test_store(dirs: &[&str]) -> Store { + let mut store = Store::new(NeedleMapKind::InMemory); + for dir in dirs { + store + .add_location( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + } + store + } + + #[test] + fn test_store_add_location() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 10, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + assert_eq!(store.locations.len(), 1); + assert_eq!(store.max_volume_count(), 10); + } + + #[test] + fn test_store_add_volume() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut store = make_test_store(&[dir]); + + store + .add_volume( + VolumeId(1), + "", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + assert!(store.has_volume(VolumeId(1))); + assert!(!store.has_volume(VolumeId(2))); + assert_eq!(store.total_volume_count(), 1); + } + + #[test] + fn test_store_read_write_delete() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut store = make_test_store(&[dir]); + store + .add_volume( + VolumeId(1), + "", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + + // Write + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0xaa), + data: b"hello store".to_vec(), + data_size: 11, + ..Needle::default() + }; + let (offset, _size, unchanged) = store.write_volume_needle(VolumeId(1), &mut n).unwrap(); + assert!(!unchanged); + assert!(offset > 0); + + // Read + let mut read_n = Needle { + id: NeedleId(1), + ..Needle::default() + }; + let count = store.read_volume_needle(VolumeId(1), &mut read_n).unwrap(); + assert_eq!(count, 11); + assert_eq!(read_n.data, b"hello store"); + + // Delete + let mut del_n = Needle { + id: NeedleId(1), + cookie: Cookie(0xaa), + ..Needle::default() + }; + let deleted = store.delete_volume_needle(VolumeId(1), &mut del_n).unwrap(); + assert!(deleted.0 > 0); + } + + #[test] + fn test_store_multi_location() { + let tmp1 = TempDir::new().unwrap(); + let tmp2 = TempDir::new().unwrap(); + let dir1 = tmp1.path().to_str().unwrap(); + let dir2 = tmp2.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir1, + dir1, + 5, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store + .add_location( + dir2, + dir2, + 5, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + assert_eq!(store.max_volume_count(), 10); + + // Add volumes — should go to location with fewest volumes + store + .add_volume( + VolumeId(1), + "", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + store + .add_volume( + VolumeId(2), + "", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + + assert_eq!(store.total_volume_count(), 2); + // Both locations should have 1 volume each (load-balanced) + assert_eq!(store.locations[0].volumes_len(), 1); + assert_eq!(store.locations[1].volumes_len(), 1); + } + + #[test] + fn test_store_delete_collection() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut store = make_test_store(&[dir]); + + store + .add_volume( + VolumeId(1), + "pics", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + store + .add_volume( + VolumeId(2), + "pics", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + store + .add_volume( + VolumeId(3), + "docs", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + assert_eq!(store.total_volume_count(), 3); + + store.delete_collection("pics").unwrap(); + assert_eq!(store.total_volume_count(), 1); + assert!(store.has_volume(VolumeId(3))); + } + + #[test] + fn test_maybe_adjust_volume_max_honors_preallocate_flag() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 2, + DiskType::HardDrive, + MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .unwrap(); + store.volume_size_limit.store(1024, Ordering::Relaxed); + store + .add_volume( + VolumeId(61), + "preallocate_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + store + .add_volume( + VolumeId(62), + "preallocate_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + for vid in [VolumeId(61), VolumeId(62)] { + let dat_path = store.find_volume(vid).unwrap().1.dat_path(); + std::fs::OpenOptions::new() + .write(true) + .open(dat_path) + .unwrap() + .set_len((crate::storage::super_block::SUPER_BLOCK_SIZE + 1) as u64) + .unwrap(); + } + store.locations[0].original_max_volume_count = 0; + store.locations[0] + .max_volume_count + .store(0, Ordering::Relaxed); + + store.set_preallocate(false); + assert!(store.maybe_adjust_volume_max()); + let without_preallocate = store.locations[0].max_volume_count.load(Ordering::Relaxed); + + store.set_preallocate(true); + assert!(store.maybe_adjust_volume_max()); + let with_preallocate = store.locations[0].max_volume_count.load(Ordering::Relaxed); + + assert!(with_preallocate > without_preallocate); + } + + #[test] + fn test_find_free_location_predicate_prefers_more_capacity_and_skips_low_disk() { + let tmp1 = TempDir::new().unwrap(); + let dir1 = tmp1.path().to_str().unwrap(); + let tmp2 = TempDir::new().unwrap(); + let dir2 = tmp2.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir1, + dir1, + 3, + DiskType::HardDrive, + MinFreeSpace::Percent(0.0), + Vec::new(), + ) + .unwrap(); + store + .add_location( + dir2, + dir2, + 5, + DiskType::HardDrive, + MinFreeSpace::Percent(0.0), + Vec::new(), + ) + .unwrap(); + + store + .add_volume( + VolumeId(71), + "find_free_location_case", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .unwrap(); + + let selected = + store.find_free_location_predicate(|loc| loc.disk_type == DiskType::HardDrive); + assert_eq!(selected, Some(1)); + + store.locations[1] + .is_disk_space_low + .store(true, Ordering::Relaxed); + + let selected = + store.find_free_location_predicate(|loc| loc.disk_type == DiskType::HardDrive); + assert_eq!(selected, Some(0)); + } + + #[test] + fn test_delete_expired_ec_volumes_removes_expired_entries() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut store = make_test_store(&[dir]); + + std::fs::write(format!("{}/expired_ec_case_9.ec00", dir), b"expired").unwrap(); + store.locations[0] + .mount_ec_shards(VolumeId(9), "expired_ec_case", &[0]) + .unwrap(); + store.find_ec_volume_mut(VolumeId(9)).unwrap().expire_at_sec = 1; + + let (ec_shards, deleted) = store.delete_expired_ec_volumes(); + + assert!(ec_shards.is_empty()); + assert_eq!(deleted.len(), 1); + assert_eq!(deleted[0].id, 9); + assert!(!store.has_ec_volume(VolumeId(9))); + assert!(!std::path::Path::new(&format!("{}/expired_ec_case_9.ec00", dir)).exists()); + } + + #[test] + fn test_store_volume_not_found() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let store = make_test_store(&[dir]); + + let mut n = Needle { + id: NeedleId(1), + ..Needle::default() + }; + let err = store.read_volume_needle(VolumeId(99), &mut n); + assert!(matches!(err, Err(VolumeError::NotFound))); + } +} diff --git a/seaweed-volume/src/storage/super_block.rs b/seaweed-volume/src/storage/super_block.rs new file mode 100644 index 000000000..033d1a929 --- /dev/null +++ b/seaweed-volume/src/storage/super_block.rs @@ -0,0 +1,289 @@ +//! SuperBlock: the 8-byte (+ optional extra) header at the start of every .dat file. +//! +//! Byte layout: +//! [0] Version +//! [1] ReplicaPlacement byte +//! [2..4] TTL (2 bytes) +//! [4..6] CompactionRevision (u16 big-endian) +//! [6..8] ExtraSize (u16 big-endian) +//! [8..] Extra data (protobuf, ExtraSize bytes) — only for Version 2/3 + +use crate::storage::needle::ttl::TTL; +use crate::storage::types::Version; + +pub const SUPER_BLOCK_SIZE: usize = 8; + +/// SuperBlock metadata at the start of a volume .dat file. +#[derive(Debug, Clone)] +pub struct SuperBlock { + pub version: Version, + pub replica_placement: ReplicaPlacement, + pub ttl: TTL, + pub compaction_revision: u16, + pub extra_size: u16, + pub extra_data: Vec, // raw protobuf bytes (SuperBlockExtra) +} + +impl SuperBlock { + /// Total block size on disk (base 8 + extra). + pub fn block_size(&self) -> usize { + match self.version.0 { + 2 | 3 => SUPER_BLOCK_SIZE + self.extra_size as usize, + _ => SUPER_BLOCK_SIZE, + } + } + + /// Serialize to bytes. + pub fn to_bytes(&self) -> Vec { + let mut header = vec![0u8; SUPER_BLOCK_SIZE]; + header[0] = self.version.0; + header[1] = self.replica_placement.to_byte(); + self.ttl.to_bytes(&mut header[2..4]); + header[4..6].copy_from_slice(&self.compaction_revision.to_be_bytes()); + + if !self.extra_data.is_empty() { + // Go checks extraSize > 256*256-2 and calls glog.Fatalf; guard against u16 overflow. + assert!( + self.extra_data.len() <= 65534, + "super block extra data too large: {} > 65534", + self.extra_data.len() + ); + let extra_size = self.extra_data.len() as u16; + header[6..8].copy_from_slice(&extra_size.to_be_bytes()); + header.extend_from_slice(&self.extra_data); + } + + header + } + + /// Parse from bytes (must be at least SUPER_BLOCK_SIZE bytes). + pub fn from_bytes(bytes: &[u8]) -> Result { + if bytes.len() < SUPER_BLOCK_SIZE { + return Err(SuperBlockError::TooShort(bytes.len())); + } + + let version = Version(bytes[0]); + let replica_placement = ReplicaPlacement::from_byte(bytes[1])?; + let ttl = TTL::from_bytes(&bytes[2..4]); + let compaction_revision = u16::from_be_bytes([bytes[4], bytes[5]]); + let extra_size = u16::from_be_bytes([bytes[6], bytes[7]]); + + let extra_data = if extra_size > 0 && bytes.len() >= SUPER_BLOCK_SIZE + extra_size as usize + { + bytes[SUPER_BLOCK_SIZE..SUPER_BLOCK_SIZE + extra_size as usize].to_vec() + } else { + vec![] + }; + + Ok(SuperBlock { + version, + replica_placement, + ttl, + compaction_revision, + extra_size, + extra_data, + }) + } + + pub fn initialized(&self) -> bool { + true // ReplicaPlacement and TTL are always valid after construction + } +} + +impl Default for SuperBlock { + fn default() -> Self { + SuperBlock { + version: Version::current(), + replica_placement: ReplicaPlacement::default(), + ttl: TTL::EMPTY, + compaction_revision: 0, + extra_size: 0, + extra_data: vec![], + } + } +} + +// ============================================================================ +// ReplicaPlacement +// ============================================================================ + +/// Replication strategy encoded as a single byte. +/// +/// Byte value = DiffDataCenterCount * 100 + DiffRackCount * 10 + SameRackCount +/// +/// Examples: +/// "000" → no replication (1 copy total) +/// "010" → 1 copy in different rack (2 copies total) +/// "100" → 1 copy in different datacenter +/// "200" → 2 copies in different datacenters +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub struct ReplicaPlacement { + pub same_rack_count: u8, + pub diff_rack_count: u8, + pub diff_data_center_count: u8, +} + +impl ReplicaPlacement { + /// Parse from a string like "000", "010", "100". + /// Accepts 0-3 character strings, padding with leading zeros to match Go behavior. + /// E.g. "" -> "000", "1" -> "001", "01" -> "001", "010" -> "010" + pub fn from_string(s: &str) -> Result { + let s = s.trim(); + if s.is_empty() { + return Ok(ReplicaPlacement::default()); + } + // Pad with leading zeros to 3 chars, matching Go's NewReplicaPlacementFromString + let padded = match s.len() { + 1 => format!("00{}", s), + 2 => format!("0{}", s), + 3 => s.to_string(), + _ => return Err(SuperBlockError::InvalidReplicaPlacement(s.to_string())), + }; + let chars: Vec = padded.chars().collect(); + let dc = chars[0] + .to_digit(10) + .ok_or_else(|| SuperBlockError::InvalidReplicaPlacement(s.to_string()))? + as u8; + let rack = chars[1] + .to_digit(10) + .ok_or_else(|| SuperBlockError::InvalidReplicaPlacement(s.to_string()))? + as u8; + let same = chars[2] + .to_digit(10) + .ok_or_else(|| SuperBlockError::InvalidReplicaPlacement(s.to_string()))? + as u8; + // Go validates: value = dc*100 + rack*10 + same must fit in a byte + let value = dc as u16 * 100 + rack as u16 * 10 + same as u16; + if value > 255 { + return Err(SuperBlockError::InvalidReplicaPlacement(s.to_string())); + } + Ok(ReplicaPlacement { + diff_data_center_count: dc, + diff_rack_count: rack, + same_rack_count: same, + }) + } + + /// Parse from a single byte. + pub fn from_byte(b: u8) -> Result { + Ok(ReplicaPlacement { + diff_data_center_count: b / 100, + diff_rack_count: (b % 100) / 10, + same_rack_count: b % 10, + }) + } + + /// Encode as a single byte. + pub fn to_byte(&self) -> u8 { + self.diff_data_center_count * 100 + self.diff_rack_count * 10 + self.same_rack_count + } + + /// Total number of copies (including the original). + pub fn get_copy_count(&self) -> u8 { + self.diff_data_center_count + self.diff_rack_count + self.same_rack_count + 1 + } + + /// Whether this placement requires replication (more than 1 copy). + pub fn has_replication(&self) -> bool { + self.get_copy_count() > 1 + } +} + +impl std::fmt::Display for ReplicaPlacement { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}{}{}", + self.diff_data_center_count, self.diff_rack_count, self.same_rack_count + ) + } +} + +// ============================================================================ +// Errors +// ============================================================================ + +#[derive(Debug, thiserror::Error)] +pub enum SuperBlockError { + #[error("super block too short: {0} bytes")] + TooShort(usize), + + #[error("invalid replica placement: {0}")] + InvalidReplicaPlacement(String), +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::types::*; + + #[test] + fn test_super_block_round_trip() { + let sb = SuperBlock { + version: VERSION_3, + replica_placement: ReplicaPlacement::from_string("010").unwrap(), + ttl: TTL { count: 5, unit: 3 }, + compaction_revision: 42, + extra_size: 0, + extra_data: vec![], + }; + + let bytes = sb.to_bytes(); + assert_eq!(bytes.len(), SUPER_BLOCK_SIZE); + + let sb2 = SuperBlock::from_bytes(&bytes).unwrap(); + assert_eq!(sb2.version, sb.version); + assert_eq!(sb2.replica_placement, sb.replica_placement); + assert_eq!(sb2.ttl, sb.ttl); + assert_eq!(sb2.compaction_revision, sb.compaction_revision); + } + + #[test] + fn test_super_block_with_extra() { + let sb = SuperBlock { + version: VERSION_3, + replica_placement: ReplicaPlacement::default(), + ttl: TTL::EMPTY, + compaction_revision: 0, + extra_size: 3, + extra_data: vec![1, 2, 3], + }; + + let bytes = sb.to_bytes(); + assert_eq!(bytes.len(), SUPER_BLOCK_SIZE + 3); + + let sb2 = SuperBlock::from_bytes(&bytes).unwrap(); + assert_eq!(sb2.extra_data, vec![1, 2, 3]); + } + + #[test] + fn test_replica_placement_byte_round_trip() { + let rp = ReplicaPlacement::from_string("123").unwrap(); + assert_eq!(rp.diff_data_center_count, 1); + assert_eq!(rp.diff_rack_count, 2); + assert_eq!(rp.same_rack_count, 3); + assert_eq!(rp.to_byte(), 123); + assert_eq!(rp.get_copy_count(), 7); // 1+2+3+1 + + let rp2 = ReplicaPlacement::from_byte(123).unwrap(); + assert_eq!(rp, rp2); + } + + #[test] + fn test_replica_placement_no_replication() { + let rp = ReplicaPlacement::from_string("000").unwrap(); + assert!(!rp.has_replication()); + assert_eq!(rp.get_copy_count(), 1); + } + + #[test] + fn test_replica_placement_display() { + let rp = ReplicaPlacement::from_string("010").unwrap(); + assert_eq!(rp.to_string(), "010"); + assert!(rp.has_replication()); + } +} diff --git a/seaweed-volume/src/storage/types.rs b/seaweed-volume/src/storage/types.rs new file mode 100644 index 000000000..c75d35ec1 --- /dev/null +++ b/seaweed-volume/src/storage/types.rs @@ -0,0 +1,679 @@ +//! Core storage types: NeedleId, Offset, Size, Cookie, DiskType. +//! +//! These types define the binary-compatible on-disk format matching the Go implementation. +//! CRITICAL: Byte layout must match exactly for cross-compatibility. + +use std::fmt; + +// ============================================================================ +// Constants +// ============================================================================ + +pub const NEEDLE_ID_SIZE: usize = 8; +pub const NEEDLE_ID_EMPTY: u64 = 0; +pub const COOKIE_SIZE: usize = 4; +pub const SIZE_SIZE: usize = 4; +pub const NEEDLE_HEADER_SIZE: usize = COOKIE_SIZE + NEEDLE_ID_SIZE + SIZE_SIZE; // 16 +pub const DATA_SIZE_SIZE: usize = 4; +pub const TIMESTAMP_SIZE: usize = 8; +pub const NEEDLE_PADDING_SIZE: usize = 8; +pub const NEEDLE_CHECKSUM_SIZE: usize = 4; + +/// 5-byte offset mode (matching Go production builds with `-tags 5BytesOffset`). +/// Max volume size: 8TB. Index entry: 17 bytes (8 + 5 + 4). +#[cfg(feature = "5bytes")] +pub const OFFSET_SIZE: usize = 5; +#[cfg(feature = "5bytes")] +pub const MAX_POSSIBLE_VOLUME_SIZE: u64 = 4 * 1024 * 1024 * 1024 * 8 * 256; // 8TB + +/// 4-byte offset mode (matching Go default build without `5BytesOffset`). +/// Max volume size: 32GB. Index entry: 16 bytes (8 + 4 + 4). +#[cfg(not(feature = "5bytes"))] +pub const OFFSET_SIZE: usize = 4; +#[cfg(not(feature = "5bytes"))] +pub const MAX_POSSIBLE_VOLUME_SIZE: u64 = 4 * 1024 * 1024 * 1024 * 8; // 32GB + +pub const NEEDLE_MAP_ENTRY_SIZE: usize = NEEDLE_ID_SIZE + OFFSET_SIZE + SIZE_SIZE; + +// ============================================================================ +// NeedleId +// ============================================================================ + +/// 64-bit unique identifier for a needle within a volume. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] +pub struct NeedleId(pub u64); + +impl NeedleId { + pub fn to_bytes(&self, bytes: &mut [u8]) { + assert!(bytes.len() >= NEEDLE_ID_SIZE); + bytes[0..8].copy_from_slice(&self.0.to_be_bytes()); + } + + pub fn from_bytes(bytes: &[u8]) -> Self { + assert!(bytes.len() >= NEEDLE_ID_SIZE); + NeedleId(u64::from_be_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ])) + } + + pub fn is_empty(&self) -> bool { + self.0 == 0 + } + + /// Parse a hex string into a NeedleId. + pub fn parse(s: &str) -> Result { + u64::from_str_radix(s, 16).map(NeedleId) + } +} + +impl fmt::Display for NeedleId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:x}", self.0) + } +} + +impl From for NeedleId { + fn from(v: u64) -> Self { + NeedleId(v) + } +} + +impl From for u64 { + fn from(v: NeedleId) -> Self { + v.0 + } +} + +// ============================================================================ +// Cookie +// ============================================================================ + +/// Random 32-bit value to mitigate brute-force lookups. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +pub struct Cookie(pub u32); + +impl Cookie { + pub fn to_bytes(&self, bytes: &mut [u8]) { + assert!(bytes.len() >= COOKIE_SIZE); + bytes[0..4].copy_from_slice(&self.0.to_be_bytes()); + } + + pub fn from_bytes(bytes: &[u8]) -> Self { + assert!(bytes.len() >= COOKIE_SIZE); + Cookie(u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]])) + } + + /// Parse a hex string into a Cookie. + pub fn parse(s: &str) -> Result { + u32::from_str_radix(s, 16).map(Cookie) + } +} + +impl fmt::Display for Cookie { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{:x}", self.0) + } +} + +impl From for Cookie { + fn from(v: u32) -> Self { + Cookie(v) + } +} + +// ============================================================================ +// Size +// ============================================================================ + +/// Needle size as stored in the index. Negative = deleted. +/// +/// - Positive: valid needle with that many bytes of body content +/// - TombstoneFileSize (-1): tombstone marker +/// - Other negative: deleted, absolute value was the original size +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)] +pub struct Size(pub i32); + +/// Special marker for a tombstone (deletion marker) entry. +pub const TOMBSTONE_FILE_SIZE: Size = Size(-1); + +impl Size { + pub fn is_tombstone(&self) -> bool { + self.0 == TOMBSTONE_FILE_SIZE.0 + } + + pub fn is_deleted(&self) -> bool { + self.0 < 0 || self.0 == TOMBSTONE_FILE_SIZE.0 + } + + pub fn is_valid(&self) -> bool { + self.0 > 0 && !self.is_tombstone() + } + + /// Raw storage size. For tombstones returns 0; for negative returns abs value. + pub fn raw(&self) -> u32 { + if self.is_tombstone() { + return 0; + } + if self.0 < 0 { + return (self.0 * -1) as u32; + } + self.0 as u32 + } + + pub fn to_bytes(&self, bytes: &mut [u8]) { + assert!(bytes.len() >= SIZE_SIZE); + bytes[0..4].copy_from_slice(&(self.0 as u32).to_be_bytes()); + } + + pub fn from_bytes(bytes: &[u8]) -> Self { + assert!(bytes.len() >= SIZE_SIZE); + let v = u32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]); + Size(v as i32) + } +} + +impl From for Size { + fn from(v: i32) -> Self { + Size(v) + } +} + +impl From for i32 { + fn from(v: Size) -> Self { + v.0 + } +} + +// ============================================================================ +// Offset +// ============================================================================ + +/// Offset encoding for needle positions in .dat files. +/// +/// The offset is stored divided by NEEDLE_PADDING_SIZE (8). +/// +/// With `5bytes` feature (default, matching Go production builds): +/// 5 bytes can address up to 8TB. +/// On-disk layout: [b3][b2][b1][b0][b4] (big-endian 4 bytes + 1 high byte) +/// +/// Without `5bytes` feature (matching Go default build): +/// 4 bytes can address up to 32GB. +/// On-disk layout: [b3][b2][b1][b0] (big-endian 4 bytes) +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Default)] +pub struct Offset { + pub b0: u8, + pub b1: u8, + pub b2: u8, + pub b3: u8, + #[cfg(feature = "5bytes")] + pub b4: u8, +} + +impl Offset { + /// Convert to the actual byte offset in the .dat file. + pub fn to_actual_offset(&self) -> i64 { + let stored = self.b0 as i64 + + (self.b1 as i64) * 256 + + (self.b2 as i64) * 65536 + + (self.b3 as i64) * 16777216; + #[cfg(feature = "5bytes")] + let stored = stored + (self.b4 as i64) * 4294967296; // 1 << 32 + stored * NEEDLE_PADDING_SIZE as i64 + } + + /// Create an Offset from an actual byte offset. + pub fn from_actual_offset(offset: i64) -> Self { + let smaller = offset / NEEDLE_PADDING_SIZE as i64; + Offset { + b0: smaller as u8, + b1: (smaller >> 8) as u8, + b2: (smaller >> 16) as u8, + b3: (smaller >> 24) as u8, + #[cfg(feature = "5bytes")] + b4: (smaller >> 32) as u8, + } + } + + /// Serialize to bytes in the .idx file format. + /// 5-byte layout: [b3][b2][b1][b0][b4] + /// 4-byte layout: [b3][b2][b1][b0] + pub fn to_bytes(&self, bytes: &mut [u8]) { + assert!(bytes.len() >= OFFSET_SIZE); + bytes[0] = self.b3; + bytes[1] = self.b2; + bytes[2] = self.b1; + bytes[3] = self.b0; + #[cfg(feature = "5bytes")] + { + bytes[4] = self.b4; + } + } + + /// Deserialize from bytes in the .idx file format. + pub fn from_bytes(bytes: &[u8]) -> Self { + assert!(bytes.len() >= OFFSET_SIZE); + Offset { + b3: bytes[0], + b2: bytes[1], + b1: bytes[2], + b0: bytes[3], + #[cfg(feature = "5bytes")] + b4: bytes[4], + } + } + + pub fn is_zero(&self) -> bool { + #[cfg(feature = "5bytes")] + { + self.b0 == 0 && self.b1 == 0 && self.b2 == 0 && self.b3 == 0 && self.b4 == 0 + } + #[cfg(not(feature = "5bytes"))] + { + self.b0 == 0 && self.b1 == 0 && self.b2 == 0 && self.b3 == 0 + } + } +} + +impl fmt::Display for Offset { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.to_actual_offset()) + } +} + +// ============================================================================ +// DiskType +// ============================================================================ + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum DiskType { + HardDrive, + Ssd, + Custom(String), +} + +impl DiskType { + pub fn from_string(s: &str) -> Self { + match s.to_lowercase().as_str() { + "" | "hdd" => DiskType::HardDrive, + "ssd" => DiskType::Ssd, + other => DiskType::Custom(other.to_string()), + } + } + + pub fn readable_string(&self) -> &str { + match self { + DiskType::HardDrive => "hdd", + DiskType::Ssd => "ssd", + DiskType::Custom(s) => s, + } + } +} + +impl fmt::Display for DiskType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + DiskType::HardDrive => write!(f, ""), + DiskType::Ssd => write!(f, "ssd"), + DiskType::Custom(s) => write!(f, "{}", s), + } + } +} + +impl Default for DiskType { + fn default() -> Self { + DiskType::HardDrive + } +} + +// ============================================================================ +// VolumeId +// ============================================================================ + +/// Volume identifier, stored as u32. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] +pub struct VolumeId(pub u32); + +impl VolumeId { + pub fn parse(s: &str) -> Result { + s.parse::().map(VolumeId) + } + + pub fn next(&self) -> VolumeId { + VolumeId(self.0 + 1) + } +} + +impl fmt::Display for VolumeId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +impl From for VolumeId { + fn from(v: u32) -> Self { + VolumeId(v) + } +} + +// ============================================================================ +// Version +// ============================================================================ + +/// Needle storage format version. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Version(pub u8); + +pub const VERSION_1: Version = Version(1); +pub const VERSION_2: Version = Version(2); +pub const VERSION_3: Version = Version(3); + +impl Version { + pub fn current() -> Self { + VERSION_3 + } + + pub fn is_supported(&self) -> bool { + self.0 >= 1 && self.0 <= 3 + } +} + +impl Default for Version { + fn default() -> Self { + VERSION_3 + } +} + +impl From for Version { + fn from(v: u8) -> Self { + Version(v) + } +} + +// ============================================================================ +// ReadOption +// ============================================================================ + +/// Options controlling needle read behavior, matching Go's `ReadOption` in store.go. +/// +/// Fields are split into request-side options (set by the caller) and response-side +/// flags (set during the read to communicate status back). +#[derive(Debug, Clone)] +pub struct ReadOption { + // -- request -- + /// If true, allow reading needles that have been soft-deleted. + pub read_deleted: bool, + /// If true, attempt to read only metadata for large needles (> PagedReadLimit). + pub attempt_meta_only: bool, + /// If true, the caller requires metadata only (no data payload). + pub must_meta_only: bool, + + // -- response -- + /// Set to true when the read actually returned metadata only. + pub is_meta_only: bool, + /// Compaction revision at the time of the read (for consistency during streaming). + pub volume_revision: u16, + /// Set to true when the offset exceeded MaxPossibleVolumeSize (4-byte offset wrap). + pub is_out_of_range: bool, + + // -- slow-read / streaming -- + /// When true, the read lock is acquired and released per chunk instead of held + /// for the entire read, reducing write latency at the cost of higher read P99. + pub has_slow_read: bool, + /// Buffer size for chunked streaming reads (used with `has_slow_read`). + pub read_buffer_size: i32, +} + +impl Default for ReadOption { + fn default() -> Self { + ReadOption { + read_deleted: false, + attempt_meta_only: false, + must_meta_only: false, + is_meta_only: false, + volume_revision: 0, + is_out_of_range: false, + has_slow_read: false, + read_buffer_size: 0, + } + } +} + +// ============================================================================ +// NeedleMapEntry helpers (for .idx file) +// ============================================================================ + +/// Parse a single .idx file entry (17 bytes) into (NeedleId, Offset, Size). +pub fn idx_entry_from_bytes(bytes: &[u8]) -> (NeedleId, Offset, Size) { + assert!(bytes.len() >= NEEDLE_MAP_ENTRY_SIZE); + let key = NeedleId::from_bytes(&bytes[..NEEDLE_ID_SIZE]); + let offset = Offset::from_bytes(&bytes[NEEDLE_ID_SIZE..NEEDLE_ID_SIZE + OFFSET_SIZE]); + let size = Size::from_bytes( + &bytes[NEEDLE_ID_SIZE + OFFSET_SIZE..NEEDLE_ID_SIZE + OFFSET_SIZE + SIZE_SIZE], + ); + (key, offset, size) +} + +/// Write a single .idx file entry (17 bytes). +pub fn idx_entry_to_bytes(bytes: &mut [u8], key: NeedleId, offset: Offset, size: Size) { + assert!(bytes.len() >= NEEDLE_MAP_ENTRY_SIZE); + key.to_bytes(&mut bytes[..NEEDLE_ID_SIZE]); + offset.to_bytes(&mut bytes[NEEDLE_ID_SIZE..NEEDLE_ID_SIZE + OFFSET_SIZE]); + size.to_bytes( + &mut bytes[NEEDLE_ID_SIZE + OFFSET_SIZE..NEEDLE_ID_SIZE + OFFSET_SIZE + SIZE_SIZE], + ); +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_needle_id_round_trip() { + let id = NeedleId(0x123456789abcdef0); + let mut buf = [0u8; 8]; + id.to_bytes(&mut buf); + let id2 = NeedleId::from_bytes(&buf); + assert_eq!(id, id2); + } + + #[test] + fn test_needle_id_display() { + let id = NeedleId(255); + assert_eq!(id.to_string(), "ff"); + } + + #[test] + fn test_needle_id_parse() { + let id = NeedleId::parse("ff").unwrap(); + assert_eq!(id, NeedleId(255)); + } + + #[test] + fn test_cookie_round_trip() { + let cookie = Cookie(0xdeadbeef); + let mut buf = [0u8; 4]; + cookie.to_bytes(&mut buf); + let cookie2 = Cookie::from_bytes(&buf); + assert_eq!(cookie, cookie2); + } + + #[test] + fn test_size_semantics() { + assert!(Size(100).is_valid()); + assert!(!Size(100).is_deleted()); + assert!(!Size(100).is_tombstone()); + assert_eq!(Size(100).raw(), 100); + + assert!(Size(-50).is_deleted()); + assert!(!Size(-50).is_tombstone()); + assert_eq!(Size(-50).raw(), 50); + + assert!(TOMBSTONE_FILE_SIZE.is_deleted()); + assert!(TOMBSTONE_FILE_SIZE.is_tombstone()); + assert_eq!(TOMBSTONE_FILE_SIZE.raw(), 0); + + assert!(!Size(0).is_valid()); + assert!(!Size(0).is_deleted()); + } + + #[test] + fn test_size_round_trip() { + let size = Size(12345); + let mut buf = [0u8; 4]; + size.to_bytes(&mut buf); + let size2 = Size::from_bytes(&buf); + assert_eq!(size, size2); + } + + #[test] + fn test_size_negative_round_trip() { + // Negative sizes round-trip through u32 bit pattern + let size = Size(-50); + let mut buf = [0u8; 4]; + size.to_bytes(&mut buf); + let size2 = Size::from_bytes(&buf); + assert_eq!(size, size2); + } + + #[test] + fn test_offset_round_trip() { + // Test with a known actual offset + let actual_offset: i64 = 8 * 1000000; // must be multiple of 8 + let offset = Offset::from_actual_offset(actual_offset); + assert_eq!(offset.to_actual_offset(), actual_offset); + + // Test byte serialization + let mut buf = [0u8; 5]; + offset.to_bytes(&mut buf); + let offset2 = Offset::from_bytes(&buf); + assert_eq!(offset.to_actual_offset(), offset2.to_actual_offset()); + } + + #[test] + fn test_offset_zero() { + let offset = Offset::default(); + assert!(offset.is_zero()); + assert_eq!(offset.to_actual_offset(), 0); + } + + #[test] + fn test_offset_max() { + // Max stored value depends on offset size + #[cfg(feature = "5bytes")] + let max_stored: i64 = (1i64 << 40) - 1; // 5-byte max + #[cfg(not(feature = "5bytes"))] + let max_stored: i64 = (1i64 << 32) - 1; // 4-byte max + let max_actual = max_stored * NEEDLE_PADDING_SIZE as i64; + let offset = Offset::from_actual_offset(max_actual); + assert_eq!(offset.to_actual_offset(), max_actual); + } + + #[test] + fn test_offset_size_constants() { + #[cfg(feature = "5bytes")] + { + assert_eq!(OFFSET_SIZE, 5); + assert_eq!(NEEDLE_MAP_ENTRY_SIZE, 17); // 8 + 5 + 4 + assert_eq!(MAX_POSSIBLE_VOLUME_SIZE, 4 * 1024 * 1024 * 1024 * 8 * 256); + // 8TB + } + #[cfg(not(feature = "5bytes"))] + { + assert_eq!(OFFSET_SIZE, 4); + assert_eq!(NEEDLE_MAP_ENTRY_SIZE, 16); // 8 + 4 + 4 + assert_eq!(MAX_POSSIBLE_VOLUME_SIZE, 4 * 1024 * 1024 * 1024 * 8); // 32GB + } + } + + #[test] + fn test_idx_entry_round_trip() { + let key = NeedleId(0xdeadbeef12345678); + let offset = Offset::from_actual_offset(8 * 999); + let size = Size(4096); + + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_entry_to_bytes(&mut buf, key, offset, size); + + let (key2, offset2, size2) = idx_entry_from_bytes(&buf); + assert_eq!(key, key2); + assert_eq!(offset.to_actual_offset(), offset2.to_actual_offset()); + assert_eq!(size, size2); + } + + #[test] + fn test_volume_id() { + let vid = VolumeId::parse("42").unwrap(); + assert_eq!(vid, VolumeId(42)); + assert_eq!(vid.to_string(), "42"); + assert_eq!(vid.next(), VolumeId(43)); + } + + #[test] + fn test_version() { + assert!(VERSION_1.is_supported()); + assert!(VERSION_2.is_supported()); + assert!(VERSION_3.is_supported()); + assert!(!Version(0).is_supported()); + assert!(!Version(4).is_supported()); + assert_eq!(Version::current(), VERSION_3); + } + + #[test] + fn test_disk_type() { + assert_eq!(DiskType::from_string(""), DiskType::HardDrive); + assert_eq!(DiskType::from_string("hdd"), DiskType::HardDrive); + assert_eq!(DiskType::from_string("SSD"), DiskType::Ssd); + assert_eq!( + DiskType::from_string("nvme"), + DiskType::Custom("nvme".to_string()) + ); + assert_eq!(DiskType::HardDrive.readable_string(), "hdd"); + assert_eq!(DiskType::Ssd.readable_string(), "ssd"); + } + + #[test] + fn test_read_option_default() { + let ro = ReadOption::default(); + assert!(!ro.read_deleted); + assert!(!ro.attempt_meta_only); + assert!(!ro.must_meta_only); + assert!(!ro.is_meta_only); + assert_eq!(ro.volume_revision, 0); + assert!(!ro.is_out_of_range); + assert!(!ro.has_slow_read); + assert_eq!(ro.read_buffer_size, 0); + } + + #[test] + fn test_read_option_custom() { + let ro = ReadOption { + read_deleted: true, + attempt_meta_only: true, + has_slow_read: true, + read_buffer_size: 1024 * 1024, + ..ReadOption::default() + }; + assert!(ro.read_deleted); + assert!(ro.attempt_meta_only); + assert!(!ro.must_meta_only); + assert!(!ro.is_meta_only); + assert!(ro.has_slow_read); + assert_eq!(ro.read_buffer_size, 1024 * 1024); + } + + #[test] + fn test_read_option_clone() { + let ro = ReadOption { + is_out_of_range: true, + volume_revision: 42, + ..ReadOption::default() + }; + let ro2 = ro.clone(); + assert!(ro2.is_out_of_range); + assert_eq!(ro2.volume_revision, 42); + } +} diff --git a/seaweed-volume/src/storage/volume.rs b/seaweed-volume/src/storage/volume.rs new file mode 100644 index 000000000..28dc761d1 --- /dev/null +++ b/seaweed-volume/src/storage/volume.rs @@ -0,0 +1,4246 @@ +//! Volume: the core storage unit — a .dat file + .idx index. +//! +//! Each volume contains many needles (files). It manages: +//! - Reading/writing/deleting needles from the .dat file +//! - Maintaining the in-memory NeedleMap (NeedleId → Offset+Size) +//! - SuperBlock at offset 0 of the .dat file +//! - Metrics (file count, content size, deleted count) +//! +//! Matches Go's storage/volume.go, volume_loading.go, volume_read.go, +//! volume_write.go, volume_super_block.go. + +use std::fs::{self, File, OpenOptions}; +use std::io::{self, Read, Seek, SeekFrom, Write}; +use std::path::Path; +use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::Arc; +use std::sync::{Condvar, Mutex}; +use std::time::{SystemTime, UNIX_EPOCH}; + +use tracing::warn; + +use crate::storage::needle::needle::{self, get_actual_size, Needle, NeedleError}; +use crate::storage::needle_map::{CompactNeedleMap, NeedleMap, NeedleMapKind, RedbNeedleMap}; +use crate::storage::super_block::{ReplicaPlacement, SuperBlock, SUPER_BLOCK_SIZE}; +use crate::storage::types::*; + +// ============================================================================ +// Errors +// ============================================================================ + +#[derive(Debug, thiserror::Error)] +pub enum VolumeError { + #[error("not found")] + NotFound, + + #[error("already deleted")] + Deleted, + + #[error("needle size mismatch")] + SizeMismatch, + + #[error("unsupported version: {0}")] + UnsupportedVersion(u8), + + #[error("cookie mismatch: {0:#x}")] + CookieMismatch(u32), + + #[error("volume not empty")] + NotEmpty, + + #[error("volume already exists")] + AlreadyExists, + + #[error("volume is read-only")] + ReadOnly, + + #[error("volume size limit exceeded: current {current}, limit {limit}")] + SizeLimitExceeded { current: u64, limit: u64 }, + + #[error("volume not initialized")] + NotInitialized, + + #[error("needle error: {0}")] + Needle(#[from] NeedleError), + + #[error("super block error: {0}")] + SuperBlock(#[from] crate::storage::super_block::SuperBlockError), + + #[error("IO error: {0}")] + Io(#[from] io::Error), + + #[error("streaming from remote-backed volume requires buffered fallback")] + StreamingUnsupported, +} + +// ============================================================================ +// VolumeInfo (.vif persistence) +// ============================================================================ + +/// Legacy simple VolumeInfo for backward compat with old .vif files. +#[derive(serde::Serialize, serde::Deserialize)] +struct VolumeInfo { + read_only: bool, +} + +pub use crate::pb::volume_server_pb::RemoteFile as PbRemoteFile; +/// Protobuf VolumeInfo type alias. +pub use crate::pb::volume_server_pb::VolumeInfo as PbVolumeInfo; + +/// Helper module for deserializing protojson uint64 fields that may be strings or numbers. +mod string_or_u64 { + use serde::{self, Deserialize, Deserializer, Serializer}; + + pub fn serialize(value: &u64, serializer: S) -> Result + where + S: Serializer, + { + // Emit as string to match Go's protojson format for uint64 + serializer.serialize_str(&value.to_string()) + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + #[serde(untagged)] + enum StringOrNum { + Str(String), + Num(u64), + } + match StringOrNum::deserialize(deserializer)? { + StringOrNum::Str(s) => s.parse::().map_err(serde::de::Error::custom), + StringOrNum::Num(n) => Ok(n), + } + } +} + +mod string_or_i64 { + use serde::{self, Deserialize, Deserializer, Serializer}; + + pub fn serialize(value: &i64, serializer: S) -> Result + where + S: Serializer, + { + serializer.serialize_str(&value.to_string()) + } + + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + #[derive(Deserialize)] + #[serde(untagged)] + enum StringOrNum { + Str(String), + Num(i64), + } + match StringOrNum::deserialize(deserializer)? { + StringOrNum::Str(s) => s.parse::().map_err(serde::de::Error::custom), + StringOrNum::Num(n) => Ok(n), + } + } +} + +/// Serde-compatible representation of RemoteFile for .vif JSON serialization. +/// Field names use snake_case to match Go's protobuf JSON output (jsonpb). +#[derive(serde::Serialize, serde::Deserialize, Default, Clone)] +pub struct VifRemoteFile { + #[serde(default, rename = "backendType")] + pub backend_type: String, + #[serde(default, rename = "backendId")] + pub backend_id: String, + #[serde(default)] + pub key: String, + #[serde(default, with = "string_or_u64")] + pub offset: u64, + #[serde(default, rename = "fileSize", with = "string_or_u64")] + pub file_size: u64, + #[serde(default, rename = "modifiedTime", with = "string_or_u64")] + pub modified_time: u64, + #[serde(default)] + pub extension: String, +} + +#[derive(serde::Serialize, serde::Deserialize, Default, Clone)] +pub struct VifEcShardConfig { + #[serde(default, rename = "dataShards")] + pub data_shards: u32, + #[serde(default, rename = "parityShards")] + pub parity_shards: u32, +} + +/// Serde-compatible representation of OldVersionVolumeInfo for legacy .vif JSON deserialization. +/// Matches Go's protobuf OldVersionVolumeInfo where `DestroyTime` maps to `expire_at_sec`. +#[derive(serde::Deserialize, Default)] +struct OldVersionVifVolumeInfo { + #[serde(default)] + pub files: Vec, + #[serde(default)] + pub version: u32, + #[serde(default)] + pub replication: String, + #[serde(default, alias = "bytesOffset", alias = "BytesOffset")] + pub bytes_offset: u32, + #[serde(default, alias = "datFileSize", alias = "dat_file_size", with = "string_or_i64")] + pub dat_file_size: i64, + #[serde(default, alias = "destroyTime", alias = "DestroyTime", with = "string_or_u64")] + pub destroy_time: u64, + #[serde(default, alias = "readOnly", alias = "read_only")] + pub read_only: bool, +} + +impl OldVersionVifVolumeInfo { + /// Convert to the standard VifVolumeInfo, mapping destroy_time -> expire_at_sec. + fn to_vif(self) -> VifVolumeInfo { + VifVolumeInfo { + files: self.files, + version: self.version, + replication: self.replication, + bytes_offset: self.bytes_offset, + dat_file_size: self.dat_file_size, + expire_at_sec: self.destroy_time, + read_only: self.read_only, + ec_shard_config: None, + } + } +} + +/// Serde-compatible representation of VolumeInfo for .vif JSON serialization. +/// Matches Go's protobuf JSON format (jsonpb with EmitUnpopulated=true). +#[derive(serde::Serialize, serde::Deserialize, Default, Clone)] +pub struct VifVolumeInfo { + #[serde(default)] + pub files: Vec, + #[serde(default)] + pub version: u32, + #[serde(default)] + pub replication: String, + #[serde(default, rename = "bytesOffset")] + pub bytes_offset: u32, + #[serde(default, rename = "datFileSize", with = "string_or_i64")] + pub dat_file_size: i64, + #[serde(default, rename = "expireAtSec", with = "string_or_u64")] + pub expire_at_sec: u64, + #[serde(default, rename = "readOnly")] + pub read_only: bool, + #[serde( + default, + rename = "ecShardConfig", + skip_serializing_if = "Option::is_none" + )] + pub ec_shard_config: Option, +} + +impl VifVolumeInfo { + /// Convert from protobuf VolumeInfo to the serde-compatible struct. + pub fn from_pb(pb: &PbVolumeInfo) -> Self { + Self { + files: pb + .files + .iter() + .map(|f| VifRemoteFile { + backend_type: f.backend_type.clone(), + backend_id: f.backend_id.clone(), + key: f.key.clone(), + offset: f.offset, + file_size: f.file_size, + modified_time: f.modified_time, + extension: f.extension.clone(), + }) + .collect(), + version: pb.version, + replication: pb.replication.clone(), + bytes_offset: pb.bytes_offset, + dat_file_size: pb.dat_file_size, + expire_at_sec: pb.expire_at_sec, + read_only: pb.read_only, + ec_shard_config: pb.ec_shard_config.as_ref().map(|c| VifEcShardConfig { + data_shards: c.data_shards, + parity_shards: c.parity_shards, + }), + } + } + + /// Convert to protobuf VolumeInfo. + pub fn to_pb(&self) -> PbVolumeInfo { + PbVolumeInfo { + files: self + .files + .iter() + .map(|f| PbRemoteFile { + backend_type: f.backend_type.clone(), + backend_id: f.backend_id.clone(), + key: f.key.clone(), + offset: f.offset, + file_size: f.file_size, + modified_time: f.modified_time, + extension: f.extension.clone(), + }) + .collect(), + version: self.version, + replication: self.replication.clone(), + bytes_offset: self.bytes_offset, + dat_file_size: self.dat_file_size, + expire_at_sec: self.expire_at_sec, + read_only: self.read_only, + ec_shard_config: self.ec_shard_config.as_ref().map(|c| { + crate::pb::volume_server_pb::EcShardConfig { + data_shards: c.data_shards, + parity_shards: c.parity_shards, + } + }), + } + } +} + +// ============================================================================ +// Streaming read support +// ============================================================================ + +#[derive(Default)] +struct DataFileAccessState { + readers: usize, + writer_active: bool, +} + +#[derive(Default)] +pub struct DataFileAccessControl { + state: Mutex, + condvar: Condvar, +} + +pub struct DataFileReadLease { + control: Arc, +} + +pub struct DataFileWriteLease { + control: Arc, +} + +impl DataFileAccessControl { + pub fn read_lock(self: &Arc) -> DataFileReadLease { + let mut state = self.state.lock().unwrap(); + while state.writer_active { + state = self.condvar.wait(state).unwrap(); + } + state.readers += 1; + drop(state); + DataFileReadLease { + control: self.clone(), + } + } + + pub fn write_lock(self: &Arc) -> DataFileWriteLease { + let mut state = self.state.lock().unwrap(); + while state.writer_active || state.readers > 0 { + state = self.condvar.wait(state).unwrap(); + } + state.writer_active = true; + drop(state); + DataFileWriteLease { + control: self.clone(), + } + } +} + +impl Drop for DataFileReadLease { + fn drop(&mut self) { + let mut state = self.control.state.lock().unwrap(); + state.readers -= 1; + if state.readers == 0 { + self.control.condvar.notify_all(); + } + } +} + +impl Drop for DataFileWriteLease { + fn drop(&mut self) { + let mut state = self.control.state.lock().unwrap(); + state.writer_active = false; + self.control.condvar.notify_all(); + } +} + +/// Information needed to stream needle data directly from the dat file +/// without loading the entire payload into memory. +pub(crate) enum NeedleStreamSource { + Local(File), + Remote(RemoteDatFile), +} + +impl NeedleStreamSource { + pub(crate) fn clone_for_read(&self) -> io::Result { + match self { + NeedleStreamSource::Local(file) => Ok(NeedleStreamSource::Local(file.try_clone()?)), + NeedleStreamSource::Remote(remote) => Ok(NeedleStreamSource::Remote(remote.clone())), + } + } + + pub(crate) fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> io::Result<()> { + match self { + NeedleStreamSource::Local(file) => { + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + file.read_exact_at(buf, offset)?; + } + #[cfg(windows)] + { + read_exact_at(file, buf, offset)?; + } + #[cfg(not(any(unix, windows)))] + { + compile_error!("Platform not supported: only unix and windows are supported"); + } + Ok(()) + } + NeedleStreamSource::Remote(remote) => remote.read_exact_at(buf, offset), + } + } +} + +pub struct NeedleStreamInfo { + /// Stream source for the dat file, local or remote. + pub(crate) source: NeedleStreamSource, + /// Absolute byte offset within the dat file where needle data starts. + pub data_file_offset: u64, + /// Size of the data payload in bytes. + pub data_size: u32, + /// Per-volume file access lock used to match Go's slow-read behavior. + pub data_file_access_control: Arc, + /// Volume ID — used to re-lookup needle offset if compaction occurs during streaming. + pub volume_id: VolumeId, + /// Needle ID — used to re-lookup needle offset if compaction occurs during streaming. + pub needle_id: NeedleId, + /// Compaction revision at the time of the initial read. If this changes during + /// streaming, the needle's disk offset must be re-read from the needle map because + /// compaction may have moved the needle to a different location. + pub compaction_revision: u16, +} + +#[derive(Clone)] +pub(crate) struct RemoteDatFile { + backend: Arc, + key: String, + file_size: u64, + modified_time: u64, +} + +impl RemoteDatFile { + pub(crate) fn read_exact_at(&self, buf: &mut [u8], offset: u64) -> io::Result<()> { + let data = self + .backend + .read_range_blocking(&self.key, offset, buf.len()) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + if data.len() != buf.len() { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + format!( + "remote read short read at offset {}: got {}, expected {}", + offset, + data.len(), + buf.len() + ), + )); + } + buf.copy_from_slice(&data); + Ok(()) + } +} + +// ============================================================================ +// Volume +// ============================================================================ + +pub struct Volume { + pub id: VolumeId, + dir: String, + dir_idx: String, + pub collection: String, + + dat_file: Option, + remote_dat_file: Option, + nm: Option, + needle_map_kind: NeedleMapKind, + data_file_access_control: Arc, + + pub super_block: SuperBlock, + + no_write_or_delete: bool, + no_write_can_delete: bool, + + /// Shared flag from the parent DiskLocation indicating low disk space. + /// Matches Go's `v.location.isDiskSpaceLow` checked in `IsReadOnly()`. + pub location_disk_space_low: Arc, + + last_modified_ts_seconds: u64, + last_append_at_ns: u64, + + last_compact_index_offset: u64, + last_compact_revision: u16, + + is_compacting: bool, + + /// Compaction speed limit in bytes per second (0 = unlimited). + pub compaction_byte_per_second: i64, + + /// Tracks the last I/O error (EIO) for volume health monitoring. + /// Uses Mutex for interior mutability so reads (&self) can clear/set it. + last_io_error: Mutex>, + + /// Protobuf VolumeInfo for tiered storage (.vif file). + pub volume_info: PbVolumeInfo, + + /// Whether this volume has a remote file reference. + pub has_remote_file: bool, +} + +/// Windows helper: loop seek_read until buffer is fully filled. +#[cfg(windows)] +fn read_exact_at(file: &File, buf: &mut [u8], mut offset: u64) -> io::Result<()> { + use std::os::windows::fs::FileExt; + let mut filled = 0; + while filled < buf.len() { + let n = file.seek_read(&mut buf[filled..], offset)?; + if n == 0 { + return Err(io::Error::new( + io::ErrorKind::UnexpectedEof, + "unexpected EOF in seek_read", + )); + } + filled += n; + offset += n as u64; + } + Ok(()) +} + +impl Volume { + /// Create and load a volume from disk. + pub fn new( + dirname: &str, + dir_idx: &str, + collection: &str, + id: VolumeId, + needle_map_kind: NeedleMapKind, + replica_placement: Option, + ttl: Option, + preallocate: u64, + version: Version, + ) -> Result { + let mut v = Volume { + id, + dir: dirname.to_string(), + dir_idx: dir_idx.to_string(), + collection: collection.to_string(), + dat_file: None, + remote_dat_file: None, + nm: None, + needle_map_kind, + data_file_access_control: Arc::new(DataFileAccessControl::default()), + super_block: SuperBlock { + replica_placement: replica_placement.unwrap_or_default(), + ttl: ttl.unwrap_or(crate::storage::needle::ttl::TTL::EMPTY), + ..SuperBlock::default() + }, + no_write_or_delete: false, + no_write_can_delete: false, + location_disk_space_low: Arc::new(AtomicBool::new(false)), + last_modified_ts_seconds: 0, + last_append_at_ns: 0, + last_compact_index_offset: 0, + last_compact_revision: 0, + is_compacting: false, + compaction_byte_per_second: 0, + last_io_error: Mutex::new(None), + volume_info: PbVolumeInfo::default(), + has_remote_file: false, + }; + + v.load(true, true, preallocate, version)?; + Ok(v) + } + + /// Returns true if the volume is currently being compacted. + pub fn is_compacting(&self) -> bool { + self.is_compacting + } + + // ---- File naming (matching Go) ---- + + /// Base filename: dir/collection_id or dir/id + pub fn data_file_name(&self) -> String { + volume_file_name(&self.dir, &self.collection, self.id) + } + + pub fn index_file_name(&self) -> String { + volume_file_name(&self.dir_idx, &self.collection, self.id) + } + + pub fn file_name(&self, ext: &str) -> String { + match ext { + ".idx" | ".cpx" | ".ldb" | ".cpldb" | ".rdb" => { + format!("{}{}", self.index_file_name(), ext) + } + _ => { + format!("{}{}", self.data_file_name(), ext) + } + } + } + + pub fn version(&self) -> Version { + if self.volume_info.version != 0 { + Version(self.volume_info.version as u8) + } else { + self.super_block.version + } + } + + // ---- Loading ---- + + fn load( + &mut self, + also_load_index: bool, + create_dat_if_missing: bool, + preallocate: u64, + version: Version, + ) -> Result<(), VolumeError> { + let dat_path = self.file_name(".dat"); + let mut already_has_super_block = false; + + let has_volume_info_file = self.load_vif()?; + + if self.volume_info.read_only && !self.has_remote_file { + self.no_write_or_delete = true; + } + + if self.has_remote_file { + self.load_remote_dat_file()?; + if let Some(remote_file) = self.volume_info.files.first() { + if remote_file.modified_time > 0 { + self.last_modified_ts_seconds = remote_file.modified_time; + } else if let Ok(metadata) = fs::metadata(self.vif_path()) { + self.last_modified_ts_seconds = metadata + .modified() + .unwrap_or(SystemTime::UNIX_EPOCH) + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + } + } + already_has_super_block = true; + } else if Path::new(&dat_path).exists() { + let metadata = fs::metadata(&dat_path)?; + + // Try to open read-write; fall back to read-only + match OpenOptions::new().read(true).write(true).open(&dat_path) { + Ok(file) => { + self.dat_file = Some(file); + } + Err(e) if e.kind() == io::ErrorKind::PermissionDenied => { + self.dat_file = Some(File::open(&dat_path)?); + self.no_write_or_delete = true; + } + Err(e) => return Err(e.into()), + } + + self.last_modified_ts_seconds = metadata + .modified() + .unwrap_or(SystemTime::UNIX_EPOCH) + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + if metadata.len() >= SUPER_BLOCK_SIZE as u64 { + already_has_super_block = true; + } + } else if create_dat_if_missing { + // Create directory if needed + if let Some(parent) = Path::new(&dat_path).parent() { + fs::create_dir_all(parent)?; + } + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&dat_path)?; + if preallocate > 0 { + preallocate_file(&file, preallocate); + } + self.dat_file = Some(file); + } else { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("volume data file {} does not exist", dat_path), + ))); + } + + if already_has_super_block { + match self.read_super_block() { + Ok(()) => { + if !self.super_block.version.is_supported() { + return Err(VolumeError::UnsupportedVersion(self.super_block.version.0)); + } + // Match Go: v.volumeInfo.Version = uint32(v.SuperBlock.Version) + self.volume_info.version = self.super_block.version.0 as u32; + } + Err(e) if self.has_remote_file => { + warn!( + volume_id = self.id.0, + error = %e, + "failed to read remote super block during load" + ); + } + Err(e) => return Err(e), + } + } else { + self.maybe_write_super_block(version)?; + } + + if also_load_index { + self.load_index()?; + + // Match Go: CheckVolumeDataIntegrity after loading index (volume_loading.go L154-159) + // Only for non-remote volumes (remote storage may not have local .dat) + if !self.has_remote_file { + if let Err(e) = self.check_volume_data_integrity() { + self.no_write_or_delete = true; + warn!( + volume_id = self.id.0, + error = %e, + "volumeDataIntegrityChecking failed" + ); + } + } + } + + // Match Go: if no .vif file existed, create one with version and bytes_offset + if !has_volume_info_file { + self.volume_info.version = self.super_block.version.0 as u32; + self.volume_info.bytes_offset = OFFSET_SIZE as u32; + if let Err(e) = self.save_volume_info() { + warn!( + volume_id = self.id.0, + error = %e, + "failed to save volume info" + ); + } + } + + Ok(()) + } + + fn load_index(&mut self) -> Result<(), VolumeError> { + let use_redb = matches!( + self.needle_map_kind, + NeedleMapKind::LevelDb | NeedleMapKind::LevelDbMedium | NeedleMapKind::LevelDbLarge + ); + + let idx_path = self.file_name(".idx"); + + // Ensure idx directory exists + if let Some(parent) = Path::new(&idx_path).parent() { + fs::create_dir_all(parent)?; + } + + if use_redb { + self.load_index_redb(&idx_path)?; + } else { + self.load_index_inmemory(&idx_path)?; + } + + Ok(()) + } + + /// Load index using in-memory CompactNeedleMap. + fn load_index_inmemory(&mut self, idx_path: &str) -> Result<(), VolumeError> { + if self.no_write_or_delete { + // Open read-only + if Path::new(&idx_path).exists() { + let mut idx_file = File::open(&idx_path)?; + let nm = CompactNeedleMap::load_from_idx(&mut idx_file)?; + self.nm = Some(NeedleMap::InMemory(nm)); + } else { + // Missing .idx with existing .dat could orphan needles + let dat_path = self.file_name(".dat"); + if Path::new(&dat_path).exists() { + let dat_size = fs::metadata(&dat_path).map(|m| m.len()).unwrap_or(0); + if dat_size > SUPER_BLOCK_SIZE as u64 { + warn!( + volume_id = self.id.0, + ".idx file missing but .dat exists with data; needles may be orphaned" + ); + } + } + self.nm = Some(NeedleMap::InMemory(CompactNeedleMap::new())); + } + } else { + // Open read-write (create if missing) + let idx_file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&idx_path)?; + + let idx_size = idx_file.metadata()?.len(); + let mut idx_reader = io::BufReader::new(&idx_file); + let mut nm = CompactNeedleMap::load_from_idx(&mut idx_reader)?; + + // Re-open for append-only writes + let write_file = OpenOptions::new() + .write(true) + .append(true) + .open(&idx_path)?; + nm.set_idx_file(Box::new(write_file), idx_size); + self.nm = Some(NeedleMap::InMemory(nm)); + } + + Ok(()) + } + + /// Load index using disk-backed RedbNeedleMap. + fn load_index_redb(&mut self, idx_path: &str) -> Result<(), VolumeError> { + // The redb database file is stored alongside the volume files + let rdb_path = self.file_name(".rdb"); + + if self.no_write_or_delete { + // Open read-only + if Path::new(&idx_path).exists() { + let mut idx_file = File::open(&idx_path)?; + let nm = RedbNeedleMap::load_from_idx(&rdb_path, &mut idx_file)?; + self.nm = Some(NeedleMap::Redb(nm)); + } else { + // Missing .idx with existing .dat could orphan needles + let dat_path = self.file_name(".dat"); + if Path::new(&dat_path).exists() { + let dat_size = fs::metadata(&dat_path).map(|m| m.len()).unwrap_or(0); + if dat_size > SUPER_BLOCK_SIZE as u64 { + warn!( + volume_id = self.id.0, + ".idx file missing but .dat exists with data; needles may be orphaned" + ); + } + } + self.nm = Some(NeedleMap::Redb(RedbNeedleMap::new(&rdb_path)?)); + } + } else { + // Open read-write (create if missing) + let idx_file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&idx_path)?; + + let idx_size = idx_file.metadata()?.len(); + let mut idx_reader = io::BufReader::new(&idx_file); + let mut nm = RedbNeedleMap::load_from_idx(&rdb_path, &mut idx_reader)?; + + // Re-open for append-only writes + let write_file = OpenOptions::new() + .write(true) + .append(true) + .open(&idx_path)?; + nm.set_idx_file(Box::new(write_file), idx_size); + self.nm = Some(NeedleMap::Redb(nm)); + } + + Ok(()) + } + + fn load_remote_dat_file(&mut self) -> Result<(), VolumeError> { + let (storage_name, storage_key) = self.remote_storage_name_key(); + let backend = crate::remote_storage::s3_tier::global_s3_tier_registry() + .read() + .unwrap() + .get(&storage_name) + .ok_or_else(|| { + VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + format!("remote tier backend {} not found", storage_name), + )) + })?; + + let remote_file = self.volume_info.files.first().ok_or_else(|| { + VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + "remote volume has no remote file entries", + )) + })?; + + let file_size = if remote_file.file_size > 0 { + remote_file.file_size + } else if self.volume_info.dat_file_size > 0 { + self.volume_info.dat_file_size as u64 + } else { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!("remote volume {} is missing file size metadata", self.id.0), + ))); + }; + + self.dat_file = None; + self.remote_dat_file = Some(RemoteDatFile { + backend, + key: storage_key, + file_size, + modified_time: remote_file.modified_time, + }); + Ok(()) + } + + fn read_exact_at_backend(&self, buf: &mut [u8], offset: u64) -> Result<(), VolumeError> { + if let Some(dat_file) = self.dat_file.as_ref() { + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + dat_file.read_exact_at(buf, offset)?; + } + #[cfg(windows)] + { + read_exact_at(dat_file, buf, offset)?; + } + #[cfg(not(any(unix, windows)))] + { + compile_error!("Platform not supported: only unix and windows are supported"); + } + Ok(()) + } else if let Some(remote_dat_file) = self.remote_dat_file.as_ref() { + remote_dat_file.read_exact_at(buf, offset)?; + Ok(()) + } else { + Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + "dat file not open", + ))) + } + } + + /// Returns true when the volume has a data backend (local .dat file or + /// remote tiered storage). Mirrors Go's `v.DataBackend != nil` check. + pub fn has_data_backend(&self) -> bool { + self.dat_file.is_some() || self.remote_dat_file.is_some() + } + + fn current_dat_file_size(&self) -> io::Result { + if let Some(ref f) = self.dat_file { + Ok(f.metadata()?.len()) + } else if let Some(ref remote_dat_file) = self.remote_dat_file { + Ok(remote_dat_file.file_size) + } else { + Ok(0) + } + } + + /// Read a raw byte range from the current .dat backend. + /// + /// This matches Go paths that stream directly from `DataBackend`, including + /// remote-only tiered volumes whose `.dat` is no longer present locally. + pub fn read_dat_slice(&self, offset: u64, size: usize) -> Result, VolumeError> { + let _guard = self.data_file_access_control.read_lock(); + let dat_size = self.current_dat_file_size()?; + if size == 0 || offset >= dat_size { + return Ok(Vec::new()); + } + + let read_len = std::cmp::min(size as u64, dat_size - offset) as usize; + let mut buf = vec![0u8; read_len]; + self.read_exact_at_backend(&mut buf, offset)?; + Ok(buf) + } + + // ---- SuperBlock I/O ---- + + fn read_super_block(&mut self) -> Result<(), VolumeError> { + let mut header = [0u8; SUPER_BLOCK_SIZE]; + self.read_exact_at_backend(&mut header, 0)?; + + let extra_size = u16::from_be_bytes([header[6], header[7]]); + let total_size = SUPER_BLOCK_SIZE + extra_size as usize; + + let mut full_buf = vec![0u8; total_size]; + full_buf[..SUPER_BLOCK_SIZE].copy_from_slice(&header); + if extra_size > 0 { + self.read_exact_at_backend(&mut full_buf[SUPER_BLOCK_SIZE..], SUPER_BLOCK_SIZE as u64)?; + } + + self.super_block = SuperBlock::from_bytes(&full_buf)?; + + // Match Go: if volumeInfo.Replication is set, override super block's ReplicaPlacement + if !self.volume_info.replication.is_empty() { + let rp = ReplicaPlacement::from_string(&self.volume_info.replication)?; + self.super_block.replica_placement = rp; + } + + Ok(()) + } + + fn maybe_write_super_block(&mut self, version: Version) -> Result<(), VolumeError> { + let dat_file = self.dat_file.as_mut().ok_or_else(|| { + VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open")) + })?; + + let dat_size = dat_file.metadata()?.len(); + if dat_size == 0 { + if !version.is_supported() { + return Err(VolumeError::UnsupportedVersion(version.0)); + } + self.super_block.version = version; + let bytes = self.super_block.to_bytes(); + dat_file.seek(SeekFrom::Start(0))?; + dat_file.write_all(&bytes)?; + dat_file.sync_all()?; + } + Ok(()) + } + + // ---- Read ---- + + /// Read a needle by its ID from the volume. + pub fn read_needle(&self, n: &mut Needle) -> Result { + let mut read_option = ReadOption::default(); + self.read_needle_with_option(n, &mut read_option) + } + + pub fn read_needle_opt(&self, n: &mut Needle, read_deleted: bool) -> Result { + let mut read_option = ReadOption { + read_deleted, + ..ReadOption::default() + }; + self.read_needle_with_option(n, &mut read_option) + } + + pub fn read_needle_with_option( + &self, + n: &mut Needle, + read_option: &mut ReadOption, + ) -> Result { + let _guard = self.data_file_access_control.read_lock(); + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + let nv = nm.get(n.id).ok_or(VolumeError::NotFound)?; + + if nv.offset.is_zero() { + return Err(VolumeError::NotFound); + } + + let mut read_size = nv.size; + if read_size.is_deleted() { + if read_option.read_deleted && !read_size.is_tombstone() { + // Negate to get original size + read_size = Size(-read_size.0); + } else { + return Err(VolumeError::Deleted); + } + } + if read_size.0 == 0 { + return Ok(0); + } + + match self.read_needle_data_at_unlocked(n, nv.offset.to_actual_offset(), read_size, read_option) { + Ok(()) => self.check_read_write_error(None), + Err(VolumeError::Io(ref e)) => { + self.check_read_write_error(Some(e)); + return Err(VolumeError::Io(io::Error::new(e.kind(), e.to_string()))); + } + Err(e) => return Err(e), + } + + // TTL expiry check + if n.has_ttl() { + if let Some(ref ttl) = n.ttl { + let ttl_minutes = ttl.minutes(); + if ttl_minutes > 0 && n.has_last_modified_date() { + let expire_at_ns = n.append_at_ns + (ttl_minutes as u64) * 60 * 1_000_000_000; + let now_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as u64; + if now_ns >= expire_at_ns { + return Err(VolumeError::NotFound); + } + } + } + } + + Ok(n.data_size as i32) + } + + /// Read needle data from .dat file at given offset. + pub fn read_needle_data_at( + &self, + n: &mut Needle, + offset: i64, + size: Size, + ) -> Result<(), VolumeError> { + let _guard = self.data_file_access_control.read_lock(); + let mut read_option = ReadOption::default(); + self.read_needle_data_at_unlocked(n, offset, size, &mut read_option) + } + + fn read_needle_data_at_unlocked( + &self, + n: &mut Needle, + offset: i64, + size: Size, + _read_option: &mut ReadOption, + ) -> Result<(), VolumeError> { + match self.read_needle_blob_and_parse(n, offset, size) { + Ok(()) => Ok(()), + #[cfg(not(feature = "5bytes"))] + Err(VolumeError::Needle(NeedleError::SizeMismatch { offset: o, .. })) + if o < MAX_POSSIBLE_VOLUME_SIZE as i64 => + { + // Double-read: in 4-byte offset mode, the actual data may be + // beyond 32GB due to offset wrapping. Retry at offset + 32GB. + self.read_needle_blob_and_parse(n, offset + MAX_POSSIBLE_VOLUME_SIZE as i64, size) + } + Err(e) => Err(e), + } + } + + fn read_needle_blob_and_parse( + &self, + n: &mut Needle, + offset: i64, + size: Size, + ) -> Result<(), VolumeError> { + let version = self.version(); + let actual_size = get_actual_size(size, version); + + let mut buf = vec![0u8; actual_size as usize]; + self.read_exact_at_backend(&mut buf, offset as u64)?; + + n.read_bytes(&mut buf, offset, size, version)?; + Ok(()) + } + + /// Read raw needle blob at a specific offset. + pub fn read_needle_blob(&self, offset: i64, size: Size) -> Result, VolumeError> { + let _guard = self.data_file_access_control.read_lock(); + self.read_needle_blob_unlocked(offset, size) + } + + fn read_needle_blob_unlocked(&self, offset: i64, size: Size) -> Result, VolumeError> { + let version = self.version(); + let actual_size = get_actual_size(size, version); + let mut buf = vec![0u8; actual_size as usize]; + self.read_exact_at_backend(&mut buf, offset as u64)?; + + Ok(buf) + } + + /// Read needle metadata at a specific offset without loading the data payload. + /// + /// Matches Go's `readNeedleMetaAt`, including the tombstone path where a + /// deleted idx entry passes a negative size and the tombstone record itself + /// is read as size 0 metadata. + pub fn read_needle_meta_at( + &self, + n: &mut Needle, + offset: i64, + size: Size, + ) -> Result<(), VolumeError> { + let _guard = self.data_file_access_control.read_lock(); + self.read_needle_meta_at_unlocked(n, offset, size) + } + + fn read_needle_meta_at_unlocked( + &self, + n: &mut Needle, + offset: i64, + size: Size, + ) -> Result<(), VolumeError> { + let normalized_size = if size.is_deleted() { Size(0) } else { size }; + match self.read_needle_meta_blob_and_parse(n, offset, normalized_size) { + Ok(()) => Ok(()), + #[cfg(not(feature = "5bytes"))] + Err(VolumeError::Needle(NeedleError::SizeMismatch { offset: o, .. })) + if o < MAX_POSSIBLE_VOLUME_SIZE as i64 => + { + self.read_needle_meta_blob_and_parse( + n, + offset + MAX_POSSIBLE_VOLUME_SIZE as i64, + normalized_size, + ) + } + Err(e) => Err(e), + } + } + + fn read_needle_meta_blob_and_parse( + &self, + n: &mut Needle, + offset: i64, + size: Size, + ) -> Result<(), VolumeError> { + let version = self.version(); + + // Step 1: Read only the first 20 bytes (header + DataSize). + // Matches Go's ReadNeedleMeta which reads NeedleHeaderSize+DataSizeSize first. + const HEADER_PREFIX: usize = NEEDLE_HEADER_SIZE + DATA_SIZE_SIZE; // 20 + let mut header_buf = [0u8; HEADER_PREFIX]; + self.read_exact_at_backend(&mut header_buf, offset as u64)?; + + // Parse header to get the needle's Size field for validation + let (_, _, found_size) = Needle::parse_header(&header_buf); + if found_size != size { + return Err(VolumeError::Needle(NeedleError::SizeMismatch { + offset, + id: n.id, + found: found_size, + expected: size, + })); + } + + // Step 2: Calculate how much meta tail to read (skip the data payload) + let actual_size = get_actual_size(size, version); + + if size.0 == 0 || version == VERSION_1 { + // Tombstone or V1: no body data section, tail starts right after header + let meta_size = actual_size - NEEDLE_HEADER_SIZE as i64; + if meta_size < 0 || meta_size > 128 * 1024 { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "invalid needle meta size {}: DataSize=0, size={}, offset={}", + meta_size, size.0, offset + ), + ))); + } + let mut meta_buf = vec![0u8; meta_size as usize]; + self.read_exact_at_backend( + &mut meta_buf, + (offset + NEEDLE_HEADER_SIZE as i64) as u64, + )?; + n.read_paged_meta(&header_buf, &meta_buf, offset, size, version)?; + } else { + // V2/V3: extract DataSize from bytes 16..20 + let data_size = u32::from_be_bytes([ + header_buf[NEEDLE_HEADER_SIZE], + header_buf[NEEDLE_HEADER_SIZE + 1], + header_buf[NEEDLE_HEADER_SIZE + 2], + header_buf[NEEDLE_HEADER_SIZE + 3], + ]); + + // Skip past: header(16) + DataSize(4) + data(data_size) + let start_offset = + offset + NEEDLE_HEADER_SIZE as i64 + DATA_SIZE_SIZE as i64 + data_size as i64; + let stop_offset = offset + actual_size; + let meta_size = stop_offset - start_offset; + + // Sanity check: reject metadata sizes > 128KB (matching Go's ReadNeedleMeta guard) + if meta_size < 0 || meta_size > 128 * 1024 { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "invalid needle meta size {}: DataSize={}, size={}, offset={}", + meta_size, data_size, size.0, offset + ), + ))); + } + + // Step 3: Read only the meta tail (skip the data payload entirely) + let mut meta_buf = vec![0u8; meta_size as usize]; + self.read_exact_at_backend(&mut meta_buf, start_offset as u64)?; + n.read_paged_meta(&header_buf, &meta_buf, offset, size, version)?; + } + + Ok(()) + } + + /// Read needle metadata (header + flags/name/mime/etc) without loading the data payload, + /// and return a `NeedleStreamInfo` that can be used to stream data directly from the dat file. + /// + /// This is used for large needles to avoid loading the entire payload into memory. + pub fn read_needle_stream_info( + &self, + n: &mut Needle, + read_deleted: bool, + ) -> Result { + let _guard = self.data_file_access_control.read_lock(); + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + let nv = nm.get(n.id).ok_or(VolumeError::NotFound)?; + + if nv.offset.is_zero() { + return Err(VolumeError::NotFound); + } + + let mut read_size = nv.size; + if read_size.is_deleted() { + if read_deleted && !read_size.is_tombstone() { + read_size = Size(-read_size.0); + } else { + return Err(VolumeError::Deleted); + } + } + if read_size.0 == 0 { + return Err(VolumeError::NotFound); + } + + #[cfg_attr(feature = "5bytes", allow(unused_mut))] + let mut offset = nv.offset.to_actual_offset(); + let version = self.version(); + let actual_size = get_actual_size(read_size, version); + + // Read the full needle bytes (including data) for metadata parsing. + // We use read_bytes_meta_only which skips copying the data payload. + #[cfg_attr(feature = "5bytes", allow(unused_mut))] + let mut read_and_parse = |off: i64| -> Result<(), VolumeError> { + let mut buf = vec![0u8; actual_size as usize]; + self.read_exact_at_backend(&mut buf, off as u64)?; + n.read_bytes_meta_only(&mut buf, off, read_size, version)?; + Ok(()) + }; + + match read_and_parse(offset) { + Ok(()) => {} + #[cfg(not(feature = "5bytes"))] + Err(VolumeError::Needle(NeedleError::SizeMismatch { offset: o, .. })) + if o < MAX_POSSIBLE_VOLUME_SIZE as i64 => + { + offset += MAX_POSSIBLE_VOLUME_SIZE as i64; + read_and_parse(offset)?; + } + Err(e) => return Err(e), + } + + // TTL expiry check + if n.has_ttl() { + if let Some(ref ttl) = n.ttl { + let ttl_minutes = ttl.minutes(); + if ttl_minutes > 0 && n.has_last_modified_date() { + let expire_at_ns = n.append_at_ns + (ttl_minutes as u64) * 60 * 1_000_000_000; + let now_ns = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as u64; + if now_ns >= expire_at_ns { + return Err(VolumeError::NotFound); + } + } + } + } + + // For V1, data starts right after the header + // For V2/V3, data starts at header + 4 (DataSize field) + let data_file_offset = if version == VERSION_1 { + offset as u64 + NEEDLE_HEADER_SIZE as u64 + } else { + offset as u64 + NEEDLE_HEADER_SIZE as u64 + 4 // skip DataSize (4 bytes) + }; + + let source = match (self.dat_file.as_ref(), self.remote_dat_file.as_ref()) { + (Some(dat_file), _) => NeedleStreamSource::Local( + dat_file.try_clone().map_err(VolumeError::Io)?, + ), + (None, Some(remote_dat_file)) => NeedleStreamSource::Remote(remote_dat_file.clone()), + (None, None) => return Err(VolumeError::StreamingUnsupported), + }; + + Ok(NeedleStreamInfo { + source, + data_file_offset, + data_size: n.data_size, + data_file_access_control: self.data_file_access_control.clone(), + volume_id: self.id, + needle_id: n.id, + compaction_revision: self.super_block.compaction_revision, + }) + } + + /// Re-lookup a needle's data-file offset after compaction may have moved it. + /// + /// Returns `(new_data_file_offset, current_compaction_revision)` or an error + /// if the needle is no longer present / has been deleted. + /// + /// This matches Go's `readNeedleDataInto` behaviour: when the volume's + /// `CompactionRevision` changes between streaming chunks, the needle offset + /// is re-read from the needle map because compaction may have relocated it. + pub fn re_lookup_needle_data_offset( + &self, + needle_id: NeedleId, + ) -> Result<(u64, u16), VolumeError> { + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + let nv = nm.get(needle_id).ok_or(VolumeError::NotFound)?; + if nv.offset.is_zero() { + return Err(VolumeError::NotFound); + } + if nv.size.is_deleted() { + return Err(VolumeError::Deleted); + } + + let offset = nv.offset.to_actual_offset(); + let version = self.version(); + + let data_file_offset = if version == VERSION_1 { + offset as u64 + NEEDLE_HEADER_SIZE as u64 + } else { + offset as u64 + NEEDLE_HEADER_SIZE as u64 + 4 // skip DataSize (4 bytes) + }; + + Ok((data_file_offset, self.super_block.compaction_revision)) + } + + // ---- Write ---- + + /// Write a needle to the volume (synchronous path). + pub fn write_needle( + &mut self, + n: &mut Needle, + check_cookie: bool, + ) -> Result<(u64, Size, bool), VolumeError> { + let _guard = self.data_file_access_control.write_lock(); + if self.is_read_only() { + return Err(VolumeError::ReadOnly); + } + + self.do_write_request(n, check_cookie) + } + + fn do_write_request( + &mut self, + n: &mut Needle, + check_cookie: bool, + ) -> Result<(u64, Size, bool), VolumeError> { + // TTL inheritance from volume (matching Go's writeNeedle2) + { + use crate::storage::needle::ttl::TTL; + let needle_ttl = n.ttl.unwrap_or(TTL::EMPTY); + if needle_ttl == TTL::EMPTY && self.super_block.ttl != TTL::EMPTY { + n.set_has_ttl(); + n.ttl = Some(self.super_block.ttl); + } + } + + // Ensure checksum is computed before dedup check + if n.checksum == crate::storage::needle::crc::CRC(0) && !n.data.is_empty() { + n.checksum = crate::storage::needle::crc::CRC::new(&n.data); + } + + // Dedup check (matches Go: n.DataSize = oldNeedle.DataSize on dedup) + if let Some(old_data_size) = self.is_file_unchanged(n) { + n.data_size = old_data_size; + return Ok((0, Size(n.data_size as i32), true)); + } + + // Cookie validation for existing needle (matches Go: check whenever nm.Get returns ok) + if let Some(nm) = &self.nm { + if let Some(nv) = nm.get(n.id) { + let mut existing = Needle::default(); + // Read only the header to check cookie + self.read_needle_header_unlocked(&mut existing, nv.offset.to_actual_offset())?; + + if n.cookie.0 == 0 && !check_cookie { + n.cookie = existing.cookie; + } + if existing.cookie != n.cookie { + return Err(VolumeError::CookieMismatch(n.cookie.0)); + } + } + } + + // Update append timestamp + n.append_at_ns = get_append_at_ns(self.last_append_at_ns); + + // Append to .dat file + let (offset, _body_size, _actual_size) = self.append_needle(n)?; + self.last_append_at_ns = n.append_at_ns; + + // Update needle map (uses n.size = full body size, matching Go's nm.Put) + let should_update = if let Some(nm) = &self.nm { + match nm.get(n.id) { + Some(nv) => (nv.offset.to_actual_offset() as u64) < offset, + None => true, + } + } else { + true + }; + + if should_update { + if let Some(nm) = &mut self.nm { + nm.put(n.id, Offset::from_actual_offset(offset as i64), n.size)?; + } + } + + if self.last_modified_ts_seconds < n.last_modified { + self.last_modified_ts_seconds = n.last_modified; + } + + // Return Size(n.DataSize) as the logical size, matching Go's doWriteRequest + Ok((offset, Size(n.data_size as i32), false)) + } + + fn read_needle_header_unlocked(&self, n: &mut Needle, offset: i64) -> Result<(), VolumeError> { + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + self.read_exact_at_backend(&mut header, offset as u64)?; + + n.read_header(&header); + Ok(()) + } + + /// Check if the needle is unchanged from the existing one on disk. + /// Returns `Some(old_data_size)` if unchanged, `None` otherwise. + /// Matches Go's isFileUnchanged which also sets n.DataSize = oldNeedle.DataSize. + fn is_file_unchanged(&self, n: &Needle) -> Option { + // Don't dedup for volumes with TTL + if self.super_block.ttl != crate::storage::needle::ttl::TTL::EMPTY { + return None; + } + + if let Some(nm) = &self.nm { + if let Some(nv) = nm.get(n.id) { + if !nv.offset.is_zero() && nv.size.is_valid() { + let mut old = Needle::default(); + let mut ro = ReadOption::default(); + if self + .read_needle_data_at_unlocked( + &mut old, + nv.offset.to_actual_offset(), + nv.size, + &mut ro, + ) + .is_ok() + { + if old.cookie == n.cookie + && old.checksum == n.checksum + && old.data == n.data + { + return Some(old.data_size); + } + } + } + } + } + None + } + + /// Append a needle to the .dat file. Returns (offset, size, actual_size). + fn append_needle(&mut self, n: &mut Needle) -> Result<(u64, Size, i64), VolumeError> { + let version = self.version(); + let bytes = n.write_bytes(version); + let actual_size = bytes.len() as i64; + + let dat_file = self.dat_file.as_mut().ok_or_else(|| { + VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open")) + })?; + + let offset = dat_file.seek(SeekFrom::End(0))?; + + // Check volume size limit before writing (matching Go's Append) + if offset >= MAX_POSSIBLE_VOLUME_SIZE && !n.data.is_empty() { + return Err(VolumeError::SizeLimitExceeded { + current: offset, + limit: MAX_POSSIBLE_VOLUME_SIZE, + }); + } + + if let Err(e) = dat_file.write_all(&bytes) { + // Truncate back to pre-write position on error (matching Go) + let _ = dat_file.set_len(offset); + self.check_read_write_error(Some(&e)); + return Err(VolumeError::Io(e)); + } + self.check_read_write_error(None); + + Ok((offset, n.size, actual_size)) + } + + // ---- Delete ---- + + /// Delete a needle from the volume. + pub fn delete_needle(&mut self, n: &mut Needle) -> Result { + let _guard = self.data_file_access_control.write_lock(); + if self.no_write_or_delete { + return Err(VolumeError::ReadOnly); + } + self.do_delete_request(n) + } + + fn do_delete_request(&mut self, n: &mut Needle) -> Result { + let (found, size, _stored_offset) = if let Some(nm) = &self.nm { + if let Some(nv) = nm.get(n.id) { + if !nv.size.is_deleted() { + (true, nv.size, nv.offset) + } else { + (false, Size(0), Offset::default()) + } + } else { + (false, Size(0), Offset::default()) + } + } else { + return Ok(Size(0)); + }; + + if !found { + return Ok(Size(0)); + } + + // Write tombstone: append needle with empty data + n.data = vec![]; + n.append_at_ns = get_append_at_ns(self.last_append_at_ns); + + let offset = if !self.has_remote_file { + // Normal volume: append tombstone to .dat file + let (offset, _, _) = self.append_needle(n)?; + offset + } else { + // Remote-tiered volume: skip .dat append, use offset 0 + 0 + }; + self.last_append_at_ns = n.append_at_ns; + + // Update index + if let Some(nm) = &mut self.nm { + nm.delete(n.id, Offset::from_actual_offset(offset as i64))?; + } + + Ok(size) + } + + // ---- Metrics ---- + + pub fn content_size(&self) -> u64 { + self.nm.as_ref().map_or(0, |nm| nm.content_size()) + } + + pub fn deleted_size(&self) -> u64 { + self.nm.as_ref().map_or(0, |nm| nm.deleted_size()) + } + + pub fn file_count(&self) -> i64 { + self.nm.as_ref().map_or(0, |nm| nm.file_count()) + } + + pub fn deleted_count(&self) -> i64 { + self.nm.as_ref().map_or(0, |nm| nm.deleted_count()) + } + + pub fn max_file_key(&self) -> NeedleId { + self.nm.as_ref().map_or(NeedleId(0), |nm| nm.max_file_key()) + } + + pub fn is_read_only(&self) -> bool { + self.no_write_or_delete + || self.no_write_can_delete + || self.location_disk_space_low.load(Ordering::Relaxed) + } + + pub fn is_no_write_or_delete(&self) -> bool { + self.no_write_or_delete + } + + pub fn is_no_write_can_delete(&self) -> bool { + self.no_write_can_delete + } + + pub fn last_compact_revision(&self) -> u16 { + self.last_compact_revision + } + + pub fn last_modified_ts(&self) -> u64 { + self.last_modified_ts_seconds + } + + pub fn is_expired(&self, volume_size: u64, volume_size_limit: u64) -> bool { + if volume_size_limit == 0 { + return false; + } + if volume_size <= SUPER_BLOCK_SIZE as u64 { + return false; + } + let ttl_minutes = self.super_block.ttl.minutes(); + if ttl_minutes == 0 { + return false; + } + let lived_minutes = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + .saturating_sub(self.last_modified_ts_seconds) + / 60; + (ttl_minutes as u64) < lived_minutes + } + + pub fn is_expired_long_enough(&self, max_delay_minutes: u32) -> bool { + let ttl_minutes = self.super_block.ttl.minutes(); + if ttl_minutes == 0 { + return false; + } + let removal_delay = std::cmp::min(ttl_minutes / 10, max_delay_minutes); + ((ttl_minutes + removal_delay) as u64) * 60 + self.last_modified_ts_seconds + < SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs() + } + + /// Read all live needles from the volume (for ReadAllNeedles streaming RPC). + pub fn read_all_needles(&self) -> Result, VolumeError> { + let _guard = self.data_file_access_control.read_lock(); + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + let version = self.version(); + let dat_size = self.current_dat_file_size()? as i64; + let mut needles = Vec::new(); + let mut offset = self.super_block.block_size() as i64; + + while offset < dat_size { + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + match self.read_exact_at_backend(&mut header, offset as u64) { + Ok(()) => {} + Err(VolumeError::Io(e)) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e), + } + + let (_cookie, key, size) = Needle::parse_header(&header); + if size.0 == 0 && key.is_empty() { + break; + } + + let body_length = needle::needle_body_length(size, version); + let total_size = NEEDLE_HEADER_SIZE as i64 + body_length as i64; + + if size.is_deleted() || size.0 <= 0 { + offset += total_size; + continue; + } + + let Some(nv) = nm.get(key) else { + offset += total_size; + continue; + }; + if nv.offset.to_actual_offset() != offset { + offset += total_size; + continue; + } + + let mut n = Needle { + id: key, + ..Needle::default() + }; + let mut read_option = ReadOption::default(); + self.read_needle_data_at_unlocked(&mut n, offset, size, &mut read_option)?; + needles.push(n); + + offset += total_size; + } + Ok(needles) + } + + /// Check volume data integrity by verifying the last index entries against the .dat file. + /// Matches Go's CheckVolumeDataIntegrity (volume_checking.go L117-141). + /// Reads the last few index entries, verifies each needle header is readable and + /// consistent. On failure, marks the volume read-only. + fn check_volume_data_integrity(&mut self) -> Result<(), VolumeError> { + let idx_path = self.file_name(".idx"); + if !Path::new(&idx_path).exists() { + return Ok(()); + } + + let idx_size = fs::metadata(&idx_path).map(|m| m.len()).unwrap_or(0) as i64; + if idx_size == 0 { + return Ok(()); + } + if idx_size % NEEDLE_MAP_ENTRY_SIZE as i64 != 0 { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "index file's size is {} bytes, maybe corrupted", + idx_size + ), + ))); + } + + let version = self.version(); + + // Check last 10 index entries (matching Go's CheckVolumeDataIntegrity). + // Go starts healthyIndexSize = indexSize and reduces on EOF. + // On success: break (err != ErrorSizeMismatch when err == nil). + // On EOF: set healthyIndexSize = position of corrupt entry, continue. + // On ErrorSizeMismatch: continue (try next entry). + // After loop: if healthyIndexSize < indexSize → error. + let mut idx_file = File::open(&idx_path)?; + let max_entries = std::cmp::min(10, idx_size / NEEDLE_MAP_ENTRY_SIZE as i64); + let mut healthy_index_size: i64 = idx_size; + + for i in 1..=max_entries { + let entry_offset = idx_size - i * NEEDLE_MAP_ENTRY_SIZE as i64; + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + idx_file.seek(SeekFrom::Start(entry_offset as u64))?; + idx_file.read_exact(&mut buf)?; + + let (key, offset, size) = idx_entry_from_bytes(&buf); + if offset.is_zero() { + continue; + } + + let actual_offset = offset.to_actual_offset() as u64; + + // Read needle header at the offset + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + match self.read_exact_at_backend(&mut header, actual_offset) { + Ok(()) => {} + Err(VolumeError::Io(e)) if e.kind() == io::ErrorKind::UnexpectedEof => { + // Match Go: on EOF, mark this entry as corrupt and continue + // checking earlier entries (healthyIndexSize tracks the boundary). + healthy_index_size = entry_offset; + continue; + } + Err(e) => return Err(e), + } + + let (_cookie, needle_id, needle_size) = Needle::parse_header(&header); + + // Verify the needle ID matches the index entry + if !key.is_empty() && needle_id != key { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "index key {:?} does not match needle Id {:?} at offset {}", + key, needle_id, actual_offset + ), + ))); + } + + // For non-deleted entries, verify the size matches + if !size.is_deleted() && size.0 > 0 && needle_size.0 != size.0 { + // Try with MaxPossibleVolumeSize offset adjustment (Go parity) + let alt_offset = actual_offset + MAX_POSSIBLE_VOLUME_SIZE as u64; + let mut alt_header = [0u8; NEEDLE_HEADER_SIZE]; + if self + .read_exact_at_backend(&mut alt_header, alt_offset) + .is_ok() + { + let (_, _, alt_size) = Needle::parse_header(&alt_header); + if alt_size.0 == size.0 { + continue; + } + } + // Match Go: ErrorSizeMismatch breaks out of the loop + break; + } + + // If V3, try to read the append timestamp from the last verified entry. + // Go reads AppendAtNs from both live and deleted (tombstone) entries + // via verifyNeedleIntegrity and verifyDeletedNeedleIntegrity. + if version == VERSION_3 { + // For tombstones (deleted), body size on disk is 0. + // For live entries, body size is size.0. + let body_size = if size.is_deleted() { 0u64 } else { size.0 as u64 }; + let ts_offset = + actual_offset + NEEDLE_HEADER_SIZE as u64 + body_size + 4; // skip checksum + let mut ts_buf = [0u8; 8]; + if self.read_exact_at_backend(&mut ts_buf, ts_offset).is_ok() { + let ts = u64::from_be_bytes(ts_buf); + if ts > 0 { + self.last_append_at_ns = ts; + } + } + } + } + + // Match Go: if healthyIndexSize < indexSize, trailing entries are corrupt + if healthy_index_size < idx_size { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "healthy index size {} is less than expected {}", + healthy_index_size, idx_size + ), + ))); + } + + Ok(()) + } + + /// Scrub the volume index by verifying each needle map entry against the dat file. + /// For each entry, reads only the 16-byte needle header at the given offset to verify: + /// correct needle ID, correct cookie (non-zero), and valid size. + /// Does NOT read/verify the full needle data or CRC. + /// Returns (files_checked, broken_needles) tuple. + pub fn scrub_index(&self) -> Result<(u64, Vec), VolumeError> { + if self.dat_file.is_none() && self.remote_dat_file.is_none() { + return Err(VolumeError::NotFound); + } + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + let dat_size = self.dat_file_size().map_err(VolumeError::Io)?; + + let mut files_checked: u64 = 0; + let mut broken = Vec::new(); + + for (needle_id, nv) in nm.iter_entries() { + if nv.offset.is_zero() || nv.size.is_deleted() { + continue; + } + + let offset = nv.offset.to_actual_offset(); + if offset < 0 || offset as u64 >= dat_size { + broken.push(format!( + "needle {} offset {} out of range (dat_size={})", + needle_id.0, offset, dat_size + )); + continue; + } + + // Read only the 16-byte needle header to verify ID, cookie, and size + let mut header_buf = [0u8; NEEDLE_HEADER_SIZE]; + match self.read_exact_at_backend(&mut header_buf, offset as u64) { + Ok(()) => { + let (cookie, id, size) = Needle::parse_header(&header_buf); + if id != needle_id { + broken.push(format!( + "needle {} header id mismatch: expected {}, got {}", + needle_id.0, needle_id.0, id.0 + )); + } else if cookie.0 == 0 { + broken.push(format!( + "needle {} has zero cookie at offset {}", + needle_id.0, offset + )); + } else if size.0 <= 0 && !nv.size.is_deleted() { + broken.push(format!( + "needle {} has invalid size {} at offset {}", + needle_id.0, size.0, offset + )); + } + } + Err(e) => { + broken.push(format!("needle {} read header error: {}", needle_id.0, e)); + } + } + + files_checked += 1; + } + + Ok((files_checked, broken)) + } + + /// Scrub the volume by reading and verifying all needles. + /// Returns (files_checked, broken_needles) tuple. + /// Each needle is read from disk and its CRC checksum is verified. + pub fn scrub(&self) -> Result<(u64, Vec), VolumeError> { + if self.dat_file.is_none() && self.remote_dat_file.is_none() { + return Err(VolumeError::NotFound); + } + let nm = self.nm.as_ref().ok_or(VolumeError::NotFound)?; + + let dat_size = self.dat_file_size().map_err(|e| VolumeError::Io(e))?; + let version = self.version(); + + let mut files_checked: u64 = 0; + let mut broken = Vec::new(); + let mut total_read: i64 = 0; + + for (needle_id, nv) in nm.iter_entries() { + if nv.offset.is_zero() { + continue; + } + + // Accumulate actual needle size for ALL entries including deleted ones + // (matches Go: deleted needles still occupy space in the .dat file). + total_read += get_actual_size(nv.size, version); + + if nv.size.is_deleted() { + continue; + } + + let offset = nv.offset.to_actual_offset(); + if offset < 0 || offset as u64 >= dat_size { + broken.push(format!( + "needle {} offset {} out of range (dat_size={})", + needle_id.0, offset, dat_size + )); + continue; + } + + // Read and verify the needle (read_needle_data_at checks CRC via read_bytes/read_tail) + let mut n = Needle { + id: needle_id, + ..Needle::default() + }; + match self.read_needle_data_at(&mut n, offset, nv.size) { + Ok(_) => {} + Err(e) => { + broken.push(format!("needle {} error: {}", needle_id.0, e)); + } + } + + files_checked += 1; + } + + // Validate total data size against .dat file size (matches Go's scrubVolumeData) + let expected_size = total_read + SUPER_BLOCK_SIZE as i64; + if (dat_size as i64) < expected_size { + broken.push(format!( + "dat file size {} is smaller than expected {} (total_read {} + super_block {})", + dat_size, expected_size, total_read, SUPER_BLOCK_SIZE + )); + } else if dat_size as i64 != expected_size { + broken.push(format!( + "warning: dat file size {} does not match expected {} (total_read {} + super_block {})", + dat_size, expected_size, total_read, SUPER_BLOCK_SIZE + )); + } + + Ok((files_checked, broken)) + } + + /// Scan raw needle entries from the .dat file starting at `from_offset`. + /// Returns (needle_header_bytes, needle_body_bytes, append_at_ns) for each needle. + /// Used by VolumeTailSender to stream raw bytes. + pub fn scan_raw_needles_from( + &self, + from_offset: u64, + ) -> Result, Vec, u64)>, VolumeError> { + let version = self.version(); + let dat_size = self.current_dat_file_size()?; + let mut entries = Vec::new(); + let mut offset = from_offset; + + while offset < dat_size { + // Read needle header (16 bytes) + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + match self.read_exact_at_backend(&mut header, offset) { + Ok(()) => {} + Err(VolumeError::Io(e)) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e.into()), + } + + let (_cookie, _id, size) = Needle::parse_header(&header); + if size.0 == 0 && _id.is_empty() { + break; + } + + let body_length = needle::needle_body_length(size, version); + let total_size = NEEDLE_HEADER_SIZE as u64 + body_length as u64; + + // Match Go's ScanVolumeFileFrom: visit ALL needles including deleted ones. + // This is critical for incremental copy where tombstones must be propagated. + + // Read body bytes + let mut body = vec![0u8; body_length as usize]; + match self.read_exact_at_backend(&mut body, offset + NEEDLE_HEADER_SIZE as u64) { + Ok(()) => {} + Err(VolumeError::Io(e)) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e.into()), + } + + // Parse the needle to get append_at_ns + let mut full = vec![0u8; total_size as usize]; + full[..NEEDLE_HEADER_SIZE].copy_from_slice(&header); + full[NEEDLE_HEADER_SIZE..].copy_from_slice(&body); + let mut n = Needle::default(); + let _ = n.read_bytes(&full, offset as i64, size, version); + + entries.push((header.to_vec(), body, n.append_at_ns)); + offset += total_size; + } + + Ok(entries) + } + + /// Insert or update a needle index entry (for low-level blob writes). + pub fn put_needle_index( + &mut self, + key: NeedleId, + offset: Offset, + size: Size, + ) -> Result<(), VolumeError> { + if let Some(ref mut nm) = self.nm { + nm.put(key, offset, size).map_err(VolumeError::Io)?; + } + Ok(()) + } + + /// Mark this volume as read-only (no writes or deletes). + /// If `persist` is true, the readonly state is saved to the .vif file. + pub fn set_read_only(&mut self) -> Result<(), VolumeError> { + self.no_write_or_delete = true; + self.save_vif() + } + + /// Mark this volume as read-only, optionally persisting to .vif. + pub fn set_read_only_persist(&mut self, persist: bool) -> Result<(), VolumeError> { + self.no_write_or_delete = true; + if persist { + self.save_vif()?; + } + Ok(()) + } + + /// Mark this volume as writable (allow writes and deletes). + pub fn set_writable(&mut self) -> Result<(), VolumeError> { + self.no_write_or_delete = false; + self.save_vif() + } + + /// Recompute the Go-style write/delete mode from the current remote tier state. + pub fn refresh_remote_write_mode(&mut self) { + self.has_remote_file = !self.volume_info.files.is_empty(); + if self.has_remote_file { + self.no_write_can_delete = true; + self.no_write_or_delete = false; + } else { + self.no_write_can_delete = false; + } + } + + /// Close the local .dat file handle (matches Go's v.DataBackend.Close() in LoadRemoteFile). + /// Called after tier-upload when the local file is being replaced by remote storage. + pub fn close_local_dat_backend(&mut self) { + self.dat_file = None; + } + + /// Close the remote dat file backend (matches Go's v.DataBackend.Close(); v.DataBackend = nil). + /// Called after tier-download when the remote backend is being replaced by local storage. + pub fn close_remote_dat_backend(&mut self) { + self.remote_dat_file = None; + } + + /// Path to .vif file. + fn vif_path(&self) -> String { + format!("{}.vif", self.data_file_name()) + } + + /// Load volume info from .vif file. + /// Supports both the protobuf-JSON format (Go-compatible) and legacy JSON. + /// Returns true if a .vif file was found and successfully loaded. + fn load_vif(&mut self) -> Result { + let path = self.vif_path(); + if let Ok(content) = fs::read_to_string(&path) { + if content.trim().is_empty() { + return Ok(false); + } + // Try protobuf-JSON (Go-compatible VolumeInfo via VifVolumeInfo) + if let Ok(vif_info) = serde_json::from_str::(&content) { + let pb_info = vif_info.to_pb(); + if pb_info.read_only { + self.no_write_or_delete = true; + } + self.volume_info = pb_info; + self.refresh_remote_write_mode(); + if self.volume_info.version == 0 { + self.volume_info.version = Version::current().0 as u32; + } + if !self.has_remote_file && self.volume_info.bytes_offset == 0 { + self.volume_info.bytes_offset = OFFSET_SIZE as u32; + } + if self.volume_info.bytes_offset != 0 + && self.volume_info.bytes_offset != OFFSET_SIZE as u32 + { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "bytes_offset mismatch in {}: found {}, expected {}", + path, self.volume_info.bytes_offset, OFFSET_SIZE + ), + ))); + } + return Ok(true); + } + // Fall back to OldVersionVolumeInfo (Go's tryOldVersionVolumeInfo): + // maps DestroyTime -> expire_at_sec + if let Ok(old_info) = serde_json::from_str::(&content) { + let vif_info = old_info.to_vif(); + let pb_info = vif_info.to_pb(); + if pb_info.read_only { + self.no_write_or_delete = true; + } + self.volume_info = pb_info; + self.refresh_remote_write_mode(); + if self.volume_info.version == 0 { + self.volume_info.version = Version::current().0 as u32; + } + if !self.has_remote_file && self.volume_info.bytes_offset == 0 { + self.volume_info.bytes_offset = OFFSET_SIZE as u32; + } + if self.volume_info.bytes_offset != 0 + && self.volume_info.bytes_offset != OFFSET_SIZE as u32 + { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "bytes_offset mismatch in {}: found {}, expected {}", + path, self.volume_info.bytes_offset, OFFSET_SIZE + ), + ))); + } + return Ok(true); + } + // Fall back to legacy format + if let Ok(info) = serde_json::from_str::(&content) { + if info.read_only { + self.no_write_or_delete = true; + } + return Ok(true); + } + } + Ok(false) + } + + /// Save volume info to .vif file in protobuf-JSON format (Go-compatible). + /// Matches Go's SaveVolumeInfo: checks writability before writing and propagates errors. + fn save_vif(&self) -> Result<(), VolumeError> { + let vif_path = self.vif_path(); + + // Match Go: if file exists but is not writable, return an error + let path = std::path::Path::new(&vif_path); + if path.exists() { + let metadata = fs::metadata(path)?; + if metadata.permissions().readonly() { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::PermissionDenied, + format!("failed to check {} not writable", vif_path), + ))); + } + } + + let mut vif = VifVolumeInfo::from_pb(&self.volume_info); + vif.read_only = self.no_write_or_delete; + + // Match Go's SaveVolumeInfo: compute ExpireAtSec from TTL + let ttl_seconds = self.super_block.ttl.to_seconds(); + if ttl_seconds > 0 { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + vif.expire_at_sec = now + ttl_seconds; + } + + let content = serde_json::to_string_pretty(&vif) + .map_err(|e| VolumeError::Io(io::Error::new(io::ErrorKind::Other, e.to_string())))?; + fs::write(&vif_path, content)?; + Ok(()) + } + + /// Save full VolumeInfo to .vif file (for tiered storage). + /// Matches Go's SaveVolumeInfo which computes ExpireAtSec from TTL. + pub fn save_volume_info(&mut self) -> Result<(), VolumeError> { + self.volume_info.read_only = self.no_write_or_delete; + + // Compute ExpireAtSec from TTL (matches Go's SaveVolumeInfo) + let ttl_seconds = self.super_block.ttl.to_seconds(); + if ttl_seconds > 0 { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + self.volume_info.expire_at_sec = now + ttl_seconds; + } + + let vif = VifVolumeInfo::from_pb(&self.volume_info); + let content = serde_json::to_string_pretty(&vif) + .map_err(|e| VolumeError::Io(io::Error::new(io::ErrorKind::Other, e.to_string())))?; + fs::write(&self.vif_path(), content)?; + Ok(()) + } + + /// Get the remote storage backend name and key from this volume .vif. + pub fn remote_storage_name_key(&self) -> (String, String) { + if self.volume_info.files.is_empty() { + return (String::new(), String::new()); + } + let rf = &self.volume_info.files[0]; + let backend_name = if rf.backend_id.is_empty() { + rf.backend_type.clone() + } else { + format!("{}.{}", rf.backend_type, rf.backend_id) + }; + (backend_name, rf.key.clone()) + } + + /// Get the dat file path for this volume. + pub fn dat_path(&self) -> String { + self.file_name(".dat") + } + + /// Get the directory this volume is stored in. + pub fn dir(&self) -> &str { + &self.dir + } + + /// Throttle IO during compaction to avoid saturating disk. + pub fn maybe_throttle_compaction(&self, bytes_written: u64) { + if self.compaction_byte_per_second <= 0 || !self.is_compacting { + return; + } + // Simple throttle: sleep based on bytes written vs allowed rate + let sleep_us = + (bytes_written as f64 / self.compaction_byte_per_second as f64 * 1_000_000.0) as u64; + if sleep_us > 0 { + std::thread::sleep(std::time::Duration::from_micros(sleep_us)); + } + } + + /// Change the replication placement and rewrite the super block. + pub fn set_replica_placement(&mut self, rp: ReplicaPlacement) -> Result<(), VolumeError> { + self.super_block.replica_placement = rp; + let bytes = self.super_block.to_bytes(); + let dat_file = self.dat_file.as_mut().ok_or_else(|| { + VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open")) + })?; + dat_file.seek(SeekFrom::Start(0))?; + dat_file.write_all(&bytes)?; + dat_file.sync_all()?; + Ok(()) + } + + // ---- Binary search for incremental copy ---- + + /// Read a single index entry's offset from the .idx file by entry index. + fn read_offset_from_index(&self, m: i64) -> Result { + let idx_path = self.file_name(".idx"); + let idx_file = File::open(&idx_path)?; + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + let file_offset = m as u64 * NEEDLE_MAP_ENTRY_SIZE as u64; + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + idx_file.read_exact_at(&mut buf, file_offset)?; + } + #[cfg(not(unix))] + { + let mut f = idx_file; + f.seek(SeekFrom::Start(file_offset))?; + std::io::Read::read_exact(&mut f, &mut buf)?; + } + let (_key, offset, _size) = idx_entry_from_bytes(&buf); + Ok(offset) + } + + /// Read the append_at_ns timestamp from a needle at the given offset in the .dat file. + /// Go reads the full needle body for ALL entries including tombstones to get the + /// actual AppendAtNs timestamp, which is needed for correct binary search during + /// incremental copy. + fn read_append_at_ns(&self, offset: Offset) -> Result { + let actual_offset = offset.to_actual_offset() as u64; + let version = self.version(); + + let mut header_buf = [0u8; NEEDLE_HEADER_SIZE]; + self.read_exact_at_backend(&mut header_buf, actual_offset)?; + + let (_cookie, _id, size) = Needle::parse_header(&header_buf); + + let actual_size = get_actual_size(size, version); + if actual_size <= 0 { + return Ok(0); + } + let mut buf = vec![0u8; actual_size as usize]; + self.read_exact_at_backend(&mut buf, actual_offset)?; + + let mut n = Needle::default(); + n.read_bytes_meta_only(&mut buf, offset.to_actual_offset(), size, version)?; + Ok(n.append_at_ns) + } + + /// Search right from position m to find the first non-deleted entry. + fn read_right_ns(&self, m: i64, max: i64) -> Result<(i64, Offset, u64), VolumeError> { + let mut index = m; + loop { + index += 1; + if index >= max { + return Ok((index, Offset::default(), 0)); + } + let offset = self.read_offset_from_index(index)?; + if !offset.is_zero() { + let ts = self.read_append_at_ns(offset)?; + return Ok((index, offset, ts)); + } + } + } + + /// Search left from position m to find the first non-deleted entry. + fn read_left_ns(&self, m: i64) -> Result<(i64, Offset, u64), VolumeError> { + let mut index = m; + loop { + index -= 1; + if index < 0 { + return Ok((index, Offset::default(), 0)); + } + let offset = self.read_offset_from_index(index)?; + if !offset.is_zero() { + let ts = self.read_append_at_ns(offset)?; + return Ok((index, offset, ts)); + } + } + } + + /// Binary search through the .idx file to find the first needle + /// with append_at_ns > since_ns. Returns (offset, is_last). + /// Matches Go's BinarySearchByAppendAtNs in volume_backup.go. + pub fn binary_search_by_append_at_ns( + &self, + since_ns: u64, + ) -> Result<(Offset, bool), VolumeError> { + let file_size = self.idx_file_size() as i64; + if file_size % NEEDLE_MAP_ENTRY_SIZE as i64 != 0 { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!("unexpected idx file size: {}", file_size), + ))); + } + + let entry_count = file_size / NEEDLE_MAP_ENTRY_SIZE as i64; + let mut l: i64 = 0; + let mut h: i64 = entry_count; + + while l < h { + let m = (l + h) / 2; + + if m == entry_count { + return Ok((Offset::default(), true)); + } + + let offset = self.read_offset_from_index(m)?; + + if offset.is_zero() { + let (left_index, _left_offset, left_ns) = self.read_left_ns(m)?; + let (right_index, right_offset, right_ns) = self.read_right_ns(m, entry_count)?; + + if right_ns <= since_ns { + l = right_index; + if l == entry_count { + return Ok((Offset::default(), true)); + } else { + continue; + } + } + if since_ns < left_ns { + h = left_index + 1; + continue; + } + return Ok((right_offset, false)); + } + + let m_ns = self.read_append_at_ns(offset)?; + + if m_ns <= since_ns { + l = m + 1; + } else { + h = m; + } + } + + if l == entry_count { + return Ok((Offset::default(), true)); + } + + let offset = self.read_offset_from_index(l)?; + Ok((offset, false)) + } + + /// Write a raw needle blob at a specific offset in the .dat file. + pub fn write_needle_blob( + &mut self, + offset: i64, + needle_blob: &[u8], + ) -> Result<(), VolumeError> { + if self.is_read_only() { + return Err(VolumeError::ReadOnly); + } + let dat_file = self.dat_file.as_mut().ok_or_else(|| { + VolumeError::Io(io::Error::new(io::ErrorKind::Other, "dat file not open")) + })?; + dat_file.seek(SeekFrom::Start(offset as u64))?; + dat_file.write_all(needle_blob)?; + Ok(()) + } + + /// Write a needle blob and update the needle map index. + /// Matches Go's Volume.WriteNeedleBlob which appends to dat and calls nm.Put. + pub fn write_needle_blob_and_index( + &mut self, + needle_id: NeedleId, + needle_blob: &[u8], + size: Size, + ) -> Result<(), VolumeError> { + // Dedup check: if the same needle already exists with matching content, skip the write. + // Matches Go's WriteNeedleBlob which reads existing needle and compares cookie+checksum+data. + if let Some(nm) = &self.nm { + if let Some(nv) = nm.get(needle_id) { + if nv.size == size { + let version = self.version(); + // Read existing needle from disk + let mut old_needle = Needle::default(); + let mut ro = ReadOption::default(); + if self + .read_needle_data_at_unlocked( + &mut old_needle, + nv.offset.to_actual_offset(), + nv.size, + &mut ro, + ) + .is_ok() + { + // Parse the incoming blob into a needle + let mut new_needle = Needle::default(); + if new_needle + .read_bytes(needle_blob, nv.offset.to_actual_offset(), size, version) + .is_ok() + { + if old_needle.cookie == new_needle.cookie + && old_needle.checksum == new_needle.checksum + && old_needle.data == new_needle.data + { + return Ok(()); + } + } + } + } + } + } + + // Check volume size limit + let content_size = self.content_size(); + if MAX_POSSIBLE_VOLUME_SIZE < content_size + needle_blob.len() as u64 { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!( + "volume size limit {} exceeded! current size is {}", + MAX_POSSIBLE_VOLUME_SIZE, content_size + ), + ))); + } + + // Compute monotonic appendAtNs (matches Go: needle.GetAppendAtNs(v.lastAppendAtNs)) + let append_at_ns = get_append_at_ns(self.last_append_at_ns); + + // Patch appendAtNs timestamp into V3 blobs (matches Go WriteNeedleBlob L64-77) + let mut blob_buf; + let blob_to_write = if self.version() == VERSION_3 { + let ts_offset = + NEEDLE_HEADER_SIZE + size.0 as usize + NEEDLE_CHECKSUM_SIZE; + if ts_offset + TIMESTAMP_SIZE > needle_blob.len() { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "needle blob buffer too small: need {} bytes, have {}", + ts_offset + TIMESTAMP_SIZE, + needle_blob.len() + ), + ))); + } + blob_buf = needle_blob.to_vec(); + blob_buf[ts_offset..ts_offset + TIMESTAMP_SIZE] + .copy_from_slice(&append_at_ns.to_be_bytes()); + &blob_buf[..] + } else { + needle_blob + }; + + // Append blob at end of dat file + let dat_size = self.dat_file_size()? as i64; + self.write_needle_blob(dat_size, blob_to_write)?; + + // Update lastAppendAtNs (matches Go L352: v.lastAppendAtNs = appendAtNs) + self.last_append_at_ns = append_at_ns; + + // Update needle map index + let offset = Offset::from_actual_offset(dat_size); + if let Some(ref mut nm) = self.nm { + nm.put(needle_id, offset, size)?; + } + + Ok(()) + } + + pub fn needs_replication(&self) -> bool { + self.super_block.replica_placement.get_copy_count() > 1 + } + + /// Garbage ratio: deleted_size / content_size (matching Go's garbageLevel). + /// content_size is the additive-only FileByteCounter. + /// + /// When DeletedCount > 0 but DeletedSize == 0 (e.g. .sdx converted back to + /// normal .idx where deleted entry sizes are missing), falls back to + /// computing deleted bytes as (datFileSize - contentSize - SuperBlockSize) + /// and uses datFileSize as the denominator. + pub fn garbage_level(&self) -> f64 { + let content = self.content_size(); + if content == 0 { + return 0.0; + } + let mut deleted = self.deleted_size(); + let mut file_size = content; + + if self.deleted_count() > 0 && deleted == 0 { + // This happens for .sdx converted back to normal .idx + // where deleted entry size is missing + let dat_file_size = self.dat_file_size().unwrap_or(0); + deleted = dat_file_size.saturating_sub(content).saturating_sub(SUPER_BLOCK_SIZE as u64); + file_size = dat_file_size; + } + + if file_size == 0 { + return 0.0; + } + deleted as f64 / file_size as f64 + } + + pub fn dat_file_size(&self) -> io::Result { + self.current_dat_file_size() + } + + /// Get the modification time of the .dat file as Unix seconds. + pub fn dat_file_mod_time(&self) -> u64 { + if let Some(dat_file) = self.dat_file.as_ref() { + dat_file + .metadata() + .ok() + .and_then(|m| m.modified().ok()) + .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) + .map(|d| d.as_secs()) + .unwrap_or(0) + } else { + self.remote_dat_file + .as_ref() + .map(|remote_dat_file| remote_dat_file.modified_time) + .unwrap_or(0) + } + } + + pub fn idx_file_size(&self) -> u64 { + self.nm.as_ref().map_or(0, |nm| nm.index_file_size()) + } + + // ---- Compaction / Vacuum ---- + + /// Compact the volume by copying only live needles to new .cpd/.cpx files. + /// This reads from the current .dat/.idx and writes to .cpd/.cpx. + /// Call `commit_compact()` after to swap the files. + pub fn compact_by_index( + &mut self, + _preallocate: u64, + _max_bytes_per_second: i64, + progress_fn: F, + ) -> Result<(), VolumeError> + where + F: Fn(i64) -> bool, + { + if self.is_compacting { + return Ok(()); // already compacting + } + self.is_compacting = true; + + let result = self.do_compact_by_index(progress_fn); + + self.is_compacting = false; + result + } + + fn do_compact_by_index(&mut self, progress_fn: F) -> Result<(), VolumeError> + where + F: Fn(i64) -> bool, + { + // Guard against nil needle map (matches Go's nil check before compaction sync) + if self.nm.is_none() { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!("volume {} needle map is nil", self.id), + ))); + } + + // Record state before compaction for makeupDiff + self.last_compact_index_offset = self.nm.as_ref().map_or(0, |nm| nm.index_file_size()); + self.last_compact_revision = self.super_block.compaction_revision; + + // Sync current data + self.sync_to_disk()?; + + let cpd_path = self.file_name(".cpd"); + let cpx_path = self.file_name(".cpx"); + let version = self.version(); + + // Write new super block with incremented compaction revision + let mut new_sb = self.super_block.clone(); + new_sb.compaction_revision += 1; + let sb_bytes = new_sb.to_bytes(); + + let mut dst = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .truncate(true) + .open(&cpd_path)?; + dst.write_all(&sb_bytes)?; + let mut new_offset = sb_bytes.len() as i64; + + // Build new index in memory + let mut new_nm = CompactNeedleMap::new(); + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + // Collect live entries from needle map (sorted ascending) + let nm = self.nm.as_ref().ok_or(VolumeError::NotInitialized)?; + let mut entries: Vec<(NeedleId, Offset, Size)> = Vec::new(); + for (id, nv) in nm.iter_entries() { + if nv.offset.is_zero() || nv.size.is_deleted() { + continue; + } + entries.push((id, nv.offset, nv.size)); + } + entries.sort_by_key(|(_, offset, _)| *offset); + + for (id, offset, size) in entries { + // Progress callback + if !progress_fn(offset.to_actual_offset()) { + // Interrupted + let _ = fs::remove_file(&cpd_path); + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Interrupted, + "compaction interrupted", + ))); + } + + // Read needle from source + let mut n = Needle { + id, + ..Needle::default() + }; + self.read_needle_data_at(&mut n, offset.to_actual_offset(), size)?; + + // Skip TTL-expired needles using the volume's TTL (matches Go's volume_vacuum.go) + if n.has_ttl() { + let ttl_minutes = self.super_block.ttl.minutes(); + if ttl_minutes > 0 && n.last_modified > 0 { + let expire_at = n.last_modified + (ttl_minutes as u64) * 60; + if now >= expire_at { + continue; + } + } + } + + // Write needle to destination + let bytes = n.write_bytes(version); + dst.write_all(&bytes)?; + + // Update new index + new_nm.put(id, Offset::from_actual_offset(new_offset), n.size)?; + new_offset += bytes.len() as i64; + } + + dst.sync_all()?; + + // Save new index + new_nm.save_to_idx(&cpx_path)?; + + Ok(()) + } + + /// Commit a previously completed compaction: swap .cpd/.cpx to .dat/.idx and reload. + /// Matches Go's isCompactionInProgress CompareAndSwap guard. + pub fn commit_compact(&mut self) -> Result<(), VolumeError> { + if self.is_compacting { + return Ok(()); // already compacting, silently skip (matches Go) + } + self.is_compacting = true; + + let result = self.do_commit_compact(); + + self.is_compacting = false; + result + } + + fn do_commit_compact(&mut self) -> Result<(), VolumeError> { + if let Err(e) = self.makeup_diff() { + warn!("makeup_diff failed: {}", e); + // Match Go: clean up .cpd/.cpx on makeup_diff failure + let cpd = self.file_name(".cpd"); + let cpx = self.file_name(".cpx"); + let _ = fs::remove_file(&cpd); + let _ = fs::remove_file(&cpx); + return Err(e); + } + + // Close current files + if let Some(ref mut nm) = self.nm { + nm.close(); + } + self.nm = None; + if let Some(ref dat_file) = self.dat_file { + let _ = dat_file.sync_all(); + } + self.dat_file = None; + self.remote_dat_file = None; + + let cpd_path = self.file_name(".cpd"); + let cpx_path = self.file_name(".cpx"); + let dat_path = self.file_name(".dat"); + let idx_path = self.file_name(".idx"); + + // Check that compact files exist + if !Path::new(&cpd_path).exists() || !Path::new(&cpx_path).exists() { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::NotFound, + "compact files (.cpd/.cpx) not found", + ))); + } + + // Swap files: .cpd → .dat, .cpx → .idx + fs::rename(&cpd_path, &dat_path)?; + fs::rename(&cpx_path, &idx_path)?; + + // Remove any leveldb/redb index files (rebuilt from .idx on reload) + let ldb_path = self.file_name(".ldb"); + let _ = fs::remove_dir_all(&ldb_path); + let rdb_path = self.file_name(".rdb"); + let _ = fs::remove_file(&rdb_path); + + // Reload + self.load(true, false, 0, self.version())?; + + Ok(()) + } + + /// Clean up leftover compaction files (.cpd, .cpx). + pub fn cleanup_compact(&self) -> Result<(), VolumeError> { + let cpd_path = self.file_name(".cpd"); + let cpx_path = self.file_name(".cpx"); + let cpldb_path = self.file_name(".cpldb"); + + let e1 = fs::remove_file(&cpd_path); + let e2 = fs::remove_file(&cpx_path); + let e3 = fs::remove_dir_all(&cpldb_path); + + // Ignore NotFound errors + if let Err(e) = e1 { + if e.kind() != io::ErrorKind::NotFound { + return Err(e.into()); + } + } + if let Err(e) = e2 { + if e.kind() != io::ErrorKind::NotFound { + return Err(e.into()); + } + } + if let Err(e) = e3 { + if e.kind() != io::ErrorKind::NotFound { + return Err(e.into()); + } + } + + Ok(()) + } + + /// Read any new needles appended during compaction and append them to .cpd/.cpx + fn makeup_diff(&mut self) -> Result<(), VolumeError> { + let old_idx_size = self.nm.as_ref().map_or(0, |nm| nm.index_file_size()); + if old_idx_size == 0 || old_idx_size <= self.last_compact_index_offset { + return Ok(()); + } + + let old_super_block = &self.super_block; + if old_super_block.compaction_revision != self.last_compact_revision { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!( + "current old dat file's compact revision {} is not the expected one {}", + old_super_block.compaction_revision, self.last_compact_revision + ), + ))); + } + + // Read the new .cpd file's super block and verify its compaction revision is old + 1 + let cpd_path_check = self.file_name(".cpd"); + let mut cpd_file_check = File::open(&cpd_path_check)?; + let mut sb_buf = [0u8; SUPER_BLOCK_SIZE]; + cpd_file_check.read_exact(&mut sb_buf)?; + let new_super_block = SuperBlock::from_bytes(&sb_buf)?; + let old_compact_revision = old_super_block.compaction_revision; + let new_compact_revision = new_super_block.compaction_revision; + if old_compact_revision + 1 != new_compact_revision { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!( + "old dat file's compact revision {} + 1 does not equal new dat file's compact revision {}", + old_compact_revision, new_compact_revision + ), + ))); + } + + let old_idx_path = self.file_name(".idx"); + let mut old_idx_file = File::open(&old_idx_path)?; + + // Read new entries from .idx + let mut incremented_entries = std::collections::HashMap::new(); + let offset = self.last_compact_index_offset; + + old_idx_file.seek(SeekFrom::Start(offset))?; + let entry_count = (old_idx_size - offset) / NEEDLE_MAP_ENTRY_SIZE as u64; + for _ in 0..entry_count { + let mut buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + old_idx_file.read_exact(&mut buf)?; + let (key, needle_offset, size) = crate::storage::types::idx_entry_from_bytes(&buf); + incremented_entries.insert(key, (needle_offset, size)); + } + + if incremented_entries.is_empty() { + return Ok(()); + } + + let cpd_path = self.file_name(".cpd"); + let cpx_path = self.file_name(".cpx"); + + let mut dst_dat = OpenOptions::new().read(true).write(true).open(&cpd_path)?; + let mut dst_idx = OpenOptions::new() + .write(true) + .append(true) + .open(&cpx_path)?; + + let mut dat_offset = dst_dat.seek(SeekFrom::End(0))?; + let padding_rem = dat_offset % NEEDLE_PADDING_SIZE as u64; + if padding_rem != 0 { + dat_offset += NEEDLE_PADDING_SIZE as u64 - padding_rem; + dst_dat.seek(SeekFrom::Start(dat_offset))?; + } + + let version = self.version(); + let old_dat_path = self.file_name(".dat"); + let old_dat_file = File::open(&old_dat_path)?; + + for (key, (needle_offset, size)) in incremented_entries { + if !needle_offset.is_zero() && !size.is_deleted() && size.0 > 0 { + let actual_size = crate::storage::needle::needle::get_actual_size(size, version); + let mut blob = vec![0u8; actual_size as usize]; + + #[cfg(unix)] + { + use std::os::unix::fs::FileExt; + old_dat_file + .read_exact_at(&mut blob, needle_offset.to_actual_offset() as u64)?; + } + #[cfg(windows)] + { + crate::storage::volume::read_exact_at( + &old_dat_file, + &mut blob, + needle_offset.to_actual_offset() as u64, + )?; + } + + dst_dat.write_all(&blob)?; + + let mut idx_entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + crate::storage::types::idx_entry_to_bytes( + &mut idx_entry_buf, + key, + Offset::from_actual_offset(dat_offset as i64), + size, + ); + dst_idx.write_all(&idx_entry_buf)?; + + dat_offset += actual_size as u64; + } else { + let mut fake_del_needle = Needle { + id: key, + cookie: Cookie(0x12345678), + append_at_ns: SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_nanos() as u64, + ..Needle::default() + }; + let bytes = fake_del_needle.write_bytes(version); + dst_dat.write_all(&bytes)?; + + let mut idx_entry_buf = [0u8; NEEDLE_MAP_ENTRY_SIZE]; + crate::storage::types::idx_entry_to_bytes( + &mut idx_entry_buf, + key, + Offset::from_actual_offset(0), + Size(crate::storage::types::TOMBSTONE_FILE_SIZE.into()), + ); + dst_idx.write_all(&idx_entry_buf)?; + + dat_offset += bytes.len() as u64; + } + } + + dst_dat.sync_all()?; + dst_idx.sync_all()?; + + Ok(()) + } + + // ---- Sync / Close ---- + + pub fn sync_to_disk(&mut self) -> io::Result<()> { + if let Some(ref dat_file) = self.dat_file { + dat_file.sync_all()?; + } + if let Some(ref nm) = self.nm { + nm.sync()?; + } + Ok(()) + } + + pub fn close(&mut self) { + if let Some(ref dat_file) = self.dat_file { + let _ = dat_file.sync_all(); + } + self.dat_file = None; + self.remote_dat_file = None; + if let Some(ref nm) = self.nm { + let _ = nm.sync(); + } + self.nm = None; + } + + /// Remove all volume files from disk. + pub fn destroy(&mut self, only_empty: bool) -> Result<(), VolumeError> { + if only_empty && self.file_count() > 0 { + return Err(VolumeError::NotEmpty); + } + if self.is_compacting { + return Err(VolumeError::Io(io::Error::new( + io::ErrorKind::Other, + format!("volume {} is compacting", self.id), + ))); + } + + let (storage_name, storage_key) = self.remote_storage_name_key(); + if self.has_remote_file && !storage_name.is_empty() && !storage_key.is_empty() { + let backend = crate::remote_storage::s3_tier::global_s3_tier_registry() + .read() + .unwrap() + .get(&storage_name); + if let Some(backend) = backend { + if let Err(e) = backend.delete_file_blocking(&storage_key) { + warn!( + volume_id = self.id.0, + storage_name, + storage_key, + error = %e, + "failed to delete remote tier file during destroy" + ); + } + } else { + warn!( + volume_id = self.id.0, + storage_name, storage_key, "remote tier backend not found during destroy" + ); + } + } + + self.close(); + remove_volume_files(&self.data_file_name()); + remove_volume_files(&self.index_file_name()); + Ok(()) + } + + /// Check if an I/O error is EIO (errno 5) and record it for health monitoring. + /// On success (None), clears any previously recorded EIO error. + /// Matches Go's `checkReadWriteError` in volume_write.go. + fn check_read_write_error(&self, err: Option<&io::Error>) { + if let Some(e) = err { + if e.raw_os_error() == Some(5) { + // EIO — record it + if let Ok(mut guard) = self.last_io_error.lock() { + *guard = Some(e.to_string()); + } + } + } else { + // Success — clear any previous EIO + if let Ok(mut guard) = self.last_io_error.lock() { + if guard.is_some() { + *guard = None; + } + } + } + } + + /// Returns the last recorded I/O error string, if any. + #[allow(dead_code)] + pub fn last_io_error(&self) -> Option { + self.last_io_error.lock().ok()?.clone() + } + + #[cfg(test)] + pub(crate) fn set_last_io_error_for_test(&self, err: Option<&str>) { + if let Ok(mut guard) = self.last_io_error.lock() { + *guard = err.map(|value| value.to_string()); + } + } + + #[cfg(test)] + pub(crate) fn set_last_modified_ts_for_test(&mut self, ts_seconds: u64) { + self.last_modified_ts_seconds = ts_seconds; + } +} + +// ============================================================================ +// Helpers +// ============================================================================ + +/// Generate volume file base name: dir/collection_id or dir/id +pub fn volume_file_name(dir: &str, collection: &str, id: VolumeId) -> String { + if collection.is_empty() { + format!("{}/{}", dir, id.0) + } else { + format!("{}/{}_{}", dir, collection, id.0) + } +} + +/// Generate a monotonically increasing append timestamp. +fn get_append_at_ns(last: u64) -> u64 { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos() as u64; + if now <= last { + last + 1 + } else { + now + } +} + +/// Remove all files associated with a volume. +pub(crate) fn remove_volume_files(base: &str) { + for ext in &[ + ".dat", ".idx", ".vif", ".sdx", ".cpd", ".cpx", ".note", ".rdb", + ] { + let _ = fs::remove_file(format!("{}{}", base, ext)); + } + // leveldb uses a directory + let _ = fs::remove_dir_all(format!("{}.ldb", base)); +} + +// ============================================================================ +// ScanVolumeFile — iterate all needles in a .dat file +// ============================================================================ + +/// Callback for scanning needles in a volume file. +pub trait VolumeFileVisitor { + fn visit_super_block(&mut self, sb: &SuperBlock) -> Result<(), VolumeError>; + fn read_needle_body(&self) -> bool; + fn visit_needle(&mut self, n: &Needle, offset: i64) -> Result<(), VolumeError>; +} + +/// Scan all needles in a volume's .dat file. +pub fn scan_volume_file( + dat_path: &str, + visitor: &mut dyn VolumeFileVisitor, +) -> Result<(), VolumeError> { + let mut file = File::open(dat_path)?; + + // Read super block + let mut sb_buf = [0u8; SUPER_BLOCK_SIZE]; + file.read_exact(&mut sb_buf)?; + let sb = SuperBlock::from_bytes(&sb_buf)?; + visitor.visit_super_block(&sb)?; + + let version = sb.version; + let mut offset = sb.block_size() as i64; + + loop { + // Read needle header + let mut header = [0u8; NEEDLE_HEADER_SIZE]; + file.seek(SeekFrom::Start(offset as u64))?; + match file.read_exact(&mut header) { + Ok(()) => {} + Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => break, + Err(e) => return Err(e.into()), + } + + let (_cookie, _id, size) = Needle::parse_header(&header); + + if size.0 == 0 && _id.is_empty() { + break; // end of valid data + } + + let body_length = needle::needle_body_length(size, version); + let total_size = NEEDLE_HEADER_SIZE as i64 + body_length; + + // Skip full body parsing for deleted needles (tombstone or negative size) + if size.is_deleted() || size.0 <= 0 { + let mut n = Needle::default(); + n.read_header(&header); + visitor.visit_needle(&n, offset)?; + } else if visitor.read_needle_body() { + let mut buf = vec![0u8; total_size as usize]; + file.seek(SeekFrom::Start(offset as u64))?; + file.read_exact(&mut buf)?; + + let mut n = Needle::default(); + n.read_bytes(&buf, offset, size, version)?; + visitor.visit_needle(&n, offset)?; + } else { + let mut n = Needle::default(); + n.read_header(&header); + visitor.visit_needle(&n, offset)?; + } + + offset += total_size; + } + + Ok(()) +} + +/// Reserve disk blocks for a file without changing its visible size. +/// On Linux, uses `fallocate(FALLOC_FL_KEEP_SIZE)` to actually reserve blocks. +/// On other platforms, this is a no-op. +fn preallocate_file(file: &File, size: u64) { + #[cfg(target_os = "linux")] + { + use std::os::unix::io::AsRawFd; + let fd = file.as_raw_fd(); + // FALLOC_FL_KEEP_SIZE = 1: allocate blocks without changing file size + let ret = unsafe { libc::fallocate(fd, 1, 0, size as libc::off_t) }; + if ret == 0 { + tracing::info!(bytes = size, "preallocated disk space"); + } else { + tracing::warn!( + bytes = size, + error = %io::Error::last_os_error(), + "fallocate failed" + ); + } + } + #[cfg(not(target_os = "linux"))] + { + let _ = (file, size); + tracing::debug!(bytes = size, "preallocation not supported on this platform"); + } +} + +// ============================================================================ +// Tests +// ============================================================================ + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::needle::crc::CRC; + use tempfile::TempDir; + + fn spawn_fake_s3_server(body: Vec) -> (String, tokio::sync::oneshot::Sender<()>) { + use axum::http::{header, HeaderMap, HeaderValue, StatusCode}; + use axum::routing::any; + use axum::Router; + + let body = Arc::new(body); + let listener = std::net::TcpListener::bind("127.0.0.1:0").unwrap(); + let addr = listener.local_addr().unwrap(); + listener.set_nonblocking(true).unwrap(); + let (shutdown_tx, shutdown_rx) = tokio::sync::oneshot::channel::<()>(); + + std::thread::spawn(move || { + let runtime = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + runtime.block_on(async move { + let app = Router::new().fallback(any(move |headers: HeaderMap| { + let body = body.clone(); + async move { + let bytes = body.as_ref(); + if let Some(range) = headers + .get(header::RANGE) + .and_then(|value| value.to_str().ok()) + { + if let Some(spec) = range.strip_prefix("bytes=") { + let (start, end) = spec.split_once('-').unwrap(); + let start = start.parse::().unwrap(); + let end = if end.is_empty() { + bytes.len().saturating_sub(1) + } else { + end.parse::().unwrap() + } + .min(bytes.len().saturating_sub(1)); + let chunk = bytes[start..=end].to_vec(); + let mut response_headers = HeaderMap::new(); + response_headers.insert( + header::CONTENT_LENGTH, + HeaderValue::from_str(&chunk.len().to_string()).unwrap(), + ); + response_headers.insert( + header::CONTENT_RANGE, + HeaderValue::from_str(&format!( + "bytes {}-{}/{}", + start, + end, + bytes.len() + )) + .unwrap(), + ); + return (StatusCode::PARTIAL_CONTENT, response_headers, chunk); + } + } + + let mut response_headers = HeaderMap::new(); + response_headers.insert( + header::CONTENT_LENGTH, + HeaderValue::from_str(&bytes.len().to_string()).unwrap(), + ); + (StatusCode::OK, response_headers, bytes.to_vec()) + } + })); + + let listener = tokio::net::TcpListener::from_std(listener).unwrap(); + axum::serve(listener, app) + .with_graceful_shutdown(async move { + let _ = shutdown_rx.await; + }) + .await + .unwrap(); + }); + }); + + (format!("http://{}", addr), shutdown_tx) + } + + fn make_test_volume(dir: &str) -> Volume { + Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap() + } + + #[test] + fn test_data_file_access_control_blocks_writer_until_reader_releases() { + let control = Arc::new(DataFileAccessControl::default()); + let read_lease = control.read_lock(); + let writer_control = control.clone(); + let acquired = Arc::new(std::sync::atomic::AtomicBool::new(false)); + let acquired_clone = acquired.clone(); + + let writer = std::thread::spawn(move || { + let _write_lease = writer_control.write_lock(); + acquired_clone.store(true, std::sync::atomic::Ordering::Relaxed); + }); + + std::thread::sleep(std::time::Duration::from_millis(50)); + assert!(!acquired.load(std::sync::atomic::Ordering::Relaxed)); + + drop(read_lease); + writer.join().unwrap(); + + assert!(acquired.load(std::sync::atomic::Ordering::Relaxed)); + } + + #[test] + fn test_volume_file_name() { + assert_eq!(volume_file_name("/data", "", VolumeId(1)), "/data/1"); + assert_eq!( + volume_file_name("/data", "pics", VolumeId(42)), + "/data/pics_42" + ); + } + + #[test] + fn test_volume_create_and_load() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let v = make_test_volume(dir); + assert_eq!(v.version(), VERSION_3); + assert_eq!(v.file_count(), 0); + assert_eq!(v.content_size(), 0); + + // .dat and .idx files should exist + assert!(Path::new(&v.file_name(".dat")).exists()); + assert!(Path::new(&v.file_name(".idx")).exists()); + } + + #[test] + fn test_volume_write_read() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + // Write a needle + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0x12345678), + data: b"hello world".to_vec(), + data_size: 11, + flags: 0, + ..Needle::default() + }; + let (offset, size, unchanged) = v.write_needle(&mut n, true).unwrap(); + assert!(!unchanged); + assert!(offset > 0); // after superblock + assert!(size.0 > 0); + assert_eq!(v.file_count(), 1); + + // Read it back + let mut read_n = Needle { + id: NeedleId(1), + ..Needle::default() + }; + let count = v.read_needle(&mut read_n).unwrap(); + assert_eq!(count, 11); + assert_eq!(read_n.data, b"hello world"); + assert_eq!(read_n.cookie, Cookie(0x12345678)); + } + + #[test] + fn test_volume_write_dedup() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0xaa), + data: b"same data".to_vec(), + data_size: 9, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + // Write same needle again — should be unchanged + let mut n2 = Needle { + id: NeedleId(1), + cookie: Cookie(0xaa), + data: b"same data".to_vec(), + data_size: 9, + ..Needle::default() + }; + n2.checksum = CRC::new(&n2.data); + let (_, _, unchanged) = v.write_needle(&mut n2, true).unwrap(); + assert!(unchanged); + } + + #[test] + fn test_volume_delete() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0xbb), + data: b"delete me".to_vec(), + data_size: 9, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + assert_eq!(v.file_count(), 1); + + let deleted_size = v + .delete_needle(&mut Needle { + id: NeedleId(1), + cookie: Cookie(0xbb), + ..Needle::default() + }) + .unwrap(); + assert!(deleted_size.0 > 0); + // Additive-only: file_count stays at 1 after delete + assert_eq!(v.file_count(), 1); + assert_eq!(v.deleted_count(), 1); + + // Read should fail with Deleted + let mut read_n = Needle { + id: NeedleId(1), + ..Needle::default() + }; + let err = v.read_needle(&mut read_n).unwrap_err(); + assert!(matches!(err, VolumeError::Deleted)); + } + + #[test] + fn test_volume_multiple_needles() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + for i in 1..=10 { + let data = format!("needle data {}", i); + let mut n = Needle { + id: NeedleId(i), + cookie: Cookie(i as u32), + data: data.as_bytes().to_vec(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + + assert_eq!(v.file_count(), 10); + assert_eq!(v.max_file_key(), NeedleId(10)); + + // Read back needle 5 + let mut n = Needle { + id: NeedleId(5), + ..Needle::default() + }; + v.read_needle(&mut n).unwrap(); + assert_eq!(n.data, b"needle data 5"); + } + + #[test] + fn test_volume_reload_from_disk() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + // Write some needles + { + let mut v = make_test_volume(dir); + for i in 1..=3 { + let data = format!("data {}", i); + let mut n = Needle { + id: NeedleId(i), + cookie: Cookie(i as u32), + data: data.as_bytes().to_vec(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + v.sync_to_disk().unwrap(); + } + + // Reload and verify + let v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + assert_eq!(v.file_count(), 3); + + let mut n = Needle { + id: NeedleId(2), + ..Needle::default() + }; + v.read_needle(&mut n).unwrap(); + assert_eq!(std::str::from_utf8(&n.data).unwrap(), "data 2"); + } + + #[test] + fn test_volume_cookie_mismatch() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0xaa), + data: b"original".to_vec(), + data_size: 8, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + // Write with wrong cookie + let mut n2 = Needle { + id: NeedleId(1), + cookie: Cookie(0xbb), + data: b"overwrite".to_vec(), + data_size: 9, + ..Needle::default() + }; + let err = v.write_needle(&mut n2, true).unwrap_err(); + assert!(matches!(err, VolumeError::CookieMismatch(_))); + } + + #[test] + fn test_volume_destroy() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let dat_path; + let idx_path; + + { + let mut v = make_test_volume(dir); + dat_path = v.file_name(".dat"); + idx_path = v.file_name(".idx"); + assert!(Path::new(&dat_path).exists()); + v.destroy(false).unwrap(); + } + + assert!(!Path::new(&dat_path).exists()); + assert!(!Path::new(&idx_path).exists()); + } + + #[test] + fn test_read_all_needles_uses_dat_order_for_live_offsets() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + let mut first = Needle { + id: NeedleId(10), + cookie: Cookie(0x11223344), + data: b"first".to_vec(), + data_size: 5, + ..Needle::default() + }; + v.write_needle(&mut first, true).unwrap(); + + let mut second = Needle { + id: NeedleId(20), + cookie: Cookie(0x55667788), + data: b"second".to_vec(), + data_size: 6, + ..Needle::default() + }; + v.write_needle(&mut second, true).unwrap(); + + let mut first_overwrite = Needle { + id: NeedleId(10), + cookie: Cookie(0x11223344), + data: b"first-overwrite".to_vec(), + data_size: 15, + ..Needle::default() + }; + v.write_needle(&mut first_overwrite, true).unwrap(); + + let needles = v.read_all_needles().unwrap(); + let ids: Vec = needles.iter().map(|n| u64::from(n.id)).collect(); + let bodies: Vec<&[u8]> = needles.iter().map(|n| n.data.as_slice()).collect(); + + assert_eq!(ids, vec![20, 10]); + assert_eq!(bodies, vec![b"second".as_slice(), b"first-overwrite".as_slice()]); + } + + #[test] + fn test_get_append_at_ns() { + let t1 = get_append_at_ns(0); + assert!(t1 > 0); + let t2 = get_append_at_ns(t1); + assert!(t2 > t1); + // If we pass a future timestamp, should return last+1 + let future = u64::MAX - 1; + let t3 = get_append_at_ns(future); + assert_eq!(t3, future + 1); + } + + #[test] + fn test_volume_compact() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + // Write 3 needles + for i in 1..=3u64 { + let mut n = Needle { + id: NeedleId(i), + cookie: Cookie(i as u32), + data: format!("data-{}", i).into_bytes(), + data_size: format!("data-{}", i).len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + } + assert_eq!(v.file_count(), 3); + + // Delete needle 2 + let mut del = Needle { + id: NeedleId(2), + cookie: Cookie(2), + ..Needle::default() + }; + v.delete_needle(&mut del).unwrap(); + // Additive-only: file_count stays at 3 after delete + assert_eq!(v.file_count(), 3); + assert_eq!(v.deleted_count(), 1); + + let dat_size_before = v.dat_file_size().unwrap(); + + // Compact + v.compact_by_index(0, 0, |_| true).unwrap(); + + // Verify compact files exist + assert!(Path::new(&v.file_name(".cpd")).exists()); + assert!(Path::new(&v.file_name(".cpx")).exists()); + + // Commit: swap files and reload + v.commit_compact().unwrap(); + + // After compaction: 2 live needles, 0 deleted + assert_eq!(v.file_count(), 2); + assert_eq!(v.deleted_count(), 0); + + // Dat should be smaller (deleted needle removed) + let dat_size_after = v.dat_file_size().unwrap(); + assert!( + dat_size_after < dat_size_before, + "dat should shrink after compact" + ); + + // Read back live needles + let mut n1 = Needle { + id: NeedleId(1), + ..Needle::default() + }; + v.read_needle(&mut n1).unwrap(); + assert_eq!(n1.data, b"data-1"); + + let mut n3 = Needle { + id: NeedleId(3), + ..Needle::default() + }; + v.read_needle(&mut n3).unwrap(); + assert_eq!(n3.data, b"data-3"); + + // Needle 2 should not exist + let mut n2 = Needle { + id: NeedleId(2), + ..Needle::default() + }; + assert!(v.read_needle(&mut n2).is_err()); + + // Compact files should not exist after commit + assert!(!Path::new(&v.file_name(".cpd")).exists()); + assert!(!Path::new(&v.file_name(".cpx")).exists()); + + // Cleanup should be a no-op + v.cleanup_compact().unwrap(); + } + + #[test] + fn test_compaction_revision_relookup() { + // Verifies that re_lookup_needle_data_offset returns the correct data offset + // and compaction revision, and that after compaction the offset changes. + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + // Write two needles + let mut n1 = Needle { + id: NeedleId(1), + cookie: Cookie(0xAABBCCDD), + data: b"first-needle-data".to_vec(), + data_size: 17, + ..Needle::default() + }; + v.write_needle(&mut n1, true).unwrap(); + + let mut n2 = Needle { + id: NeedleId(2), + cookie: Cookie(0x11223344), + data: b"second-needle-data".to_vec(), + data_size: 18, + ..Needle::default() + }; + v.write_needle(&mut n2, true).unwrap(); + + // Get initial revision and offset for needle 1 + let initial_rev = v.super_block.compaction_revision; + let (initial_offset, rev) = v.re_lookup_needle_data_offset(NeedleId(1)).unwrap(); + assert_eq!(rev, initial_rev); + assert!(initial_offset > 0, "data offset should be positive"); + + // Delete needle 2 so compaction removes it + let mut del_n2 = Needle { + id: NeedleId(2), + cookie: Cookie(0x11223344), + ..Needle::default() + }; + v.delete_needle(&mut del_n2).unwrap(); + + // Compact the volume — this increments compaction_revision and may move needles + v.compact_by_index(0, 0, |_| true).unwrap(); + v.commit_compact().unwrap(); + + // After compaction, the revision should have changed + let new_rev = v.super_block.compaction_revision; + assert_eq!( + new_rev, + initial_rev + 1, + "compaction should increment revision" + ); + + // Re-lookup needle 1 — should still be found with the new revision + let (new_offset, relookup_rev) = v.re_lookup_needle_data_offset(NeedleId(1)).unwrap(); + assert_eq!(relookup_rev, new_rev); + assert!(new_offset > 0, "data offset should still be positive"); + + // The data should still be readable correctly after compaction + let mut read_n1 = Needle { + id: NeedleId(1), + ..Needle::default() + }; + v.read_needle(&mut read_n1).unwrap(); + assert_eq!(read_n1.data, b"first-needle-data"); + + // Deleted needle should not be found + let result = v.re_lookup_needle_data_offset(NeedleId(2)); + assert!( + result.is_err(), + "deleted needle should not be found after compaction" + ); + } + + #[test] + fn test_stream_info_includes_compaction_revision() { + // Verifies that NeedleStreamInfo carries the volume's compaction revision + // so that StreamingBody can detect when compaction has occurred. + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + // Write a needle large enough to have meaningful data + let data = vec![0xABu8; 2048]; + let mut n = Needle { + id: NeedleId(42), + cookie: Cookie(0xDEADBEEF), + data: data.clone(), + data_size: data.len() as u32, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + // Read stream info + let mut read_n = Needle { + id: NeedleId(42), + cookie: Cookie(0xDEADBEEF), + ..Needle::default() + }; + let info = v.read_needle_stream_info(&mut read_n, false).unwrap(); + + assert_eq!(info.volume_id, VolumeId(1)); + assert_eq!(info.needle_id, NeedleId(42)); + assert_eq!(info.compaction_revision, v.super_block.compaction_revision); + assert_eq!(info.data_size, data.len() as u32); + assert!(info.data_file_offset > 0); + } + + #[test] + fn test_remote_vif_load_blocks_writes_but_allows_delete() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let dat_size_before_reload = { + let mut v = make_test_volume(dir); + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(0x1234), + data: b"remote".to_vec(), + data_size: 6, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + let vif = VifVolumeInfo { + files: vec![VifRemoteFile { + backend_type: "s3".to_string(), + backend_id: "default".to_string(), + key: "remote-key".to_string(), + offset: 0, + file_size: v.dat_file_size().unwrap(), + modified_time: 123, + extension: ".dat".to_string(), + }], + version: v.version().0 as u32, + ..VifVolumeInfo::default() + }; + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + + v.dat_file_size().unwrap() + }; + + let mut v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + assert!(v.is_read_only()); + assert!(!v.no_write_or_delete); + assert!(v.no_write_can_delete); + + let err = v + .write_needle( + &mut Needle { + id: NeedleId(2), + cookie: Cookie(0x5678), + data: b"blocked".to_vec(), + data_size: 7, + ..Needle::default() + }, + true, + ) + .unwrap_err(); + assert!(matches!(err, VolumeError::ReadOnly)); + + let deleted_size = v + .delete_needle(&mut Needle { + id: NeedleId(1), + cookie: Cookie(0x1234), + ..Needle::default() + }) + .unwrap(); + assert!(deleted_size.0 > 0); + assert_eq!(v.dat_file_size().unwrap(), dat_size_before_reload); + } + + #[test] + fn test_set_writable_keeps_remote_delete_only_mode() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + let mut v = make_test_volume(dir); + + v.volume_info.files.push(PbRemoteFile { + backend_type: "s3".to_string(), + backend_id: "default".to_string(), + key: "remote-key".to_string(), + offset: 0, + file_size: v.dat_file_size().unwrap(), + modified_time: 123, + extension: ".dat".to_string(), + }); + v.refresh_remote_write_mode(); + v.set_writable().unwrap(); + + assert!(v.is_read_only()); + assert!(!v.no_write_or_delete); + assert!(v.no_write_can_delete); + } + + #[test] + fn test_load_vif_defaults_local_version_and_bytes_offset() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + { + let _v = make_test_volume(dir); + let vif = VifVolumeInfo::default(); + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + } + + let v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + assert_eq!(v.volume_info.version, Version::current().0 as u32); + assert_eq!(v.volume_info.bytes_offset, OFFSET_SIZE as u32); + assert_eq!(v.version(), Version::current()); + } + + #[test] + fn test_version_superblock_overrides_vif_version() { + // Go behavior: after reading the superblock, volumeInfo.Version is set + // to SuperBlock.Version, overriding whatever was in the .vif file. + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + { + let _v = make_test_volume(dir); + // Write a .vif with version=2, but the .dat superblock is version=3 + let vif = VifVolumeInfo { + version: VERSION_2.0 as u32, + bytes_offset: OFFSET_SIZE as u32, + ..VifVolumeInfo::default() + }; + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + } + + let v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + // Superblock version (3) overrides the .vif version (2) + assert_eq!(v.volume_info.version, VERSION_3.0 as u32); + assert_eq!(v.version(), VERSION_3); + } + + #[test] + fn test_load_vif_rejects_bytes_offset_mismatch() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + { + let _v = make_test_volume(dir); + let vif = VifVolumeInfo { + version: Version::current().0 as u32, + bytes_offset: (OFFSET_SIZE as u32) + 1, + ..VifVolumeInfo::default() + }; + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + } + + let result = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ); + + match result { + Ok(_) => panic!("expected bytes_offset mismatch to fail"), + Err(VolumeError::Io(io_err)) => { + assert_eq!(io_err.kind(), io::ErrorKind::InvalidData); + assert!(io_err.to_string().contains("bytes_offset mismatch")); + } + Err(other) => panic!("unexpected error: {other:?}"), + } + } + + #[test] + fn test_remote_only_volume_load_reads_from_tier_backend() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let dat_bytes = { + let mut v = make_test_volume(dir); + let mut n = Needle { + id: NeedleId(7), + cookie: Cookie(0x7788), + data: b"remote-only".to_vec(), + data_size: 11, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + v.sync_to_disk().unwrap(); + std::fs::read(v.file_name(".dat")).unwrap() + }; + + let dat_path = format!("{}/1.dat", dir); + std::fs::remove_file(&dat_path).unwrap(); + + let (endpoint, shutdown_tx) = spawn_fake_s3_server(dat_bytes.clone()); + crate::remote_storage::s3_tier::global_s3_tier_registry() + .write() + .unwrap() + .clear(); + let tier_config = crate::remote_storage::s3_tier::S3TierConfig { + access_key: "access".to_string(), + secret_key: "secret".to_string(), + region: "us-east-1".to_string(), + bucket: "bucket-a".to_string(), + endpoint, + storage_class: "STANDARD".to_string(), + force_path_style: true, + }; + { + let mut registry = crate::remote_storage::s3_tier::global_s3_tier_registry() + .write() + .unwrap(); + registry.register( + "s3.default".to_string(), + crate::remote_storage::s3_tier::S3TierBackend::new(&tier_config), + ); + registry.register( + "s3".to_string(), + crate::remote_storage::s3_tier::S3TierBackend::new(&tier_config), + ); + } + + let vif = VifVolumeInfo { + files: vec![VifRemoteFile { + backend_type: "s3".to_string(), + backend_id: "default".to_string(), + key: "remote-key".to_string(), + offset: 0, + file_size: dat_bytes.len() as u64, + modified_time: 123, + extension: ".dat".to_string(), + }], + version: Version::current().0 as u32, + bytes_offset: OFFSET_SIZE as u32, + dat_file_size: dat_bytes.len() as i64, + ..VifVolumeInfo::default() + }; + std::fs::write( + format!("{}/1.vif", dir), + serde_json::to_string_pretty(&vif).unwrap(), + ) + .unwrap(); + + let v = Volume::new( + dir, + dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + assert!(v.has_remote_file); + assert!(v.dat_file.is_none()); + assert!(v.remote_dat_file.is_some()); + + let mut n = Needle { + id: NeedleId(7), + ..Needle::default() + }; + let size = v.read_needle(&mut n).unwrap(); + assert_eq!(size, 11); + assert_eq!(n.data, b"remote-only"); + + let mut meta = Needle { + id: NeedleId(7), + ..Needle::default() + }; + let info = v.read_needle_stream_info(&mut meta, false).unwrap(); + assert!(matches!(info.source, NeedleStreamSource::Remote(_))); + let mut streamed = vec![0u8; info.data_size as usize]; + info.source + .read_exact_at(&mut streamed, info.data_file_offset) + .unwrap(); + assert_eq!(streamed, b"remote-only"); + assert_eq!(meta.data_size, 11); + + let _ = shutdown_tx.send(()); + } + + /// Volume destroy removes .vif alongside the primary data files. + #[test] + fn test_destroy_removes_vif() { + let tmp = TempDir::new().unwrap(); + let dir = tmp.path().to_str().unwrap(); + + let mut v = make_test_volume(dir); + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(1), + data: b"test".to_vec(), + data_size: 4, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + // Write a .vif file (as EC encode would) + let vif_path = format!("{}/1.vif", dir); + std::fs::write(&vif_path, r#"{"version":3}"#).unwrap(); + assert!(std::path::Path::new(&vif_path).exists()); + + // .dat and .idx should exist + let dat_path = format!("{}/1.dat", dir); + let idx_path = format!("{}/1.idx", dir); + assert!(std::path::Path::new(&dat_path).exists()); + assert!(std::path::Path::new(&idx_path).exists()); + + // Destroy the volume + v.destroy(false).unwrap(); + + // .dat and .idx should be gone + assert!( + !std::path::Path::new(&dat_path).exists(), + ".dat should be removed" + ); + assert!( + !std::path::Path::new(&idx_path).exists(), + ".idx should be removed" + ); + + assert!( + !std::path::Path::new(&vif_path).exists(), + ".vif should be removed" + ); + } + + /// Volume destroy with separate idx directory must clean up both dirs and .vif. + #[test] + fn test_destroy_with_separate_idx_dir() { + let dat_tmp = TempDir::new().unwrap(); + let idx_tmp = TempDir::new().unwrap(); + let dat_dir = dat_tmp.path().to_str().unwrap(); + let idx_dir = idx_tmp.path().to_str().unwrap(); + + let mut v = Volume::new( + dat_dir, + idx_dir, + "", + VolumeId(1), + NeedleMapKind::InMemory, + None, + None, + 0, + Version::current(), + ) + .unwrap(); + + let mut n = Needle { + id: NeedleId(1), + cookie: Cookie(1), + data: b"hello".to_vec(), + data_size: 5, + ..Needle::default() + }; + v.write_needle(&mut n, true).unwrap(); + + // Write .vif in data dir (as EC encode would) + let vif_path = format!("{}/1.vif", dat_dir); + std::fs::write(&vif_path, r#"{"version":3}"#).unwrap(); + + let dat_path = format!("{}/1.dat", dat_dir); + let idx_path = format!("{}/1.idx", idx_dir); + assert!(std::path::Path::new(&dat_path).exists()); + assert!(std::path::Path::new(&idx_path).exists()); + + v.destroy(false).unwrap(); + + assert!( + !std::path::Path::new(&dat_path).exists(), + ".dat removed from data dir" + ); + assert!( + !std::path::Path::new(&idx_path).exists(), + ".idx removed from idx dir" + ); + assert!( + !std::path::Path::new(&vif_path).exists(), + ".vif removed from data dir" + ); + } +} diff --git a/seaweed-volume/src/version.rs b/seaweed-volume/src/version.rs new file mode 100644 index 000000000..413a526b1 --- /dev/null +++ b/seaweed-volume/src/version.rs @@ -0,0 +1,79 @@ +//! Version helpers aligned with Go's util/version package. + +use std::sync::OnceLock; + +#[cfg(feature = "5bytes")] +const SIZE_LIMIT: &str = "8000GB"; // Matches Go production builds (5BytesOffset) +#[cfg(not(feature = "5bytes"))] +const SIZE_LIMIT: &str = "30GB"; // Matches Go default build (!5BytesOffset) + +pub fn size_limit() -> &'static str { + SIZE_LIMIT +} + +pub fn commit() -> &'static str { + option_env!("SEAWEEDFS_COMMIT") + .or(option_env!("GIT_COMMIT")) + .or(option_env!("GIT_SHA")) + .unwrap_or("") +} + +pub fn version_number() -> &'static str { + static VERSION_NUMBER: OnceLock = OnceLock::new(); + VERSION_NUMBER + .get_or_init(|| { + parse_go_version_number().unwrap_or_else(|| env!("CARGO_PKG_VERSION").to_string()) + }) + .as_str() +} + +pub fn version() -> &'static str { + static VERSION: OnceLock = OnceLock::new(); + VERSION + .get_or_init(|| format!("{} {}", size_limit(), version_number())) + .as_str() +} + +pub fn full_version() -> &'static str { + static FULL: OnceLock = OnceLock::new(); + FULL.get_or_init(|| format!("{} {}", version(), commit())) + .as_str() +} + +pub fn server_header() -> &'static str { + static HEADER: OnceLock = OnceLock::new(); + HEADER + .get_or_init(|| format!("SeaweedFS Volume {}", version())) + .as_str() +} + +fn parse_go_version_number() -> Option { + let src = include_str!(concat!( + env!("CARGO_MANIFEST_DIR"), + "/../weed/util/version/constants.go" + )); + let mut major: Option = None; + let mut minor: Option = None; + for line in src.lines() { + let l = line.trim(); + if l.starts_with("MAJOR_VERSION") { + major = parse_int32_line(l); + } else if l.starts_with("MINOR_VERSION") { + minor = parse_int32_line(l); + } + if major.is_some() && minor.is_some() { + break; + } + } + match (major, minor) { + (Some(maj), Some(min)) => Some(format!("{}.{}", maj, format!("{:02}", min))), + _ => None, + } +} + +fn parse_int32_line(line: &str) -> Option { + let start = line.find("int32(")? + "int32(".len(); + let rest = &line[start..]; + let end = rest.find(')')?; + rest[..end].trim().parse::().ok() +} diff --git a/seaweed-volume/tests/http_integration.rs b/seaweed-volume/tests/http_integration.rs new file mode 100644 index 000000000..c1a69248f --- /dev/null +++ b/seaweed-volume/tests/http_integration.rs @@ -0,0 +1,677 @@ +//! Integration tests for the volume server HTTP handlers. +//! +//! Uses axum's Router with tower::ServiceExt::oneshot to test +//! end-to-end without starting a real TCP server. + +use std::sync::{Arc, RwLock}; + +use axum::body::Body; +use axum::extract::connect_info::ConnectInfo; +use axum::http::{Request, StatusCode}; +use tower::ServiceExt; // for `oneshot` + +use seaweed_volume::security::{Guard, SigningKey}; +use seaweed_volume::server::volume_server::{ + build_admin_router, build_admin_router_with_ui, build_metrics_router, build_public_router, + VolumeServerState, +}; +use seaweed_volume::storage::needle_map::NeedleMapKind; +use seaweed_volume::storage::store::Store; +use seaweed_volume::storage::types::{DiskType, Version, VolumeId}; + +use tempfile::TempDir; + +/// Create a test VolumeServerState with a temp directory, a single disk +/// location, and one pre-created volume (VolumeId 1). +fn test_state() -> (Arc, TempDir) { + test_state_with_guard(Vec::new(), Vec::new()) +} + +fn test_state_with_signing_key(signing_key: Vec) -> (Arc, TempDir) { + test_state_with_guard(Vec::new(), signing_key) +} + +fn test_state_with_whitelist(whitelist: Vec) -> (Arc, TempDir) { + test_state_with_guard(whitelist, Vec::new()) +} + +fn test_state_with_guard( + whitelist: Vec, + signing_key: Vec, +) -> (Arc, TempDir) { + let tmp = TempDir::new().expect("failed to create temp dir"); + let dir = tmp.path().to_str().unwrap(); + + let mut store = Store::new(NeedleMapKind::InMemory); + store + .add_location( + dir, + dir, + 10, + DiskType::HardDrive, + seaweed_volume::config::MinFreeSpace::Percent(1.0), + Vec::new(), + ) + .expect("failed to add location"); + store + .add_volume( + VolumeId(1), + "", + None, + None, + 0, + DiskType::HardDrive, + Version::current(), + ) + .expect("failed to create volume"); + + let guard = Guard::new( + &whitelist, + SigningKey(signing_key), + 0, + SigningKey(vec![]), + 0, + ); + let state = Arc::new(VolumeServerState { + store: RwLock::new(store), + guard: RwLock::new(guard), + is_stopping: RwLock::new(false), + maintenance: std::sync::atomic::AtomicBool::new(false), + state_version: std::sync::atomic::AtomicU32::new(0), + concurrent_upload_limit: 0, + concurrent_download_limit: 0, + inflight_upload_data_timeout: std::time::Duration::from_secs(60), + inflight_download_data_timeout: std::time::Duration::from_secs(60), + inflight_upload_bytes: std::sync::atomic::AtomicI64::new(0), + inflight_download_bytes: std::sync::atomic::AtomicI64::new(0), + upload_notify: tokio::sync::Notify::new(), + download_notify: tokio::sync::Notify::new(), + data_center: String::new(), + rack: String::new(), + file_size_limit_bytes: 0, + maintenance_byte_per_second: 0, + is_heartbeating: std::sync::atomic::AtomicBool::new(true), + has_master: false, + pre_stop_seconds: 0, + volume_state_notify: tokio::sync::Notify::new(), + write_queue: std::sync::OnceLock::new(), + s3_tier_registry: std::sync::RwLock::new( + seaweed_volume::remote_storage::s3_tier::S3TierRegistry::new(), + ), + read_mode: seaweed_volume::config::ReadMode::Local, + master_url: String::new(), + master_urls: Vec::new(), + self_url: String::new(), + http_client: reqwest::Client::new(), + outgoing_http_scheme: "http".to_string(), + outgoing_grpc_tls: None, + metrics_runtime: std::sync::RwLock::new( + seaweed_volume::server::volume_server::RuntimeMetricsConfig::default(), + ), + metrics_notify: tokio::sync::Notify::new(), + fix_jpg_orientation: false, + has_slow_read: false, + read_buffer_size_bytes: 1024 * 1024, + security_file: String::new(), + cli_white_list: vec![], + state_file_path: String::new(), + }); + (state, tmp) +} + +/// Helper: read the entire response body as bytes. +async fn body_bytes(response: axum::response::Response) -> Vec { + let body = response.into_body(); + axum::body::to_bytes(body, usize::MAX) + .await + .expect("failed to read body") + .to_vec() +} + +fn with_remote_addr(request: Request, remote_addr: &str) -> Request { + let mut request = request; + let remote_addr = remote_addr + .parse::() + .expect("invalid socket address"); + request.extensions_mut().insert(ConnectInfo(remote_addr)); + request +} + +// ============================================================================ +// 1. GET /healthz returns 200 when server is running +// ============================================================================ + +#[tokio::test] +async fn healthz_returns_200_when_running() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/healthz") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); +} + +// ============================================================================ +// 2. GET /healthz returns 503 when is_stopping=true +// ============================================================================ + +#[tokio::test] +async fn healthz_returns_503_when_stopping() { + let (state, _tmp) = test_state(); + *state.is_stopping.write().unwrap() = true; + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/healthz") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::SERVICE_UNAVAILABLE); +} + +// ============================================================================ +// 3. GET /status returns JSON with version and volumes array +// ============================================================================ + +#[tokio::test] +async fn status_returns_json_with_version_and_volumes() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/status") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + + let body = body_bytes(response).await; + let json: serde_json::Value = + serde_json::from_slice(&body).expect("response is not valid JSON"); + + assert!(json.get("Version").is_some(), "missing 'Version' field"); + assert!(json["Version"].is_string(), "'Version' should be a string"); + + assert!(json.get("Volumes").is_some(), "missing 'Volumes' field"); + assert!(json["Volumes"].is_array(), "'Volumes' should be an array"); + + // We created one volume in test_state, so the array should have one entry + let volumes = json["Volumes"].as_array().unwrap(); + assert_eq!(volumes.len(), 1, "expected 1 volume"); + assert_eq!(volumes[0]["Id"], 1); +} + +#[tokio::test] +async fn admin_router_does_not_expose_metrics() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/metrics") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn metrics_router_serves_metrics() { + let app = build_metrics_router(); + + let response = app + .oneshot( + Request::builder() + .uri("/metrics") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); +} + +#[tokio::test] +async fn admin_router_rejects_non_whitelisted_uploads() { + let (state, _tmp) = test_state_with_whitelist(vec!["127.0.0.1".to_string()]); + let app = build_admin_router(state); + + let response = app + .oneshot(with_remote_addr( + Request::builder() + .method("POST") + .uri("/1,000000000000000001") + .body(Body::from("blocked")) + .unwrap(), + "10.0.0.9:12345", + )) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::UNAUTHORIZED); +} + +#[tokio::test] +async fn admin_router_rejects_non_whitelisted_deletes() { + let (state, _tmp) = test_state_with_whitelist(vec!["127.0.0.1".to_string()]); + let app = build_admin_router(state); + + let response = app + .oneshot(with_remote_addr( + Request::builder() + .method("DELETE") + .uri("/1,000000000000000001") + .body(Body::empty()) + .unwrap(), + "10.0.0.9:12345", + )) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::UNAUTHORIZED); +} + +// Go's volume_server.go has /stats/* endpoints commented out (L130-134). +// Requests to /stats/counter fall through to the store handler which returns 400. +#[tokio::test] +async fn admin_router_does_not_expose_stats_routes() { + let (state, _tmp) = test_state_with_whitelist(vec!["127.0.0.1".to_string()]); + let app = build_admin_router_with_ui(state, true); + + let response = app + .oneshot(with_remote_addr( + Request::builder() + .uri("/stats/counter") + .body(Body::empty()) + .unwrap(), + "127.0.0.1:12345", + )) + .await + .unwrap(); + + // Falls through to store handler → 400 (bad volume id) + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +// ============================================================================ +// 4. POST writes data, then GET reads it back +// ============================================================================ + +#[tokio::test] +async fn write_then_read_needle() { + let (state, _tmp) = test_state(); + + // The fid "01637037d6" encodes NeedleId=0x01, Cookie=0x637037d6 + let uri = "/1,01637037d6"; + let payload = b"hello, seaweedfs!"; + + // --- POST (write) --- + let app = build_admin_router(state.clone()); + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(uri) + .body(Body::from(payload.to_vec())) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!( + response.status(), + StatusCode::CREATED, + "POST should return 201 Created" + ); + + let body = body_bytes(response).await; + let json: serde_json::Value = + serde_json::from_slice(&body).expect("POST response is not valid JSON"); + assert_eq!(json["size"], payload.len() as u64); + + // --- GET (read back) --- + let app = build_admin_router(state.clone()); + let response = app + .oneshot(Request::builder().uri(uri).body(Body::empty()).unwrap()) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK, "GET should return 200"); + + let body = body_bytes(response).await; + assert_eq!(body, payload, "GET body should match written data"); +} + +// ============================================================================ +// 5. DELETE deletes a needle, subsequent GET returns 404 +// ============================================================================ + +#[tokio::test] +async fn delete_then_get_returns_404() { + let (state, _tmp) = test_state(); + let uri = "/1,01637037d6"; + let payload = b"to be deleted"; + + // Write the needle first + let app = build_admin_router(state.clone()); + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(uri) + .body(Body::from(payload.to_vec())) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::CREATED); + + // Delete + let app = build_admin_router(state.clone()); + let response = app + .oneshot( + Request::builder() + .method("DELETE") + .uri(uri) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!( + response.status(), + StatusCode::ACCEPTED, + "DELETE should return 202 Accepted" + ); + + // GET should now return 404 + let app = build_admin_router(state.clone()); + let response = app + .oneshot(Request::builder().uri(uri).body(Body::empty()).unwrap()) + .await + .unwrap(); + assert_eq!( + response.status(), + StatusCode::NOT_FOUND, + "GET after DELETE should return 404" + ); +} + +// ============================================================================ +// 6. HEAD returns headers without body +// ============================================================================ + +#[tokio::test] +async fn head_returns_headers_without_body() { + let (state, _tmp) = test_state(); + let uri = "/1,01637037d6"; + let payload = b"head test data"; + + // Write needle + let app = build_admin_router(state.clone()); + let response = app + .oneshot( + Request::builder() + .method("POST") + .uri(uri) + .body(Body::from(payload.to_vec())) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::CREATED); + + // HEAD + let app = build_admin_router(state.clone()); + let response = app + .oneshot( + Request::builder() + .method("HEAD") + .uri(uri) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK, "HEAD should return 200"); + + // Content-Length header should be present + let content_length = response + .headers() + .get("content-length") + .expect("HEAD should include Content-Length header"); + let len: usize = content_length + .to_str() + .unwrap() + .parse() + .expect("Content-Length should be a number"); + assert_eq!( + len, + payload.len(), + "Content-Length should match payload size" + ); + + // Body should be empty for HEAD + let body = body_bytes(response).await; + assert!(body.is_empty(), "HEAD body should be empty"); +} + +// ============================================================================ +// 7. Invalid URL path returns 400 +// ============================================================================ + +#[tokio::test] +async fn invalid_url_path_returns_400() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + // "invalidpath" has no comma or slash separator so parse_url_path returns None + let response = app + .oneshot( + Request::builder() + .uri("/invalidpath") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!( + response.status(), + StatusCode::BAD_REQUEST, + "invalid URL path should return 400" + ); +} + +#[tokio::test] +async fn deep_invalid_url_path_returns_400() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/not/a/valid/volume/path") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn admin_root_get_returns_400() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot(Request::builder().uri("/").body(Body::empty()).unwrap()) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn public_root_get_returns_400() { + let (state, _tmp) = test_state(); + let app = build_public_router(state); + + let response = app + .oneshot(Request::builder().uri("/").body(Body::empty()).unwrap()) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn public_router_does_not_expose_healthz() { + let (state, _tmp) = test_state(); + let app = build_public_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/healthz") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +// Go's volume_server.go has /stats/* endpoints commented out (L130-134). +#[tokio::test] +async fn admin_router_stats_routes_not_registered() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/stats/counter") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + // Falls through to store handler → 400 (bad volume id) + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn admin_router_hides_ui_when_write_jwt_is_configured() { + let (state, _tmp) = test_state_with_signing_key(b"secret".to_vec()); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/ui/index.html") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::BAD_REQUEST); +} + +#[tokio::test] +async fn admin_router_can_expose_ui_with_explicit_override() { + let (state, _tmp) = test_state_with_signing_key(b"secret".to_vec()); + let app = build_admin_router_with_ui(state, true); + + let response = app + .oneshot( + Request::builder() + .uri("/ui/index.html") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + let body = body_bytes(response).await; + let html = String::from_utf8(body).unwrap(); + assert!(html.contains("Disk Stats")); + assert!(html.contains("System Stats")); + assert!(html.contains("Volumes")); +} + +#[tokio::test] +async fn admin_router_ui_override_ignores_read_jwt_checks() { + let (state, _tmp) = test_state_with_signing_key(b"write-secret".to_vec()); + state.guard.write().unwrap().read_signing_key = SigningKey(b"read-secret".to_vec()); + let app = build_admin_router_with_ui(state, true); + + let response = app + .oneshot( + Request::builder() + .uri("/ui/index.html") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); +} + +#[tokio::test] +async fn admin_router_serves_volume_ui_static_assets() { + let (state, _tmp) = test_state(); + let app = build_admin_router(state); + + let response = app + .oneshot( + Request::builder() + .uri("/seaweedfsstatic/bootstrap/3.3.1/css/bootstrap.min.css") + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + assert_eq!( + response + .headers() + .get("content-type") + .and_then(|value| value.to_str().ok()), + Some("text/css; charset=utf-8") + ); + let body = body_bytes(response).await; + assert!(body.len() > 1000); +} diff --git a/seaweed-volume/tools/generate_go_volume_docs.go b/seaweed-volume/tools/generate_go_volume_docs.go new file mode 100644 index 000000000..cdc1cb7dc --- /dev/null +++ b/seaweed-volume/tools/generate_go_volume_docs.go @@ -0,0 +1,1172 @@ +package main + +import ( + "bytes" + "flag" + "fmt" + "go/ast" + "go/parser" + "go/printer" + "go/token" + "io/fs" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +type FileDoc struct { + RelPath string + PackageName string + AbsPath string + LineCount int + Imports []string + TopLevelDecls []DeclInfo + Functions []*FunctionInfo + RustCounterpart []string +} + +type DeclInfo struct { + Kind string + Names []string + StartLine int + EndLine int + Summary string + Details []string +} + +type FunctionInfo struct { + ID string + PackageName string + FileRelPath string + Name string + Receiver string + ReceiverType string + Signature string + DocComment string + StartLine int + EndLine int + Effect string + CallNames []string + CallDisplay []string + PotentialLocal []string + ExternalCalls []string + PossibleCallers []string + ControlFlow []string + Literals []LiteralInfo + Statements []StmtInfo + SourceLines []string +} + +type LiteralInfo struct { + Line int + Value string + Kind string +} + +type StmtInfo struct { + StartLine int + EndLine int + Kind string + Summary string +} + +type funcIndex struct { + ByPackage map[string]map[string][]string + ByName map[string][]string + Defs map[string]*FunctionInfo +} + +func main() { + rootFlag := flag.String("root", ".", "repository root") + outFlag := flag.String("out", "seaweed-volume/docs/go-volume-server", "output directory for generated markdown") + flag.Parse() + + root, err := filepath.Abs(*rootFlag) + if err != nil { + fail("resolve root", err) + } + outDir := filepath.Join(root, *outFlag) + + paths, err := collectSourceFiles(root) + if err != nil { + fail("collect source files", err) + } + + docs, idx, err := parseFiles(root, paths) + if err != nil { + fail("parse source files", err) + } + + linkCallers(idx) + + if err := os.RemoveAll(outDir); err != nil { + fail("clear output directory", err) + } + if err := os.MkdirAll(outDir, 0o755); err != nil { + fail("create output directory", err) + } + + for _, doc := range docs { + target := filepath.Join(outDir, filepath.FromSlash(doc.RelPath+".md")) + if err := os.MkdirAll(filepath.Dir(target), 0o755); err != nil { + fail("create doc parent", err) + } + content := renderFileDoc(doc) + if err := os.WriteFile(target, []byte(content), 0o644); err != nil { + fail("write doc file", err) + } + } + + readme := renderIndexReadme(*outFlag, docs) + if err := os.WriteFile(filepath.Join(outDir, "README.md"), []byte(readme), 0o644); err != nil { + fail("write index", err) + } + + fmt.Printf("Generated %d Markdown files under %s\n", len(docs)+1, outDir) +} + +func fail(action string, err error) { + fmt.Fprintf(os.Stderr, "%s: %v\n", action, err) + os.Exit(1) +} + +func collectSourceFiles(root string) ([]string, error) { + var files []string + err := filepath.WalkDir(root, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + if d.IsDir() { + base := filepath.Base(path) + if base == ".git" || base == "target" || base == "vendor" { + return filepath.SkipDir + } + return nil + } + if filepath.Ext(path) != ".go" || strings.HasSuffix(path, "_test.go") { + return nil + } + rel, err := filepath.Rel(root, path) + if err != nil { + return err + } + rel = filepath.ToSlash(rel) + if shouldInclude(rel) { + files = append(files, rel) + } + return nil + }) + sort.Strings(files) + return files, err +} + +func shouldInclude(rel string) bool { + switch { + case rel == "weed/command/volume.go": + return true + case rel == "weed/server/common.go": + return true + case rel == "weed/server/constants/volume.go": + return true + case rel == "weed/server/volume_server_ui/templates.go": + return true + case strings.HasPrefix(rel, "weed/server/volume") && strings.HasSuffix(rel, ".go"): + return true + case strings.HasPrefix(rel, "weed/storage/"): + return true + case strings.HasPrefix(rel, "weed/images/"): + return true + case strings.HasPrefix(rel, "weed/security/"): + return true + case strings.HasPrefix(rel, "weed/stats/"): + return true + default: + return false + } +} + +func parseFiles(root string, relPaths []string) ([]*FileDoc, *funcIndex, error) { + fset := token.NewFileSet() + var docs []*FileDoc + index := &funcIndex{ + ByPackage: map[string]map[string][]string{}, + ByName: map[string][]string{}, + Defs: map[string]*FunctionInfo{}, + } + + for _, rel := range relPaths { + abs := filepath.Join(root, filepath.FromSlash(rel)) + src, err := os.ReadFile(abs) + if err != nil { + return nil, nil, err + } + fileAst, err := parser.ParseFile(fset, abs, src, parser.ParseComments) + if err != nil { + return nil, nil, err + } + + lines := splitLines(string(src)) + doc := &FileDoc{ + RelPath: rel, + PackageName: fileAst.Name.Name, + AbsPath: abs, + LineCount: len(lines), + Imports: collectImports(fileAst), + TopLevelDecls: collectDecls(fset, fileAst), + RustCounterpart: rustCounterparts(rel), + } + + for _, decl := range fileAst.Decls { + funcDecl, ok := decl.(*ast.FuncDecl) + if !ok { + continue + } + info := collectFunctionInfo(fset, rel, fileAst.Name.Name, funcDecl, lines) + doc.Functions = append(doc.Functions, info) + index.Defs[info.ID] = info + + if index.ByPackage[info.PackageName] == nil { + index.ByPackage[info.PackageName] = map[string][]string{} + } + index.ByPackage[info.PackageName][info.Name] = append(index.ByPackage[info.PackageName][info.Name], info.ID) + index.ByName[info.Name] = append(index.ByName[info.Name], info.ID) + } + + sort.Slice(doc.Functions, func(i, j int) bool { + return doc.Functions[i].StartLine < doc.Functions[j].StartLine + }) + docs = append(docs, doc) + } + + sort.Slice(docs, func(i, j int) bool { return docs[i].RelPath < docs[j].RelPath }) + for _, ids := range index.ByName { + sort.Strings(ids) + } + for _, byName := range index.ByPackage { + for _, ids := range byName { + sort.Strings(ids) + } + } + return docs, index, nil +} + +func collectImports(fileAst *ast.File) []string { + var imports []string + for _, imp := range fileAst.Imports { + path := strings.Trim(imp.Path.Value, "\"") + if imp.Name != nil { + imports = append(imports, imp.Name.Name+" "+path) + } else { + imports = append(imports, path) + } + } + sort.Strings(imports) + return imports +} + +func collectDecls(fset *token.FileSet, fileAst *ast.File) []DeclInfo { + var decls []DeclInfo + for _, decl := range fileAst.Decls { + genDecl, ok := decl.(*ast.GenDecl) + if !ok { + continue + } + info := DeclInfo{ + Kind: strings.ToLower(genDecl.Tok.String()), + StartLine: fset.Position(genDecl.Pos()).Line, + EndLine: fset.Position(genDecl.End()).Line, + } + for _, spec := range genDecl.Specs { + switch s := spec.(type) { + case *ast.TypeSpec: + info.Names = append(info.Names, s.Name.Name) + info.Details = append(info.Details, summarizeTypeSpec(fset, s)...) + case *ast.ValueSpec: + for i, name := range s.Names { + info.Names = append(info.Names, name.Name) + value := "" + if i < len(s.Values) { + value = nodeString(fset, s.Values[i]) + } + switch { + case value != "": + info.Details = append(info.Details, fmt.Sprintf("L%d `%s` = `%s`", fset.Position(name.Pos()).Line, name.Name, sanitizeInline(value))) + case s.Type != nil: + info.Details = append(info.Details, fmt.Sprintf("L%d `%s` has declared type `%s`", fset.Position(name.Pos()).Line, name.Name, sanitizeInline(nodeString(fset, s.Type)))) + default: + info.Details = append(info.Details, fmt.Sprintf("L%d `%s` is declared without an inline initializer", fset.Position(name.Pos()).Line, name.Name)) + } + } + } + } + if len(info.Names) == 0 { + info.Names = []string{""} + } + info.Summary = fmt.Sprintf("%s declaration covering %s", info.Kind, strings.Join(info.Names, ", ")) + decls = append(decls, info) + } + return decls +} + +func summarizeTypeSpec(fset *token.FileSet, spec *ast.TypeSpec) []string { + var details []string + switch t := spec.Type.(type) { + case *ast.StructType: + for _, field := range t.Fields.List { + names := []string{""} + if len(field.Names) > 0 { + names = nil + for _, name := range field.Names { + names = append(names, name.Name) + } + } + line := fset.Position(field.Pos()).Line + msg := fmt.Sprintf("L%d fields `%s` have type `%s`", line, strings.Join(names, "`, `"), sanitizeInline(nodeString(fset, field.Type))) + if field.Tag != nil { + msg += fmt.Sprintf(" with tag `%s`", sanitizeInline(field.Tag.Value)) + } + details = append(details, msg) + } + case *ast.InterfaceType: + for _, field := range t.Methods.List { + names := []string{""} + if len(field.Names) > 0 { + names = nil + for _, name := range field.Names { + names = append(names, name.Name) + } + } + details = append(details, fmt.Sprintf("L%d interface item `%s` has type `%s`", fset.Position(field.Pos()).Line, strings.Join(names, "`, `"), sanitizeInline(nodeString(fset, field.Type)))) + } + default: + details = append(details, fmt.Sprintf("L%d `%s` resolves to `%s`", fset.Position(spec.Pos()).Line, spec.Name.Name, sanitizeInline(nodeString(fset, spec.Type)))) + } + return details +} + +func collectFunctionInfo(fset *token.FileSet, relPath, pkgName string, decl *ast.FuncDecl, lines []string) *FunctionInfo { + startLine := fset.Position(decl.Pos()).Line + endLine := fset.Position(decl.End()).Line + if endLine > len(lines) { + endLine = len(lines) + } + sourceLines := make([]string, 0, endLine-startLine+1) + for i := startLine; i <= endLine; i++ { + sourceLines = append(sourceLines, lines[i-1]) + } + + info := &FunctionInfo{ + PackageName: pkgName, + FileRelPath: relPath, + Name: decl.Name.Name, + StartLine: startLine, + EndLine: endLine, + DocComment: cleanDocComment(decl.Doc), + Signature: buildSignature(fset, decl), + SourceLines: sourceLines, + } + if decl.Recv != nil && len(decl.Recv.List) > 0 { + field := decl.Recv.List[0] + info.ReceiverType = nodeString(fset, field.Type) + if len(field.Names) > 0 { + info.Receiver = field.Names[0].Name + } + info.ID = pkgName + "::" + normalizeReceiverType(info.ReceiverType) + "." + info.Name + } else { + info.ID = pkgName + "::" + info.Name + } + + if decl.Body != nil { + callNames, callDisplay := collectCalls(fset, decl.Body) + info.CallNames = callNames + info.CallDisplay = callDisplay + info.ControlFlow = collectControlFlow(fset, decl.Body) + info.Literals = collectLiterals(fset, decl.Body) + info.Statements = collectStatements(fset, decl.Body) + } + + info.Effect = deriveEffect(info, decl) + return info +} + +func buildSignature(fset *token.FileSet, decl *ast.FuncDecl) string { + typeText := sanitizeInline(strings.TrimSpace(nodeString(fset, decl.Type))) + typeText = strings.TrimPrefix(typeText, "func") + typeText = strings.TrimSpace(typeText) + if decl.Recv != nil { + return fmt.Sprintf("func (%s) %s%s", sanitizeInline(fieldListString(fset, decl.Recv)), decl.Name.Name, typeText) + } + return fmt.Sprintf("func %s%s", decl.Name.Name, typeText) +} + +func collectCalls(fset *token.FileSet, body *ast.BlockStmt) ([]string, []string) { + nameSet := map[string]struct{}{} + displaySet := map[string]struct{}{} + ast.Inspect(body, func(n ast.Node) bool { + call, ok := n.(*ast.CallExpr) + if !ok { + return true + } + name := simpleCallName(call.Fun) + if name != "" { + nameSet[name] = struct{}{} + } + display := sanitizeInline(nodeString(fset, call.Fun)) + if display != "" { + displaySet[display] = struct{}{} + } + return true + }) + return sortedKeys(nameSet), sortedKeys(displaySet) +} + +func collectControlFlow(fset *token.FileSet, body *ast.BlockStmt) []string { + var items []string + ast.Inspect(body, func(n ast.Node) bool { + switch s := n.(type) { + case *ast.IfStmt: + msg := fmt.Sprintf("L%d branches when `%s`", fset.Position(s.Pos()).Line, sanitizeInline(nodeString(fset, s.Cond))) + if s.Init != nil { + msg += fmt.Sprintf(" after `%s`", sanitizeInline(nodeString(fset, s.Init))) + } + items = append(items, msg) + case *ast.ForStmt: + cond := "forever" + if s.Cond != nil { + cond = sanitizeInline(nodeString(fset, s.Cond)) + } + items = append(items, fmt.Sprintf("L%d loops while `%s`", fset.Position(s.Pos()).Line, cond)) + case *ast.RangeStmt: + items = append(items, fmt.Sprintf("L%d ranges `%s` over `%s`", fset.Position(s.Pos()).Line, sanitizeInline(nodeString(fset, s.Key)), sanitizeInline(nodeString(fset, s.X)))) + case *ast.SwitchStmt: + tag := "" + if s.Tag != nil { + tag = sanitizeInline(nodeString(fset, s.Tag)) + } + items = append(items, fmt.Sprintf("L%d switches on `%s`", fset.Position(s.Pos()).Line, tag)) + case *ast.TypeSwitchStmt: + items = append(items, fmt.Sprintf("L%d performs a type switch on `%s`", fset.Position(s.Pos()).Line, sanitizeInline(nodeString(fset, s.Assign)))) + case *ast.SelectStmt: + items = append(items, fmt.Sprintf("L%d selects across channel cases", fset.Position(s.Pos()).Line)) + case *ast.DeferStmt: + items = append(items, fmt.Sprintf("L%d defers `%s`", fset.Position(s.Pos()).Line, sanitizeInline(nodeString(fset, s.Call)))) + case *ast.GoStmt: + items = append(items, fmt.Sprintf("L%d launches goroutine `%s`", fset.Position(s.Pos()).Line, sanitizeInline(nodeString(fset, s.Call)))) + case *ast.ReturnStmt: + items = append(items, fmt.Sprintf("L%d returns `%s`", fset.Position(s.Pos()).Line, sanitizeInline(joinNodes(fset, s.Results)))) + } + return true + }) + return dedupeKeepOrder(items) +} + +func collectLiterals(fset *token.FileSet, body *ast.BlockStmt) []LiteralInfo { + var literals []LiteralInfo + seen := map[string]struct{}{} + ast.Inspect(body, func(n ast.Node) bool { + switch lit := n.(type) { + case *ast.BasicLit: + item := LiteralInfo{ + Line: fset.Position(lit.Pos()).Line, + Value: lit.Value, + Kind: lit.Kind.String(), + } + key := fmt.Sprintf("%d|%s|%s", item.Line, item.Kind, item.Value) + if _, ok := seen[key]; !ok { + literals = append(literals, item) + seen[key] = struct{}{} + } + case *ast.Ident: + if lit.Name != "true" && lit.Name != "false" && lit.Name != "nil" { + return true + } + item := LiteralInfo{ + Line: fset.Position(lit.Pos()).Line, + Value: lit.Name, + Kind: "keyword", + } + key := fmt.Sprintf("%d|%s|%s", item.Line, item.Kind, item.Value) + if _, ok := seen[key]; !ok { + literals = append(literals, item) + seen[key] = struct{}{} + } + } + return true + }) + sort.Slice(literals, func(i, j int) bool { + if literals[i].Line == literals[j].Line { + if literals[i].Kind == literals[j].Kind { + return literals[i].Value < literals[j].Value + } + return literals[i].Kind < literals[j].Kind + } + return literals[i].Line < literals[j].Line + }) + return literals +} + +func collectStatements(fset *token.FileSet, body *ast.BlockStmt) []StmtInfo { + var items []StmtInfo + var walkBlock func([]ast.Stmt) + walkBlock = func(stmts []ast.Stmt) { + for _, stmt := range stmts { + info := summarizeStmt(fset, stmt) + if info.Kind != "" { + items = append(items, info) + } + switch s := stmt.(type) { + case *ast.BlockStmt: + walkBlock(s.List) + case *ast.IfStmt: + walkBlock(s.Body.List) + switch elseNode := s.Else.(type) { + case *ast.BlockStmt: + walkBlock(elseNode.List) + case *ast.IfStmt: + walkBlock([]ast.Stmt{elseNode}) + } + case *ast.ForStmt: + walkBlock(s.Body.List) + case *ast.RangeStmt: + walkBlock(s.Body.List) + case *ast.SwitchStmt: + for _, stmt := range s.Body.List { + if clause, ok := stmt.(*ast.CaseClause); ok { + items = append(items, summarizeStmt(fset, clause)) + walkBlock(clause.Body) + } + } + case *ast.TypeSwitchStmt: + for _, stmt := range s.Body.List { + if clause, ok := stmt.(*ast.CaseClause); ok { + items = append(items, summarizeStmt(fset, clause)) + walkBlock(clause.Body) + } + } + case *ast.SelectStmt: + for _, stmt := range s.Body.List { + if clause, ok := stmt.(*ast.CommClause); ok { + items = append(items, summarizeStmt(fset, clause)) + walkBlock(clause.Body) + } + } + case *ast.LabeledStmt: + walkBlock([]ast.Stmt{s.Stmt}) + } + } + } + walkBlock(body.List) + sort.Slice(items, func(i, j int) bool { + if items[i].StartLine == items[j].StartLine { + if items[i].EndLine == items[j].EndLine { + return items[i].Summary < items[j].Summary + } + return items[i].EndLine < items[j].EndLine + } + return items[i].StartLine < items[j].StartLine + }) + return items +} + +func summarizeStmt(fset *token.FileSet, stmt ast.Stmt) StmtInfo { + info := StmtInfo{ + StartLine: fset.Position(stmt.Pos()).Line, + EndLine: fset.Position(stmt.End()).Line, + } + switch s := stmt.(type) { + case *ast.AssignStmt: + info.Kind = "assign" + lhs := sanitizeInline(joinNodes(fset, s.Lhs)) + rhs := sanitizeInline(joinNodes(fset, s.Rhs)) + info.Summary = fmt.Sprintf("assigns `%s` %s `%s`", lhs, s.Tok.String(), rhs) + case *ast.ExprStmt: + info.Kind = "expr" + info.Summary = fmt.Sprintf("executes `%s`", sanitizeInline(nodeString(fset, s.X))) + case *ast.IfStmt: + info.Kind = "if" + info.Summary = fmt.Sprintf("checks `%s`", sanitizeInline(nodeString(fset, s.Cond))) + case *ast.ForStmt: + info.Kind = "for" + cond := "true" + if s.Cond != nil { + cond = sanitizeInline(nodeString(fset, s.Cond)) + } + info.Summary = fmt.Sprintf("loops while `%s`", cond) + case *ast.RangeStmt: + info.Kind = "range" + target := sanitizeInline(nodeString(fset, s.X)) + left := sanitizeInline(joinNodes(fset, []ast.Expr{exprOrBlank(s.Key), exprOrBlank(s.Value)})) + info.Summary = fmt.Sprintf("ranges `%s` over `%s`", left, target) + case *ast.ReturnStmt: + info.Kind = "return" + info.Summary = fmt.Sprintf("returns `%s`", sanitizeInline(joinNodes(fset, s.Results))) + case *ast.DeferStmt: + info.Kind = "defer" + info.Summary = fmt.Sprintf("defers `%s`", sanitizeInline(nodeString(fset, s.Call))) + case *ast.GoStmt: + info.Kind = "go" + info.Summary = fmt.Sprintf("launches goroutine `%s`", sanitizeInline(nodeString(fset, s.Call))) + case *ast.SwitchStmt: + info.Kind = "switch" + tag := "true" + if s.Tag != nil { + tag = sanitizeInline(nodeString(fset, s.Tag)) + } + info.Summary = fmt.Sprintf("switches on `%s`", tag) + case *ast.TypeSwitchStmt: + info.Kind = "type-switch" + info.Summary = fmt.Sprintf("type-switches on `%s`", sanitizeInline(nodeString(fset, s.Assign))) + case *ast.SelectStmt: + info.Kind = "select" + info.Summary = "selects across channel operations" + case *ast.CaseClause: + info.Kind = "case" + if len(s.List) == 0 { + info.Summary = "default case" + } else { + info.Summary = fmt.Sprintf("case `%s`", sanitizeInline(joinNodes(fset, s.List))) + } + case *ast.CommClause: + info.Kind = "comm" + if s.Comm == nil { + info.Summary = "default communication case" + } else { + info.Summary = fmt.Sprintf("communication case `%s`", sanitizeInline(nodeString(fset, s.Comm))) + } + case *ast.BranchStmt: + info.Kind = "branch" + if s.Label != nil { + info.Summary = fmt.Sprintf("%s to label `%s`", strings.ToLower(s.Tok.String()), s.Label.Name) + } else { + info.Summary = strings.ToLower(s.Tok.String()) + } + case *ast.SendStmt: + info.Kind = "send" + info.Summary = fmt.Sprintf("sends `%s` to `%s`", sanitizeInline(nodeString(fset, s.Value)), sanitizeInline(nodeString(fset, s.Chan))) + case *ast.IncDecStmt: + info.Kind = "incdec" + info.Summary = fmt.Sprintf("%s `%s`", strings.ToLower(s.Tok.String()), sanitizeInline(nodeString(fset, s.X))) + case *ast.DeclStmt: + info.Kind = "decl" + info.Summary = fmt.Sprintf("declares `%s`", sanitizeInline(nodeString(fset, s.Decl))) + case *ast.LabeledStmt: + info.Kind = "label" + info.Summary = fmt.Sprintf("label `%s`", s.Label.Name) + default: + info.Kind = strings.ToLower(strings.TrimSuffix(strings.TrimPrefix(fmt.Sprintf("%T", stmt), "*ast."), "Stmt")) + info.Summary = sanitizeInline(nodeString(fset, stmt)) + } + return info +} + +func exprOrBlank(expr ast.Expr) ast.Expr { + if expr == nil { + return &ast.Ident{Name: "_"} + } + return expr +} + +func deriveEffect(info *FunctionInfo, decl *ast.FuncDecl) string { + if info.DocComment != "" { + return info.DocComment + } + name := info.Name + switch { + case strings.HasPrefix(name, "New"): + return fmt.Sprintf("Constructs and returns `%s`-related state.", strings.TrimPrefix(name, "New")) + case strings.HasPrefix(name, "Get"): + return "Retrieves or serves the requested resource and returns the outcome." + case strings.HasPrefix(name, "Read"): + return "Reads storage or request data and converts it into the function's return or streamed response." + case strings.HasPrefix(name, "Write"): + return "Writes state, file data, or response output and reports the result." + case strings.HasPrefix(name, "Delete"): + return "Deletes the targeted state or storage entries and returns status." + case strings.HasPrefix(name, "Update"): + return "Updates existing state in place, usually based on request or runtime conditions." + case strings.HasPrefix(name, "Load"): + return "Loads persisted state or configuration into runtime structures." + case strings.HasPrefix(name, "Save"): + return "Persists runtime state or derived data." + case strings.HasPrefix(name, "parse"), strings.HasPrefix(name, "Parse"): + return "Parses inbound text or binary input into structured values." + case strings.HasSuffix(name, "Handler"): + return "Handles an HTTP endpoint and writes the response side effects directly." + case strings.Contains(name, "Heartbeat"): + return "Maintains master/volume heartbeat state and its side effects." + case strings.Contains(name, "Vacuum"): + return "Runs or coordinates vacuum/compaction related work." + case strings.Contains(name, "Copy"): + return "Copies data between storage locations or peer volume servers." + case strings.Contains(name, "Scrub"): + return "Validates stored data and surfaces corruption or mismatch details." + case strings.Contains(name, "Mount"): + return "Attaches runtime-visible storage or shard state." + case strings.Contains(name, "Unmount"): + return "Detaches runtime-visible storage or shard state." + case strings.Contains(name, "Needle"): + return "Manipulates or transports SeaweedFS needle state." + default: + if info.ReceiverType != "" { + return fmt.Sprintf("Implements `%s` behavior on receiver `%s`.", name, sanitizeInline(info.ReceiverType)) + } + return fmt.Sprintf("Implements `%s` for package `%s`.", name, info.PackageName) + } +} + +func linkCallers(idx *funcIndex) { + for _, fn := range idx.Defs { + var local []string + var external []string + for _, display := range fn.CallDisplay { + simple := simpleNameFromDisplay(display) + if ids, ok := idx.ByPackage[fn.PackageName][simple]; ok && len(ids) > 0 { + local = append(local, display) + } else { + external = append(external, display) + } + } + fn.PotentialLocal = dedupeKeepOrder(local) + fn.ExternalCalls = dedupeKeepOrder(external) + + var callers []string + if ids, ok := idx.ByName[fn.Name]; ok { + for _, candidateID := range ids { + if candidateID == fn.ID { + continue + } + candidate := idx.Defs[candidateID] + for _, callName := range candidate.CallNames { + if callName == fn.Name { + callers = append(callers, candidateID) + break + } + } + } + } + sort.Strings(callers) + fn.PossibleCallers = callers + } +} + +func renderFileDoc(doc *FileDoc) string { + var b strings.Builder + b.WriteString("# " + doc.RelPath + "\n\n") + b.WriteString("- Source file: `" + doc.RelPath + "`\n") + b.WriteString("- Package: `" + doc.PackageName + "`\n") + b.WriteString(fmt.Sprintf("- Total lines: `%d`\n", doc.LineCount)) + if len(doc.RustCounterpart) > 0 { + b.WriteString("- Rust counterpart candidates: `" + strings.Join(doc.RustCounterpart, "`, `") + "`\n") + } else { + b.WriteString("- Rust counterpart candidates: none mapped directly; behavior may still be folded into adjacent Rust modules.\n") + } + b.WriteString("\n## Imports\n\n") + if len(doc.Imports) == 0 { + b.WriteString("This file has no imports.\n") + } else { + for _, imp := range doc.Imports { + b.WriteString("- `" + imp + "`\n") + } + } + + b.WriteString("\n## Top-Level Declarations\n\n") + if len(doc.TopLevelDecls) == 0 { + b.WriteString("No package-level const/var/type declarations in this file.\n") + } else { + for _, decl := range doc.TopLevelDecls { + b.WriteString(fmt.Sprintf("### `%s` `%s`\n\n", decl.Kind, strings.Join(decl.Names, "`, `"))) + b.WriteString(fmt.Sprintf("- Lines: `%d-%d`\n", decl.StartLine, decl.EndLine)) + b.WriteString("- Role: " + decl.Summary + "\n") + if len(decl.Details) > 0 { + b.WriteString("- Details:\n") + for _, detail := range decl.Details { + b.WriteString(" - " + detail + "\n") + } + } + b.WriteString("\n") + } + } + + b.WriteString("## Function Inventory\n\n") + if len(doc.Functions) == 0 { + b.WriteString("No functions or methods are declared in this file.\n") + return b.String() + } + for _, fn := range doc.Functions { + receiver := "" + if fn.ReceiverType != "" { + receiver = " receiver `" + sanitizeInline(fn.ReceiverType) + "`" + } + b.WriteString(fmt.Sprintf("- `%s`%s at lines `%d-%d`\n", fn.Name, receiver, fn.StartLine, fn.EndLine)) + } + + for _, fn := range doc.Functions { + b.WriteString("\n## `" + fn.Name + "`\n\n") + b.WriteString("- Signature: `" + fn.Signature + "`\n") + b.WriteString(fmt.Sprintf("- Lines: `%d-%d`\n", fn.StartLine, fn.EndLine)) + if fn.ReceiverType != "" { + b.WriteString("- Receiver: `" + sanitizeInline(fn.ReceiverType) + "`") + if fn.Receiver != "" { + b.WriteString(fmt.Sprintf(" bound as `%s`", fn.Receiver)) + } + b.WriteString("\n") + } + b.WriteString("- Effect: " + fn.Effect + "\n") + if fn.DocComment != "" { + b.WriteString("- Native doc comment: `" + sanitizeInline(fn.DocComment) + "`\n") + } + + b.WriteString("\n### Relations\n\n") + if len(fn.PotentialLocal) > 0 { + b.WriteString("- Local package calls: `" + strings.Join(fn.PotentialLocal, "`, `") + "`\n") + } else { + b.WriteString("- Local package calls: none detected from simple call-name matching.\n") + } + if len(fn.ExternalCalls) > 0 { + b.WriteString("- External or unresolved calls: `" + strings.Join(fn.ExternalCalls, "`, `") + "`\n") + } else { + b.WriteString("- External or unresolved calls: none detected.\n") + } + if len(fn.PossibleCallers) > 0 { + b.WriteString("- Possible name-matched callers in scanned scope: `" + strings.Join(fn.PossibleCallers, "`, `") + "`\n") + } else { + b.WriteString("- Possible name-matched callers in scanned scope: none detected.\n") + } + + b.WriteString("\n### Control Flow\n\n") + if len(fn.ControlFlow) == 0 { + b.WriteString("No notable branch/loop/defer/return items were extracted.\n") + } else { + for _, item := range fn.ControlFlow { + b.WriteString("- " + item + "\n") + } + } + + b.WriteString("\n### Literal And Keyword Touchpoints\n\n") + if len(fn.Literals) == 0 { + b.WriteString("No literals or keyword literals (`true`, `false`, `nil`) were extracted from the body.\n") + } else { + for _, lit := range fn.Literals { + b.WriteString(fmt.Sprintf("- L%d `%s` = `%s`\n", lit.Line, lit.Kind, sanitizeInline(lit.Value))) + } + } + + b.WriteString("\n### Line-Level Operating Logic\n\n") + lineNotes := lineNoteMap(fn.Statements) + for offset, raw := range fn.SourceLines { + lineNo := fn.StartLine + offset + trimmed := strings.TrimSpace(raw) + if trimmed == "" { + continue + } + note := explainLine(trimmed, lineNo, lineNotes) + b.WriteString(fmt.Sprintf("- L%d: `%s`", lineNo, sanitizeInline(trimmed))) + if note != "" { + b.WriteString(" -> " + note) + } + b.WriteString("\n") + } + } + + return b.String() +} + +func renderIndexReadme(outRel string, docs []*FileDoc) string { + var b strings.Builder + b.WriteString("# Go Volume Server Translation Docs\n\n") + b.WriteString("Generated reference set for translating the Go SeaweedFS volume server into the Rust `seaweed-volume` crate.\n\n") + b.WriteString("- Generated at: `" + time.Now().Format(time.RFC3339) + "`\n") + b.WriteString(fmt.Sprintf("- Markdown files: `%d`\n", len(docs))) + b.WriteString("- Scope: `weed/command/volume.go`, selected `weed/server` volume-server files, and runtime files under `weed/storage`, `weed/images`, `weed/security`, and `weed/stats`.\n") + b.WriteString("- Output root: `" + sanitizeInline(outRel) + "`\n\n") + + groups := map[string][]*FileDoc{} + groupOrder := []string{"command", "server", "storage", "images", "security", "stats"} + for _, doc := range docs { + group := strings.Split(doc.RelPath, "/")[1] + groups[group] = append(groups[group], doc) + } + + for _, group := range groupOrder { + items := groups[group] + if len(items) == 0 { + continue + } + sort.Slice(items, func(i, j int) bool { return items[i].RelPath < items[j].RelPath }) + b.WriteString("## " + strings.Title(group) + "\n\n") + for _, doc := range items { + target := filepath.ToSlash(filepath.Join(outRel, doc.RelPath+".md")) + b.WriteString("- `" + doc.RelPath + "` -> `" + target + "`") + if len(doc.RustCounterpart) > 0 { + b.WriteString(" | Rust: `" + strings.Join(doc.RustCounterpart, "`, `") + "`") + } + b.WriteString("\n") + } + b.WriteString("\n") + } + return b.String() +} + +func lineNoteMap(statements []StmtInfo) map[int]string { + notes := map[int]string{} + for _, stmt := range statements { + if stmt.Summary != "" { + notes[stmt.StartLine] = stmt.Summary + } + } + return notes +} + +func explainLine(line string, lineNo int, notes map[int]string) string { + if note, ok := notes[lineNo]; ok { + return note + } + switch { + case strings.HasPrefix(line, "func "): + return "function signature header" + case strings.HasPrefix(line, "//"): + return "comment line" + case strings.HasPrefix(line, "/*") || strings.HasPrefix(line, "*/"): + return "comment block boundary" + case line == "{" || line == "}" || line == "})" || line == "};": + return "block boundary" + case strings.HasPrefix(line, "else"): + return "alternate control-flow branch" + case strings.HasPrefix(line, "case "): + return "switch/select case label" + case strings.HasPrefix(line, "default:"): + return "default case label" + case strings.HasPrefix(line, "return"): + return "returns from the function" + case strings.HasPrefix(line, "defer "): + return "registers deferred work for function exit" + case strings.HasPrefix(line, "go "): + return "starts a goroutine" + case strings.HasPrefix(line, "if "): + return "conditional check" + case strings.HasPrefix(line, "for "): + return "loop header" + case strings.HasPrefix(line, "switch "): + return "switch header" + case strings.HasPrefix(line, "select "): + return "channel select header" + case strings.Contains(line, ":="): + return "declares and assigns local state" + case looksLikeAssignment(line): + return "updates existing state" + case strings.HasSuffix(line, ")") || strings.HasSuffix(line, "},") || strings.HasSuffix(line, "})"): + return "executes a call or composite literal line" + default: + return "continuation or structural line" + } +} + +func looksLikeAssignment(line string) bool { + if strings.Contains(line, "==") || strings.Contains(line, ">=") || strings.Contains(line, "<=") || strings.Contains(line, "!=") { + return false + } + if strings.Contains(line, "=") { + return true + } + return false +} + +func rustCounterparts(rel string) []string { + switch { + case rel == "weed/command/volume.go": + return []string{"seaweed-volume/src/config.rs", "seaweed-volume/src/main.rs"} + case strings.HasPrefix(rel, "weed/images/"): + return []string{"seaweed-volume/src/images.rs"} + case strings.HasPrefix(rel, "weed/security/"): + return []string{"seaweed-volume/src/security.rs"} + case strings.HasPrefix(rel, "weed/stats/"): + return []string{"seaweed-volume/src/metrics.rs"} + case rel == "weed/server/common.go": + return []string{"seaweed-volume/src/server/handlers.rs", "seaweed-volume/src/server/volume_server.rs", "seaweed-volume/src/main.rs"} + case rel == "weed/server/constants/volume.go": + return []string{"seaweed-volume/src/server/mod.rs", "seaweed-volume/src/server/volume_server.rs"} + case rel == "weed/server/volume_server.go": + return []string{"seaweed-volume/src/server/volume_server.rs", "seaweed-volume/src/server/heartbeat.rs", "seaweed-volume/src/main.rs"} + case strings.HasPrefix(rel, "weed/server/volume_server_handlers"): + return []string{"seaweed-volume/src/server/handlers.rs", "seaweed-volume/src/server/volume_server.rs"} + case strings.HasPrefix(rel, "weed/server/volume_grpc_"): + return []string{"seaweed-volume/src/server/grpc_server.rs", "seaweed-volume/src/server/heartbeat.rs"} + case rel == "weed/server/volume_server_ui/templates.go": + return []string{"seaweed-volume/src/server/volume_server.rs"} + case rel == "weed/storage/disk_location.go" || rel == "weed/storage/disk_location_ec.go": + return []string{"seaweed-volume/src/storage/disk_location.rs"} + case strings.HasPrefix(rel, "weed/storage/erasure_coding/"): + name := filepath.Base(rel) + switch name { + case "ec_decoder.go": + return []string{"seaweed-volume/src/storage/erasure_coding/ec_decoder.rs"} + case "ec_encoder.go": + return []string{"seaweed-volume/src/storage/erasure_coding/ec_encoder.rs"} + case "ec_locate.go": + return []string{"seaweed-volume/src/storage/erasure_coding/ec_locate.rs"} + case "ec_shard.go", "ec_shard_info.go", "ec_shards_info.go": + return []string{"seaweed-volume/src/storage/erasure_coding/ec_shard.rs"} + default: + return []string{"seaweed-volume/src/storage/erasure_coding/ec_volume.rs", "seaweed-volume/src/storage/erasure_coding/mod.rs"} + } + case strings.HasPrefix(rel, "weed/storage/needle/"): + name := filepath.Base(rel) + switch name { + case "crc.go": + return []string{"seaweed-volume/src/storage/needle/crc.rs"} + case "volume_ttl.go": + return []string{"seaweed-volume/src/storage/needle/ttl.rs"} + default: + return []string{"seaweed-volume/src/storage/needle/needle.rs", "seaweed-volume/src/storage/needle/mod.rs"} + } + case rel == "weed/storage/needle_map.go" || strings.HasPrefix(rel, "weed/storage/needle_map/") || strings.HasPrefix(rel, "weed/storage/needle_map_"): + name := filepath.Base(rel) + if strings.Contains(name, "compact_map") { + return []string{"seaweed-volume/src/storage/needle_map/compact_map.rs"} + } + return []string{"seaweed-volume/src/storage/needle_map.rs"} + case strings.HasPrefix(rel, "weed/storage/store"): + return []string{"seaweed-volume/src/storage/store.rs"} + case strings.HasPrefix(rel, "weed/storage/super_block/"): + return []string{"seaweed-volume/src/storage/super_block.rs"} + case strings.HasPrefix(rel, "weed/storage/types/"): + return []string{"seaweed-volume/src/storage/types.rs"} + case strings.HasPrefix(rel, "weed/storage/backend/s3_backend/"): + return []string{"seaweed-volume/src/remote_storage/s3.rs", "seaweed-volume/src/remote_storage/s3_tier.rs"} + case strings.HasPrefix(rel, "weed/storage/backend/"): + return []string{"seaweed-volume/src/storage/volume.rs", "seaweed-volume/src/storage/mod.rs"} + case strings.HasPrefix(rel, "weed/storage/idx/"): + return []string{"seaweed-volume/src/storage/idx/mod.rs"} + case strings.HasPrefix(rel, "weed/storage/volume"): + return []string{"seaweed-volume/src/storage/volume.rs"} + case strings.HasPrefix(rel, "weed/storage/"): + return []string{"seaweed-volume/src/storage/mod.rs"} + default: + return nil + } +} + +func cleanDocComment(group *ast.CommentGroup) string { + if group == nil { + return "" + } + text := strings.TrimSpace(group.Text()) + return strings.Join(strings.Fields(text), " ") +} + +func nodeString(fset *token.FileSet, node any) string { + if node == nil { + return "" + } + var buf bytes.Buffer + if err := printer.Fprint(&buf, fset, node); err != nil { + return "" + } + return buf.String() +} + +func fieldListString(fset *token.FileSet, fields *ast.FieldList) string { + if fields == nil { + return "" + } + var parts []string + for _, field := range fields.List { + names := make([]string, 0, len(field.Names)) + for _, name := range field.Names { + names = append(names, name.Name) + } + typeText := sanitizeInline(nodeString(fset, field.Type)) + if len(names) == 0 { + parts = append(parts, typeText) + continue + } + parts = append(parts, strings.Join(names, ", ")+" "+typeText) + } + return strings.Join(parts, ", ") +} + +func joinNodes(fset *token.FileSet, nodes []ast.Expr) string { + parts := make([]string, 0, len(nodes)) + for _, node := range nodes { + if node == nil { + continue + } + text := nodeString(fset, node) + if text != "" { + parts = append(parts, text) + } + } + return strings.Join(parts, ", ") +} + +func splitLines(src string) []string { + src = strings.ReplaceAll(src, "\r\n", "\n") + src = strings.ReplaceAll(src, "\r", "\n") + return strings.Split(src, "\n") +} + +func normalizeReceiverType(receiver string) string { + receiver = strings.TrimPrefix(receiver, "*") + receiver = strings.ReplaceAll(receiver, " ", "") + return receiver +} + +func simpleCallName(expr ast.Expr) string { + switch e := expr.(type) { + case *ast.Ident: + return e.Name + case *ast.SelectorExpr: + return e.Sel.Name + case *ast.IndexExpr: + return simpleCallName(e.X) + case *ast.IndexListExpr: + return simpleCallName(e.X) + case *ast.ParenExpr: + return simpleCallName(e.X) + default: + return "" + } +} + +func simpleNameFromDisplay(display string) string { + if strings.Contains(display, ".") { + parts := strings.Split(display, ".") + return parts[len(parts)-1] + } + if strings.Contains(display, "(") { + return strings.TrimSpace(strings.Split(display, "(")[0]) + } + return display +} + +func sortedKeys(set map[string]struct{}) []string { + items := make([]string, 0, len(set)) + for key := range set { + items = append(items, key) + } + sort.Strings(items) + return items +} + +func dedupeKeepOrder(items []string) []string { + var out []string + seen := map[string]struct{}{} + for _, item := range items { + if _, ok := seen[item]; ok { + continue + } + out = append(out, item) + seen[item] = struct{}{} + } + return out +} + +func sanitizeInline(s string) string { + s = strings.TrimSpace(s) + s = strings.ReplaceAll(s, "\n", " ") + s = strings.Join(strings.Fields(s), " ") + s = strings.ReplaceAll(s, "`", "'") + return s +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/.cargo-ok b/seaweed-volume/vendor/reed-solomon-erasure/.cargo-ok new file mode 100644 index 000000000..5f8b79583 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/.cargo-ok @@ -0,0 +1 @@ +{"v":1} \ No newline at end of file diff --git a/seaweed-volume/vendor/reed-solomon-erasure/.cargo_vcs_info.json b/seaweed-volume/vendor/reed-solomon-erasure/.cargo_vcs_info.json new file mode 100644 index 000000000..9df5c3075 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/.cargo_vcs_info.json @@ -0,0 +1,6 @@ +{ + "git": { + "sha1": "a1ca49de5384445b68ade7d72f31f0379c199943" + }, + "path_in_vcs": "" +} \ No newline at end of file diff --git a/seaweed-volume/vendor/reed-solomon-erasure/.gitattributes b/seaweed-volume/vendor/reed-solomon-erasure/.gitattributes new file mode 100644 index 000000000..1af754e96 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/.gitattributes @@ -0,0 +1,3 @@ +BackBlaze_JavaReedSolomon/* linguist-vendored +KlausPost_reedsolomon/* linguist-vendored +NicolasT_reedsolomon/* linguist-vendored \ No newline at end of file diff --git a/seaweed-volume/vendor/reed-solomon-erasure/.gitignore b/seaweed-volume/vendor/reed-solomon-erasure/.gitignore new file mode 100644 index 000000000..e9e21997b --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/.gitignore @@ -0,0 +1,2 @@ +/target/ +/Cargo.lock diff --git a/seaweed-volume/vendor/reed-solomon-erasure/CHANGELOG.md b/seaweed-volume/vendor/reed-solomon-erasure/CHANGELOG.md new file mode 100644 index 000000000..5a86a1329 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/CHANGELOG.md @@ -0,0 +1,181 @@ +## 6.0.0 +- Use LruCache instead of InversionTree for caching data decode matrices + - See [PR #104](https://github.com/rust-rse/reed-solomon-erasure/pull/104) +- Minor code duplication + - See [PR #102](https://github.com/rust-rse/reed-solomon-erasure/pull/102) +- Dependencies update + - Updated `smallvec` from `0.6.1` to `1.8.0` + +## 5.0.3 +- Fixed cross build bug for aarch64 with simd-accel + - See [PR #100](https://github.com/rust-rse/reed-solomon-erasure/pull/100) + +## 5.0.2 +* Add support for `RUST_REED_SOLOMON_ERASURE_ARCH` environment variable and stop using `native` architecture for SIMD code + - See [PR #98](https://github.com/rust-rse/reed-solomon-erasure/pull/98) + +## 5.0.1 +- The `simd-accel` feature now builds on M1 Macs + - See [PR #92](https://github.com/rust-rse/reed-solomon-erasure/pull/92) +- Minor code cleanup + +## 5.0.0 +- Merged several PRs +- Not fully reviewed as I am no longer maintaining this crate + +## 4.0.2 +- Updated build.rs to respect RUSTFLAGS's target-cpu if available + - See [PR #75](https://github.com/darrenldl/reed-solomon-erasure/pull/75) +- Added AVX512 support + - See [PR #69](https://github.com/darrenldl/reed-solomon-erasure/pull/69) +- Disabled SIMD acceleration when MSVC is being used to build the library + - See [PR #67](https://github.com/darrenldl/reed-solomon-erasure/pull/67) +- Dependencies update + - Updated `smallvec` from `0.6` to `1.2` + +## 4.0.1 +- Updated SIMD C code for Windows compatibility + - Removed include of `unistd.h` in `simd_c/reedsolomon.c` + - Removed GCC `nonnull` attribute in `simd_c/reedsolomon.h` + - See PR [#63](https://github.com/darrenldl/reed-solomon-erasure/pull/63) [#64](https://github.com/darrenldl/reed-solomon-erasure/pull/64) for details +- Replaced use of `libc::uint8_t` in `src/galois_8.rs` with `u8` + +## 4.0.0 +- Major API restructure: removed `Shard` type in favor of generic functions +- The logic of this crate is now generic over choice of finite field +- The SIMD acceleration feature for GF(2^8) is now activated with the `simd-accel` Cargo feature. Pure-rust behavior is default. +- Ran rustfmt +- Adds a GF(2^16) implementation + +## 3.1.2 (not published) +- Doc fix + - Added space before parantheses in code comments and documentation +- Disabled SIMD C code for Android and iOS targets entirely + +## 3.1.1 +- Fixed `Matrix::augment` + - The error checking code was incorrect + - Since this method is used in internal code only, and the only use case is a correct use case, the error did not lead to any bugs +- Fixed benchmark data + - Previously used MB=10^6 bytes while I should have used MB=2^20 bytes + - Table in README has been updated accordingly + - The `>= 2.1.0` data is obtained by measuring again with the corrected `rse-benchmark` code + - The `2.0.X` and `1.X.X` data are simply adjusted by mutiplying `10^6` then dividing by `2^20` +- Dependencies update + - Updated `rand` from `0.4` to `0.5.4` +- Added special handling in `build.rs` for CC options on Android and iOS + - `-march=native` is not available for GCC on Android, see issue #23 + +## 3.1.0 +- Impl'd `std::error::Error` for `reed_solomon_erasure::Error` and `reed_solomon_erasure::SBSError` + - See issue [#17](https://github.com/darrenldl/reed-solomon-erasure/issues/17), suggested by [DrPeterVanNostrand](https://github.com/DrPeterVanNostrand) +- Added fuzzing suite + - No code changes due to this as no bugs were found +- Upgraded InversionTree QuickCheck test + - No code changes due to this as no bugs were found +- Upgraded test suite for main codec methods (e.g. encode, reconstruct) + - A lot of heavy QuickCheck tests were added + - No code changes due to this as no bugs were found +- Upgraded test suite for ShardByShard methods + - A lot of heavy QuickCheck tests were added + - No code changes due to this as no bugs were found +- Minor code refactoring in `reconstruct_internal` method + - This means `reconstruct` and related methods are slightly more optimized + +## 3.0.3 +- Added QuickCheck tests to the test suite + - InversionTree is heavily tested now +- No code changes as no bugs were found +- Deps update + - Updated rayon from 0.9 to 1.0 + +## 3.0.2 +- Same as 3.0.1, but 3.0.1 had unapplied changes + +## 3.0.1 (yanked) +- Updated doc for `with_buffer` variants of verifying methods + - Stated explicitly that the buffer contains the correct parity shards after a successful call +- Added tests for the above statement + +## 3.0.0 +- Added `with_buffer` variants for verifying methods + - This gives user the option of reducing heap allocation(s) +- Core code clean up, improvements, and review, added more AUDIT comments +- Improved shard utils +- Added code to remove leftover parity shards in `reconstruct_data_shards` + - This means one fewer gotcha of using the methods +- `ShardByShard` code review and overhaul +- `InversionTree` code review and improvements + +## 2.4.0 +- Added more flexibility for `convert_2D_slices` macro + - Now accepts expressions rather than just identifiers + - The change requires change of syntax + +## 2.3.3 +- Replaced all slice splitting functions in `misc_utils` with std lib ones or rayon ones + - This means there are fewer heap allocations in general + +## 2.3.2 +- Made `==`(`eq`) for `ReedSolomon` more reasonable + - Previously `==` would compare + - data shard count + - parity shard count + - total shard count + - internal encoding matrix + - internal `ParallelParam` + - Now it only compares + - data shard count + - parity shard count + +## 2.3.1 +- Added info on encoding behaviour to doc + +## 2.3.0 +- Made Reed-Solomon codec creation methods return error instead of panic when shard numbers are not correct + +## 2.2.0 +- Fixed SBS error checking code +- Documentation fixes and polishing +- Renamed `Error::InvalidShardsIndicator` to `Error::InvalidShardFlags` +- Added more details to documentation on error handling +- Error handling code overhaul and checks for all method variants +- Dead commented out code cleanup and indent fix + +## 2.1.0 +- Added Nicolas's SIMD C code files, gaining major speedup on supported CPUs +- Added support for "shard by shard" encoding, allowing easier streamed encoding +- Added functions for shard by shard encoding + +## 2.0.0 +- Complete rewrite of most code following Klaus Post's design +- Added optimsations (parallelism, loop unrolling) +- 4-5x faster than `1.X.X` + +## 1.1.1 +- Documentation polish +- Added documentation badge to README +- Optimised internal matrix related operations + - This largely means `decode_missing` is faster + +## 1.1.0 +- Added more helper functions +- Added more tests + +## 1.0.1 +- Added more tests +- Fixed decode_missing + - Previously may reconstruct the missing shards with incorrect length + +## 1.0.0 +- Added more tests +- Added integration with Codecov (via kcov) +- Code refactoring +- Added integration with Coveralls (via kcov) + +## 0.9.1 +- Code restructuring +- Added documentation + +## 0.9.0 +- Base version diff --git a/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml b/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml new file mode 100644 index 000000000..a6171580a --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml @@ -0,0 +1,87 @@ +# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO +# +# When uploading crates to the registry Cargo will automatically +# "normalize" Cargo.toml files for maximal compatibility +# with all versions of Cargo and also rewrite `path` dependencies +# to registry (e.g., crates.io) dependencies. +# +# If you are reading this file be aware that the original Cargo.toml +# will likely look very different (and much more reasonable). +# See Cargo.toml.orig for the original contents. + +[package] +edition = "2018" +name = "reed-solomon-erasure" +version = "6.0.0" +authors = ["Darren Ldl "] +build = "build.rs" +exclude = [ + "appveyor.yml", + ".travis.yml", +] +description = "Rust implementation of Reed-Solomon erasure coding" +homepage = "https://github.com/darrenldl/reed-solomon-erasure" +documentation = "https://docs.rs/reed-solomon-erasure" +readme = "README.md" +keywords = [ + "reed-solomon", + "erasure", +] +categories = ["encoding"] +license = "MIT" +repository = "https://github.com/darrenldl/reed-solomon-erasure" + +[[bench]] +name = "reconstruct" + +[dependencies.libc] +version = "0.2" +optional = true + +[dependencies.libm] +version = "0.2.1" + +[dependencies.lru] +version = "0.16.3" + +[dependencies.parking_lot] +version = "0.11.2" +optional = true + +[dependencies.smallvec] +version = "1.2" + +[dependencies.spin] +version = "0.9.2" +features = ["spin_mutex"] +default-features = false + +[dev-dependencies.quickcheck] +version = "0.9" + +[dev-dependencies.rand] +version = "0.7.2" + +[build-dependencies.cc] +version = "1.0" +optional = true + +[features] +default = ["std"] +simd-accel = [ + "cc", + "libc", +] +std = ["parking_lot"] + +[badges.appveyor] +repository = "darrenldl/reed-solomon-erasure" + +[badges.codecov] +repository = "darrenldl/reed-solomon-erasure" + +[badges.coveralls] +repository = "darrenldl/reed-solomon-erasure" + +[badges.travis-ci] +repository = "darrenldl/reed-solomon-erasure" diff --git a/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml.orig b/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml.orig new file mode 100644 index 000000000..e9cbc8cf9 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/Cargo.toml.orig @@ -0,0 +1,56 @@ +[package] +name= "reed-solomon-erasure" +version = "6.0.0" +authors = ["Darren Ldl "] +edition = "2018" +build = "build.rs" +exclude = [ + "appveyor.yml", + ".travis.yml" +] + +description = "Rust implementation of Reed-Solomon erasure coding" + +documentation = "https://docs.rs/reed-solomon-erasure" +homepage= "https://github.com/darrenldl/reed-solomon-erasure" +repository= "https://github.com/darrenldl/reed-solomon-erasure" + +readme= "README.md" + +keywords= ["reed-solomon", "erasure"] + +categories= ["encoding"] + +license = "MIT" + +[features] +default = ["std"] # simd off by default +std = ["parking_lot"] +simd-accel = ["cc", "libc"] + +[badges] +travis-ci = { repository = "darrenldl/reed-solomon-erasure" } +appveyor= { repository = "darrenldl/reed-solomon-erasure" } +codecov = { repository = "darrenldl/reed-solomon-erasure" } +coveralls = { repository = "darrenldl/reed-solomon-erasure" } + +[dependencies] +libc = { version = "0.2", optional = true } +# `log2()` impl for `no_std` +libm = "0.2.1" +lru = "0.16.3" +# Efficient `Mutex` implementation for `std` environment +parking_lot = { version = "0.11.2", optional = true } +smallvec = "1.2" +# `Mutex` implementation for `no_std` environment with the same high-level API as `parking_lot` +spin = { version = "0.9.2", default-features = false, features = ["spin_mutex"] } + +[dev-dependencies] +rand = "0.7.2" +quickcheck = "0.9" + +[build-dependencies] +cc = { version = "1.0", optional = true } + +[[bench]] +name = "reconstruct" diff --git a/seaweed-volume/vendor/reed-solomon-erasure/LICENSE b/seaweed-volume/vendor/reed-solomon-erasure/LICENSE new file mode 100644 index 000000000..87c0c3787 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/LICENSE @@ -0,0 +1,24 @@ +MIT License + +Copyright (c) 2017 Darren Ldl +Copyright (c) 2015, 2016 Nicolas Trangez +Copyright (c) 2015 Klaus Post +Copyright (c) 2015 Backblaze + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/seaweed-volume/vendor/reed-solomon-erasure/README.md b/seaweed-volume/vendor/reed-solomon-erasure/README.md new file mode 100644 index 000000000..5d79fab7c --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/README.md @@ -0,0 +1,166 @@ +# reed-solomon-erasure +[![Build Status](https://travis-ci.org/darrenldl/reed-solomon-erasure.svg?branch=master)](https://travis-ci.org/darrenldl/reed-solomon-erasure) +[![Build status](https://ci.appveyor.com/api/projects/status/47c0emjoa9bhpjlb/branch/master?svg=true)](https://ci.appveyor.com/project/darrenldl/reed-solomon-erasure/branch/master) +[![codecov](https://codecov.io/gh/darrenldl/reed-solomon-erasure/branch/master/graph/badge.svg)](https://codecov.io/gh/darrenldl/reed-solomon-erasure) +[![Coverage Status](https://coveralls.io/repos/github/darrenldl/reed-solomon-erasure/badge.svg?branch=master)](https://coveralls.io/github/darrenldl/reed-solomon-erasure?branch=master) +[![Crates](https://img.shields.io/crates/v/reed-solomon-erasure.svg)](https://crates.io/crates/reed-solomon-erasure) +[![Documentation](https://docs.rs/reed-solomon-erasure/badge.svg)](https://docs.rs/reed-solomon-erasure) +[![dependency status](https://deps.rs/repo/github/darrenldl/reed-solomon-erasure/status.svg)](https://deps.rs/repo/github/darrenldl/reed-solomon-erasure) + +Rust implementation of Reed-Solomon erasure coding + +WASM builds are also available, see section **WASM usage** below for details + +This is a port of [BackBlaze's Java implementation](https://github.com/Backblaze/JavaReedSolomon), [Klaus Post's Go implementation](https://github.com/klauspost/reedsolomon), and [Nicolas Trangez's Haskell implementation](https://github.com/NicolasT/reedsolomon). + +Version `1.X.X` copies BackBlaze's implementation, and is less performant as there were fewer places where parallelism could be added. + +Version `>= 2.0.0` copies Klaus Post's implementation. The SIMD C code is copied from Nicolas Trangez's implementation with minor modifications. + +See [Notes](#notes) and [License](#license) section for details. + +## WASM usage + +See [here](wasm/README.md) for details + +## Rust usage +Add the following to your `Cargo.toml` for the normal version (pure Rust version) +```toml +[dependencies] +reed-solomon-erasure = "4.0" +``` +or the following for the version which tries to utilise SIMD +```toml +[dependencies] +reed-solomon-erasure = { version = "4.0", features = [ "simd-accel" ] } +``` +and the following to your crate root +```rust +extern crate reed_solomon_erasure; +``` + +NOTE: `simd-accel` is tuned for Haswell+ processors on x86-64 and not in any way for other architectures, set +environment variable `RUST_REED_SOLOMON_ERASURE_ARCH` during build to force compilation of C code for specific architecture (`-march` flag in +GCC/Clang). Even on x86-64 you can achieve better performance by setting it to `native`, but it will stop running on +older CPUs, YMMV. + +## Example +```rust +#[macro_use(shards)] +extern crate reed_solomon_erasure; + +use reed_solomon_erasure::galois_8::ReedSolomon; +// or use the following for Galois 2^16 backend +// use reed_solomon_erasure::galois_16::ReedSolomon; + +fn main () { + let r = ReedSolomon::new(3, 2).unwrap(); // 3 data shards, 2 parity shards + + let mut master_copy = shards!( + [0, 1, 2, 3], + [4, 5, 6, 7], + [8, 9, 10, 11], + [0, 0, 0, 0], // last 2 rows are parity shards + [0, 0, 0, 0] + ); + + // Construct the parity shards + r.encode(&mut master_copy).unwrap(); + + // Make a copy and transform it into option shards arrangement + // for feeding into reconstruct_shards + let mut shards: Vec<_> = master_copy.iter().cloned().map(Some).collect(); + + // We can remove up to 2 shards, which may be data or parity shards + shards[0] = None; + shards[4] = None; + + // Try to reconstruct missing shards + r.reconstruct(&mut shards).unwrap(); + + // Convert back to normal shard arrangement + let result: Vec<_> = shards.into_iter().filter_map(|x| x).collect(); + + assert!(r.verify(&result).unwrap()); + assert_eq!(master_copy, result); +} +``` + +## Benchmark it yourself +You can test performance under different configurations quickly (e.g. data parity shards ratio, parallel parameters) +by cloning this repo: https://github.com/darrenldl/rse-benchmark + +`rse-benchmark` contains a copy of this library (usually a fully functional dev version), so you only need to adjust `main.rs` +then do `cargo run --release` to start the benchmark. + +## Performance +Version `1.X.X`, `2.0.0` do not utilise SIMD. + +Version `2.1.0` onward uses Nicolas's C files for SIMD operations. + +Machine: laptop with `Intel(R) Core(TM) i5-3337U CPU @ 1.80GHz (max 2.70GHz) 2 Cores 4 Threads` + +Below shows the result of one of the test configurations, other configurations show similar results in terms of ratio. + +|Configuration| Klaus Post's | >= 2.1.0 && < 4.0.0 | 2.0.X | 1.X.X | +|---|---|---|---|---| +| 10x2x1M | ~7800MB/s |~4500MB/s | ~1000MB/s | ~240MB/s | + +Versions `>= 4.0.0` have not been benchmarked thoroughly yet + +## Changelog +[Changelog](CHANGELOG.md) + +## Contributions +Contributions are welcome. Note that by submitting contributions, you agree to license your work under the same license used by this project as stated in the LICENSE file. + +## Credits +#### Library overhaul and Galois 2^16 backend +Many thanks to the following people for overhaul of the library and introduction of Galois 2^16 backend + + - [@drskalman](https://github.com/drskalman) + + - Jeff Burdges [@burdges](https://github.com/burdges) + + - Robert Habermeier [@rphmeier](https://github.com/rphmeier) + +#### WASM builds +Many thanks to Nazar Mokrynskyi [@nazar-pc](https://github.com/nazar-pc) for submitting his package for WASM builds + +He is the original author of the files stored in `wasm` folder. The files may have been modified by me later. + +#### AVX512 support +Many thanks to [@sakridge](https://github.com/sakridge) for adding support for AVX512 (see [PR #69](https://github.com/darrenldl/reed-solomon-erasure/pull/69)) + +#### build.rs improvements +Many thanks to [@ryoqun](https://github.com/ryoqun) for improving the usability of the library in the context of cross-compilation (see [PR #75](https://github.com/darrenldl/reed-solomon-erasure/pull/75)) + +#### no_std support +Many thanks to Nazar Mokrynskyi [@nazar-pc](https://github.com/nazar-pc) for adding `no_std` support (see [PR #90](https://github.com/darrenldl/reed-solomon-erasure/pull/90)) + +#### Testers +Many thanks to the following people for testing and benchmarking on various platforms + + - Laurențiu Nicola [@lnicola](https://github.com/lnicola/) (platforms: Linux, Intel) + + - Roger Andersen [@hexjelly](https://github.com/hexjelly) (platforms: Windows, AMD) + +## Notes +#### Code quality review +If you'd like to evaluate the quality of this library, you may find audit comments helpful. + +Simply search for "AUDIT" to see the dev notes that are aimed at facilitating code reviews. + +#### Implementation notes +The `1.X.X` implementation mostly copies [BackBlaze's Java implementation](https://github.com/Backblaze/JavaReedSolomon). + +`2.0.0` onward mostly copies [Klaus Post's Go implementation](https://github.com/klauspost/reedsolomon), and copies C files from [Nicolas Trangez's Haskell implementation](https://github.com/NicolasT/reedsolomon). + +The test suite for all versions copies [Klaus Post's Go implementation](https://github.com/klauspost/reedsolomon) as basis. + +## License +#### Nicolas Trangez's Haskell Reed-Solomon implementation +The C files for SIMD operations are copied (with no/minor modifications) from [Nicolas Trangez's Haskell implementation](https://github.com/NicolasT/reedsolomon), and are under the same MIT License as used by NicolasT's project + +#### TL;DR +All files are released under the MIT License diff --git a/seaweed-volume/vendor/reed-solomon-erasure/benches/reconstruct.rs b/seaweed-volume/vendor/reed-solomon-erasure/benches/reconstruct.rs new file mode 100644 index 000000000..e9d6b6f07 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/benches/reconstruct.rs @@ -0,0 +1,108 @@ +#![feature(test)] + +extern crate test; + +use { + rand::{prelude::*, Rng}, + reed_solomon_erasure::galois_8::Field, + test::Bencher, +}; + +type ReedSolomon = reed_solomon_erasure::ReedSolomon; + +const SHARD_SIZE: usize = 1024; + +fn run_reconstruct_bench(bencher: &mut Bencher, num_data_shards: usize, num_parity_shards: usize) { + let mut rng = rand::thread_rng(); + let mut shards = vec![vec![0u8; SHARD_SIZE]; num_data_shards + num_parity_shards]; + for shard in &mut shards[..num_data_shards] { + rng.fill(&mut shard[..]); + } + let reed_solomon = ReedSolomon::new(num_data_shards, num_parity_shards).unwrap(); + reed_solomon.encode(&mut shards[..]).unwrap(); + let shards: Vec<_> = shards.into_iter().map(Some).collect(); + + bencher.iter(|| { + let mut shards = shards.clone(); + for _ in 0..num_parity_shards { + *shards.choose_mut(&mut rng).unwrap() = None; + } + reed_solomon.reconstruct(&mut shards[..]).unwrap(); + assert!(shards.iter().all(Option::is_some)); + }); +} + +#[bench] +fn bench_reconstruct_2_2(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 2, 2) +} + +#[bench] +fn bench_reconstruct_4_2(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 4, 2) +} + +#[bench] +fn bench_reconstruct_4_4(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 4, 4) +} + +#[bench] +fn bench_reconstruct_8_2(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 8, 2) +} + +#[bench] +fn bench_reconstruct_8_4(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 8, 4) +} + +#[bench] +fn bench_reconstruct_8_8(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 8, 8) +} + +#[bench] +fn bench_reconstruct_16_2(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 16, 2) +} + +#[bench] +fn bench_reconstruct_16_4(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 16, 4) +} + +#[bench] +fn bench_reconstruct_16_8(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 16, 8) +} + +#[bench] +fn bench_reconstruct_16_16(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 16, 16) +} + +#[bench] +fn bench_reconstruct_32_2(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 32, 2) +} + +#[bench] +fn bench_reconstruct_32_4(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 32, 4) +} + +#[bench] +fn bench_reconstruct_32_8(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 32, 8) +} + +#[bench] +fn bench_reconstruct_32_16(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 32, 16) +} + +#[bench] +fn bench_reconstruct_32_32(bencher: &mut Bencher) { + run_reconstruct_bench(bencher, 32, 32) +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/build.rs b/seaweed-volume/vendor/reed-solomon-erasure/build.rs new file mode 100644 index 000000000..de9c5f18e --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/build.rs @@ -0,0 +1,196 @@ +use std::env; +use std::fs::File; +use std::io::Write; +use std::path::Path; + +#[cfg(feature = "simd-accel")] +extern crate cc; + +const FIELD_SIZE: usize = 256; + +const GENERATING_POLYNOMIAL: usize = 29; + +fn gen_log_table(polynomial: usize) -> [u8; FIELD_SIZE] { + let mut result: [u8; FIELD_SIZE] = [0; FIELD_SIZE]; + let mut b: usize = 1; + + for log in 0..FIELD_SIZE - 1 { + result[b] = log as u8; + + b = b << 1; + + if FIELD_SIZE <= b { + b = (b - FIELD_SIZE) ^ polynomial; + } + } + + result +} + +const EXP_TABLE_SIZE: usize = FIELD_SIZE * 2 - 2; + +fn gen_exp_table(log_table: &[u8; FIELD_SIZE]) -> [u8; EXP_TABLE_SIZE] { + let mut result: [u8; EXP_TABLE_SIZE] = [0; EXP_TABLE_SIZE]; + + for i in 1..FIELD_SIZE { + let log = log_table[i] as usize; + result[log] = i as u8; + result[log + FIELD_SIZE - 1] = i as u8; + } + + result +} + +fn multiply(log_table: &[u8; FIELD_SIZE], exp_table: &[u8; EXP_TABLE_SIZE], a: u8, b: u8) -> u8 { + if a == 0 || b == 0 { + 0 + } else { + let log_a = log_table[a as usize]; + let log_b = log_table[b as usize]; + let log_result = log_a as usize + log_b as usize; + exp_table[log_result] + } +} + +fn gen_mul_table( + log_table: &[u8; FIELD_SIZE], + exp_table: &[u8; EXP_TABLE_SIZE], +) -> [[u8; FIELD_SIZE]; FIELD_SIZE] { + let mut result: [[u8; FIELD_SIZE]; FIELD_SIZE] = [[0; 256]; 256]; + + for a in 0..FIELD_SIZE { + for b in 0..FIELD_SIZE { + result[a][b] = multiply(log_table, exp_table, a as u8, b as u8); + } + } + + result +} + +fn gen_mul_table_half( + log_table: &[u8; FIELD_SIZE], + exp_table: &[u8; EXP_TABLE_SIZE], +) -> ([[u8; 16]; FIELD_SIZE], [[u8; 16]; FIELD_SIZE]) { + let mut low: [[u8; 16]; FIELD_SIZE] = [[0; 16]; FIELD_SIZE]; + let mut high: [[u8; 16]; FIELD_SIZE] = [[0; 16]; FIELD_SIZE]; + + for a in 0..low.len() { + for b in 0..low.len() { + let mut result = 0; + if !(a == 0 || b == 0) { + let log_a = log_table[a]; + let log_b = log_table[b]; + result = exp_table[log_a as usize + log_b as usize]; + } + if (b & 0x0F) == b { + low[a][b] = result; + } + if (b & 0xF0) == b { + high[a][b >> 4] = result; + } + } + } + (low, high) +} + +macro_rules! write_table { + (1D => $file:ident, $table:ident, $name:expr, $type:expr) => {{ + let len = $table.len(); + let mut table_str = String::from(format!("pub static {}: [{}; {}] = [", $name, $type, len)); + + for v in $table.iter() { + let str = format!("{}, ", v); + table_str.push_str(&str); + } + + table_str.push_str("];\n"); + + $file.write_all(table_str.as_bytes()).unwrap(); + }}; + (2D => $file:ident, $table:ident, $name:expr, $type:expr) => {{ + let rows = $table.len(); + let cols = $table[0].len(); + let mut table_str = String::from(format!( + "pub static {}: [[{}; {}]; {}] = [", + $name, $type, cols, rows + )); + + for a in $table.iter() { + table_str.push_str("["); + for b in a.iter() { + let str = format!("{}, ", b); + table_str.push_str(&str); + } + table_str.push_str("],\n"); + } + + table_str.push_str("];\n"); + + $file.write_all(table_str.as_bytes()).unwrap(); + }}; +} + +fn write_tables() { + let log_table = gen_log_table(GENERATING_POLYNOMIAL); + let exp_table = gen_exp_table(&log_table); + let mul_table = gen_mul_table(&log_table, &exp_table); + + let out_dir = env::var("OUT_DIR").unwrap(); + let dest_path = Path::new(&out_dir).join("table.rs"); + let mut f = File::create(&dest_path).unwrap(); + + write_table!(1D => f, log_table, "LOG_TABLE", "u8"); + write_table!(1D => f, exp_table, "EXP_TABLE", "u8"); + write_table!(2D => f, mul_table, "MUL_TABLE", "u8"); + + if cfg!(feature = "simd-accel") { + let (mul_table_low, mul_table_high) = gen_mul_table_half(&log_table, &exp_table); + + write_table!(2D => f, mul_table_low, "MUL_TABLE_LOW", "u8"); + write_table!(2D => f, mul_table_high, "MUL_TABLE_HIGH", "u8"); + } +} + +#[cfg(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +))] +fn compile_simd_c() { + let mut build = cc::Build::new(); + build.opt_level(3); + + match env::var("RUST_REED_SOLOMON_ERASURE_ARCH") { + Ok(arch) => { + // Use explicitly specified environment variable as architecture. + build.flag(&format!("-march={}", arch)); + } + Err(_error) => { + // On x86-64 enabling Haswell architecture unlocks useful instructions and improves performance + // dramatically while allowing it to run ony modern CPU. + match env::var("CARGO_CFG_TARGET_ARCH").unwrap().as_str(){ + "x86_64" => { build.flag(&"-march=haswell"); }, + _ => () + } + } + } + + build + .flag("-std=c11") + .file("simd_c/reedsolomon.c") + .compile("reedsolomon"); +} + +#[cfg(not(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +)))] +fn compile_simd_c() {} + +fn main() { + compile_simd_c(); + write_tables(); +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/sage/galois_ext_test.sage b/seaweed-volume/vendor/reed-solomon-erasure/sage/galois_ext_test.sage new file mode 100644 index 000000000..cab1bdf10 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/sage/galois_ext_test.sage @@ -0,0 +1,26 @@ +GF256. = FiniteField(256) +R. = GF256[x] +ext_poly = R.irreducible_element(2,algorithm="first_lexicographic" ) +ExtField. = GF256.extension(ext_poly) +print ExtField +print len(ExtField) + +x^2 + a*x + a^7 + +e1 = (a^7 + a^6 + a^4 + a)*b + a^3 + a^2 + a + 1 +e2 = (a^7 + a^5 + a^2)*b + a^7 + a^4 + a^3 + a + +print "e1: ", e1 +print "e2: ", e2 + +print "e1 + e2: ", e1 + e2 +#(a^6 + a^5 + a^4 + a^2 + a)*b + a^7 + a^4 + a^2 + 1 + +print "e1 * e2: ", e1 * e2 +#(a^4 + a^2 + a + 1)*b + a^7 + a^5 + a^3 + a + +print "e1 / e2: ", e1 / e2 +#(a^7 + a^6 + a^5 + a^4 + a^3 + a^2 + 1)*b + a^6 + a^3 + a + +print "1/b: ", 1/b +#(a^4 + a^3 + a + 1)*b + a^5 + a^4 + a^2 + a \ No newline at end of file diff --git a/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.c b/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.c new file mode 100644 index 000000000..12a921100 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.c @@ -0,0 +1,574 @@ +/* reedsolomon.c - SIMD-optimized Galois-field multiplication routines + * + * Copyright (c) 2015, 2016 Nicolas Trangez + * Copyright (c) 2015 Klaus Post + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE + */ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include + +//#if defined(__SSE2__) && __SSE2__ && defined(HAVE_EMMINTRIN_H) && HAVE_EMMINTRIN_H +//#ifdef __SSE2__ +#if defined(__SSE2__) && __SSE2__ +# define USE_SSE2 1 +# undef VECTOR_SIZE +# define VECTOR_SIZE 16 +# include +#else +# define USE_SSE2 0 +#endif + +//#if defined(__SSSE3__) && __SSSE3__ && defined(HAVE_TMMINTRIN_H) && HAVE_TMMINTRIN_H +//#ifdef __SSSE3__ +#if defined(__SSSE3__) && __SSSE3__ +# define USE_SSSE3 1 +# undef VECTOR_SIZE +# define VECTOR_SIZE 16 +# include +#else +# define USE_SSSE3 0 +#endif + +//#if defined(__AVX2__) && __AVX2__ && defined(HAVE_IMMINTRIN_H) && HAVE_IMMINTRIN_H +//#ifdef __AVX2__ +#if defined(__AVX2__) && __AVX2__ +# define USE_AVX2 1 +# undef VECTOR_SIZE +# define VECTOR_SIZE 32 +# include +#else +# define USE_AVX2 0 +#endif + + +#if defined(__AVX512F__) && __AVX512F__ +# define USE_AVX512 1 +# undef VECTOR_SIZE +# define VECTOR_SIZE 64 +# include +#else +# define USE_AVX512 0 +#endif + + +/*#if ((defined(__ARM_NEON__) && __ARM_NEON__) \ + || (defined(__ARM_NEON) && __ARM_NEON) \ + || (defined(__aarch64__) && __aarch64__)) \ + && defined(HAVE_ARM_NEON_H) && HAVE_ARM_NEON_H*/ +#if ((defined(__ARM_NEON__) && __ARM_NEON__) \ + || (defined(__ARM_NEON) && __ARM_NEON) \ + || (defined(__aarch64__) && __aarch64__)) +# define USE_ARM_NEON 1 +#undef VECTOR_SIZE +# define VECTOR_SIZE 16 +# include +#else +# define USE_ARM_NEON 0 +#endif + +//#if defined(__ALTIVEC__) && __ALTIVEC__ && defined(HAVE_ALTIVEC_H) && HAVE_ALTIVEC_H +#if defined(__ALTIVEC__) && __ALTIVEC__ +# define USE_ALTIVEC 1 +# undef VECTOR_SIZE +# define VECTOR_SIZE 16 +# include +#else +# define USE_ALTIVEC 0 +#endif + +#ifndef VECTOR_SIZE +/* 'Generic' code */ +# define VECTOR_SIZE 16 +#endif + +# define USE_ALIGNED_ACCESS 0 +# define ALIGNED_ACCESS __attribute__((unused)) +# define UNALIGNED_ACCESS + +#include "reedsolomon.h" + +#if defined(HAVE_FUNC_ATTRIBUTE_HOT) && HAVE_FUNC_ATTRIBUTE_HOT +# define HOT_FUNCTION __attribute__((hot)) +#else +# define HOT_FUNCTION +#endif + +#if defined(HAVE_FUNC_ATTRIBUTE_CONST) && HAVE_FUNC_ATTRIBUTE_CONST +# define CONST_FUNCTION __attribute__((const)) +#else +# define CONST_FUNCTION +#endif + +#if defined(HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE) && HAVE_FUNC_ATTRIBUTE_ALWAYS_INLINE +# define ALWAYS_INLINE inline __attribute__((always_inline)) +#else +# define ALWAYS_INLINE inline +#endif + +#if defined(HAVE_FUNC_ATTRIBUTE_FORCE_ALIGN_ARG_POINTER) && HAVE_FUNC_ATTRIBUTE_FORCE_ALIGN_ARG_POINTER +# define FORCE_ALIGN_ARG_POINTER __attribute__((force_align_arg_pointer)) +#else +# define FORCE_ALIGN_ARG_POINTER +#endif + +#define CONCAT_HELPER(a, b) a ## b +#define CONCAT(a, b) CONCAT_HELPER(a, b) + +typedef uint8_t v16u8v __attribute__((vector_size(16), aligned(1))); +typedef uint64_t v2u64v __attribute__((vector_size(16), aligned(1))); + +#define T(t, n) t n[VSIZE / 8 / sizeof(t)] +#define T1(t, n) t n + +#define VSIZE 128 +typedef union { + T(uint8_t, u8); + T(uint64_t, u64); +#if USE_SSE2 + T1(__m128i, m128i); +#endif +#if USE_ARM_NEON + T1(uint8x16_t, uint8x16); + T1(uint8x8x2_t, uint8x8x2); +#endif +#if USE_ALTIVEC + T1(__vector uint8_t, uint8x16); + T1(__vector uint64_t, uint64x2); +#endif + T1(v16u8v, v16u8); + T1(v2u64v, v2u64); +} v128 __attribute__((aligned(1))); +#undef VSIZE + +#define VSIZE 256 +typedef union { + T(uint8_t, u8); +#if USE_AVX2 + __m256i m256i; +#endif +} v256 __attribute__((aligned(1))); +#undef VSIZE + +#define VSIZE 512 +typedef union { + T(uint8_t, u8); +#if USE_AVX512 + __m512i m512i; +#endif +} v512 __attribute__((aligned(1))); + +#undef T +#undef T1 + +#if VECTOR_SIZE == 16 +typedef v128 v; +#elif VECTOR_SIZE == 32 +typedef v256 v; +#elif VECTOR_SIZE == 64 +typedef v512 v; +#else +# error Unsupported VECTOR_SIZE +#endif + +static ALWAYS_INLINE UNALIGNED_ACCESS v128 loadu_v128(const uint8_t *in) { +#if USE_SSE2 + const v128 result = { .m128i = _mm_loadu_si128((const __m128i *)in) }; +#else + v128 result; + memcpy(&result.u64, in, sizeof(result.u64)); +#endif + + return result; +} + +static ALWAYS_INLINE UNALIGNED_ACCESS v loadu_v(const uint8_t *in) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_loadu_si512((const __m512i *)in) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_loadu_si256((const __m256i *)in) }; +#else + const v128 result = loadu_v128(in); +#endif + + return result; +} + +static ALWAYS_INLINE ALIGNED_ACCESS v load_v(const uint8_t *in) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_load_si512((const __m512i *)in) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_load_si256((const __m256i *)in) }; +#elif USE_SSE2 + const v128 result = { .m128i = _mm_load_si128((const __m128i *)in) }; +#elif USE_ARM_NEON + const v128 result = { .uint8x16 = vld1q_u8(in) }; +#elif USE_ALTIVEC + const v128 result = { .uint8x16 = vec_ld(0, in) }; +#else + const v128 result = loadu_v128(in); +#endif + + return result; +} + +static ALWAYS_INLINE CONST_FUNCTION v set1_epi8_v(const uint8_t c) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_set1_epi8(c) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_set1_epi8(c) }; +#elif USE_SSE2 + const v128 result = { .m128i = _mm_set1_epi8(c) }; +#elif USE_ARM_NEON + const v128 result = { .uint8x16 = vdupq_n_u8(c) }; +#elif USE_ALTIVEC + const v128 result = { .uint8x16 = { c, c, c, c, c, c, c, c, + c, c, c, c, c, c, c, c } }; +#else + uint64_t c2 = c, + tmp = (c2 << (7 * 8)) | + (c2 << (6 * 8)) | + (c2 << (5 * 8)) | + (c2 << (4 * 8)) | + (c2 << (3 * 8)) | + (c2 << (2 * 8)) | + (c2 << (1 * 8)) | + (c2 << (0 * 8)); + const v128 result = { .u64 = { tmp, tmp } }; +#endif + + return result; +} + +static ALWAYS_INLINE CONST_FUNCTION v srli_epi64_v(const v in /*, const unsigned int n*/) { + // TODO: Hard code n to 4 to avoid build issues on M1 Macs (the + // `USE_ARM_NEON` path below) where apple clang is failing to + // recognize the constant `n`. + // + // See https://github.com/rust-rse/reed-solomon-erasure/pull/92 + // + #define n 4 +#if USE_AVX512 + const v512 result = { .m512i = _mm512_srli_epi64(in.m512i, n) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_srli_epi64(in.m256i, n) }; +#elif USE_SSE2 + const v128 result = { .m128i = _mm_srli_epi64(in.m128i, n) }; +#elif USE_ARM_NEON + const v128 result = { .uint8x16 = vshrq_n_u8(in.uint8x16, n) }; +#elif USE_ALTIVEC +# if RS_HAVE_VEC_VSRD + const v128 shift = { .v2u64 = { n, n } }, + result = { .uint64x2 = vec_vsrd(in.v2u64, shift.v2u64) }; +# else + const v128 result = { .v2u64 = in.v2u64 >> n }; +# endif +#else + const v128 result = { .u64 = { in.u64[0] >> n, + in.u64[1] >> n } }; +#endif + #undef n + return result; +} + +static ALWAYS_INLINE CONST_FUNCTION v and_v(const v a, const v b) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_and_si512(a.m512i, b.m512i) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_and_si256(a.m256i, b.m256i) }; +#elif USE_SSE2 + const v128 result = { .m128i = _mm_and_si128(a.m128i, b.m128i) }; +#elif USE_ARM_NEON + const v128 result = { .uint8x16 = vandq_u8(a.uint8x16, b.uint8x16) }; +#elif USE_ALTIVEC + const v128 result = { .uint8x16 = vec_and(a.uint8x16, b.uint8x16) }; +#else + const v128 result = { .v2u64 = a.v2u64 & b.v2u64 }; +#endif + + return result; +} + +static ALWAYS_INLINE CONST_FUNCTION v xor_v(const v a, const v b) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_xor_si512(a.m512i, b.m512i) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_xor_si256(a.m256i, b.m256i) }; +#elif USE_SSE2 + const v128 result = { .m128i = _mm_xor_si128(a.m128i, b.m128i) }; +#elif USE_ARM_NEON + const v128 result = { .uint8x16 = veorq_u8(a.uint8x16, b.uint8x16) }; +#elif USE_ALTIVEC + const v128 result = { .uint8x16 = vec_xor(a.uint8x16, b.uint8x16) }; +#else + const v128 result = { .v2u64 = a.v2u64 ^ b.v2u64 }; +#endif + + return result; +} + +static ALWAYS_INLINE CONST_FUNCTION v shuffle_epi8_v(const v vec, const v mask) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_shuffle_epi8(vec.m512i, mask.m512i) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_shuffle_epi8(vec.m256i, mask.m256i) }; +#elif USE_SSSE3 + const v128 result = { .m128i = _mm_shuffle_epi8(vec.m128i, mask.m128i) }; +#elif USE_ARM_NEON +# if defined(RS_HAVE_VQTBL1Q_U8) && RS_HAVE_VQTBL1Q_U8 + const v128 result = { .uint8x16 = vqtbl1q_u8(vec.uint8x16, mask.uint8x16) }; +# else + /* There's no NEON instruction mapping 1-to-1 to _mm_shuffle_epi8, but + * this should have the same result... + */ + const v128 result = { .uint8x16 = vcombine_u8(vtbl2_u8(vec.uint8x8x2, + vget_low_u8(mask.uint8x16)), + vtbl2_u8(vec.uint8x8x2, + vget_high_u8(mask.uint8x16))) }; + +# endif +#elif USE_ALTIVEC + const v128 zeros = set1_epi8_v(0), + result = { .uint8x16 = vec_perm(vec.uint8x16, zeros.uint8x16, mask.uint8x16) }; +#elif defined(RS_HAVE_BUILTIN_SHUFFLE) && RS_HAVE_BUILTIN_SHUFFLE + const v16u8v zeros = { 0, 0, 0, 0, 0, 0, 0, 0 + , 0, 0, 0, 0, 0, 0, 0, 0 }; + const v128 result = { .v16u8 = __builtin_shuffle(vec.v16u8, zeros, mask.v16u8) }; +#else + v128 result = { .u64 = { 0, 0 } }; + +# define DO_BYTE(i) \ + result.u8[i] = mask.u8[i] & 0x80 ? 0 : vec.u8[mask.u8[i] & 0x0F]; + + DO_BYTE( 0); DO_BYTE( 1); DO_BYTE( 2); DO_BYTE( 3); + DO_BYTE( 4); DO_BYTE( 5); DO_BYTE( 6); DO_BYTE( 7); + DO_BYTE( 8); DO_BYTE( 9); DO_BYTE(10); DO_BYTE(11); + DO_BYTE(12); DO_BYTE(13); DO_BYTE(14); DO_BYTE(15); +#endif + + return result; +} + +static ALWAYS_INLINE UNALIGNED_ACCESS void storeu_v(uint8_t *out, const v vec) { +#if USE_AVX512 + _mm512_storeu_si512((__m512i *)out, vec.m512i); +#elif USE_AVX2 + _mm256_storeu_si256((__m256i *)out, vec.m256i); +#elif USE_SSE2 + _mm_storeu_si128((__m128i *)out, vec.m128i); +#else + memcpy(out, &vec.u64, sizeof(vec.u64)); +#endif +} + +static ALWAYS_INLINE ALIGNED_ACCESS void store_v(uint8_t *out, const v vec) { +#if USE_AVX512 + _mm512_store_si512((__m512i *)out, vec.m512i); +#elif USE_AVX2 + _mm256_store_si256((__m256i *)out, vec.m256i); +#elif USE_SSE2 + _mm_store_si128((__m128i *)out, vec.m128i); +#elif USE_ARM_NEON + vst1q_u8(out, vec.uint8x16); +#elif USE_ALTIVEC + vec_st(vec.uint8x16, 0, out); +#else + storeu_v(out, vec); +#endif +} + +static ALWAYS_INLINE CONST_FUNCTION v replicate_v128_v(const v128 vec) { +#if USE_AVX512 + const v512 result = { .m512i = _mm512_broadcast_i32x4(vec.m128i) }; +#elif USE_AVX2 + const v256 result = { .m256i = _mm256_broadcastsi128_si256(vec.m128i) }; +#else + const v128 result = vec; +#endif + + return result; +} + + +//+build !noasm !appengine + +// Copyright 2015, Klaus Post, see LICENSE for details. + +// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf +// and http://jerasure.org/jerasure/gf-complete/tree/master + +/* +// func galMulSSSE3Xor(low, high, in, out []byte) +TEXT ·galMulSSSE3Xor(SB), 7, $0 + MOVQ low+0(FP),SI // SI: &low + MOVQ high+24(FP),DX // DX: &high + MOVOU (SI), X6 // X6 low + MOVOU (DX), X7 // X7: high + MOVQ $15, BX // BX: low mask + MOVQ BX, X8 + PXOR X5, X5 + MOVQ in+48(FP),SI // R11: &in + MOVQ in_len+56(FP),R9 // R9: len(in) + MOVQ out+72(FP), DX // DX: &out + PSHUFB X5, X8 // X8: lomask (unpacked) + SHRQ $4, R9 // len(in) / 16 + CMPQ R9 ,$0 + JEQ done_xor +loopback_xor: + MOVOU (SI),X0 // in[x] + MOVOU (DX),X4 // out[x] + MOVOU X0, X1 // in[x] + MOVOU X6, X2 // low copy + MOVOU X7, X3 // high copy + PSRLQ $4, X1 // X1: high input + PAND X8, X0 // X0: low input + PAND X8, X1 // X0: high input + PSHUFB X0, X2 // X2: mul low part + PSHUFB X1, X3 // X3: mul high part + PXOR X2, X3 // X3: Result + PXOR X4, X3 // X3: Result xor existing out + MOVOU X3, (DX) // Store + ADDQ $16, SI // in+=16 + ADDQ $16, DX // out+=16 + SUBQ $1, R9 + JNZ loopback_xor +done_xor: + RET + +// func galMulSSSE3(low, high, in, out []byte) +TEXT ·galMulSSSE3(SB), 7, $0 + MOVQ low+0(FP),SI // SI: &low + MOVQ high+24(FP),DX // DX: &high + MOVOU (SI), X6 // X6 low + MOVOU (DX), X7 // X7: high + MOVQ $15, BX // BX: low mask + MOVQ BX, X8 + PXOR X5, X5 + MOVQ in+48(FP),SI // R11: &in + MOVQ in_len+56(FP),R9 // R9: len(in) + MOVQ out+72(FP), DX // DX: &out + PSHUFB X5, X8 // X8: lomask (unpacked) + SHRQ $4, R9 // len(in) / 16 + CMPQ R9 ,$0 + JEQ done +loopback: + MOVOU (SI),X0 // in[x] + MOVOU X0, X1 // in[x] + MOVOU X6, X2 // low copy + MOVOU X7, X3 // high copy + PSRLQ $4, X1 // X1: high input + PAND X8, X0 // X0: low input + PAND X8, X1 // X0: high input + PSHUFB X0, X2 // X2: mul low part + PSHUFB X1, X3 // X3: mul high part + PXOR X2, X3 // X3: Result + MOVOU X3, (DX) // Store + ADDQ $16, SI // in+=16 + ADDQ $16, DX // out+=16 + SUBQ $1, R9 + JNZ loopback +done: + RET +*/ + +static ALWAYS_INLINE v reedsolomon_gal_mul_v( + const v low_mask_unpacked, + const v low_vector, + const v high_vector, + + v (*modifier)(const v new, const v old), + + const v in_x, + const v old) { + const v low_input = and_v(in_x, low_mask_unpacked), + in_x_shifted = srli_epi64_v(in_x /*, 4*/), + high_input = and_v(in_x_shifted, low_mask_unpacked), + + mul_low_part = shuffle_epi8_v(low_vector, low_input), + mul_high_part = shuffle_epi8_v(high_vector, high_input), + + new = xor_v(mul_low_part, mul_high_part), + result = modifier(new, old); + + return result; +} + +static ALWAYS_INLINE PROTO_RETURN reedsolomon_gal_mul_impl( + PROTO_ARGS, + v (*modifier)(const v new, const v old)) { + const v low_mask_unpacked = set1_epi8_v(0x0f); + + const v128 low_vector128 = loadu_v128(low), + high_vector128 = loadu_v128(high); + const v low_vector = replicate_v128_v(low_vector128), + high_vector = replicate_v128_v(high_vector128); + + size_t done = 0; + +#if USE_ALIGNED_ACCESS +# define LOAD(addr) load_v(addr) +# define STORE(addr, vec) store_v(addr, vec) +#else +# define LOAD(addr) loadu_v(addr) +# define STORE(addr, vec) storeu_v(addr, vec) +#endif + +#if RS_HAVE_CLANG_LOOP_UNROLL +# pragma clang loop unroll(enable) +#endif + for(size_t x = 0; x < len / sizeof(v); x++) { + const v in_x = LOAD(&in[done]), + old = LOAD(&out[done]), + result = reedsolomon_gal_mul_v( + low_mask_unpacked, + low_vector, high_vector, + modifier, + in_x, + old); + + STORE(&out[done], result); + + done += sizeof(v); + } + + return done; +} + +static ALWAYS_INLINE CONST_FUNCTION v noop(const v new, const v old __attribute__((__unused__))) { + return new; +} + +#ifdef HOT +HOT_FUNCTION +#endif +FORCE_ALIGN_ARG_POINTER PROTO(reedsolomon_gal_mul) { + return reedsolomon_gal_mul_impl(low, high, in, out, len, noop); +} + +#ifdef HOT +HOT_FUNCTION +#endif +FORCE_ALIGN_ARG_POINTER PROTO(reedsolomon_gal_mul_xor) { + return reedsolomon_gal_mul_impl(low, high, in, out, len, xor_v); +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.h b/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.h new file mode 100644 index 000000000..4bd9ec0e9 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/simd_c/reedsolomon.h @@ -0,0 +1,54 @@ +/* reedsolomon.h - SIMD-optimized Galois-field multiplication routines + * + * Copyright (c) 2015, 2016 Nicolas Trangez + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE + */ + +#include + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#define PROTO_RETURN size_t +#define PROTO_ARGS \ + const uint8_t low[16], \ + const uint8_t high[16], \ + const uint8_t *restrict const in, \ + uint8_t *restrict const out, \ + const size_t len +#define PROTO(name) \ + PROTO_RETURN \ + name (PROTO_ARGS) + +PROTO(reedsolomon_gal_mul); +PROTO(reedsolomon_gal_mul_xor); + +typedef enum { + REEDSOLOMON_CPU_GENERIC = 0, + REEDSOLOMON_CPU_SSE2 = 1, + REEDSOLOMON_CPU_SSSE3 = 2, + REEDSOLOMON_CPU_AVX = 3, + REEDSOLOMON_CPU_AVX2 = 4, + REEDSOLOMON_CPU_NEON = 5, + REEDSOLOMON_CPU_ALTIVEC = 6, +} reedsolomon_cpu_support; + +reedsolomon_cpu_support reedsolomon_determine_cpu_support(void); diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/core.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/core.rs new file mode 100644 index 000000000..57733f588 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/core.rs @@ -0,0 +1,927 @@ +extern crate alloc; + +use alloc::sync::Arc; +use alloc::vec; +use alloc::vec::Vec; +use core::num::NonZeroUsize; + +use smallvec::SmallVec; + +use crate::errors::Error; +use crate::errors::SBSError; + +use crate::matrix::Matrix; + +use lru::LruCache; + +#[cfg(feature = "std")] +use parking_lot::Mutex; +#[cfg(not(feature = "std"))] +use spin::Mutex; + +use super::Field; +use super::ReconstructShard; + +const DATA_DECODE_MATRIX_CACHE_CAPACITY: usize = 254; + +// /// Parameters for parallelism. +// #[derive(PartialEq, Debug, Clone, Copy)] +// pub struct ParallelParam { +// /// Number of bytes to split the slices into for computations +// /// which can be done in parallel. +// /// +// /// Default is 32768. +// pub bytes_per_encode: usize, +// } + +// impl ParallelParam { +// /// Create a new `ParallelParam` with the given split arity. +// pub fn new(bytes_per_encode: usize) -> ParallelParam { +// ParallelParam { bytes_per_encode } +// } +// } + +// impl Default for ParallelParam { +// fn default() -> Self { +// ParallelParam::new(32768) +// } +// } + +/// Bookkeeper for shard by shard encoding. +/// +/// This is useful for avoiding incorrect use of +/// `encode_single` and `encode_single_sep` +/// +/// # Use cases +/// +/// Shard by shard encoding is useful for streamed data encoding +/// where you do not have all the needed data shards immediately, +/// but you want to spread out the encoding workload rather than +/// doing the encoding after everything is ready. +/// +/// A concrete example would be network packets encoding, +/// where encoding packet by packet as you receive them may be more efficient +/// than waiting for N packets then encode them all at once. +/// +/// # Example +/// +/// ``` +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # use reed_solomon_erasure::*; +/// # fn main () { +/// use reed_solomon_erasure::galois_8::Field; +/// let r: ReedSolomon = ReedSolomon::new(3, 2).unwrap(); +/// +/// let mut sbs = ShardByShard::new(&r); +/// +/// let mut shards = shards!([0u8, 1, 2, 3, 4], +/// [5, 6, 7, 8, 9], +/// // say we don't have the 3rd data shard yet +/// // and we want to fill it in later +/// [0, 0, 0, 0, 0], +/// [0, 0, 0, 0, 0], +/// [0, 0, 0, 0, 0]); +/// +/// // encode 1st and 2nd data shard +/// sbs.encode(&mut shards).unwrap(); +/// sbs.encode(&mut shards).unwrap(); +/// +/// // fill in 3rd data shard +/// shards[2][0] = 10.into(); +/// shards[2][1] = 11.into(); +/// shards[2][2] = 12.into(); +/// shards[2][3] = 13.into(); +/// shards[2][4] = 14.into(); +/// +/// // now do the encoding +/// sbs.encode(&mut shards).unwrap(); +/// +/// assert!(r.verify(&shards).unwrap()); +/// # } +/// ``` +#[derive(PartialEq, Debug)] +pub struct ShardByShard<'a, F: 'a + Field> { + codec: &'a ReedSolomon, + cur_input: usize, +} + +impl<'a, F: 'a + Field> ShardByShard<'a, F> { + /// Creates a new instance of the bookkeeping struct. + pub fn new(codec: &'a ReedSolomon) -> ShardByShard<'a, F> { + ShardByShard { + codec, + cur_input: 0, + } + } + + /// Checks if the parity shards are ready to use. + pub fn parity_ready(&self) -> bool { + self.cur_input == self.codec.data_shard_count + } + + /// Resets the bookkeeping data. + /// + /// You should call this when you have added and encoded + /// all data shards, and have finished using the parity shards. + /// + /// Returns `SBSError::LeftoverShards` when there are shards encoded + /// but parity shards are not ready to use. + pub fn reset(&mut self) -> Result<(), SBSError> { + if self.cur_input > 0 && !self.parity_ready() { + return Err(SBSError::LeftoverShards); + } + + self.cur_input = 0; + + Ok(()) + } + + /// Resets the bookkeeping data without checking. + pub fn reset_force(&mut self) { + self.cur_input = 0; + } + + /// Returns the current input shard index. + pub fn cur_input_index(&self) -> usize { + self.cur_input + } + + fn return_ok_and_incre_cur_input(&mut self) -> Result<(), SBSError> { + self.cur_input += 1; + Ok(()) + } + + fn sbs_encode_checks + AsMut<[F::Elem]>>( + &mut self, + slices: &mut [U], + ) -> Result<(), SBSError> { + let internal_checks = |codec: &ReedSolomon, data: &mut [U]| { + check_piece_count!(all => codec, data); + check_slices!(multi => data); + + Ok(()) + }; + + if self.parity_ready() { + return Err(SBSError::TooManyCalls); + } + + match internal_checks(self.codec, slices) { + Ok(()) => Ok(()), + Err(e) => Err(SBSError::RSError(e)), + } + } + + fn sbs_encode_sep_checks, U: AsRef<[F::Elem]> + AsMut<[F::Elem]>>( + &mut self, + data: &[T], + parity: &mut [U], + ) -> Result<(), SBSError> { + let internal_checks = |codec: &ReedSolomon, data: &[T], parity: &mut [U]| { + check_piece_count!(data => codec, data); + check_piece_count!(parity => codec, parity); + check_slices!(multi => data, multi => parity); + + Ok(()) + }; + + if self.parity_ready() { + return Err(SBSError::TooManyCalls); + } + + match internal_checks(self.codec, data, parity) { + Ok(()) => Ok(()), + Err(e) => Err(SBSError::RSError(e)), + } + } + + /// Constructs the parity shards partially using the current input data shard. + /// + /// Returns `SBSError::TooManyCalls` when all input data shards + /// have already been filled in via `encode` + pub fn encode(&mut self, mut shards: T) -> Result<(), SBSError> + where + T: AsRef<[U]> + AsMut<[U]>, + U: AsRef<[F::Elem]> + AsMut<[F::Elem]>, + { + let shards = shards.as_mut(); + self.sbs_encode_checks(shards)?; + + self.codec.encode_single(self.cur_input, shards).unwrap(); + + self.return_ok_and_incre_cur_input() + } + + /// Constructs the parity shards partially using the current input data shard. + /// + /// Returns `SBSError::TooManyCalls` when all input data shards + /// have already been filled in via `encode` + pub fn encode_sep, U: AsRef<[F::Elem]> + AsMut<[F::Elem]>>( + &mut self, + data: &[T], + parity: &mut [U], + ) -> Result<(), SBSError> { + self.sbs_encode_sep_checks(data, parity)?; + + self.codec + .encode_single_sep(self.cur_input, data[self.cur_input].as_ref(), parity) + .unwrap(); + + self.return_ok_and_incre_cur_input() + } +} + +/// Reed-Solomon erasure code encoder/decoder. +/// +/// # Common error handling +/// +/// ## For `encode`, `encode_shards`, `verify`, `verify_shards`, `reconstruct`, `reconstruct_data`, `reconstruct_shards`, `reconstruct_data_shards` +/// +/// Return `Error::TooFewShards` or `Error::TooManyShards` +/// when the number of provided shards +/// does not match the codec's one. +/// +/// Return `Error::EmptyShard` when the first shard provided is +/// of zero length. +/// +/// Return `Error::IncorrectShardSize` when the provided shards +/// are of different lengths. +/// +/// ## For `reconstruct`, `reconstruct_data`, `reconstruct_shards`, `reconstruct_data_shards` +/// +/// Return `Error::TooFewShardsPresent` when there are not +/// enough shards for reconstruction. +/// +/// Return `Error::InvalidShardFlags` when the number of flags does not match +/// the total number of shards. +/// +/// # Variants of encoding methods +/// +/// ## `sep` +/// +/// Methods ending in `_sep` takes an immutable reference to data shards, +/// and a mutable reference to parity shards. +/// +/// They are useful as they do not need to borrow the data shards mutably, +/// and other work that only needs read-only access to data shards can be done +/// in parallel/concurrently during the encoding. +/// +/// Following is a table of all the `sep` variants +/// +/// | not `sep` | `sep` | +/// | --- | --- | +/// | `encode_single` | `encode_single_sep` | +/// | `encode` | `encode_sep` | +/// +/// The `sep` variants do similar checks on the provided data shards and +/// parity shards. +/// +/// Return `Error::TooFewDataShards`, `Error::TooManyDataShards`, +/// `Error::TooFewParityShards`, or `Error::TooManyParityShards` when applicable. +/// +/// ## `single` +/// +/// Methods containing `single` facilitate shard by shard encoding, where +/// the parity shards are partially constructed using one data shard at a time. +/// See `ShardByShard` struct for more details on how shard by shard encoding +/// can be useful. +/// +/// They are prone to **misuse**, and it is recommended to use the `ShardByShard` +/// bookkeeping struct instead for shard by shard encoding. +/// +/// The ones that are also `sep` are **ESPECIALLY** prone to **misuse**. +/// Only use them when you actually need the flexibility. +/// +/// Following is a table of all the shard by shard variants +/// +/// | all shards at once | shard by shard | +/// | --- | --- | +/// | `encode` | `encode_single` | +/// | `encode_sep` | `encode_single_sep` | +/// +/// The `single` variants do similar checks on the provided data shards and parity shards, +/// and also do index check on `i_data`. +/// +/// Return `Error::InvalidIndex` if `i_data >= data_shard_count`. +/// +/// # Encoding behaviour +/// ## For `encode` +/// +/// You do not need to clear the parity shards beforehand, as the methods +/// will overwrite them completely. +/// +/// ## For `encode_single`, `encode_single_sep` +/// +/// Calling them with `i_data` being `0` will overwrite the parity shards +/// completely. If you are using the methods correctly, then you do not need +/// to clear the parity shards beforehand. +/// +/// # Variants of verifying methods +/// +/// `verify` allocate sa buffer on the heap of the same size +/// as the parity shards, and encode the input once using the buffer to store +/// the computed parity shards, then check if the provided parity shards +/// match the computed ones. +/// +/// `verify_with_buffer`, allows you to provide +/// the buffer to avoid making heap allocation(s) for the buffer in every call. +/// +/// The `with_buffer` variants also guarantee that the buffer contains the correct +/// parity shards if the result is `Ok(_)` (i.e. it does not matter whether the +/// verification passed or not, as long as the result is not an error, the buffer +/// will contain the correct parity shards after the call). +/// +/// Following is a table of all the `with_buffer` variants +/// +/// | not `with_buffer` | `with_buffer` | +/// | --- | --- | +/// | `verify` | `verify_with_buffer` | +/// +/// The `with_buffer` variants also check the dimensions of the buffer and return +/// `Error::TooFewBufferShards`, `Error::TooManyBufferShards`, `Error::EmptyShard`, +/// or `Error::IncorrectShardSize` when applicable. +/// +#[derive(Debug)] +pub struct ReedSolomon { + data_shard_count: usize, + parity_shard_count: usize, + total_shard_count: usize, + matrix: Matrix, + data_decode_matrix_cache: Mutex, Arc>>>, +} + +impl Clone for ReedSolomon { + fn clone(&self) -> ReedSolomon { + ReedSolomon::new(self.data_shard_count, self.parity_shard_count) + .expect("basic checks already passed as precondition of existence of self") + } +} + +impl PartialEq for ReedSolomon { + fn eq(&self, rhs: &ReedSolomon) -> bool { + self.data_shard_count == rhs.data_shard_count + && self.parity_shard_count == rhs.parity_shard_count + } +} + +impl ReedSolomon { + // AUDIT + // + // Error detection responsibilities + // + // Terminologies and symbols: + // X =A, B, C=> Y: X delegates error checking responsibilities A, B, C to Y + // X:= A, B, C: X needs to handle responsibilities A, B, C + // + // Encode methods + // + // `encode_single`:= + // - check index `i_data` within range [0, data shard count) + // - check length of `slices` matches total shard count exactly + // - check consistency of length of individual slices + // `encode_single_sep`:= + // - check index `i_data` within range [0, data shard count) + // - check length of `parity` matches parity shard count exactly + // - check consistency of length of individual parity slices + // - check length of `single_data` matches length of first parity slice + // `encode`:= + // - check length of `slices` matches total shard count exactly + // - check consistency of length of individual slices + // `encode_sep`:= + // - check length of `data` matches data shard count exactly + // - check length of `parity` matches parity shard count exactly + // - check consistency of length of individual data slices + // - check consistency of length of individual parity slices + // - check length of first parity slice matches length of first data slice + // + // Verify methods + // + // `verify`:= + // - check length of `slices` matches total shard count exactly + // - check consistency of length of individual slices + // + // Generates buffer then passes control to verify_with_buffer + // + // `verify_with_buffer`:= + // - check length of `slices` matches total shard count exactly + // - check length of `buffer` matches parity shard count exactly + // - check consistency of length of individual slices + // - check consistency of length of individual slices in buffer + // - check length of first slice in buffer matches length of first slice + // + // Reconstruct methods + // + // `reconstruct` =ALL=> `reconstruct_internal` + // `reconstruct_data`=ALL=> `reconstruct_internal` + // `reconstruct_internal`:= + // - check length of `slices` matches total shard count exactly + // - check consistency of length of individual slices + // - check length of `slice_present` matches length of `slices` + + fn get_parity_rows(&self) -> SmallVec<[&[F::Elem]; 32]> { + let mut parity_rows = SmallVec::with_capacity(self.parity_shard_count); + let matrix = &self.matrix; + for i in self.data_shard_count..self.total_shard_count { + parity_rows.push(matrix.get_row(i)); + } + + parity_rows + } + + fn build_matrix(data_shards: usize, total_shards: usize) -> Matrix { + let vandermonde = Matrix::vandermonde(total_shards, data_shards); + + let top = vandermonde.sub_matrix(0, 0, data_shards, data_shards); + + vandermonde.multiply(&top.invert().unwrap()) + } + + /// Creates a new instance of Reed-Solomon erasure code encoder/decoder. + /// + /// Returns `Error::TooFewDataShards` if `data_shards == 0`. + /// + /// Returns `Error::TooFewParityShards` if `parity_shards == 0`. + /// + /// Returns `Error::TooManyShards` if `data_shards + parity_shards > F::ORDER`. + pub fn new(data_shards: usize, parity_shards: usize) -> Result, Error> { + if data_shards == 0 { + return Err(Error::TooFewDataShards); + } + if parity_shards == 0 { + return Err(Error::TooFewParityShards); + } + if data_shards + parity_shards > F::ORDER { + return Err(Error::TooManyShards); + } + + let total_shards = data_shards + parity_shards; + + let matrix = Self::build_matrix(data_shards, total_shards); + + Ok(ReedSolomon { + data_shard_count: data_shards, + parity_shard_count: parity_shards, + total_shard_count: total_shards, + matrix, + data_decode_matrix_cache: Mutex::new(LruCache::new( + NonZeroUsize::new(DATA_DECODE_MATRIX_CACHE_CAPACITY).unwrap(), + )), + }) + } + + pub fn data_shard_count(&self) -> usize { + self.data_shard_count + } + + pub fn parity_shard_count(&self) -> usize { + self.parity_shard_count + } + + pub fn total_shard_count(&self) -> usize { + self.total_shard_count + } + + fn code_some_slices, U: AsMut<[F::Elem]>>( + &self, + matrix_rows: &[&[F::Elem]], + inputs: &[T], + outputs: &mut [U], + ) { + for i_input in 0..self.data_shard_count { + self.code_single_slice(matrix_rows, i_input, inputs[i_input].as_ref(), outputs); + } + } + + fn code_single_slice>( + &self, + matrix_rows: &[&[F::Elem]], + i_input: usize, + input: &[F::Elem], + outputs: &mut [U], + ) { + outputs.iter_mut().enumerate().for_each(|(i_row, output)| { + let matrix_row_to_use = matrix_rows[i_row][i_input]; + let output = output.as_mut(); + + if i_input == 0 { + F::mul_slice(matrix_row_to_use, input, output); + } else { + F::mul_slice_add(matrix_row_to_use, input, output); + } + }) + } + + fn check_some_slices_with_buffer( + &self, + matrix_rows: &[&[F::Elem]], + inputs: &[T], + to_check: &[T], + buffer: &mut [U], + ) -> bool + where + T: AsRef<[F::Elem]>, + U: AsRef<[F::Elem]> + AsMut<[F::Elem]>, + { + self.code_some_slices(matrix_rows, inputs, buffer); + + let at_least_one_mismatch_present = buffer + .iter_mut() + .enumerate() + .map(|(i, expected_parity_shard)| { + expected_parity_shard.as_ref() == to_check[i].as_ref() + }) + .any(|x| !x); // find the first false (some slice is different from the expected one) + !at_least_one_mismatch_present + } + + /// Constructs the parity shards partially using only the data shard + /// indexed by `i_data`. + /// + /// The slots where the parity shards sit at will be overwritten. + /// + /// # Warning + /// + /// You must apply this method on the data shards in strict sequential order (0..data shard count), + /// otherwise the parity shards will be incorrect. + /// + /// It is recommended to use the `ShardByShard` bookkeeping struct instead of this method directly. + pub fn encode_single(&self, i_data: usize, mut shards: T) -> Result<(), Error> + where + T: AsRef<[U]> + AsMut<[U]>, + U: AsRef<[F::Elem]> + AsMut<[F::Elem]>, + { + let slices = shards.as_mut(); + + check_slice_index!(data => self, i_data); + check_piece_count!(all=> self, slices); + check_slices!(multi => slices); + + // Get the slice of output buffers. + let (mut_input, output) = slices.split_at_mut(self.data_shard_count); + + let input = mut_input[i_data].as_ref(); + + self.encode_single_sep(i_data, input, output) + } + + /// Constructs the parity shards partially using only the data shard provided. + /// + /// The data shard must match the index `i_data`. + /// + /// The slots where the parity shards sit at will be overwritten. + /// + /// # Warning + /// + /// You must apply this method on the data shards in strict sequential order (0..data shard count), + /// otherwise the parity shards will be incorrect. + /// + /// It is recommended to use the `ShardByShard` bookkeeping struct instead of this method directly. + pub fn encode_single_sep + AsMut<[F::Elem]>>( + &self, + i_data: usize, + single_data: &[F::Elem], + parity: &mut [U], + ) -> Result<(), Error> { + check_slice_index!(data => self, i_data); + check_piece_count!(parity => self, parity); + check_slices!(multi => parity, single => single_data); + + let parity_rows = self.get_parity_rows(); + + // Do the coding. + self.code_single_slice(&parity_rows, i_data, single_data, parity); + + Ok(()) + } + + /// Constructs the parity shards. + /// + /// The slots where the parity shards sit at will be overwritten. + pub fn encode(&self, mut shards: T) -> Result<(), Error> + where + T: AsRef<[U]> + AsMut<[U]>, + U: AsRef<[F::Elem]> + AsMut<[F::Elem]>, + { + let slices: &mut [U] = shards.as_mut(); + + check_piece_count!(all => self, slices); + check_slices!(multi => slices); + + // Get the slice of output buffers. + let (input, output) = slices.split_at_mut(self.data_shard_count); + + self.encode_sep(&*input, output) + } + + /// Constructs the parity shards using a read-only view into the + /// data shards. + /// + /// The slots where the parity shards sit at will be overwritten. + pub fn encode_sep, U: AsRef<[F::Elem]> + AsMut<[F::Elem]>>( + &self, + data: &[T], + parity: &mut [U], + ) -> Result<(), Error> { + check_piece_count!(data => self, data); + check_piece_count!(parity => self, parity); + check_slices!(multi => data, multi => parity); + + let parity_rows = self.get_parity_rows(); + + // Do the coding. + self.code_some_slices(&parity_rows, data, parity); + + Ok(()) + } + + /// Checks if the parity shards are correct. + /// + /// This is a wrapper of `verify_with_buffer`. + pub fn verify>(&self, slices: &[T]) -> Result { + check_piece_count!(all => self, slices); + check_slices!(multi => slices); + + let slice_len = slices[0].as_ref().len(); + + let mut buffer: SmallVec<[Vec; 32]> = + SmallVec::with_capacity(self.parity_shard_count); + + for _ in 0..self.parity_shard_count { + buffer.push(vec![F::zero(); slice_len]); + } + + self.verify_with_buffer(slices, &mut buffer) + } + + /// Checks if the parity shards are correct. + pub fn verify_with_buffer(&self, slices: &[T], buffer: &mut [U]) -> Result + where + T: AsRef<[F::Elem]>, + U: AsRef<[F::Elem]> + AsMut<[F::Elem]>, + { + check_piece_count!(all => self, slices); + check_piece_count!(parity_buf => self, buffer); + check_slices!(multi => slices, multi => buffer); + + let data = &slices[0..self.data_shard_count]; + let to_check = &slices[self.data_shard_count..]; + + let parity_rows = self.get_parity_rows(); + + Ok(self.check_some_slices_with_buffer(&parity_rows, data, to_check, buffer)) + } + + /// Reconstructs all shards. + /// + /// The shards marked not present are only overwritten when no error + /// is detected. All provided shards must have the same length. + /// + /// This means if the method returns an `Error`, then nothing is touched. + /// + /// `reconstruct`, `reconstruct_data`, `reconstruct_shards`, + /// `reconstruct_data_shards` share the same core code base. + pub fn reconstruct>(&self, slices: &mut [T]) -> Result<(), Error> { + self.reconstruct_internal(slices, false) + } + + /// Reconstructs only the data shards. + /// + /// The shards marked not present are only overwritten when no error + /// is detected. All provided shards must have the same length. + /// + /// This means if the method returns an `Error`, then nothing is touched. + /// + /// `reconstruct`, `reconstruct_data`, `reconstruct_shards`, + /// `reconstruct_data_shards` share the same core code base. + pub fn reconstruct_data>(&self, slices: &mut [T]) -> Result<(), Error> { + self.reconstruct_internal(slices, true) + } + + fn get_data_decode_matrix( + &self, + valid_indices: &[usize], + invalid_indices: &[usize], + ) -> Arc> { + { + let mut cache = self.data_decode_matrix_cache.lock(); + if let Some(entry) = cache.get(invalid_indices) { + return entry.clone(); + } + } + // Pull out the rows of the matrix that correspond to the shards that + // we have and build a square matrix. This matrix could be used to + // generate the shards that we have from the original data. + let mut sub_matrix = Matrix::new(self.data_shard_count, self.data_shard_count); + for (sub_matrix_row, &valid_index) in valid_indices.iter().enumerate() { + for c in 0..self.data_shard_count { + sub_matrix.set(sub_matrix_row, c, self.matrix.get(valid_index, c)); + } + } + // Invert the matrix, so we can go from the encoded shards back to the + // original data. Then pull out the row that generates the shard that + // we want to decode. Note that since this matrix maps back to the + // original data, it can be used to create a data shard, but not a + // parity shard. + let data_decode_matrix = Arc::new(sub_matrix.invert().unwrap()); + // Cache the inverted matrix for future use keyed on the indices of the + // invalid rows. + { + let data_decode_matrix = data_decode_matrix.clone(); + let mut cache = self.data_decode_matrix_cache.lock(); + cache.put(Vec::from(invalid_indices), data_decode_matrix); + } + data_decode_matrix + } + + fn reconstruct_internal>( + &self, + shards: &mut [T], + data_only: bool, + ) -> Result<(), Error> { + check_piece_count!(all => self, shards); + + let data_shard_count = self.data_shard_count; + + // Quick check: are all of the shards present? If so, there's + // nothing to do. + let mut number_present = 0; + let mut shard_len = None; + + for shard in shards.iter_mut() { + if let Some(len) = shard.len() { + if len == 0 { + return Err(Error::EmptyShard); + } + number_present += 1; + if let Some(old_len) = shard_len { + if len != old_len { + // mismatch between shards. + return Err(Error::IncorrectShardSize); + } + } + shard_len = Some(len); + } + } + + if number_present == self.total_shard_count { + // Cool. All of the shards are there. We don't + // need to do anything. + return Ok(()); + } + + // More complete sanity check + if number_present < data_shard_count { + return Err(Error::TooFewShardsPresent); + } + + let shard_len = shard_len.expect("at least one shard present; qed"); + + // Pull out an array holding just the shards that + // correspond to the rows of the submatrix. These shards + // will be the input to the decoding process that re-creates + // the missing data shards. + // + // Also, create an array of indices of the valid rows we do have + // and the invalid rows we don't have. + // + // The valid indices are used to construct the data decode matrix, + // the invalid indices are used to key the data decode matrix + // in the data decode matrix cache. + // + // We only need exactly N valid indices, where N = `data_shard_count`, + // as the data decode matrix is a N x N matrix, thus only needs + // N valid indices for determining the N rows to pick from + // `self.matrix`. + let mut sub_shards: SmallVec<[&[F::Elem]; 32]> = SmallVec::with_capacity(data_shard_count); + let mut missing_data_slices: SmallVec<[&mut [F::Elem]; 32]> = + SmallVec::with_capacity(self.parity_shard_count); + let mut missing_parity_slices: SmallVec<[&mut [F::Elem]; 32]> = + SmallVec::with_capacity(self.parity_shard_count); + let mut valid_indices: SmallVec<[usize; 32]> = SmallVec::with_capacity(data_shard_count); + let mut invalid_indices: SmallVec<[usize; 32]> = SmallVec::with_capacity(data_shard_count); + + // Separate the shards into groups + for (matrix_row, shard) in shards.iter_mut().enumerate() { + // get or initialize the shard so we can reconstruct in-place, + // but if we are only reconstructing data shard, + // do not initialize if the shard is not a data shard + let shard_data = if matrix_row >= data_shard_count && data_only { + shard.get().ok_or(None) + } else { + shard.get_or_initialize(shard_len).map_err(Some) + }; + + match shard_data { + Ok(shard) => { + if sub_shards.len() < data_shard_count { + sub_shards.push(shard); + valid_indices.push(matrix_row); + } else { + // Already have enough shards in `sub_shards` + // as we only need N shards, where N = `data_shard_count`, + // for the data decode matrix + // + // So nothing to do here + } + } + Err(None) => { + // the shard data is not meant to be initialized here, + // but we should still note it missing. + invalid_indices.push(matrix_row); + } + Err(Some(x)) => { + // initialized missing shard data. + let shard = x?; + if matrix_row < data_shard_count { + missing_data_slices.push(shard); + } else { + missing_parity_slices.push(shard); + } + + invalid_indices.push(matrix_row); + } + } + } + + let data_decode_matrix = self.get_data_decode_matrix(&valid_indices, &invalid_indices); + + // Re-create any data shards that were missing. + // + // The input to the coding is all of the shards we actually + // have, and the output is the missing data shards. The computation + // is done using the special decode matrix we just built. + let mut matrix_rows: SmallVec<[&[F::Elem]; 32]> = + SmallVec::with_capacity(self.parity_shard_count); + + for i_slice in invalid_indices + .iter() + .cloned() + .take_while(|i| i < &data_shard_count) + { + matrix_rows.push(data_decode_matrix.get_row(i_slice)); + } + + self.code_some_slices(&matrix_rows, &sub_shards, &mut missing_data_slices); + + if data_only { + Ok(()) + } else { + // Now that we have all of the data shards intact, we can + // compute any of the parity that is missing. + // + // The input to the coding is ALL of the data shards, including + // any that we just calculated. The output is whichever of the + // parity shards were missing. + let mut matrix_rows: SmallVec<[&[F::Elem]; 32]> = + SmallVec::with_capacity(self.parity_shard_count); + let parity_rows = self.get_parity_rows(); + + for i_slice in invalid_indices + .iter() + .cloned() + .skip_while(|i| i < &data_shard_count) + { + matrix_rows.push(parity_rows[i_slice - data_shard_count]); + } + { + // Gather up all the data shards. + // old data shards are in `sub_shards`, + // new ones are in `missing_data_slices`. + let mut i_old_data_slice = 0; + let mut i_new_data_slice = 0; + + let mut all_data_slices: SmallVec<[&[F::Elem]; 32]> = + SmallVec::with_capacity(data_shard_count); + + let mut next_maybe_good = 0; + let mut push_good_up_to = move |data_slices: &mut SmallVec<_>, up_to| { + // if next_maybe_good == up_to, this loop is a no-op. + for _ in next_maybe_good..up_to { + // push all good indices we just skipped. + data_slices.push(sub_shards[i_old_data_slice]); + i_old_data_slice += 1; + } + + next_maybe_good = up_to + 1; + }; + + for i_slice in invalid_indices + .iter() + .cloned() + .take_while(|i| i < &data_shard_count) + { + push_good_up_to(&mut all_data_slices, i_slice); + all_data_slices.push(missing_data_slices[i_new_data_slice]); + i_new_data_slice += 1; + } + push_good_up_to(&mut all_data_slices, data_shard_count); + + // Now do the actual computation for the missing + // parity shards + self.code_some_slices(&matrix_rows, &all_data_slices, &mut missing_parity_slices); + } + + Ok(()) + } + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/errors.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/errors.rs new file mode 100644 index 000000000..761343685 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/errors.rs @@ -0,0 +1,158 @@ +use core::fmt::Formatter; + +#[derive(PartialEq, Debug, Clone, Copy)] +pub enum Error { + TooFewShards, + TooManyShards, + TooFewDataShards, + TooManyDataShards, + TooFewParityShards, + TooManyParityShards, + TooFewBufferShards, + TooManyBufferShards, + IncorrectShardSize, + TooFewShardsPresent, + EmptyShard, + InvalidShardFlags, + InvalidIndex, +} + +impl Error { + fn to_string(&self) -> &str { + match *self { + Error::TooFewShards=> "The number of provided shards is smaller than the one in codec", + Error::TooManyShards => "The number of provided shards is greater than the one in codec", + Error::TooFewDataShards => "The number of provided data shards is smaller than the one in codec", + Error::TooManyDataShards => "The number of provided data shards is greater than the one in codec", + Error::TooFewParityShards => "The number of provided parity shards is smaller than the one in codec", + Error::TooManyParityShards => "The number of provided parity shards is greater than the one in codec", + Error::TooFewBufferShards => "The number of provided buffer shards is smaller than the number of parity shards in codec", + Error::TooManyBufferShards => "The number of provided buffer shards is greater than the number of parity shards in codec", + Error::IncorrectShardSize => "At least one of the provided shards is not of the correct size", + Error::TooFewShardsPresent => "The number of shards present is smaller than number of parity shards, cannot reconstruct missing shards", + Error::EmptyShard => "The first shard provided is of zero length", + Error::InvalidShardFlags => "The number of flags does not match the total number of shards", + Error::InvalidIndex => "The data shard index provided is greater or equal to the number of data shards in codec", + } + } +} + +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut Formatter) -> Result<(), core::fmt::Error> { + write!(f, "{}", self.to_string()) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for Error { + fn description(&self) -> &str { + self.to_string() + } +} + +#[derive(PartialEq, Debug, Clone, Copy)] +pub enum SBSError { + TooManyCalls, + LeftoverShards, + RSError(Error), +} + +impl SBSError { + fn to_string(&self) -> &str { + match *self { + SBSError::TooManyCalls => "Too many calls", + SBSError::LeftoverShards => "Leftover shards", + SBSError::RSError(ref e) => e.to_string(), + } + } +} + +impl core::fmt::Display for SBSError { + fn fmt(&self, f: &mut Formatter) -> Result<(), core::fmt::Error> { + write!(f, "{}", self.to_string()) + } +} + +#[cfg(feature = "std")] +impl std::error::Error for SBSError { + fn description(&self) -> &str { + self.to_string() + } +} + +#[cfg(test)] +mod tests { + use crate::errors::Error; + use crate::errors::SBSError; + + #[test] + fn test_error_to_string_is_okay() { + assert_eq!( + Error::TooFewShards.to_string(), + "The number of provided shards is smaller than the one in codec" + ); + assert_eq!( + Error::TooManyShards.to_string(), + "The number of provided shards is greater than the one in codec" + ); + assert_eq!( + Error::TooFewDataShards.to_string(), + "The number of provided data shards is smaller than the one in codec" + ); + assert_eq!( + Error::TooManyDataShards.to_string(), + "The number of provided data shards is greater than the one in codec" + ); + assert_eq!( + Error::TooFewParityShards.to_string(), + "The number of provided parity shards is smaller than the one in codec" + ); + assert_eq!( + Error::TooManyParityShards.to_string(), + "The number of provided parity shards is greater than the one in codec" + ); + assert_eq!( + Error::TooFewBufferShards.to_string(), + "The number of provided buffer shards is smaller than the number of parity shards in codec" + ); + assert_eq!( + Error::TooManyBufferShards.to_string(), + "The number of provided buffer shards is greater than the number of parity shards in codec" + ); + assert_eq!( + Error::IncorrectShardSize.to_string(), + "At least one of the provided shards is not of the correct size" + ); + assert_eq!(Error::TooFewShardsPresent.to_string(), "The number of shards present is smaller than number of parity shards, cannot reconstruct missing shards"); + assert_eq!( + Error::EmptyShard.to_string(), + "The first shard provided is of zero length" + ); + assert_eq!( + Error::InvalidShardFlags.to_string(), + "The number of flags does not match the total number of shards" + ); + assert_eq!( + Error::InvalidIndex.to_string(), + "The data shard index provided is greater or equal to the number of data shards in codec" + ); + } + + #[test] + fn test_sbserror_to_string_is_okay() { + assert_eq!(SBSError::TooManyCalls.to_string(), "Too many calls"); + assert_eq!(SBSError::LeftoverShards.to_string(), "Leftover shards"); + } + + #[cfg(feature = "std")] + #[test] + fn test_error_display_does_not_panic() { + println!("{}", Error::TooFewShards); + } + + #[cfg(feature = "std")] + #[test] + fn test_sbserror_display_does_not_panic() { + println!("{}", SBSError::TooManyCalls); + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/galois_16.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/galois_16.rs new file mode 100644 index 000000000..500ac8d2a --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/galois_16.rs @@ -0,0 +1,412 @@ +//! GF(2^16) implementation. +//! +//! More accurately, this is a `GF((2^8)^2)` implementation which builds an extension +//! field of `GF(2^8)`, as defined in the `galois_8` module. + +use crate::galois_8; +use core::ops::{Add, Div, Mul, Sub}; + +// the irreducible polynomial used as a modulus for the field. +// print R.irreducible_element(2,algorithm="first_lexicographic" ) +// x^2 + a*x + a^7 +// +// hopefully it is a fast polynomial +const EXT_POLY: [u8; 3] = [1, 2, 128]; + +/// The field GF(2^16). +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] +pub struct Field; + +impl crate::Field for Field { + const ORDER: usize = 65536; + + type Elem = [u8; 2]; + + fn add(a: [u8; 2], b: [u8; 2]) -> [u8; 2] { + (Element(a) + Element(b)).0 + } + + fn mul(a: [u8; 2], b: [u8; 2]) -> [u8; 2] { + (Element(a) * Element(b)).0 + } + + fn div(a: [u8; 2], b: [u8; 2]) -> [u8; 2] { + (Element(a) / Element(b)).0 + } + + fn exp(elem: [u8; 2], n: usize) -> [u8; 2] { + Element(elem).exp(n).0 + } + + fn zero() -> [u8; 2] { + [0; 2] + } + + fn one() -> [u8; 2] { + [0, 1] + } + + fn nth_internal(n: usize) -> [u8; 2] { + [(n >> 8) as u8, n as u8] + } +} + +/// Type alias of ReedSolomon over GF(2^8). +pub type ReedSolomon = crate::ReedSolomon; + +/// Type alias of ShardByShard over GF(2^8). +pub type ShardByShard<'a> = crate::ShardByShard<'a, Field>; + +/// An element of `GF(2^16)`. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +struct Element(pub [u8; 2]); + +impl Element { + // Create the zero element. + fn zero() -> Self { + Element([0, 0]) + } + + // A constant element evaluating to `n`. + fn constant(n: u8) -> Element { + Element([0, n]) + } + + // Whether this is the zero element. + fn is_zero(&self) -> bool { + self.0 == [0; 2] + } + + fn exp(mut self, n: usize) -> Element { + if n == 0 { + Element::constant(1) + } else if self == Element::zero() { + Element::zero() + } else { + let x = self; + for _ in 1..n { + self = self * x; + } + + self + } + } + + // reduces from some polynomial with degree <= 2. + #[inline] + fn reduce_from(mut x: [u8; 3]) -> Self { + if x[0] != 0 { + // divide x by EXT_POLY and use remainder. + // i = 0 here. + // c*x^(i+j) = a*x^i*b*x^j + x[1] ^= galois_8::mul(EXT_POLY[1], x[0]); + x[2] ^= galois_8::mul(EXT_POLY[2], x[0]); + } + + Element([x[1], x[2]]) + } + + fn degree(&self) -> usize { + if self.0[0] != 0 { + 1 + } else { + 0 + } + } +} + +impl From<[u8; 2]> for Element { + fn from(c: [u8; 2]) -> Self { + Element(c) + } +} + +impl Default for Element { + fn default() -> Self { + Element::zero() + } +} + +impl Add for Element { + type Output = Element; + + fn add(self, other: Self) -> Element { + Element([self.0[0] ^ other.0[0], self.0[1] ^ other.0[1]]) + } +} + +impl Sub for Element { + type Output = Element; + + fn sub(self, other: Self) -> Element { + self.add(other) + } +} + +impl Mul for Element { + type Output = Element; + + fn mul(self, rhs: Self) -> Element { + // FOIL; our elements are linear at most, with two coefficients + let out: [u8; 3] = [ + galois_8::mul(self.0[0], rhs.0[0]), + galois_8::add( + galois_8::mul(self.0[1], rhs.0[0]), + galois_8::mul(self.0[0], rhs.0[1]), + ), + galois_8::mul(self.0[1], rhs.0[1]), + ]; + + Element::reduce_from(out) + } +} + +impl Mul for Element { + type Output = Element; + + fn mul(self, rhs: u8) -> Element { + Element([galois_8::mul(rhs, self.0[0]), galois_8::mul(rhs, self.0[1])]) + } +} + +impl Div for Element { + type Output = Element; + + fn div(self, rhs: Self) -> Element { + self * rhs.inverse() + } +} + +// helpers for division. + +#[derive(Debug)] +enum EgcdRhs { + Element(Element), + ExtPoly, +} + +impl Element { + // compute extended euclidean algorithm against an element of self, + // where the GCD is known to be constant. + fn const_egcd(self, rhs: EgcdRhs) -> (u8, Element, Element) { + if self.is_zero() { + let rhs = match rhs { + EgcdRhs::Element(elem) => elem, + EgcdRhs::ExtPoly => panic!("const_egcd invoked with divisible"), + }; + (rhs.0[1], Element::constant(0), Element::constant(1)) + } else { + let (cur_quotient, cur_remainder) = match rhs { + EgcdRhs::Element(rhs) => rhs.polynom_div(self), + EgcdRhs::ExtPoly => Element::div_ext_by(self), + }; + + // GCD is constant because EXT_POLY is irreducible + let (g, x, y) = cur_remainder.const_egcd(EgcdRhs::Element(self)); + (g, y + (cur_quotient * x), x) + } + } + + // divide EXT_POLY by self. + fn div_ext_by(rhs: Self) -> (Element, Element) { + if rhs.degree() == 0 { + // dividing by constant is the same as multiplying by another constant. + // and all constant multiples of EXT_POLY are in the equivalence class + // of 0. + return (Element::zero(), Element::zero()); + } + + // divisor is ensured linear here. + // now ensure divisor is monic. + let leading_mul_inv = galois_8::div(1, rhs.0[0]); + + let monictized = rhs * leading_mul_inv; + let mut poly = EXT_POLY; + + for i in 0..2 { + let coef = poly[i]; + for j in 1..2 { + if rhs.0[j] != 0 { + poly[i + j] ^= galois_8::mul(monictized.0[j], coef); + } + } + } + + let remainder = Element::constant(poly[2]); + let quotient = Element([poly[0], poly[1]]) * leading_mul_inv; + + (quotient, remainder) + } + + fn polynom_div(self, rhs: Self) -> (Element, Element) { + let divisor_degree = rhs.degree(); + if rhs.is_zero() { + panic!("divide by 0"); + } else if self.degree() < divisor_degree { + // If divisor's degree (len-1) is bigger, all dividend is a remainder + (Element::zero(), self) + } else if divisor_degree == 0 { + // divide by constant. + let invert = galois_8::div(1, rhs.0[1]); + let quotient = Element([ + galois_8::mul(invert, self.0[0]), + galois_8::mul(invert, self.0[1]), + ]); + + (quotient, Element::zero()) + } else { + // self degree is at least divisor degree, divisor degree not 0. + // therefore both are 1. + debug_assert_eq!(self.degree(), divisor_degree); + debug_assert_eq!(self.degree(), 1); + + // ensure rhs is constant. + let leading_mul_inv = galois_8::div(1, rhs.0[0]); + let monic = Element([ + galois_8::mul(leading_mul_inv, rhs.0[0]), + galois_8::mul(leading_mul_inv, rhs.0[1]), + ]); + + let leading_coeff = self.0[0]; + let mut remainder = self.0[1]; + + if monic.0[1] != 0 { + remainder ^= galois_8::mul(monic.0[1], self.0[0]); + } + + ( + Element::constant(galois_8::mul(leading_mul_inv, leading_coeff)), + Element::constant(remainder), + ) + } + } + + /// Convert the inverse of this field element. Panics if zero. + fn inverse(self) -> Element { + if self.is_zero() { + panic!("Cannot invert 0"); + } + + // first step of extended euclidean algorithm. + // done here because EXT_POLY is outside the scope of `Element`. + let (gcd, y) = { + // self / EXT_POLY = (0, self) + let remainder = self; + + // GCD is constant because EXT_POLY is irreducible + let (g, x, _) = remainder.const_egcd(EgcdRhs::ExtPoly); + + (g, x) + }; + + // we still need to normalize it by dividing by the gcd + if gcd != 0 { + // EXT_POLY is irreducible so the GCD will always be constant. + // EXT_POLY*x + self*y = gcd + // self*y = gcd - EXT_POLY*x + // + // EXT_POLY*x is representative of the equivalence class of 0. + let normalizer = galois_8::div(1, gcd); + y * normalizer + } else { + // self is equivalent to zero. + panic!("Cannot invert 0"); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use quickcheck::Arbitrary; + + impl Arbitrary for Element { + fn arbitrary(gen: &mut G) -> Self { + let a = u8::arbitrary(gen); + let b = u8::arbitrary(gen); + + Element([a, b]) + } + } + + quickcheck! { + fn qc_add_associativity(a: Element, b: Element, c: Element) -> bool { + a + (b + c) == (a + b) + c + } + + fn qc_mul_associativity(a: Element, b: Element, c: Element) -> bool { + a * (b * c) == (a * b) * c + } + + fn qc_additive_identity(a: Element) -> bool { + let zero = Element::zero(); + a - (zero - a) == zero + } + + fn qc_multiplicative_identity(a: Element) -> bool { + a.is_zero() || { + let one = Element([0, 1]); + (one / a) * a == one + } + } + + fn qc_add_commutativity(a: Element, b: Element) -> bool { + a + b == b + a + } + + fn qc_mul_commutativity(a: Element, b: Element) -> bool { + a * b == b * a + } + + fn qc_add_distributivity(a: Element, b: Element, c: Element) -> bool { + a * (b + c) == (a * b) + (a * c) + } + + fn qc_inverse(a: Element) -> bool { + a.is_zero() || { + let inv = a.inverse(); + a * inv == Element::constant(1) + } + } + + fn qc_exponent_1(a: Element, n: u8) -> bool { + a.is_zero() || n == 0 || { + let mut b = a.exp(n as usize); + for _ in 1..n { + b = b / a; + } + + a == b + } + } + + fn qc_exponent_2(a: Element, n: u8) -> bool { + a.is_zero() || { + let mut res = true; + let mut b = Element::constant(1); + + for i in 0..n { + res = res && b == a.exp(i as usize); + b = b * a; + } + + res + } + } + + fn qc_exp_zero_is_one(a: Element) -> bool { + a.exp(0) == Element::constant(1) + } + } + + #[test] + #[should_panic] + fn test_div_b_is_0() { + let _ = Element([1, 0]) / Element::zero(); + } + + #[test] + fn zero_to_zero_is_one() { + assert_eq!(Element::zero().exp(0), Element::constant(1)) + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/galois_8.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/galois_8.rs new file mode 100644 index 000000000..01adc09d9 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/galois_8.rs @@ -0,0 +1,621 @@ +//! Implementation of GF(2^8): the finite field with 2^8 elements. + +include!(concat!(env!("OUT_DIR"), "/table.rs")); + +/// The field GF(2^8). +#[derive(Debug, Default, Copy, Clone, PartialEq, Eq)] +pub struct Field; + +impl crate::Field for Field { + const ORDER: usize = 256; + type Elem = u8; + + fn add(a: u8, b: u8) -> u8 { + add(a, b) + } + + fn mul(a: u8, b: u8) -> u8 { + mul(a, b) + } + + fn div(a: u8, b: u8) -> u8 { + div(a, b) + } + + fn exp(elem: u8, n: usize) -> u8 { + exp(elem, n) + } + + fn zero() -> u8 { + 0 + } + + fn one() -> u8 { + 1 + } + + fn nth_internal(n: usize) -> u8 { + n as u8 + } + + fn mul_slice(c: u8, input: &[u8], out: &mut [u8]) { + mul_slice(c, input, out) + } + + fn mul_slice_add(c: u8, input: &[u8], out: &mut [u8]) { + mul_slice_xor(c, input, out) + } +} + +/// Type alias of ReedSolomon over GF(2^8). +pub type ReedSolomon = crate::ReedSolomon; + +/// Type alias of ShardByShard over GF(2^8). +pub type ShardByShard<'a> = crate::ShardByShard<'a, Field>; + +/// Add two elements. +pub fn add(a: u8, b: u8) -> u8 { + a ^ b +} + +/// Subtract `b` from `a`. +#[cfg(test)] +pub fn sub(a: u8, b: u8) -> u8 { + a ^ b +} + +/// Multiply two elements. +pub fn mul(a: u8, b: u8) -> u8 { + MUL_TABLE[a as usize][b as usize] +} + +/// Divide one element by another. `b`, the divisor, may not be 0. +pub fn div(a: u8, b: u8) -> u8 { + if a == 0 { + 0 + } else if b == 0 { + panic!("Divisor is 0") + } else { + let log_a = LOG_TABLE[a as usize]; + let log_b = LOG_TABLE[b as usize]; + let mut log_result = log_a as isize - log_b as isize; + if log_result < 0 { + log_result += 255; + } + EXP_TABLE[log_result as usize] + } +} + +/// Compute a^n. +pub fn exp(a: u8, n: usize) -> u8 { + if n == 0 { + 1 + } else if a == 0 { + 0 + } else { + let log_a = LOG_TABLE[a as usize]; + let mut log_result = log_a as usize * n; + while 255 <= log_result { + log_result -= 255; + } + EXP_TABLE[log_result] + } +} + +const PURE_RUST_UNROLL: isize = 4; + +macro_rules! return_if_empty { + ( + $len:expr + ) => { + if $len == 0 { + return; + } + }; +} + +#[cfg(not(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +)))] +pub fn mul_slice(c: u8, input: &[u8], out: &mut [u8]) { + mul_slice_pure_rust(c, input, out); +} + +#[cfg(not(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +)))] +pub fn mul_slice_xor(c: u8, input: &[u8], out: &mut [u8]) { + mul_slice_xor_pure_rust(c, input, out); +} + +fn mul_slice_pure_rust(c: u8, input: &[u8], out: &mut [u8]) { + let mt = &MUL_TABLE[c as usize]; + let mt_ptr: *const u8 = &mt[0]; + + assert_eq!(input.len(), out.len()); + + let len: isize = input.len() as isize; + return_if_empty!(len); + + let mut input_ptr: *const u8 = &input[0]; + let mut out_ptr: *mut u8 = &mut out[0]; + + let mut n: isize = 0; + unsafe { + assert_eq!(4, PURE_RUST_UNROLL); + if len > PURE_RUST_UNROLL { + let len_minus_unroll = len - PURE_RUST_UNROLL; + while n < len_minus_unroll { + *out_ptr = *mt_ptr.offset(*input_ptr as isize); + *out_ptr.offset(1) = *mt_ptr.offset(*input_ptr.offset(1) as isize); + *out_ptr.offset(2) = *mt_ptr.offset(*input_ptr.offset(2) as isize); + *out_ptr.offset(3) = *mt_ptr.offset(*input_ptr.offset(3) as isize); + + input_ptr = input_ptr.offset(PURE_RUST_UNROLL); + out_ptr = out_ptr.offset(PURE_RUST_UNROLL); + n += PURE_RUST_UNROLL; + } + } + while n < len { + *out_ptr = *mt_ptr.offset(*input_ptr as isize); + + input_ptr = input_ptr.offset(1); + out_ptr = out_ptr.offset(1); + n += 1; + } + } + /* for n in 0..input.len() { + * out[n] = mt[input[n] as usize] + * } + */ +} + +fn mul_slice_xor_pure_rust(c: u8, input: &[u8], out: &mut [u8]) { + let mt = &MUL_TABLE[c as usize]; + let mt_ptr: *const u8 = &mt[0]; + + assert_eq!(input.len(), out.len()); + + let len: isize = input.len() as isize; + return_if_empty!(len); + + let mut input_ptr: *const u8 = &input[0]; + let mut out_ptr: *mut u8 = &mut out[0]; + + let mut n: isize = 0; + unsafe { + assert_eq!(4, PURE_RUST_UNROLL); + if len > PURE_RUST_UNROLL { + let len_minus_unroll = len - PURE_RUST_UNROLL; + while n < len_minus_unroll { + *out_ptr ^= *mt_ptr.offset(*input_ptr as isize); + *out_ptr.offset(1) ^= *mt_ptr.offset(*input_ptr.offset(1) as isize); + *out_ptr.offset(2) ^= *mt_ptr.offset(*input_ptr.offset(2) as isize); + *out_ptr.offset(3) ^= *mt_ptr.offset(*input_ptr.offset(3) as isize); + + input_ptr = input_ptr.offset(PURE_RUST_UNROLL); + out_ptr = out_ptr.offset(PURE_RUST_UNROLL); + n += PURE_RUST_UNROLL; + } + } + while n < len { + *out_ptr ^= *mt_ptr.offset(*input_ptr as isize); + + input_ptr = input_ptr.offset(1); + out_ptr = out_ptr.offset(1); + n += 1; + } + } + /* for n in 0..input.len() { + * out[n] ^= mt[input[n] as usize]; + * } + */ +} + +#[cfg(test)] +fn slice_xor(input: &[u8], out: &mut [u8]) { + assert_eq!(input.len(), out.len()); + + let len: isize = input.len() as isize; + return_if_empty!(len); + + let mut input_ptr: *const u8 = &input[0]; + let mut out_ptr: *mut u8 = &mut out[0]; + + let mut n: isize = 0; + unsafe { + assert_eq!(4, PURE_RUST_UNROLL); + if len > PURE_RUST_UNROLL { + let len_minus_unroll = len - PURE_RUST_UNROLL; + while n < len_minus_unroll { + *out_ptr ^= *input_ptr; + *out_ptr.offset(1) ^= *input_ptr.offset(1); + *out_ptr.offset(2) ^= *input_ptr.offset(2); + *out_ptr.offset(3) ^= *input_ptr.offset(3); + + input_ptr = input_ptr.offset(PURE_RUST_UNROLL); + out_ptr = out_ptr.offset(PURE_RUST_UNROLL); + n += PURE_RUST_UNROLL; + } + } + while n < len { + *out_ptr ^= *input_ptr; + + input_ptr = input_ptr.offset(1); + out_ptr = out_ptr.offset(1); + n += 1; + } + } + /* for n in 0..input.len() { + * out[n] ^= input[n] + * } + */ +} + +#[cfg(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +))] +extern "C" { + fn reedsolomon_gal_mul( + low: *const u8, + high: *const u8, + input: *const u8, + out: *mut u8, + len: libc::size_t, + ) -> libc::size_t; + + fn reedsolomon_gal_mul_xor( + low: *const u8, + high: *const u8, + input: *const u8, + out: *mut u8, + len: libc::size_t, + ) -> libc::size_t; +} + +#[cfg(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +))] +pub fn mul_slice(c: u8, input: &[u8], out: &mut [u8]) { + let low: *const u8 = &MUL_TABLE_LOW[c as usize][0]; + let high: *const u8 = &MUL_TABLE_HIGH[c as usize][0]; + + assert_eq!(input.len(), out.len()); + + let input_ptr: *const u8 = &input[0]; + let out_ptr: *mut u8 = &mut out[0]; + let size: libc::size_t = input.len(); + + let bytes_done: usize = + unsafe { reedsolomon_gal_mul(low, high, input_ptr, out_ptr, size) as usize }; + + mul_slice_pure_rust(c, &input[bytes_done..], &mut out[bytes_done..]); +} + +#[cfg(all( + feature = "simd-accel", + any(target_arch = "x86_64", target_arch = "aarch64"), + not(target_env = "msvc"), + not(any(target_os = "android", target_os = "ios")) +))] +pub fn mul_slice_xor(c: u8, input: &[u8], out: &mut [u8]) { + let low: *const u8 = &MUL_TABLE_LOW[c as usize][0]; + let high: *const u8 = &MUL_TABLE_HIGH[c as usize][0]; + + assert_eq!(input.len(), out.len()); + + let input_ptr: *const u8 = &input[0]; + let out_ptr: *mut u8 = &mut out[0]; + let size: libc::size_t = input.len(); + + let bytes_done: usize = + unsafe { reedsolomon_gal_mul_xor(low, high, input_ptr, out_ptr, size) as usize }; + + mul_slice_xor_pure_rust(c, &input[bytes_done..], &mut out[bytes_done..]); +} + +#[cfg(test)] +mod tests { + extern crate alloc; + + use alloc::vec; + + use super::*; + use crate::tests::fill_random; + use rand; + + static BACKBLAZE_LOG_TABLE: [u8; 256] = [ + //-1, 0, 1, 25, 2, 50, 26, 198, + // first value is changed from -1 to 0 + 0, 0, 1, 25, 2, 50, 26, 198, 3, 223, 51, 238, 27, 104, 199, 75, 4, 100, 224, 14, 52, 141, + 239, 129, 28, 193, 105, 248, 200, 8, 76, 113, 5, 138, 101, 47, 225, 36, 15, 33, 53, 147, + 142, 218, 240, 18, 130, 69, 29, 181, 194, 125, 106, 39, 249, 185, 201, 154, 9, 120, 77, + 228, 114, 166, 6, 191, 139, 98, 102, 221, 48, 253, 226, 152, 37, 179, 16, 145, 34, 136, 54, + 208, 148, 206, 143, 150, 219, 189, 241, 210, 19, 92, 131, 56, 70, 64, 30, 66, 182, 163, + 195, 72, 126, 110, 107, 58, 40, 84, 250, 133, 186, 61, 202, 94, 155, 159, 10, 21, 121, 43, + 78, 212, 229, 172, 115, 243, 167, 87, 7, 112, 192, 247, 140, 128, 99, 13, 103, 74, 222, + 237, 49, 197, 254, 24, 227, 165, 153, 119, 38, 184, 180, 124, 17, 68, 146, 217, 35, 32, + 137, 46, 55, 63, 209, 91, 149, 188, 207, 205, 144, 135, 151, 178, 220, 252, 190, 97, 242, + 86, 211, 171, 20, 42, 93, 158, 132, 60, 57, 83, 71, 109, 65, 162, 31, 45, 67, 216, 183, + 123, 164, 118, 196, 23, 73, 236, 127, 12, 111, 246, 108, 161, 59, 82, 41, 157, 85, 170, + 251, 96, 134, 177, 187, 204, 62, 90, 203, 89, 95, 176, 156, 169, 160, 81, 11, 245, 22, 235, + 122, 117, 44, 215, 79, 174, 213, 233, 230, 231, 173, 232, 116, 214, 244, 234, 168, 80, 88, + 175, + ]; + + #[test] + fn log_table_same_as_backblaze() { + for i in 0..256 { + assert_eq!(LOG_TABLE[i], BACKBLAZE_LOG_TABLE[i]); + } + } + + #[test] + fn test_associativity() { + for a in 0..256 { + let a = a as u8; + for b in 0..256 { + let b = b as u8; + for c in 0..256 { + let c = c as u8; + let x = add(a, add(b, c)); + let y = add(add(a, b), c); + assert_eq!(x, y); + let x = mul(a, mul(b, c)); + let y = mul(mul(a, b), c); + assert_eq!(x, y); + } + } + } + } + + quickcheck! { + fn qc_add_associativity(a: u8, b: u8, c: u8) -> bool { + add(a, add(b, c)) == add(add(a, b), c) + } + + fn qc_mul_associativity(a: u8, b: u8, c: u8) -> bool { + mul(a, mul(b, c)) == mul(mul(a, b), c) + } + } + + #[test] + fn test_identity() { + for a in 0..256 { + let a = a as u8; + let b = sub(0, a); + let c = sub(a, b); + assert_eq!(c, 0); + if a != 0 { + let b = div(1, a); + let c = mul(a, b); + assert_eq!(c, 1); + } + } + } + + quickcheck! { + fn qc_additive_identity(a: u8) -> bool { + sub(a, sub(0, a)) == 0 + } + + fn qc_multiplicative_identity(a: u8) -> bool { + if a == 0 { true } + else { mul(a, div(1, a)) == 1 } + } + } + + #[test] + fn test_commutativity() { + for a in 0..256 { + let a = a as u8; + for b in 0..256 { + let b = b as u8; + let x = add(a, b); + let y = add(b, a); + assert_eq!(x, y); + let x = mul(a, b); + let y = mul(b, a); + assert_eq!(x, y); + } + } + } + + quickcheck! { + fn qc_add_commutativity(a: u8, b: u8) -> bool { + add(a, b) == add(b, a) + } + + fn qc_mul_commutativity(a: u8, b: u8) -> bool { + mul(a, b) == mul(b, a) + } + } + + #[test] + fn test_distributivity() { + for a in 0..256 { + let a = a as u8; + for b in 0..256 { + let b = b as u8; + for c in 0..256 { + let c = c as u8; + let x = mul(a, add(b, c)); + let y = add(mul(a, b), mul(a, c)); + assert_eq!(x, y); + } + } + } + } + + quickcheck! { + fn qc_add_distributivity(a: u8, b: u8, c: u8) -> bool { + mul(a, add(b, c)) == add(mul(a, b), mul(a, c)) + } + } + + #[test] + fn test_exp() { + for a in 0..256 { + let a = a as u8; + let mut power = 1u8; + for j in 0..256 { + let x = exp(a, j); + assert_eq!(x, power); + power = mul(power, a); + } + } + } + + #[test] + fn test_galois() { + assert_eq!(mul(3, 4), 12); + assert_eq!(mul(7, 7), 21); + assert_eq!(mul(23, 45), 41); + + let input = [ + 0, 1, 2, 3, 4, 5, 6, 10, 50, 100, 150, 174, 201, 255, 99, 32, 67, 85, 200, 199, 198, + 197, 196, 195, 194, 193, 192, 191, 190, 189, 188, 187, 186, 185, + ]; + let mut output1 = vec![0; input.len()]; + let mut output2 = vec![0; input.len()]; + mul_slice(25, &input, &mut output1); + let expect = [ + 0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0xfa, 0xb8, 0x6d, 0xc7, 0x85, 0xc3, 0x1f, + 0x22, 0x7, 0x25, 0xfe, 0xda, 0x5d, 0x44, 0x6f, 0x76, 0x39, 0x20, 0xb, 0x12, 0x11, 0x8, + 0x23, 0x3a, 0x75, 0x6c, 0x47, + ]; + for i in 0..input.len() { + assert_eq!(expect[i], output1[i]); + } + mul_slice(25, &input, &mut output2); + for i in 0..input.len() { + assert_eq!(expect[i], output2[i]); + } + + let expect_xor = [ + 0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0x2f, 0x79, 0xf2, 0x7, 0x51, 0xd4, 0x19, 0x31, + 0xc9, 0xf8, 0xfc, 0xf9, 0x4f, 0x62, 0x15, 0x38, 0xfb, 0xd6, 0xa1, 0x8c, 0x96, 0xbb, + 0xcc, 0xe1, 0x22, 0xf, 0x78, + ]; + mul_slice_xor(52, &input, &mut output1); + for i in 0..input.len() { + assert_eq!(expect_xor[i], output1[i]); + } + mul_slice_xor(52, &input, &mut output2); + for i in 0..input.len() { + assert_eq!(expect_xor[i], output2[i]); + } + + let expect = [ + 0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x9e, 0x3, 0x6, 0xe8, 0x75, 0xbd, 0x40, 0x36, + 0xa3, 0x95, 0xcb, 0xc, 0xdd, 0x6c, 0xa2, 0x13, 0x23, 0x92, 0x5c, 0xed, 0x1b, 0xaa, + 0x64, 0xd5, 0xe5, 0x54, 0x9a, + ]; + mul_slice(177, &input, &mut output1); + for i in 0..input.len() { + assert_eq!(expect[i], output1[i]); + } + mul_slice(177, &input, &mut output2); + for i in 0..input.len() { + assert_eq!(expect[i], output2[i]); + } + + let expect_xor = [ + 0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0xfb, 0xec, 0xc5, 0xd0, 0xc7, 0x53, 0x88, + 0xa3, 0xa5, 0x6, 0x78, 0x97, 0x9f, 0x5b, 0xa, 0xce, 0xa8, 0x6c, 0x3d, 0xf9, 0xdf, 0x1b, + 0x4a, 0x8e, 0xe8, 0x2c, 0x7d, + ]; + mul_slice_xor(117, &input, &mut output1); + for i in 0..input.len() { + assert_eq!(expect_xor[i], output1[i]); + } + mul_slice_xor(117, &input, &mut output2); + for i in 0..input.len() { + assert_eq!(expect_xor[i], output2[i]); + } + + assert_eq!(exp(2, 2), 4); + assert_eq!(exp(5, 20), 235); + assert_eq!(exp(13, 7), 43); + } + + #[test] + fn test_slice_add() { + let length_list = [16, 32, 34]; + for len in length_list.iter() { + let mut input = vec![0; *len]; + fill_random(&mut input); + let mut output = vec![0; *len]; + fill_random(&mut output); + let mut expect = vec![0; *len]; + for i in 0..expect.len() { + expect[i] = input[i] ^ output[i]; + } + slice_xor(&input, &mut output); + for i in 0..expect.len() { + assert_eq!(expect[i], output[i]); + } + fill_random(&mut output); + for i in 0..expect.len() { + expect[i] = input[i] ^ output[i]; + } + slice_xor(&input, &mut output); + for i in 0..expect.len() { + assert_eq!(expect[i], output[i]); + } + } + } + + #[test] + fn test_div_a_is_0() { + assert_eq!(0, div(0, 100)); + } + + #[test] + #[should_panic] + fn test_div_b_is_0() { + div(1, 0); + } + + #[test] + fn test_same_as_maybe_ffi() { + let len = 10_003; + for _ in 0..100 { + let c = rand::random::(); + let mut input = vec![0; len]; + fill_random(&mut input); + { + let mut output = vec![0; len]; + fill_random(&mut output); + let mut output_copy = output.clone(); + + mul_slice(c, &input, &mut output); + mul_slice(c, &input, &mut output_copy); + + assert_eq!(output, output_copy); + } + { + let mut output = vec![0; len]; + fill_random(&mut output); + let mut output_copy = output.clone(); + + mul_slice_xor(c, &input, &mut output); + mul_slice_xor(c, &input, &mut output_copy); + + assert_eq!(output, output_copy); + } + } + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/lib.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/lib.rs new file mode 100644 index 000000000..0ba04ae0e --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/lib.rs @@ -0,0 +1,200 @@ +//! This crate provides an encoder/decoder for Reed-Solomon erasure code. +//! +//! Please note that erasure coding means errors are not directly detected or corrected, +//! but missing data pieces (shards) can be reconstructed given that +//! the configuration provides high enough redundancy. +//! +//! You will have to implement error detection separately (e.g. via checksums) +//! and simply leave out the corrupted shards when attempting to reconstruct +//! the missing data. +#![allow(dead_code)] +#![cfg_attr(not(feature = "std"), no_std)] + +#[cfg(test)] +#[macro_use] +extern crate quickcheck; + +#[cfg(test)] +extern crate rand; + +extern crate smallvec; + +#[cfg(feature = "simd-accel")] +extern crate libc; + +use ::core::iter; +use ::core::iter::FromIterator; + +#[macro_use] +mod macros; + +mod core; +mod errors; +mod matrix; + +#[cfg(test)] +mod tests; + +pub mod galois_16; +pub mod galois_8; + +pub use crate::errors::Error; +pub use crate::errors::SBSError; + +pub use crate::core::ReedSolomon; +pub use crate::core::ShardByShard; + +// TODO: Can be simplified once https://github.com/rust-lang/rfcs/issues/2505 is resolved +#[cfg(not(feature = "std"))] +use libm::log2f as log2; +#[cfg(feature = "std")] +fn log2(n: f32) -> f32 { + n.log2() +} + +/// A finite field to perform encoding over. +pub trait Field: Sized { + /// The order of the field. This is a limit on the number of shards + /// in an encoding. + const ORDER: usize; + + /// The representational type of the field. + type Elem: Default + Clone + Copy + PartialEq + ::core::fmt::Debug; + + /// Add two elements together. + fn add(a: Self::Elem, b: Self::Elem) -> Self::Elem; + + /// Multiply two elements together. + fn mul(a: Self::Elem, b: Self::Elem) -> Self::Elem; + + /// Divide a by b. Panics is b is zero. + fn div(a: Self::Elem, b: Self::Elem) -> Self::Elem; + + /// Raise `a` to the n'th power. + fn exp(a: Self::Elem, n: usize) -> Self::Elem; + + /// The "zero" element or additive identity. + fn zero() -> Self::Elem; + + /// The "one" element or multiplicative identity. + fn one() -> Self::Elem; + + fn nth_internal(n: usize) -> Self::Elem; + + /// Yield the nth element of the field. Panics if n >= ORDER. + /// Assignment is arbitrary but must be unique to `n`. + fn nth(n: usize) -> Self::Elem { + if n >= Self::ORDER { + let pow = log2(Self::ORDER as f32) as usize; + panic!("{} out of bounds for GF(2^{}) member", n, pow) + } + + Self::nth_internal(n) + } + + /// Multiply a slice of elements by another. Writes into the output slice. + /// + /// # Panics + /// Panics if the output slice does not have equal length to the input. + fn mul_slice(elem: Self::Elem, input: &[Self::Elem], out: &mut [Self::Elem]) { + assert_eq!(input.len(), out.len()); + + for (i, o) in input.iter().zip(out) { + *o = Self::mul(elem.clone(), i.clone()) + } + } + + /// Multiply a slice of elements by another, adding each result to the corresponding value in + /// `out`. + /// + /// # Panics + /// Panics if the output slice does not have equal length to the input. + fn mul_slice_add(elem: Self::Elem, input: &[Self::Elem], out: &mut [Self::Elem]) { + assert_eq!(input.len(), out.len()); + + for (i, o) in input.iter().zip(out) { + *o = Self::add(o.clone(), Self::mul(elem.clone(), i.clone())) + } + } +} + +/// Something which might hold a shard. +/// +/// This trait is used in reconstruction, where some of the shards +/// may be unknown. +pub trait ReconstructShard { + /// The size of the shard data; `None` if empty. + fn len(&self) -> Option; + + /// Get a mutable reference to the shard data, returning `None` if uninitialized. + fn get(&mut self) -> Option<&mut [F::Elem]>; + + /// Get a mutable reference to the shard data, initializing it to the + /// given length if it was `None`. Returns an error if initialization fails. + fn get_or_initialize( + &mut self, + len: usize, + ) -> Result<&mut [F::Elem], Result<&mut [F::Elem], Error>>; +} + +impl + AsMut<[F::Elem]> + FromIterator> ReconstructShard + for Option +{ + fn len(&self) -> Option { + self.as_ref().map(|x| x.as_ref().len()) + } + + fn get(&mut self) -> Option<&mut [F::Elem]> { + self.as_mut().map(|x| x.as_mut()) + } + + fn get_or_initialize( + &mut self, + len: usize, + ) -> Result<&mut [F::Elem], Result<&mut [F::Elem], Error>> { + let is_some = self.is_some(); + let x = self + .get_or_insert_with(|| iter::repeat(F::zero()).take(len).collect()) + .as_mut(); + + if is_some { + Ok(x) + } else { + Err(Ok(x)) + } + } +} + +impl + AsMut<[F::Elem]>> ReconstructShard for (T, bool) { + fn len(&self) -> Option { + if !self.1 { + None + } else { + Some(self.0.as_ref().len()) + } + } + + fn get(&mut self) -> Option<&mut [F::Elem]> { + if !self.1 { + None + } else { + Some(self.0.as_mut()) + } + } + + fn get_or_initialize( + &mut self, + len: usize, + ) -> Result<&mut [F::Elem], Result<&mut [F::Elem], Error>> { + let x = self.0.as_mut(); + if x.len() == len { + if self.1 { + Ok(x) + } else { + Err(Ok(x)) + } + } else { + Err(Err(Error::IncorrectShardSize)) + } + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/macros.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/macros.rs new file mode 100644 index 000000000..340b27430 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/macros.rs @@ -0,0 +1,245 @@ +/// Constructs vector of shards. +/// +/// # Example +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # use reed_solomon_erasure::*; +/// # fn main () { +/// let shards: Vec> = shards!([1, 2, 3], +/// [4, 5, 6]); +/// # } +/// ``` +#[macro_export] +macro_rules! shards { + ( + $( [ $( $x:expr ),* ] ),* + ) => {{ + vec![ $( vec![ $( $x ),* ] ),* ] + }} +} + +/// Makes it easier to work with 2D slices, arrays, etc. +/// +/// # Examples +/// ## Byte arrays on stack to `Vec<&[u8]>` +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # fn main () { +/// let array: [[u8; 3]; 2] = [[1, 2, 3], +/// [4, 5, 6]]; +/// +/// let refs: Vec<&[u8]> = +/// convert_2D_slices!(array =>to_vec &[u8]); +/// # } +/// ``` +/// ## Byte arrays on stack to `Vec<&mut [u8]>` (borrow mutably) +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # fn main () { +/// let mut array: [[u8; 3]; 2] = [[1, 2, 3], +/// [4, 5, 6]]; +/// +/// let refs: Vec<&mut [u8]> = +/// convert_2D_slices!(array =>to_mut_vec &mut [u8]); +/// # } +/// ``` +/// ## Byte arrays on stack to `SmallVec<[&mut [u8]; 32]>` (borrow mutably) +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # extern crate smallvec; +/// # use smallvec::SmallVec; +/// # fn main () { +/// let mut array: [[u8; 3]; 2] = [[1, 2, 3], +/// [4, 5, 6]]; +/// +/// let refs: SmallVec<[&mut [u8]; 32]> = +/// convert_2D_slices!(array =>to_mut SmallVec<[&mut [u8]; 32]>, +/// SmallVec::with_capacity); +/// # } +/// ``` +/// ## Shard array to `SmallVec<[&mut [u8]; 32]>` (borrow mutably) +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # extern crate smallvec; +/// # use smallvec::SmallVec; +/// # fn main () { +/// let mut shards = shards!([1, 2, 3], +/// [4, 5, 6]); +/// +/// let refs: SmallVec<[&mut [u8]; 32]> = +/// convert_2D_slices!(shards =>to_mut SmallVec<[&mut [u8]; 32]>, +/// SmallVec::with_capacity); +/// # } +/// ``` +/// ## Shard array to `Vec<&mut [u8]>` (borrow mutably) into `SmallVec<[&mut [u8]; 32]>` (move) +/// ```rust +/// # #[macro_use] extern crate reed_solomon_erasure; +/// # extern crate smallvec; +/// # use smallvec::SmallVec; +/// # fn main () { +/// let mut shards = shards!([1, 2, 3], +/// [4, 5, 6]); +/// +/// let refs1 = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); +/// +/// let refs2: SmallVec<[&mut [u8]; 32]> = +/// convert_2D_slices!(refs1 =>into SmallVec<[&mut [u8]; 32]>, +/// SmallVec::with_capacity); +/// # } +/// ``` +#[macro_export] +macro_rules! convert_2D_slices { + ( + $slice:expr =>into_vec $dst_type:ty + ) => { + convert_2D_slices!($slice =>into Vec<$dst_type>, + Vec::with_capacity) + }; + ( + $slice:expr =>to_vec $dst_type:ty + ) => { + convert_2D_slices!($slice =>to Vec<$dst_type>, + Vec::with_capacity) + }; + ( + $slice:expr =>to_mut_vec $dst_type:ty + ) => { + convert_2D_slices!($slice =>to_mut Vec<$dst_type>, + Vec::with_capacity) + }; + ( + $slice:expr =>into $dst_type:ty, $with_capacity:path + ) => {{ + let mut result: $dst_type = + $with_capacity($slice.len()); + for i in $slice.into_iter() { + result.push(i); + } + result + }}; + ( + $slice:expr =>to $dst_type:ty, $with_capacity:path + ) => {{ + let mut result: $dst_type = + $with_capacity($slice.len()); + for i in $slice.iter() { + result.push(i); + } + result + }}; + ( + $slice:expr =>to_mut $dst_type:ty, $with_capacity:path + ) => {{ + let mut result: $dst_type = + $with_capacity($slice.len()); + for i in $slice.iter_mut() { + result.push(i); + } + result + }} +} + +macro_rules! check_slices { + ( + multi => $slices:expr + ) => {{ + let size = $slices[0].as_ref().len(); + if size == 0 { + return Err(Error::EmptyShard); + } + for slice in $slices.iter() { + if slice.as_ref().len() != size { + return Err(Error::IncorrectShardSize); + } + } + }}; + ( + single => $slice_left:expr, single => $slice_right:expr + ) => {{ + if $slice_left.as_ref().len() != $slice_right.as_ref().len() { + return Err(Error::IncorrectShardSize); + } + }}; + ( + multi => $slices:expr, single => $single:expr + ) => {{ + check_slices!(multi => $slices); + + check_slices!(single => $slices[0], single => $single); + }}; + ( + multi => $slices_left:expr, multi => $slices_right:expr + ) => {{ + check_slices!(multi => $slices_left); + check_slices!(multi => $slices_right); + + check_slices!(single => $slices_left[0], single => $slices_right[0]); + }} +} + +macro_rules! check_slice_index { + ( + all => $codec:expr, $index:expr + ) => {{ + if $index >= $codec.total_shard_count { + return Err(Error::InvalidIndex); + } + }}; + ( + data => $codec:expr, $index:expr + ) => {{ + if $index >= $codec.data_shard_count { + return Err(Error::InvalidIndex); + } + }}; + ( + parity => $codec:expr, $index:expr + ) => {{ + if $index >= $codec.parity_shard_count { + return Err(Error::InvalidIndex); + } + }}; +} + +macro_rules! check_piece_count { + ( + all => $codec:expr, $pieces:expr + ) => {{ + if $pieces.as_ref().len() < $codec.total_shard_count { + return Err(Error::TooFewShards); + } + if $pieces.as_ref().len() > $codec.total_shard_count { + return Err(Error::TooManyShards); + } + }}; + ( + data => $codec:expr, $pieces:expr + ) => {{ + if $pieces.as_ref().len() < $codec.data_shard_count { + return Err(Error::TooFewDataShards); + } + if $pieces.as_ref().len() > $codec.data_shard_count { + return Err(Error::TooManyDataShards); + } + }}; + ( + parity => $codec:expr, $pieces:expr + ) => {{ + if $pieces.as_ref().len() < $codec.parity_shard_count { + return Err(Error::TooFewParityShards); + } + if $pieces.as_ref().len() > $codec.parity_shard_count { + return Err(Error::TooManyParityShards); + } + }}; + ( + parity_buf => $codec:expr, $pieces:expr + ) => {{ + if $pieces.as_ref().len() < $codec.parity_shard_count { + return Err(Error::TooFewBufferShards); + } + if $pieces.as_ref().len() > $codec.parity_shard_count { + return Err(Error::TooManyBufferShards); + } + }}; +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/matrix.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/matrix.rs new file mode 100644 index 000000000..508d43046 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/matrix.rs @@ -0,0 +1,425 @@ +#![allow(dead_code)] +extern crate alloc; + +use alloc::vec; +use alloc::vec::Vec; + +use crate::Field; +use smallvec::SmallVec; + +#[derive(Debug)] +pub enum Error { + SingularMatrix, +} + +macro_rules! acc { + ( + $m:ident, $r:expr, $c:expr + ) => { + $m.data[$r * $m.col_count + $c] + }; +} + +pub fn flatten(m: Vec>) -> Vec { + let mut result: Vec = Vec::with_capacity(m.len() * m[0].len()); + for row in m { + for v in row { + result.push(v); + } + } + result +} + +#[derive(PartialEq, Debug, Clone)] +pub struct Matrix { + row_count: usize, + col_count: usize, + data: SmallVec<[F::Elem; 1024]>, // store in flattened structure + // the smallvec can hold a matrix of size up to 32x32 in stack +} + +fn calc_matrix_row_start_end(col_count: usize, row: usize) -> (usize, usize) { + let start = row * col_count; + let end = start + col_count; + + (start, end) +} + +impl Matrix { + fn calc_row_start_end(&self, row: usize) -> (usize, usize) { + calc_matrix_row_start_end(self.col_count, row) + } + + pub fn new(rows: usize, cols: usize) -> Matrix { + let data = SmallVec::from_vec(vec![F::zero(); rows * cols]); + + Matrix { + row_count: rows, + col_count: cols, + data, + } + } + + pub fn new_with_data(init_data: Vec>) -> Matrix { + let rows = init_data.len(); + let cols = init_data[0].len(); + + for r in init_data.iter() { + if r.len() != cols { + panic!("Inconsistent row sizes") + } + } + + let data = SmallVec::from_vec(flatten(init_data)); + + Matrix { + row_count: rows, + col_count: cols, + data, + } + } + + #[cfg(test)] + pub fn make_random(size: usize) -> Matrix + where + rand::distributions::Standard: rand::distributions::Distribution, + { + let mut vec: Vec> = vec![vec![Default::default(); size]; size]; + for v in vec.iter_mut() { + crate::tests::fill_random(v); + } + + Matrix::new_with_data(vec) + } + + pub fn identity(size: usize) -> Matrix { + let mut result = Self::new(size, size); + for i in 0..size { + acc!(result, i, i) = F::one(); + } + result + } + + pub fn col_count(&self) -> usize { + self.col_count + } + + pub fn row_count(&self) -> usize { + self.row_count + } + + pub fn get(&self, r: usize, c: usize) -> F::Elem { + acc!(self, r, c).clone() + } + + pub fn set(&mut self, r: usize, c: usize, val: F::Elem) { + acc!(self, r, c) = val; + } + + pub fn multiply(&self, rhs: &Matrix) -> Matrix { + if self.col_count != rhs.row_count { + panic!( + "Colomn count on left is different from row count on right, lhs: {}, rhs: {}", + self.col_count, rhs.row_count + ) + } + let mut result = Self::new(self.row_count, rhs.col_count); + for r in 0..self.row_count { + for c in 0..rhs.col_count { + let mut val = F::zero(); + for i in 0..self.col_count { + let mul = F::mul(acc!(self, r, i).clone(), acc!(rhs, i, c).clone()); + + val = F::add(val, mul); + } + acc!(result, r, c) = val; + } + } + result + } + + pub fn augment(&self, rhs: &Matrix) -> Matrix { + if self.row_count != rhs.row_count { + panic!( + "Matrices do not have the same row count, lhs: {}, rhs: {}", + self.row_count, rhs.row_count + ) + } + let mut result = Self::new(self.row_count, self.col_count + rhs.col_count); + for r in 0..self.row_count { + for c in 0..self.col_count { + acc!(result, r, c) = acc!(self, r, c).clone(); + } + let self_column_count = self.col_count; + for c in 0..rhs.col_count { + acc!(result, r, self_column_count + c) = acc!(rhs, r, c).clone(); + } + } + + result + } + + pub fn sub_matrix(&self, rmin: usize, cmin: usize, rmax: usize, cmax: usize) -> Matrix { + let mut result = Self::new(rmax - rmin, cmax - cmin); + for r in rmin..rmax { + for c in cmin..cmax { + acc!(result, r - rmin, c - cmin) = acc!(self, r, c).clone(); + } + } + result + } + + pub fn get_row(&self, row: usize) -> &[F::Elem] { + let (start, end) = self.calc_row_start_end(row); + + &self.data[start..end] + } + + pub fn swap_rows(&mut self, r1: usize, r2: usize) { + let (r1_s, _) = self.calc_row_start_end(r1); + let (r2_s, _) = self.calc_row_start_end(r2); + + if r1 == r2 { + return; + } else { + for i in 0..self.col_count { + self.data.swap(r1_s + i, r2_s + i); + } + } + } + + pub fn is_square(&self) -> bool { + self.row_count == self.col_count + } + + pub fn gaussian_elim(&mut self) -> Result<(), Error> { + for r in 0..self.row_count { + if acc!(self, r, r) == F::zero() { + for r_below in r + 1..self.row_count { + if acc!(self, r_below, r) != F::zero() { + self.swap_rows(r, r_below); + break; + } + } + } + // If we couldn't find one, the matrix is singular. + if acc!(self, r, r) == F::zero() { + return Err(Error::SingularMatrix); + } + // Scale to 1. + if acc!(self, r, r) != F::one() { + let scale = F::div(F::one(), acc!(self, r, r).clone()); + for c in 0..self.col_count { + acc!(self, r, c) = F::mul(scale, acc!(self, r, c).clone()); + } + } + // Make everything below the 1 be a 0 by subtracting + // a multiple of it. (Subtraction and addition are + // both exclusive or in the Galois field.) + for r_below in r + 1..self.row_count { + if acc!(self, r_below, r) != F::zero() { + let scale = acc!(self, r_below, r).clone(); + for c in 0..self.col_count { + acc!(self, r_below, c) = F::add( + acc!(self, r_below, c).clone(), + F::mul(scale, acc!(self, r, c).clone()), + ); + } + } + } + } + + // Now clear the part above the main diagonal. + for d in 0..self.row_count { + for r_above in 0..d { + if acc!(self, r_above, d) != F::zero() { + let scale = acc!(self, r_above, d).clone(); + for c in 0..self.col_count { + acc!(self, r_above, c) = F::add( + acc!(self, r_above, c).clone(), + F::mul(scale, acc!(self, d, c).clone()), + ); + } + } + } + } + Ok(()) + } + + pub fn invert(&self) -> Result, Error> { + if !self.is_square() { + panic!("Trying to invert a non-square matrix") + } + + let row_count = self.row_count; + let col_count = self.col_count; + + let mut work = self.augment(&Self::identity(row_count)); + work.gaussian_elim()?; + + Ok(work.sub_matrix(0, row_count, col_count, col_count * 2)) + } + + pub fn vandermonde(rows: usize, cols: usize) -> Matrix { + let mut result = Self::new(rows, cols); + + for r in 0..rows { + // doesn't matter what `r_a` is as long as it's unique. + // then the vandermonde matrix is invertible. + let r_a = F::nth(r); + for c in 0..cols { + acc!(result, r, c) = F::exp(r_a, c); + } + } + + result + } +} + +#[cfg(test)] +mod tests { + extern crate alloc; + + use alloc::vec; + + use super::Matrix; + use crate::galois_8; + + macro_rules! matrix { + ( + $( + [ $( $x:expr ),+ ] + ),* + ) => ( + Matrix::::new_with_data(vec![ $( vec![$( $x ),*] ),* ]) + ); + ($rows:expr, $cols:expr) => (Matrix::new($rows, $cols)); + } + + #[test] + fn test_matrix_col_count() { + let m1 = matrix!([1, 0, 0]); + let m2 = matrix!([0, 0, 0], [0, 0, 0]); + let m3: Matrix = Matrix::new(1, 4); + + assert_eq!(3, m1.col_count()); + assert_eq!(3, m2.col_count()); + assert_eq!(4, m3.col_count()); + } + + #[test] + fn test_matrix_row_count() { + let m1 = matrix!([1, 0, 0]); + let m2 = matrix!([0, 0, 0], [0, 0, 0]); + let m3: Matrix = Matrix::new(1, 4); + + assert_eq!(1, m1.row_count()); + assert_eq!(2, m2.row_count()); + assert_eq!(1, m3.row_count()); + } + + #[test] + fn test_matrix_swap_rows() { + { + let mut m1 = matrix!([1, 2, 3], [4, 5, 6], [7, 8, 9]); + let expect = matrix!([7, 8, 9], [4, 5, 6], [1, 2, 3]); + m1.swap_rows(0, 2); + assert_eq!(expect, m1); + } + { + let mut m1 = matrix!([1, 2, 3], [4, 5, 6], [7, 8, 9]); + let expect = m1.clone(); + m1.swap_rows(0, 0); + assert_eq!(expect, m1); + m1.swap_rows(1, 1); + assert_eq!(expect, m1); + m1.swap_rows(2, 2); + assert_eq!(expect, m1); + } + } + + #[test] + #[should_panic] + fn test_inconsistent_row_sizes() { + matrix!([1, 0, 0], [0, 1], [0, 0, 1]); + } + + #[test] + #[should_panic] + fn test_incompatible_multiply() { + let m1 = matrix!([0, 1], [0, 1], [0, 1]); + let m2 = matrix!([0, 1, 2]); + + m1.multiply(&m2); + } + + #[test] + #[should_panic] + fn test_incompatible_augment() { + let m1 = matrix!([0, 1]); + let m2 = matrix!([0, 1], [2, 3]); + + m1.augment(&m2); + } + + #[test] + fn test_matrix_identity() { + let m1 = Matrix::identity(3); + let m2 = matrix!([1, 0, 0], [0, 1, 0], [0, 0, 1]); + assert_eq!(m1, m2); + } + + #[test] + fn test_matrix_multiply() { + let m1 = matrix!([1, 2], [3, 4]); + let m2 = matrix!([5, 6], [7, 8]); + let actual = m1.multiply(&m2); + let expect = matrix!([11, 22], [19, 42]); + assert_eq!(actual, expect); + } + + #[test] + fn test_matrix_inverse_pass_cases() { + { + // Test case validating inverse of the input Matrix. + let m = matrix!([56, 23, 98], [3, 100, 200], [45, 201, 123]) + .invert() + .unwrap(); + let expect = matrix!([175, 133, 33], [130, 13, 245], [112, 35, 126]); + assert_eq!(m, expect); + } + { + // Test case validating inverse of the input Matrix. + let m = matrix!( + [1, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [0, 0, 0, 1, 0], + [0, 0, 0, 0, 1], + [7, 7, 6, 6, 1] + ) + .invert() + .unwrap(); + let expect = matrix!( + [1, 0, 0, 0, 0], + [0, 1, 0, 0, 0], + [123, 123, 1, 122, 122], + [0, 0, 1, 0, 0], + [0, 0, 0, 1, 0] + ); + assert_eq!(m, expect); + } + } + + #[test] + #[should_panic] + fn test_matrix_inverse_non_square() { + // Test case with a non-square matrix. + matrix!([56, 23], [3, 100], [45, 201]).invert().unwrap(); + } + + #[test] + #[should_panic] + fn test_matrix_inverse_singular() { + matrix!([4, 2], [12, 6]).invert().unwrap(); + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/tests/galois_16.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/tests/galois_16.rs new file mode 100644 index 000000000..872472e85 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/tests/galois_16.rs @@ -0,0 +1,489 @@ +extern crate alloc; + +use alloc::vec; +use alloc::vec::Vec; + +use super::{fill_random, option_shards_into_shards, shards_into_option_shards}; +use crate::galois_16::ReedSolomon; + +macro_rules! make_random_shards { + ($per_shard:expr, $size:expr) => {{ + let mut shards = Vec::with_capacity(20); + for _ in 0..$size { + shards.push(vec![[0; 2]; $per_shard]); + } + + for s in shards.iter_mut() { + fill_random(s); + } + + shards + }}; +} + +#[test] +fn correct_field_order_restriction() { + const ORDER: usize = 1 << 16; + + assert!(ReedSolomon::new(ORDER, 1).is_err()); + assert!(ReedSolomon::new(1, ORDER).is_err()); + + // way too slow, because it needs to build a 65536*65536 vandermonde matrix + // assert!(ReedSolomon::new(ORDER - 1, 1).is_ok()); + assert!(ReedSolomon::new(1, ORDER - 1).is_ok()); +} + +quickcheck! { + fn qc_encode_verify_reconstruct_verify(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [[u8; 2]]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + let mut slice_present = vec![true; data + parity]; + for &p in corrupt_pos_s.iter() { + slice_present[p] = false; + } + + // reconstruct + { + let mut refs: Vec<_> = shards.iter_mut() + .map(|i| &mut i[..]) + .zip(slice_present.iter().cloned()) + .collect(); + + r.reconstruct(&mut refs[..]).unwrap(); + } + + ({ + let refs = + convert_2D_slices!(expect =>to_vec &[[u8; 2]]); + + r.verify(&refs).unwrap() + }) + && + expect == shards + && + ({ + let refs = + convert_2D_slices!(shards =>to_vec &[[u8; 2]]); + + r.verify(&refs).unwrap() + }) + } + + fn qc_encode_verify_reconstruct_verify_shards(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + r.encode(&mut expect).unwrap(); + + let expect = expect; + + let mut shards = shards_into_option_shards(expect.clone()); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + shards[p] = None; + } + + // reconstruct + r.reconstruct(&mut shards).unwrap(); + + let shards = option_shards_into_shards(shards); + + r.verify(&expect).unwrap() + && expect == shards + && r.verify(&shards).unwrap() + } + + fn qc_verify(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [[u8; 2]]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + + ({ + let refs = + convert_2D_slices!(expect =>to_vec &[[u8; 2]]); + + r.verify(&refs).unwrap() + }) + && + ((corrupt > 0 && expect != shards) + || (corrupt == 0 && expect == shards)) + && + ({ + let refs = + convert_2D_slices!(shards =>to_vec &[[u8; 2]]); + + (corrupt > 0 && !r.verify(&refs).unwrap()) + || (corrupt == 0 && r.verify(&refs).unwrap()) + }) + } + + fn qc_verify_shards(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + r.encode(&mut expect).unwrap(); + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + + r.verify(&expect).unwrap() + && + ((corrupt > 0 && expect != shards) + || (corrupt == 0 && expect == shards)) + && + ((corrupt > 0 && !r.verify(&shards).unwrap()) + || (corrupt == 0 && r.verify(&shards).unwrap())) + } + + fn qc_encode_sep_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [[u8; 2]]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let (data, parity) = shards.split_at_mut(data); + + let data_refs = + convert_2D_slices!(data =>to_mut_vec &[[u8; 2]]); + + let mut parity_refs = + convert_2D_slices!(parity =>to_mut_vec &mut [[u8; 2]]); + + r.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_sep_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + { + let (data, parity) = shards.split_at_mut(data); + + r.encode_sep(data, parity).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [[u8; 2]]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let mut refs = + convert_2D_slices!(shards =>to_mut_vec &mut [[u8; 2]]); + + for i in 0..data { + r.encode_single(i, &mut refs).unwrap(); + } + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + for i in 0..data { + r.encode_single(i, &mut shards).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_sep_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [[u8; 2]]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let (data_shards, parity_shards) = shards.split_at_mut(data); + + let data_refs = + convert_2D_slices!(data_shards =>to_mut_vec &[[u8; 2]]); + + let mut parity_refs = + convert_2D_slices!(parity_shards =>to_mut_vec &mut [[u8; 2]]); + + for i in 0..data { + r.encode_single_sep(i, data_refs[i], &mut parity_refs).unwrap(); + } + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_sep_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + { + let (data_shards, parity_shards) = shards.split_at_mut(data); + + for i in 0..data { + r.encode_single_sep(i, &data_shards[i], parity_shards).unwrap(); + } + } + + let shards = shards; + + expect == shards + } +} diff --git a/seaweed-volume/vendor/reed-solomon-erasure/src/tests/mod.rs b/seaweed-volume/vendor/reed-solomon-erasure/src/tests/mod.rs new file mode 100644 index 000000000..488443b25 --- /dev/null +++ b/seaweed-volume/vendor/reed-solomon-erasure/src/tests/mod.rs @@ -0,0 +1,2619 @@ +#![allow(dead_code)] + +extern crate alloc; + +use alloc::vec; +use alloc::vec::Vec; + +use super::{galois_8, Error, SBSError}; +use rand::{self, thread_rng, Rng}; + +mod galois_16; + +type ReedSolomon = crate::ReedSolomon; +type ShardByShard<'a> = crate::ShardByShard<'a, galois_8::Field>; + +macro_rules! make_random_shards { + ($per_shard:expr, $size:expr) => {{ + let mut shards = Vec::with_capacity(20); + for _ in 0..$size { + shards.push(vec![0; $per_shard]); + } + + for s in shards.iter_mut() { + fill_random(s); + } + + shards + }}; +} + +fn assert_eq_shards(s1: &[T], s2: &[U]) +where + T: AsRef<[u8]>, + U: AsRef<[u8]>, +{ + assert_eq!(s1.len(), s2.len()); + for i in 0..s1.len() { + assert_eq!(s1[i].as_ref(), s2[i].as_ref()); + } +} + +pub fn fill_random(arr: &mut [T]) +where + rand::distributions::Standard: rand::distributions::Distribution, +{ + for a in arr.iter_mut() { + *a = rand::random::(); + } +} + +fn shards_to_option_shards(shards: &[Vec]) -> Vec>> { + let mut result = Vec::with_capacity(shards.len()); + + for v in shards.iter() { + let inner: Vec = v.clone(); + result.push(Some(inner)); + } + result +} + +fn shards_into_option_shards(shards: Vec>) -> Vec>> { + let mut result = Vec::with_capacity(shards.len()); + + for v in shards { + result.push(Some(v)); + } + result +} + +fn option_shards_to_shards(shards: &[Option>]) -> Vec> { + let mut result = Vec::with_capacity(shards.len()); + + for i in 0..shards.len() { + let shard = match shards[i] { + Some(ref x) => x, + None => panic!("Missing shard, index : {}", i), + }; + let inner: Vec = shard.clone(); + result.push(inner); + } + result +} + +fn option_shards_into_shards(shards: Vec>>) -> Vec> { + let mut result = Vec::with_capacity(shards.len()); + + for shard in shards { + let shard = match shard { + Some(x) => x, + None => panic!("Missing shard"), + }; + result.push(shard); + } + result +} + +#[test] +fn test_no_data_shards() { + assert_eq!(Error::TooFewDataShards, ReedSolomon::new(0, 1).unwrap_err()); +} + +#[test] +fn test_no_parity_shards() { + assert_eq!( + Error::TooFewParityShards, + ReedSolomon::new(1, 0).unwrap_err() + ); +} + +#[test] +fn test_too_many_shards() { + assert_eq!( + Error::TooManyShards, + ReedSolomon::new(129, 128).unwrap_err() + ); +} + +#[test] +fn test_shard_count() { + let mut rng = thread_rng(); + for _ in 0..10 { + let data_shard_count = rng.gen_range(1, 128); + let parity_shard_count = rng.gen_range(1, 128); + + let total_shard_count = data_shard_count + parity_shard_count; + + let r = ReedSolomon::new(data_shard_count, parity_shard_count).unwrap(); + + assert_eq!(data_shard_count, r.data_shard_count()); + assert_eq!(parity_shard_count, r.parity_shard_count()); + assert_eq!(total_shard_count, r.total_shard_count()); + } +} + +#[test] +fn test_reed_solomon_clone() { + let r1 = ReedSolomon::new(10, 3).unwrap(); + let r2 = r1.clone(); + + assert_eq!(r1, r2); +} + +#[test] +fn test_encoding() { + let per_shard = 50_000; + + let r = ReedSolomon::new(10, 3).unwrap(); + + let mut shards = make_random_shards!(per_shard, 13); + + r.encode(&mut shards).unwrap(); + assert!(r.verify(&shards).unwrap()); + + assert_eq!( + Error::TooFewShards, + r.encode(&mut shards[0..1]).unwrap_err() + ); + + let mut bad_shards = make_random_shards!(per_shard, 13); + bad_shards[0] = vec![0 as u8]; + assert_eq!( + Error::IncorrectShardSize, + r.encode(&mut bad_shards).unwrap_err() + ); +} + +#[test] +fn test_reconstruct_shards() { + let per_shard = 100_000; + + let r = ReedSolomon::new(8, 5).unwrap(); + + let mut shards = make_random_shards!(per_shard, 13); + + r.encode(&mut shards).unwrap(); + + let master_copy = shards.clone(); + + let mut shards = shards_to_option_shards(&shards); + + // Try to decode with all shards present + r.reconstruct(&mut shards).unwrap(); + { + let shards = option_shards_to_shards(&shards); + assert!(r.verify(&shards).unwrap()); + assert_eq!(&shards, &master_copy); + } + + // Try to decode with 10 shards + shards[0] = None; + shards[2] = None; + //shards[4] = None; + r.reconstruct(&mut shards).unwrap(); + { + let shards = option_shards_to_shards(&shards); + assert!(r.verify(&shards).unwrap()); + assert_eq!(&shards, &master_copy); + } + + // Try to decode the same shards again to try to + // trigger the usage of cached decode matrix + shards[0] = None; + shards[2] = None; + //shards[4] = None; + r.reconstruct(&mut shards).unwrap(); + { + let shards = option_shards_to_shards(&shards); + assert!(r.verify(&shards).unwrap()); + assert_eq!(&shards, &master_copy); + } + + // Try to deocde with 6 data and 4 parity shards + shards[0] = None; + shards[2] = None; + shards[12] = None; + r.reconstruct(&mut shards).unwrap(); + { + let shards = option_shards_to_shards(&shards); + assert!(r.verify(&shards).unwrap()); + assert_eq!(&shards, &master_copy); + } + + // Try to reconstruct data only + shards[0] = None; + shards[1] = None; + shards[12] = None; + r.reconstruct_data(&mut shards).unwrap(); + { + let data_shards = option_shards_to_shards(&shards[0..8]); + assert_eq!(master_copy[0], data_shards[0]); + assert_eq!(master_copy[1], data_shards[1]); + assert_eq!(None, shards[12]); + } + + // Try to decode with 7 data and 1 parity shards + shards[0] = None; + shards[1] = None; + shards[9] = None; + shards[10] = None; + shards[11] = None; + shards[12] = None; + assert_eq!( + r.reconstruct(&mut shards).unwrap_err(), + Error::TooFewShardsPresent + ); +} + +#[test] +fn test_reconstruct() { + let r = ReedSolomon::new(2, 2).unwrap(); + + let mut shards: [[u8; 3]; 4] = [[0, 1, 2], [3, 4, 5], [200, 201, 203], [100, 101, 102]]; + + { + { + let mut shard_refs: Vec<&mut [u8]> = Vec::with_capacity(3); + + for shard in shards.iter_mut() { + shard_refs.push(shard); + } + + r.encode(&mut shard_refs).unwrap(); + } + + let shard_refs: Vec<_> = shards.iter().map(|i| &i[..]).collect(); + assert!(r.verify(&shard_refs).unwrap()); + } + + { + { + let mut shard_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + shard_refs[0][0] = 101; + shard_refs[0][1] = 102; + shard_refs[0][2] = 103; + + let shards_present = [false, true, true, true]; + + let mut shards = shard_refs + .into_iter() + .zip(shards_present.iter().cloned()) + .collect::>(); + + r.reconstruct(&mut shards[..]).unwrap(); + } + + let shard_refs: Vec<_> = shards.iter().map(|i| &i[..]).collect(); + assert!(r.verify(&shard_refs).unwrap()); + } + + let expect: [[u8; 3]; 4] = [[0, 1, 2], [3, 4, 5], [6, 11, 12], [5, 14, 11]]; + assert_eq!(expect, shards); + + { + { + let mut shard_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + shard_refs[0][0] = 201; + shard_refs[0][1] = 202; + shard_refs[0][2] = 203; + + shard_refs[2][0] = 101; + shard_refs[2][1] = 102; + shard_refs[2][2] = 103; + + let shards_present = [false, true, false, true]; + + let mut shards = shard_refs + .into_iter() + .zip(shards_present.iter().cloned()) + .collect::>(); + + r.reconstruct_data(&mut shards[..]).unwrap(); + } + + let shard_refs = convert_2D_slices!(shards =>to_vec &[u8]); + + assert!(!r.verify(&shard_refs).unwrap()); + } + + let expect: [[u8; 3]; 4] = [[0, 1, 2], [3, 4, 5], [101, 102, 103], [5, 14, 11]]; + assert_eq!(expect, shards); + + { + { + let mut shard_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + shard_refs[2][0] = 101; + shard_refs[2][1] = 102; + shard_refs[2][2] = 103; + + shard_refs[3][0] = 201; + shard_refs[3][1] = 202; + shard_refs[3][2] = 203; + + let shards_present = [true, true, false, false]; + + let mut shards = shard_refs + .into_iter() + .zip(shards_present.iter().cloned()) + .collect::>(); + + r.reconstruct_data(&mut shards[..]).unwrap(); + } + + let shard_refs = convert_2D_slices!(shards =>to_vec &[u8]); + + assert!(!r.verify(&shard_refs).unwrap()); + } + + let expect: [[u8; 3]; 4] = [[0, 1, 2], [3, 4, 5], [101, 102, 103], [201, 202, 203]]; + assert_eq!(expect, shards); +} + +quickcheck! { + fn qc_encode_verify_reconstruct_verify(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + let mut slice_present = vec![true; data + parity]; + for &p in corrupt_pos_s.iter() { + slice_present[p] = false; + } + + // reconstruct + { + let mut refs: Vec<_> = shards.iter_mut() + .map(|i| &mut i[..]) + .zip(slice_present.iter().cloned()) + .collect(); + + r.reconstruct(&mut refs[..]).unwrap(); + } + + ({ + let refs = + convert_2D_slices!(expect =>to_vec &[u8]); + + r.verify(&refs).unwrap() + }) + && + expect == shards + && + ({ + let refs = + convert_2D_slices!(shards =>to_vec &[u8]); + + r.verify(&refs).unwrap() + }) + } + + fn qc_encode_verify_reconstruct_verify_shards(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + r.encode(&mut expect).unwrap(); + + let expect = expect; + + let mut shards = shards_into_option_shards(expect.clone()); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + shards[p] = None; + } + + // reconstruct + r.reconstruct(&mut shards).unwrap(); + + let shards = option_shards_into_shards(shards); + + r.verify(&expect).unwrap() + && expect == shards + && r.verify(&shards).unwrap() + } + + fn qc_verify(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + + ({ + let refs = + convert_2D_slices!(expect =>to_vec &[u8]); + + r.verify(&refs).unwrap() + }) + && + ((corrupt > 0 && expect != shards) + || (corrupt == 0 && expect == shards)) + && + ({ + let refs = + convert_2D_slices!(shards =>to_vec &[u8]); + + (corrupt > 0 && !r.verify(&refs).unwrap()) + || (corrupt == 0 && r.verify(&refs).unwrap()) + }) + } + + fn qc_verify_shards(data: usize, + parity: usize, + corrupt: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let corrupt = corrupt % (parity + 1); + + let mut corrupt_pos_s = Vec::with_capacity(corrupt); + for _ in 0..corrupt { + let mut pos = rand::random::() % (data + parity); + + while let Some(_) = corrupt_pos_s.iter().find(|&&x| x == pos) { + pos = rand::random::() % (data + parity); + } + + corrupt_pos_s.push(pos); + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + r.encode(&mut expect).unwrap(); + + let expect = expect; + + let mut shards = expect.clone(); + + // corrupt shards + for &p in corrupt_pos_s.iter() { + fill_random(&mut shards[p]); + } + + r.verify(&expect).unwrap() + && + ((corrupt > 0 && expect != shards) + || (corrupt == 0 && expect == shards)) + && + ((corrupt > 0 && !r.verify(&shards).unwrap()) + || (corrupt == 0 && r.verify(&shards).unwrap())) + } + + fn qc_encode_sep_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let (data, parity) = shards.split_at_mut(data); + + let data_refs = + convert_2D_slices!(data =>to_mut_vec &[u8]); + + let mut parity_refs = + convert_2D_slices!(parity =>to_mut_vec &mut [u8]); + + r.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_sep_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + { + let (data, parity) = shards.split_at_mut(data); + + r.encode_sep(data, parity).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let mut refs = + convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + for i in 0..data { + r.encode_single(i, &mut refs).unwrap(); + } + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + for i in 0..data { + r.encode_single(i, &mut shards).unwrap(); + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_sep_same_as_encode(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + let expect = expect; + + { + let (data_shards, parity_shards) = shards.split_at_mut(data); + + let data_refs = + convert_2D_slices!(data_shards =>to_mut_vec &[u8]); + + let mut parity_refs = + convert_2D_slices!(parity_shards =>to_mut_vec &mut [u8]); + + for i in 0..data { + r.encode_single_sep(i, data_refs[i], &mut parity_refs).unwrap(); + } + } + + let shards = shards; + + expect == shards + } + + fn qc_encode_single_sep_same_as_encode_shards(data: usize, + parity: usize, + size: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let r = ReedSolomon::new(data, parity).unwrap(); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + let expect = expect; + + { + let (data_shards, parity_shards) = shards.split_at_mut(data); + + for i in 0..data { + r.encode_single_sep(i, &data_shards[i], parity_shards).unwrap(); + } + } + + let shards = shards; + + expect == shards + } +} + +#[test] +fn test_reconstruct_error_handling() { + let r = ReedSolomon::new(2, 2).unwrap(); + + let mut shards: [[u8; 3]; 4] = [[0, 1, 2], [3, 4, 5], [200, 201, 203], [100, 101, 102]]; + + { + let mut shard_refs: Vec<&mut [u8]> = Vec::with_capacity(3); + + for shard in shards.iter_mut() { + shard_refs.push(shard); + } + + r.encode(&mut shard_refs).unwrap(); + } + + { + let mut shard_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + shard_refs[0][0] = 101; + shard_refs[0][1] = 102; + shard_refs[0][2] = 103; + + let shards_present = [true, false, false, false]; + + let mut shard_refs: Vec<_> = shard_refs + .into_iter() + .zip(shards_present.iter().cloned()) + .collect(); + + assert_eq!( + Error::TooFewShardsPresent, + r.reconstruct(&mut shard_refs[..]).unwrap_err() + ); + + shard_refs[3].1 = true; + r.reconstruct(&mut shard_refs).unwrap(); + } +} + +#[test] +fn test_one_encode() { + let r = ReedSolomon::new(5, 5).unwrap(); + + let mut shards = shards!( + [0, 1], + [4, 5], + [2, 3], + [6, 7], + [8, 9], + [0, 0], + [0, 0], + [0, 0], + [0, 0], + [0, 0] + ); + + r.encode(&mut shards).unwrap(); + { + assert_eq!(shards[5][0], 12); + assert_eq!(shards[5][1], 13); + } + { + assert_eq!(shards[6][0], 10); + assert_eq!(shards[6][1], 11); + } + { + assert_eq!(shards[7][0], 14); + assert_eq!(shards[7][1], 15); + } + { + assert_eq!(shards[8][0], 90); + assert_eq!(shards[8][1], 91); + } + { + assert_eq!(shards[9][0], 94); + assert_eq!(shards[9][1], 95); + } + + assert!(r.verify(&shards).unwrap()); + + shards[8][0] += 1; + assert!(!r.verify(&shards).unwrap()); +} + +#[test] +fn test_verify_too_few_shards() { + let r = ReedSolomon::new(3, 2).unwrap(); + + let shards = make_random_shards!(10, 4); + + assert_eq!(Error::TooFewShards, r.verify(&shards).unwrap_err()); +} + +#[test] +fn test_verify_shards_with_buffer_incorrect_buffer_sizes() { + let r = ReedSolomon::new(3, 2).unwrap(); + + { + // Test too few slices in buffer + let shards = make_random_shards!(100, 5); + + let mut buffer = vec![vec![0; 100]; 1]; + + assert_eq!( + Error::TooFewBufferShards, + r.verify_with_buffer(&shards, &mut buffer).unwrap_err() + ); + } + { + // Test too many slices in buffer + let shards = make_random_shards!(100, 5); + + let mut buffer = vec![vec![0; 100]; 3]; + + assert_eq!( + Error::TooManyBufferShards, + r.verify_with_buffer(&shards, &mut buffer).unwrap_err() + ); + } + { + // Test correct number of slices in buffer + let mut shards = make_random_shards!(100, 5); + + r.encode(&mut shards).unwrap(); + + let mut buffer = vec![vec![0; 100]; 2]; + + assert_eq!(true, r.verify_with_buffer(&shards, &mut buffer).unwrap()); + } + { + // Test having first buffer being empty + let shards = make_random_shards!(100, 5); + + let mut buffer = vec![vec![0; 100]; 2]; + buffer[0] = vec![]; + + assert_eq!( + Error::EmptyShard, + r.verify_with_buffer(&shards, &mut buffer).unwrap_err() + ); + } + { + // Test having shards of inconsistent length in buffer + let shards = make_random_shards!(100, 5); + + let mut buffer = vec![vec![0; 100]; 2]; + buffer[1] = vec![0; 99]; + + assert_eq!( + Error::IncorrectShardSize, + r.verify_with_buffer(&shards, &mut buffer).unwrap_err() + ); + } +} + +#[test] +fn test_verify_shards_with_buffer_gives_correct_parity_shards() { + let r = ReedSolomon::new(10, 3).unwrap(); + + for _ in 0..100 { + let mut shards = make_random_shards!(100, 13); + let shards_copy = shards.clone(); + + r.encode(&mut shards).unwrap(); + + { + let mut buffer = make_random_shards!(100, 3); + + assert!(!r.verify_with_buffer(&shards_copy, &mut buffer).unwrap()); + + assert_eq_shards(&shards[10..], &buffer); + } + { + let mut buffer = make_random_shards!(100, 3); + + assert!(r.verify_with_buffer(&shards, &mut buffer).unwrap()); + + assert_eq_shards(&shards[10..], &buffer); + } + } +} + +#[test] +fn test_verify_with_buffer_gives_correct_parity_shards() { + let r = ReedSolomon::new(10, 3).unwrap(); + + for _ in 0..100 { + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + let slices_copy = slices.clone(); + + { + let mut slice_refs = convert_2D_slices!(slices=>to_mut_vec &mut [u8]); + + r.encode(&mut slice_refs).unwrap(); + } + + { + let mut buffer: [[u8; 100]; 3] = [[0; 100]; 3]; + + { + let slice_copy_refs = convert_2D_slices!(slices_copy =>to_vec &[u8]); + + for slice in buffer.iter_mut() { + fill_random(slice); + } + + let mut buffer_refs = convert_2D_slices!(buffer =>to_mut_vec &mut [u8]); + + assert!(!r + .verify_with_buffer(&slice_copy_refs, &mut buffer_refs) + .unwrap()); + } + + for a in 0..3 { + for b in 0..100 { + assert_eq!(slices[10 + a][b], buffer[a][b]); + } + } + } + + { + let mut buffer: [[u8; 100]; 3] = [[0; 100]; 3]; + + { + let slice_refs = convert_2D_slices!(slices=>to_vec &[u8]); + + for slice in buffer.iter_mut() { + fill_random(slice); + } + + let mut buffer_refs = convert_2D_slices!(buffer =>to_mut_vec &mut [u8]); + + assert!(r.verify_with_buffer(&slice_refs, &mut buffer_refs).unwrap()); + } + + for a in 0..3 { + for b in 0..100 { + assert_eq!(slices[10 + a][b], buffer[a][b]); + } + } + } + } +} + +#[test] +fn test_slices_or_shards_count_check() { + let r = ReedSolomon::new(3, 2).unwrap(); + + { + let mut shards = make_random_shards!(10, 4); + + assert_eq!(Error::TooFewShards, r.encode(&mut shards).unwrap_err()); + assert_eq!(Error::TooFewShards, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::TooFewShards, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } + { + let mut shards = make_random_shards!(10, 6); + + assert_eq!(Error::TooManyShards, r.encode(&mut shards).unwrap_err()); + assert_eq!(Error::TooManyShards, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::TooManyShards, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } +} + +#[test] +fn test_check_slices_or_shards_size() { + let r = ReedSolomon::new(2, 2).unwrap(); + + { + let mut shards = shards!([0, 0, 0], [0, 1], [1, 2, 3], [0, 0, 0]); + + assert_eq!( + Error::IncorrectShardSize, + r.encode(&mut shards).unwrap_err() + ); + assert_eq!(Error::IncorrectShardSize, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::IncorrectShardSize, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } + { + let mut shards = shards!([0, 1], [0, 1], [1, 2, 3], [0, 0, 0]); + + assert_eq!( + Error::IncorrectShardSize, + r.encode(&mut shards).unwrap_err() + ); + assert_eq!(Error::IncorrectShardSize, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::IncorrectShardSize, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } + { + let mut shards = shards!([0, 1], [0, 1, 4], [1, 2, 3], [0, 0, 0]); + + assert_eq!( + Error::IncorrectShardSize, + r.encode(&mut shards).unwrap_err() + ); + assert_eq!(Error::IncorrectShardSize, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::IncorrectShardSize, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } + { + let mut shards = shards!([], [0, 1, 3], [1, 2, 3], [0, 0, 0]); + + assert_eq!(Error::EmptyShard, r.encode(&mut shards).unwrap_err()); + assert_eq!(Error::EmptyShard, r.verify(&shards).unwrap_err()); + + let mut option_shards = shards_to_option_shards(&shards); + + assert_eq!( + Error::EmptyShard, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } + { + let mut option_shards: Vec>> = vec![None, None, None, None]; + + assert_eq!( + Error::TooFewShardsPresent, + r.reconstruct(&mut option_shards).unwrap_err() + ); + } +} + +#[test] +fn shardbyshard_encode_correctly() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + let mut shards_copy = shards.clone(); + + r.encode(&mut shards).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut shards_copy).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!(shards, shards_copy); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + let mut slices_copy = slices.clone(); + + { + let mut slice_refs = convert_2D_slices!(slices=>to_mut_vec &mut [u8]); + let mut slice_copy_refs = convert_2D_slices!(slices_copy =>to_mut_vec &mut [u8]); + + r.encode(&mut slice_refs).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut slice_copy_refs).unwrap(); + } + } + + assert!(sbs.parity_ready()); + + for a in 0..13 { + for b in 0..100 { + assert_eq!(slices[a][b], slices_copy[a][b]); + } + } + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } +} + +quickcheck! { + fn qc_shardbyshard_encode_same_as_encode(data: usize, + parity: usize, + size: usize, + reuse: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let reuse = reuse % 10; + + let r = ReedSolomon::new(data, parity).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + for _ in 0..1 + reuse { + { + let mut refs = + convert_2D_slices!(expect =>to_mut_vec &mut [u8]); + + r.encode(&mut refs).unwrap(); + } + + { + let mut slice_refs = + convert_2D_slices!(shards=>to_mut_vec &mut [u8]); + + for i in 0..data { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut slice_refs).unwrap(); + } + } + + if !(expect == shards + && sbs.parity_ready() + && sbs.cur_input_index() == data + && { sbs.reset().unwrap(); !sbs.parity_ready() && sbs.cur_input_index() == 0 }) { + return false; + } + } + + return true; + } + + fn qc_shardbyshard_encode_same_as_encode_shards(data: usize, + parity: usize, + size: usize, + reuse: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let reuse = reuse % 10; + + let r = ReedSolomon::new(data, parity).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + r.encode(&mut expect).unwrap(); + + for _ in 0..1 + reuse { + for i in 0..data { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut shards).unwrap(); + } + + if !(expect == shards + && sbs.parity_ready() + && sbs.cur_input_index() == data + && { sbs.reset().unwrap(); !sbs.parity_ready() && sbs.cur_input_index() == 0 }) { + return false; + } + } + + return true; + } +} + +#[test] +fn shardbyshard_encode_sep_correctly() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + let mut shards_copy = shards.clone(); + + let (data, parity) = shards.split_at_mut(10); + let (data_copy, parity_copy) = shards_copy.split_at_mut(10); + + r.encode_sep(data, parity).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data_copy, parity_copy).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!(parity, parity_copy); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + let mut slices_copy = slices.clone(); + + { + let (data, parity) = slices.split_at_mut(10); + let (data_copy, parity_copy) = slices_copy.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + let data_copy_refs = convert_2D_slices!(data_copy =>to_mut_vec &[u8]); + let mut parity_copy_refs = convert_2D_slices!(parity_copy =>to_mut_vec &mut [u8]); + + r.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(&data_copy_refs, &mut parity_copy_refs) + .unwrap(); + } + } + + assert!(sbs.parity_ready()); + + for a in 0..13 { + for b in 0..100 { + assert_eq!(slices[a][b], slices_copy[a][b]); + } + } + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } +} + +quickcheck! { + fn qc_shardbyshard_encode_sep_same_as_encode(data: usize, + parity: usize, + size: usize, + reuse: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let reuse = reuse % 10; + + let r = ReedSolomon::new(data, parity).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + for _ in 0..1 + reuse { + { + let (data_shards, parity_shards) = + expect.split_at_mut(data); + + let data_refs = + convert_2D_slices!(data_shards =>to_mut_vec &[u8]); + let mut parity_refs = + convert_2D_slices!(parity_shards =>to_mut_vec &mut [u8]); + + r.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + + { + let (data_shards, parity_shards) = + shards.split_at_mut(data); + let data_refs = + convert_2D_slices!(data_shards =>to_mut_vec &[u8]); + let mut parity_refs = + convert_2D_slices!(parity_shards =>to_mut_vec &mut [u8]); + + for i in 0..data { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + } + + if !(expect == shards + && sbs.parity_ready() + && sbs.cur_input_index() == data + && { sbs.reset().unwrap(); !sbs.parity_ready() && sbs.cur_input_index() == 0 }) { + return false; + } + } + + return true; + } + + fn qc_shardbyshard_encode_sep_same_as_encode_shards(data: usize, + parity: usize, + size: usize, + reuse: usize) -> bool { + let data = 1 + data % 255; + let mut parity = 1 + parity % 255; + if data + parity > 256 { + parity -= data + parity - 256; + } + + let size = 1 + size % 1_000_000; + + let reuse = reuse % 10; + + let r = ReedSolomon::new(data, parity).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut expect = make_random_shards!(size, data + parity); + let mut shards = expect.clone(); + + for _ in 0..1 + reuse { + { + let (data_shards, parity_shards) = + expect.split_at_mut(data); + + r.encode_sep(data_shards, parity_shards).unwrap(); + } + + { + let (data_shards, parity_shards) = + shards.split_at_mut(data); + + for i in 0..data { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data_shards, parity_shards).unwrap(); + } + } + + if !(expect == shards + && sbs.parity_ready() + && sbs.cur_input_index() == data + && { sbs.reset().unwrap(); !sbs.parity_ready() && sbs.cur_input_index() == 0 }) { + return false; + } + } + + return true; + } +} + +#[test] +fn shardbyshard_encode_correctly_more_rigorous() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + let mut shards_copy = make_random_shards!(10_000, 13); + + r.encode(&mut shards).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + shards_copy[i].clone_from_slice(&shards[i]); + sbs.encode(&mut shards_copy).unwrap(); + fill_random(&mut shards_copy[i]); + } + + assert!(sbs.parity_ready()); + + for i in 0..10 { + shards_copy[i].clone_from_slice(&shards[i]); + } + + assert_eq!(shards, shards_copy); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let mut slices_copy: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices_copy.iter_mut() { + fill_random(slice); + } + + { + let mut slice_refs = convert_2D_slices!(slices=>to_mut_vec &mut [u8]); + let mut slice_copy_refs = convert_2D_slices!(slices_copy =>to_mut_vec &mut [u8]); + + r.encode(&mut slice_refs).unwrap(); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + slice_copy_refs[i].clone_from_slice(&slice_refs[i]); + sbs.encode(&mut slice_copy_refs).unwrap(); + fill_random(&mut slice_copy_refs[i]); + } + } + + for i in 0..10 { + slices_copy[i].clone_from_slice(&slices[i]); + } + + assert!(sbs.parity_ready()); + + for a in 0..13 { + for b in 0..100 { + assert_eq!(slices[a][b], slices_copy[a][b]); + } + } + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } +} + +#[test] +fn shardbyshard_encode_error_handling() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + + let mut slice_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut slice_refs).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!( + SBSError::TooManyCalls, + sbs.encode(&mut slice_refs).unwrap_err() + ); + + sbs.reset().unwrap(); + + for i in 0..1 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut slice_refs).unwrap(); + } + + assert_eq!(SBSError::LeftoverShards, sbs.reset().unwrap_err()); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[0] = vec![]; + { + let mut slice_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode(&mut slice_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode(&mut slice_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[0] = vec![0; 100]; + + let mut slice_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + sbs.encode(&mut slice_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[1] = vec![0; 99]; + { + let mut slice_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode(&mut slice_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode(&mut slice_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[1] = vec![0; 100]; + + let mut slice_refs = convert_2D_slices!(shards =>to_mut_vec &mut [u8]); + + sbs.encode(&mut slice_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } +} + +#[test] +fn shardbyshard_encode_shard_error_handling() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut shards).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!(SBSError::TooManyCalls, sbs.encode(&mut shards).unwrap_err()); + + sbs.reset().unwrap(); + + for i in 0..1 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode(&mut shards).unwrap(); + } + + assert_eq!(SBSError::LeftoverShards, sbs.reset().unwrap_err()); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[0] = vec![]; + { + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode(&mut shards).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode(&mut shards).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[0] = vec![0; 100]; + + sbs.encode(&mut shards).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[1] = vec![0; 99]; + { + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode(&mut shards).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode(&mut shards).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[1] = vec![0; 100]; + + sbs.encode(&mut shards).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } +} + +#[test] +fn shardbyshard_encode_sep_error_handling() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + + let (data, parity) = shards.split_at_mut(10); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data, parity).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!( + SBSError::TooManyCalls, + sbs.encode_sep(data, parity).unwrap_err() + ); + + sbs.reset().unwrap(); + + for i in 0..1 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data, parity).unwrap(); + } + + assert_eq!(SBSError::LeftoverShards, sbs.reset().unwrap_err()); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + { + let (data, parity) = slices.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!( + SBSError::TooManyCalls, + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + sbs.reset().unwrap(); + + for i in 0..1 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + } + } + + assert_eq!(SBSError::LeftoverShards, sbs.reset().unwrap_err()); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[0] = vec![]; + + { + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[0] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[10] = vec![]; + { + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[10] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[1] = vec![0; 99]; + { + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[1] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[11] = vec![0; 99]; + { + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[11] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + sbs.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + } +} + +#[test] +fn shardbyshard_encode_shard_sep_error_handling() { + { + let r = ReedSolomon::new(10, 3).unwrap(); + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(10_000, 13); + + let (data, parity) = shards.split_at_mut(10); + + for i in 0..10 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data, parity).unwrap(); + } + + assert!(sbs.parity_ready()); + + assert_eq!( + SBSError::TooManyCalls, + sbs.encode_sep(data, parity).unwrap_err() + ); + + sbs.reset().unwrap(); + + for i in 0..1 { + assert_eq!(i, sbs.cur_input_index()); + + sbs.encode_sep(data, parity).unwrap(); + } + + assert_eq!(SBSError::LeftoverShards, sbs.reset().unwrap_err()); + + sbs.reset_force(); + + assert_eq!(0, sbs.cur_input_index()); + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[0] = vec![]; + + { + let (data, parity) = shards.split_at_mut(10); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[0] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + sbs.encode_sep(data, parity).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[10] = vec![]; + { + let (data, parity) = shards.split_at_mut(10); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::EmptyShard), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[10] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + sbs.encode_sep(data, parity).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + } + { + let r = ReedSolomon::new(10, 3).unwrap(); + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[1] = vec![0; 99]; + { + let (data, parity) = shards.split_at_mut(10); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[1] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + sbs.encode_sep(data, parity).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + { + let mut sbs = ShardByShard::new(&r); + + let mut shards = make_random_shards!(100, 13); + shards[11] = vec![0; 99]; + { + let (data, parity) = shards.split_at_mut(10); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + + assert_eq!( + SBSError::RSError(Error::IncorrectShardSize), + sbs.encode_sep(data, parity).unwrap_err() + ); + + assert_eq!(0, sbs.cur_input_index()); + } + + shards[11] = vec![0; 100]; + + let (data, parity) = shards.split_at_mut(10); + + sbs.encode_sep(data, parity).unwrap(); + + assert_eq!(1, sbs.cur_input_index()); + } + } +} + +#[test] +fn test_encode_single_sep() { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut shards = make_random_shards!(10, 13); + let mut shards_copy = shards.clone(); + + r.encode(&mut shards).unwrap(); + + { + let (data, parity) = shards_copy.split_at_mut(10); + + for i in 0..10 { + r.encode_single_sep(i, &data[i], parity).unwrap(); + } + } + assert!(r.verify(&shards).unwrap()); + assert!(r.verify(&shards_copy).unwrap()); + + assert_eq_shards(&shards, &shards_copy); + } + { + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + let mut slices_copy = slices.clone(); + + { + let mut slice_refs = convert_2D_slices!(slices=>to_mut_vec &mut [u8]); + + let (data_copy, parity_copy) = slices_copy.split_at_mut(10); + + let data_copy_refs = convert_2D_slices!(data_copy =>to_mut_vec &[u8]); + let mut parity_copy_refs = convert_2D_slices!(parity_copy =>to_mut_vec &mut [u8]); + + r.encode(&mut slice_refs).unwrap(); + + for i in 0..10 { + r.encode_single_sep(i, &data_copy_refs[i], &mut parity_copy_refs) + .unwrap(); + } + } + + for a in 0..13 { + for b in 0..100 { + assert_eq!(slices[a][b], slices_copy[a][b]); + } + } + } +} + +#[test] +fn test_encode_sep() { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut shards = make_random_shards!(10_000, 13); + let mut shards_copy = shards.clone(); + + r.encode(&mut shards).unwrap(); + + { + let (data, parity) = shards_copy.split_at_mut(10); + + r.encode_sep(data, parity).unwrap(); + } + + assert_eq_shards(&shards, &shards_copy); + } + { + let mut slices: [[u8; 100]; 13] = [[0; 100]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + let mut slices_copy = slices.clone(); + + { + let (data_copy, parity_copy) = slices_copy.split_at_mut(10); + + let mut slice_refs = convert_2D_slices!(slices =>to_mut_vec &mut [u8]); + let data_copy_refs = convert_2D_slices!(data_copy =>to_mut_vec &[u8]); + let mut parity_copy_refs = convert_2D_slices!(parity_copy =>to_mut_vec &mut [u8]); + + r.encode(&mut slice_refs).unwrap(); + + r.encode_sep(&data_copy_refs, &mut parity_copy_refs) + .unwrap(); + } + + for a in 0..13 { + for b in 0..100 { + assert_eq!(slices[a][b], slices_copy[a][b]); + } + } + } +} + +#[test] +fn test_encode_single_sep_error_handling() { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut shards = make_random_shards!(1000, 13); + + { + let (data, parity) = shards.split_at_mut(10); + + for i in 0..10 { + r.encode_single_sep(i, &data[i], parity).unwrap(); + } + + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(10, &data[0], parity).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(11, &data[0], parity).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(12, &data[0], parity).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(13, &data[0], parity).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(14, &data[0], parity).unwrap_err() + ); + } + + { + let (data, parity) = shards.split_at_mut(11); + + assert_eq!( + Error::TooFewParityShards, + r.encode_single_sep(0, &data[0], parity).unwrap_err() + ); + } + { + let (data, parity) = shards.split_at_mut(9); + + assert_eq!( + Error::TooManyParityShards, + r.encode_single_sep(0, &data[0], parity).unwrap_err() + ); + } + } + { + let mut slices: [[u8; 1000]; 13] = [[0; 1000]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + { + let (data, parity) = slices.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + for i in 0..10 { + r.encode_single_sep(i, &data_refs[i], &mut parity_refs) + .unwrap(); + } + + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(10, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(11, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(12, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(13, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single_sep(14, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + } + { + let (data, parity) = slices.split_at_mut(11); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooFewParityShards, + r.encode_single_sep(0, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + } + { + let (data, parity) = slices.split_at_mut(9); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooManyParityShards, + r.encode_single_sep(0, &data_refs[0], &mut parity_refs) + .unwrap_err() + ); + } + } +} + +#[test] +fn test_encode_sep_error_handling() { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut shards = make_random_shards!(1000, 13); + + let (data, parity) = shards.split_at_mut(10); + + r.encode_sep(data, parity).unwrap(); + + { + let mut shards = make_random_shards!(1000, 12); + let (data, parity) = shards.split_at_mut(9); + + assert_eq!( + Error::TooFewDataShards, + r.encode_sep(data, parity).unwrap_err() + ); + } + { + let mut shards = make_random_shards!(1000, 14); + let (data, parity) = shards.split_at_mut(11); + + assert_eq!( + Error::TooManyDataShards, + r.encode_sep(data, parity).unwrap_err() + ); + } + { + let mut shards = make_random_shards!(1000, 12); + let (data, parity) = shards.split_at_mut(10); + + assert_eq!( + Error::TooFewParityShards, + r.encode_sep(data, parity).unwrap_err() + ); + } + { + let mut shards = make_random_shards!(1000, 14); + let (data, parity) = shards.split_at_mut(10); + + assert_eq!( + Error::TooManyParityShards, + r.encode_sep(data, parity).unwrap_err() + ); + } + } + { + let mut slices: [[u8; 1000]; 13] = [[0; 1000]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let (data, parity) = slices.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + r.encode_sep(&data_refs, &mut parity_refs).unwrap(); + + { + let mut slices: [[u8; 1000]; 12] = [[0; 1000]; 12]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let (data, parity) = slices.split_at_mut(9); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooFewDataShards, + r.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + } + { + let mut slices: [[u8; 1000]; 14] = [[0; 1000]; 14]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let (data, parity) = slices.split_at_mut(11); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooManyDataShards, + r.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + } + { + let mut slices: [[u8; 1000]; 12] = [[0; 1000]; 12]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let (data, parity) = slices.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooFewParityShards, + r.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + } + { + let mut slices: [[u8; 1000]; 14] = [[0; 1000]; 14]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let (data, parity) = slices.split_at_mut(10); + + let data_refs = convert_2D_slices!(data=>to_mut_vec &[u8]); + let mut parity_refs = convert_2D_slices!(parity=>to_mut_vec &mut [u8]); + + assert_eq!( + Error::TooManyParityShards, + r.encode_sep(&data_refs, &mut parity_refs).unwrap_err() + ); + } + } +} + +#[test] +fn test_encode_single_error_handling() { + let r = ReedSolomon::new(10, 3).unwrap(); + + { + let mut shards = make_random_shards!(1000, 13); + + for i in 0..10 { + r.encode_single(i, &mut shards).unwrap(); + } + + assert_eq!( + Error::InvalidIndex, + r.encode_single(10, &mut shards).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(11, &mut shards).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(12, &mut shards).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(13, &mut shards).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(14, &mut shards).unwrap_err() + ); + } + { + let mut slices: [[u8; 1000]; 13] = [[0; 1000]; 13]; + for slice in slices.iter_mut() { + fill_random(slice); + } + + let mut slice_refs = convert_2D_slices!(slices=>to_mut_vec &mut [u8]); + + for i in 0..10 { + r.encode_single(i, &mut slice_refs).unwrap(); + } + + assert_eq!( + Error::InvalidIndex, + r.encode_single(10, &mut slice_refs).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(11, &mut slice_refs).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(12, &mut slice_refs).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(13, &mut slice_refs).unwrap_err() + ); + assert_eq!( + Error::InvalidIndex, + r.encode_single(14, &mut slice_refs).unwrap_err() + ); + } +} diff --git a/test/s3/normal/s3_integration_test.go b/test/s3/normal/s3_integration_test.go index 2f9f325c0..6abab8849 100644 --- a/test/s3/normal/s3_integration_test.go +++ b/test/s3/normal/s3_integration_test.go @@ -10,6 +10,7 @@ import ( "net" "net/http" "os" + "os/exec" "path/filepath" "strconv" "sync" @@ -24,6 +25,7 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" "github.com/seaweedfs/seaweedfs/weed/command" "github.com/seaweedfs/seaweedfs/weed/glog" flag "github.com/seaweedfs/seaweedfs/weed/util/fla9" @@ -37,18 +39,19 @@ const ( // TestCluster manages the weed mini instance for integration testing type TestCluster struct { - dataDir string - ctx context.Context - cancel context.CancelFunc - s3Client *s3.S3 - isRunning bool - startOnce sync.Once - wg sync.WaitGroup - masterPort int - volumePort int - filerPort int - s3Port int - s3Endpoint string + dataDir string + ctx context.Context + cancel context.CancelFunc + s3Client *s3.S3 + isRunning bool + startOnce sync.Once + wg sync.WaitGroup + masterPort int + volumePort int + filerPort int + s3Port int + s3Endpoint string + rustVolumeCmd *exec.Cmd } // TestS3Integration demonstrates basic S3 operations against a running weed mini instance @@ -236,6 +239,14 @@ func startMiniCluster(t *testing.T, extraArgs ...string) (*TestCluster, error) { return nil, fmt.Errorf("S3 service failed to start: %v", err) } + // If VOLUME_SERVER_IMPL=rust, start a Rust volume server alongside weed mini + if os.Getenv("VOLUME_SERVER_IMPL") == "rust" { + if err := cluster.startRustVolumeServer(t); err != nil { + cancel() + return nil, fmt.Errorf("failed to start Rust volume server: %v", err) + } + } + cluster.isRunning = true // Create S3 client @@ -257,8 +268,82 @@ func startMiniCluster(t *testing.T, extraArgs ...string) (*TestCluster, error) { return cluster, nil } +// startRustVolumeServer starts a Rust volume server that registers with the same master. +func (c *TestCluster) startRustVolumeServer(t *testing.T) error { + t.Helper() + + rustBinary, err := framework.FindOrBuildRustBinary() + if err != nil { + return fmt.Errorf("resolve rust volume binary: %v", err) + } + + rustVolumePort, err := findAvailablePort() + if err != nil { + return fmt.Errorf("find rust volume port: %v", err) + } + rustVolumeGrpcPort, err := findAvailablePort() + if err != nil { + return fmt.Errorf("find rust volume grpc port: %v", err) + } + + rustVolumeDir := filepath.Join(c.dataDir, "rust-volume") + if err := os.MkdirAll(rustVolumeDir, 0o755); err != nil { + return fmt.Errorf("create rust volume dir: %v", err) + } + + securityToml := filepath.Join(c.dataDir, "security.toml") + + args := []string{ + "--port", strconv.Itoa(rustVolumePort), + "--port.grpc", strconv.Itoa(rustVolumeGrpcPort), + "--port.public", strconv.Itoa(rustVolumePort), + "--ip", "127.0.0.1", + "--ip.bind", "127.0.0.1", + "--dir", rustVolumeDir, + "--max", "16", + "--master", "127.0.0.1:" + strconv.Itoa(c.masterPort), + "--securityFile", securityToml, + "--preStopSeconds", "0", + } + + logFile, err := os.Create(filepath.Join(c.dataDir, "rust-volume.log")) + if err != nil { + return fmt.Errorf("create rust volume log: %v", err) + } + + c.rustVolumeCmd = exec.Command(rustBinary, args...) + c.rustVolumeCmd.Dir = c.dataDir + c.rustVolumeCmd.Stdout = logFile + c.rustVolumeCmd.Stderr = logFile + if err := c.rustVolumeCmd.Start(); err != nil { + logFile.Close() + return fmt.Errorf("start rust volume: %v", err) + } + + // Wait for the Rust volume server to be ready + rustEndpoint := fmt.Sprintf("http://127.0.0.1:%d/healthz", rustVolumePort) + deadline := time.Now().Add(15 * time.Second) + client := &http.Client{Timeout: 1 * time.Second} + for time.Now().Before(deadline) { + resp, err := client.Get(rustEndpoint) + if err == nil { + resp.Body.Close() + t.Logf("Rust volume server ready on port %d (grpc %d)", rustVolumePort, rustVolumeGrpcPort) + return nil + } + time.Sleep(200 * time.Millisecond) + } + return fmt.Errorf("rust volume server not ready after 15s (port %d)", rustVolumePort) +} + // Stop stops the test cluster func (c *TestCluster) Stop() { + // Stop Rust volume server first + if c.rustVolumeCmd != nil && c.rustVolumeCmd.Process != nil { + c.rustVolumeCmd.Process.Kill() + c.rustVolumeCmd.Wait() + } + if c.cancel != nil { c.cancel() } diff --git a/test/s3/policy/policy_test.go b/test/s3/policy/policy_test.go index 07092e04f..8c97f4a58 100644 --- a/test/s3/policy/policy_test.go +++ b/test/s3/policy/policy_test.go @@ -21,6 +21,7 @@ import ( "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/iam" "github.com/aws/aws-sdk-go/service/s3" + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" "github.com/seaweedfs/seaweedfs/weed/command" "github.com/seaweedfs/seaweedfs/weed/glog" "github.com/seaweedfs/seaweedfs/weed/pb" @@ -42,6 +43,7 @@ type TestCluster struct { filerGrpcPort int s3Port int s3Endpoint string + rustVolumeCmd *exec.Cmd } func TestS3PolicyShellRevised(t *testing.T) { @@ -822,6 +824,15 @@ enabled = true cancel() return nil, err } + + // If VOLUME_SERVER_IMPL=rust, start a Rust volume server alongside weed mini + if os.Getenv("VOLUME_SERVER_IMPL") == "rust" { + if err := cluster.startRustVolumeServer(t); err != nil { + cancel() + return nil, fmt.Errorf("failed to start Rust volume server: %v", err) + } + } + cluster.isRunning = true return cluster, nil } @@ -840,7 +851,80 @@ func waitForS3Ready(endpoint string, timeout time.Duration) error { return fmt.Errorf("timeout waiting for S3") } +// startRustVolumeServer starts a Rust volume server that registers with the same master. +func (c *TestCluster) startRustVolumeServer(t *testing.T) error { + t.Helper() + + rustBinary, err := framework.FindOrBuildRustBinary() + if err != nil { + return fmt.Errorf("resolve rust volume binary: %v", err) + } + + rustVolumePort, err := findAvailablePort() + if err != nil { + return fmt.Errorf("find rust volume port: %v", err) + } + rustVolumeGrpcPort, err := findAvailablePort() + if err != nil { + return fmt.Errorf("find rust volume grpc port: %v", err) + } + + rustVolumeDir := filepath.Join(c.dataDir, "rust-volume") + if err := os.MkdirAll(rustVolumeDir, 0o755); err != nil { + return fmt.Errorf("create rust volume dir: %v", err) + } + + securityToml := filepath.Join(c.dataDir, "security.toml") + + args := []string{ + "--port", strconv.Itoa(rustVolumePort), + "--port.grpc", strconv.Itoa(rustVolumeGrpcPort), + "--port.public", strconv.Itoa(rustVolumePort), + "--ip", "127.0.0.1", + "--ip.bind", "127.0.0.1", + "--dir", rustVolumeDir, + "--max", "16", + "--master", "127.0.0.1:" + strconv.Itoa(c.masterPort), + "--securityFile", securityToml, + "--preStopSeconds", "0", + } + + logFile, err := os.Create(filepath.Join(c.dataDir, "rust-volume.log")) + if err != nil { + return fmt.Errorf("create rust volume log: %v", err) + } + + c.rustVolumeCmd = exec.Command(rustBinary, args...) + c.rustVolumeCmd.Dir = c.dataDir + c.rustVolumeCmd.Stdout = logFile + c.rustVolumeCmd.Stderr = logFile + if err := c.rustVolumeCmd.Start(); err != nil { + logFile.Close() + return fmt.Errorf("start rust volume: %v", err) + } + + rustEndpoint := fmt.Sprintf("http://127.0.0.1:%d/healthz", rustVolumePort) + deadline := time.Now().Add(15 * time.Second) + client := &http.Client{Timeout: 1 * time.Second} + for time.Now().Before(deadline) { + resp, err := client.Get(rustEndpoint) + if err == nil { + resp.Body.Close() + t.Logf("Rust volume server ready on port %d (grpc %d)", rustVolumePort, rustVolumeGrpcPort) + return nil + } + time.Sleep(200 * time.Millisecond) + } + return fmt.Errorf("rust volume server not ready after 15s (port %d)", rustVolumePort) +} + func (c *TestCluster) Stop() { + // Stop Rust volume server first + if c.rustVolumeCmd != nil && c.rustVolumeCmd.Process != nil { + c.rustVolumeCmd.Process.Kill() + c.rustVolumeCmd.Wait() + } + if c.cancel != nil { c.cancel() } diff --git a/test/volume_server/framework/cluster.go b/test/volume_server/framework/cluster.go index 4bb1b55d5..1f9d30740 100644 --- a/test/volume_server/framework/cluster.go +++ b/test/volume_server/framework/cluster.go @@ -27,6 +27,12 @@ const ( testVolumeSizeLimitMB = 32 ) +var ( + weedBinaryOnce sync.Once + weedBinaryPath string + weedBinaryErr error +) + // Cluster is a lightweight SeaweedFS master + one volume server test harness. type Cluster struct { testingTB testing.TB @@ -326,6 +332,13 @@ func writeSecurityConfig(configDir string, profile matrix.Profile) error { b.WriteString("\"\n") b.WriteString("expires_after_seconds = 60\n") } + if profile.EnableUIAccess { + if b.Len() > 0 { + b.WriteString("\n") + } + b.WriteString("[access]\n") + b.WriteString("ui = true\n") + } if b.Len() == 0 { b.WriteString("# optional security config generated for integration tests\n") } @@ -341,40 +354,43 @@ func FindOrBuildWeedBinary() (string, error) { return "", fmt.Errorf("WEED_BINARY is set but not executable: %s", fromEnv) } - repoRoot := "" - if _, file, _, ok := runtime.Caller(0); ok { - repoRoot = filepath.Clean(filepath.Join(filepath.Dir(file), "..", "..", "..")) - candidate := filepath.Join(repoRoot, "weed", "weed") - if isExecutableFile(candidate) { - return candidate, nil + weedBinaryOnce.Do(func() { + repoRoot := "" + if _, file, _, ok := runtime.Caller(0); ok { + repoRoot = filepath.Clean(filepath.Join(filepath.Dir(file), "..", "..", "..")) + } + if repoRoot == "" { + weedBinaryErr = errors.New("unable to detect repository root") + return } - } - if repoRoot == "" { - return "", errors.New("unable to detect repository root") - } + binDir := filepath.Join(os.TempDir(), "seaweedfs_volume_server_it_bin") + if err := os.MkdirAll(binDir, 0o755); err != nil { + weedBinaryErr = fmt.Errorf("create binary directory %s: %w", binDir, err) + return + } + binPath := filepath.Join(binDir, "weed") - binDir := filepath.Join(os.TempDir(), "seaweedfs_volume_server_it_bin") - if err := os.MkdirAll(binDir, 0o755); err != nil { - return "", fmt.Errorf("create binary directory %s: %w", binDir, err) - } - binPath := filepath.Join(binDir, "weed") - if isExecutableFile(binPath) { - return binPath, nil - } + cmd := exec.Command("go", "build", "-o", binPath, ".") + cmd.Dir = filepath.Join(repoRoot, "weed") + var out bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = &out + if err := cmd.Run(); err != nil { + weedBinaryErr = fmt.Errorf("build weed binary: %w\n%s", err, out.String()) + return + } + if !isExecutableFile(binPath) { + weedBinaryErr = fmt.Errorf("built weed binary is not executable: %s", binPath) + return + } + weedBinaryPath = binPath + }) - cmd := exec.Command("go", "build", "-o", binPath, ".") - cmd.Dir = filepath.Join(repoRoot, "weed") - var out bytes.Buffer - cmd.Stdout = &out - cmd.Stderr = &out - if err := cmd.Run(); err != nil { - return "", fmt.Errorf("build weed binary: %w\n%s", err, out.String()) + if weedBinaryErr != nil { + return "", weedBinaryErr } - if !isExecutableFile(binPath) { - return "", fmt.Errorf("built weed binary is not executable: %s", binPath) - } - return binPath, nil + return weedBinaryPath, nil } func isExecutableFile(path string) bool { diff --git a/test/volume_server/framework/cluster_dual.go b/test/volume_server/framework/cluster_dual.go index b068419c0..1e3a2554c 100644 --- a/test/volume_server/framework/cluster_dual.go +++ b/test/volume_server/framework/cluster_dual.go @@ -11,7 +11,7 @@ import ( type DualVolumeCluster = MultiVolumeCluster // StartDualVolumeCluster starts a cluster with 2 volume servers. -// Deprecated: Use StartMultiVolumeCluster(t, profile, 2) directly. -func StartDualVolumeCluster(t testing.TB, profile matrix.Profile) *DualVolumeCluster { - return StartMultiVolumeCluster(t, profile, 2) +// Deprecated: Use StartMultiVolumeClusterAuto(t, profile, 2) directly. +func StartDualVolumeCluster(t testing.TB, profile matrix.Profile) MultiCluster { + return StartMultiVolumeClusterAuto(t, profile, 2) } diff --git a/test/volume_server/framework/cluster_interface.go b/test/volume_server/framework/cluster_interface.go new file mode 100644 index 000000000..875e66675 --- /dev/null +++ b/test/volume_server/framework/cluster_interface.go @@ -0,0 +1,63 @@ +package framework + +import ( + "os" + "testing" + + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" +) + +// TestCluster is the common interface for single-volume cluster harnesses. +// Both *Cluster (Go volume) and *RustCluster (Rust volume) satisfy it. +type TestCluster interface { + MasterAddress() string + VolumeAdminAddress() string + VolumePublicAddress() string + VolumeGRPCAddress() string + VolumeServerAddress() string + MasterURL() string + VolumeAdminURL() string + VolumePublicURL() string + BaseDir() string + Stop() +} + +func useRustVolumeServer() bool { + return os.Getenv("VOLUME_SERVER_IMPL") == "rust" +} + +// StartVolumeCluster starts a single-volume cluster using either the Go or +// Rust volume server, depending on the VOLUME_SERVER_IMPL environment variable. +// Set VOLUME_SERVER_IMPL=rust to use the Rust volume server. +func StartVolumeCluster(t testing.TB, profile matrix.Profile) TestCluster { + t.Helper() + if useRustVolumeServer() { + return StartRustVolumeCluster(t, profile) + } + return StartSingleVolumeCluster(t, profile) +} + +// MultiCluster is the common interface for multi-volume cluster harnesses. +// Both *MultiVolumeCluster (Go) and *RustMultiVolumeCluster (Rust) satisfy it. +type MultiCluster interface { + MasterAddress() string + MasterURL() string + BaseDir() string + VolumeAdminAddress(index int) string + VolumeAdminURL(index int) string + VolumePublicAddress(index int) string + VolumePublicURL(index int) string + VolumeGRPCAddress(index int) string + Stop() +} + +// StartMultiVolumeClusterAuto starts a multi-volume cluster using either Go or +// Rust volume servers, depending on the VOLUME_SERVER_IMPL environment variable. +// Set VOLUME_SERVER_IMPL=rust to use Rust volume servers. +func StartMultiVolumeClusterAuto(t testing.TB, profile matrix.Profile, count int) MultiCluster { + t.Helper() + if useRustVolumeServer() { + return StartRustMultiVolumeCluster(t, profile, count) + } + return StartMultiVolumeCluster(t, profile, count) +} diff --git a/test/volume_server/framework/cluster_interface_test.go b/test/volume_server/framework/cluster_interface_test.go new file mode 100644 index 000000000..58dceaf56 --- /dev/null +++ b/test/volume_server/framework/cluster_interface_test.go @@ -0,0 +1,20 @@ +package framework + +import "testing" + +func TestUseRustVolumeServer(t *testing.T) { + t.Setenv("VOLUME_SERVER_IMPL", "rust") + if !useRustVolumeServer() { + t.Fatalf("expected rust selection when VOLUME_SERVER_IMPL=rust") + } + + t.Setenv("VOLUME_SERVER_IMPL", "go") + if useRustVolumeServer() { + t.Fatalf("expected go selection when VOLUME_SERVER_IMPL=go") + } + + t.Setenv("VOLUME_SERVER_IMPL", "") + if useRustVolumeServer() { + t.Fatalf("expected go selection when VOLUME_SERVER_IMPL is unset") + } +} diff --git a/test/volume_server/framework/cluster_multi_rust.go b/test/volume_server/framework/cluster_multi_rust.go new file mode 100644 index 000000000..45b9572ae --- /dev/null +++ b/test/volume_server/framework/cluster_multi_rust.go @@ -0,0 +1,289 @@ +package framework + +import ( + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "strconv" + "sync" + "testing" + + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" +) + +// RustMultiVolumeCluster wraps a Go master + multiple Rust volume servers +// for integration testing. It mirrors MultiVolumeCluster but uses the Rust +// volume binary instead of the Go weed binary for volume servers. +type RustMultiVolumeCluster struct { + testingTB testing.TB + profile matrix.Profile + + weedBinary string // Go weed binary (for the master) + rustVolumeBinary string // Rust volume binary + + baseDir string + configDir string + logsDir string + keepLogs bool + volumeServerCount int + + masterPort int + masterGrpcPort int + + volumePorts []int + volumeGrpcPorts []int + volumePubPorts []int + + masterCmd *exec.Cmd + volumeCmds []*exec.Cmd + + cleanupOnce sync.Once +} + +// StartRustMultiVolumeCluster starts a cluster with a Go master and the +// specified number of Rust volume servers. +func StartRustMultiVolumeCluster(t testing.TB, profile matrix.Profile, serverCount int) *RustMultiVolumeCluster { + t.Helper() + + if serverCount < 1 { + t.Fatalf("serverCount must be at least 1, got %d", serverCount) + } + + weedBinary, err := FindOrBuildWeedBinary() + if err != nil { + t.Fatalf("resolve weed binary: %v", err) + } + + rustBinary, err := FindOrBuildRustBinary() + if err != nil { + t.Fatalf("resolve rust volume binary: %v", err) + } + + baseDir, keepLogs, err := newWorkDir() + if err != nil { + t.Fatalf("create temp test directory: %v", err) + } + + configDir := filepath.Join(baseDir, "config") + logsDir := filepath.Join(baseDir, "logs") + masterDataDir := filepath.Join(baseDir, "master") + + // Create directories for master and all volume servers + dirs := []string{configDir, logsDir, masterDataDir} + for i := 0; i < serverCount; i++ { + dirs = append(dirs, filepath.Join(baseDir, fmt.Sprintf("volume%d", i))) + } + for _, dir := range dirs { + if mkErr := os.MkdirAll(dir, 0o755); mkErr != nil { + t.Fatalf("create %s: %v", dir, mkErr) + } + } + + if err = writeSecurityConfig(configDir, profile); err != nil { + t.Fatalf("write security config: %v", err) + } + + masterPort, masterGrpcPort, err := allocateMasterPortPair() + if err != nil { + t.Fatalf("allocate master port pair: %v", err) + } + + // Allocate ports for all volume servers (3 ports per server: admin, grpc, public) + // If SplitPublicPort is true, we need an additional port per server + portsPerServer := 3 + if profile.SplitPublicPort { + portsPerServer = 4 + } + totalPorts := serverCount * portsPerServer + ports, err := allocatePorts(totalPorts) + if err != nil { + t.Fatalf("allocate volume ports: %v", err) + } + + c := &RustMultiVolumeCluster{ + testingTB: t, + profile: profile, + weedBinary: weedBinary, + rustVolumeBinary: rustBinary, + baseDir: baseDir, + configDir: configDir, + logsDir: logsDir, + keepLogs: keepLogs, + volumeServerCount: serverCount, + masterPort: masterPort, + masterGrpcPort: masterGrpcPort, + volumePorts: make([]int, serverCount), + volumeGrpcPorts: make([]int, serverCount), + volumePubPorts: make([]int, serverCount), + volumeCmds: make([]*exec.Cmd, serverCount), + } + + // Assign ports to each volume server + for i := 0; i < serverCount; i++ { + baseIdx := i * portsPerServer + c.volumePorts[i] = ports[baseIdx] + c.volumeGrpcPorts[i] = ports[baseIdx+1] + + // Assign public port, using baseIdx+3 if SplitPublicPort, else baseIdx+2 + pubPortIdx := baseIdx + 2 + if profile.SplitPublicPort { + pubPortIdx = baseIdx + 3 + } + c.volumePubPorts[i] = ports[pubPortIdx] + } + + // Start master (Go) + if err = c.startMaster(masterDataDir); err != nil { + c.Stop() + t.Fatalf("start master: %v", err) + } + helper := &Cluster{logsDir: logsDir} + if err = helper.waitForHTTP(c.MasterURL() + "/dir/status"); err != nil { + masterLog := helper.tailLog("master.log") + c.Stop() + t.Fatalf("wait for master readiness: %v\nmaster log tail:\n%s", err, masterLog) + } + + // Start all Rust volume servers + for i := 0; i < serverCount; i++ { + volumeDataDir := filepath.Join(baseDir, fmt.Sprintf("volume%d", i)) + if err = c.startRustVolume(i, volumeDataDir); err != nil { + volumeLog := fmt.Sprintf("volume%d.log", i) + c.Stop() + t.Fatalf("start rust volume server %d: %v\nvolume log tail:\n%s", i, err, helper.tailLog(volumeLog)) + } + if err = helper.waitForHTTP(c.VolumeAdminURL(i) + "/healthz"); err != nil { + volumeLog := fmt.Sprintf("volume%d.log", i) + c.Stop() + t.Fatalf("wait for rust volume server %d readiness: %v\nvolume log tail:\n%s", i, err, helper.tailLog(volumeLog)) + } + if err = helper.waitForTCP(c.VolumeGRPCAddress(i)); err != nil { + volumeLog := fmt.Sprintf("volume%d.log", i) + c.Stop() + t.Fatalf("wait for rust volume server %d grpc readiness: %v\nvolume log tail:\n%s", i, err, helper.tailLog(volumeLog)) + } + } + + t.Cleanup(func() { + c.Stop() + }) + + return c +} + +func (c *RustMultiVolumeCluster) Stop() { + if c == nil { + return + } + c.cleanupOnce.Do(func() { + // Stop volume servers in reverse order + for i := len(c.volumeCmds) - 1; i >= 0; i-- { + stopProcess(c.volumeCmds[i]) + } + stopProcess(c.masterCmd) + if !c.keepLogs && !c.testingTB.Failed() { + _ = os.RemoveAll(c.baseDir) + } else if c.baseDir != "" { + c.testingTB.Logf("rust multi volume server integration logs kept at %s", c.baseDir) + } + }) +} + +func (c *RustMultiVolumeCluster) startMaster(dataDir string) error { + logFile, err := os.Create(filepath.Join(c.logsDir, "master.log")) + if err != nil { + return err + } + + args := []string{ + "-config_dir=" + c.configDir, + "master", + "-ip=127.0.0.1", + "-port=" + strconv.Itoa(c.masterPort), + "-port.grpc=" + strconv.Itoa(c.masterGrpcPort), + "-mdir=" + dataDir, + "-peers=none", + "-volumeSizeLimitMB=" + strconv.Itoa(testVolumeSizeLimitMB), + "-defaultReplication=000", + } + + c.masterCmd = exec.Command(c.weedBinary, args...) + c.masterCmd.Dir = c.baseDir + c.masterCmd.Stdout = logFile + c.masterCmd.Stderr = logFile + return c.masterCmd.Start() +} + +func (c *RustMultiVolumeCluster) startRustVolume(index int, dataDir string) error { + logName := fmt.Sprintf("volume%d.log", index) + logFile, err := os.Create(filepath.Join(c.logsDir, logName)) + if err != nil { + return err + } + + args := rustVolumeArgs( + c.profile, + c.configDir, + c.masterPort, + c.volumePorts[index], + c.volumeGrpcPorts[index], + c.volumePubPorts[index], + dataDir, + ) + + cmd := exec.Command(c.rustVolumeBinary, args...) + cmd.Dir = c.baseDir + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err = cmd.Start(); err != nil { + return err + } + c.volumeCmds[index] = cmd + return nil +} + +// --- accessor methods (mirror MultiVolumeCluster) --- + +func (c *RustMultiVolumeCluster) MasterAddress() string { + return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.masterPort)) +} + +func (c *RustMultiVolumeCluster) MasterURL() string { + return "http://" + c.MasterAddress() +} + +func (c *RustMultiVolumeCluster) VolumeAdminAddress(index int) string { + if index < 0 || index >= len(c.volumePorts) { + return "" + } + return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.volumePorts[index])) +} + +func (c *RustMultiVolumeCluster) VolumePublicAddress(index int) string { + if index < 0 || index >= len(c.volumePubPorts) { + return "" + } + return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.volumePubPorts[index])) +} + +func (c *RustMultiVolumeCluster) VolumeGRPCAddress(index int) string { + if index < 0 || index >= len(c.volumeGrpcPorts) { + return "" + } + return net.JoinHostPort("127.0.0.1", strconv.Itoa(c.volumeGrpcPorts[index])) +} + +func (c *RustMultiVolumeCluster) VolumeAdminURL(index int) string { + return "http://" + c.VolumeAdminAddress(index) +} + +func (c *RustMultiVolumeCluster) VolumePublicURL(index int) string { + return "http://" + c.VolumePublicAddress(index) +} + +func (c *RustMultiVolumeCluster) BaseDir() string { + return c.baseDir +} diff --git a/test/volume_server/framework/cluster_rust.go b/test/volume_server/framework/cluster_rust.go new file mode 100644 index 000000000..5d5f56a14 --- /dev/null +++ b/test/volume_server/framework/cluster_rust.go @@ -0,0 +1,342 @@ +package framework + +import ( + "bytes" + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "runtime" + "strconv" + "sync" + "testing" + + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" +) + +// RustCluster wraps a Go master + Rust volume server for integration testing. +type RustCluster struct { + testingTB testing.TB + profile matrix.Profile + + weedBinary string // Go weed binary (for the master) + rustVolumeBinary string // Rust volume binary + + baseDir string + configDir string + logsDir string + keepLogs bool + + masterPort int + masterGrpcPort int + volumePort int + volumeGrpcPort int + volumePubPort int + + masterCmd *exec.Cmd + volumeCmd *exec.Cmd + + cleanupOnce sync.Once +} + +var ( + rustBinaryOnce sync.Once + rustBinaryPath string + rustBinaryErr error +) + +// StartRustVolumeCluster starts a Go master + Rust volume server. +func StartRustVolumeCluster(t testing.TB, profile matrix.Profile) *RustCluster { + t.Helper() + + weedBinary, err := FindOrBuildWeedBinary() + if err != nil { + t.Fatalf("resolve weed binary: %v", err) + } + + rustBinary, err := FindOrBuildRustBinary() + if err != nil { + t.Fatalf("resolve rust volume binary: %v", err) + } + + baseDir, keepLogs, err := newWorkDir() + if err != nil { + t.Fatalf("create temp test directory: %v", err) + } + + configDir := filepath.Join(baseDir, "config") + logsDir := filepath.Join(baseDir, "logs") + masterDataDir := filepath.Join(baseDir, "master") + volumeDataDir := filepath.Join(baseDir, "volume") + for _, dir := range []string{configDir, logsDir, masterDataDir, volumeDataDir} { + if mkErr := os.MkdirAll(dir, 0o755); mkErr != nil { + t.Fatalf("create %s: %v", dir, mkErr) + } + } + + if err = writeSecurityConfig(configDir, profile); err != nil { + t.Fatalf("write security config: %v", err) + } + + masterPort, masterGrpcPort, err := allocateMasterPortPair() + if err != nil { + t.Fatalf("allocate master port pair: %v", err) + } + + ports, err := allocatePorts(3) + if err != nil { + t.Fatalf("allocate ports: %v", err) + } + + rc := &RustCluster{ + testingTB: t, + profile: profile, + weedBinary: weedBinary, + rustVolumeBinary: rustBinary, + baseDir: baseDir, + configDir: configDir, + logsDir: logsDir, + keepLogs: keepLogs, + masterPort: masterPort, + masterGrpcPort: masterGrpcPort, + volumePort: ports[0], + volumeGrpcPort: ports[1], + volumePubPort: ports[0], + } + if profile.SplitPublicPort { + rc.volumePubPort = ports[2] + } + + if err = rc.startMaster(masterDataDir); err != nil { + rc.Stop() + t.Fatalf("start master: %v", err) + } + // Reuse the same HTTP readiness helper via an unexported Cluster shim. + helper := &Cluster{logsDir: logsDir} + if err = helper.waitForHTTP(rc.MasterURL() + "/dir/status"); err != nil { + masterLog := helper.tailLog("master.log") + rc.Stop() + t.Fatalf("wait for master readiness: %v\nmaster log tail:\n%s", err, masterLog) + } + + if err = rc.startRustVolume(volumeDataDir); err != nil { + masterLog := helper.tailLog("master.log") + rc.Stop() + t.Fatalf("start rust volume: %v\nmaster log tail:\n%s", err, masterLog) + } + if err = helper.waitForHTTP(rc.VolumeAdminURL() + "/healthz"); err != nil { + volumeLog := helper.tailLog("volume.log") + rc.Stop() + t.Fatalf("wait for rust volume readiness: %v\nvolume log tail:\n%s", err, volumeLog) + } + if err = helper.waitForTCP(rc.VolumeGRPCAddress()); err != nil { + volumeLog := helper.tailLog("volume.log") + rc.Stop() + t.Fatalf("wait for rust volume grpc readiness: %v\nvolume log tail:\n%s", err, volumeLog) + } + + t.Cleanup(func() { + rc.Stop() + }) + + return rc +} + +// Stop terminates all processes and cleans temporary files. +func (rc *RustCluster) Stop() { + if rc == nil { + return + } + rc.cleanupOnce.Do(func() { + stopProcess(rc.volumeCmd) + stopProcess(rc.masterCmd) + if !rc.keepLogs && !rc.testingTB.Failed() { + _ = os.RemoveAll(rc.baseDir) + } else if rc.baseDir != "" { + rc.testingTB.Logf("rust volume server integration logs kept at %s", rc.baseDir) + } + }) +} + +func (rc *RustCluster) startMaster(dataDir string) error { + logFile, err := os.Create(filepath.Join(rc.logsDir, "master.log")) + if err != nil { + return err + } + + args := []string{ + "-config_dir=" + rc.configDir, + "master", + "-ip=127.0.0.1", + "-port=" + strconv.Itoa(rc.masterPort), + "-port.grpc=" + strconv.Itoa(rc.masterGrpcPort), + "-mdir=" + dataDir, + "-peers=none", + "-volumeSizeLimitMB=" + strconv.Itoa(testVolumeSizeLimitMB), + "-defaultReplication=000", + } + + rc.masterCmd = exec.Command(rc.weedBinary, args...) + rc.masterCmd.Dir = rc.baseDir + rc.masterCmd.Stdout = logFile + rc.masterCmd.Stderr = logFile + return rc.masterCmd.Start() +} + +func rustVolumeArgs( + profile matrix.Profile, + configDir string, + masterPort int, + volumePort int, + volumeGrpcPort int, + volumePubPort int, + dataDir string, +) []string { + args := []string{ + "--port", strconv.Itoa(volumePort), + "--port.grpc", strconv.Itoa(volumeGrpcPort), + "--port.public", strconv.Itoa(volumePubPort), + "--ip", "127.0.0.1", + "--ip.bind", "127.0.0.1", + "--dir", dataDir, + "--max", "16", + "--master", "127.0.0.1:" + strconv.Itoa(masterPort), + "--securityFile", filepath.Join(configDir, "security.toml"), + "--readMode", profile.ReadMode, + "--concurrentUploadLimitMB", strconv.Itoa(profile.ConcurrentUploadLimitMB), + "--concurrentDownloadLimitMB", strconv.Itoa(profile.ConcurrentDownloadLimitMB), + "--preStopSeconds", "0", + } + if profile.InflightUploadTimeout > 0 { + args = append(args, "--inflightUploadDataTimeout", profile.InflightUploadTimeout.String()) + } + if profile.InflightDownloadTimeout > 0 { + args = append(args, "--inflightDownloadDataTimeout", profile.InflightDownloadTimeout.String()) + } + return args +} + +func (rc *RustCluster) startRustVolume(dataDir string) error { + logFile, err := os.Create(filepath.Join(rc.logsDir, "volume.log")) + if err != nil { + return err + } + + args := rustVolumeArgs( + rc.profile, + rc.configDir, + rc.masterPort, + rc.volumePort, + rc.volumeGrpcPort, + rc.volumePubPort, + dataDir, + ) + + rc.volumeCmd = exec.Command(rc.rustVolumeBinary, args...) + rc.volumeCmd.Dir = rc.baseDir + rc.volumeCmd.Stdout = logFile + rc.volumeCmd.Stderr = logFile + return rc.volumeCmd.Start() +} + +// FindOrBuildRustBinary returns an executable Rust volume binary, building one when needed. +func FindOrBuildRustBinary() (string, error) { + if fromEnv := os.Getenv("RUST_VOLUME_BINARY"); fromEnv != "" { + if isExecutableFile(fromEnv) { + return fromEnv, nil + } + return "", fmt.Errorf("RUST_VOLUME_BINARY is set but not executable: %s", fromEnv) + } + + rustBinaryOnce.Do(func() { + // Derive the Rust volume crate directory from this source file's location. + rustCrateDir := "" + if _, file, _, ok := runtime.Caller(0); ok { + repoRoot := filepath.Clean(filepath.Join(filepath.Dir(file), "..", "..", "..")) + for _, candidate := range []string{"seaweed-volume", "weed-volume"} { + dir := filepath.Join(repoRoot, candidate) + if isDir(dir) && isFile(filepath.Join(dir, "Cargo.toml")) { + rustCrateDir = dir + break + } + } + } + if rustCrateDir == "" { + rustBinaryErr = fmt.Errorf("unable to detect Rust volume crate directory") + return + } + + releaseBin := filepath.Join(rustCrateDir, "target", "release", "weed-volume") + + // Always rebuild once per test process so the harness uses current source and features. + cmd := exec.Command("cargo", "build", "--release") + cmd.Dir = rustCrateDir + var out bytes.Buffer + cmd.Stdout = &out + cmd.Stderr = &out + if err := cmd.Run(); err != nil { + rustBinaryErr = fmt.Errorf("build rust volume binary: %w\n%s", err, out.String()) + return + } + if !isExecutableFile(releaseBin) { + rustBinaryErr = fmt.Errorf("built rust volume binary is not executable: %s", releaseBin) + return + } + rustBinaryPath = releaseBin + }) + + if rustBinaryErr != nil { + return "", rustBinaryErr + } + return rustBinaryPath, nil +} + +func isDir(path string) bool { + info, err := os.Stat(path) + return err == nil && info.IsDir() +} + +func isFile(path string) bool { + info, err := os.Stat(path) + return err == nil && info.Mode().IsRegular() +} + +// --- accessor methods (mirror Cluster) --- + +func (rc *RustCluster) MasterAddress() string { + return net.JoinHostPort("127.0.0.1", strconv.Itoa(rc.masterPort)) +} + +func (rc *RustCluster) VolumeAdminAddress() string { + return net.JoinHostPort("127.0.0.1", strconv.Itoa(rc.volumePort)) +} + +func (rc *RustCluster) VolumePublicAddress() string { + return net.JoinHostPort("127.0.0.1", strconv.Itoa(rc.volumePubPort)) +} + +func (rc *RustCluster) VolumeGRPCAddress() string { + return net.JoinHostPort("127.0.0.1", strconv.Itoa(rc.volumeGrpcPort)) +} + +// VolumeServerAddress returns SeaweedFS server address format: ip:httpPort.grpcPort +func (rc *RustCluster) VolumeServerAddress() string { + return fmt.Sprintf("%s.%d", rc.VolumeAdminAddress(), rc.volumeGrpcPort) +} + +func (rc *RustCluster) MasterURL() string { + return "http://" + rc.MasterAddress() +} + +func (rc *RustCluster) VolumeAdminURL() string { + return "http://" + rc.VolumeAdminAddress() +} + +func (rc *RustCluster) VolumePublicURL() string { + return "http://" + rc.VolumePublicAddress() +} + +func (rc *RustCluster) BaseDir() string { + return rc.baseDir +} diff --git a/test/volume_server/framework/cluster_rust_test.go b/test/volume_server/framework/cluster_rust_test.go new file mode 100644 index 000000000..f2558753a --- /dev/null +++ b/test/volume_server/framework/cluster_rust_test.go @@ -0,0 +1,38 @@ +package framework + +import ( + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" +) + +func TestRustVolumeArgsIncludeReadMode(t *testing.T) { + profile := matrix.P1() + profile.ReadMode = "redirect" + profile.ConcurrentUploadLimitMB = 7 + profile.ConcurrentDownloadLimitMB = 9 + profile.InflightUploadTimeout = 3 * time.Second + profile.InflightDownloadTimeout = 4 * time.Second + + args := rustVolumeArgs(profile, "/tmp/config", 9333, 18080, 28080, 38080, "/tmp/data") + + assertArgPair(t, args, "--readMode", "redirect") + assertArgPair(t, args, "--concurrentUploadLimitMB", "7") + assertArgPair(t, args, "--concurrentDownloadLimitMB", "9") + assertArgPair(t, args, "--inflightUploadDataTimeout", "3s") + assertArgPair(t, args, "--inflightDownloadDataTimeout", "4s") +} + +func assertArgPair(t *testing.T, args []string, flag string, want string) { + t.Helper() + for i := 0; i+1 < len(args); i += 2 { + if args[i] == flag { + if args[i+1] != want { + t.Fatalf("%s value mismatch: got %q want %q", flag, args[i+1], want) + } + return + } + } + t.Fatalf("missing %s in args: %v", flag, args) +} diff --git a/test/volume_server/grpc/admin_extra_test.go b/test/volume_server/grpc/admin_extra_test.go index de62fcdb8..85afa1ade 100644 --- a/test/volume_server/grpc/admin_extra_test.go +++ b/test/volume_server/grpc/admin_extra_test.go @@ -2,6 +2,7 @@ package volume_server_grpc_test import ( "context" + "io" "net/http" "strings" "testing" @@ -18,7 +19,7 @@ func TestVolumeNeedleStatusForUploadedFile(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -57,12 +58,57 @@ func TestVolumeNeedleStatusForUploadedFile(t *testing.T) { } } +func TestVolumeNeedleStatusIncludesTtlAndLastModified(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(27) + const needleID = uint64(778901) + const cookie = uint32(0xA1B2C3D6) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, needleID, cookie) + client := framework.NewHTTPClient() + uploadReq := mustNewRequest(t, http.MethodPost, clusterHarness.VolumeAdminURL()+"/"+fid+"?ttl=7d&ts=1700000000") + uploadReq.Body = io.NopCloser(strings.NewReader("needle-status-ttl-payload")) + uploadReq.ContentLength = int64(len("needle-status-ttl-payload")) + uploadReq.Header.Set("Content-Type", "application/octet-stream") + uploadResp := framework.DoRequest(t, client, uploadReq) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload status: expected 201, got %d", uploadResp.StatusCode) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + statusResp, err := grpcClient.VolumeNeedleStatus(ctx, &volume_server_pb.VolumeNeedleStatusRequest{ + VolumeId: volumeID, + NeedleId: needleID, + }) + if err != nil { + t.Fatalf("VolumeNeedleStatus with ttl failed: %v", err) + } + // Go's ReadTTL normalizes via fitTtlCount: 7d → 1w (7 days = 1 week) + if statusResp.GetTtl() != "1w" { + t.Fatalf("ttl mismatch: got %q want %q", statusResp.GetTtl(), "1w") + } + if statusResp.GetLastModified() != 1700000000 { + t.Fatalf("last modified mismatch: got %d want %d", statusResp.GetLastModified(), 1700000000) + } +} + func TestVolumeNeedleStatusViaEcShardsWhenNormalVolumeUnmounted(t *testing.T) { if testing.Short() { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -138,7 +184,7 @@ func TestVolumeNeedleStatusMissingVolumeAndNeedle(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -185,7 +231,7 @@ func TestVolumeConfigureInvalidReplication(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -215,7 +261,7 @@ func TestVolumeConfigureSuccessAndMissingRollbackPath(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -268,7 +314,7 @@ func TestPingVolumeTargetAndLeaveAffectsHealthz(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -311,7 +357,7 @@ func TestVolumeServerLeaveIsIdempotent(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -346,7 +392,7 @@ func TestPingUnknownAndUnreachableTargetPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -367,6 +413,17 @@ func TestPingUnknownAndUnreachableTargetPaths(t *testing.T) { t.Fatalf("Ping unknown target type expected stop_time_ns >= start_time_ns") } + emptyTargetResp, err := grpcClient.Ping(ctx, &volume_server_pb.PingRequest{}) + if err != nil { + t.Fatalf("Ping empty target should not return grpc error, got: %v", err) + } + if emptyTargetResp.GetRemoteTimeNs() != 0 { + t.Fatalf("Ping empty target expected remote_time_ns=0, got %d", emptyTargetResp.GetRemoteTimeNs()) + } + if emptyTargetResp.GetStopTimeNs() < emptyTargetResp.GetStartTimeNs() { + t.Fatalf("Ping empty target expected stop_time_ns >= start_time_ns") + } + _, err = grpcClient.Ping(ctx, &volume_server_pb.PingRequest{ TargetType: cluster.MasterType, Target: "127.0.0.1:1", @@ -395,7 +452,7 @@ func TestPingMasterTargetSuccess(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/admin_lifecycle_test.go b/test/volume_server/grpc/admin_lifecycle_test.go index bdc4e5a45..7ec5d64ef 100644 --- a/test/volume_server/grpc/admin_lifecycle_test.go +++ b/test/volume_server/grpc/admin_lifecycle_test.go @@ -19,7 +19,7 @@ func TestVolumeAdminLifecycleRPCs(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -62,7 +62,7 @@ func TestVolumeDeleteOnlyEmptyVariants(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -101,7 +101,7 @@ func TestMaintenanceModeRejectsAllocateVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -133,7 +133,7 @@ func TestAllocateDuplicateAndMountUnmountMissingVariants(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -184,7 +184,7 @@ func TestMaintenanceModeRejectsVolumeDelete(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/admin_readonly_collection_test.go b/test/volume_server/grpc/admin_readonly_collection_test.go index 36d2d2f31..5f9679f2e 100644 --- a/test/volume_server/grpc/admin_readonly_collection_test.go +++ b/test/volume_server/grpc/admin_readonly_collection_test.go @@ -16,7 +16,7 @@ func TestVolumeMarkReadonlyAndWritableLifecycle(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -61,7 +61,7 @@ func TestVolumeMarkReadonlyPersistTrue(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -93,7 +93,7 @@ func TestVolumeMarkReadonlyWritableErrorPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -146,7 +146,7 @@ func TestDeleteCollectionRemovesVolumeAndIsIdempotent(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/batch_delete_test.go b/test/volume_server/grpc/batch_delete_test.go index b02d4ea27..4fc822fee 100644 --- a/test/volume_server/grpc/batch_delete_test.go +++ b/test/volume_server/grpc/batch_delete_test.go @@ -18,7 +18,7 @@ func TestBatchDeleteInvalidFidAndMaintenanceMode(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -61,7 +61,7 @@ func TestBatchDeleteCookieMismatchAndSkipCheck(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -124,7 +124,7 @@ func TestBatchDeleteMixedStatusesAndMismatchStopsProcessing(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -219,7 +219,7 @@ func TestBatchDeleteRejectsChunkManifestNeedles(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/copy_receive_variants_test.go b/test/volume_server/grpc/copy_receive_variants_test.go index 14d9cee72..3a82822b2 100644 --- a/test/volume_server/grpc/copy_receive_variants_test.go +++ b/test/volume_server/grpc/copy_receive_variants_test.go @@ -4,6 +4,7 @@ import ( "context" "io" "math" + "net/http" "strings" "testing" "time" @@ -18,7 +19,7 @@ func TestVolumeIncrementalCopyDataAndNoDataPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -77,7 +78,7 @@ func TestCopyFileIgnoreNotFoundAndStopOffsetZeroPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -132,12 +133,70 @@ func TestCopyFileIgnoreNotFoundAndStopOffsetZeroPaths(t *testing.T) { } } +func TestCopyFileStopOffsetZeroExistingFileSendsMetadata(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(93) + const needleID = uint64(770101) + const cookie = uint32(0x1234ABCD) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + client := framework.NewHTTPClient() + uploadResp := framework.UploadBytes( + t, + client, + clusterHarness.VolumeAdminURL(), + framework.NewFileID(volumeID, needleID, cookie), + []byte("copy-file-stop-zero"), + ) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + stream, err := grpcClient.CopyFile(ctx, &volume_server_pb.CopyFileRequest{ + VolumeId: volumeID, + Ext: ".dat", + CompactionRevision: math.MaxUint32, + StopOffset: 0, + IgnoreSourceFileNotFound: false, + }) + if err != nil { + t.Fatalf("CopyFile stop_offset=0 existing file start failed: %v", err) + } + + msg, err := stream.Recv() + if err != nil { + t.Fatalf("CopyFile stop_offset=0 existing file recv failed: %v", err) + } + if len(msg.GetFileContent()) != 0 { + t.Fatalf("CopyFile stop_offset=0 existing file should not send content, got %d bytes", len(msg.GetFileContent())) + } + if msg.GetModifiedTsNs() == 0 { + t.Fatalf("CopyFile stop_offset=0 existing file expected non-zero ModifiedTsNs") + } + + _, err = stream.Recv() + if err != io.EOF { + t.Fatalf("CopyFile stop_offset=0 existing file expected EOF after metadata frame, got: %v", err) + } +} + func TestCopyFileCompactionRevisionMismatch(t *testing.T) { if testing.Short() { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -166,7 +225,7 @@ func TestReceiveFileProtocolViolationResponses(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -213,7 +272,7 @@ func TestReceiveFileSuccessForRegularVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -299,7 +358,7 @@ func TestReceiveFileSuccessForEcVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -389,7 +448,7 @@ func TestCopyFileEcVolumeIgnoreMissingSourcePaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/copy_sync_test.go b/test/volume_server/grpc/copy_sync_test.go index 3c2916fd0..810395fb6 100644 --- a/test/volume_server/grpc/copy_sync_test.go +++ b/test/volume_server/grpc/copy_sync_test.go @@ -18,12 +18,19 @@ func TestVolumeSyncStatusAndReadVolumeFileStatus(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() + httpClient := framework.NewHTTPClient() const volumeID = uint32(41) framework.AllocateVolume(t, grpcClient, volumeID, "") + fid := framework.NewFileID(volumeID, 1, 0x11112222) + uploadResp := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), fid, []byte("sync-status-payload")) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() @@ -46,6 +53,12 @@ func TestVolumeSyncStatusAndReadVolumeFileStatus(t *testing.T) { if statusResp.GetVersion() == 0 { t.Fatalf("ReadVolumeFileStatus expected non-zero version") } + if syncResp.GetTailOffset() == 0 { + t.Fatalf("VolumeSyncStatus expected non-zero tail offset after upload") + } + if syncResp.GetTailOffset() != statusResp.GetDatFileSize() { + t.Fatalf("VolumeSyncStatus tail offset mismatch: got %d want %d", syncResp.GetTailOffset(), statusResp.GetDatFileSize()) + } } func TestCopyAndStreamMethodsMissingVolumePaths(t *testing.T) { @@ -53,7 +66,7 @@ func TestCopyAndStreamMethodsMissingVolumePaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -100,7 +113,7 @@ func TestVolumeCopyAndReceiveFileMaintenanceRejection(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/data_rw_test.go b/test/volume_server/grpc/data_rw_test.go index 43969532d..b7701b9b2 100644 --- a/test/volume_server/grpc/data_rw_test.go +++ b/test/volume_server/grpc/data_rw_test.go @@ -16,7 +16,7 @@ func TestReadNeedleBlobAndMetaMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -54,7 +54,7 @@ func TestWriteNeedleBlobMaintenanceAndMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -104,7 +104,7 @@ func TestReadNeedleBlobAndMetaInvalidOffsets(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/data_stream_success_test.go b/test/volume_server/grpc/data_stream_success_test.go index 90f2a8248..4297068a3 100644 --- a/test/volume_server/grpc/data_stream_success_test.go +++ b/test/volume_server/grpc/data_stream_success_test.go @@ -3,6 +3,8 @@ package volume_server_grpc_test import ( "context" "io" + "net/http" + "reflect" "strings" "testing" "time" @@ -19,7 +21,7 @@ func TestReadWriteNeedleBlobAndMetaRoundTrip(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -122,7 +124,7 @@ func TestReadAllNeedlesStreamsUploadedRecords(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -180,7 +182,7 @@ func TestReadAllNeedlesExistingThenMissingVolumeAbortsStream(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -230,6 +232,139 @@ func TestReadAllNeedlesExistingThenMissingVolumeAbortsStream(t *testing.T) { } } +func TestReadAllNeedlesPreservesDatOrderAcrossOverwrite(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(86) + const firstNeedleID = uint64(444551) + const secondNeedleID = uint64(444552) + const firstCookie = uint32(0xAA22BB33) + const secondCookie = uint32(0xCC44DD55) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + client := framework.NewHTTPClient() + uploads := []struct { + fid string + body string + }{ + {fid: framework.NewFileID(volumeID, firstNeedleID, firstCookie), body: "read-all-first"}, + {fid: framework.NewFileID(volumeID, secondNeedleID, secondCookie), body: "read-all-second"}, + {fid: framework.NewFileID(volumeID, firstNeedleID, firstCookie), body: "read-all-first-overwrite"}, + } + for _, upload := range uploads { + resp := framework.UploadBytes(t, client, clusterHarness.VolumeAdminURL(), upload.fid, []byte(upload.body)) + _ = framework.ReadAllAndClose(t, resp) + if resp.StatusCode != 201 { + t.Fatalf("upload for %s expected 201, got %d", upload.fid, resp.StatusCode) + } + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + stream, err := grpcClient.ReadAllNeedles(ctx, &volume_server_pb.ReadAllNeedlesRequest{VolumeIds: []uint32{volumeID}}) + if err != nil { + t.Fatalf("ReadAllNeedles start failed: %v", err) + } + + var orderedIDs []uint64 + var orderedBodies []string + for { + msg, recvErr := stream.Recv() + if recvErr == io.EOF { + break + } + if recvErr != nil { + t.Fatalf("ReadAllNeedles recv failed: %v", recvErr) + } + orderedIDs = append(orderedIDs, msg.GetNeedleId()) + orderedBodies = append(orderedBodies, string(msg.GetNeedleBlob())) + } + + wantIDs := []uint64{secondNeedleID, firstNeedleID} + wantBodies := []string{"read-all-second", "read-all-first-overwrite"} + if !reflect.DeepEqual(orderedIDs, wantIDs) { + t.Fatalf("ReadAllNeedles order mismatch: got %v want %v", orderedIDs, wantIDs) + } + if !reflect.DeepEqual(orderedBodies, wantBodies) { + t.Fatalf("ReadAllNeedles bodies mismatch: got %v want %v", orderedBodies, wantBodies) + } +} + +func TestReadNeedleMetaDeletedEntryUsesTombstoneMetadata(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(87) + const needleID = uint64(444661) + const cookie = uint32(0xAB12CD34) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + client := framework.NewHTTPClient() + fid := framework.NewFileID(volumeID, needleID, cookie) + uploadResp := framework.UploadBytes(t, client, clusterHarness.VolumeAdminURL(), fid, []byte("read-meta-delete")) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + deleteReq, err := http.NewRequest(http.MethodDelete, clusterHarness.VolumeAdminURL()+"/"+fid, nil) + if err != nil { + t.Fatalf("build delete request: %v", err) + } + deleteResp := framework.DoRequest(t, client, deleteReq) + _ = framework.ReadAllAndClose(t, deleteResp) + if deleteResp.StatusCode != http.StatusAccepted { + t.Fatalf("delete expected 202, got %d", deleteResp.StatusCode) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + fileStatus, err := grpcClient.ReadVolumeFileStatus(ctx, &volume_server_pb.ReadVolumeFileStatusRequest{VolumeId: volumeID}) + if err != nil { + t.Fatalf("ReadVolumeFileStatus after delete failed: %v", err) + } + + idxBytes := copyFileBytes(t, grpcClient, &volume_server_pb.CopyFileRequest{ + VolumeId: volumeID, + Ext: ".idx", + CompactionRevision: fileStatus.GetCompactionRevision(), + StopOffset: fileStatus.GetIdxFileSize(), + }) + offset, size := findLastNeedleOffsetAndSize(t, idxBytes, needleID) + if size >= 0 { + t.Fatalf("expected deleted idx entry for needle %d, got size %d", needleID, size) + } + + metaResp, err := grpcClient.ReadNeedleMeta(ctx, &volume_server_pb.ReadNeedleMetaRequest{ + VolumeId: volumeID, + NeedleId: needleID, + Offset: offset, + Size: size, + }) + if err != nil { + t.Fatalf("ReadNeedleMeta deleted-entry failed: %v", err) + } + if metaResp.GetCookie() != cookie { + t.Fatalf("ReadNeedleMeta deleted-entry cookie mismatch: got %d want %d", metaResp.GetCookie(), cookie) + } + if metaResp.GetAppendAtNs() == 0 { + t.Fatalf("ReadNeedleMeta deleted-entry expected non-zero append_at_ns") + } +} + func copyFileBytes(t testing.TB, grpcClient volume_server_pb.VolumeServerClient, req *volume_server_pb.CopyFileRequest) []byte { t.Helper() @@ -271,3 +406,23 @@ func findNeedleOffsetAndSize(t testing.TB, idxBytes []byte, needleID uint64) (of t.Fatalf("needle id %d not found in idx entries", needleID) return 0, 0 } + +func findLastNeedleOffsetAndSize(t testing.TB, idxBytes []byte, needleID uint64) (offset int64, size int32) { + t.Helper() + + found := false + for i := 0; i+types.NeedleMapEntrySize <= len(idxBytes); i += types.NeedleMapEntrySize { + key, entryOffset, entrySize := idx.IdxFileEntry(idxBytes[i : i+types.NeedleMapEntrySize]) + if uint64(key) != needleID { + continue + } + found = true + offset = entryOffset.ToActualOffset() + size = int32(entrySize) + } + + if !found { + t.Fatalf("needle id %d not found in idx entries", needleID) + } + return offset, size +} diff --git a/test/volume_server/grpc/erasure_coding_test.go b/test/volume_server/grpc/erasure_coding_test.go index 8a0d8f75f..f5852c6f3 100644 --- a/test/volume_server/grpc/erasure_coding_test.go +++ b/test/volume_server/grpc/erasure_coding_test.go @@ -23,7 +23,7 @@ func TestEcMaintenanceModeRejections(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -92,7 +92,7 @@ func TestEcMissingInvalidAndNoopPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -199,7 +199,7 @@ func TestEcGenerateMountInfoUnmountLifecycle(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -268,7 +268,7 @@ func TestEcShardReadAndBlobDeleteLifecycle(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -369,7 +369,7 @@ func TestEcRebuildMissingShardLifecycle(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -449,7 +449,7 @@ func TestEcShardsToVolumeMissingShardAndNoLiveEntries(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -560,7 +560,7 @@ func TestEcShardsToVolumeSuccessRoundTrip(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -621,7 +621,7 @@ func TestEcShardsDeleteLastShardRemovesEcx(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -757,7 +757,7 @@ func TestEcShardsCopyFailsWhenSourceUnavailable(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/fetch_remote_s3_test.go b/test/volume_server/grpc/fetch_remote_s3_test.go new file mode 100644 index 000000000..bd1c94cbc --- /dev/null +++ b/test/volume_server/grpc/fetch_remote_s3_test.go @@ -0,0 +1,288 @@ +package volume_server_grpc_test + +import ( + "bytes" + "context" + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/credentials" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/s3" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" + "github.com/seaweedfs/seaweedfs/weed/pb/remote_pb" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" +) + +// findAvailablePort finds a free TCP port on localhost. +func findAvailablePort() (int, error) { + l, err := net.Listen("tcp", "127.0.0.1:0") + if err != nil { + return 0, err + } + port := l.Addr().(*net.TCPAddr).Port + l.Close() + return port, nil +} + +// waitForPort waits until a TCP port is listening, up to timeout. +func waitForPort(port int, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for time.Now().Before(deadline) { + conn, err := net.DialTimeout("tcp", fmt.Sprintf("127.0.0.1:%d", port), 500*time.Millisecond) + if err == nil { + conn.Close() + return nil + } + time.Sleep(200 * time.Millisecond) + } + return fmt.Errorf("port %d not listening after %v", port, timeout) +} + +// startWeedMini starts a weed mini subprocess and returns the S3 endpoint and cleanup func. +func startWeedMini(t *testing.T) (s3Endpoint string, cleanup func()) { + t.Helper() + + weedBin, err := exec.LookPath("weed") + if err != nil { + weedBin = filepath.Join("..", "..", "..", "weed", "weed_binary") + if _, err := os.Stat(weedBin); os.IsNotExist(err) { + t.Skip("weed binary not found, skipping S3 remote storage test") + } + } + + miniMasterPort, _ := findAvailablePort() + miniVolumePort, _ := findAvailablePort() + miniFilerPort, _ := findAvailablePort() + miniS3Port, _ := findAvailablePort() + miniDir := t.TempDir() + os.WriteFile(filepath.Join(miniDir, "security.toml"), []byte("# empty\n"), 0644) + + ctx, cancel := context.WithCancel(context.Background()) + + miniCmd := exec.CommandContext(ctx, weedBin, "mini", + fmt.Sprintf("-dir=%s", miniDir), + fmt.Sprintf("-master.port=%d", miniMasterPort), + fmt.Sprintf("-volume.port=%d", miniVolumePort), + fmt.Sprintf("-filer.port=%d", miniFilerPort), + fmt.Sprintf("-s3.port=%d", miniS3Port), + ) + miniCmd.Env = append(os.Environ(), "AWS_ACCESS_KEY_ID=admin", "AWS_SECRET_ACCESS_KEY=admin") + miniCmd.Dir = miniDir + logFile, _ := os.CreateTemp("", "weed-mini-*.log") + miniCmd.Stdout = logFile + miniCmd.Stderr = logFile + t.Logf("weed mini logs at %s", logFile.Name()) + + if err := miniCmd.Start(); err != nil { + cancel() + logFile.Close() + t.Fatalf("start weed mini: %v", err) + } + + if err := waitForPort(miniS3Port, 30*time.Second); err != nil { + cancel() + miniCmd.Wait() + logFile.Close() + t.Fatalf("weed mini S3 not ready: %v", err) + } + t.Logf("weed mini S3 ready on port %d", miniS3Port) + + return fmt.Sprintf("http://127.0.0.1:%d", miniS3Port), func() { + cancel() + miniCmd.Wait() + logFile.Close() + } +} + +func newS3Client(endpoint string) *s3.S3 { + sess, _ := session.NewSession(&aws.Config{ + Region: aws.String("us-east-1"), + Endpoint: aws.String(endpoint), + Credentials: credentials.NewStaticCredentials("admin", "admin", ""), + DisableSSL: aws.Bool(true), + S3ForcePathStyle: aws.Bool(true), + }) + return s3.New(sess) +} + +// TestFetchAndWriteNeedleFromS3 tests the full FetchAndWriteNeedle flow: +// 1. Start a weed mini instance as S3 backend +// 2. Upload a test object to it via S3 API +// 3. Call FetchAndWriteNeedle on the volume server to fetch from S3 +// 4. Verify the response contains a valid e_tag +func TestFetchAndWriteNeedleFromS3(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + s3Endpoint, cleanupMini := startWeedMini(t) + defer cleanupMini() + + s3Client := newS3Client(s3Endpoint) + + // Create bucket and upload test data + bucket := "test-remote-fetch" + s3Client.CreateBucket(&s3.CreateBucketInput{Bucket: aws.String(bucket)}) + + testData := []byte("Hello from S3 remote storage! This is test data for FetchAndWriteNeedle.") + testKey := "test-object.dat" + _, err := s3Client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(testKey), + Body: bytes.NewReader(testData), + }) + if err != nil { + t.Fatalf("put object: %v", err) + } + t.Logf("uploaded %d bytes to s3://%s/%s", len(testData), bucket, testKey) + + // Start volume server + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(99) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + grpcCtx, grpcCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer grpcCancel() + + // FetchAndWriteNeedle from S3 + resp, err := grpcClient.FetchAndWriteNeedle(grpcCtx, &volume_server_pb.FetchAndWriteNeedleRequest{ + VolumeId: volumeID, + NeedleId: 42, + Cookie: 12345, + Offset: 0, + Size: int64(len(testData)), + RemoteConf: &remote_pb.RemoteConf{ + Name: "test-s3", + Type: "s3", + S3AccessKey: "admin", + S3SecretKey: "admin", + S3Region: "us-east-1", + S3Endpoint: s3Endpoint, + S3ForcePathStyle: true, + }, + RemoteLocation: &remote_pb.RemoteStorageLocation{ + Name: "test-s3", + Bucket: bucket, + Path: "/" + testKey, + }, + }) + if err != nil { + t.Fatalf("FetchAndWriteNeedle failed: %v", err) + } + if resp.GetETag() == "" { + t.Fatal("FetchAndWriteNeedle returned empty e_tag") + } + t.Logf("FetchAndWriteNeedle success: e_tag=%s", resp.GetETag()) +} + +// TestFetchAndWriteNeedleFromS3WithPartialRead tests reading a byte range from S3. +func TestFetchAndWriteNeedleFromS3WithPartialRead(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + s3Endpoint, cleanupMini := startWeedMini(t) + defer cleanupMini() + + s3Client := newS3Client(s3Endpoint) + + bucket := "partial-read-test" + s3Client.CreateBucket(&s3.CreateBucketInput{Bucket: aws.String(bucket)}) + + // Upload 1KB of data + fullData := make([]byte, 1024) + for i := range fullData { + fullData[i] = byte(i % 256) + } + s3Client.PutObject(&s3.PutObjectInput{ + Bucket: aws.String(bucket), Key: aws.String("big.dat"), + Body: bytes.NewReader(fullData), + }) + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + framework.AllocateVolume(t, grpcClient, 98, "") + + grpcCtx, grpcCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer grpcCancel() + + // Fetch only bytes 100-199 (100 bytes) from the 1KB object + resp, err := grpcClient.FetchAndWriteNeedle(grpcCtx, &volume_server_pb.FetchAndWriteNeedleRequest{ + VolumeId: 98, NeedleId: 7, Cookie: 999, + Offset: 100, Size: 100, + RemoteConf: &remote_pb.RemoteConf{ + Name: "test-s3-partial", Type: "s3", + S3AccessKey: "admin", S3SecretKey: "admin", + S3Region: "us-east-1", S3Endpoint: s3Endpoint, S3ForcePathStyle: true, + }, + RemoteLocation: &remote_pb.RemoteStorageLocation{ + Name: "test-s3-partial", Bucket: bucket, Path: "/big.dat", + }, + }) + if err != nil { + t.Fatalf("FetchAndWriteNeedle partial read failed: %v", err) + } + if resp.GetETag() == "" { + t.Fatal("empty e_tag for partial read") + } + t.Logf("FetchAndWriteNeedle partial read success: e_tag=%s", resp.GetETag()) +} + +// TestFetchAndWriteNeedleS3NotFound tests that fetching a non-existent S3 object returns an error. +func TestFetchAndWriteNeedleS3NotFound(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + s3Endpoint, cleanupMini := startWeedMini(t) + defer cleanupMini() + + s3Client := newS3Client(s3Endpoint) + + bucket := "notfound-test" + s3Client.CreateBucket(&s3.CreateBucketInput{Bucket: aws.String(bucket)}) + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + framework.AllocateVolume(t, grpcClient, 97, "") + + grpcCtx, grpcCancel := context.WithTimeout(context.Background(), 30*time.Second) + defer grpcCancel() + + _, err := grpcClient.FetchAndWriteNeedle(grpcCtx, &volume_server_pb.FetchAndWriteNeedleRequest{ + VolumeId: 97, NeedleId: 1, Cookie: 1, + Offset: 0, Size: 100, + RemoteConf: &remote_pb.RemoteConf{ + Name: "test-s3-nf", Type: "s3", + S3AccessKey: "admin", S3SecretKey: "admin", + S3Region: "us-east-1", S3Endpoint: s3Endpoint, S3ForcePathStyle: true, + }, + RemoteLocation: &remote_pb.RemoteStorageLocation{ + Name: "test-s3-nf", Bucket: bucket, Path: "/does-not-exist.dat", + }, + }) + if err == nil { + t.Fatal("FetchAndWriteNeedle should fail for non-existent object") + } + if !strings.Contains(err.Error(), "read from remote") { + t.Fatalf("expected 'read from remote' error, got: %v", err) + } + t.Logf("correctly got error for non-existent object: %v", err) +} diff --git a/test/volume_server/grpc/health_state_test.go b/test/volume_server/grpc/health_state_test.go index cac40731b..16f4627e6 100644 --- a/test/volume_server/grpc/health_state_test.go +++ b/test/volume_server/grpc/health_state_test.go @@ -16,7 +16,7 @@ func TestStateAndStatusRPCs(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -94,7 +94,7 @@ func TestSetStateVersionMismatchAndNilStateNoop(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/move_tail_timestamp_test.go b/test/volume_server/grpc/move_tail_timestamp_test.go index 8d5e01a47..32068079e 100644 --- a/test/volume_server/grpc/move_tail_timestamp_test.go +++ b/test/volume_server/grpc/move_tail_timestamp_test.go @@ -29,7 +29,7 @@ func TestVolumeCopyReturnsPreciseLastAppendTimestamp(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartDualVolumeCluster(t, matrix.P1()) + cluster := framework.StartMultiVolumeClusterAuto(t, matrix.P1(), 2) sourceConn, sourceClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress(0)) defer sourceConn.Close() destConn, destClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress(1)) @@ -156,7 +156,7 @@ func TestVolumeMoveHandlesInFlightWrites(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartDualVolumeCluster(t, matrix.P1()) + cluster := framework.StartMultiVolumeClusterAuto(t, matrix.P1(), 2) sourceConn, sourceClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress(0)) defer sourceConn.Close() destConn, destClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress(1)) diff --git a/test/volume_server/grpc/production_features_test.go b/test/volume_server/grpc/production_features_test.go new file mode 100644 index 000000000..7bc28cb75 --- /dev/null +++ b/test/volume_server/grpc/production_features_test.go @@ -0,0 +1,338 @@ +package volume_server_grpc_test + +import ( + "context" + "io" + "net/http" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" + "github.com/seaweedfs/seaweedfs/weed/storage/idx" + "github.com/seaweedfs/seaweedfs/weed/storage/types" +) + +func TestScrubVolumeDetectsHealthyData(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(101) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + httpClient := framework.NewHTTPClient() + needles := []struct { + needleID uint64 + cookie uint32 + body string + }{ + {needleID: 1010001, cookie: 0xAA000001, body: "scrub-healthy-needle-one"}, + {needleID: 1010002, cookie: 0xAA000002, body: "scrub-healthy-needle-two"}, + {needleID: 1010003, cookie: 0xAA000003, body: "scrub-healthy-needle-three"}, + } + for _, n := range needles { + fid := framework.NewFileID(volumeID, n.needleID, n.cookie) + uploadResp := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), fid, []byte(n.body)) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload needle %d expected 201, got %d", n.needleID, uploadResp.StatusCode) + } + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + scrubResp, err := grpcClient.ScrubVolume(ctx, &volume_server_pb.ScrubVolumeRequest{ + VolumeIds: []uint32{volumeID}, + Mode: volume_server_pb.VolumeScrubMode_FULL, + }) + if err != nil { + t.Fatalf("ScrubVolume FULL mode failed: %v", err) + } + if scrubResp.GetTotalVolumes() != 1 { + t.Fatalf("ScrubVolume expected total_volumes=1, got %d", scrubResp.GetTotalVolumes()) + } + if scrubResp.GetTotalFiles() < 3 { + t.Fatalf("ScrubVolume expected total_files >= 3, got %d", scrubResp.GetTotalFiles()) + } + if len(scrubResp.GetBrokenVolumeIds()) != 0 { + t.Fatalf("ScrubVolume expected no broken volumes for healthy data, got %v: %v", scrubResp.GetBrokenVolumeIds(), scrubResp.GetDetails()) + } +} + +func TestScrubVolumeLocalModeWithMultipleVolumes(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeIDA = uint32(102) + const volumeIDB = uint32(103) + framework.AllocateVolume(t, grpcClient, volumeIDA, "") + framework.AllocateVolume(t, grpcClient, volumeIDB, "") + + httpClient := framework.NewHTTPClient() + + fidA := framework.NewFileID(volumeIDA, 1020001, 0xBB000001) + uploadA := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), fidA, []byte("scrub-local-vol-a")) + _ = framework.ReadAllAndClose(t, uploadA) + if uploadA.StatusCode != http.StatusCreated { + t.Fatalf("upload to volume A expected 201, got %d", uploadA.StatusCode) + } + + fidB := framework.NewFileID(volumeIDB, 1030001, 0xBB000002) + uploadB := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), fidB, []byte("scrub-local-vol-b")) + _ = framework.ReadAllAndClose(t, uploadB) + if uploadB.StatusCode != http.StatusCreated { + t.Fatalf("upload to volume B expected 201, got %d", uploadB.StatusCode) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + scrubResp, err := grpcClient.ScrubVolume(ctx, &volume_server_pb.ScrubVolumeRequest{ + Mode: volume_server_pb.VolumeScrubMode_LOCAL, + }) + if err != nil { + t.Fatalf("ScrubVolume LOCAL auto-select failed: %v", err) + } + if scrubResp.GetTotalVolumes() < 2 { + t.Fatalf("ScrubVolume LOCAL expected total_volumes >= 2, got %d", scrubResp.GetTotalVolumes()) + } + if len(scrubResp.GetBrokenVolumeIds()) != 0 { + t.Fatalf("ScrubVolume LOCAL expected no broken volumes, got %v: %v", scrubResp.GetBrokenVolumeIds(), scrubResp.GetDetails()) + } +} + +func TestVolumeServerStatusReturnsRealDiskStats(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + statusResp, err := grpcClient.VolumeServerStatus(ctx, &volume_server_pb.VolumeServerStatusRequest{}) + if err != nil { + t.Fatalf("VolumeServerStatus failed: %v", err) + } + + diskStatuses := statusResp.GetDiskStatuses() + if len(diskStatuses) == 0 { + t.Fatalf("VolumeServerStatus expected non-empty disk_statuses") + } + + foundValid := false + for _, ds := range diskStatuses { + if ds.GetDir() != "" && ds.GetAll() > 0 && ds.GetFree() > 0 { + foundValid = true + break + } + } + if !foundValid { + t.Fatalf("VolumeServerStatus expected at least one disk status with Dir, All > 0, Free > 0; got %v", diskStatuses) + } +} + +func TestReadNeedleBlobAndMetaVerifiesCookie(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(104) + const needleID = uint64(1040001) + const cookie = uint32(0xCC000001) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + httpClient := framework.NewHTTPClient() + fid := framework.NewFileID(volumeID, needleID, cookie) + payload := []byte("read-needle-blob-meta-verify") + uploadResp := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), fid, payload) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + fileStatus, err := grpcClient.ReadVolumeFileStatus(ctx, &volume_server_pb.ReadVolumeFileStatusRequest{VolumeId: volumeID}) + if err != nil { + t.Fatalf("ReadVolumeFileStatus failed: %v", err) + } + if fileStatus.GetIdxFileSize() == 0 { + t.Fatalf("expected non-zero idx file size after upload") + } + + idxBytes := prodCopyFileBytes(t, grpcClient, &volume_server_pb.CopyFileRequest{ + VolumeId: volumeID, + Ext: ".idx", + CompactionRevision: fileStatus.GetCompactionRevision(), + StopOffset: fileStatus.GetIdxFileSize(), + }) + offset, size := prodFindNeedleOffsetAndSize(t, idxBytes, needleID) + + blobResp, err := grpcClient.ReadNeedleBlob(ctx, &volume_server_pb.ReadNeedleBlobRequest{ + VolumeId: volumeID, + Offset: offset, + Size: size, + }) + if err != nil { + t.Fatalf("ReadNeedleBlob failed: %v", err) + } + if len(blobResp.GetNeedleBlob()) == 0 { + t.Fatalf("ReadNeedleBlob returned empty blob") + } + + metaResp, err := grpcClient.ReadNeedleMeta(ctx, &volume_server_pb.ReadNeedleMetaRequest{ + VolumeId: volumeID, + NeedleId: needleID, + Offset: offset, + Size: size, + }) + if err != nil { + t.Fatalf("ReadNeedleMeta failed: %v", err) + } + if metaResp.GetCookie() != cookie { + t.Fatalf("ReadNeedleMeta cookie mismatch: got %d want %d", metaResp.GetCookie(), cookie) + } + if metaResp.GetCrc() == 0 { + t.Fatalf("ReadNeedleMeta expected non-zero CRC") + } +} + +func TestBatchDeleteMultipleNeedles(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(105) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + httpClient := framework.NewHTTPClient() + type needle struct { + needleID uint64 + cookie uint32 + body string + fid string + } + needles := []needle{ + {needleID: 1050001, cookie: 0xDD000001, body: "batch-del-needle-one"}, + {needleID: 1050002, cookie: 0xDD000002, body: "batch-del-needle-two"}, + {needleID: 1050003, cookie: 0xDD000003, body: "batch-del-needle-three"}, + } + fids := make([]string, len(needles)) + for i := range needles { + needles[i].fid = framework.NewFileID(volumeID, needles[i].needleID, needles[i].cookie) + fids[i] = needles[i].fid + uploadResp := framework.UploadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), needles[i].fid, []byte(needles[i].body)) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload needle %d expected 201, got %d", needles[i].needleID, uploadResp.StatusCode) + } + } + + // Verify all needles are readable before delete + for _, n := range needles { + readResp := framework.ReadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), n.fid) + _ = framework.ReadAllAndClose(t, readResp) + if readResp.StatusCode != http.StatusOK { + t.Fatalf("pre-delete read of %s expected 200, got %d", n.fid, readResp.StatusCode) + } + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + deleteResp, err := grpcClient.BatchDelete(ctx, &volume_server_pb.BatchDeleteRequest{ + FileIds: fids, + }) + if err != nil { + t.Fatalf("BatchDelete failed: %v", err) + } + if len(deleteResp.GetResults()) != 3 { + t.Fatalf("BatchDelete expected 3 results, got %d", len(deleteResp.GetResults())) + } + for i, result := range deleteResp.GetResults() { + if result.GetStatus() != http.StatusAccepted { + t.Fatalf("BatchDelete result[%d] expected status 202, got %d (error: %s)", i, result.GetStatus(), result.GetError()) + } + if result.GetSize() <= 0 { + t.Fatalf("BatchDelete result[%d] expected size > 0, got %d", i, result.GetSize()) + } + } + + // Verify all needles return 404 after delete + for _, n := range needles { + readResp := framework.ReadBytes(t, httpClient, clusterHarness.VolumeAdminURL(), n.fid) + _ = framework.ReadAllAndClose(t, readResp) + if readResp.StatusCode != http.StatusNotFound { + t.Fatalf("post-delete read of %s expected 404, got %d", n.fid, readResp.StatusCode) + } + } +} + +// prodCopyFileBytes streams a CopyFile response into a byte slice. +func prodCopyFileBytes(t testing.TB, grpcClient volume_server_pb.VolumeServerClient, req *volume_server_pb.CopyFileRequest) []byte { + t.Helper() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + stream, err := grpcClient.CopyFile(ctx, req) + if err != nil { + t.Fatalf("CopyFile start failed: %v", err) + } + + var out []byte + for { + msg, recvErr := stream.Recv() + if recvErr == io.EOF { + return out + } + if recvErr != nil { + t.Fatalf("CopyFile recv failed: %v", recvErr) + } + out = append(out, msg.GetFileContent()...) + } +} + +// prodFindNeedleOffsetAndSize scans idx bytes for a needle's offset and size. +func prodFindNeedleOffsetAndSize(t testing.TB, idxBytes []byte, needleID uint64) (offset int64, size int32) { + t.Helper() + + for i := 0; i+types.NeedleMapEntrySize <= len(idxBytes); i += types.NeedleMapEntrySize { + key, entryOffset, entrySize := idx.IdxFileEntry(idxBytes[i : i+types.NeedleMapEntrySize]) + if uint64(key) != needleID { + continue + } + if entryOffset.IsZero() || entrySize <= 0 { + continue + } + return entryOffset.ToActualOffset(), int32(entrySize) + } + + t.Fatalf("needle id %d not found in idx entries", needleID) + return 0, 0 +} diff --git a/test/volume_server/grpc/scrub_query_test.go b/test/volume_server/grpc/scrub_query_test.go index 9ddfddead..a4a776df2 100644 --- a/test/volume_server/grpc/scrub_query_test.go +++ b/test/volume_server/grpc/scrub_query_test.go @@ -17,7 +17,7 @@ func TestScrubVolumeIndexAndUnsupportedMode(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -55,7 +55,7 @@ func TestScrubEcVolumeMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -79,7 +79,7 @@ func TestScrubEcVolumeAutoSelectNoEcVolumes(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -105,7 +105,7 @@ func TestQueryInvalidAndMissingFileIDPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -149,7 +149,7 @@ func TestScrubVolumeAutoSelectAndAllModes(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -210,7 +210,7 @@ func TestQueryJsonSuccessAndCsvNoOutput(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -285,7 +285,7 @@ func TestQueryJsonNoMatchReturnsEmptyStripe(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -341,7 +341,7 @@ func TestQueryCookieMismatchReturnsEOFNoResults(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/tail_test.go b/test/volume_server/grpc/tail_test.go index 09657edb5..599450794 100644 --- a/test/volume_server/grpc/tail_test.go +++ b/test/volume_server/grpc/tail_test.go @@ -19,7 +19,7 @@ func TestVolumeTailSenderMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -40,7 +40,7 @@ func TestVolumeTailSenderHeartbeatThenEOF(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -78,7 +78,7 @@ func TestVolumeTailReceiverMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -144,7 +144,7 @@ func TestVolumeTailSenderLargeNeedleChunking(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/tiering_remote_test.go b/test/volume_server/grpc/tiering_remote_test.go index db36e7cfd..472aa1255 100644 --- a/test/volume_server/grpc/tiering_remote_test.go +++ b/test/volume_server/grpc/tiering_remote_test.go @@ -17,7 +17,7 @@ func TestFetchAndWriteNeedleMaintenanceAndMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -60,7 +60,7 @@ func TestFetchAndWriteNeedleInvalidRemoteConfig(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -94,7 +94,7 @@ func TestVolumeTierMoveDatToRemoteErrorPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -161,7 +161,7 @@ func TestVolumeTierMoveDatToRemoteMissingBackend(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -190,7 +190,7 @@ func TestVolumeTierMoveDatFromRemoteErrorPaths(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/grpc/vacuum_test.go b/test/volume_server/grpc/vacuum_test.go index ea986fed2..deb8c298d 100644 --- a/test/volume_server/grpc/vacuum_test.go +++ b/test/volume_server/grpc/vacuum_test.go @@ -16,7 +16,7 @@ func TestVacuumVolumeCheckSuccessAndMissingVolume(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -45,7 +45,7 @@ func TestVacuumMaintenanceModeRejections(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/admin_test.go b/test/volume_server/http/admin_test.go index 6dde9c20d..df0f9d5c8 100644 --- a/test/volume_server/http/admin_test.go +++ b/test/volume_server/http/admin_test.go @@ -16,7 +16,7 @@ func TestAdminStatusAndHealthz(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) client := framework.NewHTTPClient() statusReq, err := http.NewRequest(http.MethodGet, cluster.VolumeAdminURL()+"/status", nil) @@ -45,6 +45,19 @@ func TestAdminStatusAndHealthz(t *testing.T) { t.Fatalf("status payload missing field %q", field) } } + diskStatuses, ok := payload["DiskStatuses"].([]interface{}) + if !ok || len(diskStatuses) == 0 { + t.Fatalf("status payload expected non-empty DiskStatuses, got %#v", payload["DiskStatuses"]) + } + firstDisk, ok := diskStatuses[0].(map[string]interface{}) + if !ok { + t.Fatalf("status payload disk status has unexpected shape: %#v", diskStatuses[0]) + } + for _, field := range []string{"dir", "all", "used", "free"} { + if _, found := firstDisk[field]; !found { + t.Fatalf("status disk payload missing field %q: %#v", field, firstDisk) + } + } healthReq := mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/healthz") healthResp := framework.DoRequest(t, client, healthReq) @@ -74,7 +87,7 @@ func TestOptionsMethodsByPort(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P2()) + cluster := framework.StartVolumeCluster(t, matrix.P2()) client := framework.NewHTTPClient() adminResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodOptions, cluster.VolumeAdminURL()+"/")) @@ -114,7 +127,7 @@ func TestOptionsWithOriginIncludesCorsHeaders(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P2()) + cluster := framework.StartVolumeCluster(t, matrix.P2()) client := framework.NewHTTPClient() adminReq := mustNewRequest(t, http.MethodOptions, cluster.VolumeAdminURL()+"/") @@ -151,7 +164,7 @@ func TestUiIndexNotExposedWhenJwtSigningEnabled(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P3()) + cluster := framework.StartVolumeCluster(t, matrix.P3()) client := framework.NewHTTPClient() resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/ui/index.html")) diff --git a/test/volume_server/http/auth_test.go b/test/volume_server/http/auth_test.go index 5b093bba1..fc2fb3f16 100644 --- a/test/volume_server/http/auth_test.go +++ b/test/volume_server/http/auth_test.go @@ -18,7 +18,7 @@ func TestJWTAuthForWriteAndRead(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -81,7 +81,7 @@ func TestJWTAuthRejectsFidMismatch(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -161,7 +161,7 @@ func TestJWTAuthRejectsExpiredTokens(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -209,7 +209,7 @@ func TestJWTAuthViaQueryParamAndCookie(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -249,7 +249,7 @@ func TestJWTTokenSourcePrecedenceQueryOverHeader(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -299,7 +299,7 @@ func TestJWTTokenSourcePrecedenceHeaderOverCookie(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -345,7 +345,7 @@ func TestJWTTokenSourcePrecedenceQueryOverCookie(t *testing.T) { } profile := matrix.P3() - clusterHarness := framework.StartSingleVolumeCluster(t, profile) + clusterHarness := framework.StartVolumeCluster(t, profile) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/chunk_manifest_test.go b/test/volume_server/http/chunk_manifest_test.go index d3806d7f4..c8ae6cd92 100644 --- a/test/volume_server/http/chunk_manifest_test.go +++ b/test/volume_server/http/chunk_manifest_test.go @@ -16,7 +16,7 @@ func TestChunkManifestExpansionAndBypass(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -97,7 +97,7 @@ func TestChunkManifestDeleteRemovesChildChunks(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -174,7 +174,7 @@ func TestChunkManifestDeleteFailsWhenChildDeletionFails(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/compressed_read_test.go b/test/volume_server/http/compressed_read_test.go index 8a9ac5c41..0101a20c0 100644 --- a/test/volume_server/http/compressed_read_test.go +++ b/test/volume_server/http/compressed_read_test.go @@ -43,7 +43,7 @@ func TestCompressedReadAcceptEncodingMatrix(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/headers_static_test.go b/test/volume_server/http/headers_static_test.go index 5b4a2fd93..82ac8adc6 100644 --- a/test/volume_server/http/headers_static_test.go +++ b/test/volume_server/http/headers_static_test.go @@ -15,7 +15,7 @@ func TestReadPassthroughHeadersAndDownloadDisposition(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -59,12 +59,50 @@ func TestReadPassthroughHeadersAndDownloadDisposition(t *testing.T) { } } +func TestDownloadDispositionUsesGoBoolParsing(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(97) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + client := framework.NewHTTPClient() + fullFileID := framework.NewFileID(volumeID, 661123, 0x55667789) + uploadResp := framework.UploadBytes(t, client, clusterHarness.VolumeAdminURL(), fullFileID, []byte("dl-bool-parse-content")) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + parts := strings.SplitN(fullFileID, ",", 2) + if len(parts) != 2 { + t.Fatalf("unexpected file id format: %q", fullFileID) + } + fidOnly := parts[1] + + url := fmt.Sprintf("%s/%d/%s/%s?dl=t", clusterHarness.VolumeAdminURL(), volumeID, fidOnly, "report.txt") + resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, url)) + _ = framework.ReadAllAndClose(t, resp) + if resp.StatusCode != http.StatusOK { + t.Fatalf("download read expected 200, got %d", resp.StatusCode) + } + contentDisposition := resp.Header.Get("Content-Disposition") + if !strings.Contains(contentDisposition, "attachment") || !strings.Contains(contentDisposition, "report.txt") { + t.Fatalf("download disposition with dl=t mismatch: %q", contentDisposition) + } +} + func TestStaticAssetEndpoints(t *testing.T) { if testing.Short() { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) client := framework.NewHTTPClient() faviconResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, clusterHarness.VolumeAdminURL()+"/favicon.ico")) @@ -85,7 +123,7 @@ func TestStaticAssetEndpointsOnPublicPort(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) client := framework.NewHTTPClient() faviconResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, clusterHarness.VolumePublicURL()+"/favicon.ico")) diff --git a/test/volume_server/http/image_transform_test.go b/test/volume_server/http/image_transform_test.go index 222fc951f..a94eba1a2 100644 --- a/test/volume_server/http/image_transform_test.go +++ b/test/volume_server/http/image_transform_test.go @@ -45,7 +45,7 @@ func TestImageResizeAndCropReadVariants(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/production_features_test.go b/test/volume_server/http/production_features_test.go new file mode 100644 index 000000000..b91ee37ef --- /dev/null +++ b/test/volume_server/http/production_features_test.go @@ -0,0 +1,387 @@ +package volume_server_http_test + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "mime/multipart" + "net/http" + "os" + "strings" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" +) + +func TestStatsEndpoints(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + // /stats/counter — expect 200 with non-empty body + // Note: Go server guards these with WhiteList which may return 400 + counterResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/stats/counter")) + counterBody := framework.ReadAllAndClose(t, counterResp) + if counterResp.StatusCode == http.StatusBadRequest { + t.Logf("/stats/counter returned 400 (whitelist guard), skipping stats checks") + return + } + if counterResp.StatusCode != http.StatusOK { + t.Fatalf("/stats/counter expected 200, got %d, body: %s", counterResp.StatusCode, string(counterBody)) + } + if len(counterBody) == 0 { + t.Fatalf("/stats/counter returned empty body") + } + + // /stats/memory — expect 200, valid JSON with Version and Memory + memoryResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/stats/memory")) + memoryBody := framework.ReadAllAndClose(t, memoryResp) + if memoryResp.StatusCode != http.StatusOK { + t.Fatalf("/stats/memory expected 200, got %d, body: %s", memoryResp.StatusCode, string(memoryBody)) + } + var memoryPayload map[string]any + if err := json.Unmarshal(memoryBody, &memoryPayload); err != nil { + t.Fatalf("/stats/memory response is not valid JSON: %v, body: %s", err, string(memoryBody)) + } + if _, ok := memoryPayload["Version"]; !ok { + t.Fatalf("/stats/memory missing Version field") + } + if _, ok := memoryPayload["Memory"]; !ok { + t.Fatalf("/stats/memory missing Memory field") + } + + // /stats/disk — expect 200, valid JSON with Version and DiskStatuses + diskResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/stats/disk")) + diskBody := framework.ReadAllAndClose(t, diskResp) + if diskResp.StatusCode != http.StatusOK { + t.Fatalf("/stats/disk expected 200, got %d, body: %s", diskResp.StatusCode, string(diskBody)) + } + var diskPayload map[string]any + if err := json.Unmarshal(diskBody, &diskPayload); err != nil { + t.Fatalf("/stats/disk response is not valid JSON: %v, body: %s", err, string(diskBody)) + } + if _, ok := diskPayload["Version"]; !ok { + t.Fatalf("/stats/disk missing Version field") + } + if _, ok := diskPayload["DiskStatuses"]; !ok { + t.Fatalf("/stats/disk missing DiskStatuses field") + } +} + +func TestStatusPrettyJsonAndJsonp(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + // ?pretty=y — expect indented multi-line JSON + prettyResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/status?pretty=y")) + prettyBody := framework.ReadAllAndClose(t, prettyResp) + if prettyResp.StatusCode != http.StatusOK { + t.Fatalf("/status?pretty=y expected 200, got %d", prettyResp.StatusCode) + } + lines := strings.Split(strings.TrimSpace(string(prettyBody)), "\n") + if len(lines) < 3 { + t.Fatalf("/status?pretty=y expected multi-line indented JSON, got %d lines: %s", len(lines), string(prettyBody)) + } + // Verify the body is valid JSON + var prettyPayload map[string]interface{} + if err := json.Unmarshal(prettyBody, &prettyPayload); err != nil { + t.Fatalf("/status?pretty=y is not valid JSON: %v", err) + } + + // ?callback=myFunc — expect JSONP wrapping + jsonpResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/status?callback=myFunc")) + jsonpBody := framework.ReadAllAndClose(t, jsonpResp) + if jsonpResp.StatusCode != http.StatusOK { + t.Fatalf("/status?callback=myFunc expected 200, got %d", jsonpResp.StatusCode) + } + bodyStr := string(jsonpBody) + if !strings.HasPrefix(bodyStr, "myFunc(") { + t.Fatalf("/status?callback=myFunc expected body to start with 'myFunc(', got prefix: %q", bodyStr[:min(len(bodyStr), 30)]) + } + trimmed := strings.TrimRight(bodyStr, "\n; ") + if !strings.HasSuffix(trimmed, ")") { + t.Fatalf("/status?callback=myFunc expected body to end with ')', got suffix: %q", trimmed[max(0, len(trimmed)-10):]) + } + // Content-Type should be application/javascript for JSONP + if ct := jsonpResp.Header.Get("Content-Type"); !strings.Contains(ct, "javascript") { + t.Fatalf("/status?callback=myFunc expected Content-Type containing 'javascript', got %q", ct) + } +} + +func TestUploadWithCustomTimestamp(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(91) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, 910001, 0xAABBCC01) + client := framework.NewHTTPClient() + data := []byte("custom-timestamp-data") + + // Upload with ?ts=1700000000 + uploadURL := fmt.Sprintf("%s/%s?ts=1700000000", cluster.VolumeAdminURL(), fid) + req, err := http.NewRequest(http.MethodPost, uploadURL, bytes.NewReader(data)) + if err != nil { + t.Fatalf("create upload request: %v", err) + } + req.Header.Set("Content-Type", "application/octet-stream") + req.Header.Set("Content-Length", fmt.Sprintf("%d", len(data))) + uploadResp := framework.DoRequest(t, client, req) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload with ts expected 201, got %d", uploadResp.StatusCode) + } + + // Read back and verify Last-Modified + getResp := framework.ReadBytes(t, client, cluster.VolumeAdminURL(), fid) + _ = framework.ReadAllAndClose(t, getResp) + if getResp.StatusCode != http.StatusOK { + t.Fatalf("read expected 200, got %d", getResp.StatusCode) + } + + expectedLastModified := time.Unix(1700000000, 0).UTC().Format(http.TimeFormat) + gotLastModified := getResp.Header.Get("Last-Modified") + if gotLastModified != expectedLastModified { + t.Fatalf("Last-Modified mismatch: got %q, want %q", gotLastModified, expectedLastModified) + } +} + +func TestMultipartUploadUsesFormFieldsForTimestampAndTTL(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + // Go's r.FormValue() cannot read multipart text fields after r.MultipartReader() + // consumes the body, so ts/ttl sent as multipart fields only work with the Rust server. + if os.Getenv("VOLUME_SERVER_IMPL") != "rust" { + t.Skip("skipping: multipart form field extraction for ts/ttl is Rust-only") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(94) + const needleID = uint64(940001) + const cookie = uint32(0xAABBCC04) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, needleID, cookie) + payload := []byte("multipart-form-fields-data") + + var body bytes.Buffer + writer := multipart.NewWriter(&body) + if err := writer.WriteField("ts", "1700000000"); err != nil { + t.Fatalf("write multipart ts field: %v", err) + } + if err := writer.WriteField("ttl", "7d"); err != nil { + t.Fatalf("write multipart ttl field: %v", err) + } + filePart, err := writer.CreateFormFile("file", "multipart.txt") + if err != nil { + t.Fatalf("create multipart file field: %v", err) + } + if _, err := filePart.Write(payload); err != nil { + t.Fatalf("write multipart file payload: %v", err) + } + if err := writer.Close(); err != nil { + t.Fatalf("close multipart writer: %v", err) + } + + req, err := http.NewRequest(http.MethodPost, cluster.VolumeAdminURL()+"/"+fid, &body) + if err != nil { + t.Fatalf("create multipart upload request: %v", err) + } + req.Header.Set("Content-Type", writer.FormDataContentType()) + + client := framework.NewHTTPClient() + uploadResp := framework.DoRequest(t, client, req) + uploadBody := framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("multipart upload expected 201, got %d, body: %s", uploadResp.StatusCode, string(uploadBody)) + } + + readResp := framework.ReadBytes(t, client, cluster.VolumeAdminURL(), fid) + _ = framework.ReadAllAndClose(t, readResp) + if readResp.StatusCode != http.StatusOK { + t.Fatalf("multipart upload read expected 200, got %d", readResp.StatusCode) + } + expectedLastModified := time.Unix(1700000000, 0).UTC().Format(http.TimeFormat) + if got := readResp.Header.Get("Last-Modified"); got != expectedLastModified { + t.Fatalf("multipart upload Last-Modified mismatch: got %q want %q", got, expectedLastModified) + } + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + statusResp, err := grpcClient.VolumeNeedleStatus(ctx, &volume_server_pb.VolumeNeedleStatusRequest{ + VolumeId: volumeID, + NeedleId: needleID, + }) + if err != nil { + t.Fatalf("VolumeNeedleStatus after multipart upload failed: %v", err) + } + // Go's ReadTTL normalizes via fitTtlCount: 7d → 1w (7 days = 1 week) + if got := statusResp.GetTtl(); got != "1w" { + t.Fatalf("multipart upload TTL mismatch: got %q want %q", got, "1w") + } +} + +func TestRequestIdGeneration(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + // GET /status WITHOUT setting x-amz-request-id header + req := mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/status") + resp := framework.DoRequest(t, client, req) + _ = framework.ReadAllAndClose(t, resp) + if resp.StatusCode != http.StatusOK { + t.Fatalf("/status expected 200, got %d", resp.StatusCode) + } + + reqID := resp.Header.Get("x-amz-request-id") + if reqID == "" { + t.Fatalf("expected auto-generated x-amz-request-id header, got empty") + } + // Go format: "%X%08X" (timestamp hex + 8 random hex), typically 20-24 chars, all hex, no hyphens. + if len(reqID) < 16 { + t.Fatalf("x-amz-request-id too short: %q (len=%d)", reqID, len(reqID)) + } +} + +func TestS3ResponsePassthroughHeaders(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(92) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, 920001, 0xAABBCC02) + client := framework.NewHTTPClient() + data := []byte("passthrough-headers-test-data") + + uploadResp := framework.UploadBytes(t, client, cluster.VolumeAdminURL(), fid, data) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + // Read back with S3 passthrough query params + // Test response-content-language which both Go and Rust support + readURL := fmt.Sprintf("%s/%s?response-content-language=fr&response-expires=%s", + cluster.VolumeAdminURL(), fid, + "Thu,+01+Jan+2099+00:00:00+GMT", + ) + readResp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, readURL)) + readBody := framework.ReadAllAndClose(t, readResp) + if readResp.StatusCode != http.StatusOK { + t.Fatalf("read with passthrough expected 200, got %d, body: %s", readResp.StatusCode, string(readBody)) + } + + if got := readResp.Header.Get("Content-Language"); got != "fr" { + t.Fatalf("Content-Language expected 'fr', got %q", got) + } + if got := readResp.Header.Get("Expires"); got != "Thu, 01 Jan 2099 00:00:00 GMT" { + t.Fatalf("Expires expected 'Thu, 01 Jan 2099 00:00:00 GMT', got %q", got) + } +} + +func TestLargeFileWriteAndRead(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(93) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, 930001, 0xAABBCC03) + client := framework.NewHTTPClient() + data := bytes.Repeat([]byte("A"), 1024*1024) // 1MB + + uploadResp := framework.UploadBytes(t, client, cluster.VolumeAdminURL(), fid, data) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload 1MB expected 201, got %d", uploadResp.StatusCode) + } + + getResp := framework.ReadBytes(t, client, cluster.VolumeAdminURL(), fid) + getBody := framework.ReadAllAndClose(t, getResp) + if getResp.StatusCode != http.StatusOK { + t.Fatalf("read 1MB expected 200, got %d", getResp.StatusCode) + } + if len(getBody) != len(data) { + t.Fatalf("read 1MB body length mismatch: got %d, want %d", len(getBody), len(data)) + } + if !bytes.Equal(getBody, data) { + t.Fatalf("read 1MB body content mismatch") + } +} + +func TestUploadWithContentTypePreservation(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(94) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + fid := framework.NewFileID(volumeID, 940001, 0xAABBCC04) + client := framework.NewHTTPClient() + data := []byte("fake-png-data-for-content-type-test") + + // Upload with Content-Type: image/png + uploadURL := fmt.Sprintf("%s/%s", cluster.VolumeAdminURL(), fid) + req, err := http.NewRequest(http.MethodPost, uploadURL, bytes.NewReader(data)) + if err != nil { + t.Fatalf("create upload request: %v", err) + } + req.Header.Set("Content-Type", "image/png") + req.Header.Set("Content-Length", fmt.Sprintf("%d", len(data))) + uploadResp := framework.DoRequest(t, client, req) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload with image/png expected 201, got %d", uploadResp.StatusCode) + } + + // Read back and verify Content-Type is preserved + getResp := framework.ReadBytes(t, client, cluster.VolumeAdminURL(), fid) + _ = framework.ReadAllAndClose(t, getResp) + if getResp.StatusCode != http.StatusOK { + t.Fatalf("read expected 200, got %d", getResp.StatusCode) + } + if got := getResp.Header.Get("Content-Type"); got != "image/png" { + t.Fatalf("Content-Type expected 'image/png', got %q", got) + } +} diff --git a/test/volume_server/http/public_cors_methods_test.go b/test/volume_server/http/public_cors_methods_test.go index 5328b9a8b..df98d3454 100644 --- a/test/volume_server/http/public_cors_methods_test.go +++ b/test/volume_server/http/public_cors_methods_test.go @@ -15,7 +15,7 @@ func TestPublicPortReadOnlyMethodBehavior(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -70,7 +70,7 @@ func TestCorsAndUnsupportedMethodBehavior(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -133,7 +133,7 @@ func TestUnsupportedMethodTraceParity(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -168,7 +168,7 @@ func TestUnsupportedMethodPropfindParity(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -207,56 +207,12 @@ func TestUnsupportedMethodPropfindParity(t *testing.T) { } } -func TestUnsupportedMethodConnectParity(t *testing.T) { - if testing.Short() { - t.Skip("skipping integration test in short mode") - } - - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) - conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) - defer conn.Close() - - const volumeID = uint32(85) - framework.AllocateVolume(t, grpcClient, volumeID, "") - - fid := framework.NewFileID(volumeID, 124001, 0x03030303) - client := framework.NewHTTPClient() - uploadResp := framework.UploadBytes(t, client, clusterHarness.VolumeAdminURL(), fid, []byte("connect-method-check")) - _ = framework.ReadAllAndClose(t, uploadResp) - if uploadResp.StatusCode != http.StatusCreated { - t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) - } - - adminReq := mustNewRequest(t, "CONNECT", clusterHarness.VolumeAdminURL()+"/"+fid) - adminResp := framework.DoRequest(t, client, adminReq) - _ = framework.ReadAllAndClose(t, adminResp) - if adminResp.StatusCode != http.StatusBadRequest { - t.Fatalf("admin CONNECT expected 400, got %d", adminResp.StatusCode) - } - - publicReq := mustNewRequest(t, "CONNECT", clusterHarness.VolumePublicURL()+"/"+fid) - publicResp := framework.DoRequest(t, client, publicReq) - _ = framework.ReadAllAndClose(t, publicResp) - if publicResp.StatusCode != http.StatusOK { - t.Fatalf("public CONNECT expected passthrough 200, got %d", publicResp.StatusCode) - } - - verifyResp := framework.ReadBytes(t, client, clusterHarness.VolumeAdminURL(), fid) - verifyBody := framework.ReadAllAndClose(t, verifyResp) - if verifyResp.StatusCode != http.StatusOK { - t.Fatalf("verify GET expected 200, got %d", verifyResp.StatusCode) - } - if string(verifyBody) != "connect-method-check" { - t.Fatalf("CONNECT should not mutate data, got %q", string(verifyBody)) - } -} - func TestPublicPortHeadReadParity(t *testing.T) { if testing.Short() { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P2()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P2()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/range_variants_test.go b/test/volume_server/http/range_variants_test.go index 2e1f5e286..71ffb6dff 100644 --- a/test/volume_server/http/range_variants_test.go +++ b/test/volume_server/http/range_variants_test.go @@ -14,7 +14,7 @@ func TestMultiRangeReadReturnsMultipartPayload(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -52,7 +52,7 @@ func TestOversizedCombinedRangesAreIgnored(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/read_deleted_test.go b/test/volume_server/http/read_deleted_test.go index 23d400e23..b2db65d70 100644 --- a/test/volume_server/http/read_deleted_test.go +++ b/test/volume_server/http/read_deleted_test.go @@ -13,7 +13,7 @@ func TestReadDeletedQueryReturnsDeletedNeedleData(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/read_path_variants_test.go b/test/volume_server/http/read_path_variants_test.go index 97a7ac628..eac72079a 100644 --- a/test/volume_server/http/read_path_variants_test.go +++ b/test/volume_server/http/read_path_variants_test.go @@ -15,7 +15,7 @@ func TestReadPathShapesAndIfModifiedSince(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -85,7 +85,7 @@ func TestMalformedVidFidPathReturnsBadRequest(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) client := framework.NewHTTPClient() resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, clusterHarness.VolumeAdminURL()+"/not-a-vid/not-a-fid")) @@ -100,7 +100,7 @@ func TestReadWrongCookieReturnsNotFound(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -139,7 +139,7 @@ func TestConditionalHeaderPrecedenceAndInvalidIfModifiedSince(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/read_write_delete_test.go b/test/volume_server/http/read_write_delete_test.go index b122d697c..b26c4f661 100644 --- a/test/volume_server/http/read_write_delete_test.go +++ b/test/volume_server/http/read_write_delete_test.go @@ -14,7 +14,7 @@ func TestUploadReadRangeHeadDeleteRoundTrip(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) defer conn.Close() @@ -112,7 +112,7 @@ func TestInvalidReadPathReturnsBadRequest(t *testing.T) { t.Skip("skipping integration test in short mode") } - cluster := framework.StartSingleVolumeCluster(t, matrix.P1()) + cluster := framework.StartVolumeCluster(t, matrix.P1()) client := framework.NewHTTPClient() resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/invalid,needle")) diff --git a/test/volume_server/http/replication_lifecycle_test.go b/test/volume_server/http/replication_lifecycle_test.go new file mode 100644 index 000000000..c88ffeae6 --- /dev/null +++ b/test/volume_server/http/replication_lifecycle_test.go @@ -0,0 +1,63 @@ +package volume_server_http_test + +import ( + "context" + "net/http" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" + "github.com/seaweedfs/seaweedfs/weed/storage/needle" +) + +func TestReplicatedUploadSucceedsImmediatelyAfterAllocate(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + clusterHarness := framework.StartDualVolumeCluster(t, matrix.P1()) + + conn0, grpc0 := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress(0)) + defer conn0.Close() + conn1, grpc1 := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress(1)) + defer conn1.Close() + + const volumeID = uint32(115) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + req := &volume_server_pb.AllocateVolumeRequest{ + VolumeId: volumeID, + Replication: "001", + Version: uint32(needle.GetCurrentVersion()), + } + if _, err := grpc0.AllocateVolume(ctx, req); err != nil { + t.Fatalf("allocate replicated volume on node0: %v", err) + } + if _, err := grpc1.AllocateVolume(ctx, req); err != nil { + t.Fatalf("allocate replicated volume on node1: %v", err) + } + + client := framework.NewHTTPClient() + fid := framework.NewFileID(volumeID, 881001, 0x0B0C0D0E) + payload := []byte("replicated-upload-after-allocate") + + uploadResp := framework.UploadBytes(t, client, clusterHarness.VolumeAdminURL(0), fid, payload) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("replicated upload expected 201, got %d", uploadResp.StatusCode) + } + + replicaReadURL := clusterHarness.VolumeAdminURL(1) + "/" + fid + var replicaBody []byte + if !waitForHTTPStatus(t, client, replicaReadURL, http.StatusOK, 10*time.Second, func(resp *http.Response) { + replicaBody = framework.ReadAllAndClose(t, resp) + }) { + t.Fatalf("replica did not become readable within deadline") + } + if string(replicaBody) != string(payload) { + t.Fatalf("replica body mismatch: got %q want %q", string(replicaBody), string(payload)) + } +} diff --git a/test/volume_server/http/throttling_test.go b/test/volume_server/http/throttling_test.go index 7a66e9ebb..e07c441d8 100644 --- a/test/volume_server/http/throttling_test.go +++ b/test/volume_server/http/throttling_test.go @@ -60,7 +60,7 @@ func TestUploadLimitTimeoutAndReplicateBypass(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -143,7 +143,7 @@ func TestUploadLimitWaitThenProceed(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -226,7 +226,7 @@ func TestUploadLimitTimeoutThenRecovery(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -289,7 +289,7 @@ func TestDownloadLimitTimeoutReturnsTooManyRequests(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -336,7 +336,7 @@ func TestDownloadLimitWaitThenProceedWithoutReplica(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -398,7 +398,7 @@ func TestDownloadLimitTimeoutThenRecovery(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -593,7 +593,7 @@ func TestUploadLimitDisabledAllowsConcurrentUploads(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -646,7 +646,7 @@ func TestDownloadLimitDisabledAllowsConcurrentDownloads(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -691,7 +691,7 @@ func TestDownloadLimitInvalidVidWhileOverLimitReturnsBadRequest(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P8()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P8()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/write_delete_variants_test.go b/test/volume_server/http/write_delete_variants_test.go index 3355e7778..6a017299e 100644 --- a/test/volume_server/http/write_delete_variants_test.go +++ b/test/volume_server/http/write_delete_variants_test.go @@ -14,7 +14,7 @@ func TestWriteUnchangedAndDeleteEdgeVariants(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() @@ -71,7 +71,7 @@ func TestDeleteTimestampOverrideKeepsReadDeletedLastModifiedParity(t *testing.T) t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/http/write_error_variants_test.go b/test/volume_server/http/write_error_variants_test.go index ead11ed6c..2f858b41b 100644 --- a/test/volume_server/http/write_error_variants_test.go +++ b/test/volume_server/http/write_error_variants_test.go @@ -14,7 +14,7 @@ func TestWriteInvalidVidAndFidReturnBadRequest(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) client := framework.NewHTTPClient() invalidVidReq := newUploadRequest(t, clusterHarness.VolumeAdminURL()+"/invalid,12345678", []byte("x")) @@ -37,7 +37,7 @@ func TestWriteMalformedMultipartAndMD5Mismatch(t *testing.T) { t.Skip("skipping integration test in short mode") } - clusterHarness := framework.StartSingleVolumeCluster(t, matrix.P1()) + clusterHarness := framework.StartVolumeCluster(t, matrix.P1()) conn, grpcClient := framework.DialVolumeServer(t, clusterHarness.VolumeGRPCAddress()) defer conn.Close() diff --git a/test/volume_server/loadtest/loadtest_test.go b/test/volume_server/loadtest/loadtest_test.go new file mode 100644 index 000000000..d3d0d0fc1 --- /dev/null +++ b/test/volume_server/loadtest/loadtest_test.go @@ -0,0 +1,628 @@ +package loadtest + +import ( + "bytes" + "crypto/rand" + "fmt" + "io" + "net/http" + "os" + "sort" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" +) + +// Run with: +// go test -v -count=1 -timeout 300s -run BenchmarkVolumeServer ./test/volume_server/loadtest/... +// VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 300s -run BenchmarkVolumeServer ./test/volume_server/loadtest/... +// +// Compare results: +// go test -count=1 -timeout 300s -run BenchmarkVolumeServer -bench . ./test/volume_server/loadtest/... | tee go.txt +// VOLUME_SERVER_IMPL=rust go test -count=1 -timeout 300s -run BenchmarkVolumeServer -bench . ./test/volume_server/loadtest/... | tee rust.txt + +// Step-by-step payload sizes: 1KB → 4KB → 16KB → 64KB → 256KB → 1MB → 4MB → 8MB +var payloadSteps = []struct { + name string + size int +}{ + {"1KB", 1 << 10}, + {"4KB", 4 << 10}, + {"16KB", 16 << 10}, + {"64KB", 64 << 10}, + {"256KB", 256 << 10}, + {"1MB", 1 << 20}, + {"4MB", 4 << 20}, + {"8MB", 8 << 20}, +} + +func implName() string { + if os.Getenv("VOLUME_SERVER_IMPL") == "rust" { + return "rust" + } + return "go" +} + +// setupCluster starts a volume cluster and returns the admin URL and cleanup. +func setupCluster(tb testing.TB) (adminURL string, grpcAddr string, cleanup func()) { + tb.Helper() + cluster := framework.StartVolumeCluster(tb, matrix.P1()) + return cluster.VolumeAdminURL(), cluster.VolumeGRPCAddress(), cluster.Stop +} + +// allocateVolume allocates a volume via gRPC and returns its ID. +func allocateVolume(tb testing.TB, grpcAddr string, volumeID uint32) { + tb.Helper() + conn, client := framework.DialVolumeServer(tb, grpcAddr) + defer conn.Close() + framework.AllocateVolume(tb, client, volumeID, "loadtest") +} + +func makePayload(size int) []byte { + data := make([]byte, size) + rand.Read(data) + return data +} + +// uploadFile uploads data and returns the file ID used. +func uploadFile(client *http.Client, adminURL string, volumeID uint32, key uint64, cookie uint32, data []byte) error { + fid := framework.NewFileID(volumeID, key, cookie) + url := fmt.Sprintf("%s/%s", adminURL, fid) + req, err := http.NewRequest(http.MethodPost, url, bytes.NewReader(data)) + if err != nil { + return err + } + req.Header.Set("Content-Type", "application/octet-stream") + resp, err := client.Do(req) + if err != nil { + return err + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + if resp.StatusCode >= 400 { + return fmt.Errorf("upload %s: status %d", fid, resp.StatusCode) + } + return nil +} + +// downloadFile reads a file and discards the body. +func downloadFile(client *http.Client, adminURL string, volumeID uint32, key uint64, cookie uint32) error { + fid := framework.NewFileID(volumeID, key, cookie) + url := fmt.Sprintf("%s/%s", adminURL, fid) + resp, err := client.Get(url) + if err != nil { + return err + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + if resp.StatusCode >= 400 { + return fmt.Errorf("download %s: status %d", fid, resp.StatusCode) + } + return nil +} + +// deleteFile deletes a file. +func deleteFile(client *http.Client, adminURL string, volumeID uint32, key uint64, cookie uint32) error { + fid := framework.NewFileID(volumeID, key, cookie) + url := fmt.Sprintf("%s/%s", adminURL, fid) + req, err := http.NewRequest(http.MethodDelete, url, nil) + if err != nil { + return err + } + resp, err := client.Do(req) + if err != nil { + return err + } + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + return nil +} + +// --- Throughput load tests (not Go benchmarks, manual timing for comparison) --- + +// TestBenchmarkVolumeServer runs a suite of load tests printing ops/sec and latency. +func TestBenchmarkVolumeServer(t *testing.T) { + if testing.Short() { + t.Skip("skipping load test in short mode") + } + + impl := implName() + adminURL, grpcAddr, cleanup := setupCluster(t) + defer cleanup() + + const volumeID = uint32(10) + allocateVolume(t, grpcAddr, volumeID) + + httpClient := &http.Client{ + Timeout: 30 * time.Second, + Transport: &http.Transport{ + MaxIdleConnsPerHost: 128, + MaxConnsPerHost: 128, + }, + } + + // opsForSize returns fewer ops for larger payloads to keep test time reasonable. + opsForSize := func(size, concurrency int) int { + switch { + case size >= 4<<20: + if concurrency > 1 { + return 64 + } + return 30 + case size >= 1<<20: + if concurrency > 1 { + return 200 + } + return 100 + case size >= 64<<10: + if concurrency > 1 { + return 500 + } + return 300 + default: + if concurrency > 1 { + return 1000 + } + return 500 + } + } + + // Step-by-step upload: 1KB → 4KB → 16KB → 64KB → 256KB → 1MB → 4MB → 8MB + for _, ps := range payloadSteps { + for _, mode := range []struct { + label string + concurrency int + }{ + {"seq", 1}, + {"c16", 16}, + } { + name := fmt.Sprintf("Upload/%s/%s", ps.name, mode.label) + numOps := opsForSize(ps.size, mode.concurrency) + t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) { + payload := makePayload(ps.size) + runThroughputTest(t, impl, name, httpClient, adminURL, volumeID, + payload, numOps, mode.concurrency, false, false) + }) + } + } + + // Step-by-step download: 1KB → 4KB → 16KB → 64KB → 256KB → 1MB → 4MB → 8MB + for _, ps := range payloadSteps { + for _, mode := range []struct { + label string + concurrency int + }{ + {"seq", 1}, + {"c16", 16}, + } { + name := fmt.Sprintf("Download/%s/%s", ps.name, mode.label) + numOps := opsForSize(ps.size, mode.concurrency) + t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) { + payload := makePayload(ps.size) + runThroughputTest(t, impl, name, httpClient, adminURL, volumeID, + payload, numOps, mode.concurrency, true, false) + }) + } + } + + // Mixed read/write at each size + for _, ps := range payloadSteps { + name := fmt.Sprintf("Mixed/%s/c16", ps.name) + numOps := opsForSize(ps.size, 16) + t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) { + payload := makePayload(ps.size) + runThroughputTest(t, impl, name, httpClient, adminURL, volumeID, + payload, numOps, 16, false, true) + }) + } + + // Delete test + t.Run(fmt.Sprintf("%s/Delete/1KB/c16", impl), func(t *testing.T) { + payload := makePayload(1 << 10) + numOps := 1000 + baseKey := uint64(900000) + + for i := 0; i < numOps; i++ { + if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 1, payload); err != nil { + t.Fatalf("pre-upload for delete %d: %v", i, err) + } + } + + var ops atomic.Int64 + var totalLatency atomic.Int64 + + start := time.Now() + var wg sync.WaitGroup + concurrency := 16 + opsPerWorker := numOps / concurrency + + for w := 0; w < concurrency; w++ { + workerBase := baseKey + uint64(w*opsPerWorker) + wg.Add(1) + go func(wb uint64) { + defer wg.Done() + for i := 0; i < opsPerWorker; i++ { + opStart := time.Now() + deleteFile(httpClient, adminURL, volumeID, wb+uint64(i), 1) + totalLatency.Add(time.Since(opStart).Nanoseconds()) + ops.Add(1) + } + }(workerBase) + } + wg.Wait() + elapsed := time.Since(start) + + totalOps := ops.Load() + avgLatencyUs := float64(totalLatency.Load()) / float64(totalOps) / 1000.0 + opsPerSec := float64(totalOps) / elapsed.Seconds() + + t.Logf("RESULT impl=%-4s test=%-22s ops=%-6d errors=%-4d elapsed=%-10s ops/s=%-10.1f avg_lat=%-10.0fus", + impl, "Delete/1KB/c16", totalOps, 0, elapsed.Round(time.Millisecond), opsPerSec, avgLatencyUs) + }) +} + +// runThroughputTest is the shared core for throughput tests. +// keyOffset separates key ranges so concurrent tests in the same volume don't collide. +// keyCounter provides globally unique key ranges. Starts at 1 because key=0 is invalid. +var keyCounter atomic.Uint64 + +func init() { + keyCounter.Store(1) +} + +func runThroughputTest( + t *testing.T, impl, name string, + httpClient *http.Client, adminURL string, volumeID uint32, + payload []byte, numOps, concurrency int, + isDownload, isMixed bool, +) { + t.Helper() + + // Each call gets a unique key range + baseKey := keyCounter.Add(uint64(numOps*2)) - uint64(numOps*2) + + // Pre-upload for download / mixed + if isDownload || isMixed { + for i := 0; i < numOps; i++ { + if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 1, payload); err != nil { + t.Fatalf("pre-upload %d: %v", i, err) + } + } + } + + uploadBase := baseKey + if !isDownload && !isMixed { + uploadBase = baseKey + uint64(numOps) // fresh range for uploads + } + + var ops atomic.Int64 + var errors atomic.Int64 + var totalLatency atomic.Int64 + + start := time.Now() + + var wg sync.WaitGroup + opsPerWorker := numOps / concurrency + remainder := numOps % concurrency + + for w := 0; w < concurrency; w++ { + n := opsPerWorker + if w < remainder { + n++ + } + var workerBase uint64 + if w < remainder { + workerBase = uploadBase + uint64(w*(opsPerWorker+1)) + } else { + workerBase = uploadBase + uint64(remainder*(opsPerWorker+1)) + uint64((w-remainder)*opsPerWorker) + } + + wg.Add(1) + go func(wb uint64, count int) { + defer wg.Done() + for i := 0; i < count; i++ { + key := wb + uint64(i) + opStart := time.Now() + var err error + + if isMixed { + if i%2 == 0 { + err = uploadFile(httpClient, adminURL, volumeID, key, 1, payload) + } else { + err = downloadFile(httpClient, adminURL, volumeID, key, 1) + } + } else if isDownload { + err = downloadFile(httpClient, adminURL, volumeID, key, 1) + } else { + err = uploadFile(httpClient, adminURL, volumeID, key, 1, payload) + } + + totalLatency.Add(time.Since(opStart).Nanoseconds()) + ops.Add(1) + if err != nil { + errors.Add(1) + } + } + }(workerBase, n) + } + + wg.Wait() + elapsed := time.Since(start) + + totalOps := ops.Load() + totalErrs := errors.Load() + avgLatencyUs := float64(totalLatency.Load()) / float64(totalOps) / 1000.0 + opsPerSec := float64(totalOps) / elapsed.Seconds() + throughputMBs := opsPerSec * float64(len(payload)) / (1024 * 1024) + + t.Logf("RESULT impl=%-4s test=%-22s ops=%-6d errors=%-4d elapsed=%-10s ops/s=%-10.1f avg_lat=%-10.0fus throughput=%.2f MB/s", + impl, name, totalOps, totalErrs, elapsed.Round(time.Millisecond), opsPerSec, avgLatencyUs, throughputMBs) +} + +// TestLatencyPercentiles measures p50/p95/p99 latencies for upload and download at each size. +func TestLatencyPercentiles(t *testing.T) { + if testing.Short() { + t.Skip("skipping load test in short mode") + } + + impl := implName() + adminURL, grpcAddr, cleanup := setupCluster(t) + defer cleanup() + + const volumeID = uint32(20) + allocateVolume(t, grpcAddr, volumeID) + + httpClient := &http.Client{ + Timeout: 30 * time.Second, + Transport: &http.Transport{ + MaxIdleConnsPerHost: 64, + MaxConnsPerHost: 64, + }, + } + + latOpsForSize := func(size int) int { + switch { + case size >= 4<<20: + return 30 + case size >= 1<<20: + return 100 + default: + return 300 + } + } + + for _, ps := range payloadSteps { + for _, dl := range []struct { + prefix string + isDownload bool + }{ + {"Upload", false}, + {"Download", true}, + } { + name := fmt.Sprintf("%s/%s", dl.prefix, ps.name) + numOps := latOpsForSize(ps.size) + + t.Run(fmt.Sprintf("%s/%s", impl, name), func(t *testing.T) { + payload := makePayload(ps.size) + baseKey := keyCounter.Add(uint64(numOps * 2)) + + if dl.isDownload { + for i := 0; i < numOps; i++ { + if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 2, payload); err != nil { + t.Fatalf("pre-upload: %v", err) + } + } + } + + uploadBase := baseKey + if !dl.isDownload { + uploadBase = baseKey + uint64(numOps) + } + + latencies := make([]time.Duration, numOps) + for i := 0; i < numOps; i++ { + key := uploadBase + uint64(i) + start := time.Now() + if dl.isDownload { + downloadFile(httpClient, adminURL, volumeID, key, 2) + } else { + uploadFile(httpClient, adminURL, volumeID, key, 2, payload) + } + latencies[i] = time.Since(start) + } + + sortDurations(latencies) + + p50 := latencies[len(latencies)*50/100] + p95 := latencies[len(latencies)*95/100] + p99 := latencies[len(latencies)*99/100] + min := latencies[0] + max := latencies[len(latencies)-1] + + t.Logf("RESULT impl=%-4s test=%-20s n=%-4d min=%-10s p50=%-10s p95=%-10s p99=%-10s max=%-10s", + impl, name, numOps, min.Round(time.Microsecond), p50.Round(time.Microsecond), p95.Round(time.Microsecond), p99.Round(time.Microsecond), max.Round(time.Microsecond)) + }) + } + } +} + +func sortDurations(d []time.Duration) { + sort.Slice(d, func(i, j int) bool { return d[i] < d[j] }) +} + +// TestSustainedP99 runs high-concurrency load for a sustained period (default 60s, +// override with LOADTEST_DURATION=120s) and reports p50/p95/p99/p999 latencies. +// This reveals tail latency differences that short tests miss (GC pauses, lock contention, etc). +// +// Run: +// go test -v -count=1 -timeout 600s -run TestSustainedP99 ./test/volume_server/loadtest/... +// VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 600s -run TestSustainedP99 ./test/volume_server/loadtest/... +// LOADTEST_DURATION=120s VOLUME_SERVER_IMPL=rust go test -v -count=1 -timeout 600s -run TestSustainedP99 ./test/volume_server/loadtest/... +func TestSustainedP99(t *testing.T) { + if testing.Short() { + t.Skip("skipping sustained load test in short mode") + } + + duration := 60 * time.Second + if d := os.Getenv("LOADTEST_DURATION"); d != "" { + parsed, err := time.ParseDuration(d) + if err == nil && parsed > 0 { + duration = parsed + } + } + + impl := implName() + adminURL, grpcAddr, cleanup := setupCluster(t) + defer cleanup() + + httpClient := &http.Client{ + Timeout: 30 * time.Second, + Transport: &http.Transport{ + MaxIdleConnsPerHost: 128, + MaxConnsPerHost: 128, + }, + } + + type scenario struct { + name string + size int + concurrency int + isDownload bool + } + + scenarios := []scenario{ + {"Upload/1KB/c16", 1 << 10, 16, false}, + {"Upload/64KB/c16", 64 << 10, 16, false}, + {"Download/1KB/c16", 1 << 10, 16, true}, + {"Download/64KB/c16", 64 << 10, 16, true}, + } + + var nextVolID atomic.Uint32 + nextVolID.Store(30) + + for _, sc := range scenarios { + t.Run(fmt.Sprintf("%s/%s", impl, sc.name), func(t *testing.T) { + // Each scenario gets its own volume to avoid filling up + volumeID := nextVolID.Add(1) - 1 + allocateVolume(t, grpcAddr, volumeID) + + payload := makePayload(sc.size) + + // Pre-upload a pool of files for download tests + poolSize := 500 + baseKey := keyCounter.Add(uint64(poolSize*2)) - uint64(poolSize*2) + + if sc.isDownload { + t.Logf("Pre-uploading %d files for download test...", poolSize) + for i := 0; i < poolSize; i++ { + if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 3, payload); err != nil { + t.Fatalf("pre-upload %d: %v", i, err) + } + } + } + + // Collect latencies from all workers + type latencyBucket struct { + mu sync.Mutex + latencies []time.Duration + } + bucket := &latencyBucket{ + latencies: make([]time.Duration, 0, 100000), + } + + var totalOps atomic.Int64 + var totalErrors atomic.Int64 + + deadline := time.Now().Add(duration) + start := time.Now() + + // For uploads, pre-seed the pool so subsequent writes are overwrites (no volume fill) + if !sc.isDownload { + t.Logf("Pre-seeding %d files for upload overwrite test...", poolSize) + for i := 0; i < poolSize; i++ { + if err := uploadFile(httpClient, adminURL, volumeID, baseKey+uint64(i), 3, payload); err != nil { + t.Fatalf("pre-seed %d: %v", i, err) + } + } + } + + var wg sync.WaitGroup + for w := 0; w < sc.concurrency; w++ { + wg.Add(1) + go func(workerID int) { + defer wg.Done() + localLats := make([]time.Duration, 0, 8192) + + var i uint64 + for time.Now().Before(deadline) { + // Cycle through the pool to avoid filling up the volume + key := baseKey + uint64(int(i)%poolSize) + + opStart := time.Now() + var err error + if sc.isDownload { + err = downloadFile(httpClient, adminURL, volumeID, key, 3) + } else { + err = uploadFile(httpClient, adminURL, volumeID, key, 3, payload) + } + lat := time.Since(opStart) + + localLats = append(localLats, lat) + totalOps.Add(1) + if err != nil { + totalErrors.Add(1) + } + i++ + + // Flush local buffer periodically + if len(localLats) >= 8192 { + bucket.mu.Lock() + bucket.latencies = append(bucket.latencies, localLats...) + bucket.mu.Unlock() + localLats = localLats[:0] + } + } + // Final flush + if len(localLats) > 0 { + bucket.mu.Lock() + bucket.latencies = append(bucket.latencies, localLats...) + bucket.mu.Unlock() + } + }(w) + } + + wg.Wait() + elapsed := time.Since(start) + + lats := bucket.latencies + n := len(lats) + ops := totalOps.Load() + errs := totalErrors.Load() + opsPerSec := float64(ops) / elapsed.Seconds() + + sortDurations(lats) + + pct := func(p float64) time.Duration { + idx := int(float64(n) * p / 100.0) + if idx >= n { + idx = n - 1 + } + return lats[idx] + } + + t.Logf("RESULT impl=%-4s test=%-22s duration=%-6s ops=%-8d errors=%-4d ops/s=%-10.1f", + impl, sc.name, elapsed.Round(time.Second), ops, errs, opsPerSec) + t.Logf(" p50=%-10s p90=%-10s p95=%-10s p99=%-10s p999=%-10s max=%-10s", + pct(50).Round(time.Microsecond), + pct(90).Round(time.Microsecond), + pct(95).Round(time.Microsecond), + pct(99).Round(time.Microsecond), + pct(99.9).Round(time.Microsecond), + lats[n-1].Round(time.Microsecond)) + }) + } +} diff --git a/test/volume_server/matrix/config_profiles.go b/test/volume_server/matrix/config_profiles.go index c359eb029..e01e35fc1 100644 --- a/test/volume_server/matrix/config_profiles.go +++ b/test/volume_server/matrix/config_profiles.go @@ -12,6 +12,7 @@ type Profile struct { EnableJWT bool JWTSigningKey string JWTReadKey string + EnableUIAccess bool EnableMaintain bool ConcurrentUploadLimitMB int diff --git a/test/volume_server/rust/rust_volume_test.go b/test/volume_server/rust/rust_volume_test.go new file mode 100644 index 000000000..6f1a0b74a --- /dev/null +++ b/test/volume_server/rust/rust_volume_test.go @@ -0,0 +1,310 @@ +package volume_server_rust_test + +import ( + "context" + "encoding/json" + "net/http" + "testing" + "time" + + "github.com/seaweedfs/seaweedfs/test/volume_server/framework" + "github.com/seaweedfs/seaweedfs/test/volume_server/matrix" + "github.com/seaweedfs/seaweedfs/weed/pb/volume_server_pb" +) + +func mustNewRequest(t testing.TB, method, url string) *http.Request { + t.Helper() + req, err := http.NewRequest(method, url, nil) + if err != nil { + t.Fatalf("create request %s %s: %v", method, url, err) + } + return req +} + +// TestRustHealthzEndpoint verifies that the Rust volume server responds to +// GET /healthz with HTTP 200. +func TestRustHealthzEndpoint(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/healthz")) + _ = framework.ReadAllAndClose(t, resp) + + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected /healthz 200, got %d", resp.StatusCode) + } +} + +// TestRustStatusEndpoint verifies that GET /status returns 200 with a JSON +// body containing a "version" field. The Rust server uses lowercase field +// names in its axum JSON responses. +func TestRustStatusEndpoint(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/status")) + body := framework.ReadAllAndClose(t, resp) + + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected /status 200, got %d, body: %s", resp.StatusCode, string(body)) + } + + var payload map[string]interface{} + if err := json.Unmarshal(body, &payload); err != nil { + t.Fatalf("decode /status JSON: %v", err) + } + + if _, ok := payload["Version"]; !ok { + t.Fatalf("/status JSON missing \"Version\" field, keys: %v", keys(payload)) + } +} + +// TestRustPingRPC verifies the gRPC Ping RPC returns non-zero timestamps. +func TestRustPingRPC(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + resp, err := client.Ping(ctx, &volume_server_pb.PingRequest{}) + if err != nil { + t.Fatalf("Ping RPC failed: %v", err) + } + if resp.GetStartTimeNs() == 0 { + t.Fatalf("Ping StartTimeNs should be non-zero") + } + if resp.GetStopTimeNs() == 0 { + t.Fatalf("Ping StopTimeNs should be non-zero") + } + if resp.GetStopTimeNs() < resp.GetStartTimeNs() { + t.Fatalf("Ping StopTimeNs (%d) should be >= StartTimeNs (%d)", resp.GetStopTimeNs(), resp.GetStartTimeNs()) + } +} + +// TestRustAllocateAndWriteReadDelete exercises the full needle lifecycle: +// allocate a volume via gRPC, upload bytes via HTTP POST, read them back +// via HTTP GET, delete via HTTP DELETE, then confirm GET returns 404. +func TestRustAllocateAndWriteReadDelete(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + conn, grpcClient := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + const volumeID = uint32(1) + framework.AllocateVolume(t, grpcClient, volumeID, "") + + httpClient := framework.NewHTTPClient() + fid := framework.NewFileID(volumeID, 1001, 0xAABBCCDD) + data := []byte("rust-volume-server-integration-test-payload") + + // Upload + uploadResp := framework.UploadBytes(t, httpClient, cluster.VolumeAdminURL(), fid, data) + _ = framework.ReadAllAndClose(t, uploadResp) + if uploadResp.StatusCode != http.StatusCreated { + t.Fatalf("upload expected 201, got %d", uploadResp.StatusCode) + } + + // Read back + getResp := framework.ReadBytes(t, httpClient, cluster.VolumeAdminURL(), fid) + getBody := framework.ReadAllAndClose(t, getResp) + if getResp.StatusCode != http.StatusOK { + t.Fatalf("read expected 200, got %d", getResp.StatusCode) + } + if string(getBody) != string(data) { + t.Fatalf("read body mismatch: got %q, want %q", string(getBody), string(data)) + } + + // Delete + deleteResp := framework.DoRequest(t, httpClient, mustNewRequest(t, http.MethodDelete, cluster.VolumeAdminURL()+"/"+fid)) + _ = framework.ReadAllAndClose(t, deleteResp) + if deleteResp.StatusCode != http.StatusAccepted && deleteResp.StatusCode != http.StatusOK { + t.Fatalf("delete expected 202 or 200, got %d", deleteResp.StatusCode) + } + + // Verify 404 after delete + gone := framework.ReadBytes(t, httpClient, cluster.VolumeAdminURL(), fid) + _ = framework.ReadAllAndClose(t, gone) + if gone.StatusCode != http.StatusNotFound { + t.Fatalf("read after delete expected 404, got %d", gone.StatusCode) + } +} + +// TestRustVolumeLifecycle tests the volume admin gRPC lifecycle: +// allocate, check status, unmount, mount, delete. +func TestRustVolumeLifecycle(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + const volumeID = uint32(2) + framework.AllocateVolume(t, client, volumeID, "") + + // VolumeStatus should succeed on a freshly allocated volume. + statusResp, err := client.VolumeStatus(ctx, &volume_server_pb.VolumeStatusRequest{VolumeId: volumeID}) + if err != nil { + t.Fatalf("VolumeStatus failed: %v", err) + } + if statusResp.GetFileCount() != 0 { + t.Fatalf("new volume should be empty, got file_count=%d", statusResp.GetFileCount()) + } + + // Unmount then remount. + if _, err = client.VolumeUnmount(ctx, &volume_server_pb.VolumeUnmountRequest{VolumeId: volumeID}); err != nil { + t.Fatalf("VolumeUnmount failed: %v", err) + } + if _, err = client.VolumeMount(ctx, &volume_server_pb.VolumeMountRequest{VolumeId: volumeID}); err != nil { + t.Fatalf("VolumeMount failed: %v", err) + } + + // Delete. + if _, err = client.VolumeDelete(ctx, &volume_server_pb.VolumeDeleteRequest{VolumeId: volumeID, OnlyEmpty: true}); err != nil { + t.Fatalf("VolumeDelete failed: %v", err) + } + + // VolumeStatus should fail after delete. + _, err = client.VolumeStatus(ctx, &volume_server_pb.VolumeStatusRequest{VolumeId: volumeID}) + if err == nil { + t.Fatalf("VolumeStatus should fail after delete") + } +} + +// TestRustGetSetState verifies GetState returns a non-nil state and SetState +// echoes the state back. +func TestRustGetSetState(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + // GetState should return non-nil state. + getResp, err := client.GetState(ctx, &volume_server_pb.GetStateRequest{}) + if err != nil { + t.Fatalf("GetState failed: %v", err) + } + if getResp.GetState() == nil { + t.Fatalf("GetState returned nil state") + } + + // SetState should echo back the state. + setResp, err := client.SetState(ctx, &volume_server_pb.SetStateRequest{ + State: &volume_server_pb.VolumeServerState{ + Version: getResp.GetState().GetVersion(), + }, + }) + if err != nil { + t.Fatalf("SetState failed: %v", err) + } + if setResp.GetState() == nil { + t.Fatalf("SetState returned nil state") + } + if setResp.GetState().GetVersion() < getResp.GetState().GetVersion() { + t.Fatalf("SetState version should not decrease: got %d, had %d", + setResp.GetState().GetVersion(), getResp.GetState().GetVersion()) + } +} + +// TestRustVolumeServerStatus verifies VolumeServerStatus returns a version +// string and at least one disk status entry. +func TestRustVolumeServerStatus(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + conn, client := framework.DialVolumeServer(t, cluster.VolumeGRPCAddress()) + defer conn.Close() + + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + resp, err := client.VolumeServerStatus(ctx, &volume_server_pb.VolumeServerStatusRequest{}) + if err != nil { + t.Fatalf("VolumeServerStatus failed: %v", err) + } + if resp.GetVersion() == "" { + t.Fatalf("VolumeServerStatus returned empty version") + } + if len(resp.GetDiskStatuses()) == 0 { + t.Fatalf("VolumeServerStatus returned no disk statuses") + } +} + +// TestRustMetricsEndpointIsNotOnAdminPortByDefault verifies that the default +// volume admin listener does not expose Prometheus metrics. Go serves metrics +// only on the dedicated metrics listener when -metricsPort is configured. +func TestRustMetricsEndpointIsNotOnAdminPortByDefault(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + cluster := framework.StartRustVolumeCluster(t, matrix.P1()) + client := framework.NewHTTPClient() + + resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/metrics")) + body := framework.ReadAllAndClose(t, resp) + + if resp.StatusCode != http.StatusBadRequest { + t.Fatalf("expected admin /metrics 400 when metricsPort is unset, got %d body=%s", resp.StatusCode, string(body)) + } +} + +func TestRustUiAccessOverrideIgnoresReadJwt(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + profile := matrix.P3() + profile.EnableUIAccess = true + + cluster := framework.StartRustVolumeCluster(t, profile) + client := framework.NewHTTPClient() + + resp := framework.DoRequest(t, client, mustNewRequest(t, http.MethodGet, cluster.VolumeAdminURL()+"/ui/index.html")) + body := framework.ReadAllAndClose(t, resp) + + if resp.StatusCode != http.StatusOK { + t.Fatalf("expected /ui/index.html 200 with access.ui override, got %d body=%s", resp.StatusCode, string(body)) + } + if len(body) == 0 { + t.Fatalf("expected non-empty UI response body") + } +} + +// keys returns the keys of a map for diagnostic messages. +func keys(m map[string]interface{}) []string { + ks := make([]string, 0, len(m)) + for k := range m { + ks = append(ks, k) + } + return ks +} diff --git a/weed/pb/Makefile b/weed/pb/Makefile index ad90e1fe5..94f5f668d 100644 --- a/weed/pb/Makefile +++ b/weed/pb/Makefile @@ -18,6 +18,7 @@ gen: protoc plugin.proto --go_out=./plugin_pb --go-grpc_out=./plugin_pb --go_opt=paths=source_relative --go-grpc_opt=paths=source_relative # protoc filer.proto --java_out=../../other/java/client/src/main/java cp filer.proto ../../other/java/client/src/main/proto + cp volume_server.proto master.proto remote.proto ../../seaweed-volume/proto/ fbs: flatc --go -o . --go-namespace message_fbs message.fbs diff --git a/weed/storage/volume.go b/weed/storage/volume.go index 48149f4d9..468491c10 100644 --- a/weed/storage/volume.go +++ b/weed/storage/volume.go @@ -94,7 +94,7 @@ func (v *Volume) IndexFileName() (fileName string) { func (v *Volume) FileName(ext string) (fileName string) { switch ext { - case ".idx", ".cpx", ".ldb", ".cpldb": + case ".idx", ".cpx", ".ldb", ".cpldb", ".rdb": return VolumeFileName(v.dirIdx, v.Collection, int(v.Id)) + ext } // .dat, .cpd, .vif diff --git a/weed/storage/volume_vacuum.go b/weed/storage/volume_vacuum.go index e97342597..c5027204c 100644 --- a/weed/storage/volume_vacuum.go +++ b/weed/storage/volume_vacuum.go @@ -201,6 +201,7 @@ func (v *Volume) CommitCompact() error { //time.Sleep(20 * time.Second) os.RemoveAll(v.FileName(".ldb")) + os.Remove(v.FileName(".rdb")) glog.V(3).Infof("Loading volume %d commit file...", v.Id) if e = v.load(true, false, v.needleMapKind, 0, v.Version()); e != nil { diff --git a/weed/storage/volume_write.go b/weed/storage/volume_write.go index 2f832e1f7..e78bb2b3a 100644 --- a/weed/storage/volume_write.go +++ b/weed/storage/volume_write.go @@ -99,6 +99,8 @@ func removeVolumeFiles(filename string) { os.Remove(filename + ".cpx") // level db index file os.RemoveAll(filename + ".ldb") + // redb index file (Rust volume server) + os.Remove(filename + ".rdb") // marker for damaged or incomplete volume os.Remove(filename + ".note") }